xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision cdfce539)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "raid56.h"
35 #include "locking.h"
36 #include "free-space-cache.h"
37 #include "math.h"
38 
39 #undef SCRAMBLE_DELAYED_REFS
40 
41 /*
42  * control flags for do_chunk_alloc's force field
43  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
44  * if we really need one.
45  *
46  * CHUNK_ALLOC_LIMITED means to only try and allocate one
47  * if we have very few chunks already allocated.  This is
48  * used as part of the clustering code to help make sure
49  * we have a good pool of storage to cluster in, without
50  * filling the FS with empty chunks
51  *
52  * CHUNK_ALLOC_FORCE means it must try to allocate one
53  *
54  */
55 enum {
56 	CHUNK_ALLOC_NO_FORCE = 0,
57 	CHUNK_ALLOC_LIMITED = 1,
58 	CHUNK_ALLOC_FORCE = 2,
59 };
60 
61 /*
62  * Control how reservations are dealt with.
63  *
64  * RESERVE_FREE - freeing a reservation.
65  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
66  *   ENOSPC accounting
67  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
68  *   bytes_may_use as the ENOSPC accounting is done elsewhere
69  */
70 enum {
71 	RESERVE_FREE = 0,
72 	RESERVE_ALLOC = 1,
73 	RESERVE_ALLOC_NO_ACCOUNT = 2,
74 };
75 
76 static int update_block_group(struct btrfs_root *root,
77 			      u64 bytenr, u64 num_bytes, int alloc);
78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79 				struct btrfs_root *root,
80 				u64 bytenr, u64 num_bytes, u64 parent,
81 				u64 root_objectid, u64 owner_objectid,
82 				u64 owner_offset, int refs_to_drop,
83 				struct btrfs_delayed_extent_op *extra_op);
84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
85 				    struct extent_buffer *leaf,
86 				    struct btrfs_extent_item *ei);
87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
88 				      struct btrfs_root *root,
89 				      u64 parent, u64 root_objectid,
90 				      u64 flags, u64 owner, u64 offset,
91 				      struct btrfs_key *ins, int ref_mod);
92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
93 				     struct btrfs_root *root,
94 				     u64 parent, u64 root_objectid,
95 				     u64 flags, struct btrfs_disk_key *key,
96 				     int level, struct btrfs_key *ins);
97 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
98 			  struct btrfs_root *extent_root, u64 flags,
99 			  int force);
100 static int find_next_key(struct btrfs_path *path, int level,
101 			 struct btrfs_key *key);
102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103 			    int dump_block_groups);
104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105 				       u64 num_bytes, int reserve);
106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107 			       u64 num_bytes);
108 int btrfs_pin_extent(struct btrfs_root *root,
109 		     u64 bytenr, u64 num_bytes, int reserved);
110 
111 static noinline int
112 block_group_cache_done(struct btrfs_block_group_cache *cache)
113 {
114 	smp_mb();
115 	return cache->cached == BTRFS_CACHE_FINISHED;
116 }
117 
118 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
119 {
120 	return (cache->flags & bits) == bits;
121 }
122 
123 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
124 {
125 	atomic_inc(&cache->count);
126 }
127 
128 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
129 {
130 	if (atomic_dec_and_test(&cache->count)) {
131 		WARN_ON(cache->pinned > 0);
132 		WARN_ON(cache->reserved > 0);
133 		kfree(cache->free_space_ctl);
134 		kfree(cache);
135 	}
136 }
137 
138 /*
139  * this adds the block group to the fs_info rb tree for the block group
140  * cache
141  */
142 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
143 				struct btrfs_block_group_cache *block_group)
144 {
145 	struct rb_node **p;
146 	struct rb_node *parent = NULL;
147 	struct btrfs_block_group_cache *cache;
148 
149 	spin_lock(&info->block_group_cache_lock);
150 	p = &info->block_group_cache_tree.rb_node;
151 
152 	while (*p) {
153 		parent = *p;
154 		cache = rb_entry(parent, struct btrfs_block_group_cache,
155 				 cache_node);
156 		if (block_group->key.objectid < cache->key.objectid) {
157 			p = &(*p)->rb_left;
158 		} else if (block_group->key.objectid > cache->key.objectid) {
159 			p = &(*p)->rb_right;
160 		} else {
161 			spin_unlock(&info->block_group_cache_lock);
162 			return -EEXIST;
163 		}
164 	}
165 
166 	rb_link_node(&block_group->cache_node, parent, p);
167 	rb_insert_color(&block_group->cache_node,
168 			&info->block_group_cache_tree);
169 
170 	if (info->first_logical_byte > block_group->key.objectid)
171 		info->first_logical_byte = block_group->key.objectid;
172 
173 	spin_unlock(&info->block_group_cache_lock);
174 
175 	return 0;
176 }
177 
178 /*
179  * This will return the block group at or after bytenr if contains is 0, else
180  * it will return the block group that contains the bytenr
181  */
182 static struct btrfs_block_group_cache *
183 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
184 			      int contains)
185 {
186 	struct btrfs_block_group_cache *cache, *ret = NULL;
187 	struct rb_node *n;
188 	u64 end, start;
189 
190 	spin_lock(&info->block_group_cache_lock);
191 	n = info->block_group_cache_tree.rb_node;
192 
193 	while (n) {
194 		cache = rb_entry(n, struct btrfs_block_group_cache,
195 				 cache_node);
196 		end = cache->key.objectid + cache->key.offset - 1;
197 		start = cache->key.objectid;
198 
199 		if (bytenr < start) {
200 			if (!contains && (!ret || start < ret->key.objectid))
201 				ret = cache;
202 			n = n->rb_left;
203 		} else if (bytenr > start) {
204 			if (contains && bytenr <= end) {
205 				ret = cache;
206 				break;
207 			}
208 			n = n->rb_right;
209 		} else {
210 			ret = cache;
211 			break;
212 		}
213 	}
214 	if (ret) {
215 		btrfs_get_block_group(ret);
216 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
217 			info->first_logical_byte = ret->key.objectid;
218 	}
219 	spin_unlock(&info->block_group_cache_lock);
220 
221 	return ret;
222 }
223 
224 static int add_excluded_extent(struct btrfs_root *root,
225 			       u64 start, u64 num_bytes)
226 {
227 	u64 end = start + num_bytes - 1;
228 	set_extent_bits(&root->fs_info->freed_extents[0],
229 			start, end, EXTENT_UPTODATE, GFP_NOFS);
230 	set_extent_bits(&root->fs_info->freed_extents[1],
231 			start, end, EXTENT_UPTODATE, GFP_NOFS);
232 	return 0;
233 }
234 
235 static void free_excluded_extents(struct btrfs_root *root,
236 				  struct btrfs_block_group_cache *cache)
237 {
238 	u64 start, end;
239 
240 	start = cache->key.objectid;
241 	end = start + cache->key.offset - 1;
242 
243 	clear_extent_bits(&root->fs_info->freed_extents[0],
244 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
245 	clear_extent_bits(&root->fs_info->freed_extents[1],
246 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
247 }
248 
249 static int exclude_super_stripes(struct btrfs_root *root,
250 				 struct btrfs_block_group_cache *cache)
251 {
252 	u64 bytenr;
253 	u64 *logical;
254 	int stripe_len;
255 	int i, nr, ret;
256 
257 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
258 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
259 		cache->bytes_super += stripe_len;
260 		ret = add_excluded_extent(root, cache->key.objectid,
261 					  stripe_len);
262 		if (ret)
263 			return ret;
264 	}
265 
266 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
267 		bytenr = btrfs_sb_offset(i);
268 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
269 				       cache->key.objectid, bytenr,
270 				       0, &logical, &nr, &stripe_len);
271 		if (ret)
272 			return ret;
273 
274 		while (nr--) {
275 			u64 start, len;
276 
277 			if (logical[nr] > cache->key.objectid +
278 			    cache->key.offset)
279 				continue;
280 
281 			if (logical[nr] + stripe_len <= cache->key.objectid)
282 				continue;
283 
284 			start = logical[nr];
285 			if (start < cache->key.objectid) {
286 				start = cache->key.objectid;
287 				len = (logical[nr] + stripe_len) - start;
288 			} else {
289 				len = min_t(u64, stripe_len,
290 					    cache->key.objectid +
291 					    cache->key.offset - start);
292 			}
293 
294 			cache->bytes_super += len;
295 			ret = add_excluded_extent(root, start, len);
296 			if (ret) {
297 				kfree(logical);
298 				return ret;
299 			}
300 		}
301 
302 		kfree(logical);
303 	}
304 	return 0;
305 }
306 
307 static struct btrfs_caching_control *
308 get_caching_control(struct btrfs_block_group_cache *cache)
309 {
310 	struct btrfs_caching_control *ctl;
311 
312 	spin_lock(&cache->lock);
313 	if (cache->cached != BTRFS_CACHE_STARTED) {
314 		spin_unlock(&cache->lock);
315 		return NULL;
316 	}
317 
318 	/* We're loading it the fast way, so we don't have a caching_ctl. */
319 	if (!cache->caching_ctl) {
320 		spin_unlock(&cache->lock);
321 		return NULL;
322 	}
323 
324 	ctl = cache->caching_ctl;
325 	atomic_inc(&ctl->count);
326 	spin_unlock(&cache->lock);
327 	return ctl;
328 }
329 
330 static void put_caching_control(struct btrfs_caching_control *ctl)
331 {
332 	if (atomic_dec_and_test(&ctl->count))
333 		kfree(ctl);
334 }
335 
336 /*
337  * this is only called by cache_block_group, since we could have freed extents
338  * we need to check the pinned_extents for any extents that can't be used yet
339  * since their free space will be released as soon as the transaction commits.
340  */
341 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
342 			      struct btrfs_fs_info *info, u64 start, u64 end)
343 {
344 	u64 extent_start, extent_end, size, total_added = 0;
345 	int ret;
346 
347 	while (start < end) {
348 		ret = find_first_extent_bit(info->pinned_extents, start,
349 					    &extent_start, &extent_end,
350 					    EXTENT_DIRTY | EXTENT_UPTODATE,
351 					    NULL);
352 		if (ret)
353 			break;
354 
355 		if (extent_start <= start) {
356 			start = extent_end + 1;
357 		} else if (extent_start > start && extent_start < end) {
358 			size = extent_start - start;
359 			total_added += size;
360 			ret = btrfs_add_free_space(block_group, start,
361 						   size);
362 			BUG_ON(ret); /* -ENOMEM or logic error */
363 			start = extent_end + 1;
364 		} else {
365 			break;
366 		}
367 	}
368 
369 	if (start < end) {
370 		size = end - start;
371 		total_added += size;
372 		ret = btrfs_add_free_space(block_group, start, size);
373 		BUG_ON(ret); /* -ENOMEM or logic error */
374 	}
375 
376 	return total_added;
377 }
378 
379 static noinline void caching_thread(struct btrfs_work *work)
380 {
381 	struct btrfs_block_group_cache *block_group;
382 	struct btrfs_fs_info *fs_info;
383 	struct btrfs_caching_control *caching_ctl;
384 	struct btrfs_root *extent_root;
385 	struct btrfs_path *path;
386 	struct extent_buffer *leaf;
387 	struct btrfs_key key;
388 	u64 total_found = 0;
389 	u64 last = 0;
390 	u32 nritems;
391 	int ret = 0;
392 
393 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
394 	block_group = caching_ctl->block_group;
395 	fs_info = block_group->fs_info;
396 	extent_root = fs_info->extent_root;
397 
398 	path = btrfs_alloc_path();
399 	if (!path)
400 		goto out;
401 
402 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
403 
404 	/*
405 	 * We don't want to deadlock with somebody trying to allocate a new
406 	 * extent for the extent root while also trying to search the extent
407 	 * root to add free space.  So we skip locking and search the commit
408 	 * root, since its read-only
409 	 */
410 	path->skip_locking = 1;
411 	path->search_commit_root = 1;
412 	path->reada = 1;
413 
414 	key.objectid = last;
415 	key.offset = 0;
416 	key.type = BTRFS_EXTENT_ITEM_KEY;
417 again:
418 	mutex_lock(&caching_ctl->mutex);
419 	/* need to make sure the commit_root doesn't disappear */
420 	down_read(&fs_info->extent_commit_sem);
421 
422 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
423 	if (ret < 0)
424 		goto err;
425 
426 	leaf = path->nodes[0];
427 	nritems = btrfs_header_nritems(leaf);
428 
429 	while (1) {
430 		if (btrfs_fs_closing(fs_info) > 1) {
431 			last = (u64)-1;
432 			break;
433 		}
434 
435 		if (path->slots[0] < nritems) {
436 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
437 		} else {
438 			ret = find_next_key(path, 0, &key);
439 			if (ret)
440 				break;
441 
442 			if (need_resched()) {
443 				caching_ctl->progress = last;
444 				btrfs_release_path(path);
445 				up_read(&fs_info->extent_commit_sem);
446 				mutex_unlock(&caching_ctl->mutex);
447 				cond_resched();
448 				goto again;
449 			}
450 
451 			ret = btrfs_next_leaf(extent_root, path);
452 			if (ret < 0)
453 				goto err;
454 			if (ret)
455 				break;
456 			leaf = path->nodes[0];
457 			nritems = btrfs_header_nritems(leaf);
458 			continue;
459 		}
460 
461 		if (key.objectid < block_group->key.objectid) {
462 			path->slots[0]++;
463 			continue;
464 		}
465 
466 		if (key.objectid >= block_group->key.objectid +
467 		    block_group->key.offset)
468 			break;
469 
470 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
471 		    key.type == BTRFS_METADATA_ITEM_KEY) {
472 			total_found += add_new_free_space(block_group,
473 							  fs_info, last,
474 							  key.objectid);
475 			if (key.type == BTRFS_METADATA_ITEM_KEY)
476 				last = key.objectid +
477 					fs_info->tree_root->leafsize;
478 			else
479 				last = key.objectid + key.offset;
480 
481 			if (total_found > (1024 * 1024 * 2)) {
482 				total_found = 0;
483 				wake_up(&caching_ctl->wait);
484 			}
485 		}
486 		path->slots[0]++;
487 	}
488 	ret = 0;
489 
490 	total_found += add_new_free_space(block_group, fs_info, last,
491 					  block_group->key.objectid +
492 					  block_group->key.offset);
493 	caching_ctl->progress = (u64)-1;
494 
495 	spin_lock(&block_group->lock);
496 	block_group->caching_ctl = NULL;
497 	block_group->cached = BTRFS_CACHE_FINISHED;
498 	spin_unlock(&block_group->lock);
499 
500 err:
501 	btrfs_free_path(path);
502 	up_read(&fs_info->extent_commit_sem);
503 
504 	free_excluded_extents(extent_root, block_group);
505 
506 	mutex_unlock(&caching_ctl->mutex);
507 out:
508 	wake_up(&caching_ctl->wait);
509 
510 	put_caching_control(caching_ctl);
511 	btrfs_put_block_group(block_group);
512 }
513 
514 static int cache_block_group(struct btrfs_block_group_cache *cache,
515 			     int load_cache_only)
516 {
517 	DEFINE_WAIT(wait);
518 	struct btrfs_fs_info *fs_info = cache->fs_info;
519 	struct btrfs_caching_control *caching_ctl;
520 	int ret = 0;
521 
522 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
523 	if (!caching_ctl)
524 		return -ENOMEM;
525 
526 	INIT_LIST_HEAD(&caching_ctl->list);
527 	mutex_init(&caching_ctl->mutex);
528 	init_waitqueue_head(&caching_ctl->wait);
529 	caching_ctl->block_group = cache;
530 	caching_ctl->progress = cache->key.objectid;
531 	atomic_set(&caching_ctl->count, 1);
532 	caching_ctl->work.func = caching_thread;
533 
534 	spin_lock(&cache->lock);
535 	/*
536 	 * This should be a rare occasion, but this could happen I think in the
537 	 * case where one thread starts to load the space cache info, and then
538 	 * some other thread starts a transaction commit which tries to do an
539 	 * allocation while the other thread is still loading the space cache
540 	 * info.  The previous loop should have kept us from choosing this block
541 	 * group, but if we've moved to the state where we will wait on caching
542 	 * block groups we need to first check if we're doing a fast load here,
543 	 * so we can wait for it to finish, otherwise we could end up allocating
544 	 * from a block group who's cache gets evicted for one reason or
545 	 * another.
546 	 */
547 	while (cache->cached == BTRFS_CACHE_FAST) {
548 		struct btrfs_caching_control *ctl;
549 
550 		ctl = cache->caching_ctl;
551 		atomic_inc(&ctl->count);
552 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
553 		spin_unlock(&cache->lock);
554 
555 		schedule();
556 
557 		finish_wait(&ctl->wait, &wait);
558 		put_caching_control(ctl);
559 		spin_lock(&cache->lock);
560 	}
561 
562 	if (cache->cached != BTRFS_CACHE_NO) {
563 		spin_unlock(&cache->lock);
564 		kfree(caching_ctl);
565 		return 0;
566 	}
567 	WARN_ON(cache->caching_ctl);
568 	cache->caching_ctl = caching_ctl;
569 	cache->cached = BTRFS_CACHE_FAST;
570 	spin_unlock(&cache->lock);
571 
572 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
573 		ret = load_free_space_cache(fs_info, cache);
574 
575 		spin_lock(&cache->lock);
576 		if (ret == 1) {
577 			cache->caching_ctl = NULL;
578 			cache->cached = BTRFS_CACHE_FINISHED;
579 			cache->last_byte_to_unpin = (u64)-1;
580 		} else {
581 			if (load_cache_only) {
582 				cache->caching_ctl = NULL;
583 				cache->cached = BTRFS_CACHE_NO;
584 			} else {
585 				cache->cached = BTRFS_CACHE_STARTED;
586 			}
587 		}
588 		spin_unlock(&cache->lock);
589 		wake_up(&caching_ctl->wait);
590 		if (ret == 1) {
591 			put_caching_control(caching_ctl);
592 			free_excluded_extents(fs_info->extent_root, cache);
593 			return 0;
594 		}
595 	} else {
596 		/*
597 		 * We are not going to do the fast caching, set cached to the
598 		 * appropriate value and wakeup any waiters.
599 		 */
600 		spin_lock(&cache->lock);
601 		if (load_cache_only) {
602 			cache->caching_ctl = NULL;
603 			cache->cached = BTRFS_CACHE_NO;
604 		} else {
605 			cache->cached = BTRFS_CACHE_STARTED;
606 		}
607 		spin_unlock(&cache->lock);
608 		wake_up(&caching_ctl->wait);
609 	}
610 
611 	if (load_cache_only) {
612 		put_caching_control(caching_ctl);
613 		return 0;
614 	}
615 
616 	down_write(&fs_info->extent_commit_sem);
617 	atomic_inc(&caching_ctl->count);
618 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
619 	up_write(&fs_info->extent_commit_sem);
620 
621 	btrfs_get_block_group(cache);
622 
623 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
624 
625 	return ret;
626 }
627 
628 /*
629  * return the block group that starts at or after bytenr
630  */
631 static struct btrfs_block_group_cache *
632 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
633 {
634 	struct btrfs_block_group_cache *cache;
635 
636 	cache = block_group_cache_tree_search(info, bytenr, 0);
637 
638 	return cache;
639 }
640 
641 /*
642  * return the block group that contains the given bytenr
643  */
644 struct btrfs_block_group_cache *btrfs_lookup_block_group(
645 						 struct btrfs_fs_info *info,
646 						 u64 bytenr)
647 {
648 	struct btrfs_block_group_cache *cache;
649 
650 	cache = block_group_cache_tree_search(info, bytenr, 1);
651 
652 	return cache;
653 }
654 
655 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
656 						  u64 flags)
657 {
658 	struct list_head *head = &info->space_info;
659 	struct btrfs_space_info *found;
660 
661 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
662 
663 	rcu_read_lock();
664 	list_for_each_entry_rcu(found, head, list) {
665 		if (found->flags & flags) {
666 			rcu_read_unlock();
667 			return found;
668 		}
669 	}
670 	rcu_read_unlock();
671 	return NULL;
672 }
673 
674 /*
675  * after adding space to the filesystem, we need to clear the full flags
676  * on all the space infos.
677  */
678 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
679 {
680 	struct list_head *head = &info->space_info;
681 	struct btrfs_space_info *found;
682 
683 	rcu_read_lock();
684 	list_for_each_entry_rcu(found, head, list)
685 		found->full = 0;
686 	rcu_read_unlock();
687 }
688 
689 /* simple helper to search for an existing extent at a given offset */
690 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
691 {
692 	int ret;
693 	struct btrfs_key key;
694 	struct btrfs_path *path;
695 
696 	path = btrfs_alloc_path();
697 	if (!path)
698 		return -ENOMEM;
699 
700 	key.objectid = start;
701 	key.offset = len;
702 	key.type = BTRFS_EXTENT_ITEM_KEY;
703 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
704 				0, 0);
705 	if (ret > 0) {
706 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
707 		if (key.objectid == start &&
708 		    key.type == BTRFS_METADATA_ITEM_KEY)
709 			ret = 0;
710 	}
711 	btrfs_free_path(path);
712 	return ret;
713 }
714 
715 /*
716  * helper function to lookup reference count and flags of a tree block.
717  *
718  * the head node for delayed ref is used to store the sum of all the
719  * reference count modifications queued up in the rbtree. the head
720  * node may also store the extent flags to set. This way you can check
721  * to see what the reference count and extent flags would be if all of
722  * the delayed refs are not processed.
723  */
724 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
725 			     struct btrfs_root *root, u64 bytenr,
726 			     u64 offset, int metadata, u64 *refs, u64 *flags)
727 {
728 	struct btrfs_delayed_ref_head *head;
729 	struct btrfs_delayed_ref_root *delayed_refs;
730 	struct btrfs_path *path;
731 	struct btrfs_extent_item *ei;
732 	struct extent_buffer *leaf;
733 	struct btrfs_key key;
734 	u32 item_size;
735 	u64 num_refs;
736 	u64 extent_flags;
737 	int ret;
738 
739 	/*
740 	 * If we don't have skinny metadata, don't bother doing anything
741 	 * different
742 	 */
743 	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
744 		offset = root->leafsize;
745 		metadata = 0;
746 	}
747 
748 	path = btrfs_alloc_path();
749 	if (!path)
750 		return -ENOMEM;
751 
752 	if (metadata) {
753 		key.objectid = bytenr;
754 		key.type = BTRFS_METADATA_ITEM_KEY;
755 		key.offset = offset;
756 	} else {
757 		key.objectid = bytenr;
758 		key.type = BTRFS_EXTENT_ITEM_KEY;
759 		key.offset = offset;
760 	}
761 
762 	if (!trans) {
763 		path->skip_locking = 1;
764 		path->search_commit_root = 1;
765 	}
766 again:
767 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
768 				&key, path, 0, 0);
769 	if (ret < 0)
770 		goto out_free;
771 
772 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
773 		key.type = BTRFS_EXTENT_ITEM_KEY;
774 		key.offset = root->leafsize;
775 		btrfs_release_path(path);
776 		goto again;
777 	}
778 
779 	if (ret == 0) {
780 		leaf = path->nodes[0];
781 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
782 		if (item_size >= sizeof(*ei)) {
783 			ei = btrfs_item_ptr(leaf, path->slots[0],
784 					    struct btrfs_extent_item);
785 			num_refs = btrfs_extent_refs(leaf, ei);
786 			extent_flags = btrfs_extent_flags(leaf, ei);
787 		} else {
788 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
789 			struct btrfs_extent_item_v0 *ei0;
790 			BUG_ON(item_size != sizeof(*ei0));
791 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
792 					     struct btrfs_extent_item_v0);
793 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
794 			/* FIXME: this isn't correct for data */
795 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
796 #else
797 			BUG();
798 #endif
799 		}
800 		BUG_ON(num_refs == 0);
801 	} else {
802 		num_refs = 0;
803 		extent_flags = 0;
804 		ret = 0;
805 	}
806 
807 	if (!trans)
808 		goto out;
809 
810 	delayed_refs = &trans->transaction->delayed_refs;
811 	spin_lock(&delayed_refs->lock);
812 	head = btrfs_find_delayed_ref_head(trans, bytenr);
813 	if (head) {
814 		if (!mutex_trylock(&head->mutex)) {
815 			atomic_inc(&head->node.refs);
816 			spin_unlock(&delayed_refs->lock);
817 
818 			btrfs_release_path(path);
819 
820 			/*
821 			 * Mutex was contended, block until it's released and try
822 			 * again
823 			 */
824 			mutex_lock(&head->mutex);
825 			mutex_unlock(&head->mutex);
826 			btrfs_put_delayed_ref(&head->node);
827 			goto again;
828 		}
829 		if (head->extent_op && head->extent_op->update_flags)
830 			extent_flags |= head->extent_op->flags_to_set;
831 		else
832 			BUG_ON(num_refs == 0);
833 
834 		num_refs += head->node.ref_mod;
835 		mutex_unlock(&head->mutex);
836 	}
837 	spin_unlock(&delayed_refs->lock);
838 out:
839 	WARN_ON(num_refs == 0);
840 	if (refs)
841 		*refs = num_refs;
842 	if (flags)
843 		*flags = extent_flags;
844 out_free:
845 	btrfs_free_path(path);
846 	return ret;
847 }
848 
849 /*
850  * Back reference rules.  Back refs have three main goals:
851  *
852  * 1) differentiate between all holders of references to an extent so that
853  *    when a reference is dropped we can make sure it was a valid reference
854  *    before freeing the extent.
855  *
856  * 2) Provide enough information to quickly find the holders of an extent
857  *    if we notice a given block is corrupted or bad.
858  *
859  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
860  *    maintenance.  This is actually the same as #2, but with a slightly
861  *    different use case.
862  *
863  * There are two kinds of back refs. The implicit back refs is optimized
864  * for pointers in non-shared tree blocks. For a given pointer in a block,
865  * back refs of this kind provide information about the block's owner tree
866  * and the pointer's key. These information allow us to find the block by
867  * b-tree searching. The full back refs is for pointers in tree blocks not
868  * referenced by their owner trees. The location of tree block is recorded
869  * in the back refs. Actually the full back refs is generic, and can be
870  * used in all cases the implicit back refs is used. The major shortcoming
871  * of the full back refs is its overhead. Every time a tree block gets
872  * COWed, we have to update back refs entry for all pointers in it.
873  *
874  * For a newly allocated tree block, we use implicit back refs for
875  * pointers in it. This means most tree related operations only involve
876  * implicit back refs. For a tree block created in old transaction, the
877  * only way to drop a reference to it is COW it. So we can detect the
878  * event that tree block loses its owner tree's reference and do the
879  * back refs conversion.
880  *
881  * When a tree block is COW'd through a tree, there are four cases:
882  *
883  * The reference count of the block is one and the tree is the block's
884  * owner tree. Nothing to do in this case.
885  *
886  * The reference count of the block is one and the tree is not the
887  * block's owner tree. In this case, full back refs is used for pointers
888  * in the block. Remove these full back refs, add implicit back refs for
889  * every pointers in the new block.
890  *
891  * The reference count of the block is greater than one and the tree is
892  * the block's owner tree. In this case, implicit back refs is used for
893  * pointers in the block. Add full back refs for every pointers in the
894  * block, increase lower level extents' reference counts. The original
895  * implicit back refs are entailed to the new block.
896  *
897  * The reference count of the block is greater than one and the tree is
898  * not the block's owner tree. Add implicit back refs for every pointer in
899  * the new block, increase lower level extents' reference count.
900  *
901  * Back Reference Key composing:
902  *
903  * The key objectid corresponds to the first byte in the extent,
904  * The key type is used to differentiate between types of back refs.
905  * There are different meanings of the key offset for different types
906  * of back refs.
907  *
908  * File extents can be referenced by:
909  *
910  * - multiple snapshots, subvolumes, or different generations in one subvol
911  * - different files inside a single subvolume
912  * - different offsets inside a file (bookend extents in file.c)
913  *
914  * The extent ref structure for the implicit back refs has fields for:
915  *
916  * - Objectid of the subvolume root
917  * - objectid of the file holding the reference
918  * - original offset in the file
919  * - how many bookend extents
920  *
921  * The key offset for the implicit back refs is hash of the first
922  * three fields.
923  *
924  * The extent ref structure for the full back refs has field for:
925  *
926  * - number of pointers in the tree leaf
927  *
928  * The key offset for the implicit back refs is the first byte of
929  * the tree leaf
930  *
931  * When a file extent is allocated, The implicit back refs is used.
932  * the fields are filled in:
933  *
934  *     (root_key.objectid, inode objectid, offset in file, 1)
935  *
936  * When a file extent is removed file truncation, we find the
937  * corresponding implicit back refs and check the following fields:
938  *
939  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
940  *
941  * Btree extents can be referenced by:
942  *
943  * - Different subvolumes
944  *
945  * Both the implicit back refs and the full back refs for tree blocks
946  * only consist of key. The key offset for the implicit back refs is
947  * objectid of block's owner tree. The key offset for the full back refs
948  * is the first byte of parent block.
949  *
950  * When implicit back refs is used, information about the lowest key and
951  * level of the tree block are required. These information are stored in
952  * tree block info structure.
953  */
954 
955 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
956 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
957 				  struct btrfs_root *root,
958 				  struct btrfs_path *path,
959 				  u64 owner, u32 extra_size)
960 {
961 	struct btrfs_extent_item *item;
962 	struct btrfs_extent_item_v0 *ei0;
963 	struct btrfs_extent_ref_v0 *ref0;
964 	struct btrfs_tree_block_info *bi;
965 	struct extent_buffer *leaf;
966 	struct btrfs_key key;
967 	struct btrfs_key found_key;
968 	u32 new_size = sizeof(*item);
969 	u64 refs;
970 	int ret;
971 
972 	leaf = path->nodes[0];
973 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
974 
975 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
976 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
977 			     struct btrfs_extent_item_v0);
978 	refs = btrfs_extent_refs_v0(leaf, ei0);
979 
980 	if (owner == (u64)-1) {
981 		while (1) {
982 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
983 				ret = btrfs_next_leaf(root, path);
984 				if (ret < 0)
985 					return ret;
986 				BUG_ON(ret > 0); /* Corruption */
987 				leaf = path->nodes[0];
988 			}
989 			btrfs_item_key_to_cpu(leaf, &found_key,
990 					      path->slots[0]);
991 			BUG_ON(key.objectid != found_key.objectid);
992 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
993 				path->slots[0]++;
994 				continue;
995 			}
996 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
997 					      struct btrfs_extent_ref_v0);
998 			owner = btrfs_ref_objectid_v0(leaf, ref0);
999 			break;
1000 		}
1001 	}
1002 	btrfs_release_path(path);
1003 
1004 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1005 		new_size += sizeof(*bi);
1006 
1007 	new_size -= sizeof(*ei0);
1008 	ret = btrfs_search_slot(trans, root, &key, path,
1009 				new_size + extra_size, 1);
1010 	if (ret < 0)
1011 		return ret;
1012 	BUG_ON(ret); /* Corruption */
1013 
1014 	btrfs_extend_item(root, path, new_size);
1015 
1016 	leaf = path->nodes[0];
1017 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1018 	btrfs_set_extent_refs(leaf, item, refs);
1019 	/* FIXME: get real generation */
1020 	btrfs_set_extent_generation(leaf, item, 0);
1021 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1022 		btrfs_set_extent_flags(leaf, item,
1023 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1024 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1025 		bi = (struct btrfs_tree_block_info *)(item + 1);
1026 		/* FIXME: get first key of the block */
1027 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1028 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1029 	} else {
1030 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1031 	}
1032 	btrfs_mark_buffer_dirty(leaf);
1033 	return 0;
1034 }
1035 #endif
1036 
1037 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1038 {
1039 	u32 high_crc = ~(u32)0;
1040 	u32 low_crc = ~(u32)0;
1041 	__le64 lenum;
1042 
1043 	lenum = cpu_to_le64(root_objectid);
1044 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1045 	lenum = cpu_to_le64(owner);
1046 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1047 	lenum = cpu_to_le64(offset);
1048 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1049 
1050 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1051 }
1052 
1053 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1054 				     struct btrfs_extent_data_ref *ref)
1055 {
1056 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1057 				    btrfs_extent_data_ref_objectid(leaf, ref),
1058 				    btrfs_extent_data_ref_offset(leaf, ref));
1059 }
1060 
1061 static int match_extent_data_ref(struct extent_buffer *leaf,
1062 				 struct btrfs_extent_data_ref *ref,
1063 				 u64 root_objectid, u64 owner, u64 offset)
1064 {
1065 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1066 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1067 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1068 		return 0;
1069 	return 1;
1070 }
1071 
1072 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1073 					   struct btrfs_root *root,
1074 					   struct btrfs_path *path,
1075 					   u64 bytenr, u64 parent,
1076 					   u64 root_objectid,
1077 					   u64 owner, u64 offset)
1078 {
1079 	struct btrfs_key key;
1080 	struct btrfs_extent_data_ref *ref;
1081 	struct extent_buffer *leaf;
1082 	u32 nritems;
1083 	int ret;
1084 	int recow;
1085 	int err = -ENOENT;
1086 
1087 	key.objectid = bytenr;
1088 	if (parent) {
1089 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1090 		key.offset = parent;
1091 	} else {
1092 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1093 		key.offset = hash_extent_data_ref(root_objectid,
1094 						  owner, offset);
1095 	}
1096 again:
1097 	recow = 0;
1098 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1099 	if (ret < 0) {
1100 		err = ret;
1101 		goto fail;
1102 	}
1103 
1104 	if (parent) {
1105 		if (!ret)
1106 			return 0;
1107 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1108 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1109 		btrfs_release_path(path);
1110 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111 		if (ret < 0) {
1112 			err = ret;
1113 			goto fail;
1114 		}
1115 		if (!ret)
1116 			return 0;
1117 #endif
1118 		goto fail;
1119 	}
1120 
1121 	leaf = path->nodes[0];
1122 	nritems = btrfs_header_nritems(leaf);
1123 	while (1) {
1124 		if (path->slots[0] >= nritems) {
1125 			ret = btrfs_next_leaf(root, path);
1126 			if (ret < 0)
1127 				err = ret;
1128 			if (ret)
1129 				goto fail;
1130 
1131 			leaf = path->nodes[0];
1132 			nritems = btrfs_header_nritems(leaf);
1133 			recow = 1;
1134 		}
1135 
1136 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1137 		if (key.objectid != bytenr ||
1138 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1139 			goto fail;
1140 
1141 		ref = btrfs_item_ptr(leaf, path->slots[0],
1142 				     struct btrfs_extent_data_ref);
1143 
1144 		if (match_extent_data_ref(leaf, ref, root_objectid,
1145 					  owner, offset)) {
1146 			if (recow) {
1147 				btrfs_release_path(path);
1148 				goto again;
1149 			}
1150 			err = 0;
1151 			break;
1152 		}
1153 		path->slots[0]++;
1154 	}
1155 fail:
1156 	return err;
1157 }
1158 
1159 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1160 					   struct btrfs_root *root,
1161 					   struct btrfs_path *path,
1162 					   u64 bytenr, u64 parent,
1163 					   u64 root_objectid, u64 owner,
1164 					   u64 offset, int refs_to_add)
1165 {
1166 	struct btrfs_key key;
1167 	struct extent_buffer *leaf;
1168 	u32 size;
1169 	u32 num_refs;
1170 	int ret;
1171 
1172 	key.objectid = bytenr;
1173 	if (parent) {
1174 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1175 		key.offset = parent;
1176 		size = sizeof(struct btrfs_shared_data_ref);
1177 	} else {
1178 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1179 		key.offset = hash_extent_data_ref(root_objectid,
1180 						  owner, offset);
1181 		size = sizeof(struct btrfs_extent_data_ref);
1182 	}
1183 
1184 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1185 	if (ret && ret != -EEXIST)
1186 		goto fail;
1187 
1188 	leaf = path->nodes[0];
1189 	if (parent) {
1190 		struct btrfs_shared_data_ref *ref;
1191 		ref = btrfs_item_ptr(leaf, path->slots[0],
1192 				     struct btrfs_shared_data_ref);
1193 		if (ret == 0) {
1194 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1195 		} else {
1196 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1197 			num_refs += refs_to_add;
1198 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1199 		}
1200 	} else {
1201 		struct btrfs_extent_data_ref *ref;
1202 		while (ret == -EEXIST) {
1203 			ref = btrfs_item_ptr(leaf, path->slots[0],
1204 					     struct btrfs_extent_data_ref);
1205 			if (match_extent_data_ref(leaf, ref, root_objectid,
1206 						  owner, offset))
1207 				break;
1208 			btrfs_release_path(path);
1209 			key.offset++;
1210 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1211 						      size);
1212 			if (ret && ret != -EEXIST)
1213 				goto fail;
1214 
1215 			leaf = path->nodes[0];
1216 		}
1217 		ref = btrfs_item_ptr(leaf, path->slots[0],
1218 				     struct btrfs_extent_data_ref);
1219 		if (ret == 0) {
1220 			btrfs_set_extent_data_ref_root(leaf, ref,
1221 						       root_objectid);
1222 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1223 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1224 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1225 		} else {
1226 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1227 			num_refs += refs_to_add;
1228 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1229 		}
1230 	}
1231 	btrfs_mark_buffer_dirty(leaf);
1232 	ret = 0;
1233 fail:
1234 	btrfs_release_path(path);
1235 	return ret;
1236 }
1237 
1238 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1239 					   struct btrfs_root *root,
1240 					   struct btrfs_path *path,
1241 					   int refs_to_drop)
1242 {
1243 	struct btrfs_key key;
1244 	struct btrfs_extent_data_ref *ref1 = NULL;
1245 	struct btrfs_shared_data_ref *ref2 = NULL;
1246 	struct extent_buffer *leaf;
1247 	u32 num_refs = 0;
1248 	int ret = 0;
1249 
1250 	leaf = path->nodes[0];
1251 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1252 
1253 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1254 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1255 				      struct btrfs_extent_data_ref);
1256 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1257 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1258 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1259 				      struct btrfs_shared_data_ref);
1260 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1261 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1262 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1263 		struct btrfs_extent_ref_v0 *ref0;
1264 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1265 				      struct btrfs_extent_ref_v0);
1266 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1267 #endif
1268 	} else {
1269 		BUG();
1270 	}
1271 
1272 	BUG_ON(num_refs < refs_to_drop);
1273 	num_refs -= refs_to_drop;
1274 
1275 	if (num_refs == 0) {
1276 		ret = btrfs_del_item(trans, root, path);
1277 	} else {
1278 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1279 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1280 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1281 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1283 		else {
1284 			struct btrfs_extent_ref_v0 *ref0;
1285 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1286 					struct btrfs_extent_ref_v0);
1287 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1288 		}
1289 #endif
1290 		btrfs_mark_buffer_dirty(leaf);
1291 	}
1292 	return ret;
1293 }
1294 
1295 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1296 					  struct btrfs_path *path,
1297 					  struct btrfs_extent_inline_ref *iref)
1298 {
1299 	struct btrfs_key key;
1300 	struct extent_buffer *leaf;
1301 	struct btrfs_extent_data_ref *ref1;
1302 	struct btrfs_shared_data_ref *ref2;
1303 	u32 num_refs = 0;
1304 
1305 	leaf = path->nodes[0];
1306 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1307 	if (iref) {
1308 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1309 		    BTRFS_EXTENT_DATA_REF_KEY) {
1310 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1311 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1312 		} else {
1313 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1314 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1315 		}
1316 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1317 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1318 				      struct btrfs_extent_data_ref);
1319 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1320 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1321 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1322 				      struct btrfs_shared_data_ref);
1323 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1324 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1325 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1326 		struct btrfs_extent_ref_v0 *ref0;
1327 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1328 				      struct btrfs_extent_ref_v0);
1329 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1330 #endif
1331 	} else {
1332 		WARN_ON(1);
1333 	}
1334 	return num_refs;
1335 }
1336 
1337 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1338 					  struct btrfs_root *root,
1339 					  struct btrfs_path *path,
1340 					  u64 bytenr, u64 parent,
1341 					  u64 root_objectid)
1342 {
1343 	struct btrfs_key key;
1344 	int ret;
1345 
1346 	key.objectid = bytenr;
1347 	if (parent) {
1348 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1349 		key.offset = parent;
1350 	} else {
1351 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1352 		key.offset = root_objectid;
1353 	}
1354 
1355 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1356 	if (ret > 0)
1357 		ret = -ENOENT;
1358 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1359 	if (ret == -ENOENT && parent) {
1360 		btrfs_release_path(path);
1361 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1362 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1363 		if (ret > 0)
1364 			ret = -ENOENT;
1365 	}
1366 #endif
1367 	return ret;
1368 }
1369 
1370 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1371 					  struct btrfs_root *root,
1372 					  struct btrfs_path *path,
1373 					  u64 bytenr, u64 parent,
1374 					  u64 root_objectid)
1375 {
1376 	struct btrfs_key key;
1377 	int ret;
1378 
1379 	key.objectid = bytenr;
1380 	if (parent) {
1381 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1382 		key.offset = parent;
1383 	} else {
1384 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1385 		key.offset = root_objectid;
1386 	}
1387 
1388 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1389 	btrfs_release_path(path);
1390 	return ret;
1391 }
1392 
1393 static inline int extent_ref_type(u64 parent, u64 owner)
1394 {
1395 	int type;
1396 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1397 		if (parent > 0)
1398 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1399 		else
1400 			type = BTRFS_TREE_BLOCK_REF_KEY;
1401 	} else {
1402 		if (parent > 0)
1403 			type = BTRFS_SHARED_DATA_REF_KEY;
1404 		else
1405 			type = BTRFS_EXTENT_DATA_REF_KEY;
1406 	}
1407 	return type;
1408 }
1409 
1410 static int find_next_key(struct btrfs_path *path, int level,
1411 			 struct btrfs_key *key)
1412 
1413 {
1414 	for (; level < BTRFS_MAX_LEVEL; level++) {
1415 		if (!path->nodes[level])
1416 			break;
1417 		if (path->slots[level] + 1 >=
1418 		    btrfs_header_nritems(path->nodes[level]))
1419 			continue;
1420 		if (level == 0)
1421 			btrfs_item_key_to_cpu(path->nodes[level], key,
1422 					      path->slots[level] + 1);
1423 		else
1424 			btrfs_node_key_to_cpu(path->nodes[level], key,
1425 					      path->slots[level] + 1);
1426 		return 0;
1427 	}
1428 	return 1;
1429 }
1430 
1431 /*
1432  * look for inline back ref. if back ref is found, *ref_ret is set
1433  * to the address of inline back ref, and 0 is returned.
1434  *
1435  * if back ref isn't found, *ref_ret is set to the address where it
1436  * should be inserted, and -ENOENT is returned.
1437  *
1438  * if insert is true and there are too many inline back refs, the path
1439  * points to the extent item, and -EAGAIN is returned.
1440  *
1441  * NOTE: inline back refs are ordered in the same way that back ref
1442  *	 items in the tree are ordered.
1443  */
1444 static noinline_for_stack
1445 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1446 				 struct btrfs_root *root,
1447 				 struct btrfs_path *path,
1448 				 struct btrfs_extent_inline_ref **ref_ret,
1449 				 u64 bytenr, u64 num_bytes,
1450 				 u64 parent, u64 root_objectid,
1451 				 u64 owner, u64 offset, int insert)
1452 {
1453 	struct btrfs_key key;
1454 	struct extent_buffer *leaf;
1455 	struct btrfs_extent_item *ei;
1456 	struct btrfs_extent_inline_ref *iref;
1457 	u64 flags;
1458 	u64 item_size;
1459 	unsigned long ptr;
1460 	unsigned long end;
1461 	int extra_size;
1462 	int type;
1463 	int want;
1464 	int ret;
1465 	int err = 0;
1466 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1467 						 SKINNY_METADATA);
1468 
1469 	key.objectid = bytenr;
1470 	key.type = BTRFS_EXTENT_ITEM_KEY;
1471 	key.offset = num_bytes;
1472 
1473 	want = extent_ref_type(parent, owner);
1474 	if (insert) {
1475 		extra_size = btrfs_extent_inline_ref_size(want);
1476 		path->keep_locks = 1;
1477 	} else
1478 		extra_size = -1;
1479 
1480 	/*
1481 	 * Owner is our parent level, so we can just add one to get the level
1482 	 * for the block we are interested in.
1483 	 */
1484 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1485 		key.type = BTRFS_METADATA_ITEM_KEY;
1486 		key.offset = owner;
1487 	}
1488 
1489 again:
1490 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1491 	if (ret < 0) {
1492 		err = ret;
1493 		goto out;
1494 	}
1495 
1496 	/*
1497 	 * We may be a newly converted file system which still has the old fat
1498 	 * extent entries for metadata, so try and see if we have one of those.
1499 	 */
1500 	if (ret > 0 && skinny_metadata) {
1501 		skinny_metadata = false;
1502 		if (path->slots[0]) {
1503 			path->slots[0]--;
1504 			btrfs_item_key_to_cpu(path->nodes[0], &key,
1505 					      path->slots[0]);
1506 			if (key.objectid == bytenr &&
1507 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
1508 			    key.offset == num_bytes)
1509 				ret = 0;
1510 		}
1511 		if (ret) {
1512 			key.type = BTRFS_EXTENT_ITEM_KEY;
1513 			key.offset = num_bytes;
1514 			btrfs_release_path(path);
1515 			goto again;
1516 		}
1517 	}
1518 
1519 	if (ret && !insert) {
1520 		err = -ENOENT;
1521 		goto out;
1522 	} else if (ret) {
1523 		err = -EIO;
1524 		WARN_ON(1);
1525 		goto out;
1526 	}
1527 
1528 	leaf = path->nodes[0];
1529 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1530 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1531 	if (item_size < sizeof(*ei)) {
1532 		if (!insert) {
1533 			err = -ENOENT;
1534 			goto out;
1535 		}
1536 		ret = convert_extent_item_v0(trans, root, path, owner,
1537 					     extra_size);
1538 		if (ret < 0) {
1539 			err = ret;
1540 			goto out;
1541 		}
1542 		leaf = path->nodes[0];
1543 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1544 	}
1545 #endif
1546 	BUG_ON(item_size < sizeof(*ei));
1547 
1548 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1549 	flags = btrfs_extent_flags(leaf, ei);
1550 
1551 	ptr = (unsigned long)(ei + 1);
1552 	end = (unsigned long)ei + item_size;
1553 
1554 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1555 		ptr += sizeof(struct btrfs_tree_block_info);
1556 		BUG_ON(ptr > end);
1557 	}
1558 
1559 	err = -ENOENT;
1560 	while (1) {
1561 		if (ptr >= end) {
1562 			WARN_ON(ptr > end);
1563 			break;
1564 		}
1565 		iref = (struct btrfs_extent_inline_ref *)ptr;
1566 		type = btrfs_extent_inline_ref_type(leaf, iref);
1567 		if (want < type)
1568 			break;
1569 		if (want > type) {
1570 			ptr += btrfs_extent_inline_ref_size(type);
1571 			continue;
1572 		}
1573 
1574 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1575 			struct btrfs_extent_data_ref *dref;
1576 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1577 			if (match_extent_data_ref(leaf, dref, root_objectid,
1578 						  owner, offset)) {
1579 				err = 0;
1580 				break;
1581 			}
1582 			if (hash_extent_data_ref_item(leaf, dref) <
1583 			    hash_extent_data_ref(root_objectid, owner, offset))
1584 				break;
1585 		} else {
1586 			u64 ref_offset;
1587 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1588 			if (parent > 0) {
1589 				if (parent == ref_offset) {
1590 					err = 0;
1591 					break;
1592 				}
1593 				if (ref_offset < parent)
1594 					break;
1595 			} else {
1596 				if (root_objectid == ref_offset) {
1597 					err = 0;
1598 					break;
1599 				}
1600 				if (ref_offset < root_objectid)
1601 					break;
1602 			}
1603 		}
1604 		ptr += btrfs_extent_inline_ref_size(type);
1605 	}
1606 	if (err == -ENOENT && insert) {
1607 		if (item_size + extra_size >=
1608 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1609 			err = -EAGAIN;
1610 			goto out;
1611 		}
1612 		/*
1613 		 * To add new inline back ref, we have to make sure
1614 		 * there is no corresponding back ref item.
1615 		 * For simplicity, we just do not add new inline back
1616 		 * ref if there is any kind of item for this block
1617 		 */
1618 		if (find_next_key(path, 0, &key) == 0 &&
1619 		    key.objectid == bytenr &&
1620 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1621 			err = -EAGAIN;
1622 			goto out;
1623 		}
1624 	}
1625 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1626 out:
1627 	if (insert) {
1628 		path->keep_locks = 0;
1629 		btrfs_unlock_up_safe(path, 1);
1630 	}
1631 	return err;
1632 }
1633 
1634 /*
1635  * helper to add new inline back ref
1636  */
1637 static noinline_for_stack
1638 void setup_inline_extent_backref(struct btrfs_root *root,
1639 				 struct btrfs_path *path,
1640 				 struct btrfs_extent_inline_ref *iref,
1641 				 u64 parent, u64 root_objectid,
1642 				 u64 owner, u64 offset, int refs_to_add,
1643 				 struct btrfs_delayed_extent_op *extent_op)
1644 {
1645 	struct extent_buffer *leaf;
1646 	struct btrfs_extent_item *ei;
1647 	unsigned long ptr;
1648 	unsigned long end;
1649 	unsigned long item_offset;
1650 	u64 refs;
1651 	int size;
1652 	int type;
1653 
1654 	leaf = path->nodes[0];
1655 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1656 	item_offset = (unsigned long)iref - (unsigned long)ei;
1657 
1658 	type = extent_ref_type(parent, owner);
1659 	size = btrfs_extent_inline_ref_size(type);
1660 
1661 	btrfs_extend_item(root, path, size);
1662 
1663 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1664 	refs = btrfs_extent_refs(leaf, ei);
1665 	refs += refs_to_add;
1666 	btrfs_set_extent_refs(leaf, ei, refs);
1667 	if (extent_op)
1668 		__run_delayed_extent_op(extent_op, leaf, ei);
1669 
1670 	ptr = (unsigned long)ei + item_offset;
1671 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1672 	if (ptr < end - size)
1673 		memmove_extent_buffer(leaf, ptr + size, ptr,
1674 				      end - size - ptr);
1675 
1676 	iref = (struct btrfs_extent_inline_ref *)ptr;
1677 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1678 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1679 		struct btrfs_extent_data_ref *dref;
1680 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1681 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1682 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1683 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1684 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1685 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1686 		struct btrfs_shared_data_ref *sref;
1687 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1688 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1689 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1690 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1691 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1692 	} else {
1693 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1694 	}
1695 	btrfs_mark_buffer_dirty(leaf);
1696 }
1697 
1698 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1699 				 struct btrfs_root *root,
1700 				 struct btrfs_path *path,
1701 				 struct btrfs_extent_inline_ref **ref_ret,
1702 				 u64 bytenr, u64 num_bytes, u64 parent,
1703 				 u64 root_objectid, u64 owner, u64 offset)
1704 {
1705 	int ret;
1706 
1707 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1708 					   bytenr, num_bytes, parent,
1709 					   root_objectid, owner, offset, 0);
1710 	if (ret != -ENOENT)
1711 		return ret;
1712 
1713 	btrfs_release_path(path);
1714 	*ref_ret = NULL;
1715 
1716 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1717 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1718 					    root_objectid);
1719 	} else {
1720 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1721 					     root_objectid, owner, offset);
1722 	}
1723 	return ret;
1724 }
1725 
1726 /*
1727  * helper to update/remove inline back ref
1728  */
1729 static noinline_for_stack
1730 void update_inline_extent_backref(struct btrfs_root *root,
1731 				  struct btrfs_path *path,
1732 				  struct btrfs_extent_inline_ref *iref,
1733 				  int refs_to_mod,
1734 				  struct btrfs_delayed_extent_op *extent_op)
1735 {
1736 	struct extent_buffer *leaf;
1737 	struct btrfs_extent_item *ei;
1738 	struct btrfs_extent_data_ref *dref = NULL;
1739 	struct btrfs_shared_data_ref *sref = NULL;
1740 	unsigned long ptr;
1741 	unsigned long end;
1742 	u32 item_size;
1743 	int size;
1744 	int type;
1745 	u64 refs;
1746 
1747 	leaf = path->nodes[0];
1748 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1749 	refs = btrfs_extent_refs(leaf, ei);
1750 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1751 	refs += refs_to_mod;
1752 	btrfs_set_extent_refs(leaf, ei, refs);
1753 	if (extent_op)
1754 		__run_delayed_extent_op(extent_op, leaf, ei);
1755 
1756 	type = btrfs_extent_inline_ref_type(leaf, iref);
1757 
1758 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1759 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1760 		refs = btrfs_extent_data_ref_count(leaf, dref);
1761 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1762 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1763 		refs = btrfs_shared_data_ref_count(leaf, sref);
1764 	} else {
1765 		refs = 1;
1766 		BUG_ON(refs_to_mod != -1);
1767 	}
1768 
1769 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1770 	refs += refs_to_mod;
1771 
1772 	if (refs > 0) {
1773 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1774 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1775 		else
1776 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1777 	} else {
1778 		size =  btrfs_extent_inline_ref_size(type);
1779 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1780 		ptr = (unsigned long)iref;
1781 		end = (unsigned long)ei + item_size;
1782 		if (ptr + size < end)
1783 			memmove_extent_buffer(leaf, ptr, ptr + size,
1784 					      end - ptr - size);
1785 		item_size -= size;
1786 		btrfs_truncate_item(root, path, item_size, 1);
1787 	}
1788 	btrfs_mark_buffer_dirty(leaf);
1789 }
1790 
1791 static noinline_for_stack
1792 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1793 				 struct btrfs_root *root,
1794 				 struct btrfs_path *path,
1795 				 u64 bytenr, u64 num_bytes, u64 parent,
1796 				 u64 root_objectid, u64 owner,
1797 				 u64 offset, int refs_to_add,
1798 				 struct btrfs_delayed_extent_op *extent_op)
1799 {
1800 	struct btrfs_extent_inline_ref *iref;
1801 	int ret;
1802 
1803 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1804 					   bytenr, num_bytes, parent,
1805 					   root_objectid, owner, offset, 1);
1806 	if (ret == 0) {
1807 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1808 		update_inline_extent_backref(root, path, iref,
1809 					     refs_to_add, extent_op);
1810 	} else if (ret == -ENOENT) {
1811 		setup_inline_extent_backref(root, path, iref, parent,
1812 					    root_objectid, owner, offset,
1813 					    refs_to_add, extent_op);
1814 		ret = 0;
1815 	}
1816 	return ret;
1817 }
1818 
1819 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1820 				 struct btrfs_root *root,
1821 				 struct btrfs_path *path,
1822 				 u64 bytenr, u64 parent, u64 root_objectid,
1823 				 u64 owner, u64 offset, int refs_to_add)
1824 {
1825 	int ret;
1826 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1827 		BUG_ON(refs_to_add != 1);
1828 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1829 					    parent, root_objectid);
1830 	} else {
1831 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1832 					     parent, root_objectid,
1833 					     owner, offset, refs_to_add);
1834 	}
1835 	return ret;
1836 }
1837 
1838 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1839 				 struct btrfs_root *root,
1840 				 struct btrfs_path *path,
1841 				 struct btrfs_extent_inline_ref *iref,
1842 				 int refs_to_drop, int is_data)
1843 {
1844 	int ret = 0;
1845 
1846 	BUG_ON(!is_data && refs_to_drop != 1);
1847 	if (iref) {
1848 		update_inline_extent_backref(root, path, iref,
1849 					     -refs_to_drop, NULL);
1850 	} else if (is_data) {
1851 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1852 	} else {
1853 		ret = btrfs_del_item(trans, root, path);
1854 	}
1855 	return ret;
1856 }
1857 
1858 static int btrfs_issue_discard(struct block_device *bdev,
1859 				u64 start, u64 len)
1860 {
1861 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1862 }
1863 
1864 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1865 				u64 num_bytes, u64 *actual_bytes)
1866 {
1867 	int ret;
1868 	u64 discarded_bytes = 0;
1869 	struct btrfs_bio *bbio = NULL;
1870 
1871 
1872 	/* Tell the block device(s) that the sectors can be discarded */
1873 	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1874 			      bytenr, &num_bytes, &bbio, 0);
1875 	/* Error condition is -ENOMEM */
1876 	if (!ret) {
1877 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1878 		int i;
1879 
1880 
1881 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1882 			if (!stripe->dev->can_discard)
1883 				continue;
1884 
1885 			ret = btrfs_issue_discard(stripe->dev->bdev,
1886 						  stripe->physical,
1887 						  stripe->length);
1888 			if (!ret)
1889 				discarded_bytes += stripe->length;
1890 			else if (ret != -EOPNOTSUPP)
1891 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1892 
1893 			/*
1894 			 * Just in case we get back EOPNOTSUPP for some reason,
1895 			 * just ignore the return value so we don't screw up
1896 			 * people calling discard_extent.
1897 			 */
1898 			ret = 0;
1899 		}
1900 		kfree(bbio);
1901 	}
1902 
1903 	if (actual_bytes)
1904 		*actual_bytes = discarded_bytes;
1905 
1906 
1907 	if (ret == -EOPNOTSUPP)
1908 		ret = 0;
1909 	return ret;
1910 }
1911 
1912 /* Can return -ENOMEM */
1913 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1914 			 struct btrfs_root *root,
1915 			 u64 bytenr, u64 num_bytes, u64 parent,
1916 			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1917 {
1918 	int ret;
1919 	struct btrfs_fs_info *fs_info = root->fs_info;
1920 
1921 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1922 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1923 
1924 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1925 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1926 					num_bytes,
1927 					parent, root_objectid, (int)owner,
1928 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1929 	} else {
1930 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1931 					num_bytes,
1932 					parent, root_objectid, owner, offset,
1933 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1934 	}
1935 	return ret;
1936 }
1937 
1938 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1939 				  struct btrfs_root *root,
1940 				  u64 bytenr, u64 num_bytes,
1941 				  u64 parent, u64 root_objectid,
1942 				  u64 owner, u64 offset, int refs_to_add,
1943 				  struct btrfs_delayed_extent_op *extent_op)
1944 {
1945 	struct btrfs_path *path;
1946 	struct extent_buffer *leaf;
1947 	struct btrfs_extent_item *item;
1948 	u64 refs;
1949 	int ret;
1950 	int err = 0;
1951 
1952 	path = btrfs_alloc_path();
1953 	if (!path)
1954 		return -ENOMEM;
1955 
1956 	path->reada = 1;
1957 	path->leave_spinning = 1;
1958 	/* this will setup the path even if it fails to insert the back ref */
1959 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1960 					   path, bytenr, num_bytes, parent,
1961 					   root_objectid, owner, offset,
1962 					   refs_to_add, extent_op);
1963 	if (ret == 0)
1964 		goto out;
1965 
1966 	if (ret != -EAGAIN) {
1967 		err = ret;
1968 		goto out;
1969 	}
1970 
1971 	leaf = path->nodes[0];
1972 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1973 	refs = btrfs_extent_refs(leaf, item);
1974 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1975 	if (extent_op)
1976 		__run_delayed_extent_op(extent_op, leaf, item);
1977 
1978 	btrfs_mark_buffer_dirty(leaf);
1979 	btrfs_release_path(path);
1980 
1981 	path->reada = 1;
1982 	path->leave_spinning = 1;
1983 
1984 	/* now insert the actual backref */
1985 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1986 				    path, bytenr, parent, root_objectid,
1987 				    owner, offset, refs_to_add);
1988 	if (ret)
1989 		btrfs_abort_transaction(trans, root, ret);
1990 out:
1991 	btrfs_free_path(path);
1992 	return err;
1993 }
1994 
1995 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1996 				struct btrfs_root *root,
1997 				struct btrfs_delayed_ref_node *node,
1998 				struct btrfs_delayed_extent_op *extent_op,
1999 				int insert_reserved)
2000 {
2001 	int ret = 0;
2002 	struct btrfs_delayed_data_ref *ref;
2003 	struct btrfs_key ins;
2004 	u64 parent = 0;
2005 	u64 ref_root = 0;
2006 	u64 flags = 0;
2007 
2008 	ins.objectid = node->bytenr;
2009 	ins.offset = node->num_bytes;
2010 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2011 
2012 	ref = btrfs_delayed_node_to_data_ref(node);
2013 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2014 		parent = ref->parent;
2015 	else
2016 		ref_root = ref->root;
2017 
2018 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2019 		if (extent_op)
2020 			flags |= extent_op->flags_to_set;
2021 		ret = alloc_reserved_file_extent(trans, root,
2022 						 parent, ref_root, flags,
2023 						 ref->objectid, ref->offset,
2024 						 &ins, node->ref_mod);
2025 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2026 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2027 					     node->num_bytes, parent,
2028 					     ref_root, ref->objectid,
2029 					     ref->offset, node->ref_mod,
2030 					     extent_op);
2031 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2032 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2033 					  node->num_bytes, parent,
2034 					  ref_root, ref->objectid,
2035 					  ref->offset, node->ref_mod,
2036 					  extent_op);
2037 	} else {
2038 		BUG();
2039 	}
2040 	return ret;
2041 }
2042 
2043 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2044 				    struct extent_buffer *leaf,
2045 				    struct btrfs_extent_item *ei)
2046 {
2047 	u64 flags = btrfs_extent_flags(leaf, ei);
2048 	if (extent_op->update_flags) {
2049 		flags |= extent_op->flags_to_set;
2050 		btrfs_set_extent_flags(leaf, ei, flags);
2051 	}
2052 
2053 	if (extent_op->update_key) {
2054 		struct btrfs_tree_block_info *bi;
2055 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2056 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2057 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2058 	}
2059 }
2060 
2061 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2062 				 struct btrfs_root *root,
2063 				 struct btrfs_delayed_ref_node *node,
2064 				 struct btrfs_delayed_extent_op *extent_op)
2065 {
2066 	struct btrfs_key key;
2067 	struct btrfs_path *path;
2068 	struct btrfs_extent_item *ei;
2069 	struct extent_buffer *leaf;
2070 	u32 item_size;
2071 	int ret;
2072 	int err = 0;
2073 	int metadata = !extent_op->is_data;
2074 
2075 	if (trans->aborted)
2076 		return 0;
2077 
2078 	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2079 		metadata = 0;
2080 
2081 	path = btrfs_alloc_path();
2082 	if (!path)
2083 		return -ENOMEM;
2084 
2085 	key.objectid = node->bytenr;
2086 
2087 	if (metadata) {
2088 		key.type = BTRFS_METADATA_ITEM_KEY;
2089 		key.offset = extent_op->level;
2090 	} else {
2091 		key.type = BTRFS_EXTENT_ITEM_KEY;
2092 		key.offset = node->num_bytes;
2093 	}
2094 
2095 again:
2096 	path->reada = 1;
2097 	path->leave_spinning = 1;
2098 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2099 				path, 0, 1);
2100 	if (ret < 0) {
2101 		err = ret;
2102 		goto out;
2103 	}
2104 	if (ret > 0) {
2105 		if (metadata) {
2106 			btrfs_release_path(path);
2107 			metadata = 0;
2108 
2109 			key.offset = node->num_bytes;
2110 			key.type = BTRFS_EXTENT_ITEM_KEY;
2111 			goto again;
2112 		}
2113 		err = -EIO;
2114 		goto out;
2115 	}
2116 
2117 	leaf = path->nodes[0];
2118 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2119 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2120 	if (item_size < sizeof(*ei)) {
2121 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2122 					     path, (u64)-1, 0);
2123 		if (ret < 0) {
2124 			err = ret;
2125 			goto out;
2126 		}
2127 		leaf = path->nodes[0];
2128 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2129 	}
2130 #endif
2131 	BUG_ON(item_size < sizeof(*ei));
2132 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2133 	__run_delayed_extent_op(extent_op, leaf, ei);
2134 
2135 	btrfs_mark_buffer_dirty(leaf);
2136 out:
2137 	btrfs_free_path(path);
2138 	return err;
2139 }
2140 
2141 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2142 				struct btrfs_root *root,
2143 				struct btrfs_delayed_ref_node *node,
2144 				struct btrfs_delayed_extent_op *extent_op,
2145 				int insert_reserved)
2146 {
2147 	int ret = 0;
2148 	struct btrfs_delayed_tree_ref *ref;
2149 	struct btrfs_key ins;
2150 	u64 parent = 0;
2151 	u64 ref_root = 0;
2152 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2153 						 SKINNY_METADATA);
2154 
2155 	ref = btrfs_delayed_node_to_tree_ref(node);
2156 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2157 		parent = ref->parent;
2158 	else
2159 		ref_root = ref->root;
2160 
2161 	ins.objectid = node->bytenr;
2162 	if (skinny_metadata) {
2163 		ins.offset = ref->level;
2164 		ins.type = BTRFS_METADATA_ITEM_KEY;
2165 	} else {
2166 		ins.offset = node->num_bytes;
2167 		ins.type = BTRFS_EXTENT_ITEM_KEY;
2168 	}
2169 
2170 	BUG_ON(node->ref_mod != 1);
2171 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2172 		BUG_ON(!extent_op || !extent_op->update_flags);
2173 		ret = alloc_reserved_tree_block(trans, root,
2174 						parent, ref_root,
2175 						extent_op->flags_to_set,
2176 						&extent_op->key,
2177 						ref->level, &ins);
2178 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2179 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2180 					     node->num_bytes, parent, ref_root,
2181 					     ref->level, 0, 1, extent_op);
2182 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2183 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2184 					  node->num_bytes, parent, ref_root,
2185 					  ref->level, 0, 1, extent_op);
2186 	} else {
2187 		BUG();
2188 	}
2189 	return ret;
2190 }
2191 
2192 /* helper function to actually process a single delayed ref entry */
2193 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2194 			       struct btrfs_root *root,
2195 			       struct btrfs_delayed_ref_node *node,
2196 			       struct btrfs_delayed_extent_op *extent_op,
2197 			       int insert_reserved)
2198 {
2199 	int ret = 0;
2200 
2201 	if (trans->aborted)
2202 		return 0;
2203 
2204 	if (btrfs_delayed_ref_is_head(node)) {
2205 		struct btrfs_delayed_ref_head *head;
2206 		/*
2207 		 * we've hit the end of the chain and we were supposed
2208 		 * to insert this extent into the tree.  But, it got
2209 		 * deleted before we ever needed to insert it, so all
2210 		 * we have to do is clean up the accounting
2211 		 */
2212 		BUG_ON(extent_op);
2213 		head = btrfs_delayed_node_to_head(node);
2214 		if (insert_reserved) {
2215 			btrfs_pin_extent(root, node->bytenr,
2216 					 node->num_bytes, 1);
2217 			if (head->is_data) {
2218 				ret = btrfs_del_csums(trans, root,
2219 						      node->bytenr,
2220 						      node->num_bytes);
2221 			}
2222 		}
2223 		return ret;
2224 	}
2225 
2226 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2227 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2228 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2229 					   insert_reserved);
2230 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2231 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2232 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2233 					   insert_reserved);
2234 	else
2235 		BUG();
2236 	return ret;
2237 }
2238 
2239 static noinline struct btrfs_delayed_ref_node *
2240 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2241 {
2242 	struct rb_node *node;
2243 	struct btrfs_delayed_ref_node *ref;
2244 	int action = BTRFS_ADD_DELAYED_REF;
2245 again:
2246 	/*
2247 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2248 	 * this prevents ref count from going down to zero when
2249 	 * there still are pending delayed ref.
2250 	 */
2251 	node = rb_prev(&head->node.rb_node);
2252 	while (1) {
2253 		if (!node)
2254 			break;
2255 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2256 				rb_node);
2257 		if (ref->bytenr != head->node.bytenr)
2258 			break;
2259 		if (ref->action == action)
2260 			return ref;
2261 		node = rb_prev(node);
2262 	}
2263 	if (action == BTRFS_ADD_DELAYED_REF) {
2264 		action = BTRFS_DROP_DELAYED_REF;
2265 		goto again;
2266 	}
2267 	return NULL;
2268 }
2269 
2270 /*
2271  * Returns 0 on success or if called with an already aborted transaction.
2272  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2273  */
2274 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2275 				       struct btrfs_root *root,
2276 				       struct list_head *cluster)
2277 {
2278 	struct btrfs_delayed_ref_root *delayed_refs;
2279 	struct btrfs_delayed_ref_node *ref;
2280 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2281 	struct btrfs_delayed_extent_op *extent_op;
2282 	struct btrfs_fs_info *fs_info = root->fs_info;
2283 	int ret;
2284 	int count = 0;
2285 	int must_insert_reserved = 0;
2286 
2287 	delayed_refs = &trans->transaction->delayed_refs;
2288 	while (1) {
2289 		if (!locked_ref) {
2290 			/* pick a new head ref from the cluster list */
2291 			if (list_empty(cluster))
2292 				break;
2293 
2294 			locked_ref = list_entry(cluster->next,
2295 				     struct btrfs_delayed_ref_head, cluster);
2296 
2297 			/* grab the lock that says we are going to process
2298 			 * all the refs for this head */
2299 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2300 
2301 			/*
2302 			 * we may have dropped the spin lock to get the head
2303 			 * mutex lock, and that might have given someone else
2304 			 * time to free the head.  If that's true, it has been
2305 			 * removed from our list and we can move on.
2306 			 */
2307 			if (ret == -EAGAIN) {
2308 				locked_ref = NULL;
2309 				count++;
2310 				continue;
2311 			}
2312 		}
2313 
2314 		/*
2315 		 * We need to try and merge add/drops of the same ref since we
2316 		 * can run into issues with relocate dropping the implicit ref
2317 		 * and then it being added back again before the drop can
2318 		 * finish.  If we merged anything we need to re-loop so we can
2319 		 * get a good ref.
2320 		 */
2321 		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2322 					 locked_ref);
2323 
2324 		/*
2325 		 * locked_ref is the head node, so we have to go one
2326 		 * node back for any delayed ref updates
2327 		 */
2328 		ref = select_delayed_ref(locked_ref);
2329 
2330 		if (ref && ref->seq &&
2331 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2332 			/*
2333 			 * there are still refs with lower seq numbers in the
2334 			 * process of being added. Don't run this ref yet.
2335 			 */
2336 			list_del_init(&locked_ref->cluster);
2337 			btrfs_delayed_ref_unlock(locked_ref);
2338 			locked_ref = NULL;
2339 			delayed_refs->num_heads_ready++;
2340 			spin_unlock(&delayed_refs->lock);
2341 			cond_resched();
2342 			spin_lock(&delayed_refs->lock);
2343 			continue;
2344 		}
2345 
2346 		/*
2347 		 * record the must insert reserved flag before we
2348 		 * drop the spin lock.
2349 		 */
2350 		must_insert_reserved = locked_ref->must_insert_reserved;
2351 		locked_ref->must_insert_reserved = 0;
2352 
2353 		extent_op = locked_ref->extent_op;
2354 		locked_ref->extent_op = NULL;
2355 
2356 		if (!ref) {
2357 			/* All delayed refs have been processed, Go ahead
2358 			 * and send the head node to run_one_delayed_ref,
2359 			 * so that any accounting fixes can happen
2360 			 */
2361 			ref = &locked_ref->node;
2362 
2363 			if (extent_op && must_insert_reserved) {
2364 				btrfs_free_delayed_extent_op(extent_op);
2365 				extent_op = NULL;
2366 			}
2367 
2368 			if (extent_op) {
2369 				spin_unlock(&delayed_refs->lock);
2370 
2371 				ret = run_delayed_extent_op(trans, root,
2372 							    ref, extent_op);
2373 				btrfs_free_delayed_extent_op(extent_op);
2374 
2375 				if (ret) {
2376 					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2377 					spin_lock(&delayed_refs->lock);
2378 					btrfs_delayed_ref_unlock(locked_ref);
2379 					return ret;
2380 				}
2381 
2382 				goto next;
2383 			}
2384 		}
2385 
2386 		ref->in_tree = 0;
2387 		rb_erase(&ref->rb_node, &delayed_refs->root);
2388 		delayed_refs->num_entries--;
2389 		if (!btrfs_delayed_ref_is_head(ref)) {
2390 			/*
2391 			 * when we play the delayed ref, also correct the
2392 			 * ref_mod on head
2393 			 */
2394 			switch (ref->action) {
2395 			case BTRFS_ADD_DELAYED_REF:
2396 			case BTRFS_ADD_DELAYED_EXTENT:
2397 				locked_ref->node.ref_mod -= ref->ref_mod;
2398 				break;
2399 			case BTRFS_DROP_DELAYED_REF:
2400 				locked_ref->node.ref_mod += ref->ref_mod;
2401 				break;
2402 			default:
2403 				WARN_ON(1);
2404 			}
2405 		}
2406 		spin_unlock(&delayed_refs->lock);
2407 
2408 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2409 					  must_insert_reserved);
2410 
2411 		btrfs_free_delayed_extent_op(extent_op);
2412 		if (ret) {
2413 			btrfs_delayed_ref_unlock(locked_ref);
2414 			btrfs_put_delayed_ref(ref);
2415 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2416 			spin_lock(&delayed_refs->lock);
2417 			return ret;
2418 		}
2419 
2420 		/*
2421 		 * If this node is a head, that means all the refs in this head
2422 		 * have been dealt with, and we will pick the next head to deal
2423 		 * with, so we must unlock the head and drop it from the cluster
2424 		 * list before we release it.
2425 		 */
2426 		if (btrfs_delayed_ref_is_head(ref)) {
2427 			list_del_init(&locked_ref->cluster);
2428 			btrfs_delayed_ref_unlock(locked_ref);
2429 			locked_ref = NULL;
2430 		}
2431 		btrfs_put_delayed_ref(ref);
2432 		count++;
2433 next:
2434 		cond_resched();
2435 		spin_lock(&delayed_refs->lock);
2436 	}
2437 	return count;
2438 }
2439 
2440 #ifdef SCRAMBLE_DELAYED_REFS
2441 /*
2442  * Normally delayed refs get processed in ascending bytenr order. This
2443  * correlates in most cases to the order added. To expose dependencies on this
2444  * order, we start to process the tree in the middle instead of the beginning
2445  */
2446 static u64 find_middle(struct rb_root *root)
2447 {
2448 	struct rb_node *n = root->rb_node;
2449 	struct btrfs_delayed_ref_node *entry;
2450 	int alt = 1;
2451 	u64 middle;
2452 	u64 first = 0, last = 0;
2453 
2454 	n = rb_first(root);
2455 	if (n) {
2456 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2457 		first = entry->bytenr;
2458 	}
2459 	n = rb_last(root);
2460 	if (n) {
2461 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2462 		last = entry->bytenr;
2463 	}
2464 	n = root->rb_node;
2465 
2466 	while (n) {
2467 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2468 		WARN_ON(!entry->in_tree);
2469 
2470 		middle = entry->bytenr;
2471 
2472 		if (alt)
2473 			n = n->rb_left;
2474 		else
2475 			n = n->rb_right;
2476 
2477 		alt = 1 - alt;
2478 	}
2479 	return middle;
2480 }
2481 #endif
2482 
2483 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2484 					 struct btrfs_fs_info *fs_info)
2485 {
2486 	struct qgroup_update *qgroup_update;
2487 	int ret = 0;
2488 
2489 	if (list_empty(&trans->qgroup_ref_list) !=
2490 	    !trans->delayed_ref_elem.seq) {
2491 		/* list without seq or seq without list */
2492 		btrfs_err(fs_info,
2493 			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
2494 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
2495 			(u32)(trans->delayed_ref_elem.seq >> 32),
2496 			(u32)trans->delayed_ref_elem.seq);
2497 		BUG();
2498 	}
2499 
2500 	if (!trans->delayed_ref_elem.seq)
2501 		return 0;
2502 
2503 	while (!list_empty(&trans->qgroup_ref_list)) {
2504 		qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2505 						 struct qgroup_update, list);
2506 		list_del(&qgroup_update->list);
2507 		if (!ret)
2508 			ret = btrfs_qgroup_account_ref(
2509 					trans, fs_info, qgroup_update->node,
2510 					qgroup_update->extent_op);
2511 		kfree(qgroup_update);
2512 	}
2513 
2514 	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2515 
2516 	return ret;
2517 }
2518 
2519 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2520 		      int count)
2521 {
2522 	int val = atomic_read(&delayed_refs->ref_seq);
2523 
2524 	if (val < seq || val >= seq + count)
2525 		return 1;
2526 	return 0;
2527 }
2528 
2529 /*
2530  * this starts processing the delayed reference count updates and
2531  * extent insertions we have queued up so far.  count can be
2532  * 0, which means to process everything in the tree at the start
2533  * of the run (but not newly added entries), or it can be some target
2534  * number you'd like to process.
2535  *
2536  * Returns 0 on success or if called with an aborted transaction
2537  * Returns <0 on error and aborts the transaction
2538  */
2539 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2540 			   struct btrfs_root *root, unsigned long count)
2541 {
2542 	struct rb_node *node;
2543 	struct btrfs_delayed_ref_root *delayed_refs;
2544 	struct btrfs_delayed_ref_node *ref;
2545 	struct list_head cluster;
2546 	int ret;
2547 	u64 delayed_start;
2548 	int run_all = count == (unsigned long)-1;
2549 	int run_most = 0;
2550 	int loops;
2551 
2552 	/* We'll clean this up in btrfs_cleanup_transaction */
2553 	if (trans->aborted)
2554 		return 0;
2555 
2556 	if (root == root->fs_info->extent_root)
2557 		root = root->fs_info->tree_root;
2558 
2559 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2560 
2561 	delayed_refs = &trans->transaction->delayed_refs;
2562 	INIT_LIST_HEAD(&cluster);
2563 	if (count == 0) {
2564 		count = delayed_refs->num_entries * 2;
2565 		run_most = 1;
2566 	}
2567 
2568 	if (!run_all && !run_most) {
2569 		int old;
2570 		int seq = atomic_read(&delayed_refs->ref_seq);
2571 
2572 progress:
2573 		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2574 		if (old) {
2575 			DEFINE_WAIT(__wait);
2576 			if (delayed_refs->num_entries < 16348)
2577 				return 0;
2578 
2579 			prepare_to_wait(&delayed_refs->wait, &__wait,
2580 					TASK_UNINTERRUPTIBLE);
2581 
2582 			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2583 			if (old) {
2584 				schedule();
2585 				finish_wait(&delayed_refs->wait, &__wait);
2586 
2587 				if (!refs_newer(delayed_refs, seq, 256))
2588 					goto progress;
2589 				else
2590 					return 0;
2591 			} else {
2592 				finish_wait(&delayed_refs->wait, &__wait);
2593 				goto again;
2594 			}
2595 		}
2596 
2597 	} else {
2598 		atomic_inc(&delayed_refs->procs_running_refs);
2599 	}
2600 
2601 again:
2602 	loops = 0;
2603 	spin_lock(&delayed_refs->lock);
2604 
2605 #ifdef SCRAMBLE_DELAYED_REFS
2606 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2607 #endif
2608 
2609 	while (1) {
2610 		if (!(run_all || run_most) &&
2611 		    delayed_refs->num_heads_ready < 64)
2612 			break;
2613 
2614 		/*
2615 		 * go find something we can process in the rbtree.  We start at
2616 		 * the beginning of the tree, and then build a cluster
2617 		 * of refs to process starting at the first one we are able to
2618 		 * lock
2619 		 */
2620 		delayed_start = delayed_refs->run_delayed_start;
2621 		ret = btrfs_find_ref_cluster(trans, &cluster,
2622 					     delayed_refs->run_delayed_start);
2623 		if (ret)
2624 			break;
2625 
2626 		ret = run_clustered_refs(trans, root, &cluster);
2627 		if (ret < 0) {
2628 			btrfs_release_ref_cluster(&cluster);
2629 			spin_unlock(&delayed_refs->lock);
2630 			btrfs_abort_transaction(trans, root, ret);
2631 			atomic_dec(&delayed_refs->procs_running_refs);
2632 			return ret;
2633 		}
2634 
2635 		atomic_add(ret, &delayed_refs->ref_seq);
2636 
2637 		count -= min_t(unsigned long, ret, count);
2638 
2639 		if (count == 0)
2640 			break;
2641 
2642 		if (delayed_start >= delayed_refs->run_delayed_start) {
2643 			if (loops == 0) {
2644 				/*
2645 				 * btrfs_find_ref_cluster looped. let's do one
2646 				 * more cycle. if we don't run any delayed ref
2647 				 * during that cycle (because we can't because
2648 				 * all of them are blocked), bail out.
2649 				 */
2650 				loops = 1;
2651 			} else {
2652 				/*
2653 				 * no runnable refs left, stop trying
2654 				 */
2655 				BUG_ON(run_all);
2656 				break;
2657 			}
2658 		}
2659 		if (ret) {
2660 			/* refs were run, let's reset staleness detection */
2661 			loops = 0;
2662 		}
2663 	}
2664 
2665 	if (run_all) {
2666 		if (!list_empty(&trans->new_bgs)) {
2667 			spin_unlock(&delayed_refs->lock);
2668 			btrfs_create_pending_block_groups(trans, root);
2669 			spin_lock(&delayed_refs->lock);
2670 		}
2671 
2672 		node = rb_first(&delayed_refs->root);
2673 		if (!node)
2674 			goto out;
2675 		count = (unsigned long)-1;
2676 
2677 		while (node) {
2678 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2679 				       rb_node);
2680 			if (btrfs_delayed_ref_is_head(ref)) {
2681 				struct btrfs_delayed_ref_head *head;
2682 
2683 				head = btrfs_delayed_node_to_head(ref);
2684 				atomic_inc(&ref->refs);
2685 
2686 				spin_unlock(&delayed_refs->lock);
2687 				/*
2688 				 * Mutex was contended, block until it's
2689 				 * released and try again
2690 				 */
2691 				mutex_lock(&head->mutex);
2692 				mutex_unlock(&head->mutex);
2693 
2694 				btrfs_put_delayed_ref(ref);
2695 				cond_resched();
2696 				goto again;
2697 			}
2698 			node = rb_next(node);
2699 		}
2700 		spin_unlock(&delayed_refs->lock);
2701 		schedule_timeout(1);
2702 		goto again;
2703 	}
2704 out:
2705 	atomic_dec(&delayed_refs->procs_running_refs);
2706 	smp_mb();
2707 	if (waitqueue_active(&delayed_refs->wait))
2708 		wake_up(&delayed_refs->wait);
2709 
2710 	spin_unlock(&delayed_refs->lock);
2711 	assert_qgroups_uptodate(trans);
2712 	return 0;
2713 }
2714 
2715 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2716 				struct btrfs_root *root,
2717 				u64 bytenr, u64 num_bytes, u64 flags,
2718 				int level, int is_data)
2719 {
2720 	struct btrfs_delayed_extent_op *extent_op;
2721 	int ret;
2722 
2723 	extent_op = btrfs_alloc_delayed_extent_op();
2724 	if (!extent_op)
2725 		return -ENOMEM;
2726 
2727 	extent_op->flags_to_set = flags;
2728 	extent_op->update_flags = 1;
2729 	extent_op->update_key = 0;
2730 	extent_op->is_data = is_data ? 1 : 0;
2731 	extent_op->level = level;
2732 
2733 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2734 					  num_bytes, extent_op);
2735 	if (ret)
2736 		btrfs_free_delayed_extent_op(extent_op);
2737 	return ret;
2738 }
2739 
2740 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2741 				      struct btrfs_root *root,
2742 				      struct btrfs_path *path,
2743 				      u64 objectid, u64 offset, u64 bytenr)
2744 {
2745 	struct btrfs_delayed_ref_head *head;
2746 	struct btrfs_delayed_ref_node *ref;
2747 	struct btrfs_delayed_data_ref *data_ref;
2748 	struct btrfs_delayed_ref_root *delayed_refs;
2749 	struct rb_node *node;
2750 	int ret = 0;
2751 
2752 	ret = -ENOENT;
2753 	delayed_refs = &trans->transaction->delayed_refs;
2754 	spin_lock(&delayed_refs->lock);
2755 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2756 	if (!head)
2757 		goto out;
2758 
2759 	if (!mutex_trylock(&head->mutex)) {
2760 		atomic_inc(&head->node.refs);
2761 		spin_unlock(&delayed_refs->lock);
2762 
2763 		btrfs_release_path(path);
2764 
2765 		/*
2766 		 * Mutex was contended, block until it's released and let
2767 		 * caller try again
2768 		 */
2769 		mutex_lock(&head->mutex);
2770 		mutex_unlock(&head->mutex);
2771 		btrfs_put_delayed_ref(&head->node);
2772 		return -EAGAIN;
2773 	}
2774 
2775 	node = rb_prev(&head->node.rb_node);
2776 	if (!node)
2777 		goto out_unlock;
2778 
2779 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2780 
2781 	if (ref->bytenr != bytenr)
2782 		goto out_unlock;
2783 
2784 	ret = 1;
2785 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2786 		goto out_unlock;
2787 
2788 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2789 
2790 	node = rb_prev(node);
2791 	if (node) {
2792 		int seq = ref->seq;
2793 
2794 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2795 		if (ref->bytenr == bytenr && ref->seq == seq)
2796 			goto out_unlock;
2797 	}
2798 
2799 	if (data_ref->root != root->root_key.objectid ||
2800 	    data_ref->objectid != objectid || data_ref->offset != offset)
2801 		goto out_unlock;
2802 
2803 	ret = 0;
2804 out_unlock:
2805 	mutex_unlock(&head->mutex);
2806 out:
2807 	spin_unlock(&delayed_refs->lock);
2808 	return ret;
2809 }
2810 
2811 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2812 					struct btrfs_root *root,
2813 					struct btrfs_path *path,
2814 					u64 objectid, u64 offset, u64 bytenr)
2815 {
2816 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2817 	struct extent_buffer *leaf;
2818 	struct btrfs_extent_data_ref *ref;
2819 	struct btrfs_extent_inline_ref *iref;
2820 	struct btrfs_extent_item *ei;
2821 	struct btrfs_key key;
2822 	u32 item_size;
2823 	int ret;
2824 
2825 	key.objectid = bytenr;
2826 	key.offset = (u64)-1;
2827 	key.type = BTRFS_EXTENT_ITEM_KEY;
2828 
2829 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2830 	if (ret < 0)
2831 		goto out;
2832 	BUG_ON(ret == 0); /* Corruption */
2833 
2834 	ret = -ENOENT;
2835 	if (path->slots[0] == 0)
2836 		goto out;
2837 
2838 	path->slots[0]--;
2839 	leaf = path->nodes[0];
2840 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2841 
2842 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2843 		goto out;
2844 
2845 	ret = 1;
2846 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2847 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2848 	if (item_size < sizeof(*ei)) {
2849 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2850 		goto out;
2851 	}
2852 #endif
2853 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2854 
2855 	if (item_size != sizeof(*ei) +
2856 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2857 		goto out;
2858 
2859 	if (btrfs_extent_generation(leaf, ei) <=
2860 	    btrfs_root_last_snapshot(&root->root_item))
2861 		goto out;
2862 
2863 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2864 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2865 	    BTRFS_EXTENT_DATA_REF_KEY)
2866 		goto out;
2867 
2868 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2869 	if (btrfs_extent_refs(leaf, ei) !=
2870 	    btrfs_extent_data_ref_count(leaf, ref) ||
2871 	    btrfs_extent_data_ref_root(leaf, ref) !=
2872 	    root->root_key.objectid ||
2873 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2874 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2875 		goto out;
2876 
2877 	ret = 0;
2878 out:
2879 	return ret;
2880 }
2881 
2882 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2883 			  struct btrfs_root *root,
2884 			  u64 objectid, u64 offset, u64 bytenr)
2885 {
2886 	struct btrfs_path *path;
2887 	int ret;
2888 	int ret2;
2889 
2890 	path = btrfs_alloc_path();
2891 	if (!path)
2892 		return -ENOENT;
2893 
2894 	do {
2895 		ret = check_committed_ref(trans, root, path, objectid,
2896 					  offset, bytenr);
2897 		if (ret && ret != -ENOENT)
2898 			goto out;
2899 
2900 		ret2 = check_delayed_ref(trans, root, path, objectid,
2901 					 offset, bytenr);
2902 	} while (ret2 == -EAGAIN);
2903 
2904 	if (ret2 && ret2 != -ENOENT) {
2905 		ret = ret2;
2906 		goto out;
2907 	}
2908 
2909 	if (ret != -ENOENT || ret2 != -ENOENT)
2910 		ret = 0;
2911 out:
2912 	btrfs_free_path(path);
2913 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2914 		WARN_ON(ret > 0);
2915 	return ret;
2916 }
2917 
2918 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2919 			   struct btrfs_root *root,
2920 			   struct extent_buffer *buf,
2921 			   int full_backref, int inc, int for_cow)
2922 {
2923 	u64 bytenr;
2924 	u64 num_bytes;
2925 	u64 parent;
2926 	u64 ref_root;
2927 	u32 nritems;
2928 	struct btrfs_key key;
2929 	struct btrfs_file_extent_item *fi;
2930 	int i;
2931 	int level;
2932 	int ret = 0;
2933 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2934 			    u64, u64, u64, u64, u64, u64, int);
2935 
2936 	ref_root = btrfs_header_owner(buf);
2937 	nritems = btrfs_header_nritems(buf);
2938 	level = btrfs_header_level(buf);
2939 
2940 	if (!root->ref_cows && level == 0)
2941 		return 0;
2942 
2943 	if (inc)
2944 		process_func = btrfs_inc_extent_ref;
2945 	else
2946 		process_func = btrfs_free_extent;
2947 
2948 	if (full_backref)
2949 		parent = buf->start;
2950 	else
2951 		parent = 0;
2952 
2953 	for (i = 0; i < nritems; i++) {
2954 		if (level == 0) {
2955 			btrfs_item_key_to_cpu(buf, &key, i);
2956 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2957 				continue;
2958 			fi = btrfs_item_ptr(buf, i,
2959 					    struct btrfs_file_extent_item);
2960 			if (btrfs_file_extent_type(buf, fi) ==
2961 			    BTRFS_FILE_EXTENT_INLINE)
2962 				continue;
2963 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2964 			if (bytenr == 0)
2965 				continue;
2966 
2967 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2968 			key.offset -= btrfs_file_extent_offset(buf, fi);
2969 			ret = process_func(trans, root, bytenr, num_bytes,
2970 					   parent, ref_root, key.objectid,
2971 					   key.offset, for_cow);
2972 			if (ret)
2973 				goto fail;
2974 		} else {
2975 			bytenr = btrfs_node_blockptr(buf, i);
2976 			num_bytes = btrfs_level_size(root, level - 1);
2977 			ret = process_func(trans, root, bytenr, num_bytes,
2978 					   parent, ref_root, level - 1, 0,
2979 					   for_cow);
2980 			if (ret)
2981 				goto fail;
2982 		}
2983 	}
2984 	return 0;
2985 fail:
2986 	return ret;
2987 }
2988 
2989 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2990 		  struct extent_buffer *buf, int full_backref, int for_cow)
2991 {
2992 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2993 }
2994 
2995 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2996 		  struct extent_buffer *buf, int full_backref, int for_cow)
2997 {
2998 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2999 }
3000 
3001 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3002 				 struct btrfs_root *root,
3003 				 struct btrfs_path *path,
3004 				 struct btrfs_block_group_cache *cache)
3005 {
3006 	int ret;
3007 	struct btrfs_root *extent_root = root->fs_info->extent_root;
3008 	unsigned long bi;
3009 	struct extent_buffer *leaf;
3010 
3011 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3012 	if (ret < 0)
3013 		goto fail;
3014 	BUG_ON(ret); /* Corruption */
3015 
3016 	leaf = path->nodes[0];
3017 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3018 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3019 	btrfs_mark_buffer_dirty(leaf);
3020 	btrfs_release_path(path);
3021 fail:
3022 	if (ret) {
3023 		btrfs_abort_transaction(trans, root, ret);
3024 		return ret;
3025 	}
3026 	return 0;
3027 
3028 }
3029 
3030 static struct btrfs_block_group_cache *
3031 next_block_group(struct btrfs_root *root,
3032 		 struct btrfs_block_group_cache *cache)
3033 {
3034 	struct rb_node *node;
3035 	spin_lock(&root->fs_info->block_group_cache_lock);
3036 	node = rb_next(&cache->cache_node);
3037 	btrfs_put_block_group(cache);
3038 	if (node) {
3039 		cache = rb_entry(node, struct btrfs_block_group_cache,
3040 				 cache_node);
3041 		btrfs_get_block_group(cache);
3042 	} else
3043 		cache = NULL;
3044 	spin_unlock(&root->fs_info->block_group_cache_lock);
3045 	return cache;
3046 }
3047 
3048 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3049 			    struct btrfs_trans_handle *trans,
3050 			    struct btrfs_path *path)
3051 {
3052 	struct btrfs_root *root = block_group->fs_info->tree_root;
3053 	struct inode *inode = NULL;
3054 	u64 alloc_hint = 0;
3055 	int dcs = BTRFS_DC_ERROR;
3056 	int num_pages = 0;
3057 	int retries = 0;
3058 	int ret = 0;
3059 
3060 	/*
3061 	 * If this block group is smaller than 100 megs don't bother caching the
3062 	 * block group.
3063 	 */
3064 	if (block_group->key.offset < (100 * 1024 * 1024)) {
3065 		spin_lock(&block_group->lock);
3066 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3067 		spin_unlock(&block_group->lock);
3068 		return 0;
3069 	}
3070 
3071 again:
3072 	inode = lookup_free_space_inode(root, block_group, path);
3073 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3074 		ret = PTR_ERR(inode);
3075 		btrfs_release_path(path);
3076 		goto out;
3077 	}
3078 
3079 	if (IS_ERR(inode)) {
3080 		BUG_ON(retries);
3081 		retries++;
3082 
3083 		if (block_group->ro)
3084 			goto out_free;
3085 
3086 		ret = create_free_space_inode(root, trans, block_group, path);
3087 		if (ret)
3088 			goto out_free;
3089 		goto again;
3090 	}
3091 
3092 	/* We've already setup this transaction, go ahead and exit */
3093 	if (block_group->cache_generation == trans->transid &&
3094 	    i_size_read(inode)) {
3095 		dcs = BTRFS_DC_SETUP;
3096 		goto out_put;
3097 	}
3098 
3099 	/*
3100 	 * We want to set the generation to 0, that way if anything goes wrong
3101 	 * from here on out we know not to trust this cache when we load up next
3102 	 * time.
3103 	 */
3104 	BTRFS_I(inode)->generation = 0;
3105 	ret = btrfs_update_inode(trans, root, inode);
3106 	WARN_ON(ret);
3107 
3108 	if (i_size_read(inode) > 0) {
3109 		ret = btrfs_check_trunc_cache_free_space(root,
3110 					&root->fs_info->global_block_rsv);
3111 		if (ret)
3112 			goto out_put;
3113 
3114 		ret = btrfs_truncate_free_space_cache(root, trans, path,
3115 						      inode);
3116 		if (ret)
3117 			goto out_put;
3118 	}
3119 
3120 	spin_lock(&block_group->lock);
3121 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3122 	    !btrfs_test_opt(root, SPACE_CACHE)) {
3123 		/*
3124 		 * don't bother trying to write stuff out _if_
3125 		 * a) we're not cached,
3126 		 * b) we're with nospace_cache mount option.
3127 		 */
3128 		dcs = BTRFS_DC_WRITTEN;
3129 		spin_unlock(&block_group->lock);
3130 		goto out_put;
3131 	}
3132 	spin_unlock(&block_group->lock);
3133 
3134 	/*
3135 	 * Try to preallocate enough space based on how big the block group is.
3136 	 * Keep in mind this has to include any pinned space which could end up
3137 	 * taking up quite a bit since it's not folded into the other space
3138 	 * cache.
3139 	 */
3140 	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3141 	if (!num_pages)
3142 		num_pages = 1;
3143 
3144 	num_pages *= 16;
3145 	num_pages *= PAGE_CACHE_SIZE;
3146 
3147 	ret = btrfs_check_data_free_space(inode, num_pages);
3148 	if (ret)
3149 		goto out_put;
3150 
3151 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3152 					      num_pages, num_pages,
3153 					      &alloc_hint);
3154 	if (!ret)
3155 		dcs = BTRFS_DC_SETUP;
3156 	btrfs_free_reserved_data_space(inode, num_pages);
3157 
3158 out_put:
3159 	iput(inode);
3160 out_free:
3161 	btrfs_release_path(path);
3162 out:
3163 	spin_lock(&block_group->lock);
3164 	if (!ret && dcs == BTRFS_DC_SETUP)
3165 		block_group->cache_generation = trans->transid;
3166 	block_group->disk_cache_state = dcs;
3167 	spin_unlock(&block_group->lock);
3168 
3169 	return ret;
3170 }
3171 
3172 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3173 				   struct btrfs_root *root)
3174 {
3175 	struct btrfs_block_group_cache *cache;
3176 	int err = 0;
3177 	struct btrfs_path *path;
3178 	u64 last = 0;
3179 
3180 	path = btrfs_alloc_path();
3181 	if (!path)
3182 		return -ENOMEM;
3183 
3184 again:
3185 	while (1) {
3186 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3187 		while (cache) {
3188 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3189 				break;
3190 			cache = next_block_group(root, cache);
3191 		}
3192 		if (!cache) {
3193 			if (last == 0)
3194 				break;
3195 			last = 0;
3196 			continue;
3197 		}
3198 		err = cache_save_setup(cache, trans, path);
3199 		last = cache->key.objectid + cache->key.offset;
3200 		btrfs_put_block_group(cache);
3201 	}
3202 
3203 	while (1) {
3204 		if (last == 0) {
3205 			err = btrfs_run_delayed_refs(trans, root,
3206 						     (unsigned long)-1);
3207 			if (err) /* File system offline */
3208 				goto out;
3209 		}
3210 
3211 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3212 		while (cache) {
3213 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3214 				btrfs_put_block_group(cache);
3215 				goto again;
3216 			}
3217 
3218 			if (cache->dirty)
3219 				break;
3220 			cache = next_block_group(root, cache);
3221 		}
3222 		if (!cache) {
3223 			if (last == 0)
3224 				break;
3225 			last = 0;
3226 			continue;
3227 		}
3228 
3229 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
3230 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3231 		cache->dirty = 0;
3232 		last = cache->key.objectid + cache->key.offset;
3233 
3234 		err = write_one_cache_group(trans, root, path, cache);
3235 		if (err) /* File system offline */
3236 			goto out;
3237 
3238 		btrfs_put_block_group(cache);
3239 	}
3240 
3241 	while (1) {
3242 		/*
3243 		 * I don't think this is needed since we're just marking our
3244 		 * preallocated extent as written, but just in case it can't
3245 		 * hurt.
3246 		 */
3247 		if (last == 0) {
3248 			err = btrfs_run_delayed_refs(trans, root,
3249 						     (unsigned long)-1);
3250 			if (err) /* File system offline */
3251 				goto out;
3252 		}
3253 
3254 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3255 		while (cache) {
3256 			/*
3257 			 * Really this shouldn't happen, but it could if we
3258 			 * couldn't write the entire preallocated extent and
3259 			 * splitting the extent resulted in a new block.
3260 			 */
3261 			if (cache->dirty) {
3262 				btrfs_put_block_group(cache);
3263 				goto again;
3264 			}
3265 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3266 				break;
3267 			cache = next_block_group(root, cache);
3268 		}
3269 		if (!cache) {
3270 			if (last == 0)
3271 				break;
3272 			last = 0;
3273 			continue;
3274 		}
3275 
3276 		err = btrfs_write_out_cache(root, trans, cache, path);
3277 
3278 		/*
3279 		 * If we didn't have an error then the cache state is still
3280 		 * NEED_WRITE, so we can set it to WRITTEN.
3281 		 */
3282 		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3283 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
3284 		last = cache->key.objectid + cache->key.offset;
3285 		btrfs_put_block_group(cache);
3286 	}
3287 out:
3288 
3289 	btrfs_free_path(path);
3290 	return err;
3291 }
3292 
3293 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3294 {
3295 	struct btrfs_block_group_cache *block_group;
3296 	int readonly = 0;
3297 
3298 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3299 	if (!block_group || block_group->ro)
3300 		readonly = 1;
3301 	if (block_group)
3302 		btrfs_put_block_group(block_group);
3303 	return readonly;
3304 }
3305 
3306 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3307 			     u64 total_bytes, u64 bytes_used,
3308 			     struct btrfs_space_info **space_info)
3309 {
3310 	struct btrfs_space_info *found;
3311 	int i;
3312 	int factor;
3313 
3314 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3315 		     BTRFS_BLOCK_GROUP_RAID10))
3316 		factor = 2;
3317 	else
3318 		factor = 1;
3319 
3320 	found = __find_space_info(info, flags);
3321 	if (found) {
3322 		spin_lock(&found->lock);
3323 		found->total_bytes += total_bytes;
3324 		found->disk_total += total_bytes * factor;
3325 		found->bytes_used += bytes_used;
3326 		found->disk_used += bytes_used * factor;
3327 		found->full = 0;
3328 		spin_unlock(&found->lock);
3329 		*space_info = found;
3330 		return 0;
3331 	}
3332 	found = kzalloc(sizeof(*found), GFP_NOFS);
3333 	if (!found)
3334 		return -ENOMEM;
3335 
3336 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3337 		INIT_LIST_HEAD(&found->block_groups[i]);
3338 	init_rwsem(&found->groups_sem);
3339 	spin_lock_init(&found->lock);
3340 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3341 	found->total_bytes = total_bytes;
3342 	found->disk_total = total_bytes * factor;
3343 	found->bytes_used = bytes_used;
3344 	found->disk_used = bytes_used * factor;
3345 	found->bytes_pinned = 0;
3346 	found->bytes_reserved = 0;
3347 	found->bytes_readonly = 0;
3348 	found->bytes_may_use = 0;
3349 	found->full = 0;
3350 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3351 	found->chunk_alloc = 0;
3352 	found->flush = 0;
3353 	init_waitqueue_head(&found->wait);
3354 	*space_info = found;
3355 	list_add_rcu(&found->list, &info->space_info);
3356 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3357 		info->data_sinfo = found;
3358 	return 0;
3359 }
3360 
3361 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3362 {
3363 	u64 extra_flags = chunk_to_extended(flags) &
3364 				BTRFS_EXTENDED_PROFILE_MASK;
3365 
3366 	write_seqlock(&fs_info->profiles_lock);
3367 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3368 		fs_info->avail_data_alloc_bits |= extra_flags;
3369 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
3370 		fs_info->avail_metadata_alloc_bits |= extra_flags;
3371 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3372 		fs_info->avail_system_alloc_bits |= extra_flags;
3373 	write_sequnlock(&fs_info->profiles_lock);
3374 }
3375 
3376 /*
3377  * returns target flags in extended format or 0 if restripe for this
3378  * chunk_type is not in progress
3379  *
3380  * should be called with either volume_mutex or balance_lock held
3381  */
3382 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3383 {
3384 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3385 	u64 target = 0;
3386 
3387 	if (!bctl)
3388 		return 0;
3389 
3390 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
3391 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3392 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3393 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3394 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3395 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3396 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3397 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3398 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3399 	}
3400 
3401 	return target;
3402 }
3403 
3404 /*
3405  * @flags: available profiles in extended format (see ctree.h)
3406  *
3407  * Returns reduced profile in chunk format.  If profile changing is in
3408  * progress (either running or paused) picks the target profile (if it's
3409  * already available), otherwise falls back to plain reducing.
3410  */
3411 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3412 {
3413 	/*
3414 	 * we add in the count of missing devices because we want
3415 	 * to make sure that any RAID levels on a degraded FS
3416 	 * continue to be honored.
3417 	 */
3418 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3419 		root->fs_info->fs_devices->missing_devices;
3420 	u64 target;
3421 	u64 tmp;
3422 
3423 	/*
3424 	 * see if restripe for this chunk_type is in progress, if so
3425 	 * try to reduce to the target profile
3426 	 */
3427 	spin_lock(&root->fs_info->balance_lock);
3428 	target = get_restripe_target(root->fs_info, flags);
3429 	if (target) {
3430 		/* pick target profile only if it's already available */
3431 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3432 			spin_unlock(&root->fs_info->balance_lock);
3433 			return extended_to_chunk(target);
3434 		}
3435 	}
3436 	spin_unlock(&root->fs_info->balance_lock);
3437 
3438 	/* First, mask out the RAID levels which aren't possible */
3439 	if (num_devices == 1)
3440 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3441 			   BTRFS_BLOCK_GROUP_RAID5);
3442 	if (num_devices < 3)
3443 		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3444 	if (num_devices < 4)
3445 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3446 
3447 	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3448 		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3449 		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3450 	flags &= ~tmp;
3451 
3452 	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3453 		tmp = BTRFS_BLOCK_GROUP_RAID6;
3454 	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3455 		tmp = BTRFS_BLOCK_GROUP_RAID5;
3456 	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3457 		tmp = BTRFS_BLOCK_GROUP_RAID10;
3458 	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3459 		tmp = BTRFS_BLOCK_GROUP_RAID1;
3460 	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3461 		tmp = BTRFS_BLOCK_GROUP_RAID0;
3462 
3463 	return extended_to_chunk(flags | tmp);
3464 }
3465 
3466 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3467 {
3468 	unsigned seq;
3469 
3470 	do {
3471 		seq = read_seqbegin(&root->fs_info->profiles_lock);
3472 
3473 		if (flags & BTRFS_BLOCK_GROUP_DATA)
3474 			flags |= root->fs_info->avail_data_alloc_bits;
3475 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3476 			flags |= root->fs_info->avail_system_alloc_bits;
3477 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3478 			flags |= root->fs_info->avail_metadata_alloc_bits;
3479 	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
3480 
3481 	return btrfs_reduce_alloc_profile(root, flags);
3482 }
3483 
3484 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3485 {
3486 	u64 flags;
3487 	u64 ret;
3488 
3489 	if (data)
3490 		flags = BTRFS_BLOCK_GROUP_DATA;
3491 	else if (root == root->fs_info->chunk_root)
3492 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3493 	else
3494 		flags = BTRFS_BLOCK_GROUP_METADATA;
3495 
3496 	ret = get_alloc_profile(root, flags);
3497 	return ret;
3498 }
3499 
3500 /*
3501  * This will check the space that the inode allocates from to make sure we have
3502  * enough space for bytes.
3503  */
3504 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3505 {
3506 	struct btrfs_space_info *data_sinfo;
3507 	struct btrfs_root *root = BTRFS_I(inode)->root;
3508 	struct btrfs_fs_info *fs_info = root->fs_info;
3509 	u64 used;
3510 	int ret = 0, committed = 0, alloc_chunk = 1;
3511 
3512 	/* make sure bytes are sectorsize aligned */
3513 	bytes = ALIGN(bytes, root->sectorsize);
3514 
3515 	if (root == root->fs_info->tree_root ||
3516 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3517 		alloc_chunk = 0;
3518 		committed = 1;
3519 	}
3520 
3521 	data_sinfo = fs_info->data_sinfo;
3522 	if (!data_sinfo)
3523 		goto alloc;
3524 
3525 again:
3526 	/* make sure we have enough space to handle the data first */
3527 	spin_lock(&data_sinfo->lock);
3528 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3529 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3530 		data_sinfo->bytes_may_use;
3531 
3532 	if (used + bytes > data_sinfo->total_bytes) {
3533 		struct btrfs_trans_handle *trans;
3534 
3535 		/*
3536 		 * if we don't have enough free bytes in this space then we need
3537 		 * to alloc a new chunk.
3538 		 */
3539 		if (!data_sinfo->full && alloc_chunk) {
3540 			u64 alloc_target;
3541 
3542 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3543 			spin_unlock(&data_sinfo->lock);
3544 alloc:
3545 			alloc_target = btrfs_get_alloc_profile(root, 1);
3546 			trans = btrfs_join_transaction(root);
3547 			if (IS_ERR(trans))
3548 				return PTR_ERR(trans);
3549 
3550 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3551 					     alloc_target,
3552 					     CHUNK_ALLOC_NO_FORCE);
3553 			btrfs_end_transaction(trans, root);
3554 			if (ret < 0) {
3555 				if (ret != -ENOSPC)
3556 					return ret;
3557 				else
3558 					goto commit_trans;
3559 			}
3560 
3561 			if (!data_sinfo)
3562 				data_sinfo = fs_info->data_sinfo;
3563 
3564 			goto again;
3565 		}
3566 
3567 		/*
3568 		 * If we have less pinned bytes than we want to allocate then
3569 		 * don't bother committing the transaction, it won't help us.
3570 		 */
3571 		if (data_sinfo->bytes_pinned < bytes)
3572 			committed = 1;
3573 		spin_unlock(&data_sinfo->lock);
3574 
3575 		/* commit the current transaction and try again */
3576 commit_trans:
3577 		if (!committed &&
3578 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3579 			committed = 1;
3580 			trans = btrfs_join_transaction(root);
3581 			if (IS_ERR(trans))
3582 				return PTR_ERR(trans);
3583 			ret = btrfs_commit_transaction(trans, root);
3584 			if (ret)
3585 				return ret;
3586 			goto again;
3587 		}
3588 
3589 		return -ENOSPC;
3590 	}
3591 	data_sinfo->bytes_may_use += bytes;
3592 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3593 				      data_sinfo->flags, bytes, 1);
3594 	spin_unlock(&data_sinfo->lock);
3595 
3596 	return 0;
3597 }
3598 
3599 /*
3600  * Called if we need to clear a data reservation for this inode.
3601  */
3602 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3603 {
3604 	struct btrfs_root *root = BTRFS_I(inode)->root;
3605 	struct btrfs_space_info *data_sinfo;
3606 
3607 	/* make sure bytes are sectorsize aligned */
3608 	bytes = ALIGN(bytes, root->sectorsize);
3609 
3610 	data_sinfo = root->fs_info->data_sinfo;
3611 	spin_lock(&data_sinfo->lock);
3612 	data_sinfo->bytes_may_use -= bytes;
3613 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3614 				      data_sinfo->flags, bytes, 0);
3615 	spin_unlock(&data_sinfo->lock);
3616 }
3617 
3618 static void force_metadata_allocation(struct btrfs_fs_info *info)
3619 {
3620 	struct list_head *head = &info->space_info;
3621 	struct btrfs_space_info *found;
3622 
3623 	rcu_read_lock();
3624 	list_for_each_entry_rcu(found, head, list) {
3625 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3626 			found->force_alloc = CHUNK_ALLOC_FORCE;
3627 	}
3628 	rcu_read_unlock();
3629 }
3630 
3631 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
3632 {
3633 	return (global->size << 1);
3634 }
3635 
3636 static int should_alloc_chunk(struct btrfs_root *root,
3637 			      struct btrfs_space_info *sinfo, int force)
3638 {
3639 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3640 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3641 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3642 	u64 thresh;
3643 
3644 	if (force == CHUNK_ALLOC_FORCE)
3645 		return 1;
3646 
3647 	/*
3648 	 * We need to take into account the global rsv because for all intents
3649 	 * and purposes it's used space.  Don't worry about locking the
3650 	 * global_rsv, it doesn't change except when the transaction commits.
3651 	 */
3652 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3653 		num_allocated += calc_global_rsv_need_space(global_rsv);
3654 
3655 	/*
3656 	 * in limited mode, we want to have some free space up to
3657 	 * about 1% of the FS size.
3658 	 */
3659 	if (force == CHUNK_ALLOC_LIMITED) {
3660 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3661 		thresh = max_t(u64, 64 * 1024 * 1024,
3662 			       div_factor_fine(thresh, 1));
3663 
3664 		if (num_bytes - num_allocated < thresh)
3665 			return 1;
3666 	}
3667 
3668 	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3669 		return 0;
3670 	return 1;
3671 }
3672 
3673 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3674 {
3675 	u64 num_dev;
3676 
3677 	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3678 		    BTRFS_BLOCK_GROUP_RAID0 |
3679 		    BTRFS_BLOCK_GROUP_RAID5 |
3680 		    BTRFS_BLOCK_GROUP_RAID6))
3681 		num_dev = root->fs_info->fs_devices->rw_devices;
3682 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
3683 		num_dev = 2;
3684 	else
3685 		num_dev = 1;	/* DUP or single */
3686 
3687 	/* metadata for updaing devices and chunk tree */
3688 	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3689 }
3690 
3691 static void check_system_chunk(struct btrfs_trans_handle *trans,
3692 			       struct btrfs_root *root, u64 type)
3693 {
3694 	struct btrfs_space_info *info;
3695 	u64 left;
3696 	u64 thresh;
3697 
3698 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3699 	spin_lock(&info->lock);
3700 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3701 		info->bytes_reserved - info->bytes_readonly;
3702 	spin_unlock(&info->lock);
3703 
3704 	thresh = get_system_chunk_thresh(root, type);
3705 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3706 		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
3707 			left, thresh, type);
3708 		dump_space_info(info, 0, 0);
3709 	}
3710 
3711 	if (left < thresh) {
3712 		u64 flags;
3713 
3714 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3715 		btrfs_alloc_chunk(trans, root, flags);
3716 	}
3717 }
3718 
3719 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3720 			  struct btrfs_root *extent_root, u64 flags, int force)
3721 {
3722 	struct btrfs_space_info *space_info;
3723 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3724 	int wait_for_alloc = 0;
3725 	int ret = 0;
3726 
3727 	/* Don't re-enter if we're already allocating a chunk */
3728 	if (trans->allocating_chunk)
3729 		return -ENOSPC;
3730 
3731 	space_info = __find_space_info(extent_root->fs_info, flags);
3732 	if (!space_info) {
3733 		ret = update_space_info(extent_root->fs_info, flags,
3734 					0, 0, &space_info);
3735 		BUG_ON(ret); /* -ENOMEM */
3736 	}
3737 	BUG_ON(!space_info); /* Logic error */
3738 
3739 again:
3740 	spin_lock(&space_info->lock);
3741 	if (force < space_info->force_alloc)
3742 		force = space_info->force_alloc;
3743 	if (space_info->full) {
3744 		spin_unlock(&space_info->lock);
3745 		return 0;
3746 	}
3747 
3748 	if (!should_alloc_chunk(extent_root, space_info, force)) {
3749 		spin_unlock(&space_info->lock);
3750 		return 0;
3751 	} else if (space_info->chunk_alloc) {
3752 		wait_for_alloc = 1;
3753 	} else {
3754 		space_info->chunk_alloc = 1;
3755 	}
3756 
3757 	spin_unlock(&space_info->lock);
3758 
3759 	mutex_lock(&fs_info->chunk_mutex);
3760 
3761 	/*
3762 	 * The chunk_mutex is held throughout the entirety of a chunk
3763 	 * allocation, so once we've acquired the chunk_mutex we know that the
3764 	 * other guy is done and we need to recheck and see if we should
3765 	 * allocate.
3766 	 */
3767 	if (wait_for_alloc) {
3768 		mutex_unlock(&fs_info->chunk_mutex);
3769 		wait_for_alloc = 0;
3770 		goto again;
3771 	}
3772 
3773 	trans->allocating_chunk = true;
3774 
3775 	/*
3776 	 * If we have mixed data/metadata chunks we want to make sure we keep
3777 	 * allocating mixed chunks instead of individual chunks.
3778 	 */
3779 	if (btrfs_mixed_space_info(space_info))
3780 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3781 
3782 	/*
3783 	 * if we're doing a data chunk, go ahead and make sure that
3784 	 * we keep a reasonable number of metadata chunks allocated in the
3785 	 * FS as well.
3786 	 */
3787 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3788 		fs_info->data_chunk_allocations++;
3789 		if (!(fs_info->data_chunk_allocations %
3790 		      fs_info->metadata_ratio))
3791 			force_metadata_allocation(fs_info);
3792 	}
3793 
3794 	/*
3795 	 * Check if we have enough space in SYSTEM chunk because we may need
3796 	 * to update devices.
3797 	 */
3798 	check_system_chunk(trans, extent_root, flags);
3799 
3800 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3801 	trans->allocating_chunk = false;
3802 
3803 	spin_lock(&space_info->lock);
3804 	if (ret < 0 && ret != -ENOSPC)
3805 		goto out;
3806 	if (ret)
3807 		space_info->full = 1;
3808 	else
3809 		ret = 1;
3810 
3811 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3812 out:
3813 	space_info->chunk_alloc = 0;
3814 	spin_unlock(&space_info->lock);
3815 	mutex_unlock(&fs_info->chunk_mutex);
3816 	return ret;
3817 }
3818 
3819 static int can_overcommit(struct btrfs_root *root,
3820 			  struct btrfs_space_info *space_info, u64 bytes,
3821 			  enum btrfs_reserve_flush_enum flush)
3822 {
3823 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3824 	u64 profile = btrfs_get_alloc_profile(root, 0);
3825 	u64 space_size;
3826 	u64 avail;
3827 	u64 used;
3828 	u64 to_add;
3829 
3830 	used = space_info->bytes_used + space_info->bytes_reserved +
3831 		space_info->bytes_pinned + space_info->bytes_readonly;
3832 
3833 	/*
3834 	 * We only want to allow over committing if we have lots of actual space
3835 	 * free, but if we don't have enough space to handle the global reserve
3836 	 * space then we could end up having a real enospc problem when trying
3837 	 * to allocate a chunk or some other such important allocation.
3838 	 */
3839 	spin_lock(&global_rsv->lock);
3840 	space_size = calc_global_rsv_need_space(global_rsv);
3841 	spin_unlock(&global_rsv->lock);
3842 	if (used + space_size >= space_info->total_bytes)
3843 		return 0;
3844 
3845 	used += space_info->bytes_may_use;
3846 
3847 	spin_lock(&root->fs_info->free_chunk_lock);
3848 	avail = root->fs_info->free_chunk_space;
3849 	spin_unlock(&root->fs_info->free_chunk_lock);
3850 
3851 	/*
3852 	 * If we have dup, raid1 or raid10 then only half of the free
3853 	 * space is actually useable.  For raid56, the space info used
3854 	 * doesn't include the parity drive, so we don't have to
3855 	 * change the math
3856 	 */
3857 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
3858 		       BTRFS_BLOCK_GROUP_RAID1 |
3859 		       BTRFS_BLOCK_GROUP_RAID10))
3860 		avail >>= 1;
3861 
3862 	to_add = space_info->total_bytes;
3863 
3864 	/*
3865 	 * If we aren't flushing all things, let us overcommit up to
3866 	 * 1/2th of the space. If we can flush, don't let us overcommit
3867 	 * too much, let it overcommit up to 1/8 of the space.
3868 	 */
3869 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
3870 		to_add >>= 3;
3871 	else
3872 		to_add >>= 1;
3873 
3874 	/*
3875 	 * Limit the overcommit to the amount of free space we could possibly
3876 	 * allocate for chunks.
3877 	 */
3878 	to_add = min(avail, to_add);
3879 
3880 	if (used + bytes < space_info->total_bytes + to_add)
3881 		return 1;
3882 	return 0;
3883 }
3884 
3885 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3886 					 unsigned long nr_pages)
3887 {
3888 	struct super_block *sb = root->fs_info->sb;
3889 	int started;
3890 
3891 	/* If we can not start writeback, just sync all the delalloc file. */
3892 	started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3893 						      WB_REASON_FS_FREE_SPACE);
3894 	if (!started) {
3895 		/*
3896 		 * We needn't worry the filesystem going from r/w to r/o though
3897 		 * we don't acquire ->s_umount mutex, because the filesystem
3898 		 * should guarantee the delalloc inodes list be empty after
3899 		 * the filesystem is readonly(all dirty pages are written to
3900 		 * the disk).
3901 		 */
3902 		btrfs_start_delalloc_inodes(root, 0);
3903 		if (!current->journal_info)
3904 			btrfs_wait_ordered_extents(root, 0);
3905 	}
3906 }
3907 
3908 /*
3909  * shrink metadata reservation for delalloc
3910  */
3911 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3912 			    bool wait_ordered)
3913 {
3914 	struct btrfs_block_rsv *block_rsv;
3915 	struct btrfs_space_info *space_info;
3916 	struct btrfs_trans_handle *trans;
3917 	u64 delalloc_bytes;
3918 	u64 max_reclaim;
3919 	long time_left;
3920 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3921 	int loops = 0;
3922 	enum btrfs_reserve_flush_enum flush;
3923 
3924 	trans = (struct btrfs_trans_handle *)current->journal_info;
3925 	block_rsv = &root->fs_info->delalloc_block_rsv;
3926 	space_info = block_rsv->space_info;
3927 
3928 	smp_mb();
3929 	delalloc_bytes = percpu_counter_sum_positive(
3930 						&root->fs_info->delalloc_bytes);
3931 	if (delalloc_bytes == 0) {
3932 		if (trans)
3933 			return;
3934 		btrfs_wait_ordered_extents(root, 0);
3935 		return;
3936 	}
3937 
3938 	while (delalloc_bytes && loops < 3) {
3939 		max_reclaim = min(delalloc_bytes, to_reclaim);
3940 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3941 		btrfs_writeback_inodes_sb_nr(root, nr_pages);
3942 		/*
3943 		 * We need to wait for the async pages to actually start before
3944 		 * we do anything.
3945 		 */
3946 		wait_event(root->fs_info->async_submit_wait,
3947 			   !atomic_read(&root->fs_info->async_delalloc_pages));
3948 
3949 		if (!trans)
3950 			flush = BTRFS_RESERVE_FLUSH_ALL;
3951 		else
3952 			flush = BTRFS_RESERVE_NO_FLUSH;
3953 		spin_lock(&space_info->lock);
3954 		if (can_overcommit(root, space_info, orig, flush)) {
3955 			spin_unlock(&space_info->lock);
3956 			break;
3957 		}
3958 		spin_unlock(&space_info->lock);
3959 
3960 		loops++;
3961 		if (wait_ordered && !trans) {
3962 			btrfs_wait_ordered_extents(root, 0);
3963 		} else {
3964 			time_left = schedule_timeout_killable(1);
3965 			if (time_left)
3966 				break;
3967 		}
3968 		smp_mb();
3969 		delalloc_bytes = percpu_counter_sum_positive(
3970 						&root->fs_info->delalloc_bytes);
3971 	}
3972 }
3973 
3974 /**
3975  * maybe_commit_transaction - possibly commit the transaction if its ok to
3976  * @root - the root we're allocating for
3977  * @bytes - the number of bytes we want to reserve
3978  * @force - force the commit
3979  *
3980  * This will check to make sure that committing the transaction will actually
3981  * get us somewhere and then commit the transaction if it does.  Otherwise it
3982  * will return -ENOSPC.
3983  */
3984 static int may_commit_transaction(struct btrfs_root *root,
3985 				  struct btrfs_space_info *space_info,
3986 				  u64 bytes, int force)
3987 {
3988 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3989 	struct btrfs_trans_handle *trans;
3990 
3991 	trans = (struct btrfs_trans_handle *)current->journal_info;
3992 	if (trans)
3993 		return -EAGAIN;
3994 
3995 	if (force)
3996 		goto commit;
3997 
3998 	/* See if there is enough pinned space to make this reservation */
3999 	spin_lock(&space_info->lock);
4000 	if (space_info->bytes_pinned >= bytes) {
4001 		spin_unlock(&space_info->lock);
4002 		goto commit;
4003 	}
4004 	spin_unlock(&space_info->lock);
4005 
4006 	/*
4007 	 * See if there is some space in the delayed insertion reservation for
4008 	 * this reservation.
4009 	 */
4010 	if (space_info != delayed_rsv->space_info)
4011 		return -ENOSPC;
4012 
4013 	spin_lock(&space_info->lock);
4014 	spin_lock(&delayed_rsv->lock);
4015 	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
4016 		spin_unlock(&delayed_rsv->lock);
4017 		spin_unlock(&space_info->lock);
4018 		return -ENOSPC;
4019 	}
4020 	spin_unlock(&delayed_rsv->lock);
4021 	spin_unlock(&space_info->lock);
4022 
4023 commit:
4024 	trans = btrfs_join_transaction(root);
4025 	if (IS_ERR(trans))
4026 		return -ENOSPC;
4027 
4028 	return btrfs_commit_transaction(trans, root);
4029 }
4030 
4031 enum flush_state {
4032 	FLUSH_DELAYED_ITEMS_NR	=	1,
4033 	FLUSH_DELAYED_ITEMS	=	2,
4034 	FLUSH_DELALLOC		=	3,
4035 	FLUSH_DELALLOC_WAIT	=	4,
4036 	ALLOC_CHUNK		=	5,
4037 	COMMIT_TRANS		=	6,
4038 };
4039 
4040 static int flush_space(struct btrfs_root *root,
4041 		       struct btrfs_space_info *space_info, u64 num_bytes,
4042 		       u64 orig_bytes, int state)
4043 {
4044 	struct btrfs_trans_handle *trans;
4045 	int nr;
4046 	int ret = 0;
4047 
4048 	switch (state) {
4049 	case FLUSH_DELAYED_ITEMS_NR:
4050 	case FLUSH_DELAYED_ITEMS:
4051 		if (state == FLUSH_DELAYED_ITEMS_NR) {
4052 			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
4053 
4054 			nr = (int)div64_u64(num_bytes, bytes);
4055 			if (!nr)
4056 				nr = 1;
4057 			nr *= 2;
4058 		} else {
4059 			nr = -1;
4060 		}
4061 		trans = btrfs_join_transaction(root);
4062 		if (IS_ERR(trans)) {
4063 			ret = PTR_ERR(trans);
4064 			break;
4065 		}
4066 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
4067 		btrfs_end_transaction(trans, root);
4068 		break;
4069 	case FLUSH_DELALLOC:
4070 	case FLUSH_DELALLOC_WAIT:
4071 		shrink_delalloc(root, num_bytes, orig_bytes,
4072 				state == FLUSH_DELALLOC_WAIT);
4073 		break;
4074 	case ALLOC_CHUNK:
4075 		trans = btrfs_join_transaction(root);
4076 		if (IS_ERR(trans)) {
4077 			ret = PTR_ERR(trans);
4078 			break;
4079 		}
4080 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4081 				     btrfs_get_alloc_profile(root, 0),
4082 				     CHUNK_ALLOC_NO_FORCE);
4083 		btrfs_end_transaction(trans, root);
4084 		if (ret == -ENOSPC)
4085 			ret = 0;
4086 		break;
4087 	case COMMIT_TRANS:
4088 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4089 		break;
4090 	default:
4091 		ret = -ENOSPC;
4092 		break;
4093 	}
4094 
4095 	return ret;
4096 }
4097 /**
4098  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4099  * @root - the root we're allocating for
4100  * @block_rsv - the block_rsv we're allocating for
4101  * @orig_bytes - the number of bytes we want
4102  * @flush - whether or not we can flush to make our reservation
4103  *
4104  * This will reserve orgi_bytes number of bytes from the space info associated
4105  * with the block_rsv.  If there is not enough space it will make an attempt to
4106  * flush out space to make room.  It will do this by flushing delalloc if
4107  * possible or committing the transaction.  If flush is 0 then no attempts to
4108  * regain reservations will be made and this will fail if there is not enough
4109  * space already.
4110  */
4111 static int reserve_metadata_bytes(struct btrfs_root *root,
4112 				  struct btrfs_block_rsv *block_rsv,
4113 				  u64 orig_bytes,
4114 				  enum btrfs_reserve_flush_enum flush)
4115 {
4116 	struct btrfs_space_info *space_info = block_rsv->space_info;
4117 	u64 used;
4118 	u64 num_bytes = orig_bytes;
4119 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
4120 	int ret = 0;
4121 	bool flushing = false;
4122 
4123 again:
4124 	ret = 0;
4125 	spin_lock(&space_info->lock);
4126 	/*
4127 	 * We only want to wait if somebody other than us is flushing and we
4128 	 * are actually allowed to flush all things.
4129 	 */
4130 	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4131 	       space_info->flush) {
4132 		spin_unlock(&space_info->lock);
4133 		/*
4134 		 * If we have a trans handle we can't wait because the flusher
4135 		 * may have to commit the transaction, which would mean we would
4136 		 * deadlock since we are waiting for the flusher to finish, but
4137 		 * hold the current transaction open.
4138 		 */
4139 		if (current->journal_info)
4140 			return -EAGAIN;
4141 		ret = wait_event_killable(space_info->wait, !space_info->flush);
4142 		/* Must have been killed, return */
4143 		if (ret)
4144 			return -EINTR;
4145 
4146 		spin_lock(&space_info->lock);
4147 	}
4148 
4149 	ret = -ENOSPC;
4150 	used = space_info->bytes_used + space_info->bytes_reserved +
4151 		space_info->bytes_pinned + space_info->bytes_readonly +
4152 		space_info->bytes_may_use;
4153 
4154 	/*
4155 	 * The idea here is that we've not already over-reserved the block group
4156 	 * then we can go ahead and save our reservation first and then start
4157 	 * flushing if we need to.  Otherwise if we've already overcommitted
4158 	 * lets start flushing stuff first and then come back and try to make
4159 	 * our reservation.
4160 	 */
4161 	if (used <= space_info->total_bytes) {
4162 		if (used + orig_bytes <= space_info->total_bytes) {
4163 			space_info->bytes_may_use += orig_bytes;
4164 			trace_btrfs_space_reservation(root->fs_info,
4165 				"space_info", space_info->flags, orig_bytes, 1);
4166 			ret = 0;
4167 		} else {
4168 			/*
4169 			 * Ok set num_bytes to orig_bytes since we aren't
4170 			 * overocmmitted, this way we only try and reclaim what
4171 			 * we need.
4172 			 */
4173 			num_bytes = orig_bytes;
4174 		}
4175 	} else {
4176 		/*
4177 		 * Ok we're over committed, set num_bytes to the overcommitted
4178 		 * amount plus the amount of bytes that we need for this
4179 		 * reservation.
4180 		 */
4181 		num_bytes = used - space_info->total_bytes +
4182 			(orig_bytes * 2);
4183 	}
4184 
4185 	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4186 		space_info->bytes_may_use += orig_bytes;
4187 		trace_btrfs_space_reservation(root->fs_info, "space_info",
4188 					      space_info->flags, orig_bytes,
4189 					      1);
4190 		ret = 0;
4191 	}
4192 
4193 	/*
4194 	 * Couldn't make our reservation, save our place so while we're trying
4195 	 * to reclaim space we can actually use it instead of somebody else
4196 	 * stealing it from us.
4197 	 *
4198 	 * We make the other tasks wait for the flush only when we can flush
4199 	 * all things.
4200 	 */
4201 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4202 		flushing = true;
4203 		space_info->flush = 1;
4204 	}
4205 
4206 	spin_unlock(&space_info->lock);
4207 
4208 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4209 		goto out;
4210 
4211 	ret = flush_space(root, space_info, num_bytes, orig_bytes,
4212 			  flush_state);
4213 	flush_state++;
4214 
4215 	/*
4216 	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4217 	 * would happen. So skip delalloc flush.
4218 	 */
4219 	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4220 	    (flush_state == FLUSH_DELALLOC ||
4221 	     flush_state == FLUSH_DELALLOC_WAIT))
4222 		flush_state = ALLOC_CHUNK;
4223 
4224 	if (!ret)
4225 		goto again;
4226 	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4227 		 flush_state < COMMIT_TRANS)
4228 		goto again;
4229 	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4230 		 flush_state <= COMMIT_TRANS)
4231 		goto again;
4232 
4233 out:
4234 	if (ret == -ENOSPC &&
4235 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4236 		struct btrfs_block_rsv *global_rsv =
4237 			&root->fs_info->global_block_rsv;
4238 
4239 		if (block_rsv != global_rsv &&
4240 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
4241 			ret = 0;
4242 	}
4243 	if (flushing) {
4244 		spin_lock(&space_info->lock);
4245 		space_info->flush = 0;
4246 		wake_up_all(&space_info->wait);
4247 		spin_unlock(&space_info->lock);
4248 	}
4249 	return ret;
4250 }
4251 
4252 static struct btrfs_block_rsv *get_block_rsv(
4253 					const struct btrfs_trans_handle *trans,
4254 					const struct btrfs_root *root)
4255 {
4256 	struct btrfs_block_rsv *block_rsv = NULL;
4257 
4258 	if (root->ref_cows)
4259 		block_rsv = trans->block_rsv;
4260 
4261 	if (root == root->fs_info->csum_root && trans->adding_csums)
4262 		block_rsv = trans->block_rsv;
4263 
4264 	if (!block_rsv)
4265 		block_rsv = root->block_rsv;
4266 
4267 	if (!block_rsv)
4268 		block_rsv = &root->fs_info->empty_block_rsv;
4269 
4270 	return block_rsv;
4271 }
4272 
4273 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4274 			       u64 num_bytes)
4275 {
4276 	int ret = -ENOSPC;
4277 	spin_lock(&block_rsv->lock);
4278 	if (block_rsv->reserved >= num_bytes) {
4279 		block_rsv->reserved -= num_bytes;
4280 		if (block_rsv->reserved < block_rsv->size)
4281 			block_rsv->full = 0;
4282 		ret = 0;
4283 	}
4284 	spin_unlock(&block_rsv->lock);
4285 	return ret;
4286 }
4287 
4288 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4289 				u64 num_bytes, int update_size)
4290 {
4291 	spin_lock(&block_rsv->lock);
4292 	block_rsv->reserved += num_bytes;
4293 	if (update_size)
4294 		block_rsv->size += num_bytes;
4295 	else if (block_rsv->reserved >= block_rsv->size)
4296 		block_rsv->full = 1;
4297 	spin_unlock(&block_rsv->lock);
4298 }
4299 
4300 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4301 				    struct btrfs_block_rsv *block_rsv,
4302 				    struct btrfs_block_rsv *dest, u64 num_bytes)
4303 {
4304 	struct btrfs_space_info *space_info = block_rsv->space_info;
4305 
4306 	spin_lock(&block_rsv->lock);
4307 	if (num_bytes == (u64)-1)
4308 		num_bytes = block_rsv->size;
4309 	block_rsv->size -= num_bytes;
4310 	if (block_rsv->reserved >= block_rsv->size) {
4311 		num_bytes = block_rsv->reserved - block_rsv->size;
4312 		block_rsv->reserved = block_rsv->size;
4313 		block_rsv->full = 1;
4314 	} else {
4315 		num_bytes = 0;
4316 	}
4317 	spin_unlock(&block_rsv->lock);
4318 
4319 	if (num_bytes > 0) {
4320 		if (dest) {
4321 			spin_lock(&dest->lock);
4322 			if (!dest->full) {
4323 				u64 bytes_to_add;
4324 
4325 				bytes_to_add = dest->size - dest->reserved;
4326 				bytes_to_add = min(num_bytes, bytes_to_add);
4327 				dest->reserved += bytes_to_add;
4328 				if (dest->reserved >= dest->size)
4329 					dest->full = 1;
4330 				num_bytes -= bytes_to_add;
4331 			}
4332 			spin_unlock(&dest->lock);
4333 		}
4334 		if (num_bytes) {
4335 			spin_lock(&space_info->lock);
4336 			space_info->bytes_may_use -= num_bytes;
4337 			trace_btrfs_space_reservation(fs_info, "space_info",
4338 					space_info->flags, num_bytes, 0);
4339 			space_info->reservation_progress++;
4340 			spin_unlock(&space_info->lock);
4341 		}
4342 	}
4343 }
4344 
4345 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4346 				   struct btrfs_block_rsv *dst, u64 num_bytes)
4347 {
4348 	int ret;
4349 
4350 	ret = block_rsv_use_bytes(src, num_bytes);
4351 	if (ret)
4352 		return ret;
4353 
4354 	block_rsv_add_bytes(dst, num_bytes, 1);
4355 	return 0;
4356 }
4357 
4358 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4359 {
4360 	memset(rsv, 0, sizeof(*rsv));
4361 	spin_lock_init(&rsv->lock);
4362 	rsv->type = type;
4363 }
4364 
4365 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4366 					      unsigned short type)
4367 {
4368 	struct btrfs_block_rsv *block_rsv;
4369 	struct btrfs_fs_info *fs_info = root->fs_info;
4370 
4371 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4372 	if (!block_rsv)
4373 		return NULL;
4374 
4375 	btrfs_init_block_rsv(block_rsv, type);
4376 	block_rsv->space_info = __find_space_info(fs_info,
4377 						  BTRFS_BLOCK_GROUP_METADATA);
4378 	return block_rsv;
4379 }
4380 
4381 void btrfs_free_block_rsv(struct btrfs_root *root,
4382 			  struct btrfs_block_rsv *rsv)
4383 {
4384 	if (!rsv)
4385 		return;
4386 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4387 	kfree(rsv);
4388 }
4389 
4390 int btrfs_block_rsv_add(struct btrfs_root *root,
4391 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4392 			enum btrfs_reserve_flush_enum flush)
4393 {
4394 	int ret;
4395 
4396 	if (num_bytes == 0)
4397 		return 0;
4398 
4399 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4400 	if (!ret) {
4401 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
4402 		return 0;
4403 	}
4404 
4405 	return ret;
4406 }
4407 
4408 int btrfs_block_rsv_check(struct btrfs_root *root,
4409 			  struct btrfs_block_rsv *block_rsv, int min_factor)
4410 {
4411 	u64 num_bytes = 0;
4412 	int ret = -ENOSPC;
4413 
4414 	if (!block_rsv)
4415 		return 0;
4416 
4417 	spin_lock(&block_rsv->lock);
4418 	num_bytes = div_factor(block_rsv->size, min_factor);
4419 	if (block_rsv->reserved >= num_bytes)
4420 		ret = 0;
4421 	spin_unlock(&block_rsv->lock);
4422 
4423 	return ret;
4424 }
4425 
4426 int btrfs_block_rsv_refill(struct btrfs_root *root,
4427 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4428 			   enum btrfs_reserve_flush_enum flush)
4429 {
4430 	u64 num_bytes = 0;
4431 	int ret = -ENOSPC;
4432 
4433 	if (!block_rsv)
4434 		return 0;
4435 
4436 	spin_lock(&block_rsv->lock);
4437 	num_bytes = min_reserved;
4438 	if (block_rsv->reserved >= num_bytes)
4439 		ret = 0;
4440 	else
4441 		num_bytes -= block_rsv->reserved;
4442 	spin_unlock(&block_rsv->lock);
4443 
4444 	if (!ret)
4445 		return 0;
4446 
4447 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4448 	if (!ret) {
4449 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
4450 		return 0;
4451 	}
4452 
4453 	return ret;
4454 }
4455 
4456 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4457 			    struct btrfs_block_rsv *dst_rsv,
4458 			    u64 num_bytes)
4459 {
4460 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4461 }
4462 
4463 void btrfs_block_rsv_release(struct btrfs_root *root,
4464 			     struct btrfs_block_rsv *block_rsv,
4465 			     u64 num_bytes)
4466 {
4467 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4468 	if (global_rsv->full || global_rsv == block_rsv ||
4469 	    block_rsv->space_info != global_rsv->space_info)
4470 		global_rsv = NULL;
4471 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4472 				num_bytes);
4473 }
4474 
4475 /*
4476  * helper to calculate size of global block reservation.
4477  * the desired value is sum of space used by extent tree,
4478  * checksum tree and root tree
4479  */
4480 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4481 {
4482 	struct btrfs_space_info *sinfo;
4483 	u64 num_bytes;
4484 	u64 meta_used;
4485 	u64 data_used;
4486 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4487 
4488 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4489 	spin_lock(&sinfo->lock);
4490 	data_used = sinfo->bytes_used;
4491 	spin_unlock(&sinfo->lock);
4492 
4493 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4494 	spin_lock(&sinfo->lock);
4495 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4496 		data_used = 0;
4497 	meta_used = sinfo->bytes_used;
4498 	spin_unlock(&sinfo->lock);
4499 
4500 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4501 		    csum_size * 2;
4502 	num_bytes += div64_u64(data_used + meta_used, 50);
4503 
4504 	if (num_bytes * 3 > meta_used)
4505 		num_bytes = div64_u64(meta_used, 3);
4506 
4507 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4508 }
4509 
4510 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4511 {
4512 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4513 	struct btrfs_space_info *sinfo = block_rsv->space_info;
4514 	u64 num_bytes;
4515 
4516 	num_bytes = calc_global_metadata_size(fs_info);
4517 
4518 	spin_lock(&sinfo->lock);
4519 	spin_lock(&block_rsv->lock);
4520 
4521 	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
4522 
4523 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4524 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
4525 		    sinfo->bytes_may_use;
4526 
4527 	if (sinfo->total_bytes > num_bytes) {
4528 		num_bytes = sinfo->total_bytes - num_bytes;
4529 		block_rsv->reserved += num_bytes;
4530 		sinfo->bytes_may_use += num_bytes;
4531 		trace_btrfs_space_reservation(fs_info, "space_info",
4532 				      sinfo->flags, num_bytes, 1);
4533 	}
4534 
4535 	if (block_rsv->reserved >= block_rsv->size) {
4536 		num_bytes = block_rsv->reserved - block_rsv->size;
4537 		sinfo->bytes_may_use -= num_bytes;
4538 		trace_btrfs_space_reservation(fs_info, "space_info",
4539 				      sinfo->flags, num_bytes, 0);
4540 		sinfo->reservation_progress++;
4541 		block_rsv->reserved = block_rsv->size;
4542 		block_rsv->full = 1;
4543 	}
4544 
4545 	spin_unlock(&block_rsv->lock);
4546 	spin_unlock(&sinfo->lock);
4547 }
4548 
4549 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4550 {
4551 	struct btrfs_space_info *space_info;
4552 
4553 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4554 	fs_info->chunk_block_rsv.space_info = space_info;
4555 
4556 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4557 	fs_info->global_block_rsv.space_info = space_info;
4558 	fs_info->delalloc_block_rsv.space_info = space_info;
4559 	fs_info->trans_block_rsv.space_info = space_info;
4560 	fs_info->empty_block_rsv.space_info = space_info;
4561 	fs_info->delayed_block_rsv.space_info = space_info;
4562 
4563 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4564 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4565 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4566 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4567 	if (fs_info->quota_root)
4568 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4569 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4570 
4571 	update_global_block_rsv(fs_info);
4572 }
4573 
4574 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4575 {
4576 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4577 				(u64)-1);
4578 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4579 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4580 	WARN_ON(fs_info->trans_block_rsv.size > 0);
4581 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4582 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
4583 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4584 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4585 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4586 }
4587 
4588 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4589 				  struct btrfs_root *root)
4590 {
4591 	if (!trans->block_rsv)
4592 		return;
4593 
4594 	if (!trans->bytes_reserved)
4595 		return;
4596 
4597 	trace_btrfs_space_reservation(root->fs_info, "transaction",
4598 				      trans->transid, trans->bytes_reserved, 0);
4599 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4600 	trans->bytes_reserved = 0;
4601 }
4602 
4603 /* Can only return 0 or -ENOSPC */
4604 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4605 				  struct inode *inode)
4606 {
4607 	struct btrfs_root *root = BTRFS_I(inode)->root;
4608 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4609 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4610 
4611 	/*
4612 	 * We need to hold space in order to delete our orphan item once we've
4613 	 * added it, so this takes the reservation so we can release it later
4614 	 * when we are truly done with the orphan item.
4615 	 */
4616 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4617 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4618 				      btrfs_ino(inode), num_bytes, 1);
4619 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4620 }
4621 
4622 void btrfs_orphan_release_metadata(struct inode *inode)
4623 {
4624 	struct btrfs_root *root = BTRFS_I(inode)->root;
4625 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4626 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4627 				      btrfs_ino(inode), num_bytes, 0);
4628 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4629 }
4630 
4631 /*
4632  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4633  * root: the root of the parent directory
4634  * rsv: block reservation
4635  * items: the number of items that we need do reservation
4636  * qgroup_reserved: used to return the reserved size in qgroup
4637  *
4638  * This function is used to reserve the space for snapshot/subvolume
4639  * creation and deletion. Those operations are different with the
4640  * common file/directory operations, they change two fs/file trees
4641  * and root tree, the number of items that the qgroup reserves is
4642  * different with the free space reservation. So we can not use
4643  * the space reseravtion mechanism in start_transaction().
4644  */
4645 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4646 				     struct btrfs_block_rsv *rsv,
4647 				     int items,
4648 				     u64 *qgroup_reserved)
4649 {
4650 	u64 num_bytes;
4651 	int ret;
4652 
4653 	if (root->fs_info->quota_enabled) {
4654 		/* One for parent inode, two for dir entries */
4655 		num_bytes = 3 * root->leafsize;
4656 		ret = btrfs_qgroup_reserve(root, num_bytes);
4657 		if (ret)
4658 			return ret;
4659 	} else {
4660 		num_bytes = 0;
4661 	}
4662 
4663 	*qgroup_reserved = num_bytes;
4664 
4665 	num_bytes = btrfs_calc_trans_metadata_size(root, items);
4666 	rsv->space_info = __find_space_info(root->fs_info,
4667 					    BTRFS_BLOCK_GROUP_METADATA);
4668 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4669 				  BTRFS_RESERVE_FLUSH_ALL);
4670 	if (ret) {
4671 		if (*qgroup_reserved)
4672 			btrfs_qgroup_free(root, *qgroup_reserved);
4673 	}
4674 
4675 	return ret;
4676 }
4677 
4678 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4679 				      struct btrfs_block_rsv *rsv,
4680 				      u64 qgroup_reserved)
4681 {
4682 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4683 	if (qgroup_reserved)
4684 		btrfs_qgroup_free(root, qgroup_reserved);
4685 }
4686 
4687 /**
4688  * drop_outstanding_extent - drop an outstanding extent
4689  * @inode: the inode we're dropping the extent for
4690  *
4691  * This is called when we are freeing up an outstanding extent, either called
4692  * after an error or after an extent is written.  This will return the number of
4693  * reserved extents that need to be freed.  This must be called with
4694  * BTRFS_I(inode)->lock held.
4695  */
4696 static unsigned drop_outstanding_extent(struct inode *inode)
4697 {
4698 	unsigned drop_inode_space = 0;
4699 	unsigned dropped_extents = 0;
4700 
4701 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4702 	BTRFS_I(inode)->outstanding_extents--;
4703 
4704 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
4705 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4706 			       &BTRFS_I(inode)->runtime_flags))
4707 		drop_inode_space = 1;
4708 
4709 	/*
4710 	 * If we have more or the same amount of outsanding extents than we have
4711 	 * reserved then we need to leave the reserved extents count alone.
4712 	 */
4713 	if (BTRFS_I(inode)->outstanding_extents >=
4714 	    BTRFS_I(inode)->reserved_extents)
4715 		return drop_inode_space;
4716 
4717 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4718 		BTRFS_I(inode)->outstanding_extents;
4719 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4720 	return dropped_extents + drop_inode_space;
4721 }
4722 
4723 /**
4724  * calc_csum_metadata_size - return the amount of metada space that must be
4725  *	reserved/free'd for the given bytes.
4726  * @inode: the inode we're manipulating
4727  * @num_bytes: the number of bytes in question
4728  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4729  *
4730  * This adjusts the number of csum_bytes in the inode and then returns the
4731  * correct amount of metadata that must either be reserved or freed.  We
4732  * calculate how many checksums we can fit into one leaf and then divide the
4733  * number of bytes that will need to be checksumed by this value to figure out
4734  * how many checksums will be required.  If we are adding bytes then the number
4735  * may go up and we will return the number of additional bytes that must be
4736  * reserved.  If it is going down we will return the number of bytes that must
4737  * be freed.
4738  *
4739  * This must be called with BTRFS_I(inode)->lock held.
4740  */
4741 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4742 				   int reserve)
4743 {
4744 	struct btrfs_root *root = BTRFS_I(inode)->root;
4745 	u64 csum_size;
4746 	int num_csums_per_leaf;
4747 	int num_csums;
4748 	int old_csums;
4749 
4750 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4751 	    BTRFS_I(inode)->csum_bytes == 0)
4752 		return 0;
4753 
4754 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4755 	if (reserve)
4756 		BTRFS_I(inode)->csum_bytes += num_bytes;
4757 	else
4758 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4759 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4760 	num_csums_per_leaf = (int)div64_u64(csum_size,
4761 					    sizeof(struct btrfs_csum_item) +
4762 					    sizeof(struct btrfs_disk_key));
4763 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4764 	num_csums = num_csums + num_csums_per_leaf - 1;
4765 	num_csums = num_csums / num_csums_per_leaf;
4766 
4767 	old_csums = old_csums + num_csums_per_leaf - 1;
4768 	old_csums = old_csums / num_csums_per_leaf;
4769 
4770 	/* No change, no need to reserve more */
4771 	if (old_csums == num_csums)
4772 		return 0;
4773 
4774 	if (reserve)
4775 		return btrfs_calc_trans_metadata_size(root,
4776 						      num_csums - old_csums);
4777 
4778 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4779 }
4780 
4781 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4782 {
4783 	struct btrfs_root *root = BTRFS_I(inode)->root;
4784 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4785 	u64 to_reserve = 0;
4786 	u64 csum_bytes;
4787 	unsigned nr_extents = 0;
4788 	int extra_reserve = 0;
4789 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4790 	int ret = 0;
4791 	bool delalloc_lock = true;
4792 	u64 to_free = 0;
4793 	unsigned dropped;
4794 
4795 	/* If we are a free space inode we need to not flush since we will be in
4796 	 * the middle of a transaction commit.  We also don't need the delalloc
4797 	 * mutex since we won't race with anybody.  We need this mostly to make
4798 	 * lockdep shut its filthy mouth.
4799 	 */
4800 	if (btrfs_is_free_space_inode(inode)) {
4801 		flush = BTRFS_RESERVE_NO_FLUSH;
4802 		delalloc_lock = false;
4803 	}
4804 
4805 	if (flush != BTRFS_RESERVE_NO_FLUSH &&
4806 	    btrfs_transaction_in_commit(root->fs_info))
4807 		schedule_timeout(1);
4808 
4809 	if (delalloc_lock)
4810 		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4811 
4812 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4813 
4814 	spin_lock(&BTRFS_I(inode)->lock);
4815 	BTRFS_I(inode)->outstanding_extents++;
4816 
4817 	if (BTRFS_I(inode)->outstanding_extents >
4818 	    BTRFS_I(inode)->reserved_extents)
4819 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4820 			BTRFS_I(inode)->reserved_extents;
4821 
4822 	/*
4823 	 * Add an item to reserve for updating the inode when we complete the
4824 	 * delalloc io.
4825 	 */
4826 	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4827 		      &BTRFS_I(inode)->runtime_flags)) {
4828 		nr_extents++;
4829 		extra_reserve = 1;
4830 	}
4831 
4832 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4833 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4834 	csum_bytes = BTRFS_I(inode)->csum_bytes;
4835 	spin_unlock(&BTRFS_I(inode)->lock);
4836 
4837 	if (root->fs_info->quota_enabled) {
4838 		ret = btrfs_qgroup_reserve(root, num_bytes +
4839 					   nr_extents * root->leafsize);
4840 		if (ret)
4841 			goto out_fail;
4842 	}
4843 
4844 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4845 	if (unlikely(ret)) {
4846 		if (root->fs_info->quota_enabled)
4847 			btrfs_qgroup_free(root, num_bytes +
4848 						nr_extents * root->leafsize);
4849 		goto out_fail;
4850 	}
4851 
4852 	spin_lock(&BTRFS_I(inode)->lock);
4853 	if (extra_reserve) {
4854 		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4855 			&BTRFS_I(inode)->runtime_flags);
4856 		nr_extents--;
4857 	}
4858 	BTRFS_I(inode)->reserved_extents += nr_extents;
4859 	spin_unlock(&BTRFS_I(inode)->lock);
4860 
4861 	if (delalloc_lock)
4862 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4863 
4864 	if (to_reserve)
4865 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
4866 					      btrfs_ino(inode), to_reserve, 1);
4867 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
4868 
4869 	return 0;
4870 
4871 out_fail:
4872 	spin_lock(&BTRFS_I(inode)->lock);
4873 	dropped = drop_outstanding_extent(inode);
4874 	/*
4875 	 * If the inodes csum_bytes is the same as the original
4876 	 * csum_bytes then we know we haven't raced with any free()ers
4877 	 * so we can just reduce our inodes csum bytes and carry on.
4878 	 */
4879 	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
4880 		calc_csum_metadata_size(inode, num_bytes, 0);
4881 	} else {
4882 		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
4883 		u64 bytes;
4884 
4885 		/*
4886 		 * This is tricky, but first we need to figure out how much we
4887 		 * free'd from any free-ers that occured during this
4888 		 * reservation, so we reset ->csum_bytes to the csum_bytes
4889 		 * before we dropped our lock, and then call the free for the
4890 		 * number of bytes that were freed while we were trying our
4891 		 * reservation.
4892 		 */
4893 		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
4894 		BTRFS_I(inode)->csum_bytes = csum_bytes;
4895 		to_free = calc_csum_metadata_size(inode, bytes, 0);
4896 
4897 
4898 		/*
4899 		 * Now we need to see how much we would have freed had we not
4900 		 * been making this reservation and our ->csum_bytes were not
4901 		 * artificially inflated.
4902 		 */
4903 		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
4904 		bytes = csum_bytes - orig_csum_bytes;
4905 		bytes = calc_csum_metadata_size(inode, bytes, 0);
4906 
4907 		/*
4908 		 * Now reset ->csum_bytes to what it should be.  If bytes is
4909 		 * more than to_free then we would have free'd more space had we
4910 		 * not had an artificially high ->csum_bytes, so we need to free
4911 		 * the remainder.  If bytes is the same or less then we don't
4912 		 * need to do anything, the other free-ers did the correct
4913 		 * thing.
4914 		 */
4915 		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
4916 		if (bytes > to_free)
4917 			to_free = bytes - to_free;
4918 		else
4919 			to_free = 0;
4920 	}
4921 	spin_unlock(&BTRFS_I(inode)->lock);
4922 	if (dropped)
4923 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4924 
4925 	if (to_free) {
4926 		btrfs_block_rsv_release(root, block_rsv, to_free);
4927 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
4928 					      btrfs_ino(inode), to_free, 0);
4929 	}
4930 	if (delalloc_lock)
4931 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4932 	return ret;
4933 }
4934 
4935 /**
4936  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4937  * @inode: the inode to release the reservation for
4938  * @num_bytes: the number of bytes we're releasing
4939  *
4940  * This will release the metadata reservation for an inode.  This can be called
4941  * once we complete IO for a given set of bytes to release their metadata
4942  * reservations.
4943  */
4944 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4945 {
4946 	struct btrfs_root *root = BTRFS_I(inode)->root;
4947 	u64 to_free = 0;
4948 	unsigned dropped;
4949 
4950 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4951 	spin_lock(&BTRFS_I(inode)->lock);
4952 	dropped = drop_outstanding_extent(inode);
4953 
4954 	if (num_bytes)
4955 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4956 	spin_unlock(&BTRFS_I(inode)->lock);
4957 	if (dropped > 0)
4958 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
4959 
4960 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
4961 				      btrfs_ino(inode), to_free, 0);
4962 	if (root->fs_info->quota_enabled) {
4963 		btrfs_qgroup_free(root, num_bytes +
4964 					dropped * root->leafsize);
4965 	}
4966 
4967 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4968 				to_free);
4969 }
4970 
4971 /**
4972  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4973  * @inode: inode we're writing to
4974  * @num_bytes: the number of bytes we want to allocate
4975  *
4976  * This will do the following things
4977  *
4978  * o reserve space in the data space info for num_bytes
4979  * o reserve space in the metadata space info based on number of outstanding
4980  *   extents and how much csums will be needed
4981  * o add to the inodes ->delalloc_bytes
4982  * o add it to the fs_info's delalloc inodes list.
4983  *
4984  * This will return 0 for success and -ENOSPC if there is no space left.
4985  */
4986 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4987 {
4988 	int ret;
4989 
4990 	ret = btrfs_check_data_free_space(inode, num_bytes);
4991 	if (ret)
4992 		return ret;
4993 
4994 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4995 	if (ret) {
4996 		btrfs_free_reserved_data_space(inode, num_bytes);
4997 		return ret;
4998 	}
4999 
5000 	return 0;
5001 }
5002 
5003 /**
5004  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5005  * @inode: inode we're releasing space for
5006  * @num_bytes: the number of bytes we want to free up
5007  *
5008  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5009  * called in the case that we don't need the metadata AND data reservations
5010  * anymore.  So if there is an error or we insert an inline extent.
5011  *
5012  * This function will release the metadata space that was not used and will
5013  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5014  * list if there are no delalloc bytes left.
5015  */
5016 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5017 {
5018 	btrfs_delalloc_release_metadata(inode, num_bytes);
5019 	btrfs_free_reserved_data_space(inode, num_bytes);
5020 }
5021 
5022 static int update_block_group(struct btrfs_root *root,
5023 			      u64 bytenr, u64 num_bytes, int alloc)
5024 {
5025 	struct btrfs_block_group_cache *cache = NULL;
5026 	struct btrfs_fs_info *info = root->fs_info;
5027 	u64 total = num_bytes;
5028 	u64 old_val;
5029 	u64 byte_in_group;
5030 	int factor;
5031 
5032 	/* block accounting for super block */
5033 	spin_lock(&info->delalloc_lock);
5034 	old_val = btrfs_super_bytes_used(info->super_copy);
5035 	if (alloc)
5036 		old_val += num_bytes;
5037 	else
5038 		old_val -= num_bytes;
5039 	btrfs_set_super_bytes_used(info->super_copy, old_val);
5040 	spin_unlock(&info->delalloc_lock);
5041 
5042 	while (total) {
5043 		cache = btrfs_lookup_block_group(info, bytenr);
5044 		if (!cache)
5045 			return -ENOENT;
5046 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5047 				    BTRFS_BLOCK_GROUP_RAID1 |
5048 				    BTRFS_BLOCK_GROUP_RAID10))
5049 			factor = 2;
5050 		else
5051 			factor = 1;
5052 		/*
5053 		 * If this block group has free space cache written out, we
5054 		 * need to make sure to load it if we are removing space.  This
5055 		 * is because we need the unpinning stage to actually add the
5056 		 * space back to the block group, otherwise we will leak space.
5057 		 */
5058 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
5059 			cache_block_group(cache, 1);
5060 
5061 		byte_in_group = bytenr - cache->key.objectid;
5062 		WARN_ON(byte_in_group > cache->key.offset);
5063 
5064 		spin_lock(&cache->space_info->lock);
5065 		spin_lock(&cache->lock);
5066 
5067 		if (btrfs_test_opt(root, SPACE_CACHE) &&
5068 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
5069 			cache->disk_cache_state = BTRFS_DC_CLEAR;
5070 
5071 		cache->dirty = 1;
5072 		old_val = btrfs_block_group_used(&cache->item);
5073 		num_bytes = min(total, cache->key.offset - byte_in_group);
5074 		if (alloc) {
5075 			old_val += num_bytes;
5076 			btrfs_set_block_group_used(&cache->item, old_val);
5077 			cache->reserved -= num_bytes;
5078 			cache->space_info->bytes_reserved -= num_bytes;
5079 			cache->space_info->bytes_used += num_bytes;
5080 			cache->space_info->disk_used += num_bytes * factor;
5081 			spin_unlock(&cache->lock);
5082 			spin_unlock(&cache->space_info->lock);
5083 		} else {
5084 			old_val -= num_bytes;
5085 			btrfs_set_block_group_used(&cache->item, old_val);
5086 			cache->pinned += num_bytes;
5087 			cache->space_info->bytes_pinned += num_bytes;
5088 			cache->space_info->bytes_used -= num_bytes;
5089 			cache->space_info->disk_used -= num_bytes * factor;
5090 			spin_unlock(&cache->lock);
5091 			spin_unlock(&cache->space_info->lock);
5092 
5093 			set_extent_dirty(info->pinned_extents,
5094 					 bytenr, bytenr + num_bytes - 1,
5095 					 GFP_NOFS | __GFP_NOFAIL);
5096 		}
5097 		btrfs_put_block_group(cache);
5098 		total -= num_bytes;
5099 		bytenr += num_bytes;
5100 	}
5101 	return 0;
5102 }
5103 
5104 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5105 {
5106 	struct btrfs_block_group_cache *cache;
5107 	u64 bytenr;
5108 
5109 	spin_lock(&root->fs_info->block_group_cache_lock);
5110 	bytenr = root->fs_info->first_logical_byte;
5111 	spin_unlock(&root->fs_info->block_group_cache_lock);
5112 
5113 	if (bytenr < (u64)-1)
5114 		return bytenr;
5115 
5116 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5117 	if (!cache)
5118 		return 0;
5119 
5120 	bytenr = cache->key.objectid;
5121 	btrfs_put_block_group(cache);
5122 
5123 	return bytenr;
5124 }
5125 
5126 static int pin_down_extent(struct btrfs_root *root,
5127 			   struct btrfs_block_group_cache *cache,
5128 			   u64 bytenr, u64 num_bytes, int reserved)
5129 {
5130 	spin_lock(&cache->space_info->lock);
5131 	spin_lock(&cache->lock);
5132 	cache->pinned += num_bytes;
5133 	cache->space_info->bytes_pinned += num_bytes;
5134 	if (reserved) {
5135 		cache->reserved -= num_bytes;
5136 		cache->space_info->bytes_reserved -= num_bytes;
5137 	}
5138 	spin_unlock(&cache->lock);
5139 	spin_unlock(&cache->space_info->lock);
5140 
5141 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5142 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5143 	return 0;
5144 }
5145 
5146 /*
5147  * this function must be called within transaction
5148  */
5149 int btrfs_pin_extent(struct btrfs_root *root,
5150 		     u64 bytenr, u64 num_bytes, int reserved)
5151 {
5152 	struct btrfs_block_group_cache *cache;
5153 
5154 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5155 	BUG_ON(!cache); /* Logic error */
5156 
5157 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5158 
5159 	btrfs_put_block_group(cache);
5160 	return 0;
5161 }
5162 
5163 /*
5164  * this function must be called within transaction
5165  */
5166 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5167 				    u64 bytenr, u64 num_bytes)
5168 {
5169 	struct btrfs_block_group_cache *cache;
5170 	int ret;
5171 
5172 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5173 	if (!cache)
5174 		return -EINVAL;
5175 
5176 	/*
5177 	 * pull in the free space cache (if any) so that our pin
5178 	 * removes the free space from the cache.  We have load_only set
5179 	 * to one because the slow code to read in the free extents does check
5180 	 * the pinned extents.
5181 	 */
5182 	cache_block_group(cache, 1);
5183 
5184 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
5185 
5186 	/* remove us from the free space cache (if we're there at all) */
5187 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5188 	btrfs_put_block_group(cache);
5189 	return ret;
5190 }
5191 
5192 /**
5193  * btrfs_update_reserved_bytes - update the block_group and space info counters
5194  * @cache:	The cache we are manipulating
5195  * @num_bytes:	The number of bytes in question
5196  * @reserve:	One of the reservation enums
5197  *
5198  * This is called by the allocator when it reserves space, or by somebody who is
5199  * freeing space that was never actually used on disk.  For example if you
5200  * reserve some space for a new leaf in transaction A and before transaction A
5201  * commits you free that leaf, you call this with reserve set to 0 in order to
5202  * clear the reservation.
5203  *
5204  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5205  * ENOSPC accounting.  For data we handle the reservation through clearing the
5206  * delalloc bits in the io_tree.  We have to do this since we could end up
5207  * allocating less disk space for the amount of data we have reserved in the
5208  * case of compression.
5209  *
5210  * If this is a reservation and the block group has become read only we cannot
5211  * make the reservation and return -EAGAIN, otherwise this function always
5212  * succeeds.
5213  */
5214 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5215 				       u64 num_bytes, int reserve)
5216 {
5217 	struct btrfs_space_info *space_info = cache->space_info;
5218 	int ret = 0;
5219 
5220 	spin_lock(&space_info->lock);
5221 	spin_lock(&cache->lock);
5222 	if (reserve != RESERVE_FREE) {
5223 		if (cache->ro) {
5224 			ret = -EAGAIN;
5225 		} else {
5226 			cache->reserved += num_bytes;
5227 			space_info->bytes_reserved += num_bytes;
5228 			if (reserve == RESERVE_ALLOC) {
5229 				trace_btrfs_space_reservation(cache->fs_info,
5230 						"space_info", space_info->flags,
5231 						num_bytes, 0);
5232 				space_info->bytes_may_use -= num_bytes;
5233 			}
5234 		}
5235 	} else {
5236 		if (cache->ro)
5237 			space_info->bytes_readonly += num_bytes;
5238 		cache->reserved -= num_bytes;
5239 		space_info->bytes_reserved -= num_bytes;
5240 		space_info->reservation_progress++;
5241 	}
5242 	spin_unlock(&cache->lock);
5243 	spin_unlock(&space_info->lock);
5244 	return ret;
5245 }
5246 
5247 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5248 				struct btrfs_root *root)
5249 {
5250 	struct btrfs_fs_info *fs_info = root->fs_info;
5251 	struct btrfs_caching_control *next;
5252 	struct btrfs_caching_control *caching_ctl;
5253 	struct btrfs_block_group_cache *cache;
5254 
5255 	down_write(&fs_info->extent_commit_sem);
5256 
5257 	list_for_each_entry_safe(caching_ctl, next,
5258 				 &fs_info->caching_block_groups, list) {
5259 		cache = caching_ctl->block_group;
5260 		if (block_group_cache_done(cache)) {
5261 			cache->last_byte_to_unpin = (u64)-1;
5262 			list_del_init(&caching_ctl->list);
5263 			put_caching_control(caching_ctl);
5264 		} else {
5265 			cache->last_byte_to_unpin = caching_ctl->progress;
5266 		}
5267 	}
5268 
5269 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5270 		fs_info->pinned_extents = &fs_info->freed_extents[1];
5271 	else
5272 		fs_info->pinned_extents = &fs_info->freed_extents[0];
5273 
5274 	up_write(&fs_info->extent_commit_sem);
5275 
5276 	update_global_block_rsv(fs_info);
5277 }
5278 
5279 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5280 {
5281 	struct btrfs_fs_info *fs_info = root->fs_info;
5282 	struct btrfs_block_group_cache *cache = NULL;
5283 	struct btrfs_space_info *space_info;
5284 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5285 	u64 len;
5286 	bool readonly;
5287 
5288 	while (start <= end) {
5289 		readonly = false;
5290 		if (!cache ||
5291 		    start >= cache->key.objectid + cache->key.offset) {
5292 			if (cache)
5293 				btrfs_put_block_group(cache);
5294 			cache = btrfs_lookup_block_group(fs_info, start);
5295 			BUG_ON(!cache); /* Logic error */
5296 		}
5297 
5298 		len = cache->key.objectid + cache->key.offset - start;
5299 		len = min(len, end + 1 - start);
5300 
5301 		if (start < cache->last_byte_to_unpin) {
5302 			len = min(len, cache->last_byte_to_unpin - start);
5303 			btrfs_add_free_space(cache, start, len);
5304 		}
5305 
5306 		start += len;
5307 		space_info = cache->space_info;
5308 
5309 		spin_lock(&space_info->lock);
5310 		spin_lock(&cache->lock);
5311 		cache->pinned -= len;
5312 		space_info->bytes_pinned -= len;
5313 		if (cache->ro) {
5314 			space_info->bytes_readonly += len;
5315 			readonly = true;
5316 		}
5317 		spin_unlock(&cache->lock);
5318 		if (!readonly && global_rsv->space_info == space_info) {
5319 			spin_lock(&global_rsv->lock);
5320 			if (!global_rsv->full) {
5321 				len = min(len, global_rsv->size -
5322 					  global_rsv->reserved);
5323 				global_rsv->reserved += len;
5324 				space_info->bytes_may_use += len;
5325 				if (global_rsv->reserved >= global_rsv->size)
5326 					global_rsv->full = 1;
5327 			}
5328 			spin_unlock(&global_rsv->lock);
5329 		}
5330 		spin_unlock(&space_info->lock);
5331 	}
5332 
5333 	if (cache)
5334 		btrfs_put_block_group(cache);
5335 	return 0;
5336 }
5337 
5338 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5339 			       struct btrfs_root *root)
5340 {
5341 	struct btrfs_fs_info *fs_info = root->fs_info;
5342 	struct extent_io_tree *unpin;
5343 	u64 start;
5344 	u64 end;
5345 	int ret;
5346 
5347 	if (trans->aborted)
5348 		return 0;
5349 
5350 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5351 		unpin = &fs_info->freed_extents[1];
5352 	else
5353 		unpin = &fs_info->freed_extents[0];
5354 
5355 	while (1) {
5356 		ret = find_first_extent_bit(unpin, 0, &start, &end,
5357 					    EXTENT_DIRTY, NULL);
5358 		if (ret)
5359 			break;
5360 
5361 		if (btrfs_test_opt(root, DISCARD))
5362 			ret = btrfs_discard_extent(root, start,
5363 						   end + 1 - start, NULL);
5364 
5365 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
5366 		unpin_extent_range(root, start, end);
5367 		cond_resched();
5368 	}
5369 
5370 	return 0;
5371 }
5372 
5373 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5374 				struct btrfs_root *root,
5375 				u64 bytenr, u64 num_bytes, u64 parent,
5376 				u64 root_objectid, u64 owner_objectid,
5377 				u64 owner_offset, int refs_to_drop,
5378 				struct btrfs_delayed_extent_op *extent_op)
5379 {
5380 	struct btrfs_key key;
5381 	struct btrfs_path *path;
5382 	struct btrfs_fs_info *info = root->fs_info;
5383 	struct btrfs_root *extent_root = info->extent_root;
5384 	struct extent_buffer *leaf;
5385 	struct btrfs_extent_item *ei;
5386 	struct btrfs_extent_inline_ref *iref;
5387 	int ret;
5388 	int is_data;
5389 	int extent_slot = 0;
5390 	int found_extent = 0;
5391 	int num_to_del = 1;
5392 	u32 item_size;
5393 	u64 refs;
5394 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5395 						 SKINNY_METADATA);
5396 
5397 	path = btrfs_alloc_path();
5398 	if (!path)
5399 		return -ENOMEM;
5400 
5401 	path->reada = 1;
5402 	path->leave_spinning = 1;
5403 
5404 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5405 	BUG_ON(!is_data && refs_to_drop != 1);
5406 
5407 	if (is_data)
5408 		skinny_metadata = 0;
5409 
5410 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
5411 				    bytenr, num_bytes, parent,
5412 				    root_objectid, owner_objectid,
5413 				    owner_offset);
5414 	if (ret == 0) {
5415 		extent_slot = path->slots[0];
5416 		while (extent_slot >= 0) {
5417 			btrfs_item_key_to_cpu(path->nodes[0], &key,
5418 					      extent_slot);
5419 			if (key.objectid != bytenr)
5420 				break;
5421 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5422 			    key.offset == num_bytes) {
5423 				found_extent = 1;
5424 				break;
5425 			}
5426 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
5427 			    key.offset == owner_objectid) {
5428 				found_extent = 1;
5429 				break;
5430 			}
5431 			if (path->slots[0] - extent_slot > 5)
5432 				break;
5433 			extent_slot--;
5434 		}
5435 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5436 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5437 		if (found_extent && item_size < sizeof(*ei))
5438 			found_extent = 0;
5439 #endif
5440 		if (!found_extent) {
5441 			BUG_ON(iref);
5442 			ret = remove_extent_backref(trans, extent_root, path,
5443 						    NULL, refs_to_drop,
5444 						    is_data);
5445 			if (ret) {
5446 				btrfs_abort_transaction(trans, extent_root, ret);
5447 				goto out;
5448 			}
5449 			btrfs_release_path(path);
5450 			path->leave_spinning = 1;
5451 
5452 			key.objectid = bytenr;
5453 			key.type = BTRFS_EXTENT_ITEM_KEY;
5454 			key.offset = num_bytes;
5455 
5456 			if (!is_data && skinny_metadata) {
5457 				key.type = BTRFS_METADATA_ITEM_KEY;
5458 				key.offset = owner_objectid;
5459 			}
5460 
5461 			ret = btrfs_search_slot(trans, extent_root,
5462 						&key, path, -1, 1);
5463 			if (ret > 0 && skinny_metadata && path->slots[0]) {
5464 				/*
5465 				 * Couldn't find our skinny metadata item,
5466 				 * see if we have ye olde extent item.
5467 				 */
5468 				path->slots[0]--;
5469 				btrfs_item_key_to_cpu(path->nodes[0], &key,
5470 						      path->slots[0]);
5471 				if (key.objectid == bytenr &&
5472 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
5473 				    key.offset == num_bytes)
5474 					ret = 0;
5475 			}
5476 
5477 			if (ret > 0 && skinny_metadata) {
5478 				skinny_metadata = false;
5479 				key.type = BTRFS_EXTENT_ITEM_KEY;
5480 				key.offset = num_bytes;
5481 				btrfs_release_path(path);
5482 				ret = btrfs_search_slot(trans, extent_root,
5483 							&key, path, -1, 1);
5484 			}
5485 
5486 			if (ret) {
5487 				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5488 					ret, (unsigned long long)bytenr);
5489 				if (ret > 0)
5490 					btrfs_print_leaf(extent_root,
5491 							 path->nodes[0]);
5492 			}
5493 			if (ret < 0) {
5494 				btrfs_abort_transaction(trans, extent_root, ret);
5495 				goto out;
5496 			}
5497 			extent_slot = path->slots[0];
5498 		}
5499 	} else if (ret == -ENOENT) {
5500 		btrfs_print_leaf(extent_root, path->nodes[0]);
5501 		WARN_ON(1);
5502 		btrfs_err(info,
5503 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
5504 			(unsigned long long)bytenr,
5505 			(unsigned long long)parent,
5506 			(unsigned long long)root_objectid,
5507 			(unsigned long long)owner_objectid,
5508 			(unsigned long long)owner_offset);
5509 	} else {
5510 		btrfs_abort_transaction(trans, extent_root, ret);
5511 		goto out;
5512 	}
5513 
5514 	leaf = path->nodes[0];
5515 	item_size = btrfs_item_size_nr(leaf, extent_slot);
5516 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5517 	if (item_size < sizeof(*ei)) {
5518 		BUG_ON(found_extent || extent_slot != path->slots[0]);
5519 		ret = convert_extent_item_v0(trans, extent_root, path,
5520 					     owner_objectid, 0);
5521 		if (ret < 0) {
5522 			btrfs_abort_transaction(trans, extent_root, ret);
5523 			goto out;
5524 		}
5525 
5526 		btrfs_release_path(path);
5527 		path->leave_spinning = 1;
5528 
5529 		key.objectid = bytenr;
5530 		key.type = BTRFS_EXTENT_ITEM_KEY;
5531 		key.offset = num_bytes;
5532 
5533 		ret = btrfs_search_slot(trans, extent_root, &key, path,
5534 					-1, 1);
5535 		if (ret) {
5536 			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5537 				ret, (unsigned long long)bytenr);
5538 			btrfs_print_leaf(extent_root, path->nodes[0]);
5539 		}
5540 		if (ret < 0) {
5541 			btrfs_abort_transaction(trans, extent_root, ret);
5542 			goto out;
5543 		}
5544 
5545 		extent_slot = path->slots[0];
5546 		leaf = path->nodes[0];
5547 		item_size = btrfs_item_size_nr(leaf, extent_slot);
5548 	}
5549 #endif
5550 	BUG_ON(item_size < sizeof(*ei));
5551 	ei = btrfs_item_ptr(leaf, extent_slot,
5552 			    struct btrfs_extent_item);
5553 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5554 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
5555 		struct btrfs_tree_block_info *bi;
5556 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5557 		bi = (struct btrfs_tree_block_info *)(ei + 1);
5558 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5559 	}
5560 
5561 	refs = btrfs_extent_refs(leaf, ei);
5562 	if (refs < refs_to_drop) {
5563 		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
5564 			  "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
5565 		ret = -EINVAL;
5566 		btrfs_abort_transaction(trans, extent_root, ret);
5567 		goto out;
5568 	}
5569 	refs -= refs_to_drop;
5570 
5571 	if (refs > 0) {
5572 		if (extent_op)
5573 			__run_delayed_extent_op(extent_op, leaf, ei);
5574 		/*
5575 		 * In the case of inline back ref, reference count will
5576 		 * be updated by remove_extent_backref
5577 		 */
5578 		if (iref) {
5579 			BUG_ON(!found_extent);
5580 		} else {
5581 			btrfs_set_extent_refs(leaf, ei, refs);
5582 			btrfs_mark_buffer_dirty(leaf);
5583 		}
5584 		if (found_extent) {
5585 			ret = remove_extent_backref(trans, extent_root, path,
5586 						    iref, refs_to_drop,
5587 						    is_data);
5588 			if (ret) {
5589 				btrfs_abort_transaction(trans, extent_root, ret);
5590 				goto out;
5591 			}
5592 		}
5593 	} else {
5594 		if (found_extent) {
5595 			BUG_ON(is_data && refs_to_drop !=
5596 			       extent_data_ref_count(root, path, iref));
5597 			if (iref) {
5598 				BUG_ON(path->slots[0] != extent_slot);
5599 			} else {
5600 				BUG_ON(path->slots[0] != extent_slot + 1);
5601 				path->slots[0] = extent_slot;
5602 				num_to_del = 2;
5603 			}
5604 		}
5605 
5606 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5607 				      num_to_del);
5608 		if (ret) {
5609 			btrfs_abort_transaction(trans, extent_root, ret);
5610 			goto out;
5611 		}
5612 		btrfs_release_path(path);
5613 
5614 		if (is_data) {
5615 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5616 			if (ret) {
5617 				btrfs_abort_transaction(trans, extent_root, ret);
5618 				goto out;
5619 			}
5620 		}
5621 
5622 		ret = update_block_group(root, bytenr, num_bytes, 0);
5623 		if (ret) {
5624 			btrfs_abort_transaction(trans, extent_root, ret);
5625 			goto out;
5626 		}
5627 	}
5628 out:
5629 	btrfs_free_path(path);
5630 	return ret;
5631 }
5632 
5633 /*
5634  * when we free an block, it is possible (and likely) that we free the last
5635  * delayed ref for that extent as well.  This searches the delayed ref tree for
5636  * a given extent, and if there are no other delayed refs to be processed, it
5637  * removes it from the tree.
5638  */
5639 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5640 				      struct btrfs_root *root, u64 bytenr)
5641 {
5642 	struct btrfs_delayed_ref_head *head;
5643 	struct btrfs_delayed_ref_root *delayed_refs;
5644 	struct btrfs_delayed_ref_node *ref;
5645 	struct rb_node *node;
5646 	int ret = 0;
5647 
5648 	delayed_refs = &trans->transaction->delayed_refs;
5649 	spin_lock(&delayed_refs->lock);
5650 	head = btrfs_find_delayed_ref_head(trans, bytenr);
5651 	if (!head)
5652 		goto out;
5653 
5654 	node = rb_prev(&head->node.rb_node);
5655 	if (!node)
5656 		goto out;
5657 
5658 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5659 
5660 	/* there are still entries for this ref, we can't drop it */
5661 	if (ref->bytenr == bytenr)
5662 		goto out;
5663 
5664 	if (head->extent_op) {
5665 		if (!head->must_insert_reserved)
5666 			goto out;
5667 		btrfs_free_delayed_extent_op(head->extent_op);
5668 		head->extent_op = NULL;
5669 	}
5670 
5671 	/*
5672 	 * waiting for the lock here would deadlock.  If someone else has it
5673 	 * locked they are already in the process of dropping it anyway
5674 	 */
5675 	if (!mutex_trylock(&head->mutex))
5676 		goto out;
5677 
5678 	/*
5679 	 * at this point we have a head with no other entries.  Go
5680 	 * ahead and process it.
5681 	 */
5682 	head->node.in_tree = 0;
5683 	rb_erase(&head->node.rb_node, &delayed_refs->root);
5684 
5685 	delayed_refs->num_entries--;
5686 
5687 	/*
5688 	 * we don't take a ref on the node because we're removing it from the
5689 	 * tree, so we just steal the ref the tree was holding.
5690 	 */
5691 	delayed_refs->num_heads--;
5692 	if (list_empty(&head->cluster))
5693 		delayed_refs->num_heads_ready--;
5694 
5695 	list_del_init(&head->cluster);
5696 	spin_unlock(&delayed_refs->lock);
5697 
5698 	BUG_ON(head->extent_op);
5699 	if (head->must_insert_reserved)
5700 		ret = 1;
5701 
5702 	mutex_unlock(&head->mutex);
5703 	btrfs_put_delayed_ref(&head->node);
5704 	return ret;
5705 out:
5706 	spin_unlock(&delayed_refs->lock);
5707 	return 0;
5708 }
5709 
5710 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5711 			   struct btrfs_root *root,
5712 			   struct extent_buffer *buf,
5713 			   u64 parent, int last_ref)
5714 {
5715 	struct btrfs_block_group_cache *cache = NULL;
5716 	int ret;
5717 
5718 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5719 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5720 					buf->start, buf->len,
5721 					parent, root->root_key.objectid,
5722 					btrfs_header_level(buf),
5723 					BTRFS_DROP_DELAYED_REF, NULL, 0);
5724 		BUG_ON(ret); /* -ENOMEM */
5725 	}
5726 
5727 	if (!last_ref)
5728 		return;
5729 
5730 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5731 
5732 	if (btrfs_header_generation(buf) == trans->transid) {
5733 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5734 			ret = check_ref_cleanup(trans, root, buf->start);
5735 			if (!ret)
5736 				goto out;
5737 		}
5738 
5739 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5740 			pin_down_extent(root, cache, buf->start, buf->len, 1);
5741 			goto out;
5742 		}
5743 
5744 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5745 
5746 		btrfs_add_free_space(cache, buf->start, buf->len);
5747 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5748 	}
5749 out:
5750 	/*
5751 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5752 	 * anymore.
5753 	 */
5754 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5755 	btrfs_put_block_group(cache);
5756 }
5757 
5758 /* Can return -ENOMEM */
5759 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5760 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5761 		      u64 owner, u64 offset, int for_cow)
5762 {
5763 	int ret;
5764 	struct btrfs_fs_info *fs_info = root->fs_info;
5765 
5766 	/*
5767 	 * tree log blocks never actually go into the extent allocation
5768 	 * tree, just update pinning info and exit early.
5769 	 */
5770 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5771 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5772 		/* unlocks the pinned mutex */
5773 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
5774 		ret = 0;
5775 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5776 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5777 					num_bytes,
5778 					parent, root_objectid, (int)owner,
5779 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5780 	} else {
5781 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5782 						num_bytes,
5783 						parent, root_objectid, owner,
5784 						offset, BTRFS_DROP_DELAYED_REF,
5785 						NULL, for_cow);
5786 	}
5787 	return ret;
5788 }
5789 
5790 static u64 stripe_align(struct btrfs_root *root,
5791 			struct btrfs_block_group_cache *cache,
5792 			u64 val, u64 num_bytes)
5793 {
5794 	u64 ret = ALIGN(val, root->stripesize);
5795 	return ret;
5796 }
5797 
5798 /*
5799  * when we wait for progress in the block group caching, its because
5800  * our allocation attempt failed at least once.  So, we must sleep
5801  * and let some progress happen before we try again.
5802  *
5803  * This function will sleep at least once waiting for new free space to
5804  * show up, and then it will check the block group free space numbers
5805  * for our min num_bytes.  Another option is to have it go ahead
5806  * and look in the rbtree for a free extent of a given size, but this
5807  * is a good start.
5808  */
5809 static noinline int
5810 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5811 				u64 num_bytes)
5812 {
5813 	struct btrfs_caching_control *caching_ctl;
5814 
5815 	caching_ctl = get_caching_control(cache);
5816 	if (!caching_ctl)
5817 		return 0;
5818 
5819 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5820 		   (cache->free_space_ctl->free_space >= num_bytes));
5821 
5822 	put_caching_control(caching_ctl);
5823 	return 0;
5824 }
5825 
5826 static noinline int
5827 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5828 {
5829 	struct btrfs_caching_control *caching_ctl;
5830 
5831 	caching_ctl = get_caching_control(cache);
5832 	if (!caching_ctl)
5833 		return 0;
5834 
5835 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
5836 
5837 	put_caching_control(caching_ctl);
5838 	return 0;
5839 }
5840 
5841 int __get_raid_index(u64 flags)
5842 {
5843 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
5844 		return BTRFS_RAID_RAID10;
5845 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5846 		return BTRFS_RAID_RAID1;
5847 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
5848 		return BTRFS_RAID_DUP;
5849 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5850 		return BTRFS_RAID_RAID0;
5851 	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5852 		return BTRFS_RAID_RAID5;
5853 	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5854 		return BTRFS_RAID_RAID6;
5855 
5856 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5857 }
5858 
5859 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5860 {
5861 	return __get_raid_index(cache->flags);
5862 }
5863 
5864 enum btrfs_loop_type {
5865 	LOOP_CACHING_NOWAIT = 0,
5866 	LOOP_CACHING_WAIT = 1,
5867 	LOOP_ALLOC_CHUNK = 2,
5868 	LOOP_NO_EMPTY_SIZE = 3,
5869 };
5870 
5871 /*
5872  * walks the btree of allocated extents and find a hole of a given size.
5873  * The key ins is changed to record the hole:
5874  * ins->objectid == block start
5875  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5876  * ins->offset == number of blocks
5877  * Any available blocks before search_start are skipped.
5878  */
5879 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5880 				     struct btrfs_root *orig_root,
5881 				     u64 num_bytes, u64 empty_size,
5882 				     u64 hint_byte, struct btrfs_key *ins,
5883 				     u64 flags)
5884 {
5885 	int ret = 0;
5886 	struct btrfs_root *root = orig_root->fs_info->extent_root;
5887 	struct btrfs_free_cluster *last_ptr = NULL;
5888 	struct btrfs_block_group_cache *block_group = NULL;
5889 	struct btrfs_block_group_cache *used_block_group;
5890 	u64 search_start = 0;
5891 	int empty_cluster = 2 * 1024 * 1024;
5892 	struct btrfs_space_info *space_info;
5893 	int loop = 0;
5894 	int index = __get_raid_index(flags);
5895 	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
5896 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5897 	bool found_uncached_bg = false;
5898 	bool failed_cluster_refill = false;
5899 	bool failed_alloc = false;
5900 	bool use_cluster = true;
5901 	bool have_caching_bg = false;
5902 
5903 	WARN_ON(num_bytes < root->sectorsize);
5904 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5905 	ins->objectid = 0;
5906 	ins->offset = 0;
5907 
5908 	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
5909 
5910 	space_info = __find_space_info(root->fs_info, flags);
5911 	if (!space_info) {
5912 		btrfs_err(root->fs_info, "No space info for %llu", flags);
5913 		return -ENOSPC;
5914 	}
5915 
5916 	/*
5917 	 * If the space info is for both data and metadata it means we have a
5918 	 * small filesystem and we can't use the clustering stuff.
5919 	 */
5920 	if (btrfs_mixed_space_info(space_info))
5921 		use_cluster = false;
5922 
5923 	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5924 		last_ptr = &root->fs_info->meta_alloc_cluster;
5925 		if (!btrfs_test_opt(root, SSD))
5926 			empty_cluster = 64 * 1024;
5927 	}
5928 
5929 	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5930 	    btrfs_test_opt(root, SSD)) {
5931 		last_ptr = &root->fs_info->data_alloc_cluster;
5932 	}
5933 
5934 	if (last_ptr) {
5935 		spin_lock(&last_ptr->lock);
5936 		if (last_ptr->block_group)
5937 			hint_byte = last_ptr->window_start;
5938 		spin_unlock(&last_ptr->lock);
5939 	}
5940 
5941 	search_start = max(search_start, first_logical_byte(root, 0));
5942 	search_start = max(search_start, hint_byte);
5943 
5944 	if (!last_ptr)
5945 		empty_cluster = 0;
5946 
5947 	if (search_start == hint_byte) {
5948 		block_group = btrfs_lookup_block_group(root->fs_info,
5949 						       search_start);
5950 		used_block_group = block_group;
5951 		/*
5952 		 * we don't want to use the block group if it doesn't match our
5953 		 * allocation bits, or if its not cached.
5954 		 *
5955 		 * However if we are re-searching with an ideal block group
5956 		 * picked out then we don't care that the block group is cached.
5957 		 */
5958 		if (block_group && block_group_bits(block_group, flags) &&
5959 		    block_group->cached != BTRFS_CACHE_NO) {
5960 			down_read(&space_info->groups_sem);
5961 			if (list_empty(&block_group->list) ||
5962 			    block_group->ro) {
5963 				/*
5964 				 * someone is removing this block group,
5965 				 * we can't jump into the have_block_group
5966 				 * target because our list pointers are not
5967 				 * valid
5968 				 */
5969 				btrfs_put_block_group(block_group);
5970 				up_read(&space_info->groups_sem);
5971 			} else {
5972 				index = get_block_group_index(block_group);
5973 				goto have_block_group;
5974 			}
5975 		} else if (block_group) {
5976 			btrfs_put_block_group(block_group);
5977 		}
5978 	}
5979 search:
5980 	have_caching_bg = false;
5981 	down_read(&space_info->groups_sem);
5982 	list_for_each_entry(block_group, &space_info->block_groups[index],
5983 			    list) {
5984 		u64 offset;
5985 		int cached;
5986 
5987 		used_block_group = block_group;
5988 		btrfs_get_block_group(block_group);
5989 		search_start = block_group->key.objectid;
5990 
5991 		/*
5992 		 * this can happen if we end up cycling through all the
5993 		 * raid types, but we want to make sure we only allocate
5994 		 * for the proper type.
5995 		 */
5996 		if (!block_group_bits(block_group, flags)) {
5997 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5998 				BTRFS_BLOCK_GROUP_RAID1 |
5999 				BTRFS_BLOCK_GROUP_RAID5 |
6000 				BTRFS_BLOCK_GROUP_RAID6 |
6001 				BTRFS_BLOCK_GROUP_RAID10;
6002 
6003 			/*
6004 			 * if they asked for extra copies and this block group
6005 			 * doesn't provide them, bail.  This does allow us to
6006 			 * fill raid0 from raid1.
6007 			 */
6008 			if ((flags & extra) && !(block_group->flags & extra))
6009 				goto loop;
6010 		}
6011 
6012 have_block_group:
6013 		cached = block_group_cache_done(block_group);
6014 		if (unlikely(!cached)) {
6015 			found_uncached_bg = true;
6016 			ret = cache_block_group(block_group, 0);
6017 			BUG_ON(ret < 0);
6018 			ret = 0;
6019 		}
6020 
6021 		if (unlikely(block_group->ro))
6022 			goto loop;
6023 
6024 		/*
6025 		 * Ok we want to try and use the cluster allocator, so
6026 		 * lets look there
6027 		 */
6028 		if (last_ptr) {
6029 			unsigned long aligned_cluster;
6030 			/*
6031 			 * the refill lock keeps out other
6032 			 * people trying to start a new cluster
6033 			 */
6034 			spin_lock(&last_ptr->refill_lock);
6035 			used_block_group = last_ptr->block_group;
6036 			if (used_block_group != block_group &&
6037 			    (!used_block_group ||
6038 			     used_block_group->ro ||
6039 			     !block_group_bits(used_block_group, flags))) {
6040 				used_block_group = block_group;
6041 				goto refill_cluster;
6042 			}
6043 
6044 			if (used_block_group != block_group)
6045 				btrfs_get_block_group(used_block_group);
6046 
6047 			offset = btrfs_alloc_from_cluster(used_block_group,
6048 			  last_ptr, num_bytes, used_block_group->key.objectid);
6049 			if (offset) {
6050 				/* we have a block, we're done */
6051 				spin_unlock(&last_ptr->refill_lock);
6052 				trace_btrfs_reserve_extent_cluster(root,
6053 					block_group, search_start, num_bytes);
6054 				goto checks;
6055 			}
6056 
6057 			WARN_ON(last_ptr->block_group != used_block_group);
6058 			if (used_block_group != block_group) {
6059 				btrfs_put_block_group(used_block_group);
6060 				used_block_group = block_group;
6061 			}
6062 refill_cluster:
6063 			BUG_ON(used_block_group != block_group);
6064 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
6065 			 * set up a new clusters, so lets just skip it
6066 			 * and let the allocator find whatever block
6067 			 * it can find.  If we reach this point, we
6068 			 * will have tried the cluster allocator
6069 			 * plenty of times and not have found
6070 			 * anything, so we are likely way too
6071 			 * fragmented for the clustering stuff to find
6072 			 * anything.
6073 			 *
6074 			 * However, if the cluster is taken from the
6075 			 * current block group, release the cluster
6076 			 * first, so that we stand a better chance of
6077 			 * succeeding in the unclustered
6078 			 * allocation.  */
6079 			if (loop >= LOOP_NO_EMPTY_SIZE &&
6080 			    last_ptr->block_group != block_group) {
6081 				spin_unlock(&last_ptr->refill_lock);
6082 				goto unclustered_alloc;
6083 			}
6084 
6085 			/*
6086 			 * this cluster didn't work out, free it and
6087 			 * start over
6088 			 */
6089 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
6090 
6091 			if (loop >= LOOP_NO_EMPTY_SIZE) {
6092 				spin_unlock(&last_ptr->refill_lock);
6093 				goto unclustered_alloc;
6094 			}
6095 
6096 			aligned_cluster = max_t(unsigned long,
6097 						empty_cluster + empty_size,
6098 					      block_group->full_stripe_len);
6099 
6100 			/* allocate a cluster in this block group */
6101 			ret = btrfs_find_space_cluster(trans, root,
6102 					       block_group, last_ptr,
6103 					       search_start, num_bytes,
6104 					       aligned_cluster);
6105 			if (ret == 0) {
6106 				/*
6107 				 * now pull our allocation out of this
6108 				 * cluster
6109 				 */
6110 				offset = btrfs_alloc_from_cluster(block_group,
6111 						  last_ptr, num_bytes,
6112 						  search_start);
6113 				if (offset) {
6114 					/* we found one, proceed */
6115 					spin_unlock(&last_ptr->refill_lock);
6116 					trace_btrfs_reserve_extent_cluster(root,
6117 						block_group, search_start,
6118 						num_bytes);
6119 					goto checks;
6120 				}
6121 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
6122 				   && !failed_cluster_refill) {
6123 				spin_unlock(&last_ptr->refill_lock);
6124 
6125 				failed_cluster_refill = true;
6126 				wait_block_group_cache_progress(block_group,
6127 				       num_bytes + empty_cluster + empty_size);
6128 				goto have_block_group;
6129 			}
6130 
6131 			/*
6132 			 * at this point we either didn't find a cluster
6133 			 * or we weren't able to allocate a block from our
6134 			 * cluster.  Free the cluster we've been trying
6135 			 * to use, and go to the next block group
6136 			 */
6137 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
6138 			spin_unlock(&last_ptr->refill_lock);
6139 			goto loop;
6140 		}
6141 
6142 unclustered_alloc:
6143 		spin_lock(&block_group->free_space_ctl->tree_lock);
6144 		if (cached &&
6145 		    block_group->free_space_ctl->free_space <
6146 		    num_bytes + empty_cluster + empty_size) {
6147 			spin_unlock(&block_group->free_space_ctl->tree_lock);
6148 			goto loop;
6149 		}
6150 		spin_unlock(&block_group->free_space_ctl->tree_lock);
6151 
6152 		offset = btrfs_find_space_for_alloc(block_group, search_start,
6153 						    num_bytes, empty_size);
6154 		/*
6155 		 * If we didn't find a chunk, and we haven't failed on this
6156 		 * block group before, and this block group is in the middle of
6157 		 * caching and we are ok with waiting, then go ahead and wait
6158 		 * for progress to be made, and set failed_alloc to true.
6159 		 *
6160 		 * If failed_alloc is true then we've already waited on this
6161 		 * block group once and should move on to the next block group.
6162 		 */
6163 		if (!offset && !failed_alloc && !cached &&
6164 		    loop > LOOP_CACHING_NOWAIT) {
6165 			wait_block_group_cache_progress(block_group,
6166 						num_bytes + empty_size);
6167 			failed_alloc = true;
6168 			goto have_block_group;
6169 		} else if (!offset) {
6170 			if (!cached)
6171 				have_caching_bg = true;
6172 			goto loop;
6173 		}
6174 checks:
6175 		search_start = stripe_align(root, used_block_group,
6176 					    offset, num_bytes);
6177 
6178 		/* move on to the next group */
6179 		if (search_start + num_bytes >
6180 		    used_block_group->key.objectid + used_block_group->key.offset) {
6181 			btrfs_add_free_space(used_block_group, offset, num_bytes);
6182 			goto loop;
6183 		}
6184 
6185 		if (offset < search_start)
6186 			btrfs_add_free_space(used_block_group, offset,
6187 					     search_start - offset);
6188 		BUG_ON(offset > search_start);
6189 
6190 		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
6191 						  alloc_type);
6192 		if (ret == -EAGAIN) {
6193 			btrfs_add_free_space(used_block_group, offset, num_bytes);
6194 			goto loop;
6195 		}
6196 
6197 		/* we are all good, lets return */
6198 		ins->objectid = search_start;
6199 		ins->offset = num_bytes;
6200 
6201 		trace_btrfs_reserve_extent(orig_root, block_group,
6202 					   search_start, num_bytes);
6203 		if (used_block_group != block_group)
6204 			btrfs_put_block_group(used_block_group);
6205 		btrfs_put_block_group(block_group);
6206 		break;
6207 loop:
6208 		failed_cluster_refill = false;
6209 		failed_alloc = false;
6210 		BUG_ON(index != get_block_group_index(block_group));
6211 		if (used_block_group != block_group)
6212 			btrfs_put_block_group(used_block_group);
6213 		btrfs_put_block_group(block_group);
6214 	}
6215 	up_read(&space_info->groups_sem);
6216 
6217 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6218 		goto search;
6219 
6220 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6221 		goto search;
6222 
6223 	/*
6224 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6225 	 *			caching kthreads as we move along
6226 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6227 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6228 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6229 	 *			again
6230 	 */
6231 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6232 		index = 0;
6233 		loop++;
6234 		if (loop == LOOP_ALLOC_CHUNK) {
6235 			ret = do_chunk_alloc(trans, root, flags,
6236 					     CHUNK_ALLOC_FORCE);
6237 			/*
6238 			 * Do not bail out on ENOSPC since we
6239 			 * can do more things.
6240 			 */
6241 			if (ret < 0 && ret != -ENOSPC) {
6242 				btrfs_abort_transaction(trans,
6243 							root, ret);
6244 				goto out;
6245 			}
6246 		}
6247 
6248 		if (loop == LOOP_NO_EMPTY_SIZE) {
6249 			empty_size = 0;
6250 			empty_cluster = 0;
6251 		}
6252 
6253 		goto search;
6254 	} else if (!ins->objectid) {
6255 		ret = -ENOSPC;
6256 	} else if (ins->objectid) {
6257 		ret = 0;
6258 	}
6259 out:
6260 
6261 	return ret;
6262 }
6263 
6264 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6265 			    int dump_block_groups)
6266 {
6267 	struct btrfs_block_group_cache *cache;
6268 	int index = 0;
6269 
6270 	spin_lock(&info->lock);
6271 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
6272 	       (unsigned long long)info->flags,
6273 	       (unsigned long long)(info->total_bytes - info->bytes_used -
6274 				    info->bytes_pinned - info->bytes_reserved -
6275 				    info->bytes_readonly),
6276 	       (info->full) ? "" : "not ");
6277 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
6278 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
6279 	       (unsigned long long)info->total_bytes,
6280 	       (unsigned long long)info->bytes_used,
6281 	       (unsigned long long)info->bytes_pinned,
6282 	       (unsigned long long)info->bytes_reserved,
6283 	       (unsigned long long)info->bytes_may_use,
6284 	       (unsigned long long)info->bytes_readonly);
6285 	spin_unlock(&info->lock);
6286 
6287 	if (!dump_block_groups)
6288 		return;
6289 
6290 	down_read(&info->groups_sem);
6291 again:
6292 	list_for_each_entry(cache, &info->block_groups[index], list) {
6293 		spin_lock(&cache->lock);
6294 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
6295 		       (unsigned long long)cache->key.objectid,
6296 		       (unsigned long long)cache->key.offset,
6297 		       (unsigned long long)btrfs_block_group_used(&cache->item),
6298 		       (unsigned long long)cache->pinned,
6299 		       (unsigned long long)cache->reserved,
6300 		       cache->ro ? "[readonly]" : "");
6301 		btrfs_dump_free_space(cache, bytes);
6302 		spin_unlock(&cache->lock);
6303 	}
6304 	if (++index < BTRFS_NR_RAID_TYPES)
6305 		goto again;
6306 	up_read(&info->groups_sem);
6307 }
6308 
6309 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
6310 			 struct btrfs_root *root,
6311 			 u64 num_bytes, u64 min_alloc_size,
6312 			 u64 empty_size, u64 hint_byte,
6313 			 struct btrfs_key *ins, int is_data)
6314 {
6315 	bool final_tried = false;
6316 	u64 flags;
6317 	int ret;
6318 
6319 	flags = btrfs_get_alloc_profile(root, is_data);
6320 again:
6321 	WARN_ON(num_bytes < root->sectorsize);
6322 	ret = find_free_extent(trans, root, num_bytes, empty_size,
6323 			       hint_byte, ins, flags);
6324 
6325 	if (ret == -ENOSPC) {
6326 		if (!final_tried) {
6327 			num_bytes = num_bytes >> 1;
6328 			num_bytes = round_down(num_bytes, root->sectorsize);
6329 			num_bytes = max(num_bytes, min_alloc_size);
6330 			if (num_bytes == min_alloc_size)
6331 				final_tried = true;
6332 			goto again;
6333 		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6334 			struct btrfs_space_info *sinfo;
6335 
6336 			sinfo = __find_space_info(root->fs_info, flags);
6337 			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
6338 				(unsigned long long)flags,
6339 				(unsigned long long)num_bytes);
6340 			if (sinfo)
6341 				dump_space_info(sinfo, num_bytes, 1);
6342 		}
6343 	}
6344 
6345 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6346 
6347 	return ret;
6348 }
6349 
6350 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6351 					u64 start, u64 len, int pin)
6352 {
6353 	struct btrfs_block_group_cache *cache;
6354 	int ret = 0;
6355 
6356 	cache = btrfs_lookup_block_group(root->fs_info, start);
6357 	if (!cache) {
6358 		btrfs_err(root->fs_info, "Unable to find block group for %llu",
6359 			(unsigned long long)start);
6360 		return -ENOSPC;
6361 	}
6362 
6363 	if (btrfs_test_opt(root, DISCARD))
6364 		ret = btrfs_discard_extent(root, start, len, NULL);
6365 
6366 	if (pin)
6367 		pin_down_extent(root, cache, start, len, 1);
6368 	else {
6369 		btrfs_add_free_space(cache, start, len);
6370 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6371 	}
6372 	btrfs_put_block_group(cache);
6373 
6374 	trace_btrfs_reserved_extent_free(root, start, len);
6375 
6376 	return ret;
6377 }
6378 
6379 int btrfs_free_reserved_extent(struct btrfs_root *root,
6380 					u64 start, u64 len)
6381 {
6382 	return __btrfs_free_reserved_extent(root, start, len, 0);
6383 }
6384 
6385 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6386 				       u64 start, u64 len)
6387 {
6388 	return __btrfs_free_reserved_extent(root, start, len, 1);
6389 }
6390 
6391 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6392 				      struct btrfs_root *root,
6393 				      u64 parent, u64 root_objectid,
6394 				      u64 flags, u64 owner, u64 offset,
6395 				      struct btrfs_key *ins, int ref_mod)
6396 {
6397 	int ret;
6398 	struct btrfs_fs_info *fs_info = root->fs_info;
6399 	struct btrfs_extent_item *extent_item;
6400 	struct btrfs_extent_inline_ref *iref;
6401 	struct btrfs_path *path;
6402 	struct extent_buffer *leaf;
6403 	int type;
6404 	u32 size;
6405 
6406 	if (parent > 0)
6407 		type = BTRFS_SHARED_DATA_REF_KEY;
6408 	else
6409 		type = BTRFS_EXTENT_DATA_REF_KEY;
6410 
6411 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6412 
6413 	path = btrfs_alloc_path();
6414 	if (!path)
6415 		return -ENOMEM;
6416 
6417 	path->leave_spinning = 1;
6418 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6419 				      ins, size);
6420 	if (ret) {
6421 		btrfs_free_path(path);
6422 		return ret;
6423 	}
6424 
6425 	leaf = path->nodes[0];
6426 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6427 				     struct btrfs_extent_item);
6428 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6429 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6430 	btrfs_set_extent_flags(leaf, extent_item,
6431 			       flags | BTRFS_EXTENT_FLAG_DATA);
6432 
6433 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6434 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
6435 	if (parent > 0) {
6436 		struct btrfs_shared_data_ref *ref;
6437 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
6438 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6439 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6440 	} else {
6441 		struct btrfs_extent_data_ref *ref;
6442 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6443 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6444 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6445 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6446 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6447 	}
6448 
6449 	btrfs_mark_buffer_dirty(path->nodes[0]);
6450 	btrfs_free_path(path);
6451 
6452 	ret = update_block_group(root, ins->objectid, ins->offset, 1);
6453 	if (ret) { /* -ENOENT, logic error */
6454 		btrfs_err(fs_info, "update block group failed for %llu %llu",
6455 			(unsigned long long)ins->objectid,
6456 			(unsigned long long)ins->offset);
6457 		BUG();
6458 	}
6459 	return ret;
6460 }
6461 
6462 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6463 				     struct btrfs_root *root,
6464 				     u64 parent, u64 root_objectid,
6465 				     u64 flags, struct btrfs_disk_key *key,
6466 				     int level, struct btrfs_key *ins)
6467 {
6468 	int ret;
6469 	struct btrfs_fs_info *fs_info = root->fs_info;
6470 	struct btrfs_extent_item *extent_item;
6471 	struct btrfs_tree_block_info *block_info;
6472 	struct btrfs_extent_inline_ref *iref;
6473 	struct btrfs_path *path;
6474 	struct extent_buffer *leaf;
6475 	u32 size = sizeof(*extent_item) + sizeof(*iref);
6476 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6477 						 SKINNY_METADATA);
6478 
6479 	if (!skinny_metadata)
6480 		size += sizeof(*block_info);
6481 
6482 	path = btrfs_alloc_path();
6483 	if (!path)
6484 		return -ENOMEM;
6485 
6486 	path->leave_spinning = 1;
6487 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6488 				      ins, size);
6489 	if (ret) {
6490 		btrfs_free_path(path);
6491 		return ret;
6492 	}
6493 
6494 	leaf = path->nodes[0];
6495 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6496 				     struct btrfs_extent_item);
6497 	btrfs_set_extent_refs(leaf, extent_item, 1);
6498 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6499 	btrfs_set_extent_flags(leaf, extent_item,
6500 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6501 
6502 	if (skinny_metadata) {
6503 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6504 	} else {
6505 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6506 		btrfs_set_tree_block_key(leaf, block_info, key);
6507 		btrfs_set_tree_block_level(leaf, block_info, level);
6508 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6509 	}
6510 
6511 	if (parent > 0) {
6512 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6513 		btrfs_set_extent_inline_ref_type(leaf, iref,
6514 						 BTRFS_SHARED_BLOCK_REF_KEY);
6515 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6516 	} else {
6517 		btrfs_set_extent_inline_ref_type(leaf, iref,
6518 						 BTRFS_TREE_BLOCK_REF_KEY);
6519 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6520 	}
6521 
6522 	btrfs_mark_buffer_dirty(leaf);
6523 	btrfs_free_path(path);
6524 
6525 	ret = update_block_group(root, ins->objectid, root->leafsize, 1);
6526 	if (ret) { /* -ENOENT, logic error */
6527 		btrfs_err(fs_info, "update block group failed for %llu %llu",
6528 			(unsigned long long)ins->objectid,
6529 			(unsigned long long)ins->offset);
6530 		BUG();
6531 	}
6532 	return ret;
6533 }
6534 
6535 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6536 				     struct btrfs_root *root,
6537 				     u64 root_objectid, u64 owner,
6538 				     u64 offset, struct btrfs_key *ins)
6539 {
6540 	int ret;
6541 
6542 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6543 
6544 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6545 					 ins->offset, 0,
6546 					 root_objectid, owner, offset,
6547 					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6548 	return ret;
6549 }
6550 
6551 /*
6552  * this is used by the tree logging recovery code.  It records that
6553  * an extent has been allocated and makes sure to clear the free
6554  * space cache bits as well
6555  */
6556 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6557 				   struct btrfs_root *root,
6558 				   u64 root_objectid, u64 owner, u64 offset,
6559 				   struct btrfs_key *ins)
6560 {
6561 	int ret;
6562 	struct btrfs_block_group_cache *block_group;
6563 	struct btrfs_caching_control *caching_ctl;
6564 	u64 start = ins->objectid;
6565 	u64 num_bytes = ins->offset;
6566 
6567 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6568 	cache_block_group(block_group, 0);
6569 	caching_ctl = get_caching_control(block_group);
6570 
6571 	if (!caching_ctl) {
6572 		BUG_ON(!block_group_cache_done(block_group));
6573 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6574 		if (ret)
6575 			goto out;
6576 	} else {
6577 		mutex_lock(&caching_ctl->mutex);
6578 
6579 		if (start >= caching_ctl->progress) {
6580 			ret = add_excluded_extent(root, start, num_bytes);
6581 		} else if (start + num_bytes <= caching_ctl->progress) {
6582 			ret = btrfs_remove_free_space(block_group,
6583 						      start, num_bytes);
6584 		} else {
6585 			num_bytes = caching_ctl->progress - start;
6586 			ret = btrfs_remove_free_space(block_group,
6587 						      start, num_bytes);
6588 			if (ret)
6589 				goto out_lock;
6590 
6591 			start = caching_ctl->progress;
6592 			num_bytes = ins->objectid + ins->offset -
6593 				    caching_ctl->progress;
6594 			ret = add_excluded_extent(root, start, num_bytes);
6595 		}
6596 out_lock:
6597 		mutex_unlock(&caching_ctl->mutex);
6598 		put_caching_control(caching_ctl);
6599 		if (ret)
6600 			goto out;
6601 	}
6602 
6603 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6604 					  RESERVE_ALLOC_NO_ACCOUNT);
6605 	BUG_ON(ret); /* logic error */
6606 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6607 					 0, owner, offset, ins, 1);
6608 out:
6609 	btrfs_put_block_group(block_group);
6610 	return ret;
6611 }
6612 
6613 static struct extent_buffer *
6614 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6615 		      u64 bytenr, u32 blocksize, int level)
6616 {
6617 	struct extent_buffer *buf;
6618 
6619 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6620 	if (!buf)
6621 		return ERR_PTR(-ENOMEM);
6622 	btrfs_set_header_generation(buf, trans->transid);
6623 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6624 	btrfs_tree_lock(buf);
6625 	clean_tree_block(trans, root, buf);
6626 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6627 
6628 	btrfs_set_lock_blocking(buf);
6629 	btrfs_set_buffer_uptodate(buf);
6630 
6631 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6632 		/*
6633 		 * we allow two log transactions at a time, use different
6634 		 * EXENT bit to differentiate dirty pages.
6635 		 */
6636 		if (root->log_transid % 2 == 0)
6637 			set_extent_dirty(&root->dirty_log_pages, buf->start,
6638 					buf->start + buf->len - 1, GFP_NOFS);
6639 		else
6640 			set_extent_new(&root->dirty_log_pages, buf->start,
6641 					buf->start + buf->len - 1, GFP_NOFS);
6642 	} else {
6643 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6644 			 buf->start + buf->len - 1, GFP_NOFS);
6645 	}
6646 	trans->blocks_used++;
6647 	/* this returns a buffer locked for blocking */
6648 	return buf;
6649 }
6650 
6651 static struct btrfs_block_rsv *
6652 use_block_rsv(struct btrfs_trans_handle *trans,
6653 	      struct btrfs_root *root, u32 blocksize)
6654 {
6655 	struct btrfs_block_rsv *block_rsv;
6656 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6657 	int ret;
6658 	bool global_updated = false;
6659 
6660 	block_rsv = get_block_rsv(trans, root);
6661 
6662 	if (unlikely(block_rsv->size == 0))
6663 		goto try_reserve;
6664 again:
6665 	ret = block_rsv_use_bytes(block_rsv, blocksize);
6666 	if (!ret)
6667 		return block_rsv;
6668 
6669 	if (block_rsv->failfast)
6670 		return ERR_PTR(ret);
6671 
6672 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
6673 		global_updated = true;
6674 		update_global_block_rsv(root->fs_info);
6675 		goto again;
6676 	}
6677 
6678 	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6679 		static DEFINE_RATELIMIT_STATE(_rs,
6680 				DEFAULT_RATELIMIT_INTERVAL * 10,
6681 				/*DEFAULT_RATELIMIT_BURST*/ 1);
6682 		if (__ratelimit(&_rs))
6683 			WARN(1, KERN_DEBUG
6684 				"btrfs: block rsv returned %d\n", ret);
6685 	}
6686 try_reserve:
6687 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6688 				     BTRFS_RESERVE_NO_FLUSH);
6689 	if (!ret)
6690 		return block_rsv;
6691 	/*
6692 	 * If we couldn't reserve metadata bytes try and use some from
6693 	 * the global reserve if its space type is the same as the global
6694 	 * reservation.
6695 	 */
6696 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
6697 	    block_rsv->space_info == global_rsv->space_info) {
6698 		ret = block_rsv_use_bytes(global_rsv, blocksize);
6699 		if (!ret)
6700 			return global_rsv;
6701 	}
6702 	return ERR_PTR(ret);
6703 }
6704 
6705 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6706 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
6707 {
6708 	block_rsv_add_bytes(block_rsv, blocksize, 0);
6709 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6710 }
6711 
6712 /*
6713  * finds a free extent and does all the dirty work required for allocation
6714  * returns the key for the extent through ins, and a tree buffer for
6715  * the first block of the extent through buf.
6716  *
6717  * returns the tree buffer or NULL.
6718  */
6719 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6720 					struct btrfs_root *root, u32 blocksize,
6721 					u64 parent, u64 root_objectid,
6722 					struct btrfs_disk_key *key, int level,
6723 					u64 hint, u64 empty_size)
6724 {
6725 	struct btrfs_key ins;
6726 	struct btrfs_block_rsv *block_rsv;
6727 	struct extent_buffer *buf;
6728 	u64 flags = 0;
6729 	int ret;
6730 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6731 						 SKINNY_METADATA);
6732 
6733 	block_rsv = use_block_rsv(trans, root, blocksize);
6734 	if (IS_ERR(block_rsv))
6735 		return ERR_CAST(block_rsv);
6736 
6737 	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6738 				   empty_size, hint, &ins, 0);
6739 	if (ret) {
6740 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6741 		return ERR_PTR(ret);
6742 	}
6743 
6744 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6745 				    blocksize, level);
6746 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6747 
6748 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6749 		if (parent == 0)
6750 			parent = ins.objectid;
6751 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6752 	} else
6753 		BUG_ON(parent > 0);
6754 
6755 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6756 		struct btrfs_delayed_extent_op *extent_op;
6757 		extent_op = btrfs_alloc_delayed_extent_op();
6758 		BUG_ON(!extent_op); /* -ENOMEM */
6759 		if (key)
6760 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
6761 		else
6762 			memset(&extent_op->key, 0, sizeof(extent_op->key));
6763 		extent_op->flags_to_set = flags;
6764 		if (skinny_metadata)
6765 			extent_op->update_key = 0;
6766 		else
6767 			extent_op->update_key = 1;
6768 		extent_op->update_flags = 1;
6769 		extent_op->is_data = 0;
6770 		extent_op->level = level;
6771 
6772 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6773 					ins.objectid,
6774 					ins.offset, parent, root_objectid,
6775 					level, BTRFS_ADD_DELAYED_EXTENT,
6776 					extent_op, 0);
6777 		BUG_ON(ret); /* -ENOMEM */
6778 	}
6779 	return buf;
6780 }
6781 
6782 struct walk_control {
6783 	u64 refs[BTRFS_MAX_LEVEL];
6784 	u64 flags[BTRFS_MAX_LEVEL];
6785 	struct btrfs_key update_progress;
6786 	int stage;
6787 	int level;
6788 	int shared_level;
6789 	int update_ref;
6790 	int keep_locks;
6791 	int reada_slot;
6792 	int reada_count;
6793 	int for_reloc;
6794 };
6795 
6796 #define DROP_REFERENCE	1
6797 #define UPDATE_BACKREF	2
6798 
6799 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6800 				     struct btrfs_root *root,
6801 				     struct walk_control *wc,
6802 				     struct btrfs_path *path)
6803 {
6804 	u64 bytenr;
6805 	u64 generation;
6806 	u64 refs;
6807 	u64 flags;
6808 	u32 nritems;
6809 	u32 blocksize;
6810 	struct btrfs_key key;
6811 	struct extent_buffer *eb;
6812 	int ret;
6813 	int slot;
6814 	int nread = 0;
6815 
6816 	if (path->slots[wc->level] < wc->reada_slot) {
6817 		wc->reada_count = wc->reada_count * 2 / 3;
6818 		wc->reada_count = max(wc->reada_count, 2);
6819 	} else {
6820 		wc->reada_count = wc->reada_count * 3 / 2;
6821 		wc->reada_count = min_t(int, wc->reada_count,
6822 					BTRFS_NODEPTRS_PER_BLOCK(root));
6823 	}
6824 
6825 	eb = path->nodes[wc->level];
6826 	nritems = btrfs_header_nritems(eb);
6827 	blocksize = btrfs_level_size(root, wc->level - 1);
6828 
6829 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6830 		if (nread >= wc->reada_count)
6831 			break;
6832 
6833 		cond_resched();
6834 		bytenr = btrfs_node_blockptr(eb, slot);
6835 		generation = btrfs_node_ptr_generation(eb, slot);
6836 
6837 		if (slot == path->slots[wc->level])
6838 			goto reada;
6839 
6840 		if (wc->stage == UPDATE_BACKREF &&
6841 		    generation <= root->root_key.offset)
6842 			continue;
6843 
6844 		/* We don't lock the tree block, it's OK to be racy here */
6845 		ret = btrfs_lookup_extent_info(trans, root, bytenr,
6846 					       wc->level - 1, 1, &refs,
6847 					       &flags);
6848 		/* We don't care about errors in readahead. */
6849 		if (ret < 0)
6850 			continue;
6851 		BUG_ON(refs == 0);
6852 
6853 		if (wc->stage == DROP_REFERENCE) {
6854 			if (refs == 1)
6855 				goto reada;
6856 
6857 			if (wc->level == 1 &&
6858 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6859 				continue;
6860 			if (!wc->update_ref ||
6861 			    generation <= root->root_key.offset)
6862 				continue;
6863 			btrfs_node_key_to_cpu(eb, &key, slot);
6864 			ret = btrfs_comp_cpu_keys(&key,
6865 						  &wc->update_progress);
6866 			if (ret < 0)
6867 				continue;
6868 		} else {
6869 			if (wc->level == 1 &&
6870 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6871 				continue;
6872 		}
6873 reada:
6874 		ret = readahead_tree_block(root, bytenr, blocksize,
6875 					   generation);
6876 		if (ret)
6877 			break;
6878 		nread++;
6879 	}
6880 	wc->reada_slot = slot;
6881 }
6882 
6883 /*
6884  * helper to process tree block while walking down the tree.
6885  *
6886  * when wc->stage == UPDATE_BACKREF, this function updates
6887  * back refs for pointers in the block.
6888  *
6889  * NOTE: return value 1 means we should stop walking down.
6890  */
6891 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6892 				   struct btrfs_root *root,
6893 				   struct btrfs_path *path,
6894 				   struct walk_control *wc, int lookup_info)
6895 {
6896 	int level = wc->level;
6897 	struct extent_buffer *eb = path->nodes[level];
6898 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6899 	int ret;
6900 
6901 	if (wc->stage == UPDATE_BACKREF &&
6902 	    btrfs_header_owner(eb) != root->root_key.objectid)
6903 		return 1;
6904 
6905 	/*
6906 	 * when reference count of tree block is 1, it won't increase
6907 	 * again. once full backref flag is set, we never clear it.
6908 	 */
6909 	if (lookup_info &&
6910 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6911 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6912 		BUG_ON(!path->locks[level]);
6913 		ret = btrfs_lookup_extent_info(trans, root,
6914 					       eb->start, level, 1,
6915 					       &wc->refs[level],
6916 					       &wc->flags[level]);
6917 		BUG_ON(ret == -ENOMEM);
6918 		if (ret)
6919 			return ret;
6920 		BUG_ON(wc->refs[level] == 0);
6921 	}
6922 
6923 	if (wc->stage == DROP_REFERENCE) {
6924 		if (wc->refs[level] > 1)
6925 			return 1;
6926 
6927 		if (path->locks[level] && !wc->keep_locks) {
6928 			btrfs_tree_unlock_rw(eb, path->locks[level]);
6929 			path->locks[level] = 0;
6930 		}
6931 		return 0;
6932 	}
6933 
6934 	/* wc->stage == UPDATE_BACKREF */
6935 	if (!(wc->flags[level] & flag)) {
6936 		BUG_ON(!path->locks[level]);
6937 		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6938 		BUG_ON(ret); /* -ENOMEM */
6939 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6940 		BUG_ON(ret); /* -ENOMEM */
6941 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6942 						  eb->len, flag,
6943 						  btrfs_header_level(eb), 0);
6944 		BUG_ON(ret); /* -ENOMEM */
6945 		wc->flags[level] |= flag;
6946 	}
6947 
6948 	/*
6949 	 * the block is shared by multiple trees, so it's not good to
6950 	 * keep the tree lock
6951 	 */
6952 	if (path->locks[level] && level > 0) {
6953 		btrfs_tree_unlock_rw(eb, path->locks[level]);
6954 		path->locks[level] = 0;
6955 	}
6956 	return 0;
6957 }
6958 
6959 /*
6960  * helper to process tree block pointer.
6961  *
6962  * when wc->stage == DROP_REFERENCE, this function checks
6963  * reference count of the block pointed to. if the block
6964  * is shared and we need update back refs for the subtree
6965  * rooted at the block, this function changes wc->stage to
6966  * UPDATE_BACKREF. if the block is shared and there is no
6967  * need to update back, this function drops the reference
6968  * to the block.
6969  *
6970  * NOTE: return value 1 means we should stop walking down.
6971  */
6972 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6973 				 struct btrfs_root *root,
6974 				 struct btrfs_path *path,
6975 				 struct walk_control *wc, int *lookup_info)
6976 {
6977 	u64 bytenr;
6978 	u64 generation;
6979 	u64 parent;
6980 	u32 blocksize;
6981 	struct btrfs_key key;
6982 	struct extent_buffer *next;
6983 	int level = wc->level;
6984 	int reada = 0;
6985 	int ret = 0;
6986 
6987 	generation = btrfs_node_ptr_generation(path->nodes[level],
6988 					       path->slots[level]);
6989 	/*
6990 	 * if the lower level block was created before the snapshot
6991 	 * was created, we know there is no need to update back refs
6992 	 * for the subtree
6993 	 */
6994 	if (wc->stage == UPDATE_BACKREF &&
6995 	    generation <= root->root_key.offset) {
6996 		*lookup_info = 1;
6997 		return 1;
6998 	}
6999 
7000 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7001 	blocksize = btrfs_level_size(root, level - 1);
7002 
7003 	next = btrfs_find_tree_block(root, bytenr, blocksize);
7004 	if (!next) {
7005 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7006 		if (!next)
7007 			return -ENOMEM;
7008 		reada = 1;
7009 	}
7010 	btrfs_tree_lock(next);
7011 	btrfs_set_lock_blocking(next);
7012 
7013 	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
7014 				       &wc->refs[level - 1],
7015 				       &wc->flags[level - 1]);
7016 	if (ret < 0) {
7017 		btrfs_tree_unlock(next);
7018 		return ret;
7019 	}
7020 
7021 	if (unlikely(wc->refs[level - 1] == 0)) {
7022 		btrfs_err(root->fs_info, "Missing references.");
7023 		BUG();
7024 	}
7025 	*lookup_info = 0;
7026 
7027 	if (wc->stage == DROP_REFERENCE) {
7028 		if (wc->refs[level - 1] > 1) {
7029 			if (level == 1 &&
7030 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7031 				goto skip;
7032 
7033 			if (!wc->update_ref ||
7034 			    generation <= root->root_key.offset)
7035 				goto skip;
7036 
7037 			btrfs_node_key_to_cpu(path->nodes[level], &key,
7038 					      path->slots[level]);
7039 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7040 			if (ret < 0)
7041 				goto skip;
7042 
7043 			wc->stage = UPDATE_BACKREF;
7044 			wc->shared_level = level - 1;
7045 		}
7046 	} else {
7047 		if (level == 1 &&
7048 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7049 			goto skip;
7050 	}
7051 
7052 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
7053 		btrfs_tree_unlock(next);
7054 		free_extent_buffer(next);
7055 		next = NULL;
7056 		*lookup_info = 1;
7057 	}
7058 
7059 	if (!next) {
7060 		if (reada && level == 1)
7061 			reada_walk_down(trans, root, wc, path);
7062 		next = read_tree_block(root, bytenr, blocksize, generation);
7063 		if (!next || !extent_buffer_uptodate(next)) {
7064 			free_extent_buffer(next);
7065 			return -EIO;
7066 		}
7067 		btrfs_tree_lock(next);
7068 		btrfs_set_lock_blocking(next);
7069 	}
7070 
7071 	level--;
7072 	BUG_ON(level != btrfs_header_level(next));
7073 	path->nodes[level] = next;
7074 	path->slots[level] = 0;
7075 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7076 	wc->level = level;
7077 	if (wc->level == 1)
7078 		wc->reada_slot = 0;
7079 	return 0;
7080 skip:
7081 	wc->refs[level - 1] = 0;
7082 	wc->flags[level - 1] = 0;
7083 	if (wc->stage == DROP_REFERENCE) {
7084 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7085 			parent = path->nodes[level]->start;
7086 		} else {
7087 			BUG_ON(root->root_key.objectid !=
7088 			       btrfs_header_owner(path->nodes[level]));
7089 			parent = 0;
7090 		}
7091 
7092 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7093 				root->root_key.objectid, level - 1, 0, 0);
7094 		BUG_ON(ret); /* -ENOMEM */
7095 	}
7096 	btrfs_tree_unlock(next);
7097 	free_extent_buffer(next);
7098 	*lookup_info = 1;
7099 	return 1;
7100 }
7101 
7102 /*
7103  * helper to process tree block while walking up the tree.
7104  *
7105  * when wc->stage == DROP_REFERENCE, this function drops
7106  * reference count on the block.
7107  *
7108  * when wc->stage == UPDATE_BACKREF, this function changes
7109  * wc->stage back to DROP_REFERENCE if we changed wc->stage
7110  * to UPDATE_BACKREF previously while processing the block.
7111  *
7112  * NOTE: return value 1 means we should stop walking up.
7113  */
7114 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7115 				 struct btrfs_root *root,
7116 				 struct btrfs_path *path,
7117 				 struct walk_control *wc)
7118 {
7119 	int ret;
7120 	int level = wc->level;
7121 	struct extent_buffer *eb = path->nodes[level];
7122 	u64 parent = 0;
7123 
7124 	if (wc->stage == UPDATE_BACKREF) {
7125 		BUG_ON(wc->shared_level < level);
7126 		if (level < wc->shared_level)
7127 			goto out;
7128 
7129 		ret = find_next_key(path, level + 1, &wc->update_progress);
7130 		if (ret > 0)
7131 			wc->update_ref = 0;
7132 
7133 		wc->stage = DROP_REFERENCE;
7134 		wc->shared_level = -1;
7135 		path->slots[level] = 0;
7136 
7137 		/*
7138 		 * check reference count again if the block isn't locked.
7139 		 * we should start walking down the tree again if reference
7140 		 * count is one.
7141 		 */
7142 		if (!path->locks[level]) {
7143 			BUG_ON(level == 0);
7144 			btrfs_tree_lock(eb);
7145 			btrfs_set_lock_blocking(eb);
7146 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7147 
7148 			ret = btrfs_lookup_extent_info(trans, root,
7149 						       eb->start, level, 1,
7150 						       &wc->refs[level],
7151 						       &wc->flags[level]);
7152 			if (ret < 0) {
7153 				btrfs_tree_unlock_rw(eb, path->locks[level]);
7154 				path->locks[level] = 0;
7155 				return ret;
7156 			}
7157 			BUG_ON(wc->refs[level] == 0);
7158 			if (wc->refs[level] == 1) {
7159 				btrfs_tree_unlock_rw(eb, path->locks[level]);
7160 				path->locks[level] = 0;
7161 				return 1;
7162 			}
7163 		}
7164 	}
7165 
7166 	/* wc->stage == DROP_REFERENCE */
7167 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7168 
7169 	if (wc->refs[level] == 1) {
7170 		if (level == 0) {
7171 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7172 				ret = btrfs_dec_ref(trans, root, eb, 1,
7173 						    wc->for_reloc);
7174 			else
7175 				ret = btrfs_dec_ref(trans, root, eb, 0,
7176 						    wc->for_reloc);
7177 			BUG_ON(ret); /* -ENOMEM */
7178 		}
7179 		/* make block locked assertion in clean_tree_block happy */
7180 		if (!path->locks[level] &&
7181 		    btrfs_header_generation(eb) == trans->transid) {
7182 			btrfs_tree_lock(eb);
7183 			btrfs_set_lock_blocking(eb);
7184 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7185 		}
7186 		clean_tree_block(trans, root, eb);
7187 	}
7188 
7189 	if (eb == root->node) {
7190 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7191 			parent = eb->start;
7192 		else
7193 			BUG_ON(root->root_key.objectid !=
7194 			       btrfs_header_owner(eb));
7195 	} else {
7196 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7197 			parent = path->nodes[level + 1]->start;
7198 		else
7199 			BUG_ON(root->root_key.objectid !=
7200 			       btrfs_header_owner(path->nodes[level + 1]));
7201 	}
7202 
7203 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
7204 out:
7205 	wc->refs[level] = 0;
7206 	wc->flags[level] = 0;
7207 	return 0;
7208 }
7209 
7210 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7211 				   struct btrfs_root *root,
7212 				   struct btrfs_path *path,
7213 				   struct walk_control *wc)
7214 {
7215 	int level = wc->level;
7216 	int lookup_info = 1;
7217 	int ret;
7218 
7219 	while (level >= 0) {
7220 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
7221 		if (ret > 0)
7222 			break;
7223 
7224 		if (level == 0)
7225 			break;
7226 
7227 		if (path->slots[level] >=
7228 		    btrfs_header_nritems(path->nodes[level]))
7229 			break;
7230 
7231 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
7232 		if (ret > 0) {
7233 			path->slots[level]++;
7234 			continue;
7235 		} else if (ret < 0)
7236 			return ret;
7237 		level = wc->level;
7238 	}
7239 	return 0;
7240 }
7241 
7242 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
7243 				 struct btrfs_root *root,
7244 				 struct btrfs_path *path,
7245 				 struct walk_control *wc, int max_level)
7246 {
7247 	int level = wc->level;
7248 	int ret;
7249 
7250 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7251 	while (level < max_level && path->nodes[level]) {
7252 		wc->level = level;
7253 		if (path->slots[level] + 1 <
7254 		    btrfs_header_nritems(path->nodes[level])) {
7255 			path->slots[level]++;
7256 			return 0;
7257 		} else {
7258 			ret = walk_up_proc(trans, root, path, wc);
7259 			if (ret > 0)
7260 				return 0;
7261 
7262 			if (path->locks[level]) {
7263 				btrfs_tree_unlock_rw(path->nodes[level],
7264 						     path->locks[level]);
7265 				path->locks[level] = 0;
7266 			}
7267 			free_extent_buffer(path->nodes[level]);
7268 			path->nodes[level] = NULL;
7269 			level++;
7270 		}
7271 	}
7272 	return 1;
7273 }
7274 
7275 /*
7276  * drop a subvolume tree.
7277  *
7278  * this function traverses the tree freeing any blocks that only
7279  * referenced by the tree.
7280  *
7281  * when a shared tree block is found. this function decreases its
7282  * reference count by one. if update_ref is true, this function
7283  * also make sure backrefs for the shared block and all lower level
7284  * blocks are properly updated.
7285  *
7286  * If called with for_reloc == 0, may exit early with -EAGAIN
7287  */
7288 int btrfs_drop_snapshot(struct btrfs_root *root,
7289 			 struct btrfs_block_rsv *block_rsv, int update_ref,
7290 			 int for_reloc)
7291 {
7292 	struct btrfs_path *path;
7293 	struct btrfs_trans_handle *trans;
7294 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7295 	struct btrfs_root_item *root_item = &root->root_item;
7296 	struct walk_control *wc;
7297 	struct btrfs_key key;
7298 	int err = 0;
7299 	int ret;
7300 	int level;
7301 
7302 	path = btrfs_alloc_path();
7303 	if (!path) {
7304 		err = -ENOMEM;
7305 		goto out;
7306 	}
7307 
7308 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7309 	if (!wc) {
7310 		btrfs_free_path(path);
7311 		err = -ENOMEM;
7312 		goto out;
7313 	}
7314 
7315 	trans = btrfs_start_transaction(tree_root, 0);
7316 	if (IS_ERR(trans)) {
7317 		err = PTR_ERR(trans);
7318 		goto out_free;
7319 	}
7320 
7321 	if (block_rsv)
7322 		trans->block_rsv = block_rsv;
7323 
7324 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7325 		level = btrfs_header_level(root->node);
7326 		path->nodes[level] = btrfs_lock_root_node(root);
7327 		btrfs_set_lock_blocking(path->nodes[level]);
7328 		path->slots[level] = 0;
7329 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7330 		memset(&wc->update_progress, 0,
7331 		       sizeof(wc->update_progress));
7332 	} else {
7333 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7334 		memcpy(&wc->update_progress, &key,
7335 		       sizeof(wc->update_progress));
7336 
7337 		level = root_item->drop_level;
7338 		BUG_ON(level == 0);
7339 		path->lowest_level = level;
7340 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7341 		path->lowest_level = 0;
7342 		if (ret < 0) {
7343 			err = ret;
7344 			goto out_end_trans;
7345 		}
7346 		WARN_ON(ret > 0);
7347 
7348 		/*
7349 		 * unlock our path, this is safe because only this
7350 		 * function is allowed to delete this snapshot
7351 		 */
7352 		btrfs_unlock_up_safe(path, 0);
7353 
7354 		level = btrfs_header_level(root->node);
7355 		while (1) {
7356 			btrfs_tree_lock(path->nodes[level]);
7357 			btrfs_set_lock_blocking(path->nodes[level]);
7358 
7359 			ret = btrfs_lookup_extent_info(trans, root,
7360 						path->nodes[level]->start,
7361 						level, 1, &wc->refs[level],
7362 						&wc->flags[level]);
7363 			if (ret < 0) {
7364 				err = ret;
7365 				goto out_end_trans;
7366 			}
7367 			BUG_ON(wc->refs[level] == 0);
7368 
7369 			if (level == root_item->drop_level)
7370 				break;
7371 
7372 			btrfs_tree_unlock(path->nodes[level]);
7373 			WARN_ON(wc->refs[level] != 1);
7374 			level--;
7375 		}
7376 	}
7377 
7378 	wc->level = level;
7379 	wc->shared_level = -1;
7380 	wc->stage = DROP_REFERENCE;
7381 	wc->update_ref = update_ref;
7382 	wc->keep_locks = 0;
7383 	wc->for_reloc = for_reloc;
7384 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7385 
7386 	while (1) {
7387 		if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
7388 			pr_debug("btrfs: drop snapshot early exit\n");
7389 			err = -EAGAIN;
7390 			goto out_end_trans;
7391 		}
7392 
7393 		ret = walk_down_tree(trans, root, path, wc);
7394 		if (ret < 0) {
7395 			err = ret;
7396 			break;
7397 		}
7398 
7399 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7400 		if (ret < 0) {
7401 			err = ret;
7402 			break;
7403 		}
7404 
7405 		if (ret > 0) {
7406 			BUG_ON(wc->stage != DROP_REFERENCE);
7407 			break;
7408 		}
7409 
7410 		if (wc->stage == DROP_REFERENCE) {
7411 			level = wc->level;
7412 			btrfs_node_key(path->nodes[level],
7413 				       &root_item->drop_progress,
7414 				       path->slots[level]);
7415 			root_item->drop_level = level;
7416 		}
7417 
7418 		BUG_ON(wc->level == 0);
7419 		if (btrfs_should_end_transaction(trans, tree_root)) {
7420 			ret = btrfs_update_root(trans, tree_root,
7421 						&root->root_key,
7422 						root_item);
7423 			if (ret) {
7424 				btrfs_abort_transaction(trans, tree_root, ret);
7425 				err = ret;
7426 				goto out_end_trans;
7427 			}
7428 
7429 			btrfs_end_transaction_throttle(trans, tree_root);
7430 			trans = btrfs_start_transaction(tree_root, 0);
7431 			if (IS_ERR(trans)) {
7432 				err = PTR_ERR(trans);
7433 				goto out_free;
7434 			}
7435 			if (block_rsv)
7436 				trans->block_rsv = block_rsv;
7437 		}
7438 	}
7439 	btrfs_release_path(path);
7440 	if (err)
7441 		goto out_end_trans;
7442 
7443 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
7444 	if (ret) {
7445 		btrfs_abort_transaction(trans, tree_root, ret);
7446 		goto out_end_trans;
7447 	}
7448 
7449 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7450 		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7451 					   NULL, NULL);
7452 		if (ret < 0) {
7453 			btrfs_abort_transaction(trans, tree_root, ret);
7454 			err = ret;
7455 			goto out_end_trans;
7456 		} else if (ret > 0) {
7457 			/* if we fail to delete the orphan item this time
7458 			 * around, it'll get picked up the next time.
7459 			 *
7460 			 * The most common failure here is just -ENOENT.
7461 			 */
7462 			btrfs_del_orphan_item(trans, tree_root,
7463 					      root->root_key.objectid);
7464 		}
7465 	}
7466 
7467 	if (root->in_radix) {
7468 		btrfs_free_fs_root(tree_root->fs_info, root);
7469 	} else {
7470 		free_extent_buffer(root->node);
7471 		free_extent_buffer(root->commit_root);
7472 		kfree(root);
7473 	}
7474 out_end_trans:
7475 	btrfs_end_transaction_throttle(trans, tree_root);
7476 out_free:
7477 	kfree(wc);
7478 	btrfs_free_path(path);
7479 out:
7480 	if (err)
7481 		btrfs_std_error(root->fs_info, err);
7482 	return err;
7483 }
7484 
7485 /*
7486  * drop subtree rooted at tree block 'node'.
7487  *
7488  * NOTE: this function will unlock and release tree block 'node'
7489  * only used by relocation code
7490  */
7491 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7492 			struct btrfs_root *root,
7493 			struct extent_buffer *node,
7494 			struct extent_buffer *parent)
7495 {
7496 	struct btrfs_path *path;
7497 	struct walk_control *wc;
7498 	int level;
7499 	int parent_level;
7500 	int ret = 0;
7501 	int wret;
7502 
7503 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7504 
7505 	path = btrfs_alloc_path();
7506 	if (!path)
7507 		return -ENOMEM;
7508 
7509 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7510 	if (!wc) {
7511 		btrfs_free_path(path);
7512 		return -ENOMEM;
7513 	}
7514 
7515 	btrfs_assert_tree_locked(parent);
7516 	parent_level = btrfs_header_level(parent);
7517 	extent_buffer_get(parent);
7518 	path->nodes[parent_level] = parent;
7519 	path->slots[parent_level] = btrfs_header_nritems(parent);
7520 
7521 	btrfs_assert_tree_locked(node);
7522 	level = btrfs_header_level(node);
7523 	path->nodes[level] = node;
7524 	path->slots[level] = 0;
7525 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7526 
7527 	wc->refs[parent_level] = 1;
7528 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7529 	wc->level = level;
7530 	wc->shared_level = -1;
7531 	wc->stage = DROP_REFERENCE;
7532 	wc->update_ref = 0;
7533 	wc->keep_locks = 1;
7534 	wc->for_reloc = 1;
7535 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7536 
7537 	while (1) {
7538 		wret = walk_down_tree(trans, root, path, wc);
7539 		if (wret < 0) {
7540 			ret = wret;
7541 			break;
7542 		}
7543 
7544 		wret = walk_up_tree(trans, root, path, wc, parent_level);
7545 		if (wret < 0)
7546 			ret = wret;
7547 		if (wret != 0)
7548 			break;
7549 	}
7550 
7551 	kfree(wc);
7552 	btrfs_free_path(path);
7553 	return ret;
7554 }
7555 
7556 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7557 {
7558 	u64 num_devices;
7559 	u64 stripped;
7560 
7561 	/*
7562 	 * if restripe for this chunk_type is on pick target profile and
7563 	 * return, otherwise do the usual balance
7564 	 */
7565 	stripped = get_restripe_target(root->fs_info, flags);
7566 	if (stripped)
7567 		return extended_to_chunk(stripped);
7568 
7569 	/*
7570 	 * we add in the count of missing devices because we want
7571 	 * to make sure that any RAID levels on a degraded FS
7572 	 * continue to be honored.
7573 	 */
7574 	num_devices = root->fs_info->fs_devices->rw_devices +
7575 		root->fs_info->fs_devices->missing_devices;
7576 
7577 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
7578 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7579 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7580 
7581 	if (num_devices == 1) {
7582 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7583 		stripped = flags & ~stripped;
7584 
7585 		/* turn raid0 into single device chunks */
7586 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
7587 			return stripped;
7588 
7589 		/* turn mirroring into duplication */
7590 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7591 			     BTRFS_BLOCK_GROUP_RAID10))
7592 			return stripped | BTRFS_BLOCK_GROUP_DUP;
7593 	} else {
7594 		/* they already had raid on here, just return */
7595 		if (flags & stripped)
7596 			return flags;
7597 
7598 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7599 		stripped = flags & ~stripped;
7600 
7601 		/* switch duplicated blocks with raid1 */
7602 		if (flags & BTRFS_BLOCK_GROUP_DUP)
7603 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
7604 
7605 		/* this is drive concat, leave it alone */
7606 	}
7607 
7608 	return flags;
7609 }
7610 
7611 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7612 {
7613 	struct btrfs_space_info *sinfo = cache->space_info;
7614 	u64 num_bytes;
7615 	u64 min_allocable_bytes;
7616 	int ret = -ENOSPC;
7617 
7618 
7619 	/*
7620 	 * We need some metadata space and system metadata space for
7621 	 * allocating chunks in some corner cases until we force to set
7622 	 * it to be readonly.
7623 	 */
7624 	if ((sinfo->flags &
7625 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7626 	    !force)
7627 		min_allocable_bytes = 1 * 1024 * 1024;
7628 	else
7629 		min_allocable_bytes = 0;
7630 
7631 	spin_lock(&sinfo->lock);
7632 	spin_lock(&cache->lock);
7633 
7634 	if (cache->ro) {
7635 		ret = 0;
7636 		goto out;
7637 	}
7638 
7639 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7640 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7641 
7642 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7643 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7644 	    min_allocable_bytes <= sinfo->total_bytes) {
7645 		sinfo->bytes_readonly += num_bytes;
7646 		cache->ro = 1;
7647 		ret = 0;
7648 	}
7649 out:
7650 	spin_unlock(&cache->lock);
7651 	spin_unlock(&sinfo->lock);
7652 	return ret;
7653 }
7654 
7655 int btrfs_set_block_group_ro(struct btrfs_root *root,
7656 			     struct btrfs_block_group_cache *cache)
7657 
7658 {
7659 	struct btrfs_trans_handle *trans;
7660 	u64 alloc_flags;
7661 	int ret;
7662 
7663 	BUG_ON(cache->ro);
7664 
7665 	trans = btrfs_join_transaction(root);
7666 	if (IS_ERR(trans))
7667 		return PTR_ERR(trans);
7668 
7669 	alloc_flags = update_block_group_flags(root, cache->flags);
7670 	if (alloc_flags != cache->flags) {
7671 		ret = do_chunk_alloc(trans, root, alloc_flags,
7672 				     CHUNK_ALLOC_FORCE);
7673 		if (ret < 0)
7674 			goto out;
7675 	}
7676 
7677 	ret = set_block_group_ro(cache, 0);
7678 	if (!ret)
7679 		goto out;
7680 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7681 	ret = do_chunk_alloc(trans, root, alloc_flags,
7682 			     CHUNK_ALLOC_FORCE);
7683 	if (ret < 0)
7684 		goto out;
7685 	ret = set_block_group_ro(cache, 0);
7686 out:
7687 	btrfs_end_transaction(trans, root);
7688 	return ret;
7689 }
7690 
7691 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7692 			    struct btrfs_root *root, u64 type)
7693 {
7694 	u64 alloc_flags = get_alloc_profile(root, type);
7695 	return do_chunk_alloc(trans, root, alloc_flags,
7696 			      CHUNK_ALLOC_FORCE);
7697 }
7698 
7699 /*
7700  * helper to account the unused space of all the readonly block group in the
7701  * list. takes mirrors into account.
7702  */
7703 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7704 {
7705 	struct btrfs_block_group_cache *block_group;
7706 	u64 free_bytes = 0;
7707 	int factor;
7708 
7709 	list_for_each_entry(block_group, groups_list, list) {
7710 		spin_lock(&block_group->lock);
7711 
7712 		if (!block_group->ro) {
7713 			spin_unlock(&block_group->lock);
7714 			continue;
7715 		}
7716 
7717 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7718 					  BTRFS_BLOCK_GROUP_RAID10 |
7719 					  BTRFS_BLOCK_GROUP_DUP))
7720 			factor = 2;
7721 		else
7722 			factor = 1;
7723 
7724 		free_bytes += (block_group->key.offset -
7725 			       btrfs_block_group_used(&block_group->item)) *
7726 			       factor;
7727 
7728 		spin_unlock(&block_group->lock);
7729 	}
7730 
7731 	return free_bytes;
7732 }
7733 
7734 /*
7735  * helper to account the unused space of all the readonly block group in the
7736  * space_info. takes mirrors into account.
7737  */
7738 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7739 {
7740 	int i;
7741 	u64 free_bytes = 0;
7742 
7743 	spin_lock(&sinfo->lock);
7744 
7745 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7746 		if (!list_empty(&sinfo->block_groups[i]))
7747 			free_bytes += __btrfs_get_ro_block_group_free_space(
7748 						&sinfo->block_groups[i]);
7749 
7750 	spin_unlock(&sinfo->lock);
7751 
7752 	return free_bytes;
7753 }
7754 
7755 void btrfs_set_block_group_rw(struct btrfs_root *root,
7756 			      struct btrfs_block_group_cache *cache)
7757 {
7758 	struct btrfs_space_info *sinfo = cache->space_info;
7759 	u64 num_bytes;
7760 
7761 	BUG_ON(!cache->ro);
7762 
7763 	spin_lock(&sinfo->lock);
7764 	spin_lock(&cache->lock);
7765 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7766 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7767 	sinfo->bytes_readonly -= num_bytes;
7768 	cache->ro = 0;
7769 	spin_unlock(&cache->lock);
7770 	spin_unlock(&sinfo->lock);
7771 }
7772 
7773 /*
7774  * checks to see if its even possible to relocate this block group.
7775  *
7776  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7777  * ok to go ahead and try.
7778  */
7779 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7780 {
7781 	struct btrfs_block_group_cache *block_group;
7782 	struct btrfs_space_info *space_info;
7783 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7784 	struct btrfs_device *device;
7785 	u64 min_free;
7786 	u64 dev_min = 1;
7787 	u64 dev_nr = 0;
7788 	u64 target;
7789 	int index;
7790 	int full = 0;
7791 	int ret = 0;
7792 
7793 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7794 
7795 	/* odd, couldn't find the block group, leave it alone */
7796 	if (!block_group)
7797 		return -1;
7798 
7799 	min_free = btrfs_block_group_used(&block_group->item);
7800 
7801 	/* no bytes used, we're good */
7802 	if (!min_free)
7803 		goto out;
7804 
7805 	space_info = block_group->space_info;
7806 	spin_lock(&space_info->lock);
7807 
7808 	full = space_info->full;
7809 
7810 	/*
7811 	 * if this is the last block group we have in this space, we can't
7812 	 * relocate it unless we're able to allocate a new chunk below.
7813 	 *
7814 	 * Otherwise, we need to make sure we have room in the space to handle
7815 	 * all of the extents from this block group.  If we can, we're good
7816 	 */
7817 	if ((space_info->total_bytes != block_group->key.offset) &&
7818 	    (space_info->bytes_used + space_info->bytes_reserved +
7819 	     space_info->bytes_pinned + space_info->bytes_readonly +
7820 	     min_free < space_info->total_bytes)) {
7821 		spin_unlock(&space_info->lock);
7822 		goto out;
7823 	}
7824 	spin_unlock(&space_info->lock);
7825 
7826 	/*
7827 	 * ok we don't have enough space, but maybe we have free space on our
7828 	 * devices to allocate new chunks for relocation, so loop through our
7829 	 * alloc devices and guess if we have enough space.  if this block
7830 	 * group is going to be restriped, run checks against the target
7831 	 * profile instead of the current one.
7832 	 */
7833 	ret = -1;
7834 
7835 	/*
7836 	 * index:
7837 	 *      0: raid10
7838 	 *      1: raid1
7839 	 *      2: dup
7840 	 *      3: raid0
7841 	 *      4: single
7842 	 */
7843 	target = get_restripe_target(root->fs_info, block_group->flags);
7844 	if (target) {
7845 		index = __get_raid_index(extended_to_chunk(target));
7846 	} else {
7847 		/*
7848 		 * this is just a balance, so if we were marked as full
7849 		 * we know there is no space for a new chunk
7850 		 */
7851 		if (full)
7852 			goto out;
7853 
7854 		index = get_block_group_index(block_group);
7855 	}
7856 
7857 	if (index == BTRFS_RAID_RAID10) {
7858 		dev_min = 4;
7859 		/* Divide by 2 */
7860 		min_free >>= 1;
7861 	} else if (index == BTRFS_RAID_RAID1) {
7862 		dev_min = 2;
7863 	} else if (index == BTRFS_RAID_DUP) {
7864 		/* Multiply by 2 */
7865 		min_free <<= 1;
7866 	} else if (index == BTRFS_RAID_RAID0) {
7867 		dev_min = fs_devices->rw_devices;
7868 		do_div(min_free, dev_min);
7869 	}
7870 
7871 	mutex_lock(&root->fs_info->chunk_mutex);
7872 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7873 		u64 dev_offset;
7874 
7875 		/*
7876 		 * check to make sure we can actually find a chunk with enough
7877 		 * space to fit our block group in.
7878 		 */
7879 		if (device->total_bytes > device->bytes_used + min_free &&
7880 		    !device->is_tgtdev_for_dev_replace) {
7881 			ret = find_free_dev_extent(device, min_free,
7882 						   &dev_offset, NULL);
7883 			if (!ret)
7884 				dev_nr++;
7885 
7886 			if (dev_nr >= dev_min)
7887 				break;
7888 
7889 			ret = -1;
7890 		}
7891 	}
7892 	mutex_unlock(&root->fs_info->chunk_mutex);
7893 out:
7894 	btrfs_put_block_group(block_group);
7895 	return ret;
7896 }
7897 
7898 static int find_first_block_group(struct btrfs_root *root,
7899 		struct btrfs_path *path, struct btrfs_key *key)
7900 {
7901 	int ret = 0;
7902 	struct btrfs_key found_key;
7903 	struct extent_buffer *leaf;
7904 	int slot;
7905 
7906 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7907 	if (ret < 0)
7908 		goto out;
7909 
7910 	while (1) {
7911 		slot = path->slots[0];
7912 		leaf = path->nodes[0];
7913 		if (slot >= btrfs_header_nritems(leaf)) {
7914 			ret = btrfs_next_leaf(root, path);
7915 			if (ret == 0)
7916 				continue;
7917 			if (ret < 0)
7918 				goto out;
7919 			break;
7920 		}
7921 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
7922 
7923 		if (found_key.objectid >= key->objectid &&
7924 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7925 			ret = 0;
7926 			goto out;
7927 		}
7928 		path->slots[0]++;
7929 	}
7930 out:
7931 	return ret;
7932 }
7933 
7934 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7935 {
7936 	struct btrfs_block_group_cache *block_group;
7937 	u64 last = 0;
7938 
7939 	while (1) {
7940 		struct inode *inode;
7941 
7942 		block_group = btrfs_lookup_first_block_group(info, last);
7943 		while (block_group) {
7944 			spin_lock(&block_group->lock);
7945 			if (block_group->iref)
7946 				break;
7947 			spin_unlock(&block_group->lock);
7948 			block_group = next_block_group(info->tree_root,
7949 						       block_group);
7950 		}
7951 		if (!block_group) {
7952 			if (last == 0)
7953 				break;
7954 			last = 0;
7955 			continue;
7956 		}
7957 
7958 		inode = block_group->inode;
7959 		block_group->iref = 0;
7960 		block_group->inode = NULL;
7961 		spin_unlock(&block_group->lock);
7962 		iput(inode);
7963 		last = block_group->key.objectid + block_group->key.offset;
7964 		btrfs_put_block_group(block_group);
7965 	}
7966 }
7967 
7968 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7969 {
7970 	struct btrfs_block_group_cache *block_group;
7971 	struct btrfs_space_info *space_info;
7972 	struct btrfs_caching_control *caching_ctl;
7973 	struct rb_node *n;
7974 
7975 	down_write(&info->extent_commit_sem);
7976 	while (!list_empty(&info->caching_block_groups)) {
7977 		caching_ctl = list_entry(info->caching_block_groups.next,
7978 					 struct btrfs_caching_control, list);
7979 		list_del(&caching_ctl->list);
7980 		put_caching_control(caching_ctl);
7981 	}
7982 	up_write(&info->extent_commit_sem);
7983 
7984 	spin_lock(&info->block_group_cache_lock);
7985 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7986 		block_group = rb_entry(n, struct btrfs_block_group_cache,
7987 				       cache_node);
7988 		rb_erase(&block_group->cache_node,
7989 			 &info->block_group_cache_tree);
7990 		spin_unlock(&info->block_group_cache_lock);
7991 
7992 		down_write(&block_group->space_info->groups_sem);
7993 		list_del(&block_group->list);
7994 		up_write(&block_group->space_info->groups_sem);
7995 
7996 		if (block_group->cached == BTRFS_CACHE_STARTED)
7997 			wait_block_group_cache_done(block_group);
7998 
7999 		/*
8000 		 * We haven't cached this block group, which means we could
8001 		 * possibly have excluded extents on this block group.
8002 		 */
8003 		if (block_group->cached == BTRFS_CACHE_NO)
8004 			free_excluded_extents(info->extent_root, block_group);
8005 
8006 		btrfs_remove_free_space_cache(block_group);
8007 		btrfs_put_block_group(block_group);
8008 
8009 		spin_lock(&info->block_group_cache_lock);
8010 	}
8011 	spin_unlock(&info->block_group_cache_lock);
8012 
8013 	/* now that all the block groups are freed, go through and
8014 	 * free all the space_info structs.  This is only called during
8015 	 * the final stages of unmount, and so we know nobody is
8016 	 * using them.  We call synchronize_rcu() once before we start,
8017 	 * just to be on the safe side.
8018 	 */
8019 	synchronize_rcu();
8020 
8021 	release_global_block_rsv(info);
8022 
8023 	while(!list_empty(&info->space_info)) {
8024 		space_info = list_entry(info->space_info.next,
8025 					struct btrfs_space_info,
8026 					list);
8027 		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
8028 			if (space_info->bytes_pinned > 0 ||
8029 			    space_info->bytes_reserved > 0 ||
8030 			    space_info->bytes_may_use > 0) {
8031 				WARN_ON(1);
8032 				dump_space_info(space_info, 0, 0);
8033 			}
8034 		}
8035 		list_del(&space_info->list);
8036 		kfree(space_info);
8037 	}
8038 	return 0;
8039 }
8040 
8041 static void __link_block_group(struct btrfs_space_info *space_info,
8042 			       struct btrfs_block_group_cache *cache)
8043 {
8044 	int index = get_block_group_index(cache);
8045 
8046 	down_write(&space_info->groups_sem);
8047 	list_add_tail(&cache->list, &space_info->block_groups[index]);
8048 	up_write(&space_info->groups_sem);
8049 }
8050 
8051 int btrfs_read_block_groups(struct btrfs_root *root)
8052 {
8053 	struct btrfs_path *path;
8054 	int ret;
8055 	struct btrfs_block_group_cache *cache;
8056 	struct btrfs_fs_info *info = root->fs_info;
8057 	struct btrfs_space_info *space_info;
8058 	struct btrfs_key key;
8059 	struct btrfs_key found_key;
8060 	struct extent_buffer *leaf;
8061 	int need_clear = 0;
8062 	u64 cache_gen;
8063 
8064 	root = info->extent_root;
8065 	key.objectid = 0;
8066 	key.offset = 0;
8067 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8068 	path = btrfs_alloc_path();
8069 	if (!path)
8070 		return -ENOMEM;
8071 	path->reada = 1;
8072 
8073 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
8074 	if (btrfs_test_opt(root, SPACE_CACHE) &&
8075 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
8076 		need_clear = 1;
8077 	if (btrfs_test_opt(root, CLEAR_CACHE))
8078 		need_clear = 1;
8079 
8080 	while (1) {
8081 		ret = find_first_block_group(root, path, &key);
8082 		if (ret > 0)
8083 			break;
8084 		if (ret != 0)
8085 			goto error;
8086 		leaf = path->nodes[0];
8087 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8088 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
8089 		if (!cache) {
8090 			ret = -ENOMEM;
8091 			goto error;
8092 		}
8093 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8094 						GFP_NOFS);
8095 		if (!cache->free_space_ctl) {
8096 			kfree(cache);
8097 			ret = -ENOMEM;
8098 			goto error;
8099 		}
8100 
8101 		atomic_set(&cache->count, 1);
8102 		spin_lock_init(&cache->lock);
8103 		cache->fs_info = info;
8104 		INIT_LIST_HEAD(&cache->list);
8105 		INIT_LIST_HEAD(&cache->cluster_list);
8106 
8107 		if (need_clear) {
8108 			/*
8109 			 * When we mount with old space cache, we need to
8110 			 * set BTRFS_DC_CLEAR and set dirty flag.
8111 			 *
8112 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8113 			 *    truncate the old free space cache inode and
8114 			 *    setup a new one.
8115 			 * b) Setting 'dirty flag' makes sure that we flush
8116 			 *    the new space cache info onto disk.
8117 			 */
8118 			cache->disk_cache_state = BTRFS_DC_CLEAR;
8119 			if (btrfs_test_opt(root, SPACE_CACHE))
8120 				cache->dirty = 1;
8121 		}
8122 
8123 		read_extent_buffer(leaf, &cache->item,
8124 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
8125 				   sizeof(cache->item));
8126 		memcpy(&cache->key, &found_key, sizeof(found_key));
8127 
8128 		key.objectid = found_key.objectid + found_key.offset;
8129 		btrfs_release_path(path);
8130 		cache->flags = btrfs_block_group_flags(&cache->item);
8131 		cache->sectorsize = root->sectorsize;
8132 		cache->full_stripe_len = btrfs_full_stripe_len(root,
8133 					       &root->fs_info->mapping_tree,
8134 					       found_key.objectid);
8135 		btrfs_init_free_space_ctl(cache);
8136 
8137 		/*
8138 		 * We need to exclude the super stripes now so that the space
8139 		 * info has super bytes accounted for, otherwise we'll think
8140 		 * we have more space than we actually do.
8141 		 */
8142 		ret = exclude_super_stripes(root, cache);
8143 		if (ret) {
8144 			/*
8145 			 * We may have excluded something, so call this just in
8146 			 * case.
8147 			 */
8148 			free_excluded_extents(root, cache);
8149 			kfree(cache->free_space_ctl);
8150 			kfree(cache);
8151 			goto error;
8152 		}
8153 
8154 		/*
8155 		 * check for two cases, either we are full, and therefore
8156 		 * don't need to bother with the caching work since we won't
8157 		 * find any space, or we are empty, and we can just add all
8158 		 * the space in and be done with it.  This saves us _alot_ of
8159 		 * time, particularly in the full case.
8160 		 */
8161 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8162 			cache->last_byte_to_unpin = (u64)-1;
8163 			cache->cached = BTRFS_CACHE_FINISHED;
8164 			free_excluded_extents(root, cache);
8165 		} else if (btrfs_block_group_used(&cache->item) == 0) {
8166 			cache->last_byte_to_unpin = (u64)-1;
8167 			cache->cached = BTRFS_CACHE_FINISHED;
8168 			add_new_free_space(cache, root->fs_info,
8169 					   found_key.objectid,
8170 					   found_key.objectid +
8171 					   found_key.offset);
8172 			free_excluded_extents(root, cache);
8173 		}
8174 
8175 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
8176 		if (ret) {
8177 			btrfs_remove_free_space_cache(cache);
8178 			btrfs_put_block_group(cache);
8179 			goto error;
8180 		}
8181 
8182 		ret = update_space_info(info, cache->flags, found_key.offset,
8183 					btrfs_block_group_used(&cache->item),
8184 					&space_info);
8185 		if (ret) {
8186 			btrfs_remove_free_space_cache(cache);
8187 			spin_lock(&info->block_group_cache_lock);
8188 			rb_erase(&cache->cache_node,
8189 				 &info->block_group_cache_tree);
8190 			spin_unlock(&info->block_group_cache_lock);
8191 			btrfs_put_block_group(cache);
8192 			goto error;
8193 		}
8194 
8195 		cache->space_info = space_info;
8196 		spin_lock(&cache->space_info->lock);
8197 		cache->space_info->bytes_readonly += cache->bytes_super;
8198 		spin_unlock(&cache->space_info->lock);
8199 
8200 		__link_block_group(space_info, cache);
8201 
8202 		set_avail_alloc_bits(root->fs_info, cache->flags);
8203 		if (btrfs_chunk_readonly(root, cache->key.objectid))
8204 			set_block_group_ro(cache, 1);
8205 	}
8206 
8207 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8208 		if (!(get_alloc_profile(root, space_info->flags) &
8209 		      (BTRFS_BLOCK_GROUP_RAID10 |
8210 		       BTRFS_BLOCK_GROUP_RAID1 |
8211 		       BTRFS_BLOCK_GROUP_RAID5 |
8212 		       BTRFS_BLOCK_GROUP_RAID6 |
8213 		       BTRFS_BLOCK_GROUP_DUP)))
8214 			continue;
8215 		/*
8216 		 * avoid allocating from un-mirrored block group if there are
8217 		 * mirrored block groups.
8218 		 */
8219 		list_for_each_entry(cache, &space_info->block_groups[3], list)
8220 			set_block_group_ro(cache, 1);
8221 		list_for_each_entry(cache, &space_info->block_groups[4], list)
8222 			set_block_group_ro(cache, 1);
8223 	}
8224 
8225 	init_global_block_rsv(info);
8226 	ret = 0;
8227 error:
8228 	btrfs_free_path(path);
8229 	return ret;
8230 }
8231 
8232 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8233 				       struct btrfs_root *root)
8234 {
8235 	struct btrfs_block_group_cache *block_group, *tmp;
8236 	struct btrfs_root *extent_root = root->fs_info->extent_root;
8237 	struct btrfs_block_group_item item;
8238 	struct btrfs_key key;
8239 	int ret = 0;
8240 
8241 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
8242 				 new_bg_list) {
8243 		list_del_init(&block_group->new_bg_list);
8244 
8245 		if (ret)
8246 			continue;
8247 
8248 		spin_lock(&block_group->lock);
8249 		memcpy(&item, &block_group->item, sizeof(item));
8250 		memcpy(&key, &block_group->key, sizeof(key));
8251 		spin_unlock(&block_group->lock);
8252 
8253 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
8254 					sizeof(item));
8255 		if (ret)
8256 			btrfs_abort_transaction(trans, extent_root, ret);
8257 	}
8258 }
8259 
8260 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8261 			   struct btrfs_root *root, u64 bytes_used,
8262 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
8263 			   u64 size)
8264 {
8265 	int ret;
8266 	struct btrfs_root *extent_root;
8267 	struct btrfs_block_group_cache *cache;
8268 
8269 	extent_root = root->fs_info->extent_root;
8270 
8271 	root->fs_info->last_trans_log_full_commit = trans->transid;
8272 
8273 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
8274 	if (!cache)
8275 		return -ENOMEM;
8276 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8277 					GFP_NOFS);
8278 	if (!cache->free_space_ctl) {
8279 		kfree(cache);
8280 		return -ENOMEM;
8281 	}
8282 
8283 	cache->key.objectid = chunk_offset;
8284 	cache->key.offset = size;
8285 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8286 	cache->sectorsize = root->sectorsize;
8287 	cache->fs_info = root->fs_info;
8288 	cache->full_stripe_len = btrfs_full_stripe_len(root,
8289 					       &root->fs_info->mapping_tree,
8290 					       chunk_offset);
8291 
8292 	atomic_set(&cache->count, 1);
8293 	spin_lock_init(&cache->lock);
8294 	INIT_LIST_HEAD(&cache->list);
8295 	INIT_LIST_HEAD(&cache->cluster_list);
8296 	INIT_LIST_HEAD(&cache->new_bg_list);
8297 
8298 	btrfs_init_free_space_ctl(cache);
8299 
8300 	btrfs_set_block_group_used(&cache->item, bytes_used);
8301 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8302 	cache->flags = type;
8303 	btrfs_set_block_group_flags(&cache->item, type);
8304 
8305 	cache->last_byte_to_unpin = (u64)-1;
8306 	cache->cached = BTRFS_CACHE_FINISHED;
8307 	ret = exclude_super_stripes(root, cache);
8308 	if (ret) {
8309 		/*
8310 		 * We may have excluded something, so call this just in
8311 		 * case.
8312 		 */
8313 		free_excluded_extents(root, cache);
8314 		kfree(cache->free_space_ctl);
8315 		kfree(cache);
8316 		return ret;
8317 	}
8318 
8319 	add_new_free_space(cache, root->fs_info, chunk_offset,
8320 			   chunk_offset + size);
8321 
8322 	free_excluded_extents(root, cache);
8323 
8324 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
8325 	if (ret) {
8326 		btrfs_remove_free_space_cache(cache);
8327 		btrfs_put_block_group(cache);
8328 		return ret;
8329 	}
8330 
8331 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8332 				&cache->space_info);
8333 	if (ret) {
8334 		btrfs_remove_free_space_cache(cache);
8335 		spin_lock(&root->fs_info->block_group_cache_lock);
8336 		rb_erase(&cache->cache_node,
8337 			 &root->fs_info->block_group_cache_tree);
8338 		spin_unlock(&root->fs_info->block_group_cache_lock);
8339 		btrfs_put_block_group(cache);
8340 		return ret;
8341 	}
8342 	update_global_block_rsv(root->fs_info);
8343 
8344 	spin_lock(&cache->space_info->lock);
8345 	cache->space_info->bytes_readonly += cache->bytes_super;
8346 	spin_unlock(&cache->space_info->lock);
8347 
8348 	__link_block_group(cache->space_info, cache);
8349 
8350 	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
8351 
8352 	set_avail_alloc_bits(extent_root->fs_info, type);
8353 
8354 	return 0;
8355 }
8356 
8357 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8358 {
8359 	u64 extra_flags = chunk_to_extended(flags) &
8360 				BTRFS_EXTENDED_PROFILE_MASK;
8361 
8362 	write_seqlock(&fs_info->profiles_lock);
8363 	if (flags & BTRFS_BLOCK_GROUP_DATA)
8364 		fs_info->avail_data_alloc_bits &= ~extra_flags;
8365 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
8366 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8367 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8368 		fs_info->avail_system_alloc_bits &= ~extra_flags;
8369 	write_sequnlock(&fs_info->profiles_lock);
8370 }
8371 
8372 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8373 			     struct btrfs_root *root, u64 group_start)
8374 {
8375 	struct btrfs_path *path;
8376 	struct btrfs_block_group_cache *block_group;
8377 	struct btrfs_free_cluster *cluster;
8378 	struct btrfs_root *tree_root = root->fs_info->tree_root;
8379 	struct btrfs_key key;
8380 	struct inode *inode;
8381 	int ret;
8382 	int index;
8383 	int factor;
8384 
8385 	root = root->fs_info->extent_root;
8386 
8387 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8388 	BUG_ON(!block_group);
8389 	BUG_ON(!block_group->ro);
8390 
8391 	/*
8392 	 * Free the reserved super bytes from this block group before
8393 	 * remove it.
8394 	 */
8395 	free_excluded_extents(root, block_group);
8396 
8397 	memcpy(&key, &block_group->key, sizeof(key));
8398 	index = get_block_group_index(block_group);
8399 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8400 				  BTRFS_BLOCK_GROUP_RAID1 |
8401 				  BTRFS_BLOCK_GROUP_RAID10))
8402 		factor = 2;
8403 	else
8404 		factor = 1;
8405 
8406 	/* make sure this block group isn't part of an allocation cluster */
8407 	cluster = &root->fs_info->data_alloc_cluster;
8408 	spin_lock(&cluster->refill_lock);
8409 	btrfs_return_cluster_to_free_space(block_group, cluster);
8410 	spin_unlock(&cluster->refill_lock);
8411 
8412 	/*
8413 	 * make sure this block group isn't part of a metadata
8414 	 * allocation cluster
8415 	 */
8416 	cluster = &root->fs_info->meta_alloc_cluster;
8417 	spin_lock(&cluster->refill_lock);
8418 	btrfs_return_cluster_to_free_space(block_group, cluster);
8419 	spin_unlock(&cluster->refill_lock);
8420 
8421 	path = btrfs_alloc_path();
8422 	if (!path) {
8423 		ret = -ENOMEM;
8424 		goto out;
8425 	}
8426 
8427 	inode = lookup_free_space_inode(tree_root, block_group, path);
8428 	if (!IS_ERR(inode)) {
8429 		ret = btrfs_orphan_add(trans, inode);
8430 		if (ret) {
8431 			btrfs_add_delayed_iput(inode);
8432 			goto out;
8433 		}
8434 		clear_nlink(inode);
8435 		/* One for the block groups ref */
8436 		spin_lock(&block_group->lock);
8437 		if (block_group->iref) {
8438 			block_group->iref = 0;
8439 			block_group->inode = NULL;
8440 			spin_unlock(&block_group->lock);
8441 			iput(inode);
8442 		} else {
8443 			spin_unlock(&block_group->lock);
8444 		}
8445 		/* One for our lookup ref */
8446 		btrfs_add_delayed_iput(inode);
8447 	}
8448 
8449 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8450 	key.offset = block_group->key.objectid;
8451 	key.type = 0;
8452 
8453 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8454 	if (ret < 0)
8455 		goto out;
8456 	if (ret > 0)
8457 		btrfs_release_path(path);
8458 	if (ret == 0) {
8459 		ret = btrfs_del_item(trans, tree_root, path);
8460 		if (ret)
8461 			goto out;
8462 		btrfs_release_path(path);
8463 	}
8464 
8465 	spin_lock(&root->fs_info->block_group_cache_lock);
8466 	rb_erase(&block_group->cache_node,
8467 		 &root->fs_info->block_group_cache_tree);
8468 
8469 	if (root->fs_info->first_logical_byte == block_group->key.objectid)
8470 		root->fs_info->first_logical_byte = (u64)-1;
8471 	spin_unlock(&root->fs_info->block_group_cache_lock);
8472 
8473 	down_write(&block_group->space_info->groups_sem);
8474 	/*
8475 	 * we must use list_del_init so people can check to see if they
8476 	 * are still on the list after taking the semaphore
8477 	 */
8478 	list_del_init(&block_group->list);
8479 	if (list_empty(&block_group->space_info->block_groups[index]))
8480 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
8481 	up_write(&block_group->space_info->groups_sem);
8482 
8483 	if (block_group->cached == BTRFS_CACHE_STARTED)
8484 		wait_block_group_cache_done(block_group);
8485 
8486 	btrfs_remove_free_space_cache(block_group);
8487 
8488 	spin_lock(&block_group->space_info->lock);
8489 	block_group->space_info->total_bytes -= block_group->key.offset;
8490 	block_group->space_info->bytes_readonly -= block_group->key.offset;
8491 	block_group->space_info->disk_total -= block_group->key.offset * factor;
8492 	spin_unlock(&block_group->space_info->lock);
8493 
8494 	memcpy(&key, &block_group->key, sizeof(key));
8495 
8496 	btrfs_clear_space_info_full(root->fs_info);
8497 
8498 	btrfs_put_block_group(block_group);
8499 	btrfs_put_block_group(block_group);
8500 
8501 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8502 	if (ret > 0)
8503 		ret = -EIO;
8504 	if (ret < 0)
8505 		goto out;
8506 
8507 	ret = btrfs_del_item(trans, root, path);
8508 out:
8509 	btrfs_free_path(path);
8510 	return ret;
8511 }
8512 
8513 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8514 {
8515 	struct btrfs_space_info *space_info;
8516 	struct btrfs_super_block *disk_super;
8517 	u64 features;
8518 	u64 flags;
8519 	int mixed = 0;
8520 	int ret;
8521 
8522 	disk_super = fs_info->super_copy;
8523 	if (!btrfs_super_root(disk_super))
8524 		return 1;
8525 
8526 	features = btrfs_super_incompat_flags(disk_super);
8527 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8528 		mixed = 1;
8529 
8530 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
8531 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8532 	if (ret)
8533 		goto out;
8534 
8535 	if (mixed) {
8536 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8537 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8538 	} else {
8539 		flags = BTRFS_BLOCK_GROUP_METADATA;
8540 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8541 		if (ret)
8542 			goto out;
8543 
8544 		flags = BTRFS_BLOCK_GROUP_DATA;
8545 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8546 	}
8547 out:
8548 	return ret;
8549 }
8550 
8551 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8552 {
8553 	return unpin_extent_range(root, start, end);
8554 }
8555 
8556 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8557 			       u64 num_bytes, u64 *actual_bytes)
8558 {
8559 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8560 }
8561 
8562 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8563 {
8564 	struct btrfs_fs_info *fs_info = root->fs_info;
8565 	struct btrfs_block_group_cache *cache = NULL;
8566 	u64 group_trimmed;
8567 	u64 start;
8568 	u64 end;
8569 	u64 trimmed = 0;
8570 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8571 	int ret = 0;
8572 
8573 	/*
8574 	 * try to trim all FS space, our block group may start from non-zero.
8575 	 */
8576 	if (range->len == total_bytes)
8577 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
8578 	else
8579 		cache = btrfs_lookup_block_group(fs_info, range->start);
8580 
8581 	while (cache) {
8582 		if (cache->key.objectid >= (range->start + range->len)) {
8583 			btrfs_put_block_group(cache);
8584 			break;
8585 		}
8586 
8587 		start = max(range->start, cache->key.objectid);
8588 		end = min(range->start + range->len,
8589 				cache->key.objectid + cache->key.offset);
8590 
8591 		if (end - start >= range->minlen) {
8592 			if (!block_group_cache_done(cache)) {
8593 				ret = cache_block_group(cache, 0);
8594 				if (!ret)
8595 					wait_block_group_cache_done(cache);
8596 			}
8597 			ret = btrfs_trim_block_group(cache,
8598 						     &group_trimmed,
8599 						     start,
8600 						     end,
8601 						     range->minlen);
8602 
8603 			trimmed += group_trimmed;
8604 			if (ret) {
8605 				btrfs_put_block_group(cache);
8606 				break;
8607 			}
8608 		}
8609 
8610 		cache = next_block_group(fs_info->tree_root, cache);
8611 	}
8612 
8613 	range->len = trimmed;
8614 	return ret;
8615 }
8616