xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision b24413180f5600bcb3bb70fbed5cf186b60864bd)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/sched/signal.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/sort.h>
24 #include <linux/rcupdate.h>
25 #include <linux/kthread.h>
26 #include <linux/slab.h>
27 #include <linux/ratelimit.h>
28 #include <linux/percpu_counter.h>
29 #include "hash.h"
30 #include "tree-log.h"
31 #include "disk-io.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "raid56.h"
35 #include "locking.h"
36 #include "free-space-cache.h"
37 #include "free-space-tree.h"
38 #include "math.h"
39 #include "sysfs.h"
40 #include "qgroup.h"
41 
42 #undef SCRAMBLE_DELAYED_REFS
43 
44 /*
45  * control flags for do_chunk_alloc's force field
46  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
47  * if we really need one.
48  *
49  * CHUNK_ALLOC_LIMITED means to only try and allocate one
50  * if we have very few chunks already allocated.  This is
51  * used as part of the clustering code to help make sure
52  * we have a good pool of storage to cluster in, without
53  * filling the FS with empty chunks
54  *
55  * CHUNK_ALLOC_FORCE means it must try to allocate one
56  *
57  */
58 enum {
59 	CHUNK_ALLOC_NO_FORCE = 0,
60 	CHUNK_ALLOC_LIMITED = 1,
61 	CHUNK_ALLOC_FORCE = 2,
62 };
63 
64 static int update_block_group(struct btrfs_trans_handle *trans,
65 			      struct btrfs_fs_info *fs_info, u64 bytenr,
66 			      u64 num_bytes, int alloc);
67 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
68 			       struct btrfs_fs_info *fs_info,
69 				struct btrfs_delayed_ref_node *node, u64 parent,
70 				u64 root_objectid, u64 owner_objectid,
71 				u64 owner_offset, int refs_to_drop,
72 				struct btrfs_delayed_extent_op *extra_op);
73 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
74 				    struct extent_buffer *leaf,
75 				    struct btrfs_extent_item *ei);
76 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
77 				      struct btrfs_fs_info *fs_info,
78 				      u64 parent, u64 root_objectid,
79 				      u64 flags, u64 owner, u64 offset,
80 				      struct btrfs_key *ins, int ref_mod);
81 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
82 				     struct btrfs_fs_info *fs_info,
83 				     u64 parent, u64 root_objectid,
84 				     u64 flags, struct btrfs_disk_key *key,
85 				     int level, struct btrfs_key *ins);
86 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
87 			  struct btrfs_fs_info *fs_info, u64 flags,
88 			  int force);
89 static int find_next_key(struct btrfs_path *path, int level,
90 			 struct btrfs_key *key);
91 static void dump_space_info(struct btrfs_fs_info *fs_info,
92 			    struct btrfs_space_info *info, u64 bytes,
93 			    int dump_block_groups);
94 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
95 				    u64 ram_bytes, u64 num_bytes, int delalloc);
96 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
97 				     u64 num_bytes, int delalloc);
98 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
99 			       u64 num_bytes);
100 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
101 				    struct btrfs_space_info *space_info,
102 				    u64 orig_bytes,
103 				    enum btrfs_reserve_flush_enum flush,
104 				    bool system_chunk);
105 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
106 				     struct btrfs_space_info *space_info,
107 				     u64 num_bytes);
108 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
109 				     struct btrfs_space_info *space_info,
110 				     u64 num_bytes);
111 
112 static noinline int
113 block_group_cache_done(struct btrfs_block_group_cache *cache)
114 {
115 	smp_mb();
116 	return cache->cached == BTRFS_CACHE_FINISHED ||
117 		cache->cached == BTRFS_CACHE_ERROR;
118 }
119 
120 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
121 {
122 	return (cache->flags & bits) == bits;
123 }
124 
125 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
126 {
127 	atomic_inc(&cache->count);
128 }
129 
130 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
131 {
132 	if (atomic_dec_and_test(&cache->count)) {
133 		WARN_ON(cache->pinned > 0);
134 		WARN_ON(cache->reserved > 0);
135 
136 		/*
137 		 * If not empty, someone is still holding mutex of
138 		 * full_stripe_lock, which can only be released by caller.
139 		 * And it will definitely cause use-after-free when caller
140 		 * tries to release full stripe lock.
141 		 *
142 		 * No better way to resolve, but only to warn.
143 		 */
144 		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
145 		kfree(cache->free_space_ctl);
146 		kfree(cache);
147 	}
148 }
149 
150 /*
151  * this adds the block group to the fs_info rb tree for the block group
152  * cache
153  */
154 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
155 				struct btrfs_block_group_cache *block_group)
156 {
157 	struct rb_node **p;
158 	struct rb_node *parent = NULL;
159 	struct btrfs_block_group_cache *cache;
160 
161 	spin_lock(&info->block_group_cache_lock);
162 	p = &info->block_group_cache_tree.rb_node;
163 
164 	while (*p) {
165 		parent = *p;
166 		cache = rb_entry(parent, struct btrfs_block_group_cache,
167 				 cache_node);
168 		if (block_group->key.objectid < cache->key.objectid) {
169 			p = &(*p)->rb_left;
170 		} else if (block_group->key.objectid > cache->key.objectid) {
171 			p = &(*p)->rb_right;
172 		} else {
173 			spin_unlock(&info->block_group_cache_lock);
174 			return -EEXIST;
175 		}
176 	}
177 
178 	rb_link_node(&block_group->cache_node, parent, p);
179 	rb_insert_color(&block_group->cache_node,
180 			&info->block_group_cache_tree);
181 
182 	if (info->first_logical_byte > block_group->key.objectid)
183 		info->first_logical_byte = block_group->key.objectid;
184 
185 	spin_unlock(&info->block_group_cache_lock);
186 
187 	return 0;
188 }
189 
190 /*
191  * This will return the block group at or after bytenr if contains is 0, else
192  * it will return the block group that contains the bytenr
193  */
194 static struct btrfs_block_group_cache *
195 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
196 			      int contains)
197 {
198 	struct btrfs_block_group_cache *cache, *ret = NULL;
199 	struct rb_node *n;
200 	u64 end, start;
201 
202 	spin_lock(&info->block_group_cache_lock);
203 	n = info->block_group_cache_tree.rb_node;
204 
205 	while (n) {
206 		cache = rb_entry(n, struct btrfs_block_group_cache,
207 				 cache_node);
208 		end = cache->key.objectid + cache->key.offset - 1;
209 		start = cache->key.objectid;
210 
211 		if (bytenr < start) {
212 			if (!contains && (!ret || start < ret->key.objectid))
213 				ret = cache;
214 			n = n->rb_left;
215 		} else if (bytenr > start) {
216 			if (contains && bytenr <= end) {
217 				ret = cache;
218 				break;
219 			}
220 			n = n->rb_right;
221 		} else {
222 			ret = cache;
223 			break;
224 		}
225 	}
226 	if (ret) {
227 		btrfs_get_block_group(ret);
228 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
229 			info->first_logical_byte = ret->key.objectid;
230 	}
231 	spin_unlock(&info->block_group_cache_lock);
232 
233 	return ret;
234 }
235 
236 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
237 			       u64 start, u64 num_bytes)
238 {
239 	u64 end = start + num_bytes - 1;
240 	set_extent_bits(&fs_info->freed_extents[0],
241 			start, end, EXTENT_UPTODATE);
242 	set_extent_bits(&fs_info->freed_extents[1],
243 			start, end, EXTENT_UPTODATE);
244 	return 0;
245 }
246 
247 static void free_excluded_extents(struct btrfs_fs_info *fs_info,
248 				  struct btrfs_block_group_cache *cache)
249 {
250 	u64 start, end;
251 
252 	start = cache->key.objectid;
253 	end = start + cache->key.offset - 1;
254 
255 	clear_extent_bits(&fs_info->freed_extents[0],
256 			  start, end, EXTENT_UPTODATE);
257 	clear_extent_bits(&fs_info->freed_extents[1],
258 			  start, end, EXTENT_UPTODATE);
259 }
260 
261 static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
262 				 struct btrfs_block_group_cache *cache)
263 {
264 	u64 bytenr;
265 	u64 *logical;
266 	int stripe_len;
267 	int i, nr, ret;
268 
269 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
270 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
271 		cache->bytes_super += stripe_len;
272 		ret = add_excluded_extent(fs_info, cache->key.objectid,
273 					  stripe_len);
274 		if (ret)
275 			return ret;
276 	}
277 
278 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
279 		bytenr = btrfs_sb_offset(i);
280 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
281 				       bytenr, 0, &logical, &nr, &stripe_len);
282 		if (ret)
283 			return ret;
284 
285 		while (nr--) {
286 			u64 start, len;
287 
288 			if (logical[nr] > cache->key.objectid +
289 			    cache->key.offset)
290 				continue;
291 
292 			if (logical[nr] + stripe_len <= cache->key.objectid)
293 				continue;
294 
295 			start = logical[nr];
296 			if (start < cache->key.objectid) {
297 				start = cache->key.objectid;
298 				len = (logical[nr] + stripe_len) - start;
299 			} else {
300 				len = min_t(u64, stripe_len,
301 					    cache->key.objectid +
302 					    cache->key.offset - start);
303 			}
304 
305 			cache->bytes_super += len;
306 			ret = add_excluded_extent(fs_info, start, len);
307 			if (ret) {
308 				kfree(logical);
309 				return ret;
310 			}
311 		}
312 
313 		kfree(logical);
314 	}
315 	return 0;
316 }
317 
318 static struct btrfs_caching_control *
319 get_caching_control(struct btrfs_block_group_cache *cache)
320 {
321 	struct btrfs_caching_control *ctl;
322 
323 	spin_lock(&cache->lock);
324 	if (!cache->caching_ctl) {
325 		spin_unlock(&cache->lock);
326 		return NULL;
327 	}
328 
329 	ctl = cache->caching_ctl;
330 	refcount_inc(&ctl->count);
331 	spin_unlock(&cache->lock);
332 	return ctl;
333 }
334 
335 static void put_caching_control(struct btrfs_caching_control *ctl)
336 {
337 	if (refcount_dec_and_test(&ctl->count))
338 		kfree(ctl);
339 }
340 
341 #ifdef CONFIG_BTRFS_DEBUG
342 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
343 {
344 	struct btrfs_fs_info *fs_info = block_group->fs_info;
345 	u64 start = block_group->key.objectid;
346 	u64 len = block_group->key.offset;
347 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
348 		fs_info->nodesize : fs_info->sectorsize;
349 	u64 step = chunk << 1;
350 
351 	while (len > chunk) {
352 		btrfs_remove_free_space(block_group, start, chunk);
353 		start += step;
354 		if (len < step)
355 			len = 0;
356 		else
357 			len -= step;
358 	}
359 }
360 #endif
361 
362 /*
363  * this is only called by cache_block_group, since we could have freed extents
364  * we need to check the pinned_extents for any extents that can't be used yet
365  * since their free space will be released as soon as the transaction commits.
366  */
367 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
368 		       struct btrfs_fs_info *info, u64 start, u64 end)
369 {
370 	u64 extent_start, extent_end, size, total_added = 0;
371 	int ret;
372 
373 	while (start < end) {
374 		ret = find_first_extent_bit(info->pinned_extents, start,
375 					    &extent_start, &extent_end,
376 					    EXTENT_DIRTY | EXTENT_UPTODATE,
377 					    NULL);
378 		if (ret)
379 			break;
380 
381 		if (extent_start <= start) {
382 			start = extent_end + 1;
383 		} else if (extent_start > start && extent_start < end) {
384 			size = extent_start - start;
385 			total_added += size;
386 			ret = btrfs_add_free_space(block_group, start,
387 						   size);
388 			BUG_ON(ret); /* -ENOMEM or logic error */
389 			start = extent_end + 1;
390 		} else {
391 			break;
392 		}
393 	}
394 
395 	if (start < end) {
396 		size = end - start;
397 		total_added += size;
398 		ret = btrfs_add_free_space(block_group, start, size);
399 		BUG_ON(ret); /* -ENOMEM or logic error */
400 	}
401 
402 	return total_added;
403 }
404 
405 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
406 {
407 	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
408 	struct btrfs_fs_info *fs_info = block_group->fs_info;
409 	struct btrfs_root *extent_root = fs_info->extent_root;
410 	struct btrfs_path *path;
411 	struct extent_buffer *leaf;
412 	struct btrfs_key key;
413 	u64 total_found = 0;
414 	u64 last = 0;
415 	u32 nritems;
416 	int ret;
417 	bool wakeup = true;
418 
419 	path = btrfs_alloc_path();
420 	if (!path)
421 		return -ENOMEM;
422 
423 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
424 
425 #ifdef CONFIG_BTRFS_DEBUG
426 	/*
427 	 * If we're fragmenting we don't want to make anybody think we can
428 	 * allocate from this block group until we've had a chance to fragment
429 	 * the free space.
430 	 */
431 	if (btrfs_should_fragment_free_space(block_group))
432 		wakeup = false;
433 #endif
434 	/*
435 	 * We don't want to deadlock with somebody trying to allocate a new
436 	 * extent for the extent root while also trying to search the extent
437 	 * root to add free space.  So we skip locking and search the commit
438 	 * root, since its read-only
439 	 */
440 	path->skip_locking = 1;
441 	path->search_commit_root = 1;
442 	path->reada = READA_FORWARD;
443 
444 	key.objectid = last;
445 	key.offset = 0;
446 	key.type = BTRFS_EXTENT_ITEM_KEY;
447 
448 next:
449 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
450 	if (ret < 0)
451 		goto out;
452 
453 	leaf = path->nodes[0];
454 	nritems = btrfs_header_nritems(leaf);
455 
456 	while (1) {
457 		if (btrfs_fs_closing(fs_info) > 1) {
458 			last = (u64)-1;
459 			break;
460 		}
461 
462 		if (path->slots[0] < nritems) {
463 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
464 		} else {
465 			ret = find_next_key(path, 0, &key);
466 			if (ret)
467 				break;
468 
469 			if (need_resched() ||
470 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
471 				if (wakeup)
472 					caching_ctl->progress = last;
473 				btrfs_release_path(path);
474 				up_read(&fs_info->commit_root_sem);
475 				mutex_unlock(&caching_ctl->mutex);
476 				cond_resched();
477 				mutex_lock(&caching_ctl->mutex);
478 				down_read(&fs_info->commit_root_sem);
479 				goto next;
480 			}
481 
482 			ret = btrfs_next_leaf(extent_root, path);
483 			if (ret < 0)
484 				goto out;
485 			if (ret)
486 				break;
487 			leaf = path->nodes[0];
488 			nritems = btrfs_header_nritems(leaf);
489 			continue;
490 		}
491 
492 		if (key.objectid < last) {
493 			key.objectid = last;
494 			key.offset = 0;
495 			key.type = BTRFS_EXTENT_ITEM_KEY;
496 
497 			if (wakeup)
498 				caching_ctl->progress = last;
499 			btrfs_release_path(path);
500 			goto next;
501 		}
502 
503 		if (key.objectid < block_group->key.objectid) {
504 			path->slots[0]++;
505 			continue;
506 		}
507 
508 		if (key.objectid >= block_group->key.objectid +
509 		    block_group->key.offset)
510 			break;
511 
512 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
513 		    key.type == BTRFS_METADATA_ITEM_KEY) {
514 			total_found += add_new_free_space(block_group,
515 							  fs_info, last,
516 							  key.objectid);
517 			if (key.type == BTRFS_METADATA_ITEM_KEY)
518 				last = key.objectid +
519 					fs_info->nodesize;
520 			else
521 				last = key.objectid + key.offset;
522 
523 			if (total_found > CACHING_CTL_WAKE_UP) {
524 				total_found = 0;
525 				if (wakeup)
526 					wake_up(&caching_ctl->wait);
527 			}
528 		}
529 		path->slots[0]++;
530 	}
531 	ret = 0;
532 
533 	total_found += add_new_free_space(block_group, fs_info, last,
534 					  block_group->key.objectid +
535 					  block_group->key.offset);
536 	caching_ctl->progress = (u64)-1;
537 
538 out:
539 	btrfs_free_path(path);
540 	return ret;
541 }
542 
543 static noinline void caching_thread(struct btrfs_work *work)
544 {
545 	struct btrfs_block_group_cache *block_group;
546 	struct btrfs_fs_info *fs_info;
547 	struct btrfs_caching_control *caching_ctl;
548 	struct btrfs_root *extent_root;
549 	int ret;
550 
551 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
552 	block_group = caching_ctl->block_group;
553 	fs_info = block_group->fs_info;
554 	extent_root = fs_info->extent_root;
555 
556 	mutex_lock(&caching_ctl->mutex);
557 	down_read(&fs_info->commit_root_sem);
558 
559 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
560 		ret = load_free_space_tree(caching_ctl);
561 	else
562 		ret = load_extent_tree_free(caching_ctl);
563 
564 	spin_lock(&block_group->lock);
565 	block_group->caching_ctl = NULL;
566 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
567 	spin_unlock(&block_group->lock);
568 
569 #ifdef CONFIG_BTRFS_DEBUG
570 	if (btrfs_should_fragment_free_space(block_group)) {
571 		u64 bytes_used;
572 
573 		spin_lock(&block_group->space_info->lock);
574 		spin_lock(&block_group->lock);
575 		bytes_used = block_group->key.offset -
576 			btrfs_block_group_used(&block_group->item);
577 		block_group->space_info->bytes_used += bytes_used >> 1;
578 		spin_unlock(&block_group->lock);
579 		spin_unlock(&block_group->space_info->lock);
580 		fragment_free_space(block_group);
581 	}
582 #endif
583 
584 	caching_ctl->progress = (u64)-1;
585 
586 	up_read(&fs_info->commit_root_sem);
587 	free_excluded_extents(fs_info, block_group);
588 	mutex_unlock(&caching_ctl->mutex);
589 
590 	wake_up(&caching_ctl->wait);
591 
592 	put_caching_control(caching_ctl);
593 	btrfs_put_block_group(block_group);
594 }
595 
596 static int cache_block_group(struct btrfs_block_group_cache *cache,
597 			     int load_cache_only)
598 {
599 	DEFINE_WAIT(wait);
600 	struct btrfs_fs_info *fs_info = cache->fs_info;
601 	struct btrfs_caching_control *caching_ctl;
602 	int ret = 0;
603 
604 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
605 	if (!caching_ctl)
606 		return -ENOMEM;
607 
608 	INIT_LIST_HEAD(&caching_ctl->list);
609 	mutex_init(&caching_ctl->mutex);
610 	init_waitqueue_head(&caching_ctl->wait);
611 	caching_ctl->block_group = cache;
612 	caching_ctl->progress = cache->key.objectid;
613 	refcount_set(&caching_ctl->count, 1);
614 	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
615 			caching_thread, NULL, NULL);
616 
617 	spin_lock(&cache->lock);
618 	/*
619 	 * This should be a rare occasion, but this could happen I think in the
620 	 * case where one thread starts to load the space cache info, and then
621 	 * some other thread starts a transaction commit which tries to do an
622 	 * allocation while the other thread is still loading the space cache
623 	 * info.  The previous loop should have kept us from choosing this block
624 	 * group, but if we've moved to the state where we will wait on caching
625 	 * block groups we need to first check if we're doing a fast load here,
626 	 * so we can wait for it to finish, otherwise we could end up allocating
627 	 * from a block group who's cache gets evicted for one reason or
628 	 * another.
629 	 */
630 	while (cache->cached == BTRFS_CACHE_FAST) {
631 		struct btrfs_caching_control *ctl;
632 
633 		ctl = cache->caching_ctl;
634 		refcount_inc(&ctl->count);
635 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
636 		spin_unlock(&cache->lock);
637 
638 		schedule();
639 
640 		finish_wait(&ctl->wait, &wait);
641 		put_caching_control(ctl);
642 		spin_lock(&cache->lock);
643 	}
644 
645 	if (cache->cached != BTRFS_CACHE_NO) {
646 		spin_unlock(&cache->lock);
647 		kfree(caching_ctl);
648 		return 0;
649 	}
650 	WARN_ON(cache->caching_ctl);
651 	cache->caching_ctl = caching_ctl;
652 	cache->cached = BTRFS_CACHE_FAST;
653 	spin_unlock(&cache->lock);
654 
655 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
656 		mutex_lock(&caching_ctl->mutex);
657 		ret = load_free_space_cache(fs_info, cache);
658 
659 		spin_lock(&cache->lock);
660 		if (ret == 1) {
661 			cache->caching_ctl = NULL;
662 			cache->cached = BTRFS_CACHE_FINISHED;
663 			cache->last_byte_to_unpin = (u64)-1;
664 			caching_ctl->progress = (u64)-1;
665 		} else {
666 			if (load_cache_only) {
667 				cache->caching_ctl = NULL;
668 				cache->cached = BTRFS_CACHE_NO;
669 			} else {
670 				cache->cached = BTRFS_CACHE_STARTED;
671 				cache->has_caching_ctl = 1;
672 			}
673 		}
674 		spin_unlock(&cache->lock);
675 #ifdef CONFIG_BTRFS_DEBUG
676 		if (ret == 1 &&
677 		    btrfs_should_fragment_free_space(cache)) {
678 			u64 bytes_used;
679 
680 			spin_lock(&cache->space_info->lock);
681 			spin_lock(&cache->lock);
682 			bytes_used = cache->key.offset -
683 				btrfs_block_group_used(&cache->item);
684 			cache->space_info->bytes_used += bytes_used >> 1;
685 			spin_unlock(&cache->lock);
686 			spin_unlock(&cache->space_info->lock);
687 			fragment_free_space(cache);
688 		}
689 #endif
690 		mutex_unlock(&caching_ctl->mutex);
691 
692 		wake_up(&caching_ctl->wait);
693 		if (ret == 1) {
694 			put_caching_control(caching_ctl);
695 			free_excluded_extents(fs_info, cache);
696 			return 0;
697 		}
698 	} else {
699 		/*
700 		 * We're either using the free space tree or no caching at all.
701 		 * Set cached to the appropriate value and wakeup any waiters.
702 		 */
703 		spin_lock(&cache->lock);
704 		if (load_cache_only) {
705 			cache->caching_ctl = NULL;
706 			cache->cached = BTRFS_CACHE_NO;
707 		} else {
708 			cache->cached = BTRFS_CACHE_STARTED;
709 			cache->has_caching_ctl = 1;
710 		}
711 		spin_unlock(&cache->lock);
712 		wake_up(&caching_ctl->wait);
713 	}
714 
715 	if (load_cache_only) {
716 		put_caching_control(caching_ctl);
717 		return 0;
718 	}
719 
720 	down_write(&fs_info->commit_root_sem);
721 	refcount_inc(&caching_ctl->count);
722 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
723 	up_write(&fs_info->commit_root_sem);
724 
725 	btrfs_get_block_group(cache);
726 
727 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
728 
729 	return ret;
730 }
731 
732 /*
733  * return the block group that starts at or after bytenr
734  */
735 static struct btrfs_block_group_cache *
736 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
737 {
738 	return block_group_cache_tree_search(info, bytenr, 0);
739 }
740 
741 /*
742  * return the block group that contains the given bytenr
743  */
744 struct btrfs_block_group_cache *btrfs_lookup_block_group(
745 						 struct btrfs_fs_info *info,
746 						 u64 bytenr)
747 {
748 	return block_group_cache_tree_search(info, bytenr, 1);
749 }
750 
751 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
752 						  u64 flags)
753 {
754 	struct list_head *head = &info->space_info;
755 	struct btrfs_space_info *found;
756 
757 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
758 
759 	rcu_read_lock();
760 	list_for_each_entry_rcu(found, head, list) {
761 		if (found->flags & flags) {
762 			rcu_read_unlock();
763 			return found;
764 		}
765 	}
766 	rcu_read_unlock();
767 	return NULL;
768 }
769 
770 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
771 			     u64 owner, u64 root_objectid)
772 {
773 	struct btrfs_space_info *space_info;
774 	u64 flags;
775 
776 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
777 		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
778 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
779 		else
780 			flags = BTRFS_BLOCK_GROUP_METADATA;
781 	} else {
782 		flags = BTRFS_BLOCK_GROUP_DATA;
783 	}
784 
785 	space_info = __find_space_info(fs_info, flags);
786 	ASSERT(space_info);
787 	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
788 }
789 
790 /*
791  * after adding space to the filesystem, we need to clear the full flags
792  * on all the space infos.
793  */
794 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
795 {
796 	struct list_head *head = &info->space_info;
797 	struct btrfs_space_info *found;
798 
799 	rcu_read_lock();
800 	list_for_each_entry_rcu(found, head, list)
801 		found->full = 0;
802 	rcu_read_unlock();
803 }
804 
805 /* simple helper to search for an existing data extent at a given offset */
806 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
807 {
808 	int ret;
809 	struct btrfs_key key;
810 	struct btrfs_path *path;
811 
812 	path = btrfs_alloc_path();
813 	if (!path)
814 		return -ENOMEM;
815 
816 	key.objectid = start;
817 	key.offset = len;
818 	key.type = BTRFS_EXTENT_ITEM_KEY;
819 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
820 	btrfs_free_path(path);
821 	return ret;
822 }
823 
824 /*
825  * helper function to lookup reference count and flags of a tree block.
826  *
827  * the head node for delayed ref is used to store the sum of all the
828  * reference count modifications queued up in the rbtree. the head
829  * node may also store the extent flags to set. This way you can check
830  * to see what the reference count and extent flags would be if all of
831  * the delayed refs are not processed.
832  */
833 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
834 			     struct btrfs_fs_info *fs_info, u64 bytenr,
835 			     u64 offset, int metadata, u64 *refs, u64 *flags)
836 {
837 	struct btrfs_delayed_ref_head *head;
838 	struct btrfs_delayed_ref_root *delayed_refs;
839 	struct btrfs_path *path;
840 	struct btrfs_extent_item *ei;
841 	struct extent_buffer *leaf;
842 	struct btrfs_key key;
843 	u32 item_size;
844 	u64 num_refs;
845 	u64 extent_flags;
846 	int ret;
847 
848 	/*
849 	 * If we don't have skinny metadata, don't bother doing anything
850 	 * different
851 	 */
852 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
853 		offset = fs_info->nodesize;
854 		metadata = 0;
855 	}
856 
857 	path = btrfs_alloc_path();
858 	if (!path)
859 		return -ENOMEM;
860 
861 	if (!trans) {
862 		path->skip_locking = 1;
863 		path->search_commit_root = 1;
864 	}
865 
866 search_again:
867 	key.objectid = bytenr;
868 	key.offset = offset;
869 	if (metadata)
870 		key.type = BTRFS_METADATA_ITEM_KEY;
871 	else
872 		key.type = BTRFS_EXTENT_ITEM_KEY;
873 
874 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
875 	if (ret < 0)
876 		goto out_free;
877 
878 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
879 		if (path->slots[0]) {
880 			path->slots[0]--;
881 			btrfs_item_key_to_cpu(path->nodes[0], &key,
882 					      path->slots[0]);
883 			if (key.objectid == bytenr &&
884 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
885 			    key.offset == fs_info->nodesize)
886 				ret = 0;
887 		}
888 	}
889 
890 	if (ret == 0) {
891 		leaf = path->nodes[0];
892 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
893 		if (item_size >= sizeof(*ei)) {
894 			ei = btrfs_item_ptr(leaf, path->slots[0],
895 					    struct btrfs_extent_item);
896 			num_refs = btrfs_extent_refs(leaf, ei);
897 			extent_flags = btrfs_extent_flags(leaf, ei);
898 		} else {
899 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
900 			struct btrfs_extent_item_v0 *ei0;
901 			BUG_ON(item_size != sizeof(*ei0));
902 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
903 					     struct btrfs_extent_item_v0);
904 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
905 			/* FIXME: this isn't correct for data */
906 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
907 #else
908 			BUG();
909 #endif
910 		}
911 		BUG_ON(num_refs == 0);
912 	} else {
913 		num_refs = 0;
914 		extent_flags = 0;
915 		ret = 0;
916 	}
917 
918 	if (!trans)
919 		goto out;
920 
921 	delayed_refs = &trans->transaction->delayed_refs;
922 	spin_lock(&delayed_refs->lock);
923 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
924 	if (head) {
925 		if (!mutex_trylock(&head->mutex)) {
926 			refcount_inc(&head->node.refs);
927 			spin_unlock(&delayed_refs->lock);
928 
929 			btrfs_release_path(path);
930 
931 			/*
932 			 * Mutex was contended, block until it's released and try
933 			 * again
934 			 */
935 			mutex_lock(&head->mutex);
936 			mutex_unlock(&head->mutex);
937 			btrfs_put_delayed_ref(&head->node);
938 			goto search_again;
939 		}
940 		spin_lock(&head->lock);
941 		if (head->extent_op && head->extent_op->update_flags)
942 			extent_flags |= head->extent_op->flags_to_set;
943 		else
944 			BUG_ON(num_refs == 0);
945 
946 		num_refs += head->node.ref_mod;
947 		spin_unlock(&head->lock);
948 		mutex_unlock(&head->mutex);
949 	}
950 	spin_unlock(&delayed_refs->lock);
951 out:
952 	WARN_ON(num_refs == 0);
953 	if (refs)
954 		*refs = num_refs;
955 	if (flags)
956 		*flags = extent_flags;
957 out_free:
958 	btrfs_free_path(path);
959 	return ret;
960 }
961 
962 /*
963  * Back reference rules.  Back refs have three main goals:
964  *
965  * 1) differentiate between all holders of references to an extent so that
966  *    when a reference is dropped we can make sure it was a valid reference
967  *    before freeing the extent.
968  *
969  * 2) Provide enough information to quickly find the holders of an extent
970  *    if we notice a given block is corrupted or bad.
971  *
972  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
973  *    maintenance.  This is actually the same as #2, but with a slightly
974  *    different use case.
975  *
976  * There are two kinds of back refs. The implicit back refs is optimized
977  * for pointers in non-shared tree blocks. For a given pointer in a block,
978  * back refs of this kind provide information about the block's owner tree
979  * and the pointer's key. These information allow us to find the block by
980  * b-tree searching. The full back refs is for pointers in tree blocks not
981  * referenced by their owner trees. The location of tree block is recorded
982  * in the back refs. Actually the full back refs is generic, and can be
983  * used in all cases the implicit back refs is used. The major shortcoming
984  * of the full back refs is its overhead. Every time a tree block gets
985  * COWed, we have to update back refs entry for all pointers in it.
986  *
987  * For a newly allocated tree block, we use implicit back refs for
988  * pointers in it. This means most tree related operations only involve
989  * implicit back refs. For a tree block created in old transaction, the
990  * only way to drop a reference to it is COW it. So we can detect the
991  * event that tree block loses its owner tree's reference and do the
992  * back refs conversion.
993  *
994  * When a tree block is COWed through a tree, there are four cases:
995  *
996  * The reference count of the block is one and the tree is the block's
997  * owner tree. Nothing to do in this case.
998  *
999  * The reference count of the block is one and the tree is not the
1000  * block's owner tree. In this case, full back refs is used for pointers
1001  * in the block. Remove these full back refs, add implicit back refs for
1002  * every pointers in the new block.
1003  *
1004  * The reference count of the block is greater than one and the tree is
1005  * the block's owner tree. In this case, implicit back refs is used for
1006  * pointers in the block. Add full back refs for every pointers in the
1007  * block, increase lower level extents' reference counts. The original
1008  * implicit back refs are entailed to the new block.
1009  *
1010  * The reference count of the block is greater than one and the tree is
1011  * not the block's owner tree. Add implicit back refs for every pointer in
1012  * the new block, increase lower level extents' reference count.
1013  *
1014  * Back Reference Key composing:
1015  *
1016  * The key objectid corresponds to the first byte in the extent,
1017  * The key type is used to differentiate between types of back refs.
1018  * There are different meanings of the key offset for different types
1019  * of back refs.
1020  *
1021  * File extents can be referenced by:
1022  *
1023  * - multiple snapshots, subvolumes, or different generations in one subvol
1024  * - different files inside a single subvolume
1025  * - different offsets inside a file (bookend extents in file.c)
1026  *
1027  * The extent ref structure for the implicit back refs has fields for:
1028  *
1029  * - Objectid of the subvolume root
1030  * - objectid of the file holding the reference
1031  * - original offset in the file
1032  * - how many bookend extents
1033  *
1034  * The key offset for the implicit back refs is hash of the first
1035  * three fields.
1036  *
1037  * The extent ref structure for the full back refs has field for:
1038  *
1039  * - number of pointers in the tree leaf
1040  *
1041  * The key offset for the implicit back refs is the first byte of
1042  * the tree leaf
1043  *
1044  * When a file extent is allocated, The implicit back refs is used.
1045  * the fields are filled in:
1046  *
1047  *     (root_key.objectid, inode objectid, offset in file, 1)
1048  *
1049  * When a file extent is removed file truncation, we find the
1050  * corresponding implicit back refs and check the following fields:
1051  *
1052  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1053  *
1054  * Btree extents can be referenced by:
1055  *
1056  * - Different subvolumes
1057  *
1058  * Both the implicit back refs and the full back refs for tree blocks
1059  * only consist of key. The key offset for the implicit back refs is
1060  * objectid of block's owner tree. The key offset for the full back refs
1061  * is the first byte of parent block.
1062  *
1063  * When implicit back refs is used, information about the lowest key and
1064  * level of the tree block are required. These information are stored in
1065  * tree block info structure.
1066  */
1067 
1068 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1069 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1070 				  struct btrfs_fs_info *fs_info,
1071 				  struct btrfs_path *path,
1072 				  u64 owner, u32 extra_size)
1073 {
1074 	struct btrfs_root *root = fs_info->extent_root;
1075 	struct btrfs_extent_item *item;
1076 	struct btrfs_extent_item_v0 *ei0;
1077 	struct btrfs_extent_ref_v0 *ref0;
1078 	struct btrfs_tree_block_info *bi;
1079 	struct extent_buffer *leaf;
1080 	struct btrfs_key key;
1081 	struct btrfs_key found_key;
1082 	u32 new_size = sizeof(*item);
1083 	u64 refs;
1084 	int ret;
1085 
1086 	leaf = path->nodes[0];
1087 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1088 
1089 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1090 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
1091 			     struct btrfs_extent_item_v0);
1092 	refs = btrfs_extent_refs_v0(leaf, ei0);
1093 
1094 	if (owner == (u64)-1) {
1095 		while (1) {
1096 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1097 				ret = btrfs_next_leaf(root, path);
1098 				if (ret < 0)
1099 					return ret;
1100 				BUG_ON(ret > 0); /* Corruption */
1101 				leaf = path->nodes[0];
1102 			}
1103 			btrfs_item_key_to_cpu(leaf, &found_key,
1104 					      path->slots[0]);
1105 			BUG_ON(key.objectid != found_key.objectid);
1106 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1107 				path->slots[0]++;
1108 				continue;
1109 			}
1110 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1111 					      struct btrfs_extent_ref_v0);
1112 			owner = btrfs_ref_objectid_v0(leaf, ref0);
1113 			break;
1114 		}
1115 	}
1116 	btrfs_release_path(path);
1117 
1118 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1119 		new_size += sizeof(*bi);
1120 
1121 	new_size -= sizeof(*ei0);
1122 	ret = btrfs_search_slot(trans, root, &key, path,
1123 				new_size + extra_size, 1);
1124 	if (ret < 0)
1125 		return ret;
1126 	BUG_ON(ret); /* Corruption */
1127 
1128 	btrfs_extend_item(fs_info, path, new_size);
1129 
1130 	leaf = path->nodes[0];
1131 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1132 	btrfs_set_extent_refs(leaf, item, refs);
1133 	/* FIXME: get real generation */
1134 	btrfs_set_extent_generation(leaf, item, 0);
1135 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1136 		btrfs_set_extent_flags(leaf, item,
1137 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1138 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1139 		bi = (struct btrfs_tree_block_info *)(item + 1);
1140 		/* FIXME: get first key of the block */
1141 		memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
1142 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1143 	} else {
1144 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1145 	}
1146 	btrfs_mark_buffer_dirty(leaf);
1147 	return 0;
1148 }
1149 #endif
1150 
1151 /*
1152  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1153  * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1154  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1155  */
1156 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1157 				     struct btrfs_extent_inline_ref *iref,
1158 				     enum btrfs_inline_ref_type is_data)
1159 {
1160 	int type = btrfs_extent_inline_ref_type(eb, iref);
1161 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1162 
1163 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1164 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
1165 	    type == BTRFS_SHARED_DATA_REF_KEY ||
1166 	    type == BTRFS_EXTENT_DATA_REF_KEY) {
1167 		if (is_data == BTRFS_REF_TYPE_BLOCK) {
1168 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
1169 				return type;
1170 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1171 				ASSERT(eb->fs_info);
1172 				/*
1173 				 * Every shared one has parent tree
1174 				 * block, which must be aligned to
1175 				 * nodesize.
1176 				 */
1177 				if (offset &&
1178 				    IS_ALIGNED(offset, eb->fs_info->nodesize))
1179 					return type;
1180 			}
1181 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
1182 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
1183 				return type;
1184 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
1185 				ASSERT(eb->fs_info);
1186 				/*
1187 				 * Every shared one has parent tree
1188 				 * block, which must be aligned to
1189 				 * nodesize.
1190 				 */
1191 				if (offset &&
1192 				    IS_ALIGNED(offset, eb->fs_info->nodesize))
1193 					return type;
1194 			}
1195 		} else {
1196 			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1197 			return type;
1198 		}
1199 	}
1200 
1201 	btrfs_print_leaf((struct extent_buffer *)eb);
1202 	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1203 		  eb->start, type);
1204 	WARN_ON(1);
1205 
1206 	return BTRFS_REF_TYPE_INVALID;
1207 }
1208 
1209 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1210 {
1211 	u32 high_crc = ~(u32)0;
1212 	u32 low_crc = ~(u32)0;
1213 	__le64 lenum;
1214 
1215 	lenum = cpu_to_le64(root_objectid);
1216 	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1217 	lenum = cpu_to_le64(owner);
1218 	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1219 	lenum = cpu_to_le64(offset);
1220 	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1221 
1222 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1223 }
1224 
1225 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1226 				     struct btrfs_extent_data_ref *ref)
1227 {
1228 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1229 				    btrfs_extent_data_ref_objectid(leaf, ref),
1230 				    btrfs_extent_data_ref_offset(leaf, ref));
1231 }
1232 
1233 static int match_extent_data_ref(struct extent_buffer *leaf,
1234 				 struct btrfs_extent_data_ref *ref,
1235 				 u64 root_objectid, u64 owner, u64 offset)
1236 {
1237 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1238 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1239 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1240 		return 0;
1241 	return 1;
1242 }
1243 
1244 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1245 					   struct btrfs_fs_info *fs_info,
1246 					   struct btrfs_path *path,
1247 					   u64 bytenr, u64 parent,
1248 					   u64 root_objectid,
1249 					   u64 owner, u64 offset)
1250 {
1251 	struct btrfs_root *root = fs_info->extent_root;
1252 	struct btrfs_key key;
1253 	struct btrfs_extent_data_ref *ref;
1254 	struct extent_buffer *leaf;
1255 	u32 nritems;
1256 	int ret;
1257 	int recow;
1258 	int err = -ENOENT;
1259 
1260 	key.objectid = bytenr;
1261 	if (parent) {
1262 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1263 		key.offset = parent;
1264 	} else {
1265 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1266 		key.offset = hash_extent_data_ref(root_objectid,
1267 						  owner, offset);
1268 	}
1269 again:
1270 	recow = 0;
1271 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1272 	if (ret < 0) {
1273 		err = ret;
1274 		goto fail;
1275 	}
1276 
1277 	if (parent) {
1278 		if (!ret)
1279 			return 0;
1280 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1281 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1282 		btrfs_release_path(path);
1283 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1284 		if (ret < 0) {
1285 			err = ret;
1286 			goto fail;
1287 		}
1288 		if (!ret)
1289 			return 0;
1290 #endif
1291 		goto fail;
1292 	}
1293 
1294 	leaf = path->nodes[0];
1295 	nritems = btrfs_header_nritems(leaf);
1296 	while (1) {
1297 		if (path->slots[0] >= nritems) {
1298 			ret = btrfs_next_leaf(root, path);
1299 			if (ret < 0)
1300 				err = ret;
1301 			if (ret)
1302 				goto fail;
1303 
1304 			leaf = path->nodes[0];
1305 			nritems = btrfs_header_nritems(leaf);
1306 			recow = 1;
1307 		}
1308 
1309 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1310 		if (key.objectid != bytenr ||
1311 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1312 			goto fail;
1313 
1314 		ref = btrfs_item_ptr(leaf, path->slots[0],
1315 				     struct btrfs_extent_data_ref);
1316 
1317 		if (match_extent_data_ref(leaf, ref, root_objectid,
1318 					  owner, offset)) {
1319 			if (recow) {
1320 				btrfs_release_path(path);
1321 				goto again;
1322 			}
1323 			err = 0;
1324 			break;
1325 		}
1326 		path->slots[0]++;
1327 	}
1328 fail:
1329 	return err;
1330 }
1331 
1332 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1333 					   struct btrfs_fs_info *fs_info,
1334 					   struct btrfs_path *path,
1335 					   u64 bytenr, u64 parent,
1336 					   u64 root_objectid, u64 owner,
1337 					   u64 offset, int refs_to_add)
1338 {
1339 	struct btrfs_root *root = fs_info->extent_root;
1340 	struct btrfs_key key;
1341 	struct extent_buffer *leaf;
1342 	u32 size;
1343 	u32 num_refs;
1344 	int ret;
1345 
1346 	key.objectid = bytenr;
1347 	if (parent) {
1348 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1349 		key.offset = parent;
1350 		size = sizeof(struct btrfs_shared_data_ref);
1351 	} else {
1352 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1353 		key.offset = hash_extent_data_ref(root_objectid,
1354 						  owner, offset);
1355 		size = sizeof(struct btrfs_extent_data_ref);
1356 	}
1357 
1358 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1359 	if (ret && ret != -EEXIST)
1360 		goto fail;
1361 
1362 	leaf = path->nodes[0];
1363 	if (parent) {
1364 		struct btrfs_shared_data_ref *ref;
1365 		ref = btrfs_item_ptr(leaf, path->slots[0],
1366 				     struct btrfs_shared_data_ref);
1367 		if (ret == 0) {
1368 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1369 		} else {
1370 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1371 			num_refs += refs_to_add;
1372 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1373 		}
1374 	} else {
1375 		struct btrfs_extent_data_ref *ref;
1376 		while (ret == -EEXIST) {
1377 			ref = btrfs_item_ptr(leaf, path->slots[0],
1378 					     struct btrfs_extent_data_ref);
1379 			if (match_extent_data_ref(leaf, ref, root_objectid,
1380 						  owner, offset))
1381 				break;
1382 			btrfs_release_path(path);
1383 			key.offset++;
1384 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1385 						      size);
1386 			if (ret && ret != -EEXIST)
1387 				goto fail;
1388 
1389 			leaf = path->nodes[0];
1390 		}
1391 		ref = btrfs_item_ptr(leaf, path->slots[0],
1392 				     struct btrfs_extent_data_ref);
1393 		if (ret == 0) {
1394 			btrfs_set_extent_data_ref_root(leaf, ref,
1395 						       root_objectid);
1396 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1397 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1398 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1399 		} else {
1400 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1401 			num_refs += refs_to_add;
1402 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1403 		}
1404 	}
1405 	btrfs_mark_buffer_dirty(leaf);
1406 	ret = 0;
1407 fail:
1408 	btrfs_release_path(path);
1409 	return ret;
1410 }
1411 
1412 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1413 					   struct btrfs_fs_info *fs_info,
1414 					   struct btrfs_path *path,
1415 					   int refs_to_drop, int *last_ref)
1416 {
1417 	struct btrfs_key key;
1418 	struct btrfs_extent_data_ref *ref1 = NULL;
1419 	struct btrfs_shared_data_ref *ref2 = NULL;
1420 	struct extent_buffer *leaf;
1421 	u32 num_refs = 0;
1422 	int ret = 0;
1423 
1424 	leaf = path->nodes[0];
1425 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1426 
1427 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1428 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1429 				      struct btrfs_extent_data_ref);
1430 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1431 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1432 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1433 				      struct btrfs_shared_data_ref);
1434 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1435 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1436 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1437 		struct btrfs_extent_ref_v0 *ref0;
1438 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1439 				      struct btrfs_extent_ref_v0);
1440 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1441 #endif
1442 	} else {
1443 		BUG();
1444 	}
1445 
1446 	BUG_ON(num_refs < refs_to_drop);
1447 	num_refs -= refs_to_drop;
1448 
1449 	if (num_refs == 0) {
1450 		ret = btrfs_del_item(trans, fs_info->extent_root, path);
1451 		*last_ref = 1;
1452 	} else {
1453 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1454 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1455 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1456 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1457 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1458 		else {
1459 			struct btrfs_extent_ref_v0 *ref0;
1460 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1461 					struct btrfs_extent_ref_v0);
1462 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1463 		}
1464 #endif
1465 		btrfs_mark_buffer_dirty(leaf);
1466 	}
1467 	return ret;
1468 }
1469 
1470 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1471 					  struct btrfs_extent_inline_ref *iref)
1472 {
1473 	struct btrfs_key key;
1474 	struct extent_buffer *leaf;
1475 	struct btrfs_extent_data_ref *ref1;
1476 	struct btrfs_shared_data_ref *ref2;
1477 	u32 num_refs = 0;
1478 	int type;
1479 
1480 	leaf = path->nodes[0];
1481 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1482 	if (iref) {
1483 		/*
1484 		 * If type is invalid, we should have bailed out earlier than
1485 		 * this call.
1486 		 */
1487 		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1488 		ASSERT(type != BTRFS_REF_TYPE_INVALID);
1489 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1490 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1491 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1492 		} else {
1493 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1494 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1495 		}
1496 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1497 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1498 				      struct btrfs_extent_data_ref);
1499 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1500 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1501 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1502 				      struct btrfs_shared_data_ref);
1503 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1504 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1505 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1506 		struct btrfs_extent_ref_v0 *ref0;
1507 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1508 				      struct btrfs_extent_ref_v0);
1509 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1510 #endif
1511 	} else {
1512 		WARN_ON(1);
1513 	}
1514 	return num_refs;
1515 }
1516 
1517 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1518 					  struct btrfs_fs_info *fs_info,
1519 					  struct btrfs_path *path,
1520 					  u64 bytenr, u64 parent,
1521 					  u64 root_objectid)
1522 {
1523 	struct btrfs_root *root = fs_info->extent_root;
1524 	struct btrfs_key key;
1525 	int ret;
1526 
1527 	key.objectid = bytenr;
1528 	if (parent) {
1529 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1530 		key.offset = parent;
1531 	} else {
1532 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1533 		key.offset = root_objectid;
1534 	}
1535 
1536 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1537 	if (ret > 0)
1538 		ret = -ENOENT;
1539 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1540 	if (ret == -ENOENT && parent) {
1541 		btrfs_release_path(path);
1542 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1543 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1544 		if (ret > 0)
1545 			ret = -ENOENT;
1546 	}
1547 #endif
1548 	return ret;
1549 }
1550 
1551 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1552 					  struct btrfs_fs_info *fs_info,
1553 					  struct btrfs_path *path,
1554 					  u64 bytenr, u64 parent,
1555 					  u64 root_objectid)
1556 {
1557 	struct btrfs_key key;
1558 	int ret;
1559 
1560 	key.objectid = bytenr;
1561 	if (parent) {
1562 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1563 		key.offset = parent;
1564 	} else {
1565 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1566 		key.offset = root_objectid;
1567 	}
1568 
1569 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
1570 				      path, &key, 0);
1571 	btrfs_release_path(path);
1572 	return ret;
1573 }
1574 
1575 static inline int extent_ref_type(u64 parent, u64 owner)
1576 {
1577 	int type;
1578 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1579 		if (parent > 0)
1580 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1581 		else
1582 			type = BTRFS_TREE_BLOCK_REF_KEY;
1583 	} else {
1584 		if (parent > 0)
1585 			type = BTRFS_SHARED_DATA_REF_KEY;
1586 		else
1587 			type = BTRFS_EXTENT_DATA_REF_KEY;
1588 	}
1589 	return type;
1590 }
1591 
1592 static int find_next_key(struct btrfs_path *path, int level,
1593 			 struct btrfs_key *key)
1594 
1595 {
1596 	for (; level < BTRFS_MAX_LEVEL; level++) {
1597 		if (!path->nodes[level])
1598 			break;
1599 		if (path->slots[level] + 1 >=
1600 		    btrfs_header_nritems(path->nodes[level]))
1601 			continue;
1602 		if (level == 0)
1603 			btrfs_item_key_to_cpu(path->nodes[level], key,
1604 					      path->slots[level] + 1);
1605 		else
1606 			btrfs_node_key_to_cpu(path->nodes[level], key,
1607 					      path->slots[level] + 1);
1608 		return 0;
1609 	}
1610 	return 1;
1611 }
1612 
1613 /*
1614  * look for inline back ref. if back ref is found, *ref_ret is set
1615  * to the address of inline back ref, and 0 is returned.
1616  *
1617  * if back ref isn't found, *ref_ret is set to the address where it
1618  * should be inserted, and -ENOENT is returned.
1619  *
1620  * if insert is true and there are too many inline back refs, the path
1621  * points to the extent item, and -EAGAIN is returned.
1622  *
1623  * NOTE: inline back refs are ordered in the same way that back ref
1624  *	 items in the tree are ordered.
1625  */
1626 static noinline_for_stack
1627 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1628 				 struct btrfs_fs_info *fs_info,
1629 				 struct btrfs_path *path,
1630 				 struct btrfs_extent_inline_ref **ref_ret,
1631 				 u64 bytenr, u64 num_bytes,
1632 				 u64 parent, u64 root_objectid,
1633 				 u64 owner, u64 offset, int insert)
1634 {
1635 	struct btrfs_root *root = fs_info->extent_root;
1636 	struct btrfs_key key;
1637 	struct extent_buffer *leaf;
1638 	struct btrfs_extent_item *ei;
1639 	struct btrfs_extent_inline_ref *iref;
1640 	u64 flags;
1641 	u64 item_size;
1642 	unsigned long ptr;
1643 	unsigned long end;
1644 	int extra_size;
1645 	int type;
1646 	int want;
1647 	int ret;
1648 	int err = 0;
1649 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1650 	int needed;
1651 
1652 	key.objectid = bytenr;
1653 	key.type = BTRFS_EXTENT_ITEM_KEY;
1654 	key.offset = num_bytes;
1655 
1656 	want = extent_ref_type(parent, owner);
1657 	if (insert) {
1658 		extra_size = btrfs_extent_inline_ref_size(want);
1659 		path->keep_locks = 1;
1660 	} else
1661 		extra_size = -1;
1662 
1663 	/*
1664 	 * Owner is our parent level, so we can just add one to get the level
1665 	 * for the block we are interested in.
1666 	 */
1667 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1668 		key.type = BTRFS_METADATA_ITEM_KEY;
1669 		key.offset = owner;
1670 	}
1671 
1672 again:
1673 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1674 	if (ret < 0) {
1675 		err = ret;
1676 		goto out;
1677 	}
1678 
1679 	/*
1680 	 * We may be a newly converted file system which still has the old fat
1681 	 * extent entries for metadata, so try and see if we have one of those.
1682 	 */
1683 	if (ret > 0 && skinny_metadata) {
1684 		skinny_metadata = false;
1685 		if (path->slots[0]) {
1686 			path->slots[0]--;
1687 			btrfs_item_key_to_cpu(path->nodes[0], &key,
1688 					      path->slots[0]);
1689 			if (key.objectid == bytenr &&
1690 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
1691 			    key.offset == num_bytes)
1692 				ret = 0;
1693 		}
1694 		if (ret) {
1695 			key.objectid = bytenr;
1696 			key.type = BTRFS_EXTENT_ITEM_KEY;
1697 			key.offset = num_bytes;
1698 			btrfs_release_path(path);
1699 			goto again;
1700 		}
1701 	}
1702 
1703 	if (ret && !insert) {
1704 		err = -ENOENT;
1705 		goto out;
1706 	} else if (WARN_ON(ret)) {
1707 		err = -EIO;
1708 		goto out;
1709 	}
1710 
1711 	leaf = path->nodes[0];
1712 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1713 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1714 	if (item_size < sizeof(*ei)) {
1715 		if (!insert) {
1716 			err = -ENOENT;
1717 			goto out;
1718 		}
1719 		ret = convert_extent_item_v0(trans, fs_info, path, owner,
1720 					     extra_size);
1721 		if (ret < 0) {
1722 			err = ret;
1723 			goto out;
1724 		}
1725 		leaf = path->nodes[0];
1726 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1727 	}
1728 #endif
1729 	BUG_ON(item_size < sizeof(*ei));
1730 
1731 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1732 	flags = btrfs_extent_flags(leaf, ei);
1733 
1734 	ptr = (unsigned long)(ei + 1);
1735 	end = (unsigned long)ei + item_size;
1736 
1737 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1738 		ptr += sizeof(struct btrfs_tree_block_info);
1739 		BUG_ON(ptr > end);
1740 	}
1741 
1742 	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1743 		needed = BTRFS_REF_TYPE_DATA;
1744 	else
1745 		needed = BTRFS_REF_TYPE_BLOCK;
1746 
1747 	err = -ENOENT;
1748 	while (1) {
1749 		if (ptr >= end) {
1750 			WARN_ON(ptr > end);
1751 			break;
1752 		}
1753 		iref = (struct btrfs_extent_inline_ref *)ptr;
1754 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1755 		if (type == BTRFS_REF_TYPE_INVALID) {
1756 			err = -EINVAL;
1757 			goto out;
1758 		}
1759 
1760 		if (want < type)
1761 			break;
1762 		if (want > type) {
1763 			ptr += btrfs_extent_inline_ref_size(type);
1764 			continue;
1765 		}
1766 
1767 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1768 			struct btrfs_extent_data_ref *dref;
1769 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1770 			if (match_extent_data_ref(leaf, dref, root_objectid,
1771 						  owner, offset)) {
1772 				err = 0;
1773 				break;
1774 			}
1775 			if (hash_extent_data_ref_item(leaf, dref) <
1776 			    hash_extent_data_ref(root_objectid, owner, offset))
1777 				break;
1778 		} else {
1779 			u64 ref_offset;
1780 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1781 			if (parent > 0) {
1782 				if (parent == ref_offset) {
1783 					err = 0;
1784 					break;
1785 				}
1786 				if (ref_offset < parent)
1787 					break;
1788 			} else {
1789 				if (root_objectid == ref_offset) {
1790 					err = 0;
1791 					break;
1792 				}
1793 				if (ref_offset < root_objectid)
1794 					break;
1795 			}
1796 		}
1797 		ptr += btrfs_extent_inline_ref_size(type);
1798 	}
1799 	if (err == -ENOENT && insert) {
1800 		if (item_size + extra_size >=
1801 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1802 			err = -EAGAIN;
1803 			goto out;
1804 		}
1805 		/*
1806 		 * To add new inline back ref, we have to make sure
1807 		 * there is no corresponding back ref item.
1808 		 * For simplicity, we just do not add new inline back
1809 		 * ref if there is any kind of item for this block
1810 		 */
1811 		if (find_next_key(path, 0, &key) == 0 &&
1812 		    key.objectid == bytenr &&
1813 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1814 			err = -EAGAIN;
1815 			goto out;
1816 		}
1817 	}
1818 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1819 out:
1820 	if (insert) {
1821 		path->keep_locks = 0;
1822 		btrfs_unlock_up_safe(path, 1);
1823 	}
1824 	return err;
1825 }
1826 
1827 /*
1828  * helper to add new inline back ref
1829  */
1830 static noinline_for_stack
1831 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1832 				 struct btrfs_path *path,
1833 				 struct btrfs_extent_inline_ref *iref,
1834 				 u64 parent, u64 root_objectid,
1835 				 u64 owner, u64 offset, int refs_to_add,
1836 				 struct btrfs_delayed_extent_op *extent_op)
1837 {
1838 	struct extent_buffer *leaf;
1839 	struct btrfs_extent_item *ei;
1840 	unsigned long ptr;
1841 	unsigned long end;
1842 	unsigned long item_offset;
1843 	u64 refs;
1844 	int size;
1845 	int type;
1846 
1847 	leaf = path->nodes[0];
1848 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1849 	item_offset = (unsigned long)iref - (unsigned long)ei;
1850 
1851 	type = extent_ref_type(parent, owner);
1852 	size = btrfs_extent_inline_ref_size(type);
1853 
1854 	btrfs_extend_item(fs_info, path, size);
1855 
1856 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1857 	refs = btrfs_extent_refs(leaf, ei);
1858 	refs += refs_to_add;
1859 	btrfs_set_extent_refs(leaf, ei, refs);
1860 	if (extent_op)
1861 		__run_delayed_extent_op(extent_op, leaf, ei);
1862 
1863 	ptr = (unsigned long)ei + item_offset;
1864 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1865 	if (ptr < end - size)
1866 		memmove_extent_buffer(leaf, ptr + size, ptr,
1867 				      end - size - ptr);
1868 
1869 	iref = (struct btrfs_extent_inline_ref *)ptr;
1870 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1871 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1872 		struct btrfs_extent_data_ref *dref;
1873 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1874 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1875 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1876 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1877 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1878 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1879 		struct btrfs_shared_data_ref *sref;
1880 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1881 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1882 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1883 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1884 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1885 	} else {
1886 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1887 	}
1888 	btrfs_mark_buffer_dirty(leaf);
1889 }
1890 
1891 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1892 				 struct btrfs_fs_info *fs_info,
1893 				 struct btrfs_path *path,
1894 				 struct btrfs_extent_inline_ref **ref_ret,
1895 				 u64 bytenr, u64 num_bytes, u64 parent,
1896 				 u64 root_objectid, u64 owner, u64 offset)
1897 {
1898 	int ret;
1899 
1900 	ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
1901 					   bytenr, num_bytes, parent,
1902 					   root_objectid, owner, offset, 0);
1903 	if (ret != -ENOENT)
1904 		return ret;
1905 
1906 	btrfs_release_path(path);
1907 	*ref_ret = NULL;
1908 
1909 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1910 		ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
1911 					    parent, root_objectid);
1912 	} else {
1913 		ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
1914 					     parent, root_objectid, owner,
1915 					     offset);
1916 	}
1917 	return ret;
1918 }
1919 
1920 /*
1921  * helper to update/remove inline back ref
1922  */
1923 static noinline_for_stack
1924 void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
1925 				  struct btrfs_path *path,
1926 				  struct btrfs_extent_inline_ref *iref,
1927 				  int refs_to_mod,
1928 				  struct btrfs_delayed_extent_op *extent_op,
1929 				  int *last_ref)
1930 {
1931 	struct extent_buffer *leaf;
1932 	struct btrfs_extent_item *ei;
1933 	struct btrfs_extent_data_ref *dref = NULL;
1934 	struct btrfs_shared_data_ref *sref = NULL;
1935 	unsigned long ptr;
1936 	unsigned long end;
1937 	u32 item_size;
1938 	int size;
1939 	int type;
1940 	u64 refs;
1941 
1942 	leaf = path->nodes[0];
1943 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1944 	refs = btrfs_extent_refs(leaf, ei);
1945 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1946 	refs += refs_to_mod;
1947 	btrfs_set_extent_refs(leaf, ei, refs);
1948 	if (extent_op)
1949 		__run_delayed_extent_op(extent_op, leaf, ei);
1950 
1951 	/*
1952 	 * If type is invalid, we should have bailed out after
1953 	 * lookup_inline_extent_backref().
1954 	 */
1955 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1956 	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1957 
1958 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1959 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1960 		refs = btrfs_extent_data_ref_count(leaf, dref);
1961 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1962 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1963 		refs = btrfs_shared_data_ref_count(leaf, sref);
1964 	} else {
1965 		refs = 1;
1966 		BUG_ON(refs_to_mod != -1);
1967 	}
1968 
1969 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1970 	refs += refs_to_mod;
1971 
1972 	if (refs > 0) {
1973 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1974 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1975 		else
1976 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1977 	} else {
1978 		*last_ref = 1;
1979 		size =  btrfs_extent_inline_ref_size(type);
1980 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1981 		ptr = (unsigned long)iref;
1982 		end = (unsigned long)ei + item_size;
1983 		if (ptr + size < end)
1984 			memmove_extent_buffer(leaf, ptr, ptr + size,
1985 					      end - ptr - size);
1986 		item_size -= size;
1987 		btrfs_truncate_item(fs_info, path, item_size, 1);
1988 	}
1989 	btrfs_mark_buffer_dirty(leaf);
1990 }
1991 
1992 static noinline_for_stack
1993 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1994 				 struct btrfs_fs_info *fs_info,
1995 				 struct btrfs_path *path,
1996 				 u64 bytenr, u64 num_bytes, u64 parent,
1997 				 u64 root_objectid, u64 owner,
1998 				 u64 offset, int refs_to_add,
1999 				 struct btrfs_delayed_extent_op *extent_op)
2000 {
2001 	struct btrfs_extent_inline_ref *iref;
2002 	int ret;
2003 
2004 	ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
2005 					   bytenr, num_bytes, parent,
2006 					   root_objectid, owner, offset, 1);
2007 	if (ret == 0) {
2008 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
2009 		update_inline_extent_backref(fs_info, path, iref,
2010 					     refs_to_add, extent_op, NULL);
2011 	} else if (ret == -ENOENT) {
2012 		setup_inline_extent_backref(fs_info, path, iref, parent,
2013 					    root_objectid, owner, offset,
2014 					    refs_to_add, extent_op);
2015 		ret = 0;
2016 	}
2017 	return ret;
2018 }
2019 
2020 static int insert_extent_backref(struct btrfs_trans_handle *trans,
2021 				 struct btrfs_fs_info *fs_info,
2022 				 struct btrfs_path *path,
2023 				 u64 bytenr, u64 parent, u64 root_objectid,
2024 				 u64 owner, u64 offset, int refs_to_add)
2025 {
2026 	int ret;
2027 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2028 		BUG_ON(refs_to_add != 1);
2029 		ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
2030 					    parent, root_objectid);
2031 	} else {
2032 		ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
2033 					     parent, root_objectid,
2034 					     owner, offset, refs_to_add);
2035 	}
2036 	return ret;
2037 }
2038 
2039 static int remove_extent_backref(struct btrfs_trans_handle *trans,
2040 				 struct btrfs_fs_info *fs_info,
2041 				 struct btrfs_path *path,
2042 				 struct btrfs_extent_inline_ref *iref,
2043 				 int refs_to_drop, int is_data, int *last_ref)
2044 {
2045 	int ret = 0;
2046 
2047 	BUG_ON(!is_data && refs_to_drop != 1);
2048 	if (iref) {
2049 		update_inline_extent_backref(fs_info, path, iref,
2050 					     -refs_to_drop, NULL, last_ref);
2051 	} else if (is_data) {
2052 		ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
2053 					     last_ref);
2054 	} else {
2055 		*last_ref = 1;
2056 		ret = btrfs_del_item(trans, fs_info->extent_root, path);
2057 	}
2058 	return ret;
2059 }
2060 
2061 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
2062 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
2063 			       u64 *discarded_bytes)
2064 {
2065 	int j, ret = 0;
2066 	u64 bytes_left, end;
2067 	u64 aligned_start = ALIGN(start, 1 << 9);
2068 
2069 	if (WARN_ON(start != aligned_start)) {
2070 		len -= aligned_start - start;
2071 		len = round_down(len, 1 << 9);
2072 		start = aligned_start;
2073 	}
2074 
2075 	*discarded_bytes = 0;
2076 
2077 	if (!len)
2078 		return 0;
2079 
2080 	end = start + len;
2081 	bytes_left = len;
2082 
2083 	/* Skip any superblocks on this device. */
2084 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
2085 		u64 sb_start = btrfs_sb_offset(j);
2086 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
2087 		u64 size = sb_start - start;
2088 
2089 		if (!in_range(sb_start, start, bytes_left) &&
2090 		    !in_range(sb_end, start, bytes_left) &&
2091 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
2092 			continue;
2093 
2094 		/*
2095 		 * Superblock spans beginning of range.  Adjust start and
2096 		 * try again.
2097 		 */
2098 		if (sb_start <= start) {
2099 			start += sb_end - start;
2100 			if (start > end) {
2101 				bytes_left = 0;
2102 				break;
2103 			}
2104 			bytes_left = end - start;
2105 			continue;
2106 		}
2107 
2108 		if (size) {
2109 			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2110 						   GFP_NOFS, 0);
2111 			if (!ret)
2112 				*discarded_bytes += size;
2113 			else if (ret != -EOPNOTSUPP)
2114 				return ret;
2115 		}
2116 
2117 		start = sb_end;
2118 		if (start > end) {
2119 			bytes_left = 0;
2120 			break;
2121 		}
2122 		bytes_left = end - start;
2123 	}
2124 
2125 	if (bytes_left) {
2126 		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2127 					   GFP_NOFS, 0);
2128 		if (!ret)
2129 			*discarded_bytes += bytes_left;
2130 	}
2131 	return ret;
2132 }
2133 
2134 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
2135 			 u64 num_bytes, u64 *actual_bytes)
2136 {
2137 	int ret;
2138 	u64 discarded_bytes = 0;
2139 	struct btrfs_bio *bbio = NULL;
2140 
2141 
2142 	/*
2143 	 * Avoid races with device replace and make sure our bbio has devices
2144 	 * associated to its stripes that don't go away while we are discarding.
2145 	 */
2146 	btrfs_bio_counter_inc_blocked(fs_info);
2147 	/* Tell the block device(s) that the sectors can be discarded */
2148 	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2149 			      &bbio, 0);
2150 	/* Error condition is -ENOMEM */
2151 	if (!ret) {
2152 		struct btrfs_bio_stripe *stripe = bbio->stripes;
2153 		int i;
2154 
2155 
2156 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2157 			u64 bytes;
2158 			if (!stripe->dev->can_discard)
2159 				continue;
2160 
2161 			ret = btrfs_issue_discard(stripe->dev->bdev,
2162 						  stripe->physical,
2163 						  stripe->length,
2164 						  &bytes);
2165 			if (!ret)
2166 				discarded_bytes += bytes;
2167 			else if (ret != -EOPNOTSUPP)
2168 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2169 
2170 			/*
2171 			 * Just in case we get back EOPNOTSUPP for some reason,
2172 			 * just ignore the return value so we don't screw up
2173 			 * people calling discard_extent.
2174 			 */
2175 			ret = 0;
2176 		}
2177 		btrfs_put_bbio(bbio);
2178 	}
2179 	btrfs_bio_counter_dec(fs_info);
2180 
2181 	if (actual_bytes)
2182 		*actual_bytes = discarded_bytes;
2183 
2184 
2185 	if (ret == -EOPNOTSUPP)
2186 		ret = 0;
2187 	return ret;
2188 }
2189 
2190 /* Can return -ENOMEM */
2191 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2192 			 struct btrfs_fs_info *fs_info,
2193 			 u64 bytenr, u64 num_bytes, u64 parent,
2194 			 u64 root_objectid, u64 owner, u64 offset)
2195 {
2196 	int old_ref_mod, new_ref_mod;
2197 	int ret;
2198 
2199 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2200 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
2201 
2202 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2203 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2204 						 num_bytes, parent,
2205 						 root_objectid, (int)owner,
2206 						 BTRFS_ADD_DELAYED_REF, NULL,
2207 						 &old_ref_mod, &new_ref_mod);
2208 	} else {
2209 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2210 						 num_bytes, parent,
2211 						 root_objectid, owner, offset,
2212 						 0, BTRFS_ADD_DELAYED_REF,
2213 						 &old_ref_mod, &new_ref_mod);
2214 	}
2215 
2216 	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2217 		add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
2218 
2219 	return ret;
2220 }
2221 
2222 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2223 				  struct btrfs_fs_info *fs_info,
2224 				  struct btrfs_delayed_ref_node *node,
2225 				  u64 parent, u64 root_objectid,
2226 				  u64 owner, u64 offset, int refs_to_add,
2227 				  struct btrfs_delayed_extent_op *extent_op)
2228 {
2229 	struct btrfs_path *path;
2230 	struct extent_buffer *leaf;
2231 	struct btrfs_extent_item *item;
2232 	struct btrfs_key key;
2233 	u64 bytenr = node->bytenr;
2234 	u64 num_bytes = node->num_bytes;
2235 	u64 refs;
2236 	int ret;
2237 
2238 	path = btrfs_alloc_path();
2239 	if (!path)
2240 		return -ENOMEM;
2241 
2242 	path->reada = READA_FORWARD;
2243 	path->leave_spinning = 1;
2244 	/* this will setup the path even if it fails to insert the back ref */
2245 	ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
2246 					   num_bytes, parent, root_objectid,
2247 					   owner, offset,
2248 					   refs_to_add, extent_op);
2249 	if ((ret < 0 && ret != -EAGAIN) || !ret)
2250 		goto out;
2251 
2252 	/*
2253 	 * Ok we had -EAGAIN which means we didn't have space to insert and
2254 	 * inline extent ref, so just update the reference count and add a
2255 	 * normal backref.
2256 	 */
2257 	leaf = path->nodes[0];
2258 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2259 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2260 	refs = btrfs_extent_refs(leaf, item);
2261 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2262 	if (extent_op)
2263 		__run_delayed_extent_op(extent_op, leaf, item);
2264 
2265 	btrfs_mark_buffer_dirty(leaf);
2266 	btrfs_release_path(path);
2267 
2268 	path->reada = READA_FORWARD;
2269 	path->leave_spinning = 1;
2270 	/* now insert the actual backref */
2271 	ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
2272 				    root_objectid, owner, offset, refs_to_add);
2273 	if (ret)
2274 		btrfs_abort_transaction(trans, ret);
2275 out:
2276 	btrfs_free_path(path);
2277 	return ret;
2278 }
2279 
2280 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2281 				struct btrfs_fs_info *fs_info,
2282 				struct btrfs_delayed_ref_node *node,
2283 				struct btrfs_delayed_extent_op *extent_op,
2284 				int insert_reserved)
2285 {
2286 	int ret = 0;
2287 	struct btrfs_delayed_data_ref *ref;
2288 	struct btrfs_key ins;
2289 	u64 parent = 0;
2290 	u64 ref_root = 0;
2291 	u64 flags = 0;
2292 
2293 	ins.objectid = node->bytenr;
2294 	ins.offset = node->num_bytes;
2295 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2296 
2297 	ref = btrfs_delayed_node_to_data_ref(node);
2298 	trace_run_delayed_data_ref(fs_info, node, ref, node->action);
2299 
2300 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2301 		parent = ref->parent;
2302 	ref_root = ref->root;
2303 
2304 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2305 		if (extent_op)
2306 			flags |= extent_op->flags_to_set;
2307 		ret = alloc_reserved_file_extent(trans, fs_info,
2308 						 parent, ref_root, flags,
2309 						 ref->objectid, ref->offset,
2310 						 &ins, node->ref_mod);
2311 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2312 		ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
2313 					     ref_root, ref->objectid,
2314 					     ref->offset, node->ref_mod,
2315 					     extent_op);
2316 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2317 		ret = __btrfs_free_extent(trans, fs_info, node, parent,
2318 					  ref_root, ref->objectid,
2319 					  ref->offset, node->ref_mod,
2320 					  extent_op);
2321 	} else {
2322 		BUG();
2323 	}
2324 	return ret;
2325 }
2326 
2327 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2328 				    struct extent_buffer *leaf,
2329 				    struct btrfs_extent_item *ei)
2330 {
2331 	u64 flags = btrfs_extent_flags(leaf, ei);
2332 	if (extent_op->update_flags) {
2333 		flags |= extent_op->flags_to_set;
2334 		btrfs_set_extent_flags(leaf, ei, flags);
2335 	}
2336 
2337 	if (extent_op->update_key) {
2338 		struct btrfs_tree_block_info *bi;
2339 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2340 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2341 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2342 	}
2343 }
2344 
2345 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2346 				 struct btrfs_fs_info *fs_info,
2347 				 struct btrfs_delayed_ref_node *node,
2348 				 struct btrfs_delayed_extent_op *extent_op)
2349 {
2350 	struct btrfs_key key;
2351 	struct btrfs_path *path;
2352 	struct btrfs_extent_item *ei;
2353 	struct extent_buffer *leaf;
2354 	u32 item_size;
2355 	int ret;
2356 	int err = 0;
2357 	int metadata = !extent_op->is_data;
2358 
2359 	if (trans->aborted)
2360 		return 0;
2361 
2362 	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2363 		metadata = 0;
2364 
2365 	path = btrfs_alloc_path();
2366 	if (!path)
2367 		return -ENOMEM;
2368 
2369 	key.objectid = node->bytenr;
2370 
2371 	if (metadata) {
2372 		key.type = BTRFS_METADATA_ITEM_KEY;
2373 		key.offset = extent_op->level;
2374 	} else {
2375 		key.type = BTRFS_EXTENT_ITEM_KEY;
2376 		key.offset = node->num_bytes;
2377 	}
2378 
2379 again:
2380 	path->reada = READA_FORWARD;
2381 	path->leave_spinning = 1;
2382 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2383 	if (ret < 0) {
2384 		err = ret;
2385 		goto out;
2386 	}
2387 	if (ret > 0) {
2388 		if (metadata) {
2389 			if (path->slots[0] > 0) {
2390 				path->slots[0]--;
2391 				btrfs_item_key_to_cpu(path->nodes[0], &key,
2392 						      path->slots[0]);
2393 				if (key.objectid == node->bytenr &&
2394 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
2395 				    key.offset == node->num_bytes)
2396 					ret = 0;
2397 			}
2398 			if (ret > 0) {
2399 				btrfs_release_path(path);
2400 				metadata = 0;
2401 
2402 				key.objectid = node->bytenr;
2403 				key.offset = node->num_bytes;
2404 				key.type = BTRFS_EXTENT_ITEM_KEY;
2405 				goto again;
2406 			}
2407 		} else {
2408 			err = -EIO;
2409 			goto out;
2410 		}
2411 	}
2412 
2413 	leaf = path->nodes[0];
2414 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2415 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2416 	if (item_size < sizeof(*ei)) {
2417 		ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
2418 		if (ret < 0) {
2419 			err = ret;
2420 			goto out;
2421 		}
2422 		leaf = path->nodes[0];
2423 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2424 	}
2425 #endif
2426 	BUG_ON(item_size < sizeof(*ei));
2427 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2428 	__run_delayed_extent_op(extent_op, leaf, ei);
2429 
2430 	btrfs_mark_buffer_dirty(leaf);
2431 out:
2432 	btrfs_free_path(path);
2433 	return err;
2434 }
2435 
2436 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2437 				struct btrfs_fs_info *fs_info,
2438 				struct btrfs_delayed_ref_node *node,
2439 				struct btrfs_delayed_extent_op *extent_op,
2440 				int insert_reserved)
2441 {
2442 	int ret = 0;
2443 	struct btrfs_delayed_tree_ref *ref;
2444 	struct btrfs_key ins;
2445 	u64 parent = 0;
2446 	u64 ref_root = 0;
2447 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
2448 
2449 	ref = btrfs_delayed_node_to_tree_ref(node);
2450 	trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
2451 
2452 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2453 		parent = ref->parent;
2454 	ref_root = ref->root;
2455 
2456 	ins.objectid = node->bytenr;
2457 	if (skinny_metadata) {
2458 		ins.offset = ref->level;
2459 		ins.type = BTRFS_METADATA_ITEM_KEY;
2460 	} else {
2461 		ins.offset = node->num_bytes;
2462 		ins.type = BTRFS_EXTENT_ITEM_KEY;
2463 	}
2464 
2465 	if (node->ref_mod != 1) {
2466 		btrfs_err(fs_info,
2467 	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2468 			  node->bytenr, node->ref_mod, node->action, ref_root,
2469 			  parent);
2470 		return -EIO;
2471 	}
2472 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2473 		BUG_ON(!extent_op || !extent_op->update_flags);
2474 		ret = alloc_reserved_tree_block(trans, fs_info,
2475 						parent, ref_root,
2476 						extent_op->flags_to_set,
2477 						&extent_op->key,
2478 						ref->level, &ins);
2479 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2480 		ret = __btrfs_inc_extent_ref(trans, fs_info, node,
2481 					     parent, ref_root,
2482 					     ref->level, 0, 1,
2483 					     extent_op);
2484 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2485 		ret = __btrfs_free_extent(trans, fs_info, node,
2486 					  parent, ref_root,
2487 					  ref->level, 0, 1, extent_op);
2488 	} else {
2489 		BUG();
2490 	}
2491 	return ret;
2492 }
2493 
2494 /* helper function to actually process a single delayed ref entry */
2495 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2496 			       struct btrfs_fs_info *fs_info,
2497 			       struct btrfs_delayed_ref_node *node,
2498 			       struct btrfs_delayed_extent_op *extent_op,
2499 			       int insert_reserved)
2500 {
2501 	int ret = 0;
2502 
2503 	if (trans->aborted) {
2504 		if (insert_reserved)
2505 			btrfs_pin_extent(fs_info, node->bytenr,
2506 					 node->num_bytes, 1);
2507 		return 0;
2508 	}
2509 
2510 	if (btrfs_delayed_ref_is_head(node)) {
2511 		struct btrfs_delayed_ref_head *head;
2512 		/*
2513 		 * we've hit the end of the chain and we were supposed
2514 		 * to insert this extent into the tree.  But, it got
2515 		 * deleted before we ever needed to insert it, so all
2516 		 * we have to do is clean up the accounting
2517 		 */
2518 		BUG_ON(extent_op);
2519 		head = btrfs_delayed_node_to_head(node);
2520 		trace_run_delayed_ref_head(fs_info, node, head, node->action);
2521 
2522 		if (head->total_ref_mod < 0) {
2523 			struct btrfs_block_group_cache *cache;
2524 
2525 			cache = btrfs_lookup_block_group(fs_info, node->bytenr);
2526 			ASSERT(cache);
2527 			percpu_counter_add(&cache->space_info->total_bytes_pinned,
2528 					   -node->num_bytes);
2529 			btrfs_put_block_group(cache);
2530 		}
2531 
2532 		if (insert_reserved) {
2533 			btrfs_pin_extent(fs_info, node->bytenr,
2534 					 node->num_bytes, 1);
2535 			if (head->is_data) {
2536 				ret = btrfs_del_csums(trans, fs_info,
2537 						      node->bytenr,
2538 						      node->num_bytes);
2539 			}
2540 		}
2541 
2542 		/* Also free its reserved qgroup space */
2543 		btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2544 					      head->qgroup_reserved);
2545 		return ret;
2546 	}
2547 
2548 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2549 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2550 		ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
2551 					   insert_reserved);
2552 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2553 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2554 		ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
2555 					   insert_reserved);
2556 	else
2557 		BUG();
2558 	return ret;
2559 }
2560 
2561 static inline struct btrfs_delayed_ref_node *
2562 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2563 {
2564 	struct btrfs_delayed_ref_node *ref;
2565 
2566 	if (list_empty(&head->ref_list))
2567 		return NULL;
2568 
2569 	/*
2570 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2571 	 * This is to prevent a ref count from going down to zero, which deletes
2572 	 * the extent item from the extent tree, when there still are references
2573 	 * to add, which would fail because they would not find the extent item.
2574 	 */
2575 	if (!list_empty(&head->ref_add_list))
2576 		return list_first_entry(&head->ref_add_list,
2577 				struct btrfs_delayed_ref_node, add_list);
2578 
2579 	ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
2580 			       list);
2581 	ASSERT(list_empty(&ref->add_list));
2582 	return ref;
2583 }
2584 
2585 /*
2586  * Returns 0 on success or if called with an already aborted transaction.
2587  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2588  */
2589 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2590 					     struct btrfs_fs_info *fs_info,
2591 					     unsigned long nr)
2592 {
2593 	struct btrfs_delayed_ref_root *delayed_refs;
2594 	struct btrfs_delayed_ref_node *ref;
2595 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2596 	struct btrfs_delayed_extent_op *extent_op;
2597 	ktime_t start = ktime_get();
2598 	int ret;
2599 	unsigned long count = 0;
2600 	unsigned long actual_count = 0;
2601 	int must_insert_reserved = 0;
2602 
2603 	delayed_refs = &trans->transaction->delayed_refs;
2604 	while (1) {
2605 		if (!locked_ref) {
2606 			if (count >= nr)
2607 				break;
2608 
2609 			spin_lock(&delayed_refs->lock);
2610 			locked_ref = btrfs_select_ref_head(trans);
2611 			if (!locked_ref) {
2612 				spin_unlock(&delayed_refs->lock);
2613 				break;
2614 			}
2615 
2616 			/* grab the lock that says we are going to process
2617 			 * all the refs for this head */
2618 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2619 			spin_unlock(&delayed_refs->lock);
2620 			/*
2621 			 * we may have dropped the spin lock to get the head
2622 			 * mutex lock, and that might have given someone else
2623 			 * time to free the head.  If that's true, it has been
2624 			 * removed from our list and we can move on.
2625 			 */
2626 			if (ret == -EAGAIN) {
2627 				locked_ref = NULL;
2628 				count++;
2629 				continue;
2630 			}
2631 		}
2632 
2633 		/*
2634 		 * We need to try and merge add/drops of the same ref since we
2635 		 * can run into issues with relocate dropping the implicit ref
2636 		 * and then it being added back again before the drop can
2637 		 * finish.  If we merged anything we need to re-loop so we can
2638 		 * get a good ref.
2639 		 * Or we can get node references of the same type that weren't
2640 		 * merged when created due to bumps in the tree mod seq, and
2641 		 * we need to merge them to prevent adding an inline extent
2642 		 * backref before dropping it (triggering a BUG_ON at
2643 		 * insert_inline_extent_backref()).
2644 		 */
2645 		spin_lock(&locked_ref->lock);
2646 		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2647 					 locked_ref);
2648 
2649 		/*
2650 		 * locked_ref is the head node, so we have to go one
2651 		 * node back for any delayed ref updates
2652 		 */
2653 		ref = select_delayed_ref(locked_ref);
2654 
2655 		if (ref && ref->seq &&
2656 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2657 			spin_unlock(&locked_ref->lock);
2658 			spin_lock(&delayed_refs->lock);
2659 			locked_ref->processing = 0;
2660 			delayed_refs->num_heads_ready++;
2661 			spin_unlock(&delayed_refs->lock);
2662 			btrfs_delayed_ref_unlock(locked_ref);
2663 			locked_ref = NULL;
2664 			cond_resched();
2665 			count++;
2666 			continue;
2667 		}
2668 
2669 		/*
2670 		 * record the must insert reserved flag before we
2671 		 * drop the spin lock.
2672 		 */
2673 		must_insert_reserved = locked_ref->must_insert_reserved;
2674 		locked_ref->must_insert_reserved = 0;
2675 
2676 		extent_op = locked_ref->extent_op;
2677 		locked_ref->extent_op = NULL;
2678 
2679 		if (!ref) {
2680 
2681 
2682 			/* All delayed refs have been processed, Go ahead
2683 			 * and send the head node to run_one_delayed_ref,
2684 			 * so that any accounting fixes can happen
2685 			 */
2686 			ref = &locked_ref->node;
2687 
2688 			if (extent_op && must_insert_reserved) {
2689 				btrfs_free_delayed_extent_op(extent_op);
2690 				extent_op = NULL;
2691 			}
2692 
2693 			if (extent_op) {
2694 				spin_unlock(&locked_ref->lock);
2695 				ret = run_delayed_extent_op(trans, fs_info,
2696 							    ref, extent_op);
2697 				btrfs_free_delayed_extent_op(extent_op);
2698 
2699 				if (ret) {
2700 					/*
2701 					 * Need to reset must_insert_reserved if
2702 					 * there was an error so the abort stuff
2703 					 * can cleanup the reserved space
2704 					 * properly.
2705 					 */
2706 					if (must_insert_reserved)
2707 						locked_ref->must_insert_reserved = 1;
2708 					spin_lock(&delayed_refs->lock);
2709 					locked_ref->processing = 0;
2710 					delayed_refs->num_heads_ready++;
2711 					spin_unlock(&delayed_refs->lock);
2712 					btrfs_debug(fs_info,
2713 						    "run_delayed_extent_op returned %d",
2714 						    ret);
2715 					btrfs_delayed_ref_unlock(locked_ref);
2716 					return ret;
2717 				}
2718 				continue;
2719 			}
2720 
2721 			/*
2722 			 * Need to drop our head ref lock and re-acquire the
2723 			 * delayed ref lock and then re-check to make sure
2724 			 * nobody got added.
2725 			 */
2726 			spin_unlock(&locked_ref->lock);
2727 			spin_lock(&delayed_refs->lock);
2728 			spin_lock(&locked_ref->lock);
2729 			if (!list_empty(&locked_ref->ref_list) ||
2730 			    locked_ref->extent_op) {
2731 				spin_unlock(&locked_ref->lock);
2732 				spin_unlock(&delayed_refs->lock);
2733 				continue;
2734 			}
2735 			ref->in_tree = 0;
2736 			delayed_refs->num_heads--;
2737 			rb_erase(&locked_ref->href_node,
2738 				 &delayed_refs->href_root);
2739 			spin_unlock(&delayed_refs->lock);
2740 		} else {
2741 			actual_count++;
2742 			ref->in_tree = 0;
2743 			list_del(&ref->list);
2744 			if (!list_empty(&ref->add_list))
2745 				list_del(&ref->add_list);
2746 		}
2747 		atomic_dec(&delayed_refs->num_entries);
2748 
2749 		if (!btrfs_delayed_ref_is_head(ref)) {
2750 			/*
2751 			 * when we play the delayed ref, also correct the
2752 			 * ref_mod on head
2753 			 */
2754 			switch (ref->action) {
2755 			case BTRFS_ADD_DELAYED_REF:
2756 			case BTRFS_ADD_DELAYED_EXTENT:
2757 				locked_ref->node.ref_mod -= ref->ref_mod;
2758 				break;
2759 			case BTRFS_DROP_DELAYED_REF:
2760 				locked_ref->node.ref_mod += ref->ref_mod;
2761 				break;
2762 			default:
2763 				WARN_ON(1);
2764 			}
2765 		}
2766 		spin_unlock(&locked_ref->lock);
2767 
2768 		ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
2769 					  must_insert_reserved);
2770 
2771 		btrfs_free_delayed_extent_op(extent_op);
2772 		if (ret) {
2773 			spin_lock(&delayed_refs->lock);
2774 			locked_ref->processing = 0;
2775 			delayed_refs->num_heads_ready++;
2776 			spin_unlock(&delayed_refs->lock);
2777 			btrfs_delayed_ref_unlock(locked_ref);
2778 			btrfs_put_delayed_ref(ref);
2779 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2780 				    ret);
2781 			return ret;
2782 		}
2783 
2784 		/*
2785 		 * If this node is a head, that means all the refs in this head
2786 		 * have been dealt with, and we will pick the next head to deal
2787 		 * with, so we must unlock the head and drop it from the cluster
2788 		 * list before we release it.
2789 		 */
2790 		if (btrfs_delayed_ref_is_head(ref)) {
2791 			if (locked_ref->is_data &&
2792 			    locked_ref->total_ref_mod < 0) {
2793 				spin_lock(&delayed_refs->lock);
2794 				delayed_refs->pending_csums -= ref->num_bytes;
2795 				spin_unlock(&delayed_refs->lock);
2796 			}
2797 			btrfs_delayed_ref_unlock(locked_ref);
2798 			locked_ref = NULL;
2799 		}
2800 		btrfs_put_delayed_ref(ref);
2801 		count++;
2802 		cond_resched();
2803 	}
2804 
2805 	/*
2806 	 * We don't want to include ref heads since we can have empty ref heads
2807 	 * and those will drastically skew our runtime down since we just do
2808 	 * accounting, no actual extent tree updates.
2809 	 */
2810 	if (actual_count > 0) {
2811 		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2812 		u64 avg;
2813 
2814 		/*
2815 		 * We weigh the current average higher than our current runtime
2816 		 * to avoid large swings in the average.
2817 		 */
2818 		spin_lock(&delayed_refs->lock);
2819 		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2820 		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2821 		spin_unlock(&delayed_refs->lock);
2822 	}
2823 	return 0;
2824 }
2825 
2826 #ifdef SCRAMBLE_DELAYED_REFS
2827 /*
2828  * Normally delayed refs get processed in ascending bytenr order. This
2829  * correlates in most cases to the order added. To expose dependencies on this
2830  * order, we start to process the tree in the middle instead of the beginning
2831  */
2832 static u64 find_middle(struct rb_root *root)
2833 {
2834 	struct rb_node *n = root->rb_node;
2835 	struct btrfs_delayed_ref_node *entry;
2836 	int alt = 1;
2837 	u64 middle;
2838 	u64 first = 0, last = 0;
2839 
2840 	n = rb_first(root);
2841 	if (n) {
2842 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2843 		first = entry->bytenr;
2844 	}
2845 	n = rb_last(root);
2846 	if (n) {
2847 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2848 		last = entry->bytenr;
2849 	}
2850 	n = root->rb_node;
2851 
2852 	while (n) {
2853 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2854 		WARN_ON(!entry->in_tree);
2855 
2856 		middle = entry->bytenr;
2857 
2858 		if (alt)
2859 			n = n->rb_left;
2860 		else
2861 			n = n->rb_right;
2862 
2863 		alt = 1 - alt;
2864 	}
2865 	return middle;
2866 }
2867 #endif
2868 
2869 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2870 {
2871 	u64 num_bytes;
2872 
2873 	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2874 			     sizeof(struct btrfs_extent_inline_ref));
2875 	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2876 		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2877 
2878 	/*
2879 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2880 	 * closer to what we're really going to want to use.
2881 	 */
2882 	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2883 }
2884 
2885 /*
2886  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2887  * would require to store the csums for that many bytes.
2888  */
2889 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2890 {
2891 	u64 csum_size;
2892 	u64 num_csums_per_leaf;
2893 	u64 num_csums;
2894 
2895 	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2896 	num_csums_per_leaf = div64_u64(csum_size,
2897 			(u64)btrfs_super_csum_size(fs_info->super_copy));
2898 	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2899 	num_csums += num_csums_per_leaf - 1;
2900 	num_csums = div64_u64(num_csums, num_csums_per_leaf);
2901 	return num_csums;
2902 }
2903 
2904 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2905 				       struct btrfs_fs_info *fs_info)
2906 {
2907 	struct btrfs_block_rsv *global_rsv;
2908 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2909 	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2910 	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2911 	u64 num_bytes, num_dirty_bgs_bytes;
2912 	int ret = 0;
2913 
2914 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2915 	num_heads = heads_to_leaves(fs_info, num_heads);
2916 	if (num_heads > 1)
2917 		num_bytes += (num_heads - 1) * fs_info->nodesize;
2918 	num_bytes <<= 1;
2919 	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2920 							fs_info->nodesize;
2921 	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2922 							     num_dirty_bgs);
2923 	global_rsv = &fs_info->global_block_rsv;
2924 
2925 	/*
2926 	 * If we can't allocate any more chunks lets make sure we have _lots_ of
2927 	 * wiggle room since running delayed refs can create more delayed refs.
2928 	 */
2929 	if (global_rsv->space_info->full) {
2930 		num_dirty_bgs_bytes <<= 1;
2931 		num_bytes <<= 1;
2932 	}
2933 
2934 	spin_lock(&global_rsv->lock);
2935 	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2936 		ret = 1;
2937 	spin_unlock(&global_rsv->lock);
2938 	return ret;
2939 }
2940 
2941 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2942 				       struct btrfs_fs_info *fs_info)
2943 {
2944 	u64 num_entries =
2945 		atomic_read(&trans->transaction->delayed_refs.num_entries);
2946 	u64 avg_runtime;
2947 	u64 val;
2948 
2949 	smp_mb();
2950 	avg_runtime = fs_info->avg_delayed_ref_runtime;
2951 	val = num_entries * avg_runtime;
2952 	if (val >= NSEC_PER_SEC)
2953 		return 1;
2954 	if (val >= NSEC_PER_SEC / 2)
2955 		return 2;
2956 
2957 	return btrfs_check_space_for_delayed_refs(trans, fs_info);
2958 }
2959 
2960 struct async_delayed_refs {
2961 	struct btrfs_root *root;
2962 	u64 transid;
2963 	int count;
2964 	int error;
2965 	int sync;
2966 	struct completion wait;
2967 	struct btrfs_work work;
2968 };
2969 
2970 static inline struct async_delayed_refs *
2971 to_async_delayed_refs(struct btrfs_work *work)
2972 {
2973 	return container_of(work, struct async_delayed_refs, work);
2974 }
2975 
2976 static void delayed_ref_async_start(struct btrfs_work *work)
2977 {
2978 	struct async_delayed_refs *async = to_async_delayed_refs(work);
2979 	struct btrfs_trans_handle *trans;
2980 	struct btrfs_fs_info *fs_info = async->root->fs_info;
2981 	int ret;
2982 
2983 	/* if the commit is already started, we don't need to wait here */
2984 	if (btrfs_transaction_blocked(fs_info))
2985 		goto done;
2986 
2987 	trans = btrfs_join_transaction(async->root);
2988 	if (IS_ERR(trans)) {
2989 		async->error = PTR_ERR(trans);
2990 		goto done;
2991 	}
2992 
2993 	/*
2994 	 * trans->sync means that when we call end_transaction, we won't
2995 	 * wait on delayed refs
2996 	 */
2997 	trans->sync = true;
2998 
2999 	/* Don't bother flushing if we got into a different transaction */
3000 	if (trans->transid > async->transid)
3001 		goto end;
3002 
3003 	ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
3004 	if (ret)
3005 		async->error = ret;
3006 end:
3007 	ret = btrfs_end_transaction(trans);
3008 	if (ret && !async->error)
3009 		async->error = ret;
3010 done:
3011 	if (async->sync)
3012 		complete(&async->wait);
3013 	else
3014 		kfree(async);
3015 }
3016 
3017 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
3018 				 unsigned long count, u64 transid, int wait)
3019 {
3020 	struct async_delayed_refs *async;
3021 	int ret;
3022 
3023 	async = kmalloc(sizeof(*async), GFP_NOFS);
3024 	if (!async)
3025 		return -ENOMEM;
3026 
3027 	async->root = fs_info->tree_root;
3028 	async->count = count;
3029 	async->error = 0;
3030 	async->transid = transid;
3031 	if (wait)
3032 		async->sync = 1;
3033 	else
3034 		async->sync = 0;
3035 	init_completion(&async->wait);
3036 
3037 	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
3038 			delayed_ref_async_start, NULL, NULL);
3039 
3040 	btrfs_queue_work(fs_info->extent_workers, &async->work);
3041 
3042 	if (wait) {
3043 		wait_for_completion(&async->wait);
3044 		ret = async->error;
3045 		kfree(async);
3046 		return ret;
3047 	}
3048 	return 0;
3049 }
3050 
3051 /*
3052  * this starts processing the delayed reference count updates and
3053  * extent insertions we have queued up so far.  count can be
3054  * 0, which means to process everything in the tree at the start
3055  * of the run (but not newly added entries), or it can be some target
3056  * number you'd like to process.
3057  *
3058  * Returns 0 on success or if called with an aborted transaction
3059  * Returns <0 on error and aborts the transaction
3060  */
3061 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3062 			   struct btrfs_fs_info *fs_info, unsigned long count)
3063 {
3064 	struct rb_node *node;
3065 	struct btrfs_delayed_ref_root *delayed_refs;
3066 	struct btrfs_delayed_ref_head *head;
3067 	int ret;
3068 	int run_all = count == (unsigned long)-1;
3069 	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
3070 
3071 	/* We'll clean this up in btrfs_cleanup_transaction */
3072 	if (trans->aborted)
3073 		return 0;
3074 
3075 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
3076 		return 0;
3077 
3078 	delayed_refs = &trans->transaction->delayed_refs;
3079 	if (count == 0)
3080 		count = atomic_read(&delayed_refs->num_entries) * 2;
3081 
3082 again:
3083 #ifdef SCRAMBLE_DELAYED_REFS
3084 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3085 #endif
3086 	trans->can_flush_pending_bgs = false;
3087 	ret = __btrfs_run_delayed_refs(trans, fs_info, count);
3088 	if (ret < 0) {
3089 		btrfs_abort_transaction(trans, ret);
3090 		return ret;
3091 	}
3092 
3093 	if (run_all) {
3094 		if (!list_empty(&trans->new_bgs))
3095 			btrfs_create_pending_block_groups(trans, fs_info);
3096 
3097 		spin_lock(&delayed_refs->lock);
3098 		node = rb_first(&delayed_refs->href_root);
3099 		if (!node) {
3100 			spin_unlock(&delayed_refs->lock);
3101 			goto out;
3102 		}
3103 
3104 		while (node) {
3105 			head = rb_entry(node, struct btrfs_delayed_ref_head,
3106 					href_node);
3107 			if (btrfs_delayed_ref_is_head(&head->node)) {
3108 				struct btrfs_delayed_ref_node *ref;
3109 
3110 				ref = &head->node;
3111 				refcount_inc(&ref->refs);
3112 
3113 				spin_unlock(&delayed_refs->lock);
3114 				/*
3115 				 * Mutex was contended, block until it's
3116 				 * released and try again
3117 				 */
3118 				mutex_lock(&head->mutex);
3119 				mutex_unlock(&head->mutex);
3120 
3121 				btrfs_put_delayed_ref(ref);
3122 				cond_resched();
3123 				goto again;
3124 			} else {
3125 				WARN_ON(1);
3126 			}
3127 			node = rb_next(node);
3128 		}
3129 		spin_unlock(&delayed_refs->lock);
3130 		cond_resched();
3131 		goto again;
3132 	}
3133 out:
3134 	trans->can_flush_pending_bgs = can_flush_pending_bgs;
3135 	return 0;
3136 }
3137 
3138 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3139 				struct btrfs_fs_info *fs_info,
3140 				u64 bytenr, u64 num_bytes, u64 flags,
3141 				int level, int is_data)
3142 {
3143 	struct btrfs_delayed_extent_op *extent_op;
3144 	int ret;
3145 
3146 	extent_op = btrfs_alloc_delayed_extent_op();
3147 	if (!extent_op)
3148 		return -ENOMEM;
3149 
3150 	extent_op->flags_to_set = flags;
3151 	extent_op->update_flags = true;
3152 	extent_op->update_key = false;
3153 	extent_op->is_data = is_data ? true : false;
3154 	extent_op->level = level;
3155 
3156 	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3157 					  num_bytes, extent_op);
3158 	if (ret)
3159 		btrfs_free_delayed_extent_op(extent_op);
3160 	return ret;
3161 }
3162 
3163 static noinline int check_delayed_ref(struct btrfs_root *root,
3164 				      struct btrfs_path *path,
3165 				      u64 objectid, u64 offset, u64 bytenr)
3166 {
3167 	struct btrfs_delayed_ref_head *head;
3168 	struct btrfs_delayed_ref_node *ref;
3169 	struct btrfs_delayed_data_ref *data_ref;
3170 	struct btrfs_delayed_ref_root *delayed_refs;
3171 	struct btrfs_transaction *cur_trans;
3172 	int ret = 0;
3173 
3174 	cur_trans = root->fs_info->running_transaction;
3175 	if (!cur_trans)
3176 		return 0;
3177 
3178 	delayed_refs = &cur_trans->delayed_refs;
3179 	spin_lock(&delayed_refs->lock);
3180 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3181 	if (!head) {
3182 		spin_unlock(&delayed_refs->lock);
3183 		return 0;
3184 	}
3185 
3186 	if (!mutex_trylock(&head->mutex)) {
3187 		refcount_inc(&head->node.refs);
3188 		spin_unlock(&delayed_refs->lock);
3189 
3190 		btrfs_release_path(path);
3191 
3192 		/*
3193 		 * Mutex was contended, block until it's released and let
3194 		 * caller try again
3195 		 */
3196 		mutex_lock(&head->mutex);
3197 		mutex_unlock(&head->mutex);
3198 		btrfs_put_delayed_ref(&head->node);
3199 		return -EAGAIN;
3200 	}
3201 	spin_unlock(&delayed_refs->lock);
3202 
3203 	spin_lock(&head->lock);
3204 	list_for_each_entry(ref, &head->ref_list, list) {
3205 		/* If it's a shared ref we know a cross reference exists */
3206 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3207 			ret = 1;
3208 			break;
3209 		}
3210 
3211 		data_ref = btrfs_delayed_node_to_data_ref(ref);
3212 
3213 		/*
3214 		 * If our ref doesn't match the one we're currently looking at
3215 		 * then we have a cross reference.
3216 		 */
3217 		if (data_ref->root != root->root_key.objectid ||
3218 		    data_ref->objectid != objectid ||
3219 		    data_ref->offset != offset) {
3220 			ret = 1;
3221 			break;
3222 		}
3223 	}
3224 	spin_unlock(&head->lock);
3225 	mutex_unlock(&head->mutex);
3226 	return ret;
3227 }
3228 
3229 static noinline int check_committed_ref(struct btrfs_root *root,
3230 					struct btrfs_path *path,
3231 					u64 objectid, u64 offset, u64 bytenr)
3232 {
3233 	struct btrfs_fs_info *fs_info = root->fs_info;
3234 	struct btrfs_root *extent_root = fs_info->extent_root;
3235 	struct extent_buffer *leaf;
3236 	struct btrfs_extent_data_ref *ref;
3237 	struct btrfs_extent_inline_ref *iref;
3238 	struct btrfs_extent_item *ei;
3239 	struct btrfs_key key;
3240 	u32 item_size;
3241 	int type;
3242 	int ret;
3243 
3244 	key.objectid = bytenr;
3245 	key.offset = (u64)-1;
3246 	key.type = BTRFS_EXTENT_ITEM_KEY;
3247 
3248 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3249 	if (ret < 0)
3250 		goto out;
3251 	BUG_ON(ret == 0); /* Corruption */
3252 
3253 	ret = -ENOENT;
3254 	if (path->slots[0] == 0)
3255 		goto out;
3256 
3257 	path->slots[0]--;
3258 	leaf = path->nodes[0];
3259 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3260 
3261 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3262 		goto out;
3263 
3264 	ret = 1;
3265 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3266 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3267 	if (item_size < sizeof(*ei)) {
3268 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3269 		goto out;
3270 	}
3271 #endif
3272 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3273 
3274 	if (item_size != sizeof(*ei) +
3275 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3276 		goto out;
3277 
3278 	if (btrfs_extent_generation(leaf, ei) <=
3279 	    btrfs_root_last_snapshot(&root->root_item))
3280 		goto out;
3281 
3282 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3283 
3284 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3285 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
3286 		goto out;
3287 
3288 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3289 	if (btrfs_extent_refs(leaf, ei) !=
3290 	    btrfs_extent_data_ref_count(leaf, ref) ||
3291 	    btrfs_extent_data_ref_root(leaf, ref) !=
3292 	    root->root_key.objectid ||
3293 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3294 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
3295 		goto out;
3296 
3297 	ret = 0;
3298 out:
3299 	return ret;
3300 }
3301 
3302 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3303 			  u64 bytenr)
3304 {
3305 	struct btrfs_path *path;
3306 	int ret;
3307 	int ret2;
3308 
3309 	path = btrfs_alloc_path();
3310 	if (!path)
3311 		return -ENOENT;
3312 
3313 	do {
3314 		ret = check_committed_ref(root, path, objectid,
3315 					  offset, bytenr);
3316 		if (ret && ret != -ENOENT)
3317 			goto out;
3318 
3319 		ret2 = check_delayed_ref(root, path, objectid,
3320 					 offset, bytenr);
3321 	} while (ret2 == -EAGAIN);
3322 
3323 	if (ret2 && ret2 != -ENOENT) {
3324 		ret = ret2;
3325 		goto out;
3326 	}
3327 
3328 	if (ret != -ENOENT || ret2 != -ENOENT)
3329 		ret = 0;
3330 out:
3331 	btrfs_free_path(path);
3332 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3333 		WARN_ON(ret > 0);
3334 	return ret;
3335 }
3336 
3337 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3338 			   struct btrfs_root *root,
3339 			   struct extent_buffer *buf,
3340 			   int full_backref, int inc)
3341 {
3342 	struct btrfs_fs_info *fs_info = root->fs_info;
3343 	u64 bytenr;
3344 	u64 num_bytes;
3345 	u64 parent;
3346 	u64 ref_root;
3347 	u32 nritems;
3348 	struct btrfs_key key;
3349 	struct btrfs_file_extent_item *fi;
3350 	int i;
3351 	int level;
3352 	int ret = 0;
3353 	int (*process_func)(struct btrfs_trans_handle *,
3354 			    struct btrfs_fs_info *,
3355 			    u64, u64, u64, u64, u64, u64);
3356 
3357 
3358 	if (btrfs_is_testing(fs_info))
3359 		return 0;
3360 
3361 	ref_root = btrfs_header_owner(buf);
3362 	nritems = btrfs_header_nritems(buf);
3363 	level = btrfs_header_level(buf);
3364 
3365 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3366 		return 0;
3367 
3368 	if (inc)
3369 		process_func = btrfs_inc_extent_ref;
3370 	else
3371 		process_func = btrfs_free_extent;
3372 
3373 	if (full_backref)
3374 		parent = buf->start;
3375 	else
3376 		parent = 0;
3377 
3378 	for (i = 0; i < nritems; i++) {
3379 		if (level == 0) {
3380 			btrfs_item_key_to_cpu(buf, &key, i);
3381 			if (key.type != BTRFS_EXTENT_DATA_KEY)
3382 				continue;
3383 			fi = btrfs_item_ptr(buf, i,
3384 					    struct btrfs_file_extent_item);
3385 			if (btrfs_file_extent_type(buf, fi) ==
3386 			    BTRFS_FILE_EXTENT_INLINE)
3387 				continue;
3388 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3389 			if (bytenr == 0)
3390 				continue;
3391 
3392 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3393 			key.offset -= btrfs_file_extent_offset(buf, fi);
3394 			ret = process_func(trans, fs_info, bytenr, num_bytes,
3395 					   parent, ref_root, key.objectid,
3396 					   key.offset);
3397 			if (ret)
3398 				goto fail;
3399 		} else {
3400 			bytenr = btrfs_node_blockptr(buf, i);
3401 			num_bytes = fs_info->nodesize;
3402 			ret = process_func(trans, fs_info, bytenr, num_bytes,
3403 					   parent, ref_root, level - 1, 0);
3404 			if (ret)
3405 				goto fail;
3406 		}
3407 	}
3408 	return 0;
3409 fail:
3410 	return ret;
3411 }
3412 
3413 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3414 		  struct extent_buffer *buf, int full_backref)
3415 {
3416 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3417 }
3418 
3419 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3420 		  struct extent_buffer *buf, int full_backref)
3421 {
3422 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3423 }
3424 
3425 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3426 				 struct btrfs_fs_info *fs_info,
3427 				 struct btrfs_path *path,
3428 				 struct btrfs_block_group_cache *cache)
3429 {
3430 	int ret;
3431 	struct btrfs_root *extent_root = fs_info->extent_root;
3432 	unsigned long bi;
3433 	struct extent_buffer *leaf;
3434 
3435 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3436 	if (ret) {
3437 		if (ret > 0)
3438 			ret = -ENOENT;
3439 		goto fail;
3440 	}
3441 
3442 	leaf = path->nodes[0];
3443 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3444 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3445 	btrfs_mark_buffer_dirty(leaf);
3446 fail:
3447 	btrfs_release_path(path);
3448 	return ret;
3449 
3450 }
3451 
3452 static struct btrfs_block_group_cache *
3453 next_block_group(struct btrfs_fs_info *fs_info,
3454 		 struct btrfs_block_group_cache *cache)
3455 {
3456 	struct rb_node *node;
3457 
3458 	spin_lock(&fs_info->block_group_cache_lock);
3459 
3460 	/* If our block group was removed, we need a full search. */
3461 	if (RB_EMPTY_NODE(&cache->cache_node)) {
3462 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3463 
3464 		spin_unlock(&fs_info->block_group_cache_lock);
3465 		btrfs_put_block_group(cache);
3466 		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3467 	}
3468 	node = rb_next(&cache->cache_node);
3469 	btrfs_put_block_group(cache);
3470 	if (node) {
3471 		cache = rb_entry(node, struct btrfs_block_group_cache,
3472 				 cache_node);
3473 		btrfs_get_block_group(cache);
3474 	} else
3475 		cache = NULL;
3476 	spin_unlock(&fs_info->block_group_cache_lock);
3477 	return cache;
3478 }
3479 
3480 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3481 			    struct btrfs_trans_handle *trans,
3482 			    struct btrfs_path *path)
3483 {
3484 	struct btrfs_fs_info *fs_info = block_group->fs_info;
3485 	struct btrfs_root *root = fs_info->tree_root;
3486 	struct inode *inode = NULL;
3487 	struct extent_changeset *data_reserved = NULL;
3488 	u64 alloc_hint = 0;
3489 	int dcs = BTRFS_DC_ERROR;
3490 	u64 num_pages = 0;
3491 	int retries = 0;
3492 	int ret = 0;
3493 
3494 	/*
3495 	 * If this block group is smaller than 100 megs don't bother caching the
3496 	 * block group.
3497 	 */
3498 	if (block_group->key.offset < (100 * SZ_1M)) {
3499 		spin_lock(&block_group->lock);
3500 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3501 		spin_unlock(&block_group->lock);
3502 		return 0;
3503 	}
3504 
3505 	if (trans->aborted)
3506 		return 0;
3507 again:
3508 	inode = lookup_free_space_inode(fs_info, block_group, path);
3509 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3510 		ret = PTR_ERR(inode);
3511 		btrfs_release_path(path);
3512 		goto out;
3513 	}
3514 
3515 	if (IS_ERR(inode)) {
3516 		BUG_ON(retries);
3517 		retries++;
3518 
3519 		if (block_group->ro)
3520 			goto out_free;
3521 
3522 		ret = create_free_space_inode(fs_info, trans, block_group,
3523 					      path);
3524 		if (ret)
3525 			goto out_free;
3526 		goto again;
3527 	}
3528 
3529 	/* We've already setup this transaction, go ahead and exit */
3530 	if (block_group->cache_generation == trans->transid &&
3531 	    i_size_read(inode)) {
3532 		dcs = BTRFS_DC_SETUP;
3533 		goto out_put;
3534 	}
3535 
3536 	/*
3537 	 * We want to set the generation to 0, that way if anything goes wrong
3538 	 * from here on out we know not to trust this cache when we load up next
3539 	 * time.
3540 	 */
3541 	BTRFS_I(inode)->generation = 0;
3542 	ret = btrfs_update_inode(trans, root, inode);
3543 	if (ret) {
3544 		/*
3545 		 * So theoretically we could recover from this, simply set the
3546 		 * super cache generation to 0 so we know to invalidate the
3547 		 * cache, but then we'd have to keep track of the block groups
3548 		 * that fail this way so we know we _have_ to reset this cache
3549 		 * before the next commit or risk reading stale cache.  So to
3550 		 * limit our exposure to horrible edge cases lets just abort the
3551 		 * transaction, this only happens in really bad situations
3552 		 * anyway.
3553 		 */
3554 		btrfs_abort_transaction(trans, ret);
3555 		goto out_put;
3556 	}
3557 	WARN_ON(ret);
3558 
3559 	if (i_size_read(inode) > 0) {
3560 		ret = btrfs_check_trunc_cache_free_space(fs_info,
3561 					&fs_info->global_block_rsv);
3562 		if (ret)
3563 			goto out_put;
3564 
3565 		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3566 		if (ret)
3567 			goto out_put;
3568 	}
3569 
3570 	spin_lock(&block_group->lock);
3571 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3572 	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3573 		/*
3574 		 * don't bother trying to write stuff out _if_
3575 		 * a) we're not cached,
3576 		 * b) we're with nospace_cache mount option,
3577 		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3578 		 */
3579 		dcs = BTRFS_DC_WRITTEN;
3580 		spin_unlock(&block_group->lock);
3581 		goto out_put;
3582 	}
3583 	spin_unlock(&block_group->lock);
3584 
3585 	/*
3586 	 * We hit an ENOSPC when setting up the cache in this transaction, just
3587 	 * skip doing the setup, we've already cleared the cache so we're safe.
3588 	 */
3589 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3590 		ret = -ENOSPC;
3591 		goto out_put;
3592 	}
3593 
3594 	/*
3595 	 * Try to preallocate enough space based on how big the block group is.
3596 	 * Keep in mind this has to include any pinned space which could end up
3597 	 * taking up quite a bit since it's not folded into the other space
3598 	 * cache.
3599 	 */
3600 	num_pages = div_u64(block_group->key.offset, SZ_256M);
3601 	if (!num_pages)
3602 		num_pages = 1;
3603 
3604 	num_pages *= 16;
3605 	num_pages *= PAGE_SIZE;
3606 
3607 	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3608 	if (ret)
3609 		goto out_put;
3610 
3611 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3612 					      num_pages, num_pages,
3613 					      &alloc_hint);
3614 	/*
3615 	 * Our cache requires contiguous chunks so that we don't modify a bunch
3616 	 * of metadata or split extents when writing the cache out, which means
3617 	 * we can enospc if we are heavily fragmented in addition to just normal
3618 	 * out of space conditions.  So if we hit this just skip setting up any
3619 	 * other block groups for this transaction, maybe we'll unpin enough
3620 	 * space the next time around.
3621 	 */
3622 	if (!ret)
3623 		dcs = BTRFS_DC_SETUP;
3624 	else if (ret == -ENOSPC)
3625 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3626 
3627 out_put:
3628 	iput(inode);
3629 out_free:
3630 	btrfs_release_path(path);
3631 out:
3632 	spin_lock(&block_group->lock);
3633 	if (!ret && dcs == BTRFS_DC_SETUP)
3634 		block_group->cache_generation = trans->transid;
3635 	block_group->disk_cache_state = dcs;
3636 	spin_unlock(&block_group->lock);
3637 
3638 	extent_changeset_free(data_reserved);
3639 	return ret;
3640 }
3641 
3642 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3643 			    struct btrfs_fs_info *fs_info)
3644 {
3645 	struct btrfs_block_group_cache *cache, *tmp;
3646 	struct btrfs_transaction *cur_trans = trans->transaction;
3647 	struct btrfs_path *path;
3648 
3649 	if (list_empty(&cur_trans->dirty_bgs) ||
3650 	    !btrfs_test_opt(fs_info, SPACE_CACHE))
3651 		return 0;
3652 
3653 	path = btrfs_alloc_path();
3654 	if (!path)
3655 		return -ENOMEM;
3656 
3657 	/* Could add new block groups, use _safe just in case */
3658 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3659 				 dirty_list) {
3660 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3661 			cache_save_setup(cache, trans, path);
3662 	}
3663 
3664 	btrfs_free_path(path);
3665 	return 0;
3666 }
3667 
3668 /*
3669  * transaction commit does final block group cache writeback during a
3670  * critical section where nothing is allowed to change the FS.  This is
3671  * required in order for the cache to actually match the block group,
3672  * but can introduce a lot of latency into the commit.
3673  *
3674  * So, btrfs_start_dirty_block_groups is here to kick off block group
3675  * cache IO.  There's a chance we'll have to redo some of it if the
3676  * block group changes again during the commit, but it greatly reduces
3677  * the commit latency by getting rid of the easy block groups while
3678  * we're still allowing others to join the commit.
3679  */
3680 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3681 				   struct btrfs_fs_info *fs_info)
3682 {
3683 	struct btrfs_block_group_cache *cache;
3684 	struct btrfs_transaction *cur_trans = trans->transaction;
3685 	int ret = 0;
3686 	int should_put;
3687 	struct btrfs_path *path = NULL;
3688 	LIST_HEAD(dirty);
3689 	struct list_head *io = &cur_trans->io_bgs;
3690 	int num_started = 0;
3691 	int loops = 0;
3692 
3693 	spin_lock(&cur_trans->dirty_bgs_lock);
3694 	if (list_empty(&cur_trans->dirty_bgs)) {
3695 		spin_unlock(&cur_trans->dirty_bgs_lock);
3696 		return 0;
3697 	}
3698 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3699 	spin_unlock(&cur_trans->dirty_bgs_lock);
3700 
3701 again:
3702 	/*
3703 	 * make sure all the block groups on our dirty list actually
3704 	 * exist
3705 	 */
3706 	btrfs_create_pending_block_groups(trans, fs_info);
3707 
3708 	if (!path) {
3709 		path = btrfs_alloc_path();
3710 		if (!path)
3711 			return -ENOMEM;
3712 	}
3713 
3714 	/*
3715 	 * cache_write_mutex is here only to save us from balance or automatic
3716 	 * removal of empty block groups deleting this block group while we are
3717 	 * writing out the cache
3718 	 */
3719 	mutex_lock(&trans->transaction->cache_write_mutex);
3720 	while (!list_empty(&dirty)) {
3721 		cache = list_first_entry(&dirty,
3722 					 struct btrfs_block_group_cache,
3723 					 dirty_list);
3724 		/*
3725 		 * this can happen if something re-dirties a block
3726 		 * group that is already under IO.  Just wait for it to
3727 		 * finish and then do it all again
3728 		 */
3729 		if (!list_empty(&cache->io_list)) {
3730 			list_del_init(&cache->io_list);
3731 			btrfs_wait_cache_io(trans, cache, path);
3732 			btrfs_put_block_group(cache);
3733 		}
3734 
3735 
3736 		/*
3737 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3738 		 * if it should update the cache_state.  Don't delete
3739 		 * until after we wait.
3740 		 *
3741 		 * Since we're not running in the commit critical section
3742 		 * we need the dirty_bgs_lock to protect from update_block_group
3743 		 */
3744 		spin_lock(&cur_trans->dirty_bgs_lock);
3745 		list_del_init(&cache->dirty_list);
3746 		spin_unlock(&cur_trans->dirty_bgs_lock);
3747 
3748 		should_put = 1;
3749 
3750 		cache_save_setup(cache, trans, path);
3751 
3752 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3753 			cache->io_ctl.inode = NULL;
3754 			ret = btrfs_write_out_cache(fs_info, trans,
3755 						    cache, path);
3756 			if (ret == 0 && cache->io_ctl.inode) {
3757 				num_started++;
3758 				should_put = 0;
3759 
3760 				/*
3761 				 * the cache_write_mutex is protecting
3762 				 * the io_list
3763 				 */
3764 				list_add_tail(&cache->io_list, io);
3765 			} else {
3766 				/*
3767 				 * if we failed to write the cache, the
3768 				 * generation will be bad and life goes on
3769 				 */
3770 				ret = 0;
3771 			}
3772 		}
3773 		if (!ret) {
3774 			ret = write_one_cache_group(trans, fs_info,
3775 						    path, cache);
3776 			/*
3777 			 * Our block group might still be attached to the list
3778 			 * of new block groups in the transaction handle of some
3779 			 * other task (struct btrfs_trans_handle->new_bgs). This
3780 			 * means its block group item isn't yet in the extent
3781 			 * tree. If this happens ignore the error, as we will
3782 			 * try again later in the critical section of the
3783 			 * transaction commit.
3784 			 */
3785 			if (ret == -ENOENT) {
3786 				ret = 0;
3787 				spin_lock(&cur_trans->dirty_bgs_lock);
3788 				if (list_empty(&cache->dirty_list)) {
3789 					list_add_tail(&cache->dirty_list,
3790 						      &cur_trans->dirty_bgs);
3791 					btrfs_get_block_group(cache);
3792 				}
3793 				spin_unlock(&cur_trans->dirty_bgs_lock);
3794 			} else if (ret) {
3795 				btrfs_abort_transaction(trans, ret);
3796 			}
3797 		}
3798 
3799 		/* if its not on the io list, we need to put the block group */
3800 		if (should_put)
3801 			btrfs_put_block_group(cache);
3802 
3803 		if (ret)
3804 			break;
3805 
3806 		/*
3807 		 * Avoid blocking other tasks for too long. It might even save
3808 		 * us from writing caches for block groups that are going to be
3809 		 * removed.
3810 		 */
3811 		mutex_unlock(&trans->transaction->cache_write_mutex);
3812 		mutex_lock(&trans->transaction->cache_write_mutex);
3813 	}
3814 	mutex_unlock(&trans->transaction->cache_write_mutex);
3815 
3816 	/*
3817 	 * go through delayed refs for all the stuff we've just kicked off
3818 	 * and then loop back (just once)
3819 	 */
3820 	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
3821 	if (!ret && loops == 0) {
3822 		loops++;
3823 		spin_lock(&cur_trans->dirty_bgs_lock);
3824 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3825 		/*
3826 		 * dirty_bgs_lock protects us from concurrent block group
3827 		 * deletes too (not just cache_write_mutex).
3828 		 */
3829 		if (!list_empty(&dirty)) {
3830 			spin_unlock(&cur_trans->dirty_bgs_lock);
3831 			goto again;
3832 		}
3833 		spin_unlock(&cur_trans->dirty_bgs_lock);
3834 	} else if (ret < 0) {
3835 		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3836 	}
3837 
3838 	btrfs_free_path(path);
3839 	return ret;
3840 }
3841 
3842 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3843 				   struct btrfs_fs_info *fs_info)
3844 {
3845 	struct btrfs_block_group_cache *cache;
3846 	struct btrfs_transaction *cur_trans = trans->transaction;
3847 	int ret = 0;
3848 	int should_put;
3849 	struct btrfs_path *path;
3850 	struct list_head *io = &cur_trans->io_bgs;
3851 	int num_started = 0;
3852 
3853 	path = btrfs_alloc_path();
3854 	if (!path)
3855 		return -ENOMEM;
3856 
3857 	/*
3858 	 * Even though we are in the critical section of the transaction commit,
3859 	 * we can still have concurrent tasks adding elements to this
3860 	 * transaction's list of dirty block groups. These tasks correspond to
3861 	 * endio free space workers started when writeback finishes for a
3862 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3863 	 * allocate new block groups as a result of COWing nodes of the root
3864 	 * tree when updating the free space inode. The writeback for the space
3865 	 * caches is triggered by an earlier call to
3866 	 * btrfs_start_dirty_block_groups() and iterations of the following
3867 	 * loop.
3868 	 * Also we want to do the cache_save_setup first and then run the
3869 	 * delayed refs to make sure we have the best chance at doing this all
3870 	 * in one shot.
3871 	 */
3872 	spin_lock(&cur_trans->dirty_bgs_lock);
3873 	while (!list_empty(&cur_trans->dirty_bgs)) {
3874 		cache = list_first_entry(&cur_trans->dirty_bgs,
3875 					 struct btrfs_block_group_cache,
3876 					 dirty_list);
3877 
3878 		/*
3879 		 * this can happen if cache_save_setup re-dirties a block
3880 		 * group that is already under IO.  Just wait for it to
3881 		 * finish and then do it all again
3882 		 */
3883 		if (!list_empty(&cache->io_list)) {
3884 			spin_unlock(&cur_trans->dirty_bgs_lock);
3885 			list_del_init(&cache->io_list);
3886 			btrfs_wait_cache_io(trans, cache, path);
3887 			btrfs_put_block_group(cache);
3888 			spin_lock(&cur_trans->dirty_bgs_lock);
3889 		}
3890 
3891 		/*
3892 		 * don't remove from the dirty list until after we've waited
3893 		 * on any pending IO
3894 		 */
3895 		list_del_init(&cache->dirty_list);
3896 		spin_unlock(&cur_trans->dirty_bgs_lock);
3897 		should_put = 1;
3898 
3899 		cache_save_setup(cache, trans, path);
3900 
3901 		if (!ret)
3902 			ret = btrfs_run_delayed_refs(trans, fs_info,
3903 						     (unsigned long) -1);
3904 
3905 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3906 			cache->io_ctl.inode = NULL;
3907 			ret = btrfs_write_out_cache(fs_info, trans,
3908 						    cache, path);
3909 			if (ret == 0 && cache->io_ctl.inode) {
3910 				num_started++;
3911 				should_put = 0;
3912 				list_add_tail(&cache->io_list, io);
3913 			} else {
3914 				/*
3915 				 * if we failed to write the cache, the
3916 				 * generation will be bad and life goes on
3917 				 */
3918 				ret = 0;
3919 			}
3920 		}
3921 		if (!ret) {
3922 			ret = write_one_cache_group(trans, fs_info,
3923 						    path, cache);
3924 			/*
3925 			 * One of the free space endio workers might have
3926 			 * created a new block group while updating a free space
3927 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3928 			 * and hasn't released its transaction handle yet, in
3929 			 * which case the new block group is still attached to
3930 			 * its transaction handle and its creation has not
3931 			 * finished yet (no block group item in the extent tree
3932 			 * yet, etc). If this is the case, wait for all free
3933 			 * space endio workers to finish and retry. This is a
3934 			 * a very rare case so no need for a more efficient and
3935 			 * complex approach.
3936 			 */
3937 			if (ret == -ENOENT) {
3938 				wait_event(cur_trans->writer_wait,
3939 				   atomic_read(&cur_trans->num_writers) == 1);
3940 				ret = write_one_cache_group(trans, fs_info,
3941 							    path, cache);
3942 			}
3943 			if (ret)
3944 				btrfs_abort_transaction(trans, ret);
3945 		}
3946 
3947 		/* if its not on the io list, we need to put the block group */
3948 		if (should_put)
3949 			btrfs_put_block_group(cache);
3950 		spin_lock(&cur_trans->dirty_bgs_lock);
3951 	}
3952 	spin_unlock(&cur_trans->dirty_bgs_lock);
3953 
3954 	while (!list_empty(io)) {
3955 		cache = list_first_entry(io, struct btrfs_block_group_cache,
3956 					 io_list);
3957 		list_del_init(&cache->io_list);
3958 		btrfs_wait_cache_io(trans, cache, path);
3959 		btrfs_put_block_group(cache);
3960 	}
3961 
3962 	btrfs_free_path(path);
3963 	return ret;
3964 }
3965 
3966 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3967 {
3968 	struct btrfs_block_group_cache *block_group;
3969 	int readonly = 0;
3970 
3971 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
3972 	if (!block_group || block_group->ro)
3973 		readonly = 1;
3974 	if (block_group)
3975 		btrfs_put_block_group(block_group);
3976 	return readonly;
3977 }
3978 
3979 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3980 {
3981 	struct btrfs_block_group_cache *bg;
3982 	bool ret = true;
3983 
3984 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3985 	if (!bg)
3986 		return false;
3987 
3988 	spin_lock(&bg->lock);
3989 	if (bg->ro)
3990 		ret = false;
3991 	else
3992 		atomic_inc(&bg->nocow_writers);
3993 	spin_unlock(&bg->lock);
3994 
3995 	/* no put on block group, done by btrfs_dec_nocow_writers */
3996 	if (!ret)
3997 		btrfs_put_block_group(bg);
3998 
3999 	return ret;
4000 
4001 }
4002 
4003 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
4004 {
4005 	struct btrfs_block_group_cache *bg;
4006 
4007 	bg = btrfs_lookup_block_group(fs_info, bytenr);
4008 	ASSERT(bg);
4009 	if (atomic_dec_and_test(&bg->nocow_writers))
4010 		wake_up_atomic_t(&bg->nocow_writers);
4011 	/*
4012 	 * Once for our lookup and once for the lookup done by a previous call
4013 	 * to btrfs_inc_nocow_writers()
4014 	 */
4015 	btrfs_put_block_group(bg);
4016 	btrfs_put_block_group(bg);
4017 }
4018 
4019 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
4020 {
4021 	schedule();
4022 	return 0;
4023 }
4024 
4025 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
4026 {
4027 	wait_on_atomic_t(&bg->nocow_writers,
4028 			 btrfs_wait_nocow_writers_atomic_t,
4029 			 TASK_UNINTERRUPTIBLE);
4030 }
4031 
4032 static const char *alloc_name(u64 flags)
4033 {
4034 	switch (flags) {
4035 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
4036 		return "mixed";
4037 	case BTRFS_BLOCK_GROUP_METADATA:
4038 		return "metadata";
4039 	case BTRFS_BLOCK_GROUP_DATA:
4040 		return "data";
4041 	case BTRFS_BLOCK_GROUP_SYSTEM:
4042 		return "system";
4043 	default:
4044 		WARN_ON(1);
4045 		return "invalid-combination";
4046 	};
4047 }
4048 
4049 static int create_space_info(struct btrfs_fs_info *info, u64 flags,
4050 			     struct btrfs_space_info **new)
4051 {
4052 
4053 	struct btrfs_space_info *space_info;
4054 	int i;
4055 	int ret;
4056 
4057 	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
4058 	if (!space_info)
4059 		return -ENOMEM;
4060 
4061 	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
4062 				 GFP_KERNEL);
4063 	if (ret) {
4064 		kfree(space_info);
4065 		return ret;
4066 	}
4067 
4068 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
4069 		INIT_LIST_HEAD(&space_info->block_groups[i]);
4070 	init_rwsem(&space_info->groups_sem);
4071 	spin_lock_init(&space_info->lock);
4072 	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
4073 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4074 	init_waitqueue_head(&space_info->wait);
4075 	INIT_LIST_HEAD(&space_info->ro_bgs);
4076 	INIT_LIST_HEAD(&space_info->tickets);
4077 	INIT_LIST_HEAD(&space_info->priority_tickets);
4078 
4079 	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
4080 				    info->space_info_kobj, "%s",
4081 				    alloc_name(space_info->flags));
4082 	if (ret) {
4083 		percpu_counter_destroy(&space_info->total_bytes_pinned);
4084 		kfree(space_info);
4085 		return ret;
4086 	}
4087 
4088 	*new = space_info;
4089 	list_add_rcu(&space_info->list, &info->space_info);
4090 	if (flags & BTRFS_BLOCK_GROUP_DATA)
4091 		info->data_sinfo = space_info;
4092 
4093 	return ret;
4094 }
4095 
4096 static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4097 			     u64 total_bytes, u64 bytes_used,
4098 			     u64 bytes_readonly,
4099 			     struct btrfs_space_info **space_info)
4100 {
4101 	struct btrfs_space_info *found;
4102 	int factor;
4103 
4104 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
4105 		     BTRFS_BLOCK_GROUP_RAID10))
4106 		factor = 2;
4107 	else
4108 		factor = 1;
4109 
4110 	found = __find_space_info(info, flags);
4111 	ASSERT(found);
4112 	spin_lock(&found->lock);
4113 	found->total_bytes += total_bytes;
4114 	found->disk_total += total_bytes * factor;
4115 	found->bytes_used += bytes_used;
4116 	found->disk_used += bytes_used * factor;
4117 	found->bytes_readonly += bytes_readonly;
4118 	if (total_bytes > 0)
4119 		found->full = 0;
4120 	space_info_add_new_bytes(info, found, total_bytes -
4121 				 bytes_used - bytes_readonly);
4122 	spin_unlock(&found->lock);
4123 	*space_info = found;
4124 }
4125 
4126 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4127 {
4128 	u64 extra_flags = chunk_to_extended(flags) &
4129 				BTRFS_EXTENDED_PROFILE_MASK;
4130 
4131 	write_seqlock(&fs_info->profiles_lock);
4132 	if (flags & BTRFS_BLOCK_GROUP_DATA)
4133 		fs_info->avail_data_alloc_bits |= extra_flags;
4134 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
4135 		fs_info->avail_metadata_alloc_bits |= extra_flags;
4136 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4137 		fs_info->avail_system_alloc_bits |= extra_flags;
4138 	write_sequnlock(&fs_info->profiles_lock);
4139 }
4140 
4141 /*
4142  * returns target flags in extended format or 0 if restripe for this
4143  * chunk_type is not in progress
4144  *
4145  * should be called with either volume_mutex or balance_lock held
4146  */
4147 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4148 {
4149 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4150 	u64 target = 0;
4151 
4152 	if (!bctl)
4153 		return 0;
4154 
4155 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
4156 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4157 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4158 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4159 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4160 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4161 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4162 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4163 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4164 	}
4165 
4166 	return target;
4167 }
4168 
4169 /*
4170  * @flags: available profiles in extended format (see ctree.h)
4171  *
4172  * Returns reduced profile in chunk format.  If profile changing is in
4173  * progress (either running or paused) picks the target profile (if it's
4174  * already available), otherwise falls back to plain reducing.
4175  */
4176 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4177 {
4178 	u64 num_devices = fs_info->fs_devices->rw_devices;
4179 	u64 target;
4180 	u64 raid_type;
4181 	u64 allowed = 0;
4182 
4183 	/*
4184 	 * see if restripe for this chunk_type is in progress, if so
4185 	 * try to reduce to the target profile
4186 	 */
4187 	spin_lock(&fs_info->balance_lock);
4188 	target = get_restripe_target(fs_info, flags);
4189 	if (target) {
4190 		/* pick target profile only if it's already available */
4191 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4192 			spin_unlock(&fs_info->balance_lock);
4193 			return extended_to_chunk(target);
4194 		}
4195 	}
4196 	spin_unlock(&fs_info->balance_lock);
4197 
4198 	/* First, mask out the RAID levels which aren't possible */
4199 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4200 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4201 			allowed |= btrfs_raid_group[raid_type];
4202 	}
4203 	allowed &= flags;
4204 
4205 	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4206 		allowed = BTRFS_BLOCK_GROUP_RAID6;
4207 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4208 		allowed = BTRFS_BLOCK_GROUP_RAID5;
4209 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4210 		allowed = BTRFS_BLOCK_GROUP_RAID10;
4211 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4212 		allowed = BTRFS_BLOCK_GROUP_RAID1;
4213 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4214 		allowed = BTRFS_BLOCK_GROUP_RAID0;
4215 
4216 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4217 
4218 	return extended_to_chunk(flags | allowed);
4219 }
4220 
4221 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4222 {
4223 	unsigned seq;
4224 	u64 flags;
4225 
4226 	do {
4227 		flags = orig_flags;
4228 		seq = read_seqbegin(&fs_info->profiles_lock);
4229 
4230 		if (flags & BTRFS_BLOCK_GROUP_DATA)
4231 			flags |= fs_info->avail_data_alloc_bits;
4232 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4233 			flags |= fs_info->avail_system_alloc_bits;
4234 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4235 			flags |= fs_info->avail_metadata_alloc_bits;
4236 	} while (read_seqretry(&fs_info->profiles_lock, seq));
4237 
4238 	return btrfs_reduce_alloc_profile(fs_info, flags);
4239 }
4240 
4241 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4242 {
4243 	struct btrfs_fs_info *fs_info = root->fs_info;
4244 	u64 flags;
4245 	u64 ret;
4246 
4247 	if (data)
4248 		flags = BTRFS_BLOCK_GROUP_DATA;
4249 	else if (root == fs_info->chunk_root)
4250 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
4251 	else
4252 		flags = BTRFS_BLOCK_GROUP_METADATA;
4253 
4254 	ret = get_alloc_profile(fs_info, flags);
4255 	return ret;
4256 }
4257 
4258 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4259 {
4260 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4261 }
4262 
4263 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4264 {
4265 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4266 }
4267 
4268 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4269 {
4270 	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4271 }
4272 
4273 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4274 				 bool may_use_included)
4275 {
4276 	ASSERT(s_info);
4277 	return s_info->bytes_used + s_info->bytes_reserved +
4278 		s_info->bytes_pinned + s_info->bytes_readonly +
4279 		(may_use_included ? s_info->bytes_may_use : 0);
4280 }
4281 
4282 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4283 {
4284 	struct btrfs_root *root = inode->root;
4285 	struct btrfs_fs_info *fs_info = root->fs_info;
4286 	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4287 	u64 used;
4288 	int ret = 0;
4289 	int need_commit = 2;
4290 	int have_pinned_space;
4291 
4292 	/* make sure bytes are sectorsize aligned */
4293 	bytes = ALIGN(bytes, fs_info->sectorsize);
4294 
4295 	if (btrfs_is_free_space_inode(inode)) {
4296 		need_commit = 0;
4297 		ASSERT(current->journal_info);
4298 	}
4299 
4300 again:
4301 	/* make sure we have enough space to handle the data first */
4302 	spin_lock(&data_sinfo->lock);
4303 	used = btrfs_space_info_used(data_sinfo, true);
4304 
4305 	if (used + bytes > data_sinfo->total_bytes) {
4306 		struct btrfs_trans_handle *trans;
4307 
4308 		/*
4309 		 * if we don't have enough free bytes in this space then we need
4310 		 * to alloc a new chunk.
4311 		 */
4312 		if (!data_sinfo->full) {
4313 			u64 alloc_target;
4314 
4315 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4316 			spin_unlock(&data_sinfo->lock);
4317 
4318 			alloc_target = btrfs_data_alloc_profile(fs_info);
4319 			/*
4320 			 * It is ugly that we don't call nolock join
4321 			 * transaction for the free space inode case here.
4322 			 * But it is safe because we only do the data space
4323 			 * reservation for the free space cache in the
4324 			 * transaction context, the common join transaction
4325 			 * just increase the counter of the current transaction
4326 			 * handler, doesn't try to acquire the trans_lock of
4327 			 * the fs.
4328 			 */
4329 			trans = btrfs_join_transaction(root);
4330 			if (IS_ERR(trans))
4331 				return PTR_ERR(trans);
4332 
4333 			ret = do_chunk_alloc(trans, fs_info, alloc_target,
4334 					     CHUNK_ALLOC_NO_FORCE);
4335 			btrfs_end_transaction(trans);
4336 			if (ret < 0) {
4337 				if (ret != -ENOSPC)
4338 					return ret;
4339 				else {
4340 					have_pinned_space = 1;
4341 					goto commit_trans;
4342 				}
4343 			}
4344 
4345 			goto again;
4346 		}
4347 
4348 		/*
4349 		 * If we don't have enough pinned space to deal with this
4350 		 * allocation, and no removed chunk in current transaction,
4351 		 * don't bother committing the transaction.
4352 		 */
4353 		have_pinned_space = percpu_counter_compare(
4354 			&data_sinfo->total_bytes_pinned,
4355 			used + bytes - data_sinfo->total_bytes);
4356 		spin_unlock(&data_sinfo->lock);
4357 
4358 		/* commit the current transaction and try again */
4359 commit_trans:
4360 		if (need_commit &&
4361 		    !atomic_read(&fs_info->open_ioctl_trans)) {
4362 			need_commit--;
4363 
4364 			if (need_commit > 0) {
4365 				btrfs_start_delalloc_roots(fs_info, 0, -1);
4366 				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4367 							 (u64)-1);
4368 			}
4369 
4370 			trans = btrfs_join_transaction(root);
4371 			if (IS_ERR(trans))
4372 				return PTR_ERR(trans);
4373 			if (have_pinned_space >= 0 ||
4374 			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4375 				     &trans->transaction->flags) ||
4376 			    need_commit > 0) {
4377 				ret = btrfs_commit_transaction(trans);
4378 				if (ret)
4379 					return ret;
4380 				/*
4381 				 * The cleaner kthread might still be doing iput
4382 				 * operations. Wait for it to finish so that
4383 				 * more space is released.
4384 				 */
4385 				mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4386 				mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4387 				goto again;
4388 			} else {
4389 				btrfs_end_transaction(trans);
4390 			}
4391 		}
4392 
4393 		trace_btrfs_space_reservation(fs_info,
4394 					      "space_info:enospc",
4395 					      data_sinfo->flags, bytes, 1);
4396 		return -ENOSPC;
4397 	}
4398 	data_sinfo->bytes_may_use += bytes;
4399 	trace_btrfs_space_reservation(fs_info, "space_info",
4400 				      data_sinfo->flags, bytes, 1);
4401 	spin_unlock(&data_sinfo->lock);
4402 
4403 	return ret;
4404 }
4405 
4406 int btrfs_check_data_free_space(struct inode *inode,
4407 			struct extent_changeset **reserved, u64 start, u64 len)
4408 {
4409 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4410 	int ret;
4411 
4412 	/* align the range */
4413 	len = round_up(start + len, fs_info->sectorsize) -
4414 	      round_down(start, fs_info->sectorsize);
4415 	start = round_down(start, fs_info->sectorsize);
4416 
4417 	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4418 	if (ret < 0)
4419 		return ret;
4420 
4421 	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4422 	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4423 	if (ret < 0)
4424 		btrfs_free_reserved_data_space_noquota(inode, start, len);
4425 	else
4426 		ret = 0;
4427 	return ret;
4428 }
4429 
4430 /*
4431  * Called if we need to clear a data reservation for this inode
4432  * Normally in a error case.
4433  *
4434  * This one will *NOT* use accurate qgroup reserved space API, just for case
4435  * which we can't sleep and is sure it won't affect qgroup reserved space.
4436  * Like clear_bit_hook().
4437  */
4438 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4439 					    u64 len)
4440 {
4441 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4442 	struct btrfs_space_info *data_sinfo;
4443 
4444 	/* Make sure the range is aligned to sectorsize */
4445 	len = round_up(start + len, fs_info->sectorsize) -
4446 	      round_down(start, fs_info->sectorsize);
4447 	start = round_down(start, fs_info->sectorsize);
4448 
4449 	data_sinfo = fs_info->data_sinfo;
4450 	spin_lock(&data_sinfo->lock);
4451 	if (WARN_ON(data_sinfo->bytes_may_use < len))
4452 		data_sinfo->bytes_may_use = 0;
4453 	else
4454 		data_sinfo->bytes_may_use -= len;
4455 	trace_btrfs_space_reservation(fs_info, "space_info",
4456 				      data_sinfo->flags, len, 0);
4457 	spin_unlock(&data_sinfo->lock);
4458 }
4459 
4460 /*
4461  * Called if we need to clear a data reservation for this inode
4462  * Normally in a error case.
4463  *
4464  * This one will handle the per-inode data rsv map for accurate reserved
4465  * space framework.
4466  */
4467 void btrfs_free_reserved_data_space(struct inode *inode,
4468 			struct extent_changeset *reserved, u64 start, u64 len)
4469 {
4470 	struct btrfs_root *root = BTRFS_I(inode)->root;
4471 
4472 	/* Make sure the range is aligned to sectorsize */
4473 	len = round_up(start + len, root->fs_info->sectorsize) -
4474 	      round_down(start, root->fs_info->sectorsize);
4475 	start = round_down(start, root->fs_info->sectorsize);
4476 
4477 	btrfs_free_reserved_data_space_noquota(inode, start, len);
4478 	btrfs_qgroup_free_data(inode, reserved, start, len);
4479 }
4480 
4481 static void force_metadata_allocation(struct btrfs_fs_info *info)
4482 {
4483 	struct list_head *head = &info->space_info;
4484 	struct btrfs_space_info *found;
4485 
4486 	rcu_read_lock();
4487 	list_for_each_entry_rcu(found, head, list) {
4488 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4489 			found->force_alloc = CHUNK_ALLOC_FORCE;
4490 	}
4491 	rcu_read_unlock();
4492 }
4493 
4494 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4495 {
4496 	return (global->size << 1);
4497 }
4498 
4499 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4500 			      struct btrfs_space_info *sinfo, int force)
4501 {
4502 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4503 	u64 bytes_used = btrfs_space_info_used(sinfo, false);
4504 	u64 thresh;
4505 
4506 	if (force == CHUNK_ALLOC_FORCE)
4507 		return 1;
4508 
4509 	/*
4510 	 * We need to take into account the global rsv because for all intents
4511 	 * and purposes it's used space.  Don't worry about locking the
4512 	 * global_rsv, it doesn't change except when the transaction commits.
4513 	 */
4514 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4515 		bytes_used += calc_global_rsv_need_space(global_rsv);
4516 
4517 	/*
4518 	 * in limited mode, we want to have some free space up to
4519 	 * about 1% of the FS size.
4520 	 */
4521 	if (force == CHUNK_ALLOC_LIMITED) {
4522 		thresh = btrfs_super_total_bytes(fs_info->super_copy);
4523 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4524 
4525 		if (sinfo->total_bytes - bytes_used < thresh)
4526 			return 1;
4527 	}
4528 
4529 	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4530 		return 0;
4531 	return 1;
4532 }
4533 
4534 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4535 {
4536 	u64 num_dev;
4537 
4538 	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4539 		    BTRFS_BLOCK_GROUP_RAID0 |
4540 		    BTRFS_BLOCK_GROUP_RAID5 |
4541 		    BTRFS_BLOCK_GROUP_RAID6))
4542 		num_dev = fs_info->fs_devices->rw_devices;
4543 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
4544 		num_dev = 2;
4545 	else
4546 		num_dev = 1;	/* DUP or single */
4547 
4548 	return num_dev;
4549 }
4550 
4551 /*
4552  * If @is_allocation is true, reserve space in the system space info necessary
4553  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4554  * removing a chunk.
4555  */
4556 void check_system_chunk(struct btrfs_trans_handle *trans,
4557 			struct btrfs_fs_info *fs_info, u64 type)
4558 {
4559 	struct btrfs_space_info *info;
4560 	u64 left;
4561 	u64 thresh;
4562 	int ret = 0;
4563 	u64 num_devs;
4564 
4565 	/*
4566 	 * Needed because we can end up allocating a system chunk and for an
4567 	 * atomic and race free space reservation in the chunk block reserve.
4568 	 */
4569 	ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
4570 
4571 	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4572 	spin_lock(&info->lock);
4573 	left = info->total_bytes - btrfs_space_info_used(info, true);
4574 	spin_unlock(&info->lock);
4575 
4576 	num_devs = get_profile_num_devs(fs_info, type);
4577 
4578 	/* num_devs device items to update and 1 chunk item to add or remove */
4579 	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4580 		btrfs_calc_trans_metadata_size(fs_info, 1);
4581 
4582 	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4583 		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4584 			   left, thresh, type);
4585 		dump_space_info(fs_info, info, 0, 0);
4586 	}
4587 
4588 	if (left < thresh) {
4589 		u64 flags = btrfs_system_alloc_profile(fs_info);
4590 
4591 		/*
4592 		 * Ignore failure to create system chunk. We might end up not
4593 		 * needing it, as we might not need to COW all nodes/leafs from
4594 		 * the paths we visit in the chunk tree (they were already COWed
4595 		 * or created in the current transaction for example).
4596 		 */
4597 		ret = btrfs_alloc_chunk(trans, fs_info, flags);
4598 	}
4599 
4600 	if (!ret) {
4601 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
4602 					  &fs_info->chunk_block_rsv,
4603 					  thresh, BTRFS_RESERVE_NO_FLUSH);
4604 		if (!ret)
4605 			trans->chunk_bytes_reserved += thresh;
4606 	}
4607 }
4608 
4609 /*
4610  * If force is CHUNK_ALLOC_FORCE:
4611  *    - return 1 if it successfully allocates a chunk,
4612  *    - return errors including -ENOSPC otherwise.
4613  * If force is NOT CHUNK_ALLOC_FORCE:
4614  *    - return 0 if it doesn't need to allocate a new chunk,
4615  *    - return 1 if it successfully allocates a chunk,
4616  *    - return errors including -ENOSPC otherwise.
4617  */
4618 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4619 			  struct btrfs_fs_info *fs_info, u64 flags, int force)
4620 {
4621 	struct btrfs_space_info *space_info;
4622 	int wait_for_alloc = 0;
4623 	int ret = 0;
4624 
4625 	/* Don't re-enter if we're already allocating a chunk */
4626 	if (trans->allocating_chunk)
4627 		return -ENOSPC;
4628 
4629 	space_info = __find_space_info(fs_info, flags);
4630 	if (!space_info) {
4631 		ret = create_space_info(fs_info, flags, &space_info);
4632 		if (ret)
4633 			return ret;
4634 	}
4635 
4636 again:
4637 	spin_lock(&space_info->lock);
4638 	if (force < space_info->force_alloc)
4639 		force = space_info->force_alloc;
4640 	if (space_info->full) {
4641 		if (should_alloc_chunk(fs_info, space_info, force))
4642 			ret = -ENOSPC;
4643 		else
4644 			ret = 0;
4645 		spin_unlock(&space_info->lock);
4646 		return ret;
4647 	}
4648 
4649 	if (!should_alloc_chunk(fs_info, space_info, force)) {
4650 		spin_unlock(&space_info->lock);
4651 		return 0;
4652 	} else if (space_info->chunk_alloc) {
4653 		wait_for_alloc = 1;
4654 	} else {
4655 		space_info->chunk_alloc = 1;
4656 	}
4657 
4658 	spin_unlock(&space_info->lock);
4659 
4660 	mutex_lock(&fs_info->chunk_mutex);
4661 
4662 	/*
4663 	 * The chunk_mutex is held throughout the entirety of a chunk
4664 	 * allocation, so once we've acquired the chunk_mutex we know that the
4665 	 * other guy is done and we need to recheck and see if we should
4666 	 * allocate.
4667 	 */
4668 	if (wait_for_alloc) {
4669 		mutex_unlock(&fs_info->chunk_mutex);
4670 		wait_for_alloc = 0;
4671 		goto again;
4672 	}
4673 
4674 	trans->allocating_chunk = true;
4675 
4676 	/*
4677 	 * If we have mixed data/metadata chunks we want to make sure we keep
4678 	 * allocating mixed chunks instead of individual chunks.
4679 	 */
4680 	if (btrfs_mixed_space_info(space_info))
4681 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4682 
4683 	/*
4684 	 * if we're doing a data chunk, go ahead and make sure that
4685 	 * we keep a reasonable number of metadata chunks allocated in the
4686 	 * FS as well.
4687 	 */
4688 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4689 		fs_info->data_chunk_allocations++;
4690 		if (!(fs_info->data_chunk_allocations %
4691 		      fs_info->metadata_ratio))
4692 			force_metadata_allocation(fs_info);
4693 	}
4694 
4695 	/*
4696 	 * Check if we have enough space in SYSTEM chunk because we may need
4697 	 * to update devices.
4698 	 */
4699 	check_system_chunk(trans, fs_info, flags);
4700 
4701 	ret = btrfs_alloc_chunk(trans, fs_info, flags);
4702 	trans->allocating_chunk = false;
4703 
4704 	spin_lock(&space_info->lock);
4705 	if (ret < 0 && ret != -ENOSPC)
4706 		goto out;
4707 	if (ret)
4708 		space_info->full = 1;
4709 	else
4710 		ret = 1;
4711 
4712 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4713 out:
4714 	space_info->chunk_alloc = 0;
4715 	spin_unlock(&space_info->lock);
4716 	mutex_unlock(&fs_info->chunk_mutex);
4717 	/*
4718 	 * When we allocate a new chunk we reserve space in the chunk block
4719 	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4720 	 * add new nodes/leafs to it if we end up needing to do it when
4721 	 * inserting the chunk item and updating device items as part of the
4722 	 * second phase of chunk allocation, performed by
4723 	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4724 	 * large number of new block groups to create in our transaction
4725 	 * handle's new_bgs list to avoid exhausting the chunk block reserve
4726 	 * in extreme cases - like having a single transaction create many new
4727 	 * block groups when starting to write out the free space caches of all
4728 	 * the block groups that were made dirty during the lifetime of the
4729 	 * transaction.
4730 	 */
4731 	if (trans->can_flush_pending_bgs &&
4732 	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4733 		btrfs_create_pending_block_groups(trans, fs_info);
4734 		btrfs_trans_release_chunk_metadata(trans);
4735 	}
4736 	return ret;
4737 }
4738 
4739 static int can_overcommit(struct btrfs_fs_info *fs_info,
4740 			  struct btrfs_space_info *space_info, u64 bytes,
4741 			  enum btrfs_reserve_flush_enum flush,
4742 			  bool system_chunk)
4743 {
4744 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4745 	u64 profile;
4746 	u64 space_size;
4747 	u64 avail;
4748 	u64 used;
4749 
4750 	/* Don't overcommit when in mixed mode. */
4751 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4752 		return 0;
4753 
4754 	if (system_chunk)
4755 		profile = btrfs_system_alloc_profile(fs_info);
4756 	else
4757 		profile = btrfs_metadata_alloc_profile(fs_info);
4758 
4759 	used = btrfs_space_info_used(space_info, false);
4760 
4761 	/*
4762 	 * We only want to allow over committing if we have lots of actual space
4763 	 * free, but if we don't have enough space to handle the global reserve
4764 	 * space then we could end up having a real enospc problem when trying
4765 	 * to allocate a chunk or some other such important allocation.
4766 	 */
4767 	spin_lock(&global_rsv->lock);
4768 	space_size = calc_global_rsv_need_space(global_rsv);
4769 	spin_unlock(&global_rsv->lock);
4770 	if (used + space_size >= space_info->total_bytes)
4771 		return 0;
4772 
4773 	used += space_info->bytes_may_use;
4774 
4775 	avail = atomic64_read(&fs_info->free_chunk_space);
4776 
4777 	/*
4778 	 * If we have dup, raid1 or raid10 then only half of the free
4779 	 * space is actually useable.  For raid56, the space info used
4780 	 * doesn't include the parity drive, so we don't have to
4781 	 * change the math
4782 	 */
4783 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
4784 		       BTRFS_BLOCK_GROUP_RAID1 |
4785 		       BTRFS_BLOCK_GROUP_RAID10))
4786 		avail >>= 1;
4787 
4788 	/*
4789 	 * If we aren't flushing all things, let us overcommit up to
4790 	 * 1/2th of the space. If we can flush, don't let us overcommit
4791 	 * too much, let it overcommit up to 1/8 of the space.
4792 	 */
4793 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
4794 		avail >>= 3;
4795 	else
4796 		avail >>= 1;
4797 
4798 	if (used + bytes < space_info->total_bytes + avail)
4799 		return 1;
4800 	return 0;
4801 }
4802 
4803 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4804 					 unsigned long nr_pages, int nr_items)
4805 {
4806 	struct super_block *sb = fs_info->sb;
4807 
4808 	if (down_read_trylock(&sb->s_umount)) {
4809 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4810 		up_read(&sb->s_umount);
4811 	} else {
4812 		/*
4813 		 * We needn't worry the filesystem going from r/w to r/o though
4814 		 * we don't acquire ->s_umount mutex, because the filesystem
4815 		 * should guarantee the delalloc inodes list be empty after
4816 		 * the filesystem is readonly(all dirty pages are written to
4817 		 * the disk).
4818 		 */
4819 		btrfs_start_delalloc_roots(fs_info, 0, nr_items);
4820 		if (!current->journal_info)
4821 			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4822 	}
4823 }
4824 
4825 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4826 					u64 to_reclaim)
4827 {
4828 	u64 bytes;
4829 	u64 nr;
4830 
4831 	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4832 	nr = div64_u64(to_reclaim, bytes);
4833 	if (!nr)
4834 		nr = 1;
4835 	return nr;
4836 }
4837 
4838 #define EXTENT_SIZE_PER_ITEM	SZ_256K
4839 
4840 /*
4841  * shrink metadata reservation for delalloc
4842  */
4843 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4844 			    u64 orig, bool wait_ordered)
4845 {
4846 	struct btrfs_block_rsv *block_rsv;
4847 	struct btrfs_space_info *space_info;
4848 	struct btrfs_trans_handle *trans;
4849 	u64 delalloc_bytes;
4850 	u64 max_reclaim;
4851 	u64 items;
4852 	long time_left;
4853 	unsigned long nr_pages;
4854 	int loops;
4855 	enum btrfs_reserve_flush_enum flush;
4856 
4857 	/* Calc the number of the pages we need flush for space reservation */
4858 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
4859 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4860 
4861 	trans = (struct btrfs_trans_handle *)current->journal_info;
4862 	block_rsv = &fs_info->delalloc_block_rsv;
4863 	space_info = block_rsv->space_info;
4864 
4865 	delalloc_bytes = percpu_counter_sum_positive(
4866 						&fs_info->delalloc_bytes);
4867 	if (delalloc_bytes == 0) {
4868 		if (trans)
4869 			return;
4870 		if (wait_ordered)
4871 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4872 		return;
4873 	}
4874 
4875 	loops = 0;
4876 	while (delalloc_bytes && loops < 3) {
4877 		max_reclaim = min(delalloc_bytes, to_reclaim);
4878 		nr_pages = max_reclaim >> PAGE_SHIFT;
4879 		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4880 		/*
4881 		 * We need to wait for the async pages to actually start before
4882 		 * we do anything.
4883 		 */
4884 		max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4885 		if (!max_reclaim)
4886 			goto skip_async;
4887 
4888 		if (max_reclaim <= nr_pages)
4889 			max_reclaim = 0;
4890 		else
4891 			max_reclaim -= nr_pages;
4892 
4893 		wait_event(fs_info->async_submit_wait,
4894 			   atomic_read(&fs_info->async_delalloc_pages) <=
4895 			   (int)max_reclaim);
4896 skip_async:
4897 		if (!trans)
4898 			flush = BTRFS_RESERVE_FLUSH_ALL;
4899 		else
4900 			flush = BTRFS_RESERVE_NO_FLUSH;
4901 		spin_lock(&space_info->lock);
4902 		if (list_empty(&space_info->tickets) &&
4903 		    list_empty(&space_info->priority_tickets)) {
4904 			spin_unlock(&space_info->lock);
4905 			break;
4906 		}
4907 		spin_unlock(&space_info->lock);
4908 
4909 		loops++;
4910 		if (wait_ordered && !trans) {
4911 			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4912 		} else {
4913 			time_left = schedule_timeout_killable(1);
4914 			if (time_left)
4915 				break;
4916 		}
4917 		delalloc_bytes = percpu_counter_sum_positive(
4918 						&fs_info->delalloc_bytes);
4919 	}
4920 }
4921 
4922 /**
4923  * maybe_commit_transaction - possibly commit the transaction if its ok to
4924  * @root - the root we're allocating for
4925  * @bytes - the number of bytes we want to reserve
4926  * @force - force the commit
4927  *
4928  * This will check to make sure that committing the transaction will actually
4929  * get us somewhere and then commit the transaction if it does.  Otherwise it
4930  * will return -ENOSPC.
4931  */
4932 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4933 				  struct btrfs_space_info *space_info,
4934 				  u64 bytes, int force)
4935 {
4936 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4937 	struct btrfs_trans_handle *trans;
4938 
4939 	trans = (struct btrfs_trans_handle *)current->journal_info;
4940 	if (trans)
4941 		return -EAGAIN;
4942 
4943 	if (force)
4944 		goto commit;
4945 
4946 	/* See if there is enough pinned space to make this reservation */
4947 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4948 				   bytes) >= 0)
4949 		goto commit;
4950 
4951 	/*
4952 	 * See if there is some space in the delayed insertion reservation for
4953 	 * this reservation.
4954 	 */
4955 	if (space_info != delayed_rsv->space_info)
4956 		return -ENOSPC;
4957 
4958 	spin_lock(&delayed_rsv->lock);
4959 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4960 				   bytes - delayed_rsv->size) < 0) {
4961 		spin_unlock(&delayed_rsv->lock);
4962 		return -ENOSPC;
4963 	}
4964 	spin_unlock(&delayed_rsv->lock);
4965 
4966 commit:
4967 	trans = btrfs_join_transaction(fs_info->extent_root);
4968 	if (IS_ERR(trans))
4969 		return -ENOSPC;
4970 
4971 	return btrfs_commit_transaction(trans);
4972 }
4973 
4974 struct reserve_ticket {
4975 	u64 bytes;
4976 	int error;
4977 	struct list_head list;
4978 	wait_queue_head_t wait;
4979 };
4980 
4981 /*
4982  * Try to flush some data based on policy set by @state. This is only advisory
4983  * and may fail for various reasons. The caller is supposed to examine the
4984  * state of @space_info to detect the outcome.
4985  */
4986 static void flush_space(struct btrfs_fs_info *fs_info,
4987 		       struct btrfs_space_info *space_info, u64 num_bytes,
4988 		       int state)
4989 {
4990 	struct btrfs_root *root = fs_info->extent_root;
4991 	struct btrfs_trans_handle *trans;
4992 	int nr;
4993 	int ret = 0;
4994 
4995 	switch (state) {
4996 	case FLUSH_DELAYED_ITEMS_NR:
4997 	case FLUSH_DELAYED_ITEMS:
4998 		if (state == FLUSH_DELAYED_ITEMS_NR)
4999 			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
5000 		else
5001 			nr = -1;
5002 
5003 		trans = btrfs_join_transaction(root);
5004 		if (IS_ERR(trans)) {
5005 			ret = PTR_ERR(trans);
5006 			break;
5007 		}
5008 		ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
5009 		btrfs_end_transaction(trans);
5010 		break;
5011 	case FLUSH_DELALLOC:
5012 	case FLUSH_DELALLOC_WAIT:
5013 		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
5014 				state == FLUSH_DELALLOC_WAIT);
5015 		break;
5016 	case ALLOC_CHUNK:
5017 		trans = btrfs_join_transaction(root);
5018 		if (IS_ERR(trans)) {
5019 			ret = PTR_ERR(trans);
5020 			break;
5021 		}
5022 		ret = do_chunk_alloc(trans, fs_info,
5023 				     btrfs_metadata_alloc_profile(fs_info),
5024 				     CHUNK_ALLOC_NO_FORCE);
5025 		btrfs_end_transaction(trans);
5026 		if (ret > 0 || ret == -ENOSPC)
5027 			ret = 0;
5028 		break;
5029 	case COMMIT_TRANS:
5030 		ret = may_commit_transaction(fs_info, space_info,
5031 					     num_bytes, 0);
5032 		break;
5033 	default:
5034 		ret = -ENOSPC;
5035 		break;
5036 	}
5037 
5038 	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
5039 				ret);
5040 	return;
5041 }
5042 
5043 static inline u64
5044 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
5045 				 struct btrfs_space_info *space_info,
5046 				 bool system_chunk)
5047 {
5048 	struct reserve_ticket *ticket;
5049 	u64 used;
5050 	u64 expected;
5051 	u64 to_reclaim = 0;
5052 
5053 	list_for_each_entry(ticket, &space_info->tickets, list)
5054 		to_reclaim += ticket->bytes;
5055 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
5056 		to_reclaim += ticket->bytes;
5057 	if (to_reclaim)
5058 		return to_reclaim;
5059 
5060 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
5061 	if (can_overcommit(fs_info, space_info, to_reclaim,
5062 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5063 		return 0;
5064 
5065 	used = btrfs_space_info_used(space_info, true);
5066 
5067 	if (can_overcommit(fs_info, space_info, SZ_1M,
5068 			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5069 		expected = div_factor_fine(space_info->total_bytes, 95);
5070 	else
5071 		expected = div_factor_fine(space_info->total_bytes, 90);
5072 
5073 	if (used > expected)
5074 		to_reclaim = used - expected;
5075 	else
5076 		to_reclaim = 0;
5077 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5078 				     space_info->bytes_reserved);
5079 	return to_reclaim;
5080 }
5081 
5082 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5083 					struct btrfs_space_info *space_info,
5084 					u64 used, bool system_chunk)
5085 {
5086 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5087 
5088 	/* If we're just plain full then async reclaim just slows us down. */
5089 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5090 		return 0;
5091 
5092 	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5093 					      system_chunk))
5094 		return 0;
5095 
5096 	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5097 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5098 }
5099 
5100 static void wake_all_tickets(struct list_head *head)
5101 {
5102 	struct reserve_ticket *ticket;
5103 
5104 	while (!list_empty(head)) {
5105 		ticket = list_first_entry(head, struct reserve_ticket, list);
5106 		list_del_init(&ticket->list);
5107 		ticket->error = -ENOSPC;
5108 		wake_up(&ticket->wait);
5109 	}
5110 }
5111 
5112 /*
5113  * This is for normal flushers, we can wait all goddamned day if we want to.  We
5114  * will loop and continuously try to flush as long as we are making progress.
5115  * We count progress as clearing off tickets each time we have to loop.
5116  */
5117 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5118 {
5119 	struct btrfs_fs_info *fs_info;
5120 	struct btrfs_space_info *space_info;
5121 	u64 to_reclaim;
5122 	int flush_state;
5123 	int commit_cycles = 0;
5124 	u64 last_tickets_id;
5125 
5126 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5127 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5128 
5129 	spin_lock(&space_info->lock);
5130 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5131 						      false);
5132 	if (!to_reclaim) {
5133 		space_info->flush = 0;
5134 		spin_unlock(&space_info->lock);
5135 		return;
5136 	}
5137 	last_tickets_id = space_info->tickets_id;
5138 	spin_unlock(&space_info->lock);
5139 
5140 	flush_state = FLUSH_DELAYED_ITEMS_NR;
5141 	do {
5142 		flush_space(fs_info, space_info, to_reclaim, flush_state);
5143 		spin_lock(&space_info->lock);
5144 		if (list_empty(&space_info->tickets)) {
5145 			space_info->flush = 0;
5146 			spin_unlock(&space_info->lock);
5147 			return;
5148 		}
5149 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5150 							      space_info,
5151 							      false);
5152 		if (last_tickets_id == space_info->tickets_id) {
5153 			flush_state++;
5154 		} else {
5155 			last_tickets_id = space_info->tickets_id;
5156 			flush_state = FLUSH_DELAYED_ITEMS_NR;
5157 			if (commit_cycles)
5158 				commit_cycles--;
5159 		}
5160 
5161 		if (flush_state > COMMIT_TRANS) {
5162 			commit_cycles++;
5163 			if (commit_cycles > 2) {
5164 				wake_all_tickets(&space_info->tickets);
5165 				space_info->flush = 0;
5166 			} else {
5167 				flush_state = FLUSH_DELAYED_ITEMS_NR;
5168 			}
5169 		}
5170 		spin_unlock(&space_info->lock);
5171 	} while (flush_state <= COMMIT_TRANS);
5172 }
5173 
5174 void btrfs_init_async_reclaim_work(struct work_struct *work)
5175 {
5176 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5177 }
5178 
5179 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5180 					    struct btrfs_space_info *space_info,
5181 					    struct reserve_ticket *ticket)
5182 {
5183 	u64 to_reclaim;
5184 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
5185 
5186 	spin_lock(&space_info->lock);
5187 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5188 						      false);
5189 	if (!to_reclaim) {
5190 		spin_unlock(&space_info->lock);
5191 		return;
5192 	}
5193 	spin_unlock(&space_info->lock);
5194 
5195 	do {
5196 		flush_space(fs_info, space_info, to_reclaim, flush_state);
5197 		flush_state++;
5198 		spin_lock(&space_info->lock);
5199 		if (ticket->bytes == 0) {
5200 			spin_unlock(&space_info->lock);
5201 			return;
5202 		}
5203 		spin_unlock(&space_info->lock);
5204 
5205 		/*
5206 		 * Priority flushers can't wait on delalloc without
5207 		 * deadlocking.
5208 		 */
5209 		if (flush_state == FLUSH_DELALLOC ||
5210 		    flush_state == FLUSH_DELALLOC_WAIT)
5211 			flush_state = ALLOC_CHUNK;
5212 	} while (flush_state < COMMIT_TRANS);
5213 }
5214 
5215 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5216 			       struct btrfs_space_info *space_info,
5217 			       struct reserve_ticket *ticket, u64 orig_bytes)
5218 
5219 {
5220 	DEFINE_WAIT(wait);
5221 	int ret = 0;
5222 
5223 	spin_lock(&space_info->lock);
5224 	while (ticket->bytes > 0 && ticket->error == 0) {
5225 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5226 		if (ret) {
5227 			ret = -EINTR;
5228 			break;
5229 		}
5230 		spin_unlock(&space_info->lock);
5231 
5232 		schedule();
5233 
5234 		finish_wait(&ticket->wait, &wait);
5235 		spin_lock(&space_info->lock);
5236 	}
5237 	if (!ret)
5238 		ret = ticket->error;
5239 	if (!list_empty(&ticket->list))
5240 		list_del_init(&ticket->list);
5241 	if (ticket->bytes && ticket->bytes < orig_bytes) {
5242 		u64 num_bytes = orig_bytes - ticket->bytes;
5243 		space_info->bytes_may_use -= num_bytes;
5244 		trace_btrfs_space_reservation(fs_info, "space_info",
5245 					      space_info->flags, num_bytes, 0);
5246 	}
5247 	spin_unlock(&space_info->lock);
5248 
5249 	return ret;
5250 }
5251 
5252 /**
5253  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5254  * @root - the root we're allocating for
5255  * @space_info - the space info we want to allocate from
5256  * @orig_bytes - the number of bytes we want
5257  * @flush - whether or not we can flush to make our reservation
5258  *
5259  * This will reserve orig_bytes number of bytes from the space info associated
5260  * with the block_rsv.  If there is not enough space it will make an attempt to
5261  * flush out space to make room.  It will do this by flushing delalloc if
5262  * possible or committing the transaction.  If flush is 0 then no attempts to
5263  * regain reservations will be made and this will fail if there is not enough
5264  * space already.
5265  */
5266 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5267 				    struct btrfs_space_info *space_info,
5268 				    u64 orig_bytes,
5269 				    enum btrfs_reserve_flush_enum flush,
5270 				    bool system_chunk)
5271 {
5272 	struct reserve_ticket ticket;
5273 	u64 used;
5274 	int ret = 0;
5275 
5276 	ASSERT(orig_bytes);
5277 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5278 
5279 	spin_lock(&space_info->lock);
5280 	ret = -ENOSPC;
5281 	used = btrfs_space_info_used(space_info, true);
5282 
5283 	/*
5284 	 * If we have enough space then hooray, make our reservation and carry
5285 	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
5286 	 * If not things get more complicated.
5287 	 */
5288 	if (used + orig_bytes <= space_info->total_bytes) {
5289 		space_info->bytes_may_use += orig_bytes;
5290 		trace_btrfs_space_reservation(fs_info, "space_info",
5291 					      space_info->flags, orig_bytes, 1);
5292 		ret = 0;
5293 	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5294 				  system_chunk)) {
5295 		space_info->bytes_may_use += orig_bytes;
5296 		trace_btrfs_space_reservation(fs_info, "space_info",
5297 					      space_info->flags, orig_bytes, 1);
5298 		ret = 0;
5299 	}
5300 
5301 	/*
5302 	 * If we couldn't make a reservation then setup our reservation ticket
5303 	 * and kick the async worker if it's not already running.
5304 	 *
5305 	 * If we are a priority flusher then we just need to add our ticket to
5306 	 * the list and we will do our own flushing further down.
5307 	 */
5308 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5309 		ticket.bytes = orig_bytes;
5310 		ticket.error = 0;
5311 		init_waitqueue_head(&ticket.wait);
5312 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5313 			list_add_tail(&ticket.list, &space_info->tickets);
5314 			if (!space_info->flush) {
5315 				space_info->flush = 1;
5316 				trace_btrfs_trigger_flush(fs_info,
5317 							  space_info->flags,
5318 							  orig_bytes, flush,
5319 							  "enospc");
5320 				queue_work(system_unbound_wq,
5321 					   &fs_info->async_reclaim_work);
5322 			}
5323 		} else {
5324 			list_add_tail(&ticket.list,
5325 				      &space_info->priority_tickets);
5326 		}
5327 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5328 		used += orig_bytes;
5329 		/*
5330 		 * We will do the space reservation dance during log replay,
5331 		 * which means we won't have fs_info->fs_root set, so don't do
5332 		 * the async reclaim as we will panic.
5333 		 */
5334 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5335 		    need_do_async_reclaim(fs_info, space_info,
5336 					  used, system_chunk) &&
5337 		    !work_busy(&fs_info->async_reclaim_work)) {
5338 			trace_btrfs_trigger_flush(fs_info, space_info->flags,
5339 						  orig_bytes, flush, "preempt");
5340 			queue_work(system_unbound_wq,
5341 				   &fs_info->async_reclaim_work);
5342 		}
5343 	}
5344 	spin_unlock(&space_info->lock);
5345 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5346 		return ret;
5347 
5348 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
5349 		return wait_reserve_ticket(fs_info, space_info, &ticket,
5350 					   orig_bytes);
5351 
5352 	ret = 0;
5353 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5354 	spin_lock(&space_info->lock);
5355 	if (ticket.bytes) {
5356 		if (ticket.bytes < orig_bytes) {
5357 			u64 num_bytes = orig_bytes - ticket.bytes;
5358 			space_info->bytes_may_use -= num_bytes;
5359 			trace_btrfs_space_reservation(fs_info, "space_info",
5360 						      space_info->flags,
5361 						      num_bytes, 0);
5362 
5363 		}
5364 		list_del_init(&ticket.list);
5365 		ret = -ENOSPC;
5366 	}
5367 	spin_unlock(&space_info->lock);
5368 	ASSERT(list_empty(&ticket.list));
5369 	return ret;
5370 }
5371 
5372 /**
5373  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5374  * @root - the root we're allocating for
5375  * @block_rsv - the block_rsv we're allocating for
5376  * @orig_bytes - the number of bytes we want
5377  * @flush - whether or not we can flush to make our reservation
5378  *
5379  * This will reserve orgi_bytes number of bytes from the space info associated
5380  * with the block_rsv.  If there is not enough space it will make an attempt to
5381  * flush out space to make room.  It will do this by flushing delalloc if
5382  * possible or committing the transaction.  If flush is 0 then no attempts to
5383  * regain reservations will be made and this will fail if there is not enough
5384  * space already.
5385  */
5386 static int reserve_metadata_bytes(struct btrfs_root *root,
5387 				  struct btrfs_block_rsv *block_rsv,
5388 				  u64 orig_bytes,
5389 				  enum btrfs_reserve_flush_enum flush)
5390 {
5391 	struct btrfs_fs_info *fs_info = root->fs_info;
5392 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5393 	int ret;
5394 	bool system_chunk = (root == fs_info->chunk_root);
5395 
5396 	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5397 				       orig_bytes, flush, system_chunk);
5398 	if (ret == -ENOSPC &&
5399 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5400 		if (block_rsv != global_rsv &&
5401 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
5402 			ret = 0;
5403 	}
5404 	if (ret == -ENOSPC)
5405 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5406 					      block_rsv->space_info->flags,
5407 					      orig_bytes, 1);
5408 	return ret;
5409 }
5410 
5411 static struct btrfs_block_rsv *get_block_rsv(
5412 					const struct btrfs_trans_handle *trans,
5413 					const struct btrfs_root *root)
5414 {
5415 	struct btrfs_fs_info *fs_info = root->fs_info;
5416 	struct btrfs_block_rsv *block_rsv = NULL;
5417 
5418 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5419 	    (root == fs_info->csum_root && trans->adding_csums) ||
5420 	    (root == fs_info->uuid_root))
5421 		block_rsv = trans->block_rsv;
5422 
5423 	if (!block_rsv)
5424 		block_rsv = root->block_rsv;
5425 
5426 	if (!block_rsv)
5427 		block_rsv = &fs_info->empty_block_rsv;
5428 
5429 	return block_rsv;
5430 }
5431 
5432 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5433 			       u64 num_bytes)
5434 {
5435 	int ret = -ENOSPC;
5436 	spin_lock(&block_rsv->lock);
5437 	if (block_rsv->reserved >= num_bytes) {
5438 		block_rsv->reserved -= num_bytes;
5439 		if (block_rsv->reserved < block_rsv->size)
5440 			block_rsv->full = 0;
5441 		ret = 0;
5442 	}
5443 	spin_unlock(&block_rsv->lock);
5444 	return ret;
5445 }
5446 
5447 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5448 				u64 num_bytes, int update_size)
5449 {
5450 	spin_lock(&block_rsv->lock);
5451 	block_rsv->reserved += num_bytes;
5452 	if (update_size)
5453 		block_rsv->size += num_bytes;
5454 	else if (block_rsv->reserved >= block_rsv->size)
5455 		block_rsv->full = 1;
5456 	spin_unlock(&block_rsv->lock);
5457 }
5458 
5459 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5460 			     struct btrfs_block_rsv *dest, u64 num_bytes,
5461 			     int min_factor)
5462 {
5463 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5464 	u64 min_bytes;
5465 
5466 	if (global_rsv->space_info != dest->space_info)
5467 		return -ENOSPC;
5468 
5469 	spin_lock(&global_rsv->lock);
5470 	min_bytes = div_factor(global_rsv->size, min_factor);
5471 	if (global_rsv->reserved < min_bytes + num_bytes) {
5472 		spin_unlock(&global_rsv->lock);
5473 		return -ENOSPC;
5474 	}
5475 	global_rsv->reserved -= num_bytes;
5476 	if (global_rsv->reserved < global_rsv->size)
5477 		global_rsv->full = 0;
5478 	spin_unlock(&global_rsv->lock);
5479 
5480 	block_rsv_add_bytes(dest, num_bytes, 1);
5481 	return 0;
5482 }
5483 
5484 /*
5485  * This is for space we already have accounted in space_info->bytes_may_use, so
5486  * basically when we're returning space from block_rsv's.
5487  */
5488 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5489 				     struct btrfs_space_info *space_info,
5490 				     u64 num_bytes)
5491 {
5492 	struct reserve_ticket *ticket;
5493 	struct list_head *head;
5494 	u64 used;
5495 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5496 	bool check_overcommit = false;
5497 
5498 	spin_lock(&space_info->lock);
5499 	head = &space_info->priority_tickets;
5500 
5501 	/*
5502 	 * If we are over our limit then we need to check and see if we can
5503 	 * overcommit, and if we can't then we just need to free up our space
5504 	 * and not satisfy any requests.
5505 	 */
5506 	used = btrfs_space_info_used(space_info, true);
5507 	if (used - num_bytes >= space_info->total_bytes)
5508 		check_overcommit = true;
5509 again:
5510 	while (!list_empty(head) && num_bytes) {
5511 		ticket = list_first_entry(head, struct reserve_ticket,
5512 					  list);
5513 		/*
5514 		 * We use 0 bytes because this space is already reserved, so
5515 		 * adding the ticket space would be a double count.
5516 		 */
5517 		if (check_overcommit &&
5518 		    !can_overcommit(fs_info, space_info, 0, flush, false))
5519 			break;
5520 		if (num_bytes >= ticket->bytes) {
5521 			list_del_init(&ticket->list);
5522 			num_bytes -= ticket->bytes;
5523 			ticket->bytes = 0;
5524 			space_info->tickets_id++;
5525 			wake_up(&ticket->wait);
5526 		} else {
5527 			ticket->bytes -= num_bytes;
5528 			num_bytes = 0;
5529 		}
5530 	}
5531 
5532 	if (num_bytes && head == &space_info->priority_tickets) {
5533 		head = &space_info->tickets;
5534 		flush = BTRFS_RESERVE_FLUSH_ALL;
5535 		goto again;
5536 	}
5537 	space_info->bytes_may_use -= num_bytes;
5538 	trace_btrfs_space_reservation(fs_info, "space_info",
5539 				      space_info->flags, num_bytes, 0);
5540 	spin_unlock(&space_info->lock);
5541 }
5542 
5543 /*
5544  * This is for newly allocated space that isn't accounted in
5545  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5546  * we use this helper.
5547  */
5548 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5549 				     struct btrfs_space_info *space_info,
5550 				     u64 num_bytes)
5551 {
5552 	struct reserve_ticket *ticket;
5553 	struct list_head *head = &space_info->priority_tickets;
5554 
5555 again:
5556 	while (!list_empty(head) && num_bytes) {
5557 		ticket = list_first_entry(head, struct reserve_ticket,
5558 					  list);
5559 		if (num_bytes >= ticket->bytes) {
5560 			trace_btrfs_space_reservation(fs_info, "space_info",
5561 						      space_info->flags,
5562 						      ticket->bytes, 1);
5563 			list_del_init(&ticket->list);
5564 			num_bytes -= ticket->bytes;
5565 			space_info->bytes_may_use += ticket->bytes;
5566 			ticket->bytes = 0;
5567 			space_info->tickets_id++;
5568 			wake_up(&ticket->wait);
5569 		} else {
5570 			trace_btrfs_space_reservation(fs_info, "space_info",
5571 						      space_info->flags,
5572 						      num_bytes, 1);
5573 			space_info->bytes_may_use += num_bytes;
5574 			ticket->bytes -= num_bytes;
5575 			num_bytes = 0;
5576 		}
5577 	}
5578 
5579 	if (num_bytes && head == &space_info->priority_tickets) {
5580 		head = &space_info->tickets;
5581 		goto again;
5582 	}
5583 }
5584 
5585 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5586 				    struct btrfs_block_rsv *block_rsv,
5587 				    struct btrfs_block_rsv *dest, u64 num_bytes)
5588 {
5589 	struct btrfs_space_info *space_info = block_rsv->space_info;
5590 
5591 	spin_lock(&block_rsv->lock);
5592 	if (num_bytes == (u64)-1)
5593 		num_bytes = block_rsv->size;
5594 	block_rsv->size -= num_bytes;
5595 	if (block_rsv->reserved >= block_rsv->size) {
5596 		num_bytes = block_rsv->reserved - block_rsv->size;
5597 		block_rsv->reserved = block_rsv->size;
5598 		block_rsv->full = 1;
5599 	} else {
5600 		num_bytes = 0;
5601 	}
5602 	spin_unlock(&block_rsv->lock);
5603 
5604 	if (num_bytes > 0) {
5605 		if (dest) {
5606 			spin_lock(&dest->lock);
5607 			if (!dest->full) {
5608 				u64 bytes_to_add;
5609 
5610 				bytes_to_add = dest->size - dest->reserved;
5611 				bytes_to_add = min(num_bytes, bytes_to_add);
5612 				dest->reserved += bytes_to_add;
5613 				if (dest->reserved >= dest->size)
5614 					dest->full = 1;
5615 				num_bytes -= bytes_to_add;
5616 			}
5617 			spin_unlock(&dest->lock);
5618 		}
5619 		if (num_bytes)
5620 			space_info_add_old_bytes(fs_info, space_info,
5621 						 num_bytes);
5622 	}
5623 }
5624 
5625 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5626 			    struct btrfs_block_rsv *dst, u64 num_bytes,
5627 			    int update_size)
5628 {
5629 	int ret;
5630 
5631 	ret = block_rsv_use_bytes(src, num_bytes);
5632 	if (ret)
5633 		return ret;
5634 
5635 	block_rsv_add_bytes(dst, num_bytes, update_size);
5636 	return 0;
5637 }
5638 
5639 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5640 {
5641 	memset(rsv, 0, sizeof(*rsv));
5642 	spin_lock_init(&rsv->lock);
5643 	rsv->type = type;
5644 }
5645 
5646 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5647 					      unsigned short type)
5648 {
5649 	struct btrfs_block_rsv *block_rsv;
5650 
5651 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5652 	if (!block_rsv)
5653 		return NULL;
5654 
5655 	btrfs_init_block_rsv(block_rsv, type);
5656 	block_rsv->space_info = __find_space_info(fs_info,
5657 						  BTRFS_BLOCK_GROUP_METADATA);
5658 	return block_rsv;
5659 }
5660 
5661 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5662 			  struct btrfs_block_rsv *rsv)
5663 {
5664 	if (!rsv)
5665 		return;
5666 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5667 	kfree(rsv);
5668 }
5669 
5670 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5671 {
5672 	kfree(rsv);
5673 }
5674 
5675 int btrfs_block_rsv_add(struct btrfs_root *root,
5676 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5677 			enum btrfs_reserve_flush_enum flush)
5678 {
5679 	int ret;
5680 
5681 	if (num_bytes == 0)
5682 		return 0;
5683 
5684 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5685 	if (!ret) {
5686 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
5687 		return 0;
5688 	}
5689 
5690 	return ret;
5691 }
5692 
5693 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5694 {
5695 	u64 num_bytes = 0;
5696 	int ret = -ENOSPC;
5697 
5698 	if (!block_rsv)
5699 		return 0;
5700 
5701 	spin_lock(&block_rsv->lock);
5702 	num_bytes = div_factor(block_rsv->size, min_factor);
5703 	if (block_rsv->reserved >= num_bytes)
5704 		ret = 0;
5705 	spin_unlock(&block_rsv->lock);
5706 
5707 	return ret;
5708 }
5709 
5710 int btrfs_block_rsv_refill(struct btrfs_root *root,
5711 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5712 			   enum btrfs_reserve_flush_enum flush)
5713 {
5714 	u64 num_bytes = 0;
5715 	int ret = -ENOSPC;
5716 
5717 	if (!block_rsv)
5718 		return 0;
5719 
5720 	spin_lock(&block_rsv->lock);
5721 	num_bytes = min_reserved;
5722 	if (block_rsv->reserved >= num_bytes)
5723 		ret = 0;
5724 	else
5725 		num_bytes -= block_rsv->reserved;
5726 	spin_unlock(&block_rsv->lock);
5727 
5728 	if (!ret)
5729 		return 0;
5730 
5731 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5732 	if (!ret) {
5733 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
5734 		return 0;
5735 	}
5736 
5737 	return ret;
5738 }
5739 
5740 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5741 			     struct btrfs_block_rsv *block_rsv,
5742 			     u64 num_bytes)
5743 {
5744 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5745 
5746 	if (global_rsv == block_rsv ||
5747 	    block_rsv->space_info != global_rsv->space_info)
5748 		global_rsv = NULL;
5749 	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
5750 }
5751 
5752 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5753 {
5754 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5755 	struct btrfs_space_info *sinfo = block_rsv->space_info;
5756 	u64 num_bytes;
5757 
5758 	/*
5759 	 * The global block rsv is based on the size of the extent tree, the
5760 	 * checksum tree and the root tree.  If the fs is empty we want to set
5761 	 * it to a minimal amount for safety.
5762 	 */
5763 	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5764 		btrfs_root_used(&fs_info->csum_root->root_item) +
5765 		btrfs_root_used(&fs_info->tree_root->root_item);
5766 	num_bytes = max_t(u64, num_bytes, SZ_16M);
5767 
5768 	spin_lock(&sinfo->lock);
5769 	spin_lock(&block_rsv->lock);
5770 
5771 	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5772 
5773 	if (block_rsv->reserved < block_rsv->size) {
5774 		num_bytes = btrfs_space_info_used(sinfo, true);
5775 		if (sinfo->total_bytes > num_bytes) {
5776 			num_bytes = sinfo->total_bytes - num_bytes;
5777 			num_bytes = min(num_bytes,
5778 					block_rsv->size - block_rsv->reserved);
5779 			block_rsv->reserved += num_bytes;
5780 			sinfo->bytes_may_use += num_bytes;
5781 			trace_btrfs_space_reservation(fs_info, "space_info",
5782 						      sinfo->flags, num_bytes,
5783 						      1);
5784 		}
5785 	} else if (block_rsv->reserved > block_rsv->size) {
5786 		num_bytes = block_rsv->reserved - block_rsv->size;
5787 		sinfo->bytes_may_use -= num_bytes;
5788 		trace_btrfs_space_reservation(fs_info, "space_info",
5789 				      sinfo->flags, num_bytes, 0);
5790 		block_rsv->reserved = block_rsv->size;
5791 	}
5792 
5793 	if (block_rsv->reserved == block_rsv->size)
5794 		block_rsv->full = 1;
5795 	else
5796 		block_rsv->full = 0;
5797 
5798 	spin_unlock(&block_rsv->lock);
5799 	spin_unlock(&sinfo->lock);
5800 }
5801 
5802 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5803 {
5804 	struct btrfs_space_info *space_info;
5805 
5806 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5807 	fs_info->chunk_block_rsv.space_info = space_info;
5808 
5809 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5810 	fs_info->global_block_rsv.space_info = space_info;
5811 	fs_info->delalloc_block_rsv.space_info = space_info;
5812 	fs_info->trans_block_rsv.space_info = space_info;
5813 	fs_info->empty_block_rsv.space_info = space_info;
5814 	fs_info->delayed_block_rsv.space_info = space_info;
5815 
5816 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5817 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5818 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5819 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5820 	if (fs_info->quota_root)
5821 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5822 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5823 
5824 	update_global_block_rsv(fs_info);
5825 }
5826 
5827 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5828 {
5829 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5830 				(u64)-1);
5831 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5832 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5833 	WARN_ON(fs_info->trans_block_rsv.size > 0);
5834 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5835 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
5836 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5837 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
5838 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5839 }
5840 
5841 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5842 				  struct btrfs_fs_info *fs_info)
5843 {
5844 	if (!trans->block_rsv)
5845 		return;
5846 
5847 	if (!trans->bytes_reserved)
5848 		return;
5849 
5850 	trace_btrfs_space_reservation(fs_info, "transaction",
5851 				      trans->transid, trans->bytes_reserved, 0);
5852 	btrfs_block_rsv_release(fs_info, trans->block_rsv,
5853 				trans->bytes_reserved);
5854 	trans->bytes_reserved = 0;
5855 }
5856 
5857 /*
5858  * To be called after all the new block groups attached to the transaction
5859  * handle have been created (btrfs_create_pending_block_groups()).
5860  */
5861 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5862 {
5863 	struct btrfs_fs_info *fs_info = trans->fs_info;
5864 
5865 	if (!trans->chunk_bytes_reserved)
5866 		return;
5867 
5868 	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5869 
5870 	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5871 				trans->chunk_bytes_reserved);
5872 	trans->chunk_bytes_reserved = 0;
5873 }
5874 
5875 /* Can only return 0 or -ENOSPC */
5876 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5877 				  struct btrfs_inode *inode)
5878 {
5879 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5880 	struct btrfs_root *root = inode->root;
5881 	/*
5882 	 * We always use trans->block_rsv here as we will have reserved space
5883 	 * for our orphan when starting the transaction, using get_block_rsv()
5884 	 * here will sometimes make us choose the wrong block rsv as we could be
5885 	 * doing a reloc inode for a non refcounted root.
5886 	 */
5887 	struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5888 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5889 
5890 	/*
5891 	 * We need to hold space in order to delete our orphan item once we've
5892 	 * added it, so this takes the reservation so we can release it later
5893 	 * when we are truly done with the orphan item.
5894 	 */
5895 	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5896 
5897 	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5898 			num_bytes, 1);
5899 	return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5900 }
5901 
5902 void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
5903 {
5904 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5905 	struct btrfs_root *root = inode->root;
5906 	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5907 
5908 	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5909 			num_bytes, 0);
5910 	btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
5911 }
5912 
5913 /*
5914  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5915  * root: the root of the parent directory
5916  * rsv: block reservation
5917  * items: the number of items that we need do reservation
5918  * qgroup_reserved: used to return the reserved size in qgroup
5919  *
5920  * This function is used to reserve the space for snapshot/subvolume
5921  * creation and deletion. Those operations are different with the
5922  * common file/directory operations, they change two fs/file trees
5923  * and root tree, the number of items that the qgroup reserves is
5924  * different with the free space reservation. So we can not use
5925  * the space reservation mechanism in start_transaction().
5926  */
5927 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5928 				     struct btrfs_block_rsv *rsv,
5929 				     int items,
5930 				     u64 *qgroup_reserved,
5931 				     bool use_global_rsv)
5932 {
5933 	u64 num_bytes;
5934 	int ret;
5935 	struct btrfs_fs_info *fs_info = root->fs_info;
5936 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5937 
5938 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5939 		/* One for parent inode, two for dir entries */
5940 		num_bytes = 3 * fs_info->nodesize;
5941 		ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
5942 		if (ret)
5943 			return ret;
5944 	} else {
5945 		num_bytes = 0;
5946 	}
5947 
5948 	*qgroup_reserved = num_bytes;
5949 
5950 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5951 	rsv->space_info = __find_space_info(fs_info,
5952 					    BTRFS_BLOCK_GROUP_METADATA);
5953 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5954 				  BTRFS_RESERVE_FLUSH_ALL);
5955 
5956 	if (ret == -ENOSPC && use_global_rsv)
5957 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5958 
5959 	if (ret && *qgroup_reserved)
5960 		btrfs_qgroup_free_meta(root, *qgroup_reserved);
5961 
5962 	return ret;
5963 }
5964 
5965 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5966 				      struct btrfs_block_rsv *rsv)
5967 {
5968 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5969 }
5970 
5971 /**
5972  * drop_outstanding_extent - drop an outstanding extent
5973  * @inode: the inode we're dropping the extent for
5974  * @num_bytes: the number of bytes we're releasing.
5975  *
5976  * This is called when we are freeing up an outstanding extent, either called
5977  * after an error or after an extent is written.  This will return the number of
5978  * reserved extents that need to be freed.  This must be called with
5979  * BTRFS_I(inode)->lock held.
5980  */
5981 static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
5982 		u64 num_bytes)
5983 {
5984 	unsigned drop_inode_space = 0;
5985 	unsigned dropped_extents = 0;
5986 	unsigned num_extents;
5987 
5988 	num_extents = count_max_extents(num_bytes);
5989 	ASSERT(num_extents);
5990 	ASSERT(inode->outstanding_extents >= num_extents);
5991 	inode->outstanding_extents -= num_extents;
5992 
5993 	if (inode->outstanding_extents == 0 &&
5994 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5995 			       &inode->runtime_flags))
5996 		drop_inode_space = 1;
5997 
5998 	/*
5999 	 * If we have more or the same amount of outstanding extents than we have
6000 	 * reserved then we need to leave the reserved extents count alone.
6001 	 */
6002 	if (inode->outstanding_extents >= inode->reserved_extents)
6003 		return drop_inode_space;
6004 
6005 	dropped_extents = inode->reserved_extents - inode->outstanding_extents;
6006 	inode->reserved_extents -= dropped_extents;
6007 	return dropped_extents + drop_inode_space;
6008 }
6009 
6010 /**
6011  * calc_csum_metadata_size - return the amount of metadata space that must be
6012  *	reserved/freed for the given bytes.
6013  * @inode: the inode we're manipulating
6014  * @num_bytes: the number of bytes in question
6015  * @reserve: 1 if we are reserving space, 0 if we are freeing space
6016  *
6017  * This adjusts the number of csum_bytes in the inode and then returns the
6018  * correct amount of metadata that must either be reserved or freed.  We
6019  * calculate how many checksums we can fit into one leaf and then divide the
6020  * number of bytes that will need to be checksumed by this value to figure out
6021  * how many checksums will be required.  If we are adding bytes then the number
6022  * may go up and we will return the number of additional bytes that must be
6023  * reserved.  If it is going down we will return the number of bytes that must
6024  * be freed.
6025  *
6026  * This must be called with BTRFS_I(inode)->lock held.
6027  */
6028 static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
6029 				   int reserve)
6030 {
6031 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6032 	u64 old_csums, num_csums;
6033 
6034 	if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
6035 		return 0;
6036 
6037 	old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
6038 	if (reserve)
6039 		inode->csum_bytes += num_bytes;
6040 	else
6041 		inode->csum_bytes -= num_bytes;
6042 	num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
6043 
6044 	/* No change, no need to reserve more */
6045 	if (old_csums == num_csums)
6046 		return 0;
6047 
6048 	if (reserve)
6049 		return btrfs_calc_trans_metadata_size(fs_info,
6050 						      num_csums - old_csums);
6051 
6052 	return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
6053 }
6054 
6055 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6056 {
6057 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6058 	struct btrfs_root *root = inode->root;
6059 	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
6060 	u64 to_reserve = 0;
6061 	u64 csum_bytes;
6062 	unsigned nr_extents;
6063 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6064 	int ret = 0;
6065 	bool delalloc_lock = true;
6066 	u64 to_free = 0;
6067 	unsigned dropped;
6068 	bool release_extra = false;
6069 
6070 	/* If we are a free space inode we need to not flush since we will be in
6071 	 * the middle of a transaction commit.  We also don't need the delalloc
6072 	 * mutex since we won't race with anybody.  We need this mostly to make
6073 	 * lockdep shut its filthy mouth.
6074 	 *
6075 	 * If we have a transaction open (can happen if we call truncate_block
6076 	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6077 	 */
6078 	if (btrfs_is_free_space_inode(inode)) {
6079 		flush = BTRFS_RESERVE_NO_FLUSH;
6080 		delalloc_lock = false;
6081 	} else if (current->journal_info) {
6082 		flush = BTRFS_RESERVE_FLUSH_LIMIT;
6083 	}
6084 
6085 	if (flush != BTRFS_RESERVE_NO_FLUSH &&
6086 	    btrfs_transaction_in_commit(fs_info))
6087 		schedule_timeout(1);
6088 
6089 	if (delalloc_lock)
6090 		mutex_lock(&inode->delalloc_mutex);
6091 
6092 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6093 
6094 	spin_lock(&inode->lock);
6095 	nr_extents = count_max_extents(num_bytes);
6096 	inode->outstanding_extents += nr_extents;
6097 
6098 	nr_extents = 0;
6099 	if (inode->outstanding_extents > inode->reserved_extents)
6100 		nr_extents += inode->outstanding_extents -
6101 			inode->reserved_extents;
6102 
6103 	/* We always want to reserve a slot for updating the inode. */
6104 	to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
6105 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
6106 	csum_bytes = inode->csum_bytes;
6107 	spin_unlock(&inode->lock);
6108 
6109 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6110 		ret = btrfs_qgroup_reserve_meta(root,
6111 				nr_extents * fs_info->nodesize, true);
6112 		if (ret)
6113 			goto out_fail;
6114 	}
6115 
6116 	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
6117 	if (unlikely(ret)) {
6118 		btrfs_qgroup_free_meta(root,
6119 				       nr_extents * fs_info->nodesize);
6120 		goto out_fail;
6121 	}
6122 
6123 	spin_lock(&inode->lock);
6124 	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
6125 			     &inode->runtime_flags)) {
6126 		to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
6127 		release_extra = true;
6128 	}
6129 	inode->reserved_extents += nr_extents;
6130 	spin_unlock(&inode->lock);
6131 
6132 	if (delalloc_lock)
6133 		mutex_unlock(&inode->delalloc_mutex);
6134 
6135 	if (to_reserve)
6136 		trace_btrfs_space_reservation(fs_info, "delalloc",
6137 					      btrfs_ino(inode), to_reserve, 1);
6138 	if (release_extra)
6139 		btrfs_block_rsv_release(fs_info, block_rsv,
6140 				btrfs_calc_trans_metadata_size(fs_info, 1));
6141 	return 0;
6142 
6143 out_fail:
6144 	spin_lock(&inode->lock);
6145 	dropped = drop_outstanding_extent(inode, num_bytes);
6146 	/*
6147 	 * If the inodes csum_bytes is the same as the original
6148 	 * csum_bytes then we know we haven't raced with any free()ers
6149 	 * so we can just reduce our inodes csum bytes and carry on.
6150 	 */
6151 	if (inode->csum_bytes == csum_bytes) {
6152 		calc_csum_metadata_size(inode, num_bytes, 0);
6153 	} else {
6154 		u64 orig_csum_bytes = inode->csum_bytes;
6155 		u64 bytes;
6156 
6157 		/*
6158 		 * This is tricky, but first we need to figure out how much we
6159 		 * freed from any free-ers that occurred during this
6160 		 * reservation, so we reset ->csum_bytes to the csum_bytes
6161 		 * before we dropped our lock, and then call the free for the
6162 		 * number of bytes that were freed while we were trying our
6163 		 * reservation.
6164 		 */
6165 		bytes = csum_bytes - inode->csum_bytes;
6166 		inode->csum_bytes = csum_bytes;
6167 		to_free = calc_csum_metadata_size(inode, bytes, 0);
6168 
6169 
6170 		/*
6171 		 * Now we need to see how much we would have freed had we not
6172 		 * been making this reservation and our ->csum_bytes were not
6173 		 * artificially inflated.
6174 		 */
6175 		inode->csum_bytes = csum_bytes - num_bytes;
6176 		bytes = csum_bytes - orig_csum_bytes;
6177 		bytes = calc_csum_metadata_size(inode, bytes, 0);
6178 
6179 		/*
6180 		 * Now reset ->csum_bytes to what it should be.  If bytes is
6181 		 * more than to_free then we would have freed more space had we
6182 		 * not had an artificially high ->csum_bytes, so we need to free
6183 		 * the remainder.  If bytes is the same or less then we don't
6184 		 * need to do anything, the other free-ers did the correct
6185 		 * thing.
6186 		 */
6187 		inode->csum_bytes = orig_csum_bytes - num_bytes;
6188 		if (bytes > to_free)
6189 			to_free = bytes - to_free;
6190 		else
6191 			to_free = 0;
6192 	}
6193 	spin_unlock(&inode->lock);
6194 	if (dropped)
6195 		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
6196 
6197 	if (to_free) {
6198 		btrfs_block_rsv_release(fs_info, block_rsv, to_free);
6199 		trace_btrfs_space_reservation(fs_info, "delalloc",
6200 					      btrfs_ino(inode), to_free, 0);
6201 	}
6202 	if (delalloc_lock)
6203 		mutex_unlock(&inode->delalloc_mutex);
6204 	return ret;
6205 }
6206 
6207 /**
6208  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6209  * @inode: the inode to release the reservation for
6210  * @num_bytes: the number of bytes we're releasing
6211  *
6212  * This will release the metadata reservation for an inode.  This can be called
6213  * once we complete IO for a given set of bytes to release their metadata
6214  * reservations.
6215  */
6216 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
6217 {
6218 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6219 	u64 to_free = 0;
6220 	unsigned dropped;
6221 
6222 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6223 	spin_lock(&inode->lock);
6224 	dropped = drop_outstanding_extent(inode, num_bytes);
6225 
6226 	if (num_bytes)
6227 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6228 	spin_unlock(&inode->lock);
6229 	if (dropped > 0)
6230 		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
6231 
6232 	if (btrfs_is_testing(fs_info))
6233 		return;
6234 
6235 	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
6236 				      to_free, 0);
6237 
6238 	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
6239 }
6240 
6241 /**
6242  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6243  * delalloc
6244  * @inode: inode we're writing to
6245  * @start: start range we are writing to
6246  * @len: how long the range we are writing to
6247  * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6248  * 	      current reservation.
6249  *
6250  * This will do the following things
6251  *
6252  * o reserve space in data space info for num bytes
6253  *   and reserve precious corresponding qgroup space
6254  *   (Done in check_data_free_space)
6255  *
6256  * o reserve space for metadata space, based on the number of outstanding
6257  *   extents and how much csums will be needed
6258  *   also reserve metadata space in a per root over-reserve method.
6259  * o add to the inodes->delalloc_bytes
6260  * o add it to the fs_info's delalloc inodes list.
6261  *   (Above 3 all done in delalloc_reserve_metadata)
6262  *
6263  * Return 0 for success
6264  * Return <0 for error(-ENOSPC or -EQUOT)
6265  */
6266 int btrfs_delalloc_reserve_space(struct inode *inode,
6267 			struct extent_changeset **reserved, u64 start, u64 len)
6268 {
6269 	int ret;
6270 
6271 	ret = btrfs_check_data_free_space(inode, reserved, start, len);
6272 	if (ret < 0)
6273 		return ret;
6274 	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6275 	if (ret < 0)
6276 		btrfs_free_reserved_data_space(inode, *reserved, start, len);
6277 	return ret;
6278 }
6279 
6280 /**
6281  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6282  * @inode: inode we're releasing space for
6283  * @start: start position of the space already reserved
6284  * @len: the len of the space already reserved
6285  *
6286  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
6287  * called in the case that we don't need the metadata AND data reservations
6288  * anymore.  So if there is an error or we insert an inline extent.
6289  *
6290  * This function will release the metadata space that was not used and will
6291  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6292  * list if there are no delalloc bytes left.
6293  * Also it will handle the qgroup reserved space.
6294  */
6295 void btrfs_delalloc_release_space(struct inode *inode,
6296 			struct extent_changeset *reserved, u64 start, u64 len)
6297 {
6298 	btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
6299 	btrfs_free_reserved_data_space(inode, reserved, start, len);
6300 }
6301 
6302 static int update_block_group(struct btrfs_trans_handle *trans,
6303 			      struct btrfs_fs_info *info, u64 bytenr,
6304 			      u64 num_bytes, int alloc)
6305 {
6306 	struct btrfs_block_group_cache *cache = NULL;
6307 	u64 total = num_bytes;
6308 	u64 old_val;
6309 	u64 byte_in_group;
6310 	int factor;
6311 
6312 	/* block accounting for super block */
6313 	spin_lock(&info->delalloc_root_lock);
6314 	old_val = btrfs_super_bytes_used(info->super_copy);
6315 	if (alloc)
6316 		old_val += num_bytes;
6317 	else
6318 		old_val -= num_bytes;
6319 	btrfs_set_super_bytes_used(info->super_copy, old_val);
6320 	spin_unlock(&info->delalloc_root_lock);
6321 
6322 	while (total) {
6323 		cache = btrfs_lookup_block_group(info, bytenr);
6324 		if (!cache)
6325 			return -ENOENT;
6326 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6327 				    BTRFS_BLOCK_GROUP_RAID1 |
6328 				    BTRFS_BLOCK_GROUP_RAID10))
6329 			factor = 2;
6330 		else
6331 			factor = 1;
6332 		/*
6333 		 * If this block group has free space cache written out, we
6334 		 * need to make sure to load it if we are removing space.  This
6335 		 * is because we need the unpinning stage to actually add the
6336 		 * space back to the block group, otherwise we will leak space.
6337 		 */
6338 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
6339 			cache_block_group(cache, 1);
6340 
6341 		byte_in_group = bytenr - cache->key.objectid;
6342 		WARN_ON(byte_in_group > cache->key.offset);
6343 
6344 		spin_lock(&cache->space_info->lock);
6345 		spin_lock(&cache->lock);
6346 
6347 		if (btrfs_test_opt(info, SPACE_CACHE) &&
6348 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
6349 			cache->disk_cache_state = BTRFS_DC_CLEAR;
6350 
6351 		old_val = btrfs_block_group_used(&cache->item);
6352 		num_bytes = min(total, cache->key.offset - byte_in_group);
6353 		if (alloc) {
6354 			old_val += num_bytes;
6355 			btrfs_set_block_group_used(&cache->item, old_val);
6356 			cache->reserved -= num_bytes;
6357 			cache->space_info->bytes_reserved -= num_bytes;
6358 			cache->space_info->bytes_used += num_bytes;
6359 			cache->space_info->disk_used += num_bytes * factor;
6360 			spin_unlock(&cache->lock);
6361 			spin_unlock(&cache->space_info->lock);
6362 		} else {
6363 			old_val -= num_bytes;
6364 			btrfs_set_block_group_used(&cache->item, old_val);
6365 			cache->pinned += num_bytes;
6366 			cache->space_info->bytes_pinned += num_bytes;
6367 			cache->space_info->bytes_used -= num_bytes;
6368 			cache->space_info->disk_used -= num_bytes * factor;
6369 			spin_unlock(&cache->lock);
6370 			spin_unlock(&cache->space_info->lock);
6371 
6372 			trace_btrfs_space_reservation(info, "pinned",
6373 						      cache->space_info->flags,
6374 						      num_bytes, 1);
6375 			percpu_counter_add(&cache->space_info->total_bytes_pinned,
6376 					   num_bytes);
6377 			set_extent_dirty(info->pinned_extents,
6378 					 bytenr, bytenr + num_bytes - 1,
6379 					 GFP_NOFS | __GFP_NOFAIL);
6380 		}
6381 
6382 		spin_lock(&trans->transaction->dirty_bgs_lock);
6383 		if (list_empty(&cache->dirty_list)) {
6384 			list_add_tail(&cache->dirty_list,
6385 				      &trans->transaction->dirty_bgs);
6386 				trans->transaction->num_dirty_bgs++;
6387 			btrfs_get_block_group(cache);
6388 		}
6389 		spin_unlock(&trans->transaction->dirty_bgs_lock);
6390 
6391 		/*
6392 		 * No longer have used bytes in this block group, queue it for
6393 		 * deletion. We do this after adding the block group to the
6394 		 * dirty list to avoid races between cleaner kthread and space
6395 		 * cache writeout.
6396 		 */
6397 		if (!alloc && old_val == 0) {
6398 			spin_lock(&info->unused_bgs_lock);
6399 			if (list_empty(&cache->bg_list)) {
6400 				btrfs_get_block_group(cache);
6401 				list_add_tail(&cache->bg_list,
6402 					      &info->unused_bgs);
6403 			}
6404 			spin_unlock(&info->unused_bgs_lock);
6405 		}
6406 
6407 		btrfs_put_block_group(cache);
6408 		total -= num_bytes;
6409 		bytenr += num_bytes;
6410 	}
6411 	return 0;
6412 }
6413 
6414 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6415 {
6416 	struct btrfs_block_group_cache *cache;
6417 	u64 bytenr;
6418 
6419 	spin_lock(&fs_info->block_group_cache_lock);
6420 	bytenr = fs_info->first_logical_byte;
6421 	spin_unlock(&fs_info->block_group_cache_lock);
6422 
6423 	if (bytenr < (u64)-1)
6424 		return bytenr;
6425 
6426 	cache = btrfs_lookup_first_block_group(fs_info, search_start);
6427 	if (!cache)
6428 		return 0;
6429 
6430 	bytenr = cache->key.objectid;
6431 	btrfs_put_block_group(cache);
6432 
6433 	return bytenr;
6434 }
6435 
6436 static int pin_down_extent(struct btrfs_fs_info *fs_info,
6437 			   struct btrfs_block_group_cache *cache,
6438 			   u64 bytenr, u64 num_bytes, int reserved)
6439 {
6440 	spin_lock(&cache->space_info->lock);
6441 	spin_lock(&cache->lock);
6442 	cache->pinned += num_bytes;
6443 	cache->space_info->bytes_pinned += num_bytes;
6444 	if (reserved) {
6445 		cache->reserved -= num_bytes;
6446 		cache->space_info->bytes_reserved -= num_bytes;
6447 	}
6448 	spin_unlock(&cache->lock);
6449 	spin_unlock(&cache->space_info->lock);
6450 
6451 	trace_btrfs_space_reservation(fs_info, "pinned",
6452 				      cache->space_info->flags, num_bytes, 1);
6453 	percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
6454 	set_extent_dirty(fs_info->pinned_extents, bytenr,
6455 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6456 	return 0;
6457 }
6458 
6459 /*
6460  * this function must be called within transaction
6461  */
6462 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6463 		     u64 bytenr, u64 num_bytes, int reserved)
6464 {
6465 	struct btrfs_block_group_cache *cache;
6466 
6467 	cache = btrfs_lookup_block_group(fs_info, bytenr);
6468 	BUG_ON(!cache); /* Logic error */
6469 
6470 	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6471 
6472 	btrfs_put_block_group(cache);
6473 	return 0;
6474 }
6475 
6476 /*
6477  * this function must be called within transaction
6478  */
6479 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6480 				    u64 bytenr, u64 num_bytes)
6481 {
6482 	struct btrfs_block_group_cache *cache;
6483 	int ret;
6484 
6485 	cache = btrfs_lookup_block_group(fs_info, bytenr);
6486 	if (!cache)
6487 		return -EINVAL;
6488 
6489 	/*
6490 	 * pull in the free space cache (if any) so that our pin
6491 	 * removes the free space from the cache.  We have load_only set
6492 	 * to one because the slow code to read in the free extents does check
6493 	 * the pinned extents.
6494 	 */
6495 	cache_block_group(cache, 1);
6496 
6497 	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6498 
6499 	/* remove us from the free space cache (if we're there at all) */
6500 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6501 	btrfs_put_block_group(cache);
6502 	return ret;
6503 }
6504 
6505 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6506 				   u64 start, u64 num_bytes)
6507 {
6508 	int ret;
6509 	struct btrfs_block_group_cache *block_group;
6510 	struct btrfs_caching_control *caching_ctl;
6511 
6512 	block_group = btrfs_lookup_block_group(fs_info, start);
6513 	if (!block_group)
6514 		return -EINVAL;
6515 
6516 	cache_block_group(block_group, 0);
6517 	caching_ctl = get_caching_control(block_group);
6518 
6519 	if (!caching_ctl) {
6520 		/* Logic error */
6521 		BUG_ON(!block_group_cache_done(block_group));
6522 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6523 	} else {
6524 		mutex_lock(&caching_ctl->mutex);
6525 
6526 		if (start >= caching_ctl->progress) {
6527 			ret = add_excluded_extent(fs_info, start, num_bytes);
6528 		} else if (start + num_bytes <= caching_ctl->progress) {
6529 			ret = btrfs_remove_free_space(block_group,
6530 						      start, num_bytes);
6531 		} else {
6532 			num_bytes = caching_ctl->progress - start;
6533 			ret = btrfs_remove_free_space(block_group,
6534 						      start, num_bytes);
6535 			if (ret)
6536 				goto out_lock;
6537 
6538 			num_bytes = (start + num_bytes) -
6539 				caching_ctl->progress;
6540 			start = caching_ctl->progress;
6541 			ret = add_excluded_extent(fs_info, start, num_bytes);
6542 		}
6543 out_lock:
6544 		mutex_unlock(&caching_ctl->mutex);
6545 		put_caching_control(caching_ctl);
6546 	}
6547 	btrfs_put_block_group(block_group);
6548 	return ret;
6549 }
6550 
6551 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6552 				 struct extent_buffer *eb)
6553 {
6554 	struct btrfs_file_extent_item *item;
6555 	struct btrfs_key key;
6556 	int found_type;
6557 	int i;
6558 
6559 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6560 		return 0;
6561 
6562 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
6563 		btrfs_item_key_to_cpu(eb, &key, i);
6564 		if (key.type != BTRFS_EXTENT_DATA_KEY)
6565 			continue;
6566 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6567 		found_type = btrfs_file_extent_type(eb, item);
6568 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
6569 			continue;
6570 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6571 			continue;
6572 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6573 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6574 		__exclude_logged_extent(fs_info, key.objectid, key.offset);
6575 	}
6576 
6577 	return 0;
6578 }
6579 
6580 static void
6581 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6582 {
6583 	atomic_inc(&bg->reservations);
6584 }
6585 
6586 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6587 					const u64 start)
6588 {
6589 	struct btrfs_block_group_cache *bg;
6590 
6591 	bg = btrfs_lookup_block_group(fs_info, start);
6592 	ASSERT(bg);
6593 	if (atomic_dec_and_test(&bg->reservations))
6594 		wake_up_atomic_t(&bg->reservations);
6595 	btrfs_put_block_group(bg);
6596 }
6597 
6598 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6599 {
6600 	schedule();
6601 	return 0;
6602 }
6603 
6604 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6605 {
6606 	struct btrfs_space_info *space_info = bg->space_info;
6607 
6608 	ASSERT(bg->ro);
6609 
6610 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6611 		return;
6612 
6613 	/*
6614 	 * Our block group is read only but before we set it to read only,
6615 	 * some task might have had allocated an extent from it already, but it
6616 	 * has not yet created a respective ordered extent (and added it to a
6617 	 * root's list of ordered extents).
6618 	 * Therefore wait for any task currently allocating extents, since the
6619 	 * block group's reservations counter is incremented while a read lock
6620 	 * on the groups' semaphore is held and decremented after releasing
6621 	 * the read access on that semaphore and creating the ordered extent.
6622 	 */
6623 	down_write(&space_info->groups_sem);
6624 	up_write(&space_info->groups_sem);
6625 
6626 	wait_on_atomic_t(&bg->reservations,
6627 			 btrfs_wait_bg_reservations_atomic_t,
6628 			 TASK_UNINTERRUPTIBLE);
6629 }
6630 
6631 /**
6632  * btrfs_add_reserved_bytes - update the block_group and space info counters
6633  * @cache:	The cache we are manipulating
6634  * @ram_bytes:  The number of bytes of file content, and will be same to
6635  *              @num_bytes except for the compress path.
6636  * @num_bytes:	The number of bytes in question
6637  * @delalloc:   The blocks are allocated for the delalloc write
6638  *
6639  * This is called by the allocator when it reserves space. If this is a
6640  * reservation and the block group has become read only we cannot make the
6641  * reservation and return -EAGAIN, otherwise this function always succeeds.
6642  */
6643 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6644 				    u64 ram_bytes, u64 num_bytes, int delalloc)
6645 {
6646 	struct btrfs_space_info *space_info = cache->space_info;
6647 	int ret = 0;
6648 
6649 	spin_lock(&space_info->lock);
6650 	spin_lock(&cache->lock);
6651 	if (cache->ro) {
6652 		ret = -EAGAIN;
6653 	} else {
6654 		cache->reserved += num_bytes;
6655 		space_info->bytes_reserved += num_bytes;
6656 
6657 		trace_btrfs_space_reservation(cache->fs_info,
6658 				"space_info", space_info->flags,
6659 				ram_bytes, 0);
6660 		space_info->bytes_may_use -= ram_bytes;
6661 		if (delalloc)
6662 			cache->delalloc_bytes += num_bytes;
6663 	}
6664 	spin_unlock(&cache->lock);
6665 	spin_unlock(&space_info->lock);
6666 	return ret;
6667 }
6668 
6669 /**
6670  * btrfs_free_reserved_bytes - update the block_group and space info counters
6671  * @cache:      The cache we are manipulating
6672  * @num_bytes:  The number of bytes in question
6673  * @delalloc:   The blocks are allocated for the delalloc write
6674  *
6675  * This is called by somebody who is freeing space that was never actually used
6676  * on disk.  For example if you reserve some space for a new leaf in transaction
6677  * A and before transaction A commits you free that leaf, you call this with
6678  * reserve set to 0 in order to clear the reservation.
6679  */
6680 
6681 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6682 				     u64 num_bytes, int delalloc)
6683 {
6684 	struct btrfs_space_info *space_info = cache->space_info;
6685 	int ret = 0;
6686 
6687 	spin_lock(&space_info->lock);
6688 	spin_lock(&cache->lock);
6689 	if (cache->ro)
6690 		space_info->bytes_readonly += num_bytes;
6691 	cache->reserved -= num_bytes;
6692 	space_info->bytes_reserved -= num_bytes;
6693 
6694 	if (delalloc)
6695 		cache->delalloc_bytes -= num_bytes;
6696 	spin_unlock(&cache->lock);
6697 	spin_unlock(&space_info->lock);
6698 	return ret;
6699 }
6700 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6701 {
6702 	struct btrfs_caching_control *next;
6703 	struct btrfs_caching_control *caching_ctl;
6704 	struct btrfs_block_group_cache *cache;
6705 
6706 	down_write(&fs_info->commit_root_sem);
6707 
6708 	list_for_each_entry_safe(caching_ctl, next,
6709 				 &fs_info->caching_block_groups, list) {
6710 		cache = caching_ctl->block_group;
6711 		if (block_group_cache_done(cache)) {
6712 			cache->last_byte_to_unpin = (u64)-1;
6713 			list_del_init(&caching_ctl->list);
6714 			put_caching_control(caching_ctl);
6715 		} else {
6716 			cache->last_byte_to_unpin = caching_ctl->progress;
6717 		}
6718 	}
6719 
6720 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6721 		fs_info->pinned_extents = &fs_info->freed_extents[1];
6722 	else
6723 		fs_info->pinned_extents = &fs_info->freed_extents[0];
6724 
6725 	up_write(&fs_info->commit_root_sem);
6726 
6727 	update_global_block_rsv(fs_info);
6728 }
6729 
6730 /*
6731  * Returns the free cluster for the given space info and sets empty_cluster to
6732  * what it should be based on the mount options.
6733  */
6734 static struct btrfs_free_cluster *
6735 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6736 		   struct btrfs_space_info *space_info, u64 *empty_cluster)
6737 {
6738 	struct btrfs_free_cluster *ret = NULL;
6739 
6740 	*empty_cluster = 0;
6741 	if (btrfs_mixed_space_info(space_info))
6742 		return ret;
6743 
6744 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6745 		ret = &fs_info->meta_alloc_cluster;
6746 		if (btrfs_test_opt(fs_info, SSD))
6747 			*empty_cluster = SZ_2M;
6748 		else
6749 			*empty_cluster = SZ_64K;
6750 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6751 		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
6752 		*empty_cluster = SZ_2M;
6753 		ret = &fs_info->data_alloc_cluster;
6754 	}
6755 
6756 	return ret;
6757 }
6758 
6759 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6760 			      u64 start, u64 end,
6761 			      const bool return_free_space)
6762 {
6763 	struct btrfs_block_group_cache *cache = NULL;
6764 	struct btrfs_space_info *space_info;
6765 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6766 	struct btrfs_free_cluster *cluster = NULL;
6767 	u64 len;
6768 	u64 total_unpinned = 0;
6769 	u64 empty_cluster = 0;
6770 	bool readonly;
6771 
6772 	while (start <= end) {
6773 		readonly = false;
6774 		if (!cache ||
6775 		    start >= cache->key.objectid + cache->key.offset) {
6776 			if (cache)
6777 				btrfs_put_block_group(cache);
6778 			total_unpinned = 0;
6779 			cache = btrfs_lookup_block_group(fs_info, start);
6780 			BUG_ON(!cache); /* Logic error */
6781 
6782 			cluster = fetch_cluster_info(fs_info,
6783 						     cache->space_info,
6784 						     &empty_cluster);
6785 			empty_cluster <<= 1;
6786 		}
6787 
6788 		len = cache->key.objectid + cache->key.offset - start;
6789 		len = min(len, end + 1 - start);
6790 
6791 		if (start < cache->last_byte_to_unpin) {
6792 			len = min(len, cache->last_byte_to_unpin - start);
6793 			if (return_free_space)
6794 				btrfs_add_free_space(cache, start, len);
6795 		}
6796 
6797 		start += len;
6798 		total_unpinned += len;
6799 		space_info = cache->space_info;
6800 
6801 		/*
6802 		 * If this space cluster has been marked as fragmented and we've
6803 		 * unpinned enough in this block group to potentially allow a
6804 		 * cluster to be created inside of it go ahead and clear the
6805 		 * fragmented check.
6806 		 */
6807 		if (cluster && cluster->fragmented &&
6808 		    total_unpinned > empty_cluster) {
6809 			spin_lock(&cluster->lock);
6810 			cluster->fragmented = 0;
6811 			spin_unlock(&cluster->lock);
6812 		}
6813 
6814 		spin_lock(&space_info->lock);
6815 		spin_lock(&cache->lock);
6816 		cache->pinned -= len;
6817 		space_info->bytes_pinned -= len;
6818 
6819 		trace_btrfs_space_reservation(fs_info, "pinned",
6820 					      space_info->flags, len, 0);
6821 		space_info->max_extent_size = 0;
6822 		percpu_counter_add(&space_info->total_bytes_pinned, -len);
6823 		if (cache->ro) {
6824 			space_info->bytes_readonly += len;
6825 			readonly = true;
6826 		}
6827 		spin_unlock(&cache->lock);
6828 		if (!readonly && return_free_space &&
6829 		    global_rsv->space_info == space_info) {
6830 			u64 to_add = len;
6831 
6832 			spin_lock(&global_rsv->lock);
6833 			if (!global_rsv->full) {
6834 				to_add = min(len, global_rsv->size -
6835 					     global_rsv->reserved);
6836 				global_rsv->reserved += to_add;
6837 				space_info->bytes_may_use += to_add;
6838 				if (global_rsv->reserved >= global_rsv->size)
6839 					global_rsv->full = 1;
6840 				trace_btrfs_space_reservation(fs_info,
6841 							      "space_info",
6842 							      space_info->flags,
6843 							      to_add, 1);
6844 				len -= to_add;
6845 			}
6846 			spin_unlock(&global_rsv->lock);
6847 			/* Add to any tickets we may have */
6848 			if (len)
6849 				space_info_add_new_bytes(fs_info, space_info,
6850 							 len);
6851 		}
6852 		spin_unlock(&space_info->lock);
6853 	}
6854 
6855 	if (cache)
6856 		btrfs_put_block_group(cache);
6857 	return 0;
6858 }
6859 
6860 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6861 			       struct btrfs_fs_info *fs_info)
6862 {
6863 	struct btrfs_block_group_cache *block_group, *tmp;
6864 	struct list_head *deleted_bgs;
6865 	struct extent_io_tree *unpin;
6866 	u64 start;
6867 	u64 end;
6868 	int ret;
6869 
6870 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6871 		unpin = &fs_info->freed_extents[1];
6872 	else
6873 		unpin = &fs_info->freed_extents[0];
6874 
6875 	while (!trans->aborted) {
6876 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
6877 		ret = find_first_extent_bit(unpin, 0, &start, &end,
6878 					    EXTENT_DIRTY, NULL);
6879 		if (ret) {
6880 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6881 			break;
6882 		}
6883 
6884 		if (btrfs_test_opt(fs_info, DISCARD))
6885 			ret = btrfs_discard_extent(fs_info, start,
6886 						   end + 1 - start, NULL);
6887 
6888 		clear_extent_dirty(unpin, start, end);
6889 		unpin_extent_range(fs_info, start, end, true);
6890 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6891 		cond_resched();
6892 	}
6893 
6894 	/*
6895 	 * Transaction is finished.  We don't need the lock anymore.  We
6896 	 * do need to clean up the block groups in case of a transaction
6897 	 * abort.
6898 	 */
6899 	deleted_bgs = &trans->transaction->deleted_bgs;
6900 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6901 		u64 trimmed = 0;
6902 
6903 		ret = -EROFS;
6904 		if (!trans->aborted)
6905 			ret = btrfs_discard_extent(fs_info,
6906 						   block_group->key.objectid,
6907 						   block_group->key.offset,
6908 						   &trimmed);
6909 
6910 		list_del_init(&block_group->bg_list);
6911 		btrfs_put_block_group_trimming(block_group);
6912 		btrfs_put_block_group(block_group);
6913 
6914 		if (ret) {
6915 			const char *errstr = btrfs_decode_error(ret);
6916 			btrfs_warn(fs_info,
6917 			   "discard failed while removing blockgroup: errno=%d %s",
6918 				   ret, errstr);
6919 		}
6920 	}
6921 
6922 	return 0;
6923 }
6924 
6925 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6926 				struct btrfs_fs_info *info,
6927 				struct btrfs_delayed_ref_node *node, u64 parent,
6928 				u64 root_objectid, u64 owner_objectid,
6929 				u64 owner_offset, int refs_to_drop,
6930 				struct btrfs_delayed_extent_op *extent_op)
6931 {
6932 	struct btrfs_key key;
6933 	struct btrfs_path *path;
6934 	struct btrfs_root *extent_root = info->extent_root;
6935 	struct extent_buffer *leaf;
6936 	struct btrfs_extent_item *ei;
6937 	struct btrfs_extent_inline_ref *iref;
6938 	int ret;
6939 	int is_data;
6940 	int extent_slot = 0;
6941 	int found_extent = 0;
6942 	int num_to_del = 1;
6943 	u32 item_size;
6944 	u64 refs;
6945 	u64 bytenr = node->bytenr;
6946 	u64 num_bytes = node->num_bytes;
6947 	int last_ref = 0;
6948 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6949 
6950 	path = btrfs_alloc_path();
6951 	if (!path)
6952 		return -ENOMEM;
6953 
6954 	path->reada = READA_FORWARD;
6955 	path->leave_spinning = 1;
6956 
6957 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6958 	BUG_ON(!is_data && refs_to_drop != 1);
6959 
6960 	if (is_data)
6961 		skinny_metadata = 0;
6962 
6963 	ret = lookup_extent_backref(trans, info, path, &iref,
6964 				    bytenr, num_bytes, parent,
6965 				    root_objectid, owner_objectid,
6966 				    owner_offset);
6967 	if (ret == 0) {
6968 		extent_slot = path->slots[0];
6969 		while (extent_slot >= 0) {
6970 			btrfs_item_key_to_cpu(path->nodes[0], &key,
6971 					      extent_slot);
6972 			if (key.objectid != bytenr)
6973 				break;
6974 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6975 			    key.offset == num_bytes) {
6976 				found_extent = 1;
6977 				break;
6978 			}
6979 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
6980 			    key.offset == owner_objectid) {
6981 				found_extent = 1;
6982 				break;
6983 			}
6984 			if (path->slots[0] - extent_slot > 5)
6985 				break;
6986 			extent_slot--;
6987 		}
6988 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6989 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6990 		if (found_extent && item_size < sizeof(*ei))
6991 			found_extent = 0;
6992 #endif
6993 		if (!found_extent) {
6994 			BUG_ON(iref);
6995 			ret = remove_extent_backref(trans, info, path, NULL,
6996 						    refs_to_drop,
6997 						    is_data, &last_ref);
6998 			if (ret) {
6999 				btrfs_abort_transaction(trans, ret);
7000 				goto out;
7001 			}
7002 			btrfs_release_path(path);
7003 			path->leave_spinning = 1;
7004 
7005 			key.objectid = bytenr;
7006 			key.type = BTRFS_EXTENT_ITEM_KEY;
7007 			key.offset = num_bytes;
7008 
7009 			if (!is_data && skinny_metadata) {
7010 				key.type = BTRFS_METADATA_ITEM_KEY;
7011 				key.offset = owner_objectid;
7012 			}
7013 
7014 			ret = btrfs_search_slot(trans, extent_root,
7015 						&key, path, -1, 1);
7016 			if (ret > 0 && skinny_metadata && path->slots[0]) {
7017 				/*
7018 				 * Couldn't find our skinny metadata item,
7019 				 * see if we have ye olde extent item.
7020 				 */
7021 				path->slots[0]--;
7022 				btrfs_item_key_to_cpu(path->nodes[0], &key,
7023 						      path->slots[0]);
7024 				if (key.objectid == bytenr &&
7025 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
7026 				    key.offset == num_bytes)
7027 					ret = 0;
7028 			}
7029 
7030 			if (ret > 0 && skinny_metadata) {
7031 				skinny_metadata = false;
7032 				key.objectid = bytenr;
7033 				key.type = BTRFS_EXTENT_ITEM_KEY;
7034 				key.offset = num_bytes;
7035 				btrfs_release_path(path);
7036 				ret = btrfs_search_slot(trans, extent_root,
7037 							&key, path, -1, 1);
7038 			}
7039 
7040 			if (ret) {
7041 				btrfs_err(info,
7042 					  "umm, got %d back from search, was looking for %llu",
7043 					  ret, bytenr);
7044 				if (ret > 0)
7045 					btrfs_print_leaf(path->nodes[0]);
7046 			}
7047 			if (ret < 0) {
7048 				btrfs_abort_transaction(trans, ret);
7049 				goto out;
7050 			}
7051 			extent_slot = path->slots[0];
7052 		}
7053 	} else if (WARN_ON(ret == -ENOENT)) {
7054 		btrfs_print_leaf(path->nodes[0]);
7055 		btrfs_err(info,
7056 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
7057 			bytenr, parent, root_objectid, owner_objectid,
7058 			owner_offset);
7059 		btrfs_abort_transaction(trans, ret);
7060 		goto out;
7061 	} else {
7062 		btrfs_abort_transaction(trans, ret);
7063 		goto out;
7064 	}
7065 
7066 	leaf = path->nodes[0];
7067 	item_size = btrfs_item_size_nr(leaf, extent_slot);
7068 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
7069 	if (item_size < sizeof(*ei)) {
7070 		BUG_ON(found_extent || extent_slot != path->slots[0]);
7071 		ret = convert_extent_item_v0(trans, info, path, owner_objectid,
7072 					     0);
7073 		if (ret < 0) {
7074 			btrfs_abort_transaction(trans, ret);
7075 			goto out;
7076 		}
7077 
7078 		btrfs_release_path(path);
7079 		path->leave_spinning = 1;
7080 
7081 		key.objectid = bytenr;
7082 		key.type = BTRFS_EXTENT_ITEM_KEY;
7083 		key.offset = num_bytes;
7084 
7085 		ret = btrfs_search_slot(trans, extent_root, &key, path,
7086 					-1, 1);
7087 		if (ret) {
7088 			btrfs_err(info,
7089 				  "umm, got %d back from search, was looking for %llu",
7090 				ret, bytenr);
7091 			btrfs_print_leaf(path->nodes[0]);
7092 		}
7093 		if (ret < 0) {
7094 			btrfs_abort_transaction(trans, ret);
7095 			goto out;
7096 		}
7097 
7098 		extent_slot = path->slots[0];
7099 		leaf = path->nodes[0];
7100 		item_size = btrfs_item_size_nr(leaf, extent_slot);
7101 	}
7102 #endif
7103 	BUG_ON(item_size < sizeof(*ei));
7104 	ei = btrfs_item_ptr(leaf, extent_slot,
7105 			    struct btrfs_extent_item);
7106 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7107 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
7108 		struct btrfs_tree_block_info *bi;
7109 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7110 		bi = (struct btrfs_tree_block_info *)(ei + 1);
7111 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7112 	}
7113 
7114 	refs = btrfs_extent_refs(leaf, ei);
7115 	if (refs < refs_to_drop) {
7116 		btrfs_err(info,
7117 			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7118 			  refs_to_drop, refs, bytenr);
7119 		ret = -EINVAL;
7120 		btrfs_abort_transaction(trans, ret);
7121 		goto out;
7122 	}
7123 	refs -= refs_to_drop;
7124 
7125 	if (refs > 0) {
7126 		if (extent_op)
7127 			__run_delayed_extent_op(extent_op, leaf, ei);
7128 		/*
7129 		 * In the case of inline back ref, reference count will
7130 		 * be updated by remove_extent_backref
7131 		 */
7132 		if (iref) {
7133 			BUG_ON(!found_extent);
7134 		} else {
7135 			btrfs_set_extent_refs(leaf, ei, refs);
7136 			btrfs_mark_buffer_dirty(leaf);
7137 		}
7138 		if (found_extent) {
7139 			ret = remove_extent_backref(trans, info, path,
7140 						    iref, refs_to_drop,
7141 						    is_data, &last_ref);
7142 			if (ret) {
7143 				btrfs_abort_transaction(trans, ret);
7144 				goto out;
7145 			}
7146 		}
7147 	} else {
7148 		if (found_extent) {
7149 			BUG_ON(is_data && refs_to_drop !=
7150 			       extent_data_ref_count(path, iref));
7151 			if (iref) {
7152 				BUG_ON(path->slots[0] != extent_slot);
7153 			} else {
7154 				BUG_ON(path->slots[0] != extent_slot + 1);
7155 				path->slots[0] = extent_slot;
7156 				num_to_del = 2;
7157 			}
7158 		}
7159 
7160 		last_ref = 1;
7161 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7162 				      num_to_del);
7163 		if (ret) {
7164 			btrfs_abort_transaction(trans, ret);
7165 			goto out;
7166 		}
7167 		btrfs_release_path(path);
7168 
7169 		if (is_data) {
7170 			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7171 			if (ret) {
7172 				btrfs_abort_transaction(trans, ret);
7173 				goto out;
7174 			}
7175 		}
7176 
7177 		ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
7178 		if (ret) {
7179 			btrfs_abort_transaction(trans, ret);
7180 			goto out;
7181 		}
7182 
7183 		ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7184 		if (ret) {
7185 			btrfs_abort_transaction(trans, ret);
7186 			goto out;
7187 		}
7188 	}
7189 	btrfs_release_path(path);
7190 
7191 out:
7192 	btrfs_free_path(path);
7193 	return ret;
7194 }
7195 
7196 /*
7197  * when we free an block, it is possible (and likely) that we free the last
7198  * delayed ref for that extent as well.  This searches the delayed ref tree for
7199  * a given extent, and if there are no other delayed refs to be processed, it
7200  * removes it from the tree.
7201  */
7202 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7203 				      u64 bytenr)
7204 {
7205 	struct btrfs_delayed_ref_head *head;
7206 	struct btrfs_delayed_ref_root *delayed_refs;
7207 	int ret = 0;
7208 
7209 	delayed_refs = &trans->transaction->delayed_refs;
7210 	spin_lock(&delayed_refs->lock);
7211 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7212 	if (!head)
7213 		goto out_delayed_unlock;
7214 
7215 	spin_lock(&head->lock);
7216 	if (!list_empty(&head->ref_list))
7217 		goto out;
7218 
7219 	if (head->extent_op) {
7220 		if (!head->must_insert_reserved)
7221 			goto out;
7222 		btrfs_free_delayed_extent_op(head->extent_op);
7223 		head->extent_op = NULL;
7224 	}
7225 
7226 	/*
7227 	 * waiting for the lock here would deadlock.  If someone else has it
7228 	 * locked they are already in the process of dropping it anyway
7229 	 */
7230 	if (!mutex_trylock(&head->mutex))
7231 		goto out;
7232 
7233 	/*
7234 	 * at this point we have a head with no other entries.  Go
7235 	 * ahead and process it.
7236 	 */
7237 	head->node.in_tree = 0;
7238 	rb_erase(&head->href_node, &delayed_refs->href_root);
7239 
7240 	atomic_dec(&delayed_refs->num_entries);
7241 
7242 	/*
7243 	 * we don't take a ref on the node because we're removing it from the
7244 	 * tree, so we just steal the ref the tree was holding.
7245 	 */
7246 	delayed_refs->num_heads--;
7247 	if (head->processing == 0)
7248 		delayed_refs->num_heads_ready--;
7249 	head->processing = 0;
7250 	spin_unlock(&head->lock);
7251 	spin_unlock(&delayed_refs->lock);
7252 
7253 	BUG_ON(head->extent_op);
7254 	if (head->must_insert_reserved)
7255 		ret = 1;
7256 
7257 	mutex_unlock(&head->mutex);
7258 	btrfs_put_delayed_ref(&head->node);
7259 	return ret;
7260 out:
7261 	spin_unlock(&head->lock);
7262 
7263 out_delayed_unlock:
7264 	spin_unlock(&delayed_refs->lock);
7265 	return 0;
7266 }
7267 
7268 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7269 			   struct btrfs_root *root,
7270 			   struct extent_buffer *buf,
7271 			   u64 parent, int last_ref)
7272 {
7273 	struct btrfs_fs_info *fs_info = root->fs_info;
7274 	int pin = 1;
7275 	int ret;
7276 
7277 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7278 		int old_ref_mod, new_ref_mod;
7279 
7280 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
7281 						 buf->len, parent,
7282 						 root->root_key.objectid,
7283 						 btrfs_header_level(buf),
7284 						 BTRFS_DROP_DELAYED_REF, NULL,
7285 						 &old_ref_mod, &new_ref_mod);
7286 		BUG_ON(ret); /* -ENOMEM */
7287 		pin = old_ref_mod >= 0 && new_ref_mod < 0;
7288 	}
7289 
7290 	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7291 		struct btrfs_block_group_cache *cache;
7292 
7293 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7294 			ret = check_ref_cleanup(trans, buf->start);
7295 			if (!ret)
7296 				goto out;
7297 		}
7298 
7299 		pin = 0;
7300 		cache = btrfs_lookup_block_group(fs_info, buf->start);
7301 
7302 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7303 			pin_down_extent(fs_info, cache, buf->start,
7304 					buf->len, 1);
7305 			btrfs_put_block_group(cache);
7306 			goto out;
7307 		}
7308 
7309 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7310 
7311 		btrfs_add_free_space(cache, buf->start, buf->len);
7312 		btrfs_free_reserved_bytes(cache, buf->len, 0);
7313 		btrfs_put_block_group(cache);
7314 		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7315 	}
7316 out:
7317 	if (pin)
7318 		add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
7319 				 root->root_key.objectid);
7320 
7321 	if (last_ref) {
7322 		/*
7323 		 * Deleting the buffer, clear the corrupt flag since it doesn't
7324 		 * matter anymore.
7325 		 */
7326 		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7327 	}
7328 }
7329 
7330 /* Can return -ENOMEM */
7331 int btrfs_free_extent(struct btrfs_trans_handle *trans,
7332 		      struct btrfs_fs_info *fs_info,
7333 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7334 		      u64 owner, u64 offset)
7335 {
7336 	int old_ref_mod, new_ref_mod;
7337 	int ret;
7338 
7339 	if (btrfs_is_testing(fs_info))
7340 		return 0;
7341 
7342 
7343 	/*
7344 	 * tree log blocks never actually go into the extent allocation
7345 	 * tree, just update pinning info and exit early.
7346 	 */
7347 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7348 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7349 		/* unlocks the pinned mutex */
7350 		btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7351 		old_ref_mod = new_ref_mod = 0;
7352 		ret = 0;
7353 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7354 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7355 						 num_bytes, parent,
7356 						 root_objectid, (int)owner,
7357 						 BTRFS_DROP_DELAYED_REF, NULL,
7358 						 &old_ref_mod, &new_ref_mod);
7359 	} else {
7360 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7361 						 num_bytes, parent,
7362 						 root_objectid, owner, offset,
7363 						 0, BTRFS_DROP_DELAYED_REF,
7364 						 &old_ref_mod, &new_ref_mod);
7365 	}
7366 
7367 	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7368 		add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
7369 
7370 	return ret;
7371 }
7372 
7373 /*
7374  * when we wait for progress in the block group caching, its because
7375  * our allocation attempt failed at least once.  So, we must sleep
7376  * and let some progress happen before we try again.
7377  *
7378  * This function will sleep at least once waiting for new free space to
7379  * show up, and then it will check the block group free space numbers
7380  * for our min num_bytes.  Another option is to have it go ahead
7381  * and look in the rbtree for a free extent of a given size, but this
7382  * is a good start.
7383  *
7384  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7385  * any of the information in this block group.
7386  */
7387 static noinline void
7388 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7389 				u64 num_bytes)
7390 {
7391 	struct btrfs_caching_control *caching_ctl;
7392 
7393 	caching_ctl = get_caching_control(cache);
7394 	if (!caching_ctl)
7395 		return;
7396 
7397 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7398 		   (cache->free_space_ctl->free_space >= num_bytes));
7399 
7400 	put_caching_control(caching_ctl);
7401 }
7402 
7403 static noinline int
7404 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7405 {
7406 	struct btrfs_caching_control *caching_ctl;
7407 	int ret = 0;
7408 
7409 	caching_ctl = get_caching_control(cache);
7410 	if (!caching_ctl)
7411 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7412 
7413 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
7414 	if (cache->cached == BTRFS_CACHE_ERROR)
7415 		ret = -EIO;
7416 	put_caching_control(caching_ctl);
7417 	return ret;
7418 }
7419 
7420 int __get_raid_index(u64 flags)
7421 {
7422 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
7423 		return BTRFS_RAID_RAID10;
7424 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7425 		return BTRFS_RAID_RAID1;
7426 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
7427 		return BTRFS_RAID_DUP;
7428 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7429 		return BTRFS_RAID_RAID0;
7430 	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7431 		return BTRFS_RAID_RAID5;
7432 	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7433 		return BTRFS_RAID_RAID6;
7434 
7435 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7436 }
7437 
7438 int get_block_group_index(struct btrfs_block_group_cache *cache)
7439 {
7440 	return __get_raid_index(cache->flags);
7441 }
7442 
7443 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7444 	[BTRFS_RAID_RAID10]	= "raid10",
7445 	[BTRFS_RAID_RAID1]	= "raid1",
7446 	[BTRFS_RAID_DUP]	= "dup",
7447 	[BTRFS_RAID_RAID0]	= "raid0",
7448 	[BTRFS_RAID_SINGLE]	= "single",
7449 	[BTRFS_RAID_RAID5]	= "raid5",
7450 	[BTRFS_RAID_RAID6]	= "raid6",
7451 };
7452 
7453 static const char *get_raid_name(enum btrfs_raid_types type)
7454 {
7455 	if (type >= BTRFS_NR_RAID_TYPES)
7456 		return NULL;
7457 
7458 	return btrfs_raid_type_names[type];
7459 }
7460 
7461 enum btrfs_loop_type {
7462 	LOOP_CACHING_NOWAIT = 0,
7463 	LOOP_CACHING_WAIT = 1,
7464 	LOOP_ALLOC_CHUNK = 2,
7465 	LOOP_NO_EMPTY_SIZE = 3,
7466 };
7467 
7468 static inline void
7469 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7470 		       int delalloc)
7471 {
7472 	if (delalloc)
7473 		down_read(&cache->data_rwsem);
7474 }
7475 
7476 static inline void
7477 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7478 		       int delalloc)
7479 {
7480 	btrfs_get_block_group(cache);
7481 	if (delalloc)
7482 		down_read(&cache->data_rwsem);
7483 }
7484 
7485 static struct btrfs_block_group_cache *
7486 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7487 		   struct btrfs_free_cluster *cluster,
7488 		   int delalloc)
7489 {
7490 	struct btrfs_block_group_cache *used_bg = NULL;
7491 
7492 	spin_lock(&cluster->refill_lock);
7493 	while (1) {
7494 		used_bg = cluster->block_group;
7495 		if (!used_bg)
7496 			return NULL;
7497 
7498 		if (used_bg == block_group)
7499 			return used_bg;
7500 
7501 		btrfs_get_block_group(used_bg);
7502 
7503 		if (!delalloc)
7504 			return used_bg;
7505 
7506 		if (down_read_trylock(&used_bg->data_rwsem))
7507 			return used_bg;
7508 
7509 		spin_unlock(&cluster->refill_lock);
7510 
7511 		/* We should only have one-level nested. */
7512 		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7513 
7514 		spin_lock(&cluster->refill_lock);
7515 		if (used_bg == cluster->block_group)
7516 			return used_bg;
7517 
7518 		up_read(&used_bg->data_rwsem);
7519 		btrfs_put_block_group(used_bg);
7520 	}
7521 }
7522 
7523 static inline void
7524 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7525 			 int delalloc)
7526 {
7527 	if (delalloc)
7528 		up_read(&cache->data_rwsem);
7529 	btrfs_put_block_group(cache);
7530 }
7531 
7532 /*
7533  * walks the btree of allocated extents and find a hole of a given size.
7534  * The key ins is changed to record the hole:
7535  * ins->objectid == start position
7536  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7537  * ins->offset == the size of the hole.
7538  * Any available blocks before search_start are skipped.
7539  *
7540  * If there is no suitable free space, we will record the max size of
7541  * the free space extent currently.
7542  */
7543 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7544 				u64 ram_bytes, u64 num_bytes, u64 empty_size,
7545 				u64 hint_byte, struct btrfs_key *ins,
7546 				u64 flags, int delalloc)
7547 {
7548 	int ret = 0;
7549 	struct btrfs_root *root = fs_info->extent_root;
7550 	struct btrfs_free_cluster *last_ptr = NULL;
7551 	struct btrfs_block_group_cache *block_group = NULL;
7552 	u64 search_start = 0;
7553 	u64 max_extent_size = 0;
7554 	u64 empty_cluster = 0;
7555 	struct btrfs_space_info *space_info;
7556 	int loop = 0;
7557 	int index = __get_raid_index(flags);
7558 	bool failed_cluster_refill = false;
7559 	bool failed_alloc = false;
7560 	bool use_cluster = true;
7561 	bool have_caching_bg = false;
7562 	bool orig_have_caching_bg = false;
7563 	bool full_search = false;
7564 
7565 	WARN_ON(num_bytes < fs_info->sectorsize);
7566 	ins->type = BTRFS_EXTENT_ITEM_KEY;
7567 	ins->objectid = 0;
7568 	ins->offset = 0;
7569 
7570 	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7571 
7572 	space_info = __find_space_info(fs_info, flags);
7573 	if (!space_info) {
7574 		btrfs_err(fs_info, "No space info for %llu", flags);
7575 		return -ENOSPC;
7576 	}
7577 
7578 	/*
7579 	 * If our free space is heavily fragmented we may not be able to make
7580 	 * big contiguous allocations, so instead of doing the expensive search
7581 	 * for free space, simply return ENOSPC with our max_extent_size so we
7582 	 * can go ahead and search for a more manageable chunk.
7583 	 *
7584 	 * If our max_extent_size is large enough for our allocation simply
7585 	 * disable clustering since we will likely not be able to find enough
7586 	 * space to create a cluster and induce latency trying.
7587 	 */
7588 	if (unlikely(space_info->max_extent_size)) {
7589 		spin_lock(&space_info->lock);
7590 		if (space_info->max_extent_size &&
7591 		    num_bytes > space_info->max_extent_size) {
7592 			ins->offset = space_info->max_extent_size;
7593 			spin_unlock(&space_info->lock);
7594 			return -ENOSPC;
7595 		} else if (space_info->max_extent_size) {
7596 			use_cluster = false;
7597 		}
7598 		spin_unlock(&space_info->lock);
7599 	}
7600 
7601 	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7602 	if (last_ptr) {
7603 		spin_lock(&last_ptr->lock);
7604 		if (last_ptr->block_group)
7605 			hint_byte = last_ptr->window_start;
7606 		if (last_ptr->fragmented) {
7607 			/*
7608 			 * We still set window_start so we can keep track of the
7609 			 * last place we found an allocation to try and save
7610 			 * some time.
7611 			 */
7612 			hint_byte = last_ptr->window_start;
7613 			use_cluster = false;
7614 		}
7615 		spin_unlock(&last_ptr->lock);
7616 	}
7617 
7618 	search_start = max(search_start, first_logical_byte(fs_info, 0));
7619 	search_start = max(search_start, hint_byte);
7620 	if (search_start == hint_byte) {
7621 		block_group = btrfs_lookup_block_group(fs_info, search_start);
7622 		/*
7623 		 * we don't want to use the block group if it doesn't match our
7624 		 * allocation bits, or if its not cached.
7625 		 *
7626 		 * However if we are re-searching with an ideal block group
7627 		 * picked out then we don't care that the block group is cached.
7628 		 */
7629 		if (block_group && block_group_bits(block_group, flags) &&
7630 		    block_group->cached != BTRFS_CACHE_NO) {
7631 			down_read(&space_info->groups_sem);
7632 			if (list_empty(&block_group->list) ||
7633 			    block_group->ro) {
7634 				/*
7635 				 * someone is removing this block group,
7636 				 * we can't jump into the have_block_group
7637 				 * target because our list pointers are not
7638 				 * valid
7639 				 */
7640 				btrfs_put_block_group(block_group);
7641 				up_read(&space_info->groups_sem);
7642 			} else {
7643 				index = get_block_group_index(block_group);
7644 				btrfs_lock_block_group(block_group, delalloc);
7645 				goto have_block_group;
7646 			}
7647 		} else if (block_group) {
7648 			btrfs_put_block_group(block_group);
7649 		}
7650 	}
7651 search:
7652 	have_caching_bg = false;
7653 	if (index == 0 || index == __get_raid_index(flags))
7654 		full_search = true;
7655 	down_read(&space_info->groups_sem);
7656 	list_for_each_entry(block_group, &space_info->block_groups[index],
7657 			    list) {
7658 		u64 offset;
7659 		int cached;
7660 
7661 		/* If the block group is read-only, we can skip it entirely. */
7662 		if (unlikely(block_group->ro))
7663 			continue;
7664 
7665 		btrfs_grab_block_group(block_group, delalloc);
7666 		search_start = block_group->key.objectid;
7667 
7668 		/*
7669 		 * this can happen if we end up cycling through all the
7670 		 * raid types, but we want to make sure we only allocate
7671 		 * for the proper type.
7672 		 */
7673 		if (!block_group_bits(block_group, flags)) {
7674 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
7675 				BTRFS_BLOCK_GROUP_RAID1 |
7676 				BTRFS_BLOCK_GROUP_RAID5 |
7677 				BTRFS_BLOCK_GROUP_RAID6 |
7678 				BTRFS_BLOCK_GROUP_RAID10;
7679 
7680 			/*
7681 			 * if they asked for extra copies and this block group
7682 			 * doesn't provide them, bail.  This does allow us to
7683 			 * fill raid0 from raid1.
7684 			 */
7685 			if ((flags & extra) && !(block_group->flags & extra))
7686 				goto loop;
7687 		}
7688 
7689 have_block_group:
7690 		cached = block_group_cache_done(block_group);
7691 		if (unlikely(!cached)) {
7692 			have_caching_bg = true;
7693 			ret = cache_block_group(block_group, 0);
7694 			BUG_ON(ret < 0);
7695 			ret = 0;
7696 		}
7697 
7698 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7699 			goto loop;
7700 
7701 		/*
7702 		 * Ok we want to try and use the cluster allocator, so
7703 		 * lets look there
7704 		 */
7705 		if (last_ptr && use_cluster) {
7706 			struct btrfs_block_group_cache *used_block_group;
7707 			unsigned long aligned_cluster;
7708 			/*
7709 			 * the refill lock keeps out other
7710 			 * people trying to start a new cluster
7711 			 */
7712 			used_block_group = btrfs_lock_cluster(block_group,
7713 							      last_ptr,
7714 							      delalloc);
7715 			if (!used_block_group)
7716 				goto refill_cluster;
7717 
7718 			if (used_block_group != block_group &&
7719 			    (used_block_group->ro ||
7720 			     !block_group_bits(used_block_group, flags)))
7721 				goto release_cluster;
7722 
7723 			offset = btrfs_alloc_from_cluster(used_block_group,
7724 						last_ptr,
7725 						num_bytes,
7726 						used_block_group->key.objectid,
7727 						&max_extent_size);
7728 			if (offset) {
7729 				/* we have a block, we're done */
7730 				spin_unlock(&last_ptr->refill_lock);
7731 				trace_btrfs_reserve_extent_cluster(fs_info,
7732 						used_block_group,
7733 						search_start, num_bytes);
7734 				if (used_block_group != block_group) {
7735 					btrfs_release_block_group(block_group,
7736 								  delalloc);
7737 					block_group = used_block_group;
7738 				}
7739 				goto checks;
7740 			}
7741 
7742 			WARN_ON(last_ptr->block_group != used_block_group);
7743 release_cluster:
7744 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
7745 			 * set up a new clusters, so lets just skip it
7746 			 * and let the allocator find whatever block
7747 			 * it can find.  If we reach this point, we
7748 			 * will have tried the cluster allocator
7749 			 * plenty of times and not have found
7750 			 * anything, so we are likely way too
7751 			 * fragmented for the clustering stuff to find
7752 			 * anything.
7753 			 *
7754 			 * However, if the cluster is taken from the
7755 			 * current block group, release the cluster
7756 			 * first, so that we stand a better chance of
7757 			 * succeeding in the unclustered
7758 			 * allocation.  */
7759 			if (loop >= LOOP_NO_EMPTY_SIZE &&
7760 			    used_block_group != block_group) {
7761 				spin_unlock(&last_ptr->refill_lock);
7762 				btrfs_release_block_group(used_block_group,
7763 							  delalloc);
7764 				goto unclustered_alloc;
7765 			}
7766 
7767 			/*
7768 			 * this cluster didn't work out, free it and
7769 			 * start over
7770 			 */
7771 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7772 
7773 			if (used_block_group != block_group)
7774 				btrfs_release_block_group(used_block_group,
7775 							  delalloc);
7776 refill_cluster:
7777 			if (loop >= LOOP_NO_EMPTY_SIZE) {
7778 				spin_unlock(&last_ptr->refill_lock);
7779 				goto unclustered_alloc;
7780 			}
7781 
7782 			aligned_cluster = max_t(unsigned long,
7783 						empty_cluster + empty_size,
7784 					      block_group->full_stripe_len);
7785 
7786 			/* allocate a cluster in this block group */
7787 			ret = btrfs_find_space_cluster(fs_info, block_group,
7788 						       last_ptr, search_start,
7789 						       num_bytes,
7790 						       aligned_cluster);
7791 			if (ret == 0) {
7792 				/*
7793 				 * now pull our allocation out of this
7794 				 * cluster
7795 				 */
7796 				offset = btrfs_alloc_from_cluster(block_group,
7797 							last_ptr,
7798 							num_bytes,
7799 							search_start,
7800 							&max_extent_size);
7801 				if (offset) {
7802 					/* we found one, proceed */
7803 					spin_unlock(&last_ptr->refill_lock);
7804 					trace_btrfs_reserve_extent_cluster(fs_info,
7805 						block_group, search_start,
7806 						num_bytes);
7807 					goto checks;
7808 				}
7809 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
7810 				   && !failed_cluster_refill) {
7811 				spin_unlock(&last_ptr->refill_lock);
7812 
7813 				failed_cluster_refill = true;
7814 				wait_block_group_cache_progress(block_group,
7815 				       num_bytes + empty_cluster + empty_size);
7816 				goto have_block_group;
7817 			}
7818 
7819 			/*
7820 			 * at this point we either didn't find a cluster
7821 			 * or we weren't able to allocate a block from our
7822 			 * cluster.  Free the cluster we've been trying
7823 			 * to use, and go to the next block group
7824 			 */
7825 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7826 			spin_unlock(&last_ptr->refill_lock);
7827 			goto loop;
7828 		}
7829 
7830 unclustered_alloc:
7831 		/*
7832 		 * We are doing an unclustered alloc, set the fragmented flag so
7833 		 * we don't bother trying to setup a cluster again until we get
7834 		 * more space.
7835 		 */
7836 		if (unlikely(last_ptr)) {
7837 			spin_lock(&last_ptr->lock);
7838 			last_ptr->fragmented = 1;
7839 			spin_unlock(&last_ptr->lock);
7840 		}
7841 		if (cached) {
7842 			struct btrfs_free_space_ctl *ctl =
7843 				block_group->free_space_ctl;
7844 
7845 			spin_lock(&ctl->tree_lock);
7846 			if (ctl->free_space <
7847 			    num_bytes + empty_cluster + empty_size) {
7848 				if (ctl->free_space > max_extent_size)
7849 					max_extent_size = ctl->free_space;
7850 				spin_unlock(&ctl->tree_lock);
7851 				goto loop;
7852 			}
7853 			spin_unlock(&ctl->tree_lock);
7854 		}
7855 
7856 		offset = btrfs_find_space_for_alloc(block_group, search_start,
7857 						    num_bytes, empty_size,
7858 						    &max_extent_size);
7859 		/*
7860 		 * If we didn't find a chunk, and we haven't failed on this
7861 		 * block group before, and this block group is in the middle of
7862 		 * caching and we are ok with waiting, then go ahead and wait
7863 		 * for progress to be made, and set failed_alloc to true.
7864 		 *
7865 		 * If failed_alloc is true then we've already waited on this
7866 		 * block group once and should move on to the next block group.
7867 		 */
7868 		if (!offset && !failed_alloc && !cached &&
7869 		    loop > LOOP_CACHING_NOWAIT) {
7870 			wait_block_group_cache_progress(block_group,
7871 						num_bytes + empty_size);
7872 			failed_alloc = true;
7873 			goto have_block_group;
7874 		} else if (!offset) {
7875 			goto loop;
7876 		}
7877 checks:
7878 		search_start = ALIGN(offset, fs_info->stripesize);
7879 
7880 		/* move on to the next group */
7881 		if (search_start + num_bytes >
7882 		    block_group->key.objectid + block_group->key.offset) {
7883 			btrfs_add_free_space(block_group, offset, num_bytes);
7884 			goto loop;
7885 		}
7886 
7887 		if (offset < search_start)
7888 			btrfs_add_free_space(block_group, offset,
7889 					     search_start - offset);
7890 		BUG_ON(offset > search_start);
7891 
7892 		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7893 				num_bytes, delalloc);
7894 		if (ret == -EAGAIN) {
7895 			btrfs_add_free_space(block_group, offset, num_bytes);
7896 			goto loop;
7897 		}
7898 		btrfs_inc_block_group_reservations(block_group);
7899 
7900 		/* we are all good, lets return */
7901 		ins->objectid = search_start;
7902 		ins->offset = num_bytes;
7903 
7904 		trace_btrfs_reserve_extent(fs_info, block_group,
7905 					   search_start, num_bytes);
7906 		btrfs_release_block_group(block_group, delalloc);
7907 		break;
7908 loop:
7909 		failed_cluster_refill = false;
7910 		failed_alloc = false;
7911 		BUG_ON(index != get_block_group_index(block_group));
7912 		btrfs_release_block_group(block_group, delalloc);
7913 		cond_resched();
7914 	}
7915 	up_read(&space_info->groups_sem);
7916 
7917 	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7918 		&& !orig_have_caching_bg)
7919 		orig_have_caching_bg = true;
7920 
7921 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7922 		goto search;
7923 
7924 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7925 		goto search;
7926 
7927 	/*
7928 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7929 	 *			caching kthreads as we move along
7930 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7931 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7932 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7933 	 *			again
7934 	 */
7935 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7936 		index = 0;
7937 		if (loop == LOOP_CACHING_NOWAIT) {
7938 			/*
7939 			 * We want to skip the LOOP_CACHING_WAIT step if we
7940 			 * don't have any uncached bgs and we've already done a
7941 			 * full search through.
7942 			 */
7943 			if (orig_have_caching_bg || !full_search)
7944 				loop = LOOP_CACHING_WAIT;
7945 			else
7946 				loop = LOOP_ALLOC_CHUNK;
7947 		} else {
7948 			loop++;
7949 		}
7950 
7951 		if (loop == LOOP_ALLOC_CHUNK) {
7952 			struct btrfs_trans_handle *trans;
7953 			int exist = 0;
7954 
7955 			trans = current->journal_info;
7956 			if (trans)
7957 				exist = 1;
7958 			else
7959 				trans = btrfs_join_transaction(root);
7960 
7961 			if (IS_ERR(trans)) {
7962 				ret = PTR_ERR(trans);
7963 				goto out;
7964 			}
7965 
7966 			ret = do_chunk_alloc(trans, fs_info, flags,
7967 					     CHUNK_ALLOC_FORCE);
7968 
7969 			/*
7970 			 * If we can't allocate a new chunk we've already looped
7971 			 * through at least once, move on to the NO_EMPTY_SIZE
7972 			 * case.
7973 			 */
7974 			if (ret == -ENOSPC)
7975 				loop = LOOP_NO_EMPTY_SIZE;
7976 
7977 			/*
7978 			 * Do not bail out on ENOSPC since we
7979 			 * can do more things.
7980 			 */
7981 			if (ret < 0 && ret != -ENOSPC)
7982 				btrfs_abort_transaction(trans, ret);
7983 			else
7984 				ret = 0;
7985 			if (!exist)
7986 				btrfs_end_transaction(trans);
7987 			if (ret)
7988 				goto out;
7989 		}
7990 
7991 		if (loop == LOOP_NO_EMPTY_SIZE) {
7992 			/*
7993 			 * Don't loop again if we already have no empty_size and
7994 			 * no empty_cluster.
7995 			 */
7996 			if (empty_size == 0 &&
7997 			    empty_cluster == 0) {
7998 				ret = -ENOSPC;
7999 				goto out;
8000 			}
8001 			empty_size = 0;
8002 			empty_cluster = 0;
8003 		}
8004 
8005 		goto search;
8006 	} else if (!ins->objectid) {
8007 		ret = -ENOSPC;
8008 	} else if (ins->objectid) {
8009 		if (!use_cluster && last_ptr) {
8010 			spin_lock(&last_ptr->lock);
8011 			last_ptr->window_start = ins->objectid;
8012 			spin_unlock(&last_ptr->lock);
8013 		}
8014 		ret = 0;
8015 	}
8016 out:
8017 	if (ret == -ENOSPC) {
8018 		spin_lock(&space_info->lock);
8019 		space_info->max_extent_size = max_extent_size;
8020 		spin_unlock(&space_info->lock);
8021 		ins->offset = max_extent_size;
8022 	}
8023 	return ret;
8024 }
8025 
8026 static void dump_space_info(struct btrfs_fs_info *fs_info,
8027 			    struct btrfs_space_info *info, u64 bytes,
8028 			    int dump_block_groups)
8029 {
8030 	struct btrfs_block_group_cache *cache;
8031 	int index = 0;
8032 
8033 	spin_lock(&info->lock);
8034 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
8035 		   info->flags,
8036 		   info->total_bytes - btrfs_space_info_used(info, true),
8037 		   info->full ? "" : "not ");
8038 	btrfs_info(fs_info,
8039 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
8040 		info->total_bytes, info->bytes_used, info->bytes_pinned,
8041 		info->bytes_reserved, info->bytes_may_use,
8042 		info->bytes_readonly);
8043 	spin_unlock(&info->lock);
8044 
8045 	if (!dump_block_groups)
8046 		return;
8047 
8048 	down_read(&info->groups_sem);
8049 again:
8050 	list_for_each_entry(cache, &info->block_groups[index], list) {
8051 		spin_lock(&cache->lock);
8052 		btrfs_info(fs_info,
8053 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
8054 			cache->key.objectid, cache->key.offset,
8055 			btrfs_block_group_used(&cache->item), cache->pinned,
8056 			cache->reserved, cache->ro ? "[readonly]" : "");
8057 		btrfs_dump_free_space(cache, bytes);
8058 		spin_unlock(&cache->lock);
8059 	}
8060 	if (++index < BTRFS_NR_RAID_TYPES)
8061 		goto again;
8062 	up_read(&info->groups_sem);
8063 }
8064 
8065 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
8066 			 u64 num_bytes, u64 min_alloc_size,
8067 			 u64 empty_size, u64 hint_byte,
8068 			 struct btrfs_key *ins, int is_data, int delalloc)
8069 {
8070 	struct btrfs_fs_info *fs_info = root->fs_info;
8071 	bool final_tried = num_bytes == min_alloc_size;
8072 	u64 flags;
8073 	int ret;
8074 
8075 	flags = get_alloc_profile_by_root(root, is_data);
8076 again:
8077 	WARN_ON(num_bytes < fs_info->sectorsize);
8078 	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
8079 			       hint_byte, ins, flags, delalloc);
8080 	if (!ret && !is_data) {
8081 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
8082 	} else if (ret == -ENOSPC) {
8083 		if (!final_tried && ins->offset) {
8084 			num_bytes = min(num_bytes >> 1, ins->offset);
8085 			num_bytes = round_down(num_bytes,
8086 					       fs_info->sectorsize);
8087 			num_bytes = max(num_bytes, min_alloc_size);
8088 			ram_bytes = num_bytes;
8089 			if (num_bytes == min_alloc_size)
8090 				final_tried = true;
8091 			goto again;
8092 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8093 			struct btrfs_space_info *sinfo;
8094 
8095 			sinfo = __find_space_info(fs_info, flags);
8096 			btrfs_err(fs_info,
8097 				  "allocation failed flags %llu, wanted %llu",
8098 				  flags, num_bytes);
8099 			if (sinfo)
8100 				dump_space_info(fs_info, sinfo, num_bytes, 1);
8101 		}
8102 	}
8103 
8104 	return ret;
8105 }
8106 
8107 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8108 					u64 start, u64 len,
8109 					int pin, int delalloc)
8110 {
8111 	struct btrfs_block_group_cache *cache;
8112 	int ret = 0;
8113 
8114 	cache = btrfs_lookup_block_group(fs_info, start);
8115 	if (!cache) {
8116 		btrfs_err(fs_info, "Unable to find block group for %llu",
8117 			  start);
8118 		return -ENOSPC;
8119 	}
8120 
8121 	if (pin)
8122 		pin_down_extent(fs_info, cache, start, len, 1);
8123 	else {
8124 		if (btrfs_test_opt(fs_info, DISCARD))
8125 			ret = btrfs_discard_extent(fs_info, start, len, NULL);
8126 		btrfs_add_free_space(cache, start, len);
8127 		btrfs_free_reserved_bytes(cache, len, delalloc);
8128 		trace_btrfs_reserved_extent_free(fs_info, start, len);
8129 	}
8130 
8131 	btrfs_put_block_group(cache);
8132 	return ret;
8133 }
8134 
8135 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8136 			       u64 start, u64 len, int delalloc)
8137 {
8138 	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8139 }
8140 
8141 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8142 				       u64 start, u64 len)
8143 {
8144 	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8145 }
8146 
8147 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8148 				      struct btrfs_fs_info *fs_info,
8149 				      u64 parent, u64 root_objectid,
8150 				      u64 flags, u64 owner, u64 offset,
8151 				      struct btrfs_key *ins, int ref_mod)
8152 {
8153 	int ret;
8154 	struct btrfs_extent_item *extent_item;
8155 	struct btrfs_extent_inline_ref *iref;
8156 	struct btrfs_path *path;
8157 	struct extent_buffer *leaf;
8158 	int type;
8159 	u32 size;
8160 
8161 	if (parent > 0)
8162 		type = BTRFS_SHARED_DATA_REF_KEY;
8163 	else
8164 		type = BTRFS_EXTENT_DATA_REF_KEY;
8165 
8166 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8167 
8168 	path = btrfs_alloc_path();
8169 	if (!path)
8170 		return -ENOMEM;
8171 
8172 	path->leave_spinning = 1;
8173 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8174 				      ins, size);
8175 	if (ret) {
8176 		btrfs_free_path(path);
8177 		return ret;
8178 	}
8179 
8180 	leaf = path->nodes[0];
8181 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8182 				     struct btrfs_extent_item);
8183 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8184 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8185 	btrfs_set_extent_flags(leaf, extent_item,
8186 			       flags | BTRFS_EXTENT_FLAG_DATA);
8187 
8188 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8189 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
8190 	if (parent > 0) {
8191 		struct btrfs_shared_data_ref *ref;
8192 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
8193 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8194 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8195 	} else {
8196 		struct btrfs_extent_data_ref *ref;
8197 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8198 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8199 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8200 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8201 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8202 	}
8203 
8204 	btrfs_mark_buffer_dirty(path->nodes[0]);
8205 	btrfs_free_path(path);
8206 
8207 	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8208 					  ins->offset);
8209 	if (ret)
8210 		return ret;
8211 
8212 	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8213 	if (ret) { /* -ENOENT, logic error */
8214 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8215 			ins->objectid, ins->offset);
8216 		BUG();
8217 	}
8218 	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8219 	return ret;
8220 }
8221 
8222 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8223 				     struct btrfs_fs_info *fs_info,
8224 				     u64 parent, u64 root_objectid,
8225 				     u64 flags, struct btrfs_disk_key *key,
8226 				     int level, struct btrfs_key *ins)
8227 {
8228 	int ret;
8229 	struct btrfs_extent_item *extent_item;
8230 	struct btrfs_tree_block_info *block_info;
8231 	struct btrfs_extent_inline_ref *iref;
8232 	struct btrfs_path *path;
8233 	struct extent_buffer *leaf;
8234 	u32 size = sizeof(*extent_item) + sizeof(*iref);
8235 	u64 num_bytes = ins->offset;
8236 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8237 
8238 	if (!skinny_metadata)
8239 		size += sizeof(*block_info);
8240 
8241 	path = btrfs_alloc_path();
8242 	if (!path) {
8243 		btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
8244 						   fs_info->nodesize);
8245 		return -ENOMEM;
8246 	}
8247 
8248 	path->leave_spinning = 1;
8249 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8250 				      ins, size);
8251 	if (ret) {
8252 		btrfs_free_path(path);
8253 		btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
8254 						   fs_info->nodesize);
8255 		return ret;
8256 	}
8257 
8258 	leaf = path->nodes[0];
8259 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8260 				     struct btrfs_extent_item);
8261 	btrfs_set_extent_refs(leaf, extent_item, 1);
8262 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8263 	btrfs_set_extent_flags(leaf, extent_item,
8264 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8265 
8266 	if (skinny_metadata) {
8267 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8268 		num_bytes = fs_info->nodesize;
8269 	} else {
8270 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8271 		btrfs_set_tree_block_key(leaf, block_info, key);
8272 		btrfs_set_tree_block_level(leaf, block_info, level);
8273 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8274 	}
8275 
8276 	if (parent > 0) {
8277 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8278 		btrfs_set_extent_inline_ref_type(leaf, iref,
8279 						 BTRFS_SHARED_BLOCK_REF_KEY);
8280 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8281 	} else {
8282 		btrfs_set_extent_inline_ref_type(leaf, iref,
8283 						 BTRFS_TREE_BLOCK_REF_KEY);
8284 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8285 	}
8286 
8287 	btrfs_mark_buffer_dirty(leaf);
8288 	btrfs_free_path(path);
8289 
8290 	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8291 					  num_bytes);
8292 	if (ret)
8293 		return ret;
8294 
8295 	ret = update_block_group(trans, fs_info, ins->objectid,
8296 				 fs_info->nodesize, 1);
8297 	if (ret) { /* -ENOENT, logic error */
8298 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8299 			ins->objectid, ins->offset);
8300 		BUG();
8301 	}
8302 
8303 	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
8304 					  fs_info->nodesize);
8305 	return ret;
8306 }
8307 
8308 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8309 				     u64 root_objectid, u64 owner,
8310 				     u64 offset, u64 ram_bytes,
8311 				     struct btrfs_key *ins)
8312 {
8313 	struct btrfs_fs_info *fs_info = trans->fs_info;
8314 	int ret;
8315 
8316 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8317 
8318 	ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
8319 					 ins->offset, 0, root_objectid, owner,
8320 					 offset, ram_bytes,
8321 					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8322 	return ret;
8323 }
8324 
8325 /*
8326  * this is used by the tree logging recovery code.  It records that
8327  * an extent has been allocated and makes sure to clear the free
8328  * space cache bits as well
8329  */
8330 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8331 				   struct btrfs_fs_info *fs_info,
8332 				   u64 root_objectid, u64 owner, u64 offset,
8333 				   struct btrfs_key *ins)
8334 {
8335 	int ret;
8336 	struct btrfs_block_group_cache *block_group;
8337 	struct btrfs_space_info *space_info;
8338 
8339 	/*
8340 	 * Mixed block groups will exclude before processing the log so we only
8341 	 * need to do the exclude dance if this fs isn't mixed.
8342 	 */
8343 	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8344 		ret = __exclude_logged_extent(fs_info, ins->objectid,
8345 					      ins->offset);
8346 		if (ret)
8347 			return ret;
8348 	}
8349 
8350 	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8351 	if (!block_group)
8352 		return -EINVAL;
8353 
8354 	space_info = block_group->space_info;
8355 	spin_lock(&space_info->lock);
8356 	spin_lock(&block_group->lock);
8357 	space_info->bytes_reserved += ins->offset;
8358 	block_group->reserved += ins->offset;
8359 	spin_unlock(&block_group->lock);
8360 	spin_unlock(&space_info->lock);
8361 
8362 	ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
8363 					 0, owner, offset, ins, 1);
8364 	btrfs_put_block_group(block_group);
8365 	return ret;
8366 }
8367 
8368 static struct extent_buffer *
8369 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8370 		      u64 bytenr, int level)
8371 {
8372 	struct btrfs_fs_info *fs_info = root->fs_info;
8373 	struct extent_buffer *buf;
8374 
8375 	buf = btrfs_find_create_tree_block(fs_info, bytenr);
8376 	if (IS_ERR(buf))
8377 		return buf;
8378 
8379 	btrfs_set_header_generation(buf, trans->transid);
8380 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8381 	btrfs_tree_lock(buf);
8382 	clean_tree_block(fs_info, buf);
8383 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8384 
8385 	btrfs_set_lock_blocking(buf);
8386 	set_extent_buffer_uptodate(buf);
8387 
8388 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8389 		buf->log_index = root->log_transid % 2;
8390 		/*
8391 		 * we allow two log transactions at a time, use different
8392 		 * EXENT bit to differentiate dirty pages.
8393 		 */
8394 		if (buf->log_index == 0)
8395 			set_extent_dirty(&root->dirty_log_pages, buf->start,
8396 					buf->start + buf->len - 1, GFP_NOFS);
8397 		else
8398 			set_extent_new(&root->dirty_log_pages, buf->start,
8399 					buf->start + buf->len - 1);
8400 	} else {
8401 		buf->log_index = -1;
8402 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8403 			 buf->start + buf->len - 1, GFP_NOFS);
8404 	}
8405 	trans->dirty = true;
8406 	/* this returns a buffer locked for blocking */
8407 	return buf;
8408 }
8409 
8410 static struct btrfs_block_rsv *
8411 use_block_rsv(struct btrfs_trans_handle *trans,
8412 	      struct btrfs_root *root, u32 blocksize)
8413 {
8414 	struct btrfs_fs_info *fs_info = root->fs_info;
8415 	struct btrfs_block_rsv *block_rsv;
8416 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8417 	int ret;
8418 	bool global_updated = false;
8419 
8420 	block_rsv = get_block_rsv(trans, root);
8421 
8422 	if (unlikely(block_rsv->size == 0))
8423 		goto try_reserve;
8424 again:
8425 	ret = block_rsv_use_bytes(block_rsv, blocksize);
8426 	if (!ret)
8427 		return block_rsv;
8428 
8429 	if (block_rsv->failfast)
8430 		return ERR_PTR(ret);
8431 
8432 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8433 		global_updated = true;
8434 		update_global_block_rsv(fs_info);
8435 		goto again;
8436 	}
8437 
8438 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8439 		static DEFINE_RATELIMIT_STATE(_rs,
8440 				DEFAULT_RATELIMIT_INTERVAL * 10,
8441 				/*DEFAULT_RATELIMIT_BURST*/ 1);
8442 		if (__ratelimit(&_rs))
8443 			WARN(1, KERN_DEBUG
8444 				"BTRFS: block rsv returned %d\n", ret);
8445 	}
8446 try_reserve:
8447 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8448 				     BTRFS_RESERVE_NO_FLUSH);
8449 	if (!ret)
8450 		return block_rsv;
8451 	/*
8452 	 * If we couldn't reserve metadata bytes try and use some from
8453 	 * the global reserve if its space type is the same as the global
8454 	 * reservation.
8455 	 */
8456 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8457 	    block_rsv->space_info == global_rsv->space_info) {
8458 		ret = block_rsv_use_bytes(global_rsv, blocksize);
8459 		if (!ret)
8460 			return global_rsv;
8461 	}
8462 	return ERR_PTR(ret);
8463 }
8464 
8465 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8466 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
8467 {
8468 	block_rsv_add_bytes(block_rsv, blocksize, 0);
8469 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8470 }
8471 
8472 /*
8473  * finds a free extent and does all the dirty work required for allocation
8474  * returns the tree buffer or an ERR_PTR on error.
8475  */
8476 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8477 					     struct btrfs_root *root,
8478 					     u64 parent, u64 root_objectid,
8479 					     const struct btrfs_disk_key *key,
8480 					     int level, u64 hint,
8481 					     u64 empty_size)
8482 {
8483 	struct btrfs_fs_info *fs_info = root->fs_info;
8484 	struct btrfs_key ins;
8485 	struct btrfs_block_rsv *block_rsv;
8486 	struct extent_buffer *buf;
8487 	struct btrfs_delayed_extent_op *extent_op;
8488 	u64 flags = 0;
8489 	int ret;
8490 	u32 blocksize = fs_info->nodesize;
8491 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8492 
8493 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8494 	if (btrfs_is_testing(fs_info)) {
8495 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8496 					    level);
8497 		if (!IS_ERR(buf))
8498 			root->alloc_bytenr += blocksize;
8499 		return buf;
8500 	}
8501 #endif
8502 
8503 	block_rsv = use_block_rsv(trans, root, blocksize);
8504 	if (IS_ERR(block_rsv))
8505 		return ERR_CAST(block_rsv);
8506 
8507 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8508 				   empty_size, hint, &ins, 0, 0);
8509 	if (ret)
8510 		goto out_unuse;
8511 
8512 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8513 	if (IS_ERR(buf)) {
8514 		ret = PTR_ERR(buf);
8515 		goto out_free_reserved;
8516 	}
8517 
8518 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8519 		if (parent == 0)
8520 			parent = ins.objectid;
8521 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8522 	} else
8523 		BUG_ON(parent > 0);
8524 
8525 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8526 		extent_op = btrfs_alloc_delayed_extent_op();
8527 		if (!extent_op) {
8528 			ret = -ENOMEM;
8529 			goto out_free_buf;
8530 		}
8531 		if (key)
8532 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
8533 		else
8534 			memset(&extent_op->key, 0, sizeof(extent_op->key));
8535 		extent_op->flags_to_set = flags;
8536 		extent_op->update_key = skinny_metadata ? false : true;
8537 		extent_op->update_flags = true;
8538 		extent_op->is_data = false;
8539 		extent_op->level = level;
8540 
8541 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
8542 						 ins.offset, parent,
8543 						 root_objectid, level,
8544 						 BTRFS_ADD_DELAYED_EXTENT,
8545 						 extent_op, NULL, NULL);
8546 		if (ret)
8547 			goto out_free_delayed;
8548 	}
8549 	return buf;
8550 
8551 out_free_delayed:
8552 	btrfs_free_delayed_extent_op(extent_op);
8553 out_free_buf:
8554 	free_extent_buffer(buf);
8555 out_free_reserved:
8556 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8557 out_unuse:
8558 	unuse_block_rsv(fs_info, block_rsv, blocksize);
8559 	return ERR_PTR(ret);
8560 }
8561 
8562 struct walk_control {
8563 	u64 refs[BTRFS_MAX_LEVEL];
8564 	u64 flags[BTRFS_MAX_LEVEL];
8565 	struct btrfs_key update_progress;
8566 	int stage;
8567 	int level;
8568 	int shared_level;
8569 	int update_ref;
8570 	int keep_locks;
8571 	int reada_slot;
8572 	int reada_count;
8573 	int for_reloc;
8574 };
8575 
8576 #define DROP_REFERENCE	1
8577 #define UPDATE_BACKREF	2
8578 
8579 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8580 				     struct btrfs_root *root,
8581 				     struct walk_control *wc,
8582 				     struct btrfs_path *path)
8583 {
8584 	struct btrfs_fs_info *fs_info = root->fs_info;
8585 	u64 bytenr;
8586 	u64 generation;
8587 	u64 refs;
8588 	u64 flags;
8589 	u32 nritems;
8590 	struct btrfs_key key;
8591 	struct extent_buffer *eb;
8592 	int ret;
8593 	int slot;
8594 	int nread = 0;
8595 
8596 	if (path->slots[wc->level] < wc->reada_slot) {
8597 		wc->reada_count = wc->reada_count * 2 / 3;
8598 		wc->reada_count = max(wc->reada_count, 2);
8599 	} else {
8600 		wc->reada_count = wc->reada_count * 3 / 2;
8601 		wc->reada_count = min_t(int, wc->reada_count,
8602 					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8603 	}
8604 
8605 	eb = path->nodes[wc->level];
8606 	nritems = btrfs_header_nritems(eb);
8607 
8608 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8609 		if (nread >= wc->reada_count)
8610 			break;
8611 
8612 		cond_resched();
8613 		bytenr = btrfs_node_blockptr(eb, slot);
8614 		generation = btrfs_node_ptr_generation(eb, slot);
8615 
8616 		if (slot == path->slots[wc->level])
8617 			goto reada;
8618 
8619 		if (wc->stage == UPDATE_BACKREF &&
8620 		    generation <= root->root_key.offset)
8621 			continue;
8622 
8623 		/* We don't lock the tree block, it's OK to be racy here */
8624 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8625 					       wc->level - 1, 1, &refs,
8626 					       &flags);
8627 		/* We don't care about errors in readahead. */
8628 		if (ret < 0)
8629 			continue;
8630 		BUG_ON(refs == 0);
8631 
8632 		if (wc->stage == DROP_REFERENCE) {
8633 			if (refs == 1)
8634 				goto reada;
8635 
8636 			if (wc->level == 1 &&
8637 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8638 				continue;
8639 			if (!wc->update_ref ||
8640 			    generation <= root->root_key.offset)
8641 				continue;
8642 			btrfs_node_key_to_cpu(eb, &key, slot);
8643 			ret = btrfs_comp_cpu_keys(&key,
8644 						  &wc->update_progress);
8645 			if (ret < 0)
8646 				continue;
8647 		} else {
8648 			if (wc->level == 1 &&
8649 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8650 				continue;
8651 		}
8652 reada:
8653 		readahead_tree_block(fs_info, bytenr);
8654 		nread++;
8655 	}
8656 	wc->reada_slot = slot;
8657 }
8658 
8659 /*
8660  * helper to process tree block while walking down the tree.
8661  *
8662  * when wc->stage == UPDATE_BACKREF, this function updates
8663  * back refs for pointers in the block.
8664  *
8665  * NOTE: return value 1 means we should stop walking down.
8666  */
8667 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8668 				   struct btrfs_root *root,
8669 				   struct btrfs_path *path,
8670 				   struct walk_control *wc, int lookup_info)
8671 {
8672 	struct btrfs_fs_info *fs_info = root->fs_info;
8673 	int level = wc->level;
8674 	struct extent_buffer *eb = path->nodes[level];
8675 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8676 	int ret;
8677 
8678 	if (wc->stage == UPDATE_BACKREF &&
8679 	    btrfs_header_owner(eb) != root->root_key.objectid)
8680 		return 1;
8681 
8682 	/*
8683 	 * when reference count of tree block is 1, it won't increase
8684 	 * again. once full backref flag is set, we never clear it.
8685 	 */
8686 	if (lookup_info &&
8687 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8688 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8689 		BUG_ON(!path->locks[level]);
8690 		ret = btrfs_lookup_extent_info(trans, fs_info,
8691 					       eb->start, level, 1,
8692 					       &wc->refs[level],
8693 					       &wc->flags[level]);
8694 		BUG_ON(ret == -ENOMEM);
8695 		if (ret)
8696 			return ret;
8697 		BUG_ON(wc->refs[level] == 0);
8698 	}
8699 
8700 	if (wc->stage == DROP_REFERENCE) {
8701 		if (wc->refs[level] > 1)
8702 			return 1;
8703 
8704 		if (path->locks[level] && !wc->keep_locks) {
8705 			btrfs_tree_unlock_rw(eb, path->locks[level]);
8706 			path->locks[level] = 0;
8707 		}
8708 		return 0;
8709 	}
8710 
8711 	/* wc->stage == UPDATE_BACKREF */
8712 	if (!(wc->flags[level] & flag)) {
8713 		BUG_ON(!path->locks[level]);
8714 		ret = btrfs_inc_ref(trans, root, eb, 1);
8715 		BUG_ON(ret); /* -ENOMEM */
8716 		ret = btrfs_dec_ref(trans, root, eb, 0);
8717 		BUG_ON(ret); /* -ENOMEM */
8718 		ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8719 						  eb->len, flag,
8720 						  btrfs_header_level(eb), 0);
8721 		BUG_ON(ret); /* -ENOMEM */
8722 		wc->flags[level] |= flag;
8723 	}
8724 
8725 	/*
8726 	 * the block is shared by multiple trees, so it's not good to
8727 	 * keep the tree lock
8728 	 */
8729 	if (path->locks[level] && level > 0) {
8730 		btrfs_tree_unlock_rw(eb, path->locks[level]);
8731 		path->locks[level] = 0;
8732 	}
8733 	return 0;
8734 }
8735 
8736 /*
8737  * helper to process tree block pointer.
8738  *
8739  * when wc->stage == DROP_REFERENCE, this function checks
8740  * reference count of the block pointed to. if the block
8741  * is shared and we need update back refs for the subtree
8742  * rooted at the block, this function changes wc->stage to
8743  * UPDATE_BACKREF. if the block is shared and there is no
8744  * need to update back, this function drops the reference
8745  * to the block.
8746  *
8747  * NOTE: return value 1 means we should stop walking down.
8748  */
8749 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8750 				 struct btrfs_root *root,
8751 				 struct btrfs_path *path,
8752 				 struct walk_control *wc, int *lookup_info)
8753 {
8754 	struct btrfs_fs_info *fs_info = root->fs_info;
8755 	u64 bytenr;
8756 	u64 generation;
8757 	u64 parent;
8758 	u32 blocksize;
8759 	struct btrfs_key key;
8760 	struct extent_buffer *next;
8761 	int level = wc->level;
8762 	int reada = 0;
8763 	int ret = 0;
8764 	bool need_account = false;
8765 
8766 	generation = btrfs_node_ptr_generation(path->nodes[level],
8767 					       path->slots[level]);
8768 	/*
8769 	 * if the lower level block was created before the snapshot
8770 	 * was created, we know there is no need to update back refs
8771 	 * for the subtree
8772 	 */
8773 	if (wc->stage == UPDATE_BACKREF &&
8774 	    generation <= root->root_key.offset) {
8775 		*lookup_info = 1;
8776 		return 1;
8777 	}
8778 
8779 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8780 	blocksize = fs_info->nodesize;
8781 
8782 	next = find_extent_buffer(fs_info, bytenr);
8783 	if (!next) {
8784 		next = btrfs_find_create_tree_block(fs_info, bytenr);
8785 		if (IS_ERR(next))
8786 			return PTR_ERR(next);
8787 
8788 		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8789 					       level - 1);
8790 		reada = 1;
8791 	}
8792 	btrfs_tree_lock(next);
8793 	btrfs_set_lock_blocking(next);
8794 
8795 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8796 				       &wc->refs[level - 1],
8797 				       &wc->flags[level - 1]);
8798 	if (ret < 0)
8799 		goto out_unlock;
8800 
8801 	if (unlikely(wc->refs[level - 1] == 0)) {
8802 		btrfs_err(fs_info, "Missing references.");
8803 		ret = -EIO;
8804 		goto out_unlock;
8805 	}
8806 	*lookup_info = 0;
8807 
8808 	if (wc->stage == DROP_REFERENCE) {
8809 		if (wc->refs[level - 1] > 1) {
8810 			need_account = true;
8811 			if (level == 1 &&
8812 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8813 				goto skip;
8814 
8815 			if (!wc->update_ref ||
8816 			    generation <= root->root_key.offset)
8817 				goto skip;
8818 
8819 			btrfs_node_key_to_cpu(path->nodes[level], &key,
8820 					      path->slots[level]);
8821 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8822 			if (ret < 0)
8823 				goto skip;
8824 
8825 			wc->stage = UPDATE_BACKREF;
8826 			wc->shared_level = level - 1;
8827 		}
8828 	} else {
8829 		if (level == 1 &&
8830 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8831 			goto skip;
8832 	}
8833 
8834 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
8835 		btrfs_tree_unlock(next);
8836 		free_extent_buffer(next);
8837 		next = NULL;
8838 		*lookup_info = 1;
8839 	}
8840 
8841 	if (!next) {
8842 		if (reada && level == 1)
8843 			reada_walk_down(trans, root, wc, path);
8844 		next = read_tree_block(fs_info, bytenr, generation);
8845 		if (IS_ERR(next)) {
8846 			return PTR_ERR(next);
8847 		} else if (!extent_buffer_uptodate(next)) {
8848 			free_extent_buffer(next);
8849 			return -EIO;
8850 		}
8851 		btrfs_tree_lock(next);
8852 		btrfs_set_lock_blocking(next);
8853 	}
8854 
8855 	level--;
8856 	ASSERT(level == btrfs_header_level(next));
8857 	if (level != btrfs_header_level(next)) {
8858 		btrfs_err(root->fs_info, "mismatched level");
8859 		ret = -EIO;
8860 		goto out_unlock;
8861 	}
8862 	path->nodes[level] = next;
8863 	path->slots[level] = 0;
8864 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8865 	wc->level = level;
8866 	if (wc->level == 1)
8867 		wc->reada_slot = 0;
8868 	return 0;
8869 skip:
8870 	wc->refs[level - 1] = 0;
8871 	wc->flags[level - 1] = 0;
8872 	if (wc->stage == DROP_REFERENCE) {
8873 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8874 			parent = path->nodes[level]->start;
8875 		} else {
8876 			ASSERT(root->root_key.objectid ==
8877 			       btrfs_header_owner(path->nodes[level]));
8878 			if (root->root_key.objectid !=
8879 			    btrfs_header_owner(path->nodes[level])) {
8880 				btrfs_err(root->fs_info,
8881 						"mismatched block owner");
8882 				ret = -EIO;
8883 				goto out_unlock;
8884 			}
8885 			parent = 0;
8886 		}
8887 
8888 		if (need_account) {
8889 			ret = btrfs_qgroup_trace_subtree(trans, root, next,
8890 							 generation, level - 1);
8891 			if (ret) {
8892 				btrfs_err_rl(fs_info,
8893 					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8894 					     ret);
8895 			}
8896 		}
8897 		ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
8898 					parent, root->root_key.objectid,
8899 					level - 1, 0);
8900 		if (ret)
8901 			goto out_unlock;
8902 	}
8903 
8904 	*lookup_info = 1;
8905 	ret = 1;
8906 
8907 out_unlock:
8908 	btrfs_tree_unlock(next);
8909 	free_extent_buffer(next);
8910 
8911 	return ret;
8912 }
8913 
8914 /*
8915  * helper to process tree block while walking up the tree.
8916  *
8917  * when wc->stage == DROP_REFERENCE, this function drops
8918  * reference count on the block.
8919  *
8920  * when wc->stage == UPDATE_BACKREF, this function changes
8921  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8922  * to UPDATE_BACKREF previously while processing the block.
8923  *
8924  * NOTE: return value 1 means we should stop walking up.
8925  */
8926 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8927 				 struct btrfs_root *root,
8928 				 struct btrfs_path *path,
8929 				 struct walk_control *wc)
8930 {
8931 	struct btrfs_fs_info *fs_info = root->fs_info;
8932 	int ret;
8933 	int level = wc->level;
8934 	struct extent_buffer *eb = path->nodes[level];
8935 	u64 parent = 0;
8936 
8937 	if (wc->stage == UPDATE_BACKREF) {
8938 		BUG_ON(wc->shared_level < level);
8939 		if (level < wc->shared_level)
8940 			goto out;
8941 
8942 		ret = find_next_key(path, level + 1, &wc->update_progress);
8943 		if (ret > 0)
8944 			wc->update_ref = 0;
8945 
8946 		wc->stage = DROP_REFERENCE;
8947 		wc->shared_level = -1;
8948 		path->slots[level] = 0;
8949 
8950 		/*
8951 		 * check reference count again if the block isn't locked.
8952 		 * we should start walking down the tree again if reference
8953 		 * count is one.
8954 		 */
8955 		if (!path->locks[level]) {
8956 			BUG_ON(level == 0);
8957 			btrfs_tree_lock(eb);
8958 			btrfs_set_lock_blocking(eb);
8959 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8960 
8961 			ret = btrfs_lookup_extent_info(trans, fs_info,
8962 						       eb->start, level, 1,
8963 						       &wc->refs[level],
8964 						       &wc->flags[level]);
8965 			if (ret < 0) {
8966 				btrfs_tree_unlock_rw(eb, path->locks[level]);
8967 				path->locks[level] = 0;
8968 				return ret;
8969 			}
8970 			BUG_ON(wc->refs[level] == 0);
8971 			if (wc->refs[level] == 1) {
8972 				btrfs_tree_unlock_rw(eb, path->locks[level]);
8973 				path->locks[level] = 0;
8974 				return 1;
8975 			}
8976 		}
8977 	}
8978 
8979 	/* wc->stage == DROP_REFERENCE */
8980 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8981 
8982 	if (wc->refs[level] == 1) {
8983 		if (level == 0) {
8984 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8985 				ret = btrfs_dec_ref(trans, root, eb, 1);
8986 			else
8987 				ret = btrfs_dec_ref(trans, root, eb, 0);
8988 			BUG_ON(ret); /* -ENOMEM */
8989 			ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
8990 			if (ret) {
8991 				btrfs_err_rl(fs_info,
8992 					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
8993 					     ret);
8994 			}
8995 		}
8996 		/* make block locked assertion in clean_tree_block happy */
8997 		if (!path->locks[level] &&
8998 		    btrfs_header_generation(eb) == trans->transid) {
8999 			btrfs_tree_lock(eb);
9000 			btrfs_set_lock_blocking(eb);
9001 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9002 		}
9003 		clean_tree_block(fs_info, eb);
9004 	}
9005 
9006 	if (eb == root->node) {
9007 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9008 			parent = eb->start;
9009 		else
9010 			BUG_ON(root->root_key.objectid !=
9011 			       btrfs_header_owner(eb));
9012 	} else {
9013 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9014 			parent = path->nodes[level + 1]->start;
9015 		else
9016 			BUG_ON(root->root_key.objectid !=
9017 			       btrfs_header_owner(path->nodes[level + 1]));
9018 	}
9019 
9020 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9021 out:
9022 	wc->refs[level] = 0;
9023 	wc->flags[level] = 0;
9024 	return 0;
9025 }
9026 
9027 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9028 				   struct btrfs_root *root,
9029 				   struct btrfs_path *path,
9030 				   struct walk_control *wc)
9031 {
9032 	int level = wc->level;
9033 	int lookup_info = 1;
9034 	int ret;
9035 
9036 	while (level >= 0) {
9037 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
9038 		if (ret > 0)
9039 			break;
9040 
9041 		if (level == 0)
9042 			break;
9043 
9044 		if (path->slots[level] >=
9045 		    btrfs_header_nritems(path->nodes[level]))
9046 			break;
9047 
9048 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
9049 		if (ret > 0) {
9050 			path->slots[level]++;
9051 			continue;
9052 		} else if (ret < 0)
9053 			return ret;
9054 		level = wc->level;
9055 	}
9056 	return 0;
9057 }
9058 
9059 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9060 				 struct btrfs_root *root,
9061 				 struct btrfs_path *path,
9062 				 struct walk_control *wc, int max_level)
9063 {
9064 	int level = wc->level;
9065 	int ret;
9066 
9067 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9068 	while (level < max_level && path->nodes[level]) {
9069 		wc->level = level;
9070 		if (path->slots[level] + 1 <
9071 		    btrfs_header_nritems(path->nodes[level])) {
9072 			path->slots[level]++;
9073 			return 0;
9074 		} else {
9075 			ret = walk_up_proc(trans, root, path, wc);
9076 			if (ret > 0)
9077 				return 0;
9078 
9079 			if (path->locks[level]) {
9080 				btrfs_tree_unlock_rw(path->nodes[level],
9081 						     path->locks[level]);
9082 				path->locks[level] = 0;
9083 			}
9084 			free_extent_buffer(path->nodes[level]);
9085 			path->nodes[level] = NULL;
9086 			level++;
9087 		}
9088 	}
9089 	return 1;
9090 }
9091 
9092 /*
9093  * drop a subvolume tree.
9094  *
9095  * this function traverses the tree freeing any blocks that only
9096  * referenced by the tree.
9097  *
9098  * when a shared tree block is found. this function decreases its
9099  * reference count by one. if update_ref is true, this function
9100  * also make sure backrefs for the shared block and all lower level
9101  * blocks are properly updated.
9102  *
9103  * If called with for_reloc == 0, may exit early with -EAGAIN
9104  */
9105 int btrfs_drop_snapshot(struct btrfs_root *root,
9106 			 struct btrfs_block_rsv *block_rsv, int update_ref,
9107 			 int for_reloc)
9108 {
9109 	struct btrfs_fs_info *fs_info = root->fs_info;
9110 	struct btrfs_path *path;
9111 	struct btrfs_trans_handle *trans;
9112 	struct btrfs_root *tree_root = fs_info->tree_root;
9113 	struct btrfs_root_item *root_item = &root->root_item;
9114 	struct walk_control *wc;
9115 	struct btrfs_key key;
9116 	int err = 0;
9117 	int ret;
9118 	int level;
9119 	bool root_dropped = false;
9120 
9121 	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
9122 
9123 	path = btrfs_alloc_path();
9124 	if (!path) {
9125 		err = -ENOMEM;
9126 		goto out;
9127 	}
9128 
9129 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9130 	if (!wc) {
9131 		btrfs_free_path(path);
9132 		err = -ENOMEM;
9133 		goto out;
9134 	}
9135 
9136 	trans = btrfs_start_transaction(tree_root, 0);
9137 	if (IS_ERR(trans)) {
9138 		err = PTR_ERR(trans);
9139 		goto out_free;
9140 	}
9141 
9142 	if (block_rsv)
9143 		trans->block_rsv = block_rsv;
9144 
9145 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9146 		level = btrfs_header_level(root->node);
9147 		path->nodes[level] = btrfs_lock_root_node(root);
9148 		btrfs_set_lock_blocking(path->nodes[level]);
9149 		path->slots[level] = 0;
9150 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9151 		memset(&wc->update_progress, 0,
9152 		       sizeof(wc->update_progress));
9153 	} else {
9154 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9155 		memcpy(&wc->update_progress, &key,
9156 		       sizeof(wc->update_progress));
9157 
9158 		level = root_item->drop_level;
9159 		BUG_ON(level == 0);
9160 		path->lowest_level = level;
9161 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9162 		path->lowest_level = 0;
9163 		if (ret < 0) {
9164 			err = ret;
9165 			goto out_end_trans;
9166 		}
9167 		WARN_ON(ret > 0);
9168 
9169 		/*
9170 		 * unlock our path, this is safe because only this
9171 		 * function is allowed to delete this snapshot
9172 		 */
9173 		btrfs_unlock_up_safe(path, 0);
9174 
9175 		level = btrfs_header_level(root->node);
9176 		while (1) {
9177 			btrfs_tree_lock(path->nodes[level]);
9178 			btrfs_set_lock_blocking(path->nodes[level]);
9179 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9180 
9181 			ret = btrfs_lookup_extent_info(trans, fs_info,
9182 						path->nodes[level]->start,
9183 						level, 1, &wc->refs[level],
9184 						&wc->flags[level]);
9185 			if (ret < 0) {
9186 				err = ret;
9187 				goto out_end_trans;
9188 			}
9189 			BUG_ON(wc->refs[level] == 0);
9190 
9191 			if (level == root_item->drop_level)
9192 				break;
9193 
9194 			btrfs_tree_unlock(path->nodes[level]);
9195 			path->locks[level] = 0;
9196 			WARN_ON(wc->refs[level] != 1);
9197 			level--;
9198 		}
9199 	}
9200 
9201 	wc->level = level;
9202 	wc->shared_level = -1;
9203 	wc->stage = DROP_REFERENCE;
9204 	wc->update_ref = update_ref;
9205 	wc->keep_locks = 0;
9206 	wc->for_reloc = for_reloc;
9207 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9208 
9209 	while (1) {
9210 
9211 		ret = walk_down_tree(trans, root, path, wc);
9212 		if (ret < 0) {
9213 			err = ret;
9214 			break;
9215 		}
9216 
9217 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9218 		if (ret < 0) {
9219 			err = ret;
9220 			break;
9221 		}
9222 
9223 		if (ret > 0) {
9224 			BUG_ON(wc->stage != DROP_REFERENCE);
9225 			break;
9226 		}
9227 
9228 		if (wc->stage == DROP_REFERENCE) {
9229 			level = wc->level;
9230 			btrfs_node_key(path->nodes[level],
9231 				       &root_item->drop_progress,
9232 				       path->slots[level]);
9233 			root_item->drop_level = level;
9234 		}
9235 
9236 		BUG_ON(wc->level == 0);
9237 		if (btrfs_should_end_transaction(trans) ||
9238 		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9239 			ret = btrfs_update_root(trans, tree_root,
9240 						&root->root_key,
9241 						root_item);
9242 			if (ret) {
9243 				btrfs_abort_transaction(trans, ret);
9244 				err = ret;
9245 				goto out_end_trans;
9246 			}
9247 
9248 			btrfs_end_transaction_throttle(trans);
9249 			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9250 				btrfs_debug(fs_info,
9251 					    "drop snapshot early exit");
9252 				err = -EAGAIN;
9253 				goto out_free;
9254 			}
9255 
9256 			trans = btrfs_start_transaction(tree_root, 0);
9257 			if (IS_ERR(trans)) {
9258 				err = PTR_ERR(trans);
9259 				goto out_free;
9260 			}
9261 			if (block_rsv)
9262 				trans->block_rsv = block_rsv;
9263 		}
9264 	}
9265 	btrfs_release_path(path);
9266 	if (err)
9267 		goto out_end_trans;
9268 
9269 	ret = btrfs_del_root(trans, fs_info, &root->root_key);
9270 	if (ret) {
9271 		btrfs_abort_transaction(trans, ret);
9272 		goto out_end_trans;
9273 	}
9274 
9275 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9276 		ret = btrfs_find_root(tree_root, &root->root_key, path,
9277 				      NULL, NULL);
9278 		if (ret < 0) {
9279 			btrfs_abort_transaction(trans, ret);
9280 			err = ret;
9281 			goto out_end_trans;
9282 		} else if (ret > 0) {
9283 			/* if we fail to delete the orphan item this time
9284 			 * around, it'll get picked up the next time.
9285 			 *
9286 			 * The most common failure here is just -ENOENT.
9287 			 */
9288 			btrfs_del_orphan_item(trans, tree_root,
9289 					      root->root_key.objectid);
9290 		}
9291 	}
9292 
9293 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9294 		btrfs_add_dropped_root(trans, root);
9295 	} else {
9296 		free_extent_buffer(root->node);
9297 		free_extent_buffer(root->commit_root);
9298 		btrfs_put_fs_root(root);
9299 	}
9300 	root_dropped = true;
9301 out_end_trans:
9302 	btrfs_end_transaction_throttle(trans);
9303 out_free:
9304 	kfree(wc);
9305 	btrfs_free_path(path);
9306 out:
9307 	/*
9308 	 * So if we need to stop dropping the snapshot for whatever reason we
9309 	 * need to make sure to add it back to the dead root list so that we
9310 	 * keep trying to do the work later.  This also cleans up roots if we
9311 	 * don't have it in the radix (like when we recover after a power fail
9312 	 * or unmount) so we don't leak memory.
9313 	 */
9314 	if (!for_reloc && root_dropped == false)
9315 		btrfs_add_dead_root(root);
9316 	if (err && err != -EAGAIN)
9317 		btrfs_handle_fs_error(fs_info, err, NULL);
9318 	return err;
9319 }
9320 
9321 /*
9322  * drop subtree rooted at tree block 'node'.
9323  *
9324  * NOTE: this function will unlock and release tree block 'node'
9325  * only used by relocation code
9326  */
9327 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9328 			struct btrfs_root *root,
9329 			struct extent_buffer *node,
9330 			struct extent_buffer *parent)
9331 {
9332 	struct btrfs_fs_info *fs_info = root->fs_info;
9333 	struct btrfs_path *path;
9334 	struct walk_control *wc;
9335 	int level;
9336 	int parent_level;
9337 	int ret = 0;
9338 	int wret;
9339 
9340 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9341 
9342 	path = btrfs_alloc_path();
9343 	if (!path)
9344 		return -ENOMEM;
9345 
9346 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9347 	if (!wc) {
9348 		btrfs_free_path(path);
9349 		return -ENOMEM;
9350 	}
9351 
9352 	btrfs_assert_tree_locked(parent);
9353 	parent_level = btrfs_header_level(parent);
9354 	extent_buffer_get(parent);
9355 	path->nodes[parent_level] = parent;
9356 	path->slots[parent_level] = btrfs_header_nritems(parent);
9357 
9358 	btrfs_assert_tree_locked(node);
9359 	level = btrfs_header_level(node);
9360 	path->nodes[level] = node;
9361 	path->slots[level] = 0;
9362 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9363 
9364 	wc->refs[parent_level] = 1;
9365 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9366 	wc->level = level;
9367 	wc->shared_level = -1;
9368 	wc->stage = DROP_REFERENCE;
9369 	wc->update_ref = 0;
9370 	wc->keep_locks = 1;
9371 	wc->for_reloc = 1;
9372 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9373 
9374 	while (1) {
9375 		wret = walk_down_tree(trans, root, path, wc);
9376 		if (wret < 0) {
9377 			ret = wret;
9378 			break;
9379 		}
9380 
9381 		wret = walk_up_tree(trans, root, path, wc, parent_level);
9382 		if (wret < 0)
9383 			ret = wret;
9384 		if (wret != 0)
9385 			break;
9386 	}
9387 
9388 	kfree(wc);
9389 	btrfs_free_path(path);
9390 	return ret;
9391 }
9392 
9393 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9394 {
9395 	u64 num_devices;
9396 	u64 stripped;
9397 
9398 	/*
9399 	 * if restripe for this chunk_type is on pick target profile and
9400 	 * return, otherwise do the usual balance
9401 	 */
9402 	stripped = get_restripe_target(fs_info, flags);
9403 	if (stripped)
9404 		return extended_to_chunk(stripped);
9405 
9406 	num_devices = fs_info->fs_devices->rw_devices;
9407 
9408 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
9409 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9410 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9411 
9412 	if (num_devices == 1) {
9413 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9414 		stripped = flags & ~stripped;
9415 
9416 		/* turn raid0 into single device chunks */
9417 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
9418 			return stripped;
9419 
9420 		/* turn mirroring into duplication */
9421 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9422 			     BTRFS_BLOCK_GROUP_RAID10))
9423 			return stripped | BTRFS_BLOCK_GROUP_DUP;
9424 	} else {
9425 		/* they already had raid on here, just return */
9426 		if (flags & stripped)
9427 			return flags;
9428 
9429 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9430 		stripped = flags & ~stripped;
9431 
9432 		/* switch duplicated blocks with raid1 */
9433 		if (flags & BTRFS_BLOCK_GROUP_DUP)
9434 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
9435 
9436 		/* this is drive concat, leave it alone */
9437 	}
9438 
9439 	return flags;
9440 }
9441 
9442 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9443 {
9444 	struct btrfs_space_info *sinfo = cache->space_info;
9445 	u64 num_bytes;
9446 	u64 min_allocable_bytes;
9447 	int ret = -ENOSPC;
9448 
9449 	/*
9450 	 * We need some metadata space and system metadata space for
9451 	 * allocating chunks in some corner cases until we force to set
9452 	 * it to be readonly.
9453 	 */
9454 	if ((sinfo->flags &
9455 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9456 	    !force)
9457 		min_allocable_bytes = SZ_1M;
9458 	else
9459 		min_allocable_bytes = 0;
9460 
9461 	spin_lock(&sinfo->lock);
9462 	spin_lock(&cache->lock);
9463 
9464 	if (cache->ro) {
9465 		cache->ro++;
9466 		ret = 0;
9467 		goto out;
9468 	}
9469 
9470 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9471 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
9472 
9473 	if (btrfs_space_info_used(sinfo, true) + num_bytes +
9474 	    min_allocable_bytes <= sinfo->total_bytes) {
9475 		sinfo->bytes_readonly += num_bytes;
9476 		cache->ro++;
9477 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9478 		ret = 0;
9479 	}
9480 out:
9481 	spin_unlock(&cache->lock);
9482 	spin_unlock(&sinfo->lock);
9483 	return ret;
9484 }
9485 
9486 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
9487 			     struct btrfs_block_group_cache *cache)
9488 
9489 {
9490 	struct btrfs_trans_handle *trans;
9491 	u64 alloc_flags;
9492 	int ret;
9493 
9494 again:
9495 	trans = btrfs_join_transaction(fs_info->extent_root);
9496 	if (IS_ERR(trans))
9497 		return PTR_ERR(trans);
9498 
9499 	/*
9500 	 * we're not allowed to set block groups readonly after the dirty
9501 	 * block groups cache has started writing.  If it already started,
9502 	 * back off and let this transaction commit
9503 	 */
9504 	mutex_lock(&fs_info->ro_block_group_mutex);
9505 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9506 		u64 transid = trans->transid;
9507 
9508 		mutex_unlock(&fs_info->ro_block_group_mutex);
9509 		btrfs_end_transaction(trans);
9510 
9511 		ret = btrfs_wait_for_commit(fs_info, transid);
9512 		if (ret)
9513 			return ret;
9514 		goto again;
9515 	}
9516 
9517 	/*
9518 	 * if we are changing raid levels, try to allocate a corresponding
9519 	 * block group with the new raid level.
9520 	 */
9521 	alloc_flags = update_block_group_flags(fs_info, cache->flags);
9522 	if (alloc_flags != cache->flags) {
9523 		ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9524 				     CHUNK_ALLOC_FORCE);
9525 		/*
9526 		 * ENOSPC is allowed here, we may have enough space
9527 		 * already allocated at the new raid level to
9528 		 * carry on
9529 		 */
9530 		if (ret == -ENOSPC)
9531 			ret = 0;
9532 		if (ret < 0)
9533 			goto out;
9534 	}
9535 
9536 	ret = inc_block_group_ro(cache, 0);
9537 	if (!ret)
9538 		goto out;
9539 	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9540 	ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9541 			     CHUNK_ALLOC_FORCE);
9542 	if (ret < 0)
9543 		goto out;
9544 	ret = inc_block_group_ro(cache, 0);
9545 out:
9546 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9547 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
9548 		mutex_lock(&fs_info->chunk_mutex);
9549 		check_system_chunk(trans, fs_info, alloc_flags);
9550 		mutex_unlock(&fs_info->chunk_mutex);
9551 	}
9552 	mutex_unlock(&fs_info->ro_block_group_mutex);
9553 
9554 	btrfs_end_transaction(trans);
9555 	return ret;
9556 }
9557 
9558 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9559 			    struct btrfs_fs_info *fs_info, u64 type)
9560 {
9561 	u64 alloc_flags = get_alloc_profile(fs_info, type);
9562 
9563 	return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
9564 }
9565 
9566 /*
9567  * helper to account the unused space of all the readonly block group in the
9568  * space_info. takes mirrors into account.
9569  */
9570 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9571 {
9572 	struct btrfs_block_group_cache *block_group;
9573 	u64 free_bytes = 0;
9574 	int factor;
9575 
9576 	/* It's df, we don't care if it's racy */
9577 	if (list_empty(&sinfo->ro_bgs))
9578 		return 0;
9579 
9580 	spin_lock(&sinfo->lock);
9581 	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9582 		spin_lock(&block_group->lock);
9583 
9584 		if (!block_group->ro) {
9585 			spin_unlock(&block_group->lock);
9586 			continue;
9587 		}
9588 
9589 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9590 					  BTRFS_BLOCK_GROUP_RAID10 |
9591 					  BTRFS_BLOCK_GROUP_DUP))
9592 			factor = 2;
9593 		else
9594 			factor = 1;
9595 
9596 		free_bytes += (block_group->key.offset -
9597 			       btrfs_block_group_used(&block_group->item)) *
9598 			       factor;
9599 
9600 		spin_unlock(&block_group->lock);
9601 	}
9602 	spin_unlock(&sinfo->lock);
9603 
9604 	return free_bytes;
9605 }
9606 
9607 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9608 {
9609 	struct btrfs_space_info *sinfo = cache->space_info;
9610 	u64 num_bytes;
9611 
9612 	BUG_ON(!cache->ro);
9613 
9614 	spin_lock(&sinfo->lock);
9615 	spin_lock(&cache->lock);
9616 	if (!--cache->ro) {
9617 		num_bytes = cache->key.offset - cache->reserved -
9618 			    cache->pinned - cache->bytes_super -
9619 			    btrfs_block_group_used(&cache->item);
9620 		sinfo->bytes_readonly -= num_bytes;
9621 		list_del_init(&cache->ro_list);
9622 	}
9623 	spin_unlock(&cache->lock);
9624 	spin_unlock(&sinfo->lock);
9625 }
9626 
9627 /*
9628  * checks to see if its even possible to relocate this block group.
9629  *
9630  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9631  * ok to go ahead and try.
9632  */
9633 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9634 {
9635 	struct btrfs_root *root = fs_info->extent_root;
9636 	struct btrfs_block_group_cache *block_group;
9637 	struct btrfs_space_info *space_info;
9638 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9639 	struct btrfs_device *device;
9640 	struct btrfs_trans_handle *trans;
9641 	u64 min_free;
9642 	u64 dev_min = 1;
9643 	u64 dev_nr = 0;
9644 	u64 target;
9645 	int debug;
9646 	int index;
9647 	int full = 0;
9648 	int ret = 0;
9649 
9650 	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9651 
9652 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
9653 
9654 	/* odd, couldn't find the block group, leave it alone */
9655 	if (!block_group) {
9656 		if (debug)
9657 			btrfs_warn(fs_info,
9658 				   "can't find block group for bytenr %llu",
9659 				   bytenr);
9660 		return -1;
9661 	}
9662 
9663 	min_free = btrfs_block_group_used(&block_group->item);
9664 
9665 	/* no bytes used, we're good */
9666 	if (!min_free)
9667 		goto out;
9668 
9669 	space_info = block_group->space_info;
9670 	spin_lock(&space_info->lock);
9671 
9672 	full = space_info->full;
9673 
9674 	/*
9675 	 * if this is the last block group we have in this space, we can't
9676 	 * relocate it unless we're able to allocate a new chunk below.
9677 	 *
9678 	 * Otherwise, we need to make sure we have room in the space to handle
9679 	 * all of the extents from this block group.  If we can, we're good
9680 	 */
9681 	if ((space_info->total_bytes != block_group->key.offset) &&
9682 	    (btrfs_space_info_used(space_info, false) + min_free <
9683 	     space_info->total_bytes)) {
9684 		spin_unlock(&space_info->lock);
9685 		goto out;
9686 	}
9687 	spin_unlock(&space_info->lock);
9688 
9689 	/*
9690 	 * ok we don't have enough space, but maybe we have free space on our
9691 	 * devices to allocate new chunks for relocation, so loop through our
9692 	 * alloc devices and guess if we have enough space.  if this block
9693 	 * group is going to be restriped, run checks against the target
9694 	 * profile instead of the current one.
9695 	 */
9696 	ret = -1;
9697 
9698 	/*
9699 	 * index:
9700 	 *      0: raid10
9701 	 *      1: raid1
9702 	 *      2: dup
9703 	 *      3: raid0
9704 	 *      4: single
9705 	 */
9706 	target = get_restripe_target(fs_info, block_group->flags);
9707 	if (target) {
9708 		index = __get_raid_index(extended_to_chunk(target));
9709 	} else {
9710 		/*
9711 		 * this is just a balance, so if we were marked as full
9712 		 * we know there is no space for a new chunk
9713 		 */
9714 		if (full) {
9715 			if (debug)
9716 				btrfs_warn(fs_info,
9717 					   "no space to alloc new chunk for block group %llu",
9718 					   block_group->key.objectid);
9719 			goto out;
9720 		}
9721 
9722 		index = get_block_group_index(block_group);
9723 	}
9724 
9725 	if (index == BTRFS_RAID_RAID10) {
9726 		dev_min = 4;
9727 		/* Divide by 2 */
9728 		min_free >>= 1;
9729 	} else if (index == BTRFS_RAID_RAID1) {
9730 		dev_min = 2;
9731 	} else if (index == BTRFS_RAID_DUP) {
9732 		/* Multiply by 2 */
9733 		min_free <<= 1;
9734 	} else if (index == BTRFS_RAID_RAID0) {
9735 		dev_min = fs_devices->rw_devices;
9736 		min_free = div64_u64(min_free, dev_min);
9737 	}
9738 
9739 	/* We need to do this so that we can look at pending chunks */
9740 	trans = btrfs_join_transaction(root);
9741 	if (IS_ERR(trans)) {
9742 		ret = PTR_ERR(trans);
9743 		goto out;
9744 	}
9745 
9746 	mutex_lock(&fs_info->chunk_mutex);
9747 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9748 		u64 dev_offset;
9749 
9750 		/*
9751 		 * check to make sure we can actually find a chunk with enough
9752 		 * space to fit our block group in.
9753 		 */
9754 		if (device->total_bytes > device->bytes_used + min_free &&
9755 		    !device->is_tgtdev_for_dev_replace) {
9756 			ret = find_free_dev_extent(trans, device, min_free,
9757 						   &dev_offset, NULL);
9758 			if (!ret)
9759 				dev_nr++;
9760 
9761 			if (dev_nr >= dev_min)
9762 				break;
9763 
9764 			ret = -1;
9765 		}
9766 	}
9767 	if (debug && ret == -1)
9768 		btrfs_warn(fs_info,
9769 			   "no space to allocate a new chunk for block group %llu",
9770 			   block_group->key.objectid);
9771 	mutex_unlock(&fs_info->chunk_mutex);
9772 	btrfs_end_transaction(trans);
9773 out:
9774 	btrfs_put_block_group(block_group);
9775 	return ret;
9776 }
9777 
9778 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9779 				  struct btrfs_path *path,
9780 				  struct btrfs_key *key)
9781 {
9782 	struct btrfs_root *root = fs_info->extent_root;
9783 	int ret = 0;
9784 	struct btrfs_key found_key;
9785 	struct extent_buffer *leaf;
9786 	int slot;
9787 
9788 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9789 	if (ret < 0)
9790 		goto out;
9791 
9792 	while (1) {
9793 		slot = path->slots[0];
9794 		leaf = path->nodes[0];
9795 		if (slot >= btrfs_header_nritems(leaf)) {
9796 			ret = btrfs_next_leaf(root, path);
9797 			if (ret == 0)
9798 				continue;
9799 			if (ret < 0)
9800 				goto out;
9801 			break;
9802 		}
9803 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
9804 
9805 		if (found_key.objectid >= key->objectid &&
9806 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9807 			struct extent_map_tree *em_tree;
9808 			struct extent_map *em;
9809 
9810 			em_tree = &root->fs_info->mapping_tree.map_tree;
9811 			read_lock(&em_tree->lock);
9812 			em = lookup_extent_mapping(em_tree, found_key.objectid,
9813 						   found_key.offset);
9814 			read_unlock(&em_tree->lock);
9815 			if (!em) {
9816 				btrfs_err(fs_info,
9817 			"logical %llu len %llu found bg but no related chunk",
9818 					  found_key.objectid, found_key.offset);
9819 				ret = -ENOENT;
9820 			} else {
9821 				ret = 0;
9822 			}
9823 			free_extent_map(em);
9824 			goto out;
9825 		}
9826 		path->slots[0]++;
9827 	}
9828 out:
9829 	return ret;
9830 }
9831 
9832 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9833 {
9834 	struct btrfs_block_group_cache *block_group;
9835 	u64 last = 0;
9836 
9837 	while (1) {
9838 		struct inode *inode;
9839 
9840 		block_group = btrfs_lookup_first_block_group(info, last);
9841 		while (block_group) {
9842 			spin_lock(&block_group->lock);
9843 			if (block_group->iref)
9844 				break;
9845 			spin_unlock(&block_group->lock);
9846 			block_group = next_block_group(info, block_group);
9847 		}
9848 		if (!block_group) {
9849 			if (last == 0)
9850 				break;
9851 			last = 0;
9852 			continue;
9853 		}
9854 
9855 		inode = block_group->inode;
9856 		block_group->iref = 0;
9857 		block_group->inode = NULL;
9858 		spin_unlock(&block_group->lock);
9859 		ASSERT(block_group->io_ctl.inode == NULL);
9860 		iput(inode);
9861 		last = block_group->key.objectid + block_group->key.offset;
9862 		btrfs_put_block_group(block_group);
9863 	}
9864 }
9865 
9866 /*
9867  * Must be called only after stopping all workers, since we could have block
9868  * group caching kthreads running, and therefore they could race with us if we
9869  * freed the block groups before stopping them.
9870  */
9871 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9872 {
9873 	struct btrfs_block_group_cache *block_group;
9874 	struct btrfs_space_info *space_info;
9875 	struct btrfs_caching_control *caching_ctl;
9876 	struct rb_node *n;
9877 
9878 	down_write(&info->commit_root_sem);
9879 	while (!list_empty(&info->caching_block_groups)) {
9880 		caching_ctl = list_entry(info->caching_block_groups.next,
9881 					 struct btrfs_caching_control, list);
9882 		list_del(&caching_ctl->list);
9883 		put_caching_control(caching_ctl);
9884 	}
9885 	up_write(&info->commit_root_sem);
9886 
9887 	spin_lock(&info->unused_bgs_lock);
9888 	while (!list_empty(&info->unused_bgs)) {
9889 		block_group = list_first_entry(&info->unused_bgs,
9890 					       struct btrfs_block_group_cache,
9891 					       bg_list);
9892 		list_del_init(&block_group->bg_list);
9893 		btrfs_put_block_group(block_group);
9894 	}
9895 	spin_unlock(&info->unused_bgs_lock);
9896 
9897 	spin_lock(&info->block_group_cache_lock);
9898 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9899 		block_group = rb_entry(n, struct btrfs_block_group_cache,
9900 				       cache_node);
9901 		rb_erase(&block_group->cache_node,
9902 			 &info->block_group_cache_tree);
9903 		RB_CLEAR_NODE(&block_group->cache_node);
9904 		spin_unlock(&info->block_group_cache_lock);
9905 
9906 		down_write(&block_group->space_info->groups_sem);
9907 		list_del(&block_group->list);
9908 		up_write(&block_group->space_info->groups_sem);
9909 
9910 		/*
9911 		 * We haven't cached this block group, which means we could
9912 		 * possibly have excluded extents on this block group.
9913 		 */
9914 		if (block_group->cached == BTRFS_CACHE_NO ||
9915 		    block_group->cached == BTRFS_CACHE_ERROR)
9916 			free_excluded_extents(info, block_group);
9917 
9918 		btrfs_remove_free_space_cache(block_group);
9919 		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9920 		ASSERT(list_empty(&block_group->dirty_list));
9921 		ASSERT(list_empty(&block_group->io_list));
9922 		ASSERT(list_empty(&block_group->bg_list));
9923 		ASSERT(atomic_read(&block_group->count) == 1);
9924 		btrfs_put_block_group(block_group);
9925 
9926 		spin_lock(&info->block_group_cache_lock);
9927 	}
9928 	spin_unlock(&info->block_group_cache_lock);
9929 
9930 	/* now that all the block groups are freed, go through and
9931 	 * free all the space_info structs.  This is only called during
9932 	 * the final stages of unmount, and so we know nobody is
9933 	 * using them.  We call synchronize_rcu() once before we start,
9934 	 * just to be on the safe side.
9935 	 */
9936 	synchronize_rcu();
9937 
9938 	release_global_block_rsv(info);
9939 
9940 	while (!list_empty(&info->space_info)) {
9941 		int i;
9942 
9943 		space_info = list_entry(info->space_info.next,
9944 					struct btrfs_space_info,
9945 					list);
9946 
9947 		/*
9948 		 * Do not hide this behind enospc_debug, this is actually
9949 		 * important and indicates a real bug if this happens.
9950 		 */
9951 		if (WARN_ON(space_info->bytes_pinned > 0 ||
9952 			    space_info->bytes_reserved > 0 ||
9953 			    space_info->bytes_may_use > 0))
9954 			dump_space_info(info, space_info, 0, 0);
9955 		list_del(&space_info->list);
9956 		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9957 			struct kobject *kobj;
9958 			kobj = space_info->block_group_kobjs[i];
9959 			space_info->block_group_kobjs[i] = NULL;
9960 			if (kobj) {
9961 				kobject_del(kobj);
9962 				kobject_put(kobj);
9963 			}
9964 		}
9965 		kobject_del(&space_info->kobj);
9966 		kobject_put(&space_info->kobj);
9967 	}
9968 	return 0;
9969 }
9970 
9971 static void __link_block_group(struct btrfs_space_info *space_info,
9972 			       struct btrfs_block_group_cache *cache)
9973 {
9974 	int index = get_block_group_index(cache);
9975 	bool first = false;
9976 
9977 	down_write(&space_info->groups_sem);
9978 	if (list_empty(&space_info->block_groups[index]))
9979 		first = true;
9980 	list_add_tail(&cache->list, &space_info->block_groups[index]);
9981 	up_write(&space_info->groups_sem);
9982 
9983 	if (first) {
9984 		struct raid_kobject *rkobj;
9985 		int ret;
9986 
9987 		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9988 		if (!rkobj)
9989 			goto out_err;
9990 		rkobj->raid_type = index;
9991 		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9992 		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9993 				  "%s", get_raid_name(index));
9994 		if (ret) {
9995 			kobject_put(&rkobj->kobj);
9996 			goto out_err;
9997 		}
9998 		space_info->block_group_kobjs[index] = &rkobj->kobj;
9999 	}
10000 
10001 	return;
10002 out_err:
10003 	btrfs_warn(cache->fs_info,
10004 		   "failed to add kobject for block cache, ignoring");
10005 }
10006 
10007 static struct btrfs_block_group_cache *
10008 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10009 			       u64 start, u64 size)
10010 {
10011 	struct btrfs_block_group_cache *cache;
10012 
10013 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
10014 	if (!cache)
10015 		return NULL;
10016 
10017 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10018 					GFP_NOFS);
10019 	if (!cache->free_space_ctl) {
10020 		kfree(cache);
10021 		return NULL;
10022 	}
10023 
10024 	cache->key.objectid = start;
10025 	cache->key.offset = size;
10026 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10027 
10028 	cache->fs_info = fs_info;
10029 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10030 	set_free_space_tree_thresholds(cache);
10031 
10032 	atomic_set(&cache->count, 1);
10033 	spin_lock_init(&cache->lock);
10034 	init_rwsem(&cache->data_rwsem);
10035 	INIT_LIST_HEAD(&cache->list);
10036 	INIT_LIST_HEAD(&cache->cluster_list);
10037 	INIT_LIST_HEAD(&cache->bg_list);
10038 	INIT_LIST_HEAD(&cache->ro_list);
10039 	INIT_LIST_HEAD(&cache->dirty_list);
10040 	INIT_LIST_HEAD(&cache->io_list);
10041 	btrfs_init_free_space_ctl(cache);
10042 	atomic_set(&cache->trimming, 0);
10043 	mutex_init(&cache->free_space_lock);
10044 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10045 
10046 	return cache;
10047 }
10048 
10049 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10050 {
10051 	struct btrfs_path *path;
10052 	int ret;
10053 	struct btrfs_block_group_cache *cache;
10054 	struct btrfs_space_info *space_info;
10055 	struct btrfs_key key;
10056 	struct btrfs_key found_key;
10057 	struct extent_buffer *leaf;
10058 	int need_clear = 0;
10059 	u64 cache_gen;
10060 	u64 feature;
10061 	int mixed;
10062 
10063 	feature = btrfs_super_incompat_flags(info->super_copy);
10064 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10065 
10066 	key.objectid = 0;
10067 	key.offset = 0;
10068 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10069 	path = btrfs_alloc_path();
10070 	if (!path)
10071 		return -ENOMEM;
10072 	path->reada = READA_FORWARD;
10073 
10074 	cache_gen = btrfs_super_cache_generation(info->super_copy);
10075 	if (btrfs_test_opt(info, SPACE_CACHE) &&
10076 	    btrfs_super_generation(info->super_copy) != cache_gen)
10077 		need_clear = 1;
10078 	if (btrfs_test_opt(info, CLEAR_CACHE))
10079 		need_clear = 1;
10080 
10081 	while (1) {
10082 		ret = find_first_block_group(info, path, &key);
10083 		if (ret > 0)
10084 			break;
10085 		if (ret != 0)
10086 			goto error;
10087 
10088 		leaf = path->nodes[0];
10089 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10090 
10091 		cache = btrfs_create_block_group_cache(info, found_key.objectid,
10092 						       found_key.offset);
10093 		if (!cache) {
10094 			ret = -ENOMEM;
10095 			goto error;
10096 		}
10097 
10098 		if (need_clear) {
10099 			/*
10100 			 * When we mount with old space cache, we need to
10101 			 * set BTRFS_DC_CLEAR and set dirty flag.
10102 			 *
10103 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10104 			 *    truncate the old free space cache inode and
10105 			 *    setup a new one.
10106 			 * b) Setting 'dirty flag' makes sure that we flush
10107 			 *    the new space cache info onto disk.
10108 			 */
10109 			if (btrfs_test_opt(info, SPACE_CACHE))
10110 				cache->disk_cache_state = BTRFS_DC_CLEAR;
10111 		}
10112 
10113 		read_extent_buffer(leaf, &cache->item,
10114 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
10115 				   sizeof(cache->item));
10116 		cache->flags = btrfs_block_group_flags(&cache->item);
10117 		if (!mixed &&
10118 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10119 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10120 			btrfs_err(info,
10121 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10122 				  cache->key.objectid);
10123 			ret = -EINVAL;
10124 			goto error;
10125 		}
10126 
10127 		key.objectid = found_key.objectid + found_key.offset;
10128 		btrfs_release_path(path);
10129 
10130 		/*
10131 		 * We need to exclude the super stripes now so that the space
10132 		 * info has super bytes accounted for, otherwise we'll think
10133 		 * we have more space than we actually do.
10134 		 */
10135 		ret = exclude_super_stripes(info, cache);
10136 		if (ret) {
10137 			/*
10138 			 * We may have excluded something, so call this just in
10139 			 * case.
10140 			 */
10141 			free_excluded_extents(info, cache);
10142 			btrfs_put_block_group(cache);
10143 			goto error;
10144 		}
10145 
10146 		/*
10147 		 * check for two cases, either we are full, and therefore
10148 		 * don't need to bother with the caching work since we won't
10149 		 * find any space, or we are empty, and we can just add all
10150 		 * the space in and be done with it.  This saves us _alot_ of
10151 		 * time, particularly in the full case.
10152 		 */
10153 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10154 			cache->last_byte_to_unpin = (u64)-1;
10155 			cache->cached = BTRFS_CACHE_FINISHED;
10156 			free_excluded_extents(info, cache);
10157 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10158 			cache->last_byte_to_unpin = (u64)-1;
10159 			cache->cached = BTRFS_CACHE_FINISHED;
10160 			add_new_free_space(cache, info,
10161 					   found_key.objectid,
10162 					   found_key.objectid +
10163 					   found_key.offset);
10164 			free_excluded_extents(info, cache);
10165 		}
10166 
10167 		ret = btrfs_add_block_group_cache(info, cache);
10168 		if (ret) {
10169 			btrfs_remove_free_space_cache(cache);
10170 			btrfs_put_block_group(cache);
10171 			goto error;
10172 		}
10173 
10174 		trace_btrfs_add_block_group(info, cache, 0);
10175 		update_space_info(info, cache->flags, found_key.offset,
10176 				  btrfs_block_group_used(&cache->item),
10177 				  cache->bytes_super, &space_info);
10178 
10179 		cache->space_info = space_info;
10180 
10181 		__link_block_group(space_info, cache);
10182 
10183 		set_avail_alloc_bits(info, cache->flags);
10184 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10185 			inc_block_group_ro(cache, 1);
10186 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10187 			spin_lock(&info->unused_bgs_lock);
10188 			/* Should always be true but just in case. */
10189 			if (list_empty(&cache->bg_list)) {
10190 				btrfs_get_block_group(cache);
10191 				list_add_tail(&cache->bg_list,
10192 					      &info->unused_bgs);
10193 			}
10194 			spin_unlock(&info->unused_bgs_lock);
10195 		}
10196 	}
10197 
10198 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
10199 		if (!(get_alloc_profile(info, space_info->flags) &
10200 		      (BTRFS_BLOCK_GROUP_RAID10 |
10201 		       BTRFS_BLOCK_GROUP_RAID1 |
10202 		       BTRFS_BLOCK_GROUP_RAID5 |
10203 		       BTRFS_BLOCK_GROUP_RAID6 |
10204 		       BTRFS_BLOCK_GROUP_DUP)))
10205 			continue;
10206 		/*
10207 		 * avoid allocating from un-mirrored block group if there are
10208 		 * mirrored block groups.
10209 		 */
10210 		list_for_each_entry(cache,
10211 				&space_info->block_groups[BTRFS_RAID_RAID0],
10212 				list)
10213 			inc_block_group_ro(cache, 1);
10214 		list_for_each_entry(cache,
10215 				&space_info->block_groups[BTRFS_RAID_SINGLE],
10216 				list)
10217 			inc_block_group_ro(cache, 1);
10218 	}
10219 
10220 	init_global_block_rsv(info);
10221 	ret = 0;
10222 error:
10223 	btrfs_free_path(path);
10224 	return ret;
10225 }
10226 
10227 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
10228 				       struct btrfs_fs_info *fs_info)
10229 {
10230 	struct btrfs_block_group_cache *block_group, *tmp;
10231 	struct btrfs_root *extent_root = fs_info->extent_root;
10232 	struct btrfs_block_group_item item;
10233 	struct btrfs_key key;
10234 	int ret = 0;
10235 	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10236 
10237 	trans->can_flush_pending_bgs = false;
10238 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10239 		if (ret)
10240 			goto next;
10241 
10242 		spin_lock(&block_group->lock);
10243 		memcpy(&item, &block_group->item, sizeof(item));
10244 		memcpy(&key, &block_group->key, sizeof(key));
10245 		spin_unlock(&block_group->lock);
10246 
10247 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
10248 					sizeof(item));
10249 		if (ret)
10250 			btrfs_abort_transaction(trans, ret);
10251 		ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
10252 					       key.offset);
10253 		if (ret)
10254 			btrfs_abort_transaction(trans, ret);
10255 		add_block_group_free_space(trans, fs_info, block_group);
10256 		/* already aborted the transaction if it failed. */
10257 next:
10258 		list_del_init(&block_group->bg_list);
10259 	}
10260 	trans->can_flush_pending_bgs = can_flush_pending_bgs;
10261 }
10262 
10263 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10264 			   struct btrfs_fs_info *fs_info, u64 bytes_used,
10265 			   u64 type, u64 chunk_offset, u64 size)
10266 {
10267 	struct btrfs_block_group_cache *cache;
10268 	int ret;
10269 
10270 	btrfs_set_log_full_commit(fs_info, trans);
10271 
10272 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10273 	if (!cache)
10274 		return -ENOMEM;
10275 
10276 	btrfs_set_block_group_used(&cache->item, bytes_used);
10277 	btrfs_set_block_group_chunk_objectid(&cache->item,
10278 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10279 	btrfs_set_block_group_flags(&cache->item, type);
10280 
10281 	cache->flags = type;
10282 	cache->last_byte_to_unpin = (u64)-1;
10283 	cache->cached = BTRFS_CACHE_FINISHED;
10284 	cache->needs_free_space = 1;
10285 	ret = exclude_super_stripes(fs_info, cache);
10286 	if (ret) {
10287 		/*
10288 		 * We may have excluded something, so call this just in
10289 		 * case.
10290 		 */
10291 		free_excluded_extents(fs_info, cache);
10292 		btrfs_put_block_group(cache);
10293 		return ret;
10294 	}
10295 
10296 	add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
10297 
10298 	free_excluded_extents(fs_info, cache);
10299 
10300 #ifdef CONFIG_BTRFS_DEBUG
10301 	if (btrfs_should_fragment_free_space(cache)) {
10302 		u64 new_bytes_used = size - bytes_used;
10303 
10304 		bytes_used += new_bytes_used >> 1;
10305 		fragment_free_space(cache);
10306 	}
10307 #endif
10308 	/*
10309 	 * Ensure the corresponding space_info object is created and
10310 	 * assigned to our block group. We want our bg to be added to the rbtree
10311 	 * with its ->space_info set.
10312 	 */
10313 	cache->space_info = __find_space_info(fs_info, cache->flags);
10314 	if (!cache->space_info) {
10315 		ret = create_space_info(fs_info, cache->flags,
10316 				       &cache->space_info);
10317 		if (ret) {
10318 			btrfs_remove_free_space_cache(cache);
10319 			btrfs_put_block_group(cache);
10320 			return ret;
10321 		}
10322 	}
10323 
10324 	ret = btrfs_add_block_group_cache(fs_info, cache);
10325 	if (ret) {
10326 		btrfs_remove_free_space_cache(cache);
10327 		btrfs_put_block_group(cache);
10328 		return ret;
10329 	}
10330 
10331 	/*
10332 	 * Now that our block group has its ->space_info set and is inserted in
10333 	 * the rbtree, update the space info's counters.
10334 	 */
10335 	trace_btrfs_add_block_group(fs_info, cache, 1);
10336 	update_space_info(fs_info, cache->flags, size, bytes_used,
10337 				cache->bytes_super, &cache->space_info);
10338 	update_global_block_rsv(fs_info);
10339 
10340 	__link_block_group(cache->space_info, cache);
10341 
10342 	list_add_tail(&cache->bg_list, &trans->new_bgs);
10343 
10344 	set_avail_alloc_bits(fs_info, type);
10345 	return 0;
10346 }
10347 
10348 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10349 {
10350 	u64 extra_flags = chunk_to_extended(flags) &
10351 				BTRFS_EXTENDED_PROFILE_MASK;
10352 
10353 	write_seqlock(&fs_info->profiles_lock);
10354 	if (flags & BTRFS_BLOCK_GROUP_DATA)
10355 		fs_info->avail_data_alloc_bits &= ~extra_flags;
10356 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
10357 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10358 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10359 		fs_info->avail_system_alloc_bits &= ~extra_flags;
10360 	write_sequnlock(&fs_info->profiles_lock);
10361 }
10362 
10363 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10364 			     struct btrfs_fs_info *fs_info, u64 group_start,
10365 			     struct extent_map *em)
10366 {
10367 	struct btrfs_root *root = fs_info->extent_root;
10368 	struct btrfs_path *path;
10369 	struct btrfs_block_group_cache *block_group;
10370 	struct btrfs_free_cluster *cluster;
10371 	struct btrfs_root *tree_root = fs_info->tree_root;
10372 	struct btrfs_key key;
10373 	struct inode *inode;
10374 	struct kobject *kobj = NULL;
10375 	int ret;
10376 	int index;
10377 	int factor;
10378 	struct btrfs_caching_control *caching_ctl = NULL;
10379 	bool remove_em;
10380 
10381 	block_group = btrfs_lookup_block_group(fs_info, group_start);
10382 	BUG_ON(!block_group);
10383 	BUG_ON(!block_group->ro);
10384 
10385 	/*
10386 	 * Free the reserved super bytes from this block group before
10387 	 * remove it.
10388 	 */
10389 	free_excluded_extents(fs_info, block_group);
10390 
10391 	memcpy(&key, &block_group->key, sizeof(key));
10392 	index = get_block_group_index(block_group);
10393 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10394 				  BTRFS_BLOCK_GROUP_RAID1 |
10395 				  BTRFS_BLOCK_GROUP_RAID10))
10396 		factor = 2;
10397 	else
10398 		factor = 1;
10399 
10400 	/* make sure this block group isn't part of an allocation cluster */
10401 	cluster = &fs_info->data_alloc_cluster;
10402 	spin_lock(&cluster->refill_lock);
10403 	btrfs_return_cluster_to_free_space(block_group, cluster);
10404 	spin_unlock(&cluster->refill_lock);
10405 
10406 	/*
10407 	 * make sure this block group isn't part of a metadata
10408 	 * allocation cluster
10409 	 */
10410 	cluster = &fs_info->meta_alloc_cluster;
10411 	spin_lock(&cluster->refill_lock);
10412 	btrfs_return_cluster_to_free_space(block_group, cluster);
10413 	spin_unlock(&cluster->refill_lock);
10414 
10415 	path = btrfs_alloc_path();
10416 	if (!path) {
10417 		ret = -ENOMEM;
10418 		goto out;
10419 	}
10420 
10421 	/*
10422 	 * get the inode first so any iput calls done for the io_list
10423 	 * aren't the final iput (no unlinks allowed now)
10424 	 */
10425 	inode = lookup_free_space_inode(fs_info, block_group, path);
10426 
10427 	mutex_lock(&trans->transaction->cache_write_mutex);
10428 	/*
10429 	 * make sure our free spache cache IO is done before remove the
10430 	 * free space inode
10431 	 */
10432 	spin_lock(&trans->transaction->dirty_bgs_lock);
10433 	if (!list_empty(&block_group->io_list)) {
10434 		list_del_init(&block_group->io_list);
10435 
10436 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10437 
10438 		spin_unlock(&trans->transaction->dirty_bgs_lock);
10439 		btrfs_wait_cache_io(trans, block_group, path);
10440 		btrfs_put_block_group(block_group);
10441 		spin_lock(&trans->transaction->dirty_bgs_lock);
10442 	}
10443 
10444 	if (!list_empty(&block_group->dirty_list)) {
10445 		list_del_init(&block_group->dirty_list);
10446 		btrfs_put_block_group(block_group);
10447 	}
10448 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10449 	mutex_unlock(&trans->transaction->cache_write_mutex);
10450 
10451 	if (!IS_ERR(inode)) {
10452 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10453 		if (ret) {
10454 			btrfs_add_delayed_iput(inode);
10455 			goto out;
10456 		}
10457 		clear_nlink(inode);
10458 		/* One for the block groups ref */
10459 		spin_lock(&block_group->lock);
10460 		if (block_group->iref) {
10461 			block_group->iref = 0;
10462 			block_group->inode = NULL;
10463 			spin_unlock(&block_group->lock);
10464 			iput(inode);
10465 		} else {
10466 			spin_unlock(&block_group->lock);
10467 		}
10468 		/* One for our lookup ref */
10469 		btrfs_add_delayed_iput(inode);
10470 	}
10471 
10472 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10473 	key.offset = block_group->key.objectid;
10474 	key.type = 0;
10475 
10476 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10477 	if (ret < 0)
10478 		goto out;
10479 	if (ret > 0)
10480 		btrfs_release_path(path);
10481 	if (ret == 0) {
10482 		ret = btrfs_del_item(trans, tree_root, path);
10483 		if (ret)
10484 			goto out;
10485 		btrfs_release_path(path);
10486 	}
10487 
10488 	spin_lock(&fs_info->block_group_cache_lock);
10489 	rb_erase(&block_group->cache_node,
10490 		 &fs_info->block_group_cache_tree);
10491 	RB_CLEAR_NODE(&block_group->cache_node);
10492 
10493 	if (fs_info->first_logical_byte == block_group->key.objectid)
10494 		fs_info->first_logical_byte = (u64)-1;
10495 	spin_unlock(&fs_info->block_group_cache_lock);
10496 
10497 	down_write(&block_group->space_info->groups_sem);
10498 	/*
10499 	 * we must use list_del_init so people can check to see if they
10500 	 * are still on the list after taking the semaphore
10501 	 */
10502 	list_del_init(&block_group->list);
10503 	if (list_empty(&block_group->space_info->block_groups[index])) {
10504 		kobj = block_group->space_info->block_group_kobjs[index];
10505 		block_group->space_info->block_group_kobjs[index] = NULL;
10506 		clear_avail_alloc_bits(fs_info, block_group->flags);
10507 	}
10508 	up_write(&block_group->space_info->groups_sem);
10509 	if (kobj) {
10510 		kobject_del(kobj);
10511 		kobject_put(kobj);
10512 	}
10513 
10514 	if (block_group->has_caching_ctl)
10515 		caching_ctl = get_caching_control(block_group);
10516 	if (block_group->cached == BTRFS_CACHE_STARTED)
10517 		wait_block_group_cache_done(block_group);
10518 	if (block_group->has_caching_ctl) {
10519 		down_write(&fs_info->commit_root_sem);
10520 		if (!caching_ctl) {
10521 			struct btrfs_caching_control *ctl;
10522 
10523 			list_for_each_entry(ctl,
10524 				    &fs_info->caching_block_groups, list)
10525 				if (ctl->block_group == block_group) {
10526 					caching_ctl = ctl;
10527 					refcount_inc(&caching_ctl->count);
10528 					break;
10529 				}
10530 		}
10531 		if (caching_ctl)
10532 			list_del_init(&caching_ctl->list);
10533 		up_write(&fs_info->commit_root_sem);
10534 		if (caching_ctl) {
10535 			/* Once for the caching bgs list and once for us. */
10536 			put_caching_control(caching_ctl);
10537 			put_caching_control(caching_ctl);
10538 		}
10539 	}
10540 
10541 	spin_lock(&trans->transaction->dirty_bgs_lock);
10542 	if (!list_empty(&block_group->dirty_list)) {
10543 		WARN_ON(1);
10544 	}
10545 	if (!list_empty(&block_group->io_list)) {
10546 		WARN_ON(1);
10547 	}
10548 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10549 	btrfs_remove_free_space_cache(block_group);
10550 
10551 	spin_lock(&block_group->space_info->lock);
10552 	list_del_init(&block_group->ro_list);
10553 
10554 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10555 		WARN_ON(block_group->space_info->total_bytes
10556 			< block_group->key.offset);
10557 		WARN_ON(block_group->space_info->bytes_readonly
10558 			< block_group->key.offset);
10559 		WARN_ON(block_group->space_info->disk_total
10560 			< block_group->key.offset * factor);
10561 	}
10562 	block_group->space_info->total_bytes -= block_group->key.offset;
10563 	block_group->space_info->bytes_readonly -= block_group->key.offset;
10564 	block_group->space_info->disk_total -= block_group->key.offset * factor;
10565 
10566 	spin_unlock(&block_group->space_info->lock);
10567 
10568 	memcpy(&key, &block_group->key, sizeof(key));
10569 
10570 	mutex_lock(&fs_info->chunk_mutex);
10571 	if (!list_empty(&em->list)) {
10572 		/* We're in the transaction->pending_chunks list. */
10573 		free_extent_map(em);
10574 	}
10575 	spin_lock(&block_group->lock);
10576 	block_group->removed = 1;
10577 	/*
10578 	 * At this point trimming can't start on this block group, because we
10579 	 * removed the block group from the tree fs_info->block_group_cache_tree
10580 	 * so no one can't find it anymore and even if someone already got this
10581 	 * block group before we removed it from the rbtree, they have already
10582 	 * incremented block_group->trimming - if they didn't, they won't find
10583 	 * any free space entries because we already removed them all when we
10584 	 * called btrfs_remove_free_space_cache().
10585 	 *
10586 	 * And we must not remove the extent map from the fs_info->mapping_tree
10587 	 * to prevent the same logical address range and physical device space
10588 	 * ranges from being reused for a new block group. This is because our
10589 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10590 	 * completely transactionless, so while it is trimming a range the
10591 	 * currently running transaction might finish and a new one start,
10592 	 * allowing for new block groups to be created that can reuse the same
10593 	 * physical device locations unless we take this special care.
10594 	 *
10595 	 * There may also be an implicit trim operation if the file system
10596 	 * is mounted with -odiscard. The same protections must remain
10597 	 * in place until the extents have been discarded completely when
10598 	 * the transaction commit has completed.
10599 	 */
10600 	remove_em = (atomic_read(&block_group->trimming) == 0);
10601 	/*
10602 	 * Make sure a trimmer task always sees the em in the pinned_chunks list
10603 	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10604 	 * before checking block_group->removed).
10605 	 */
10606 	if (!remove_em) {
10607 		/*
10608 		 * Our em might be in trans->transaction->pending_chunks which
10609 		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10610 		 * and so is the fs_info->pinned_chunks list.
10611 		 *
10612 		 * So at this point we must be holding the chunk_mutex to avoid
10613 		 * any races with chunk allocation (more specifically at
10614 		 * volumes.c:contains_pending_extent()), to ensure it always
10615 		 * sees the em, either in the pending_chunks list or in the
10616 		 * pinned_chunks list.
10617 		 */
10618 		list_move_tail(&em->list, &fs_info->pinned_chunks);
10619 	}
10620 	spin_unlock(&block_group->lock);
10621 
10622 	if (remove_em) {
10623 		struct extent_map_tree *em_tree;
10624 
10625 		em_tree = &fs_info->mapping_tree.map_tree;
10626 		write_lock(&em_tree->lock);
10627 		/*
10628 		 * The em might be in the pending_chunks list, so make sure the
10629 		 * chunk mutex is locked, since remove_extent_mapping() will
10630 		 * delete us from that list.
10631 		 */
10632 		remove_extent_mapping(em_tree, em);
10633 		write_unlock(&em_tree->lock);
10634 		/* once for the tree */
10635 		free_extent_map(em);
10636 	}
10637 
10638 	mutex_unlock(&fs_info->chunk_mutex);
10639 
10640 	ret = remove_block_group_free_space(trans, fs_info, block_group);
10641 	if (ret)
10642 		goto out;
10643 
10644 	btrfs_put_block_group(block_group);
10645 	btrfs_put_block_group(block_group);
10646 
10647 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10648 	if (ret > 0)
10649 		ret = -EIO;
10650 	if (ret < 0)
10651 		goto out;
10652 
10653 	ret = btrfs_del_item(trans, root, path);
10654 out:
10655 	btrfs_free_path(path);
10656 	return ret;
10657 }
10658 
10659 struct btrfs_trans_handle *
10660 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10661 				     const u64 chunk_offset)
10662 {
10663 	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10664 	struct extent_map *em;
10665 	struct map_lookup *map;
10666 	unsigned int num_items;
10667 
10668 	read_lock(&em_tree->lock);
10669 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10670 	read_unlock(&em_tree->lock);
10671 	ASSERT(em && em->start == chunk_offset);
10672 
10673 	/*
10674 	 * We need to reserve 3 + N units from the metadata space info in order
10675 	 * to remove a block group (done at btrfs_remove_chunk() and at
10676 	 * btrfs_remove_block_group()), which are used for:
10677 	 *
10678 	 * 1 unit for adding the free space inode's orphan (located in the tree
10679 	 * of tree roots).
10680 	 * 1 unit for deleting the block group item (located in the extent
10681 	 * tree).
10682 	 * 1 unit for deleting the free space item (located in tree of tree
10683 	 * roots).
10684 	 * N units for deleting N device extent items corresponding to each
10685 	 * stripe (located in the device tree).
10686 	 *
10687 	 * In order to remove a block group we also need to reserve units in the
10688 	 * system space info in order to update the chunk tree (update one or
10689 	 * more device items and remove one chunk item), but this is done at
10690 	 * btrfs_remove_chunk() through a call to check_system_chunk().
10691 	 */
10692 	map = em->map_lookup;
10693 	num_items = 3 + map->num_stripes;
10694 	free_extent_map(em);
10695 
10696 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10697 							   num_items, 1);
10698 }
10699 
10700 /*
10701  * Process the unused_bgs list and remove any that don't have any allocated
10702  * space inside of them.
10703  */
10704 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10705 {
10706 	struct btrfs_block_group_cache *block_group;
10707 	struct btrfs_space_info *space_info;
10708 	struct btrfs_trans_handle *trans;
10709 	int ret = 0;
10710 
10711 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10712 		return;
10713 
10714 	spin_lock(&fs_info->unused_bgs_lock);
10715 	while (!list_empty(&fs_info->unused_bgs)) {
10716 		u64 start, end;
10717 		int trimming;
10718 
10719 		block_group = list_first_entry(&fs_info->unused_bgs,
10720 					       struct btrfs_block_group_cache,
10721 					       bg_list);
10722 		list_del_init(&block_group->bg_list);
10723 
10724 		space_info = block_group->space_info;
10725 
10726 		if (ret || btrfs_mixed_space_info(space_info)) {
10727 			btrfs_put_block_group(block_group);
10728 			continue;
10729 		}
10730 		spin_unlock(&fs_info->unused_bgs_lock);
10731 
10732 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
10733 
10734 		/* Don't want to race with allocators so take the groups_sem */
10735 		down_write(&space_info->groups_sem);
10736 		spin_lock(&block_group->lock);
10737 		if (block_group->reserved ||
10738 		    btrfs_block_group_used(&block_group->item) ||
10739 		    block_group->ro ||
10740 		    list_is_singular(&block_group->list)) {
10741 			/*
10742 			 * We want to bail if we made new allocations or have
10743 			 * outstanding allocations in this block group.  We do
10744 			 * the ro check in case balance is currently acting on
10745 			 * this block group.
10746 			 */
10747 			spin_unlock(&block_group->lock);
10748 			up_write(&space_info->groups_sem);
10749 			goto next;
10750 		}
10751 		spin_unlock(&block_group->lock);
10752 
10753 		/* We don't want to force the issue, only flip if it's ok. */
10754 		ret = inc_block_group_ro(block_group, 0);
10755 		up_write(&space_info->groups_sem);
10756 		if (ret < 0) {
10757 			ret = 0;
10758 			goto next;
10759 		}
10760 
10761 		/*
10762 		 * Want to do this before we do anything else so we can recover
10763 		 * properly if we fail to join the transaction.
10764 		 */
10765 		trans = btrfs_start_trans_remove_block_group(fs_info,
10766 						     block_group->key.objectid);
10767 		if (IS_ERR(trans)) {
10768 			btrfs_dec_block_group_ro(block_group);
10769 			ret = PTR_ERR(trans);
10770 			goto next;
10771 		}
10772 
10773 		/*
10774 		 * We could have pending pinned extents for this block group,
10775 		 * just delete them, we don't care about them anymore.
10776 		 */
10777 		start = block_group->key.objectid;
10778 		end = start + block_group->key.offset - 1;
10779 		/*
10780 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10781 		 * btrfs_finish_extent_commit(). If we are at transaction N,
10782 		 * another task might be running finish_extent_commit() for the
10783 		 * previous transaction N - 1, and have seen a range belonging
10784 		 * to the block group in freed_extents[] before we were able to
10785 		 * clear the whole block group range from freed_extents[]. This
10786 		 * means that task can lookup for the block group after we
10787 		 * unpinned it from freed_extents[] and removed it, leading to
10788 		 * a BUG_ON() at btrfs_unpin_extent_range().
10789 		 */
10790 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
10791 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10792 				  EXTENT_DIRTY);
10793 		if (ret) {
10794 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10795 			btrfs_dec_block_group_ro(block_group);
10796 			goto end_trans;
10797 		}
10798 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10799 				  EXTENT_DIRTY);
10800 		if (ret) {
10801 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10802 			btrfs_dec_block_group_ro(block_group);
10803 			goto end_trans;
10804 		}
10805 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10806 
10807 		/* Reset pinned so btrfs_put_block_group doesn't complain */
10808 		spin_lock(&space_info->lock);
10809 		spin_lock(&block_group->lock);
10810 
10811 		space_info->bytes_pinned -= block_group->pinned;
10812 		space_info->bytes_readonly += block_group->pinned;
10813 		percpu_counter_add(&space_info->total_bytes_pinned,
10814 				   -block_group->pinned);
10815 		block_group->pinned = 0;
10816 
10817 		spin_unlock(&block_group->lock);
10818 		spin_unlock(&space_info->lock);
10819 
10820 		/* DISCARD can flip during remount */
10821 		trimming = btrfs_test_opt(fs_info, DISCARD);
10822 
10823 		/* Implicit trim during transaction commit. */
10824 		if (trimming)
10825 			btrfs_get_block_group_trimming(block_group);
10826 
10827 		/*
10828 		 * Btrfs_remove_chunk will abort the transaction if things go
10829 		 * horribly wrong.
10830 		 */
10831 		ret = btrfs_remove_chunk(trans, fs_info,
10832 					 block_group->key.objectid);
10833 
10834 		if (ret) {
10835 			if (trimming)
10836 				btrfs_put_block_group_trimming(block_group);
10837 			goto end_trans;
10838 		}
10839 
10840 		/*
10841 		 * If we're not mounted with -odiscard, we can just forget
10842 		 * about this block group. Otherwise we'll need to wait
10843 		 * until transaction commit to do the actual discard.
10844 		 */
10845 		if (trimming) {
10846 			spin_lock(&fs_info->unused_bgs_lock);
10847 			/*
10848 			 * A concurrent scrub might have added us to the list
10849 			 * fs_info->unused_bgs, so use a list_move operation
10850 			 * to add the block group to the deleted_bgs list.
10851 			 */
10852 			list_move(&block_group->bg_list,
10853 				  &trans->transaction->deleted_bgs);
10854 			spin_unlock(&fs_info->unused_bgs_lock);
10855 			btrfs_get_block_group(block_group);
10856 		}
10857 end_trans:
10858 		btrfs_end_transaction(trans);
10859 next:
10860 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10861 		btrfs_put_block_group(block_group);
10862 		spin_lock(&fs_info->unused_bgs_lock);
10863 	}
10864 	spin_unlock(&fs_info->unused_bgs_lock);
10865 }
10866 
10867 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10868 {
10869 	struct btrfs_space_info *space_info;
10870 	struct btrfs_super_block *disk_super;
10871 	u64 features;
10872 	u64 flags;
10873 	int mixed = 0;
10874 	int ret;
10875 
10876 	disk_super = fs_info->super_copy;
10877 	if (!btrfs_super_root(disk_super))
10878 		return -EINVAL;
10879 
10880 	features = btrfs_super_incompat_flags(disk_super);
10881 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10882 		mixed = 1;
10883 
10884 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
10885 	ret = create_space_info(fs_info, flags, &space_info);
10886 	if (ret)
10887 		goto out;
10888 
10889 	if (mixed) {
10890 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10891 		ret = create_space_info(fs_info, flags, &space_info);
10892 	} else {
10893 		flags = BTRFS_BLOCK_GROUP_METADATA;
10894 		ret = create_space_info(fs_info, flags, &space_info);
10895 		if (ret)
10896 			goto out;
10897 
10898 		flags = BTRFS_BLOCK_GROUP_DATA;
10899 		ret = create_space_info(fs_info, flags, &space_info);
10900 	}
10901 out:
10902 	return ret;
10903 }
10904 
10905 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10906 				   u64 start, u64 end)
10907 {
10908 	return unpin_extent_range(fs_info, start, end, false);
10909 }
10910 
10911 /*
10912  * It used to be that old block groups would be left around forever.
10913  * Iterating over them would be enough to trim unused space.  Since we
10914  * now automatically remove them, we also need to iterate over unallocated
10915  * space.
10916  *
10917  * We don't want a transaction for this since the discard may take a
10918  * substantial amount of time.  We don't require that a transaction be
10919  * running, but we do need to take a running transaction into account
10920  * to ensure that we're not discarding chunks that were released in
10921  * the current transaction.
10922  *
10923  * Holding the chunks lock will prevent other threads from allocating
10924  * or releasing chunks, but it won't prevent a running transaction
10925  * from committing and releasing the memory that the pending chunks
10926  * list head uses.  For that, we need to take a reference to the
10927  * transaction.
10928  */
10929 static int btrfs_trim_free_extents(struct btrfs_device *device,
10930 				   u64 minlen, u64 *trimmed)
10931 {
10932 	u64 start = 0, len = 0;
10933 	int ret;
10934 
10935 	*trimmed = 0;
10936 
10937 	/* Not writeable = nothing to do. */
10938 	if (!device->writeable)
10939 		return 0;
10940 
10941 	/* No free space = nothing to do. */
10942 	if (device->total_bytes <= device->bytes_used)
10943 		return 0;
10944 
10945 	ret = 0;
10946 
10947 	while (1) {
10948 		struct btrfs_fs_info *fs_info = device->fs_info;
10949 		struct btrfs_transaction *trans;
10950 		u64 bytes;
10951 
10952 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10953 		if (ret)
10954 			return ret;
10955 
10956 		down_read(&fs_info->commit_root_sem);
10957 
10958 		spin_lock(&fs_info->trans_lock);
10959 		trans = fs_info->running_transaction;
10960 		if (trans)
10961 			refcount_inc(&trans->use_count);
10962 		spin_unlock(&fs_info->trans_lock);
10963 
10964 		ret = find_free_dev_extent_start(trans, device, minlen, start,
10965 						 &start, &len);
10966 		if (trans)
10967 			btrfs_put_transaction(trans);
10968 
10969 		if (ret) {
10970 			up_read(&fs_info->commit_root_sem);
10971 			mutex_unlock(&fs_info->chunk_mutex);
10972 			if (ret == -ENOSPC)
10973 				ret = 0;
10974 			break;
10975 		}
10976 
10977 		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10978 		up_read(&fs_info->commit_root_sem);
10979 		mutex_unlock(&fs_info->chunk_mutex);
10980 
10981 		if (ret)
10982 			break;
10983 
10984 		start += len;
10985 		*trimmed += bytes;
10986 
10987 		if (fatal_signal_pending(current)) {
10988 			ret = -ERESTARTSYS;
10989 			break;
10990 		}
10991 
10992 		cond_resched();
10993 	}
10994 
10995 	return ret;
10996 }
10997 
10998 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10999 {
11000 	struct btrfs_block_group_cache *cache = NULL;
11001 	struct btrfs_device *device;
11002 	struct list_head *devices;
11003 	u64 group_trimmed;
11004 	u64 start;
11005 	u64 end;
11006 	u64 trimmed = 0;
11007 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11008 	int ret = 0;
11009 
11010 	/*
11011 	 * try to trim all FS space, our block group may start from non-zero.
11012 	 */
11013 	if (range->len == total_bytes)
11014 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
11015 	else
11016 		cache = btrfs_lookup_block_group(fs_info, range->start);
11017 
11018 	while (cache) {
11019 		if (cache->key.objectid >= (range->start + range->len)) {
11020 			btrfs_put_block_group(cache);
11021 			break;
11022 		}
11023 
11024 		start = max(range->start, cache->key.objectid);
11025 		end = min(range->start + range->len,
11026 				cache->key.objectid + cache->key.offset);
11027 
11028 		if (end - start >= range->minlen) {
11029 			if (!block_group_cache_done(cache)) {
11030 				ret = cache_block_group(cache, 0);
11031 				if (ret) {
11032 					btrfs_put_block_group(cache);
11033 					break;
11034 				}
11035 				ret = wait_block_group_cache_done(cache);
11036 				if (ret) {
11037 					btrfs_put_block_group(cache);
11038 					break;
11039 				}
11040 			}
11041 			ret = btrfs_trim_block_group(cache,
11042 						     &group_trimmed,
11043 						     start,
11044 						     end,
11045 						     range->minlen);
11046 
11047 			trimmed += group_trimmed;
11048 			if (ret) {
11049 				btrfs_put_block_group(cache);
11050 				break;
11051 			}
11052 		}
11053 
11054 		cache = next_block_group(fs_info, cache);
11055 	}
11056 
11057 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
11058 	devices = &fs_info->fs_devices->alloc_list;
11059 	list_for_each_entry(device, devices, dev_alloc_list) {
11060 		ret = btrfs_trim_free_extents(device, range->minlen,
11061 					      &group_trimmed);
11062 		if (ret)
11063 			break;
11064 
11065 		trimmed += group_trimmed;
11066 	}
11067 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11068 
11069 	range->len = trimmed;
11070 	return ret;
11071 }
11072 
11073 /*
11074  * btrfs_{start,end}_write_no_snapshotting() are similar to
11075  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11076  * data into the page cache through nocow before the subvolume is snapshoted,
11077  * but flush the data into disk after the snapshot creation, or to prevent
11078  * operations while snapshotting is ongoing and that cause the snapshot to be
11079  * inconsistent (writes followed by expanding truncates for example).
11080  */
11081 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11082 {
11083 	percpu_counter_dec(&root->subv_writers->counter);
11084 	/*
11085 	 * Make sure counter is updated before we wake up waiters.
11086 	 */
11087 	smp_mb();
11088 	if (waitqueue_active(&root->subv_writers->wait))
11089 		wake_up(&root->subv_writers->wait);
11090 }
11091 
11092 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11093 {
11094 	if (atomic_read(&root->will_be_snapshotted))
11095 		return 0;
11096 
11097 	percpu_counter_inc(&root->subv_writers->counter);
11098 	/*
11099 	 * Make sure counter is updated before we check for snapshot creation.
11100 	 */
11101 	smp_mb();
11102 	if (atomic_read(&root->will_be_snapshotted)) {
11103 		btrfs_end_write_no_snapshotting(root);
11104 		return 0;
11105 	}
11106 	return 1;
11107 }
11108 
11109 static int wait_snapshotting_atomic_t(atomic_t *a)
11110 {
11111 	schedule();
11112 	return 0;
11113 }
11114 
11115 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11116 {
11117 	while (true) {
11118 		int ret;
11119 
11120 		ret = btrfs_start_write_no_snapshotting(root);
11121 		if (ret)
11122 			break;
11123 		wait_on_atomic_t(&root->will_be_snapshotted,
11124 				 wait_snapshotting_atomic_t,
11125 				 TASK_UNINTERRUPTIBLE);
11126 	}
11127 }
11128