xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision e983940270f10fe8551baf0098be76ea478294a3)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/percpu_counter.h>
28 #include "hash.h"
29 #include "tree-log.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "volumes.h"
33 #include "raid56.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "free-space-tree.h"
37 #include "math.h"
38 #include "sysfs.h"
39 #include "qgroup.h"
40 
41 #undef SCRAMBLE_DELAYED_REFS
42 
43 /*
44  * control flags for do_chunk_alloc's force field
45  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
46  * if we really need one.
47  *
48  * CHUNK_ALLOC_LIMITED means to only try and allocate one
49  * if we have very few chunks already allocated.  This is
50  * used as part of the clustering code to help make sure
51  * we have a good pool of storage to cluster in, without
52  * filling the FS with empty chunks
53  *
54  * CHUNK_ALLOC_FORCE means it must try to allocate one
55  *
56  */
57 enum {
58 	CHUNK_ALLOC_NO_FORCE = 0,
59 	CHUNK_ALLOC_LIMITED = 1,
60 	CHUNK_ALLOC_FORCE = 2,
61 };
62 
63 static int update_block_group(struct btrfs_trans_handle *trans,
64 			      struct btrfs_root *root, u64 bytenr,
65 			      u64 num_bytes, int alloc);
66 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
67 				struct btrfs_root *root,
68 				struct btrfs_delayed_ref_node *node, u64 parent,
69 				u64 root_objectid, u64 owner_objectid,
70 				u64 owner_offset, int refs_to_drop,
71 				struct btrfs_delayed_extent_op *extra_op);
72 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
73 				    struct extent_buffer *leaf,
74 				    struct btrfs_extent_item *ei);
75 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
76 				      struct btrfs_root *root,
77 				      u64 parent, u64 root_objectid,
78 				      u64 flags, u64 owner, u64 offset,
79 				      struct btrfs_key *ins, int ref_mod);
80 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
81 				     struct btrfs_root *root,
82 				     u64 parent, u64 root_objectid,
83 				     u64 flags, struct btrfs_disk_key *key,
84 				     int level, struct btrfs_key *ins);
85 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
86 			  struct btrfs_root *extent_root, u64 flags,
87 			  int force);
88 static int find_next_key(struct btrfs_path *path, int level,
89 			 struct btrfs_key *key);
90 static void dump_space_info(struct btrfs_fs_info *fs_info,
91 			    struct btrfs_space_info *info, u64 bytes,
92 			    int dump_block_groups);
93 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
94 				    u64 ram_bytes, u64 num_bytes, int delalloc);
95 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
96 				     u64 num_bytes, int delalloc);
97 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
98 			       u64 num_bytes);
99 int btrfs_pin_extent(struct btrfs_root *root,
100 		     u64 bytenr, u64 num_bytes, int reserved);
101 static int __reserve_metadata_bytes(struct btrfs_root *root,
102 				    struct btrfs_space_info *space_info,
103 				    u64 orig_bytes,
104 				    enum btrfs_reserve_flush_enum flush);
105 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
106 				     struct btrfs_space_info *space_info,
107 				     u64 num_bytes);
108 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
109 				     struct btrfs_space_info *space_info,
110 				     u64 num_bytes);
111 
112 static noinline int
113 block_group_cache_done(struct btrfs_block_group_cache *cache)
114 {
115 	smp_mb();
116 	return cache->cached == BTRFS_CACHE_FINISHED ||
117 		cache->cached == BTRFS_CACHE_ERROR;
118 }
119 
120 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
121 {
122 	return (cache->flags & bits) == bits;
123 }
124 
125 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
126 {
127 	atomic_inc(&cache->count);
128 }
129 
130 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
131 {
132 	if (atomic_dec_and_test(&cache->count)) {
133 		WARN_ON(cache->pinned > 0);
134 		WARN_ON(cache->reserved > 0);
135 		kfree(cache->free_space_ctl);
136 		kfree(cache);
137 	}
138 }
139 
140 /*
141  * this adds the block group to the fs_info rb tree for the block group
142  * cache
143  */
144 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
145 				struct btrfs_block_group_cache *block_group)
146 {
147 	struct rb_node **p;
148 	struct rb_node *parent = NULL;
149 	struct btrfs_block_group_cache *cache;
150 
151 	spin_lock(&info->block_group_cache_lock);
152 	p = &info->block_group_cache_tree.rb_node;
153 
154 	while (*p) {
155 		parent = *p;
156 		cache = rb_entry(parent, struct btrfs_block_group_cache,
157 				 cache_node);
158 		if (block_group->key.objectid < cache->key.objectid) {
159 			p = &(*p)->rb_left;
160 		} else if (block_group->key.objectid > cache->key.objectid) {
161 			p = &(*p)->rb_right;
162 		} else {
163 			spin_unlock(&info->block_group_cache_lock);
164 			return -EEXIST;
165 		}
166 	}
167 
168 	rb_link_node(&block_group->cache_node, parent, p);
169 	rb_insert_color(&block_group->cache_node,
170 			&info->block_group_cache_tree);
171 
172 	if (info->first_logical_byte > block_group->key.objectid)
173 		info->first_logical_byte = block_group->key.objectid;
174 
175 	spin_unlock(&info->block_group_cache_lock);
176 
177 	return 0;
178 }
179 
180 /*
181  * This will return the block group at or after bytenr if contains is 0, else
182  * it will return the block group that contains the bytenr
183  */
184 static struct btrfs_block_group_cache *
185 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
186 			      int contains)
187 {
188 	struct btrfs_block_group_cache *cache, *ret = NULL;
189 	struct rb_node *n;
190 	u64 end, start;
191 
192 	spin_lock(&info->block_group_cache_lock);
193 	n = info->block_group_cache_tree.rb_node;
194 
195 	while (n) {
196 		cache = rb_entry(n, struct btrfs_block_group_cache,
197 				 cache_node);
198 		end = cache->key.objectid + cache->key.offset - 1;
199 		start = cache->key.objectid;
200 
201 		if (bytenr < start) {
202 			if (!contains && (!ret || start < ret->key.objectid))
203 				ret = cache;
204 			n = n->rb_left;
205 		} else if (bytenr > start) {
206 			if (contains && bytenr <= end) {
207 				ret = cache;
208 				break;
209 			}
210 			n = n->rb_right;
211 		} else {
212 			ret = cache;
213 			break;
214 		}
215 	}
216 	if (ret) {
217 		btrfs_get_block_group(ret);
218 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
219 			info->first_logical_byte = ret->key.objectid;
220 	}
221 	spin_unlock(&info->block_group_cache_lock);
222 
223 	return ret;
224 }
225 
226 static int add_excluded_extent(struct btrfs_root *root,
227 			       u64 start, u64 num_bytes)
228 {
229 	u64 end = start + num_bytes - 1;
230 	set_extent_bits(&root->fs_info->freed_extents[0],
231 			start, end, EXTENT_UPTODATE);
232 	set_extent_bits(&root->fs_info->freed_extents[1],
233 			start, end, EXTENT_UPTODATE);
234 	return 0;
235 }
236 
237 static void free_excluded_extents(struct btrfs_root *root,
238 				  struct btrfs_block_group_cache *cache)
239 {
240 	u64 start, end;
241 
242 	start = cache->key.objectid;
243 	end = start + cache->key.offset - 1;
244 
245 	clear_extent_bits(&root->fs_info->freed_extents[0],
246 			  start, end, EXTENT_UPTODATE);
247 	clear_extent_bits(&root->fs_info->freed_extents[1],
248 			  start, end, EXTENT_UPTODATE);
249 }
250 
251 static int exclude_super_stripes(struct btrfs_root *root,
252 				 struct btrfs_block_group_cache *cache)
253 {
254 	u64 bytenr;
255 	u64 *logical;
256 	int stripe_len;
257 	int i, nr, ret;
258 
259 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
260 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
261 		cache->bytes_super += stripe_len;
262 		ret = add_excluded_extent(root, cache->key.objectid,
263 					  stripe_len);
264 		if (ret)
265 			return ret;
266 	}
267 
268 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
269 		bytenr = btrfs_sb_offset(i);
270 		ret = btrfs_rmap_block(root->fs_info, cache->key.objectid,
271 				       bytenr, 0, &logical, &nr, &stripe_len);
272 		if (ret)
273 			return ret;
274 
275 		while (nr--) {
276 			u64 start, len;
277 
278 			if (logical[nr] > cache->key.objectid +
279 			    cache->key.offset)
280 				continue;
281 
282 			if (logical[nr] + stripe_len <= cache->key.objectid)
283 				continue;
284 
285 			start = logical[nr];
286 			if (start < cache->key.objectid) {
287 				start = cache->key.objectid;
288 				len = (logical[nr] + stripe_len) - start;
289 			} else {
290 				len = min_t(u64, stripe_len,
291 					    cache->key.objectid +
292 					    cache->key.offset - start);
293 			}
294 
295 			cache->bytes_super += len;
296 			ret = add_excluded_extent(root, start, len);
297 			if (ret) {
298 				kfree(logical);
299 				return ret;
300 			}
301 		}
302 
303 		kfree(logical);
304 	}
305 	return 0;
306 }
307 
308 static struct btrfs_caching_control *
309 get_caching_control(struct btrfs_block_group_cache *cache)
310 {
311 	struct btrfs_caching_control *ctl;
312 
313 	spin_lock(&cache->lock);
314 	if (!cache->caching_ctl) {
315 		spin_unlock(&cache->lock);
316 		return NULL;
317 	}
318 
319 	ctl = cache->caching_ctl;
320 	atomic_inc(&ctl->count);
321 	spin_unlock(&cache->lock);
322 	return ctl;
323 }
324 
325 static void put_caching_control(struct btrfs_caching_control *ctl)
326 {
327 	if (atomic_dec_and_test(&ctl->count))
328 		kfree(ctl);
329 }
330 
331 #ifdef CONFIG_BTRFS_DEBUG
332 static void fragment_free_space(struct btrfs_root *root,
333 				struct btrfs_block_group_cache *block_group)
334 {
335 	u64 start = block_group->key.objectid;
336 	u64 len = block_group->key.offset;
337 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
338 		root->nodesize : root->sectorsize;
339 	u64 step = chunk << 1;
340 
341 	while (len > chunk) {
342 		btrfs_remove_free_space(block_group, start, chunk);
343 		start += step;
344 		if (len < step)
345 			len = 0;
346 		else
347 			len -= step;
348 	}
349 }
350 #endif
351 
352 /*
353  * this is only called by cache_block_group, since we could have freed extents
354  * we need to check the pinned_extents for any extents that can't be used yet
355  * since their free space will be released as soon as the transaction commits.
356  */
357 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
358 		       struct btrfs_fs_info *info, u64 start, u64 end)
359 {
360 	u64 extent_start, extent_end, size, total_added = 0;
361 	int ret;
362 
363 	while (start < end) {
364 		ret = find_first_extent_bit(info->pinned_extents, start,
365 					    &extent_start, &extent_end,
366 					    EXTENT_DIRTY | EXTENT_UPTODATE,
367 					    NULL);
368 		if (ret)
369 			break;
370 
371 		if (extent_start <= start) {
372 			start = extent_end + 1;
373 		} else if (extent_start > start && extent_start < end) {
374 			size = extent_start - start;
375 			total_added += size;
376 			ret = btrfs_add_free_space(block_group, start,
377 						   size);
378 			BUG_ON(ret); /* -ENOMEM or logic error */
379 			start = extent_end + 1;
380 		} else {
381 			break;
382 		}
383 	}
384 
385 	if (start < end) {
386 		size = end - start;
387 		total_added += size;
388 		ret = btrfs_add_free_space(block_group, start, size);
389 		BUG_ON(ret); /* -ENOMEM or logic error */
390 	}
391 
392 	return total_added;
393 }
394 
395 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
396 {
397 	struct btrfs_block_group_cache *block_group;
398 	struct btrfs_fs_info *fs_info;
399 	struct btrfs_root *extent_root;
400 	struct btrfs_path *path;
401 	struct extent_buffer *leaf;
402 	struct btrfs_key key;
403 	u64 total_found = 0;
404 	u64 last = 0;
405 	u32 nritems;
406 	int ret;
407 	bool wakeup = true;
408 
409 	block_group = caching_ctl->block_group;
410 	fs_info = block_group->fs_info;
411 	extent_root = fs_info->extent_root;
412 
413 	path = btrfs_alloc_path();
414 	if (!path)
415 		return -ENOMEM;
416 
417 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
418 
419 #ifdef CONFIG_BTRFS_DEBUG
420 	/*
421 	 * If we're fragmenting we don't want to make anybody think we can
422 	 * allocate from this block group until we've had a chance to fragment
423 	 * the free space.
424 	 */
425 	if (btrfs_should_fragment_free_space(extent_root, block_group))
426 		wakeup = false;
427 #endif
428 	/*
429 	 * We don't want to deadlock with somebody trying to allocate a new
430 	 * extent for the extent root while also trying to search the extent
431 	 * root to add free space.  So we skip locking and search the commit
432 	 * root, since its read-only
433 	 */
434 	path->skip_locking = 1;
435 	path->search_commit_root = 1;
436 	path->reada = READA_FORWARD;
437 
438 	key.objectid = last;
439 	key.offset = 0;
440 	key.type = BTRFS_EXTENT_ITEM_KEY;
441 
442 next:
443 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
444 	if (ret < 0)
445 		goto out;
446 
447 	leaf = path->nodes[0];
448 	nritems = btrfs_header_nritems(leaf);
449 
450 	while (1) {
451 		if (btrfs_fs_closing(fs_info) > 1) {
452 			last = (u64)-1;
453 			break;
454 		}
455 
456 		if (path->slots[0] < nritems) {
457 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
458 		} else {
459 			ret = find_next_key(path, 0, &key);
460 			if (ret)
461 				break;
462 
463 			if (need_resched() ||
464 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
465 				if (wakeup)
466 					caching_ctl->progress = last;
467 				btrfs_release_path(path);
468 				up_read(&fs_info->commit_root_sem);
469 				mutex_unlock(&caching_ctl->mutex);
470 				cond_resched();
471 				mutex_lock(&caching_ctl->mutex);
472 				down_read(&fs_info->commit_root_sem);
473 				goto next;
474 			}
475 
476 			ret = btrfs_next_leaf(extent_root, path);
477 			if (ret < 0)
478 				goto out;
479 			if (ret)
480 				break;
481 			leaf = path->nodes[0];
482 			nritems = btrfs_header_nritems(leaf);
483 			continue;
484 		}
485 
486 		if (key.objectid < last) {
487 			key.objectid = last;
488 			key.offset = 0;
489 			key.type = BTRFS_EXTENT_ITEM_KEY;
490 
491 			if (wakeup)
492 				caching_ctl->progress = last;
493 			btrfs_release_path(path);
494 			goto next;
495 		}
496 
497 		if (key.objectid < block_group->key.objectid) {
498 			path->slots[0]++;
499 			continue;
500 		}
501 
502 		if (key.objectid >= block_group->key.objectid +
503 		    block_group->key.offset)
504 			break;
505 
506 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
507 		    key.type == BTRFS_METADATA_ITEM_KEY) {
508 			total_found += add_new_free_space(block_group,
509 							  fs_info, last,
510 							  key.objectid);
511 			if (key.type == BTRFS_METADATA_ITEM_KEY)
512 				last = key.objectid +
513 					fs_info->tree_root->nodesize;
514 			else
515 				last = key.objectid + key.offset;
516 
517 			if (total_found > CACHING_CTL_WAKE_UP) {
518 				total_found = 0;
519 				if (wakeup)
520 					wake_up(&caching_ctl->wait);
521 			}
522 		}
523 		path->slots[0]++;
524 	}
525 	ret = 0;
526 
527 	total_found += add_new_free_space(block_group, fs_info, last,
528 					  block_group->key.objectid +
529 					  block_group->key.offset);
530 	caching_ctl->progress = (u64)-1;
531 
532 out:
533 	btrfs_free_path(path);
534 	return ret;
535 }
536 
537 static noinline void caching_thread(struct btrfs_work *work)
538 {
539 	struct btrfs_block_group_cache *block_group;
540 	struct btrfs_fs_info *fs_info;
541 	struct btrfs_caching_control *caching_ctl;
542 	struct btrfs_root *extent_root;
543 	int ret;
544 
545 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
546 	block_group = caching_ctl->block_group;
547 	fs_info = block_group->fs_info;
548 	extent_root = fs_info->extent_root;
549 
550 	mutex_lock(&caching_ctl->mutex);
551 	down_read(&fs_info->commit_root_sem);
552 
553 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
554 		ret = load_free_space_tree(caching_ctl);
555 	else
556 		ret = load_extent_tree_free(caching_ctl);
557 
558 	spin_lock(&block_group->lock);
559 	block_group->caching_ctl = NULL;
560 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
561 	spin_unlock(&block_group->lock);
562 
563 #ifdef CONFIG_BTRFS_DEBUG
564 	if (btrfs_should_fragment_free_space(extent_root, block_group)) {
565 		u64 bytes_used;
566 
567 		spin_lock(&block_group->space_info->lock);
568 		spin_lock(&block_group->lock);
569 		bytes_used = block_group->key.offset -
570 			btrfs_block_group_used(&block_group->item);
571 		block_group->space_info->bytes_used += bytes_used >> 1;
572 		spin_unlock(&block_group->lock);
573 		spin_unlock(&block_group->space_info->lock);
574 		fragment_free_space(extent_root, block_group);
575 	}
576 #endif
577 
578 	caching_ctl->progress = (u64)-1;
579 
580 	up_read(&fs_info->commit_root_sem);
581 	free_excluded_extents(fs_info->extent_root, block_group);
582 	mutex_unlock(&caching_ctl->mutex);
583 
584 	wake_up(&caching_ctl->wait);
585 
586 	put_caching_control(caching_ctl);
587 	btrfs_put_block_group(block_group);
588 }
589 
590 static int cache_block_group(struct btrfs_block_group_cache *cache,
591 			     int load_cache_only)
592 {
593 	DEFINE_WAIT(wait);
594 	struct btrfs_fs_info *fs_info = cache->fs_info;
595 	struct btrfs_caching_control *caching_ctl;
596 	int ret = 0;
597 
598 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
599 	if (!caching_ctl)
600 		return -ENOMEM;
601 
602 	INIT_LIST_HEAD(&caching_ctl->list);
603 	mutex_init(&caching_ctl->mutex);
604 	init_waitqueue_head(&caching_ctl->wait);
605 	caching_ctl->block_group = cache;
606 	caching_ctl->progress = cache->key.objectid;
607 	atomic_set(&caching_ctl->count, 1);
608 	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
609 			caching_thread, NULL, NULL);
610 
611 	spin_lock(&cache->lock);
612 	/*
613 	 * This should be a rare occasion, but this could happen I think in the
614 	 * case where one thread starts to load the space cache info, and then
615 	 * some other thread starts a transaction commit which tries to do an
616 	 * allocation while the other thread is still loading the space cache
617 	 * info.  The previous loop should have kept us from choosing this block
618 	 * group, but if we've moved to the state where we will wait on caching
619 	 * block groups we need to first check if we're doing a fast load here,
620 	 * so we can wait for it to finish, otherwise we could end up allocating
621 	 * from a block group who's cache gets evicted for one reason or
622 	 * another.
623 	 */
624 	while (cache->cached == BTRFS_CACHE_FAST) {
625 		struct btrfs_caching_control *ctl;
626 
627 		ctl = cache->caching_ctl;
628 		atomic_inc(&ctl->count);
629 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
630 		spin_unlock(&cache->lock);
631 
632 		schedule();
633 
634 		finish_wait(&ctl->wait, &wait);
635 		put_caching_control(ctl);
636 		spin_lock(&cache->lock);
637 	}
638 
639 	if (cache->cached != BTRFS_CACHE_NO) {
640 		spin_unlock(&cache->lock);
641 		kfree(caching_ctl);
642 		return 0;
643 	}
644 	WARN_ON(cache->caching_ctl);
645 	cache->caching_ctl = caching_ctl;
646 	cache->cached = BTRFS_CACHE_FAST;
647 	spin_unlock(&cache->lock);
648 
649 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
650 		mutex_lock(&caching_ctl->mutex);
651 		ret = load_free_space_cache(fs_info, cache);
652 
653 		spin_lock(&cache->lock);
654 		if (ret == 1) {
655 			cache->caching_ctl = NULL;
656 			cache->cached = BTRFS_CACHE_FINISHED;
657 			cache->last_byte_to_unpin = (u64)-1;
658 			caching_ctl->progress = (u64)-1;
659 		} else {
660 			if (load_cache_only) {
661 				cache->caching_ctl = NULL;
662 				cache->cached = BTRFS_CACHE_NO;
663 			} else {
664 				cache->cached = BTRFS_CACHE_STARTED;
665 				cache->has_caching_ctl = 1;
666 			}
667 		}
668 		spin_unlock(&cache->lock);
669 #ifdef CONFIG_BTRFS_DEBUG
670 		if (ret == 1 &&
671 		    btrfs_should_fragment_free_space(fs_info->extent_root,
672 						     cache)) {
673 			u64 bytes_used;
674 
675 			spin_lock(&cache->space_info->lock);
676 			spin_lock(&cache->lock);
677 			bytes_used = cache->key.offset -
678 				btrfs_block_group_used(&cache->item);
679 			cache->space_info->bytes_used += bytes_used >> 1;
680 			spin_unlock(&cache->lock);
681 			spin_unlock(&cache->space_info->lock);
682 			fragment_free_space(fs_info->extent_root, cache);
683 		}
684 #endif
685 		mutex_unlock(&caching_ctl->mutex);
686 
687 		wake_up(&caching_ctl->wait);
688 		if (ret == 1) {
689 			put_caching_control(caching_ctl);
690 			free_excluded_extents(fs_info->extent_root, cache);
691 			return 0;
692 		}
693 	} else {
694 		/*
695 		 * We're either using the free space tree or no caching at all.
696 		 * Set cached to the appropriate value and wakeup any waiters.
697 		 */
698 		spin_lock(&cache->lock);
699 		if (load_cache_only) {
700 			cache->caching_ctl = NULL;
701 			cache->cached = BTRFS_CACHE_NO;
702 		} else {
703 			cache->cached = BTRFS_CACHE_STARTED;
704 			cache->has_caching_ctl = 1;
705 		}
706 		spin_unlock(&cache->lock);
707 		wake_up(&caching_ctl->wait);
708 	}
709 
710 	if (load_cache_only) {
711 		put_caching_control(caching_ctl);
712 		return 0;
713 	}
714 
715 	down_write(&fs_info->commit_root_sem);
716 	atomic_inc(&caching_ctl->count);
717 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
718 	up_write(&fs_info->commit_root_sem);
719 
720 	btrfs_get_block_group(cache);
721 
722 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
723 
724 	return ret;
725 }
726 
727 /*
728  * return the block group that starts at or after bytenr
729  */
730 static struct btrfs_block_group_cache *
731 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
732 {
733 	return block_group_cache_tree_search(info, bytenr, 0);
734 }
735 
736 /*
737  * return the block group that contains the given bytenr
738  */
739 struct btrfs_block_group_cache *btrfs_lookup_block_group(
740 						 struct btrfs_fs_info *info,
741 						 u64 bytenr)
742 {
743 	return block_group_cache_tree_search(info, bytenr, 1);
744 }
745 
746 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
747 						  u64 flags)
748 {
749 	struct list_head *head = &info->space_info;
750 	struct btrfs_space_info *found;
751 
752 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
753 
754 	rcu_read_lock();
755 	list_for_each_entry_rcu(found, head, list) {
756 		if (found->flags & flags) {
757 			rcu_read_unlock();
758 			return found;
759 		}
760 	}
761 	rcu_read_unlock();
762 	return NULL;
763 }
764 
765 /*
766  * after adding space to the filesystem, we need to clear the full flags
767  * on all the space infos.
768  */
769 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
770 {
771 	struct list_head *head = &info->space_info;
772 	struct btrfs_space_info *found;
773 
774 	rcu_read_lock();
775 	list_for_each_entry_rcu(found, head, list)
776 		found->full = 0;
777 	rcu_read_unlock();
778 }
779 
780 /* simple helper to search for an existing data extent at a given offset */
781 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
782 {
783 	int ret;
784 	struct btrfs_key key;
785 	struct btrfs_path *path;
786 
787 	path = btrfs_alloc_path();
788 	if (!path)
789 		return -ENOMEM;
790 
791 	key.objectid = start;
792 	key.offset = len;
793 	key.type = BTRFS_EXTENT_ITEM_KEY;
794 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
795 				0, 0);
796 	btrfs_free_path(path);
797 	return ret;
798 }
799 
800 /*
801  * helper function to lookup reference count and flags of a tree block.
802  *
803  * the head node for delayed ref is used to store the sum of all the
804  * reference count modifications queued up in the rbtree. the head
805  * node may also store the extent flags to set. This way you can check
806  * to see what the reference count and extent flags would be if all of
807  * the delayed refs are not processed.
808  */
809 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
810 			     struct btrfs_root *root, u64 bytenr,
811 			     u64 offset, int metadata, u64 *refs, u64 *flags)
812 {
813 	struct btrfs_delayed_ref_head *head;
814 	struct btrfs_delayed_ref_root *delayed_refs;
815 	struct btrfs_path *path;
816 	struct btrfs_extent_item *ei;
817 	struct extent_buffer *leaf;
818 	struct btrfs_key key;
819 	u32 item_size;
820 	u64 num_refs;
821 	u64 extent_flags;
822 	int ret;
823 
824 	/*
825 	 * If we don't have skinny metadata, don't bother doing anything
826 	 * different
827 	 */
828 	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
829 		offset = root->nodesize;
830 		metadata = 0;
831 	}
832 
833 	path = btrfs_alloc_path();
834 	if (!path)
835 		return -ENOMEM;
836 
837 	if (!trans) {
838 		path->skip_locking = 1;
839 		path->search_commit_root = 1;
840 	}
841 
842 search_again:
843 	key.objectid = bytenr;
844 	key.offset = offset;
845 	if (metadata)
846 		key.type = BTRFS_METADATA_ITEM_KEY;
847 	else
848 		key.type = BTRFS_EXTENT_ITEM_KEY;
849 
850 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
851 				&key, path, 0, 0);
852 	if (ret < 0)
853 		goto out_free;
854 
855 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
856 		if (path->slots[0]) {
857 			path->slots[0]--;
858 			btrfs_item_key_to_cpu(path->nodes[0], &key,
859 					      path->slots[0]);
860 			if (key.objectid == bytenr &&
861 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
862 			    key.offset == root->nodesize)
863 				ret = 0;
864 		}
865 	}
866 
867 	if (ret == 0) {
868 		leaf = path->nodes[0];
869 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
870 		if (item_size >= sizeof(*ei)) {
871 			ei = btrfs_item_ptr(leaf, path->slots[0],
872 					    struct btrfs_extent_item);
873 			num_refs = btrfs_extent_refs(leaf, ei);
874 			extent_flags = btrfs_extent_flags(leaf, ei);
875 		} else {
876 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
877 			struct btrfs_extent_item_v0 *ei0;
878 			BUG_ON(item_size != sizeof(*ei0));
879 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
880 					     struct btrfs_extent_item_v0);
881 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
882 			/* FIXME: this isn't correct for data */
883 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
884 #else
885 			BUG();
886 #endif
887 		}
888 		BUG_ON(num_refs == 0);
889 	} else {
890 		num_refs = 0;
891 		extent_flags = 0;
892 		ret = 0;
893 	}
894 
895 	if (!trans)
896 		goto out;
897 
898 	delayed_refs = &trans->transaction->delayed_refs;
899 	spin_lock(&delayed_refs->lock);
900 	head = btrfs_find_delayed_ref_head(trans, bytenr);
901 	if (head) {
902 		if (!mutex_trylock(&head->mutex)) {
903 			atomic_inc(&head->node.refs);
904 			spin_unlock(&delayed_refs->lock);
905 
906 			btrfs_release_path(path);
907 
908 			/*
909 			 * Mutex was contended, block until it's released and try
910 			 * again
911 			 */
912 			mutex_lock(&head->mutex);
913 			mutex_unlock(&head->mutex);
914 			btrfs_put_delayed_ref(&head->node);
915 			goto search_again;
916 		}
917 		spin_lock(&head->lock);
918 		if (head->extent_op && head->extent_op->update_flags)
919 			extent_flags |= head->extent_op->flags_to_set;
920 		else
921 			BUG_ON(num_refs == 0);
922 
923 		num_refs += head->node.ref_mod;
924 		spin_unlock(&head->lock);
925 		mutex_unlock(&head->mutex);
926 	}
927 	spin_unlock(&delayed_refs->lock);
928 out:
929 	WARN_ON(num_refs == 0);
930 	if (refs)
931 		*refs = num_refs;
932 	if (flags)
933 		*flags = extent_flags;
934 out_free:
935 	btrfs_free_path(path);
936 	return ret;
937 }
938 
939 /*
940  * Back reference rules.  Back refs have three main goals:
941  *
942  * 1) differentiate between all holders of references to an extent so that
943  *    when a reference is dropped we can make sure it was a valid reference
944  *    before freeing the extent.
945  *
946  * 2) Provide enough information to quickly find the holders of an extent
947  *    if we notice a given block is corrupted or bad.
948  *
949  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
950  *    maintenance.  This is actually the same as #2, but with a slightly
951  *    different use case.
952  *
953  * There are two kinds of back refs. The implicit back refs is optimized
954  * for pointers in non-shared tree blocks. For a given pointer in a block,
955  * back refs of this kind provide information about the block's owner tree
956  * and the pointer's key. These information allow us to find the block by
957  * b-tree searching. The full back refs is for pointers in tree blocks not
958  * referenced by their owner trees. The location of tree block is recorded
959  * in the back refs. Actually the full back refs is generic, and can be
960  * used in all cases the implicit back refs is used. The major shortcoming
961  * of the full back refs is its overhead. Every time a tree block gets
962  * COWed, we have to update back refs entry for all pointers in it.
963  *
964  * For a newly allocated tree block, we use implicit back refs for
965  * pointers in it. This means most tree related operations only involve
966  * implicit back refs. For a tree block created in old transaction, the
967  * only way to drop a reference to it is COW it. So we can detect the
968  * event that tree block loses its owner tree's reference and do the
969  * back refs conversion.
970  *
971  * When a tree block is COWed through a tree, there are four cases:
972  *
973  * The reference count of the block is one and the tree is the block's
974  * owner tree. Nothing to do in this case.
975  *
976  * The reference count of the block is one and the tree is not the
977  * block's owner tree. In this case, full back refs is used for pointers
978  * in the block. Remove these full back refs, add implicit back refs for
979  * every pointers in the new block.
980  *
981  * The reference count of the block is greater than one and the tree is
982  * the block's owner tree. In this case, implicit back refs is used for
983  * pointers in the block. Add full back refs for every pointers in the
984  * block, increase lower level extents' reference counts. The original
985  * implicit back refs are entailed to the new block.
986  *
987  * The reference count of the block is greater than one and the tree is
988  * not the block's owner tree. Add implicit back refs for every pointer in
989  * the new block, increase lower level extents' reference count.
990  *
991  * Back Reference Key composing:
992  *
993  * The key objectid corresponds to the first byte in the extent,
994  * The key type is used to differentiate between types of back refs.
995  * There are different meanings of the key offset for different types
996  * of back refs.
997  *
998  * File extents can be referenced by:
999  *
1000  * - multiple snapshots, subvolumes, or different generations in one subvol
1001  * - different files inside a single subvolume
1002  * - different offsets inside a file (bookend extents in file.c)
1003  *
1004  * The extent ref structure for the implicit back refs has fields for:
1005  *
1006  * - Objectid of the subvolume root
1007  * - objectid of the file holding the reference
1008  * - original offset in the file
1009  * - how many bookend extents
1010  *
1011  * The key offset for the implicit back refs is hash of the first
1012  * three fields.
1013  *
1014  * The extent ref structure for the full back refs has field for:
1015  *
1016  * - number of pointers in the tree leaf
1017  *
1018  * The key offset for the implicit back refs is the first byte of
1019  * the tree leaf
1020  *
1021  * When a file extent is allocated, The implicit back refs is used.
1022  * the fields are filled in:
1023  *
1024  *     (root_key.objectid, inode objectid, offset in file, 1)
1025  *
1026  * When a file extent is removed file truncation, we find the
1027  * corresponding implicit back refs and check the following fields:
1028  *
1029  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1030  *
1031  * Btree extents can be referenced by:
1032  *
1033  * - Different subvolumes
1034  *
1035  * Both the implicit back refs and the full back refs for tree blocks
1036  * only consist of key. The key offset for the implicit back refs is
1037  * objectid of block's owner tree. The key offset for the full back refs
1038  * is the first byte of parent block.
1039  *
1040  * When implicit back refs is used, information about the lowest key and
1041  * level of the tree block are required. These information are stored in
1042  * tree block info structure.
1043  */
1044 
1045 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1046 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1047 				  struct btrfs_root *root,
1048 				  struct btrfs_path *path,
1049 				  u64 owner, u32 extra_size)
1050 {
1051 	struct btrfs_extent_item *item;
1052 	struct btrfs_extent_item_v0 *ei0;
1053 	struct btrfs_extent_ref_v0 *ref0;
1054 	struct btrfs_tree_block_info *bi;
1055 	struct extent_buffer *leaf;
1056 	struct btrfs_key key;
1057 	struct btrfs_key found_key;
1058 	u32 new_size = sizeof(*item);
1059 	u64 refs;
1060 	int ret;
1061 
1062 	leaf = path->nodes[0];
1063 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1064 
1065 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1066 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
1067 			     struct btrfs_extent_item_v0);
1068 	refs = btrfs_extent_refs_v0(leaf, ei0);
1069 
1070 	if (owner == (u64)-1) {
1071 		while (1) {
1072 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1073 				ret = btrfs_next_leaf(root, path);
1074 				if (ret < 0)
1075 					return ret;
1076 				BUG_ON(ret > 0); /* Corruption */
1077 				leaf = path->nodes[0];
1078 			}
1079 			btrfs_item_key_to_cpu(leaf, &found_key,
1080 					      path->slots[0]);
1081 			BUG_ON(key.objectid != found_key.objectid);
1082 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1083 				path->slots[0]++;
1084 				continue;
1085 			}
1086 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1087 					      struct btrfs_extent_ref_v0);
1088 			owner = btrfs_ref_objectid_v0(leaf, ref0);
1089 			break;
1090 		}
1091 	}
1092 	btrfs_release_path(path);
1093 
1094 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1095 		new_size += sizeof(*bi);
1096 
1097 	new_size -= sizeof(*ei0);
1098 	ret = btrfs_search_slot(trans, root, &key, path,
1099 				new_size + extra_size, 1);
1100 	if (ret < 0)
1101 		return ret;
1102 	BUG_ON(ret); /* Corruption */
1103 
1104 	btrfs_extend_item(root, path, new_size);
1105 
1106 	leaf = path->nodes[0];
1107 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1108 	btrfs_set_extent_refs(leaf, item, refs);
1109 	/* FIXME: get real generation */
1110 	btrfs_set_extent_generation(leaf, item, 0);
1111 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1112 		btrfs_set_extent_flags(leaf, item,
1113 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1114 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1115 		bi = (struct btrfs_tree_block_info *)(item + 1);
1116 		/* FIXME: get first key of the block */
1117 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1118 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1119 	} else {
1120 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1121 	}
1122 	btrfs_mark_buffer_dirty(leaf);
1123 	return 0;
1124 }
1125 #endif
1126 
1127 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1128 {
1129 	u32 high_crc = ~(u32)0;
1130 	u32 low_crc = ~(u32)0;
1131 	__le64 lenum;
1132 
1133 	lenum = cpu_to_le64(root_objectid);
1134 	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1135 	lenum = cpu_to_le64(owner);
1136 	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1137 	lenum = cpu_to_le64(offset);
1138 	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1139 
1140 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1141 }
1142 
1143 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1144 				     struct btrfs_extent_data_ref *ref)
1145 {
1146 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1147 				    btrfs_extent_data_ref_objectid(leaf, ref),
1148 				    btrfs_extent_data_ref_offset(leaf, ref));
1149 }
1150 
1151 static int match_extent_data_ref(struct extent_buffer *leaf,
1152 				 struct btrfs_extent_data_ref *ref,
1153 				 u64 root_objectid, u64 owner, u64 offset)
1154 {
1155 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1156 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1157 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1158 		return 0;
1159 	return 1;
1160 }
1161 
1162 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1163 					   struct btrfs_root *root,
1164 					   struct btrfs_path *path,
1165 					   u64 bytenr, u64 parent,
1166 					   u64 root_objectid,
1167 					   u64 owner, u64 offset)
1168 {
1169 	struct btrfs_key key;
1170 	struct btrfs_extent_data_ref *ref;
1171 	struct extent_buffer *leaf;
1172 	u32 nritems;
1173 	int ret;
1174 	int recow;
1175 	int err = -ENOENT;
1176 
1177 	key.objectid = bytenr;
1178 	if (parent) {
1179 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1180 		key.offset = parent;
1181 	} else {
1182 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1183 		key.offset = hash_extent_data_ref(root_objectid,
1184 						  owner, offset);
1185 	}
1186 again:
1187 	recow = 0;
1188 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1189 	if (ret < 0) {
1190 		err = ret;
1191 		goto fail;
1192 	}
1193 
1194 	if (parent) {
1195 		if (!ret)
1196 			return 0;
1197 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1198 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1199 		btrfs_release_path(path);
1200 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1201 		if (ret < 0) {
1202 			err = ret;
1203 			goto fail;
1204 		}
1205 		if (!ret)
1206 			return 0;
1207 #endif
1208 		goto fail;
1209 	}
1210 
1211 	leaf = path->nodes[0];
1212 	nritems = btrfs_header_nritems(leaf);
1213 	while (1) {
1214 		if (path->slots[0] >= nritems) {
1215 			ret = btrfs_next_leaf(root, path);
1216 			if (ret < 0)
1217 				err = ret;
1218 			if (ret)
1219 				goto fail;
1220 
1221 			leaf = path->nodes[0];
1222 			nritems = btrfs_header_nritems(leaf);
1223 			recow = 1;
1224 		}
1225 
1226 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1227 		if (key.objectid != bytenr ||
1228 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1229 			goto fail;
1230 
1231 		ref = btrfs_item_ptr(leaf, path->slots[0],
1232 				     struct btrfs_extent_data_ref);
1233 
1234 		if (match_extent_data_ref(leaf, ref, root_objectid,
1235 					  owner, offset)) {
1236 			if (recow) {
1237 				btrfs_release_path(path);
1238 				goto again;
1239 			}
1240 			err = 0;
1241 			break;
1242 		}
1243 		path->slots[0]++;
1244 	}
1245 fail:
1246 	return err;
1247 }
1248 
1249 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1250 					   struct btrfs_root *root,
1251 					   struct btrfs_path *path,
1252 					   u64 bytenr, u64 parent,
1253 					   u64 root_objectid, u64 owner,
1254 					   u64 offset, int refs_to_add)
1255 {
1256 	struct btrfs_key key;
1257 	struct extent_buffer *leaf;
1258 	u32 size;
1259 	u32 num_refs;
1260 	int ret;
1261 
1262 	key.objectid = bytenr;
1263 	if (parent) {
1264 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1265 		key.offset = parent;
1266 		size = sizeof(struct btrfs_shared_data_ref);
1267 	} else {
1268 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1269 		key.offset = hash_extent_data_ref(root_objectid,
1270 						  owner, offset);
1271 		size = sizeof(struct btrfs_extent_data_ref);
1272 	}
1273 
1274 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1275 	if (ret && ret != -EEXIST)
1276 		goto fail;
1277 
1278 	leaf = path->nodes[0];
1279 	if (parent) {
1280 		struct btrfs_shared_data_ref *ref;
1281 		ref = btrfs_item_ptr(leaf, path->slots[0],
1282 				     struct btrfs_shared_data_ref);
1283 		if (ret == 0) {
1284 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1285 		} else {
1286 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1287 			num_refs += refs_to_add;
1288 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1289 		}
1290 	} else {
1291 		struct btrfs_extent_data_ref *ref;
1292 		while (ret == -EEXIST) {
1293 			ref = btrfs_item_ptr(leaf, path->slots[0],
1294 					     struct btrfs_extent_data_ref);
1295 			if (match_extent_data_ref(leaf, ref, root_objectid,
1296 						  owner, offset))
1297 				break;
1298 			btrfs_release_path(path);
1299 			key.offset++;
1300 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1301 						      size);
1302 			if (ret && ret != -EEXIST)
1303 				goto fail;
1304 
1305 			leaf = path->nodes[0];
1306 		}
1307 		ref = btrfs_item_ptr(leaf, path->slots[0],
1308 				     struct btrfs_extent_data_ref);
1309 		if (ret == 0) {
1310 			btrfs_set_extent_data_ref_root(leaf, ref,
1311 						       root_objectid);
1312 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1313 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1314 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1315 		} else {
1316 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1317 			num_refs += refs_to_add;
1318 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1319 		}
1320 	}
1321 	btrfs_mark_buffer_dirty(leaf);
1322 	ret = 0;
1323 fail:
1324 	btrfs_release_path(path);
1325 	return ret;
1326 }
1327 
1328 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1329 					   struct btrfs_root *root,
1330 					   struct btrfs_path *path,
1331 					   int refs_to_drop, int *last_ref)
1332 {
1333 	struct btrfs_key key;
1334 	struct btrfs_extent_data_ref *ref1 = NULL;
1335 	struct btrfs_shared_data_ref *ref2 = NULL;
1336 	struct extent_buffer *leaf;
1337 	u32 num_refs = 0;
1338 	int ret = 0;
1339 
1340 	leaf = path->nodes[0];
1341 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1342 
1343 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1344 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1345 				      struct btrfs_extent_data_ref);
1346 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1347 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1348 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1349 				      struct btrfs_shared_data_ref);
1350 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1351 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1352 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1353 		struct btrfs_extent_ref_v0 *ref0;
1354 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1355 				      struct btrfs_extent_ref_v0);
1356 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1357 #endif
1358 	} else {
1359 		BUG();
1360 	}
1361 
1362 	BUG_ON(num_refs < refs_to_drop);
1363 	num_refs -= refs_to_drop;
1364 
1365 	if (num_refs == 0) {
1366 		ret = btrfs_del_item(trans, root, path);
1367 		*last_ref = 1;
1368 	} else {
1369 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1370 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1371 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1372 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1373 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1374 		else {
1375 			struct btrfs_extent_ref_v0 *ref0;
1376 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1377 					struct btrfs_extent_ref_v0);
1378 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1379 		}
1380 #endif
1381 		btrfs_mark_buffer_dirty(leaf);
1382 	}
1383 	return ret;
1384 }
1385 
1386 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1387 					  struct btrfs_extent_inline_ref *iref)
1388 {
1389 	struct btrfs_key key;
1390 	struct extent_buffer *leaf;
1391 	struct btrfs_extent_data_ref *ref1;
1392 	struct btrfs_shared_data_ref *ref2;
1393 	u32 num_refs = 0;
1394 
1395 	leaf = path->nodes[0];
1396 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1397 	if (iref) {
1398 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1399 		    BTRFS_EXTENT_DATA_REF_KEY) {
1400 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1401 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1402 		} else {
1403 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1404 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1405 		}
1406 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1407 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1408 				      struct btrfs_extent_data_ref);
1409 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1410 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1411 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1412 				      struct btrfs_shared_data_ref);
1413 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1414 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1415 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1416 		struct btrfs_extent_ref_v0 *ref0;
1417 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1418 				      struct btrfs_extent_ref_v0);
1419 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1420 #endif
1421 	} else {
1422 		WARN_ON(1);
1423 	}
1424 	return num_refs;
1425 }
1426 
1427 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1428 					  struct btrfs_root *root,
1429 					  struct btrfs_path *path,
1430 					  u64 bytenr, u64 parent,
1431 					  u64 root_objectid)
1432 {
1433 	struct btrfs_key key;
1434 	int ret;
1435 
1436 	key.objectid = bytenr;
1437 	if (parent) {
1438 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1439 		key.offset = parent;
1440 	} else {
1441 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1442 		key.offset = root_objectid;
1443 	}
1444 
1445 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1446 	if (ret > 0)
1447 		ret = -ENOENT;
1448 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1449 	if (ret == -ENOENT && parent) {
1450 		btrfs_release_path(path);
1451 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1452 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1453 		if (ret > 0)
1454 			ret = -ENOENT;
1455 	}
1456 #endif
1457 	return ret;
1458 }
1459 
1460 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1461 					  struct btrfs_root *root,
1462 					  struct btrfs_path *path,
1463 					  u64 bytenr, u64 parent,
1464 					  u64 root_objectid)
1465 {
1466 	struct btrfs_key key;
1467 	int ret;
1468 
1469 	key.objectid = bytenr;
1470 	if (parent) {
1471 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1472 		key.offset = parent;
1473 	} else {
1474 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1475 		key.offset = root_objectid;
1476 	}
1477 
1478 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1479 	btrfs_release_path(path);
1480 	return ret;
1481 }
1482 
1483 static inline int extent_ref_type(u64 parent, u64 owner)
1484 {
1485 	int type;
1486 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1487 		if (parent > 0)
1488 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1489 		else
1490 			type = BTRFS_TREE_BLOCK_REF_KEY;
1491 	} else {
1492 		if (parent > 0)
1493 			type = BTRFS_SHARED_DATA_REF_KEY;
1494 		else
1495 			type = BTRFS_EXTENT_DATA_REF_KEY;
1496 	}
1497 	return type;
1498 }
1499 
1500 static int find_next_key(struct btrfs_path *path, int level,
1501 			 struct btrfs_key *key)
1502 
1503 {
1504 	for (; level < BTRFS_MAX_LEVEL; level++) {
1505 		if (!path->nodes[level])
1506 			break;
1507 		if (path->slots[level] + 1 >=
1508 		    btrfs_header_nritems(path->nodes[level]))
1509 			continue;
1510 		if (level == 0)
1511 			btrfs_item_key_to_cpu(path->nodes[level], key,
1512 					      path->slots[level] + 1);
1513 		else
1514 			btrfs_node_key_to_cpu(path->nodes[level], key,
1515 					      path->slots[level] + 1);
1516 		return 0;
1517 	}
1518 	return 1;
1519 }
1520 
1521 /*
1522  * look for inline back ref. if back ref is found, *ref_ret is set
1523  * to the address of inline back ref, and 0 is returned.
1524  *
1525  * if back ref isn't found, *ref_ret is set to the address where it
1526  * should be inserted, and -ENOENT is returned.
1527  *
1528  * if insert is true and there are too many inline back refs, the path
1529  * points to the extent item, and -EAGAIN is returned.
1530  *
1531  * NOTE: inline back refs are ordered in the same way that back ref
1532  *	 items in the tree are ordered.
1533  */
1534 static noinline_for_stack
1535 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1536 				 struct btrfs_root *root,
1537 				 struct btrfs_path *path,
1538 				 struct btrfs_extent_inline_ref **ref_ret,
1539 				 u64 bytenr, u64 num_bytes,
1540 				 u64 parent, u64 root_objectid,
1541 				 u64 owner, u64 offset, int insert)
1542 {
1543 	struct btrfs_key key;
1544 	struct extent_buffer *leaf;
1545 	struct btrfs_extent_item *ei;
1546 	struct btrfs_extent_inline_ref *iref;
1547 	u64 flags;
1548 	u64 item_size;
1549 	unsigned long ptr;
1550 	unsigned long end;
1551 	int extra_size;
1552 	int type;
1553 	int want;
1554 	int ret;
1555 	int err = 0;
1556 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1557 						 SKINNY_METADATA);
1558 
1559 	key.objectid = bytenr;
1560 	key.type = BTRFS_EXTENT_ITEM_KEY;
1561 	key.offset = num_bytes;
1562 
1563 	want = extent_ref_type(parent, owner);
1564 	if (insert) {
1565 		extra_size = btrfs_extent_inline_ref_size(want);
1566 		path->keep_locks = 1;
1567 	} else
1568 		extra_size = -1;
1569 
1570 	/*
1571 	 * Owner is our parent level, so we can just add one to get the level
1572 	 * for the block we are interested in.
1573 	 */
1574 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1575 		key.type = BTRFS_METADATA_ITEM_KEY;
1576 		key.offset = owner;
1577 	}
1578 
1579 again:
1580 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1581 	if (ret < 0) {
1582 		err = ret;
1583 		goto out;
1584 	}
1585 
1586 	/*
1587 	 * We may be a newly converted file system which still has the old fat
1588 	 * extent entries for metadata, so try and see if we have one of those.
1589 	 */
1590 	if (ret > 0 && skinny_metadata) {
1591 		skinny_metadata = false;
1592 		if (path->slots[0]) {
1593 			path->slots[0]--;
1594 			btrfs_item_key_to_cpu(path->nodes[0], &key,
1595 					      path->slots[0]);
1596 			if (key.objectid == bytenr &&
1597 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
1598 			    key.offset == num_bytes)
1599 				ret = 0;
1600 		}
1601 		if (ret) {
1602 			key.objectid = bytenr;
1603 			key.type = BTRFS_EXTENT_ITEM_KEY;
1604 			key.offset = num_bytes;
1605 			btrfs_release_path(path);
1606 			goto again;
1607 		}
1608 	}
1609 
1610 	if (ret && !insert) {
1611 		err = -ENOENT;
1612 		goto out;
1613 	} else if (WARN_ON(ret)) {
1614 		err = -EIO;
1615 		goto out;
1616 	}
1617 
1618 	leaf = path->nodes[0];
1619 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1620 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1621 	if (item_size < sizeof(*ei)) {
1622 		if (!insert) {
1623 			err = -ENOENT;
1624 			goto out;
1625 		}
1626 		ret = convert_extent_item_v0(trans, root, path, owner,
1627 					     extra_size);
1628 		if (ret < 0) {
1629 			err = ret;
1630 			goto out;
1631 		}
1632 		leaf = path->nodes[0];
1633 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1634 	}
1635 #endif
1636 	BUG_ON(item_size < sizeof(*ei));
1637 
1638 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1639 	flags = btrfs_extent_flags(leaf, ei);
1640 
1641 	ptr = (unsigned long)(ei + 1);
1642 	end = (unsigned long)ei + item_size;
1643 
1644 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1645 		ptr += sizeof(struct btrfs_tree_block_info);
1646 		BUG_ON(ptr > end);
1647 	}
1648 
1649 	err = -ENOENT;
1650 	while (1) {
1651 		if (ptr >= end) {
1652 			WARN_ON(ptr > end);
1653 			break;
1654 		}
1655 		iref = (struct btrfs_extent_inline_ref *)ptr;
1656 		type = btrfs_extent_inline_ref_type(leaf, iref);
1657 		if (want < type)
1658 			break;
1659 		if (want > type) {
1660 			ptr += btrfs_extent_inline_ref_size(type);
1661 			continue;
1662 		}
1663 
1664 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1665 			struct btrfs_extent_data_ref *dref;
1666 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1667 			if (match_extent_data_ref(leaf, dref, root_objectid,
1668 						  owner, offset)) {
1669 				err = 0;
1670 				break;
1671 			}
1672 			if (hash_extent_data_ref_item(leaf, dref) <
1673 			    hash_extent_data_ref(root_objectid, owner, offset))
1674 				break;
1675 		} else {
1676 			u64 ref_offset;
1677 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1678 			if (parent > 0) {
1679 				if (parent == ref_offset) {
1680 					err = 0;
1681 					break;
1682 				}
1683 				if (ref_offset < parent)
1684 					break;
1685 			} else {
1686 				if (root_objectid == ref_offset) {
1687 					err = 0;
1688 					break;
1689 				}
1690 				if (ref_offset < root_objectid)
1691 					break;
1692 			}
1693 		}
1694 		ptr += btrfs_extent_inline_ref_size(type);
1695 	}
1696 	if (err == -ENOENT && insert) {
1697 		if (item_size + extra_size >=
1698 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1699 			err = -EAGAIN;
1700 			goto out;
1701 		}
1702 		/*
1703 		 * To add new inline back ref, we have to make sure
1704 		 * there is no corresponding back ref item.
1705 		 * For simplicity, we just do not add new inline back
1706 		 * ref if there is any kind of item for this block
1707 		 */
1708 		if (find_next_key(path, 0, &key) == 0 &&
1709 		    key.objectid == bytenr &&
1710 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1711 			err = -EAGAIN;
1712 			goto out;
1713 		}
1714 	}
1715 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1716 out:
1717 	if (insert) {
1718 		path->keep_locks = 0;
1719 		btrfs_unlock_up_safe(path, 1);
1720 	}
1721 	return err;
1722 }
1723 
1724 /*
1725  * helper to add new inline back ref
1726  */
1727 static noinline_for_stack
1728 void setup_inline_extent_backref(struct btrfs_root *root,
1729 				 struct btrfs_path *path,
1730 				 struct btrfs_extent_inline_ref *iref,
1731 				 u64 parent, u64 root_objectid,
1732 				 u64 owner, u64 offset, int refs_to_add,
1733 				 struct btrfs_delayed_extent_op *extent_op)
1734 {
1735 	struct extent_buffer *leaf;
1736 	struct btrfs_extent_item *ei;
1737 	unsigned long ptr;
1738 	unsigned long end;
1739 	unsigned long item_offset;
1740 	u64 refs;
1741 	int size;
1742 	int type;
1743 
1744 	leaf = path->nodes[0];
1745 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1746 	item_offset = (unsigned long)iref - (unsigned long)ei;
1747 
1748 	type = extent_ref_type(parent, owner);
1749 	size = btrfs_extent_inline_ref_size(type);
1750 
1751 	btrfs_extend_item(root, path, size);
1752 
1753 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1754 	refs = btrfs_extent_refs(leaf, ei);
1755 	refs += refs_to_add;
1756 	btrfs_set_extent_refs(leaf, ei, refs);
1757 	if (extent_op)
1758 		__run_delayed_extent_op(extent_op, leaf, ei);
1759 
1760 	ptr = (unsigned long)ei + item_offset;
1761 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1762 	if (ptr < end - size)
1763 		memmove_extent_buffer(leaf, ptr + size, ptr,
1764 				      end - size - ptr);
1765 
1766 	iref = (struct btrfs_extent_inline_ref *)ptr;
1767 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1768 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1769 		struct btrfs_extent_data_ref *dref;
1770 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1771 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1772 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1773 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1774 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1775 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1776 		struct btrfs_shared_data_ref *sref;
1777 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1778 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1779 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1780 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1781 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1782 	} else {
1783 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1784 	}
1785 	btrfs_mark_buffer_dirty(leaf);
1786 }
1787 
1788 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1789 				 struct btrfs_root *root,
1790 				 struct btrfs_path *path,
1791 				 struct btrfs_extent_inline_ref **ref_ret,
1792 				 u64 bytenr, u64 num_bytes, u64 parent,
1793 				 u64 root_objectid, u64 owner, u64 offset)
1794 {
1795 	int ret;
1796 
1797 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1798 					   bytenr, num_bytes, parent,
1799 					   root_objectid, owner, offset, 0);
1800 	if (ret != -ENOENT)
1801 		return ret;
1802 
1803 	btrfs_release_path(path);
1804 	*ref_ret = NULL;
1805 
1806 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1807 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1808 					    root_objectid);
1809 	} else {
1810 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1811 					     root_objectid, owner, offset);
1812 	}
1813 	return ret;
1814 }
1815 
1816 /*
1817  * helper to update/remove inline back ref
1818  */
1819 static noinline_for_stack
1820 void update_inline_extent_backref(struct btrfs_root *root,
1821 				  struct btrfs_path *path,
1822 				  struct btrfs_extent_inline_ref *iref,
1823 				  int refs_to_mod,
1824 				  struct btrfs_delayed_extent_op *extent_op,
1825 				  int *last_ref)
1826 {
1827 	struct extent_buffer *leaf;
1828 	struct btrfs_extent_item *ei;
1829 	struct btrfs_extent_data_ref *dref = NULL;
1830 	struct btrfs_shared_data_ref *sref = NULL;
1831 	unsigned long ptr;
1832 	unsigned long end;
1833 	u32 item_size;
1834 	int size;
1835 	int type;
1836 	u64 refs;
1837 
1838 	leaf = path->nodes[0];
1839 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1840 	refs = btrfs_extent_refs(leaf, ei);
1841 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1842 	refs += refs_to_mod;
1843 	btrfs_set_extent_refs(leaf, ei, refs);
1844 	if (extent_op)
1845 		__run_delayed_extent_op(extent_op, leaf, ei);
1846 
1847 	type = btrfs_extent_inline_ref_type(leaf, iref);
1848 
1849 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1850 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1851 		refs = btrfs_extent_data_ref_count(leaf, dref);
1852 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1853 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1854 		refs = btrfs_shared_data_ref_count(leaf, sref);
1855 	} else {
1856 		refs = 1;
1857 		BUG_ON(refs_to_mod != -1);
1858 	}
1859 
1860 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1861 	refs += refs_to_mod;
1862 
1863 	if (refs > 0) {
1864 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1865 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1866 		else
1867 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1868 	} else {
1869 		*last_ref = 1;
1870 		size =  btrfs_extent_inline_ref_size(type);
1871 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1872 		ptr = (unsigned long)iref;
1873 		end = (unsigned long)ei + item_size;
1874 		if (ptr + size < end)
1875 			memmove_extent_buffer(leaf, ptr, ptr + size,
1876 					      end - ptr - size);
1877 		item_size -= size;
1878 		btrfs_truncate_item(root, path, item_size, 1);
1879 	}
1880 	btrfs_mark_buffer_dirty(leaf);
1881 }
1882 
1883 static noinline_for_stack
1884 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1885 				 struct btrfs_root *root,
1886 				 struct btrfs_path *path,
1887 				 u64 bytenr, u64 num_bytes, u64 parent,
1888 				 u64 root_objectid, u64 owner,
1889 				 u64 offset, int refs_to_add,
1890 				 struct btrfs_delayed_extent_op *extent_op)
1891 {
1892 	struct btrfs_extent_inline_ref *iref;
1893 	int ret;
1894 
1895 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1896 					   bytenr, num_bytes, parent,
1897 					   root_objectid, owner, offset, 1);
1898 	if (ret == 0) {
1899 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1900 		update_inline_extent_backref(root, path, iref,
1901 					     refs_to_add, extent_op, NULL);
1902 	} else if (ret == -ENOENT) {
1903 		setup_inline_extent_backref(root, path, iref, parent,
1904 					    root_objectid, owner, offset,
1905 					    refs_to_add, extent_op);
1906 		ret = 0;
1907 	}
1908 	return ret;
1909 }
1910 
1911 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1912 				 struct btrfs_root *root,
1913 				 struct btrfs_path *path,
1914 				 u64 bytenr, u64 parent, u64 root_objectid,
1915 				 u64 owner, u64 offset, int refs_to_add)
1916 {
1917 	int ret;
1918 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1919 		BUG_ON(refs_to_add != 1);
1920 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1921 					    parent, root_objectid);
1922 	} else {
1923 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1924 					     parent, root_objectid,
1925 					     owner, offset, refs_to_add);
1926 	}
1927 	return ret;
1928 }
1929 
1930 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1931 				 struct btrfs_root *root,
1932 				 struct btrfs_path *path,
1933 				 struct btrfs_extent_inline_ref *iref,
1934 				 int refs_to_drop, int is_data, int *last_ref)
1935 {
1936 	int ret = 0;
1937 
1938 	BUG_ON(!is_data && refs_to_drop != 1);
1939 	if (iref) {
1940 		update_inline_extent_backref(root, path, iref,
1941 					     -refs_to_drop, NULL, last_ref);
1942 	} else if (is_data) {
1943 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1944 					     last_ref);
1945 	} else {
1946 		*last_ref = 1;
1947 		ret = btrfs_del_item(trans, root, path);
1948 	}
1949 	return ret;
1950 }
1951 
1952 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1953 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1954 			       u64 *discarded_bytes)
1955 {
1956 	int j, ret = 0;
1957 	u64 bytes_left, end;
1958 	u64 aligned_start = ALIGN(start, 1 << 9);
1959 
1960 	if (WARN_ON(start != aligned_start)) {
1961 		len -= aligned_start - start;
1962 		len = round_down(len, 1 << 9);
1963 		start = aligned_start;
1964 	}
1965 
1966 	*discarded_bytes = 0;
1967 
1968 	if (!len)
1969 		return 0;
1970 
1971 	end = start + len;
1972 	bytes_left = len;
1973 
1974 	/* Skip any superblocks on this device. */
1975 	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1976 		u64 sb_start = btrfs_sb_offset(j);
1977 		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1978 		u64 size = sb_start - start;
1979 
1980 		if (!in_range(sb_start, start, bytes_left) &&
1981 		    !in_range(sb_end, start, bytes_left) &&
1982 		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1983 			continue;
1984 
1985 		/*
1986 		 * Superblock spans beginning of range.  Adjust start and
1987 		 * try again.
1988 		 */
1989 		if (sb_start <= start) {
1990 			start += sb_end - start;
1991 			if (start > end) {
1992 				bytes_left = 0;
1993 				break;
1994 			}
1995 			bytes_left = end - start;
1996 			continue;
1997 		}
1998 
1999 		if (size) {
2000 			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2001 						   GFP_NOFS, 0);
2002 			if (!ret)
2003 				*discarded_bytes += size;
2004 			else if (ret != -EOPNOTSUPP)
2005 				return ret;
2006 		}
2007 
2008 		start = sb_end;
2009 		if (start > end) {
2010 			bytes_left = 0;
2011 			break;
2012 		}
2013 		bytes_left = end - start;
2014 	}
2015 
2016 	if (bytes_left) {
2017 		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2018 					   GFP_NOFS, 0);
2019 		if (!ret)
2020 			*discarded_bytes += bytes_left;
2021 	}
2022 	return ret;
2023 }
2024 
2025 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2026 			 u64 num_bytes, u64 *actual_bytes)
2027 {
2028 	int ret;
2029 	u64 discarded_bytes = 0;
2030 	struct btrfs_bio *bbio = NULL;
2031 
2032 
2033 	/*
2034 	 * Avoid races with device replace and make sure our bbio has devices
2035 	 * associated to its stripes that don't go away while we are discarding.
2036 	 */
2037 	btrfs_bio_counter_inc_blocked(root->fs_info);
2038 	/* Tell the block device(s) that the sectors can be discarded */
2039 	ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
2040 			      bytenr, &num_bytes, &bbio, 0);
2041 	/* Error condition is -ENOMEM */
2042 	if (!ret) {
2043 		struct btrfs_bio_stripe *stripe = bbio->stripes;
2044 		int i;
2045 
2046 
2047 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2048 			u64 bytes;
2049 			if (!stripe->dev->can_discard)
2050 				continue;
2051 
2052 			ret = btrfs_issue_discard(stripe->dev->bdev,
2053 						  stripe->physical,
2054 						  stripe->length,
2055 						  &bytes);
2056 			if (!ret)
2057 				discarded_bytes += bytes;
2058 			else if (ret != -EOPNOTSUPP)
2059 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2060 
2061 			/*
2062 			 * Just in case we get back EOPNOTSUPP for some reason,
2063 			 * just ignore the return value so we don't screw up
2064 			 * people calling discard_extent.
2065 			 */
2066 			ret = 0;
2067 		}
2068 		btrfs_put_bbio(bbio);
2069 	}
2070 	btrfs_bio_counter_dec(root->fs_info);
2071 
2072 	if (actual_bytes)
2073 		*actual_bytes = discarded_bytes;
2074 
2075 
2076 	if (ret == -EOPNOTSUPP)
2077 		ret = 0;
2078 	return ret;
2079 }
2080 
2081 /* Can return -ENOMEM */
2082 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2083 			 struct btrfs_root *root,
2084 			 u64 bytenr, u64 num_bytes, u64 parent,
2085 			 u64 root_objectid, u64 owner, u64 offset)
2086 {
2087 	int ret;
2088 	struct btrfs_fs_info *fs_info = root->fs_info;
2089 
2090 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2091 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
2092 
2093 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2094 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2095 					num_bytes,
2096 					parent, root_objectid, (int)owner,
2097 					BTRFS_ADD_DELAYED_REF, NULL);
2098 	} else {
2099 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2100 					num_bytes, parent, root_objectid,
2101 					owner, offset, 0,
2102 					BTRFS_ADD_DELAYED_REF, NULL);
2103 	}
2104 	return ret;
2105 }
2106 
2107 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2108 				  struct btrfs_root *root,
2109 				  struct btrfs_delayed_ref_node *node,
2110 				  u64 parent, u64 root_objectid,
2111 				  u64 owner, u64 offset, int refs_to_add,
2112 				  struct btrfs_delayed_extent_op *extent_op)
2113 {
2114 	struct btrfs_fs_info *fs_info = root->fs_info;
2115 	struct btrfs_path *path;
2116 	struct extent_buffer *leaf;
2117 	struct btrfs_extent_item *item;
2118 	struct btrfs_key key;
2119 	u64 bytenr = node->bytenr;
2120 	u64 num_bytes = node->num_bytes;
2121 	u64 refs;
2122 	int ret;
2123 
2124 	path = btrfs_alloc_path();
2125 	if (!path)
2126 		return -ENOMEM;
2127 
2128 	path->reada = READA_FORWARD;
2129 	path->leave_spinning = 1;
2130 	/* this will setup the path even if it fails to insert the back ref */
2131 	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2132 					   bytenr, num_bytes, parent,
2133 					   root_objectid, owner, offset,
2134 					   refs_to_add, extent_op);
2135 	if ((ret < 0 && ret != -EAGAIN) || !ret)
2136 		goto out;
2137 
2138 	/*
2139 	 * Ok we had -EAGAIN which means we didn't have space to insert and
2140 	 * inline extent ref, so just update the reference count and add a
2141 	 * normal backref.
2142 	 */
2143 	leaf = path->nodes[0];
2144 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2145 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2146 	refs = btrfs_extent_refs(leaf, item);
2147 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2148 	if (extent_op)
2149 		__run_delayed_extent_op(extent_op, leaf, item);
2150 
2151 	btrfs_mark_buffer_dirty(leaf);
2152 	btrfs_release_path(path);
2153 
2154 	path->reada = READA_FORWARD;
2155 	path->leave_spinning = 1;
2156 	/* now insert the actual backref */
2157 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
2158 				    path, bytenr, parent, root_objectid,
2159 				    owner, offset, refs_to_add);
2160 	if (ret)
2161 		btrfs_abort_transaction(trans, ret);
2162 out:
2163 	btrfs_free_path(path);
2164 	return ret;
2165 }
2166 
2167 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2168 				struct btrfs_root *root,
2169 				struct btrfs_delayed_ref_node *node,
2170 				struct btrfs_delayed_extent_op *extent_op,
2171 				int insert_reserved)
2172 {
2173 	int ret = 0;
2174 	struct btrfs_delayed_data_ref *ref;
2175 	struct btrfs_key ins;
2176 	u64 parent = 0;
2177 	u64 ref_root = 0;
2178 	u64 flags = 0;
2179 
2180 	ins.objectid = node->bytenr;
2181 	ins.offset = node->num_bytes;
2182 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2183 
2184 	ref = btrfs_delayed_node_to_data_ref(node);
2185 	trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
2186 
2187 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2188 		parent = ref->parent;
2189 	ref_root = ref->root;
2190 
2191 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2192 		if (extent_op)
2193 			flags |= extent_op->flags_to_set;
2194 		ret = alloc_reserved_file_extent(trans, root,
2195 						 parent, ref_root, flags,
2196 						 ref->objectid, ref->offset,
2197 						 &ins, node->ref_mod);
2198 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2199 		ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2200 					     ref_root, ref->objectid,
2201 					     ref->offset, node->ref_mod,
2202 					     extent_op);
2203 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2204 		ret = __btrfs_free_extent(trans, root, node, parent,
2205 					  ref_root, ref->objectid,
2206 					  ref->offset, node->ref_mod,
2207 					  extent_op);
2208 	} else {
2209 		BUG();
2210 	}
2211 	return ret;
2212 }
2213 
2214 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2215 				    struct extent_buffer *leaf,
2216 				    struct btrfs_extent_item *ei)
2217 {
2218 	u64 flags = btrfs_extent_flags(leaf, ei);
2219 	if (extent_op->update_flags) {
2220 		flags |= extent_op->flags_to_set;
2221 		btrfs_set_extent_flags(leaf, ei, flags);
2222 	}
2223 
2224 	if (extent_op->update_key) {
2225 		struct btrfs_tree_block_info *bi;
2226 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2227 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2228 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2229 	}
2230 }
2231 
2232 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2233 				 struct btrfs_root *root,
2234 				 struct btrfs_delayed_ref_node *node,
2235 				 struct btrfs_delayed_extent_op *extent_op)
2236 {
2237 	struct btrfs_key key;
2238 	struct btrfs_path *path;
2239 	struct btrfs_extent_item *ei;
2240 	struct extent_buffer *leaf;
2241 	u32 item_size;
2242 	int ret;
2243 	int err = 0;
2244 	int metadata = !extent_op->is_data;
2245 
2246 	if (trans->aborted)
2247 		return 0;
2248 
2249 	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2250 		metadata = 0;
2251 
2252 	path = btrfs_alloc_path();
2253 	if (!path)
2254 		return -ENOMEM;
2255 
2256 	key.objectid = node->bytenr;
2257 
2258 	if (metadata) {
2259 		key.type = BTRFS_METADATA_ITEM_KEY;
2260 		key.offset = extent_op->level;
2261 	} else {
2262 		key.type = BTRFS_EXTENT_ITEM_KEY;
2263 		key.offset = node->num_bytes;
2264 	}
2265 
2266 again:
2267 	path->reada = READA_FORWARD;
2268 	path->leave_spinning = 1;
2269 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2270 				path, 0, 1);
2271 	if (ret < 0) {
2272 		err = ret;
2273 		goto out;
2274 	}
2275 	if (ret > 0) {
2276 		if (metadata) {
2277 			if (path->slots[0] > 0) {
2278 				path->slots[0]--;
2279 				btrfs_item_key_to_cpu(path->nodes[0], &key,
2280 						      path->slots[0]);
2281 				if (key.objectid == node->bytenr &&
2282 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
2283 				    key.offset == node->num_bytes)
2284 					ret = 0;
2285 			}
2286 			if (ret > 0) {
2287 				btrfs_release_path(path);
2288 				metadata = 0;
2289 
2290 				key.objectid = node->bytenr;
2291 				key.offset = node->num_bytes;
2292 				key.type = BTRFS_EXTENT_ITEM_KEY;
2293 				goto again;
2294 			}
2295 		} else {
2296 			err = -EIO;
2297 			goto out;
2298 		}
2299 	}
2300 
2301 	leaf = path->nodes[0];
2302 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2303 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2304 	if (item_size < sizeof(*ei)) {
2305 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2306 					     path, (u64)-1, 0);
2307 		if (ret < 0) {
2308 			err = ret;
2309 			goto out;
2310 		}
2311 		leaf = path->nodes[0];
2312 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2313 	}
2314 #endif
2315 	BUG_ON(item_size < sizeof(*ei));
2316 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2317 	__run_delayed_extent_op(extent_op, leaf, ei);
2318 
2319 	btrfs_mark_buffer_dirty(leaf);
2320 out:
2321 	btrfs_free_path(path);
2322 	return err;
2323 }
2324 
2325 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2326 				struct btrfs_root *root,
2327 				struct btrfs_delayed_ref_node *node,
2328 				struct btrfs_delayed_extent_op *extent_op,
2329 				int insert_reserved)
2330 {
2331 	int ret = 0;
2332 	struct btrfs_delayed_tree_ref *ref;
2333 	struct btrfs_key ins;
2334 	u64 parent = 0;
2335 	u64 ref_root = 0;
2336 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2337 						 SKINNY_METADATA);
2338 
2339 	ref = btrfs_delayed_node_to_tree_ref(node);
2340 	trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
2341 
2342 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2343 		parent = ref->parent;
2344 	ref_root = ref->root;
2345 
2346 	ins.objectid = node->bytenr;
2347 	if (skinny_metadata) {
2348 		ins.offset = ref->level;
2349 		ins.type = BTRFS_METADATA_ITEM_KEY;
2350 	} else {
2351 		ins.offset = node->num_bytes;
2352 		ins.type = BTRFS_EXTENT_ITEM_KEY;
2353 	}
2354 
2355 	if (node->ref_mod != 1) {
2356 		btrfs_err(root->fs_info,
2357 	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2358 			  node->bytenr, node->ref_mod, node->action, ref_root,
2359 			  parent);
2360 		return -EIO;
2361 	}
2362 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2363 		BUG_ON(!extent_op || !extent_op->update_flags);
2364 		ret = alloc_reserved_tree_block(trans, root,
2365 						parent, ref_root,
2366 						extent_op->flags_to_set,
2367 						&extent_op->key,
2368 						ref->level, &ins);
2369 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2370 		ret = __btrfs_inc_extent_ref(trans, root, node,
2371 					     parent, ref_root,
2372 					     ref->level, 0, 1,
2373 					     extent_op);
2374 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2375 		ret = __btrfs_free_extent(trans, root, node,
2376 					  parent, ref_root,
2377 					  ref->level, 0, 1, extent_op);
2378 	} else {
2379 		BUG();
2380 	}
2381 	return ret;
2382 }
2383 
2384 /* helper function to actually process a single delayed ref entry */
2385 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2386 			       struct btrfs_root *root,
2387 			       struct btrfs_delayed_ref_node *node,
2388 			       struct btrfs_delayed_extent_op *extent_op,
2389 			       int insert_reserved)
2390 {
2391 	int ret = 0;
2392 
2393 	if (trans->aborted) {
2394 		if (insert_reserved)
2395 			btrfs_pin_extent(root, node->bytenr,
2396 					 node->num_bytes, 1);
2397 		return 0;
2398 	}
2399 
2400 	if (btrfs_delayed_ref_is_head(node)) {
2401 		struct btrfs_delayed_ref_head *head;
2402 		/*
2403 		 * we've hit the end of the chain and we were supposed
2404 		 * to insert this extent into the tree.  But, it got
2405 		 * deleted before we ever needed to insert it, so all
2406 		 * we have to do is clean up the accounting
2407 		 */
2408 		BUG_ON(extent_op);
2409 		head = btrfs_delayed_node_to_head(node);
2410 		trace_run_delayed_ref_head(root->fs_info, node, head,
2411 					   node->action);
2412 
2413 		if (insert_reserved) {
2414 			btrfs_pin_extent(root, node->bytenr,
2415 					 node->num_bytes, 1);
2416 			if (head->is_data) {
2417 				ret = btrfs_del_csums(trans, root,
2418 						      node->bytenr,
2419 						      node->num_bytes);
2420 			}
2421 		}
2422 
2423 		/* Also free its reserved qgroup space */
2424 		btrfs_qgroup_free_delayed_ref(root->fs_info,
2425 					      head->qgroup_ref_root,
2426 					      head->qgroup_reserved);
2427 		return ret;
2428 	}
2429 
2430 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2431 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2432 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2433 					   insert_reserved);
2434 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2435 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2436 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2437 					   insert_reserved);
2438 	else
2439 		BUG();
2440 	return ret;
2441 }
2442 
2443 static inline struct btrfs_delayed_ref_node *
2444 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2445 {
2446 	struct btrfs_delayed_ref_node *ref;
2447 
2448 	if (list_empty(&head->ref_list))
2449 		return NULL;
2450 
2451 	/*
2452 	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2453 	 * This is to prevent a ref count from going down to zero, which deletes
2454 	 * the extent item from the extent tree, when there still are references
2455 	 * to add, which would fail because they would not find the extent item.
2456 	 */
2457 	list_for_each_entry(ref, &head->ref_list, list) {
2458 		if (ref->action == BTRFS_ADD_DELAYED_REF)
2459 			return ref;
2460 	}
2461 
2462 	return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2463 			  list);
2464 }
2465 
2466 /*
2467  * Returns 0 on success or if called with an already aborted transaction.
2468  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2469  */
2470 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2471 					     struct btrfs_root *root,
2472 					     unsigned long nr)
2473 {
2474 	struct btrfs_delayed_ref_root *delayed_refs;
2475 	struct btrfs_delayed_ref_node *ref;
2476 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2477 	struct btrfs_delayed_extent_op *extent_op;
2478 	struct btrfs_fs_info *fs_info = root->fs_info;
2479 	ktime_t start = ktime_get();
2480 	int ret;
2481 	unsigned long count = 0;
2482 	unsigned long actual_count = 0;
2483 	int must_insert_reserved = 0;
2484 
2485 	delayed_refs = &trans->transaction->delayed_refs;
2486 	while (1) {
2487 		if (!locked_ref) {
2488 			if (count >= nr)
2489 				break;
2490 
2491 			spin_lock(&delayed_refs->lock);
2492 			locked_ref = btrfs_select_ref_head(trans);
2493 			if (!locked_ref) {
2494 				spin_unlock(&delayed_refs->lock);
2495 				break;
2496 			}
2497 
2498 			/* grab the lock that says we are going to process
2499 			 * all the refs for this head */
2500 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2501 			spin_unlock(&delayed_refs->lock);
2502 			/*
2503 			 * we may have dropped the spin lock to get the head
2504 			 * mutex lock, and that might have given someone else
2505 			 * time to free the head.  If that's true, it has been
2506 			 * removed from our list and we can move on.
2507 			 */
2508 			if (ret == -EAGAIN) {
2509 				locked_ref = NULL;
2510 				count++;
2511 				continue;
2512 			}
2513 		}
2514 
2515 		/*
2516 		 * We need to try and merge add/drops of the same ref since we
2517 		 * can run into issues with relocate dropping the implicit ref
2518 		 * and then it being added back again before the drop can
2519 		 * finish.  If we merged anything we need to re-loop so we can
2520 		 * get a good ref.
2521 		 * Or we can get node references of the same type that weren't
2522 		 * merged when created due to bumps in the tree mod seq, and
2523 		 * we need to merge them to prevent adding an inline extent
2524 		 * backref before dropping it (triggering a BUG_ON at
2525 		 * insert_inline_extent_backref()).
2526 		 */
2527 		spin_lock(&locked_ref->lock);
2528 		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2529 					 locked_ref);
2530 
2531 		/*
2532 		 * locked_ref is the head node, so we have to go one
2533 		 * node back for any delayed ref updates
2534 		 */
2535 		ref = select_delayed_ref(locked_ref);
2536 
2537 		if (ref && ref->seq &&
2538 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2539 			spin_unlock(&locked_ref->lock);
2540 			btrfs_delayed_ref_unlock(locked_ref);
2541 			spin_lock(&delayed_refs->lock);
2542 			locked_ref->processing = 0;
2543 			delayed_refs->num_heads_ready++;
2544 			spin_unlock(&delayed_refs->lock);
2545 			locked_ref = NULL;
2546 			cond_resched();
2547 			count++;
2548 			continue;
2549 		}
2550 
2551 		/*
2552 		 * record the must insert reserved flag before we
2553 		 * drop the spin lock.
2554 		 */
2555 		must_insert_reserved = locked_ref->must_insert_reserved;
2556 		locked_ref->must_insert_reserved = 0;
2557 
2558 		extent_op = locked_ref->extent_op;
2559 		locked_ref->extent_op = NULL;
2560 
2561 		if (!ref) {
2562 
2563 
2564 			/* All delayed refs have been processed, Go ahead
2565 			 * and send the head node to run_one_delayed_ref,
2566 			 * so that any accounting fixes can happen
2567 			 */
2568 			ref = &locked_ref->node;
2569 
2570 			if (extent_op && must_insert_reserved) {
2571 				btrfs_free_delayed_extent_op(extent_op);
2572 				extent_op = NULL;
2573 			}
2574 
2575 			if (extent_op) {
2576 				spin_unlock(&locked_ref->lock);
2577 				ret = run_delayed_extent_op(trans, root,
2578 							    ref, extent_op);
2579 				btrfs_free_delayed_extent_op(extent_op);
2580 
2581 				if (ret) {
2582 					/*
2583 					 * Need to reset must_insert_reserved if
2584 					 * there was an error so the abort stuff
2585 					 * can cleanup the reserved space
2586 					 * properly.
2587 					 */
2588 					if (must_insert_reserved)
2589 						locked_ref->must_insert_reserved = 1;
2590 					locked_ref->processing = 0;
2591 					btrfs_debug(fs_info,
2592 						    "run_delayed_extent_op returned %d",
2593 						    ret);
2594 					btrfs_delayed_ref_unlock(locked_ref);
2595 					return ret;
2596 				}
2597 				continue;
2598 			}
2599 
2600 			/*
2601 			 * Need to drop our head ref lock and re-acquire the
2602 			 * delayed ref lock and then re-check to make sure
2603 			 * nobody got added.
2604 			 */
2605 			spin_unlock(&locked_ref->lock);
2606 			spin_lock(&delayed_refs->lock);
2607 			spin_lock(&locked_ref->lock);
2608 			if (!list_empty(&locked_ref->ref_list) ||
2609 			    locked_ref->extent_op) {
2610 				spin_unlock(&locked_ref->lock);
2611 				spin_unlock(&delayed_refs->lock);
2612 				continue;
2613 			}
2614 			ref->in_tree = 0;
2615 			delayed_refs->num_heads--;
2616 			rb_erase(&locked_ref->href_node,
2617 				 &delayed_refs->href_root);
2618 			spin_unlock(&delayed_refs->lock);
2619 		} else {
2620 			actual_count++;
2621 			ref->in_tree = 0;
2622 			list_del(&ref->list);
2623 		}
2624 		atomic_dec(&delayed_refs->num_entries);
2625 
2626 		if (!btrfs_delayed_ref_is_head(ref)) {
2627 			/*
2628 			 * when we play the delayed ref, also correct the
2629 			 * ref_mod on head
2630 			 */
2631 			switch (ref->action) {
2632 			case BTRFS_ADD_DELAYED_REF:
2633 			case BTRFS_ADD_DELAYED_EXTENT:
2634 				locked_ref->node.ref_mod -= ref->ref_mod;
2635 				break;
2636 			case BTRFS_DROP_DELAYED_REF:
2637 				locked_ref->node.ref_mod += ref->ref_mod;
2638 				break;
2639 			default:
2640 				WARN_ON(1);
2641 			}
2642 		}
2643 		spin_unlock(&locked_ref->lock);
2644 
2645 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2646 					  must_insert_reserved);
2647 
2648 		btrfs_free_delayed_extent_op(extent_op);
2649 		if (ret) {
2650 			locked_ref->processing = 0;
2651 			btrfs_delayed_ref_unlock(locked_ref);
2652 			btrfs_put_delayed_ref(ref);
2653 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2654 				    ret);
2655 			return ret;
2656 		}
2657 
2658 		/*
2659 		 * If this node is a head, that means all the refs in this head
2660 		 * have been dealt with, and we will pick the next head to deal
2661 		 * with, so we must unlock the head and drop it from the cluster
2662 		 * list before we release it.
2663 		 */
2664 		if (btrfs_delayed_ref_is_head(ref)) {
2665 			if (locked_ref->is_data &&
2666 			    locked_ref->total_ref_mod < 0) {
2667 				spin_lock(&delayed_refs->lock);
2668 				delayed_refs->pending_csums -= ref->num_bytes;
2669 				spin_unlock(&delayed_refs->lock);
2670 			}
2671 			btrfs_delayed_ref_unlock(locked_ref);
2672 			locked_ref = NULL;
2673 		}
2674 		btrfs_put_delayed_ref(ref);
2675 		count++;
2676 		cond_resched();
2677 	}
2678 
2679 	/*
2680 	 * We don't want to include ref heads since we can have empty ref heads
2681 	 * and those will drastically skew our runtime down since we just do
2682 	 * accounting, no actual extent tree updates.
2683 	 */
2684 	if (actual_count > 0) {
2685 		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2686 		u64 avg;
2687 
2688 		/*
2689 		 * We weigh the current average higher than our current runtime
2690 		 * to avoid large swings in the average.
2691 		 */
2692 		spin_lock(&delayed_refs->lock);
2693 		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2694 		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2695 		spin_unlock(&delayed_refs->lock);
2696 	}
2697 	return 0;
2698 }
2699 
2700 #ifdef SCRAMBLE_DELAYED_REFS
2701 /*
2702  * Normally delayed refs get processed in ascending bytenr order. This
2703  * correlates in most cases to the order added. To expose dependencies on this
2704  * order, we start to process the tree in the middle instead of the beginning
2705  */
2706 static u64 find_middle(struct rb_root *root)
2707 {
2708 	struct rb_node *n = root->rb_node;
2709 	struct btrfs_delayed_ref_node *entry;
2710 	int alt = 1;
2711 	u64 middle;
2712 	u64 first = 0, last = 0;
2713 
2714 	n = rb_first(root);
2715 	if (n) {
2716 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2717 		first = entry->bytenr;
2718 	}
2719 	n = rb_last(root);
2720 	if (n) {
2721 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2722 		last = entry->bytenr;
2723 	}
2724 	n = root->rb_node;
2725 
2726 	while (n) {
2727 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2728 		WARN_ON(!entry->in_tree);
2729 
2730 		middle = entry->bytenr;
2731 
2732 		if (alt)
2733 			n = n->rb_left;
2734 		else
2735 			n = n->rb_right;
2736 
2737 		alt = 1 - alt;
2738 	}
2739 	return middle;
2740 }
2741 #endif
2742 
2743 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2744 {
2745 	u64 num_bytes;
2746 
2747 	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2748 			     sizeof(struct btrfs_extent_inline_ref));
2749 	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2750 		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2751 
2752 	/*
2753 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2754 	 * closer to what we're really going to want to use.
2755 	 */
2756 	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2757 }
2758 
2759 /*
2760  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2761  * would require to store the csums for that many bytes.
2762  */
2763 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2764 {
2765 	u64 csum_size;
2766 	u64 num_csums_per_leaf;
2767 	u64 num_csums;
2768 
2769 	csum_size = BTRFS_MAX_ITEM_SIZE(root);
2770 	num_csums_per_leaf = div64_u64(csum_size,
2771 			(u64)btrfs_super_csum_size(root->fs_info->super_copy));
2772 	num_csums = div64_u64(csum_bytes, root->sectorsize);
2773 	num_csums += num_csums_per_leaf - 1;
2774 	num_csums = div64_u64(num_csums, num_csums_per_leaf);
2775 	return num_csums;
2776 }
2777 
2778 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2779 				       struct btrfs_root *root)
2780 {
2781 	struct btrfs_block_rsv *global_rsv;
2782 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2783 	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2784 	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2785 	u64 num_bytes, num_dirty_bgs_bytes;
2786 	int ret = 0;
2787 
2788 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2789 	num_heads = heads_to_leaves(root, num_heads);
2790 	if (num_heads > 1)
2791 		num_bytes += (num_heads - 1) * root->nodesize;
2792 	num_bytes <<= 1;
2793 	num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2794 	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2795 							     num_dirty_bgs);
2796 	global_rsv = &root->fs_info->global_block_rsv;
2797 
2798 	/*
2799 	 * If we can't allocate any more chunks lets make sure we have _lots_ of
2800 	 * wiggle room since running delayed refs can create more delayed refs.
2801 	 */
2802 	if (global_rsv->space_info->full) {
2803 		num_dirty_bgs_bytes <<= 1;
2804 		num_bytes <<= 1;
2805 	}
2806 
2807 	spin_lock(&global_rsv->lock);
2808 	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2809 		ret = 1;
2810 	spin_unlock(&global_rsv->lock);
2811 	return ret;
2812 }
2813 
2814 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2815 				       struct btrfs_root *root)
2816 {
2817 	struct btrfs_fs_info *fs_info = root->fs_info;
2818 	u64 num_entries =
2819 		atomic_read(&trans->transaction->delayed_refs.num_entries);
2820 	u64 avg_runtime;
2821 	u64 val;
2822 
2823 	smp_mb();
2824 	avg_runtime = fs_info->avg_delayed_ref_runtime;
2825 	val = num_entries * avg_runtime;
2826 	if (num_entries * avg_runtime >= NSEC_PER_SEC)
2827 		return 1;
2828 	if (val >= NSEC_PER_SEC / 2)
2829 		return 2;
2830 
2831 	return btrfs_check_space_for_delayed_refs(trans, root);
2832 }
2833 
2834 struct async_delayed_refs {
2835 	struct btrfs_root *root;
2836 	u64 transid;
2837 	int count;
2838 	int error;
2839 	int sync;
2840 	struct completion wait;
2841 	struct btrfs_work work;
2842 };
2843 
2844 static void delayed_ref_async_start(struct btrfs_work *work)
2845 {
2846 	struct async_delayed_refs *async;
2847 	struct btrfs_trans_handle *trans;
2848 	int ret;
2849 
2850 	async = container_of(work, struct async_delayed_refs, work);
2851 
2852 	/* if the commit is already started, we don't need to wait here */
2853 	if (btrfs_transaction_blocked(async->root->fs_info))
2854 		goto done;
2855 
2856 	trans = btrfs_join_transaction(async->root);
2857 	if (IS_ERR(trans)) {
2858 		async->error = PTR_ERR(trans);
2859 		goto done;
2860 	}
2861 
2862 	/*
2863 	 * trans->sync means that when we call end_transaction, we won't
2864 	 * wait on delayed refs
2865 	 */
2866 	trans->sync = true;
2867 
2868 	/* Don't bother flushing if we got into a different transaction */
2869 	if (trans->transid > async->transid)
2870 		goto end;
2871 
2872 	ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2873 	if (ret)
2874 		async->error = ret;
2875 end:
2876 	ret = btrfs_end_transaction(trans, async->root);
2877 	if (ret && !async->error)
2878 		async->error = ret;
2879 done:
2880 	if (async->sync)
2881 		complete(&async->wait);
2882 	else
2883 		kfree(async);
2884 }
2885 
2886 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2887 				 unsigned long count, u64 transid, int wait)
2888 {
2889 	struct async_delayed_refs *async;
2890 	int ret;
2891 
2892 	async = kmalloc(sizeof(*async), GFP_NOFS);
2893 	if (!async)
2894 		return -ENOMEM;
2895 
2896 	async->root = root->fs_info->tree_root;
2897 	async->count = count;
2898 	async->error = 0;
2899 	async->transid = transid;
2900 	if (wait)
2901 		async->sync = 1;
2902 	else
2903 		async->sync = 0;
2904 	init_completion(&async->wait);
2905 
2906 	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2907 			delayed_ref_async_start, NULL, NULL);
2908 
2909 	btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2910 
2911 	if (wait) {
2912 		wait_for_completion(&async->wait);
2913 		ret = async->error;
2914 		kfree(async);
2915 		return ret;
2916 	}
2917 	return 0;
2918 }
2919 
2920 /*
2921  * this starts processing the delayed reference count updates and
2922  * extent insertions we have queued up so far.  count can be
2923  * 0, which means to process everything in the tree at the start
2924  * of the run (but not newly added entries), or it can be some target
2925  * number you'd like to process.
2926  *
2927  * Returns 0 on success or if called with an aborted transaction
2928  * Returns <0 on error and aborts the transaction
2929  */
2930 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2931 			   struct btrfs_root *root, unsigned long count)
2932 {
2933 	struct rb_node *node;
2934 	struct btrfs_delayed_ref_root *delayed_refs;
2935 	struct btrfs_delayed_ref_head *head;
2936 	int ret;
2937 	int run_all = count == (unsigned long)-1;
2938 	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2939 
2940 	/* We'll clean this up in btrfs_cleanup_transaction */
2941 	if (trans->aborted)
2942 		return 0;
2943 
2944 	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &root->fs_info->flags))
2945 		return 0;
2946 
2947 	if (root == root->fs_info->extent_root)
2948 		root = root->fs_info->tree_root;
2949 
2950 	delayed_refs = &trans->transaction->delayed_refs;
2951 	if (count == 0)
2952 		count = atomic_read(&delayed_refs->num_entries) * 2;
2953 
2954 again:
2955 #ifdef SCRAMBLE_DELAYED_REFS
2956 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2957 #endif
2958 	trans->can_flush_pending_bgs = false;
2959 	ret = __btrfs_run_delayed_refs(trans, root, count);
2960 	if (ret < 0) {
2961 		btrfs_abort_transaction(trans, ret);
2962 		return ret;
2963 	}
2964 
2965 	if (run_all) {
2966 		if (!list_empty(&trans->new_bgs))
2967 			btrfs_create_pending_block_groups(trans, root);
2968 
2969 		spin_lock(&delayed_refs->lock);
2970 		node = rb_first(&delayed_refs->href_root);
2971 		if (!node) {
2972 			spin_unlock(&delayed_refs->lock);
2973 			goto out;
2974 		}
2975 
2976 		while (node) {
2977 			head = rb_entry(node, struct btrfs_delayed_ref_head,
2978 					href_node);
2979 			if (btrfs_delayed_ref_is_head(&head->node)) {
2980 				struct btrfs_delayed_ref_node *ref;
2981 
2982 				ref = &head->node;
2983 				atomic_inc(&ref->refs);
2984 
2985 				spin_unlock(&delayed_refs->lock);
2986 				/*
2987 				 * Mutex was contended, block until it's
2988 				 * released and try again
2989 				 */
2990 				mutex_lock(&head->mutex);
2991 				mutex_unlock(&head->mutex);
2992 
2993 				btrfs_put_delayed_ref(ref);
2994 				cond_resched();
2995 				goto again;
2996 			} else {
2997 				WARN_ON(1);
2998 			}
2999 			node = rb_next(node);
3000 		}
3001 		spin_unlock(&delayed_refs->lock);
3002 		cond_resched();
3003 		goto again;
3004 	}
3005 out:
3006 	assert_qgroups_uptodate(trans);
3007 	trans->can_flush_pending_bgs = can_flush_pending_bgs;
3008 	return 0;
3009 }
3010 
3011 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3012 				struct btrfs_root *root,
3013 				u64 bytenr, u64 num_bytes, u64 flags,
3014 				int level, int is_data)
3015 {
3016 	struct btrfs_delayed_extent_op *extent_op;
3017 	int ret;
3018 
3019 	extent_op = btrfs_alloc_delayed_extent_op();
3020 	if (!extent_op)
3021 		return -ENOMEM;
3022 
3023 	extent_op->flags_to_set = flags;
3024 	extent_op->update_flags = true;
3025 	extent_op->update_key = false;
3026 	extent_op->is_data = is_data ? true : false;
3027 	extent_op->level = level;
3028 
3029 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
3030 					  num_bytes, extent_op);
3031 	if (ret)
3032 		btrfs_free_delayed_extent_op(extent_op);
3033 	return ret;
3034 }
3035 
3036 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
3037 				      struct btrfs_root *root,
3038 				      struct btrfs_path *path,
3039 				      u64 objectid, u64 offset, u64 bytenr)
3040 {
3041 	struct btrfs_delayed_ref_head *head;
3042 	struct btrfs_delayed_ref_node *ref;
3043 	struct btrfs_delayed_data_ref *data_ref;
3044 	struct btrfs_delayed_ref_root *delayed_refs;
3045 	int ret = 0;
3046 
3047 	delayed_refs = &trans->transaction->delayed_refs;
3048 	spin_lock(&delayed_refs->lock);
3049 	head = btrfs_find_delayed_ref_head(trans, bytenr);
3050 	if (!head) {
3051 		spin_unlock(&delayed_refs->lock);
3052 		return 0;
3053 	}
3054 
3055 	if (!mutex_trylock(&head->mutex)) {
3056 		atomic_inc(&head->node.refs);
3057 		spin_unlock(&delayed_refs->lock);
3058 
3059 		btrfs_release_path(path);
3060 
3061 		/*
3062 		 * Mutex was contended, block until it's released and let
3063 		 * caller try again
3064 		 */
3065 		mutex_lock(&head->mutex);
3066 		mutex_unlock(&head->mutex);
3067 		btrfs_put_delayed_ref(&head->node);
3068 		return -EAGAIN;
3069 	}
3070 	spin_unlock(&delayed_refs->lock);
3071 
3072 	spin_lock(&head->lock);
3073 	list_for_each_entry(ref, &head->ref_list, list) {
3074 		/* If it's a shared ref we know a cross reference exists */
3075 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3076 			ret = 1;
3077 			break;
3078 		}
3079 
3080 		data_ref = btrfs_delayed_node_to_data_ref(ref);
3081 
3082 		/*
3083 		 * If our ref doesn't match the one we're currently looking at
3084 		 * then we have a cross reference.
3085 		 */
3086 		if (data_ref->root != root->root_key.objectid ||
3087 		    data_ref->objectid != objectid ||
3088 		    data_ref->offset != offset) {
3089 			ret = 1;
3090 			break;
3091 		}
3092 	}
3093 	spin_unlock(&head->lock);
3094 	mutex_unlock(&head->mutex);
3095 	return ret;
3096 }
3097 
3098 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3099 					struct btrfs_root *root,
3100 					struct btrfs_path *path,
3101 					u64 objectid, u64 offset, u64 bytenr)
3102 {
3103 	struct btrfs_root *extent_root = root->fs_info->extent_root;
3104 	struct extent_buffer *leaf;
3105 	struct btrfs_extent_data_ref *ref;
3106 	struct btrfs_extent_inline_ref *iref;
3107 	struct btrfs_extent_item *ei;
3108 	struct btrfs_key key;
3109 	u32 item_size;
3110 	int ret;
3111 
3112 	key.objectid = bytenr;
3113 	key.offset = (u64)-1;
3114 	key.type = BTRFS_EXTENT_ITEM_KEY;
3115 
3116 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3117 	if (ret < 0)
3118 		goto out;
3119 	BUG_ON(ret == 0); /* Corruption */
3120 
3121 	ret = -ENOENT;
3122 	if (path->slots[0] == 0)
3123 		goto out;
3124 
3125 	path->slots[0]--;
3126 	leaf = path->nodes[0];
3127 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3128 
3129 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3130 		goto out;
3131 
3132 	ret = 1;
3133 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3134 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3135 	if (item_size < sizeof(*ei)) {
3136 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3137 		goto out;
3138 	}
3139 #endif
3140 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3141 
3142 	if (item_size != sizeof(*ei) +
3143 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3144 		goto out;
3145 
3146 	if (btrfs_extent_generation(leaf, ei) <=
3147 	    btrfs_root_last_snapshot(&root->root_item))
3148 		goto out;
3149 
3150 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3151 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
3152 	    BTRFS_EXTENT_DATA_REF_KEY)
3153 		goto out;
3154 
3155 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3156 	if (btrfs_extent_refs(leaf, ei) !=
3157 	    btrfs_extent_data_ref_count(leaf, ref) ||
3158 	    btrfs_extent_data_ref_root(leaf, ref) !=
3159 	    root->root_key.objectid ||
3160 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3161 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
3162 		goto out;
3163 
3164 	ret = 0;
3165 out:
3166 	return ret;
3167 }
3168 
3169 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3170 			  struct btrfs_root *root,
3171 			  u64 objectid, u64 offset, u64 bytenr)
3172 {
3173 	struct btrfs_path *path;
3174 	int ret;
3175 	int ret2;
3176 
3177 	path = btrfs_alloc_path();
3178 	if (!path)
3179 		return -ENOENT;
3180 
3181 	do {
3182 		ret = check_committed_ref(trans, root, path, objectid,
3183 					  offset, bytenr);
3184 		if (ret && ret != -ENOENT)
3185 			goto out;
3186 
3187 		ret2 = check_delayed_ref(trans, root, path, objectid,
3188 					 offset, bytenr);
3189 	} while (ret2 == -EAGAIN);
3190 
3191 	if (ret2 && ret2 != -ENOENT) {
3192 		ret = ret2;
3193 		goto out;
3194 	}
3195 
3196 	if (ret != -ENOENT || ret2 != -ENOENT)
3197 		ret = 0;
3198 out:
3199 	btrfs_free_path(path);
3200 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3201 		WARN_ON(ret > 0);
3202 	return ret;
3203 }
3204 
3205 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3206 			   struct btrfs_root *root,
3207 			   struct extent_buffer *buf,
3208 			   int full_backref, int inc)
3209 {
3210 	u64 bytenr;
3211 	u64 num_bytes;
3212 	u64 parent;
3213 	u64 ref_root;
3214 	u32 nritems;
3215 	struct btrfs_key key;
3216 	struct btrfs_file_extent_item *fi;
3217 	int i;
3218 	int level;
3219 	int ret = 0;
3220 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3221 			    u64, u64, u64, u64, u64, u64);
3222 
3223 
3224 	if (btrfs_is_testing(root->fs_info))
3225 		return 0;
3226 
3227 	ref_root = btrfs_header_owner(buf);
3228 	nritems = btrfs_header_nritems(buf);
3229 	level = btrfs_header_level(buf);
3230 
3231 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3232 		return 0;
3233 
3234 	if (inc)
3235 		process_func = btrfs_inc_extent_ref;
3236 	else
3237 		process_func = btrfs_free_extent;
3238 
3239 	if (full_backref)
3240 		parent = buf->start;
3241 	else
3242 		parent = 0;
3243 
3244 	for (i = 0; i < nritems; i++) {
3245 		if (level == 0) {
3246 			btrfs_item_key_to_cpu(buf, &key, i);
3247 			if (key.type != BTRFS_EXTENT_DATA_KEY)
3248 				continue;
3249 			fi = btrfs_item_ptr(buf, i,
3250 					    struct btrfs_file_extent_item);
3251 			if (btrfs_file_extent_type(buf, fi) ==
3252 			    BTRFS_FILE_EXTENT_INLINE)
3253 				continue;
3254 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3255 			if (bytenr == 0)
3256 				continue;
3257 
3258 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3259 			key.offset -= btrfs_file_extent_offset(buf, fi);
3260 			ret = process_func(trans, root, bytenr, num_bytes,
3261 					   parent, ref_root, key.objectid,
3262 					   key.offset);
3263 			if (ret)
3264 				goto fail;
3265 		} else {
3266 			bytenr = btrfs_node_blockptr(buf, i);
3267 			num_bytes = root->nodesize;
3268 			ret = process_func(trans, root, bytenr, num_bytes,
3269 					   parent, ref_root, level - 1, 0);
3270 			if (ret)
3271 				goto fail;
3272 		}
3273 	}
3274 	return 0;
3275 fail:
3276 	return ret;
3277 }
3278 
3279 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3280 		  struct extent_buffer *buf, int full_backref)
3281 {
3282 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3283 }
3284 
3285 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3286 		  struct extent_buffer *buf, int full_backref)
3287 {
3288 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3289 }
3290 
3291 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3292 				 struct btrfs_root *root,
3293 				 struct btrfs_path *path,
3294 				 struct btrfs_block_group_cache *cache)
3295 {
3296 	int ret;
3297 	struct btrfs_root *extent_root = root->fs_info->extent_root;
3298 	unsigned long bi;
3299 	struct extent_buffer *leaf;
3300 
3301 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3302 	if (ret) {
3303 		if (ret > 0)
3304 			ret = -ENOENT;
3305 		goto fail;
3306 	}
3307 
3308 	leaf = path->nodes[0];
3309 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3310 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3311 	btrfs_mark_buffer_dirty(leaf);
3312 fail:
3313 	btrfs_release_path(path);
3314 	return ret;
3315 
3316 }
3317 
3318 static struct btrfs_block_group_cache *
3319 next_block_group(struct btrfs_root *root,
3320 		 struct btrfs_block_group_cache *cache)
3321 {
3322 	struct rb_node *node;
3323 
3324 	spin_lock(&root->fs_info->block_group_cache_lock);
3325 
3326 	/* If our block group was removed, we need a full search. */
3327 	if (RB_EMPTY_NODE(&cache->cache_node)) {
3328 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3329 
3330 		spin_unlock(&root->fs_info->block_group_cache_lock);
3331 		btrfs_put_block_group(cache);
3332 		cache = btrfs_lookup_first_block_group(root->fs_info,
3333 						       next_bytenr);
3334 		return cache;
3335 	}
3336 	node = rb_next(&cache->cache_node);
3337 	btrfs_put_block_group(cache);
3338 	if (node) {
3339 		cache = rb_entry(node, struct btrfs_block_group_cache,
3340 				 cache_node);
3341 		btrfs_get_block_group(cache);
3342 	} else
3343 		cache = NULL;
3344 	spin_unlock(&root->fs_info->block_group_cache_lock);
3345 	return cache;
3346 }
3347 
3348 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3349 			    struct btrfs_trans_handle *trans,
3350 			    struct btrfs_path *path)
3351 {
3352 	struct btrfs_root *root = block_group->fs_info->tree_root;
3353 	struct inode *inode = NULL;
3354 	u64 alloc_hint = 0;
3355 	int dcs = BTRFS_DC_ERROR;
3356 	u64 num_pages = 0;
3357 	int retries = 0;
3358 	int ret = 0;
3359 
3360 	/*
3361 	 * If this block group is smaller than 100 megs don't bother caching the
3362 	 * block group.
3363 	 */
3364 	if (block_group->key.offset < (100 * SZ_1M)) {
3365 		spin_lock(&block_group->lock);
3366 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3367 		spin_unlock(&block_group->lock);
3368 		return 0;
3369 	}
3370 
3371 	if (trans->aborted)
3372 		return 0;
3373 again:
3374 	inode = lookup_free_space_inode(root, block_group, path);
3375 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3376 		ret = PTR_ERR(inode);
3377 		btrfs_release_path(path);
3378 		goto out;
3379 	}
3380 
3381 	if (IS_ERR(inode)) {
3382 		BUG_ON(retries);
3383 		retries++;
3384 
3385 		if (block_group->ro)
3386 			goto out_free;
3387 
3388 		ret = create_free_space_inode(root, trans, block_group, path);
3389 		if (ret)
3390 			goto out_free;
3391 		goto again;
3392 	}
3393 
3394 	/* We've already setup this transaction, go ahead and exit */
3395 	if (block_group->cache_generation == trans->transid &&
3396 	    i_size_read(inode)) {
3397 		dcs = BTRFS_DC_SETUP;
3398 		goto out_put;
3399 	}
3400 
3401 	/*
3402 	 * We want to set the generation to 0, that way if anything goes wrong
3403 	 * from here on out we know not to trust this cache when we load up next
3404 	 * time.
3405 	 */
3406 	BTRFS_I(inode)->generation = 0;
3407 	ret = btrfs_update_inode(trans, root, inode);
3408 	if (ret) {
3409 		/*
3410 		 * So theoretically we could recover from this, simply set the
3411 		 * super cache generation to 0 so we know to invalidate the
3412 		 * cache, but then we'd have to keep track of the block groups
3413 		 * that fail this way so we know we _have_ to reset this cache
3414 		 * before the next commit or risk reading stale cache.  So to
3415 		 * limit our exposure to horrible edge cases lets just abort the
3416 		 * transaction, this only happens in really bad situations
3417 		 * anyway.
3418 		 */
3419 		btrfs_abort_transaction(trans, ret);
3420 		goto out_put;
3421 	}
3422 	WARN_ON(ret);
3423 
3424 	if (i_size_read(inode) > 0) {
3425 		ret = btrfs_check_trunc_cache_free_space(root,
3426 					&root->fs_info->global_block_rsv);
3427 		if (ret)
3428 			goto out_put;
3429 
3430 		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3431 		if (ret)
3432 			goto out_put;
3433 	}
3434 
3435 	spin_lock(&block_group->lock);
3436 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3437 	    !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
3438 		/*
3439 		 * don't bother trying to write stuff out _if_
3440 		 * a) we're not cached,
3441 		 * b) we're with nospace_cache mount option.
3442 		 */
3443 		dcs = BTRFS_DC_WRITTEN;
3444 		spin_unlock(&block_group->lock);
3445 		goto out_put;
3446 	}
3447 	spin_unlock(&block_group->lock);
3448 
3449 	/*
3450 	 * We hit an ENOSPC when setting up the cache in this transaction, just
3451 	 * skip doing the setup, we've already cleared the cache so we're safe.
3452 	 */
3453 	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3454 		ret = -ENOSPC;
3455 		goto out_put;
3456 	}
3457 
3458 	/*
3459 	 * Try to preallocate enough space based on how big the block group is.
3460 	 * Keep in mind this has to include any pinned space which could end up
3461 	 * taking up quite a bit since it's not folded into the other space
3462 	 * cache.
3463 	 */
3464 	num_pages = div_u64(block_group->key.offset, SZ_256M);
3465 	if (!num_pages)
3466 		num_pages = 1;
3467 
3468 	num_pages *= 16;
3469 	num_pages *= PAGE_SIZE;
3470 
3471 	ret = btrfs_check_data_free_space(inode, 0, num_pages);
3472 	if (ret)
3473 		goto out_put;
3474 
3475 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3476 					      num_pages, num_pages,
3477 					      &alloc_hint);
3478 	/*
3479 	 * Our cache requires contiguous chunks so that we don't modify a bunch
3480 	 * of metadata or split extents when writing the cache out, which means
3481 	 * we can enospc if we are heavily fragmented in addition to just normal
3482 	 * out of space conditions.  So if we hit this just skip setting up any
3483 	 * other block groups for this transaction, maybe we'll unpin enough
3484 	 * space the next time around.
3485 	 */
3486 	if (!ret)
3487 		dcs = BTRFS_DC_SETUP;
3488 	else if (ret == -ENOSPC)
3489 		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3490 
3491 out_put:
3492 	iput(inode);
3493 out_free:
3494 	btrfs_release_path(path);
3495 out:
3496 	spin_lock(&block_group->lock);
3497 	if (!ret && dcs == BTRFS_DC_SETUP)
3498 		block_group->cache_generation = trans->transid;
3499 	block_group->disk_cache_state = dcs;
3500 	spin_unlock(&block_group->lock);
3501 
3502 	return ret;
3503 }
3504 
3505 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3506 			    struct btrfs_root *root)
3507 {
3508 	struct btrfs_block_group_cache *cache, *tmp;
3509 	struct btrfs_transaction *cur_trans = trans->transaction;
3510 	struct btrfs_path *path;
3511 
3512 	if (list_empty(&cur_trans->dirty_bgs) ||
3513 	    !btrfs_test_opt(root->fs_info, SPACE_CACHE))
3514 		return 0;
3515 
3516 	path = btrfs_alloc_path();
3517 	if (!path)
3518 		return -ENOMEM;
3519 
3520 	/* Could add new block groups, use _safe just in case */
3521 	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3522 				 dirty_list) {
3523 		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3524 			cache_save_setup(cache, trans, path);
3525 	}
3526 
3527 	btrfs_free_path(path);
3528 	return 0;
3529 }
3530 
3531 /*
3532  * transaction commit does final block group cache writeback during a
3533  * critical section where nothing is allowed to change the FS.  This is
3534  * required in order for the cache to actually match the block group,
3535  * but can introduce a lot of latency into the commit.
3536  *
3537  * So, btrfs_start_dirty_block_groups is here to kick off block group
3538  * cache IO.  There's a chance we'll have to redo some of it if the
3539  * block group changes again during the commit, but it greatly reduces
3540  * the commit latency by getting rid of the easy block groups while
3541  * we're still allowing others to join the commit.
3542  */
3543 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3544 				   struct btrfs_root *root)
3545 {
3546 	struct btrfs_block_group_cache *cache;
3547 	struct btrfs_transaction *cur_trans = trans->transaction;
3548 	int ret = 0;
3549 	int should_put;
3550 	struct btrfs_path *path = NULL;
3551 	LIST_HEAD(dirty);
3552 	struct list_head *io = &cur_trans->io_bgs;
3553 	int num_started = 0;
3554 	int loops = 0;
3555 
3556 	spin_lock(&cur_trans->dirty_bgs_lock);
3557 	if (list_empty(&cur_trans->dirty_bgs)) {
3558 		spin_unlock(&cur_trans->dirty_bgs_lock);
3559 		return 0;
3560 	}
3561 	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3562 	spin_unlock(&cur_trans->dirty_bgs_lock);
3563 
3564 again:
3565 	/*
3566 	 * make sure all the block groups on our dirty list actually
3567 	 * exist
3568 	 */
3569 	btrfs_create_pending_block_groups(trans, root);
3570 
3571 	if (!path) {
3572 		path = btrfs_alloc_path();
3573 		if (!path)
3574 			return -ENOMEM;
3575 	}
3576 
3577 	/*
3578 	 * cache_write_mutex is here only to save us from balance or automatic
3579 	 * removal of empty block groups deleting this block group while we are
3580 	 * writing out the cache
3581 	 */
3582 	mutex_lock(&trans->transaction->cache_write_mutex);
3583 	while (!list_empty(&dirty)) {
3584 		cache = list_first_entry(&dirty,
3585 					 struct btrfs_block_group_cache,
3586 					 dirty_list);
3587 		/*
3588 		 * this can happen if something re-dirties a block
3589 		 * group that is already under IO.  Just wait for it to
3590 		 * finish and then do it all again
3591 		 */
3592 		if (!list_empty(&cache->io_list)) {
3593 			list_del_init(&cache->io_list);
3594 			btrfs_wait_cache_io(root, trans, cache,
3595 					    &cache->io_ctl, path,
3596 					    cache->key.objectid);
3597 			btrfs_put_block_group(cache);
3598 		}
3599 
3600 
3601 		/*
3602 		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3603 		 * if it should update the cache_state.  Don't delete
3604 		 * until after we wait.
3605 		 *
3606 		 * Since we're not running in the commit critical section
3607 		 * we need the dirty_bgs_lock to protect from update_block_group
3608 		 */
3609 		spin_lock(&cur_trans->dirty_bgs_lock);
3610 		list_del_init(&cache->dirty_list);
3611 		spin_unlock(&cur_trans->dirty_bgs_lock);
3612 
3613 		should_put = 1;
3614 
3615 		cache_save_setup(cache, trans, path);
3616 
3617 		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3618 			cache->io_ctl.inode = NULL;
3619 			ret = btrfs_write_out_cache(root, trans, cache, path);
3620 			if (ret == 0 && cache->io_ctl.inode) {
3621 				num_started++;
3622 				should_put = 0;
3623 
3624 				/*
3625 				 * the cache_write_mutex is protecting
3626 				 * the io_list
3627 				 */
3628 				list_add_tail(&cache->io_list, io);
3629 			} else {
3630 				/*
3631 				 * if we failed to write the cache, the
3632 				 * generation will be bad and life goes on
3633 				 */
3634 				ret = 0;
3635 			}
3636 		}
3637 		if (!ret) {
3638 			ret = write_one_cache_group(trans, root, path, cache);
3639 			/*
3640 			 * Our block group might still be attached to the list
3641 			 * of new block groups in the transaction handle of some
3642 			 * other task (struct btrfs_trans_handle->new_bgs). This
3643 			 * means its block group item isn't yet in the extent
3644 			 * tree. If this happens ignore the error, as we will
3645 			 * try again later in the critical section of the
3646 			 * transaction commit.
3647 			 */
3648 			if (ret == -ENOENT) {
3649 				ret = 0;
3650 				spin_lock(&cur_trans->dirty_bgs_lock);
3651 				if (list_empty(&cache->dirty_list)) {
3652 					list_add_tail(&cache->dirty_list,
3653 						      &cur_trans->dirty_bgs);
3654 					btrfs_get_block_group(cache);
3655 				}
3656 				spin_unlock(&cur_trans->dirty_bgs_lock);
3657 			} else if (ret) {
3658 				btrfs_abort_transaction(trans, ret);
3659 			}
3660 		}
3661 
3662 		/* if its not on the io list, we need to put the block group */
3663 		if (should_put)
3664 			btrfs_put_block_group(cache);
3665 
3666 		if (ret)
3667 			break;
3668 
3669 		/*
3670 		 * Avoid blocking other tasks for too long. It might even save
3671 		 * us from writing caches for block groups that are going to be
3672 		 * removed.
3673 		 */
3674 		mutex_unlock(&trans->transaction->cache_write_mutex);
3675 		mutex_lock(&trans->transaction->cache_write_mutex);
3676 	}
3677 	mutex_unlock(&trans->transaction->cache_write_mutex);
3678 
3679 	/*
3680 	 * go through delayed refs for all the stuff we've just kicked off
3681 	 * and then loop back (just once)
3682 	 */
3683 	ret = btrfs_run_delayed_refs(trans, root, 0);
3684 	if (!ret && loops == 0) {
3685 		loops++;
3686 		spin_lock(&cur_trans->dirty_bgs_lock);
3687 		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3688 		/*
3689 		 * dirty_bgs_lock protects us from concurrent block group
3690 		 * deletes too (not just cache_write_mutex).
3691 		 */
3692 		if (!list_empty(&dirty)) {
3693 			spin_unlock(&cur_trans->dirty_bgs_lock);
3694 			goto again;
3695 		}
3696 		spin_unlock(&cur_trans->dirty_bgs_lock);
3697 	} else if (ret < 0) {
3698 		btrfs_cleanup_dirty_bgs(cur_trans, root);
3699 	}
3700 
3701 	btrfs_free_path(path);
3702 	return ret;
3703 }
3704 
3705 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3706 				   struct btrfs_root *root)
3707 {
3708 	struct btrfs_block_group_cache *cache;
3709 	struct btrfs_transaction *cur_trans = trans->transaction;
3710 	int ret = 0;
3711 	int should_put;
3712 	struct btrfs_path *path;
3713 	struct list_head *io = &cur_trans->io_bgs;
3714 	int num_started = 0;
3715 
3716 	path = btrfs_alloc_path();
3717 	if (!path)
3718 		return -ENOMEM;
3719 
3720 	/*
3721 	 * Even though we are in the critical section of the transaction commit,
3722 	 * we can still have concurrent tasks adding elements to this
3723 	 * transaction's list of dirty block groups. These tasks correspond to
3724 	 * endio free space workers started when writeback finishes for a
3725 	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3726 	 * allocate new block groups as a result of COWing nodes of the root
3727 	 * tree when updating the free space inode. The writeback for the space
3728 	 * caches is triggered by an earlier call to
3729 	 * btrfs_start_dirty_block_groups() and iterations of the following
3730 	 * loop.
3731 	 * Also we want to do the cache_save_setup first and then run the
3732 	 * delayed refs to make sure we have the best chance at doing this all
3733 	 * in one shot.
3734 	 */
3735 	spin_lock(&cur_trans->dirty_bgs_lock);
3736 	while (!list_empty(&cur_trans->dirty_bgs)) {
3737 		cache = list_first_entry(&cur_trans->dirty_bgs,
3738 					 struct btrfs_block_group_cache,
3739 					 dirty_list);
3740 
3741 		/*
3742 		 * this can happen if cache_save_setup re-dirties a block
3743 		 * group that is already under IO.  Just wait for it to
3744 		 * finish and then do it all again
3745 		 */
3746 		if (!list_empty(&cache->io_list)) {
3747 			spin_unlock(&cur_trans->dirty_bgs_lock);
3748 			list_del_init(&cache->io_list);
3749 			btrfs_wait_cache_io(root, trans, cache,
3750 					    &cache->io_ctl, path,
3751 					    cache->key.objectid);
3752 			btrfs_put_block_group(cache);
3753 			spin_lock(&cur_trans->dirty_bgs_lock);
3754 		}
3755 
3756 		/*
3757 		 * don't remove from the dirty list until after we've waited
3758 		 * on any pending IO
3759 		 */
3760 		list_del_init(&cache->dirty_list);
3761 		spin_unlock(&cur_trans->dirty_bgs_lock);
3762 		should_put = 1;
3763 
3764 		cache_save_setup(cache, trans, path);
3765 
3766 		if (!ret)
3767 			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3768 
3769 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3770 			cache->io_ctl.inode = NULL;
3771 			ret = btrfs_write_out_cache(root, trans, cache, path);
3772 			if (ret == 0 && cache->io_ctl.inode) {
3773 				num_started++;
3774 				should_put = 0;
3775 				list_add_tail(&cache->io_list, io);
3776 			} else {
3777 				/*
3778 				 * if we failed to write the cache, the
3779 				 * generation will be bad and life goes on
3780 				 */
3781 				ret = 0;
3782 			}
3783 		}
3784 		if (!ret) {
3785 			ret = write_one_cache_group(trans, root, path, cache);
3786 			/*
3787 			 * One of the free space endio workers might have
3788 			 * created a new block group while updating a free space
3789 			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3790 			 * and hasn't released its transaction handle yet, in
3791 			 * which case the new block group is still attached to
3792 			 * its transaction handle and its creation has not
3793 			 * finished yet (no block group item in the extent tree
3794 			 * yet, etc). If this is the case, wait for all free
3795 			 * space endio workers to finish and retry. This is a
3796 			 * a very rare case so no need for a more efficient and
3797 			 * complex approach.
3798 			 */
3799 			if (ret == -ENOENT) {
3800 				wait_event(cur_trans->writer_wait,
3801 				   atomic_read(&cur_trans->num_writers) == 1);
3802 				ret = write_one_cache_group(trans, root, path,
3803 							    cache);
3804 			}
3805 			if (ret)
3806 				btrfs_abort_transaction(trans, ret);
3807 		}
3808 
3809 		/* if its not on the io list, we need to put the block group */
3810 		if (should_put)
3811 			btrfs_put_block_group(cache);
3812 		spin_lock(&cur_trans->dirty_bgs_lock);
3813 	}
3814 	spin_unlock(&cur_trans->dirty_bgs_lock);
3815 
3816 	while (!list_empty(io)) {
3817 		cache = list_first_entry(io, struct btrfs_block_group_cache,
3818 					 io_list);
3819 		list_del_init(&cache->io_list);
3820 		btrfs_wait_cache_io(root, trans, cache,
3821 				    &cache->io_ctl, path, cache->key.objectid);
3822 		btrfs_put_block_group(cache);
3823 	}
3824 
3825 	btrfs_free_path(path);
3826 	return ret;
3827 }
3828 
3829 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3830 {
3831 	struct btrfs_block_group_cache *block_group;
3832 	int readonly = 0;
3833 
3834 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3835 	if (!block_group || block_group->ro)
3836 		readonly = 1;
3837 	if (block_group)
3838 		btrfs_put_block_group(block_group);
3839 	return readonly;
3840 }
3841 
3842 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3843 {
3844 	struct btrfs_block_group_cache *bg;
3845 	bool ret = true;
3846 
3847 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3848 	if (!bg)
3849 		return false;
3850 
3851 	spin_lock(&bg->lock);
3852 	if (bg->ro)
3853 		ret = false;
3854 	else
3855 		atomic_inc(&bg->nocow_writers);
3856 	spin_unlock(&bg->lock);
3857 
3858 	/* no put on block group, done by btrfs_dec_nocow_writers */
3859 	if (!ret)
3860 		btrfs_put_block_group(bg);
3861 
3862 	return ret;
3863 
3864 }
3865 
3866 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3867 {
3868 	struct btrfs_block_group_cache *bg;
3869 
3870 	bg = btrfs_lookup_block_group(fs_info, bytenr);
3871 	ASSERT(bg);
3872 	if (atomic_dec_and_test(&bg->nocow_writers))
3873 		wake_up_atomic_t(&bg->nocow_writers);
3874 	/*
3875 	 * Once for our lookup and once for the lookup done by a previous call
3876 	 * to btrfs_inc_nocow_writers()
3877 	 */
3878 	btrfs_put_block_group(bg);
3879 	btrfs_put_block_group(bg);
3880 }
3881 
3882 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3883 {
3884 	schedule();
3885 	return 0;
3886 }
3887 
3888 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3889 {
3890 	wait_on_atomic_t(&bg->nocow_writers,
3891 			 btrfs_wait_nocow_writers_atomic_t,
3892 			 TASK_UNINTERRUPTIBLE);
3893 }
3894 
3895 static const char *alloc_name(u64 flags)
3896 {
3897 	switch (flags) {
3898 	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3899 		return "mixed";
3900 	case BTRFS_BLOCK_GROUP_METADATA:
3901 		return "metadata";
3902 	case BTRFS_BLOCK_GROUP_DATA:
3903 		return "data";
3904 	case BTRFS_BLOCK_GROUP_SYSTEM:
3905 		return "system";
3906 	default:
3907 		WARN_ON(1);
3908 		return "invalid-combination";
3909 	};
3910 }
3911 
3912 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3913 			     u64 total_bytes, u64 bytes_used,
3914 			     u64 bytes_readonly,
3915 			     struct btrfs_space_info **space_info)
3916 {
3917 	struct btrfs_space_info *found;
3918 	int i;
3919 	int factor;
3920 	int ret;
3921 
3922 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3923 		     BTRFS_BLOCK_GROUP_RAID10))
3924 		factor = 2;
3925 	else
3926 		factor = 1;
3927 
3928 	found = __find_space_info(info, flags);
3929 	if (found) {
3930 		spin_lock(&found->lock);
3931 		found->total_bytes += total_bytes;
3932 		found->disk_total += total_bytes * factor;
3933 		found->bytes_used += bytes_used;
3934 		found->disk_used += bytes_used * factor;
3935 		found->bytes_readonly += bytes_readonly;
3936 		if (total_bytes > 0)
3937 			found->full = 0;
3938 		space_info_add_new_bytes(info, found, total_bytes -
3939 					 bytes_used - bytes_readonly);
3940 		spin_unlock(&found->lock);
3941 		*space_info = found;
3942 		return 0;
3943 	}
3944 	found = kzalloc(sizeof(*found), GFP_NOFS);
3945 	if (!found)
3946 		return -ENOMEM;
3947 
3948 	ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3949 	if (ret) {
3950 		kfree(found);
3951 		return ret;
3952 	}
3953 
3954 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3955 		INIT_LIST_HEAD(&found->block_groups[i]);
3956 	init_rwsem(&found->groups_sem);
3957 	spin_lock_init(&found->lock);
3958 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3959 	found->total_bytes = total_bytes;
3960 	found->disk_total = total_bytes * factor;
3961 	found->bytes_used = bytes_used;
3962 	found->disk_used = bytes_used * factor;
3963 	found->bytes_pinned = 0;
3964 	found->bytes_reserved = 0;
3965 	found->bytes_readonly = bytes_readonly;
3966 	found->bytes_may_use = 0;
3967 	found->full = 0;
3968 	found->max_extent_size = 0;
3969 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3970 	found->chunk_alloc = 0;
3971 	found->flush = 0;
3972 	init_waitqueue_head(&found->wait);
3973 	INIT_LIST_HEAD(&found->ro_bgs);
3974 	INIT_LIST_HEAD(&found->tickets);
3975 	INIT_LIST_HEAD(&found->priority_tickets);
3976 
3977 	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3978 				    info->space_info_kobj, "%s",
3979 				    alloc_name(found->flags));
3980 	if (ret) {
3981 		kfree(found);
3982 		return ret;
3983 	}
3984 
3985 	*space_info = found;
3986 	list_add_rcu(&found->list, &info->space_info);
3987 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3988 		info->data_sinfo = found;
3989 
3990 	return ret;
3991 }
3992 
3993 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3994 {
3995 	u64 extra_flags = chunk_to_extended(flags) &
3996 				BTRFS_EXTENDED_PROFILE_MASK;
3997 
3998 	write_seqlock(&fs_info->profiles_lock);
3999 	if (flags & BTRFS_BLOCK_GROUP_DATA)
4000 		fs_info->avail_data_alloc_bits |= extra_flags;
4001 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
4002 		fs_info->avail_metadata_alloc_bits |= extra_flags;
4003 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4004 		fs_info->avail_system_alloc_bits |= extra_flags;
4005 	write_sequnlock(&fs_info->profiles_lock);
4006 }
4007 
4008 /*
4009  * returns target flags in extended format or 0 if restripe for this
4010  * chunk_type is not in progress
4011  *
4012  * should be called with either volume_mutex or balance_lock held
4013  */
4014 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4015 {
4016 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4017 	u64 target = 0;
4018 
4019 	if (!bctl)
4020 		return 0;
4021 
4022 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
4023 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4024 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4025 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4026 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4027 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4028 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4029 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4030 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4031 	}
4032 
4033 	return target;
4034 }
4035 
4036 /*
4037  * @flags: available profiles in extended format (see ctree.h)
4038  *
4039  * Returns reduced profile in chunk format.  If profile changing is in
4040  * progress (either running or paused) picks the target profile (if it's
4041  * already available), otherwise falls back to plain reducing.
4042  */
4043 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
4044 {
4045 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
4046 	u64 target;
4047 	u64 raid_type;
4048 	u64 allowed = 0;
4049 
4050 	/*
4051 	 * see if restripe for this chunk_type is in progress, if so
4052 	 * try to reduce to the target profile
4053 	 */
4054 	spin_lock(&root->fs_info->balance_lock);
4055 	target = get_restripe_target(root->fs_info, flags);
4056 	if (target) {
4057 		/* pick target profile only if it's already available */
4058 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4059 			spin_unlock(&root->fs_info->balance_lock);
4060 			return extended_to_chunk(target);
4061 		}
4062 	}
4063 	spin_unlock(&root->fs_info->balance_lock);
4064 
4065 	/* First, mask out the RAID levels which aren't possible */
4066 	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4067 		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4068 			allowed |= btrfs_raid_group[raid_type];
4069 	}
4070 	allowed &= flags;
4071 
4072 	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4073 		allowed = BTRFS_BLOCK_GROUP_RAID6;
4074 	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4075 		allowed = BTRFS_BLOCK_GROUP_RAID5;
4076 	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4077 		allowed = BTRFS_BLOCK_GROUP_RAID10;
4078 	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4079 		allowed = BTRFS_BLOCK_GROUP_RAID1;
4080 	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4081 		allowed = BTRFS_BLOCK_GROUP_RAID0;
4082 
4083 	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4084 
4085 	return extended_to_chunk(flags | allowed);
4086 }
4087 
4088 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
4089 {
4090 	unsigned seq;
4091 	u64 flags;
4092 
4093 	do {
4094 		flags = orig_flags;
4095 		seq = read_seqbegin(&root->fs_info->profiles_lock);
4096 
4097 		if (flags & BTRFS_BLOCK_GROUP_DATA)
4098 			flags |= root->fs_info->avail_data_alloc_bits;
4099 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4100 			flags |= root->fs_info->avail_system_alloc_bits;
4101 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4102 			flags |= root->fs_info->avail_metadata_alloc_bits;
4103 	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
4104 
4105 	return btrfs_reduce_alloc_profile(root, flags);
4106 }
4107 
4108 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
4109 {
4110 	u64 flags;
4111 	u64 ret;
4112 
4113 	if (data)
4114 		flags = BTRFS_BLOCK_GROUP_DATA;
4115 	else if (root == root->fs_info->chunk_root)
4116 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
4117 	else
4118 		flags = BTRFS_BLOCK_GROUP_METADATA;
4119 
4120 	ret = get_alloc_profile(root, flags);
4121 	return ret;
4122 }
4123 
4124 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
4125 {
4126 	struct btrfs_space_info *data_sinfo;
4127 	struct btrfs_root *root = BTRFS_I(inode)->root;
4128 	struct btrfs_fs_info *fs_info = root->fs_info;
4129 	u64 used;
4130 	int ret = 0;
4131 	int need_commit = 2;
4132 	int have_pinned_space;
4133 
4134 	/* make sure bytes are sectorsize aligned */
4135 	bytes = ALIGN(bytes, root->sectorsize);
4136 
4137 	if (btrfs_is_free_space_inode(inode)) {
4138 		need_commit = 0;
4139 		ASSERT(current->journal_info);
4140 	}
4141 
4142 	data_sinfo = fs_info->data_sinfo;
4143 	if (!data_sinfo)
4144 		goto alloc;
4145 
4146 again:
4147 	/* make sure we have enough space to handle the data first */
4148 	spin_lock(&data_sinfo->lock);
4149 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
4150 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
4151 		data_sinfo->bytes_may_use;
4152 
4153 	if (used + bytes > data_sinfo->total_bytes) {
4154 		struct btrfs_trans_handle *trans;
4155 
4156 		/*
4157 		 * if we don't have enough free bytes in this space then we need
4158 		 * to alloc a new chunk.
4159 		 */
4160 		if (!data_sinfo->full) {
4161 			u64 alloc_target;
4162 
4163 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4164 			spin_unlock(&data_sinfo->lock);
4165 alloc:
4166 			alloc_target = btrfs_get_alloc_profile(root, 1);
4167 			/*
4168 			 * It is ugly that we don't call nolock join
4169 			 * transaction for the free space inode case here.
4170 			 * But it is safe because we only do the data space
4171 			 * reservation for the free space cache in the
4172 			 * transaction context, the common join transaction
4173 			 * just increase the counter of the current transaction
4174 			 * handler, doesn't try to acquire the trans_lock of
4175 			 * the fs.
4176 			 */
4177 			trans = btrfs_join_transaction(root);
4178 			if (IS_ERR(trans))
4179 				return PTR_ERR(trans);
4180 
4181 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4182 					     alloc_target,
4183 					     CHUNK_ALLOC_NO_FORCE);
4184 			btrfs_end_transaction(trans, root);
4185 			if (ret < 0) {
4186 				if (ret != -ENOSPC)
4187 					return ret;
4188 				else {
4189 					have_pinned_space = 1;
4190 					goto commit_trans;
4191 				}
4192 			}
4193 
4194 			if (!data_sinfo)
4195 				data_sinfo = fs_info->data_sinfo;
4196 
4197 			goto again;
4198 		}
4199 
4200 		/*
4201 		 * If we don't have enough pinned space to deal with this
4202 		 * allocation, and no removed chunk in current transaction,
4203 		 * don't bother committing the transaction.
4204 		 */
4205 		have_pinned_space = percpu_counter_compare(
4206 			&data_sinfo->total_bytes_pinned,
4207 			used + bytes - data_sinfo->total_bytes);
4208 		spin_unlock(&data_sinfo->lock);
4209 
4210 		/* commit the current transaction and try again */
4211 commit_trans:
4212 		if (need_commit &&
4213 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
4214 			need_commit--;
4215 
4216 			if (need_commit > 0) {
4217 				btrfs_start_delalloc_roots(fs_info, 0, -1);
4218 				btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
4219 			}
4220 
4221 			trans = btrfs_join_transaction(root);
4222 			if (IS_ERR(trans))
4223 				return PTR_ERR(trans);
4224 			if (have_pinned_space >= 0 ||
4225 			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4226 				     &trans->transaction->flags) ||
4227 			    need_commit > 0) {
4228 				ret = btrfs_commit_transaction(trans, root);
4229 				if (ret)
4230 					return ret;
4231 				/*
4232 				 * The cleaner kthread might still be doing iput
4233 				 * operations. Wait for it to finish so that
4234 				 * more space is released.
4235 				 */
4236 				mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
4237 				mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
4238 				goto again;
4239 			} else {
4240 				btrfs_end_transaction(trans, root);
4241 			}
4242 		}
4243 
4244 		trace_btrfs_space_reservation(root->fs_info,
4245 					      "space_info:enospc",
4246 					      data_sinfo->flags, bytes, 1);
4247 		return -ENOSPC;
4248 	}
4249 	data_sinfo->bytes_may_use += bytes;
4250 	trace_btrfs_space_reservation(root->fs_info, "space_info",
4251 				      data_sinfo->flags, bytes, 1);
4252 	spin_unlock(&data_sinfo->lock);
4253 
4254 	return ret;
4255 }
4256 
4257 /*
4258  * New check_data_free_space() with ability for precious data reservation
4259  * Will replace old btrfs_check_data_free_space(), but for patch split,
4260  * add a new function first and then replace it.
4261  */
4262 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4263 {
4264 	struct btrfs_root *root = BTRFS_I(inode)->root;
4265 	int ret;
4266 
4267 	/* align the range */
4268 	len = round_up(start + len, root->sectorsize) -
4269 	      round_down(start, root->sectorsize);
4270 	start = round_down(start, root->sectorsize);
4271 
4272 	ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4273 	if (ret < 0)
4274 		return ret;
4275 
4276 	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4277 	ret = btrfs_qgroup_reserve_data(inode, start, len);
4278 	if (ret)
4279 		btrfs_free_reserved_data_space_noquota(inode, start, len);
4280 	return ret;
4281 }
4282 
4283 /*
4284  * Called if we need to clear a data reservation for this inode
4285  * Normally in a error case.
4286  *
4287  * This one will *NOT* use accurate qgroup reserved space API, just for case
4288  * which we can't sleep and is sure it won't affect qgroup reserved space.
4289  * Like clear_bit_hook().
4290  */
4291 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4292 					    u64 len)
4293 {
4294 	struct btrfs_root *root = BTRFS_I(inode)->root;
4295 	struct btrfs_space_info *data_sinfo;
4296 
4297 	/* Make sure the range is aligned to sectorsize */
4298 	len = round_up(start + len, root->sectorsize) -
4299 	      round_down(start, root->sectorsize);
4300 	start = round_down(start, root->sectorsize);
4301 
4302 	data_sinfo = root->fs_info->data_sinfo;
4303 	spin_lock(&data_sinfo->lock);
4304 	if (WARN_ON(data_sinfo->bytes_may_use < len))
4305 		data_sinfo->bytes_may_use = 0;
4306 	else
4307 		data_sinfo->bytes_may_use -= len;
4308 	trace_btrfs_space_reservation(root->fs_info, "space_info",
4309 				      data_sinfo->flags, len, 0);
4310 	spin_unlock(&data_sinfo->lock);
4311 }
4312 
4313 /*
4314  * Called if we need to clear a data reservation for this inode
4315  * Normally in a error case.
4316  *
4317  * This one will handle the per-inode data rsv map for accurate reserved
4318  * space framework.
4319  */
4320 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4321 {
4322 	btrfs_free_reserved_data_space_noquota(inode, start, len);
4323 	btrfs_qgroup_free_data(inode, start, len);
4324 }
4325 
4326 static void force_metadata_allocation(struct btrfs_fs_info *info)
4327 {
4328 	struct list_head *head = &info->space_info;
4329 	struct btrfs_space_info *found;
4330 
4331 	rcu_read_lock();
4332 	list_for_each_entry_rcu(found, head, list) {
4333 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4334 			found->force_alloc = CHUNK_ALLOC_FORCE;
4335 	}
4336 	rcu_read_unlock();
4337 }
4338 
4339 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4340 {
4341 	return (global->size << 1);
4342 }
4343 
4344 static int should_alloc_chunk(struct btrfs_root *root,
4345 			      struct btrfs_space_info *sinfo, int force)
4346 {
4347 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4348 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4349 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4350 	u64 thresh;
4351 
4352 	if (force == CHUNK_ALLOC_FORCE)
4353 		return 1;
4354 
4355 	/*
4356 	 * We need to take into account the global rsv because for all intents
4357 	 * and purposes it's used space.  Don't worry about locking the
4358 	 * global_rsv, it doesn't change except when the transaction commits.
4359 	 */
4360 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4361 		num_allocated += calc_global_rsv_need_space(global_rsv);
4362 
4363 	/*
4364 	 * in limited mode, we want to have some free space up to
4365 	 * about 1% of the FS size.
4366 	 */
4367 	if (force == CHUNK_ALLOC_LIMITED) {
4368 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4369 		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4370 
4371 		if (num_bytes - num_allocated < thresh)
4372 			return 1;
4373 	}
4374 
4375 	if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
4376 		return 0;
4377 	return 1;
4378 }
4379 
4380 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4381 {
4382 	u64 num_dev;
4383 
4384 	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4385 		    BTRFS_BLOCK_GROUP_RAID0 |
4386 		    BTRFS_BLOCK_GROUP_RAID5 |
4387 		    BTRFS_BLOCK_GROUP_RAID6))
4388 		num_dev = root->fs_info->fs_devices->rw_devices;
4389 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
4390 		num_dev = 2;
4391 	else
4392 		num_dev = 1;	/* DUP or single */
4393 
4394 	return num_dev;
4395 }
4396 
4397 /*
4398  * If @is_allocation is true, reserve space in the system space info necessary
4399  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4400  * removing a chunk.
4401  */
4402 void check_system_chunk(struct btrfs_trans_handle *trans,
4403 			struct btrfs_root *root,
4404 			u64 type)
4405 {
4406 	struct btrfs_space_info *info;
4407 	u64 left;
4408 	u64 thresh;
4409 	int ret = 0;
4410 	u64 num_devs;
4411 
4412 	/*
4413 	 * Needed because we can end up allocating a system chunk and for an
4414 	 * atomic and race free space reservation in the chunk block reserve.
4415 	 */
4416 	ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4417 
4418 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4419 	spin_lock(&info->lock);
4420 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4421 		info->bytes_reserved - info->bytes_readonly -
4422 		info->bytes_may_use;
4423 	spin_unlock(&info->lock);
4424 
4425 	num_devs = get_profile_num_devs(root, type);
4426 
4427 	/* num_devs device items to update and 1 chunk item to add or remove */
4428 	thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4429 		btrfs_calc_trans_metadata_size(root, 1);
4430 
4431 	if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
4432 		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4433 			left, thresh, type);
4434 		dump_space_info(root->fs_info, info, 0, 0);
4435 	}
4436 
4437 	if (left < thresh) {
4438 		u64 flags;
4439 
4440 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4441 		/*
4442 		 * Ignore failure to create system chunk. We might end up not
4443 		 * needing it, as we might not need to COW all nodes/leafs from
4444 		 * the paths we visit in the chunk tree (they were already COWed
4445 		 * or created in the current transaction for example).
4446 		 */
4447 		ret = btrfs_alloc_chunk(trans, root, flags);
4448 	}
4449 
4450 	if (!ret) {
4451 		ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4452 					  &root->fs_info->chunk_block_rsv,
4453 					  thresh, BTRFS_RESERVE_NO_FLUSH);
4454 		if (!ret)
4455 			trans->chunk_bytes_reserved += thresh;
4456 	}
4457 }
4458 
4459 /*
4460  * If force is CHUNK_ALLOC_FORCE:
4461  *    - return 1 if it successfully allocates a chunk,
4462  *    - return errors including -ENOSPC otherwise.
4463  * If force is NOT CHUNK_ALLOC_FORCE:
4464  *    - return 0 if it doesn't need to allocate a new chunk,
4465  *    - return 1 if it successfully allocates a chunk,
4466  *    - return errors including -ENOSPC otherwise.
4467  */
4468 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4469 			  struct btrfs_root *extent_root, u64 flags, int force)
4470 {
4471 	struct btrfs_space_info *space_info;
4472 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
4473 	int wait_for_alloc = 0;
4474 	int ret = 0;
4475 
4476 	/* Don't re-enter if we're already allocating a chunk */
4477 	if (trans->allocating_chunk)
4478 		return -ENOSPC;
4479 
4480 	space_info = __find_space_info(extent_root->fs_info, flags);
4481 	if (!space_info) {
4482 		ret = update_space_info(extent_root->fs_info, flags,
4483 					0, 0, 0, &space_info);
4484 		BUG_ON(ret); /* -ENOMEM */
4485 	}
4486 	BUG_ON(!space_info); /* Logic error */
4487 
4488 again:
4489 	spin_lock(&space_info->lock);
4490 	if (force < space_info->force_alloc)
4491 		force = space_info->force_alloc;
4492 	if (space_info->full) {
4493 		if (should_alloc_chunk(extent_root, space_info, force))
4494 			ret = -ENOSPC;
4495 		else
4496 			ret = 0;
4497 		spin_unlock(&space_info->lock);
4498 		return ret;
4499 	}
4500 
4501 	if (!should_alloc_chunk(extent_root, space_info, force)) {
4502 		spin_unlock(&space_info->lock);
4503 		return 0;
4504 	} else if (space_info->chunk_alloc) {
4505 		wait_for_alloc = 1;
4506 	} else {
4507 		space_info->chunk_alloc = 1;
4508 	}
4509 
4510 	spin_unlock(&space_info->lock);
4511 
4512 	mutex_lock(&fs_info->chunk_mutex);
4513 
4514 	/*
4515 	 * The chunk_mutex is held throughout the entirety of a chunk
4516 	 * allocation, so once we've acquired the chunk_mutex we know that the
4517 	 * other guy is done and we need to recheck and see if we should
4518 	 * allocate.
4519 	 */
4520 	if (wait_for_alloc) {
4521 		mutex_unlock(&fs_info->chunk_mutex);
4522 		wait_for_alloc = 0;
4523 		goto again;
4524 	}
4525 
4526 	trans->allocating_chunk = true;
4527 
4528 	/*
4529 	 * If we have mixed data/metadata chunks we want to make sure we keep
4530 	 * allocating mixed chunks instead of individual chunks.
4531 	 */
4532 	if (btrfs_mixed_space_info(space_info))
4533 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4534 
4535 	/*
4536 	 * if we're doing a data chunk, go ahead and make sure that
4537 	 * we keep a reasonable number of metadata chunks allocated in the
4538 	 * FS as well.
4539 	 */
4540 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4541 		fs_info->data_chunk_allocations++;
4542 		if (!(fs_info->data_chunk_allocations %
4543 		      fs_info->metadata_ratio))
4544 			force_metadata_allocation(fs_info);
4545 	}
4546 
4547 	/*
4548 	 * Check if we have enough space in SYSTEM chunk because we may need
4549 	 * to update devices.
4550 	 */
4551 	check_system_chunk(trans, extent_root, flags);
4552 
4553 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
4554 	trans->allocating_chunk = false;
4555 
4556 	spin_lock(&space_info->lock);
4557 	if (ret < 0 && ret != -ENOSPC)
4558 		goto out;
4559 	if (ret)
4560 		space_info->full = 1;
4561 	else
4562 		ret = 1;
4563 
4564 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4565 out:
4566 	space_info->chunk_alloc = 0;
4567 	spin_unlock(&space_info->lock);
4568 	mutex_unlock(&fs_info->chunk_mutex);
4569 	/*
4570 	 * When we allocate a new chunk we reserve space in the chunk block
4571 	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4572 	 * add new nodes/leafs to it if we end up needing to do it when
4573 	 * inserting the chunk item and updating device items as part of the
4574 	 * second phase of chunk allocation, performed by
4575 	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4576 	 * large number of new block groups to create in our transaction
4577 	 * handle's new_bgs list to avoid exhausting the chunk block reserve
4578 	 * in extreme cases - like having a single transaction create many new
4579 	 * block groups when starting to write out the free space caches of all
4580 	 * the block groups that were made dirty during the lifetime of the
4581 	 * transaction.
4582 	 */
4583 	if (trans->can_flush_pending_bgs &&
4584 	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4585 		btrfs_create_pending_block_groups(trans, extent_root);
4586 		btrfs_trans_release_chunk_metadata(trans);
4587 	}
4588 	return ret;
4589 }
4590 
4591 static int can_overcommit(struct btrfs_root *root,
4592 			  struct btrfs_space_info *space_info, u64 bytes,
4593 			  enum btrfs_reserve_flush_enum flush)
4594 {
4595 	struct btrfs_block_rsv *global_rsv;
4596 	u64 profile;
4597 	u64 space_size;
4598 	u64 avail;
4599 	u64 used;
4600 
4601 	/* Don't overcommit when in mixed mode. */
4602 	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4603 		return 0;
4604 
4605 	BUG_ON(root->fs_info == NULL);
4606 	global_rsv = &root->fs_info->global_block_rsv;
4607 	profile = btrfs_get_alloc_profile(root, 0);
4608 	used = space_info->bytes_used + space_info->bytes_reserved +
4609 		space_info->bytes_pinned + space_info->bytes_readonly;
4610 
4611 	/*
4612 	 * We only want to allow over committing if we have lots of actual space
4613 	 * free, but if we don't have enough space to handle the global reserve
4614 	 * space then we could end up having a real enospc problem when trying
4615 	 * to allocate a chunk or some other such important allocation.
4616 	 */
4617 	spin_lock(&global_rsv->lock);
4618 	space_size = calc_global_rsv_need_space(global_rsv);
4619 	spin_unlock(&global_rsv->lock);
4620 	if (used + space_size >= space_info->total_bytes)
4621 		return 0;
4622 
4623 	used += space_info->bytes_may_use;
4624 
4625 	spin_lock(&root->fs_info->free_chunk_lock);
4626 	avail = root->fs_info->free_chunk_space;
4627 	spin_unlock(&root->fs_info->free_chunk_lock);
4628 
4629 	/*
4630 	 * If we have dup, raid1 or raid10 then only half of the free
4631 	 * space is actually useable.  For raid56, the space info used
4632 	 * doesn't include the parity drive, so we don't have to
4633 	 * change the math
4634 	 */
4635 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
4636 		       BTRFS_BLOCK_GROUP_RAID1 |
4637 		       BTRFS_BLOCK_GROUP_RAID10))
4638 		avail >>= 1;
4639 
4640 	/*
4641 	 * If we aren't flushing all things, let us overcommit up to
4642 	 * 1/2th of the space. If we can flush, don't let us overcommit
4643 	 * too much, let it overcommit up to 1/8 of the space.
4644 	 */
4645 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
4646 		avail >>= 3;
4647 	else
4648 		avail >>= 1;
4649 
4650 	if (used + bytes < space_info->total_bytes + avail)
4651 		return 1;
4652 	return 0;
4653 }
4654 
4655 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4656 					 unsigned long nr_pages, int nr_items)
4657 {
4658 	struct super_block *sb = root->fs_info->sb;
4659 
4660 	if (down_read_trylock(&sb->s_umount)) {
4661 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4662 		up_read(&sb->s_umount);
4663 	} else {
4664 		/*
4665 		 * We needn't worry the filesystem going from r/w to r/o though
4666 		 * we don't acquire ->s_umount mutex, because the filesystem
4667 		 * should guarantee the delalloc inodes list be empty after
4668 		 * the filesystem is readonly(all dirty pages are written to
4669 		 * the disk).
4670 		 */
4671 		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4672 		if (!current->journal_info)
4673 			btrfs_wait_ordered_roots(root->fs_info, nr_items,
4674 						 0, (u64)-1);
4675 	}
4676 }
4677 
4678 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4679 {
4680 	u64 bytes;
4681 	int nr;
4682 
4683 	bytes = btrfs_calc_trans_metadata_size(root, 1);
4684 	nr = (int)div64_u64(to_reclaim, bytes);
4685 	if (!nr)
4686 		nr = 1;
4687 	return nr;
4688 }
4689 
4690 #define EXTENT_SIZE_PER_ITEM	SZ_256K
4691 
4692 /*
4693  * shrink metadata reservation for delalloc
4694  */
4695 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4696 			    bool wait_ordered)
4697 {
4698 	struct btrfs_block_rsv *block_rsv;
4699 	struct btrfs_space_info *space_info;
4700 	struct btrfs_trans_handle *trans;
4701 	u64 delalloc_bytes;
4702 	u64 max_reclaim;
4703 	long time_left;
4704 	unsigned long nr_pages;
4705 	int loops;
4706 	int items;
4707 	enum btrfs_reserve_flush_enum flush;
4708 
4709 	/* Calc the number of the pages we need flush for space reservation */
4710 	items = calc_reclaim_items_nr(root, to_reclaim);
4711 	to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
4712 
4713 	trans = (struct btrfs_trans_handle *)current->journal_info;
4714 	block_rsv = &root->fs_info->delalloc_block_rsv;
4715 	space_info = block_rsv->space_info;
4716 
4717 	delalloc_bytes = percpu_counter_sum_positive(
4718 						&root->fs_info->delalloc_bytes);
4719 	if (delalloc_bytes == 0) {
4720 		if (trans)
4721 			return;
4722 		if (wait_ordered)
4723 			btrfs_wait_ordered_roots(root->fs_info, items,
4724 						 0, (u64)-1);
4725 		return;
4726 	}
4727 
4728 	loops = 0;
4729 	while (delalloc_bytes && loops < 3) {
4730 		max_reclaim = min(delalloc_bytes, to_reclaim);
4731 		nr_pages = max_reclaim >> PAGE_SHIFT;
4732 		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4733 		/*
4734 		 * We need to wait for the async pages to actually start before
4735 		 * we do anything.
4736 		 */
4737 		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4738 		if (!max_reclaim)
4739 			goto skip_async;
4740 
4741 		if (max_reclaim <= nr_pages)
4742 			max_reclaim = 0;
4743 		else
4744 			max_reclaim -= nr_pages;
4745 
4746 		wait_event(root->fs_info->async_submit_wait,
4747 			   atomic_read(&root->fs_info->async_delalloc_pages) <=
4748 			   (int)max_reclaim);
4749 skip_async:
4750 		if (!trans)
4751 			flush = BTRFS_RESERVE_FLUSH_ALL;
4752 		else
4753 			flush = BTRFS_RESERVE_NO_FLUSH;
4754 		spin_lock(&space_info->lock);
4755 		if (can_overcommit(root, space_info, orig, flush)) {
4756 			spin_unlock(&space_info->lock);
4757 			break;
4758 		}
4759 		if (list_empty(&space_info->tickets) &&
4760 		    list_empty(&space_info->priority_tickets)) {
4761 			spin_unlock(&space_info->lock);
4762 			break;
4763 		}
4764 		spin_unlock(&space_info->lock);
4765 
4766 		loops++;
4767 		if (wait_ordered && !trans) {
4768 			btrfs_wait_ordered_roots(root->fs_info, items,
4769 						 0, (u64)-1);
4770 		} else {
4771 			time_left = schedule_timeout_killable(1);
4772 			if (time_left)
4773 				break;
4774 		}
4775 		delalloc_bytes = percpu_counter_sum_positive(
4776 						&root->fs_info->delalloc_bytes);
4777 	}
4778 }
4779 
4780 /**
4781  * maybe_commit_transaction - possibly commit the transaction if its ok to
4782  * @root - the root we're allocating for
4783  * @bytes - the number of bytes we want to reserve
4784  * @force - force the commit
4785  *
4786  * This will check to make sure that committing the transaction will actually
4787  * get us somewhere and then commit the transaction if it does.  Otherwise it
4788  * will return -ENOSPC.
4789  */
4790 static int may_commit_transaction(struct btrfs_root *root,
4791 				  struct btrfs_space_info *space_info,
4792 				  u64 bytes, int force)
4793 {
4794 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4795 	struct btrfs_trans_handle *trans;
4796 
4797 	trans = (struct btrfs_trans_handle *)current->journal_info;
4798 	if (trans)
4799 		return -EAGAIN;
4800 
4801 	if (force)
4802 		goto commit;
4803 
4804 	/* See if there is enough pinned space to make this reservation */
4805 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4806 				   bytes) >= 0)
4807 		goto commit;
4808 
4809 	/*
4810 	 * See if there is some space in the delayed insertion reservation for
4811 	 * this reservation.
4812 	 */
4813 	if (space_info != delayed_rsv->space_info)
4814 		return -ENOSPC;
4815 
4816 	spin_lock(&delayed_rsv->lock);
4817 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4818 				   bytes - delayed_rsv->size) >= 0) {
4819 		spin_unlock(&delayed_rsv->lock);
4820 		return -ENOSPC;
4821 	}
4822 	spin_unlock(&delayed_rsv->lock);
4823 
4824 commit:
4825 	trans = btrfs_join_transaction(root);
4826 	if (IS_ERR(trans))
4827 		return -ENOSPC;
4828 
4829 	return btrfs_commit_transaction(trans, root);
4830 }
4831 
4832 struct reserve_ticket {
4833 	u64 bytes;
4834 	int error;
4835 	struct list_head list;
4836 	wait_queue_head_t wait;
4837 };
4838 
4839 static int flush_space(struct btrfs_root *root,
4840 		       struct btrfs_space_info *space_info, u64 num_bytes,
4841 		       u64 orig_bytes, int state)
4842 {
4843 	struct btrfs_trans_handle *trans;
4844 	int nr;
4845 	int ret = 0;
4846 
4847 	switch (state) {
4848 	case FLUSH_DELAYED_ITEMS_NR:
4849 	case FLUSH_DELAYED_ITEMS:
4850 		if (state == FLUSH_DELAYED_ITEMS_NR)
4851 			nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4852 		else
4853 			nr = -1;
4854 
4855 		trans = btrfs_join_transaction(root);
4856 		if (IS_ERR(trans)) {
4857 			ret = PTR_ERR(trans);
4858 			break;
4859 		}
4860 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
4861 		btrfs_end_transaction(trans, root);
4862 		break;
4863 	case FLUSH_DELALLOC:
4864 	case FLUSH_DELALLOC_WAIT:
4865 		shrink_delalloc(root, num_bytes * 2, orig_bytes,
4866 				state == FLUSH_DELALLOC_WAIT);
4867 		break;
4868 	case ALLOC_CHUNK:
4869 		trans = btrfs_join_transaction(root);
4870 		if (IS_ERR(trans)) {
4871 			ret = PTR_ERR(trans);
4872 			break;
4873 		}
4874 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4875 				     btrfs_get_alloc_profile(root, 0),
4876 				     CHUNK_ALLOC_NO_FORCE);
4877 		btrfs_end_transaction(trans, root);
4878 		if (ret > 0 || ret == -ENOSPC)
4879 			ret = 0;
4880 		break;
4881 	case COMMIT_TRANS:
4882 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4883 		break;
4884 	default:
4885 		ret = -ENOSPC;
4886 		break;
4887 	}
4888 
4889 	trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
4890 				orig_bytes, state, ret);
4891 	return ret;
4892 }
4893 
4894 static inline u64
4895 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4896 				 struct btrfs_space_info *space_info)
4897 {
4898 	struct reserve_ticket *ticket;
4899 	u64 used;
4900 	u64 expected;
4901 	u64 to_reclaim = 0;
4902 
4903 	list_for_each_entry(ticket, &space_info->tickets, list)
4904 		to_reclaim += ticket->bytes;
4905 	list_for_each_entry(ticket, &space_info->priority_tickets, list)
4906 		to_reclaim += ticket->bytes;
4907 	if (to_reclaim)
4908 		return to_reclaim;
4909 
4910 	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4911 	if (can_overcommit(root, space_info, to_reclaim,
4912 			   BTRFS_RESERVE_FLUSH_ALL))
4913 		return 0;
4914 
4915 	used = space_info->bytes_used + space_info->bytes_reserved +
4916 	       space_info->bytes_pinned + space_info->bytes_readonly +
4917 	       space_info->bytes_may_use;
4918 	if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
4919 		expected = div_factor_fine(space_info->total_bytes, 95);
4920 	else
4921 		expected = div_factor_fine(space_info->total_bytes, 90);
4922 
4923 	if (used > expected)
4924 		to_reclaim = used - expected;
4925 	else
4926 		to_reclaim = 0;
4927 	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4928 				     space_info->bytes_reserved);
4929 	return to_reclaim;
4930 }
4931 
4932 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4933 					struct btrfs_root *root, u64 used)
4934 {
4935 	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4936 
4937 	/* If we're just plain full then async reclaim just slows us down. */
4938 	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4939 		return 0;
4940 
4941 	if (!btrfs_calc_reclaim_metadata_size(root, space_info))
4942 		return 0;
4943 
4944 	return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
4945 		!test_bit(BTRFS_FS_STATE_REMOUNTING,
4946 			  &root->fs_info->fs_state));
4947 }
4948 
4949 static void wake_all_tickets(struct list_head *head)
4950 {
4951 	struct reserve_ticket *ticket;
4952 
4953 	while (!list_empty(head)) {
4954 		ticket = list_first_entry(head, struct reserve_ticket, list);
4955 		list_del_init(&ticket->list);
4956 		ticket->error = -ENOSPC;
4957 		wake_up(&ticket->wait);
4958 	}
4959 }
4960 
4961 /*
4962  * This is for normal flushers, we can wait all goddamned day if we want to.  We
4963  * will loop and continuously try to flush as long as we are making progress.
4964  * We count progress as clearing off tickets each time we have to loop.
4965  */
4966 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4967 {
4968 	struct btrfs_fs_info *fs_info;
4969 	struct btrfs_space_info *space_info;
4970 	u64 to_reclaim;
4971 	int flush_state;
4972 	int commit_cycles = 0;
4973 	u64 last_tickets_id;
4974 
4975 	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4976 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4977 
4978 	spin_lock(&space_info->lock);
4979 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4980 						      space_info);
4981 	if (!to_reclaim) {
4982 		space_info->flush = 0;
4983 		spin_unlock(&space_info->lock);
4984 		return;
4985 	}
4986 	last_tickets_id = space_info->tickets_id;
4987 	spin_unlock(&space_info->lock);
4988 
4989 	flush_state = FLUSH_DELAYED_ITEMS_NR;
4990 	do {
4991 		struct reserve_ticket *ticket;
4992 		int ret;
4993 
4994 		ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
4995 			    to_reclaim, flush_state);
4996 		spin_lock(&space_info->lock);
4997 		if (list_empty(&space_info->tickets)) {
4998 			space_info->flush = 0;
4999 			spin_unlock(&space_info->lock);
5000 			return;
5001 		}
5002 		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5003 							      space_info);
5004 		ticket = list_first_entry(&space_info->tickets,
5005 					  struct reserve_ticket, list);
5006 		if (last_tickets_id == space_info->tickets_id) {
5007 			flush_state++;
5008 		} else {
5009 			last_tickets_id = space_info->tickets_id;
5010 			flush_state = FLUSH_DELAYED_ITEMS_NR;
5011 			if (commit_cycles)
5012 				commit_cycles--;
5013 		}
5014 
5015 		if (flush_state > COMMIT_TRANS) {
5016 			commit_cycles++;
5017 			if (commit_cycles > 2) {
5018 				wake_all_tickets(&space_info->tickets);
5019 				space_info->flush = 0;
5020 			} else {
5021 				flush_state = FLUSH_DELAYED_ITEMS_NR;
5022 			}
5023 		}
5024 		spin_unlock(&space_info->lock);
5025 	} while (flush_state <= COMMIT_TRANS);
5026 }
5027 
5028 void btrfs_init_async_reclaim_work(struct work_struct *work)
5029 {
5030 	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5031 }
5032 
5033 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5034 					    struct btrfs_space_info *space_info,
5035 					    struct reserve_ticket *ticket)
5036 {
5037 	u64 to_reclaim;
5038 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
5039 
5040 	spin_lock(&space_info->lock);
5041 	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5042 						      space_info);
5043 	if (!to_reclaim) {
5044 		spin_unlock(&space_info->lock);
5045 		return;
5046 	}
5047 	spin_unlock(&space_info->lock);
5048 
5049 	do {
5050 		flush_space(fs_info->fs_root, space_info, to_reclaim,
5051 			    to_reclaim, flush_state);
5052 		flush_state++;
5053 		spin_lock(&space_info->lock);
5054 		if (ticket->bytes == 0) {
5055 			spin_unlock(&space_info->lock);
5056 			return;
5057 		}
5058 		spin_unlock(&space_info->lock);
5059 
5060 		/*
5061 		 * Priority flushers can't wait on delalloc without
5062 		 * deadlocking.
5063 		 */
5064 		if (flush_state == FLUSH_DELALLOC ||
5065 		    flush_state == FLUSH_DELALLOC_WAIT)
5066 			flush_state = ALLOC_CHUNK;
5067 	} while (flush_state < COMMIT_TRANS);
5068 }
5069 
5070 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5071 			       struct btrfs_space_info *space_info,
5072 			       struct reserve_ticket *ticket, u64 orig_bytes)
5073 
5074 {
5075 	DEFINE_WAIT(wait);
5076 	int ret = 0;
5077 
5078 	spin_lock(&space_info->lock);
5079 	while (ticket->bytes > 0 && ticket->error == 0) {
5080 		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5081 		if (ret) {
5082 			ret = -EINTR;
5083 			break;
5084 		}
5085 		spin_unlock(&space_info->lock);
5086 
5087 		schedule();
5088 
5089 		finish_wait(&ticket->wait, &wait);
5090 		spin_lock(&space_info->lock);
5091 	}
5092 	if (!ret)
5093 		ret = ticket->error;
5094 	if (!list_empty(&ticket->list))
5095 		list_del_init(&ticket->list);
5096 	if (ticket->bytes && ticket->bytes < orig_bytes) {
5097 		u64 num_bytes = orig_bytes - ticket->bytes;
5098 		space_info->bytes_may_use -= num_bytes;
5099 		trace_btrfs_space_reservation(fs_info, "space_info",
5100 					      space_info->flags, num_bytes, 0);
5101 	}
5102 	spin_unlock(&space_info->lock);
5103 
5104 	return ret;
5105 }
5106 
5107 /**
5108  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5109  * @root - the root we're allocating for
5110  * @space_info - the space info we want to allocate from
5111  * @orig_bytes - the number of bytes we want
5112  * @flush - whether or not we can flush to make our reservation
5113  *
5114  * This will reserve orig_bytes number of bytes from the space info associated
5115  * with the block_rsv.  If there is not enough space it will make an attempt to
5116  * flush out space to make room.  It will do this by flushing delalloc if
5117  * possible or committing the transaction.  If flush is 0 then no attempts to
5118  * regain reservations will be made and this will fail if there is not enough
5119  * space already.
5120  */
5121 static int __reserve_metadata_bytes(struct btrfs_root *root,
5122 				    struct btrfs_space_info *space_info,
5123 				    u64 orig_bytes,
5124 				    enum btrfs_reserve_flush_enum flush)
5125 {
5126 	struct reserve_ticket ticket;
5127 	u64 used;
5128 	int ret = 0;
5129 
5130 	ASSERT(orig_bytes);
5131 	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5132 
5133 	spin_lock(&space_info->lock);
5134 	ret = -ENOSPC;
5135 	used = space_info->bytes_used + space_info->bytes_reserved +
5136 		space_info->bytes_pinned + space_info->bytes_readonly +
5137 		space_info->bytes_may_use;
5138 
5139 	/*
5140 	 * If we have enough space then hooray, make our reservation and carry
5141 	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
5142 	 * If not things get more complicated.
5143 	 */
5144 	if (used + orig_bytes <= space_info->total_bytes) {
5145 		space_info->bytes_may_use += orig_bytes;
5146 		trace_btrfs_space_reservation(root->fs_info, "space_info",
5147 					      space_info->flags, orig_bytes,
5148 					      1);
5149 		ret = 0;
5150 	} else if (can_overcommit(root, space_info, orig_bytes, flush)) {
5151 		space_info->bytes_may_use += orig_bytes;
5152 		trace_btrfs_space_reservation(root->fs_info, "space_info",
5153 					      space_info->flags, orig_bytes,
5154 					      1);
5155 		ret = 0;
5156 	}
5157 
5158 	/*
5159 	 * If we couldn't make a reservation then setup our reservation ticket
5160 	 * and kick the async worker if it's not already running.
5161 	 *
5162 	 * If we are a priority flusher then we just need to add our ticket to
5163 	 * the list and we will do our own flushing further down.
5164 	 */
5165 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5166 		ticket.bytes = orig_bytes;
5167 		ticket.error = 0;
5168 		init_waitqueue_head(&ticket.wait);
5169 		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5170 			list_add_tail(&ticket.list, &space_info->tickets);
5171 			if (!space_info->flush) {
5172 				space_info->flush = 1;
5173 				trace_btrfs_trigger_flush(root->fs_info,
5174 							  space_info->flags,
5175 							  orig_bytes, flush,
5176 							  "enospc");
5177 				queue_work(system_unbound_wq,
5178 					   &root->fs_info->async_reclaim_work);
5179 			}
5180 		} else {
5181 			list_add_tail(&ticket.list,
5182 				      &space_info->priority_tickets);
5183 		}
5184 	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5185 		used += orig_bytes;
5186 		/*
5187 		 * We will do the space reservation dance during log replay,
5188 		 * which means we won't have fs_info->fs_root set, so don't do
5189 		 * the async reclaim as we will panic.
5190 		 */
5191 		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags) &&
5192 		    need_do_async_reclaim(space_info, root, used) &&
5193 		    !work_busy(&root->fs_info->async_reclaim_work)) {
5194 			trace_btrfs_trigger_flush(root->fs_info,
5195 						  space_info->flags,
5196 						  orig_bytes, flush,
5197 						  "preempt");
5198 			queue_work(system_unbound_wq,
5199 				   &root->fs_info->async_reclaim_work);
5200 		}
5201 	}
5202 	spin_unlock(&space_info->lock);
5203 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5204 		return ret;
5205 
5206 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
5207 		return wait_reserve_ticket(root->fs_info, space_info, &ticket,
5208 					   orig_bytes);
5209 
5210 	ret = 0;
5211 	priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
5212 	spin_lock(&space_info->lock);
5213 	if (ticket.bytes) {
5214 		if (ticket.bytes < orig_bytes) {
5215 			u64 num_bytes = orig_bytes - ticket.bytes;
5216 			space_info->bytes_may_use -= num_bytes;
5217 			trace_btrfs_space_reservation(root->fs_info,
5218 					"space_info", space_info->flags,
5219 					num_bytes, 0);
5220 
5221 		}
5222 		list_del_init(&ticket.list);
5223 		ret = -ENOSPC;
5224 	}
5225 	spin_unlock(&space_info->lock);
5226 	ASSERT(list_empty(&ticket.list));
5227 	return ret;
5228 }
5229 
5230 /**
5231  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5232  * @root - the root we're allocating for
5233  * @block_rsv - the block_rsv we're allocating for
5234  * @orig_bytes - the number of bytes we want
5235  * @flush - whether or not we can flush to make our reservation
5236  *
5237  * This will reserve orgi_bytes number of bytes from the space info associated
5238  * with the block_rsv.  If there is not enough space it will make an attempt to
5239  * flush out space to make room.  It will do this by flushing delalloc if
5240  * possible or committing the transaction.  If flush is 0 then no attempts to
5241  * regain reservations will be made and this will fail if there is not enough
5242  * space already.
5243  */
5244 static int reserve_metadata_bytes(struct btrfs_root *root,
5245 				  struct btrfs_block_rsv *block_rsv,
5246 				  u64 orig_bytes,
5247 				  enum btrfs_reserve_flush_enum flush)
5248 {
5249 	int ret;
5250 
5251 	ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
5252 				       flush);
5253 	if (ret == -ENOSPC &&
5254 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5255 		struct btrfs_block_rsv *global_rsv =
5256 			&root->fs_info->global_block_rsv;
5257 
5258 		if (block_rsv != global_rsv &&
5259 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
5260 			ret = 0;
5261 	}
5262 	if (ret == -ENOSPC)
5263 		trace_btrfs_space_reservation(root->fs_info,
5264 					      "space_info:enospc",
5265 					      block_rsv->space_info->flags,
5266 					      orig_bytes, 1);
5267 	return ret;
5268 }
5269 
5270 static struct btrfs_block_rsv *get_block_rsv(
5271 					const struct btrfs_trans_handle *trans,
5272 					const struct btrfs_root *root)
5273 {
5274 	struct btrfs_block_rsv *block_rsv = NULL;
5275 
5276 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5277 	    (root == root->fs_info->csum_root && trans->adding_csums) ||
5278 	     (root == root->fs_info->uuid_root))
5279 		block_rsv = trans->block_rsv;
5280 
5281 	if (!block_rsv)
5282 		block_rsv = root->block_rsv;
5283 
5284 	if (!block_rsv)
5285 		block_rsv = &root->fs_info->empty_block_rsv;
5286 
5287 	return block_rsv;
5288 }
5289 
5290 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5291 			       u64 num_bytes)
5292 {
5293 	int ret = -ENOSPC;
5294 	spin_lock(&block_rsv->lock);
5295 	if (block_rsv->reserved >= num_bytes) {
5296 		block_rsv->reserved -= num_bytes;
5297 		if (block_rsv->reserved < block_rsv->size)
5298 			block_rsv->full = 0;
5299 		ret = 0;
5300 	}
5301 	spin_unlock(&block_rsv->lock);
5302 	return ret;
5303 }
5304 
5305 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5306 				u64 num_bytes, int update_size)
5307 {
5308 	spin_lock(&block_rsv->lock);
5309 	block_rsv->reserved += num_bytes;
5310 	if (update_size)
5311 		block_rsv->size += num_bytes;
5312 	else if (block_rsv->reserved >= block_rsv->size)
5313 		block_rsv->full = 1;
5314 	spin_unlock(&block_rsv->lock);
5315 }
5316 
5317 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5318 			     struct btrfs_block_rsv *dest, u64 num_bytes,
5319 			     int min_factor)
5320 {
5321 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5322 	u64 min_bytes;
5323 
5324 	if (global_rsv->space_info != dest->space_info)
5325 		return -ENOSPC;
5326 
5327 	spin_lock(&global_rsv->lock);
5328 	min_bytes = div_factor(global_rsv->size, min_factor);
5329 	if (global_rsv->reserved < min_bytes + num_bytes) {
5330 		spin_unlock(&global_rsv->lock);
5331 		return -ENOSPC;
5332 	}
5333 	global_rsv->reserved -= num_bytes;
5334 	if (global_rsv->reserved < global_rsv->size)
5335 		global_rsv->full = 0;
5336 	spin_unlock(&global_rsv->lock);
5337 
5338 	block_rsv_add_bytes(dest, num_bytes, 1);
5339 	return 0;
5340 }
5341 
5342 /*
5343  * This is for space we already have accounted in space_info->bytes_may_use, so
5344  * basically when we're returning space from block_rsv's.
5345  */
5346 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5347 				     struct btrfs_space_info *space_info,
5348 				     u64 num_bytes)
5349 {
5350 	struct reserve_ticket *ticket;
5351 	struct list_head *head;
5352 	u64 used;
5353 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5354 	bool check_overcommit = false;
5355 
5356 	spin_lock(&space_info->lock);
5357 	head = &space_info->priority_tickets;
5358 
5359 	/*
5360 	 * If we are over our limit then we need to check and see if we can
5361 	 * overcommit, and if we can't then we just need to free up our space
5362 	 * and not satisfy any requests.
5363 	 */
5364 	used = space_info->bytes_used + space_info->bytes_reserved +
5365 		space_info->bytes_pinned + space_info->bytes_readonly +
5366 		space_info->bytes_may_use;
5367 	if (used - num_bytes >= space_info->total_bytes)
5368 		check_overcommit = true;
5369 again:
5370 	while (!list_empty(head) && num_bytes) {
5371 		ticket = list_first_entry(head, struct reserve_ticket,
5372 					  list);
5373 		/*
5374 		 * We use 0 bytes because this space is already reserved, so
5375 		 * adding the ticket space would be a double count.
5376 		 */
5377 		if (check_overcommit &&
5378 		    !can_overcommit(fs_info->extent_root, space_info, 0,
5379 				    flush))
5380 			break;
5381 		if (num_bytes >= ticket->bytes) {
5382 			list_del_init(&ticket->list);
5383 			num_bytes -= ticket->bytes;
5384 			ticket->bytes = 0;
5385 			space_info->tickets_id++;
5386 			wake_up(&ticket->wait);
5387 		} else {
5388 			ticket->bytes -= num_bytes;
5389 			num_bytes = 0;
5390 		}
5391 	}
5392 
5393 	if (num_bytes && head == &space_info->priority_tickets) {
5394 		head = &space_info->tickets;
5395 		flush = BTRFS_RESERVE_FLUSH_ALL;
5396 		goto again;
5397 	}
5398 	space_info->bytes_may_use -= num_bytes;
5399 	trace_btrfs_space_reservation(fs_info, "space_info",
5400 				      space_info->flags, num_bytes, 0);
5401 	spin_unlock(&space_info->lock);
5402 }
5403 
5404 /*
5405  * This is for newly allocated space that isn't accounted in
5406  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5407  * we use this helper.
5408  */
5409 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5410 				     struct btrfs_space_info *space_info,
5411 				     u64 num_bytes)
5412 {
5413 	struct reserve_ticket *ticket;
5414 	struct list_head *head = &space_info->priority_tickets;
5415 
5416 again:
5417 	while (!list_empty(head) && num_bytes) {
5418 		ticket = list_first_entry(head, struct reserve_ticket,
5419 					  list);
5420 		if (num_bytes >= ticket->bytes) {
5421 			trace_btrfs_space_reservation(fs_info, "space_info",
5422 						      space_info->flags,
5423 						      ticket->bytes, 1);
5424 			list_del_init(&ticket->list);
5425 			num_bytes -= ticket->bytes;
5426 			space_info->bytes_may_use += ticket->bytes;
5427 			ticket->bytes = 0;
5428 			space_info->tickets_id++;
5429 			wake_up(&ticket->wait);
5430 		} else {
5431 			trace_btrfs_space_reservation(fs_info, "space_info",
5432 						      space_info->flags,
5433 						      num_bytes, 1);
5434 			space_info->bytes_may_use += num_bytes;
5435 			ticket->bytes -= num_bytes;
5436 			num_bytes = 0;
5437 		}
5438 	}
5439 
5440 	if (num_bytes && head == &space_info->priority_tickets) {
5441 		head = &space_info->tickets;
5442 		goto again;
5443 	}
5444 }
5445 
5446 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5447 				    struct btrfs_block_rsv *block_rsv,
5448 				    struct btrfs_block_rsv *dest, u64 num_bytes)
5449 {
5450 	struct btrfs_space_info *space_info = block_rsv->space_info;
5451 
5452 	spin_lock(&block_rsv->lock);
5453 	if (num_bytes == (u64)-1)
5454 		num_bytes = block_rsv->size;
5455 	block_rsv->size -= num_bytes;
5456 	if (block_rsv->reserved >= block_rsv->size) {
5457 		num_bytes = block_rsv->reserved - block_rsv->size;
5458 		block_rsv->reserved = block_rsv->size;
5459 		block_rsv->full = 1;
5460 	} else {
5461 		num_bytes = 0;
5462 	}
5463 	spin_unlock(&block_rsv->lock);
5464 
5465 	if (num_bytes > 0) {
5466 		if (dest) {
5467 			spin_lock(&dest->lock);
5468 			if (!dest->full) {
5469 				u64 bytes_to_add;
5470 
5471 				bytes_to_add = dest->size - dest->reserved;
5472 				bytes_to_add = min(num_bytes, bytes_to_add);
5473 				dest->reserved += bytes_to_add;
5474 				if (dest->reserved >= dest->size)
5475 					dest->full = 1;
5476 				num_bytes -= bytes_to_add;
5477 			}
5478 			spin_unlock(&dest->lock);
5479 		}
5480 		if (num_bytes)
5481 			space_info_add_old_bytes(fs_info, space_info,
5482 						 num_bytes);
5483 	}
5484 }
5485 
5486 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5487 			    struct btrfs_block_rsv *dst, u64 num_bytes,
5488 			    int update_size)
5489 {
5490 	int ret;
5491 
5492 	ret = block_rsv_use_bytes(src, num_bytes);
5493 	if (ret)
5494 		return ret;
5495 
5496 	block_rsv_add_bytes(dst, num_bytes, update_size);
5497 	return 0;
5498 }
5499 
5500 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5501 {
5502 	memset(rsv, 0, sizeof(*rsv));
5503 	spin_lock_init(&rsv->lock);
5504 	rsv->type = type;
5505 }
5506 
5507 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5508 					      unsigned short type)
5509 {
5510 	struct btrfs_block_rsv *block_rsv;
5511 	struct btrfs_fs_info *fs_info = root->fs_info;
5512 
5513 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5514 	if (!block_rsv)
5515 		return NULL;
5516 
5517 	btrfs_init_block_rsv(block_rsv, type);
5518 	block_rsv->space_info = __find_space_info(fs_info,
5519 						  BTRFS_BLOCK_GROUP_METADATA);
5520 	return block_rsv;
5521 }
5522 
5523 void btrfs_free_block_rsv(struct btrfs_root *root,
5524 			  struct btrfs_block_rsv *rsv)
5525 {
5526 	if (!rsv)
5527 		return;
5528 	btrfs_block_rsv_release(root, rsv, (u64)-1);
5529 	kfree(rsv);
5530 }
5531 
5532 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5533 {
5534 	kfree(rsv);
5535 }
5536 
5537 int btrfs_block_rsv_add(struct btrfs_root *root,
5538 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5539 			enum btrfs_reserve_flush_enum flush)
5540 {
5541 	int ret;
5542 
5543 	if (num_bytes == 0)
5544 		return 0;
5545 
5546 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5547 	if (!ret) {
5548 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
5549 		return 0;
5550 	}
5551 
5552 	return ret;
5553 }
5554 
5555 int btrfs_block_rsv_check(struct btrfs_root *root,
5556 			  struct btrfs_block_rsv *block_rsv, int min_factor)
5557 {
5558 	u64 num_bytes = 0;
5559 	int ret = -ENOSPC;
5560 
5561 	if (!block_rsv)
5562 		return 0;
5563 
5564 	spin_lock(&block_rsv->lock);
5565 	num_bytes = div_factor(block_rsv->size, min_factor);
5566 	if (block_rsv->reserved >= num_bytes)
5567 		ret = 0;
5568 	spin_unlock(&block_rsv->lock);
5569 
5570 	return ret;
5571 }
5572 
5573 int btrfs_block_rsv_refill(struct btrfs_root *root,
5574 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5575 			   enum btrfs_reserve_flush_enum flush)
5576 {
5577 	u64 num_bytes = 0;
5578 	int ret = -ENOSPC;
5579 
5580 	if (!block_rsv)
5581 		return 0;
5582 
5583 	spin_lock(&block_rsv->lock);
5584 	num_bytes = min_reserved;
5585 	if (block_rsv->reserved >= num_bytes)
5586 		ret = 0;
5587 	else
5588 		num_bytes -= block_rsv->reserved;
5589 	spin_unlock(&block_rsv->lock);
5590 
5591 	if (!ret)
5592 		return 0;
5593 
5594 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5595 	if (!ret) {
5596 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
5597 		return 0;
5598 	}
5599 
5600 	return ret;
5601 }
5602 
5603 void btrfs_block_rsv_release(struct btrfs_root *root,
5604 			     struct btrfs_block_rsv *block_rsv,
5605 			     u64 num_bytes)
5606 {
5607 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5608 	if (global_rsv == block_rsv ||
5609 	    block_rsv->space_info != global_rsv->space_info)
5610 		global_rsv = NULL;
5611 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5612 				num_bytes);
5613 }
5614 
5615 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5616 {
5617 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5618 	struct btrfs_space_info *sinfo = block_rsv->space_info;
5619 	u64 num_bytes;
5620 
5621 	/*
5622 	 * The global block rsv is based on the size of the extent tree, the
5623 	 * checksum tree and the root tree.  If the fs is empty we want to set
5624 	 * it to a minimal amount for safety.
5625 	 */
5626 	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5627 		btrfs_root_used(&fs_info->csum_root->root_item) +
5628 		btrfs_root_used(&fs_info->tree_root->root_item);
5629 	num_bytes = max_t(u64, num_bytes, SZ_16M);
5630 
5631 	spin_lock(&sinfo->lock);
5632 	spin_lock(&block_rsv->lock);
5633 
5634 	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5635 
5636 	if (block_rsv->reserved < block_rsv->size) {
5637 		num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5638 			sinfo->bytes_reserved + sinfo->bytes_readonly +
5639 			sinfo->bytes_may_use;
5640 		if (sinfo->total_bytes > num_bytes) {
5641 			num_bytes = sinfo->total_bytes - num_bytes;
5642 			num_bytes = min(num_bytes,
5643 					block_rsv->size - block_rsv->reserved);
5644 			block_rsv->reserved += num_bytes;
5645 			sinfo->bytes_may_use += num_bytes;
5646 			trace_btrfs_space_reservation(fs_info, "space_info",
5647 						      sinfo->flags, num_bytes,
5648 						      1);
5649 		}
5650 	} else if (block_rsv->reserved > block_rsv->size) {
5651 		num_bytes = block_rsv->reserved - block_rsv->size;
5652 		sinfo->bytes_may_use -= num_bytes;
5653 		trace_btrfs_space_reservation(fs_info, "space_info",
5654 				      sinfo->flags, num_bytes, 0);
5655 		block_rsv->reserved = block_rsv->size;
5656 	}
5657 
5658 	if (block_rsv->reserved == block_rsv->size)
5659 		block_rsv->full = 1;
5660 	else
5661 		block_rsv->full = 0;
5662 
5663 	spin_unlock(&block_rsv->lock);
5664 	spin_unlock(&sinfo->lock);
5665 }
5666 
5667 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5668 {
5669 	struct btrfs_space_info *space_info;
5670 
5671 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5672 	fs_info->chunk_block_rsv.space_info = space_info;
5673 
5674 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5675 	fs_info->global_block_rsv.space_info = space_info;
5676 	fs_info->delalloc_block_rsv.space_info = space_info;
5677 	fs_info->trans_block_rsv.space_info = space_info;
5678 	fs_info->empty_block_rsv.space_info = space_info;
5679 	fs_info->delayed_block_rsv.space_info = space_info;
5680 
5681 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5682 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5683 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5684 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5685 	if (fs_info->quota_root)
5686 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5687 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5688 
5689 	update_global_block_rsv(fs_info);
5690 }
5691 
5692 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5693 {
5694 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5695 				(u64)-1);
5696 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5697 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5698 	WARN_ON(fs_info->trans_block_rsv.size > 0);
5699 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5700 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
5701 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5702 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
5703 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5704 }
5705 
5706 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5707 				  struct btrfs_root *root)
5708 {
5709 	if (!trans->block_rsv)
5710 		return;
5711 
5712 	if (!trans->bytes_reserved)
5713 		return;
5714 
5715 	trace_btrfs_space_reservation(root->fs_info, "transaction",
5716 				      trans->transid, trans->bytes_reserved, 0);
5717 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5718 	trans->bytes_reserved = 0;
5719 }
5720 
5721 /*
5722  * To be called after all the new block groups attached to the transaction
5723  * handle have been created (btrfs_create_pending_block_groups()).
5724  */
5725 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5726 {
5727 	struct btrfs_fs_info *fs_info = trans->fs_info;
5728 
5729 	if (!trans->chunk_bytes_reserved)
5730 		return;
5731 
5732 	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5733 
5734 	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5735 				trans->chunk_bytes_reserved);
5736 	trans->chunk_bytes_reserved = 0;
5737 }
5738 
5739 /* Can only return 0 or -ENOSPC */
5740 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5741 				  struct inode *inode)
5742 {
5743 	struct btrfs_root *root = BTRFS_I(inode)->root;
5744 	/*
5745 	 * We always use trans->block_rsv here as we will have reserved space
5746 	 * for our orphan when starting the transaction, using get_block_rsv()
5747 	 * here will sometimes make us choose the wrong block rsv as we could be
5748 	 * doing a reloc inode for a non refcounted root.
5749 	 */
5750 	struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5751 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5752 
5753 	/*
5754 	 * We need to hold space in order to delete our orphan item once we've
5755 	 * added it, so this takes the reservation so we can release it later
5756 	 * when we are truly done with the orphan item.
5757 	 */
5758 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5759 	trace_btrfs_space_reservation(root->fs_info, "orphan",
5760 				      btrfs_ino(inode), num_bytes, 1);
5761 	return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5762 }
5763 
5764 void btrfs_orphan_release_metadata(struct inode *inode)
5765 {
5766 	struct btrfs_root *root = BTRFS_I(inode)->root;
5767 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5768 	trace_btrfs_space_reservation(root->fs_info, "orphan",
5769 				      btrfs_ino(inode), num_bytes, 0);
5770 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5771 }
5772 
5773 /*
5774  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5775  * root: the root of the parent directory
5776  * rsv: block reservation
5777  * items: the number of items that we need do reservation
5778  * qgroup_reserved: used to return the reserved size in qgroup
5779  *
5780  * This function is used to reserve the space for snapshot/subvolume
5781  * creation and deletion. Those operations are different with the
5782  * common file/directory operations, they change two fs/file trees
5783  * and root tree, the number of items that the qgroup reserves is
5784  * different with the free space reservation. So we can not use
5785  * the space reservation mechanism in start_transaction().
5786  */
5787 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5788 				     struct btrfs_block_rsv *rsv,
5789 				     int items,
5790 				     u64 *qgroup_reserved,
5791 				     bool use_global_rsv)
5792 {
5793 	u64 num_bytes;
5794 	int ret;
5795 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5796 
5797 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
5798 		/* One for parent inode, two for dir entries */
5799 		num_bytes = 3 * root->nodesize;
5800 		ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5801 		if (ret)
5802 			return ret;
5803 	} else {
5804 		num_bytes = 0;
5805 	}
5806 
5807 	*qgroup_reserved = num_bytes;
5808 
5809 	num_bytes = btrfs_calc_trans_metadata_size(root, items);
5810 	rsv->space_info = __find_space_info(root->fs_info,
5811 					    BTRFS_BLOCK_GROUP_METADATA);
5812 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5813 				  BTRFS_RESERVE_FLUSH_ALL);
5814 
5815 	if (ret == -ENOSPC && use_global_rsv)
5816 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5817 
5818 	if (ret && *qgroup_reserved)
5819 		btrfs_qgroup_free_meta(root, *qgroup_reserved);
5820 
5821 	return ret;
5822 }
5823 
5824 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5825 				      struct btrfs_block_rsv *rsv,
5826 				      u64 qgroup_reserved)
5827 {
5828 	btrfs_block_rsv_release(root, rsv, (u64)-1);
5829 }
5830 
5831 /**
5832  * drop_outstanding_extent - drop an outstanding extent
5833  * @inode: the inode we're dropping the extent for
5834  * @num_bytes: the number of bytes we're releasing.
5835  *
5836  * This is called when we are freeing up an outstanding extent, either called
5837  * after an error or after an extent is written.  This will return the number of
5838  * reserved extents that need to be freed.  This must be called with
5839  * BTRFS_I(inode)->lock held.
5840  */
5841 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5842 {
5843 	unsigned drop_inode_space = 0;
5844 	unsigned dropped_extents = 0;
5845 	unsigned num_extents = 0;
5846 
5847 	num_extents = (unsigned)div64_u64(num_bytes +
5848 					  BTRFS_MAX_EXTENT_SIZE - 1,
5849 					  BTRFS_MAX_EXTENT_SIZE);
5850 	ASSERT(num_extents);
5851 	ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5852 	BTRFS_I(inode)->outstanding_extents -= num_extents;
5853 
5854 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
5855 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5856 			       &BTRFS_I(inode)->runtime_flags))
5857 		drop_inode_space = 1;
5858 
5859 	/*
5860 	 * If we have more or the same amount of outstanding extents than we have
5861 	 * reserved then we need to leave the reserved extents count alone.
5862 	 */
5863 	if (BTRFS_I(inode)->outstanding_extents >=
5864 	    BTRFS_I(inode)->reserved_extents)
5865 		return drop_inode_space;
5866 
5867 	dropped_extents = BTRFS_I(inode)->reserved_extents -
5868 		BTRFS_I(inode)->outstanding_extents;
5869 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
5870 	return dropped_extents + drop_inode_space;
5871 }
5872 
5873 /**
5874  * calc_csum_metadata_size - return the amount of metadata space that must be
5875  *	reserved/freed for the given bytes.
5876  * @inode: the inode we're manipulating
5877  * @num_bytes: the number of bytes in question
5878  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5879  *
5880  * This adjusts the number of csum_bytes in the inode and then returns the
5881  * correct amount of metadata that must either be reserved or freed.  We
5882  * calculate how many checksums we can fit into one leaf and then divide the
5883  * number of bytes that will need to be checksumed by this value to figure out
5884  * how many checksums will be required.  If we are adding bytes then the number
5885  * may go up and we will return the number of additional bytes that must be
5886  * reserved.  If it is going down we will return the number of bytes that must
5887  * be freed.
5888  *
5889  * This must be called with BTRFS_I(inode)->lock held.
5890  */
5891 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5892 				   int reserve)
5893 {
5894 	struct btrfs_root *root = BTRFS_I(inode)->root;
5895 	u64 old_csums, num_csums;
5896 
5897 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5898 	    BTRFS_I(inode)->csum_bytes == 0)
5899 		return 0;
5900 
5901 	old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5902 	if (reserve)
5903 		BTRFS_I(inode)->csum_bytes += num_bytes;
5904 	else
5905 		BTRFS_I(inode)->csum_bytes -= num_bytes;
5906 	num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5907 
5908 	/* No change, no need to reserve more */
5909 	if (old_csums == num_csums)
5910 		return 0;
5911 
5912 	if (reserve)
5913 		return btrfs_calc_trans_metadata_size(root,
5914 						      num_csums - old_csums);
5915 
5916 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5917 }
5918 
5919 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5920 {
5921 	struct btrfs_root *root = BTRFS_I(inode)->root;
5922 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5923 	u64 to_reserve = 0;
5924 	u64 csum_bytes;
5925 	unsigned nr_extents = 0;
5926 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5927 	int ret = 0;
5928 	bool delalloc_lock = true;
5929 	u64 to_free = 0;
5930 	unsigned dropped;
5931 	bool release_extra = false;
5932 
5933 	/* If we are a free space inode we need to not flush since we will be in
5934 	 * the middle of a transaction commit.  We also don't need the delalloc
5935 	 * mutex since we won't race with anybody.  We need this mostly to make
5936 	 * lockdep shut its filthy mouth.
5937 	 *
5938 	 * If we have a transaction open (can happen if we call truncate_block
5939 	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5940 	 */
5941 	if (btrfs_is_free_space_inode(inode)) {
5942 		flush = BTRFS_RESERVE_NO_FLUSH;
5943 		delalloc_lock = false;
5944 	} else if (current->journal_info) {
5945 		flush = BTRFS_RESERVE_FLUSH_LIMIT;
5946 	}
5947 
5948 	if (flush != BTRFS_RESERVE_NO_FLUSH &&
5949 	    btrfs_transaction_in_commit(root->fs_info))
5950 		schedule_timeout(1);
5951 
5952 	if (delalloc_lock)
5953 		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5954 
5955 	num_bytes = ALIGN(num_bytes, root->sectorsize);
5956 
5957 	spin_lock(&BTRFS_I(inode)->lock);
5958 	nr_extents = (unsigned)div64_u64(num_bytes +
5959 					 BTRFS_MAX_EXTENT_SIZE - 1,
5960 					 BTRFS_MAX_EXTENT_SIZE);
5961 	BTRFS_I(inode)->outstanding_extents += nr_extents;
5962 
5963 	nr_extents = 0;
5964 	if (BTRFS_I(inode)->outstanding_extents >
5965 	    BTRFS_I(inode)->reserved_extents)
5966 		nr_extents += BTRFS_I(inode)->outstanding_extents -
5967 			BTRFS_I(inode)->reserved_extents;
5968 
5969 	/* We always want to reserve a slot for updating the inode. */
5970 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
5971 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5972 	csum_bytes = BTRFS_I(inode)->csum_bytes;
5973 	spin_unlock(&BTRFS_I(inode)->lock);
5974 
5975 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
5976 		ret = btrfs_qgroup_reserve_meta(root,
5977 				nr_extents * root->nodesize);
5978 		if (ret)
5979 			goto out_fail;
5980 	}
5981 
5982 	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
5983 	if (unlikely(ret)) {
5984 		btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5985 		goto out_fail;
5986 	}
5987 
5988 	spin_lock(&BTRFS_I(inode)->lock);
5989 	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5990 			     &BTRFS_I(inode)->runtime_flags)) {
5991 		to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
5992 		release_extra = true;
5993 	}
5994 	BTRFS_I(inode)->reserved_extents += nr_extents;
5995 	spin_unlock(&BTRFS_I(inode)->lock);
5996 
5997 	if (delalloc_lock)
5998 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5999 
6000 	if (to_reserve)
6001 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
6002 					      btrfs_ino(inode), to_reserve, 1);
6003 	if (release_extra)
6004 		btrfs_block_rsv_release(root, block_rsv,
6005 					btrfs_calc_trans_metadata_size(root,
6006 								       1));
6007 	return 0;
6008 
6009 out_fail:
6010 	spin_lock(&BTRFS_I(inode)->lock);
6011 	dropped = drop_outstanding_extent(inode, num_bytes);
6012 	/*
6013 	 * If the inodes csum_bytes is the same as the original
6014 	 * csum_bytes then we know we haven't raced with any free()ers
6015 	 * so we can just reduce our inodes csum bytes and carry on.
6016 	 */
6017 	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
6018 		calc_csum_metadata_size(inode, num_bytes, 0);
6019 	} else {
6020 		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
6021 		u64 bytes;
6022 
6023 		/*
6024 		 * This is tricky, but first we need to figure out how much we
6025 		 * freed from any free-ers that occurred during this
6026 		 * reservation, so we reset ->csum_bytes to the csum_bytes
6027 		 * before we dropped our lock, and then call the free for the
6028 		 * number of bytes that were freed while we were trying our
6029 		 * reservation.
6030 		 */
6031 		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
6032 		BTRFS_I(inode)->csum_bytes = csum_bytes;
6033 		to_free = calc_csum_metadata_size(inode, bytes, 0);
6034 
6035 
6036 		/*
6037 		 * Now we need to see how much we would have freed had we not
6038 		 * been making this reservation and our ->csum_bytes were not
6039 		 * artificially inflated.
6040 		 */
6041 		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
6042 		bytes = csum_bytes - orig_csum_bytes;
6043 		bytes = calc_csum_metadata_size(inode, bytes, 0);
6044 
6045 		/*
6046 		 * Now reset ->csum_bytes to what it should be.  If bytes is
6047 		 * more than to_free then we would have freed more space had we
6048 		 * not had an artificially high ->csum_bytes, so we need to free
6049 		 * the remainder.  If bytes is the same or less then we don't
6050 		 * need to do anything, the other free-ers did the correct
6051 		 * thing.
6052 		 */
6053 		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
6054 		if (bytes > to_free)
6055 			to_free = bytes - to_free;
6056 		else
6057 			to_free = 0;
6058 	}
6059 	spin_unlock(&BTRFS_I(inode)->lock);
6060 	if (dropped)
6061 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
6062 
6063 	if (to_free) {
6064 		btrfs_block_rsv_release(root, block_rsv, to_free);
6065 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
6066 					      btrfs_ino(inode), to_free, 0);
6067 	}
6068 	if (delalloc_lock)
6069 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
6070 	return ret;
6071 }
6072 
6073 /**
6074  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6075  * @inode: the inode to release the reservation for
6076  * @num_bytes: the number of bytes we're releasing
6077  *
6078  * This will release the metadata reservation for an inode.  This can be called
6079  * once we complete IO for a given set of bytes to release their metadata
6080  * reservations.
6081  */
6082 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
6083 {
6084 	struct btrfs_root *root = BTRFS_I(inode)->root;
6085 	u64 to_free = 0;
6086 	unsigned dropped;
6087 
6088 	num_bytes = ALIGN(num_bytes, root->sectorsize);
6089 	spin_lock(&BTRFS_I(inode)->lock);
6090 	dropped = drop_outstanding_extent(inode, num_bytes);
6091 
6092 	if (num_bytes)
6093 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6094 	spin_unlock(&BTRFS_I(inode)->lock);
6095 	if (dropped > 0)
6096 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
6097 
6098 	if (btrfs_is_testing(root->fs_info))
6099 		return;
6100 
6101 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
6102 				      btrfs_ino(inode), to_free, 0);
6103 
6104 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
6105 				to_free);
6106 }
6107 
6108 /**
6109  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6110  * delalloc
6111  * @inode: inode we're writing to
6112  * @start: start range we are writing to
6113  * @len: how long the range we are writing to
6114  *
6115  * This will do the following things
6116  *
6117  * o reserve space in data space info for num bytes
6118  *   and reserve precious corresponding qgroup space
6119  *   (Done in check_data_free_space)
6120  *
6121  * o reserve space for metadata space, based on the number of outstanding
6122  *   extents and how much csums will be needed
6123  *   also reserve metadata space in a per root over-reserve method.
6124  * o add to the inodes->delalloc_bytes
6125  * o add it to the fs_info's delalloc inodes list.
6126  *   (Above 3 all done in delalloc_reserve_metadata)
6127  *
6128  * Return 0 for success
6129  * Return <0 for error(-ENOSPC or -EQUOT)
6130  */
6131 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
6132 {
6133 	int ret;
6134 
6135 	ret = btrfs_check_data_free_space(inode, start, len);
6136 	if (ret < 0)
6137 		return ret;
6138 	ret = btrfs_delalloc_reserve_metadata(inode, len);
6139 	if (ret < 0)
6140 		btrfs_free_reserved_data_space(inode, start, len);
6141 	return ret;
6142 }
6143 
6144 /**
6145  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6146  * @inode: inode we're releasing space for
6147  * @start: start position of the space already reserved
6148  * @len: the len of the space already reserved
6149  *
6150  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
6151  * called in the case that we don't need the metadata AND data reservations
6152  * anymore.  So if there is an error or we insert an inline extent.
6153  *
6154  * This function will release the metadata space that was not used and will
6155  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6156  * list if there are no delalloc bytes left.
6157  * Also it will handle the qgroup reserved space.
6158  */
6159 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
6160 {
6161 	btrfs_delalloc_release_metadata(inode, len);
6162 	btrfs_free_reserved_data_space(inode, start, len);
6163 }
6164 
6165 static int update_block_group(struct btrfs_trans_handle *trans,
6166 			      struct btrfs_root *root, u64 bytenr,
6167 			      u64 num_bytes, int alloc)
6168 {
6169 	struct btrfs_block_group_cache *cache = NULL;
6170 	struct btrfs_fs_info *info = root->fs_info;
6171 	u64 total = num_bytes;
6172 	u64 old_val;
6173 	u64 byte_in_group;
6174 	int factor;
6175 
6176 	/* block accounting for super block */
6177 	spin_lock(&info->delalloc_root_lock);
6178 	old_val = btrfs_super_bytes_used(info->super_copy);
6179 	if (alloc)
6180 		old_val += num_bytes;
6181 	else
6182 		old_val -= num_bytes;
6183 	btrfs_set_super_bytes_used(info->super_copy, old_val);
6184 	spin_unlock(&info->delalloc_root_lock);
6185 
6186 	while (total) {
6187 		cache = btrfs_lookup_block_group(info, bytenr);
6188 		if (!cache)
6189 			return -ENOENT;
6190 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6191 				    BTRFS_BLOCK_GROUP_RAID1 |
6192 				    BTRFS_BLOCK_GROUP_RAID10))
6193 			factor = 2;
6194 		else
6195 			factor = 1;
6196 		/*
6197 		 * If this block group has free space cache written out, we
6198 		 * need to make sure to load it if we are removing space.  This
6199 		 * is because we need the unpinning stage to actually add the
6200 		 * space back to the block group, otherwise we will leak space.
6201 		 */
6202 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
6203 			cache_block_group(cache, 1);
6204 
6205 		byte_in_group = bytenr - cache->key.objectid;
6206 		WARN_ON(byte_in_group > cache->key.offset);
6207 
6208 		spin_lock(&cache->space_info->lock);
6209 		spin_lock(&cache->lock);
6210 
6211 		if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
6212 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
6213 			cache->disk_cache_state = BTRFS_DC_CLEAR;
6214 
6215 		old_val = btrfs_block_group_used(&cache->item);
6216 		num_bytes = min(total, cache->key.offset - byte_in_group);
6217 		if (alloc) {
6218 			old_val += num_bytes;
6219 			btrfs_set_block_group_used(&cache->item, old_val);
6220 			cache->reserved -= num_bytes;
6221 			cache->space_info->bytes_reserved -= num_bytes;
6222 			cache->space_info->bytes_used += num_bytes;
6223 			cache->space_info->disk_used += num_bytes * factor;
6224 			spin_unlock(&cache->lock);
6225 			spin_unlock(&cache->space_info->lock);
6226 		} else {
6227 			old_val -= num_bytes;
6228 			btrfs_set_block_group_used(&cache->item, old_val);
6229 			cache->pinned += num_bytes;
6230 			cache->space_info->bytes_pinned += num_bytes;
6231 			cache->space_info->bytes_used -= num_bytes;
6232 			cache->space_info->disk_used -= num_bytes * factor;
6233 			spin_unlock(&cache->lock);
6234 			spin_unlock(&cache->space_info->lock);
6235 
6236 			trace_btrfs_space_reservation(root->fs_info, "pinned",
6237 						      cache->space_info->flags,
6238 						      num_bytes, 1);
6239 			set_extent_dirty(info->pinned_extents,
6240 					 bytenr, bytenr + num_bytes - 1,
6241 					 GFP_NOFS | __GFP_NOFAIL);
6242 		}
6243 
6244 		spin_lock(&trans->transaction->dirty_bgs_lock);
6245 		if (list_empty(&cache->dirty_list)) {
6246 			list_add_tail(&cache->dirty_list,
6247 				      &trans->transaction->dirty_bgs);
6248 				trans->transaction->num_dirty_bgs++;
6249 			btrfs_get_block_group(cache);
6250 		}
6251 		spin_unlock(&trans->transaction->dirty_bgs_lock);
6252 
6253 		/*
6254 		 * No longer have used bytes in this block group, queue it for
6255 		 * deletion. We do this after adding the block group to the
6256 		 * dirty list to avoid races between cleaner kthread and space
6257 		 * cache writeout.
6258 		 */
6259 		if (!alloc && old_val == 0) {
6260 			spin_lock(&info->unused_bgs_lock);
6261 			if (list_empty(&cache->bg_list)) {
6262 				btrfs_get_block_group(cache);
6263 				list_add_tail(&cache->bg_list,
6264 					      &info->unused_bgs);
6265 			}
6266 			spin_unlock(&info->unused_bgs_lock);
6267 		}
6268 
6269 		btrfs_put_block_group(cache);
6270 		total -= num_bytes;
6271 		bytenr += num_bytes;
6272 	}
6273 	return 0;
6274 }
6275 
6276 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
6277 {
6278 	struct btrfs_block_group_cache *cache;
6279 	u64 bytenr;
6280 
6281 	spin_lock(&root->fs_info->block_group_cache_lock);
6282 	bytenr = root->fs_info->first_logical_byte;
6283 	spin_unlock(&root->fs_info->block_group_cache_lock);
6284 
6285 	if (bytenr < (u64)-1)
6286 		return bytenr;
6287 
6288 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
6289 	if (!cache)
6290 		return 0;
6291 
6292 	bytenr = cache->key.objectid;
6293 	btrfs_put_block_group(cache);
6294 
6295 	return bytenr;
6296 }
6297 
6298 static int pin_down_extent(struct btrfs_root *root,
6299 			   struct btrfs_block_group_cache *cache,
6300 			   u64 bytenr, u64 num_bytes, int reserved)
6301 {
6302 	spin_lock(&cache->space_info->lock);
6303 	spin_lock(&cache->lock);
6304 	cache->pinned += num_bytes;
6305 	cache->space_info->bytes_pinned += num_bytes;
6306 	if (reserved) {
6307 		cache->reserved -= num_bytes;
6308 		cache->space_info->bytes_reserved -= num_bytes;
6309 	}
6310 	spin_unlock(&cache->lock);
6311 	spin_unlock(&cache->space_info->lock);
6312 
6313 	trace_btrfs_space_reservation(root->fs_info, "pinned",
6314 				      cache->space_info->flags, num_bytes, 1);
6315 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
6316 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6317 	return 0;
6318 }
6319 
6320 /*
6321  * this function must be called within transaction
6322  */
6323 int btrfs_pin_extent(struct btrfs_root *root,
6324 		     u64 bytenr, u64 num_bytes, int reserved)
6325 {
6326 	struct btrfs_block_group_cache *cache;
6327 
6328 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6329 	BUG_ON(!cache); /* Logic error */
6330 
6331 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
6332 
6333 	btrfs_put_block_group(cache);
6334 	return 0;
6335 }
6336 
6337 /*
6338  * this function must be called within transaction
6339  */
6340 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6341 				    u64 bytenr, u64 num_bytes)
6342 {
6343 	struct btrfs_block_group_cache *cache;
6344 	int ret;
6345 
6346 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6347 	if (!cache)
6348 		return -EINVAL;
6349 
6350 	/*
6351 	 * pull in the free space cache (if any) so that our pin
6352 	 * removes the free space from the cache.  We have load_only set
6353 	 * to one because the slow code to read in the free extents does check
6354 	 * the pinned extents.
6355 	 */
6356 	cache_block_group(cache, 1);
6357 
6358 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
6359 
6360 	/* remove us from the free space cache (if we're there at all) */
6361 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6362 	btrfs_put_block_group(cache);
6363 	return ret;
6364 }
6365 
6366 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
6367 {
6368 	int ret;
6369 	struct btrfs_block_group_cache *block_group;
6370 	struct btrfs_caching_control *caching_ctl;
6371 
6372 	block_group = btrfs_lookup_block_group(root->fs_info, start);
6373 	if (!block_group)
6374 		return -EINVAL;
6375 
6376 	cache_block_group(block_group, 0);
6377 	caching_ctl = get_caching_control(block_group);
6378 
6379 	if (!caching_ctl) {
6380 		/* Logic error */
6381 		BUG_ON(!block_group_cache_done(block_group));
6382 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
6383 	} else {
6384 		mutex_lock(&caching_ctl->mutex);
6385 
6386 		if (start >= caching_ctl->progress) {
6387 			ret = add_excluded_extent(root, start, num_bytes);
6388 		} else if (start + num_bytes <= caching_ctl->progress) {
6389 			ret = btrfs_remove_free_space(block_group,
6390 						      start, num_bytes);
6391 		} else {
6392 			num_bytes = caching_ctl->progress - start;
6393 			ret = btrfs_remove_free_space(block_group,
6394 						      start, num_bytes);
6395 			if (ret)
6396 				goto out_lock;
6397 
6398 			num_bytes = (start + num_bytes) -
6399 				caching_ctl->progress;
6400 			start = caching_ctl->progress;
6401 			ret = add_excluded_extent(root, start, num_bytes);
6402 		}
6403 out_lock:
6404 		mutex_unlock(&caching_ctl->mutex);
6405 		put_caching_control(caching_ctl);
6406 	}
6407 	btrfs_put_block_group(block_group);
6408 	return ret;
6409 }
6410 
6411 int btrfs_exclude_logged_extents(struct btrfs_root *log,
6412 				 struct extent_buffer *eb)
6413 {
6414 	struct btrfs_file_extent_item *item;
6415 	struct btrfs_key key;
6416 	int found_type;
6417 	int i;
6418 
6419 	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
6420 		return 0;
6421 
6422 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
6423 		btrfs_item_key_to_cpu(eb, &key, i);
6424 		if (key.type != BTRFS_EXTENT_DATA_KEY)
6425 			continue;
6426 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6427 		found_type = btrfs_file_extent_type(eb, item);
6428 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
6429 			continue;
6430 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6431 			continue;
6432 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6433 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6434 		__exclude_logged_extent(log, key.objectid, key.offset);
6435 	}
6436 
6437 	return 0;
6438 }
6439 
6440 static void
6441 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6442 {
6443 	atomic_inc(&bg->reservations);
6444 }
6445 
6446 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6447 					const u64 start)
6448 {
6449 	struct btrfs_block_group_cache *bg;
6450 
6451 	bg = btrfs_lookup_block_group(fs_info, start);
6452 	ASSERT(bg);
6453 	if (atomic_dec_and_test(&bg->reservations))
6454 		wake_up_atomic_t(&bg->reservations);
6455 	btrfs_put_block_group(bg);
6456 }
6457 
6458 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6459 {
6460 	schedule();
6461 	return 0;
6462 }
6463 
6464 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6465 {
6466 	struct btrfs_space_info *space_info = bg->space_info;
6467 
6468 	ASSERT(bg->ro);
6469 
6470 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6471 		return;
6472 
6473 	/*
6474 	 * Our block group is read only but before we set it to read only,
6475 	 * some task might have had allocated an extent from it already, but it
6476 	 * has not yet created a respective ordered extent (and added it to a
6477 	 * root's list of ordered extents).
6478 	 * Therefore wait for any task currently allocating extents, since the
6479 	 * block group's reservations counter is incremented while a read lock
6480 	 * on the groups' semaphore is held and decremented after releasing
6481 	 * the read access on that semaphore and creating the ordered extent.
6482 	 */
6483 	down_write(&space_info->groups_sem);
6484 	up_write(&space_info->groups_sem);
6485 
6486 	wait_on_atomic_t(&bg->reservations,
6487 			 btrfs_wait_bg_reservations_atomic_t,
6488 			 TASK_UNINTERRUPTIBLE);
6489 }
6490 
6491 /**
6492  * btrfs_add_reserved_bytes - update the block_group and space info counters
6493  * @cache:	The cache we are manipulating
6494  * @ram_bytes:  The number of bytes of file content, and will be same to
6495  *              @num_bytes except for the compress path.
6496  * @num_bytes:	The number of bytes in question
6497  * @delalloc:   The blocks are allocated for the delalloc write
6498  *
6499  * This is called by the allocator when it reserves space. Metadata
6500  * reservations should be called with RESERVE_ALLOC so we do the proper
6501  * ENOSPC accounting.  For data we handle the reservation through clearing the
6502  * delalloc bits in the io_tree.  We have to do this since we could end up
6503  * allocating less disk space for the amount of data we have reserved in the
6504  * case of compression.
6505  *
6506  * If this is a reservation and the block group has become read only we cannot
6507  * make the reservation and return -EAGAIN, otherwise this function always
6508  * succeeds.
6509  */
6510 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6511 				    u64 ram_bytes, u64 num_bytes, int delalloc)
6512 {
6513 	struct btrfs_space_info *space_info = cache->space_info;
6514 	int ret = 0;
6515 
6516 	spin_lock(&space_info->lock);
6517 	spin_lock(&cache->lock);
6518 	if (cache->ro) {
6519 		ret = -EAGAIN;
6520 	} else {
6521 		cache->reserved += num_bytes;
6522 		space_info->bytes_reserved += num_bytes;
6523 
6524 		trace_btrfs_space_reservation(cache->fs_info,
6525 				"space_info", space_info->flags,
6526 				ram_bytes, 0);
6527 		space_info->bytes_may_use -= ram_bytes;
6528 		if (delalloc)
6529 			cache->delalloc_bytes += num_bytes;
6530 	}
6531 	spin_unlock(&cache->lock);
6532 	spin_unlock(&space_info->lock);
6533 	return ret;
6534 }
6535 
6536 /**
6537  * btrfs_free_reserved_bytes - update the block_group and space info counters
6538  * @cache:      The cache we are manipulating
6539  * @num_bytes:  The number of bytes in question
6540  * @delalloc:   The blocks are allocated for the delalloc write
6541  *
6542  * This is called by somebody who is freeing space that was never actually used
6543  * on disk.  For example if you reserve some space for a new leaf in transaction
6544  * A and before transaction A commits you free that leaf, you call this with
6545  * reserve set to 0 in order to clear the reservation.
6546  */
6547 
6548 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6549 				     u64 num_bytes, int delalloc)
6550 {
6551 	struct btrfs_space_info *space_info = cache->space_info;
6552 	int ret = 0;
6553 
6554 	spin_lock(&space_info->lock);
6555 	spin_lock(&cache->lock);
6556 	if (cache->ro)
6557 		space_info->bytes_readonly += num_bytes;
6558 	cache->reserved -= num_bytes;
6559 	space_info->bytes_reserved -= num_bytes;
6560 
6561 	if (delalloc)
6562 		cache->delalloc_bytes -= num_bytes;
6563 	spin_unlock(&cache->lock);
6564 	spin_unlock(&space_info->lock);
6565 	return ret;
6566 }
6567 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6568 				struct btrfs_root *root)
6569 {
6570 	struct btrfs_fs_info *fs_info = root->fs_info;
6571 	struct btrfs_caching_control *next;
6572 	struct btrfs_caching_control *caching_ctl;
6573 	struct btrfs_block_group_cache *cache;
6574 
6575 	down_write(&fs_info->commit_root_sem);
6576 
6577 	list_for_each_entry_safe(caching_ctl, next,
6578 				 &fs_info->caching_block_groups, list) {
6579 		cache = caching_ctl->block_group;
6580 		if (block_group_cache_done(cache)) {
6581 			cache->last_byte_to_unpin = (u64)-1;
6582 			list_del_init(&caching_ctl->list);
6583 			put_caching_control(caching_ctl);
6584 		} else {
6585 			cache->last_byte_to_unpin = caching_ctl->progress;
6586 		}
6587 	}
6588 
6589 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6590 		fs_info->pinned_extents = &fs_info->freed_extents[1];
6591 	else
6592 		fs_info->pinned_extents = &fs_info->freed_extents[0];
6593 
6594 	up_write(&fs_info->commit_root_sem);
6595 
6596 	update_global_block_rsv(fs_info);
6597 }
6598 
6599 /*
6600  * Returns the free cluster for the given space info and sets empty_cluster to
6601  * what it should be based on the mount options.
6602  */
6603 static struct btrfs_free_cluster *
6604 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6605 		   u64 *empty_cluster)
6606 {
6607 	struct btrfs_free_cluster *ret = NULL;
6608 	bool ssd = btrfs_test_opt(root->fs_info, SSD);
6609 
6610 	*empty_cluster = 0;
6611 	if (btrfs_mixed_space_info(space_info))
6612 		return ret;
6613 
6614 	if (ssd)
6615 		*empty_cluster = SZ_2M;
6616 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6617 		ret = &root->fs_info->meta_alloc_cluster;
6618 		if (!ssd)
6619 			*empty_cluster = SZ_64K;
6620 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6621 		ret = &root->fs_info->data_alloc_cluster;
6622 	}
6623 
6624 	return ret;
6625 }
6626 
6627 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6628 			      const bool return_free_space)
6629 {
6630 	struct btrfs_fs_info *fs_info = root->fs_info;
6631 	struct btrfs_block_group_cache *cache = NULL;
6632 	struct btrfs_space_info *space_info;
6633 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6634 	struct btrfs_free_cluster *cluster = NULL;
6635 	u64 len;
6636 	u64 total_unpinned = 0;
6637 	u64 empty_cluster = 0;
6638 	bool readonly;
6639 
6640 	while (start <= end) {
6641 		readonly = false;
6642 		if (!cache ||
6643 		    start >= cache->key.objectid + cache->key.offset) {
6644 			if (cache)
6645 				btrfs_put_block_group(cache);
6646 			total_unpinned = 0;
6647 			cache = btrfs_lookup_block_group(fs_info, start);
6648 			BUG_ON(!cache); /* Logic error */
6649 
6650 			cluster = fetch_cluster_info(root,
6651 						     cache->space_info,
6652 						     &empty_cluster);
6653 			empty_cluster <<= 1;
6654 		}
6655 
6656 		len = cache->key.objectid + cache->key.offset - start;
6657 		len = min(len, end + 1 - start);
6658 
6659 		if (start < cache->last_byte_to_unpin) {
6660 			len = min(len, cache->last_byte_to_unpin - start);
6661 			if (return_free_space)
6662 				btrfs_add_free_space(cache, start, len);
6663 		}
6664 
6665 		start += len;
6666 		total_unpinned += len;
6667 		space_info = cache->space_info;
6668 
6669 		/*
6670 		 * If this space cluster has been marked as fragmented and we've
6671 		 * unpinned enough in this block group to potentially allow a
6672 		 * cluster to be created inside of it go ahead and clear the
6673 		 * fragmented check.
6674 		 */
6675 		if (cluster && cluster->fragmented &&
6676 		    total_unpinned > empty_cluster) {
6677 			spin_lock(&cluster->lock);
6678 			cluster->fragmented = 0;
6679 			spin_unlock(&cluster->lock);
6680 		}
6681 
6682 		spin_lock(&space_info->lock);
6683 		spin_lock(&cache->lock);
6684 		cache->pinned -= len;
6685 		space_info->bytes_pinned -= len;
6686 
6687 		trace_btrfs_space_reservation(fs_info, "pinned",
6688 					      space_info->flags, len, 0);
6689 		space_info->max_extent_size = 0;
6690 		percpu_counter_add(&space_info->total_bytes_pinned, -len);
6691 		if (cache->ro) {
6692 			space_info->bytes_readonly += len;
6693 			readonly = true;
6694 		}
6695 		spin_unlock(&cache->lock);
6696 		if (!readonly && return_free_space &&
6697 		    global_rsv->space_info == space_info) {
6698 			u64 to_add = len;
6699 			WARN_ON(!return_free_space);
6700 			spin_lock(&global_rsv->lock);
6701 			if (!global_rsv->full) {
6702 				to_add = min(len, global_rsv->size -
6703 					     global_rsv->reserved);
6704 				global_rsv->reserved += to_add;
6705 				space_info->bytes_may_use += to_add;
6706 				if (global_rsv->reserved >= global_rsv->size)
6707 					global_rsv->full = 1;
6708 				trace_btrfs_space_reservation(fs_info,
6709 							      "space_info",
6710 							      space_info->flags,
6711 							      to_add, 1);
6712 				len -= to_add;
6713 			}
6714 			spin_unlock(&global_rsv->lock);
6715 			/* Add to any tickets we may have */
6716 			if (len)
6717 				space_info_add_new_bytes(fs_info, space_info,
6718 							 len);
6719 		}
6720 		spin_unlock(&space_info->lock);
6721 	}
6722 
6723 	if (cache)
6724 		btrfs_put_block_group(cache);
6725 	return 0;
6726 }
6727 
6728 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6729 			       struct btrfs_root *root)
6730 {
6731 	struct btrfs_fs_info *fs_info = root->fs_info;
6732 	struct btrfs_block_group_cache *block_group, *tmp;
6733 	struct list_head *deleted_bgs;
6734 	struct extent_io_tree *unpin;
6735 	u64 start;
6736 	u64 end;
6737 	int ret;
6738 
6739 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6740 		unpin = &fs_info->freed_extents[1];
6741 	else
6742 		unpin = &fs_info->freed_extents[0];
6743 
6744 	while (!trans->aborted) {
6745 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
6746 		ret = find_first_extent_bit(unpin, 0, &start, &end,
6747 					    EXTENT_DIRTY, NULL);
6748 		if (ret) {
6749 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6750 			break;
6751 		}
6752 
6753 		if (btrfs_test_opt(root->fs_info, DISCARD))
6754 			ret = btrfs_discard_extent(root, start,
6755 						   end + 1 - start, NULL);
6756 
6757 		clear_extent_dirty(unpin, start, end);
6758 		unpin_extent_range(root, start, end, true);
6759 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6760 		cond_resched();
6761 	}
6762 
6763 	/*
6764 	 * Transaction is finished.  We don't need the lock anymore.  We
6765 	 * do need to clean up the block groups in case of a transaction
6766 	 * abort.
6767 	 */
6768 	deleted_bgs = &trans->transaction->deleted_bgs;
6769 	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6770 		u64 trimmed = 0;
6771 
6772 		ret = -EROFS;
6773 		if (!trans->aborted)
6774 			ret = btrfs_discard_extent(root,
6775 						   block_group->key.objectid,
6776 						   block_group->key.offset,
6777 						   &trimmed);
6778 
6779 		list_del_init(&block_group->bg_list);
6780 		btrfs_put_block_group_trimming(block_group);
6781 		btrfs_put_block_group(block_group);
6782 
6783 		if (ret) {
6784 			const char *errstr = btrfs_decode_error(ret);
6785 			btrfs_warn(fs_info,
6786 				   "Discard failed while removing blockgroup: errno=%d %s\n",
6787 				   ret, errstr);
6788 		}
6789 	}
6790 
6791 	return 0;
6792 }
6793 
6794 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6795 			     u64 owner, u64 root_objectid)
6796 {
6797 	struct btrfs_space_info *space_info;
6798 	u64 flags;
6799 
6800 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6801 		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6802 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
6803 		else
6804 			flags = BTRFS_BLOCK_GROUP_METADATA;
6805 	} else {
6806 		flags = BTRFS_BLOCK_GROUP_DATA;
6807 	}
6808 
6809 	space_info = __find_space_info(fs_info, flags);
6810 	BUG_ON(!space_info); /* Logic bug */
6811 	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6812 }
6813 
6814 
6815 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6816 				struct btrfs_root *root,
6817 				struct btrfs_delayed_ref_node *node, u64 parent,
6818 				u64 root_objectid, u64 owner_objectid,
6819 				u64 owner_offset, int refs_to_drop,
6820 				struct btrfs_delayed_extent_op *extent_op)
6821 {
6822 	struct btrfs_key key;
6823 	struct btrfs_path *path;
6824 	struct btrfs_fs_info *info = root->fs_info;
6825 	struct btrfs_root *extent_root = info->extent_root;
6826 	struct extent_buffer *leaf;
6827 	struct btrfs_extent_item *ei;
6828 	struct btrfs_extent_inline_ref *iref;
6829 	int ret;
6830 	int is_data;
6831 	int extent_slot = 0;
6832 	int found_extent = 0;
6833 	int num_to_del = 1;
6834 	u32 item_size;
6835 	u64 refs;
6836 	u64 bytenr = node->bytenr;
6837 	u64 num_bytes = node->num_bytes;
6838 	int last_ref = 0;
6839 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6840 						 SKINNY_METADATA);
6841 
6842 	path = btrfs_alloc_path();
6843 	if (!path)
6844 		return -ENOMEM;
6845 
6846 	path->reada = READA_FORWARD;
6847 	path->leave_spinning = 1;
6848 
6849 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6850 	BUG_ON(!is_data && refs_to_drop != 1);
6851 
6852 	if (is_data)
6853 		skinny_metadata = 0;
6854 
6855 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
6856 				    bytenr, num_bytes, parent,
6857 				    root_objectid, owner_objectid,
6858 				    owner_offset);
6859 	if (ret == 0) {
6860 		extent_slot = path->slots[0];
6861 		while (extent_slot >= 0) {
6862 			btrfs_item_key_to_cpu(path->nodes[0], &key,
6863 					      extent_slot);
6864 			if (key.objectid != bytenr)
6865 				break;
6866 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6867 			    key.offset == num_bytes) {
6868 				found_extent = 1;
6869 				break;
6870 			}
6871 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
6872 			    key.offset == owner_objectid) {
6873 				found_extent = 1;
6874 				break;
6875 			}
6876 			if (path->slots[0] - extent_slot > 5)
6877 				break;
6878 			extent_slot--;
6879 		}
6880 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6881 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6882 		if (found_extent && item_size < sizeof(*ei))
6883 			found_extent = 0;
6884 #endif
6885 		if (!found_extent) {
6886 			BUG_ON(iref);
6887 			ret = remove_extent_backref(trans, extent_root, path,
6888 						    NULL, refs_to_drop,
6889 						    is_data, &last_ref);
6890 			if (ret) {
6891 				btrfs_abort_transaction(trans, ret);
6892 				goto out;
6893 			}
6894 			btrfs_release_path(path);
6895 			path->leave_spinning = 1;
6896 
6897 			key.objectid = bytenr;
6898 			key.type = BTRFS_EXTENT_ITEM_KEY;
6899 			key.offset = num_bytes;
6900 
6901 			if (!is_data && skinny_metadata) {
6902 				key.type = BTRFS_METADATA_ITEM_KEY;
6903 				key.offset = owner_objectid;
6904 			}
6905 
6906 			ret = btrfs_search_slot(trans, extent_root,
6907 						&key, path, -1, 1);
6908 			if (ret > 0 && skinny_metadata && path->slots[0]) {
6909 				/*
6910 				 * Couldn't find our skinny metadata item,
6911 				 * see if we have ye olde extent item.
6912 				 */
6913 				path->slots[0]--;
6914 				btrfs_item_key_to_cpu(path->nodes[0], &key,
6915 						      path->slots[0]);
6916 				if (key.objectid == bytenr &&
6917 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
6918 				    key.offset == num_bytes)
6919 					ret = 0;
6920 			}
6921 
6922 			if (ret > 0 && skinny_metadata) {
6923 				skinny_metadata = false;
6924 				key.objectid = bytenr;
6925 				key.type = BTRFS_EXTENT_ITEM_KEY;
6926 				key.offset = num_bytes;
6927 				btrfs_release_path(path);
6928 				ret = btrfs_search_slot(trans, extent_root,
6929 							&key, path, -1, 1);
6930 			}
6931 
6932 			if (ret) {
6933 				btrfs_err(info,
6934 					  "umm, got %d back from search, was looking for %llu",
6935 					  ret, bytenr);
6936 				if (ret > 0)
6937 					btrfs_print_leaf(extent_root,
6938 							 path->nodes[0]);
6939 			}
6940 			if (ret < 0) {
6941 				btrfs_abort_transaction(trans, ret);
6942 				goto out;
6943 			}
6944 			extent_slot = path->slots[0];
6945 		}
6946 	} else if (WARN_ON(ret == -ENOENT)) {
6947 		btrfs_print_leaf(extent_root, path->nodes[0]);
6948 		btrfs_err(info,
6949 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6950 			bytenr, parent, root_objectid, owner_objectid,
6951 			owner_offset);
6952 		btrfs_abort_transaction(trans, ret);
6953 		goto out;
6954 	} else {
6955 		btrfs_abort_transaction(trans, ret);
6956 		goto out;
6957 	}
6958 
6959 	leaf = path->nodes[0];
6960 	item_size = btrfs_item_size_nr(leaf, extent_slot);
6961 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6962 	if (item_size < sizeof(*ei)) {
6963 		BUG_ON(found_extent || extent_slot != path->slots[0]);
6964 		ret = convert_extent_item_v0(trans, extent_root, path,
6965 					     owner_objectid, 0);
6966 		if (ret < 0) {
6967 			btrfs_abort_transaction(trans, ret);
6968 			goto out;
6969 		}
6970 
6971 		btrfs_release_path(path);
6972 		path->leave_spinning = 1;
6973 
6974 		key.objectid = bytenr;
6975 		key.type = BTRFS_EXTENT_ITEM_KEY;
6976 		key.offset = num_bytes;
6977 
6978 		ret = btrfs_search_slot(trans, extent_root, &key, path,
6979 					-1, 1);
6980 		if (ret) {
6981 			btrfs_err(info,
6982 				  "umm, got %d back from search, was looking for %llu",
6983 				ret, bytenr);
6984 			btrfs_print_leaf(extent_root, path->nodes[0]);
6985 		}
6986 		if (ret < 0) {
6987 			btrfs_abort_transaction(trans, ret);
6988 			goto out;
6989 		}
6990 
6991 		extent_slot = path->slots[0];
6992 		leaf = path->nodes[0];
6993 		item_size = btrfs_item_size_nr(leaf, extent_slot);
6994 	}
6995 #endif
6996 	BUG_ON(item_size < sizeof(*ei));
6997 	ei = btrfs_item_ptr(leaf, extent_slot,
6998 			    struct btrfs_extent_item);
6999 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7000 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
7001 		struct btrfs_tree_block_info *bi;
7002 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7003 		bi = (struct btrfs_tree_block_info *)(ei + 1);
7004 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7005 	}
7006 
7007 	refs = btrfs_extent_refs(leaf, ei);
7008 	if (refs < refs_to_drop) {
7009 		btrfs_err(info,
7010 			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7011 			  refs_to_drop, refs, bytenr);
7012 		ret = -EINVAL;
7013 		btrfs_abort_transaction(trans, ret);
7014 		goto out;
7015 	}
7016 	refs -= refs_to_drop;
7017 
7018 	if (refs > 0) {
7019 		if (extent_op)
7020 			__run_delayed_extent_op(extent_op, leaf, ei);
7021 		/*
7022 		 * In the case of inline back ref, reference count will
7023 		 * be updated by remove_extent_backref
7024 		 */
7025 		if (iref) {
7026 			BUG_ON(!found_extent);
7027 		} else {
7028 			btrfs_set_extent_refs(leaf, ei, refs);
7029 			btrfs_mark_buffer_dirty(leaf);
7030 		}
7031 		if (found_extent) {
7032 			ret = remove_extent_backref(trans, extent_root, path,
7033 						    iref, refs_to_drop,
7034 						    is_data, &last_ref);
7035 			if (ret) {
7036 				btrfs_abort_transaction(trans, ret);
7037 				goto out;
7038 			}
7039 		}
7040 		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
7041 				 root_objectid);
7042 	} else {
7043 		if (found_extent) {
7044 			BUG_ON(is_data && refs_to_drop !=
7045 			       extent_data_ref_count(path, iref));
7046 			if (iref) {
7047 				BUG_ON(path->slots[0] != extent_slot);
7048 			} else {
7049 				BUG_ON(path->slots[0] != extent_slot + 1);
7050 				path->slots[0] = extent_slot;
7051 				num_to_del = 2;
7052 			}
7053 		}
7054 
7055 		last_ref = 1;
7056 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7057 				      num_to_del);
7058 		if (ret) {
7059 			btrfs_abort_transaction(trans, ret);
7060 			goto out;
7061 		}
7062 		btrfs_release_path(path);
7063 
7064 		if (is_data) {
7065 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
7066 			if (ret) {
7067 				btrfs_abort_transaction(trans, ret);
7068 				goto out;
7069 			}
7070 		}
7071 
7072 		ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
7073 					     num_bytes);
7074 		if (ret) {
7075 			btrfs_abort_transaction(trans, ret);
7076 			goto out;
7077 		}
7078 
7079 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
7080 		if (ret) {
7081 			btrfs_abort_transaction(trans, ret);
7082 			goto out;
7083 		}
7084 	}
7085 	btrfs_release_path(path);
7086 
7087 out:
7088 	btrfs_free_path(path);
7089 	return ret;
7090 }
7091 
7092 /*
7093  * when we free an block, it is possible (and likely) that we free the last
7094  * delayed ref for that extent as well.  This searches the delayed ref tree for
7095  * a given extent, and if there are no other delayed refs to be processed, it
7096  * removes it from the tree.
7097  */
7098 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7099 				      struct btrfs_root *root, u64 bytenr)
7100 {
7101 	struct btrfs_delayed_ref_head *head;
7102 	struct btrfs_delayed_ref_root *delayed_refs;
7103 	int ret = 0;
7104 
7105 	delayed_refs = &trans->transaction->delayed_refs;
7106 	spin_lock(&delayed_refs->lock);
7107 	head = btrfs_find_delayed_ref_head(trans, bytenr);
7108 	if (!head)
7109 		goto out_delayed_unlock;
7110 
7111 	spin_lock(&head->lock);
7112 	if (!list_empty(&head->ref_list))
7113 		goto out;
7114 
7115 	if (head->extent_op) {
7116 		if (!head->must_insert_reserved)
7117 			goto out;
7118 		btrfs_free_delayed_extent_op(head->extent_op);
7119 		head->extent_op = NULL;
7120 	}
7121 
7122 	/*
7123 	 * waiting for the lock here would deadlock.  If someone else has it
7124 	 * locked they are already in the process of dropping it anyway
7125 	 */
7126 	if (!mutex_trylock(&head->mutex))
7127 		goto out;
7128 
7129 	/*
7130 	 * at this point we have a head with no other entries.  Go
7131 	 * ahead and process it.
7132 	 */
7133 	head->node.in_tree = 0;
7134 	rb_erase(&head->href_node, &delayed_refs->href_root);
7135 
7136 	atomic_dec(&delayed_refs->num_entries);
7137 
7138 	/*
7139 	 * we don't take a ref on the node because we're removing it from the
7140 	 * tree, so we just steal the ref the tree was holding.
7141 	 */
7142 	delayed_refs->num_heads--;
7143 	if (head->processing == 0)
7144 		delayed_refs->num_heads_ready--;
7145 	head->processing = 0;
7146 	spin_unlock(&head->lock);
7147 	spin_unlock(&delayed_refs->lock);
7148 
7149 	BUG_ON(head->extent_op);
7150 	if (head->must_insert_reserved)
7151 		ret = 1;
7152 
7153 	mutex_unlock(&head->mutex);
7154 	btrfs_put_delayed_ref(&head->node);
7155 	return ret;
7156 out:
7157 	spin_unlock(&head->lock);
7158 
7159 out_delayed_unlock:
7160 	spin_unlock(&delayed_refs->lock);
7161 	return 0;
7162 }
7163 
7164 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7165 			   struct btrfs_root *root,
7166 			   struct extent_buffer *buf,
7167 			   u64 parent, int last_ref)
7168 {
7169 	int pin = 1;
7170 	int ret;
7171 
7172 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7173 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7174 					buf->start, buf->len,
7175 					parent, root->root_key.objectid,
7176 					btrfs_header_level(buf),
7177 					BTRFS_DROP_DELAYED_REF, NULL);
7178 		BUG_ON(ret); /* -ENOMEM */
7179 	}
7180 
7181 	if (!last_ref)
7182 		return;
7183 
7184 	if (btrfs_header_generation(buf) == trans->transid) {
7185 		struct btrfs_block_group_cache *cache;
7186 
7187 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7188 			ret = check_ref_cleanup(trans, root, buf->start);
7189 			if (!ret)
7190 				goto out;
7191 		}
7192 
7193 		cache = btrfs_lookup_block_group(root->fs_info, buf->start);
7194 
7195 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7196 			pin_down_extent(root, cache, buf->start, buf->len, 1);
7197 			btrfs_put_block_group(cache);
7198 			goto out;
7199 		}
7200 
7201 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7202 
7203 		btrfs_add_free_space(cache, buf->start, buf->len);
7204 		btrfs_free_reserved_bytes(cache, buf->len, 0);
7205 		btrfs_put_block_group(cache);
7206 		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
7207 		pin = 0;
7208 	}
7209 out:
7210 	if (pin)
7211 		add_pinned_bytes(root->fs_info, buf->len,
7212 				 btrfs_header_level(buf),
7213 				 root->root_key.objectid);
7214 
7215 	/*
7216 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
7217 	 * anymore.
7218 	 */
7219 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7220 }
7221 
7222 /* Can return -ENOMEM */
7223 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7224 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7225 		      u64 owner, u64 offset)
7226 {
7227 	int ret;
7228 	struct btrfs_fs_info *fs_info = root->fs_info;
7229 
7230 	if (btrfs_is_testing(fs_info))
7231 		return 0;
7232 
7233 	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
7234 
7235 	/*
7236 	 * tree log blocks never actually go into the extent allocation
7237 	 * tree, just update pinning info and exit early.
7238 	 */
7239 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7240 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7241 		/* unlocks the pinned mutex */
7242 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
7243 		ret = 0;
7244 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7245 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7246 					num_bytes,
7247 					parent, root_objectid, (int)owner,
7248 					BTRFS_DROP_DELAYED_REF, NULL);
7249 	} else {
7250 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7251 						num_bytes,
7252 						parent, root_objectid, owner,
7253 						offset, 0,
7254 						BTRFS_DROP_DELAYED_REF, NULL);
7255 	}
7256 	return ret;
7257 }
7258 
7259 /*
7260  * when we wait for progress in the block group caching, its because
7261  * our allocation attempt failed at least once.  So, we must sleep
7262  * and let some progress happen before we try again.
7263  *
7264  * This function will sleep at least once waiting for new free space to
7265  * show up, and then it will check the block group free space numbers
7266  * for our min num_bytes.  Another option is to have it go ahead
7267  * and look in the rbtree for a free extent of a given size, but this
7268  * is a good start.
7269  *
7270  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7271  * any of the information in this block group.
7272  */
7273 static noinline void
7274 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7275 				u64 num_bytes)
7276 {
7277 	struct btrfs_caching_control *caching_ctl;
7278 
7279 	caching_ctl = get_caching_control(cache);
7280 	if (!caching_ctl)
7281 		return;
7282 
7283 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7284 		   (cache->free_space_ctl->free_space >= num_bytes));
7285 
7286 	put_caching_control(caching_ctl);
7287 }
7288 
7289 static noinline int
7290 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7291 {
7292 	struct btrfs_caching_control *caching_ctl;
7293 	int ret = 0;
7294 
7295 	caching_ctl = get_caching_control(cache);
7296 	if (!caching_ctl)
7297 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7298 
7299 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
7300 	if (cache->cached == BTRFS_CACHE_ERROR)
7301 		ret = -EIO;
7302 	put_caching_control(caching_ctl);
7303 	return ret;
7304 }
7305 
7306 int __get_raid_index(u64 flags)
7307 {
7308 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
7309 		return BTRFS_RAID_RAID10;
7310 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7311 		return BTRFS_RAID_RAID1;
7312 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
7313 		return BTRFS_RAID_DUP;
7314 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7315 		return BTRFS_RAID_RAID0;
7316 	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7317 		return BTRFS_RAID_RAID5;
7318 	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7319 		return BTRFS_RAID_RAID6;
7320 
7321 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7322 }
7323 
7324 int get_block_group_index(struct btrfs_block_group_cache *cache)
7325 {
7326 	return __get_raid_index(cache->flags);
7327 }
7328 
7329 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7330 	[BTRFS_RAID_RAID10]	= "raid10",
7331 	[BTRFS_RAID_RAID1]	= "raid1",
7332 	[BTRFS_RAID_DUP]	= "dup",
7333 	[BTRFS_RAID_RAID0]	= "raid0",
7334 	[BTRFS_RAID_SINGLE]	= "single",
7335 	[BTRFS_RAID_RAID5]	= "raid5",
7336 	[BTRFS_RAID_RAID6]	= "raid6",
7337 };
7338 
7339 static const char *get_raid_name(enum btrfs_raid_types type)
7340 {
7341 	if (type >= BTRFS_NR_RAID_TYPES)
7342 		return NULL;
7343 
7344 	return btrfs_raid_type_names[type];
7345 }
7346 
7347 enum btrfs_loop_type {
7348 	LOOP_CACHING_NOWAIT = 0,
7349 	LOOP_CACHING_WAIT = 1,
7350 	LOOP_ALLOC_CHUNK = 2,
7351 	LOOP_NO_EMPTY_SIZE = 3,
7352 };
7353 
7354 static inline void
7355 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7356 		       int delalloc)
7357 {
7358 	if (delalloc)
7359 		down_read(&cache->data_rwsem);
7360 }
7361 
7362 static inline void
7363 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7364 		       int delalloc)
7365 {
7366 	btrfs_get_block_group(cache);
7367 	if (delalloc)
7368 		down_read(&cache->data_rwsem);
7369 }
7370 
7371 static struct btrfs_block_group_cache *
7372 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7373 		   struct btrfs_free_cluster *cluster,
7374 		   int delalloc)
7375 {
7376 	struct btrfs_block_group_cache *used_bg = NULL;
7377 
7378 	spin_lock(&cluster->refill_lock);
7379 	while (1) {
7380 		used_bg = cluster->block_group;
7381 		if (!used_bg)
7382 			return NULL;
7383 
7384 		if (used_bg == block_group)
7385 			return used_bg;
7386 
7387 		btrfs_get_block_group(used_bg);
7388 
7389 		if (!delalloc)
7390 			return used_bg;
7391 
7392 		if (down_read_trylock(&used_bg->data_rwsem))
7393 			return used_bg;
7394 
7395 		spin_unlock(&cluster->refill_lock);
7396 
7397 		down_read(&used_bg->data_rwsem);
7398 
7399 		spin_lock(&cluster->refill_lock);
7400 		if (used_bg == cluster->block_group)
7401 			return used_bg;
7402 
7403 		up_read(&used_bg->data_rwsem);
7404 		btrfs_put_block_group(used_bg);
7405 	}
7406 }
7407 
7408 static inline void
7409 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7410 			 int delalloc)
7411 {
7412 	if (delalloc)
7413 		up_read(&cache->data_rwsem);
7414 	btrfs_put_block_group(cache);
7415 }
7416 
7417 /*
7418  * walks the btree of allocated extents and find a hole of a given size.
7419  * The key ins is changed to record the hole:
7420  * ins->objectid == start position
7421  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7422  * ins->offset == the size of the hole.
7423  * Any available blocks before search_start are skipped.
7424  *
7425  * If there is no suitable free space, we will record the max size of
7426  * the free space extent currently.
7427  */
7428 static noinline int find_free_extent(struct btrfs_root *orig_root,
7429 				u64 ram_bytes, u64 num_bytes, u64 empty_size,
7430 				u64 hint_byte, struct btrfs_key *ins,
7431 				u64 flags, int delalloc)
7432 {
7433 	int ret = 0;
7434 	struct btrfs_root *root = orig_root->fs_info->extent_root;
7435 	struct btrfs_free_cluster *last_ptr = NULL;
7436 	struct btrfs_block_group_cache *block_group = NULL;
7437 	u64 search_start = 0;
7438 	u64 max_extent_size = 0;
7439 	u64 empty_cluster = 0;
7440 	struct btrfs_space_info *space_info;
7441 	int loop = 0;
7442 	int index = __get_raid_index(flags);
7443 	bool failed_cluster_refill = false;
7444 	bool failed_alloc = false;
7445 	bool use_cluster = true;
7446 	bool have_caching_bg = false;
7447 	bool orig_have_caching_bg = false;
7448 	bool full_search = false;
7449 
7450 	WARN_ON(num_bytes < root->sectorsize);
7451 	ins->type = BTRFS_EXTENT_ITEM_KEY;
7452 	ins->objectid = 0;
7453 	ins->offset = 0;
7454 
7455 	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
7456 
7457 	space_info = __find_space_info(root->fs_info, flags);
7458 	if (!space_info) {
7459 		btrfs_err(root->fs_info, "No space info for %llu", flags);
7460 		return -ENOSPC;
7461 	}
7462 
7463 	/*
7464 	 * If our free space is heavily fragmented we may not be able to make
7465 	 * big contiguous allocations, so instead of doing the expensive search
7466 	 * for free space, simply return ENOSPC with our max_extent_size so we
7467 	 * can go ahead and search for a more manageable chunk.
7468 	 *
7469 	 * If our max_extent_size is large enough for our allocation simply
7470 	 * disable clustering since we will likely not be able to find enough
7471 	 * space to create a cluster and induce latency trying.
7472 	 */
7473 	if (unlikely(space_info->max_extent_size)) {
7474 		spin_lock(&space_info->lock);
7475 		if (space_info->max_extent_size &&
7476 		    num_bytes > space_info->max_extent_size) {
7477 			ins->offset = space_info->max_extent_size;
7478 			spin_unlock(&space_info->lock);
7479 			return -ENOSPC;
7480 		} else if (space_info->max_extent_size) {
7481 			use_cluster = false;
7482 		}
7483 		spin_unlock(&space_info->lock);
7484 	}
7485 
7486 	last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7487 	if (last_ptr) {
7488 		spin_lock(&last_ptr->lock);
7489 		if (last_ptr->block_group)
7490 			hint_byte = last_ptr->window_start;
7491 		if (last_ptr->fragmented) {
7492 			/*
7493 			 * We still set window_start so we can keep track of the
7494 			 * last place we found an allocation to try and save
7495 			 * some time.
7496 			 */
7497 			hint_byte = last_ptr->window_start;
7498 			use_cluster = false;
7499 		}
7500 		spin_unlock(&last_ptr->lock);
7501 	}
7502 
7503 	search_start = max(search_start, first_logical_byte(root, 0));
7504 	search_start = max(search_start, hint_byte);
7505 	if (search_start == hint_byte) {
7506 		block_group = btrfs_lookup_block_group(root->fs_info,
7507 						       search_start);
7508 		/*
7509 		 * we don't want to use the block group if it doesn't match our
7510 		 * allocation bits, or if its not cached.
7511 		 *
7512 		 * However if we are re-searching with an ideal block group
7513 		 * picked out then we don't care that the block group is cached.
7514 		 */
7515 		if (block_group && block_group_bits(block_group, flags) &&
7516 		    block_group->cached != BTRFS_CACHE_NO) {
7517 			down_read(&space_info->groups_sem);
7518 			if (list_empty(&block_group->list) ||
7519 			    block_group->ro) {
7520 				/*
7521 				 * someone is removing this block group,
7522 				 * we can't jump into the have_block_group
7523 				 * target because our list pointers are not
7524 				 * valid
7525 				 */
7526 				btrfs_put_block_group(block_group);
7527 				up_read(&space_info->groups_sem);
7528 			} else {
7529 				index = get_block_group_index(block_group);
7530 				btrfs_lock_block_group(block_group, delalloc);
7531 				goto have_block_group;
7532 			}
7533 		} else if (block_group) {
7534 			btrfs_put_block_group(block_group);
7535 		}
7536 	}
7537 search:
7538 	have_caching_bg = false;
7539 	if (index == 0 || index == __get_raid_index(flags))
7540 		full_search = true;
7541 	down_read(&space_info->groups_sem);
7542 	list_for_each_entry(block_group, &space_info->block_groups[index],
7543 			    list) {
7544 		u64 offset;
7545 		int cached;
7546 
7547 		btrfs_grab_block_group(block_group, delalloc);
7548 		search_start = block_group->key.objectid;
7549 
7550 		/*
7551 		 * this can happen if we end up cycling through all the
7552 		 * raid types, but we want to make sure we only allocate
7553 		 * for the proper type.
7554 		 */
7555 		if (!block_group_bits(block_group, flags)) {
7556 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
7557 				BTRFS_BLOCK_GROUP_RAID1 |
7558 				BTRFS_BLOCK_GROUP_RAID5 |
7559 				BTRFS_BLOCK_GROUP_RAID6 |
7560 				BTRFS_BLOCK_GROUP_RAID10;
7561 
7562 			/*
7563 			 * if they asked for extra copies and this block group
7564 			 * doesn't provide them, bail.  This does allow us to
7565 			 * fill raid0 from raid1.
7566 			 */
7567 			if ((flags & extra) && !(block_group->flags & extra))
7568 				goto loop;
7569 		}
7570 
7571 have_block_group:
7572 		cached = block_group_cache_done(block_group);
7573 		if (unlikely(!cached)) {
7574 			have_caching_bg = true;
7575 			ret = cache_block_group(block_group, 0);
7576 			BUG_ON(ret < 0);
7577 			ret = 0;
7578 		}
7579 
7580 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7581 			goto loop;
7582 		if (unlikely(block_group->ro))
7583 			goto loop;
7584 
7585 		/*
7586 		 * Ok we want to try and use the cluster allocator, so
7587 		 * lets look there
7588 		 */
7589 		if (last_ptr && use_cluster) {
7590 			struct btrfs_block_group_cache *used_block_group;
7591 			unsigned long aligned_cluster;
7592 			/*
7593 			 * the refill lock keeps out other
7594 			 * people trying to start a new cluster
7595 			 */
7596 			used_block_group = btrfs_lock_cluster(block_group,
7597 							      last_ptr,
7598 							      delalloc);
7599 			if (!used_block_group)
7600 				goto refill_cluster;
7601 
7602 			if (used_block_group != block_group &&
7603 			    (used_block_group->ro ||
7604 			     !block_group_bits(used_block_group, flags)))
7605 				goto release_cluster;
7606 
7607 			offset = btrfs_alloc_from_cluster(used_block_group,
7608 						last_ptr,
7609 						num_bytes,
7610 						used_block_group->key.objectid,
7611 						&max_extent_size);
7612 			if (offset) {
7613 				/* we have a block, we're done */
7614 				spin_unlock(&last_ptr->refill_lock);
7615 				trace_btrfs_reserve_extent_cluster(root,
7616 						used_block_group,
7617 						search_start, num_bytes);
7618 				if (used_block_group != block_group) {
7619 					btrfs_release_block_group(block_group,
7620 								  delalloc);
7621 					block_group = used_block_group;
7622 				}
7623 				goto checks;
7624 			}
7625 
7626 			WARN_ON(last_ptr->block_group != used_block_group);
7627 release_cluster:
7628 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
7629 			 * set up a new clusters, so lets just skip it
7630 			 * and let the allocator find whatever block
7631 			 * it can find.  If we reach this point, we
7632 			 * will have tried the cluster allocator
7633 			 * plenty of times and not have found
7634 			 * anything, so we are likely way too
7635 			 * fragmented for the clustering stuff to find
7636 			 * anything.
7637 			 *
7638 			 * However, if the cluster is taken from the
7639 			 * current block group, release the cluster
7640 			 * first, so that we stand a better chance of
7641 			 * succeeding in the unclustered
7642 			 * allocation.  */
7643 			if (loop >= LOOP_NO_EMPTY_SIZE &&
7644 			    used_block_group != block_group) {
7645 				spin_unlock(&last_ptr->refill_lock);
7646 				btrfs_release_block_group(used_block_group,
7647 							  delalloc);
7648 				goto unclustered_alloc;
7649 			}
7650 
7651 			/*
7652 			 * this cluster didn't work out, free it and
7653 			 * start over
7654 			 */
7655 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7656 
7657 			if (used_block_group != block_group)
7658 				btrfs_release_block_group(used_block_group,
7659 							  delalloc);
7660 refill_cluster:
7661 			if (loop >= LOOP_NO_EMPTY_SIZE) {
7662 				spin_unlock(&last_ptr->refill_lock);
7663 				goto unclustered_alloc;
7664 			}
7665 
7666 			aligned_cluster = max_t(unsigned long,
7667 						empty_cluster + empty_size,
7668 					      block_group->full_stripe_len);
7669 
7670 			/* allocate a cluster in this block group */
7671 			ret = btrfs_find_space_cluster(root, block_group,
7672 						       last_ptr, search_start,
7673 						       num_bytes,
7674 						       aligned_cluster);
7675 			if (ret == 0) {
7676 				/*
7677 				 * now pull our allocation out of this
7678 				 * cluster
7679 				 */
7680 				offset = btrfs_alloc_from_cluster(block_group,
7681 							last_ptr,
7682 							num_bytes,
7683 							search_start,
7684 							&max_extent_size);
7685 				if (offset) {
7686 					/* we found one, proceed */
7687 					spin_unlock(&last_ptr->refill_lock);
7688 					trace_btrfs_reserve_extent_cluster(root,
7689 						block_group, search_start,
7690 						num_bytes);
7691 					goto checks;
7692 				}
7693 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
7694 				   && !failed_cluster_refill) {
7695 				spin_unlock(&last_ptr->refill_lock);
7696 
7697 				failed_cluster_refill = true;
7698 				wait_block_group_cache_progress(block_group,
7699 				       num_bytes + empty_cluster + empty_size);
7700 				goto have_block_group;
7701 			}
7702 
7703 			/*
7704 			 * at this point we either didn't find a cluster
7705 			 * or we weren't able to allocate a block from our
7706 			 * cluster.  Free the cluster we've been trying
7707 			 * to use, and go to the next block group
7708 			 */
7709 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7710 			spin_unlock(&last_ptr->refill_lock);
7711 			goto loop;
7712 		}
7713 
7714 unclustered_alloc:
7715 		/*
7716 		 * We are doing an unclustered alloc, set the fragmented flag so
7717 		 * we don't bother trying to setup a cluster again until we get
7718 		 * more space.
7719 		 */
7720 		if (unlikely(last_ptr)) {
7721 			spin_lock(&last_ptr->lock);
7722 			last_ptr->fragmented = 1;
7723 			spin_unlock(&last_ptr->lock);
7724 		}
7725 		spin_lock(&block_group->free_space_ctl->tree_lock);
7726 		if (cached &&
7727 		    block_group->free_space_ctl->free_space <
7728 		    num_bytes + empty_cluster + empty_size) {
7729 			if (block_group->free_space_ctl->free_space >
7730 			    max_extent_size)
7731 				max_extent_size =
7732 					block_group->free_space_ctl->free_space;
7733 			spin_unlock(&block_group->free_space_ctl->tree_lock);
7734 			goto loop;
7735 		}
7736 		spin_unlock(&block_group->free_space_ctl->tree_lock);
7737 
7738 		offset = btrfs_find_space_for_alloc(block_group, search_start,
7739 						    num_bytes, empty_size,
7740 						    &max_extent_size);
7741 		/*
7742 		 * If we didn't find a chunk, and we haven't failed on this
7743 		 * block group before, and this block group is in the middle of
7744 		 * caching and we are ok with waiting, then go ahead and wait
7745 		 * for progress to be made, and set failed_alloc to true.
7746 		 *
7747 		 * If failed_alloc is true then we've already waited on this
7748 		 * block group once and should move on to the next block group.
7749 		 */
7750 		if (!offset && !failed_alloc && !cached &&
7751 		    loop > LOOP_CACHING_NOWAIT) {
7752 			wait_block_group_cache_progress(block_group,
7753 						num_bytes + empty_size);
7754 			failed_alloc = true;
7755 			goto have_block_group;
7756 		} else if (!offset) {
7757 			goto loop;
7758 		}
7759 checks:
7760 		search_start = ALIGN(offset, root->stripesize);
7761 
7762 		/* move on to the next group */
7763 		if (search_start + num_bytes >
7764 		    block_group->key.objectid + block_group->key.offset) {
7765 			btrfs_add_free_space(block_group, offset, num_bytes);
7766 			goto loop;
7767 		}
7768 
7769 		if (offset < search_start)
7770 			btrfs_add_free_space(block_group, offset,
7771 					     search_start - offset);
7772 		BUG_ON(offset > search_start);
7773 
7774 		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7775 				num_bytes, delalloc);
7776 		if (ret == -EAGAIN) {
7777 			btrfs_add_free_space(block_group, offset, num_bytes);
7778 			goto loop;
7779 		}
7780 		btrfs_inc_block_group_reservations(block_group);
7781 
7782 		/* we are all good, lets return */
7783 		ins->objectid = search_start;
7784 		ins->offset = num_bytes;
7785 
7786 		trace_btrfs_reserve_extent(orig_root, block_group,
7787 					   search_start, num_bytes);
7788 		btrfs_release_block_group(block_group, delalloc);
7789 		break;
7790 loop:
7791 		failed_cluster_refill = false;
7792 		failed_alloc = false;
7793 		BUG_ON(index != get_block_group_index(block_group));
7794 		btrfs_release_block_group(block_group, delalloc);
7795 	}
7796 	up_read(&space_info->groups_sem);
7797 
7798 	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7799 		&& !orig_have_caching_bg)
7800 		orig_have_caching_bg = true;
7801 
7802 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7803 		goto search;
7804 
7805 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7806 		goto search;
7807 
7808 	/*
7809 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7810 	 *			caching kthreads as we move along
7811 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7812 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7813 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7814 	 *			again
7815 	 */
7816 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7817 		index = 0;
7818 		if (loop == LOOP_CACHING_NOWAIT) {
7819 			/*
7820 			 * We want to skip the LOOP_CACHING_WAIT step if we
7821 			 * don't have any uncached bgs and we've already done a
7822 			 * full search through.
7823 			 */
7824 			if (orig_have_caching_bg || !full_search)
7825 				loop = LOOP_CACHING_WAIT;
7826 			else
7827 				loop = LOOP_ALLOC_CHUNK;
7828 		} else {
7829 			loop++;
7830 		}
7831 
7832 		if (loop == LOOP_ALLOC_CHUNK) {
7833 			struct btrfs_trans_handle *trans;
7834 			int exist = 0;
7835 
7836 			trans = current->journal_info;
7837 			if (trans)
7838 				exist = 1;
7839 			else
7840 				trans = btrfs_join_transaction(root);
7841 
7842 			if (IS_ERR(trans)) {
7843 				ret = PTR_ERR(trans);
7844 				goto out;
7845 			}
7846 
7847 			ret = do_chunk_alloc(trans, root, flags,
7848 					     CHUNK_ALLOC_FORCE);
7849 
7850 			/*
7851 			 * If we can't allocate a new chunk we've already looped
7852 			 * through at least once, move on to the NO_EMPTY_SIZE
7853 			 * case.
7854 			 */
7855 			if (ret == -ENOSPC)
7856 				loop = LOOP_NO_EMPTY_SIZE;
7857 
7858 			/*
7859 			 * Do not bail out on ENOSPC since we
7860 			 * can do more things.
7861 			 */
7862 			if (ret < 0 && ret != -ENOSPC)
7863 				btrfs_abort_transaction(trans, ret);
7864 			else
7865 				ret = 0;
7866 			if (!exist)
7867 				btrfs_end_transaction(trans, root);
7868 			if (ret)
7869 				goto out;
7870 		}
7871 
7872 		if (loop == LOOP_NO_EMPTY_SIZE) {
7873 			/*
7874 			 * Don't loop again if we already have no empty_size and
7875 			 * no empty_cluster.
7876 			 */
7877 			if (empty_size == 0 &&
7878 			    empty_cluster == 0) {
7879 				ret = -ENOSPC;
7880 				goto out;
7881 			}
7882 			empty_size = 0;
7883 			empty_cluster = 0;
7884 		}
7885 
7886 		goto search;
7887 	} else if (!ins->objectid) {
7888 		ret = -ENOSPC;
7889 	} else if (ins->objectid) {
7890 		if (!use_cluster && last_ptr) {
7891 			spin_lock(&last_ptr->lock);
7892 			last_ptr->window_start = ins->objectid;
7893 			spin_unlock(&last_ptr->lock);
7894 		}
7895 		ret = 0;
7896 	}
7897 out:
7898 	if (ret == -ENOSPC) {
7899 		spin_lock(&space_info->lock);
7900 		space_info->max_extent_size = max_extent_size;
7901 		spin_unlock(&space_info->lock);
7902 		ins->offset = max_extent_size;
7903 	}
7904 	return ret;
7905 }
7906 
7907 static void dump_space_info(struct btrfs_fs_info *fs_info,
7908 			    struct btrfs_space_info *info, u64 bytes,
7909 			    int dump_block_groups)
7910 {
7911 	struct btrfs_block_group_cache *cache;
7912 	int index = 0;
7913 
7914 	spin_lock(&info->lock);
7915 	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7916 		   info->flags,
7917 		   info->total_bytes - info->bytes_used - info->bytes_pinned -
7918 		   info->bytes_reserved - info->bytes_readonly -
7919 		   info->bytes_may_use, (info->full) ? "" : "not ");
7920 	btrfs_info(fs_info,
7921 		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7922 		info->total_bytes, info->bytes_used, info->bytes_pinned,
7923 		info->bytes_reserved, info->bytes_may_use,
7924 		info->bytes_readonly);
7925 	spin_unlock(&info->lock);
7926 
7927 	if (!dump_block_groups)
7928 		return;
7929 
7930 	down_read(&info->groups_sem);
7931 again:
7932 	list_for_each_entry(cache, &info->block_groups[index], list) {
7933 		spin_lock(&cache->lock);
7934 		btrfs_info(fs_info,
7935 			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7936 			cache->key.objectid, cache->key.offset,
7937 			btrfs_block_group_used(&cache->item), cache->pinned,
7938 			cache->reserved, cache->ro ? "[readonly]" : "");
7939 		btrfs_dump_free_space(cache, bytes);
7940 		spin_unlock(&cache->lock);
7941 	}
7942 	if (++index < BTRFS_NR_RAID_TYPES)
7943 		goto again;
7944 	up_read(&info->groups_sem);
7945 }
7946 
7947 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7948 			 u64 num_bytes, u64 min_alloc_size,
7949 			 u64 empty_size, u64 hint_byte,
7950 			 struct btrfs_key *ins, int is_data, int delalloc)
7951 {
7952 	struct btrfs_fs_info *fs_info = root->fs_info;
7953 	bool final_tried = num_bytes == min_alloc_size;
7954 	u64 flags;
7955 	int ret;
7956 
7957 	flags = btrfs_get_alloc_profile(root, is_data);
7958 again:
7959 	WARN_ON(num_bytes < root->sectorsize);
7960 	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
7961 			       hint_byte, ins, flags, delalloc);
7962 	if (!ret && !is_data) {
7963 		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
7964 	} else if (ret == -ENOSPC) {
7965 		if (!final_tried && ins->offset) {
7966 			num_bytes = min(num_bytes >> 1, ins->offset);
7967 			num_bytes = round_down(num_bytes, root->sectorsize);
7968 			num_bytes = max(num_bytes, min_alloc_size);
7969 			ram_bytes = num_bytes;
7970 			if (num_bytes == min_alloc_size)
7971 				final_tried = true;
7972 			goto again;
7973 		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7974 			struct btrfs_space_info *sinfo;
7975 
7976 			sinfo = __find_space_info(fs_info, flags);
7977 			btrfs_err(root->fs_info,
7978 				  "allocation failed flags %llu, wanted %llu",
7979 				  flags, num_bytes);
7980 			if (sinfo)
7981 				dump_space_info(fs_info, sinfo, num_bytes, 1);
7982 		}
7983 	}
7984 
7985 	return ret;
7986 }
7987 
7988 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7989 					u64 start, u64 len,
7990 					int pin, int delalloc)
7991 {
7992 	struct btrfs_block_group_cache *cache;
7993 	int ret = 0;
7994 
7995 	cache = btrfs_lookup_block_group(root->fs_info, start);
7996 	if (!cache) {
7997 		btrfs_err(root->fs_info, "Unable to find block group for %llu",
7998 			start);
7999 		return -ENOSPC;
8000 	}
8001 
8002 	if (pin)
8003 		pin_down_extent(root, cache, start, len, 1);
8004 	else {
8005 		if (btrfs_test_opt(root->fs_info, DISCARD))
8006 			ret = btrfs_discard_extent(root, start, len, NULL);
8007 		btrfs_add_free_space(cache, start, len);
8008 		btrfs_free_reserved_bytes(cache, len, delalloc);
8009 		trace_btrfs_reserved_extent_free(root, start, len);
8010 	}
8011 
8012 	btrfs_put_block_group(cache);
8013 	return ret;
8014 }
8015 
8016 int btrfs_free_reserved_extent(struct btrfs_root *root,
8017 			       u64 start, u64 len, int delalloc)
8018 {
8019 	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
8020 }
8021 
8022 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
8023 				       u64 start, u64 len)
8024 {
8025 	return __btrfs_free_reserved_extent(root, start, len, 1, 0);
8026 }
8027 
8028 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8029 				      struct btrfs_root *root,
8030 				      u64 parent, u64 root_objectid,
8031 				      u64 flags, u64 owner, u64 offset,
8032 				      struct btrfs_key *ins, int ref_mod)
8033 {
8034 	int ret;
8035 	struct btrfs_fs_info *fs_info = root->fs_info;
8036 	struct btrfs_extent_item *extent_item;
8037 	struct btrfs_extent_inline_ref *iref;
8038 	struct btrfs_path *path;
8039 	struct extent_buffer *leaf;
8040 	int type;
8041 	u32 size;
8042 
8043 	if (parent > 0)
8044 		type = BTRFS_SHARED_DATA_REF_KEY;
8045 	else
8046 		type = BTRFS_EXTENT_DATA_REF_KEY;
8047 
8048 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8049 
8050 	path = btrfs_alloc_path();
8051 	if (!path)
8052 		return -ENOMEM;
8053 
8054 	path->leave_spinning = 1;
8055 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8056 				      ins, size);
8057 	if (ret) {
8058 		btrfs_free_path(path);
8059 		return ret;
8060 	}
8061 
8062 	leaf = path->nodes[0];
8063 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8064 				     struct btrfs_extent_item);
8065 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8066 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8067 	btrfs_set_extent_flags(leaf, extent_item,
8068 			       flags | BTRFS_EXTENT_FLAG_DATA);
8069 
8070 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8071 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
8072 	if (parent > 0) {
8073 		struct btrfs_shared_data_ref *ref;
8074 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
8075 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8076 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8077 	} else {
8078 		struct btrfs_extent_data_ref *ref;
8079 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8080 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8081 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8082 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8083 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8084 	}
8085 
8086 	btrfs_mark_buffer_dirty(path->nodes[0]);
8087 	btrfs_free_path(path);
8088 
8089 	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8090 					  ins->offset);
8091 	if (ret)
8092 		return ret;
8093 
8094 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
8095 	if (ret) { /* -ENOENT, logic error */
8096 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8097 			ins->objectid, ins->offset);
8098 		BUG();
8099 	}
8100 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
8101 	return ret;
8102 }
8103 
8104 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8105 				     struct btrfs_root *root,
8106 				     u64 parent, u64 root_objectid,
8107 				     u64 flags, struct btrfs_disk_key *key,
8108 				     int level, struct btrfs_key *ins)
8109 {
8110 	int ret;
8111 	struct btrfs_fs_info *fs_info = root->fs_info;
8112 	struct btrfs_extent_item *extent_item;
8113 	struct btrfs_tree_block_info *block_info;
8114 	struct btrfs_extent_inline_ref *iref;
8115 	struct btrfs_path *path;
8116 	struct extent_buffer *leaf;
8117 	u32 size = sizeof(*extent_item) + sizeof(*iref);
8118 	u64 num_bytes = ins->offset;
8119 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8120 						 SKINNY_METADATA);
8121 
8122 	if (!skinny_metadata)
8123 		size += sizeof(*block_info);
8124 
8125 	path = btrfs_alloc_path();
8126 	if (!path) {
8127 		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8128 						   root->nodesize);
8129 		return -ENOMEM;
8130 	}
8131 
8132 	path->leave_spinning = 1;
8133 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8134 				      ins, size);
8135 	if (ret) {
8136 		btrfs_free_path(path);
8137 		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8138 						   root->nodesize);
8139 		return ret;
8140 	}
8141 
8142 	leaf = path->nodes[0];
8143 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8144 				     struct btrfs_extent_item);
8145 	btrfs_set_extent_refs(leaf, extent_item, 1);
8146 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8147 	btrfs_set_extent_flags(leaf, extent_item,
8148 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8149 
8150 	if (skinny_metadata) {
8151 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8152 		num_bytes = root->nodesize;
8153 	} else {
8154 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8155 		btrfs_set_tree_block_key(leaf, block_info, key);
8156 		btrfs_set_tree_block_level(leaf, block_info, level);
8157 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8158 	}
8159 
8160 	if (parent > 0) {
8161 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8162 		btrfs_set_extent_inline_ref_type(leaf, iref,
8163 						 BTRFS_SHARED_BLOCK_REF_KEY);
8164 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8165 	} else {
8166 		btrfs_set_extent_inline_ref_type(leaf, iref,
8167 						 BTRFS_TREE_BLOCK_REF_KEY);
8168 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8169 	}
8170 
8171 	btrfs_mark_buffer_dirty(leaf);
8172 	btrfs_free_path(path);
8173 
8174 	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8175 					  num_bytes);
8176 	if (ret)
8177 		return ret;
8178 
8179 	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
8180 				 1);
8181 	if (ret) { /* -ENOENT, logic error */
8182 		btrfs_err(fs_info, "update block group failed for %llu %llu",
8183 			ins->objectid, ins->offset);
8184 		BUG();
8185 	}
8186 
8187 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
8188 	return ret;
8189 }
8190 
8191 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8192 				     struct btrfs_root *root,
8193 				     u64 root_objectid, u64 owner,
8194 				     u64 offset, u64 ram_bytes,
8195 				     struct btrfs_key *ins)
8196 {
8197 	int ret;
8198 
8199 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8200 
8201 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
8202 					 ins->offset, 0,
8203 					 root_objectid, owner, offset,
8204 					 ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
8205 					 NULL);
8206 	return ret;
8207 }
8208 
8209 /*
8210  * this is used by the tree logging recovery code.  It records that
8211  * an extent has been allocated and makes sure to clear the free
8212  * space cache bits as well
8213  */
8214 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8215 				   struct btrfs_root *root,
8216 				   u64 root_objectid, u64 owner, u64 offset,
8217 				   struct btrfs_key *ins)
8218 {
8219 	int ret;
8220 	struct btrfs_block_group_cache *block_group;
8221 	struct btrfs_space_info *space_info;
8222 
8223 	/*
8224 	 * Mixed block groups will exclude before processing the log so we only
8225 	 * need to do the exclude dance if this fs isn't mixed.
8226 	 */
8227 	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
8228 		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
8229 		if (ret)
8230 			return ret;
8231 	}
8232 
8233 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
8234 	if (!block_group)
8235 		return -EINVAL;
8236 
8237 	space_info = block_group->space_info;
8238 	spin_lock(&space_info->lock);
8239 	spin_lock(&block_group->lock);
8240 	space_info->bytes_reserved += ins->offset;
8241 	block_group->reserved += ins->offset;
8242 	spin_unlock(&block_group->lock);
8243 	spin_unlock(&space_info->lock);
8244 
8245 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
8246 					 0, owner, offset, ins, 1);
8247 	btrfs_put_block_group(block_group);
8248 	return ret;
8249 }
8250 
8251 static struct extent_buffer *
8252 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8253 		      u64 bytenr, int level)
8254 {
8255 	struct extent_buffer *buf;
8256 
8257 	buf = btrfs_find_create_tree_block(root, bytenr);
8258 	if (IS_ERR(buf))
8259 		return buf;
8260 
8261 	btrfs_set_header_generation(buf, trans->transid);
8262 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8263 	btrfs_tree_lock(buf);
8264 	clean_tree_block(trans, root->fs_info, buf);
8265 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8266 
8267 	btrfs_set_lock_blocking(buf);
8268 	set_extent_buffer_uptodate(buf);
8269 
8270 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8271 		buf->log_index = root->log_transid % 2;
8272 		/*
8273 		 * we allow two log transactions at a time, use different
8274 		 * EXENT bit to differentiate dirty pages.
8275 		 */
8276 		if (buf->log_index == 0)
8277 			set_extent_dirty(&root->dirty_log_pages, buf->start,
8278 					buf->start + buf->len - 1, GFP_NOFS);
8279 		else
8280 			set_extent_new(&root->dirty_log_pages, buf->start,
8281 					buf->start + buf->len - 1);
8282 	} else {
8283 		buf->log_index = -1;
8284 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8285 			 buf->start + buf->len - 1, GFP_NOFS);
8286 	}
8287 	trans->dirty = true;
8288 	/* this returns a buffer locked for blocking */
8289 	return buf;
8290 }
8291 
8292 static struct btrfs_block_rsv *
8293 use_block_rsv(struct btrfs_trans_handle *trans,
8294 	      struct btrfs_root *root, u32 blocksize)
8295 {
8296 	struct btrfs_block_rsv *block_rsv;
8297 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
8298 	int ret;
8299 	bool global_updated = false;
8300 
8301 	block_rsv = get_block_rsv(trans, root);
8302 
8303 	if (unlikely(block_rsv->size == 0))
8304 		goto try_reserve;
8305 again:
8306 	ret = block_rsv_use_bytes(block_rsv, blocksize);
8307 	if (!ret)
8308 		return block_rsv;
8309 
8310 	if (block_rsv->failfast)
8311 		return ERR_PTR(ret);
8312 
8313 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8314 		global_updated = true;
8315 		update_global_block_rsv(root->fs_info);
8316 		goto again;
8317 	}
8318 
8319 	if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
8320 		static DEFINE_RATELIMIT_STATE(_rs,
8321 				DEFAULT_RATELIMIT_INTERVAL * 10,
8322 				/*DEFAULT_RATELIMIT_BURST*/ 1);
8323 		if (__ratelimit(&_rs))
8324 			WARN(1, KERN_DEBUG
8325 				"BTRFS: block rsv returned %d\n", ret);
8326 	}
8327 try_reserve:
8328 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8329 				     BTRFS_RESERVE_NO_FLUSH);
8330 	if (!ret)
8331 		return block_rsv;
8332 	/*
8333 	 * If we couldn't reserve metadata bytes try and use some from
8334 	 * the global reserve if its space type is the same as the global
8335 	 * reservation.
8336 	 */
8337 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8338 	    block_rsv->space_info == global_rsv->space_info) {
8339 		ret = block_rsv_use_bytes(global_rsv, blocksize);
8340 		if (!ret)
8341 			return global_rsv;
8342 	}
8343 	return ERR_PTR(ret);
8344 }
8345 
8346 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8347 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
8348 {
8349 	block_rsv_add_bytes(block_rsv, blocksize, 0);
8350 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8351 }
8352 
8353 /*
8354  * finds a free extent and does all the dirty work required for allocation
8355  * returns the tree buffer or an ERR_PTR on error.
8356  */
8357 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8358 					struct btrfs_root *root,
8359 					u64 parent, u64 root_objectid,
8360 					struct btrfs_disk_key *key, int level,
8361 					u64 hint, u64 empty_size)
8362 {
8363 	struct btrfs_key ins;
8364 	struct btrfs_block_rsv *block_rsv;
8365 	struct extent_buffer *buf;
8366 	struct btrfs_delayed_extent_op *extent_op;
8367 	u64 flags = 0;
8368 	int ret;
8369 	u32 blocksize = root->nodesize;
8370 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8371 						 SKINNY_METADATA);
8372 
8373 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8374 	if (btrfs_is_testing(root->fs_info)) {
8375 		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8376 					    level);
8377 		if (!IS_ERR(buf))
8378 			root->alloc_bytenr += blocksize;
8379 		return buf;
8380 	}
8381 #endif
8382 
8383 	block_rsv = use_block_rsv(trans, root, blocksize);
8384 	if (IS_ERR(block_rsv))
8385 		return ERR_CAST(block_rsv);
8386 
8387 	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8388 				   empty_size, hint, &ins, 0, 0);
8389 	if (ret)
8390 		goto out_unuse;
8391 
8392 	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8393 	if (IS_ERR(buf)) {
8394 		ret = PTR_ERR(buf);
8395 		goto out_free_reserved;
8396 	}
8397 
8398 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8399 		if (parent == 0)
8400 			parent = ins.objectid;
8401 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8402 	} else
8403 		BUG_ON(parent > 0);
8404 
8405 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8406 		extent_op = btrfs_alloc_delayed_extent_op();
8407 		if (!extent_op) {
8408 			ret = -ENOMEM;
8409 			goto out_free_buf;
8410 		}
8411 		if (key)
8412 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
8413 		else
8414 			memset(&extent_op->key, 0, sizeof(extent_op->key));
8415 		extent_op->flags_to_set = flags;
8416 		extent_op->update_key = skinny_metadata ? false : true;
8417 		extent_op->update_flags = true;
8418 		extent_op->is_data = false;
8419 		extent_op->level = level;
8420 
8421 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8422 						 ins.objectid, ins.offset,
8423 						 parent, root_objectid, level,
8424 						 BTRFS_ADD_DELAYED_EXTENT,
8425 						 extent_op);
8426 		if (ret)
8427 			goto out_free_delayed;
8428 	}
8429 	return buf;
8430 
8431 out_free_delayed:
8432 	btrfs_free_delayed_extent_op(extent_op);
8433 out_free_buf:
8434 	free_extent_buffer(buf);
8435 out_free_reserved:
8436 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
8437 out_unuse:
8438 	unuse_block_rsv(root->fs_info, block_rsv, blocksize);
8439 	return ERR_PTR(ret);
8440 }
8441 
8442 struct walk_control {
8443 	u64 refs[BTRFS_MAX_LEVEL];
8444 	u64 flags[BTRFS_MAX_LEVEL];
8445 	struct btrfs_key update_progress;
8446 	int stage;
8447 	int level;
8448 	int shared_level;
8449 	int update_ref;
8450 	int keep_locks;
8451 	int reada_slot;
8452 	int reada_count;
8453 	int for_reloc;
8454 };
8455 
8456 #define DROP_REFERENCE	1
8457 #define UPDATE_BACKREF	2
8458 
8459 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8460 				     struct btrfs_root *root,
8461 				     struct walk_control *wc,
8462 				     struct btrfs_path *path)
8463 {
8464 	u64 bytenr;
8465 	u64 generation;
8466 	u64 refs;
8467 	u64 flags;
8468 	u32 nritems;
8469 	struct btrfs_key key;
8470 	struct extent_buffer *eb;
8471 	int ret;
8472 	int slot;
8473 	int nread = 0;
8474 
8475 	if (path->slots[wc->level] < wc->reada_slot) {
8476 		wc->reada_count = wc->reada_count * 2 / 3;
8477 		wc->reada_count = max(wc->reada_count, 2);
8478 	} else {
8479 		wc->reada_count = wc->reada_count * 3 / 2;
8480 		wc->reada_count = min_t(int, wc->reada_count,
8481 					BTRFS_NODEPTRS_PER_BLOCK(root));
8482 	}
8483 
8484 	eb = path->nodes[wc->level];
8485 	nritems = btrfs_header_nritems(eb);
8486 
8487 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8488 		if (nread >= wc->reada_count)
8489 			break;
8490 
8491 		cond_resched();
8492 		bytenr = btrfs_node_blockptr(eb, slot);
8493 		generation = btrfs_node_ptr_generation(eb, slot);
8494 
8495 		if (slot == path->slots[wc->level])
8496 			goto reada;
8497 
8498 		if (wc->stage == UPDATE_BACKREF &&
8499 		    generation <= root->root_key.offset)
8500 			continue;
8501 
8502 		/* We don't lock the tree block, it's OK to be racy here */
8503 		ret = btrfs_lookup_extent_info(trans, root, bytenr,
8504 					       wc->level - 1, 1, &refs,
8505 					       &flags);
8506 		/* We don't care about errors in readahead. */
8507 		if (ret < 0)
8508 			continue;
8509 		BUG_ON(refs == 0);
8510 
8511 		if (wc->stage == DROP_REFERENCE) {
8512 			if (refs == 1)
8513 				goto reada;
8514 
8515 			if (wc->level == 1 &&
8516 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8517 				continue;
8518 			if (!wc->update_ref ||
8519 			    generation <= root->root_key.offset)
8520 				continue;
8521 			btrfs_node_key_to_cpu(eb, &key, slot);
8522 			ret = btrfs_comp_cpu_keys(&key,
8523 						  &wc->update_progress);
8524 			if (ret < 0)
8525 				continue;
8526 		} else {
8527 			if (wc->level == 1 &&
8528 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8529 				continue;
8530 		}
8531 reada:
8532 		readahead_tree_block(root, bytenr);
8533 		nread++;
8534 	}
8535 	wc->reada_slot = slot;
8536 }
8537 
8538 static int account_leaf_items(struct btrfs_trans_handle *trans,
8539 			      struct btrfs_root *root,
8540 			      struct extent_buffer *eb)
8541 {
8542 	int nr = btrfs_header_nritems(eb);
8543 	int i, extent_type, ret;
8544 	struct btrfs_key key;
8545 	struct btrfs_file_extent_item *fi;
8546 	u64 bytenr, num_bytes;
8547 
8548 	/* We can be called directly from walk_up_proc() */
8549 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
8550 		return 0;
8551 
8552 	for (i = 0; i < nr; i++) {
8553 		btrfs_item_key_to_cpu(eb, &key, i);
8554 
8555 		if (key.type != BTRFS_EXTENT_DATA_KEY)
8556 			continue;
8557 
8558 		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
8559 		/* filter out non qgroup-accountable extents  */
8560 		extent_type = btrfs_file_extent_type(eb, fi);
8561 
8562 		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
8563 			continue;
8564 
8565 		bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8566 		if (!bytenr)
8567 			continue;
8568 
8569 		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8570 
8571 		ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
8572 				bytenr, num_bytes, GFP_NOFS);
8573 		if (ret)
8574 			return ret;
8575 	}
8576 	return 0;
8577 }
8578 
8579 /*
8580  * Walk up the tree from the bottom, freeing leaves and any interior
8581  * nodes which have had all slots visited. If a node (leaf or
8582  * interior) is freed, the node above it will have it's slot
8583  * incremented. The root node will never be freed.
8584  *
8585  * At the end of this function, we should have a path which has all
8586  * slots incremented to the next position for a search. If we need to
8587  * read a new node it will be NULL and the node above it will have the
8588  * correct slot selected for a later read.
8589  *
8590  * If we increment the root nodes slot counter past the number of
8591  * elements, 1 is returned to signal completion of the search.
8592  */
8593 static int adjust_slots_upwards(struct btrfs_root *root,
8594 				struct btrfs_path *path, int root_level)
8595 {
8596 	int level = 0;
8597 	int nr, slot;
8598 	struct extent_buffer *eb;
8599 
8600 	if (root_level == 0)
8601 		return 1;
8602 
8603 	while (level <= root_level) {
8604 		eb = path->nodes[level];
8605 		nr = btrfs_header_nritems(eb);
8606 		path->slots[level]++;
8607 		slot = path->slots[level];
8608 		if (slot >= nr || level == 0) {
8609 			/*
8610 			 * Don't free the root -  we will detect this
8611 			 * condition after our loop and return a
8612 			 * positive value for caller to stop walking the tree.
8613 			 */
8614 			if (level != root_level) {
8615 				btrfs_tree_unlock_rw(eb, path->locks[level]);
8616 				path->locks[level] = 0;
8617 
8618 				free_extent_buffer(eb);
8619 				path->nodes[level] = NULL;
8620 				path->slots[level] = 0;
8621 			}
8622 		} else {
8623 			/*
8624 			 * We have a valid slot to walk back down
8625 			 * from. Stop here so caller can process these
8626 			 * new nodes.
8627 			 */
8628 			break;
8629 		}
8630 
8631 		level++;
8632 	}
8633 
8634 	eb = path->nodes[root_level];
8635 	if (path->slots[root_level] >= btrfs_header_nritems(eb))
8636 		return 1;
8637 
8638 	return 0;
8639 }
8640 
8641 /*
8642  * root_eb is the subtree root and is locked before this function is called.
8643  */
8644 static int account_shared_subtree(struct btrfs_trans_handle *trans,
8645 				  struct btrfs_root *root,
8646 				  struct extent_buffer *root_eb,
8647 				  u64 root_gen,
8648 				  int root_level)
8649 {
8650 	int ret = 0;
8651 	int level;
8652 	struct extent_buffer *eb = root_eb;
8653 	struct btrfs_path *path = NULL;
8654 
8655 	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
8656 	BUG_ON(root_eb == NULL);
8657 
8658 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
8659 		return 0;
8660 
8661 	if (!extent_buffer_uptodate(root_eb)) {
8662 		ret = btrfs_read_buffer(root_eb, root_gen);
8663 		if (ret)
8664 			goto out;
8665 	}
8666 
8667 	if (root_level == 0) {
8668 		ret = account_leaf_items(trans, root, root_eb);
8669 		goto out;
8670 	}
8671 
8672 	path = btrfs_alloc_path();
8673 	if (!path)
8674 		return -ENOMEM;
8675 
8676 	/*
8677 	 * Walk down the tree.  Missing extent blocks are filled in as
8678 	 * we go. Metadata is accounted every time we read a new
8679 	 * extent block.
8680 	 *
8681 	 * When we reach a leaf, we account for file extent items in it,
8682 	 * walk back up the tree (adjusting slot pointers as we go)
8683 	 * and restart the search process.
8684 	 */
8685 	extent_buffer_get(root_eb); /* For path */
8686 	path->nodes[root_level] = root_eb;
8687 	path->slots[root_level] = 0;
8688 	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8689 walk_down:
8690 	level = root_level;
8691 	while (level >= 0) {
8692 		if (path->nodes[level] == NULL) {
8693 			int parent_slot;
8694 			u64 child_gen;
8695 			u64 child_bytenr;
8696 
8697 			/* We need to get child blockptr/gen from
8698 			 * parent before we can read it. */
8699 			eb = path->nodes[level + 1];
8700 			parent_slot = path->slots[level + 1];
8701 			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8702 			child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8703 
8704 			eb = read_tree_block(root, child_bytenr, child_gen);
8705 			if (IS_ERR(eb)) {
8706 				ret = PTR_ERR(eb);
8707 				goto out;
8708 			} else if (!extent_buffer_uptodate(eb)) {
8709 				free_extent_buffer(eb);
8710 				ret = -EIO;
8711 				goto out;
8712 			}
8713 
8714 			path->nodes[level] = eb;
8715 			path->slots[level] = 0;
8716 
8717 			btrfs_tree_read_lock(eb);
8718 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8719 			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8720 
8721 			ret = btrfs_qgroup_insert_dirty_extent(trans,
8722 					root->fs_info, child_bytenr,
8723 					root->nodesize, GFP_NOFS);
8724 			if (ret)
8725 				goto out;
8726 		}
8727 
8728 		if (level == 0) {
8729 			ret = account_leaf_items(trans, root, path->nodes[level]);
8730 			if (ret)
8731 				goto out;
8732 
8733 			/* Nonzero return here means we completed our search */
8734 			ret = adjust_slots_upwards(root, path, root_level);
8735 			if (ret)
8736 				break;
8737 
8738 			/* Restart search with new slots */
8739 			goto walk_down;
8740 		}
8741 
8742 		level--;
8743 	}
8744 
8745 	ret = 0;
8746 out:
8747 	btrfs_free_path(path);
8748 
8749 	return ret;
8750 }
8751 
8752 /*
8753  * helper to process tree block while walking down the tree.
8754  *
8755  * when wc->stage == UPDATE_BACKREF, this function updates
8756  * back refs for pointers in the block.
8757  *
8758  * NOTE: return value 1 means we should stop walking down.
8759  */
8760 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8761 				   struct btrfs_root *root,
8762 				   struct btrfs_path *path,
8763 				   struct walk_control *wc, int lookup_info)
8764 {
8765 	int level = wc->level;
8766 	struct extent_buffer *eb = path->nodes[level];
8767 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8768 	int ret;
8769 
8770 	if (wc->stage == UPDATE_BACKREF &&
8771 	    btrfs_header_owner(eb) != root->root_key.objectid)
8772 		return 1;
8773 
8774 	/*
8775 	 * when reference count of tree block is 1, it won't increase
8776 	 * again. once full backref flag is set, we never clear it.
8777 	 */
8778 	if (lookup_info &&
8779 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8780 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8781 		BUG_ON(!path->locks[level]);
8782 		ret = btrfs_lookup_extent_info(trans, root,
8783 					       eb->start, level, 1,
8784 					       &wc->refs[level],
8785 					       &wc->flags[level]);
8786 		BUG_ON(ret == -ENOMEM);
8787 		if (ret)
8788 			return ret;
8789 		BUG_ON(wc->refs[level] == 0);
8790 	}
8791 
8792 	if (wc->stage == DROP_REFERENCE) {
8793 		if (wc->refs[level] > 1)
8794 			return 1;
8795 
8796 		if (path->locks[level] && !wc->keep_locks) {
8797 			btrfs_tree_unlock_rw(eb, path->locks[level]);
8798 			path->locks[level] = 0;
8799 		}
8800 		return 0;
8801 	}
8802 
8803 	/* wc->stage == UPDATE_BACKREF */
8804 	if (!(wc->flags[level] & flag)) {
8805 		BUG_ON(!path->locks[level]);
8806 		ret = btrfs_inc_ref(trans, root, eb, 1);
8807 		BUG_ON(ret); /* -ENOMEM */
8808 		ret = btrfs_dec_ref(trans, root, eb, 0);
8809 		BUG_ON(ret); /* -ENOMEM */
8810 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8811 						  eb->len, flag,
8812 						  btrfs_header_level(eb), 0);
8813 		BUG_ON(ret); /* -ENOMEM */
8814 		wc->flags[level] |= flag;
8815 	}
8816 
8817 	/*
8818 	 * the block is shared by multiple trees, so it's not good to
8819 	 * keep the tree lock
8820 	 */
8821 	if (path->locks[level] && level > 0) {
8822 		btrfs_tree_unlock_rw(eb, path->locks[level]);
8823 		path->locks[level] = 0;
8824 	}
8825 	return 0;
8826 }
8827 
8828 /*
8829  * helper to process tree block pointer.
8830  *
8831  * when wc->stage == DROP_REFERENCE, this function checks
8832  * reference count of the block pointed to. if the block
8833  * is shared and we need update back refs for the subtree
8834  * rooted at the block, this function changes wc->stage to
8835  * UPDATE_BACKREF. if the block is shared and there is no
8836  * need to update back, this function drops the reference
8837  * to the block.
8838  *
8839  * NOTE: return value 1 means we should stop walking down.
8840  */
8841 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8842 				 struct btrfs_root *root,
8843 				 struct btrfs_path *path,
8844 				 struct walk_control *wc, int *lookup_info)
8845 {
8846 	u64 bytenr;
8847 	u64 generation;
8848 	u64 parent;
8849 	u32 blocksize;
8850 	struct btrfs_key key;
8851 	struct extent_buffer *next;
8852 	int level = wc->level;
8853 	int reada = 0;
8854 	int ret = 0;
8855 	bool need_account = false;
8856 
8857 	generation = btrfs_node_ptr_generation(path->nodes[level],
8858 					       path->slots[level]);
8859 	/*
8860 	 * if the lower level block was created before the snapshot
8861 	 * was created, we know there is no need to update back refs
8862 	 * for the subtree
8863 	 */
8864 	if (wc->stage == UPDATE_BACKREF &&
8865 	    generation <= root->root_key.offset) {
8866 		*lookup_info = 1;
8867 		return 1;
8868 	}
8869 
8870 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8871 	blocksize = root->nodesize;
8872 
8873 	next = btrfs_find_tree_block(root->fs_info, bytenr);
8874 	if (!next) {
8875 		next = btrfs_find_create_tree_block(root, bytenr);
8876 		if (IS_ERR(next))
8877 			return PTR_ERR(next);
8878 
8879 		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8880 					       level - 1);
8881 		reada = 1;
8882 	}
8883 	btrfs_tree_lock(next);
8884 	btrfs_set_lock_blocking(next);
8885 
8886 	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8887 				       &wc->refs[level - 1],
8888 				       &wc->flags[level - 1]);
8889 	if (ret < 0)
8890 		goto out_unlock;
8891 
8892 	if (unlikely(wc->refs[level - 1] == 0)) {
8893 		btrfs_err(root->fs_info, "Missing references.");
8894 		ret = -EIO;
8895 		goto out_unlock;
8896 	}
8897 	*lookup_info = 0;
8898 
8899 	if (wc->stage == DROP_REFERENCE) {
8900 		if (wc->refs[level - 1] > 1) {
8901 			need_account = true;
8902 			if (level == 1 &&
8903 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8904 				goto skip;
8905 
8906 			if (!wc->update_ref ||
8907 			    generation <= root->root_key.offset)
8908 				goto skip;
8909 
8910 			btrfs_node_key_to_cpu(path->nodes[level], &key,
8911 					      path->slots[level]);
8912 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8913 			if (ret < 0)
8914 				goto skip;
8915 
8916 			wc->stage = UPDATE_BACKREF;
8917 			wc->shared_level = level - 1;
8918 		}
8919 	} else {
8920 		if (level == 1 &&
8921 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8922 			goto skip;
8923 	}
8924 
8925 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
8926 		btrfs_tree_unlock(next);
8927 		free_extent_buffer(next);
8928 		next = NULL;
8929 		*lookup_info = 1;
8930 	}
8931 
8932 	if (!next) {
8933 		if (reada && level == 1)
8934 			reada_walk_down(trans, root, wc, path);
8935 		next = read_tree_block(root, bytenr, generation);
8936 		if (IS_ERR(next)) {
8937 			return PTR_ERR(next);
8938 		} else if (!extent_buffer_uptodate(next)) {
8939 			free_extent_buffer(next);
8940 			return -EIO;
8941 		}
8942 		btrfs_tree_lock(next);
8943 		btrfs_set_lock_blocking(next);
8944 	}
8945 
8946 	level--;
8947 	ASSERT(level == btrfs_header_level(next));
8948 	if (level != btrfs_header_level(next)) {
8949 		btrfs_err(root->fs_info, "mismatched level");
8950 		ret = -EIO;
8951 		goto out_unlock;
8952 	}
8953 	path->nodes[level] = next;
8954 	path->slots[level] = 0;
8955 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8956 	wc->level = level;
8957 	if (wc->level == 1)
8958 		wc->reada_slot = 0;
8959 	return 0;
8960 skip:
8961 	wc->refs[level - 1] = 0;
8962 	wc->flags[level - 1] = 0;
8963 	if (wc->stage == DROP_REFERENCE) {
8964 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8965 			parent = path->nodes[level]->start;
8966 		} else {
8967 			ASSERT(root->root_key.objectid ==
8968 			       btrfs_header_owner(path->nodes[level]));
8969 			if (root->root_key.objectid !=
8970 			    btrfs_header_owner(path->nodes[level])) {
8971 				btrfs_err(root->fs_info,
8972 						"mismatched block owner");
8973 				ret = -EIO;
8974 				goto out_unlock;
8975 			}
8976 			parent = 0;
8977 		}
8978 
8979 		if (need_account) {
8980 			ret = account_shared_subtree(trans, root, next,
8981 						     generation, level - 1);
8982 			if (ret) {
8983 				btrfs_err_rl(root->fs_info,
8984 					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8985 					     ret);
8986 			}
8987 		}
8988 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8989 				root->root_key.objectid, level - 1, 0);
8990 		if (ret)
8991 			goto out_unlock;
8992 	}
8993 
8994 	*lookup_info = 1;
8995 	ret = 1;
8996 
8997 out_unlock:
8998 	btrfs_tree_unlock(next);
8999 	free_extent_buffer(next);
9000 
9001 	return ret;
9002 }
9003 
9004 /*
9005  * helper to process tree block while walking up the tree.
9006  *
9007  * when wc->stage == DROP_REFERENCE, this function drops
9008  * reference count on the block.
9009  *
9010  * when wc->stage == UPDATE_BACKREF, this function changes
9011  * wc->stage back to DROP_REFERENCE if we changed wc->stage
9012  * to UPDATE_BACKREF previously while processing the block.
9013  *
9014  * NOTE: return value 1 means we should stop walking up.
9015  */
9016 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9017 				 struct btrfs_root *root,
9018 				 struct btrfs_path *path,
9019 				 struct walk_control *wc)
9020 {
9021 	int ret;
9022 	int level = wc->level;
9023 	struct extent_buffer *eb = path->nodes[level];
9024 	u64 parent = 0;
9025 
9026 	if (wc->stage == UPDATE_BACKREF) {
9027 		BUG_ON(wc->shared_level < level);
9028 		if (level < wc->shared_level)
9029 			goto out;
9030 
9031 		ret = find_next_key(path, level + 1, &wc->update_progress);
9032 		if (ret > 0)
9033 			wc->update_ref = 0;
9034 
9035 		wc->stage = DROP_REFERENCE;
9036 		wc->shared_level = -1;
9037 		path->slots[level] = 0;
9038 
9039 		/*
9040 		 * check reference count again if the block isn't locked.
9041 		 * we should start walking down the tree again if reference
9042 		 * count is one.
9043 		 */
9044 		if (!path->locks[level]) {
9045 			BUG_ON(level == 0);
9046 			btrfs_tree_lock(eb);
9047 			btrfs_set_lock_blocking(eb);
9048 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9049 
9050 			ret = btrfs_lookup_extent_info(trans, root,
9051 						       eb->start, level, 1,
9052 						       &wc->refs[level],
9053 						       &wc->flags[level]);
9054 			if (ret < 0) {
9055 				btrfs_tree_unlock_rw(eb, path->locks[level]);
9056 				path->locks[level] = 0;
9057 				return ret;
9058 			}
9059 			BUG_ON(wc->refs[level] == 0);
9060 			if (wc->refs[level] == 1) {
9061 				btrfs_tree_unlock_rw(eb, path->locks[level]);
9062 				path->locks[level] = 0;
9063 				return 1;
9064 			}
9065 		}
9066 	}
9067 
9068 	/* wc->stage == DROP_REFERENCE */
9069 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9070 
9071 	if (wc->refs[level] == 1) {
9072 		if (level == 0) {
9073 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9074 				ret = btrfs_dec_ref(trans, root, eb, 1);
9075 			else
9076 				ret = btrfs_dec_ref(trans, root, eb, 0);
9077 			BUG_ON(ret); /* -ENOMEM */
9078 			ret = account_leaf_items(trans, root, eb);
9079 			if (ret) {
9080 				btrfs_err_rl(root->fs_info,
9081 					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
9082 					     ret);
9083 			}
9084 		}
9085 		/* make block locked assertion in clean_tree_block happy */
9086 		if (!path->locks[level] &&
9087 		    btrfs_header_generation(eb) == trans->transid) {
9088 			btrfs_tree_lock(eb);
9089 			btrfs_set_lock_blocking(eb);
9090 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9091 		}
9092 		clean_tree_block(trans, root->fs_info, eb);
9093 	}
9094 
9095 	if (eb == root->node) {
9096 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9097 			parent = eb->start;
9098 		else
9099 			BUG_ON(root->root_key.objectid !=
9100 			       btrfs_header_owner(eb));
9101 	} else {
9102 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9103 			parent = path->nodes[level + 1]->start;
9104 		else
9105 			BUG_ON(root->root_key.objectid !=
9106 			       btrfs_header_owner(path->nodes[level + 1]));
9107 	}
9108 
9109 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9110 out:
9111 	wc->refs[level] = 0;
9112 	wc->flags[level] = 0;
9113 	return 0;
9114 }
9115 
9116 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9117 				   struct btrfs_root *root,
9118 				   struct btrfs_path *path,
9119 				   struct walk_control *wc)
9120 {
9121 	int level = wc->level;
9122 	int lookup_info = 1;
9123 	int ret;
9124 
9125 	while (level >= 0) {
9126 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
9127 		if (ret > 0)
9128 			break;
9129 
9130 		if (level == 0)
9131 			break;
9132 
9133 		if (path->slots[level] >=
9134 		    btrfs_header_nritems(path->nodes[level]))
9135 			break;
9136 
9137 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
9138 		if (ret > 0) {
9139 			path->slots[level]++;
9140 			continue;
9141 		} else if (ret < 0)
9142 			return ret;
9143 		level = wc->level;
9144 	}
9145 	return 0;
9146 }
9147 
9148 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9149 				 struct btrfs_root *root,
9150 				 struct btrfs_path *path,
9151 				 struct walk_control *wc, int max_level)
9152 {
9153 	int level = wc->level;
9154 	int ret;
9155 
9156 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9157 	while (level < max_level && path->nodes[level]) {
9158 		wc->level = level;
9159 		if (path->slots[level] + 1 <
9160 		    btrfs_header_nritems(path->nodes[level])) {
9161 			path->slots[level]++;
9162 			return 0;
9163 		} else {
9164 			ret = walk_up_proc(trans, root, path, wc);
9165 			if (ret > 0)
9166 				return 0;
9167 
9168 			if (path->locks[level]) {
9169 				btrfs_tree_unlock_rw(path->nodes[level],
9170 						     path->locks[level]);
9171 				path->locks[level] = 0;
9172 			}
9173 			free_extent_buffer(path->nodes[level]);
9174 			path->nodes[level] = NULL;
9175 			level++;
9176 		}
9177 	}
9178 	return 1;
9179 }
9180 
9181 /*
9182  * drop a subvolume tree.
9183  *
9184  * this function traverses the tree freeing any blocks that only
9185  * referenced by the tree.
9186  *
9187  * when a shared tree block is found. this function decreases its
9188  * reference count by one. if update_ref is true, this function
9189  * also make sure backrefs for the shared block and all lower level
9190  * blocks are properly updated.
9191  *
9192  * If called with for_reloc == 0, may exit early with -EAGAIN
9193  */
9194 int btrfs_drop_snapshot(struct btrfs_root *root,
9195 			 struct btrfs_block_rsv *block_rsv, int update_ref,
9196 			 int for_reloc)
9197 {
9198 	struct btrfs_fs_info *fs_info = root->fs_info;
9199 	struct btrfs_path *path;
9200 	struct btrfs_trans_handle *trans;
9201 	struct btrfs_root *tree_root = fs_info->tree_root;
9202 	struct btrfs_root_item *root_item = &root->root_item;
9203 	struct walk_control *wc;
9204 	struct btrfs_key key;
9205 	int err = 0;
9206 	int ret;
9207 	int level;
9208 	bool root_dropped = false;
9209 
9210 	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
9211 
9212 	path = btrfs_alloc_path();
9213 	if (!path) {
9214 		err = -ENOMEM;
9215 		goto out;
9216 	}
9217 
9218 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9219 	if (!wc) {
9220 		btrfs_free_path(path);
9221 		err = -ENOMEM;
9222 		goto out;
9223 	}
9224 
9225 	trans = btrfs_start_transaction(tree_root, 0);
9226 	if (IS_ERR(trans)) {
9227 		err = PTR_ERR(trans);
9228 		goto out_free;
9229 	}
9230 
9231 	if (block_rsv)
9232 		trans->block_rsv = block_rsv;
9233 
9234 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9235 		level = btrfs_header_level(root->node);
9236 		path->nodes[level] = btrfs_lock_root_node(root);
9237 		btrfs_set_lock_blocking(path->nodes[level]);
9238 		path->slots[level] = 0;
9239 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9240 		memset(&wc->update_progress, 0,
9241 		       sizeof(wc->update_progress));
9242 	} else {
9243 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9244 		memcpy(&wc->update_progress, &key,
9245 		       sizeof(wc->update_progress));
9246 
9247 		level = root_item->drop_level;
9248 		BUG_ON(level == 0);
9249 		path->lowest_level = level;
9250 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9251 		path->lowest_level = 0;
9252 		if (ret < 0) {
9253 			err = ret;
9254 			goto out_end_trans;
9255 		}
9256 		WARN_ON(ret > 0);
9257 
9258 		/*
9259 		 * unlock our path, this is safe because only this
9260 		 * function is allowed to delete this snapshot
9261 		 */
9262 		btrfs_unlock_up_safe(path, 0);
9263 
9264 		level = btrfs_header_level(root->node);
9265 		while (1) {
9266 			btrfs_tree_lock(path->nodes[level]);
9267 			btrfs_set_lock_blocking(path->nodes[level]);
9268 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9269 
9270 			ret = btrfs_lookup_extent_info(trans, root,
9271 						path->nodes[level]->start,
9272 						level, 1, &wc->refs[level],
9273 						&wc->flags[level]);
9274 			if (ret < 0) {
9275 				err = ret;
9276 				goto out_end_trans;
9277 			}
9278 			BUG_ON(wc->refs[level] == 0);
9279 
9280 			if (level == root_item->drop_level)
9281 				break;
9282 
9283 			btrfs_tree_unlock(path->nodes[level]);
9284 			path->locks[level] = 0;
9285 			WARN_ON(wc->refs[level] != 1);
9286 			level--;
9287 		}
9288 	}
9289 
9290 	wc->level = level;
9291 	wc->shared_level = -1;
9292 	wc->stage = DROP_REFERENCE;
9293 	wc->update_ref = update_ref;
9294 	wc->keep_locks = 0;
9295 	wc->for_reloc = for_reloc;
9296 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9297 
9298 	while (1) {
9299 
9300 		ret = walk_down_tree(trans, root, path, wc);
9301 		if (ret < 0) {
9302 			err = ret;
9303 			break;
9304 		}
9305 
9306 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9307 		if (ret < 0) {
9308 			err = ret;
9309 			break;
9310 		}
9311 
9312 		if (ret > 0) {
9313 			BUG_ON(wc->stage != DROP_REFERENCE);
9314 			break;
9315 		}
9316 
9317 		if (wc->stage == DROP_REFERENCE) {
9318 			level = wc->level;
9319 			btrfs_node_key(path->nodes[level],
9320 				       &root_item->drop_progress,
9321 				       path->slots[level]);
9322 			root_item->drop_level = level;
9323 		}
9324 
9325 		BUG_ON(wc->level == 0);
9326 		if (btrfs_should_end_transaction(trans, tree_root) ||
9327 		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
9328 			ret = btrfs_update_root(trans, tree_root,
9329 						&root->root_key,
9330 						root_item);
9331 			if (ret) {
9332 				btrfs_abort_transaction(trans, ret);
9333 				err = ret;
9334 				goto out_end_trans;
9335 			}
9336 
9337 			btrfs_end_transaction_throttle(trans, tree_root);
9338 			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
9339 				btrfs_debug(fs_info,
9340 					    "drop snapshot early exit");
9341 				err = -EAGAIN;
9342 				goto out_free;
9343 			}
9344 
9345 			trans = btrfs_start_transaction(tree_root, 0);
9346 			if (IS_ERR(trans)) {
9347 				err = PTR_ERR(trans);
9348 				goto out_free;
9349 			}
9350 			if (block_rsv)
9351 				trans->block_rsv = block_rsv;
9352 		}
9353 	}
9354 	btrfs_release_path(path);
9355 	if (err)
9356 		goto out_end_trans;
9357 
9358 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
9359 	if (ret) {
9360 		btrfs_abort_transaction(trans, ret);
9361 		goto out_end_trans;
9362 	}
9363 
9364 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9365 		ret = btrfs_find_root(tree_root, &root->root_key, path,
9366 				      NULL, NULL);
9367 		if (ret < 0) {
9368 			btrfs_abort_transaction(trans, ret);
9369 			err = ret;
9370 			goto out_end_trans;
9371 		} else if (ret > 0) {
9372 			/* if we fail to delete the orphan item this time
9373 			 * around, it'll get picked up the next time.
9374 			 *
9375 			 * The most common failure here is just -ENOENT.
9376 			 */
9377 			btrfs_del_orphan_item(trans, tree_root,
9378 					      root->root_key.objectid);
9379 		}
9380 	}
9381 
9382 	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9383 		btrfs_add_dropped_root(trans, root);
9384 	} else {
9385 		free_extent_buffer(root->node);
9386 		free_extent_buffer(root->commit_root);
9387 		btrfs_put_fs_root(root);
9388 	}
9389 	root_dropped = true;
9390 out_end_trans:
9391 	btrfs_end_transaction_throttle(trans, tree_root);
9392 out_free:
9393 	kfree(wc);
9394 	btrfs_free_path(path);
9395 out:
9396 	/*
9397 	 * So if we need to stop dropping the snapshot for whatever reason we
9398 	 * need to make sure to add it back to the dead root list so that we
9399 	 * keep trying to do the work later.  This also cleans up roots if we
9400 	 * don't have it in the radix (like when we recover after a power fail
9401 	 * or unmount) so we don't leak memory.
9402 	 */
9403 	if (!for_reloc && root_dropped == false)
9404 		btrfs_add_dead_root(root);
9405 	if (err && err != -EAGAIN)
9406 		btrfs_handle_fs_error(fs_info, err, NULL);
9407 	return err;
9408 }
9409 
9410 /*
9411  * drop subtree rooted at tree block 'node'.
9412  *
9413  * NOTE: this function will unlock and release tree block 'node'
9414  * only used by relocation code
9415  */
9416 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9417 			struct btrfs_root *root,
9418 			struct extent_buffer *node,
9419 			struct extent_buffer *parent)
9420 {
9421 	struct btrfs_path *path;
9422 	struct walk_control *wc;
9423 	int level;
9424 	int parent_level;
9425 	int ret = 0;
9426 	int wret;
9427 
9428 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9429 
9430 	path = btrfs_alloc_path();
9431 	if (!path)
9432 		return -ENOMEM;
9433 
9434 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9435 	if (!wc) {
9436 		btrfs_free_path(path);
9437 		return -ENOMEM;
9438 	}
9439 
9440 	btrfs_assert_tree_locked(parent);
9441 	parent_level = btrfs_header_level(parent);
9442 	extent_buffer_get(parent);
9443 	path->nodes[parent_level] = parent;
9444 	path->slots[parent_level] = btrfs_header_nritems(parent);
9445 
9446 	btrfs_assert_tree_locked(node);
9447 	level = btrfs_header_level(node);
9448 	path->nodes[level] = node;
9449 	path->slots[level] = 0;
9450 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9451 
9452 	wc->refs[parent_level] = 1;
9453 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9454 	wc->level = level;
9455 	wc->shared_level = -1;
9456 	wc->stage = DROP_REFERENCE;
9457 	wc->update_ref = 0;
9458 	wc->keep_locks = 1;
9459 	wc->for_reloc = 1;
9460 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9461 
9462 	while (1) {
9463 		wret = walk_down_tree(trans, root, path, wc);
9464 		if (wret < 0) {
9465 			ret = wret;
9466 			break;
9467 		}
9468 
9469 		wret = walk_up_tree(trans, root, path, wc, parent_level);
9470 		if (wret < 0)
9471 			ret = wret;
9472 		if (wret != 0)
9473 			break;
9474 	}
9475 
9476 	kfree(wc);
9477 	btrfs_free_path(path);
9478 	return ret;
9479 }
9480 
9481 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
9482 {
9483 	u64 num_devices;
9484 	u64 stripped;
9485 
9486 	/*
9487 	 * if restripe for this chunk_type is on pick target profile and
9488 	 * return, otherwise do the usual balance
9489 	 */
9490 	stripped = get_restripe_target(root->fs_info, flags);
9491 	if (stripped)
9492 		return extended_to_chunk(stripped);
9493 
9494 	num_devices = root->fs_info->fs_devices->rw_devices;
9495 
9496 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
9497 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9498 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9499 
9500 	if (num_devices == 1) {
9501 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9502 		stripped = flags & ~stripped;
9503 
9504 		/* turn raid0 into single device chunks */
9505 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
9506 			return stripped;
9507 
9508 		/* turn mirroring into duplication */
9509 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9510 			     BTRFS_BLOCK_GROUP_RAID10))
9511 			return stripped | BTRFS_BLOCK_GROUP_DUP;
9512 	} else {
9513 		/* they already had raid on here, just return */
9514 		if (flags & stripped)
9515 			return flags;
9516 
9517 		stripped |= BTRFS_BLOCK_GROUP_DUP;
9518 		stripped = flags & ~stripped;
9519 
9520 		/* switch duplicated blocks with raid1 */
9521 		if (flags & BTRFS_BLOCK_GROUP_DUP)
9522 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
9523 
9524 		/* this is drive concat, leave it alone */
9525 	}
9526 
9527 	return flags;
9528 }
9529 
9530 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9531 {
9532 	struct btrfs_space_info *sinfo = cache->space_info;
9533 	u64 num_bytes;
9534 	u64 min_allocable_bytes;
9535 	int ret = -ENOSPC;
9536 
9537 	/*
9538 	 * We need some metadata space and system metadata space for
9539 	 * allocating chunks in some corner cases until we force to set
9540 	 * it to be readonly.
9541 	 */
9542 	if ((sinfo->flags &
9543 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9544 	    !force)
9545 		min_allocable_bytes = SZ_1M;
9546 	else
9547 		min_allocable_bytes = 0;
9548 
9549 	spin_lock(&sinfo->lock);
9550 	spin_lock(&cache->lock);
9551 
9552 	if (cache->ro) {
9553 		cache->ro++;
9554 		ret = 0;
9555 		goto out;
9556 	}
9557 
9558 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9559 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
9560 
9561 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9562 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
9563 	    min_allocable_bytes <= sinfo->total_bytes) {
9564 		sinfo->bytes_readonly += num_bytes;
9565 		cache->ro++;
9566 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9567 		ret = 0;
9568 	}
9569 out:
9570 	spin_unlock(&cache->lock);
9571 	spin_unlock(&sinfo->lock);
9572 	return ret;
9573 }
9574 
9575 int btrfs_inc_block_group_ro(struct btrfs_root *root,
9576 			     struct btrfs_block_group_cache *cache)
9577 
9578 {
9579 	struct btrfs_trans_handle *trans;
9580 	u64 alloc_flags;
9581 	int ret;
9582 
9583 again:
9584 	trans = btrfs_join_transaction(root);
9585 	if (IS_ERR(trans))
9586 		return PTR_ERR(trans);
9587 
9588 	/*
9589 	 * we're not allowed to set block groups readonly after the dirty
9590 	 * block groups cache has started writing.  If it already started,
9591 	 * back off and let this transaction commit
9592 	 */
9593 	mutex_lock(&root->fs_info->ro_block_group_mutex);
9594 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9595 		u64 transid = trans->transid;
9596 
9597 		mutex_unlock(&root->fs_info->ro_block_group_mutex);
9598 		btrfs_end_transaction(trans, root);
9599 
9600 		ret = btrfs_wait_for_commit(root, transid);
9601 		if (ret)
9602 			return ret;
9603 		goto again;
9604 	}
9605 
9606 	/*
9607 	 * if we are changing raid levels, try to allocate a corresponding
9608 	 * block group with the new raid level.
9609 	 */
9610 	alloc_flags = update_block_group_flags(root, cache->flags);
9611 	if (alloc_flags != cache->flags) {
9612 		ret = do_chunk_alloc(trans, root, alloc_flags,
9613 				     CHUNK_ALLOC_FORCE);
9614 		/*
9615 		 * ENOSPC is allowed here, we may have enough space
9616 		 * already allocated at the new raid level to
9617 		 * carry on
9618 		 */
9619 		if (ret == -ENOSPC)
9620 			ret = 0;
9621 		if (ret < 0)
9622 			goto out;
9623 	}
9624 
9625 	ret = inc_block_group_ro(cache, 0);
9626 	if (!ret)
9627 		goto out;
9628 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9629 	ret = do_chunk_alloc(trans, root, alloc_flags,
9630 			     CHUNK_ALLOC_FORCE);
9631 	if (ret < 0)
9632 		goto out;
9633 	ret = inc_block_group_ro(cache, 0);
9634 out:
9635 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9636 		alloc_flags = update_block_group_flags(root, cache->flags);
9637 		lock_chunks(root->fs_info->chunk_root);
9638 		check_system_chunk(trans, root, alloc_flags);
9639 		unlock_chunks(root->fs_info->chunk_root);
9640 	}
9641 	mutex_unlock(&root->fs_info->ro_block_group_mutex);
9642 
9643 	btrfs_end_transaction(trans, root);
9644 	return ret;
9645 }
9646 
9647 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9648 			    struct btrfs_root *root, u64 type)
9649 {
9650 	u64 alloc_flags = get_alloc_profile(root, type);
9651 	return do_chunk_alloc(trans, root, alloc_flags,
9652 			      CHUNK_ALLOC_FORCE);
9653 }
9654 
9655 /*
9656  * helper to account the unused space of all the readonly block group in the
9657  * space_info. takes mirrors into account.
9658  */
9659 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9660 {
9661 	struct btrfs_block_group_cache *block_group;
9662 	u64 free_bytes = 0;
9663 	int factor;
9664 
9665 	/* It's df, we don't care if it's racy */
9666 	if (list_empty(&sinfo->ro_bgs))
9667 		return 0;
9668 
9669 	spin_lock(&sinfo->lock);
9670 	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9671 		spin_lock(&block_group->lock);
9672 
9673 		if (!block_group->ro) {
9674 			spin_unlock(&block_group->lock);
9675 			continue;
9676 		}
9677 
9678 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9679 					  BTRFS_BLOCK_GROUP_RAID10 |
9680 					  BTRFS_BLOCK_GROUP_DUP))
9681 			factor = 2;
9682 		else
9683 			factor = 1;
9684 
9685 		free_bytes += (block_group->key.offset -
9686 			       btrfs_block_group_used(&block_group->item)) *
9687 			       factor;
9688 
9689 		spin_unlock(&block_group->lock);
9690 	}
9691 	spin_unlock(&sinfo->lock);
9692 
9693 	return free_bytes;
9694 }
9695 
9696 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9697 			      struct btrfs_block_group_cache *cache)
9698 {
9699 	struct btrfs_space_info *sinfo = cache->space_info;
9700 	u64 num_bytes;
9701 
9702 	BUG_ON(!cache->ro);
9703 
9704 	spin_lock(&sinfo->lock);
9705 	spin_lock(&cache->lock);
9706 	if (!--cache->ro) {
9707 		num_bytes = cache->key.offset - cache->reserved -
9708 			    cache->pinned - cache->bytes_super -
9709 			    btrfs_block_group_used(&cache->item);
9710 		sinfo->bytes_readonly -= num_bytes;
9711 		list_del_init(&cache->ro_list);
9712 	}
9713 	spin_unlock(&cache->lock);
9714 	spin_unlock(&sinfo->lock);
9715 }
9716 
9717 /*
9718  * checks to see if its even possible to relocate this block group.
9719  *
9720  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9721  * ok to go ahead and try.
9722  */
9723 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9724 {
9725 	struct btrfs_block_group_cache *block_group;
9726 	struct btrfs_space_info *space_info;
9727 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9728 	struct btrfs_device *device;
9729 	struct btrfs_trans_handle *trans;
9730 	u64 min_free;
9731 	u64 dev_min = 1;
9732 	u64 dev_nr = 0;
9733 	u64 target;
9734 	int debug;
9735 	int index;
9736 	int full = 0;
9737 	int ret = 0;
9738 
9739 	debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
9740 
9741 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9742 
9743 	/* odd, couldn't find the block group, leave it alone */
9744 	if (!block_group) {
9745 		if (debug)
9746 			btrfs_warn(root->fs_info,
9747 				   "can't find block group for bytenr %llu",
9748 				   bytenr);
9749 		return -1;
9750 	}
9751 
9752 	min_free = btrfs_block_group_used(&block_group->item);
9753 
9754 	/* no bytes used, we're good */
9755 	if (!min_free)
9756 		goto out;
9757 
9758 	space_info = block_group->space_info;
9759 	spin_lock(&space_info->lock);
9760 
9761 	full = space_info->full;
9762 
9763 	/*
9764 	 * if this is the last block group we have in this space, we can't
9765 	 * relocate it unless we're able to allocate a new chunk below.
9766 	 *
9767 	 * Otherwise, we need to make sure we have room in the space to handle
9768 	 * all of the extents from this block group.  If we can, we're good
9769 	 */
9770 	if ((space_info->total_bytes != block_group->key.offset) &&
9771 	    (space_info->bytes_used + space_info->bytes_reserved +
9772 	     space_info->bytes_pinned + space_info->bytes_readonly +
9773 	     min_free < space_info->total_bytes)) {
9774 		spin_unlock(&space_info->lock);
9775 		goto out;
9776 	}
9777 	spin_unlock(&space_info->lock);
9778 
9779 	/*
9780 	 * ok we don't have enough space, but maybe we have free space on our
9781 	 * devices to allocate new chunks for relocation, so loop through our
9782 	 * alloc devices and guess if we have enough space.  if this block
9783 	 * group is going to be restriped, run checks against the target
9784 	 * profile instead of the current one.
9785 	 */
9786 	ret = -1;
9787 
9788 	/*
9789 	 * index:
9790 	 *      0: raid10
9791 	 *      1: raid1
9792 	 *      2: dup
9793 	 *      3: raid0
9794 	 *      4: single
9795 	 */
9796 	target = get_restripe_target(root->fs_info, block_group->flags);
9797 	if (target) {
9798 		index = __get_raid_index(extended_to_chunk(target));
9799 	} else {
9800 		/*
9801 		 * this is just a balance, so if we were marked as full
9802 		 * we know there is no space for a new chunk
9803 		 */
9804 		if (full) {
9805 			if (debug)
9806 				btrfs_warn(root->fs_info,
9807 					"no space to alloc new chunk for block group %llu",
9808 					block_group->key.objectid);
9809 			goto out;
9810 		}
9811 
9812 		index = get_block_group_index(block_group);
9813 	}
9814 
9815 	if (index == BTRFS_RAID_RAID10) {
9816 		dev_min = 4;
9817 		/* Divide by 2 */
9818 		min_free >>= 1;
9819 	} else if (index == BTRFS_RAID_RAID1) {
9820 		dev_min = 2;
9821 	} else if (index == BTRFS_RAID_DUP) {
9822 		/* Multiply by 2 */
9823 		min_free <<= 1;
9824 	} else if (index == BTRFS_RAID_RAID0) {
9825 		dev_min = fs_devices->rw_devices;
9826 		min_free = div64_u64(min_free, dev_min);
9827 	}
9828 
9829 	/* We need to do this so that we can look at pending chunks */
9830 	trans = btrfs_join_transaction(root);
9831 	if (IS_ERR(trans)) {
9832 		ret = PTR_ERR(trans);
9833 		goto out;
9834 	}
9835 
9836 	mutex_lock(&root->fs_info->chunk_mutex);
9837 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9838 		u64 dev_offset;
9839 
9840 		/*
9841 		 * check to make sure we can actually find a chunk with enough
9842 		 * space to fit our block group in.
9843 		 */
9844 		if (device->total_bytes > device->bytes_used + min_free &&
9845 		    !device->is_tgtdev_for_dev_replace) {
9846 			ret = find_free_dev_extent(trans, device, min_free,
9847 						   &dev_offset, NULL);
9848 			if (!ret)
9849 				dev_nr++;
9850 
9851 			if (dev_nr >= dev_min)
9852 				break;
9853 
9854 			ret = -1;
9855 		}
9856 	}
9857 	if (debug && ret == -1)
9858 		btrfs_warn(root->fs_info,
9859 			"no space to allocate a new chunk for block group %llu",
9860 			block_group->key.objectid);
9861 	mutex_unlock(&root->fs_info->chunk_mutex);
9862 	btrfs_end_transaction(trans, root);
9863 out:
9864 	btrfs_put_block_group(block_group);
9865 	return ret;
9866 }
9867 
9868 static int find_first_block_group(struct btrfs_root *root,
9869 		struct btrfs_path *path, struct btrfs_key *key)
9870 {
9871 	int ret = 0;
9872 	struct btrfs_key found_key;
9873 	struct extent_buffer *leaf;
9874 	int slot;
9875 
9876 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9877 	if (ret < 0)
9878 		goto out;
9879 
9880 	while (1) {
9881 		slot = path->slots[0];
9882 		leaf = path->nodes[0];
9883 		if (slot >= btrfs_header_nritems(leaf)) {
9884 			ret = btrfs_next_leaf(root, path);
9885 			if (ret == 0)
9886 				continue;
9887 			if (ret < 0)
9888 				goto out;
9889 			break;
9890 		}
9891 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
9892 
9893 		if (found_key.objectid >= key->objectid &&
9894 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9895 			struct extent_map_tree *em_tree;
9896 			struct extent_map *em;
9897 
9898 			em_tree = &root->fs_info->mapping_tree.map_tree;
9899 			read_lock(&em_tree->lock);
9900 			em = lookup_extent_mapping(em_tree, found_key.objectid,
9901 						   found_key.offset);
9902 			read_unlock(&em_tree->lock);
9903 			if (!em) {
9904 				btrfs_err(root->fs_info,
9905 			"logical %llu len %llu found bg but no related chunk",
9906 					  found_key.objectid, found_key.offset);
9907 				ret = -ENOENT;
9908 			} else {
9909 				ret = 0;
9910 			}
9911 			free_extent_map(em);
9912 			goto out;
9913 		}
9914 		path->slots[0]++;
9915 	}
9916 out:
9917 	return ret;
9918 }
9919 
9920 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9921 {
9922 	struct btrfs_block_group_cache *block_group;
9923 	u64 last = 0;
9924 
9925 	while (1) {
9926 		struct inode *inode;
9927 
9928 		block_group = btrfs_lookup_first_block_group(info, last);
9929 		while (block_group) {
9930 			spin_lock(&block_group->lock);
9931 			if (block_group->iref)
9932 				break;
9933 			spin_unlock(&block_group->lock);
9934 			block_group = next_block_group(info->tree_root,
9935 						       block_group);
9936 		}
9937 		if (!block_group) {
9938 			if (last == 0)
9939 				break;
9940 			last = 0;
9941 			continue;
9942 		}
9943 
9944 		inode = block_group->inode;
9945 		block_group->iref = 0;
9946 		block_group->inode = NULL;
9947 		spin_unlock(&block_group->lock);
9948 		ASSERT(block_group->io_ctl.inode == NULL);
9949 		iput(inode);
9950 		last = block_group->key.objectid + block_group->key.offset;
9951 		btrfs_put_block_group(block_group);
9952 	}
9953 }
9954 
9955 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9956 {
9957 	struct btrfs_block_group_cache *block_group;
9958 	struct btrfs_space_info *space_info;
9959 	struct btrfs_caching_control *caching_ctl;
9960 	struct rb_node *n;
9961 
9962 	down_write(&info->commit_root_sem);
9963 	while (!list_empty(&info->caching_block_groups)) {
9964 		caching_ctl = list_entry(info->caching_block_groups.next,
9965 					 struct btrfs_caching_control, list);
9966 		list_del(&caching_ctl->list);
9967 		put_caching_control(caching_ctl);
9968 	}
9969 	up_write(&info->commit_root_sem);
9970 
9971 	spin_lock(&info->unused_bgs_lock);
9972 	while (!list_empty(&info->unused_bgs)) {
9973 		block_group = list_first_entry(&info->unused_bgs,
9974 					       struct btrfs_block_group_cache,
9975 					       bg_list);
9976 		list_del_init(&block_group->bg_list);
9977 		btrfs_put_block_group(block_group);
9978 	}
9979 	spin_unlock(&info->unused_bgs_lock);
9980 
9981 	spin_lock(&info->block_group_cache_lock);
9982 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9983 		block_group = rb_entry(n, struct btrfs_block_group_cache,
9984 				       cache_node);
9985 		rb_erase(&block_group->cache_node,
9986 			 &info->block_group_cache_tree);
9987 		RB_CLEAR_NODE(&block_group->cache_node);
9988 		spin_unlock(&info->block_group_cache_lock);
9989 
9990 		down_write(&block_group->space_info->groups_sem);
9991 		list_del(&block_group->list);
9992 		up_write(&block_group->space_info->groups_sem);
9993 
9994 		if (block_group->cached == BTRFS_CACHE_STARTED)
9995 			wait_block_group_cache_done(block_group);
9996 
9997 		/*
9998 		 * We haven't cached this block group, which means we could
9999 		 * possibly have excluded extents on this block group.
10000 		 */
10001 		if (block_group->cached == BTRFS_CACHE_NO ||
10002 		    block_group->cached == BTRFS_CACHE_ERROR)
10003 			free_excluded_extents(info->extent_root, block_group);
10004 
10005 		btrfs_remove_free_space_cache(block_group);
10006 		ASSERT(list_empty(&block_group->dirty_list));
10007 		ASSERT(list_empty(&block_group->io_list));
10008 		ASSERT(list_empty(&block_group->bg_list));
10009 		ASSERT(atomic_read(&block_group->count) == 1);
10010 		btrfs_put_block_group(block_group);
10011 
10012 		spin_lock(&info->block_group_cache_lock);
10013 	}
10014 	spin_unlock(&info->block_group_cache_lock);
10015 
10016 	/* now that all the block groups are freed, go through and
10017 	 * free all the space_info structs.  This is only called during
10018 	 * the final stages of unmount, and so we know nobody is
10019 	 * using them.  We call synchronize_rcu() once before we start,
10020 	 * just to be on the safe side.
10021 	 */
10022 	synchronize_rcu();
10023 
10024 	release_global_block_rsv(info);
10025 
10026 	while (!list_empty(&info->space_info)) {
10027 		int i;
10028 
10029 		space_info = list_entry(info->space_info.next,
10030 					struct btrfs_space_info,
10031 					list);
10032 
10033 		/*
10034 		 * Do not hide this behind enospc_debug, this is actually
10035 		 * important and indicates a real bug if this happens.
10036 		 */
10037 		if (WARN_ON(space_info->bytes_pinned > 0 ||
10038 			    space_info->bytes_reserved > 0 ||
10039 			    space_info->bytes_may_use > 0))
10040 			dump_space_info(info, space_info, 0, 0);
10041 		list_del(&space_info->list);
10042 		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10043 			struct kobject *kobj;
10044 			kobj = space_info->block_group_kobjs[i];
10045 			space_info->block_group_kobjs[i] = NULL;
10046 			if (kobj) {
10047 				kobject_del(kobj);
10048 				kobject_put(kobj);
10049 			}
10050 		}
10051 		kobject_del(&space_info->kobj);
10052 		kobject_put(&space_info->kobj);
10053 	}
10054 	return 0;
10055 }
10056 
10057 static void __link_block_group(struct btrfs_space_info *space_info,
10058 			       struct btrfs_block_group_cache *cache)
10059 {
10060 	int index = get_block_group_index(cache);
10061 	bool first = false;
10062 
10063 	down_write(&space_info->groups_sem);
10064 	if (list_empty(&space_info->block_groups[index]))
10065 		first = true;
10066 	list_add_tail(&cache->list, &space_info->block_groups[index]);
10067 	up_write(&space_info->groups_sem);
10068 
10069 	if (first) {
10070 		struct raid_kobject *rkobj;
10071 		int ret;
10072 
10073 		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10074 		if (!rkobj)
10075 			goto out_err;
10076 		rkobj->raid_type = index;
10077 		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10078 		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10079 				  "%s", get_raid_name(index));
10080 		if (ret) {
10081 			kobject_put(&rkobj->kobj);
10082 			goto out_err;
10083 		}
10084 		space_info->block_group_kobjs[index] = &rkobj->kobj;
10085 	}
10086 
10087 	return;
10088 out_err:
10089 	btrfs_warn(cache->fs_info,
10090 		   "failed to add kobject for block cache, ignoring");
10091 }
10092 
10093 static struct btrfs_block_group_cache *
10094 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
10095 {
10096 	struct btrfs_block_group_cache *cache;
10097 
10098 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
10099 	if (!cache)
10100 		return NULL;
10101 
10102 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10103 					GFP_NOFS);
10104 	if (!cache->free_space_ctl) {
10105 		kfree(cache);
10106 		return NULL;
10107 	}
10108 
10109 	cache->key.objectid = start;
10110 	cache->key.offset = size;
10111 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10112 
10113 	cache->sectorsize = root->sectorsize;
10114 	cache->fs_info = root->fs_info;
10115 	cache->full_stripe_len = btrfs_full_stripe_len(root,
10116 					       &root->fs_info->mapping_tree,
10117 					       start);
10118 	set_free_space_tree_thresholds(cache);
10119 
10120 	atomic_set(&cache->count, 1);
10121 	spin_lock_init(&cache->lock);
10122 	init_rwsem(&cache->data_rwsem);
10123 	INIT_LIST_HEAD(&cache->list);
10124 	INIT_LIST_HEAD(&cache->cluster_list);
10125 	INIT_LIST_HEAD(&cache->bg_list);
10126 	INIT_LIST_HEAD(&cache->ro_list);
10127 	INIT_LIST_HEAD(&cache->dirty_list);
10128 	INIT_LIST_HEAD(&cache->io_list);
10129 	btrfs_init_free_space_ctl(cache);
10130 	atomic_set(&cache->trimming, 0);
10131 	mutex_init(&cache->free_space_lock);
10132 
10133 	return cache;
10134 }
10135 
10136 int btrfs_read_block_groups(struct btrfs_root *root)
10137 {
10138 	struct btrfs_path *path;
10139 	int ret;
10140 	struct btrfs_block_group_cache *cache;
10141 	struct btrfs_fs_info *info = root->fs_info;
10142 	struct btrfs_space_info *space_info;
10143 	struct btrfs_key key;
10144 	struct btrfs_key found_key;
10145 	struct extent_buffer *leaf;
10146 	int need_clear = 0;
10147 	u64 cache_gen;
10148 	u64 feature;
10149 	int mixed;
10150 
10151 	feature = btrfs_super_incompat_flags(info->super_copy);
10152 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10153 
10154 	root = info->extent_root;
10155 	key.objectid = 0;
10156 	key.offset = 0;
10157 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10158 	path = btrfs_alloc_path();
10159 	if (!path)
10160 		return -ENOMEM;
10161 	path->reada = READA_FORWARD;
10162 
10163 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
10164 	if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
10165 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
10166 		need_clear = 1;
10167 	if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
10168 		need_clear = 1;
10169 
10170 	while (1) {
10171 		ret = find_first_block_group(root, path, &key);
10172 		if (ret > 0)
10173 			break;
10174 		if (ret != 0)
10175 			goto error;
10176 
10177 		leaf = path->nodes[0];
10178 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10179 
10180 		cache = btrfs_create_block_group_cache(root, found_key.objectid,
10181 						       found_key.offset);
10182 		if (!cache) {
10183 			ret = -ENOMEM;
10184 			goto error;
10185 		}
10186 
10187 		if (need_clear) {
10188 			/*
10189 			 * When we mount with old space cache, we need to
10190 			 * set BTRFS_DC_CLEAR and set dirty flag.
10191 			 *
10192 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10193 			 *    truncate the old free space cache inode and
10194 			 *    setup a new one.
10195 			 * b) Setting 'dirty flag' makes sure that we flush
10196 			 *    the new space cache info onto disk.
10197 			 */
10198 			if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
10199 				cache->disk_cache_state = BTRFS_DC_CLEAR;
10200 		}
10201 
10202 		read_extent_buffer(leaf, &cache->item,
10203 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
10204 				   sizeof(cache->item));
10205 		cache->flags = btrfs_block_group_flags(&cache->item);
10206 		if (!mixed &&
10207 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10208 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10209 			btrfs_err(info,
10210 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10211 				  cache->key.objectid);
10212 			ret = -EINVAL;
10213 			goto error;
10214 		}
10215 
10216 		key.objectid = found_key.objectid + found_key.offset;
10217 		btrfs_release_path(path);
10218 
10219 		/*
10220 		 * We need to exclude the super stripes now so that the space
10221 		 * info has super bytes accounted for, otherwise we'll think
10222 		 * we have more space than we actually do.
10223 		 */
10224 		ret = exclude_super_stripes(root, cache);
10225 		if (ret) {
10226 			/*
10227 			 * We may have excluded something, so call this just in
10228 			 * case.
10229 			 */
10230 			free_excluded_extents(root, cache);
10231 			btrfs_put_block_group(cache);
10232 			goto error;
10233 		}
10234 
10235 		/*
10236 		 * check for two cases, either we are full, and therefore
10237 		 * don't need to bother with the caching work since we won't
10238 		 * find any space, or we are empty, and we can just add all
10239 		 * the space in and be done with it.  This saves us _alot_ of
10240 		 * time, particularly in the full case.
10241 		 */
10242 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10243 			cache->last_byte_to_unpin = (u64)-1;
10244 			cache->cached = BTRFS_CACHE_FINISHED;
10245 			free_excluded_extents(root, cache);
10246 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10247 			cache->last_byte_to_unpin = (u64)-1;
10248 			cache->cached = BTRFS_CACHE_FINISHED;
10249 			add_new_free_space(cache, root->fs_info,
10250 					   found_key.objectid,
10251 					   found_key.objectid +
10252 					   found_key.offset);
10253 			free_excluded_extents(root, cache);
10254 		}
10255 
10256 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
10257 		if (ret) {
10258 			btrfs_remove_free_space_cache(cache);
10259 			btrfs_put_block_group(cache);
10260 			goto error;
10261 		}
10262 
10263 		trace_btrfs_add_block_group(root->fs_info, cache, 0);
10264 		ret = update_space_info(info, cache->flags, found_key.offset,
10265 					btrfs_block_group_used(&cache->item),
10266 					cache->bytes_super, &space_info);
10267 		if (ret) {
10268 			btrfs_remove_free_space_cache(cache);
10269 			spin_lock(&info->block_group_cache_lock);
10270 			rb_erase(&cache->cache_node,
10271 				 &info->block_group_cache_tree);
10272 			RB_CLEAR_NODE(&cache->cache_node);
10273 			spin_unlock(&info->block_group_cache_lock);
10274 			btrfs_put_block_group(cache);
10275 			goto error;
10276 		}
10277 
10278 		cache->space_info = space_info;
10279 
10280 		__link_block_group(space_info, cache);
10281 
10282 		set_avail_alloc_bits(root->fs_info, cache->flags);
10283 		if (btrfs_chunk_readonly(root, cache->key.objectid)) {
10284 			inc_block_group_ro(cache, 1);
10285 		} else if (btrfs_block_group_used(&cache->item) == 0) {
10286 			spin_lock(&info->unused_bgs_lock);
10287 			/* Should always be true but just in case. */
10288 			if (list_empty(&cache->bg_list)) {
10289 				btrfs_get_block_group(cache);
10290 				list_add_tail(&cache->bg_list,
10291 					      &info->unused_bgs);
10292 			}
10293 			spin_unlock(&info->unused_bgs_lock);
10294 		}
10295 	}
10296 
10297 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
10298 		if (!(get_alloc_profile(root, space_info->flags) &
10299 		      (BTRFS_BLOCK_GROUP_RAID10 |
10300 		       BTRFS_BLOCK_GROUP_RAID1 |
10301 		       BTRFS_BLOCK_GROUP_RAID5 |
10302 		       BTRFS_BLOCK_GROUP_RAID6 |
10303 		       BTRFS_BLOCK_GROUP_DUP)))
10304 			continue;
10305 		/*
10306 		 * avoid allocating from un-mirrored block group if there are
10307 		 * mirrored block groups.
10308 		 */
10309 		list_for_each_entry(cache,
10310 				&space_info->block_groups[BTRFS_RAID_RAID0],
10311 				list)
10312 			inc_block_group_ro(cache, 1);
10313 		list_for_each_entry(cache,
10314 				&space_info->block_groups[BTRFS_RAID_SINGLE],
10315 				list)
10316 			inc_block_group_ro(cache, 1);
10317 	}
10318 
10319 	init_global_block_rsv(info);
10320 	ret = 0;
10321 error:
10322 	btrfs_free_path(path);
10323 	return ret;
10324 }
10325 
10326 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
10327 				       struct btrfs_root *root)
10328 {
10329 	struct btrfs_block_group_cache *block_group, *tmp;
10330 	struct btrfs_root *extent_root = root->fs_info->extent_root;
10331 	struct btrfs_block_group_item item;
10332 	struct btrfs_key key;
10333 	int ret = 0;
10334 	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10335 
10336 	trans->can_flush_pending_bgs = false;
10337 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10338 		if (ret)
10339 			goto next;
10340 
10341 		spin_lock(&block_group->lock);
10342 		memcpy(&item, &block_group->item, sizeof(item));
10343 		memcpy(&key, &block_group->key, sizeof(key));
10344 		spin_unlock(&block_group->lock);
10345 
10346 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
10347 					sizeof(item));
10348 		if (ret)
10349 			btrfs_abort_transaction(trans, ret);
10350 		ret = btrfs_finish_chunk_alloc(trans, extent_root,
10351 					       key.objectid, key.offset);
10352 		if (ret)
10353 			btrfs_abort_transaction(trans, ret);
10354 		add_block_group_free_space(trans, root->fs_info, block_group);
10355 		/* already aborted the transaction if it failed. */
10356 next:
10357 		list_del_init(&block_group->bg_list);
10358 	}
10359 	trans->can_flush_pending_bgs = can_flush_pending_bgs;
10360 }
10361 
10362 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10363 			   struct btrfs_root *root, u64 bytes_used,
10364 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
10365 			   u64 size)
10366 {
10367 	int ret;
10368 	struct btrfs_root *extent_root;
10369 	struct btrfs_block_group_cache *cache;
10370 	extent_root = root->fs_info->extent_root;
10371 
10372 	btrfs_set_log_full_commit(root->fs_info, trans);
10373 
10374 	cache = btrfs_create_block_group_cache(root, chunk_offset, size);
10375 	if (!cache)
10376 		return -ENOMEM;
10377 
10378 	btrfs_set_block_group_used(&cache->item, bytes_used);
10379 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
10380 	btrfs_set_block_group_flags(&cache->item, type);
10381 
10382 	cache->flags = type;
10383 	cache->last_byte_to_unpin = (u64)-1;
10384 	cache->cached = BTRFS_CACHE_FINISHED;
10385 	cache->needs_free_space = 1;
10386 	ret = exclude_super_stripes(root, cache);
10387 	if (ret) {
10388 		/*
10389 		 * We may have excluded something, so call this just in
10390 		 * case.
10391 		 */
10392 		free_excluded_extents(root, cache);
10393 		btrfs_put_block_group(cache);
10394 		return ret;
10395 	}
10396 
10397 	add_new_free_space(cache, root->fs_info, chunk_offset,
10398 			   chunk_offset + size);
10399 
10400 	free_excluded_extents(root, cache);
10401 
10402 #ifdef CONFIG_BTRFS_DEBUG
10403 	if (btrfs_should_fragment_free_space(root, cache)) {
10404 		u64 new_bytes_used = size - bytes_used;
10405 
10406 		bytes_used += new_bytes_used >> 1;
10407 		fragment_free_space(root, cache);
10408 	}
10409 #endif
10410 	/*
10411 	 * Call to ensure the corresponding space_info object is created and
10412 	 * assigned to our block group, but don't update its counters just yet.
10413 	 * We want our bg to be added to the rbtree with its ->space_info set.
10414 	 */
10415 	ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
10416 				&cache->space_info);
10417 	if (ret) {
10418 		btrfs_remove_free_space_cache(cache);
10419 		btrfs_put_block_group(cache);
10420 		return ret;
10421 	}
10422 
10423 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
10424 	if (ret) {
10425 		btrfs_remove_free_space_cache(cache);
10426 		btrfs_put_block_group(cache);
10427 		return ret;
10428 	}
10429 
10430 	/*
10431 	 * Now that our block group has its ->space_info set and is inserted in
10432 	 * the rbtree, update the space info's counters.
10433 	 */
10434 	trace_btrfs_add_block_group(root->fs_info, cache, 1);
10435 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
10436 				cache->bytes_super, &cache->space_info);
10437 	if (ret) {
10438 		btrfs_remove_free_space_cache(cache);
10439 		spin_lock(&root->fs_info->block_group_cache_lock);
10440 		rb_erase(&cache->cache_node,
10441 			 &root->fs_info->block_group_cache_tree);
10442 		RB_CLEAR_NODE(&cache->cache_node);
10443 		spin_unlock(&root->fs_info->block_group_cache_lock);
10444 		btrfs_put_block_group(cache);
10445 		return ret;
10446 	}
10447 	update_global_block_rsv(root->fs_info);
10448 
10449 	__link_block_group(cache->space_info, cache);
10450 
10451 	list_add_tail(&cache->bg_list, &trans->new_bgs);
10452 
10453 	set_avail_alloc_bits(extent_root->fs_info, type);
10454 	return 0;
10455 }
10456 
10457 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10458 {
10459 	u64 extra_flags = chunk_to_extended(flags) &
10460 				BTRFS_EXTENDED_PROFILE_MASK;
10461 
10462 	write_seqlock(&fs_info->profiles_lock);
10463 	if (flags & BTRFS_BLOCK_GROUP_DATA)
10464 		fs_info->avail_data_alloc_bits &= ~extra_flags;
10465 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
10466 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10467 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10468 		fs_info->avail_system_alloc_bits &= ~extra_flags;
10469 	write_sequnlock(&fs_info->profiles_lock);
10470 }
10471 
10472 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10473 			     struct btrfs_root *root, u64 group_start,
10474 			     struct extent_map *em)
10475 {
10476 	struct btrfs_path *path;
10477 	struct btrfs_block_group_cache *block_group;
10478 	struct btrfs_free_cluster *cluster;
10479 	struct btrfs_root *tree_root = root->fs_info->tree_root;
10480 	struct btrfs_key key;
10481 	struct inode *inode;
10482 	struct kobject *kobj = NULL;
10483 	int ret;
10484 	int index;
10485 	int factor;
10486 	struct btrfs_caching_control *caching_ctl = NULL;
10487 	bool remove_em;
10488 
10489 	root = root->fs_info->extent_root;
10490 
10491 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
10492 	BUG_ON(!block_group);
10493 	BUG_ON(!block_group->ro);
10494 
10495 	/*
10496 	 * Free the reserved super bytes from this block group before
10497 	 * remove it.
10498 	 */
10499 	free_excluded_extents(root, block_group);
10500 
10501 	memcpy(&key, &block_group->key, sizeof(key));
10502 	index = get_block_group_index(block_group);
10503 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10504 				  BTRFS_BLOCK_GROUP_RAID1 |
10505 				  BTRFS_BLOCK_GROUP_RAID10))
10506 		factor = 2;
10507 	else
10508 		factor = 1;
10509 
10510 	/* make sure this block group isn't part of an allocation cluster */
10511 	cluster = &root->fs_info->data_alloc_cluster;
10512 	spin_lock(&cluster->refill_lock);
10513 	btrfs_return_cluster_to_free_space(block_group, cluster);
10514 	spin_unlock(&cluster->refill_lock);
10515 
10516 	/*
10517 	 * make sure this block group isn't part of a metadata
10518 	 * allocation cluster
10519 	 */
10520 	cluster = &root->fs_info->meta_alloc_cluster;
10521 	spin_lock(&cluster->refill_lock);
10522 	btrfs_return_cluster_to_free_space(block_group, cluster);
10523 	spin_unlock(&cluster->refill_lock);
10524 
10525 	path = btrfs_alloc_path();
10526 	if (!path) {
10527 		ret = -ENOMEM;
10528 		goto out;
10529 	}
10530 
10531 	/*
10532 	 * get the inode first so any iput calls done for the io_list
10533 	 * aren't the final iput (no unlinks allowed now)
10534 	 */
10535 	inode = lookup_free_space_inode(tree_root, block_group, path);
10536 
10537 	mutex_lock(&trans->transaction->cache_write_mutex);
10538 	/*
10539 	 * make sure our free spache cache IO is done before remove the
10540 	 * free space inode
10541 	 */
10542 	spin_lock(&trans->transaction->dirty_bgs_lock);
10543 	if (!list_empty(&block_group->io_list)) {
10544 		list_del_init(&block_group->io_list);
10545 
10546 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10547 
10548 		spin_unlock(&trans->transaction->dirty_bgs_lock);
10549 		btrfs_wait_cache_io(root, trans, block_group,
10550 				    &block_group->io_ctl, path,
10551 				    block_group->key.objectid);
10552 		btrfs_put_block_group(block_group);
10553 		spin_lock(&trans->transaction->dirty_bgs_lock);
10554 	}
10555 
10556 	if (!list_empty(&block_group->dirty_list)) {
10557 		list_del_init(&block_group->dirty_list);
10558 		btrfs_put_block_group(block_group);
10559 	}
10560 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10561 	mutex_unlock(&trans->transaction->cache_write_mutex);
10562 
10563 	if (!IS_ERR(inode)) {
10564 		ret = btrfs_orphan_add(trans, inode);
10565 		if (ret) {
10566 			btrfs_add_delayed_iput(inode);
10567 			goto out;
10568 		}
10569 		clear_nlink(inode);
10570 		/* One for the block groups ref */
10571 		spin_lock(&block_group->lock);
10572 		if (block_group->iref) {
10573 			block_group->iref = 0;
10574 			block_group->inode = NULL;
10575 			spin_unlock(&block_group->lock);
10576 			iput(inode);
10577 		} else {
10578 			spin_unlock(&block_group->lock);
10579 		}
10580 		/* One for our lookup ref */
10581 		btrfs_add_delayed_iput(inode);
10582 	}
10583 
10584 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10585 	key.offset = block_group->key.objectid;
10586 	key.type = 0;
10587 
10588 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10589 	if (ret < 0)
10590 		goto out;
10591 	if (ret > 0)
10592 		btrfs_release_path(path);
10593 	if (ret == 0) {
10594 		ret = btrfs_del_item(trans, tree_root, path);
10595 		if (ret)
10596 			goto out;
10597 		btrfs_release_path(path);
10598 	}
10599 
10600 	spin_lock(&root->fs_info->block_group_cache_lock);
10601 	rb_erase(&block_group->cache_node,
10602 		 &root->fs_info->block_group_cache_tree);
10603 	RB_CLEAR_NODE(&block_group->cache_node);
10604 
10605 	if (root->fs_info->first_logical_byte == block_group->key.objectid)
10606 		root->fs_info->first_logical_byte = (u64)-1;
10607 	spin_unlock(&root->fs_info->block_group_cache_lock);
10608 
10609 	down_write(&block_group->space_info->groups_sem);
10610 	/*
10611 	 * we must use list_del_init so people can check to see if they
10612 	 * are still on the list after taking the semaphore
10613 	 */
10614 	list_del_init(&block_group->list);
10615 	if (list_empty(&block_group->space_info->block_groups[index])) {
10616 		kobj = block_group->space_info->block_group_kobjs[index];
10617 		block_group->space_info->block_group_kobjs[index] = NULL;
10618 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
10619 	}
10620 	up_write(&block_group->space_info->groups_sem);
10621 	if (kobj) {
10622 		kobject_del(kobj);
10623 		kobject_put(kobj);
10624 	}
10625 
10626 	if (block_group->has_caching_ctl)
10627 		caching_ctl = get_caching_control(block_group);
10628 	if (block_group->cached == BTRFS_CACHE_STARTED)
10629 		wait_block_group_cache_done(block_group);
10630 	if (block_group->has_caching_ctl) {
10631 		down_write(&root->fs_info->commit_root_sem);
10632 		if (!caching_ctl) {
10633 			struct btrfs_caching_control *ctl;
10634 
10635 			list_for_each_entry(ctl,
10636 				    &root->fs_info->caching_block_groups, list)
10637 				if (ctl->block_group == block_group) {
10638 					caching_ctl = ctl;
10639 					atomic_inc(&caching_ctl->count);
10640 					break;
10641 				}
10642 		}
10643 		if (caching_ctl)
10644 			list_del_init(&caching_ctl->list);
10645 		up_write(&root->fs_info->commit_root_sem);
10646 		if (caching_ctl) {
10647 			/* Once for the caching bgs list and once for us. */
10648 			put_caching_control(caching_ctl);
10649 			put_caching_control(caching_ctl);
10650 		}
10651 	}
10652 
10653 	spin_lock(&trans->transaction->dirty_bgs_lock);
10654 	if (!list_empty(&block_group->dirty_list)) {
10655 		WARN_ON(1);
10656 	}
10657 	if (!list_empty(&block_group->io_list)) {
10658 		WARN_ON(1);
10659 	}
10660 	spin_unlock(&trans->transaction->dirty_bgs_lock);
10661 	btrfs_remove_free_space_cache(block_group);
10662 
10663 	spin_lock(&block_group->space_info->lock);
10664 	list_del_init(&block_group->ro_list);
10665 
10666 	if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
10667 		WARN_ON(block_group->space_info->total_bytes
10668 			< block_group->key.offset);
10669 		WARN_ON(block_group->space_info->bytes_readonly
10670 			< block_group->key.offset);
10671 		WARN_ON(block_group->space_info->disk_total
10672 			< block_group->key.offset * factor);
10673 	}
10674 	block_group->space_info->total_bytes -= block_group->key.offset;
10675 	block_group->space_info->bytes_readonly -= block_group->key.offset;
10676 	block_group->space_info->disk_total -= block_group->key.offset * factor;
10677 
10678 	spin_unlock(&block_group->space_info->lock);
10679 
10680 	memcpy(&key, &block_group->key, sizeof(key));
10681 
10682 	lock_chunks(root);
10683 	if (!list_empty(&em->list)) {
10684 		/* We're in the transaction->pending_chunks list. */
10685 		free_extent_map(em);
10686 	}
10687 	spin_lock(&block_group->lock);
10688 	block_group->removed = 1;
10689 	/*
10690 	 * At this point trimming can't start on this block group, because we
10691 	 * removed the block group from the tree fs_info->block_group_cache_tree
10692 	 * so no one can't find it anymore and even if someone already got this
10693 	 * block group before we removed it from the rbtree, they have already
10694 	 * incremented block_group->trimming - if they didn't, they won't find
10695 	 * any free space entries because we already removed them all when we
10696 	 * called btrfs_remove_free_space_cache().
10697 	 *
10698 	 * And we must not remove the extent map from the fs_info->mapping_tree
10699 	 * to prevent the same logical address range and physical device space
10700 	 * ranges from being reused for a new block group. This is because our
10701 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10702 	 * completely transactionless, so while it is trimming a range the
10703 	 * currently running transaction might finish and a new one start,
10704 	 * allowing for new block groups to be created that can reuse the same
10705 	 * physical device locations unless we take this special care.
10706 	 *
10707 	 * There may also be an implicit trim operation if the file system
10708 	 * is mounted with -odiscard. The same protections must remain
10709 	 * in place until the extents have been discarded completely when
10710 	 * the transaction commit has completed.
10711 	 */
10712 	remove_em = (atomic_read(&block_group->trimming) == 0);
10713 	/*
10714 	 * Make sure a trimmer task always sees the em in the pinned_chunks list
10715 	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10716 	 * before checking block_group->removed).
10717 	 */
10718 	if (!remove_em) {
10719 		/*
10720 		 * Our em might be in trans->transaction->pending_chunks which
10721 		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10722 		 * and so is the fs_info->pinned_chunks list.
10723 		 *
10724 		 * So at this point we must be holding the chunk_mutex to avoid
10725 		 * any races with chunk allocation (more specifically at
10726 		 * volumes.c:contains_pending_extent()), to ensure it always
10727 		 * sees the em, either in the pending_chunks list or in the
10728 		 * pinned_chunks list.
10729 		 */
10730 		list_move_tail(&em->list, &root->fs_info->pinned_chunks);
10731 	}
10732 	spin_unlock(&block_group->lock);
10733 
10734 	if (remove_em) {
10735 		struct extent_map_tree *em_tree;
10736 
10737 		em_tree = &root->fs_info->mapping_tree.map_tree;
10738 		write_lock(&em_tree->lock);
10739 		/*
10740 		 * The em might be in the pending_chunks list, so make sure the
10741 		 * chunk mutex is locked, since remove_extent_mapping() will
10742 		 * delete us from that list.
10743 		 */
10744 		remove_extent_mapping(em_tree, em);
10745 		write_unlock(&em_tree->lock);
10746 		/* once for the tree */
10747 		free_extent_map(em);
10748 	}
10749 
10750 	unlock_chunks(root);
10751 
10752 	ret = remove_block_group_free_space(trans, root->fs_info, block_group);
10753 	if (ret)
10754 		goto out;
10755 
10756 	btrfs_put_block_group(block_group);
10757 	btrfs_put_block_group(block_group);
10758 
10759 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10760 	if (ret > 0)
10761 		ret = -EIO;
10762 	if (ret < 0)
10763 		goto out;
10764 
10765 	ret = btrfs_del_item(trans, root, path);
10766 out:
10767 	btrfs_free_path(path);
10768 	return ret;
10769 }
10770 
10771 struct btrfs_trans_handle *
10772 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10773 				     const u64 chunk_offset)
10774 {
10775 	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10776 	struct extent_map *em;
10777 	struct map_lookup *map;
10778 	unsigned int num_items;
10779 
10780 	read_lock(&em_tree->lock);
10781 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10782 	read_unlock(&em_tree->lock);
10783 	ASSERT(em && em->start == chunk_offset);
10784 
10785 	/*
10786 	 * We need to reserve 3 + N units from the metadata space info in order
10787 	 * to remove a block group (done at btrfs_remove_chunk() and at
10788 	 * btrfs_remove_block_group()), which are used for:
10789 	 *
10790 	 * 1 unit for adding the free space inode's orphan (located in the tree
10791 	 * of tree roots).
10792 	 * 1 unit for deleting the block group item (located in the extent
10793 	 * tree).
10794 	 * 1 unit for deleting the free space item (located in tree of tree
10795 	 * roots).
10796 	 * N units for deleting N device extent items corresponding to each
10797 	 * stripe (located in the device tree).
10798 	 *
10799 	 * In order to remove a block group we also need to reserve units in the
10800 	 * system space info in order to update the chunk tree (update one or
10801 	 * more device items and remove one chunk item), but this is done at
10802 	 * btrfs_remove_chunk() through a call to check_system_chunk().
10803 	 */
10804 	map = em->map_lookup;
10805 	num_items = 3 + map->num_stripes;
10806 	free_extent_map(em);
10807 
10808 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10809 							   num_items, 1);
10810 }
10811 
10812 /*
10813  * Process the unused_bgs list and remove any that don't have any allocated
10814  * space inside of them.
10815  */
10816 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10817 {
10818 	struct btrfs_block_group_cache *block_group;
10819 	struct btrfs_space_info *space_info;
10820 	struct btrfs_root *root = fs_info->extent_root;
10821 	struct btrfs_trans_handle *trans;
10822 	int ret = 0;
10823 
10824 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10825 		return;
10826 
10827 	spin_lock(&fs_info->unused_bgs_lock);
10828 	while (!list_empty(&fs_info->unused_bgs)) {
10829 		u64 start, end;
10830 		int trimming;
10831 
10832 		block_group = list_first_entry(&fs_info->unused_bgs,
10833 					       struct btrfs_block_group_cache,
10834 					       bg_list);
10835 		list_del_init(&block_group->bg_list);
10836 
10837 		space_info = block_group->space_info;
10838 
10839 		if (ret || btrfs_mixed_space_info(space_info)) {
10840 			btrfs_put_block_group(block_group);
10841 			continue;
10842 		}
10843 		spin_unlock(&fs_info->unused_bgs_lock);
10844 
10845 		mutex_lock(&fs_info->delete_unused_bgs_mutex);
10846 
10847 		/* Don't want to race with allocators so take the groups_sem */
10848 		down_write(&space_info->groups_sem);
10849 		spin_lock(&block_group->lock);
10850 		if (block_group->reserved ||
10851 		    btrfs_block_group_used(&block_group->item) ||
10852 		    block_group->ro ||
10853 		    list_is_singular(&block_group->list)) {
10854 			/*
10855 			 * We want to bail if we made new allocations or have
10856 			 * outstanding allocations in this block group.  We do
10857 			 * the ro check in case balance is currently acting on
10858 			 * this block group.
10859 			 */
10860 			spin_unlock(&block_group->lock);
10861 			up_write(&space_info->groups_sem);
10862 			goto next;
10863 		}
10864 		spin_unlock(&block_group->lock);
10865 
10866 		/* We don't want to force the issue, only flip if it's ok. */
10867 		ret = inc_block_group_ro(block_group, 0);
10868 		up_write(&space_info->groups_sem);
10869 		if (ret < 0) {
10870 			ret = 0;
10871 			goto next;
10872 		}
10873 
10874 		/*
10875 		 * Want to do this before we do anything else so we can recover
10876 		 * properly if we fail to join the transaction.
10877 		 */
10878 		trans = btrfs_start_trans_remove_block_group(fs_info,
10879 						     block_group->key.objectid);
10880 		if (IS_ERR(trans)) {
10881 			btrfs_dec_block_group_ro(root, block_group);
10882 			ret = PTR_ERR(trans);
10883 			goto next;
10884 		}
10885 
10886 		/*
10887 		 * We could have pending pinned extents for this block group,
10888 		 * just delete them, we don't care about them anymore.
10889 		 */
10890 		start = block_group->key.objectid;
10891 		end = start + block_group->key.offset - 1;
10892 		/*
10893 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10894 		 * btrfs_finish_extent_commit(). If we are at transaction N,
10895 		 * another task might be running finish_extent_commit() for the
10896 		 * previous transaction N - 1, and have seen a range belonging
10897 		 * to the block group in freed_extents[] before we were able to
10898 		 * clear the whole block group range from freed_extents[]. This
10899 		 * means that task can lookup for the block group after we
10900 		 * unpinned it from freed_extents[] and removed it, leading to
10901 		 * a BUG_ON() at btrfs_unpin_extent_range().
10902 		 */
10903 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
10904 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10905 				  EXTENT_DIRTY);
10906 		if (ret) {
10907 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10908 			btrfs_dec_block_group_ro(root, block_group);
10909 			goto end_trans;
10910 		}
10911 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10912 				  EXTENT_DIRTY);
10913 		if (ret) {
10914 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10915 			btrfs_dec_block_group_ro(root, block_group);
10916 			goto end_trans;
10917 		}
10918 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10919 
10920 		/* Reset pinned so btrfs_put_block_group doesn't complain */
10921 		spin_lock(&space_info->lock);
10922 		spin_lock(&block_group->lock);
10923 
10924 		space_info->bytes_pinned -= block_group->pinned;
10925 		space_info->bytes_readonly += block_group->pinned;
10926 		percpu_counter_add(&space_info->total_bytes_pinned,
10927 				   -block_group->pinned);
10928 		block_group->pinned = 0;
10929 
10930 		spin_unlock(&block_group->lock);
10931 		spin_unlock(&space_info->lock);
10932 
10933 		/* DISCARD can flip during remount */
10934 		trimming = btrfs_test_opt(root->fs_info, DISCARD);
10935 
10936 		/* Implicit trim during transaction commit. */
10937 		if (trimming)
10938 			btrfs_get_block_group_trimming(block_group);
10939 
10940 		/*
10941 		 * Btrfs_remove_chunk will abort the transaction if things go
10942 		 * horribly wrong.
10943 		 */
10944 		ret = btrfs_remove_chunk(trans, root,
10945 					 block_group->key.objectid);
10946 
10947 		if (ret) {
10948 			if (trimming)
10949 				btrfs_put_block_group_trimming(block_group);
10950 			goto end_trans;
10951 		}
10952 
10953 		/*
10954 		 * If we're not mounted with -odiscard, we can just forget
10955 		 * about this block group. Otherwise we'll need to wait
10956 		 * until transaction commit to do the actual discard.
10957 		 */
10958 		if (trimming) {
10959 			spin_lock(&fs_info->unused_bgs_lock);
10960 			/*
10961 			 * A concurrent scrub might have added us to the list
10962 			 * fs_info->unused_bgs, so use a list_move operation
10963 			 * to add the block group to the deleted_bgs list.
10964 			 */
10965 			list_move(&block_group->bg_list,
10966 				  &trans->transaction->deleted_bgs);
10967 			spin_unlock(&fs_info->unused_bgs_lock);
10968 			btrfs_get_block_group(block_group);
10969 		}
10970 end_trans:
10971 		btrfs_end_transaction(trans, root);
10972 next:
10973 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10974 		btrfs_put_block_group(block_group);
10975 		spin_lock(&fs_info->unused_bgs_lock);
10976 	}
10977 	spin_unlock(&fs_info->unused_bgs_lock);
10978 }
10979 
10980 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10981 {
10982 	struct btrfs_space_info *space_info;
10983 	struct btrfs_super_block *disk_super;
10984 	u64 features;
10985 	u64 flags;
10986 	int mixed = 0;
10987 	int ret;
10988 
10989 	disk_super = fs_info->super_copy;
10990 	if (!btrfs_super_root(disk_super))
10991 		return -EINVAL;
10992 
10993 	features = btrfs_super_incompat_flags(disk_super);
10994 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10995 		mixed = 1;
10996 
10997 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
10998 	ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10999 	if (ret)
11000 		goto out;
11001 
11002 	if (mixed) {
11003 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11004 		ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11005 	} else {
11006 		flags = BTRFS_BLOCK_GROUP_METADATA;
11007 		ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11008 		if (ret)
11009 			goto out;
11010 
11011 		flags = BTRFS_BLOCK_GROUP_DATA;
11012 		ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
11013 	}
11014 out:
11015 	return ret;
11016 }
11017 
11018 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
11019 {
11020 	return unpin_extent_range(root, start, end, false);
11021 }
11022 
11023 /*
11024  * It used to be that old block groups would be left around forever.
11025  * Iterating over them would be enough to trim unused space.  Since we
11026  * now automatically remove them, we also need to iterate over unallocated
11027  * space.
11028  *
11029  * We don't want a transaction for this since the discard may take a
11030  * substantial amount of time.  We don't require that a transaction be
11031  * running, but we do need to take a running transaction into account
11032  * to ensure that we're not discarding chunks that were released in
11033  * the current transaction.
11034  *
11035  * Holding the chunks lock will prevent other threads from allocating
11036  * or releasing chunks, but it won't prevent a running transaction
11037  * from committing and releasing the memory that the pending chunks
11038  * list head uses.  For that, we need to take a reference to the
11039  * transaction.
11040  */
11041 static int btrfs_trim_free_extents(struct btrfs_device *device,
11042 				   u64 minlen, u64 *trimmed)
11043 {
11044 	u64 start = 0, len = 0;
11045 	int ret;
11046 
11047 	*trimmed = 0;
11048 
11049 	/* Not writeable = nothing to do. */
11050 	if (!device->writeable)
11051 		return 0;
11052 
11053 	/* No free space = nothing to do. */
11054 	if (device->total_bytes <= device->bytes_used)
11055 		return 0;
11056 
11057 	ret = 0;
11058 
11059 	while (1) {
11060 		struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
11061 		struct btrfs_transaction *trans;
11062 		u64 bytes;
11063 
11064 		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11065 		if (ret)
11066 			return ret;
11067 
11068 		down_read(&fs_info->commit_root_sem);
11069 
11070 		spin_lock(&fs_info->trans_lock);
11071 		trans = fs_info->running_transaction;
11072 		if (trans)
11073 			atomic_inc(&trans->use_count);
11074 		spin_unlock(&fs_info->trans_lock);
11075 
11076 		ret = find_free_dev_extent_start(trans, device, minlen, start,
11077 						 &start, &len);
11078 		if (trans)
11079 			btrfs_put_transaction(trans);
11080 
11081 		if (ret) {
11082 			up_read(&fs_info->commit_root_sem);
11083 			mutex_unlock(&fs_info->chunk_mutex);
11084 			if (ret == -ENOSPC)
11085 				ret = 0;
11086 			break;
11087 		}
11088 
11089 		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11090 		up_read(&fs_info->commit_root_sem);
11091 		mutex_unlock(&fs_info->chunk_mutex);
11092 
11093 		if (ret)
11094 			break;
11095 
11096 		start += len;
11097 		*trimmed += bytes;
11098 
11099 		if (fatal_signal_pending(current)) {
11100 			ret = -ERESTARTSYS;
11101 			break;
11102 		}
11103 
11104 		cond_resched();
11105 	}
11106 
11107 	return ret;
11108 }
11109 
11110 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
11111 {
11112 	struct btrfs_fs_info *fs_info = root->fs_info;
11113 	struct btrfs_block_group_cache *cache = NULL;
11114 	struct btrfs_device *device;
11115 	struct list_head *devices;
11116 	u64 group_trimmed;
11117 	u64 start;
11118 	u64 end;
11119 	u64 trimmed = 0;
11120 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11121 	int ret = 0;
11122 
11123 	/*
11124 	 * try to trim all FS space, our block group may start from non-zero.
11125 	 */
11126 	if (range->len == total_bytes)
11127 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
11128 	else
11129 		cache = btrfs_lookup_block_group(fs_info, range->start);
11130 
11131 	while (cache) {
11132 		if (cache->key.objectid >= (range->start + range->len)) {
11133 			btrfs_put_block_group(cache);
11134 			break;
11135 		}
11136 
11137 		start = max(range->start, cache->key.objectid);
11138 		end = min(range->start + range->len,
11139 				cache->key.objectid + cache->key.offset);
11140 
11141 		if (end - start >= range->minlen) {
11142 			if (!block_group_cache_done(cache)) {
11143 				ret = cache_block_group(cache, 0);
11144 				if (ret) {
11145 					btrfs_put_block_group(cache);
11146 					break;
11147 				}
11148 				ret = wait_block_group_cache_done(cache);
11149 				if (ret) {
11150 					btrfs_put_block_group(cache);
11151 					break;
11152 				}
11153 			}
11154 			ret = btrfs_trim_block_group(cache,
11155 						     &group_trimmed,
11156 						     start,
11157 						     end,
11158 						     range->minlen);
11159 
11160 			trimmed += group_trimmed;
11161 			if (ret) {
11162 				btrfs_put_block_group(cache);
11163 				break;
11164 			}
11165 		}
11166 
11167 		cache = next_block_group(fs_info->tree_root, cache);
11168 	}
11169 
11170 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
11171 	devices = &root->fs_info->fs_devices->alloc_list;
11172 	list_for_each_entry(device, devices, dev_alloc_list) {
11173 		ret = btrfs_trim_free_extents(device, range->minlen,
11174 					      &group_trimmed);
11175 		if (ret)
11176 			break;
11177 
11178 		trimmed += group_trimmed;
11179 	}
11180 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
11181 
11182 	range->len = trimmed;
11183 	return ret;
11184 }
11185 
11186 /*
11187  * btrfs_{start,end}_write_no_snapshoting() are similar to
11188  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11189  * data into the page cache through nocow before the subvolume is snapshoted,
11190  * but flush the data into disk after the snapshot creation, or to prevent
11191  * operations while snapshoting is ongoing and that cause the snapshot to be
11192  * inconsistent (writes followed by expanding truncates for example).
11193  */
11194 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
11195 {
11196 	percpu_counter_dec(&root->subv_writers->counter);
11197 	/*
11198 	 * Make sure counter is updated before we wake up waiters.
11199 	 */
11200 	smp_mb();
11201 	if (waitqueue_active(&root->subv_writers->wait))
11202 		wake_up(&root->subv_writers->wait);
11203 }
11204 
11205 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
11206 {
11207 	if (atomic_read(&root->will_be_snapshoted))
11208 		return 0;
11209 
11210 	percpu_counter_inc(&root->subv_writers->counter);
11211 	/*
11212 	 * Make sure counter is updated before we check for snapshot creation.
11213 	 */
11214 	smp_mb();
11215 	if (atomic_read(&root->will_be_snapshoted)) {
11216 		btrfs_end_write_no_snapshoting(root);
11217 		return 0;
11218 	}
11219 	return 1;
11220 }
11221 
11222 static int wait_snapshoting_atomic_t(atomic_t *a)
11223 {
11224 	schedule();
11225 	return 0;
11226 }
11227 
11228 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11229 {
11230 	while (true) {
11231 		int ret;
11232 
11233 		ret = btrfs_start_write_no_snapshoting(root);
11234 		if (ret)
11235 			break;
11236 		wait_on_atomic_t(&root->will_be_snapshoted,
11237 				 wait_snapshoting_atomic_t,
11238 				 TASK_UNINTERRUPTIBLE);
11239 	}
11240 }
11241