xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision c1c9ff7c94e83fae89a742df74db51156869bad5)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/percpu_counter.h>
28 #include "compat.h"
29 #include "hash.h"
30 #include "ctree.h"
31 #include "disk-io.h"
32 #include "print-tree.h"
33 #include "transaction.h"
34 #include "volumes.h"
35 #include "raid56.h"
36 #include "locking.h"
37 #include "free-space-cache.h"
38 #include "math.h"
39 
40 #undef SCRAMBLE_DELAYED_REFS
41 
42 /*
43  * control flags for do_chunk_alloc's force field
44  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
45  * if we really need one.
46  *
47  * CHUNK_ALLOC_LIMITED means to only try and allocate one
48  * if we have very few chunks already allocated.  This is
49  * used as part of the clustering code to help make sure
50  * we have a good pool of storage to cluster in, without
51  * filling the FS with empty chunks
52  *
53  * CHUNK_ALLOC_FORCE means it must try to allocate one
54  *
55  */
56 enum {
57 	CHUNK_ALLOC_NO_FORCE = 0,
58 	CHUNK_ALLOC_LIMITED = 1,
59 	CHUNK_ALLOC_FORCE = 2,
60 };
61 
62 /*
63  * Control how reservations are dealt with.
64  *
65  * RESERVE_FREE - freeing a reservation.
66  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
67  *   ENOSPC accounting
68  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
69  *   bytes_may_use as the ENOSPC accounting is done elsewhere
70  */
71 enum {
72 	RESERVE_FREE = 0,
73 	RESERVE_ALLOC = 1,
74 	RESERVE_ALLOC_NO_ACCOUNT = 2,
75 };
76 
77 static int update_block_group(struct btrfs_root *root,
78 			      u64 bytenr, u64 num_bytes, int alloc);
79 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
80 				struct btrfs_root *root,
81 				u64 bytenr, u64 num_bytes, u64 parent,
82 				u64 root_objectid, u64 owner_objectid,
83 				u64 owner_offset, int refs_to_drop,
84 				struct btrfs_delayed_extent_op *extra_op);
85 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
86 				    struct extent_buffer *leaf,
87 				    struct btrfs_extent_item *ei);
88 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
89 				      struct btrfs_root *root,
90 				      u64 parent, u64 root_objectid,
91 				      u64 flags, u64 owner, u64 offset,
92 				      struct btrfs_key *ins, int ref_mod);
93 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
94 				     struct btrfs_root *root,
95 				     u64 parent, u64 root_objectid,
96 				     u64 flags, struct btrfs_disk_key *key,
97 				     int level, struct btrfs_key *ins);
98 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
99 			  struct btrfs_root *extent_root, u64 flags,
100 			  int force);
101 static int find_next_key(struct btrfs_path *path, int level,
102 			 struct btrfs_key *key);
103 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
104 			    int dump_block_groups);
105 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
106 				       u64 num_bytes, int reserve);
107 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
108 			       u64 num_bytes);
109 int btrfs_pin_extent(struct btrfs_root *root,
110 		     u64 bytenr, u64 num_bytes, int reserved);
111 
112 static noinline int
113 block_group_cache_done(struct btrfs_block_group_cache *cache)
114 {
115 	smp_mb();
116 	return cache->cached == BTRFS_CACHE_FINISHED ||
117 		cache->cached == BTRFS_CACHE_ERROR;
118 }
119 
120 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
121 {
122 	return (cache->flags & bits) == bits;
123 }
124 
125 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
126 {
127 	atomic_inc(&cache->count);
128 }
129 
130 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
131 {
132 	if (atomic_dec_and_test(&cache->count)) {
133 		WARN_ON(cache->pinned > 0);
134 		WARN_ON(cache->reserved > 0);
135 		kfree(cache->free_space_ctl);
136 		kfree(cache);
137 	}
138 }
139 
140 /*
141  * this adds the block group to the fs_info rb tree for the block group
142  * cache
143  */
144 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
145 				struct btrfs_block_group_cache *block_group)
146 {
147 	struct rb_node **p;
148 	struct rb_node *parent = NULL;
149 	struct btrfs_block_group_cache *cache;
150 
151 	spin_lock(&info->block_group_cache_lock);
152 	p = &info->block_group_cache_tree.rb_node;
153 
154 	while (*p) {
155 		parent = *p;
156 		cache = rb_entry(parent, struct btrfs_block_group_cache,
157 				 cache_node);
158 		if (block_group->key.objectid < cache->key.objectid) {
159 			p = &(*p)->rb_left;
160 		} else if (block_group->key.objectid > cache->key.objectid) {
161 			p = &(*p)->rb_right;
162 		} else {
163 			spin_unlock(&info->block_group_cache_lock);
164 			return -EEXIST;
165 		}
166 	}
167 
168 	rb_link_node(&block_group->cache_node, parent, p);
169 	rb_insert_color(&block_group->cache_node,
170 			&info->block_group_cache_tree);
171 
172 	if (info->first_logical_byte > block_group->key.objectid)
173 		info->first_logical_byte = block_group->key.objectid;
174 
175 	spin_unlock(&info->block_group_cache_lock);
176 
177 	return 0;
178 }
179 
180 /*
181  * This will return the block group at or after bytenr if contains is 0, else
182  * it will return the block group that contains the bytenr
183  */
184 static struct btrfs_block_group_cache *
185 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
186 			      int contains)
187 {
188 	struct btrfs_block_group_cache *cache, *ret = NULL;
189 	struct rb_node *n;
190 	u64 end, start;
191 
192 	spin_lock(&info->block_group_cache_lock);
193 	n = info->block_group_cache_tree.rb_node;
194 
195 	while (n) {
196 		cache = rb_entry(n, struct btrfs_block_group_cache,
197 				 cache_node);
198 		end = cache->key.objectid + cache->key.offset - 1;
199 		start = cache->key.objectid;
200 
201 		if (bytenr < start) {
202 			if (!contains && (!ret || start < ret->key.objectid))
203 				ret = cache;
204 			n = n->rb_left;
205 		} else if (bytenr > start) {
206 			if (contains && bytenr <= end) {
207 				ret = cache;
208 				break;
209 			}
210 			n = n->rb_right;
211 		} else {
212 			ret = cache;
213 			break;
214 		}
215 	}
216 	if (ret) {
217 		btrfs_get_block_group(ret);
218 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
219 			info->first_logical_byte = ret->key.objectid;
220 	}
221 	spin_unlock(&info->block_group_cache_lock);
222 
223 	return ret;
224 }
225 
226 static int add_excluded_extent(struct btrfs_root *root,
227 			       u64 start, u64 num_bytes)
228 {
229 	u64 end = start + num_bytes - 1;
230 	set_extent_bits(&root->fs_info->freed_extents[0],
231 			start, end, EXTENT_UPTODATE, GFP_NOFS);
232 	set_extent_bits(&root->fs_info->freed_extents[1],
233 			start, end, EXTENT_UPTODATE, GFP_NOFS);
234 	return 0;
235 }
236 
237 static void free_excluded_extents(struct btrfs_root *root,
238 				  struct btrfs_block_group_cache *cache)
239 {
240 	u64 start, end;
241 
242 	start = cache->key.objectid;
243 	end = start + cache->key.offset - 1;
244 
245 	clear_extent_bits(&root->fs_info->freed_extents[0],
246 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
247 	clear_extent_bits(&root->fs_info->freed_extents[1],
248 			  start, end, EXTENT_UPTODATE, GFP_NOFS);
249 }
250 
251 static int exclude_super_stripes(struct btrfs_root *root,
252 				 struct btrfs_block_group_cache *cache)
253 {
254 	u64 bytenr;
255 	u64 *logical;
256 	int stripe_len;
257 	int i, nr, ret;
258 
259 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
260 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
261 		cache->bytes_super += stripe_len;
262 		ret = add_excluded_extent(root, cache->key.objectid,
263 					  stripe_len);
264 		if (ret)
265 			return ret;
266 	}
267 
268 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
269 		bytenr = btrfs_sb_offset(i);
270 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
271 				       cache->key.objectid, bytenr,
272 				       0, &logical, &nr, &stripe_len);
273 		if (ret)
274 			return ret;
275 
276 		while (nr--) {
277 			u64 start, len;
278 
279 			if (logical[nr] > cache->key.objectid +
280 			    cache->key.offset)
281 				continue;
282 
283 			if (logical[nr] + stripe_len <= cache->key.objectid)
284 				continue;
285 
286 			start = logical[nr];
287 			if (start < cache->key.objectid) {
288 				start = cache->key.objectid;
289 				len = (logical[nr] + stripe_len) - start;
290 			} else {
291 				len = min_t(u64, stripe_len,
292 					    cache->key.objectid +
293 					    cache->key.offset - start);
294 			}
295 
296 			cache->bytes_super += len;
297 			ret = add_excluded_extent(root, start, len);
298 			if (ret) {
299 				kfree(logical);
300 				return ret;
301 			}
302 		}
303 
304 		kfree(logical);
305 	}
306 	return 0;
307 }
308 
309 static struct btrfs_caching_control *
310 get_caching_control(struct btrfs_block_group_cache *cache)
311 {
312 	struct btrfs_caching_control *ctl;
313 
314 	spin_lock(&cache->lock);
315 	if (cache->cached != BTRFS_CACHE_STARTED) {
316 		spin_unlock(&cache->lock);
317 		return NULL;
318 	}
319 
320 	/* We're loading it the fast way, so we don't have a caching_ctl. */
321 	if (!cache->caching_ctl) {
322 		spin_unlock(&cache->lock);
323 		return NULL;
324 	}
325 
326 	ctl = cache->caching_ctl;
327 	atomic_inc(&ctl->count);
328 	spin_unlock(&cache->lock);
329 	return ctl;
330 }
331 
332 static void put_caching_control(struct btrfs_caching_control *ctl)
333 {
334 	if (atomic_dec_and_test(&ctl->count))
335 		kfree(ctl);
336 }
337 
338 /*
339  * this is only called by cache_block_group, since we could have freed extents
340  * we need to check the pinned_extents for any extents that can't be used yet
341  * since their free space will be released as soon as the transaction commits.
342  */
343 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
344 			      struct btrfs_fs_info *info, u64 start, u64 end)
345 {
346 	u64 extent_start, extent_end, size, total_added = 0;
347 	int ret;
348 
349 	while (start < end) {
350 		ret = find_first_extent_bit(info->pinned_extents, start,
351 					    &extent_start, &extent_end,
352 					    EXTENT_DIRTY | EXTENT_UPTODATE,
353 					    NULL);
354 		if (ret)
355 			break;
356 
357 		if (extent_start <= start) {
358 			start = extent_end + 1;
359 		} else if (extent_start > start && extent_start < end) {
360 			size = extent_start - start;
361 			total_added += size;
362 			ret = btrfs_add_free_space(block_group, start,
363 						   size);
364 			BUG_ON(ret); /* -ENOMEM or logic error */
365 			start = extent_end + 1;
366 		} else {
367 			break;
368 		}
369 	}
370 
371 	if (start < end) {
372 		size = end - start;
373 		total_added += size;
374 		ret = btrfs_add_free_space(block_group, start, size);
375 		BUG_ON(ret); /* -ENOMEM or logic error */
376 	}
377 
378 	return total_added;
379 }
380 
381 static noinline void caching_thread(struct btrfs_work *work)
382 {
383 	struct btrfs_block_group_cache *block_group;
384 	struct btrfs_fs_info *fs_info;
385 	struct btrfs_caching_control *caching_ctl;
386 	struct btrfs_root *extent_root;
387 	struct btrfs_path *path;
388 	struct extent_buffer *leaf;
389 	struct btrfs_key key;
390 	u64 total_found = 0;
391 	u64 last = 0;
392 	u32 nritems;
393 	int ret = -ENOMEM;
394 
395 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
396 	block_group = caching_ctl->block_group;
397 	fs_info = block_group->fs_info;
398 	extent_root = fs_info->extent_root;
399 
400 	path = btrfs_alloc_path();
401 	if (!path)
402 		goto out;
403 
404 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
405 
406 	/*
407 	 * We don't want to deadlock with somebody trying to allocate a new
408 	 * extent for the extent root while also trying to search the extent
409 	 * root to add free space.  So we skip locking and search the commit
410 	 * root, since its read-only
411 	 */
412 	path->skip_locking = 1;
413 	path->search_commit_root = 1;
414 	path->reada = 1;
415 
416 	key.objectid = last;
417 	key.offset = 0;
418 	key.type = BTRFS_EXTENT_ITEM_KEY;
419 again:
420 	mutex_lock(&caching_ctl->mutex);
421 	/* need to make sure the commit_root doesn't disappear */
422 	down_read(&fs_info->extent_commit_sem);
423 
424 next:
425 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
426 	if (ret < 0)
427 		goto err;
428 
429 	leaf = path->nodes[0];
430 	nritems = btrfs_header_nritems(leaf);
431 
432 	while (1) {
433 		if (btrfs_fs_closing(fs_info) > 1) {
434 			last = (u64)-1;
435 			break;
436 		}
437 
438 		if (path->slots[0] < nritems) {
439 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
440 		} else {
441 			ret = find_next_key(path, 0, &key);
442 			if (ret)
443 				break;
444 
445 			if (need_resched()) {
446 				caching_ctl->progress = last;
447 				btrfs_release_path(path);
448 				up_read(&fs_info->extent_commit_sem);
449 				mutex_unlock(&caching_ctl->mutex);
450 				cond_resched();
451 				goto again;
452 			}
453 
454 			ret = btrfs_next_leaf(extent_root, path);
455 			if (ret < 0)
456 				goto err;
457 			if (ret)
458 				break;
459 			leaf = path->nodes[0];
460 			nritems = btrfs_header_nritems(leaf);
461 			continue;
462 		}
463 
464 		if (key.objectid < last) {
465 			key.objectid = last;
466 			key.offset = 0;
467 			key.type = BTRFS_EXTENT_ITEM_KEY;
468 
469 			caching_ctl->progress = last;
470 			btrfs_release_path(path);
471 			goto next;
472 		}
473 
474 		if (key.objectid < block_group->key.objectid) {
475 			path->slots[0]++;
476 			continue;
477 		}
478 
479 		if (key.objectid >= block_group->key.objectid +
480 		    block_group->key.offset)
481 			break;
482 
483 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
484 		    key.type == BTRFS_METADATA_ITEM_KEY) {
485 			total_found += add_new_free_space(block_group,
486 							  fs_info, last,
487 							  key.objectid);
488 			if (key.type == BTRFS_METADATA_ITEM_KEY)
489 				last = key.objectid +
490 					fs_info->tree_root->leafsize;
491 			else
492 				last = key.objectid + key.offset;
493 
494 			if (total_found > (1024 * 1024 * 2)) {
495 				total_found = 0;
496 				wake_up(&caching_ctl->wait);
497 			}
498 		}
499 		path->slots[0]++;
500 	}
501 	ret = 0;
502 
503 	total_found += add_new_free_space(block_group, fs_info, last,
504 					  block_group->key.objectid +
505 					  block_group->key.offset);
506 	caching_ctl->progress = (u64)-1;
507 
508 	spin_lock(&block_group->lock);
509 	block_group->caching_ctl = NULL;
510 	block_group->cached = BTRFS_CACHE_FINISHED;
511 	spin_unlock(&block_group->lock);
512 
513 err:
514 	btrfs_free_path(path);
515 	up_read(&fs_info->extent_commit_sem);
516 
517 	free_excluded_extents(extent_root, block_group);
518 
519 	mutex_unlock(&caching_ctl->mutex);
520 out:
521 	if (ret) {
522 		spin_lock(&block_group->lock);
523 		block_group->caching_ctl = NULL;
524 		block_group->cached = BTRFS_CACHE_ERROR;
525 		spin_unlock(&block_group->lock);
526 	}
527 	wake_up(&caching_ctl->wait);
528 
529 	put_caching_control(caching_ctl);
530 	btrfs_put_block_group(block_group);
531 }
532 
533 static int cache_block_group(struct btrfs_block_group_cache *cache,
534 			     int load_cache_only)
535 {
536 	DEFINE_WAIT(wait);
537 	struct btrfs_fs_info *fs_info = cache->fs_info;
538 	struct btrfs_caching_control *caching_ctl;
539 	int ret = 0;
540 
541 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
542 	if (!caching_ctl)
543 		return -ENOMEM;
544 
545 	INIT_LIST_HEAD(&caching_ctl->list);
546 	mutex_init(&caching_ctl->mutex);
547 	init_waitqueue_head(&caching_ctl->wait);
548 	caching_ctl->block_group = cache;
549 	caching_ctl->progress = cache->key.objectid;
550 	atomic_set(&caching_ctl->count, 1);
551 	caching_ctl->work.func = caching_thread;
552 
553 	spin_lock(&cache->lock);
554 	/*
555 	 * This should be a rare occasion, but this could happen I think in the
556 	 * case where one thread starts to load the space cache info, and then
557 	 * some other thread starts a transaction commit which tries to do an
558 	 * allocation while the other thread is still loading the space cache
559 	 * info.  The previous loop should have kept us from choosing this block
560 	 * group, but if we've moved to the state where we will wait on caching
561 	 * block groups we need to first check if we're doing a fast load here,
562 	 * so we can wait for it to finish, otherwise we could end up allocating
563 	 * from a block group who's cache gets evicted for one reason or
564 	 * another.
565 	 */
566 	while (cache->cached == BTRFS_CACHE_FAST) {
567 		struct btrfs_caching_control *ctl;
568 
569 		ctl = cache->caching_ctl;
570 		atomic_inc(&ctl->count);
571 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
572 		spin_unlock(&cache->lock);
573 
574 		schedule();
575 
576 		finish_wait(&ctl->wait, &wait);
577 		put_caching_control(ctl);
578 		spin_lock(&cache->lock);
579 	}
580 
581 	if (cache->cached != BTRFS_CACHE_NO) {
582 		spin_unlock(&cache->lock);
583 		kfree(caching_ctl);
584 		return 0;
585 	}
586 	WARN_ON(cache->caching_ctl);
587 	cache->caching_ctl = caching_ctl;
588 	cache->cached = BTRFS_CACHE_FAST;
589 	spin_unlock(&cache->lock);
590 
591 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
592 		ret = load_free_space_cache(fs_info, cache);
593 
594 		spin_lock(&cache->lock);
595 		if (ret == 1) {
596 			cache->caching_ctl = NULL;
597 			cache->cached = BTRFS_CACHE_FINISHED;
598 			cache->last_byte_to_unpin = (u64)-1;
599 		} else {
600 			if (load_cache_only) {
601 				cache->caching_ctl = NULL;
602 				cache->cached = BTRFS_CACHE_NO;
603 			} else {
604 				cache->cached = BTRFS_CACHE_STARTED;
605 			}
606 		}
607 		spin_unlock(&cache->lock);
608 		wake_up(&caching_ctl->wait);
609 		if (ret == 1) {
610 			put_caching_control(caching_ctl);
611 			free_excluded_extents(fs_info->extent_root, cache);
612 			return 0;
613 		}
614 	} else {
615 		/*
616 		 * We are not going to do the fast caching, set cached to the
617 		 * appropriate value and wakeup any waiters.
618 		 */
619 		spin_lock(&cache->lock);
620 		if (load_cache_only) {
621 			cache->caching_ctl = NULL;
622 			cache->cached = BTRFS_CACHE_NO;
623 		} else {
624 			cache->cached = BTRFS_CACHE_STARTED;
625 		}
626 		spin_unlock(&cache->lock);
627 		wake_up(&caching_ctl->wait);
628 	}
629 
630 	if (load_cache_only) {
631 		put_caching_control(caching_ctl);
632 		return 0;
633 	}
634 
635 	down_write(&fs_info->extent_commit_sem);
636 	atomic_inc(&caching_ctl->count);
637 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
638 	up_write(&fs_info->extent_commit_sem);
639 
640 	btrfs_get_block_group(cache);
641 
642 	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
643 
644 	return ret;
645 }
646 
647 /*
648  * return the block group that starts at or after bytenr
649  */
650 static struct btrfs_block_group_cache *
651 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
652 {
653 	struct btrfs_block_group_cache *cache;
654 
655 	cache = block_group_cache_tree_search(info, bytenr, 0);
656 
657 	return cache;
658 }
659 
660 /*
661  * return the block group that contains the given bytenr
662  */
663 struct btrfs_block_group_cache *btrfs_lookup_block_group(
664 						 struct btrfs_fs_info *info,
665 						 u64 bytenr)
666 {
667 	struct btrfs_block_group_cache *cache;
668 
669 	cache = block_group_cache_tree_search(info, bytenr, 1);
670 
671 	return cache;
672 }
673 
674 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
675 						  u64 flags)
676 {
677 	struct list_head *head = &info->space_info;
678 	struct btrfs_space_info *found;
679 
680 	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
681 
682 	rcu_read_lock();
683 	list_for_each_entry_rcu(found, head, list) {
684 		if (found->flags & flags) {
685 			rcu_read_unlock();
686 			return found;
687 		}
688 	}
689 	rcu_read_unlock();
690 	return NULL;
691 }
692 
693 /*
694  * after adding space to the filesystem, we need to clear the full flags
695  * on all the space infos.
696  */
697 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
698 {
699 	struct list_head *head = &info->space_info;
700 	struct btrfs_space_info *found;
701 
702 	rcu_read_lock();
703 	list_for_each_entry_rcu(found, head, list)
704 		found->full = 0;
705 	rcu_read_unlock();
706 }
707 
708 /* simple helper to search for an existing extent at a given offset */
709 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
710 {
711 	int ret;
712 	struct btrfs_key key;
713 	struct btrfs_path *path;
714 
715 	path = btrfs_alloc_path();
716 	if (!path)
717 		return -ENOMEM;
718 
719 	key.objectid = start;
720 	key.offset = len;
721 	key.type = BTRFS_EXTENT_ITEM_KEY;
722 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
723 				0, 0);
724 	if (ret > 0) {
725 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
726 		if (key.objectid == start &&
727 		    key.type == BTRFS_METADATA_ITEM_KEY)
728 			ret = 0;
729 	}
730 	btrfs_free_path(path);
731 	return ret;
732 }
733 
734 /*
735  * helper function to lookup reference count and flags of a tree block.
736  *
737  * the head node for delayed ref is used to store the sum of all the
738  * reference count modifications queued up in the rbtree. the head
739  * node may also store the extent flags to set. This way you can check
740  * to see what the reference count and extent flags would be if all of
741  * the delayed refs are not processed.
742  */
743 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
744 			     struct btrfs_root *root, u64 bytenr,
745 			     u64 offset, int metadata, u64 *refs, u64 *flags)
746 {
747 	struct btrfs_delayed_ref_head *head;
748 	struct btrfs_delayed_ref_root *delayed_refs;
749 	struct btrfs_path *path;
750 	struct btrfs_extent_item *ei;
751 	struct extent_buffer *leaf;
752 	struct btrfs_key key;
753 	u32 item_size;
754 	u64 num_refs;
755 	u64 extent_flags;
756 	int ret;
757 
758 	/*
759 	 * If we don't have skinny metadata, don't bother doing anything
760 	 * different
761 	 */
762 	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
763 		offset = root->leafsize;
764 		metadata = 0;
765 	}
766 
767 	path = btrfs_alloc_path();
768 	if (!path)
769 		return -ENOMEM;
770 
771 	if (metadata) {
772 		key.objectid = bytenr;
773 		key.type = BTRFS_METADATA_ITEM_KEY;
774 		key.offset = offset;
775 	} else {
776 		key.objectid = bytenr;
777 		key.type = BTRFS_EXTENT_ITEM_KEY;
778 		key.offset = offset;
779 	}
780 
781 	if (!trans) {
782 		path->skip_locking = 1;
783 		path->search_commit_root = 1;
784 	}
785 again:
786 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
787 				&key, path, 0, 0);
788 	if (ret < 0)
789 		goto out_free;
790 
791 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
792 		metadata = 0;
793 		if (path->slots[0]) {
794 			path->slots[0]--;
795 			btrfs_item_key_to_cpu(path->nodes[0], &key,
796 					      path->slots[0]);
797 			if (key.objectid == bytenr &&
798 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
799 			    key.offset == root->leafsize)
800 				ret = 0;
801 		}
802 		if (ret) {
803 			key.objectid = bytenr;
804 			key.type = BTRFS_EXTENT_ITEM_KEY;
805 			key.offset = root->leafsize;
806 			btrfs_release_path(path);
807 			goto again;
808 		}
809 	}
810 
811 	if (ret == 0) {
812 		leaf = path->nodes[0];
813 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
814 		if (item_size >= sizeof(*ei)) {
815 			ei = btrfs_item_ptr(leaf, path->slots[0],
816 					    struct btrfs_extent_item);
817 			num_refs = btrfs_extent_refs(leaf, ei);
818 			extent_flags = btrfs_extent_flags(leaf, ei);
819 		} else {
820 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
821 			struct btrfs_extent_item_v0 *ei0;
822 			BUG_ON(item_size != sizeof(*ei0));
823 			ei0 = btrfs_item_ptr(leaf, path->slots[0],
824 					     struct btrfs_extent_item_v0);
825 			num_refs = btrfs_extent_refs_v0(leaf, ei0);
826 			/* FIXME: this isn't correct for data */
827 			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
828 #else
829 			BUG();
830 #endif
831 		}
832 		BUG_ON(num_refs == 0);
833 	} else {
834 		num_refs = 0;
835 		extent_flags = 0;
836 		ret = 0;
837 	}
838 
839 	if (!trans)
840 		goto out;
841 
842 	delayed_refs = &trans->transaction->delayed_refs;
843 	spin_lock(&delayed_refs->lock);
844 	head = btrfs_find_delayed_ref_head(trans, bytenr);
845 	if (head) {
846 		if (!mutex_trylock(&head->mutex)) {
847 			atomic_inc(&head->node.refs);
848 			spin_unlock(&delayed_refs->lock);
849 
850 			btrfs_release_path(path);
851 
852 			/*
853 			 * Mutex was contended, block until it's released and try
854 			 * again
855 			 */
856 			mutex_lock(&head->mutex);
857 			mutex_unlock(&head->mutex);
858 			btrfs_put_delayed_ref(&head->node);
859 			goto again;
860 		}
861 		if (head->extent_op && head->extent_op->update_flags)
862 			extent_flags |= head->extent_op->flags_to_set;
863 		else
864 			BUG_ON(num_refs == 0);
865 
866 		num_refs += head->node.ref_mod;
867 		mutex_unlock(&head->mutex);
868 	}
869 	spin_unlock(&delayed_refs->lock);
870 out:
871 	WARN_ON(num_refs == 0);
872 	if (refs)
873 		*refs = num_refs;
874 	if (flags)
875 		*flags = extent_flags;
876 out_free:
877 	btrfs_free_path(path);
878 	return ret;
879 }
880 
881 /*
882  * Back reference rules.  Back refs have three main goals:
883  *
884  * 1) differentiate between all holders of references to an extent so that
885  *    when a reference is dropped we can make sure it was a valid reference
886  *    before freeing the extent.
887  *
888  * 2) Provide enough information to quickly find the holders of an extent
889  *    if we notice a given block is corrupted or bad.
890  *
891  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
892  *    maintenance.  This is actually the same as #2, but with a slightly
893  *    different use case.
894  *
895  * There are two kinds of back refs. The implicit back refs is optimized
896  * for pointers in non-shared tree blocks. For a given pointer in a block,
897  * back refs of this kind provide information about the block's owner tree
898  * and the pointer's key. These information allow us to find the block by
899  * b-tree searching. The full back refs is for pointers in tree blocks not
900  * referenced by their owner trees. The location of tree block is recorded
901  * in the back refs. Actually the full back refs is generic, and can be
902  * used in all cases the implicit back refs is used. The major shortcoming
903  * of the full back refs is its overhead. Every time a tree block gets
904  * COWed, we have to update back refs entry for all pointers in it.
905  *
906  * For a newly allocated tree block, we use implicit back refs for
907  * pointers in it. This means most tree related operations only involve
908  * implicit back refs. For a tree block created in old transaction, the
909  * only way to drop a reference to it is COW it. So we can detect the
910  * event that tree block loses its owner tree's reference and do the
911  * back refs conversion.
912  *
913  * When a tree block is COW'd through a tree, there are four cases:
914  *
915  * The reference count of the block is one and the tree is the block's
916  * owner tree. Nothing to do in this case.
917  *
918  * The reference count of the block is one and the tree is not the
919  * block's owner tree. In this case, full back refs is used for pointers
920  * in the block. Remove these full back refs, add implicit back refs for
921  * every pointers in the new block.
922  *
923  * The reference count of the block is greater than one and the tree is
924  * the block's owner tree. In this case, implicit back refs is used for
925  * pointers in the block. Add full back refs for every pointers in the
926  * block, increase lower level extents' reference counts. The original
927  * implicit back refs are entailed to the new block.
928  *
929  * The reference count of the block is greater than one and the tree is
930  * not the block's owner tree. Add implicit back refs for every pointer in
931  * the new block, increase lower level extents' reference count.
932  *
933  * Back Reference Key composing:
934  *
935  * The key objectid corresponds to the first byte in the extent,
936  * The key type is used to differentiate between types of back refs.
937  * There are different meanings of the key offset for different types
938  * of back refs.
939  *
940  * File extents can be referenced by:
941  *
942  * - multiple snapshots, subvolumes, or different generations in one subvol
943  * - different files inside a single subvolume
944  * - different offsets inside a file (bookend extents in file.c)
945  *
946  * The extent ref structure for the implicit back refs has fields for:
947  *
948  * - Objectid of the subvolume root
949  * - objectid of the file holding the reference
950  * - original offset in the file
951  * - how many bookend extents
952  *
953  * The key offset for the implicit back refs is hash of the first
954  * three fields.
955  *
956  * The extent ref structure for the full back refs has field for:
957  *
958  * - number of pointers in the tree leaf
959  *
960  * The key offset for the implicit back refs is the first byte of
961  * the tree leaf
962  *
963  * When a file extent is allocated, The implicit back refs is used.
964  * the fields are filled in:
965  *
966  *     (root_key.objectid, inode objectid, offset in file, 1)
967  *
968  * When a file extent is removed file truncation, we find the
969  * corresponding implicit back refs and check the following fields:
970  *
971  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
972  *
973  * Btree extents can be referenced by:
974  *
975  * - Different subvolumes
976  *
977  * Both the implicit back refs and the full back refs for tree blocks
978  * only consist of key. The key offset for the implicit back refs is
979  * objectid of block's owner tree. The key offset for the full back refs
980  * is the first byte of parent block.
981  *
982  * When implicit back refs is used, information about the lowest key and
983  * level of the tree block are required. These information are stored in
984  * tree block info structure.
985  */
986 
987 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
988 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
989 				  struct btrfs_root *root,
990 				  struct btrfs_path *path,
991 				  u64 owner, u32 extra_size)
992 {
993 	struct btrfs_extent_item *item;
994 	struct btrfs_extent_item_v0 *ei0;
995 	struct btrfs_extent_ref_v0 *ref0;
996 	struct btrfs_tree_block_info *bi;
997 	struct extent_buffer *leaf;
998 	struct btrfs_key key;
999 	struct btrfs_key found_key;
1000 	u32 new_size = sizeof(*item);
1001 	u64 refs;
1002 	int ret;
1003 
1004 	leaf = path->nodes[0];
1005 	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1006 
1007 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1008 	ei0 = btrfs_item_ptr(leaf, path->slots[0],
1009 			     struct btrfs_extent_item_v0);
1010 	refs = btrfs_extent_refs_v0(leaf, ei0);
1011 
1012 	if (owner == (u64)-1) {
1013 		while (1) {
1014 			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1015 				ret = btrfs_next_leaf(root, path);
1016 				if (ret < 0)
1017 					return ret;
1018 				BUG_ON(ret > 0); /* Corruption */
1019 				leaf = path->nodes[0];
1020 			}
1021 			btrfs_item_key_to_cpu(leaf, &found_key,
1022 					      path->slots[0]);
1023 			BUG_ON(key.objectid != found_key.objectid);
1024 			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1025 				path->slots[0]++;
1026 				continue;
1027 			}
1028 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1029 					      struct btrfs_extent_ref_v0);
1030 			owner = btrfs_ref_objectid_v0(leaf, ref0);
1031 			break;
1032 		}
1033 	}
1034 	btrfs_release_path(path);
1035 
1036 	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1037 		new_size += sizeof(*bi);
1038 
1039 	new_size -= sizeof(*ei0);
1040 	ret = btrfs_search_slot(trans, root, &key, path,
1041 				new_size + extra_size, 1);
1042 	if (ret < 0)
1043 		return ret;
1044 	BUG_ON(ret); /* Corruption */
1045 
1046 	btrfs_extend_item(root, path, new_size);
1047 
1048 	leaf = path->nodes[0];
1049 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1050 	btrfs_set_extent_refs(leaf, item, refs);
1051 	/* FIXME: get real generation */
1052 	btrfs_set_extent_generation(leaf, item, 0);
1053 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1054 		btrfs_set_extent_flags(leaf, item,
1055 				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1056 				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1057 		bi = (struct btrfs_tree_block_info *)(item + 1);
1058 		/* FIXME: get first key of the block */
1059 		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1060 		btrfs_set_tree_block_level(leaf, bi, (int)owner);
1061 	} else {
1062 		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1063 	}
1064 	btrfs_mark_buffer_dirty(leaf);
1065 	return 0;
1066 }
1067 #endif
1068 
1069 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1070 {
1071 	u32 high_crc = ~(u32)0;
1072 	u32 low_crc = ~(u32)0;
1073 	__le64 lenum;
1074 
1075 	lenum = cpu_to_le64(root_objectid);
1076 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1077 	lenum = cpu_to_le64(owner);
1078 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1079 	lenum = cpu_to_le64(offset);
1080 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1081 
1082 	return ((u64)high_crc << 31) ^ (u64)low_crc;
1083 }
1084 
1085 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1086 				     struct btrfs_extent_data_ref *ref)
1087 {
1088 	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1089 				    btrfs_extent_data_ref_objectid(leaf, ref),
1090 				    btrfs_extent_data_ref_offset(leaf, ref));
1091 }
1092 
1093 static int match_extent_data_ref(struct extent_buffer *leaf,
1094 				 struct btrfs_extent_data_ref *ref,
1095 				 u64 root_objectid, u64 owner, u64 offset)
1096 {
1097 	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1098 	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1099 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
1100 		return 0;
1101 	return 1;
1102 }
1103 
1104 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1105 					   struct btrfs_root *root,
1106 					   struct btrfs_path *path,
1107 					   u64 bytenr, u64 parent,
1108 					   u64 root_objectid,
1109 					   u64 owner, u64 offset)
1110 {
1111 	struct btrfs_key key;
1112 	struct btrfs_extent_data_ref *ref;
1113 	struct extent_buffer *leaf;
1114 	u32 nritems;
1115 	int ret;
1116 	int recow;
1117 	int err = -ENOENT;
1118 
1119 	key.objectid = bytenr;
1120 	if (parent) {
1121 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1122 		key.offset = parent;
1123 	} else {
1124 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1125 		key.offset = hash_extent_data_ref(root_objectid,
1126 						  owner, offset);
1127 	}
1128 again:
1129 	recow = 0;
1130 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1131 	if (ret < 0) {
1132 		err = ret;
1133 		goto fail;
1134 	}
1135 
1136 	if (parent) {
1137 		if (!ret)
1138 			return 0;
1139 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1140 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1141 		btrfs_release_path(path);
1142 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1143 		if (ret < 0) {
1144 			err = ret;
1145 			goto fail;
1146 		}
1147 		if (!ret)
1148 			return 0;
1149 #endif
1150 		goto fail;
1151 	}
1152 
1153 	leaf = path->nodes[0];
1154 	nritems = btrfs_header_nritems(leaf);
1155 	while (1) {
1156 		if (path->slots[0] >= nritems) {
1157 			ret = btrfs_next_leaf(root, path);
1158 			if (ret < 0)
1159 				err = ret;
1160 			if (ret)
1161 				goto fail;
1162 
1163 			leaf = path->nodes[0];
1164 			nritems = btrfs_header_nritems(leaf);
1165 			recow = 1;
1166 		}
1167 
1168 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1169 		if (key.objectid != bytenr ||
1170 		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1171 			goto fail;
1172 
1173 		ref = btrfs_item_ptr(leaf, path->slots[0],
1174 				     struct btrfs_extent_data_ref);
1175 
1176 		if (match_extent_data_ref(leaf, ref, root_objectid,
1177 					  owner, offset)) {
1178 			if (recow) {
1179 				btrfs_release_path(path);
1180 				goto again;
1181 			}
1182 			err = 0;
1183 			break;
1184 		}
1185 		path->slots[0]++;
1186 	}
1187 fail:
1188 	return err;
1189 }
1190 
1191 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1192 					   struct btrfs_root *root,
1193 					   struct btrfs_path *path,
1194 					   u64 bytenr, u64 parent,
1195 					   u64 root_objectid, u64 owner,
1196 					   u64 offset, int refs_to_add)
1197 {
1198 	struct btrfs_key key;
1199 	struct extent_buffer *leaf;
1200 	u32 size;
1201 	u32 num_refs;
1202 	int ret;
1203 
1204 	key.objectid = bytenr;
1205 	if (parent) {
1206 		key.type = BTRFS_SHARED_DATA_REF_KEY;
1207 		key.offset = parent;
1208 		size = sizeof(struct btrfs_shared_data_ref);
1209 	} else {
1210 		key.type = BTRFS_EXTENT_DATA_REF_KEY;
1211 		key.offset = hash_extent_data_ref(root_objectid,
1212 						  owner, offset);
1213 		size = sizeof(struct btrfs_extent_data_ref);
1214 	}
1215 
1216 	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1217 	if (ret && ret != -EEXIST)
1218 		goto fail;
1219 
1220 	leaf = path->nodes[0];
1221 	if (parent) {
1222 		struct btrfs_shared_data_ref *ref;
1223 		ref = btrfs_item_ptr(leaf, path->slots[0],
1224 				     struct btrfs_shared_data_ref);
1225 		if (ret == 0) {
1226 			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1227 		} else {
1228 			num_refs = btrfs_shared_data_ref_count(leaf, ref);
1229 			num_refs += refs_to_add;
1230 			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1231 		}
1232 	} else {
1233 		struct btrfs_extent_data_ref *ref;
1234 		while (ret == -EEXIST) {
1235 			ref = btrfs_item_ptr(leaf, path->slots[0],
1236 					     struct btrfs_extent_data_ref);
1237 			if (match_extent_data_ref(leaf, ref, root_objectid,
1238 						  owner, offset))
1239 				break;
1240 			btrfs_release_path(path);
1241 			key.offset++;
1242 			ret = btrfs_insert_empty_item(trans, root, path, &key,
1243 						      size);
1244 			if (ret && ret != -EEXIST)
1245 				goto fail;
1246 
1247 			leaf = path->nodes[0];
1248 		}
1249 		ref = btrfs_item_ptr(leaf, path->slots[0],
1250 				     struct btrfs_extent_data_ref);
1251 		if (ret == 0) {
1252 			btrfs_set_extent_data_ref_root(leaf, ref,
1253 						       root_objectid);
1254 			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1255 			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1256 			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1257 		} else {
1258 			num_refs = btrfs_extent_data_ref_count(leaf, ref);
1259 			num_refs += refs_to_add;
1260 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1261 		}
1262 	}
1263 	btrfs_mark_buffer_dirty(leaf);
1264 	ret = 0;
1265 fail:
1266 	btrfs_release_path(path);
1267 	return ret;
1268 }
1269 
1270 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1271 					   struct btrfs_root *root,
1272 					   struct btrfs_path *path,
1273 					   int refs_to_drop)
1274 {
1275 	struct btrfs_key key;
1276 	struct btrfs_extent_data_ref *ref1 = NULL;
1277 	struct btrfs_shared_data_ref *ref2 = NULL;
1278 	struct extent_buffer *leaf;
1279 	u32 num_refs = 0;
1280 	int ret = 0;
1281 
1282 	leaf = path->nodes[0];
1283 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1284 
1285 	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1286 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1287 				      struct btrfs_extent_data_ref);
1288 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1289 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1290 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1291 				      struct btrfs_shared_data_ref);
1292 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1293 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1294 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1295 		struct btrfs_extent_ref_v0 *ref0;
1296 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1297 				      struct btrfs_extent_ref_v0);
1298 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1299 #endif
1300 	} else {
1301 		BUG();
1302 	}
1303 
1304 	BUG_ON(num_refs < refs_to_drop);
1305 	num_refs -= refs_to_drop;
1306 
1307 	if (num_refs == 0) {
1308 		ret = btrfs_del_item(trans, root, path);
1309 	} else {
1310 		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1311 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1312 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1313 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1314 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1315 		else {
1316 			struct btrfs_extent_ref_v0 *ref0;
1317 			ref0 = btrfs_item_ptr(leaf, path->slots[0],
1318 					struct btrfs_extent_ref_v0);
1319 			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1320 		}
1321 #endif
1322 		btrfs_mark_buffer_dirty(leaf);
1323 	}
1324 	return ret;
1325 }
1326 
1327 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1328 					  struct btrfs_path *path,
1329 					  struct btrfs_extent_inline_ref *iref)
1330 {
1331 	struct btrfs_key key;
1332 	struct extent_buffer *leaf;
1333 	struct btrfs_extent_data_ref *ref1;
1334 	struct btrfs_shared_data_ref *ref2;
1335 	u32 num_refs = 0;
1336 
1337 	leaf = path->nodes[0];
1338 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1339 	if (iref) {
1340 		if (btrfs_extent_inline_ref_type(leaf, iref) ==
1341 		    BTRFS_EXTENT_DATA_REF_KEY) {
1342 			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1343 			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1344 		} else {
1345 			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1346 			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1347 		}
1348 	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1349 		ref1 = btrfs_item_ptr(leaf, path->slots[0],
1350 				      struct btrfs_extent_data_ref);
1351 		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1352 	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1353 		ref2 = btrfs_item_ptr(leaf, path->slots[0],
1354 				      struct btrfs_shared_data_ref);
1355 		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1356 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1357 	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1358 		struct btrfs_extent_ref_v0 *ref0;
1359 		ref0 = btrfs_item_ptr(leaf, path->slots[0],
1360 				      struct btrfs_extent_ref_v0);
1361 		num_refs = btrfs_ref_count_v0(leaf, ref0);
1362 #endif
1363 	} else {
1364 		WARN_ON(1);
1365 	}
1366 	return num_refs;
1367 }
1368 
1369 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1370 					  struct btrfs_root *root,
1371 					  struct btrfs_path *path,
1372 					  u64 bytenr, u64 parent,
1373 					  u64 root_objectid)
1374 {
1375 	struct btrfs_key key;
1376 	int ret;
1377 
1378 	key.objectid = bytenr;
1379 	if (parent) {
1380 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1381 		key.offset = parent;
1382 	} else {
1383 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1384 		key.offset = root_objectid;
1385 	}
1386 
1387 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1388 	if (ret > 0)
1389 		ret = -ENOENT;
1390 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1391 	if (ret == -ENOENT && parent) {
1392 		btrfs_release_path(path);
1393 		key.type = BTRFS_EXTENT_REF_V0_KEY;
1394 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1395 		if (ret > 0)
1396 			ret = -ENOENT;
1397 	}
1398 #endif
1399 	return ret;
1400 }
1401 
1402 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1403 					  struct btrfs_root *root,
1404 					  struct btrfs_path *path,
1405 					  u64 bytenr, u64 parent,
1406 					  u64 root_objectid)
1407 {
1408 	struct btrfs_key key;
1409 	int ret;
1410 
1411 	key.objectid = bytenr;
1412 	if (parent) {
1413 		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1414 		key.offset = parent;
1415 	} else {
1416 		key.type = BTRFS_TREE_BLOCK_REF_KEY;
1417 		key.offset = root_objectid;
1418 	}
1419 
1420 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1421 	btrfs_release_path(path);
1422 	return ret;
1423 }
1424 
1425 static inline int extent_ref_type(u64 parent, u64 owner)
1426 {
1427 	int type;
1428 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1429 		if (parent > 0)
1430 			type = BTRFS_SHARED_BLOCK_REF_KEY;
1431 		else
1432 			type = BTRFS_TREE_BLOCK_REF_KEY;
1433 	} else {
1434 		if (parent > 0)
1435 			type = BTRFS_SHARED_DATA_REF_KEY;
1436 		else
1437 			type = BTRFS_EXTENT_DATA_REF_KEY;
1438 	}
1439 	return type;
1440 }
1441 
1442 static int find_next_key(struct btrfs_path *path, int level,
1443 			 struct btrfs_key *key)
1444 
1445 {
1446 	for (; level < BTRFS_MAX_LEVEL; level++) {
1447 		if (!path->nodes[level])
1448 			break;
1449 		if (path->slots[level] + 1 >=
1450 		    btrfs_header_nritems(path->nodes[level]))
1451 			continue;
1452 		if (level == 0)
1453 			btrfs_item_key_to_cpu(path->nodes[level], key,
1454 					      path->slots[level] + 1);
1455 		else
1456 			btrfs_node_key_to_cpu(path->nodes[level], key,
1457 					      path->slots[level] + 1);
1458 		return 0;
1459 	}
1460 	return 1;
1461 }
1462 
1463 /*
1464  * look for inline back ref. if back ref is found, *ref_ret is set
1465  * to the address of inline back ref, and 0 is returned.
1466  *
1467  * if back ref isn't found, *ref_ret is set to the address where it
1468  * should be inserted, and -ENOENT is returned.
1469  *
1470  * if insert is true and there are too many inline back refs, the path
1471  * points to the extent item, and -EAGAIN is returned.
1472  *
1473  * NOTE: inline back refs are ordered in the same way that back ref
1474  *	 items in the tree are ordered.
1475  */
1476 static noinline_for_stack
1477 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1478 				 struct btrfs_root *root,
1479 				 struct btrfs_path *path,
1480 				 struct btrfs_extent_inline_ref **ref_ret,
1481 				 u64 bytenr, u64 num_bytes,
1482 				 u64 parent, u64 root_objectid,
1483 				 u64 owner, u64 offset, int insert)
1484 {
1485 	struct btrfs_key key;
1486 	struct extent_buffer *leaf;
1487 	struct btrfs_extent_item *ei;
1488 	struct btrfs_extent_inline_ref *iref;
1489 	u64 flags;
1490 	u64 item_size;
1491 	unsigned long ptr;
1492 	unsigned long end;
1493 	int extra_size;
1494 	int type;
1495 	int want;
1496 	int ret;
1497 	int err = 0;
1498 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1499 						 SKINNY_METADATA);
1500 
1501 	key.objectid = bytenr;
1502 	key.type = BTRFS_EXTENT_ITEM_KEY;
1503 	key.offset = num_bytes;
1504 
1505 	want = extent_ref_type(parent, owner);
1506 	if (insert) {
1507 		extra_size = btrfs_extent_inline_ref_size(want);
1508 		path->keep_locks = 1;
1509 	} else
1510 		extra_size = -1;
1511 
1512 	/*
1513 	 * Owner is our parent level, so we can just add one to get the level
1514 	 * for the block we are interested in.
1515 	 */
1516 	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1517 		key.type = BTRFS_METADATA_ITEM_KEY;
1518 		key.offset = owner;
1519 	}
1520 
1521 again:
1522 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1523 	if (ret < 0) {
1524 		err = ret;
1525 		goto out;
1526 	}
1527 
1528 	/*
1529 	 * We may be a newly converted file system which still has the old fat
1530 	 * extent entries for metadata, so try and see if we have one of those.
1531 	 */
1532 	if (ret > 0 && skinny_metadata) {
1533 		skinny_metadata = false;
1534 		if (path->slots[0]) {
1535 			path->slots[0]--;
1536 			btrfs_item_key_to_cpu(path->nodes[0], &key,
1537 					      path->slots[0]);
1538 			if (key.objectid == bytenr &&
1539 			    key.type == BTRFS_EXTENT_ITEM_KEY &&
1540 			    key.offset == num_bytes)
1541 				ret = 0;
1542 		}
1543 		if (ret) {
1544 			key.type = BTRFS_EXTENT_ITEM_KEY;
1545 			key.offset = num_bytes;
1546 			btrfs_release_path(path);
1547 			goto again;
1548 		}
1549 	}
1550 
1551 	if (ret && !insert) {
1552 		err = -ENOENT;
1553 		goto out;
1554 	} else if (ret) {
1555 		err = -EIO;
1556 		WARN_ON(1);
1557 		goto out;
1558 	}
1559 
1560 	leaf = path->nodes[0];
1561 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1562 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1563 	if (item_size < sizeof(*ei)) {
1564 		if (!insert) {
1565 			err = -ENOENT;
1566 			goto out;
1567 		}
1568 		ret = convert_extent_item_v0(trans, root, path, owner,
1569 					     extra_size);
1570 		if (ret < 0) {
1571 			err = ret;
1572 			goto out;
1573 		}
1574 		leaf = path->nodes[0];
1575 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1576 	}
1577 #endif
1578 	BUG_ON(item_size < sizeof(*ei));
1579 
1580 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1581 	flags = btrfs_extent_flags(leaf, ei);
1582 
1583 	ptr = (unsigned long)(ei + 1);
1584 	end = (unsigned long)ei + item_size;
1585 
1586 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1587 		ptr += sizeof(struct btrfs_tree_block_info);
1588 		BUG_ON(ptr > end);
1589 	}
1590 
1591 	err = -ENOENT;
1592 	while (1) {
1593 		if (ptr >= end) {
1594 			WARN_ON(ptr > end);
1595 			break;
1596 		}
1597 		iref = (struct btrfs_extent_inline_ref *)ptr;
1598 		type = btrfs_extent_inline_ref_type(leaf, iref);
1599 		if (want < type)
1600 			break;
1601 		if (want > type) {
1602 			ptr += btrfs_extent_inline_ref_size(type);
1603 			continue;
1604 		}
1605 
1606 		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1607 			struct btrfs_extent_data_ref *dref;
1608 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1609 			if (match_extent_data_ref(leaf, dref, root_objectid,
1610 						  owner, offset)) {
1611 				err = 0;
1612 				break;
1613 			}
1614 			if (hash_extent_data_ref_item(leaf, dref) <
1615 			    hash_extent_data_ref(root_objectid, owner, offset))
1616 				break;
1617 		} else {
1618 			u64 ref_offset;
1619 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1620 			if (parent > 0) {
1621 				if (parent == ref_offset) {
1622 					err = 0;
1623 					break;
1624 				}
1625 				if (ref_offset < parent)
1626 					break;
1627 			} else {
1628 				if (root_objectid == ref_offset) {
1629 					err = 0;
1630 					break;
1631 				}
1632 				if (ref_offset < root_objectid)
1633 					break;
1634 			}
1635 		}
1636 		ptr += btrfs_extent_inline_ref_size(type);
1637 	}
1638 	if (err == -ENOENT && insert) {
1639 		if (item_size + extra_size >=
1640 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1641 			err = -EAGAIN;
1642 			goto out;
1643 		}
1644 		/*
1645 		 * To add new inline back ref, we have to make sure
1646 		 * there is no corresponding back ref item.
1647 		 * For simplicity, we just do not add new inline back
1648 		 * ref if there is any kind of item for this block
1649 		 */
1650 		if (find_next_key(path, 0, &key) == 0 &&
1651 		    key.objectid == bytenr &&
1652 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1653 			err = -EAGAIN;
1654 			goto out;
1655 		}
1656 	}
1657 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1658 out:
1659 	if (insert) {
1660 		path->keep_locks = 0;
1661 		btrfs_unlock_up_safe(path, 1);
1662 	}
1663 	return err;
1664 }
1665 
1666 /*
1667  * helper to add new inline back ref
1668  */
1669 static noinline_for_stack
1670 void setup_inline_extent_backref(struct btrfs_root *root,
1671 				 struct btrfs_path *path,
1672 				 struct btrfs_extent_inline_ref *iref,
1673 				 u64 parent, u64 root_objectid,
1674 				 u64 owner, u64 offset, int refs_to_add,
1675 				 struct btrfs_delayed_extent_op *extent_op)
1676 {
1677 	struct extent_buffer *leaf;
1678 	struct btrfs_extent_item *ei;
1679 	unsigned long ptr;
1680 	unsigned long end;
1681 	unsigned long item_offset;
1682 	u64 refs;
1683 	int size;
1684 	int type;
1685 
1686 	leaf = path->nodes[0];
1687 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1688 	item_offset = (unsigned long)iref - (unsigned long)ei;
1689 
1690 	type = extent_ref_type(parent, owner);
1691 	size = btrfs_extent_inline_ref_size(type);
1692 
1693 	btrfs_extend_item(root, path, size);
1694 
1695 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1696 	refs = btrfs_extent_refs(leaf, ei);
1697 	refs += refs_to_add;
1698 	btrfs_set_extent_refs(leaf, ei, refs);
1699 	if (extent_op)
1700 		__run_delayed_extent_op(extent_op, leaf, ei);
1701 
1702 	ptr = (unsigned long)ei + item_offset;
1703 	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1704 	if (ptr < end - size)
1705 		memmove_extent_buffer(leaf, ptr + size, ptr,
1706 				      end - size - ptr);
1707 
1708 	iref = (struct btrfs_extent_inline_ref *)ptr;
1709 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1710 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1711 		struct btrfs_extent_data_ref *dref;
1712 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1713 		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1714 		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1715 		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1716 		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1717 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1718 		struct btrfs_shared_data_ref *sref;
1719 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1720 		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1721 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1722 	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1723 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1724 	} else {
1725 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1726 	}
1727 	btrfs_mark_buffer_dirty(leaf);
1728 }
1729 
1730 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1731 				 struct btrfs_root *root,
1732 				 struct btrfs_path *path,
1733 				 struct btrfs_extent_inline_ref **ref_ret,
1734 				 u64 bytenr, u64 num_bytes, u64 parent,
1735 				 u64 root_objectid, u64 owner, u64 offset)
1736 {
1737 	int ret;
1738 
1739 	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1740 					   bytenr, num_bytes, parent,
1741 					   root_objectid, owner, offset, 0);
1742 	if (ret != -ENOENT)
1743 		return ret;
1744 
1745 	btrfs_release_path(path);
1746 	*ref_ret = NULL;
1747 
1748 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1749 		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1750 					    root_objectid);
1751 	} else {
1752 		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1753 					     root_objectid, owner, offset);
1754 	}
1755 	return ret;
1756 }
1757 
1758 /*
1759  * helper to update/remove inline back ref
1760  */
1761 static noinline_for_stack
1762 void update_inline_extent_backref(struct btrfs_root *root,
1763 				  struct btrfs_path *path,
1764 				  struct btrfs_extent_inline_ref *iref,
1765 				  int refs_to_mod,
1766 				  struct btrfs_delayed_extent_op *extent_op)
1767 {
1768 	struct extent_buffer *leaf;
1769 	struct btrfs_extent_item *ei;
1770 	struct btrfs_extent_data_ref *dref = NULL;
1771 	struct btrfs_shared_data_ref *sref = NULL;
1772 	unsigned long ptr;
1773 	unsigned long end;
1774 	u32 item_size;
1775 	int size;
1776 	int type;
1777 	u64 refs;
1778 
1779 	leaf = path->nodes[0];
1780 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1781 	refs = btrfs_extent_refs(leaf, ei);
1782 	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1783 	refs += refs_to_mod;
1784 	btrfs_set_extent_refs(leaf, ei, refs);
1785 	if (extent_op)
1786 		__run_delayed_extent_op(extent_op, leaf, ei);
1787 
1788 	type = btrfs_extent_inline_ref_type(leaf, iref);
1789 
1790 	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1791 		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1792 		refs = btrfs_extent_data_ref_count(leaf, dref);
1793 	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1794 		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1795 		refs = btrfs_shared_data_ref_count(leaf, sref);
1796 	} else {
1797 		refs = 1;
1798 		BUG_ON(refs_to_mod != -1);
1799 	}
1800 
1801 	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1802 	refs += refs_to_mod;
1803 
1804 	if (refs > 0) {
1805 		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1806 			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1807 		else
1808 			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1809 	} else {
1810 		size =  btrfs_extent_inline_ref_size(type);
1811 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1812 		ptr = (unsigned long)iref;
1813 		end = (unsigned long)ei + item_size;
1814 		if (ptr + size < end)
1815 			memmove_extent_buffer(leaf, ptr, ptr + size,
1816 					      end - ptr - size);
1817 		item_size -= size;
1818 		btrfs_truncate_item(root, path, item_size, 1);
1819 	}
1820 	btrfs_mark_buffer_dirty(leaf);
1821 }
1822 
1823 static noinline_for_stack
1824 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1825 				 struct btrfs_root *root,
1826 				 struct btrfs_path *path,
1827 				 u64 bytenr, u64 num_bytes, u64 parent,
1828 				 u64 root_objectid, u64 owner,
1829 				 u64 offset, int refs_to_add,
1830 				 struct btrfs_delayed_extent_op *extent_op)
1831 {
1832 	struct btrfs_extent_inline_ref *iref;
1833 	int ret;
1834 
1835 	ret = lookup_inline_extent_backref(trans, root, path, &iref,
1836 					   bytenr, num_bytes, parent,
1837 					   root_objectid, owner, offset, 1);
1838 	if (ret == 0) {
1839 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1840 		update_inline_extent_backref(root, path, iref,
1841 					     refs_to_add, extent_op);
1842 	} else if (ret == -ENOENT) {
1843 		setup_inline_extent_backref(root, path, iref, parent,
1844 					    root_objectid, owner, offset,
1845 					    refs_to_add, extent_op);
1846 		ret = 0;
1847 	}
1848 	return ret;
1849 }
1850 
1851 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1852 				 struct btrfs_root *root,
1853 				 struct btrfs_path *path,
1854 				 u64 bytenr, u64 parent, u64 root_objectid,
1855 				 u64 owner, u64 offset, int refs_to_add)
1856 {
1857 	int ret;
1858 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1859 		BUG_ON(refs_to_add != 1);
1860 		ret = insert_tree_block_ref(trans, root, path, bytenr,
1861 					    parent, root_objectid);
1862 	} else {
1863 		ret = insert_extent_data_ref(trans, root, path, bytenr,
1864 					     parent, root_objectid,
1865 					     owner, offset, refs_to_add);
1866 	}
1867 	return ret;
1868 }
1869 
1870 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1871 				 struct btrfs_root *root,
1872 				 struct btrfs_path *path,
1873 				 struct btrfs_extent_inline_ref *iref,
1874 				 int refs_to_drop, int is_data)
1875 {
1876 	int ret = 0;
1877 
1878 	BUG_ON(!is_data && refs_to_drop != 1);
1879 	if (iref) {
1880 		update_inline_extent_backref(root, path, iref,
1881 					     -refs_to_drop, NULL);
1882 	} else if (is_data) {
1883 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1884 	} else {
1885 		ret = btrfs_del_item(trans, root, path);
1886 	}
1887 	return ret;
1888 }
1889 
1890 static int btrfs_issue_discard(struct block_device *bdev,
1891 				u64 start, u64 len)
1892 {
1893 	return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1894 }
1895 
1896 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1897 				u64 num_bytes, u64 *actual_bytes)
1898 {
1899 	int ret;
1900 	u64 discarded_bytes = 0;
1901 	struct btrfs_bio *bbio = NULL;
1902 
1903 
1904 	/* Tell the block device(s) that the sectors can be discarded */
1905 	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1906 			      bytenr, &num_bytes, &bbio, 0);
1907 	/* Error condition is -ENOMEM */
1908 	if (!ret) {
1909 		struct btrfs_bio_stripe *stripe = bbio->stripes;
1910 		int i;
1911 
1912 
1913 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1914 			if (!stripe->dev->can_discard)
1915 				continue;
1916 
1917 			ret = btrfs_issue_discard(stripe->dev->bdev,
1918 						  stripe->physical,
1919 						  stripe->length);
1920 			if (!ret)
1921 				discarded_bytes += stripe->length;
1922 			else if (ret != -EOPNOTSUPP)
1923 				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1924 
1925 			/*
1926 			 * Just in case we get back EOPNOTSUPP for some reason,
1927 			 * just ignore the return value so we don't screw up
1928 			 * people calling discard_extent.
1929 			 */
1930 			ret = 0;
1931 		}
1932 		kfree(bbio);
1933 	}
1934 
1935 	if (actual_bytes)
1936 		*actual_bytes = discarded_bytes;
1937 
1938 
1939 	if (ret == -EOPNOTSUPP)
1940 		ret = 0;
1941 	return ret;
1942 }
1943 
1944 /* Can return -ENOMEM */
1945 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1946 			 struct btrfs_root *root,
1947 			 u64 bytenr, u64 num_bytes, u64 parent,
1948 			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1949 {
1950 	int ret;
1951 	struct btrfs_fs_info *fs_info = root->fs_info;
1952 
1953 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1954 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
1955 
1956 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1957 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1958 					num_bytes,
1959 					parent, root_objectid, (int)owner,
1960 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1961 	} else {
1962 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1963 					num_bytes,
1964 					parent, root_objectid, owner, offset,
1965 					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1966 	}
1967 	return ret;
1968 }
1969 
1970 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1971 				  struct btrfs_root *root,
1972 				  u64 bytenr, u64 num_bytes,
1973 				  u64 parent, u64 root_objectid,
1974 				  u64 owner, u64 offset, int refs_to_add,
1975 				  struct btrfs_delayed_extent_op *extent_op)
1976 {
1977 	struct btrfs_path *path;
1978 	struct extent_buffer *leaf;
1979 	struct btrfs_extent_item *item;
1980 	u64 refs;
1981 	int ret;
1982 	int err = 0;
1983 
1984 	path = btrfs_alloc_path();
1985 	if (!path)
1986 		return -ENOMEM;
1987 
1988 	path->reada = 1;
1989 	path->leave_spinning = 1;
1990 	/* this will setup the path even if it fails to insert the back ref */
1991 	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1992 					   path, bytenr, num_bytes, parent,
1993 					   root_objectid, owner, offset,
1994 					   refs_to_add, extent_op);
1995 	if (ret == 0)
1996 		goto out;
1997 
1998 	if (ret != -EAGAIN) {
1999 		err = ret;
2000 		goto out;
2001 	}
2002 
2003 	leaf = path->nodes[0];
2004 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2005 	refs = btrfs_extent_refs(leaf, item);
2006 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2007 	if (extent_op)
2008 		__run_delayed_extent_op(extent_op, leaf, item);
2009 
2010 	btrfs_mark_buffer_dirty(leaf);
2011 	btrfs_release_path(path);
2012 
2013 	path->reada = 1;
2014 	path->leave_spinning = 1;
2015 
2016 	/* now insert the actual backref */
2017 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
2018 				    path, bytenr, parent, root_objectid,
2019 				    owner, offset, refs_to_add);
2020 	if (ret)
2021 		btrfs_abort_transaction(trans, root, ret);
2022 out:
2023 	btrfs_free_path(path);
2024 	return err;
2025 }
2026 
2027 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2028 				struct btrfs_root *root,
2029 				struct btrfs_delayed_ref_node *node,
2030 				struct btrfs_delayed_extent_op *extent_op,
2031 				int insert_reserved)
2032 {
2033 	int ret = 0;
2034 	struct btrfs_delayed_data_ref *ref;
2035 	struct btrfs_key ins;
2036 	u64 parent = 0;
2037 	u64 ref_root = 0;
2038 	u64 flags = 0;
2039 
2040 	ins.objectid = node->bytenr;
2041 	ins.offset = node->num_bytes;
2042 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2043 
2044 	ref = btrfs_delayed_node_to_data_ref(node);
2045 	trace_run_delayed_data_ref(node, ref, node->action);
2046 
2047 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2048 		parent = ref->parent;
2049 	else
2050 		ref_root = ref->root;
2051 
2052 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2053 		if (extent_op)
2054 			flags |= extent_op->flags_to_set;
2055 		ret = alloc_reserved_file_extent(trans, root,
2056 						 parent, ref_root, flags,
2057 						 ref->objectid, ref->offset,
2058 						 &ins, node->ref_mod);
2059 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2060 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2061 					     node->num_bytes, parent,
2062 					     ref_root, ref->objectid,
2063 					     ref->offset, node->ref_mod,
2064 					     extent_op);
2065 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2066 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2067 					  node->num_bytes, parent,
2068 					  ref_root, ref->objectid,
2069 					  ref->offset, node->ref_mod,
2070 					  extent_op);
2071 	} else {
2072 		BUG();
2073 	}
2074 	return ret;
2075 }
2076 
2077 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2078 				    struct extent_buffer *leaf,
2079 				    struct btrfs_extent_item *ei)
2080 {
2081 	u64 flags = btrfs_extent_flags(leaf, ei);
2082 	if (extent_op->update_flags) {
2083 		flags |= extent_op->flags_to_set;
2084 		btrfs_set_extent_flags(leaf, ei, flags);
2085 	}
2086 
2087 	if (extent_op->update_key) {
2088 		struct btrfs_tree_block_info *bi;
2089 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2090 		bi = (struct btrfs_tree_block_info *)(ei + 1);
2091 		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2092 	}
2093 }
2094 
2095 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2096 				 struct btrfs_root *root,
2097 				 struct btrfs_delayed_ref_node *node,
2098 				 struct btrfs_delayed_extent_op *extent_op)
2099 {
2100 	struct btrfs_key key;
2101 	struct btrfs_path *path;
2102 	struct btrfs_extent_item *ei;
2103 	struct extent_buffer *leaf;
2104 	u32 item_size;
2105 	int ret;
2106 	int err = 0;
2107 	int metadata = !extent_op->is_data;
2108 
2109 	if (trans->aborted)
2110 		return 0;
2111 
2112 	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2113 		metadata = 0;
2114 
2115 	path = btrfs_alloc_path();
2116 	if (!path)
2117 		return -ENOMEM;
2118 
2119 	key.objectid = node->bytenr;
2120 
2121 	if (metadata) {
2122 		key.type = BTRFS_METADATA_ITEM_KEY;
2123 		key.offset = extent_op->level;
2124 	} else {
2125 		key.type = BTRFS_EXTENT_ITEM_KEY;
2126 		key.offset = node->num_bytes;
2127 	}
2128 
2129 again:
2130 	path->reada = 1;
2131 	path->leave_spinning = 1;
2132 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2133 				path, 0, 1);
2134 	if (ret < 0) {
2135 		err = ret;
2136 		goto out;
2137 	}
2138 	if (ret > 0) {
2139 		if (metadata) {
2140 			btrfs_release_path(path);
2141 			metadata = 0;
2142 
2143 			key.offset = node->num_bytes;
2144 			key.type = BTRFS_EXTENT_ITEM_KEY;
2145 			goto again;
2146 		}
2147 		err = -EIO;
2148 		goto out;
2149 	}
2150 
2151 	leaf = path->nodes[0];
2152 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2153 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2154 	if (item_size < sizeof(*ei)) {
2155 		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2156 					     path, (u64)-1, 0);
2157 		if (ret < 0) {
2158 			err = ret;
2159 			goto out;
2160 		}
2161 		leaf = path->nodes[0];
2162 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2163 	}
2164 #endif
2165 	BUG_ON(item_size < sizeof(*ei));
2166 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2167 	__run_delayed_extent_op(extent_op, leaf, ei);
2168 
2169 	btrfs_mark_buffer_dirty(leaf);
2170 out:
2171 	btrfs_free_path(path);
2172 	return err;
2173 }
2174 
2175 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2176 				struct btrfs_root *root,
2177 				struct btrfs_delayed_ref_node *node,
2178 				struct btrfs_delayed_extent_op *extent_op,
2179 				int insert_reserved)
2180 {
2181 	int ret = 0;
2182 	struct btrfs_delayed_tree_ref *ref;
2183 	struct btrfs_key ins;
2184 	u64 parent = 0;
2185 	u64 ref_root = 0;
2186 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2187 						 SKINNY_METADATA);
2188 
2189 	ref = btrfs_delayed_node_to_tree_ref(node);
2190 	trace_run_delayed_tree_ref(node, ref, node->action);
2191 
2192 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2193 		parent = ref->parent;
2194 	else
2195 		ref_root = ref->root;
2196 
2197 	ins.objectid = node->bytenr;
2198 	if (skinny_metadata) {
2199 		ins.offset = ref->level;
2200 		ins.type = BTRFS_METADATA_ITEM_KEY;
2201 	} else {
2202 		ins.offset = node->num_bytes;
2203 		ins.type = BTRFS_EXTENT_ITEM_KEY;
2204 	}
2205 
2206 	BUG_ON(node->ref_mod != 1);
2207 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2208 		BUG_ON(!extent_op || !extent_op->update_flags);
2209 		ret = alloc_reserved_tree_block(trans, root,
2210 						parent, ref_root,
2211 						extent_op->flags_to_set,
2212 						&extent_op->key,
2213 						ref->level, &ins);
2214 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2215 		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2216 					     node->num_bytes, parent, ref_root,
2217 					     ref->level, 0, 1, extent_op);
2218 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2219 		ret = __btrfs_free_extent(trans, root, node->bytenr,
2220 					  node->num_bytes, parent, ref_root,
2221 					  ref->level, 0, 1, extent_op);
2222 	} else {
2223 		BUG();
2224 	}
2225 	return ret;
2226 }
2227 
2228 /* helper function to actually process a single delayed ref entry */
2229 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2230 			       struct btrfs_root *root,
2231 			       struct btrfs_delayed_ref_node *node,
2232 			       struct btrfs_delayed_extent_op *extent_op,
2233 			       int insert_reserved)
2234 {
2235 	int ret = 0;
2236 
2237 	if (trans->aborted)
2238 		return 0;
2239 
2240 	if (btrfs_delayed_ref_is_head(node)) {
2241 		struct btrfs_delayed_ref_head *head;
2242 		/*
2243 		 * we've hit the end of the chain and we were supposed
2244 		 * to insert this extent into the tree.  But, it got
2245 		 * deleted before we ever needed to insert it, so all
2246 		 * we have to do is clean up the accounting
2247 		 */
2248 		BUG_ON(extent_op);
2249 		head = btrfs_delayed_node_to_head(node);
2250 		trace_run_delayed_ref_head(node, head, node->action);
2251 
2252 		if (insert_reserved) {
2253 			btrfs_pin_extent(root, node->bytenr,
2254 					 node->num_bytes, 1);
2255 			if (head->is_data) {
2256 				ret = btrfs_del_csums(trans, root,
2257 						      node->bytenr,
2258 						      node->num_bytes);
2259 			}
2260 		}
2261 		return ret;
2262 	}
2263 
2264 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2265 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2266 		ret = run_delayed_tree_ref(trans, root, node, extent_op,
2267 					   insert_reserved);
2268 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2269 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
2270 		ret = run_delayed_data_ref(trans, root, node, extent_op,
2271 					   insert_reserved);
2272 	else
2273 		BUG();
2274 	return ret;
2275 }
2276 
2277 static noinline struct btrfs_delayed_ref_node *
2278 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2279 {
2280 	struct rb_node *node;
2281 	struct btrfs_delayed_ref_node *ref;
2282 	int action = BTRFS_ADD_DELAYED_REF;
2283 again:
2284 	/*
2285 	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2286 	 * this prevents ref count from going down to zero when
2287 	 * there still are pending delayed ref.
2288 	 */
2289 	node = rb_prev(&head->node.rb_node);
2290 	while (1) {
2291 		if (!node)
2292 			break;
2293 		ref = rb_entry(node, struct btrfs_delayed_ref_node,
2294 				rb_node);
2295 		if (ref->bytenr != head->node.bytenr)
2296 			break;
2297 		if (ref->action == action)
2298 			return ref;
2299 		node = rb_prev(node);
2300 	}
2301 	if (action == BTRFS_ADD_DELAYED_REF) {
2302 		action = BTRFS_DROP_DELAYED_REF;
2303 		goto again;
2304 	}
2305 	return NULL;
2306 }
2307 
2308 /*
2309  * Returns 0 on success or if called with an already aborted transaction.
2310  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2311  */
2312 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2313 				       struct btrfs_root *root,
2314 				       struct list_head *cluster)
2315 {
2316 	struct btrfs_delayed_ref_root *delayed_refs;
2317 	struct btrfs_delayed_ref_node *ref;
2318 	struct btrfs_delayed_ref_head *locked_ref = NULL;
2319 	struct btrfs_delayed_extent_op *extent_op;
2320 	struct btrfs_fs_info *fs_info = root->fs_info;
2321 	int ret;
2322 	int count = 0;
2323 	int must_insert_reserved = 0;
2324 
2325 	delayed_refs = &trans->transaction->delayed_refs;
2326 	while (1) {
2327 		if (!locked_ref) {
2328 			/* pick a new head ref from the cluster list */
2329 			if (list_empty(cluster))
2330 				break;
2331 
2332 			locked_ref = list_entry(cluster->next,
2333 				     struct btrfs_delayed_ref_head, cluster);
2334 
2335 			/* grab the lock that says we are going to process
2336 			 * all the refs for this head */
2337 			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2338 
2339 			/*
2340 			 * we may have dropped the spin lock to get the head
2341 			 * mutex lock, and that might have given someone else
2342 			 * time to free the head.  If that's true, it has been
2343 			 * removed from our list and we can move on.
2344 			 */
2345 			if (ret == -EAGAIN) {
2346 				locked_ref = NULL;
2347 				count++;
2348 				continue;
2349 			}
2350 		}
2351 
2352 		/*
2353 		 * We need to try and merge add/drops of the same ref since we
2354 		 * can run into issues with relocate dropping the implicit ref
2355 		 * and then it being added back again before the drop can
2356 		 * finish.  If we merged anything we need to re-loop so we can
2357 		 * get a good ref.
2358 		 */
2359 		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2360 					 locked_ref);
2361 
2362 		/*
2363 		 * locked_ref is the head node, so we have to go one
2364 		 * node back for any delayed ref updates
2365 		 */
2366 		ref = select_delayed_ref(locked_ref);
2367 
2368 		if (ref && ref->seq &&
2369 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2370 			/*
2371 			 * there are still refs with lower seq numbers in the
2372 			 * process of being added. Don't run this ref yet.
2373 			 */
2374 			list_del_init(&locked_ref->cluster);
2375 			btrfs_delayed_ref_unlock(locked_ref);
2376 			locked_ref = NULL;
2377 			delayed_refs->num_heads_ready++;
2378 			spin_unlock(&delayed_refs->lock);
2379 			cond_resched();
2380 			spin_lock(&delayed_refs->lock);
2381 			continue;
2382 		}
2383 
2384 		/*
2385 		 * record the must insert reserved flag before we
2386 		 * drop the spin lock.
2387 		 */
2388 		must_insert_reserved = locked_ref->must_insert_reserved;
2389 		locked_ref->must_insert_reserved = 0;
2390 
2391 		extent_op = locked_ref->extent_op;
2392 		locked_ref->extent_op = NULL;
2393 
2394 		if (!ref) {
2395 			/* All delayed refs have been processed, Go ahead
2396 			 * and send the head node to run_one_delayed_ref,
2397 			 * so that any accounting fixes can happen
2398 			 */
2399 			ref = &locked_ref->node;
2400 
2401 			if (extent_op && must_insert_reserved) {
2402 				btrfs_free_delayed_extent_op(extent_op);
2403 				extent_op = NULL;
2404 			}
2405 
2406 			if (extent_op) {
2407 				spin_unlock(&delayed_refs->lock);
2408 
2409 				ret = run_delayed_extent_op(trans, root,
2410 							    ref, extent_op);
2411 				btrfs_free_delayed_extent_op(extent_op);
2412 
2413 				if (ret) {
2414 					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2415 					spin_lock(&delayed_refs->lock);
2416 					btrfs_delayed_ref_unlock(locked_ref);
2417 					return ret;
2418 				}
2419 
2420 				goto next;
2421 			}
2422 		}
2423 
2424 		ref->in_tree = 0;
2425 		rb_erase(&ref->rb_node, &delayed_refs->root);
2426 		delayed_refs->num_entries--;
2427 		if (!btrfs_delayed_ref_is_head(ref)) {
2428 			/*
2429 			 * when we play the delayed ref, also correct the
2430 			 * ref_mod on head
2431 			 */
2432 			switch (ref->action) {
2433 			case BTRFS_ADD_DELAYED_REF:
2434 			case BTRFS_ADD_DELAYED_EXTENT:
2435 				locked_ref->node.ref_mod -= ref->ref_mod;
2436 				break;
2437 			case BTRFS_DROP_DELAYED_REF:
2438 				locked_ref->node.ref_mod += ref->ref_mod;
2439 				break;
2440 			default:
2441 				WARN_ON(1);
2442 			}
2443 		}
2444 		spin_unlock(&delayed_refs->lock);
2445 
2446 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2447 					  must_insert_reserved);
2448 
2449 		btrfs_free_delayed_extent_op(extent_op);
2450 		if (ret) {
2451 			btrfs_delayed_ref_unlock(locked_ref);
2452 			btrfs_put_delayed_ref(ref);
2453 			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2454 			spin_lock(&delayed_refs->lock);
2455 			return ret;
2456 		}
2457 
2458 		/*
2459 		 * If this node is a head, that means all the refs in this head
2460 		 * have been dealt with, and we will pick the next head to deal
2461 		 * with, so we must unlock the head and drop it from the cluster
2462 		 * list before we release it.
2463 		 */
2464 		if (btrfs_delayed_ref_is_head(ref)) {
2465 			list_del_init(&locked_ref->cluster);
2466 			btrfs_delayed_ref_unlock(locked_ref);
2467 			locked_ref = NULL;
2468 		}
2469 		btrfs_put_delayed_ref(ref);
2470 		count++;
2471 next:
2472 		cond_resched();
2473 		spin_lock(&delayed_refs->lock);
2474 	}
2475 	return count;
2476 }
2477 
2478 #ifdef SCRAMBLE_DELAYED_REFS
2479 /*
2480  * Normally delayed refs get processed in ascending bytenr order. This
2481  * correlates in most cases to the order added. To expose dependencies on this
2482  * order, we start to process the tree in the middle instead of the beginning
2483  */
2484 static u64 find_middle(struct rb_root *root)
2485 {
2486 	struct rb_node *n = root->rb_node;
2487 	struct btrfs_delayed_ref_node *entry;
2488 	int alt = 1;
2489 	u64 middle;
2490 	u64 first = 0, last = 0;
2491 
2492 	n = rb_first(root);
2493 	if (n) {
2494 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2495 		first = entry->bytenr;
2496 	}
2497 	n = rb_last(root);
2498 	if (n) {
2499 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2500 		last = entry->bytenr;
2501 	}
2502 	n = root->rb_node;
2503 
2504 	while (n) {
2505 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2506 		WARN_ON(!entry->in_tree);
2507 
2508 		middle = entry->bytenr;
2509 
2510 		if (alt)
2511 			n = n->rb_left;
2512 		else
2513 			n = n->rb_right;
2514 
2515 		alt = 1 - alt;
2516 	}
2517 	return middle;
2518 }
2519 #endif
2520 
2521 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2522 					 struct btrfs_fs_info *fs_info)
2523 {
2524 	struct qgroup_update *qgroup_update;
2525 	int ret = 0;
2526 
2527 	if (list_empty(&trans->qgroup_ref_list) !=
2528 	    !trans->delayed_ref_elem.seq) {
2529 		/* list without seq or seq without list */
2530 		btrfs_err(fs_info,
2531 			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
2532 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
2533 			(u32)(trans->delayed_ref_elem.seq >> 32),
2534 			(u32)trans->delayed_ref_elem.seq);
2535 		BUG();
2536 	}
2537 
2538 	if (!trans->delayed_ref_elem.seq)
2539 		return 0;
2540 
2541 	while (!list_empty(&trans->qgroup_ref_list)) {
2542 		qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2543 						 struct qgroup_update, list);
2544 		list_del(&qgroup_update->list);
2545 		if (!ret)
2546 			ret = btrfs_qgroup_account_ref(
2547 					trans, fs_info, qgroup_update->node,
2548 					qgroup_update->extent_op);
2549 		kfree(qgroup_update);
2550 	}
2551 
2552 	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2553 
2554 	return ret;
2555 }
2556 
2557 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2558 		      int count)
2559 {
2560 	int val = atomic_read(&delayed_refs->ref_seq);
2561 
2562 	if (val < seq || val >= seq + count)
2563 		return 1;
2564 	return 0;
2565 }
2566 
2567 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2568 {
2569 	u64 num_bytes;
2570 
2571 	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2572 			     sizeof(struct btrfs_extent_inline_ref));
2573 	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2574 		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2575 
2576 	/*
2577 	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2578 	 * closer to what we're really going to want to ouse.
2579 	 */
2580 	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2581 }
2582 
2583 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2584 				       struct btrfs_root *root)
2585 {
2586 	struct btrfs_block_rsv *global_rsv;
2587 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2588 	u64 num_bytes;
2589 	int ret = 0;
2590 
2591 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2592 	num_heads = heads_to_leaves(root, num_heads);
2593 	if (num_heads > 1)
2594 		num_bytes += (num_heads - 1) * root->leafsize;
2595 	num_bytes <<= 1;
2596 	global_rsv = &root->fs_info->global_block_rsv;
2597 
2598 	/*
2599 	 * If we can't allocate any more chunks lets make sure we have _lots_ of
2600 	 * wiggle room since running delayed refs can create more delayed refs.
2601 	 */
2602 	if (global_rsv->space_info->full)
2603 		num_bytes <<= 1;
2604 
2605 	spin_lock(&global_rsv->lock);
2606 	if (global_rsv->reserved <= num_bytes)
2607 		ret = 1;
2608 	spin_unlock(&global_rsv->lock);
2609 	return ret;
2610 }
2611 
2612 /*
2613  * this starts processing the delayed reference count updates and
2614  * extent insertions we have queued up so far.  count can be
2615  * 0, which means to process everything in the tree at the start
2616  * of the run (but not newly added entries), or it can be some target
2617  * number you'd like to process.
2618  *
2619  * Returns 0 on success or if called with an aborted transaction
2620  * Returns <0 on error and aborts the transaction
2621  */
2622 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2623 			   struct btrfs_root *root, unsigned long count)
2624 {
2625 	struct rb_node *node;
2626 	struct btrfs_delayed_ref_root *delayed_refs;
2627 	struct btrfs_delayed_ref_node *ref;
2628 	struct list_head cluster;
2629 	int ret;
2630 	u64 delayed_start;
2631 	int run_all = count == (unsigned long)-1;
2632 	int run_most = 0;
2633 	int loops;
2634 
2635 	/* We'll clean this up in btrfs_cleanup_transaction */
2636 	if (trans->aborted)
2637 		return 0;
2638 
2639 	if (root == root->fs_info->extent_root)
2640 		root = root->fs_info->tree_root;
2641 
2642 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2643 
2644 	delayed_refs = &trans->transaction->delayed_refs;
2645 	INIT_LIST_HEAD(&cluster);
2646 	if (count == 0) {
2647 		count = delayed_refs->num_entries * 2;
2648 		run_most = 1;
2649 	}
2650 
2651 	if (!run_all && !run_most) {
2652 		int old;
2653 		int seq = atomic_read(&delayed_refs->ref_seq);
2654 
2655 progress:
2656 		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2657 		if (old) {
2658 			DEFINE_WAIT(__wait);
2659 			if (delayed_refs->flushing ||
2660 			    !btrfs_should_throttle_delayed_refs(trans, root))
2661 				return 0;
2662 
2663 			prepare_to_wait(&delayed_refs->wait, &__wait,
2664 					TASK_UNINTERRUPTIBLE);
2665 
2666 			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2667 			if (old) {
2668 				schedule();
2669 				finish_wait(&delayed_refs->wait, &__wait);
2670 
2671 				if (!refs_newer(delayed_refs, seq, 256))
2672 					goto progress;
2673 				else
2674 					return 0;
2675 			} else {
2676 				finish_wait(&delayed_refs->wait, &__wait);
2677 				goto again;
2678 			}
2679 		}
2680 
2681 	} else {
2682 		atomic_inc(&delayed_refs->procs_running_refs);
2683 	}
2684 
2685 again:
2686 	loops = 0;
2687 	spin_lock(&delayed_refs->lock);
2688 
2689 #ifdef SCRAMBLE_DELAYED_REFS
2690 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2691 #endif
2692 
2693 	while (1) {
2694 		if (!(run_all || run_most) &&
2695 		    !btrfs_should_throttle_delayed_refs(trans, root))
2696 			break;
2697 
2698 		/*
2699 		 * go find something we can process in the rbtree.  We start at
2700 		 * the beginning of the tree, and then build a cluster
2701 		 * of refs to process starting at the first one we are able to
2702 		 * lock
2703 		 */
2704 		delayed_start = delayed_refs->run_delayed_start;
2705 		ret = btrfs_find_ref_cluster(trans, &cluster,
2706 					     delayed_refs->run_delayed_start);
2707 		if (ret)
2708 			break;
2709 
2710 		ret = run_clustered_refs(trans, root, &cluster);
2711 		if (ret < 0) {
2712 			btrfs_release_ref_cluster(&cluster);
2713 			spin_unlock(&delayed_refs->lock);
2714 			btrfs_abort_transaction(trans, root, ret);
2715 			atomic_dec(&delayed_refs->procs_running_refs);
2716 			wake_up(&delayed_refs->wait);
2717 			return ret;
2718 		}
2719 
2720 		atomic_add(ret, &delayed_refs->ref_seq);
2721 
2722 		count -= min_t(unsigned long, ret, count);
2723 
2724 		if (count == 0)
2725 			break;
2726 
2727 		if (delayed_start >= delayed_refs->run_delayed_start) {
2728 			if (loops == 0) {
2729 				/*
2730 				 * btrfs_find_ref_cluster looped. let's do one
2731 				 * more cycle. if we don't run any delayed ref
2732 				 * during that cycle (because we can't because
2733 				 * all of them are blocked), bail out.
2734 				 */
2735 				loops = 1;
2736 			} else {
2737 				/*
2738 				 * no runnable refs left, stop trying
2739 				 */
2740 				BUG_ON(run_all);
2741 				break;
2742 			}
2743 		}
2744 		if (ret) {
2745 			/* refs were run, let's reset staleness detection */
2746 			loops = 0;
2747 		}
2748 	}
2749 
2750 	if (run_all) {
2751 		if (!list_empty(&trans->new_bgs)) {
2752 			spin_unlock(&delayed_refs->lock);
2753 			btrfs_create_pending_block_groups(trans, root);
2754 			spin_lock(&delayed_refs->lock);
2755 		}
2756 
2757 		node = rb_first(&delayed_refs->root);
2758 		if (!node)
2759 			goto out;
2760 		count = (unsigned long)-1;
2761 
2762 		while (node) {
2763 			ref = rb_entry(node, struct btrfs_delayed_ref_node,
2764 				       rb_node);
2765 			if (btrfs_delayed_ref_is_head(ref)) {
2766 				struct btrfs_delayed_ref_head *head;
2767 
2768 				head = btrfs_delayed_node_to_head(ref);
2769 				atomic_inc(&ref->refs);
2770 
2771 				spin_unlock(&delayed_refs->lock);
2772 				/*
2773 				 * Mutex was contended, block until it's
2774 				 * released and try again
2775 				 */
2776 				mutex_lock(&head->mutex);
2777 				mutex_unlock(&head->mutex);
2778 
2779 				btrfs_put_delayed_ref(ref);
2780 				cond_resched();
2781 				goto again;
2782 			}
2783 			node = rb_next(node);
2784 		}
2785 		spin_unlock(&delayed_refs->lock);
2786 		schedule_timeout(1);
2787 		goto again;
2788 	}
2789 out:
2790 	atomic_dec(&delayed_refs->procs_running_refs);
2791 	smp_mb();
2792 	if (waitqueue_active(&delayed_refs->wait))
2793 		wake_up(&delayed_refs->wait);
2794 
2795 	spin_unlock(&delayed_refs->lock);
2796 	assert_qgroups_uptodate(trans);
2797 	return 0;
2798 }
2799 
2800 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2801 				struct btrfs_root *root,
2802 				u64 bytenr, u64 num_bytes, u64 flags,
2803 				int level, int is_data)
2804 {
2805 	struct btrfs_delayed_extent_op *extent_op;
2806 	int ret;
2807 
2808 	extent_op = btrfs_alloc_delayed_extent_op();
2809 	if (!extent_op)
2810 		return -ENOMEM;
2811 
2812 	extent_op->flags_to_set = flags;
2813 	extent_op->update_flags = 1;
2814 	extent_op->update_key = 0;
2815 	extent_op->is_data = is_data ? 1 : 0;
2816 	extent_op->level = level;
2817 
2818 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2819 					  num_bytes, extent_op);
2820 	if (ret)
2821 		btrfs_free_delayed_extent_op(extent_op);
2822 	return ret;
2823 }
2824 
2825 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2826 				      struct btrfs_root *root,
2827 				      struct btrfs_path *path,
2828 				      u64 objectid, u64 offset, u64 bytenr)
2829 {
2830 	struct btrfs_delayed_ref_head *head;
2831 	struct btrfs_delayed_ref_node *ref;
2832 	struct btrfs_delayed_data_ref *data_ref;
2833 	struct btrfs_delayed_ref_root *delayed_refs;
2834 	struct rb_node *node;
2835 	int ret = 0;
2836 
2837 	ret = -ENOENT;
2838 	delayed_refs = &trans->transaction->delayed_refs;
2839 	spin_lock(&delayed_refs->lock);
2840 	head = btrfs_find_delayed_ref_head(trans, bytenr);
2841 	if (!head)
2842 		goto out;
2843 
2844 	if (!mutex_trylock(&head->mutex)) {
2845 		atomic_inc(&head->node.refs);
2846 		spin_unlock(&delayed_refs->lock);
2847 
2848 		btrfs_release_path(path);
2849 
2850 		/*
2851 		 * Mutex was contended, block until it's released and let
2852 		 * caller try again
2853 		 */
2854 		mutex_lock(&head->mutex);
2855 		mutex_unlock(&head->mutex);
2856 		btrfs_put_delayed_ref(&head->node);
2857 		return -EAGAIN;
2858 	}
2859 
2860 	node = rb_prev(&head->node.rb_node);
2861 	if (!node)
2862 		goto out_unlock;
2863 
2864 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2865 
2866 	if (ref->bytenr != bytenr)
2867 		goto out_unlock;
2868 
2869 	ret = 1;
2870 	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2871 		goto out_unlock;
2872 
2873 	data_ref = btrfs_delayed_node_to_data_ref(ref);
2874 
2875 	node = rb_prev(node);
2876 	if (node) {
2877 		int seq = ref->seq;
2878 
2879 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2880 		if (ref->bytenr == bytenr && ref->seq == seq)
2881 			goto out_unlock;
2882 	}
2883 
2884 	if (data_ref->root != root->root_key.objectid ||
2885 	    data_ref->objectid != objectid || data_ref->offset != offset)
2886 		goto out_unlock;
2887 
2888 	ret = 0;
2889 out_unlock:
2890 	mutex_unlock(&head->mutex);
2891 out:
2892 	spin_unlock(&delayed_refs->lock);
2893 	return ret;
2894 }
2895 
2896 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2897 					struct btrfs_root *root,
2898 					struct btrfs_path *path,
2899 					u64 objectid, u64 offset, u64 bytenr)
2900 {
2901 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2902 	struct extent_buffer *leaf;
2903 	struct btrfs_extent_data_ref *ref;
2904 	struct btrfs_extent_inline_ref *iref;
2905 	struct btrfs_extent_item *ei;
2906 	struct btrfs_key key;
2907 	u32 item_size;
2908 	int ret;
2909 
2910 	key.objectid = bytenr;
2911 	key.offset = (u64)-1;
2912 	key.type = BTRFS_EXTENT_ITEM_KEY;
2913 
2914 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2915 	if (ret < 0)
2916 		goto out;
2917 	BUG_ON(ret == 0); /* Corruption */
2918 
2919 	ret = -ENOENT;
2920 	if (path->slots[0] == 0)
2921 		goto out;
2922 
2923 	path->slots[0]--;
2924 	leaf = path->nodes[0];
2925 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2926 
2927 	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2928 		goto out;
2929 
2930 	ret = 1;
2931 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2932 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2933 	if (item_size < sizeof(*ei)) {
2934 		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2935 		goto out;
2936 	}
2937 #endif
2938 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2939 
2940 	if (item_size != sizeof(*ei) +
2941 	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2942 		goto out;
2943 
2944 	if (btrfs_extent_generation(leaf, ei) <=
2945 	    btrfs_root_last_snapshot(&root->root_item))
2946 		goto out;
2947 
2948 	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2949 	if (btrfs_extent_inline_ref_type(leaf, iref) !=
2950 	    BTRFS_EXTENT_DATA_REF_KEY)
2951 		goto out;
2952 
2953 	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2954 	if (btrfs_extent_refs(leaf, ei) !=
2955 	    btrfs_extent_data_ref_count(leaf, ref) ||
2956 	    btrfs_extent_data_ref_root(leaf, ref) !=
2957 	    root->root_key.objectid ||
2958 	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2959 	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2960 		goto out;
2961 
2962 	ret = 0;
2963 out:
2964 	return ret;
2965 }
2966 
2967 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2968 			  struct btrfs_root *root,
2969 			  u64 objectid, u64 offset, u64 bytenr)
2970 {
2971 	struct btrfs_path *path;
2972 	int ret;
2973 	int ret2;
2974 
2975 	path = btrfs_alloc_path();
2976 	if (!path)
2977 		return -ENOENT;
2978 
2979 	do {
2980 		ret = check_committed_ref(trans, root, path, objectid,
2981 					  offset, bytenr);
2982 		if (ret && ret != -ENOENT)
2983 			goto out;
2984 
2985 		ret2 = check_delayed_ref(trans, root, path, objectid,
2986 					 offset, bytenr);
2987 	} while (ret2 == -EAGAIN);
2988 
2989 	if (ret2 && ret2 != -ENOENT) {
2990 		ret = ret2;
2991 		goto out;
2992 	}
2993 
2994 	if (ret != -ENOENT || ret2 != -ENOENT)
2995 		ret = 0;
2996 out:
2997 	btrfs_free_path(path);
2998 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2999 		WARN_ON(ret > 0);
3000 	return ret;
3001 }
3002 
3003 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3004 			   struct btrfs_root *root,
3005 			   struct extent_buffer *buf,
3006 			   int full_backref, int inc, int for_cow)
3007 {
3008 	u64 bytenr;
3009 	u64 num_bytes;
3010 	u64 parent;
3011 	u64 ref_root;
3012 	u32 nritems;
3013 	struct btrfs_key key;
3014 	struct btrfs_file_extent_item *fi;
3015 	int i;
3016 	int level;
3017 	int ret = 0;
3018 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3019 			    u64, u64, u64, u64, u64, u64, int);
3020 
3021 	ref_root = btrfs_header_owner(buf);
3022 	nritems = btrfs_header_nritems(buf);
3023 	level = btrfs_header_level(buf);
3024 
3025 	if (!root->ref_cows && level == 0)
3026 		return 0;
3027 
3028 	if (inc)
3029 		process_func = btrfs_inc_extent_ref;
3030 	else
3031 		process_func = btrfs_free_extent;
3032 
3033 	if (full_backref)
3034 		parent = buf->start;
3035 	else
3036 		parent = 0;
3037 
3038 	for (i = 0; i < nritems; i++) {
3039 		if (level == 0) {
3040 			btrfs_item_key_to_cpu(buf, &key, i);
3041 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3042 				continue;
3043 			fi = btrfs_item_ptr(buf, i,
3044 					    struct btrfs_file_extent_item);
3045 			if (btrfs_file_extent_type(buf, fi) ==
3046 			    BTRFS_FILE_EXTENT_INLINE)
3047 				continue;
3048 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3049 			if (bytenr == 0)
3050 				continue;
3051 
3052 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3053 			key.offset -= btrfs_file_extent_offset(buf, fi);
3054 			ret = process_func(trans, root, bytenr, num_bytes,
3055 					   parent, ref_root, key.objectid,
3056 					   key.offset, for_cow);
3057 			if (ret)
3058 				goto fail;
3059 		} else {
3060 			bytenr = btrfs_node_blockptr(buf, i);
3061 			num_bytes = btrfs_level_size(root, level - 1);
3062 			ret = process_func(trans, root, bytenr, num_bytes,
3063 					   parent, ref_root, level - 1, 0,
3064 					   for_cow);
3065 			if (ret)
3066 				goto fail;
3067 		}
3068 	}
3069 	return 0;
3070 fail:
3071 	return ret;
3072 }
3073 
3074 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3075 		  struct extent_buffer *buf, int full_backref, int for_cow)
3076 {
3077 	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
3078 }
3079 
3080 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3081 		  struct extent_buffer *buf, int full_backref, int for_cow)
3082 {
3083 	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
3084 }
3085 
3086 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3087 				 struct btrfs_root *root,
3088 				 struct btrfs_path *path,
3089 				 struct btrfs_block_group_cache *cache)
3090 {
3091 	int ret;
3092 	struct btrfs_root *extent_root = root->fs_info->extent_root;
3093 	unsigned long bi;
3094 	struct extent_buffer *leaf;
3095 
3096 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3097 	if (ret < 0)
3098 		goto fail;
3099 	BUG_ON(ret); /* Corruption */
3100 
3101 	leaf = path->nodes[0];
3102 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3103 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3104 	btrfs_mark_buffer_dirty(leaf);
3105 	btrfs_release_path(path);
3106 fail:
3107 	if (ret) {
3108 		btrfs_abort_transaction(trans, root, ret);
3109 		return ret;
3110 	}
3111 	return 0;
3112 
3113 }
3114 
3115 static struct btrfs_block_group_cache *
3116 next_block_group(struct btrfs_root *root,
3117 		 struct btrfs_block_group_cache *cache)
3118 {
3119 	struct rb_node *node;
3120 	spin_lock(&root->fs_info->block_group_cache_lock);
3121 	node = rb_next(&cache->cache_node);
3122 	btrfs_put_block_group(cache);
3123 	if (node) {
3124 		cache = rb_entry(node, struct btrfs_block_group_cache,
3125 				 cache_node);
3126 		btrfs_get_block_group(cache);
3127 	} else
3128 		cache = NULL;
3129 	spin_unlock(&root->fs_info->block_group_cache_lock);
3130 	return cache;
3131 }
3132 
3133 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3134 			    struct btrfs_trans_handle *trans,
3135 			    struct btrfs_path *path)
3136 {
3137 	struct btrfs_root *root = block_group->fs_info->tree_root;
3138 	struct inode *inode = NULL;
3139 	u64 alloc_hint = 0;
3140 	int dcs = BTRFS_DC_ERROR;
3141 	int num_pages = 0;
3142 	int retries = 0;
3143 	int ret = 0;
3144 
3145 	/*
3146 	 * If this block group is smaller than 100 megs don't bother caching the
3147 	 * block group.
3148 	 */
3149 	if (block_group->key.offset < (100 * 1024 * 1024)) {
3150 		spin_lock(&block_group->lock);
3151 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3152 		spin_unlock(&block_group->lock);
3153 		return 0;
3154 	}
3155 
3156 again:
3157 	inode = lookup_free_space_inode(root, block_group, path);
3158 	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3159 		ret = PTR_ERR(inode);
3160 		btrfs_release_path(path);
3161 		goto out;
3162 	}
3163 
3164 	if (IS_ERR(inode)) {
3165 		BUG_ON(retries);
3166 		retries++;
3167 
3168 		if (block_group->ro)
3169 			goto out_free;
3170 
3171 		ret = create_free_space_inode(root, trans, block_group, path);
3172 		if (ret)
3173 			goto out_free;
3174 		goto again;
3175 	}
3176 
3177 	/* We've already setup this transaction, go ahead and exit */
3178 	if (block_group->cache_generation == trans->transid &&
3179 	    i_size_read(inode)) {
3180 		dcs = BTRFS_DC_SETUP;
3181 		goto out_put;
3182 	}
3183 
3184 	/*
3185 	 * We want to set the generation to 0, that way if anything goes wrong
3186 	 * from here on out we know not to trust this cache when we load up next
3187 	 * time.
3188 	 */
3189 	BTRFS_I(inode)->generation = 0;
3190 	ret = btrfs_update_inode(trans, root, inode);
3191 	WARN_ON(ret);
3192 
3193 	if (i_size_read(inode) > 0) {
3194 		ret = btrfs_check_trunc_cache_free_space(root,
3195 					&root->fs_info->global_block_rsv);
3196 		if (ret)
3197 			goto out_put;
3198 
3199 		ret = btrfs_truncate_free_space_cache(root, trans, path,
3200 						      inode);
3201 		if (ret)
3202 			goto out_put;
3203 	}
3204 
3205 	spin_lock(&block_group->lock);
3206 	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3207 	    !btrfs_test_opt(root, SPACE_CACHE)) {
3208 		/*
3209 		 * don't bother trying to write stuff out _if_
3210 		 * a) we're not cached,
3211 		 * b) we're with nospace_cache mount option.
3212 		 */
3213 		dcs = BTRFS_DC_WRITTEN;
3214 		spin_unlock(&block_group->lock);
3215 		goto out_put;
3216 	}
3217 	spin_unlock(&block_group->lock);
3218 
3219 	/*
3220 	 * Try to preallocate enough space based on how big the block group is.
3221 	 * Keep in mind this has to include any pinned space which could end up
3222 	 * taking up quite a bit since it's not folded into the other space
3223 	 * cache.
3224 	 */
3225 	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3226 	if (!num_pages)
3227 		num_pages = 1;
3228 
3229 	num_pages *= 16;
3230 	num_pages *= PAGE_CACHE_SIZE;
3231 
3232 	ret = btrfs_check_data_free_space(inode, num_pages);
3233 	if (ret)
3234 		goto out_put;
3235 
3236 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3237 					      num_pages, num_pages,
3238 					      &alloc_hint);
3239 	if (!ret)
3240 		dcs = BTRFS_DC_SETUP;
3241 	btrfs_free_reserved_data_space(inode, num_pages);
3242 
3243 out_put:
3244 	iput(inode);
3245 out_free:
3246 	btrfs_release_path(path);
3247 out:
3248 	spin_lock(&block_group->lock);
3249 	if (!ret && dcs == BTRFS_DC_SETUP)
3250 		block_group->cache_generation = trans->transid;
3251 	block_group->disk_cache_state = dcs;
3252 	spin_unlock(&block_group->lock);
3253 
3254 	return ret;
3255 }
3256 
3257 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3258 				   struct btrfs_root *root)
3259 {
3260 	struct btrfs_block_group_cache *cache;
3261 	int err = 0;
3262 	struct btrfs_path *path;
3263 	u64 last = 0;
3264 
3265 	path = btrfs_alloc_path();
3266 	if (!path)
3267 		return -ENOMEM;
3268 
3269 again:
3270 	while (1) {
3271 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3272 		while (cache) {
3273 			if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3274 				break;
3275 			cache = next_block_group(root, cache);
3276 		}
3277 		if (!cache) {
3278 			if (last == 0)
3279 				break;
3280 			last = 0;
3281 			continue;
3282 		}
3283 		err = cache_save_setup(cache, trans, path);
3284 		last = cache->key.objectid + cache->key.offset;
3285 		btrfs_put_block_group(cache);
3286 	}
3287 
3288 	while (1) {
3289 		if (last == 0) {
3290 			err = btrfs_run_delayed_refs(trans, root,
3291 						     (unsigned long)-1);
3292 			if (err) /* File system offline */
3293 				goto out;
3294 		}
3295 
3296 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3297 		while (cache) {
3298 			if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3299 				btrfs_put_block_group(cache);
3300 				goto again;
3301 			}
3302 
3303 			if (cache->dirty)
3304 				break;
3305 			cache = next_block_group(root, cache);
3306 		}
3307 		if (!cache) {
3308 			if (last == 0)
3309 				break;
3310 			last = 0;
3311 			continue;
3312 		}
3313 
3314 		if (cache->disk_cache_state == BTRFS_DC_SETUP)
3315 			cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3316 		cache->dirty = 0;
3317 		last = cache->key.objectid + cache->key.offset;
3318 
3319 		err = write_one_cache_group(trans, root, path, cache);
3320 		if (err) /* File system offline */
3321 			goto out;
3322 
3323 		btrfs_put_block_group(cache);
3324 	}
3325 
3326 	while (1) {
3327 		/*
3328 		 * I don't think this is needed since we're just marking our
3329 		 * preallocated extent as written, but just in case it can't
3330 		 * hurt.
3331 		 */
3332 		if (last == 0) {
3333 			err = btrfs_run_delayed_refs(trans, root,
3334 						     (unsigned long)-1);
3335 			if (err) /* File system offline */
3336 				goto out;
3337 		}
3338 
3339 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
3340 		while (cache) {
3341 			/*
3342 			 * Really this shouldn't happen, but it could if we
3343 			 * couldn't write the entire preallocated extent and
3344 			 * splitting the extent resulted in a new block.
3345 			 */
3346 			if (cache->dirty) {
3347 				btrfs_put_block_group(cache);
3348 				goto again;
3349 			}
3350 			if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3351 				break;
3352 			cache = next_block_group(root, cache);
3353 		}
3354 		if (!cache) {
3355 			if (last == 0)
3356 				break;
3357 			last = 0;
3358 			continue;
3359 		}
3360 
3361 		err = btrfs_write_out_cache(root, trans, cache, path);
3362 
3363 		/*
3364 		 * If we didn't have an error then the cache state is still
3365 		 * NEED_WRITE, so we can set it to WRITTEN.
3366 		 */
3367 		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3368 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
3369 		last = cache->key.objectid + cache->key.offset;
3370 		btrfs_put_block_group(cache);
3371 	}
3372 out:
3373 
3374 	btrfs_free_path(path);
3375 	return err;
3376 }
3377 
3378 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3379 {
3380 	struct btrfs_block_group_cache *block_group;
3381 	int readonly = 0;
3382 
3383 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3384 	if (!block_group || block_group->ro)
3385 		readonly = 1;
3386 	if (block_group)
3387 		btrfs_put_block_group(block_group);
3388 	return readonly;
3389 }
3390 
3391 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3392 			     u64 total_bytes, u64 bytes_used,
3393 			     struct btrfs_space_info **space_info)
3394 {
3395 	struct btrfs_space_info *found;
3396 	int i;
3397 	int factor;
3398 	int ret;
3399 
3400 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3401 		     BTRFS_BLOCK_GROUP_RAID10))
3402 		factor = 2;
3403 	else
3404 		factor = 1;
3405 
3406 	found = __find_space_info(info, flags);
3407 	if (found) {
3408 		spin_lock(&found->lock);
3409 		found->total_bytes += total_bytes;
3410 		found->disk_total += total_bytes * factor;
3411 		found->bytes_used += bytes_used;
3412 		found->disk_used += bytes_used * factor;
3413 		found->full = 0;
3414 		spin_unlock(&found->lock);
3415 		*space_info = found;
3416 		return 0;
3417 	}
3418 	found = kzalloc(sizeof(*found), GFP_NOFS);
3419 	if (!found)
3420 		return -ENOMEM;
3421 
3422 	ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3423 	if (ret) {
3424 		kfree(found);
3425 		return ret;
3426 	}
3427 
3428 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3429 		INIT_LIST_HEAD(&found->block_groups[i]);
3430 	init_rwsem(&found->groups_sem);
3431 	spin_lock_init(&found->lock);
3432 	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3433 	found->total_bytes = total_bytes;
3434 	found->disk_total = total_bytes * factor;
3435 	found->bytes_used = bytes_used;
3436 	found->disk_used = bytes_used * factor;
3437 	found->bytes_pinned = 0;
3438 	found->bytes_reserved = 0;
3439 	found->bytes_readonly = 0;
3440 	found->bytes_may_use = 0;
3441 	found->full = 0;
3442 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3443 	found->chunk_alloc = 0;
3444 	found->flush = 0;
3445 	init_waitqueue_head(&found->wait);
3446 	*space_info = found;
3447 	list_add_rcu(&found->list, &info->space_info);
3448 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3449 		info->data_sinfo = found;
3450 	return 0;
3451 }
3452 
3453 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3454 {
3455 	u64 extra_flags = chunk_to_extended(flags) &
3456 				BTRFS_EXTENDED_PROFILE_MASK;
3457 
3458 	write_seqlock(&fs_info->profiles_lock);
3459 	if (flags & BTRFS_BLOCK_GROUP_DATA)
3460 		fs_info->avail_data_alloc_bits |= extra_flags;
3461 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
3462 		fs_info->avail_metadata_alloc_bits |= extra_flags;
3463 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3464 		fs_info->avail_system_alloc_bits |= extra_flags;
3465 	write_sequnlock(&fs_info->profiles_lock);
3466 }
3467 
3468 /*
3469  * returns target flags in extended format or 0 if restripe for this
3470  * chunk_type is not in progress
3471  *
3472  * should be called with either volume_mutex or balance_lock held
3473  */
3474 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3475 {
3476 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3477 	u64 target = 0;
3478 
3479 	if (!bctl)
3480 		return 0;
3481 
3482 	if (flags & BTRFS_BLOCK_GROUP_DATA &&
3483 	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3484 		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3485 	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3486 		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3487 		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3488 	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3489 		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3490 		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3491 	}
3492 
3493 	return target;
3494 }
3495 
3496 /*
3497  * @flags: available profiles in extended format (see ctree.h)
3498  *
3499  * Returns reduced profile in chunk format.  If profile changing is in
3500  * progress (either running or paused) picks the target profile (if it's
3501  * already available), otherwise falls back to plain reducing.
3502  */
3503 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3504 {
3505 	/*
3506 	 * we add in the count of missing devices because we want
3507 	 * to make sure that any RAID levels on a degraded FS
3508 	 * continue to be honored.
3509 	 */
3510 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
3511 		root->fs_info->fs_devices->missing_devices;
3512 	u64 target;
3513 	u64 tmp;
3514 
3515 	/*
3516 	 * see if restripe for this chunk_type is in progress, if so
3517 	 * try to reduce to the target profile
3518 	 */
3519 	spin_lock(&root->fs_info->balance_lock);
3520 	target = get_restripe_target(root->fs_info, flags);
3521 	if (target) {
3522 		/* pick target profile only if it's already available */
3523 		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3524 			spin_unlock(&root->fs_info->balance_lock);
3525 			return extended_to_chunk(target);
3526 		}
3527 	}
3528 	spin_unlock(&root->fs_info->balance_lock);
3529 
3530 	/* First, mask out the RAID levels which aren't possible */
3531 	if (num_devices == 1)
3532 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3533 			   BTRFS_BLOCK_GROUP_RAID5);
3534 	if (num_devices < 3)
3535 		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3536 	if (num_devices < 4)
3537 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3538 
3539 	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3540 		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3541 		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3542 	flags &= ~tmp;
3543 
3544 	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3545 		tmp = BTRFS_BLOCK_GROUP_RAID6;
3546 	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3547 		tmp = BTRFS_BLOCK_GROUP_RAID5;
3548 	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3549 		tmp = BTRFS_BLOCK_GROUP_RAID10;
3550 	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3551 		tmp = BTRFS_BLOCK_GROUP_RAID1;
3552 	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3553 		tmp = BTRFS_BLOCK_GROUP_RAID0;
3554 
3555 	return extended_to_chunk(flags | tmp);
3556 }
3557 
3558 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3559 {
3560 	unsigned seq;
3561 
3562 	do {
3563 		seq = read_seqbegin(&root->fs_info->profiles_lock);
3564 
3565 		if (flags & BTRFS_BLOCK_GROUP_DATA)
3566 			flags |= root->fs_info->avail_data_alloc_bits;
3567 		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3568 			flags |= root->fs_info->avail_system_alloc_bits;
3569 		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3570 			flags |= root->fs_info->avail_metadata_alloc_bits;
3571 	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
3572 
3573 	return btrfs_reduce_alloc_profile(root, flags);
3574 }
3575 
3576 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3577 {
3578 	u64 flags;
3579 	u64 ret;
3580 
3581 	if (data)
3582 		flags = BTRFS_BLOCK_GROUP_DATA;
3583 	else if (root == root->fs_info->chunk_root)
3584 		flags = BTRFS_BLOCK_GROUP_SYSTEM;
3585 	else
3586 		flags = BTRFS_BLOCK_GROUP_METADATA;
3587 
3588 	ret = get_alloc_profile(root, flags);
3589 	return ret;
3590 }
3591 
3592 /*
3593  * This will check the space that the inode allocates from to make sure we have
3594  * enough space for bytes.
3595  */
3596 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3597 {
3598 	struct btrfs_space_info *data_sinfo;
3599 	struct btrfs_root *root = BTRFS_I(inode)->root;
3600 	struct btrfs_fs_info *fs_info = root->fs_info;
3601 	u64 used;
3602 	int ret = 0, committed = 0, alloc_chunk = 1;
3603 
3604 	/* make sure bytes are sectorsize aligned */
3605 	bytes = ALIGN(bytes, root->sectorsize);
3606 
3607 	if (root == root->fs_info->tree_root ||
3608 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3609 		alloc_chunk = 0;
3610 		committed = 1;
3611 	}
3612 
3613 	data_sinfo = fs_info->data_sinfo;
3614 	if (!data_sinfo)
3615 		goto alloc;
3616 
3617 again:
3618 	/* make sure we have enough space to handle the data first */
3619 	spin_lock(&data_sinfo->lock);
3620 	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3621 		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3622 		data_sinfo->bytes_may_use;
3623 
3624 	if (used + bytes > data_sinfo->total_bytes) {
3625 		struct btrfs_trans_handle *trans;
3626 
3627 		/*
3628 		 * if we don't have enough free bytes in this space then we need
3629 		 * to alloc a new chunk.
3630 		 */
3631 		if (!data_sinfo->full && alloc_chunk) {
3632 			u64 alloc_target;
3633 
3634 			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3635 			spin_unlock(&data_sinfo->lock);
3636 alloc:
3637 			alloc_target = btrfs_get_alloc_profile(root, 1);
3638 			trans = btrfs_join_transaction(root);
3639 			if (IS_ERR(trans))
3640 				return PTR_ERR(trans);
3641 
3642 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3643 					     alloc_target,
3644 					     CHUNK_ALLOC_NO_FORCE);
3645 			btrfs_end_transaction(trans, root);
3646 			if (ret < 0) {
3647 				if (ret != -ENOSPC)
3648 					return ret;
3649 				else
3650 					goto commit_trans;
3651 			}
3652 
3653 			if (!data_sinfo)
3654 				data_sinfo = fs_info->data_sinfo;
3655 
3656 			goto again;
3657 		}
3658 
3659 		/*
3660 		 * If we don't have enough pinned space to deal with this
3661 		 * allocation don't bother committing the transaction.
3662 		 */
3663 		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3664 					   bytes) < 0)
3665 			committed = 1;
3666 		spin_unlock(&data_sinfo->lock);
3667 
3668 		/* commit the current transaction and try again */
3669 commit_trans:
3670 		if (!committed &&
3671 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3672 			committed = 1;
3673 
3674 			trans = btrfs_join_transaction(root);
3675 			if (IS_ERR(trans))
3676 				return PTR_ERR(trans);
3677 			ret = btrfs_commit_transaction(trans, root);
3678 			if (ret)
3679 				return ret;
3680 			goto again;
3681 		}
3682 
3683 		return -ENOSPC;
3684 	}
3685 	data_sinfo->bytes_may_use += bytes;
3686 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3687 				      data_sinfo->flags, bytes, 1);
3688 	spin_unlock(&data_sinfo->lock);
3689 
3690 	return 0;
3691 }
3692 
3693 /*
3694  * Called if we need to clear a data reservation for this inode.
3695  */
3696 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3697 {
3698 	struct btrfs_root *root = BTRFS_I(inode)->root;
3699 	struct btrfs_space_info *data_sinfo;
3700 
3701 	/* make sure bytes are sectorsize aligned */
3702 	bytes = ALIGN(bytes, root->sectorsize);
3703 
3704 	data_sinfo = root->fs_info->data_sinfo;
3705 	spin_lock(&data_sinfo->lock);
3706 	WARN_ON(data_sinfo->bytes_may_use < bytes);
3707 	data_sinfo->bytes_may_use -= bytes;
3708 	trace_btrfs_space_reservation(root->fs_info, "space_info",
3709 				      data_sinfo->flags, bytes, 0);
3710 	spin_unlock(&data_sinfo->lock);
3711 }
3712 
3713 static void force_metadata_allocation(struct btrfs_fs_info *info)
3714 {
3715 	struct list_head *head = &info->space_info;
3716 	struct btrfs_space_info *found;
3717 
3718 	rcu_read_lock();
3719 	list_for_each_entry_rcu(found, head, list) {
3720 		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3721 			found->force_alloc = CHUNK_ALLOC_FORCE;
3722 	}
3723 	rcu_read_unlock();
3724 }
3725 
3726 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
3727 {
3728 	return (global->size << 1);
3729 }
3730 
3731 static int should_alloc_chunk(struct btrfs_root *root,
3732 			      struct btrfs_space_info *sinfo, int force)
3733 {
3734 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3735 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3736 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3737 	u64 thresh;
3738 
3739 	if (force == CHUNK_ALLOC_FORCE)
3740 		return 1;
3741 
3742 	/*
3743 	 * We need to take into account the global rsv because for all intents
3744 	 * and purposes it's used space.  Don't worry about locking the
3745 	 * global_rsv, it doesn't change except when the transaction commits.
3746 	 */
3747 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3748 		num_allocated += calc_global_rsv_need_space(global_rsv);
3749 
3750 	/*
3751 	 * in limited mode, we want to have some free space up to
3752 	 * about 1% of the FS size.
3753 	 */
3754 	if (force == CHUNK_ALLOC_LIMITED) {
3755 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3756 		thresh = max_t(u64, 64 * 1024 * 1024,
3757 			       div_factor_fine(thresh, 1));
3758 
3759 		if (num_bytes - num_allocated < thresh)
3760 			return 1;
3761 	}
3762 
3763 	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3764 		return 0;
3765 	return 1;
3766 }
3767 
3768 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3769 {
3770 	u64 num_dev;
3771 
3772 	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3773 		    BTRFS_BLOCK_GROUP_RAID0 |
3774 		    BTRFS_BLOCK_GROUP_RAID5 |
3775 		    BTRFS_BLOCK_GROUP_RAID6))
3776 		num_dev = root->fs_info->fs_devices->rw_devices;
3777 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
3778 		num_dev = 2;
3779 	else
3780 		num_dev = 1;	/* DUP or single */
3781 
3782 	/* metadata for updaing devices and chunk tree */
3783 	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3784 }
3785 
3786 static void check_system_chunk(struct btrfs_trans_handle *trans,
3787 			       struct btrfs_root *root, u64 type)
3788 {
3789 	struct btrfs_space_info *info;
3790 	u64 left;
3791 	u64 thresh;
3792 
3793 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3794 	spin_lock(&info->lock);
3795 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3796 		info->bytes_reserved - info->bytes_readonly;
3797 	spin_unlock(&info->lock);
3798 
3799 	thresh = get_system_chunk_thresh(root, type);
3800 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3801 		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
3802 			left, thresh, type);
3803 		dump_space_info(info, 0, 0);
3804 	}
3805 
3806 	if (left < thresh) {
3807 		u64 flags;
3808 
3809 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3810 		btrfs_alloc_chunk(trans, root, flags);
3811 	}
3812 }
3813 
3814 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3815 			  struct btrfs_root *extent_root, u64 flags, int force)
3816 {
3817 	struct btrfs_space_info *space_info;
3818 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
3819 	int wait_for_alloc = 0;
3820 	int ret = 0;
3821 
3822 	/* Don't re-enter if we're already allocating a chunk */
3823 	if (trans->allocating_chunk)
3824 		return -ENOSPC;
3825 
3826 	space_info = __find_space_info(extent_root->fs_info, flags);
3827 	if (!space_info) {
3828 		ret = update_space_info(extent_root->fs_info, flags,
3829 					0, 0, &space_info);
3830 		BUG_ON(ret); /* -ENOMEM */
3831 	}
3832 	BUG_ON(!space_info); /* Logic error */
3833 
3834 again:
3835 	spin_lock(&space_info->lock);
3836 	if (force < space_info->force_alloc)
3837 		force = space_info->force_alloc;
3838 	if (space_info->full) {
3839 		if (should_alloc_chunk(extent_root, space_info, force))
3840 			ret = -ENOSPC;
3841 		else
3842 			ret = 0;
3843 		spin_unlock(&space_info->lock);
3844 		return ret;
3845 	}
3846 
3847 	if (!should_alloc_chunk(extent_root, space_info, force)) {
3848 		spin_unlock(&space_info->lock);
3849 		return 0;
3850 	} else if (space_info->chunk_alloc) {
3851 		wait_for_alloc = 1;
3852 	} else {
3853 		space_info->chunk_alloc = 1;
3854 	}
3855 
3856 	spin_unlock(&space_info->lock);
3857 
3858 	mutex_lock(&fs_info->chunk_mutex);
3859 
3860 	/*
3861 	 * The chunk_mutex is held throughout the entirety of a chunk
3862 	 * allocation, so once we've acquired the chunk_mutex we know that the
3863 	 * other guy is done and we need to recheck and see if we should
3864 	 * allocate.
3865 	 */
3866 	if (wait_for_alloc) {
3867 		mutex_unlock(&fs_info->chunk_mutex);
3868 		wait_for_alloc = 0;
3869 		goto again;
3870 	}
3871 
3872 	trans->allocating_chunk = true;
3873 
3874 	/*
3875 	 * If we have mixed data/metadata chunks we want to make sure we keep
3876 	 * allocating mixed chunks instead of individual chunks.
3877 	 */
3878 	if (btrfs_mixed_space_info(space_info))
3879 		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3880 
3881 	/*
3882 	 * if we're doing a data chunk, go ahead and make sure that
3883 	 * we keep a reasonable number of metadata chunks allocated in the
3884 	 * FS as well.
3885 	 */
3886 	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3887 		fs_info->data_chunk_allocations++;
3888 		if (!(fs_info->data_chunk_allocations %
3889 		      fs_info->metadata_ratio))
3890 			force_metadata_allocation(fs_info);
3891 	}
3892 
3893 	/*
3894 	 * Check if we have enough space in SYSTEM chunk because we may need
3895 	 * to update devices.
3896 	 */
3897 	check_system_chunk(trans, extent_root, flags);
3898 
3899 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
3900 	trans->allocating_chunk = false;
3901 
3902 	spin_lock(&space_info->lock);
3903 	if (ret < 0 && ret != -ENOSPC)
3904 		goto out;
3905 	if (ret)
3906 		space_info->full = 1;
3907 	else
3908 		ret = 1;
3909 
3910 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3911 out:
3912 	space_info->chunk_alloc = 0;
3913 	spin_unlock(&space_info->lock);
3914 	mutex_unlock(&fs_info->chunk_mutex);
3915 	return ret;
3916 }
3917 
3918 static int can_overcommit(struct btrfs_root *root,
3919 			  struct btrfs_space_info *space_info, u64 bytes,
3920 			  enum btrfs_reserve_flush_enum flush)
3921 {
3922 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3923 	u64 profile = btrfs_get_alloc_profile(root, 0);
3924 	u64 space_size;
3925 	u64 avail;
3926 	u64 used;
3927 	u64 to_add;
3928 
3929 	used = space_info->bytes_used + space_info->bytes_reserved +
3930 		space_info->bytes_pinned + space_info->bytes_readonly;
3931 
3932 	/*
3933 	 * We only want to allow over committing if we have lots of actual space
3934 	 * free, but if we don't have enough space to handle the global reserve
3935 	 * space then we could end up having a real enospc problem when trying
3936 	 * to allocate a chunk or some other such important allocation.
3937 	 */
3938 	spin_lock(&global_rsv->lock);
3939 	space_size = calc_global_rsv_need_space(global_rsv);
3940 	spin_unlock(&global_rsv->lock);
3941 	if (used + space_size >= space_info->total_bytes)
3942 		return 0;
3943 
3944 	used += space_info->bytes_may_use;
3945 
3946 	spin_lock(&root->fs_info->free_chunk_lock);
3947 	avail = root->fs_info->free_chunk_space;
3948 	spin_unlock(&root->fs_info->free_chunk_lock);
3949 
3950 	/*
3951 	 * If we have dup, raid1 or raid10 then only half of the free
3952 	 * space is actually useable.  For raid56, the space info used
3953 	 * doesn't include the parity drive, so we don't have to
3954 	 * change the math
3955 	 */
3956 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
3957 		       BTRFS_BLOCK_GROUP_RAID1 |
3958 		       BTRFS_BLOCK_GROUP_RAID10))
3959 		avail >>= 1;
3960 
3961 	to_add = space_info->total_bytes;
3962 
3963 	/*
3964 	 * If we aren't flushing all things, let us overcommit up to
3965 	 * 1/2th of the space. If we can flush, don't let us overcommit
3966 	 * too much, let it overcommit up to 1/8 of the space.
3967 	 */
3968 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
3969 		to_add >>= 3;
3970 	else
3971 		to_add >>= 1;
3972 
3973 	/*
3974 	 * Limit the overcommit to the amount of free space we could possibly
3975 	 * allocate for chunks.
3976 	 */
3977 	to_add = min(avail, to_add);
3978 
3979 	if (used + bytes < space_info->total_bytes + to_add)
3980 		return 1;
3981 	return 0;
3982 }
3983 
3984 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3985 					 unsigned long nr_pages)
3986 {
3987 	struct super_block *sb = root->fs_info->sb;
3988 
3989 	if (down_read_trylock(&sb->s_umount)) {
3990 		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3991 		up_read(&sb->s_umount);
3992 	} else {
3993 		/*
3994 		 * We needn't worry the filesystem going from r/w to r/o though
3995 		 * we don't acquire ->s_umount mutex, because the filesystem
3996 		 * should guarantee the delalloc inodes list be empty after
3997 		 * the filesystem is readonly(all dirty pages are written to
3998 		 * the disk).
3999 		 */
4000 		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
4001 		if (!current->journal_info)
4002 			btrfs_wait_all_ordered_extents(root->fs_info, 0);
4003 	}
4004 }
4005 
4006 /*
4007  * shrink metadata reservation for delalloc
4008  */
4009 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4010 			    bool wait_ordered)
4011 {
4012 	struct btrfs_block_rsv *block_rsv;
4013 	struct btrfs_space_info *space_info;
4014 	struct btrfs_trans_handle *trans;
4015 	u64 delalloc_bytes;
4016 	u64 max_reclaim;
4017 	long time_left;
4018 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
4019 	int loops = 0;
4020 	enum btrfs_reserve_flush_enum flush;
4021 
4022 	trans = (struct btrfs_trans_handle *)current->journal_info;
4023 	block_rsv = &root->fs_info->delalloc_block_rsv;
4024 	space_info = block_rsv->space_info;
4025 
4026 	smp_mb();
4027 	delalloc_bytes = percpu_counter_sum_positive(
4028 						&root->fs_info->delalloc_bytes);
4029 	if (delalloc_bytes == 0) {
4030 		if (trans)
4031 			return;
4032 		btrfs_wait_all_ordered_extents(root->fs_info, 0);
4033 		return;
4034 	}
4035 
4036 	while (delalloc_bytes && loops < 3) {
4037 		max_reclaim = min(delalloc_bytes, to_reclaim);
4038 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4039 		btrfs_writeback_inodes_sb_nr(root, nr_pages);
4040 		/*
4041 		 * We need to wait for the async pages to actually start before
4042 		 * we do anything.
4043 		 */
4044 		wait_event(root->fs_info->async_submit_wait,
4045 			   !atomic_read(&root->fs_info->async_delalloc_pages));
4046 
4047 		if (!trans)
4048 			flush = BTRFS_RESERVE_FLUSH_ALL;
4049 		else
4050 			flush = BTRFS_RESERVE_NO_FLUSH;
4051 		spin_lock(&space_info->lock);
4052 		if (can_overcommit(root, space_info, orig, flush)) {
4053 			spin_unlock(&space_info->lock);
4054 			break;
4055 		}
4056 		spin_unlock(&space_info->lock);
4057 
4058 		loops++;
4059 		if (wait_ordered && !trans) {
4060 			btrfs_wait_all_ordered_extents(root->fs_info, 0);
4061 		} else {
4062 			time_left = schedule_timeout_killable(1);
4063 			if (time_left)
4064 				break;
4065 		}
4066 		smp_mb();
4067 		delalloc_bytes = percpu_counter_sum_positive(
4068 						&root->fs_info->delalloc_bytes);
4069 	}
4070 }
4071 
4072 /**
4073  * maybe_commit_transaction - possibly commit the transaction if its ok to
4074  * @root - the root we're allocating for
4075  * @bytes - the number of bytes we want to reserve
4076  * @force - force the commit
4077  *
4078  * This will check to make sure that committing the transaction will actually
4079  * get us somewhere and then commit the transaction if it does.  Otherwise it
4080  * will return -ENOSPC.
4081  */
4082 static int may_commit_transaction(struct btrfs_root *root,
4083 				  struct btrfs_space_info *space_info,
4084 				  u64 bytes, int force)
4085 {
4086 	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4087 	struct btrfs_trans_handle *trans;
4088 
4089 	trans = (struct btrfs_trans_handle *)current->journal_info;
4090 	if (trans)
4091 		return -EAGAIN;
4092 
4093 	if (force)
4094 		goto commit;
4095 
4096 	/* See if there is enough pinned space to make this reservation */
4097 	spin_lock(&space_info->lock);
4098 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4099 				   bytes) >= 0) {
4100 		spin_unlock(&space_info->lock);
4101 		goto commit;
4102 	}
4103 	spin_unlock(&space_info->lock);
4104 
4105 	/*
4106 	 * See if there is some space in the delayed insertion reservation for
4107 	 * this reservation.
4108 	 */
4109 	if (space_info != delayed_rsv->space_info)
4110 		return -ENOSPC;
4111 
4112 	spin_lock(&space_info->lock);
4113 	spin_lock(&delayed_rsv->lock);
4114 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4115 				   bytes - delayed_rsv->size) >= 0) {
4116 		spin_unlock(&delayed_rsv->lock);
4117 		spin_unlock(&space_info->lock);
4118 		return -ENOSPC;
4119 	}
4120 	spin_unlock(&delayed_rsv->lock);
4121 	spin_unlock(&space_info->lock);
4122 
4123 commit:
4124 	trans = btrfs_join_transaction(root);
4125 	if (IS_ERR(trans))
4126 		return -ENOSPC;
4127 
4128 	return btrfs_commit_transaction(trans, root);
4129 }
4130 
4131 enum flush_state {
4132 	FLUSH_DELAYED_ITEMS_NR	=	1,
4133 	FLUSH_DELAYED_ITEMS	=	2,
4134 	FLUSH_DELALLOC		=	3,
4135 	FLUSH_DELALLOC_WAIT	=	4,
4136 	ALLOC_CHUNK		=	5,
4137 	COMMIT_TRANS		=	6,
4138 };
4139 
4140 static int flush_space(struct btrfs_root *root,
4141 		       struct btrfs_space_info *space_info, u64 num_bytes,
4142 		       u64 orig_bytes, int state)
4143 {
4144 	struct btrfs_trans_handle *trans;
4145 	int nr;
4146 	int ret = 0;
4147 
4148 	switch (state) {
4149 	case FLUSH_DELAYED_ITEMS_NR:
4150 	case FLUSH_DELAYED_ITEMS:
4151 		if (state == FLUSH_DELAYED_ITEMS_NR) {
4152 			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
4153 
4154 			nr = (int)div64_u64(num_bytes, bytes);
4155 			if (!nr)
4156 				nr = 1;
4157 			nr *= 2;
4158 		} else {
4159 			nr = -1;
4160 		}
4161 		trans = btrfs_join_transaction(root);
4162 		if (IS_ERR(trans)) {
4163 			ret = PTR_ERR(trans);
4164 			break;
4165 		}
4166 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
4167 		btrfs_end_transaction(trans, root);
4168 		break;
4169 	case FLUSH_DELALLOC:
4170 	case FLUSH_DELALLOC_WAIT:
4171 		shrink_delalloc(root, num_bytes, orig_bytes,
4172 				state == FLUSH_DELALLOC_WAIT);
4173 		break;
4174 	case ALLOC_CHUNK:
4175 		trans = btrfs_join_transaction(root);
4176 		if (IS_ERR(trans)) {
4177 			ret = PTR_ERR(trans);
4178 			break;
4179 		}
4180 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4181 				     btrfs_get_alloc_profile(root, 0),
4182 				     CHUNK_ALLOC_NO_FORCE);
4183 		btrfs_end_transaction(trans, root);
4184 		if (ret == -ENOSPC)
4185 			ret = 0;
4186 		break;
4187 	case COMMIT_TRANS:
4188 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4189 		break;
4190 	default:
4191 		ret = -ENOSPC;
4192 		break;
4193 	}
4194 
4195 	return ret;
4196 }
4197 /**
4198  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4199  * @root - the root we're allocating for
4200  * @block_rsv - the block_rsv we're allocating for
4201  * @orig_bytes - the number of bytes we want
4202  * @flush - whether or not we can flush to make our reservation
4203  *
4204  * This will reserve orgi_bytes number of bytes from the space info associated
4205  * with the block_rsv.  If there is not enough space it will make an attempt to
4206  * flush out space to make room.  It will do this by flushing delalloc if
4207  * possible or committing the transaction.  If flush is 0 then no attempts to
4208  * regain reservations will be made and this will fail if there is not enough
4209  * space already.
4210  */
4211 static int reserve_metadata_bytes(struct btrfs_root *root,
4212 				  struct btrfs_block_rsv *block_rsv,
4213 				  u64 orig_bytes,
4214 				  enum btrfs_reserve_flush_enum flush)
4215 {
4216 	struct btrfs_space_info *space_info = block_rsv->space_info;
4217 	u64 used;
4218 	u64 num_bytes = orig_bytes;
4219 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
4220 	int ret = 0;
4221 	bool flushing = false;
4222 
4223 again:
4224 	ret = 0;
4225 	spin_lock(&space_info->lock);
4226 	/*
4227 	 * We only want to wait if somebody other than us is flushing and we
4228 	 * are actually allowed to flush all things.
4229 	 */
4230 	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4231 	       space_info->flush) {
4232 		spin_unlock(&space_info->lock);
4233 		/*
4234 		 * If we have a trans handle we can't wait because the flusher
4235 		 * may have to commit the transaction, which would mean we would
4236 		 * deadlock since we are waiting for the flusher to finish, but
4237 		 * hold the current transaction open.
4238 		 */
4239 		if (current->journal_info)
4240 			return -EAGAIN;
4241 		ret = wait_event_killable(space_info->wait, !space_info->flush);
4242 		/* Must have been killed, return */
4243 		if (ret)
4244 			return -EINTR;
4245 
4246 		spin_lock(&space_info->lock);
4247 	}
4248 
4249 	ret = -ENOSPC;
4250 	used = space_info->bytes_used + space_info->bytes_reserved +
4251 		space_info->bytes_pinned + space_info->bytes_readonly +
4252 		space_info->bytes_may_use;
4253 
4254 	/*
4255 	 * The idea here is that we've not already over-reserved the block group
4256 	 * then we can go ahead and save our reservation first and then start
4257 	 * flushing if we need to.  Otherwise if we've already overcommitted
4258 	 * lets start flushing stuff first and then come back and try to make
4259 	 * our reservation.
4260 	 */
4261 	if (used <= space_info->total_bytes) {
4262 		if (used + orig_bytes <= space_info->total_bytes) {
4263 			space_info->bytes_may_use += orig_bytes;
4264 			trace_btrfs_space_reservation(root->fs_info,
4265 				"space_info", space_info->flags, orig_bytes, 1);
4266 			ret = 0;
4267 		} else {
4268 			/*
4269 			 * Ok set num_bytes to orig_bytes since we aren't
4270 			 * overocmmitted, this way we only try and reclaim what
4271 			 * we need.
4272 			 */
4273 			num_bytes = orig_bytes;
4274 		}
4275 	} else {
4276 		/*
4277 		 * Ok we're over committed, set num_bytes to the overcommitted
4278 		 * amount plus the amount of bytes that we need for this
4279 		 * reservation.
4280 		 */
4281 		num_bytes = used - space_info->total_bytes +
4282 			(orig_bytes * 2);
4283 	}
4284 
4285 	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4286 		space_info->bytes_may_use += orig_bytes;
4287 		trace_btrfs_space_reservation(root->fs_info, "space_info",
4288 					      space_info->flags, orig_bytes,
4289 					      1);
4290 		ret = 0;
4291 	}
4292 
4293 	/*
4294 	 * Couldn't make our reservation, save our place so while we're trying
4295 	 * to reclaim space we can actually use it instead of somebody else
4296 	 * stealing it from us.
4297 	 *
4298 	 * We make the other tasks wait for the flush only when we can flush
4299 	 * all things.
4300 	 */
4301 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4302 		flushing = true;
4303 		space_info->flush = 1;
4304 	}
4305 
4306 	spin_unlock(&space_info->lock);
4307 
4308 	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4309 		goto out;
4310 
4311 	ret = flush_space(root, space_info, num_bytes, orig_bytes,
4312 			  flush_state);
4313 	flush_state++;
4314 
4315 	/*
4316 	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4317 	 * would happen. So skip delalloc flush.
4318 	 */
4319 	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4320 	    (flush_state == FLUSH_DELALLOC ||
4321 	     flush_state == FLUSH_DELALLOC_WAIT))
4322 		flush_state = ALLOC_CHUNK;
4323 
4324 	if (!ret)
4325 		goto again;
4326 	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4327 		 flush_state < COMMIT_TRANS)
4328 		goto again;
4329 	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4330 		 flush_state <= COMMIT_TRANS)
4331 		goto again;
4332 
4333 out:
4334 	if (ret == -ENOSPC &&
4335 	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4336 		struct btrfs_block_rsv *global_rsv =
4337 			&root->fs_info->global_block_rsv;
4338 
4339 		if (block_rsv != global_rsv &&
4340 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
4341 			ret = 0;
4342 	}
4343 	if (flushing) {
4344 		spin_lock(&space_info->lock);
4345 		space_info->flush = 0;
4346 		wake_up_all(&space_info->wait);
4347 		spin_unlock(&space_info->lock);
4348 	}
4349 	return ret;
4350 }
4351 
4352 static struct btrfs_block_rsv *get_block_rsv(
4353 					const struct btrfs_trans_handle *trans,
4354 					const struct btrfs_root *root)
4355 {
4356 	struct btrfs_block_rsv *block_rsv = NULL;
4357 
4358 	if (root->ref_cows)
4359 		block_rsv = trans->block_rsv;
4360 
4361 	if (root == root->fs_info->csum_root && trans->adding_csums)
4362 		block_rsv = trans->block_rsv;
4363 
4364 	if (root == root->fs_info->uuid_root)
4365 		block_rsv = trans->block_rsv;
4366 
4367 	if (!block_rsv)
4368 		block_rsv = root->block_rsv;
4369 
4370 	if (!block_rsv)
4371 		block_rsv = &root->fs_info->empty_block_rsv;
4372 
4373 	return block_rsv;
4374 }
4375 
4376 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4377 			       u64 num_bytes)
4378 {
4379 	int ret = -ENOSPC;
4380 	spin_lock(&block_rsv->lock);
4381 	if (block_rsv->reserved >= num_bytes) {
4382 		block_rsv->reserved -= num_bytes;
4383 		if (block_rsv->reserved < block_rsv->size)
4384 			block_rsv->full = 0;
4385 		ret = 0;
4386 	}
4387 	spin_unlock(&block_rsv->lock);
4388 	return ret;
4389 }
4390 
4391 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4392 				u64 num_bytes, int update_size)
4393 {
4394 	spin_lock(&block_rsv->lock);
4395 	block_rsv->reserved += num_bytes;
4396 	if (update_size)
4397 		block_rsv->size += num_bytes;
4398 	else if (block_rsv->reserved >= block_rsv->size)
4399 		block_rsv->full = 1;
4400 	spin_unlock(&block_rsv->lock);
4401 }
4402 
4403 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4404 			     struct btrfs_block_rsv *dest, u64 num_bytes,
4405 			     int min_factor)
4406 {
4407 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4408 	u64 min_bytes;
4409 
4410 	if (global_rsv->space_info != dest->space_info)
4411 		return -ENOSPC;
4412 
4413 	spin_lock(&global_rsv->lock);
4414 	min_bytes = div_factor(global_rsv->size, min_factor);
4415 	if (global_rsv->reserved < min_bytes + num_bytes) {
4416 		spin_unlock(&global_rsv->lock);
4417 		return -ENOSPC;
4418 	}
4419 	global_rsv->reserved -= num_bytes;
4420 	if (global_rsv->reserved < global_rsv->size)
4421 		global_rsv->full = 0;
4422 	spin_unlock(&global_rsv->lock);
4423 
4424 	block_rsv_add_bytes(dest, num_bytes, 1);
4425 	return 0;
4426 }
4427 
4428 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4429 				    struct btrfs_block_rsv *block_rsv,
4430 				    struct btrfs_block_rsv *dest, u64 num_bytes)
4431 {
4432 	struct btrfs_space_info *space_info = block_rsv->space_info;
4433 
4434 	spin_lock(&block_rsv->lock);
4435 	if (num_bytes == (u64)-1)
4436 		num_bytes = block_rsv->size;
4437 	block_rsv->size -= num_bytes;
4438 	if (block_rsv->reserved >= block_rsv->size) {
4439 		num_bytes = block_rsv->reserved - block_rsv->size;
4440 		block_rsv->reserved = block_rsv->size;
4441 		block_rsv->full = 1;
4442 	} else {
4443 		num_bytes = 0;
4444 	}
4445 	spin_unlock(&block_rsv->lock);
4446 
4447 	if (num_bytes > 0) {
4448 		if (dest) {
4449 			spin_lock(&dest->lock);
4450 			if (!dest->full) {
4451 				u64 bytes_to_add;
4452 
4453 				bytes_to_add = dest->size - dest->reserved;
4454 				bytes_to_add = min(num_bytes, bytes_to_add);
4455 				dest->reserved += bytes_to_add;
4456 				if (dest->reserved >= dest->size)
4457 					dest->full = 1;
4458 				num_bytes -= bytes_to_add;
4459 			}
4460 			spin_unlock(&dest->lock);
4461 		}
4462 		if (num_bytes) {
4463 			spin_lock(&space_info->lock);
4464 			space_info->bytes_may_use -= num_bytes;
4465 			trace_btrfs_space_reservation(fs_info, "space_info",
4466 					space_info->flags, num_bytes, 0);
4467 			space_info->reservation_progress++;
4468 			spin_unlock(&space_info->lock);
4469 		}
4470 	}
4471 }
4472 
4473 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4474 				   struct btrfs_block_rsv *dst, u64 num_bytes)
4475 {
4476 	int ret;
4477 
4478 	ret = block_rsv_use_bytes(src, num_bytes);
4479 	if (ret)
4480 		return ret;
4481 
4482 	block_rsv_add_bytes(dst, num_bytes, 1);
4483 	return 0;
4484 }
4485 
4486 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4487 {
4488 	memset(rsv, 0, sizeof(*rsv));
4489 	spin_lock_init(&rsv->lock);
4490 	rsv->type = type;
4491 }
4492 
4493 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4494 					      unsigned short type)
4495 {
4496 	struct btrfs_block_rsv *block_rsv;
4497 	struct btrfs_fs_info *fs_info = root->fs_info;
4498 
4499 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4500 	if (!block_rsv)
4501 		return NULL;
4502 
4503 	btrfs_init_block_rsv(block_rsv, type);
4504 	block_rsv->space_info = __find_space_info(fs_info,
4505 						  BTRFS_BLOCK_GROUP_METADATA);
4506 	return block_rsv;
4507 }
4508 
4509 void btrfs_free_block_rsv(struct btrfs_root *root,
4510 			  struct btrfs_block_rsv *rsv)
4511 {
4512 	if (!rsv)
4513 		return;
4514 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4515 	kfree(rsv);
4516 }
4517 
4518 int btrfs_block_rsv_add(struct btrfs_root *root,
4519 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4520 			enum btrfs_reserve_flush_enum flush)
4521 {
4522 	int ret;
4523 
4524 	if (num_bytes == 0)
4525 		return 0;
4526 
4527 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4528 	if (!ret) {
4529 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
4530 		return 0;
4531 	}
4532 
4533 	return ret;
4534 }
4535 
4536 int btrfs_block_rsv_check(struct btrfs_root *root,
4537 			  struct btrfs_block_rsv *block_rsv, int min_factor)
4538 {
4539 	u64 num_bytes = 0;
4540 	int ret = -ENOSPC;
4541 
4542 	if (!block_rsv)
4543 		return 0;
4544 
4545 	spin_lock(&block_rsv->lock);
4546 	num_bytes = div_factor(block_rsv->size, min_factor);
4547 	if (block_rsv->reserved >= num_bytes)
4548 		ret = 0;
4549 	spin_unlock(&block_rsv->lock);
4550 
4551 	return ret;
4552 }
4553 
4554 int btrfs_block_rsv_refill(struct btrfs_root *root,
4555 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4556 			   enum btrfs_reserve_flush_enum flush)
4557 {
4558 	u64 num_bytes = 0;
4559 	int ret = -ENOSPC;
4560 
4561 	if (!block_rsv)
4562 		return 0;
4563 
4564 	spin_lock(&block_rsv->lock);
4565 	num_bytes = min_reserved;
4566 	if (block_rsv->reserved >= num_bytes)
4567 		ret = 0;
4568 	else
4569 		num_bytes -= block_rsv->reserved;
4570 	spin_unlock(&block_rsv->lock);
4571 
4572 	if (!ret)
4573 		return 0;
4574 
4575 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4576 	if (!ret) {
4577 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
4578 		return 0;
4579 	}
4580 
4581 	return ret;
4582 }
4583 
4584 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4585 			    struct btrfs_block_rsv *dst_rsv,
4586 			    u64 num_bytes)
4587 {
4588 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4589 }
4590 
4591 void btrfs_block_rsv_release(struct btrfs_root *root,
4592 			     struct btrfs_block_rsv *block_rsv,
4593 			     u64 num_bytes)
4594 {
4595 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4596 	if (global_rsv->full || global_rsv == block_rsv ||
4597 	    block_rsv->space_info != global_rsv->space_info)
4598 		global_rsv = NULL;
4599 	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4600 				num_bytes);
4601 }
4602 
4603 /*
4604  * helper to calculate size of global block reservation.
4605  * the desired value is sum of space used by extent tree,
4606  * checksum tree and root tree
4607  */
4608 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4609 {
4610 	struct btrfs_space_info *sinfo;
4611 	u64 num_bytes;
4612 	u64 meta_used;
4613 	u64 data_used;
4614 	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4615 
4616 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4617 	spin_lock(&sinfo->lock);
4618 	data_used = sinfo->bytes_used;
4619 	spin_unlock(&sinfo->lock);
4620 
4621 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4622 	spin_lock(&sinfo->lock);
4623 	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4624 		data_used = 0;
4625 	meta_used = sinfo->bytes_used;
4626 	spin_unlock(&sinfo->lock);
4627 
4628 	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4629 		    csum_size * 2;
4630 	num_bytes += div64_u64(data_used + meta_used, 50);
4631 
4632 	if (num_bytes * 3 > meta_used)
4633 		num_bytes = div64_u64(meta_used, 3);
4634 
4635 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4636 }
4637 
4638 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4639 {
4640 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4641 	struct btrfs_space_info *sinfo = block_rsv->space_info;
4642 	u64 num_bytes;
4643 
4644 	num_bytes = calc_global_metadata_size(fs_info);
4645 
4646 	spin_lock(&sinfo->lock);
4647 	spin_lock(&block_rsv->lock);
4648 
4649 	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
4650 
4651 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4652 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
4653 		    sinfo->bytes_may_use;
4654 
4655 	if (sinfo->total_bytes > num_bytes) {
4656 		num_bytes = sinfo->total_bytes - num_bytes;
4657 		block_rsv->reserved += num_bytes;
4658 		sinfo->bytes_may_use += num_bytes;
4659 		trace_btrfs_space_reservation(fs_info, "space_info",
4660 				      sinfo->flags, num_bytes, 1);
4661 	}
4662 
4663 	if (block_rsv->reserved >= block_rsv->size) {
4664 		num_bytes = block_rsv->reserved - block_rsv->size;
4665 		sinfo->bytes_may_use -= num_bytes;
4666 		trace_btrfs_space_reservation(fs_info, "space_info",
4667 				      sinfo->flags, num_bytes, 0);
4668 		sinfo->reservation_progress++;
4669 		block_rsv->reserved = block_rsv->size;
4670 		block_rsv->full = 1;
4671 	}
4672 
4673 	spin_unlock(&block_rsv->lock);
4674 	spin_unlock(&sinfo->lock);
4675 }
4676 
4677 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4678 {
4679 	struct btrfs_space_info *space_info;
4680 
4681 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4682 	fs_info->chunk_block_rsv.space_info = space_info;
4683 
4684 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4685 	fs_info->global_block_rsv.space_info = space_info;
4686 	fs_info->delalloc_block_rsv.space_info = space_info;
4687 	fs_info->trans_block_rsv.space_info = space_info;
4688 	fs_info->empty_block_rsv.space_info = space_info;
4689 	fs_info->delayed_block_rsv.space_info = space_info;
4690 
4691 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4692 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4693 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4694 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4695 	if (fs_info->quota_root)
4696 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4697 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4698 
4699 	update_global_block_rsv(fs_info);
4700 }
4701 
4702 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4703 {
4704 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4705 				(u64)-1);
4706 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4707 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4708 	WARN_ON(fs_info->trans_block_rsv.size > 0);
4709 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4710 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
4711 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4712 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
4713 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4714 }
4715 
4716 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4717 				  struct btrfs_root *root)
4718 {
4719 	if (!trans->block_rsv)
4720 		return;
4721 
4722 	if (!trans->bytes_reserved)
4723 		return;
4724 
4725 	trace_btrfs_space_reservation(root->fs_info, "transaction",
4726 				      trans->transid, trans->bytes_reserved, 0);
4727 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4728 	trans->bytes_reserved = 0;
4729 }
4730 
4731 /* Can only return 0 or -ENOSPC */
4732 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4733 				  struct inode *inode)
4734 {
4735 	struct btrfs_root *root = BTRFS_I(inode)->root;
4736 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4737 	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4738 
4739 	/*
4740 	 * We need to hold space in order to delete our orphan item once we've
4741 	 * added it, so this takes the reservation so we can release it later
4742 	 * when we are truly done with the orphan item.
4743 	 */
4744 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4745 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4746 				      btrfs_ino(inode), num_bytes, 1);
4747 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4748 }
4749 
4750 void btrfs_orphan_release_metadata(struct inode *inode)
4751 {
4752 	struct btrfs_root *root = BTRFS_I(inode)->root;
4753 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4754 	trace_btrfs_space_reservation(root->fs_info, "orphan",
4755 				      btrfs_ino(inode), num_bytes, 0);
4756 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4757 }
4758 
4759 /*
4760  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4761  * root: the root of the parent directory
4762  * rsv: block reservation
4763  * items: the number of items that we need do reservation
4764  * qgroup_reserved: used to return the reserved size in qgroup
4765  *
4766  * This function is used to reserve the space for snapshot/subvolume
4767  * creation and deletion. Those operations are different with the
4768  * common file/directory operations, they change two fs/file trees
4769  * and root tree, the number of items that the qgroup reserves is
4770  * different with the free space reservation. So we can not use
4771  * the space reseravtion mechanism in start_transaction().
4772  */
4773 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4774 				     struct btrfs_block_rsv *rsv,
4775 				     int items,
4776 				     u64 *qgroup_reserved,
4777 				     bool use_global_rsv)
4778 {
4779 	u64 num_bytes;
4780 	int ret;
4781 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4782 
4783 	if (root->fs_info->quota_enabled) {
4784 		/* One for parent inode, two for dir entries */
4785 		num_bytes = 3 * root->leafsize;
4786 		ret = btrfs_qgroup_reserve(root, num_bytes);
4787 		if (ret)
4788 			return ret;
4789 	} else {
4790 		num_bytes = 0;
4791 	}
4792 
4793 	*qgroup_reserved = num_bytes;
4794 
4795 	num_bytes = btrfs_calc_trans_metadata_size(root, items);
4796 	rsv->space_info = __find_space_info(root->fs_info,
4797 					    BTRFS_BLOCK_GROUP_METADATA);
4798 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4799 				  BTRFS_RESERVE_FLUSH_ALL);
4800 
4801 	if (ret == -ENOSPC && use_global_rsv)
4802 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
4803 
4804 	if (ret) {
4805 		if (*qgroup_reserved)
4806 			btrfs_qgroup_free(root, *qgroup_reserved);
4807 	}
4808 
4809 	return ret;
4810 }
4811 
4812 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4813 				      struct btrfs_block_rsv *rsv,
4814 				      u64 qgroup_reserved)
4815 {
4816 	btrfs_block_rsv_release(root, rsv, (u64)-1);
4817 	if (qgroup_reserved)
4818 		btrfs_qgroup_free(root, qgroup_reserved);
4819 }
4820 
4821 /**
4822  * drop_outstanding_extent - drop an outstanding extent
4823  * @inode: the inode we're dropping the extent for
4824  *
4825  * This is called when we are freeing up an outstanding extent, either called
4826  * after an error or after an extent is written.  This will return the number of
4827  * reserved extents that need to be freed.  This must be called with
4828  * BTRFS_I(inode)->lock held.
4829  */
4830 static unsigned drop_outstanding_extent(struct inode *inode)
4831 {
4832 	unsigned drop_inode_space = 0;
4833 	unsigned dropped_extents = 0;
4834 
4835 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4836 	BTRFS_I(inode)->outstanding_extents--;
4837 
4838 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
4839 	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4840 			       &BTRFS_I(inode)->runtime_flags))
4841 		drop_inode_space = 1;
4842 
4843 	/*
4844 	 * If we have more or the same amount of outsanding extents than we have
4845 	 * reserved then we need to leave the reserved extents count alone.
4846 	 */
4847 	if (BTRFS_I(inode)->outstanding_extents >=
4848 	    BTRFS_I(inode)->reserved_extents)
4849 		return drop_inode_space;
4850 
4851 	dropped_extents = BTRFS_I(inode)->reserved_extents -
4852 		BTRFS_I(inode)->outstanding_extents;
4853 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
4854 	return dropped_extents + drop_inode_space;
4855 }
4856 
4857 /**
4858  * calc_csum_metadata_size - return the amount of metada space that must be
4859  *	reserved/free'd for the given bytes.
4860  * @inode: the inode we're manipulating
4861  * @num_bytes: the number of bytes in question
4862  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4863  *
4864  * This adjusts the number of csum_bytes in the inode and then returns the
4865  * correct amount of metadata that must either be reserved or freed.  We
4866  * calculate how many checksums we can fit into one leaf and then divide the
4867  * number of bytes that will need to be checksumed by this value to figure out
4868  * how many checksums will be required.  If we are adding bytes then the number
4869  * may go up and we will return the number of additional bytes that must be
4870  * reserved.  If it is going down we will return the number of bytes that must
4871  * be freed.
4872  *
4873  * This must be called with BTRFS_I(inode)->lock held.
4874  */
4875 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4876 				   int reserve)
4877 {
4878 	struct btrfs_root *root = BTRFS_I(inode)->root;
4879 	u64 csum_size;
4880 	int num_csums_per_leaf;
4881 	int num_csums;
4882 	int old_csums;
4883 
4884 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4885 	    BTRFS_I(inode)->csum_bytes == 0)
4886 		return 0;
4887 
4888 	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4889 	if (reserve)
4890 		BTRFS_I(inode)->csum_bytes += num_bytes;
4891 	else
4892 		BTRFS_I(inode)->csum_bytes -= num_bytes;
4893 	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4894 	num_csums_per_leaf = (int)div64_u64(csum_size,
4895 					    sizeof(struct btrfs_csum_item) +
4896 					    sizeof(struct btrfs_disk_key));
4897 	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4898 	num_csums = num_csums + num_csums_per_leaf - 1;
4899 	num_csums = num_csums / num_csums_per_leaf;
4900 
4901 	old_csums = old_csums + num_csums_per_leaf - 1;
4902 	old_csums = old_csums / num_csums_per_leaf;
4903 
4904 	/* No change, no need to reserve more */
4905 	if (old_csums == num_csums)
4906 		return 0;
4907 
4908 	if (reserve)
4909 		return btrfs_calc_trans_metadata_size(root,
4910 						      num_csums - old_csums);
4911 
4912 	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4913 }
4914 
4915 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4916 {
4917 	struct btrfs_root *root = BTRFS_I(inode)->root;
4918 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4919 	u64 to_reserve = 0;
4920 	u64 csum_bytes;
4921 	unsigned nr_extents = 0;
4922 	int extra_reserve = 0;
4923 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4924 	int ret = 0;
4925 	bool delalloc_lock = true;
4926 	u64 to_free = 0;
4927 	unsigned dropped;
4928 
4929 	/* If we are a free space inode we need to not flush since we will be in
4930 	 * the middle of a transaction commit.  We also don't need the delalloc
4931 	 * mutex since we won't race with anybody.  We need this mostly to make
4932 	 * lockdep shut its filthy mouth.
4933 	 */
4934 	if (btrfs_is_free_space_inode(inode)) {
4935 		flush = BTRFS_RESERVE_NO_FLUSH;
4936 		delalloc_lock = false;
4937 	}
4938 
4939 	if (flush != BTRFS_RESERVE_NO_FLUSH &&
4940 	    btrfs_transaction_in_commit(root->fs_info))
4941 		schedule_timeout(1);
4942 
4943 	if (delalloc_lock)
4944 		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4945 
4946 	num_bytes = ALIGN(num_bytes, root->sectorsize);
4947 
4948 	spin_lock(&BTRFS_I(inode)->lock);
4949 	BTRFS_I(inode)->outstanding_extents++;
4950 
4951 	if (BTRFS_I(inode)->outstanding_extents >
4952 	    BTRFS_I(inode)->reserved_extents)
4953 		nr_extents = BTRFS_I(inode)->outstanding_extents -
4954 			BTRFS_I(inode)->reserved_extents;
4955 
4956 	/*
4957 	 * Add an item to reserve for updating the inode when we complete the
4958 	 * delalloc io.
4959 	 */
4960 	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4961 		      &BTRFS_I(inode)->runtime_flags)) {
4962 		nr_extents++;
4963 		extra_reserve = 1;
4964 	}
4965 
4966 	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4967 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4968 	csum_bytes = BTRFS_I(inode)->csum_bytes;
4969 	spin_unlock(&BTRFS_I(inode)->lock);
4970 
4971 	if (root->fs_info->quota_enabled) {
4972 		ret = btrfs_qgroup_reserve(root, num_bytes +
4973 					   nr_extents * root->leafsize);
4974 		if (ret)
4975 			goto out_fail;
4976 	}
4977 
4978 	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4979 	if (unlikely(ret)) {
4980 		if (root->fs_info->quota_enabled)
4981 			btrfs_qgroup_free(root, num_bytes +
4982 						nr_extents * root->leafsize);
4983 		goto out_fail;
4984 	}
4985 
4986 	spin_lock(&BTRFS_I(inode)->lock);
4987 	if (extra_reserve) {
4988 		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4989 			&BTRFS_I(inode)->runtime_flags);
4990 		nr_extents--;
4991 	}
4992 	BTRFS_I(inode)->reserved_extents += nr_extents;
4993 	spin_unlock(&BTRFS_I(inode)->lock);
4994 
4995 	if (delalloc_lock)
4996 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4997 
4998 	if (to_reserve)
4999 		trace_btrfs_space_reservation(root->fs_info,"delalloc",
5000 					      btrfs_ino(inode), to_reserve, 1);
5001 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
5002 
5003 	return 0;
5004 
5005 out_fail:
5006 	spin_lock(&BTRFS_I(inode)->lock);
5007 	dropped = drop_outstanding_extent(inode);
5008 	/*
5009 	 * If the inodes csum_bytes is the same as the original
5010 	 * csum_bytes then we know we haven't raced with any free()ers
5011 	 * so we can just reduce our inodes csum bytes and carry on.
5012 	 */
5013 	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5014 		calc_csum_metadata_size(inode, num_bytes, 0);
5015 	} else {
5016 		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5017 		u64 bytes;
5018 
5019 		/*
5020 		 * This is tricky, but first we need to figure out how much we
5021 		 * free'd from any free-ers that occured during this
5022 		 * reservation, so we reset ->csum_bytes to the csum_bytes
5023 		 * before we dropped our lock, and then call the free for the
5024 		 * number of bytes that were freed while we were trying our
5025 		 * reservation.
5026 		 */
5027 		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5028 		BTRFS_I(inode)->csum_bytes = csum_bytes;
5029 		to_free = calc_csum_metadata_size(inode, bytes, 0);
5030 
5031 
5032 		/*
5033 		 * Now we need to see how much we would have freed had we not
5034 		 * been making this reservation and our ->csum_bytes were not
5035 		 * artificially inflated.
5036 		 */
5037 		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5038 		bytes = csum_bytes - orig_csum_bytes;
5039 		bytes = calc_csum_metadata_size(inode, bytes, 0);
5040 
5041 		/*
5042 		 * Now reset ->csum_bytes to what it should be.  If bytes is
5043 		 * more than to_free then we would have free'd more space had we
5044 		 * not had an artificially high ->csum_bytes, so we need to free
5045 		 * the remainder.  If bytes is the same or less then we don't
5046 		 * need to do anything, the other free-ers did the correct
5047 		 * thing.
5048 		 */
5049 		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5050 		if (bytes > to_free)
5051 			to_free = bytes - to_free;
5052 		else
5053 			to_free = 0;
5054 	}
5055 	spin_unlock(&BTRFS_I(inode)->lock);
5056 	if (dropped)
5057 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
5058 
5059 	if (to_free) {
5060 		btrfs_block_rsv_release(root, block_rsv, to_free);
5061 		trace_btrfs_space_reservation(root->fs_info, "delalloc",
5062 					      btrfs_ino(inode), to_free, 0);
5063 	}
5064 	if (delalloc_lock)
5065 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5066 	return ret;
5067 }
5068 
5069 /**
5070  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5071  * @inode: the inode to release the reservation for
5072  * @num_bytes: the number of bytes we're releasing
5073  *
5074  * This will release the metadata reservation for an inode.  This can be called
5075  * once we complete IO for a given set of bytes to release their metadata
5076  * reservations.
5077  */
5078 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5079 {
5080 	struct btrfs_root *root = BTRFS_I(inode)->root;
5081 	u64 to_free = 0;
5082 	unsigned dropped;
5083 
5084 	num_bytes = ALIGN(num_bytes, root->sectorsize);
5085 	spin_lock(&BTRFS_I(inode)->lock);
5086 	dropped = drop_outstanding_extent(inode);
5087 
5088 	if (num_bytes)
5089 		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5090 	spin_unlock(&BTRFS_I(inode)->lock);
5091 	if (dropped > 0)
5092 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
5093 
5094 	trace_btrfs_space_reservation(root->fs_info, "delalloc",
5095 				      btrfs_ino(inode), to_free, 0);
5096 	if (root->fs_info->quota_enabled) {
5097 		btrfs_qgroup_free(root, num_bytes +
5098 					dropped * root->leafsize);
5099 	}
5100 
5101 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5102 				to_free);
5103 }
5104 
5105 /**
5106  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5107  * @inode: inode we're writing to
5108  * @num_bytes: the number of bytes we want to allocate
5109  *
5110  * This will do the following things
5111  *
5112  * o reserve space in the data space info for num_bytes
5113  * o reserve space in the metadata space info based on number of outstanding
5114  *   extents and how much csums will be needed
5115  * o add to the inodes ->delalloc_bytes
5116  * o add it to the fs_info's delalloc inodes list.
5117  *
5118  * This will return 0 for success and -ENOSPC if there is no space left.
5119  */
5120 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5121 {
5122 	int ret;
5123 
5124 	ret = btrfs_check_data_free_space(inode, num_bytes);
5125 	if (ret)
5126 		return ret;
5127 
5128 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5129 	if (ret) {
5130 		btrfs_free_reserved_data_space(inode, num_bytes);
5131 		return ret;
5132 	}
5133 
5134 	return 0;
5135 }
5136 
5137 /**
5138  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5139  * @inode: inode we're releasing space for
5140  * @num_bytes: the number of bytes we want to free up
5141  *
5142  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5143  * called in the case that we don't need the metadata AND data reservations
5144  * anymore.  So if there is an error or we insert an inline extent.
5145  *
5146  * This function will release the metadata space that was not used and will
5147  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5148  * list if there are no delalloc bytes left.
5149  */
5150 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5151 {
5152 	btrfs_delalloc_release_metadata(inode, num_bytes);
5153 	btrfs_free_reserved_data_space(inode, num_bytes);
5154 }
5155 
5156 static int update_block_group(struct btrfs_root *root,
5157 			      u64 bytenr, u64 num_bytes, int alloc)
5158 {
5159 	struct btrfs_block_group_cache *cache = NULL;
5160 	struct btrfs_fs_info *info = root->fs_info;
5161 	u64 total = num_bytes;
5162 	u64 old_val;
5163 	u64 byte_in_group;
5164 	int factor;
5165 
5166 	/* block accounting for super block */
5167 	spin_lock(&info->delalloc_root_lock);
5168 	old_val = btrfs_super_bytes_used(info->super_copy);
5169 	if (alloc)
5170 		old_val += num_bytes;
5171 	else
5172 		old_val -= num_bytes;
5173 	btrfs_set_super_bytes_used(info->super_copy, old_val);
5174 	spin_unlock(&info->delalloc_root_lock);
5175 
5176 	while (total) {
5177 		cache = btrfs_lookup_block_group(info, bytenr);
5178 		if (!cache)
5179 			return -ENOENT;
5180 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5181 				    BTRFS_BLOCK_GROUP_RAID1 |
5182 				    BTRFS_BLOCK_GROUP_RAID10))
5183 			factor = 2;
5184 		else
5185 			factor = 1;
5186 		/*
5187 		 * If this block group has free space cache written out, we
5188 		 * need to make sure to load it if we are removing space.  This
5189 		 * is because we need the unpinning stage to actually add the
5190 		 * space back to the block group, otherwise we will leak space.
5191 		 */
5192 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
5193 			cache_block_group(cache, 1);
5194 
5195 		byte_in_group = bytenr - cache->key.objectid;
5196 		WARN_ON(byte_in_group > cache->key.offset);
5197 
5198 		spin_lock(&cache->space_info->lock);
5199 		spin_lock(&cache->lock);
5200 
5201 		if (btrfs_test_opt(root, SPACE_CACHE) &&
5202 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
5203 			cache->disk_cache_state = BTRFS_DC_CLEAR;
5204 
5205 		cache->dirty = 1;
5206 		old_val = btrfs_block_group_used(&cache->item);
5207 		num_bytes = min(total, cache->key.offset - byte_in_group);
5208 		if (alloc) {
5209 			old_val += num_bytes;
5210 			btrfs_set_block_group_used(&cache->item, old_val);
5211 			cache->reserved -= num_bytes;
5212 			cache->space_info->bytes_reserved -= num_bytes;
5213 			cache->space_info->bytes_used += num_bytes;
5214 			cache->space_info->disk_used += num_bytes * factor;
5215 			spin_unlock(&cache->lock);
5216 			spin_unlock(&cache->space_info->lock);
5217 		} else {
5218 			old_val -= num_bytes;
5219 			btrfs_set_block_group_used(&cache->item, old_val);
5220 			cache->pinned += num_bytes;
5221 			cache->space_info->bytes_pinned += num_bytes;
5222 			cache->space_info->bytes_used -= num_bytes;
5223 			cache->space_info->disk_used -= num_bytes * factor;
5224 			spin_unlock(&cache->lock);
5225 			spin_unlock(&cache->space_info->lock);
5226 
5227 			set_extent_dirty(info->pinned_extents,
5228 					 bytenr, bytenr + num_bytes - 1,
5229 					 GFP_NOFS | __GFP_NOFAIL);
5230 		}
5231 		btrfs_put_block_group(cache);
5232 		total -= num_bytes;
5233 		bytenr += num_bytes;
5234 	}
5235 	return 0;
5236 }
5237 
5238 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5239 {
5240 	struct btrfs_block_group_cache *cache;
5241 	u64 bytenr;
5242 
5243 	spin_lock(&root->fs_info->block_group_cache_lock);
5244 	bytenr = root->fs_info->first_logical_byte;
5245 	spin_unlock(&root->fs_info->block_group_cache_lock);
5246 
5247 	if (bytenr < (u64)-1)
5248 		return bytenr;
5249 
5250 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5251 	if (!cache)
5252 		return 0;
5253 
5254 	bytenr = cache->key.objectid;
5255 	btrfs_put_block_group(cache);
5256 
5257 	return bytenr;
5258 }
5259 
5260 static int pin_down_extent(struct btrfs_root *root,
5261 			   struct btrfs_block_group_cache *cache,
5262 			   u64 bytenr, u64 num_bytes, int reserved)
5263 {
5264 	spin_lock(&cache->space_info->lock);
5265 	spin_lock(&cache->lock);
5266 	cache->pinned += num_bytes;
5267 	cache->space_info->bytes_pinned += num_bytes;
5268 	if (reserved) {
5269 		cache->reserved -= num_bytes;
5270 		cache->space_info->bytes_reserved -= num_bytes;
5271 	}
5272 	spin_unlock(&cache->lock);
5273 	spin_unlock(&cache->space_info->lock);
5274 
5275 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5276 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5277 	return 0;
5278 }
5279 
5280 /*
5281  * this function must be called within transaction
5282  */
5283 int btrfs_pin_extent(struct btrfs_root *root,
5284 		     u64 bytenr, u64 num_bytes, int reserved)
5285 {
5286 	struct btrfs_block_group_cache *cache;
5287 
5288 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5289 	BUG_ON(!cache); /* Logic error */
5290 
5291 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5292 
5293 	btrfs_put_block_group(cache);
5294 	return 0;
5295 }
5296 
5297 /*
5298  * this function must be called within transaction
5299  */
5300 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5301 				    u64 bytenr, u64 num_bytes)
5302 {
5303 	struct btrfs_block_group_cache *cache;
5304 	int ret;
5305 
5306 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5307 	if (!cache)
5308 		return -EINVAL;
5309 
5310 	/*
5311 	 * pull in the free space cache (if any) so that our pin
5312 	 * removes the free space from the cache.  We have load_only set
5313 	 * to one because the slow code to read in the free extents does check
5314 	 * the pinned extents.
5315 	 */
5316 	cache_block_group(cache, 1);
5317 
5318 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
5319 
5320 	/* remove us from the free space cache (if we're there at all) */
5321 	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5322 	btrfs_put_block_group(cache);
5323 	return ret;
5324 }
5325 
5326 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5327 {
5328 	int ret;
5329 	struct btrfs_block_group_cache *block_group;
5330 	struct btrfs_caching_control *caching_ctl;
5331 
5332 	block_group = btrfs_lookup_block_group(root->fs_info, start);
5333 	if (!block_group)
5334 		return -EINVAL;
5335 
5336 	cache_block_group(block_group, 0);
5337 	caching_ctl = get_caching_control(block_group);
5338 
5339 	if (!caching_ctl) {
5340 		/* Logic error */
5341 		BUG_ON(!block_group_cache_done(block_group));
5342 		ret = btrfs_remove_free_space(block_group, start, num_bytes);
5343 	} else {
5344 		mutex_lock(&caching_ctl->mutex);
5345 
5346 		if (start >= caching_ctl->progress) {
5347 			ret = add_excluded_extent(root, start, num_bytes);
5348 		} else if (start + num_bytes <= caching_ctl->progress) {
5349 			ret = btrfs_remove_free_space(block_group,
5350 						      start, num_bytes);
5351 		} else {
5352 			num_bytes = caching_ctl->progress - start;
5353 			ret = btrfs_remove_free_space(block_group,
5354 						      start, num_bytes);
5355 			if (ret)
5356 				goto out_lock;
5357 
5358 			num_bytes = (start + num_bytes) -
5359 				caching_ctl->progress;
5360 			start = caching_ctl->progress;
5361 			ret = add_excluded_extent(root, start, num_bytes);
5362 		}
5363 out_lock:
5364 		mutex_unlock(&caching_ctl->mutex);
5365 		put_caching_control(caching_ctl);
5366 	}
5367 	btrfs_put_block_group(block_group);
5368 	return ret;
5369 }
5370 
5371 int btrfs_exclude_logged_extents(struct btrfs_root *log,
5372 				 struct extent_buffer *eb)
5373 {
5374 	struct btrfs_file_extent_item *item;
5375 	struct btrfs_key key;
5376 	int found_type;
5377 	int i;
5378 
5379 	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5380 		return 0;
5381 
5382 	for (i = 0; i < btrfs_header_nritems(eb); i++) {
5383 		btrfs_item_key_to_cpu(eb, &key, i);
5384 		if (key.type != BTRFS_EXTENT_DATA_KEY)
5385 			continue;
5386 		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5387 		found_type = btrfs_file_extent_type(eb, item);
5388 		if (found_type == BTRFS_FILE_EXTENT_INLINE)
5389 			continue;
5390 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5391 			continue;
5392 		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5393 		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5394 		__exclude_logged_extent(log, key.objectid, key.offset);
5395 	}
5396 
5397 	return 0;
5398 }
5399 
5400 /**
5401  * btrfs_update_reserved_bytes - update the block_group and space info counters
5402  * @cache:	The cache we are manipulating
5403  * @num_bytes:	The number of bytes in question
5404  * @reserve:	One of the reservation enums
5405  *
5406  * This is called by the allocator when it reserves space, or by somebody who is
5407  * freeing space that was never actually used on disk.  For example if you
5408  * reserve some space for a new leaf in transaction A and before transaction A
5409  * commits you free that leaf, you call this with reserve set to 0 in order to
5410  * clear the reservation.
5411  *
5412  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5413  * ENOSPC accounting.  For data we handle the reservation through clearing the
5414  * delalloc bits in the io_tree.  We have to do this since we could end up
5415  * allocating less disk space for the amount of data we have reserved in the
5416  * case of compression.
5417  *
5418  * If this is a reservation and the block group has become read only we cannot
5419  * make the reservation and return -EAGAIN, otherwise this function always
5420  * succeeds.
5421  */
5422 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5423 				       u64 num_bytes, int reserve)
5424 {
5425 	struct btrfs_space_info *space_info = cache->space_info;
5426 	int ret = 0;
5427 
5428 	spin_lock(&space_info->lock);
5429 	spin_lock(&cache->lock);
5430 	if (reserve != RESERVE_FREE) {
5431 		if (cache->ro) {
5432 			ret = -EAGAIN;
5433 		} else {
5434 			cache->reserved += num_bytes;
5435 			space_info->bytes_reserved += num_bytes;
5436 			if (reserve == RESERVE_ALLOC) {
5437 				trace_btrfs_space_reservation(cache->fs_info,
5438 						"space_info", space_info->flags,
5439 						num_bytes, 0);
5440 				space_info->bytes_may_use -= num_bytes;
5441 			}
5442 		}
5443 	} else {
5444 		if (cache->ro)
5445 			space_info->bytes_readonly += num_bytes;
5446 		cache->reserved -= num_bytes;
5447 		space_info->bytes_reserved -= num_bytes;
5448 		space_info->reservation_progress++;
5449 	}
5450 	spin_unlock(&cache->lock);
5451 	spin_unlock(&space_info->lock);
5452 	return ret;
5453 }
5454 
5455 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5456 				struct btrfs_root *root)
5457 {
5458 	struct btrfs_fs_info *fs_info = root->fs_info;
5459 	struct btrfs_caching_control *next;
5460 	struct btrfs_caching_control *caching_ctl;
5461 	struct btrfs_block_group_cache *cache;
5462 	struct btrfs_space_info *space_info;
5463 
5464 	down_write(&fs_info->extent_commit_sem);
5465 
5466 	list_for_each_entry_safe(caching_ctl, next,
5467 				 &fs_info->caching_block_groups, list) {
5468 		cache = caching_ctl->block_group;
5469 		if (block_group_cache_done(cache)) {
5470 			cache->last_byte_to_unpin = (u64)-1;
5471 			list_del_init(&caching_ctl->list);
5472 			put_caching_control(caching_ctl);
5473 		} else {
5474 			cache->last_byte_to_unpin = caching_ctl->progress;
5475 		}
5476 	}
5477 
5478 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5479 		fs_info->pinned_extents = &fs_info->freed_extents[1];
5480 	else
5481 		fs_info->pinned_extents = &fs_info->freed_extents[0];
5482 
5483 	up_write(&fs_info->extent_commit_sem);
5484 
5485 	list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5486 		percpu_counter_set(&space_info->total_bytes_pinned, 0);
5487 
5488 	update_global_block_rsv(fs_info);
5489 }
5490 
5491 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5492 {
5493 	struct btrfs_fs_info *fs_info = root->fs_info;
5494 	struct btrfs_block_group_cache *cache = NULL;
5495 	struct btrfs_space_info *space_info;
5496 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5497 	u64 len;
5498 	bool readonly;
5499 
5500 	while (start <= end) {
5501 		readonly = false;
5502 		if (!cache ||
5503 		    start >= cache->key.objectid + cache->key.offset) {
5504 			if (cache)
5505 				btrfs_put_block_group(cache);
5506 			cache = btrfs_lookup_block_group(fs_info, start);
5507 			BUG_ON(!cache); /* Logic error */
5508 		}
5509 
5510 		len = cache->key.objectid + cache->key.offset - start;
5511 		len = min(len, end + 1 - start);
5512 
5513 		if (start < cache->last_byte_to_unpin) {
5514 			len = min(len, cache->last_byte_to_unpin - start);
5515 			btrfs_add_free_space(cache, start, len);
5516 		}
5517 
5518 		start += len;
5519 		space_info = cache->space_info;
5520 
5521 		spin_lock(&space_info->lock);
5522 		spin_lock(&cache->lock);
5523 		cache->pinned -= len;
5524 		space_info->bytes_pinned -= len;
5525 		if (cache->ro) {
5526 			space_info->bytes_readonly += len;
5527 			readonly = true;
5528 		}
5529 		spin_unlock(&cache->lock);
5530 		if (!readonly && global_rsv->space_info == space_info) {
5531 			spin_lock(&global_rsv->lock);
5532 			if (!global_rsv->full) {
5533 				len = min(len, global_rsv->size -
5534 					  global_rsv->reserved);
5535 				global_rsv->reserved += len;
5536 				space_info->bytes_may_use += len;
5537 				if (global_rsv->reserved >= global_rsv->size)
5538 					global_rsv->full = 1;
5539 			}
5540 			spin_unlock(&global_rsv->lock);
5541 		}
5542 		spin_unlock(&space_info->lock);
5543 	}
5544 
5545 	if (cache)
5546 		btrfs_put_block_group(cache);
5547 	return 0;
5548 }
5549 
5550 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5551 			       struct btrfs_root *root)
5552 {
5553 	struct btrfs_fs_info *fs_info = root->fs_info;
5554 	struct extent_io_tree *unpin;
5555 	u64 start;
5556 	u64 end;
5557 	int ret;
5558 
5559 	if (trans->aborted)
5560 		return 0;
5561 
5562 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5563 		unpin = &fs_info->freed_extents[1];
5564 	else
5565 		unpin = &fs_info->freed_extents[0];
5566 
5567 	while (1) {
5568 		ret = find_first_extent_bit(unpin, 0, &start, &end,
5569 					    EXTENT_DIRTY, NULL);
5570 		if (ret)
5571 			break;
5572 
5573 		if (btrfs_test_opt(root, DISCARD))
5574 			ret = btrfs_discard_extent(root, start,
5575 						   end + 1 - start, NULL);
5576 
5577 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
5578 		unpin_extent_range(root, start, end);
5579 		cond_resched();
5580 	}
5581 
5582 	return 0;
5583 }
5584 
5585 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5586 			     u64 owner, u64 root_objectid)
5587 {
5588 	struct btrfs_space_info *space_info;
5589 	u64 flags;
5590 
5591 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5592 		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5593 			flags = BTRFS_BLOCK_GROUP_SYSTEM;
5594 		else
5595 			flags = BTRFS_BLOCK_GROUP_METADATA;
5596 	} else {
5597 		flags = BTRFS_BLOCK_GROUP_DATA;
5598 	}
5599 
5600 	space_info = __find_space_info(fs_info, flags);
5601 	BUG_ON(!space_info); /* Logic bug */
5602 	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5603 }
5604 
5605 
5606 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5607 				struct btrfs_root *root,
5608 				u64 bytenr, u64 num_bytes, u64 parent,
5609 				u64 root_objectid, u64 owner_objectid,
5610 				u64 owner_offset, int refs_to_drop,
5611 				struct btrfs_delayed_extent_op *extent_op)
5612 {
5613 	struct btrfs_key key;
5614 	struct btrfs_path *path;
5615 	struct btrfs_fs_info *info = root->fs_info;
5616 	struct btrfs_root *extent_root = info->extent_root;
5617 	struct extent_buffer *leaf;
5618 	struct btrfs_extent_item *ei;
5619 	struct btrfs_extent_inline_ref *iref;
5620 	int ret;
5621 	int is_data;
5622 	int extent_slot = 0;
5623 	int found_extent = 0;
5624 	int num_to_del = 1;
5625 	u32 item_size;
5626 	u64 refs;
5627 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5628 						 SKINNY_METADATA);
5629 
5630 	path = btrfs_alloc_path();
5631 	if (!path)
5632 		return -ENOMEM;
5633 
5634 	path->reada = 1;
5635 	path->leave_spinning = 1;
5636 
5637 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5638 	BUG_ON(!is_data && refs_to_drop != 1);
5639 
5640 	if (is_data)
5641 		skinny_metadata = 0;
5642 
5643 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
5644 				    bytenr, num_bytes, parent,
5645 				    root_objectid, owner_objectid,
5646 				    owner_offset);
5647 	if (ret == 0) {
5648 		extent_slot = path->slots[0];
5649 		while (extent_slot >= 0) {
5650 			btrfs_item_key_to_cpu(path->nodes[0], &key,
5651 					      extent_slot);
5652 			if (key.objectid != bytenr)
5653 				break;
5654 			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5655 			    key.offset == num_bytes) {
5656 				found_extent = 1;
5657 				break;
5658 			}
5659 			if (key.type == BTRFS_METADATA_ITEM_KEY &&
5660 			    key.offset == owner_objectid) {
5661 				found_extent = 1;
5662 				break;
5663 			}
5664 			if (path->slots[0] - extent_slot > 5)
5665 				break;
5666 			extent_slot--;
5667 		}
5668 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5669 		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5670 		if (found_extent && item_size < sizeof(*ei))
5671 			found_extent = 0;
5672 #endif
5673 		if (!found_extent) {
5674 			BUG_ON(iref);
5675 			ret = remove_extent_backref(trans, extent_root, path,
5676 						    NULL, refs_to_drop,
5677 						    is_data);
5678 			if (ret) {
5679 				btrfs_abort_transaction(trans, extent_root, ret);
5680 				goto out;
5681 			}
5682 			btrfs_release_path(path);
5683 			path->leave_spinning = 1;
5684 
5685 			key.objectid = bytenr;
5686 			key.type = BTRFS_EXTENT_ITEM_KEY;
5687 			key.offset = num_bytes;
5688 
5689 			if (!is_data && skinny_metadata) {
5690 				key.type = BTRFS_METADATA_ITEM_KEY;
5691 				key.offset = owner_objectid;
5692 			}
5693 
5694 			ret = btrfs_search_slot(trans, extent_root,
5695 						&key, path, -1, 1);
5696 			if (ret > 0 && skinny_metadata && path->slots[0]) {
5697 				/*
5698 				 * Couldn't find our skinny metadata item,
5699 				 * see if we have ye olde extent item.
5700 				 */
5701 				path->slots[0]--;
5702 				btrfs_item_key_to_cpu(path->nodes[0], &key,
5703 						      path->slots[0]);
5704 				if (key.objectid == bytenr &&
5705 				    key.type == BTRFS_EXTENT_ITEM_KEY &&
5706 				    key.offset == num_bytes)
5707 					ret = 0;
5708 			}
5709 
5710 			if (ret > 0 && skinny_metadata) {
5711 				skinny_metadata = false;
5712 				key.type = BTRFS_EXTENT_ITEM_KEY;
5713 				key.offset = num_bytes;
5714 				btrfs_release_path(path);
5715 				ret = btrfs_search_slot(trans, extent_root,
5716 							&key, path, -1, 1);
5717 			}
5718 
5719 			if (ret) {
5720 				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5721 					ret, bytenr);
5722 				if (ret > 0)
5723 					btrfs_print_leaf(extent_root,
5724 							 path->nodes[0]);
5725 			}
5726 			if (ret < 0) {
5727 				btrfs_abort_transaction(trans, extent_root, ret);
5728 				goto out;
5729 			}
5730 			extent_slot = path->slots[0];
5731 		}
5732 	} else if (ret == -ENOENT) {
5733 		btrfs_print_leaf(extent_root, path->nodes[0]);
5734 		WARN_ON(1);
5735 		btrfs_err(info,
5736 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
5737 			bytenr, parent, root_objectid, owner_objectid,
5738 			owner_offset);
5739 	} else {
5740 		btrfs_abort_transaction(trans, extent_root, ret);
5741 		goto out;
5742 	}
5743 
5744 	leaf = path->nodes[0];
5745 	item_size = btrfs_item_size_nr(leaf, extent_slot);
5746 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5747 	if (item_size < sizeof(*ei)) {
5748 		BUG_ON(found_extent || extent_slot != path->slots[0]);
5749 		ret = convert_extent_item_v0(trans, extent_root, path,
5750 					     owner_objectid, 0);
5751 		if (ret < 0) {
5752 			btrfs_abort_transaction(trans, extent_root, ret);
5753 			goto out;
5754 		}
5755 
5756 		btrfs_release_path(path);
5757 		path->leave_spinning = 1;
5758 
5759 		key.objectid = bytenr;
5760 		key.type = BTRFS_EXTENT_ITEM_KEY;
5761 		key.offset = num_bytes;
5762 
5763 		ret = btrfs_search_slot(trans, extent_root, &key, path,
5764 					-1, 1);
5765 		if (ret) {
5766 			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5767 				ret, bytenr);
5768 			btrfs_print_leaf(extent_root, path->nodes[0]);
5769 		}
5770 		if (ret < 0) {
5771 			btrfs_abort_transaction(trans, extent_root, ret);
5772 			goto out;
5773 		}
5774 
5775 		extent_slot = path->slots[0];
5776 		leaf = path->nodes[0];
5777 		item_size = btrfs_item_size_nr(leaf, extent_slot);
5778 	}
5779 #endif
5780 	BUG_ON(item_size < sizeof(*ei));
5781 	ei = btrfs_item_ptr(leaf, extent_slot,
5782 			    struct btrfs_extent_item);
5783 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5784 	    key.type == BTRFS_EXTENT_ITEM_KEY) {
5785 		struct btrfs_tree_block_info *bi;
5786 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5787 		bi = (struct btrfs_tree_block_info *)(ei + 1);
5788 		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5789 	}
5790 
5791 	refs = btrfs_extent_refs(leaf, ei);
5792 	if (refs < refs_to_drop) {
5793 		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
5794 			  "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
5795 		ret = -EINVAL;
5796 		btrfs_abort_transaction(trans, extent_root, ret);
5797 		goto out;
5798 	}
5799 	refs -= refs_to_drop;
5800 
5801 	if (refs > 0) {
5802 		if (extent_op)
5803 			__run_delayed_extent_op(extent_op, leaf, ei);
5804 		/*
5805 		 * In the case of inline back ref, reference count will
5806 		 * be updated by remove_extent_backref
5807 		 */
5808 		if (iref) {
5809 			BUG_ON(!found_extent);
5810 		} else {
5811 			btrfs_set_extent_refs(leaf, ei, refs);
5812 			btrfs_mark_buffer_dirty(leaf);
5813 		}
5814 		if (found_extent) {
5815 			ret = remove_extent_backref(trans, extent_root, path,
5816 						    iref, refs_to_drop,
5817 						    is_data);
5818 			if (ret) {
5819 				btrfs_abort_transaction(trans, extent_root, ret);
5820 				goto out;
5821 			}
5822 		}
5823 		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5824 				 root_objectid);
5825 	} else {
5826 		if (found_extent) {
5827 			BUG_ON(is_data && refs_to_drop !=
5828 			       extent_data_ref_count(root, path, iref));
5829 			if (iref) {
5830 				BUG_ON(path->slots[0] != extent_slot);
5831 			} else {
5832 				BUG_ON(path->slots[0] != extent_slot + 1);
5833 				path->slots[0] = extent_slot;
5834 				num_to_del = 2;
5835 			}
5836 		}
5837 
5838 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5839 				      num_to_del);
5840 		if (ret) {
5841 			btrfs_abort_transaction(trans, extent_root, ret);
5842 			goto out;
5843 		}
5844 		btrfs_release_path(path);
5845 
5846 		if (is_data) {
5847 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5848 			if (ret) {
5849 				btrfs_abort_transaction(trans, extent_root, ret);
5850 				goto out;
5851 			}
5852 		}
5853 
5854 		ret = update_block_group(root, bytenr, num_bytes, 0);
5855 		if (ret) {
5856 			btrfs_abort_transaction(trans, extent_root, ret);
5857 			goto out;
5858 		}
5859 	}
5860 out:
5861 	btrfs_free_path(path);
5862 	return ret;
5863 }
5864 
5865 /*
5866  * when we free an block, it is possible (and likely) that we free the last
5867  * delayed ref for that extent as well.  This searches the delayed ref tree for
5868  * a given extent, and if there are no other delayed refs to be processed, it
5869  * removes it from the tree.
5870  */
5871 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5872 				      struct btrfs_root *root, u64 bytenr)
5873 {
5874 	struct btrfs_delayed_ref_head *head;
5875 	struct btrfs_delayed_ref_root *delayed_refs;
5876 	struct btrfs_delayed_ref_node *ref;
5877 	struct rb_node *node;
5878 	int ret = 0;
5879 
5880 	delayed_refs = &trans->transaction->delayed_refs;
5881 	spin_lock(&delayed_refs->lock);
5882 	head = btrfs_find_delayed_ref_head(trans, bytenr);
5883 	if (!head)
5884 		goto out;
5885 
5886 	node = rb_prev(&head->node.rb_node);
5887 	if (!node)
5888 		goto out;
5889 
5890 	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5891 
5892 	/* there are still entries for this ref, we can't drop it */
5893 	if (ref->bytenr == bytenr)
5894 		goto out;
5895 
5896 	if (head->extent_op) {
5897 		if (!head->must_insert_reserved)
5898 			goto out;
5899 		btrfs_free_delayed_extent_op(head->extent_op);
5900 		head->extent_op = NULL;
5901 	}
5902 
5903 	/*
5904 	 * waiting for the lock here would deadlock.  If someone else has it
5905 	 * locked they are already in the process of dropping it anyway
5906 	 */
5907 	if (!mutex_trylock(&head->mutex))
5908 		goto out;
5909 
5910 	/*
5911 	 * at this point we have a head with no other entries.  Go
5912 	 * ahead and process it.
5913 	 */
5914 	head->node.in_tree = 0;
5915 	rb_erase(&head->node.rb_node, &delayed_refs->root);
5916 
5917 	delayed_refs->num_entries--;
5918 
5919 	/*
5920 	 * we don't take a ref on the node because we're removing it from the
5921 	 * tree, so we just steal the ref the tree was holding.
5922 	 */
5923 	delayed_refs->num_heads--;
5924 	if (list_empty(&head->cluster))
5925 		delayed_refs->num_heads_ready--;
5926 
5927 	list_del_init(&head->cluster);
5928 	spin_unlock(&delayed_refs->lock);
5929 
5930 	BUG_ON(head->extent_op);
5931 	if (head->must_insert_reserved)
5932 		ret = 1;
5933 
5934 	mutex_unlock(&head->mutex);
5935 	btrfs_put_delayed_ref(&head->node);
5936 	return ret;
5937 out:
5938 	spin_unlock(&delayed_refs->lock);
5939 	return 0;
5940 }
5941 
5942 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5943 			   struct btrfs_root *root,
5944 			   struct extent_buffer *buf,
5945 			   u64 parent, int last_ref)
5946 {
5947 	struct btrfs_block_group_cache *cache = NULL;
5948 	int pin = 1;
5949 	int ret;
5950 
5951 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5952 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5953 					buf->start, buf->len,
5954 					parent, root->root_key.objectid,
5955 					btrfs_header_level(buf),
5956 					BTRFS_DROP_DELAYED_REF, NULL, 0);
5957 		BUG_ON(ret); /* -ENOMEM */
5958 	}
5959 
5960 	if (!last_ref)
5961 		return;
5962 
5963 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5964 
5965 	if (btrfs_header_generation(buf) == trans->transid) {
5966 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5967 			ret = check_ref_cleanup(trans, root, buf->start);
5968 			if (!ret)
5969 				goto out;
5970 		}
5971 
5972 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5973 			pin_down_extent(root, cache, buf->start, buf->len, 1);
5974 			goto out;
5975 		}
5976 
5977 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5978 
5979 		btrfs_add_free_space(cache, buf->start, buf->len);
5980 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5981 		pin = 0;
5982 	}
5983 out:
5984 	if (pin)
5985 		add_pinned_bytes(root->fs_info, buf->len,
5986 				 btrfs_header_level(buf),
5987 				 root->root_key.objectid);
5988 
5989 	/*
5990 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5991 	 * anymore.
5992 	 */
5993 	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5994 	btrfs_put_block_group(cache);
5995 }
5996 
5997 /* Can return -ENOMEM */
5998 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5999 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6000 		      u64 owner, u64 offset, int for_cow)
6001 {
6002 	int ret;
6003 	struct btrfs_fs_info *fs_info = root->fs_info;
6004 
6005 	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6006 
6007 	/*
6008 	 * tree log blocks never actually go into the extent allocation
6009 	 * tree, just update pinning info and exit early.
6010 	 */
6011 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6012 		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6013 		/* unlocks the pinned mutex */
6014 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
6015 		ret = 0;
6016 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6017 		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6018 					num_bytes,
6019 					parent, root_objectid, (int)owner,
6020 					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
6021 	} else {
6022 		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6023 						num_bytes,
6024 						parent, root_objectid, owner,
6025 						offset, BTRFS_DROP_DELAYED_REF,
6026 						NULL, for_cow);
6027 	}
6028 	return ret;
6029 }
6030 
6031 static u64 stripe_align(struct btrfs_root *root,
6032 			struct btrfs_block_group_cache *cache,
6033 			u64 val, u64 num_bytes)
6034 {
6035 	u64 ret = ALIGN(val, root->stripesize);
6036 	return ret;
6037 }
6038 
6039 /*
6040  * when we wait for progress in the block group caching, its because
6041  * our allocation attempt failed at least once.  So, we must sleep
6042  * and let some progress happen before we try again.
6043  *
6044  * This function will sleep at least once waiting for new free space to
6045  * show up, and then it will check the block group free space numbers
6046  * for our min num_bytes.  Another option is to have it go ahead
6047  * and look in the rbtree for a free extent of a given size, but this
6048  * is a good start.
6049  *
6050  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6051  * any of the information in this block group.
6052  */
6053 static noinline void
6054 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6055 				u64 num_bytes)
6056 {
6057 	struct btrfs_caching_control *caching_ctl;
6058 
6059 	caching_ctl = get_caching_control(cache);
6060 	if (!caching_ctl)
6061 		return;
6062 
6063 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6064 		   (cache->free_space_ctl->free_space >= num_bytes));
6065 
6066 	put_caching_control(caching_ctl);
6067 }
6068 
6069 static noinline int
6070 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6071 {
6072 	struct btrfs_caching_control *caching_ctl;
6073 	int ret = 0;
6074 
6075 	caching_ctl = get_caching_control(cache);
6076 	if (!caching_ctl)
6077 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6078 
6079 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
6080 	if (cache->cached == BTRFS_CACHE_ERROR)
6081 		ret = -EIO;
6082 	put_caching_control(caching_ctl);
6083 	return ret;
6084 }
6085 
6086 int __get_raid_index(u64 flags)
6087 {
6088 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
6089 		return BTRFS_RAID_RAID10;
6090 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6091 		return BTRFS_RAID_RAID1;
6092 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
6093 		return BTRFS_RAID_DUP;
6094 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6095 		return BTRFS_RAID_RAID0;
6096 	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6097 		return BTRFS_RAID_RAID5;
6098 	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6099 		return BTRFS_RAID_RAID6;
6100 
6101 	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6102 }
6103 
6104 static int get_block_group_index(struct btrfs_block_group_cache *cache)
6105 {
6106 	return __get_raid_index(cache->flags);
6107 }
6108 
6109 enum btrfs_loop_type {
6110 	LOOP_CACHING_NOWAIT = 0,
6111 	LOOP_CACHING_WAIT = 1,
6112 	LOOP_ALLOC_CHUNK = 2,
6113 	LOOP_NO_EMPTY_SIZE = 3,
6114 };
6115 
6116 /*
6117  * walks the btree of allocated extents and find a hole of a given size.
6118  * The key ins is changed to record the hole:
6119  * ins->objectid == block start
6120  * ins->flags = BTRFS_EXTENT_ITEM_KEY
6121  * ins->offset == number of blocks
6122  * Any available blocks before search_start are skipped.
6123  */
6124 static noinline int find_free_extent(struct btrfs_root *orig_root,
6125 				     u64 num_bytes, u64 empty_size,
6126 				     u64 hint_byte, struct btrfs_key *ins,
6127 				     u64 flags)
6128 {
6129 	int ret = 0;
6130 	struct btrfs_root *root = orig_root->fs_info->extent_root;
6131 	struct btrfs_free_cluster *last_ptr = NULL;
6132 	struct btrfs_block_group_cache *block_group = NULL;
6133 	struct btrfs_block_group_cache *used_block_group;
6134 	u64 search_start = 0;
6135 	int empty_cluster = 2 * 1024 * 1024;
6136 	struct btrfs_space_info *space_info;
6137 	int loop = 0;
6138 	int index = __get_raid_index(flags);
6139 	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6140 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6141 	bool found_uncached_bg = false;
6142 	bool failed_cluster_refill = false;
6143 	bool failed_alloc = false;
6144 	bool use_cluster = true;
6145 	bool have_caching_bg = false;
6146 
6147 	WARN_ON(num_bytes < root->sectorsize);
6148 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
6149 	ins->objectid = 0;
6150 	ins->offset = 0;
6151 
6152 	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6153 
6154 	space_info = __find_space_info(root->fs_info, flags);
6155 	if (!space_info) {
6156 		btrfs_err(root->fs_info, "No space info for %llu", flags);
6157 		return -ENOSPC;
6158 	}
6159 
6160 	/*
6161 	 * If the space info is for both data and metadata it means we have a
6162 	 * small filesystem and we can't use the clustering stuff.
6163 	 */
6164 	if (btrfs_mixed_space_info(space_info))
6165 		use_cluster = false;
6166 
6167 	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6168 		last_ptr = &root->fs_info->meta_alloc_cluster;
6169 		if (!btrfs_test_opt(root, SSD))
6170 			empty_cluster = 64 * 1024;
6171 	}
6172 
6173 	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6174 	    btrfs_test_opt(root, SSD)) {
6175 		last_ptr = &root->fs_info->data_alloc_cluster;
6176 	}
6177 
6178 	if (last_ptr) {
6179 		spin_lock(&last_ptr->lock);
6180 		if (last_ptr->block_group)
6181 			hint_byte = last_ptr->window_start;
6182 		spin_unlock(&last_ptr->lock);
6183 	}
6184 
6185 	search_start = max(search_start, first_logical_byte(root, 0));
6186 	search_start = max(search_start, hint_byte);
6187 
6188 	if (!last_ptr)
6189 		empty_cluster = 0;
6190 
6191 	if (search_start == hint_byte) {
6192 		block_group = btrfs_lookup_block_group(root->fs_info,
6193 						       search_start);
6194 		used_block_group = block_group;
6195 		/*
6196 		 * we don't want to use the block group if it doesn't match our
6197 		 * allocation bits, or if its not cached.
6198 		 *
6199 		 * However if we are re-searching with an ideal block group
6200 		 * picked out then we don't care that the block group is cached.
6201 		 */
6202 		if (block_group && block_group_bits(block_group, flags) &&
6203 		    block_group->cached != BTRFS_CACHE_NO) {
6204 			down_read(&space_info->groups_sem);
6205 			if (list_empty(&block_group->list) ||
6206 			    block_group->ro) {
6207 				/*
6208 				 * someone is removing this block group,
6209 				 * we can't jump into the have_block_group
6210 				 * target because our list pointers are not
6211 				 * valid
6212 				 */
6213 				btrfs_put_block_group(block_group);
6214 				up_read(&space_info->groups_sem);
6215 			} else {
6216 				index = get_block_group_index(block_group);
6217 				goto have_block_group;
6218 			}
6219 		} else if (block_group) {
6220 			btrfs_put_block_group(block_group);
6221 		}
6222 	}
6223 search:
6224 	have_caching_bg = false;
6225 	down_read(&space_info->groups_sem);
6226 	list_for_each_entry(block_group, &space_info->block_groups[index],
6227 			    list) {
6228 		u64 offset;
6229 		int cached;
6230 
6231 		used_block_group = block_group;
6232 		btrfs_get_block_group(block_group);
6233 		search_start = block_group->key.objectid;
6234 
6235 		/*
6236 		 * this can happen if we end up cycling through all the
6237 		 * raid types, but we want to make sure we only allocate
6238 		 * for the proper type.
6239 		 */
6240 		if (!block_group_bits(block_group, flags)) {
6241 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
6242 				BTRFS_BLOCK_GROUP_RAID1 |
6243 				BTRFS_BLOCK_GROUP_RAID5 |
6244 				BTRFS_BLOCK_GROUP_RAID6 |
6245 				BTRFS_BLOCK_GROUP_RAID10;
6246 
6247 			/*
6248 			 * if they asked for extra copies and this block group
6249 			 * doesn't provide them, bail.  This does allow us to
6250 			 * fill raid0 from raid1.
6251 			 */
6252 			if ((flags & extra) && !(block_group->flags & extra))
6253 				goto loop;
6254 		}
6255 
6256 have_block_group:
6257 		cached = block_group_cache_done(block_group);
6258 		if (unlikely(!cached)) {
6259 			found_uncached_bg = true;
6260 			ret = cache_block_group(block_group, 0);
6261 			BUG_ON(ret < 0);
6262 			ret = 0;
6263 		}
6264 
6265 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6266 			goto loop;
6267 		if (unlikely(block_group->ro))
6268 			goto loop;
6269 
6270 		/*
6271 		 * Ok we want to try and use the cluster allocator, so
6272 		 * lets look there
6273 		 */
6274 		if (last_ptr) {
6275 			unsigned long aligned_cluster;
6276 			/*
6277 			 * the refill lock keeps out other
6278 			 * people trying to start a new cluster
6279 			 */
6280 			spin_lock(&last_ptr->refill_lock);
6281 			used_block_group = last_ptr->block_group;
6282 			if (used_block_group != block_group &&
6283 			    (!used_block_group ||
6284 			     used_block_group->ro ||
6285 			     !block_group_bits(used_block_group, flags))) {
6286 				used_block_group = block_group;
6287 				goto refill_cluster;
6288 			}
6289 
6290 			if (used_block_group != block_group)
6291 				btrfs_get_block_group(used_block_group);
6292 
6293 			offset = btrfs_alloc_from_cluster(used_block_group,
6294 			  last_ptr, num_bytes, used_block_group->key.objectid);
6295 			if (offset) {
6296 				/* we have a block, we're done */
6297 				spin_unlock(&last_ptr->refill_lock);
6298 				trace_btrfs_reserve_extent_cluster(root,
6299 					block_group, search_start, num_bytes);
6300 				goto checks;
6301 			}
6302 
6303 			WARN_ON(last_ptr->block_group != used_block_group);
6304 			if (used_block_group != block_group) {
6305 				btrfs_put_block_group(used_block_group);
6306 				used_block_group = block_group;
6307 			}
6308 refill_cluster:
6309 			BUG_ON(used_block_group != block_group);
6310 			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
6311 			 * set up a new clusters, so lets just skip it
6312 			 * and let the allocator find whatever block
6313 			 * it can find.  If we reach this point, we
6314 			 * will have tried the cluster allocator
6315 			 * plenty of times and not have found
6316 			 * anything, so we are likely way too
6317 			 * fragmented for the clustering stuff to find
6318 			 * anything.
6319 			 *
6320 			 * However, if the cluster is taken from the
6321 			 * current block group, release the cluster
6322 			 * first, so that we stand a better chance of
6323 			 * succeeding in the unclustered
6324 			 * allocation.  */
6325 			if (loop >= LOOP_NO_EMPTY_SIZE &&
6326 			    last_ptr->block_group != block_group) {
6327 				spin_unlock(&last_ptr->refill_lock);
6328 				goto unclustered_alloc;
6329 			}
6330 
6331 			/*
6332 			 * this cluster didn't work out, free it and
6333 			 * start over
6334 			 */
6335 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
6336 
6337 			if (loop >= LOOP_NO_EMPTY_SIZE) {
6338 				spin_unlock(&last_ptr->refill_lock);
6339 				goto unclustered_alloc;
6340 			}
6341 
6342 			aligned_cluster = max_t(unsigned long,
6343 						empty_cluster + empty_size,
6344 					      block_group->full_stripe_len);
6345 
6346 			/* allocate a cluster in this block group */
6347 			ret = btrfs_find_space_cluster(root, block_group,
6348 						       last_ptr, search_start,
6349 						       num_bytes,
6350 						       aligned_cluster);
6351 			if (ret == 0) {
6352 				/*
6353 				 * now pull our allocation out of this
6354 				 * cluster
6355 				 */
6356 				offset = btrfs_alloc_from_cluster(block_group,
6357 						  last_ptr, num_bytes,
6358 						  search_start);
6359 				if (offset) {
6360 					/* we found one, proceed */
6361 					spin_unlock(&last_ptr->refill_lock);
6362 					trace_btrfs_reserve_extent_cluster(root,
6363 						block_group, search_start,
6364 						num_bytes);
6365 					goto checks;
6366 				}
6367 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
6368 				   && !failed_cluster_refill) {
6369 				spin_unlock(&last_ptr->refill_lock);
6370 
6371 				failed_cluster_refill = true;
6372 				wait_block_group_cache_progress(block_group,
6373 				       num_bytes + empty_cluster + empty_size);
6374 				goto have_block_group;
6375 			}
6376 
6377 			/*
6378 			 * at this point we either didn't find a cluster
6379 			 * or we weren't able to allocate a block from our
6380 			 * cluster.  Free the cluster we've been trying
6381 			 * to use, and go to the next block group
6382 			 */
6383 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
6384 			spin_unlock(&last_ptr->refill_lock);
6385 			goto loop;
6386 		}
6387 
6388 unclustered_alloc:
6389 		spin_lock(&block_group->free_space_ctl->tree_lock);
6390 		if (cached &&
6391 		    block_group->free_space_ctl->free_space <
6392 		    num_bytes + empty_cluster + empty_size) {
6393 			spin_unlock(&block_group->free_space_ctl->tree_lock);
6394 			goto loop;
6395 		}
6396 		spin_unlock(&block_group->free_space_ctl->tree_lock);
6397 
6398 		offset = btrfs_find_space_for_alloc(block_group, search_start,
6399 						    num_bytes, empty_size);
6400 		/*
6401 		 * If we didn't find a chunk, and we haven't failed on this
6402 		 * block group before, and this block group is in the middle of
6403 		 * caching and we are ok with waiting, then go ahead and wait
6404 		 * for progress to be made, and set failed_alloc to true.
6405 		 *
6406 		 * If failed_alloc is true then we've already waited on this
6407 		 * block group once and should move on to the next block group.
6408 		 */
6409 		if (!offset && !failed_alloc && !cached &&
6410 		    loop > LOOP_CACHING_NOWAIT) {
6411 			wait_block_group_cache_progress(block_group,
6412 						num_bytes + empty_size);
6413 			failed_alloc = true;
6414 			goto have_block_group;
6415 		} else if (!offset) {
6416 			if (!cached)
6417 				have_caching_bg = true;
6418 			goto loop;
6419 		}
6420 checks:
6421 		search_start = stripe_align(root, used_block_group,
6422 					    offset, num_bytes);
6423 
6424 		/* move on to the next group */
6425 		if (search_start + num_bytes >
6426 		    used_block_group->key.objectid + used_block_group->key.offset) {
6427 			btrfs_add_free_space(used_block_group, offset, num_bytes);
6428 			goto loop;
6429 		}
6430 
6431 		if (offset < search_start)
6432 			btrfs_add_free_space(used_block_group, offset,
6433 					     search_start - offset);
6434 		BUG_ON(offset > search_start);
6435 
6436 		ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
6437 						  alloc_type);
6438 		if (ret == -EAGAIN) {
6439 			btrfs_add_free_space(used_block_group, offset, num_bytes);
6440 			goto loop;
6441 		}
6442 
6443 		/* we are all good, lets return */
6444 		ins->objectid = search_start;
6445 		ins->offset = num_bytes;
6446 
6447 		trace_btrfs_reserve_extent(orig_root, block_group,
6448 					   search_start, num_bytes);
6449 		if (used_block_group != block_group)
6450 			btrfs_put_block_group(used_block_group);
6451 		btrfs_put_block_group(block_group);
6452 		break;
6453 loop:
6454 		failed_cluster_refill = false;
6455 		failed_alloc = false;
6456 		BUG_ON(index != get_block_group_index(block_group));
6457 		if (used_block_group != block_group)
6458 			btrfs_put_block_group(used_block_group);
6459 		btrfs_put_block_group(block_group);
6460 	}
6461 	up_read(&space_info->groups_sem);
6462 
6463 	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6464 		goto search;
6465 
6466 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6467 		goto search;
6468 
6469 	/*
6470 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6471 	 *			caching kthreads as we move along
6472 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6473 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6474 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6475 	 *			again
6476 	 */
6477 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6478 		index = 0;
6479 		loop++;
6480 		if (loop == LOOP_ALLOC_CHUNK) {
6481 			struct btrfs_trans_handle *trans;
6482 
6483 			trans = btrfs_join_transaction(root);
6484 			if (IS_ERR(trans)) {
6485 				ret = PTR_ERR(trans);
6486 				goto out;
6487 			}
6488 
6489 			ret = do_chunk_alloc(trans, root, flags,
6490 					     CHUNK_ALLOC_FORCE);
6491 			/*
6492 			 * Do not bail out on ENOSPC since we
6493 			 * can do more things.
6494 			 */
6495 			if (ret < 0 && ret != -ENOSPC)
6496 				btrfs_abort_transaction(trans,
6497 							root, ret);
6498 			else
6499 				ret = 0;
6500 			btrfs_end_transaction(trans, root);
6501 			if (ret)
6502 				goto out;
6503 		}
6504 
6505 		if (loop == LOOP_NO_EMPTY_SIZE) {
6506 			empty_size = 0;
6507 			empty_cluster = 0;
6508 		}
6509 
6510 		goto search;
6511 	} else if (!ins->objectid) {
6512 		ret = -ENOSPC;
6513 	} else if (ins->objectid) {
6514 		ret = 0;
6515 	}
6516 out:
6517 
6518 	return ret;
6519 }
6520 
6521 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6522 			    int dump_block_groups)
6523 {
6524 	struct btrfs_block_group_cache *cache;
6525 	int index = 0;
6526 
6527 	spin_lock(&info->lock);
6528 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
6529 	       info->flags,
6530 	       info->total_bytes - info->bytes_used - info->bytes_pinned -
6531 	       info->bytes_reserved - info->bytes_readonly,
6532 	       (info->full) ? "" : "not ");
6533 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
6534 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
6535 	       info->total_bytes, info->bytes_used, info->bytes_pinned,
6536 	       info->bytes_reserved, info->bytes_may_use,
6537 	       info->bytes_readonly);
6538 	spin_unlock(&info->lock);
6539 
6540 	if (!dump_block_groups)
6541 		return;
6542 
6543 	down_read(&info->groups_sem);
6544 again:
6545 	list_for_each_entry(cache, &info->block_groups[index], list) {
6546 		spin_lock(&cache->lock);
6547 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
6548 		       cache->key.objectid, cache->key.offset,
6549 		       btrfs_block_group_used(&cache->item), cache->pinned,
6550 		       cache->reserved, cache->ro ? "[readonly]" : "");
6551 		btrfs_dump_free_space(cache, bytes);
6552 		spin_unlock(&cache->lock);
6553 	}
6554 	if (++index < BTRFS_NR_RAID_TYPES)
6555 		goto again;
6556 	up_read(&info->groups_sem);
6557 }
6558 
6559 int btrfs_reserve_extent(struct btrfs_root *root,
6560 			 u64 num_bytes, u64 min_alloc_size,
6561 			 u64 empty_size, u64 hint_byte,
6562 			 struct btrfs_key *ins, int is_data)
6563 {
6564 	bool final_tried = false;
6565 	u64 flags;
6566 	int ret;
6567 
6568 	flags = btrfs_get_alloc_profile(root, is_data);
6569 again:
6570 	WARN_ON(num_bytes < root->sectorsize);
6571 	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
6572 			       flags);
6573 
6574 	if (ret == -ENOSPC) {
6575 		if (!final_tried) {
6576 			num_bytes = num_bytes >> 1;
6577 			num_bytes = round_down(num_bytes, root->sectorsize);
6578 			num_bytes = max(num_bytes, min_alloc_size);
6579 			if (num_bytes == min_alloc_size)
6580 				final_tried = true;
6581 			goto again;
6582 		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6583 			struct btrfs_space_info *sinfo;
6584 
6585 			sinfo = __find_space_info(root->fs_info, flags);
6586 			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
6587 				flags, num_bytes);
6588 			if (sinfo)
6589 				dump_space_info(sinfo, num_bytes, 1);
6590 		}
6591 	}
6592 
6593 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6594 
6595 	return ret;
6596 }
6597 
6598 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6599 					u64 start, u64 len, int pin)
6600 {
6601 	struct btrfs_block_group_cache *cache;
6602 	int ret = 0;
6603 
6604 	cache = btrfs_lookup_block_group(root->fs_info, start);
6605 	if (!cache) {
6606 		btrfs_err(root->fs_info, "Unable to find block group for %llu",
6607 			start);
6608 		return -ENOSPC;
6609 	}
6610 
6611 	if (btrfs_test_opt(root, DISCARD))
6612 		ret = btrfs_discard_extent(root, start, len, NULL);
6613 
6614 	if (pin)
6615 		pin_down_extent(root, cache, start, len, 1);
6616 	else {
6617 		btrfs_add_free_space(cache, start, len);
6618 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6619 	}
6620 	btrfs_put_block_group(cache);
6621 
6622 	trace_btrfs_reserved_extent_free(root, start, len);
6623 
6624 	return ret;
6625 }
6626 
6627 int btrfs_free_reserved_extent(struct btrfs_root *root,
6628 					u64 start, u64 len)
6629 {
6630 	return __btrfs_free_reserved_extent(root, start, len, 0);
6631 }
6632 
6633 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6634 				       u64 start, u64 len)
6635 {
6636 	return __btrfs_free_reserved_extent(root, start, len, 1);
6637 }
6638 
6639 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6640 				      struct btrfs_root *root,
6641 				      u64 parent, u64 root_objectid,
6642 				      u64 flags, u64 owner, u64 offset,
6643 				      struct btrfs_key *ins, int ref_mod)
6644 {
6645 	int ret;
6646 	struct btrfs_fs_info *fs_info = root->fs_info;
6647 	struct btrfs_extent_item *extent_item;
6648 	struct btrfs_extent_inline_ref *iref;
6649 	struct btrfs_path *path;
6650 	struct extent_buffer *leaf;
6651 	int type;
6652 	u32 size;
6653 
6654 	if (parent > 0)
6655 		type = BTRFS_SHARED_DATA_REF_KEY;
6656 	else
6657 		type = BTRFS_EXTENT_DATA_REF_KEY;
6658 
6659 	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6660 
6661 	path = btrfs_alloc_path();
6662 	if (!path)
6663 		return -ENOMEM;
6664 
6665 	path->leave_spinning = 1;
6666 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6667 				      ins, size);
6668 	if (ret) {
6669 		btrfs_free_path(path);
6670 		return ret;
6671 	}
6672 
6673 	leaf = path->nodes[0];
6674 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6675 				     struct btrfs_extent_item);
6676 	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6677 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6678 	btrfs_set_extent_flags(leaf, extent_item,
6679 			       flags | BTRFS_EXTENT_FLAG_DATA);
6680 
6681 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6682 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
6683 	if (parent > 0) {
6684 		struct btrfs_shared_data_ref *ref;
6685 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
6686 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6687 		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6688 	} else {
6689 		struct btrfs_extent_data_ref *ref;
6690 		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6691 		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6692 		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6693 		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6694 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6695 	}
6696 
6697 	btrfs_mark_buffer_dirty(path->nodes[0]);
6698 	btrfs_free_path(path);
6699 
6700 	ret = update_block_group(root, ins->objectid, ins->offset, 1);
6701 	if (ret) { /* -ENOENT, logic error */
6702 		btrfs_err(fs_info, "update block group failed for %llu %llu",
6703 			ins->objectid, ins->offset);
6704 		BUG();
6705 	}
6706 	return ret;
6707 }
6708 
6709 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6710 				     struct btrfs_root *root,
6711 				     u64 parent, u64 root_objectid,
6712 				     u64 flags, struct btrfs_disk_key *key,
6713 				     int level, struct btrfs_key *ins)
6714 {
6715 	int ret;
6716 	struct btrfs_fs_info *fs_info = root->fs_info;
6717 	struct btrfs_extent_item *extent_item;
6718 	struct btrfs_tree_block_info *block_info;
6719 	struct btrfs_extent_inline_ref *iref;
6720 	struct btrfs_path *path;
6721 	struct extent_buffer *leaf;
6722 	u32 size = sizeof(*extent_item) + sizeof(*iref);
6723 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6724 						 SKINNY_METADATA);
6725 
6726 	if (!skinny_metadata)
6727 		size += sizeof(*block_info);
6728 
6729 	path = btrfs_alloc_path();
6730 	if (!path)
6731 		return -ENOMEM;
6732 
6733 	path->leave_spinning = 1;
6734 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6735 				      ins, size);
6736 	if (ret) {
6737 		btrfs_free_path(path);
6738 		return ret;
6739 	}
6740 
6741 	leaf = path->nodes[0];
6742 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
6743 				     struct btrfs_extent_item);
6744 	btrfs_set_extent_refs(leaf, extent_item, 1);
6745 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6746 	btrfs_set_extent_flags(leaf, extent_item,
6747 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6748 
6749 	if (skinny_metadata) {
6750 		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6751 	} else {
6752 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6753 		btrfs_set_tree_block_key(leaf, block_info, key);
6754 		btrfs_set_tree_block_level(leaf, block_info, level);
6755 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6756 	}
6757 
6758 	if (parent > 0) {
6759 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6760 		btrfs_set_extent_inline_ref_type(leaf, iref,
6761 						 BTRFS_SHARED_BLOCK_REF_KEY);
6762 		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6763 	} else {
6764 		btrfs_set_extent_inline_ref_type(leaf, iref,
6765 						 BTRFS_TREE_BLOCK_REF_KEY);
6766 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6767 	}
6768 
6769 	btrfs_mark_buffer_dirty(leaf);
6770 	btrfs_free_path(path);
6771 
6772 	ret = update_block_group(root, ins->objectid, root->leafsize, 1);
6773 	if (ret) { /* -ENOENT, logic error */
6774 		btrfs_err(fs_info, "update block group failed for %llu %llu",
6775 			ins->objectid, ins->offset);
6776 		BUG();
6777 	}
6778 	return ret;
6779 }
6780 
6781 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6782 				     struct btrfs_root *root,
6783 				     u64 root_objectid, u64 owner,
6784 				     u64 offset, struct btrfs_key *ins)
6785 {
6786 	int ret;
6787 
6788 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6789 
6790 	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6791 					 ins->offset, 0,
6792 					 root_objectid, owner, offset,
6793 					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6794 	return ret;
6795 }
6796 
6797 /*
6798  * this is used by the tree logging recovery code.  It records that
6799  * an extent has been allocated and makes sure to clear the free
6800  * space cache bits as well
6801  */
6802 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6803 				   struct btrfs_root *root,
6804 				   u64 root_objectid, u64 owner, u64 offset,
6805 				   struct btrfs_key *ins)
6806 {
6807 	int ret;
6808 	struct btrfs_block_group_cache *block_group;
6809 
6810 	/*
6811 	 * Mixed block groups will exclude before processing the log so we only
6812 	 * need to do the exlude dance if this fs isn't mixed.
6813 	 */
6814 	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6815 		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6816 		if (ret)
6817 			return ret;
6818 	}
6819 
6820 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6821 	if (!block_group)
6822 		return -EINVAL;
6823 
6824 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6825 					  RESERVE_ALLOC_NO_ACCOUNT);
6826 	BUG_ON(ret); /* logic error */
6827 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6828 					 0, owner, offset, ins, 1);
6829 	btrfs_put_block_group(block_group);
6830 	return ret;
6831 }
6832 
6833 static struct extent_buffer *
6834 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6835 		      u64 bytenr, u32 blocksize, int level)
6836 {
6837 	struct extent_buffer *buf;
6838 
6839 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6840 	if (!buf)
6841 		return ERR_PTR(-ENOMEM);
6842 	btrfs_set_header_generation(buf, trans->transid);
6843 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6844 	btrfs_tree_lock(buf);
6845 	clean_tree_block(trans, root, buf);
6846 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6847 
6848 	btrfs_set_lock_blocking(buf);
6849 	btrfs_set_buffer_uptodate(buf);
6850 
6851 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6852 		/*
6853 		 * we allow two log transactions at a time, use different
6854 		 * EXENT bit to differentiate dirty pages.
6855 		 */
6856 		if (root->log_transid % 2 == 0)
6857 			set_extent_dirty(&root->dirty_log_pages, buf->start,
6858 					buf->start + buf->len - 1, GFP_NOFS);
6859 		else
6860 			set_extent_new(&root->dirty_log_pages, buf->start,
6861 					buf->start + buf->len - 1, GFP_NOFS);
6862 	} else {
6863 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6864 			 buf->start + buf->len - 1, GFP_NOFS);
6865 	}
6866 	trans->blocks_used++;
6867 	/* this returns a buffer locked for blocking */
6868 	return buf;
6869 }
6870 
6871 static struct btrfs_block_rsv *
6872 use_block_rsv(struct btrfs_trans_handle *trans,
6873 	      struct btrfs_root *root, u32 blocksize)
6874 {
6875 	struct btrfs_block_rsv *block_rsv;
6876 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6877 	int ret;
6878 	bool global_updated = false;
6879 
6880 	block_rsv = get_block_rsv(trans, root);
6881 
6882 	if (unlikely(block_rsv->size == 0))
6883 		goto try_reserve;
6884 again:
6885 	ret = block_rsv_use_bytes(block_rsv, blocksize);
6886 	if (!ret)
6887 		return block_rsv;
6888 
6889 	if (block_rsv->failfast)
6890 		return ERR_PTR(ret);
6891 
6892 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
6893 		global_updated = true;
6894 		update_global_block_rsv(root->fs_info);
6895 		goto again;
6896 	}
6897 
6898 	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6899 		static DEFINE_RATELIMIT_STATE(_rs,
6900 				DEFAULT_RATELIMIT_INTERVAL * 10,
6901 				/*DEFAULT_RATELIMIT_BURST*/ 1);
6902 		if (__ratelimit(&_rs))
6903 			WARN(1, KERN_DEBUG
6904 				"btrfs: block rsv returned %d\n", ret);
6905 	}
6906 try_reserve:
6907 	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6908 				     BTRFS_RESERVE_NO_FLUSH);
6909 	if (!ret)
6910 		return block_rsv;
6911 	/*
6912 	 * If we couldn't reserve metadata bytes try and use some from
6913 	 * the global reserve if its space type is the same as the global
6914 	 * reservation.
6915 	 */
6916 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
6917 	    block_rsv->space_info == global_rsv->space_info) {
6918 		ret = block_rsv_use_bytes(global_rsv, blocksize);
6919 		if (!ret)
6920 			return global_rsv;
6921 	}
6922 	return ERR_PTR(ret);
6923 }
6924 
6925 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6926 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
6927 {
6928 	block_rsv_add_bytes(block_rsv, blocksize, 0);
6929 	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6930 }
6931 
6932 /*
6933  * finds a free extent and does all the dirty work required for allocation
6934  * returns the key for the extent through ins, and a tree buffer for
6935  * the first block of the extent through buf.
6936  *
6937  * returns the tree buffer or NULL.
6938  */
6939 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6940 					struct btrfs_root *root, u32 blocksize,
6941 					u64 parent, u64 root_objectid,
6942 					struct btrfs_disk_key *key, int level,
6943 					u64 hint, u64 empty_size)
6944 {
6945 	struct btrfs_key ins;
6946 	struct btrfs_block_rsv *block_rsv;
6947 	struct extent_buffer *buf;
6948 	u64 flags = 0;
6949 	int ret;
6950 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6951 						 SKINNY_METADATA);
6952 
6953 	block_rsv = use_block_rsv(trans, root, blocksize);
6954 	if (IS_ERR(block_rsv))
6955 		return ERR_CAST(block_rsv);
6956 
6957 	ret = btrfs_reserve_extent(root, blocksize, blocksize,
6958 				   empty_size, hint, &ins, 0);
6959 	if (ret) {
6960 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6961 		return ERR_PTR(ret);
6962 	}
6963 
6964 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6965 				    blocksize, level);
6966 	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6967 
6968 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6969 		if (parent == 0)
6970 			parent = ins.objectid;
6971 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6972 	} else
6973 		BUG_ON(parent > 0);
6974 
6975 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6976 		struct btrfs_delayed_extent_op *extent_op;
6977 		extent_op = btrfs_alloc_delayed_extent_op();
6978 		BUG_ON(!extent_op); /* -ENOMEM */
6979 		if (key)
6980 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
6981 		else
6982 			memset(&extent_op->key, 0, sizeof(extent_op->key));
6983 		extent_op->flags_to_set = flags;
6984 		if (skinny_metadata)
6985 			extent_op->update_key = 0;
6986 		else
6987 			extent_op->update_key = 1;
6988 		extent_op->update_flags = 1;
6989 		extent_op->is_data = 0;
6990 		extent_op->level = level;
6991 
6992 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6993 					ins.objectid,
6994 					ins.offset, parent, root_objectid,
6995 					level, BTRFS_ADD_DELAYED_EXTENT,
6996 					extent_op, 0);
6997 		BUG_ON(ret); /* -ENOMEM */
6998 	}
6999 	return buf;
7000 }
7001 
7002 struct walk_control {
7003 	u64 refs[BTRFS_MAX_LEVEL];
7004 	u64 flags[BTRFS_MAX_LEVEL];
7005 	struct btrfs_key update_progress;
7006 	int stage;
7007 	int level;
7008 	int shared_level;
7009 	int update_ref;
7010 	int keep_locks;
7011 	int reada_slot;
7012 	int reada_count;
7013 	int for_reloc;
7014 };
7015 
7016 #define DROP_REFERENCE	1
7017 #define UPDATE_BACKREF	2
7018 
7019 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7020 				     struct btrfs_root *root,
7021 				     struct walk_control *wc,
7022 				     struct btrfs_path *path)
7023 {
7024 	u64 bytenr;
7025 	u64 generation;
7026 	u64 refs;
7027 	u64 flags;
7028 	u32 nritems;
7029 	u32 blocksize;
7030 	struct btrfs_key key;
7031 	struct extent_buffer *eb;
7032 	int ret;
7033 	int slot;
7034 	int nread = 0;
7035 
7036 	if (path->slots[wc->level] < wc->reada_slot) {
7037 		wc->reada_count = wc->reada_count * 2 / 3;
7038 		wc->reada_count = max(wc->reada_count, 2);
7039 	} else {
7040 		wc->reada_count = wc->reada_count * 3 / 2;
7041 		wc->reada_count = min_t(int, wc->reada_count,
7042 					BTRFS_NODEPTRS_PER_BLOCK(root));
7043 	}
7044 
7045 	eb = path->nodes[wc->level];
7046 	nritems = btrfs_header_nritems(eb);
7047 	blocksize = btrfs_level_size(root, wc->level - 1);
7048 
7049 	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7050 		if (nread >= wc->reada_count)
7051 			break;
7052 
7053 		cond_resched();
7054 		bytenr = btrfs_node_blockptr(eb, slot);
7055 		generation = btrfs_node_ptr_generation(eb, slot);
7056 
7057 		if (slot == path->slots[wc->level])
7058 			goto reada;
7059 
7060 		if (wc->stage == UPDATE_BACKREF &&
7061 		    generation <= root->root_key.offset)
7062 			continue;
7063 
7064 		/* We don't lock the tree block, it's OK to be racy here */
7065 		ret = btrfs_lookup_extent_info(trans, root, bytenr,
7066 					       wc->level - 1, 1, &refs,
7067 					       &flags);
7068 		/* We don't care about errors in readahead. */
7069 		if (ret < 0)
7070 			continue;
7071 		BUG_ON(refs == 0);
7072 
7073 		if (wc->stage == DROP_REFERENCE) {
7074 			if (refs == 1)
7075 				goto reada;
7076 
7077 			if (wc->level == 1 &&
7078 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7079 				continue;
7080 			if (!wc->update_ref ||
7081 			    generation <= root->root_key.offset)
7082 				continue;
7083 			btrfs_node_key_to_cpu(eb, &key, slot);
7084 			ret = btrfs_comp_cpu_keys(&key,
7085 						  &wc->update_progress);
7086 			if (ret < 0)
7087 				continue;
7088 		} else {
7089 			if (wc->level == 1 &&
7090 			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7091 				continue;
7092 		}
7093 reada:
7094 		ret = readahead_tree_block(root, bytenr, blocksize,
7095 					   generation);
7096 		if (ret)
7097 			break;
7098 		nread++;
7099 	}
7100 	wc->reada_slot = slot;
7101 }
7102 
7103 /*
7104  * helper to process tree block while walking down the tree.
7105  *
7106  * when wc->stage == UPDATE_BACKREF, this function updates
7107  * back refs for pointers in the block.
7108  *
7109  * NOTE: return value 1 means we should stop walking down.
7110  */
7111 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7112 				   struct btrfs_root *root,
7113 				   struct btrfs_path *path,
7114 				   struct walk_control *wc, int lookup_info)
7115 {
7116 	int level = wc->level;
7117 	struct extent_buffer *eb = path->nodes[level];
7118 	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7119 	int ret;
7120 
7121 	if (wc->stage == UPDATE_BACKREF &&
7122 	    btrfs_header_owner(eb) != root->root_key.objectid)
7123 		return 1;
7124 
7125 	/*
7126 	 * when reference count of tree block is 1, it won't increase
7127 	 * again. once full backref flag is set, we never clear it.
7128 	 */
7129 	if (lookup_info &&
7130 	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7131 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
7132 		BUG_ON(!path->locks[level]);
7133 		ret = btrfs_lookup_extent_info(trans, root,
7134 					       eb->start, level, 1,
7135 					       &wc->refs[level],
7136 					       &wc->flags[level]);
7137 		BUG_ON(ret == -ENOMEM);
7138 		if (ret)
7139 			return ret;
7140 		BUG_ON(wc->refs[level] == 0);
7141 	}
7142 
7143 	if (wc->stage == DROP_REFERENCE) {
7144 		if (wc->refs[level] > 1)
7145 			return 1;
7146 
7147 		if (path->locks[level] && !wc->keep_locks) {
7148 			btrfs_tree_unlock_rw(eb, path->locks[level]);
7149 			path->locks[level] = 0;
7150 		}
7151 		return 0;
7152 	}
7153 
7154 	/* wc->stage == UPDATE_BACKREF */
7155 	if (!(wc->flags[level] & flag)) {
7156 		BUG_ON(!path->locks[level]);
7157 		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
7158 		BUG_ON(ret); /* -ENOMEM */
7159 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
7160 		BUG_ON(ret); /* -ENOMEM */
7161 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7162 						  eb->len, flag,
7163 						  btrfs_header_level(eb), 0);
7164 		BUG_ON(ret); /* -ENOMEM */
7165 		wc->flags[level] |= flag;
7166 	}
7167 
7168 	/*
7169 	 * the block is shared by multiple trees, so it's not good to
7170 	 * keep the tree lock
7171 	 */
7172 	if (path->locks[level] && level > 0) {
7173 		btrfs_tree_unlock_rw(eb, path->locks[level]);
7174 		path->locks[level] = 0;
7175 	}
7176 	return 0;
7177 }
7178 
7179 /*
7180  * helper to process tree block pointer.
7181  *
7182  * when wc->stage == DROP_REFERENCE, this function checks
7183  * reference count of the block pointed to. if the block
7184  * is shared and we need update back refs for the subtree
7185  * rooted at the block, this function changes wc->stage to
7186  * UPDATE_BACKREF. if the block is shared and there is no
7187  * need to update back, this function drops the reference
7188  * to the block.
7189  *
7190  * NOTE: return value 1 means we should stop walking down.
7191  */
7192 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7193 				 struct btrfs_root *root,
7194 				 struct btrfs_path *path,
7195 				 struct walk_control *wc, int *lookup_info)
7196 {
7197 	u64 bytenr;
7198 	u64 generation;
7199 	u64 parent;
7200 	u32 blocksize;
7201 	struct btrfs_key key;
7202 	struct extent_buffer *next;
7203 	int level = wc->level;
7204 	int reada = 0;
7205 	int ret = 0;
7206 
7207 	generation = btrfs_node_ptr_generation(path->nodes[level],
7208 					       path->slots[level]);
7209 	/*
7210 	 * if the lower level block was created before the snapshot
7211 	 * was created, we know there is no need to update back refs
7212 	 * for the subtree
7213 	 */
7214 	if (wc->stage == UPDATE_BACKREF &&
7215 	    generation <= root->root_key.offset) {
7216 		*lookup_info = 1;
7217 		return 1;
7218 	}
7219 
7220 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7221 	blocksize = btrfs_level_size(root, level - 1);
7222 
7223 	next = btrfs_find_tree_block(root, bytenr, blocksize);
7224 	if (!next) {
7225 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7226 		if (!next)
7227 			return -ENOMEM;
7228 		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7229 					       level - 1);
7230 		reada = 1;
7231 	}
7232 	btrfs_tree_lock(next);
7233 	btrfs_set_lock_blocking(next);
7234 
7235 	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
7236 				       &wc->refs[level - 1],
7237 				       &wc->flags[level - 1]);
7238 	if (ret < 0) {
7239 		btrfs_tree_unlock(next);
7240 		return ret;
7241 	}
7242 
7243 	if (unlikely(wc->refs[level - 1] == 0)) {
7244 		btrfs_err(root->fs_info, "Missing references.");
7245 		BUG();
7246 	}
7247 	*lookup_info = 0;
7248 
7249 	if (wc->stage == DROP_REFERENCE) {
7250 		if (wc->refs[level - 1] > 1) {
7251 			if (level == 1 &&
7252 			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7253 				goto skip;
7254 
7255 			if (!wc->update_ref ||
7256 			    generation <= root->root_key.offset)
7257 				goto skip;
7258 
7259 			btrfs_node_key_to_cpu(path->nodes[level], &key,
7260 					      path->slots[level]);
7261 			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7262 			if (ret < 0)
7263 				goto skip;
7264 
7265 			wc->stage = UPDATE_BACKREF;
7266 			wc->shared_level = level - 1;
7267 		}
7268 	} else {
7269 		if (level == 1 &&
7270 		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7271 			goto skip;
7272 	}
7273 
7274 	if (!btrfs_buffer_uptodate(next, generation, 0)) {
7275 		btrfs_tree_unlock(next);
7276 		free_extent_buffer(next);
7277 		next = NULL;
7278 		*lookup_info = 1;
7279 	}
7280 
7281 	if (!next) {
7282 		if (reada && level == 1)
7283 			reada_walk_down(trans, root, wc, path);
7284 		next = read_tree_block(root, bytenr, blocksize, generation);
7285 		if (!next || !extent_buffer_uptodate(next)) {
7286 			free_extent_buffer(next);
7287 			return -EIO;
7288 		}
7289 		btrfs_tree_lock(next);
7290 		btrfs_set_lock_blocking(next);
7291 	}
7292 
7293 	level--;
7294 	BUG_ON(level != btrfs_header_level(next));
7295 	path->nodes[level] = next;
7296 	path->slots[level] = 0;
7297 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7298 	wc->level = level;
7299 	if (wc->level == 1)
7300 		wc->reada_slot = 0;
7301 	return 0;
7302 skip:
7303 	wc->refs[level - 1] = 0;
7304 	wc->flags[level - 1] = 0;
7305 	if (wc->stage == DROP_REFERENCE) {
7306 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7307 			parent = path->nodes[level]->start;
7308 		} else {
7309 			BUG_ON(root->root_key.objectid !=
7310 			       btrfs_header_owner(path->nodes[level]));
7311 			parent = 0;
7312 		}
7313 
7314 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7315 				root->root_key.objectid, level - 1, 0, 0);
7316 		BUG_ON(ret); /* -ENOMEM */
7317 	}
7318 	btrfs_tree_unlock(next);
7319 	free_extent_buffer(next);
7320 	*lookup_info = 1;
7321 	return 1;
7322 }
7323 
7324 /*
7325  * helper to process tree block while walking up the tree.
7326  *
7327  * when wc->stage == DROP_REFERENCE, this function drops
7328  * reference count on the block.
7329  *
7330  * when wc->stage == UPDATE_BACKREF, this function changes
7331  * wc->stage back to DROP_REFERENCE if we changed wc->stage
7332  * to UPDATE_BACKREF previously while processing the block.
7333  *
7334  * NOTE: return value 1 means we should stop walking up.
7335  */
7336 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7337 				 struct btrfs_root *root,
7338 				 struct btrfs_path *path,
7339 				 struct walk_control *wc)
7340 {
7341 	int ret;
7342 	int level = wc->level;
7343 	struct extent_buffer *eb = path->nodes[level];
7344 	u64 parent = 0;
7345 
7346 	if (wc->stage == UPDATE_BACKREF) {
7347 		BUG_ON(wc->shared_level < level);
7348 		if (level < wc->shared_level)
7349 			goto out;
7350 
7351 		ret = find_next_key(path, level + 1, &wc->update_progress);
7352 		if (ret > 0)
7353 			wc->update_ref = 0;
7354 
7355 		wc->stage = DROP_REFERENCE;
7356 		wc->shared_level = -1;
7357 		path->slots[level] = 0;
7358 
7359 		/*
7360 		 * check reference count again if the block isn't locked.
7361 		 * we should start walking down the tree again if reference
7362 		 * count is one.
7363 		 */
7364 		if (!path->locks[level]) {
7365 			BUG_ON(level == 0);
7366 			btrfs_tree_lock(eb);
7367 			btrfs_set_lock_blocking(eb);
7368 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7369 
7370 			ret = btrfs_lookup_extent_info(trans, root,
7371 						       eb->start, level, 1,
7372 						       &wc->refs[level],
7373 						       &wc->flags[level]);
7374 			if (ret < 0) {
7375 				btrfs_tree_unlock_rw(eb, path->locks[level]);
7376 				path->locks[level] = 0;
7377 				return ret;
7378 			}
7379 			BUG_ON(wc->refs[level] == 0);
7380 			if (wc->refs[level] == 1) {
7381 				btrfs_tree_unlock_rw(eb, path->locks[level]);
7382 				path->locks[level] = 0;
7383 				return 1;
7384 			}
7385 		}
7386 	}
7387 
7388 	/* wc->stage == DROP_REFERENCE */
7389 	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7390 
7391 	if (wc->refs[level] == 1) {
7392 		if (level == 0) {
7393 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7394 				ret = btrfs_dec_ref(trans, root, eb, 1,
7395 						    wc->for_reloc);
7396 			else
7397 				ret = btrfs_dec_ref(trans, root, eb, 0,
7398 						    wc->for_reloc);
7399 			BUG_ON(ret); /* -ENOMEM */
7400 		}
7401 		/* make block locked assertion in clean_tree_block happy */
7402 		if (!path->locks[level] &&
7403 		    btrfs_header_generation(eb) == trans->transid) {
7404 			btrfs_tree_lock(eb);
7405 			btrfs_set_lock_blocking(eb);
7406 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7407 		}
7408 		clean_tree_block(trans, root, eb);
7409 	}
7410 
7411 	if (eb == root->node) {
7412 		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7413 			parent = eb->start;
7414 		else
7415 			BUG_ON(root->root_key.objectid !=
7416 			       btrfs_header_owner(eb));
7417 	} else {
7418 		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7419 			parent = path->nodes[level + 1]->start;
7420 		else
7421 			BUG_ON(root->root_key.objectid !=
7422 			       btrfs_header_owner(path->nodes[level + 1]));
7423 	}
7424 
7425 	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
7426 out:
7427 	wc->refs[level] = 0;
7428 	wc->flags[level] = 0;
7429 	return 0;
7430 }
7431 
7432 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7433 				   struct btrfs_root *root,
7434 				   struct btrfs_path *path,
7435 				   struct walk_control *wc)
7436 {
7437 	int level = wc->level;
7438 	int lookup_info = 1;
7439 	int ret;
7440 
7441 	while (level >= 0) {
7442 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
7443 		if (ret > 0)
7444 			break;
7445 
7446 		if (level == 0)
7447 			break;
7448 
7449 		if (path->slots[level] >=
7450 		    btrfs_header_nritems(path->nodes[level]))
7451 			break;
7452 
7453 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
7454 		if (ret > 0) {
7455 			path->slots[level]++;
7456 			continue;
7457 		} else if (ret < 0)
7458 			return ret;
7459 		level = wc->level;
7460 	}
7461 	return 0;
7462 }
7463 
7464 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
7465 				 struct btrfs_root *root,
7466 				 struct btrfs_path *path,
7467 				 struct walk_control *wc, int max_level)
7468 {
7469 	int level = wc->level;
7470 	int ret;
7471 
7472 	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7473 	while (level < max_level && path->nodes[level]) {
7474 		wc->level = level;
7475 		if (path->slots[level] + 1 <
7476 		    btrfs_header_nritems(path->nodes[level])) {
7477 			path->slots[level]++;
7478 			return 0;
7479 		} else {
7480 			ret = walk_up_proc(trans, root, path, wc);
7481 			if (ret > 0)
7482 				return 0;
7483 
7484 			if (path->locks[level]) {
7485 				btrfs_tree_unlock_rw(path->nodes[level],
7486 						     path->locks[level]);
7487 				path->locks[level] = 0;
7488 			}
7489 			free_extent_buffer(path->nodes[level]);
7490 			path->nodes[level] = NULL;
7491 			level++;
7492 		}
7493 	}
7494 	return 1;
7495 }
7496 
7497 /*
7498  * drop a subvolume tree.
7499  *
7500  * this function traverses the tree freeing any blocks that only
7501  * referenced by the tree.
7502  *
7503  * when a shared tree block is found. this function decreases its
7504  * reference count by one. if update_ref is true, this function
7505  * also make sure backrefs for the shared block and all lower level
7506  * blocks are properly updated.
7507  *
7508  * If called with for_reloc == 0, may exit early with -EAGAIN
7509  */
7510 int btrfs_drop_snapshot(struct btrfs_root *root,
7511 			 struct btrfs_block_rsv *block_rsv, int update_ref,
7512 			 int for_reloc)
7513 {
7514 	struct btrfs_path *path;
7515 	struct btrfs_trans_handle *trans;
7516 	struct btrfs_root *tree_root = root->fs_info->tree_root;
7517 	struct btrfs_root_item *root_item = &root->root_item;
7518 	struct walk_control *wc;
7519 	struct btrfs_key key;
7520 	int err = 0;
7521 	int ret;
7522 	int level;
7523 	bool root_dropped = false;
7524 
7525 	path = btrfs_alloc_path();
7526 	if (!path) {
7527 		err = -ENOMEM;
7528 		goto out;
7529 	}
7530 
7531 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7532 	if (!wc) {
7533 		btrfs_free_path(path);
7534 		err = -ENOMEM;
7535 		goto out;
7536 	}
7537 
7538 	trans = btrfs_start_transaction(tree_root, 0);
7539 	if (IS_ERR(trans)) {
7540 		err = PTR_ERR(trans);
7541 		goto out_free;
7542 	}
7543 
7544 	if (block_rsv)
7545 		trans->block_rsv = block_rsv;
7546 
7547 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7548 		level = btrfs_header_level(root->node);
7549 		path->nodes[level] = btrfs_lock_root_node(root);
7550 		btrfs_set_lock_blocking(path->nodes[level]);
7551 		path->slots[level] = 0;
7552 		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7553 		memset(&wc->update_progress, 0,
7554 		       sizeof(wc->update_progress));
7555 	} else {
7556 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7557 		memcpy(&wc->update_progress, &key,
7558 		       sizeof(wc->update_progress));
7559 
7560 		level = root_item->drop_level;
7561 		BUG_ON(level == 0);
7562 		path->lowest_level = level;
7563 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7564 		path->lowest_level = 0;
7565 		if (ret < 0) {
7566 			err = ret;
7567 			goto out_end_trans;
7568 		}
7569 		WARN_ON(ret > 0);
7570 
7571 		/*
7572 		 * unlock our path, this is safe because only this
7573 		 * function is allowed to delete this snapshot
7574 		 */
7575 		btrfs_unlock_up_safe(path, 0);
7576 
7577 		level = btrfs_header_level(root->node);
7578 		while (1) {
7579 			btrfs_tree_lock(path->nodes[level]);
7580 			btrfs_set_lock_blocking(path->nodes[level]);
7581 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7582 
7583 			ret = btrfs_lookup_extent_info(trans, root,
7584 						path->nodes[level]->start,
7585 						level, 1, &wc->refs[level],
7586 						&wc->flags[level]);
7587 			if (ret < 0) {
7588 				err = ret;
7589 				goto out_end_trans;
7590 			}
7591 			BUG_ON(wc->refs[level] == 0);
7592 
7593 			if (level == root_item->drop_level)
7594 				break;
7595 
7596 			btrfs_tree_unlock(path->nodes[level]);
7597 			path->locks[level] = 0;
7598 			WARN_ON(wc->refs[level] != 1);
7599 			level--;
7600 		}
7601 	}
7602 
7603 	wc->level = level;
7604 	wc->shared_level = -1;
7605 	wc->stage = DROP_REFERENCE;
7606 	wc->update_ref = update_ref;
7607 	wc->keep_locks = 0;
7608 	wc->for_reloc = for_reloc;
7609 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7610 
7611 	while (1) {
7612 
7613 		ret = walk_down_tree(trans, root, path, wc);
7614 		if (ret < 0) {
7615 			err = ret;
7616 			break;
7617 		}
7618 
7619 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7620 		if (ret < 0) {
7621 			err = ret;
7622 			break;
7623 		}
7624 
7625 		if (ret > 0) {
7626 			BUG_ON(wc->stage != DROP_REFERENCE);
7627 			break;
7628 		}
7629 
7630 		if (wc->stage == DROP_REFERENCE) {
7631 			level = wc->level;
7632 			btrfs_node_key(path->nodes[level],
7633 				       &root_item->drop_progress,
7634 				       path->slots[level]);
7635 			root_item->drop_level = level;
7636 		}
7637 
7638 		BUG_ON(wc->level == 0);
7639 		if (btrfs_should_end_transaction(trans, tree_root) ||
7640 		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
7641 			ret = btrfs_update_root(trans, tree_root,
7642 						&root->root_key,
7643 						root_item);
7644 			if (ret) {
7645 				btrfs_abort_transaction(trans, tree_root, ret);
7646 				err = ret;
7647 				goto out_end_trans;
7648 			}
7649 
7650 			btrfs_end_transaction_throttle(trans, tree_root);
7651 			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7652 				pr_debug("btrfs: drop snapshot early exit\n");
7653 				err = -EAGAIN;
7654 				goto out_free;
7655 			}
7656 
7657 			trans = btrfs_start_transaction(tree_root, 0);
7658 			if (IS_ERR(trans)) {
7659 				err = PTR_ERR(trans);
7660 				goto out_free;
7661 			}
7662 			if (block_rsv)
7663 				trans->block_rsv = block_rsv;
7664 		}
7665 	}
7666 	btrfs_release_path(path);
7667 	if (err)
7668 		goto out_end_trans;
7669 
7670 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
7671 	if (ret) {
7672 		btrfs_abort_transaction(trans, tree_root, ret);
7673 		goto out_end_trans;
7674 	}
7675 
7676 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7677 		ret = btrfs_find_root(tree_root, &root->root_key, path,
7678 				      NULL, NULL);
7679 		if (ret < 0) {
7680 			btrfs_abort_transaction(trans, tree_root, ret);
7681 			err = ret;
7682 			goto out_end_trans;
7683 		} else if (ret > 0) {
7684 			/* if we fail to delete the orphan item this time
7685 			 * around, it'll get picked up the next time.
7686 			 *
7687 			 * The most common failure here is just -ENOENT.
7688 			 */
7689 			btrfs_del_orphan_item(trans, tree_root,
7690 					      root->root_key.objectid);
7691 		}
7692 	}
7693 
7694 	if (root->in_radix) {
7695 		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7696 	} else {
7697 		free_extent_buffer(root->node);
7698 		free_extent_buffer(root->commit_root);
7699 		btrfs_put_fs_root(root);
7700 	}
7701 	root_dropped = true;
7702 out_end_trans:
7703 	btrfs_end_transaction_throttle(trans, tree_root);
7704 out_free:
7705 	kfree(wc);
7706 	btrfs_free_path(path);
7707 out:
7708 	/*
7709 	 * So if we need to stop dropping the snapshot for whatever reason we
7710 	 * need to make sure to add it back to the dead root list so that we
7711 	 * keep trying to do the work later.  This also cleans up roots if we
7712 	 * don't have it in the radix (like when we recover after a power fail
7713 	 * or unmount) so we don't leak memory.
7714 	 */
7715 	if (!for_reloc && root_dropped == false)
7716 		btrfs_add_dead_root(root);
7717 	if (err)
7718 		btrfs_std_error(root->fs_info, err);
7719 	return err;
7720 }
7721 
7722 /*
7723  * drop subtree rooted at tree block 'node'.
7724  *
7725  * NOTE: this function will unlock and release tree block 'node'
7726  * only used by relocation code
7727  */
7728 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7729 			struct btrfs_root *root,
7730 			struct extent_buffer *node,
7731 			struct extent_buffer *parent)
7732 {
7733 	struct btrfs_path *path;
7734 	struct walk_control *wc;
7735 	int level;
7736 	int parent_level;
7737 	int ret = 0;
7738 	int wret;
7739 
7740 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7741 
7742 	path = btrfs_alloc_path();
7743 	if (!path)
7744 		return -ENOMEM;
7745 
7746 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
7747 	if (!wc) {
7748 		btrfs_free_path(path);
7749 		return -ENOMEM;
7750 	}
7751 
7752 	btrfs_assert_tree_locked(parent);
7753 	parent_level = btrfs_header_level(parent);
7754 	extent_buffer_get(parent);
7755 	path->nodes[parent_level] = parent;
7756 	path->slots[parent_level] = btrfs_header_nritems(parent);
7757 
7758 	btrfs_assert_tree_locked(node);
7759 	level = btrfs_header_level(node);
7760 	path->nodes[level] = node;
7761 	path->slots[level] = 0;
7762 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7763 
7764 	wc->refs[parent_level] = 1;
7765 	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7766 	wc->level = level;
7767 	wc->shared_level = -1;
7768 	wc->stage = DROP_REFERENCE;
7769 	wc->update_ref = 0;
7770 	wc->keep_locks = 1;
7771 	wc->for_reloc = 1;
7772 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7773 
7774 	while (1) {
7775 		wret = walk_down_tree(trans, root, path, wc);
7776 		if (wret < 0) {
7777 			ret = wret;
7778 			break;
7779 		}
7780 
7781 		wret = walk_up_tree(trans, root, path, wc, parent_level);
7782 		if (wret < 0)
7783 			ret = wret;
7784 		if (wret != 0)
7785 			break;
7786 	}
7787 
7788 	kfree(wc);
7789 	btrfs_free_path(path);
7790 	return ret;
7791 }
7792 
7793 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7794 {
7795 	u64 num_devices;
7796 	u64 stripped;
7797 
7798 	/*
7799 	 * if restripe for this chunk_type is on pick target profile and
7800 	 * return, otherwise do the usual balance
7801 	 */
7802 	stripped = get_restripe_target(root->fs_info, flags);
7803 	if (stripped)
7804 		return extended_to_chunk(stripped);
7805 
7806 	/*
7807 	 * we add in the count of missing devices because we want
7808 	 * to make sure that any RAID levels on a degraded FS
7809 	 * continue to be honored.
7810 	 */
7811 	num_devices = root->fs_info->fs_devices->rw_devices +
7812 		root->fs_info->fs_devices->missing_devices;
7813 
7814 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
7815 		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7816 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7817 
7818 	if (num_devices == 1) {
7819 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7820 		stripped = flags & ~stripped;
7821 
7822 		/* turn raid0 into single device chunks */
7823 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
7824 			return stripped;
7825 
7826 		/* turn mirroring into duplication */
7827 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7828 			     BTRFS_BLOCK_GROUP_RAID10))
7829 			return stripped | BTRFS_BLOCK_GROUP_DUP;
7830 	} else {
7831 		/* they already had raid on here, just return */
7832 		if (flags & stripped)
7833 			return flags;
7834 
7835 		stripped |= BTRFS_BLOCK_GROUP_DUP;
7836 		stripped = flags & ~stripped;
7837 
7838 		/* switch duplicated blocks with raid1 */
7839 		if (flags & BTRFS_BLOCK_GROUP_DUP)
7840 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
7841 
7842 		/* this is drive concat, leave it alone */
7843 	}
7844 
7845 	return flags;
7846 }
7847 
7848 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7849 {
7850 	struct btrfs_space_info *sinfo = cache->space_info;
7851 	u64 num_bytes;
7852 	u64 min_allocable_bytes;
7853 	int ret = -ENOSPC;
7854 
7855 
7856 	/*
7857 	 * We need some metadata space and system metadata space for
7858 	 * allocating chunks in some corner cases until we force to set
7859 	 * it to be readonly.
7860 	 */
7861 	if ((sinfo->flags &
7862 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7863 	    !force)
7864 		min_allocable_bytes = 1 * 1024 * 1024;
7865 	else
7866 		min_allocable_bytes = 0;
7867 
7868 	spin_lock(&sinfo->lock);
7869 	spin_lock(&cache->lock);
7870 
7871 	if (cache->ro) {
7872 		ret = 0;
7873 		goto out;
7874 	}
7875 
7876 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7877 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
7878 
7879 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7880 	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7881 	    min_allocable_bytes <= sinfo->total_bytes) {
7882 		sinfo->bytes_readonly += num_bytes;
7883 		cache->ro = 1;
7884 		ret = 0;
7885 	}
7886 out:
7887 	spin_unlock(&cache->lock);
7888 	spin_unlock(&sinfo->lock);
7889 	return ret;
7890 }
7891 
7892 int btrfs_set_block_group_ro(struct btrfs_root *root,
7893 			     struct btrfs_block_group_cache *cache)
7894 
7895 {
7896 	struct btrfs_trans_handle *trans;
7897 	u64 alloc_flags;
7898 	int ret;
7899 
7900 	BUG_ON(cache->ro);
7901 
7902 	trans = btrfs_join_transaction(root);
7903 	if (IS_ERR(trans))
7904 		return PTR_ERR(trans);
7905 
7906 	alloc_flags = update_block_group_flags(root, cache->flags);
7907 	if (alloc_flags != cache->flags) {
7908 		ret = do_chunk_alloc(trans, root, alloc_flags,
7909 				     CHUNK_ALLOC_FORCE);
7910 		if (ret < 0)
7911 			goto out;
7912 	}
7913 
7914 	ret = set_block_group_ro(cache, 0);
7915 	if (!ret)
7916 		goto out;
7917 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7918 	ret = do_chunk_alloc(trans, root, alloc_flags,
7919 			     CHUNK_ALLOC_FORCE);
7920 	if (ret < 0)
7921 		goto out;
7922 	ret = set_block_group_ro(cache, 0);
7923 out:
7924 	btrfs_end_transaction(trans, root);
7925 	return ret;
7926 }
7927 
7928 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7929 			    struct btrfs_root *root, u64 type)
7930 {
7931 	u64 alloc_flags = get_alloc_profile(root, type);
7932 	return do_chunk_alloc(trans, root, alloc_flags,
7933 			      CHUNK_ALLOC_FORCE);
7934 }
7935 
7936 /*
7937  * helper to account the unused space of all the readonly block group in the
7938  * list. takes mirrors into account.
7939  */
7940 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7941 {
7942 	struct btrfs_block_group_cache *block_group;
7943 	u64 free_bytes = 0;
7944 	int factor;
7945 
7946 	list_for_each_entry(block_group, groups_list, list) {
7947 		spin_lock(&block_group->lock);
7948 
7949 		if (!block_group->ro) {
7950 			spin_unlock(&block_group->lock);
7951 			continue;
7952 		}
7953 
7954 		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7955 					  BTRFS_BLOCK_GROUP_RAID10 |
7956 					  BTRFS_BLOCK_GROUP_DUP))
7957 			factor = 2;
7958 		else
7959 			factor = 1;
7960 
7961 		free_bytes += (block_group->key.offset -
7962 			       btrfs_block_group_used(&block_group->item)) *
7963 			       factor;
7964 
7965 		spin_unlock(&block_group->lock);
7966 	}
7967 
7968 	return free_bytes;
7969 }
7970 
7971 /*
7972  * helper to account the unused space of all the readonly block group in the
7973  * space_info. takes mirrors into account.
7974  */
7975 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7976 {
7977 	int i;
7978 	u64 free_bytes = 0;
7979 
7980 	spin_lock(&sinfo->lock);
7981 
7982 	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7983 		if (!list_empty(&sinfo->block_groups[i]))
7984 			free_bytes += __btrfs_get_ro_block_group_free_space(
7985 						&sinfo->block_groups[i]);
7986 
7987 	spin_unlock(&sinfo->lock);
7988 
7989 	return free_bytes;
7990 }
7991 
7992 void btrfs_set_block_group_rw(struct btrfs_root *root,
7993 			      struct btrfs_block_group_cache *cache)
7994 {
7995 	struct btrfs_space_info *sinfo = cache->space_info;
7996 	u64 num_bytes;
7997 
7998 	BUG_ON(!cache->ro);
7999 
8000 	spin_lock(&sinfo->lock);
8001 	spin_lock(&cache->lock);
8002 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8003 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
8004 	sinfo->bytes_readonly -= num_bytes;
8005 	cache->ro = 0;
8006 	spin_unlock(&cache->lock);
8007 	spin_unlock(&sinfo->lock);
8008 }
8009 
8010 /*
8011  * checks to see if its even possible to relocate this block group.
8012  *
8013  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8014  * ok to go ahead and try.
8015  */
8016 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8017 {
8018 	struct btrfs_block_group_cache *block_group;
8019 	struct btrfs_space_info *space_info;
8020 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8021 	struct btrfs_device *device;
8022 	struct btrfs_trans_handle *trans;
8023 	u64 min_free;
8024 	u64 dev_min = 1;
8025 	u64 dev_nr = 0;
8026 	u64 target;
8027 	int index;
8028 	int full = 0;
8029 	int ret = 0;
8030 
8031 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
8032 
8033 	/* odd, couldn't find the block group, leave it alone */
8034 	if (!block_group)
8035 		return -1;
8036 
8037 	min_free = btrfs_block_group_used(&block_group->item);
8038 
8039 	/* no bytes used, we're good */
8040 	if (!min_free)
8041 		goto out;
8042 
8043 	space_info = block_group->space_info;
8044 	spin_lock(&space_info->lock);
8045 
8046 	full = space_info->full;
8047 
8048 	/*
8049 	 * if this is the last block group we have in this space, we can't
8050 	 * relocate it unless we're able to allocate a new chunk below.
8051 	 *
8052 	 * Otherwise, we need to make sure we have room in the space to handle
8053 	 * all of the extents from this block group.  If we can, we're good
8054 	 */
8055 	if ((space_info->total_bytes != block_group->key.offset) &&
8056 	    (space_info->bytes_used + space_info->bytes_reserved +
8057 	     space_info->bytes_pinned + space_info->bytes_readonly +
8058 	     min_free < space_info->total_bytes)) {
8059 		spin_unlock(&space_info->lock);
8060 		goto out;
8061 	}
8062 	spin_unlock(&space_info->lock);
8063 
8064 	/*
8065 	 * ok we don't have enough space, but maybe we have free space on our
8066 	 * devices to allocate new chunks for relocation, so loop through our
8067 	 * alloc devices and guess if we have enough space.  if this block
8068 	 * group is going to be restriped, run checks against the target
8069 	 * profile instead of the current one.
8070 	 */
8071 	ret = -1;
8072 
8073 	/*
8074 	 * index:
8075 	 *      0: raid10
8076 	 *      1: raid1
8077 	 *      2: dup
8078 	 *      3: raid0
8079 	 *      4: single
8080 	 */
8081 	target = get_restripe_target(root->fs_info, block_group->flags);
8082 	if (target) {
8083 		index = __get_raid_index(extended_to_chunk(target));
8084 	} else {
8085 		/*
8086 		 * this is just a balance, so if we were marked as full
8087 		 * we know there is no space for a new chunk
8088 		 */
8089 		if (full)
8090 			goto out;
8091 
8092 		index = get_block_group_index(block_group);
8093 	}
8094 
8095 	if (index == BTRFS_RAID_RAID10) {
8096 		dev_min = 4;
8097 		/* Divide by 2 */
8098 		min_free >>= 1;
8099 	} else if (index == BTRFS_RAID_RAID1) {
8100 		dev_min = 2;
8101 	} else if (index == BTRFS_RAID_DUP) {
8102 		/* Multiply by 2 */
8103 		min_free <<= 1;
8104 	} else if (index == BTRFS_RAID_RAID0) {
8105 		dev_min = fs_devices->rw_devices;
8106 		do_div(min_free, dev_min);
8107 	}
8108 
8109 	/* We need to do this so that we can look at pending chunks */
8110 	trans = btrfs_join_transaction(root);
8111 	if (IS_ERR(trans)) {
8112 		ret = PTR_ERR(trans);
8113 		goto out;
8114 	}
8115 
8116 	mutex_lock(&root->fs_info->chunk_mutex);
8117 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8118 		u64 dev_offset;
8119 
8120 		/*
8121 		 * check to make sure we can actually find a chunk with enough
8122 		 * space to fit our block group in.
8123 		 */
8124 		if (device->total_bytes > device->bytes_used + min_free &&
8125 		    !device->is_tgtdev_for_dev_replace) {
8126 			ret = find_free_dev_extent(trans, device, min_free,
8127 						   &dev_offset, NULL);
8128 			if (!ret)
8129 				dev_nr++;
8130 
8131 			if (dev_nr >= dev_min)
8132 				break;
8133 
8134 			ret = -1;
8135 		}
8136 	}
8137 	mutex_unlock(&root->fs_info->chunk_mutex);
8138 	btrfs_end_transaction(trans, root);
8139 out:
8140 	btrfs_put_block_group(block_group);
8141 	return ret;
8142 }
8143 
8144 static int find_first_block_group(struct btrfs_root *root,
8145 		struct btrfs_path *path, struct btrfs_key *key)
8146 {
8147 	int ret = 0;
8148 	struct btrfs_key found_key;
8149 	struct extent_buffer *leaf;
8150 	int slot;
8151 
8152 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8153 	if (ret < 0)
8154 		goto out;
8155 
8156 	while (1) {
8157 		slot = path->slots[0];
8158 		leaf = path->nodes[0];
8159 		if (slot >= btrfs_header_nritems(leaf)) {
8160 			ret = btrfs_next_leaf(root, path);
8161 			if (ret == 0)
8162 				continue;
8163 			if (ret < 0)
8164 				goto out;
8165 			break;
8166 		}
8167 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
8168 
8169 		if (found_key.objectid >= key->objectid &&
8170 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8171 			ret = 0;
8172 			goto out;
8173 		}
8174 		path->slots[0]++;
8175 	}
8176 out:
8177 	return ret;
8178 }
8179 
8180 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8181 {
8182 	struct btrfs_block_group_cache *block_group;
8183 	u64 last = 0;
8184 
8185 	while (1) {
8186 		struct inode *inode;
8187 
8188 		block_group = btrfs_lookup_first_block_group(info, last);
8189 		while (block_group) {
8190 			spin_lock(&block_group->lock);
8191 			if (block_group->iref)
8192 				break;
8193 			spin_unlock(&block_group->lock);
8194 			block_group = next_block_group(info->tree_root,
8195 						       block_group);
8196 		}
8197 		if (!block_group) {
8198 			if (last == 0)
8199 				break;
8200 			last = 0;
8201 			continue;
8202 		}
8203 
8204 		inode = block_group->inode;
8205 		block_group->iref = 0;
8206 		block_group->inode = NULL;
8207 		spin_unlock(&block_group->lock);
8208 		iput(inode);
8209 		last = block_group->key.objectid + block_group->key.offset;
8210 		btrfs_put_block_group(block_group);
8211 	}
8212 }
8213 
8214 int btrfs_free_block_groups(struct btrfs_fs_info *info)
8215 {
8216 	struct btrfs_block_group_cache *block_group;
8217 	struct btrfs_space_info *space_info;
8218 	struct btrfs_caching_control *caching_ctl;
8219 	struct rb_node *n;
8220 
8221 	down_write(&info->extent_commit_sem);
8222 	while (!list_empty(&info->caching_block_groups)) {
8223 		caching_ctl = list_entry(info->caching_block_groups.next,
8224 					 struct btrfs_caching_control, list);
8225 		list_del(&caching_ctl->list);
8226 		put_caching_control(caching_ctl);
8227 	}
8228 	up_write(&info->extent_commit_sem);
8229 
8230 	spin_lock(&info->block_group_cache_lock);
8231 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8232 		block_group = rb_entry(n, struct btrfs_block_group_cache,
8233 				       cache_node);
8234 		rb_erase(&block_group->cache_node,
8235 			 &info->block_group_cache_tree);
8236 		spin_unlock(&info->block_group_cache_lock);
8237 
8238 		down_write(&block_group->space_info->groups_sem);
8239 		list_del(&block_group->list);
8240 		up_write(&block_group->space_info->groups_sem);
8241 
8242 		if (block_group->cached == BTRFS_CACHE_STARTED)
8243 			wait_block_group_cache_done(block_group);
8244 
8245 		/*
8246 		 * We haven't cached this block group, which means we could
8247 		 * possibly have excluded extents on this block group.
8248 		 */
8249 		if (block_group->cached == BTRFS_CACHE_NO ||
8250 		    block_group->cached == BTRFS_CACHE_ERROR)
8251 			free_excluded_extents(info->extent_root, block_group);
8252 
8253 		btrfs_remove_free_space_cache(block_group);
8254 		btrfs_put_block_group(block_group);
8255 
8256 		spin_lock(&info->block_group_cache_lock);
8257 	}
8258 	spin_unlock(&info->block_group_cache_lock);
8259 
8260 	/* now that all the block groups are freed, go through and
8261 	 * free all the space_info structs.  This is only called during
8262 	 * the final stages of unmount, and so we know nobody is
8263 	 * using them.  We call synchronize_rcu() once before we start,
8264 	 * just to be on the safe side.
8265 	 */
8266 	synchronize_rcu();
8267 
8268 	release_global_block_rsv(info);
8269 
8270 	while(!list_empty(&info->space_info)) {
8271 		space_info = list_entry(info->space_info.next,
8272 					struct btrfs_space_info,
8273 					list);
8274 		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
8275 			if (space_info->bytes_pinned > 0 ||
8276 			    space_info->bytes_reserved > 0 ||
8277 			    space_info->bytes_may_use > 0) {
8278 				WARN_ON(1);
8279 				dump_space_info(space_info, 0, 0);
8280 			}
8281 		}
8282 		percpu_counter_destroy(&space_info->total_bytes_pinned);
8283 		list_del(&space_info->list);
8284 		kfree(space_info);
8285 	}
8286 	return 0;
8287 }
8288 
8289 static void __link_block_group(struct btrfs_space_info *space_info,
8290 			       struct btrfs_block_group_cache *cache)
8291 {
8292 	int index = get_block_group_index(cache);
8293 
8294 	down_write(&space_info->groups_sem);
8295 	list_add_tail(&cache->list, &space_info->block_groups[index]);
8296 	up_write(&space_info->groups_sem);
8297 }
8298 
8299 int btrfs_read_block_groups(struct btrfs_root *root)
8300 {
8301 	struct btrfs_path *path;
8302 	int ret;
8303 	struct btrfs_block_group_cache *cache;
8304 	struct btrfs_fs_info *info = root->fs_info;
8305 	struct btrfs_space_info *space_info;
8306 	struct btrfs_key key;
8307 	struct btrfs_key found_key;
8308 	struct extent_buffer *leaf;
8309 	int need_clear = 0;
8310 	u64 cache_gen;
8311 
8312 	root = info->extent_root;
8313 	key.objectid = 0;
8314 	key.offset = 0;
8315 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8316 	path = btrfs_alloc_path();
8317 	if (!path)
8318 		return -ENOMEM;
8319 	path->reada = 1;
8320 
8321 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
8322 	if (btrfs_test_opt(root, SPACE_CACHE) &&
8323 	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
8324 		need_clear = 1;
8325 	if (btrfs_test_opt(root, CLEAR_CACHE))
8326 		need_clear = 1;
8327 
8328 	while (1) {
8329 		ret = find_first_block_group(root, path, &key);
8330 		if (ret > 0)
8331 			break;
8332 		if (ret != 0)
8333 			goto error;
8334 		leaf = path->nodes[0];
8335 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8336 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
8337 		if (!cache) {
8338 			ret = -ENOMEM;
8339 			goto error;
8340 		}
8341 		cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8342 						GFP_NOFS);
8343 		if (!cache->free_space_ctl) {
8344 			kfree(cache);
8345 			ret = -ENOMEM;
8346 			goto error;
8347 		}
8348 
8349 		atomic_set(&cache->count, 1);
8350 		spin_lock_init(&cache->lock);
8351 		cache->fs_info = info;
8352 		INIT_LIST_HEAD(&cache->list);
8353 		INIT_LIST_HEAD(&cache->cluster_list);
8354 
8355 		if (need_clear) {
8356 			/*
8357 			 * When we mount with old space cache, we need to
8358 			 * set BTRFS_DC_CLEAR and set dirty flag.
8359 			 *
8360 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8361 			 *    truncate the old free space cache inode and
8362 			 *    setup a new one.
8363 			 * b) Setting 'dirty flag' makes sure that we flush
8364 			 *    the new space cache info onto disk.
8365 			 */
8366 			cache->disk_cache_state = BTRFS_DC_CLEAR;
8367 			if (btrfs_test_opt(root, SPACE_CACHE))
8368 				cache->dirty = 1;
8369 		}
8370 
8371 		read_extent_buffer(leaf, &cache->item,
8372 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
8373 				   sizeof(cache->item));
8374 		memcpy(&cache->key, &found_key, sizeof(found_key));
8375 
8376 		key.objectid = found_key.objectid + found_key.offset;
8377 		btrfs_release_path(path);
8378 		cache->flags = btrfs_block_group_flags(&cache->item);
8379 		cache->sectorsize = root->sectorsize;
8380 		cache->full_stripe_len = btrfs_full_stripe_len(root,
8381 					       &root->fs_info->mapping_tree,
8382 					       found_key.objectid);
8383 		btrfs_init_free_space_ctl(cache);
8384 
8385 		/*
8386 		 * We need to exclude the super stripes now so that the space
8387 		 * info has super bytes accounted for, otherwise we'll think
8388 		 * we have more space than we actually do.
8389 		 */
8390 		ret = exclude_super_stripes(root, cache);
8391 		if (ret) {
8392 			/*
8393 			 * We may have excluded something, so call this just in
8394 			 * case.
8395 			 */
8396 			free_excluded_extents(root, cache);
8397 			kfree(cache->free_space_ctl);
8398 			kfree(cache);
8399 			goto error;
8400 		}
8401 
8402 		/*
8403 		 * check for two cases, either we are full, and therefore
8404 		 * don't need to bother with the caching work since we won't
8405 		 * find any space, or we are empty, and we can just add all
8406 		 * the space in and be done with it.  This saves us _alot_ of
8407 		 * time, particularly in the full case.
8408 		 */
8409 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8410 			cache->last_byte_to_unpin = (u64)-1;
8411 			cache->cached = BTRFS_CACHE_FINISHED;
8412 			free_excluded_extents(root, cache);
8413 		} else if (btrfs_block_group_used(&cache->item) == 0) {
8414 			cache->last_byte_to_unpin = (u64)-1;
8415 			cache->cached = BTRFS_CACHE_FINISHED;
8416 			add_new_free_space(cache, root->fs_info,
8417 					   found_key.objectid,
8418 					   found_key.objectid +
8419 					   found_key.offset);
8420 			free_excluded_extents(root, cache);
8421 		}
8422 
8423 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
8424 		if (ret) {
8425 			btrfs_remove_free_space_cache(cache);
8426 			btrfs_put_block_group(cache);
8427 			goto error;
8428 		}
8429 
8430 		ret = update_space_info(info, cache->flags, found_key.offset,
8431 					btrfs_block_group_used(&cache->item),
8432 					&space_info);
8433 		if (ret) {
8434 			btrfs_remove_free_space_cache(cache);
8435 			spin_lock(&info->block_group_cache_lock);
8436 			rb_erase(&cache->cache_node,
8437 				 &info->block_group_cache_tree);
8438 			spin_unlock(&info->block_group_cache_lock);
8439 			btrfs_put_block_group(cache);
8440 			goto error;
8441 		}
8442 
8443 		cache->space_info = space_info;
8444 		spin_lock(&cache->space_info->lock);
8445 		cache->space_info->bytes_readonly += cache->bytes_super;
8446 		spin_unlock(&cache->space_info->lock);
8447 
8448 		__link_block_group(space_info, cache);
8449 
8450 		set_avail_alloc_bits(root->fs_info, cache->flags);
8451 		if (btrfs_chunk_readonly(root, cache->key.objectid))
8452 			set_block_group_ro(cache, 1);
8453 	}
8454 
8455 	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8456 		if (!(get_alloc_profile(root, space_info->flags) &
8457 		      (BTRFS_BLOCK_GROUP_RAID10 |
8458 		       BTRFS_BLOCK_GROUP_RAID1 |
8459 		       BTRFS_BLOCK_GROUP_RAID5 |
8460 		       BTRFS_BLOCK_GROUP_RAID6 |
8461 		       BTRFS_BLOCK_GROUP_DUP)))
8462 			continue;
8463 		/*
8464 		 * avoid allocating from un-mirrored block group if there are
8465 		 * mirrored block groups.
8466 		 */
8467 		list_for_each_entry(cache,
8468 				&space_info->block_groups[BTRFS_RAID_RAID0],
8469 				list)
8470 			set_block_group_ro(cache, 1);
8471 		list_for_each_entry(cache,
8472 				&space_info->block_groups[BTRFS_RAID_SINGLE],
8473 				list)
8474 			set_block_group_ro(cache, 1);
8475 	}
8476 
8477 	init_global_block_rsv(info);
8478 	ret = 0;
8479 error:
8480 	btrfs_free_path(path);
8481 	return ret;
8482 }
8483 
8484 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8485 				       struct btrfs_root *root)
8486 {
8487 	struct btrfs_block_group_cache *block_group, *tmp;
8488 	struct btrfs_root *extent_root = root->fs_info->extent_root;
8489 	struct btrfs_block_group_item item;
8490 	struct btrfs_key key;
8491 	int ret = 0;
8492 
8493 	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
8494 				 new_bg_list) {
8495 		list_del_init(&block_group->new_bg_list);
8496 
8497 		if (ret)
8498 			continue;
8499 
8500 		spin_lock(&block_group->lock);
8501 		memcpy(&item, &block_group->item, sizeof(item));
8502 		memcpy(&key, &block_group->key, sizeof(key));
8503 		spin_unlock(&block_group->lock);
8504 
8505 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
8506 					sizeof(item));
8507 		if (ret)
8508 			btrfs_abort_transaction(trans, extent_root, ret);
8509 		ret = btrfs_finish_chunk_alloc(trans, extent_root,
8510 					       key.objectid, key.offset);
8511 		if (ret)
8512 			btrfs_abort_transaction(trans, extent_root, ret);
8513 	}
8514 }
8515 
8516 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8517 			   struct btrfs_root *root, u64 bytes_used,
8518 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
8519 			   u64 size)
8520 {
8521 	int ret;
8522 	struct btrfs_root *extent_root;
8523 	struct btrfs_block_group_cache *cache;
8524 
8525 	extent_root = root->fs_info->extent_root;
8526 
8527 	root->fs_info->last_trans_log_full_commit = trans->transid;
8528 
8529 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
8530 	if (!cache)
8531 		return -ENOMEM;
8532 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8533 					GFP_NOFS);
8534 	if (!cache->free_space_ctl) {
8535 		kfree(cache);
8536 		return -ENOMEM;
8537 	}
8538 
8539 	cache->key.objectid = chunk_offset;
8540 	cache->key.offset = size;
8541 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8542 	cache->sectorsize = root->sectorsize;
8543 	cache->fs_info = root->fs_info;
8544 	cache->full_stripe_len = btrfs_full_stripe_len(root,
8545 					       &root->fs_info->mapping_tree,
8546 					       chunk_offset);
8547 
8548 	atomic_set(&cache->count, 1);
8549 	spin_lock_init(&cache->lock);
8550 	INIT_LIST_HEAD(&cache->list);
8551 	INIT_LIST_HEAD(&cache->cluster_list);
8552 	INIT_LIST_HEAD(&cache->new_bg_list);
8553 
8554 	btrfs_init_free_space_ctl(cache);
8555 
8556 	btrfs_set_block_group_used(&cache->item, bytes_used);
8557 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8558 	cache->flags = type;
8559 	btrfs_set_block_group_flags(&cache->item, type);
8560 
8561 	cache->last_byte_to_unpin = (u64)-1;
8562 	cache->cached = BTRFS_CACHE_FINISHED;
8563 	ret = exclude_super_stripes(root, cache);
8564 	if (ret) {
8565 		/*
8566 		 * We may have excluded something, so call this just in
8567 		 * case.
8568 		 */
8569 		free_excluded_extents(root, cache);
8570 		kfree(cache->free_space_ctl);
8571 		kfree(cache);
8572 		return ret;
8573 	}
8574 
8575 	add_new_free_space(cache, root->fs_info, chunk_offset,
8576 			   chunk_offset + size);
8577 
8578 	free_excluded_extents(root, cache);
8579 
8580 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
8581 	if (ret) {
8582 		btrfs_remove_free_space_cache(cache);
8583 		btrfs_put_block_group(cache);
8584 		return ret;
8585 	}
8586 
8587 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8588 				&cache->space_info);
8589 	if (ret) {
8590 		btrfs_remove_free_space_cache(cache);
8591 		spin_lock(&root->fs_info->block_group_cache_lock);
8592 		rb_erase(&cache->cache_node,
8593 			 &root->fs_info->block_group_cache_tree);
8594 		spin_unlock(&root->fs_info->block_group_cache_lock);
8595 		btrfs_put_block_group(cache);
8596 		return ret;
8597 	}
8598 	update_global_block_rsv(root->fs_info);
8599 
8600 	spin_lock(&cache->space_info->lock);
8601 	cache->space_info->bytes_readonly += cache->bytes_super;
8602 	spin_unlock(&cache->space_info->lock);
8603 
8604 	__link_block_group(cache->space_info, cache);
8605 
8606 	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
8607 
8608 	set_avail_alloc_bits(extent_root->fs_info, type);
8609 
8610 	return 0;
8611 }
8612 
8613 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8614 {
8615 	u64 extra_flags = chunk_to_extended(flags) &
8616 				BTRFS_EXTENDED_PROFILE_MASK;
8617 
8618 	write_seqlock(&fs_info->profiles_lock);
8619 	if (flags & BTRFS_BLOCK_GROUP_DATA)
8620 		fs_info->avail_data_alloc_bits &= ~extra_flags;
8621 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
8622 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8623 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8624 		fs_info->avail_system_alloc_bits &= ~extra_flags;
8625 	write_sequnlock(&fs_info->profiles_lock);
8626 }
8627 
8628 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8629 			     struct btrfs_root *root, u64 group_start)
8630 {
8631 	struct btrfs_path *path;
8632 	struct btrfs_block_group_cache *block_group;
8633 	struct btrfs_free_cluster *cluster;
8634 	struct btrfs_root *tree_root = root->fs_info->tree_root;
8635 	struct btrfs_key key;
8636 	struct inode *inode;
8637 	int ret;
8638 	int index;
8639 	int factor;
8640 
8641 	root = root->fs_info->extent_root;
8642 
8643 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8644 	BUG_ON(!block_group);
8645 	BUG_ON(!block_group->ro);
8646 
8647 	/*
8648 	 * Free the reserved super bytes from this block group before
8649 	 * remove it.
8650 	 */
8651 	free_excluded_extents(root, block_group);
8652 
8653 	memcpy(&key, &block_group->key, sizeof(key));
8654 	index = get_block_group_index(block_group);
8655 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8656 				  BTRFS_BLOCK_GROUP_RAID1 |
8657 				  BTRFS_BLOCK_GROUP_RAID10))
8658 		factor = 2;
8659 	else
8660 		factor = 1;
8661 
8662 	/* make sure this block group isn't part of an allocation cluster */
8663 	cluster = &root->fs_info->data_alloc_cluster;
8664 	spin_lock(&cluster->refill_lock);
8665 	btrfs_return_cluster_to_free_space(block_group, cluster);
8666 	spin_unlock(&cluster->refill_lock);
8667 
8668 	/*
8669 	 * make sure this block group isn't part of a metadata
8670 	 * allocation cluster
8671 	 */
8672 	cluster = &root->fs_info->meta_alloc_cluster;
8673 	spin_lock(&cluster->refill_lock);
8674 	btrfs_return_cluster_to_free_space(block_group, cluster);
8675 	spin_unlock(&cluster->refill_lock);
8676 
8677 	path = btrfs_alloc_path();
8678 	if (!path) {
8679 		ret = -ENOMEM;
8680 		goto out;
8681 	}
8682 
8683 	inode = lookup_free_space_inode(tree_root, block_group, path);
8684 	if (!IS_ERR(inode)) {
8685 		ret = btrfs_orphan_add(trans, inode);
8686 		if (ret) {
8687 			btrfs_add_delayed_iput(inode);
8688 			goto out;
8689 		}
8690 		clear_nlink(inode);
8691 		/* One for the block groups ref */
8692 		spin_lock(&block_group->lock);
8693 		if (block_group->iref) {
8694 			block_group->iref = 0;
8695 			block_group->inode = NULL;
8696 			spin_unlock(&block_group->lock);
8697 			iput(inode);
8698 		} else {
8699 			spin_unlock(&block_group->lock);
8700 		}
8701 		/* One for our lookup ref */
8702 		btrfs_add_delayed_iput(inode);
8703 	}
8704 
8705 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8706 	key.offset = block_group->key.objectid;
8707 	key.type = 0;
8708 
8709 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8710 	if (ret < 0)
8711 		goto out;
8712 	if (ret > 0)
8713 		btrfs_release_path(path);
8714 	if (ret == 0) {
8715 		ret = btrfs_del_item(trans, tree_root, path);
8716 		if (ret)
8717 			goto out;
8718 		btrfs_release_path(path);
8719 	}
8720 
8721 	spin_lock(&root->fs_info->block_group_cache_lock);
8722 	rb_erase(&block_group->cache_node,
8723 		 &root->fs_info->block_group_cache_tree);
8724 
8725 	if (root->fs_info->first_logical_byte == block_group->key.objectid)
8726 		root->fs_info->first_logical_byte = (u64)-1;
8727 	spin_unlock(&root->fs_info->block_group_cache_lock);
8728 
8729 	down_write(&block_group->space_info->groups_sem);
8730 	/*
8731 	 * we must use list_del_init so people can check to see if they
8732 	 * are still on the list after taking the semaphore
8733 	 */
8734 	list_del_init(&block_group->list);
8735 	if (list_empty(&block_group->space_info->block_groups[index]))
8736 		clear_avail_alloc_bits(root->fs_info, block_group->flags);
8737 	up_write(&block_group->space_info->groups_sem);
8738 
8739 	if (block_group->cached == BTRFS_CACHE_STARTED)
8740 		wait_block_group_cache_done(block_group);
8741 
8742 	btrfs_remove_free_space_cache(block_group);
8743 
8744 	spin_lock(&block_group->space_info->lock);
8745 	block_group->space_info->total_bytes -= block_group->key.offset;
8746 	block_group->space_info->bytes_readonly -= block_group->key.offset;
8747 	block_group->space_info->disk_total -= block_group->key.offset * factor;
8748 	spin_unlock(&block_group->space_info->lock);
8749 
8750 	memcpy(&key, &block_group->key, sizeof(key));
8751 
8752 	btrfs_clear_space_info_full(root->fs_info);
8753 
8754 	btrfs_put_block_group(block_group);
8755 	btrfs_put_block_group(block_group);
8756 
8757 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8758 	if (ret > 0)
8759 		ret = -EIO;
8760 	if (ret < 0)
8761 		goto out;
8762 
8763 	ret = btrfs_del_item(trans, root, path);
8764 out:
8765 	btrfs_free_path(path);
8766 	return ret;
8767 }
8768 
8769 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8770 {
8771 	struct btrfs_space_info *space_info;
8772 	struct btrfs_super_block *disk_super;
8773 	u64 features;
8774 	u64 flags;
8775 	int mixed = 0;
8776 	int ret;
8777 
8778 	disk_super = fs_info->super_copy;
8779 	if (!btrfs_super_root(disk_super))
8780 		return 1;
8781 
8782 	features = btrfs_super_incompat_flags(disk_super);
8783 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8784 		mixed = 1;
8785 
8786 	flags = BTRFS_BLOCK_GROUP_SYSTEM;
8787 	ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8788 	if (ret)
8789 		goto out;
8790 
8791 	if (mixed) {
8792 		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8793 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8794 	} else {
8795 		flags = BTRFS_BLOCK_GROUP_METADATA;
8796 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8797 		if (ret)
8798 			goto out;
8799 
8800 		flags = BTRFS_BLOCK_GROUP_DATA;
8801 		ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8802 	}
8803 out:
8804 	return ret;
8805 }
8806 
8807 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8808 {
8809 	return unpin_extent_range(root, start, end);
8810 }
8811 
8812 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8813 			       u64 num_bytes, u64 *actual_bytes)
8814 {
8815 	return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8816 }
8817 
8818 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8819 {
8820 	struct btrfs_fs_info *fs_info = root->fs_info;
8821 	struct btrfs_block_group_cache *cache = NULL;
8822 	u64 group_trimmed;
8823 	u64 start;
8824 	u64 end;
8825 	u64 trimmed = 0;
8826 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8827 	int ret = 0;
8828 
8829 	/*
8830 	 * try to trim all FS space, our block group may start from non-zero.
8831 	 */
8832 	if (range->len == total_bytes)
8833 		cache = btrfs_lookup_first_block_group(fs_info, range->start);
8834 	else
8835 		cache = btrfs_lookup_block_group(fs_info, range->start);
8836 
8837 	while (cache) {
8838 		if (cache->key.objectid >= (range->start + range->len)) {
8839 			btrfs_put_block_group(cache);
8840 			break;
8841 		}
8842 
8843 		start = max(range->start, cache->key.objectid);
8844 		end = min(range->start + range->len,
8845 				cache->key.objectid + cache->key.offset);
8846 
8847 		if (end - start >= range->minlen) {
8848 			if (!block_group_cache_done(cache)) {
8849 				ret = cache_block_group(cache, 0);
8850 				if (ret) {
8851 					btrfs_put_block_group(cache);
8852 					break;
8853 				}
8854 				ret = wait_block_group_cache_done(cache);
8855 				if (ret) {
8856 					btrfs_put_block_group(cache);
8857 					break;
8858 				}
8859 			}
8860 			ret = btrfs_trim_block_group(cache,
8861 						     &group_trimmed,
8862 						     start,
8863 						     end,
8864 						     range->minlen);
8865 
8866 			trimmed += group_trimmed;
8867 			if (ret) {
8868 				btrfs_put_block_group(cache);
8869 				break;
8870 			}
8871 		}
8872 
8873 		cache = next_block_group(fs_info->tree_root, cache);
8874 	}
8875 
8876 	range->len = trimmed;
8877 	return ret;
8878 }
8879