xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision 278002edb19bce2c628fafb0af936e77000f3a5b)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/sched/signal.h>
8  #include <linux/pagemap.h>
9  #include <linux/writeback.h>
10  #include <linux/blkdev.h>
11  #include <linux/sort.h>
12  #include <linux/rcupdate.h>
13  #include <linux/kthread.h>
14  #include <linux/slab.h>
15  #include <linux/ratelimit.h>
16  #include <linux/percpu_counter.h>
17  #include <linux/lockdep.h>
18  #include <linux/crc32c.h>
19  #include "ctree.h"
20  #include "extent-tree.h"
21  #include "tree-log.h"
22  #include "disk-io.h"
23  #include "print-tree.h"
24  #include "volumes.h"
25  #include "raid56.h"
26  #include "locking.h"
27  #include "free-space-cache.h"
28  #include "free-space-tree.h"
29  #include "sysfs.h"
30  #include "qgroup.h"
31  #include "ref-verify.h"
32  #include "space-info.h"
33  #include "block-rsv.h"
34  #include "delalloc-space.h"
35  #include "discard.h"
36  #include "rcu-string.h"
37  #include "zoned.h"
38  #include "dev-replace.h"
39  #include "fs.h"
40  #include "accessors.h"
41  #include "root-tree.h"
42  #include "file-item.h"
43  #include "orphan.h"
44  #include "tree-checker.h"
45  
46  #undef SCRAMBLE_DELAYED_REFS
47  
48  
49  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
50  			       struct btrfs_delayed_ref_node *node, u64 parent,
51  			       u64 root_objectid, u64 owner_objectid,
52  			       u64 owner_offset, int refs_to_drop,
53  			       struct btrfs_delayed_extent_op *extra_op);
54  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
55  				    struct extent_buffer *leaf,
56  				    struct btrfs_extent_item *ei);
57  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
58  				      u64 parent, u64 root_objectid,
59  				      u64 flags, u64 owner, u64 offset,
60  				      struct btrfs_key *ins, int ref_mod);
61  static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
62  				     struct btrfs_delayed_ref_node *node,
63  				     struct btrfs_delayed_extent_op *extent_op);
64  static int find_next_key(struct btrfs_path *path, int level,
65  			 struct btrfs_key *key);
66  
block_group_bits(struct btrfs_block_group * cache,u64 bits)67  static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
68  {
69  	return (cache->flags & bits) == bits;
70  }
71  
72  /* simple helper to search for an existing data extent at a given offset */
btrfs_lookup_data_extent(struct btrfs_fs_info * fs_info,u64 start,u64 len)73  int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
74  {
75  	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
76  	int ret;
77  	struct btrfs_key key;
78  	struct btrfs_path *path;
79  
80  	path = btrfs_alloc_path();
81  	if (!path)
82  		return -ENOMEM;
83  
84  	key.objectid = start;
85  	key.offset = len;
86  	key.type = BTRFS_EXTENT_ITEM_KEY;
87  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
88  	btrfs_free_path(path);
89  	return ret;
90  }
91  
92  /*
93   * helper function to lookup reference count and flags of a tree block.
94   *
95   * the head node for delayed ref is used to store the sum of all the
96   * reference count modifications queued up in the rbtree. the head
97   * node may also store the extent flags to set. This way you can check
98   * to see what the reference count and extent flags would be if all of
99   * the delayed refs are not processed.
100   */
btrfs_lookup_extent_info(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info,u64 bytenr,u64 offset,int metadata,u64 * refs,u64 * flags)101  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
102  			     struct btrfs_fs_info *fs_info, u64 bytenr,
103  			     u64 offset, int metadata, u64 *refs, u64 *flags)
104  {
105  	struct btrfs_root *extent_root;
106  	struct btrfs_delayed_ref_head *head;
107  	struct btrfs_delayed_ref_root *delayed_refs;
108  	struct btrfs_path *path;
109  	struct btrfs_extent_item *ei;
110  	struct extent_buffer *leaf;
111  	struct btrfs_key key;
112  	u32 item_size;
113  	u64 num_refs;
114  	u64 extent_flags;
115  	int ret;
116  
117  	/*
118  	 * If we don't have skinny metadata, don't bother doing anything
119  	 * different
120  	 */
121  	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
122  		offset = fs_info->nodesize;
123  		metadata = 0;
124  	}
125  
126  	path = btrfs_alloc_path();
127  	if (!path)
128  		return -ENOMEM;
129  
130  	if (!trans) {
131  		path->skip_locking = 1;
132  		path->search_commit_root = 1;
133  	}
134  
135  search_again:
136  	key.objectid = bytenr;
137  	key.offset = offset;
138  	if (metadata)
139  		key.type = BTRFS_METADATA_ITEM_KEY;
140  	else
141  		key.type = BTRFS_EXTENT_ITEM_KEY;
142  
143  	extent_root = btrfs_extent_root(fs_info, bytenr);
144  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
145  	if (ret < 0)
146  		goto out_free;
147  
148  	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
149  		if (path->slots[0]) {
150  			path->slots[0]--;
151  			btrfs_item_key_to_cpu(path->nodes[0], &key,
152  					      path->slots[0]);
153  			if (key.objectid == bytenr &&
154  			    key.type == BTRFS_EXTENT_ITEM_KEY &&
155  			    key.offset == fs_info->nodesize)
156  				ret = 0;
157  		}
158  	}
159  
160  	if (ret == 0) {
161  		leaf = path->nodes[0];
162  		item_size = btrfs_item_size(leaf, path->slots[0]);
163  		if (item_size >= sizeof(*ei)) {
164  			ei = btrfs_item_ptr(leaf, path->slots[0],
165  					    struct btrfs_extent_item);
166  			num_refs = btrfs_extent_refs(leaf, ei);
167  			extent_flags = btrfs_extent_flags(leaf, ei);
168  		} else {
169  			ret = -EUCLEAN;
170  			btrfs_err(fs_info,
171  			"unexpected extent item size, has %u expect >= %zu",
172  				  item_size, sizeof(*ei));
173  			if (trans)
174  				btrfs_abort_transaction(trans, ret);
175  			else
176  				btrfs_handle_fs_error(fs_info, ret, NULL);
177  
178  			goto out_free;
179  		}
180  
181  		BUG_ON(num_refs == 0);
182  	} else {
183  		num_refs = 0;
184  		extent_flags = 0;
185  		ret = 0;
186  	}
187  
188  	if (!trans)
189  		goto out;
190  
191  	delayed_refs = &trans->transaction->delayed_refs;
192  	spin_lock(&delayed_refs->lock);
193  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
194  	if (head) {
195  		if (!mutex_trylock(&head->mutex)) {
196  			refcount_inc(&head->refs);
197  			spin_unlock(&delayed_refs->lock);
198  
199  			btrfs_release_path(path);
200  
201  			/*
202  			 * Mutex was contended, block until it's released and try
203  			 * again
204  			 */
205  			mutex_lock(&head->mutex);
206  			mutex_unlock(&head->mutex);
207  			btrfs_put_delayed_ref_head(head);
208  			goto search_again;
209  		}
210  		spin_lock(&head->lock);
211  		if (head->extent_op && head->extent_op->update_flags)
212  			extent_flags |= head->extent_op->flags_to_set;
213  		else
214  			BUG_ON(num_refs == 0);
215  
216  		num_refs += head->ref_mod;
217  		spin_unlock(&head->lock);
218  		mutex_unlock(&head->mutex);
219  	}
220  	spin_unlock(&delayed_refs->lock);
221  out:
222  	WARN_ON(num_refs == 0);
223  	if (refs)
224  		*refs = num_refs;
225  	if (flags)
226  		*flags = extent_flags;
227  out_free:
228  	btrfs_free_path(path);
229  	return ret;
230  }
231  
232  /*
233   * Back reference rules.  Back refs have three main goals:
234   *
235   * 1) differentiate between all holders of references to an extent so that
236   *    when a reference is dropped we can make sure it was a valid reference
237   *    before freeing the extent.
238   *
239   * 2) Provide enough information to quickly find the holders of an extent
240   *    if we notice a given block is corrupted or bad.
241   *
242   * 3) Make it easy to migrate blocks for FS shrinking or storage pool
243   *    maintenance.  This is actually the same as #2, but with a slightly
244   *    different use case.
245   *
246   * There are two kinds of back refs. The implicit back refs is optimized
247   * for pointers in non-shared tree blocks. For a given pointer in a block,
248   * back refs of this kind provide information about the block's owner tree
249   * and the pointer's key. These information allow us to find the block by
250   * b-tree searching. The full back refs is for pointers in tree blocks not
251   * referenced by their owner trees. The location of tree block is recorded
252   * in the back refs. Actually the full back refs is generic, and can be
253   * used in all cases the implicit back refs is used. The major shortcoming
254   * of the full back refs is its overhead. Every time a tree block gets
255   * COWed, we have to update back refs entry for all pointers in it.
256   *
257   * For a newly allocated tree block, we use implicit back refs for
258   * pointers in it. This means most tree related operations only involve
259   * implicit back refs. For a tree block created in old transaction, the
260   * only way to drop a reference to it is COW it. So we can detect the
261   * event that tree block loses its owner tree's reference and do the
262   * back refs conversion.
263   *
264   * When a tree block is COWed through a tree, there are four cases:
265   *
266   * The reference count of the block is one and the tree is the block's
267   * owner tree. Nothing to do in this case.
268   *
269   * The reference count of the block is one and the tree is not the
270   * block's owner tree. In this case, full back refs is used for pointers
271   * in the block. Remove these full back refs, add implicit back refs for
272   * every pointers in the new block.
273   *
274   * The reference count of the block is greater than one and the tree is
275   * the block's owner tree. In this case, implicit back refs is used for
276   * pointers in the block. Add full back refs for every pointers in the
277   * block, increase lower level extents' reference counts. The original
278   * implicit back refs are entailed to the new block.
279   *
280   * The reference count of the block is greater than one and the tree is
281   * not the block's owner tree. Add implicit back refs for every pointer in
282   * the new block, increase lower level extents' reference count.
283   *
284   * Back Reference Key composing:
285   *
286   * The key objectid corresponds to the first byte in the extent,
287   * The key type is used to differentiate between types of back refs.
288   * There are different meanings of the key offset for different types
289   * of back refs.
290   *
291   * File extents can be referenced by:
292   *
293   * - multiple snapshots, subvolumes, or different generations in one subvol
294   * - different files inside a single subvolume
295   * - different offsets inside a file (bookend extents in file.c)
296   *
297   * The extent ref structure for the implicit back refs has fields for:
298   *
299   * - Objectid of the subvolume root
300   * - objectid of the file holding the reference
301   * - original offset in the file
302   * - how many bookend extents
303   *
304   * The key offset for the implicit back refs is hash of the first
305   * three fields.
306   *
307   * The extent ref structure for the full back refs has field for:
308   *
309   * - number of pointers in the tree leaf
310   *
311   * The key offset for the implicit back refs is the first byte of
312   * the tree leaf
313   *
314   * When a file extent is allocated, The implicit back refs is used.
315   * the fields are filled in:
316   *
317   *     (root_key.objectid, inode objectid, offset in file, 1)
318   *
319   * When a file extent is removed file truncation, we find the
320   * corresponding implicit back refs and check the following fields:
321   *
322   *     (btrfs_header_owner(leaf), inode objectid, offset in file)
323   *
324   * Btree extents can be referenced by:
325   *
326   * - Different subvolumes
327   *
328   * Both the implicit back refs and the full back refs for tree blocks
329   * only consist of key. The key offset for the implicit back refs is
330   * objectid of block's owner tree. The key offset for the full back refs
331   * is the first byte of parent block.
332   *
333   * When implicit back refs is used, information about the lowest key and
334   * level of the tree block are required. These information are stored in
335   * tree block info structure.
336   */
337  
338  /*
339   * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
340   * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
341   * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
342   */
btrfs_get_extent_inline_ref_type(const struct extent_buffer * eb,struct btrfs_extent_inline_ref * iref,enum btrfs_inline_ref_type is_data)343  int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
344  				     struct btrfs_extent_inline_ref *iref,
345  				     enum btrfs_inline_ref_type is_data)
346  {
347  	int type = btrfs_extent_inline_ref_type(eb, iref);
348  	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
349  
350  	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
351  	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
352  	    type == BTRFS_SHARED_DATA_REF_KEY ||
353  	    type == BTRFS_EXTENT_DATA_REF_KEY) {
354  		if (is_data == BTRFS_REF_TYPE_BLOCK) {
355  			if (type == BTRFS_TREE_BLOCK_REF_KEY)
356  				return type;
357  			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
358  				ASSERT(eb->fs_info);
359  				/*
360  				 * Every shared one has parent tree block,
361  				 * which must be aligned to sector size.
362  				 */
363  				if (offset &&
364  				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
365  					return type;
366  			}
367  		} else if (is_data == BTRFS_REF_TYPE_DATA) {
368  			if (type == BTRFS_EXTENT_DATA_REF_KEY)
369  				return type;
370  			if (type == BTRFS_SHARED_DATA_REF_KEY) {
371  				ASSERT(eb->fs_info);
372  				/*
373  				 * Every shared one has parent tree block,
374  				 * which must be aligned to sector size.
375  				 */
376  				if (offset &&
377  				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
378  					return type;
379  			}
380  		} else {
381  			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
382  			return type;
383  		}
384  	}
385  
386  	WARN_ON(1);
387  	btrfs_print_leaf(eb);
388  	btrfs_err(eb->fs_info,
389  		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
390  		  eb->start, (unsigned long)iref, type);
391  
392  	return BTRFS_REF_TYPE_INVALID;
393  }
394  
hash_extent_data_ref(u64 root_objectid,u64 owner,u64 offset)395  u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
396  {
397  	u32 high_crc = ~(u32)0;
398  	u32 low_crc = ~(u32)0;
399  	__le64 lenum;
400  
401  	lenum = cpu_to_le64(root_objectid);
402  	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
403  	lenum = cpu_to_le64(owner);
404  	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
405  	lenum = cpu_to_le64(offset);
406  	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
407  
408  	return ((u64)high_crc << 31) ^ (u64)low_crc;
409  }
410  
hash_extent_data_ref_item(struct extent_buffer * leaf,struct btrfs_extent_data_ref * ref)411  static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
412  				     struct btrfs_extent_data_ref *ref)
413  {
414  	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
415  				    btrfs_extent_data_ref_objectid(leaf, ref),
416  				    btrfs_extent_data_ref_offset(leaf, ref));
417  }
418  
match_extent_data_ref(struct extent_buffer * leaf,struct btrfs_extent_data_ref * ref,u64 root_objectid,u64 owner,u64 offset)419  static int match_extent_data_ref(struct extent_buffer *leaf,
420  				 struct btrfs_extent_data_ref *ref,
421  				 u64 root_objectid, u64 owner, u64 offset)
422  {
423  	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
424  	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
425  	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
426  		return 0;
427  	return 1;
428  }
429  
lookup_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid,u64 owner,u64 offset)430  static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
431  					   struct btrfs_path *path,
432  					   u64 bytenr, u64 parent,
433  					   u64 root_objectid,
434  					   u64 owner, u64 offset)
435  {
436  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
437  	struct btrfs_key key;
438  	struct btrfs_extent_data_ref *ref;
439  	struct extent_buffer *leaf;
440  	u32 nritems;
441  	int ret;
442  	int recow;
443  	int err = -ENOENT;
444  
445  	key.objectid = bytenr;
446  	if (parent) {
447  		key.type = BTRFS_SHARED_DATA_REF_KEY;
448  		key.offset = parent;
449  	} else {
450  		key.type = BTRFS_EXTENT_DATA_REF_KEY;
451  		key.offset = hash_extent_data_ref(root_objectid,
452  						  owner, offset);
453  	}
454  again:
455  	recow = 0;
456  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
457  	if (ret < 0) {
458  		err = ret;
459  		goto fail;
460  	}
461  
462  	if (parent) {
463  		if (!ret)
464  			return 0;
465  		goto fail;
466  	}
467  
468  	leaf = path->nodes[0];
469  	nritems = btrfs_header_nritems(leaf);
470  	while (1) {
471  		if (path->slots[0] >= nritems) {
472  			ret = btrfs_next_leaf(root, path);
473  			if (ret < 0)
474  				err = ret;
475  			if (ret)
476  				goto fail;
477  
478  			leaf = path->nodes[0];
479  			nritems = btrfs_header_nritems(leaf);
480  			recow = 1;
481  		}
482  
483  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
484  		if (key.objectid != bytenr ||
485  		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
486  			goto fail;
487  
488  		ref = btrfs_item_ptr(leaf, path->slots[0],
489  				     struct btrfs_extent_data_ref);
490  
491  		if (match_extent_data_ref(leaf, ref, root_objectid,
492  					  owner, offset)) {
493  			if (recow) {
494  				btrfs_release_path(path);
495  				goto again;
496  			}
497  			err = 0;
498  			break;
499  		}
500  		path->slots[0]++;
501  	}
502  fail:
503  	return err;
504  }
505  
insert_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add)506  static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
507  					   struct btrfs_path *path,
508  					   u64 bytenr, u64 parent,
509  					   u64 root_objectid, u64 owner,
510  					   u64 offset, int refs_to_add)
511  {
512  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
513  	struct btrfs_key key;
514  	struct extent_buffer *leaf;
515  	u32 size;
516  	u32 num_refs;
517  	int ret;
518  
519  	key.objectid = bytenr;
520  	if (parent) {
521  		key.type = BTRFS_SHARED_DATA_REF_KEY;
522  		key.offset = parent;
523  		size = sizeof(struct btrfs_shared_data_ref);
524  	} else {
525  		key.type = BTRFS_EXTENT_DATA_REF_KEY;
526  		key.offset = hash_extent_data_ref(root_objectid,
527  						  owner, offset);
528  		size = sizeof(struct btrfs_extent_data_ref);
529  	}
530  
531  	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
532  	if (ret && ret != -EEXIST)
533  		goto fail;
534  
535  	leaf = path->nodes[0];
536  	if (parent) {
537  		struct btrfs_shared_data_ref *ref;
538  		ref = btrfs_item_ptr(leaf, path->slots[0],
539  				     struct btrfs_shared_data_ref);
540  		if (ret == 0) {
541  			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
542  		} else {
543  			num_refs = btrfs_shared_data_ref_count(leaf, ref);
544  			num_refs += refs_to_add;
545  			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
546  		}
547  	} else {
548  		struct btrfs_extent_data_ref *ref;
549  		while (ret == -EEXIST) {
550  			ref = btrfs_item_ptr(leaf, path->slots[0],
551  					     struct btrfs_extent_data_ref);
552  			if (match_extent_data_ref(leaf, ref, root_objectid,
553  						  owner, offset))
554  				break;
555  			btrfs_release_path(path);
556  			key.offset++;
557  			ret = btrfs_insert_empty_item(trans, root, path, &key,
558  						      size);
559  			if (ret && ret != -EEXIST)
560  				goto fail;
561  
562  			leaf = path->nodes[0];
563  		}
564  		ref = btrfs_item_ptr(leaf, path->slots[0],
565  				     struct btrfs_extent_data_ref);
566  		if (ret == 0) {
567  			btrfs_set_extent_data_ref_root(leaf, ref,
568  						       root_objectid);
569  			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
570  			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
571  			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
572  		} else {
573  			num_refs = btrfs_extent_data_ref_count(leaf, ref);
574  			num_refs += refs_to_add;
575  			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
576  		}
577  	}
578  	btrfs_mark_buffer_dirty(trans, leaf);
579  	ret = 0;
580  fail:
581  	btrfs_release_path(path);
582  	return ret;
583  }
584  
remove_extent_data_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int refs_to_drop)585  static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
586  					   struct btrfs_root *root,
587  					   struct btrfs_path *path,
588  					   int refs_to_drop)
589  {
590  	struct btrfs_key key;
591  	struct btrfs_extent_data_ref *ref1 = NULL;
592  	struct btrfs_shared_data_ref *ref2 = NULL;
593  	struct extent_buffer *leaf;
594  	u32 num_refs = 0;
595  	int ret = 0;
596  
597  	leaf = path->nodes[0];
598  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
599  
600  	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
601  		ref1 = btrfs_item_ptr(leaf, path->slots[0],
602  				      struct btrfs_extent_data_ref);
603  		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
604  	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
605  		ref2 = btrfs_item_ptr(leaf, path->slots[0],
606  				      struct btrfs_shared_data_ref);
607  		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
608  	} else {
609  		btrfs_err(trans->fs_info,
610  			  "unrecognized backref key (%llu %u %llu)",
611  			  key.objectid, key.type, key.offset);
612  		btrfs_abort_transaction(trans, -EUCLEAN);
613  		return -EUCLEAN;
614  	}
615  
616  	BUG_ON(num_refs < refs_to_drop);
617  	num_refs -= refs_to_drop;
618  
619  	if (num_refs == 0) {
620  		ret = btrfs_del_item(trans, root, path);
621  	} else {
622  		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
623  			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
624  		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
625  			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
626  		btrfs_mark_buffer_dirty(trans, leaf);
627  	}
628  	return ret;
629  }
630  
extent_data_ref_count(struct btrfs_path * path,struct btrfs_extent_inline_ref * iref)631  static noinline u32 extent_data_ref_count(struct btrfs_path *path,
632  					  struct btrfs_extent_inline_ref *iref)
633  {
634  	struct btrfs_key key;
635  	struct extent_buffer *leaf;
636  	struct btrfs_extent_data_ref *ref1;
637  	struct btrfs_shared_data_ref *ref2;
638  	u32 num_refs = 0;
639  	int type;
640  
641  	leaf = path->nodes[0];
642  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
643  
644  	if (iref) {
645  		/*
646  		 * If type is invalid, we should have bailed out earlier than
647  		 * this call.
648  		 */
649  		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
650  		ASSERT(type != BTRFS_REF_TYPE_INVALID);
651  		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
652  			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
653  			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
654  		} else {
655  			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
656  			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
657  		}
658  	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
659  		ref1 = btrfs_item_ptr(leaf, path->slots[0],
660  				      struct btrfs_extent_data_ref);
661  		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
662  	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
663  		ref2 = btrfs_item_ptr(leaf, path->slots[0],
664  				      struct btrfs_shared_data_ref);
665  		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
666  	} else {
667  		WARN_ON(1);
668  	}
669  	return num_refs;
670  }
671  
lookup_tree_block_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid)672  static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
673  					  struct btrfs_path *path,
674  					  u64 bytenr, u64 parent,
675  					  u64 root_objectid)
676  {
677  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
678  	struct btrfs_key key;
679  	int ret;
680  
681  	key.objectid = bytenr;
682  	if (parent) {
683  		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
684  		key.offset = parent;
685  	} else {
686  		key.type = BTRFS_TREE_BLOCK_REF_KEY;
687  		key.offset = root_objectid;
688  	}
689  
690  	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
691  	if (ret > 0)
692  		ret = -ENOENT;
693  	return ret;
694  }
695  
insert_tree_block_ref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 parent,u64 root_objectid)696  static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
697  					  struct btrfs_path *path,
698  					  u64 bytenr, u64 parent,
699  					  u64 root_objectid)
700  {
701  	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
702  	struct btrfs_key key;
703  	int ret;
704  
705  	key.objectid = bytenr;
706  	if (parent) {
707  		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
708  		key.offset = parent;
709  	} else {
710  		key.type = BTRFS_TREE_BLOCK_REF_KEY;
711  		key.offset = root_objectid;
712  	}
713  
714  	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
715  	btrfs_release_path(path);
716  	return ret;
717  }
718  
extent_ref_type(u64 parent,u64 owner)719  static inline int extent_ref_type(u64 parent, u64 owner)
720  {
721  	int type;
722  	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
723  		if (parent > 0)
724  			type = BTRFS_SHARED_BLOCK_REF_KEY;
725  		else
726  			type = BTRFS_TREE_BLOCK_REF_KEY;
727  	} else {
728  		if (parent > 0)
729  			type = BTRFS_SHARED_DATA_REF_KEY;
730  		else
731  			type = BTRFS_EXTENT_DATA_REF_KEY;
732  	}
733  	return type;
734  }
735  
find_next_key(struct btrfs_path * path,int level,struct btrfs_key * key)736  static int find_next_key(struct btrfs_path *path, int level,
737  			 struct btrfs_key *key)
738  
739  {
740  	for (; level < BTRFS_MAX_LEVEL; level++) {
741  		if (!path->nodes[level])
742  			break;
743  		if (path->slots[level] + 1 >=
744  		    btrfs_header_nritems(path->nodes[level]))
745  			continue;
746  		if (level == 0)
747  			btrfs_item_key_to_cpu(path->nodes[level], key,
748  					      path->slots[level] + 1);
749  		else
750  			btrfs_node_key_to_cpu(path->nodes[level], key,
751  					      path->slots[level] + 1);
752  		return 0;
753  	}
754  	return 1;
755  }
756  
757  /*
758   * look for inline back ref. if back ref is found, *ref_ret is set
759   * to the address of inline back ref, and 0 is returned.
760   *
761   * if back ref isn't found, *ref_ret is set to the address where it
762   * should be inserted, and -ENOENT is returned.
763   *
764   * if insert is true and there are too many inline back refs, the path
765   * points to the extent item, and -EAGAIN is returned.
766   *
767   * NOTE: inline back refs are ordered in the same way that back ref
768   *	 items in the tree are ordered.
769   */
770  static noinline_for_stack
lookup_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref ** ref_ret,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset,int insert)771  int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
772  				 struct btrfs_path *path,
773  				 struct btrfs_extent_inline_ref **ref_ret,
774  				 u64 bytenr, u64 num_bytes,
775  				 u64 parent, u64 root_objectid,
776  				 u64 owner, u64 offset, int insert)
777  {
778  	struct btrfs_fs_info *fs_info = trans->fs_info;
779  	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
780  	struct btrfs_key key;
781  	struct extent_buffer *leaf;
782  	struct btrfs_extent_item *ei;
783  	struct btrfs_extent_inline_ref *iref;
784  	u64 flags;
785  	u64 item_size;
786  	unsigned long ptr;
787  	unsigned long end;
788  	int extra_size;
789  	int type;
790  	int want;
791  	int ret;
792  	int err = 0;
793  	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
794  	int needed;
795  
796  	key.objectid = bytenr;
797  	key.type = BTRFS_EXTENT_ITEM_KEY;
798  	key.offset = num_bytes;
799  
800  	want = extent_ref_type(parent, owner);
801  	if (insert) {
802  		extra_size = btrfs_extent_inline_ref_size(want);
803  		path->search_for_extension = 1;
804  		path->keep_locks = 1;
805  	} else
806  		extra_size = -1;
807  
808  	/*
809  	 * Owner is our level, so we can just add one to get the level for the
810  	 * block we are interested in.
811  	 */
812  	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
813  		key.type = BTRFS_METADATA_ITEM_KEY;
814  		key.offset = owner;
815  	}
816  
817  again:
818  	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
819  	if (ret < 0) {
820  		err = ret;
821  		goto out;
822  	}
823  
824  	/*
825  	 * We may be a newly converted file system which still has the old fat
826  	 * extent entries for metadata, so try and see if we have one of those.
827  	 */
828  	if (ret > 0 && skinny_metadata) {
829  		skinny_metadata = false;
830  		if (path->slots[0]) {
831  			path->slots[0]--;
832  			btrfs_item_key_to_cpu(path->nodes[0], &key,
833  					      path->slots[0]);
834  			if (key.objectid == bytenr &&
835  			    key.type == BTRFS_EXTENT_ITEM_KEY &&
836  			    key.offset == num_bytes)
837  				ret = 0;
838  		}
839  		if (ret) {
840  			key.objectid = bytenr;
841  			key.type = BTRFS_EXTENT_ITEM_KEY;
842  			key.offset = num_bytes;
843  			btrfs_release_path(path);
844  			goto again;
845  		}
846  	}
847  
848  	if (ret && !insert) {
849  		err = -ENOENT;
850  		goto out;
851  	} else if (WARN_ON(ret)) {
852  		btrfs_print_leaf(path->nodes[0]);
853  		btrfs_err(fs_info,
854  "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
855  			  bytenr, num_bytes, parent, root_objectid, owner,
856  			  offset);
857  		err = -EIO;
858  		goto out;
859  	}
860  
861  	leaf = path->nodes[0];
862  	item_size = btrfs_item_size(leaf, path->slots[0]);
863  	if (unlikely(item_size < sizeof(*ei))) {
864  		err = -EUCLEAN;
865  		btrfs_err(fs_info,
866  			  "unexpected extent item size, has %llu expect >= %zu",
867  			  item_size, sizeof(*ei));
868  		btrfs_abort_transaction(trans, err);
869  		goto out;
870  	}
871  
872  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
873  	flags = btrfs_extent_flags(leaf, ei);
874  
875  	ptr = (unsigned long)(ei + 1);
876  	end = (unsigned long)ei + item_size;
877  
878  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
879  		ptr += sizeof(struct btrfs_tree_block_info);
880  		BUG_ON(ptr > end);
881  	}
882  
883  	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
884  		needed = BTRFS_REF_TYPE_DATA;
885  	else
886  		needed = BTRFS_REF_TYPE_BLOCK;
887  
888  	err = -ENOENT;
889  	while (1) {
890  		if (ptr >= end) {
891  			if (ptr > end) {
892  				err = -EUCLEAN;
893  				btrfs_print_leaf(path->nodes[0]);
894  				btrfs_crit(fs_info,
895  "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
896  					path->slots[0], root_objectid, owner, offset, parent);
897  			}
898  			break;
899  		}
900  		iref = (struct btrfs_extent_inline_ref *)ptr;
901  		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
902  		if (type == BTRFS_REF_TYPE_INVALID) {
903  			err = -EUCLEAN;
904  			goto out;
905  		}
906  
907  		if (want < type)
908  			break;
909  		if (want > type) {
910  			ptr += btrfs_extent_inline_ref_size(type);
911  			continue;
912  		}
913  
914  		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
915  			struct btrfs_extent_data_ref *dref;
916  			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
917  			if (match_extent_data_ref(leaf, dref, root_objectid,
918  						  owner, offset)) {
919  				err = 0;
920  				break;
921  			}
922  			if (hash_extent_data_ref_item(leaf, dref) <
923  			    hash_extent_data_ref(root_objectid, owner, offset))
924  				break;
925  		} else {
926  			u64 ref_offset;
927  			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
928  			if (parent > 0) {
929  				if (parent == ref_offset) {
930  					err = 0;
931  					break;
932  				}
933  				if (ref_offset < parent)
934  					break;
935  			} else {
936  				if (root_objectid == ref_offset) {
937  					err = 0;
938  					break;
939  				}
940  				if (ref_offset < root_objectid)
941  					break;
942  			}
943  		}
944  		ptr += btrfs_extent_inline_ref_size(type);
945  	}
946  	if (err == -ENOENT && insert) {
947  		if (item_size + extra_size >=
948  		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
949  			err = -EAGAIN;
950  			goto out;
951  		}
952  		/*
953  		 * To add new inline back ref, we have to make sure
954  		 * there is no corresponding back ref item.
955  		 * For simplicity, we just do not add new inline back
956  		 * ref if there is any kind of item for this block
957  		 */
958  		if (find_next_key(path, 0, &key) == 0 &&
959  		    key.objectid == bytenr &&
960  		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
961  			err = -EAGAIN;
962  			goto out;
963  		}
964  	}
965  	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
966  out:
967  	if (insert) {
968  		path->keep_locks = 0;
969  		path->search_for_extension = 0;
970  		btrfs_unlock_up_safe(path, 1);
971  	}
972  	return err;
973  }
974  
975  /*
976   * helper to add new inline back ref
977   */
978  static noinline_for_stack
setup_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)979  void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
980  				 struct btrfs_path *path,
981  				 struct btrfs_extent_inline_ref *iref,
982  				 u64 parent, u64 root_objectid,
983  				 u64 owner, u64 offset, int refs_to_add,
984  				 struct btrfs_delayed_extent_op *extent_op)
985  {
986  	struct extent_buffer *leaf;
987  	struct btrfs_extent_item *ei;
988  	unsigned long ptr;
989  	unsigned long end;
990  	unsigned long item_offset;
991  	u64 refs;
992  	int size;
993  	int type;
994  
995  	leaf = path->nodes[0];
996  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
997  	item_offset = (unsigned long)iref - (unsigned long)ei;
998  
999  	type = extent_ref_type(parent, owner);
1000  	size = btrfs_extent_inline_ref_size(type);
1001  
1002  	btrfs_extend_item(trans, path, size);
1003  
1004  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1005  	refs = btrfs_extent_refs(leaf, ei);
1006  	refs += refs_to_add;
1007  	btrfs_set_extent_refs(leaf, ei, refs);
1008  	if (extent_op)
1009  		__run_delayed_extent_op(extent_op, leaf, ei);
1010  
1011  	ptr = (unsigned long)ei + item_offset;
1012  	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1013  	if (ptr < end - size)
1014  		memmove_extent_buffer(leaf, ptr + size, ptr,
1015  				      end - size - ptr);
1016  
1017  	iref = (struct btrfs_extent_inline_ref *)ptr;
1018  	btrfs_set_extent_inline_ref_type(leaf, iref, type);
1019  	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1020  		struct btrfs_extent_data_ref *dref;
1021  		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1022  		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1023  		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1024  		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1025  		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1026  	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1027  		struct btrfs_shared_data_ref *sref;
1028  		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1029  		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1030  		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1031  	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1032  		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1033  	} else {
1034  		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1035  	}
1036  	btrfs_mark_buffer_dirty(trans, leaf);
1037  }
1038  
lookup_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref ** ref_ret,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset)1039  static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1040  				 struct btrfs_path *path,
1041  				 struct btrfs_extent_inline_ref **ref_ret,
1042  				 u64 bytenr, u64 num_bytes, u64 parent,
1043  				 u64 root_objectid, u64 owner, u64 offset)
1044  {
1045  	int ret;
1046  
1047  	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1048  					   num_bytes, parent, root_objectid,
1049  					   owner, offset, 0);
1050  	if (ret != -ENOENT)
1051  		return ret;
1052  
1053  	btrfs_release_path(path);
1054  	*ref_ret = NULL;
1055  
1056  	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1057  		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1058  					    root_objectid);
1059  	} else {
1060  		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1061  					     root_objectid, owner, offset);
1062  	}
1063  	return ret;
1064  }
1065  
1066  /*
1067   * helper to update/remove inline back ref
1068   */
update_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,int refs_to_mod,struct btrfs_delayed_extent_op * extent_op)1069  static noinline_for_stack int update_inline_extent_backref(
1070  				  struct btrfs_trans_handle *trans,
1071  				  struct btrfs_path *path,
1072  				  struct btrfs_extent_inline_ref *iref,
1073  				  int refs_to_mod,
1074  				  struct btrfs_delayed_extent_op *extent_op)
1075  {
1076  	struct extent_buffer *leaf = path->nodes[0];
1077  	struct btrfs_fs_info *fs_info = leaf->fs_info;
1078  	struct btrfs_extent_item *ei;
1079  	struct btrfs_extent_data_ref *dref = NULL;
1080  	struct btrfs_shared_data_ref *sref = NULL;
1081  	unsigned long ptr;
1082  	unsigned long end;
1083  	u32 item_size;
1084  	int size;
1085  	int type;
1086  	u64 refs;
1087  
1088  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1089  	refs = btrfs_extent_refs(leaf, ei);
1090  	if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1091  		struct btrfs_key key;
1092  		u32 extent_size;
1093  
1094  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1095  		if (key.type == BTRFS_METADATA_ITEM_KEY)
1096  			extent_size = fs_info->nodesize;
1097  		else
1098  			extent_size = key.offset;
1099  		btrfs_print_leaf(leaf);
1100  		btrfs_err(fs_info,
1101  	"invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1102  			  key.objectid, extent_size, refs_to_mod, refs);
1103  		return -EUCLEAN;
1104  	}
1105  	refs += refs_to_mod;
1106  	btrfs_set_extent_refs(leaf, ei, refs);
1107  	if (extent_op)
1108  		__run_delayed_extent_op(extent_op, leaf, ei);
1109  
1110  	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1111  	/*
1112  	 * Function btrfs_get_extent_inline_ref_type() has already printed
1113  	 * error messages.
1114  	 */
1115  	if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1116  		return -EUCLEAN;
1117  
1118  	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1119  		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1120  		refs = btrfs_extent_data_ref_count(leaf, dref);
1121  	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1122  		sref = (struct btrfs_shared_data_ref *)(iref + 1);
1123  		refs = btrfs_shared_data_ref_count(leaf, sref);
1124  	} else {
1125  		refs = 1;
1126  		/*
1127  		 * For tree blocks we can only drop one ref for it, and tree
1128  		 * blocks should not have refs > 1.
1129  		 *
1130  		 * Furthermore if we're inserting a new inline backref, we
1131  		 * won't reach this path either. That would be
1132  		 * setup_inline_extent_backref().
1133  		 */
1134  		if (unlikely(refs_to_mod != -1)) {
1135  			struct btrfs_key key;
1136  
1137  			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1138  
1139  			btrfs_print_leaf(leaf);
1140  			btrfs_err(fs_info,
1141  			"invalid refs_to_mod for tree block %llu, has %d expect -1",
1142  				  key.objectid, refs_to_mod);
1143  			return -EUCLEAN;
1144  		}
1145  	}
1146  
1147  	if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1148  		struct btrfs_key key;
1149  		u32 extent_size;
1150  
1151  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1152  		if (key.type == BTRFS_METADATA_ITEM_KEY)
1153  			extent_size = fs_info->nodesize;
1154  		else
1155  			extent_size = key.offset;
1156  		btrfs_print_leaf(leaf);
1157  		btrfs_err(fs_info,
1158  "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1159  			  (unsigned long)iref, key.objectid, extent_size,
1160  			  refs_to_mod, refs);
1161  		return -EUCLEAN;
1162  	}
1163  	refs += refs_to_mod;
1164  
1165  	if (refs > 0) {
1166  		if (type == BTRFS_EXTENT_DATA_REF_KEY)
1167  			btrfs_set_extent_data_ref_count(leaf, dref, refs);
1168  		else
1169  			btrfs_set_shared_data_ref_count(leaf, sref, refs);
1170  	} else {
1171  		size =  btrfs_extent_inline_ref_size(type);
1172  		item_size = btrfs_item_size(leaf, path->slots[0]);
1173  		ptr = (unsigned long)iref;
1174  		end = (unsigned long)ei + item_size;
1175  		if (ptr + size < end)
1176  			memmove_extent_buffer(leaf, ptr, ptr + size,
1177  					      end - ptr - size);
1178  		item_size -= size;
1179  		btrfs_truncate_item(trans, path, item_size, 1);
1180  	}
1181  	btrfs_mark_buffer_dirty(trans, leaf);
1182  	return 0;
1183  }
1184  
1185  static noinline_for_stack
insert_inline_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_path * path,u64 bytenr,u64 num_bytes,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)1186  int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1187  				 struct btrfs_path *path,
1188  				 u64 bytenr, u64 num_bytes, u64 parent,
1189  				 u64 root_objectid, u64 owner,
1190  				 u64 offset, int refs_to_add,
1191  				 struct btrfs_delayed_extent_op *extent_op)
1192  {
1193  	struct btrfs_extent_inline_ref *iref;
1194  	int ret;
1195  
1196  	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1197  					   num_bytes, parent, root_objectid,
1198  					   owner, offset, 1);
1199  	if (ret == 0) {
1200  		/*
1201  		 * We're adding refs to a tree block we already own, this
1202  		 * should not happen at all.
1203  		 */
1204  		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1205  			btrfs_print_leaf(path->nodes[0]);
1206  			btrfs_crit(trans->fs_info,
1207  "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1208  				   bytenr, num_bytes, root_objectid, path->slots[0]);
1209  			return -EUCLEAN;
1210  		}
1211  		ret = update_inline_extent_backref(trans, path, iref,
1212  						   refs_to_add, extent_op);
1213  	} else if (ret == -ENOENT) {
1214  		setup_inline_extent_backref(trans, path, iref, parent,
1215  					    root_objectid, owner, offset,
1216  					    refs_to_add, extent_op);
1217  		ret = 0;
1218  	}
1219  	return ret;
1220  }
1221  
remove_extent_backref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_extent_inline_ref * iref,int refs_to_drop,int is_data)1222  static int remove_extent_backref(struct btrfs_trans_handle *trans,
1223  				 struct btrfs_root *root,
1224  				 struct btrfs_path *path,
1225  				 struct btrfs_extent_inline_ref *iref,
1226  				 int refs_to_drop, int is_data)
1227  {
1228  	int ret = 0;
1229  
1230  	BUG_ON(!is_data && refs_to_drop != 1);
1231  	if (iref)
1232  		ret = update_inline_extent_backref(trans, path, iref,
1233  						   -refs_to_drop, NULL);
1234  	else if (is_data)
1235  		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1236  	else
1237  		ret = btrfs_del_item(trans, root, path);
1238  	return ret;
1239  }
1240  
btrfs_issue_discard(struct block_device * bdev,u64 start,u64 len,u64 * discarded_bytes)1241  static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1242  			       u64 *discarded_bytes)
1243  {
1244  	int j, ret = 0;
1245  	u64 bytes_left, end;
1246  	u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1247  
1248  	/* Adjust the range to be aligned to 512B sectors if necessary. */
1249  	if (start != aligned_start) {
1250  		len -= aligned_start - start;
1251  		len = round_down(len, 1 << SECTOR_SHIFT);
1252  		start = aligned_start;
1253  	}
1254  
1255  	*discarded_bytes = 0;
1256  
1257  	if (!len)
1258  		return 0;
1259  
1260  	end = start + len;
1261  	bytes_left = len;
1262  
1263  	/* Skip any superblocks on this device. */
1264  	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1265  		u64 sb_start = btrfs_sb_offset(j);
1266  		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1267  		u64 size = sb_start - start;
1268  
1269  		if (!in_range(sb_start, start, bytes_left) &&
1270  		    !in_range(sb_end, start, bytes_left) &&
1271  		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1272  			continue;
1273  
1274  		/*
1275  		 * Superblock spans beginning of range.  Adjust start and
1276  		 * try again.
1277  		 */
1278  		if (sb_start <= start) {
1279  			start += sb_end - start;
1280  			if (start > end) {
1281  				bytes_left = 0;
1282  				break;
1283  			}
1284  			bytes_left = end - start;
1285  			continue;
1286  		}
1287  
1288  		if (size) {
1289  			ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1290  						   size >> SECTOR_SHIFT,
1291  						   GFP_NOFS);
1292  			if (!ret)
1293  				*discarded_bytes += size;
1294  			else if (ret != -EOPNOTSUPP)
1295  				return ret;
1296  		}
1297  
1298  		start = sb_end;
1299  		if (start > end) {
1300  			bytes_left = 0;
1301  			break;
1302  		}
1303  		bytes_left = end - start;
1304  	}
1305  
1306  	while (bytes_left) {
1307  		u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left);
1308  
1309  		ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1310  					   bytes_to_discard >> SECTOR_SHIFT,
1311  					   GFP_NOFS);
1312  
1313  		if (ret) {
1314  			if (ret != -EOPNOTSUPP)
1315  				break;
1316  			continue;
1317  		}
1318  
1319  		start += bytes_to_discard;
1320  		bytes_left -= bytes_to_discard;
1321  		*discarded_bytes += bytes_to_discard;
1322  
1323  		if (btrfs_trim_interrupted()) {
1324  			ret = -ERESTARTSYS;
1325  			break;
1326  		}
1327  	}
1328  
1329  	return ret;
1330  }
1331  
do_discard_extent(struct btrfs_discard_stripe * stripe,u64 * bytes)1332  static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1333  {
1334  	struct btrfs_device *dev = stripe->dev;
1335  	struct btrfs_fs_info *fs_info = dev->fs_info;
1336  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1337  	u64 phys = stripe->physical;
1338  	u64 len = stripe->length;
1339  	u64 discarded = 0;
1340  	int ret = 0;
1341  
1342  	/* Zone reset on a zoned filesystem */
1343  	if (btrfs_can_zone_reset(dev, phys, len)) {
1344  		u64 src_disc;
1345  
1346  		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1347  		if (ret)
1348  			goto out;
1349  
1350  		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1351  		    dev != dev_replace->srcdev)
1352  			goto out;
1353  
1354  		src_disc = discarded;
1355  
1356  		/* Send to replace target as well */
1357  		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1358  					      &discarded);
1359  		discarded += src_disc;
1360  	} else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1361  		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1362  	} else {
1363  		ret = 0;
1364  		*bytes = 0;
1365  	}
1366  
1367  out:
1368  	*bytes = discarded;
1369  	return ret;
1370  }
1371  
btrfs_discard_extent(struct btrfs_fs_info * fs_info,u64 bytenr,u64 num_bytes,u64 * actual_bytes)1372  int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1373  			 u64 num_bytes, u64 *actual_bytes)
1374  {
1375  	int ret = 0;
1376  	u64 discarded_bytes = 0;
1377  	u64 end = bytenr + num_bytes;
1378  	u64 cur = bytenr;
1379  
1380  	/*
1381  	 * Avoid races with device replace and make sure the devices in the
1382  	 * stripes don't go away while we are discarding.
1383  	 */
1384  	btrfs_bio_counter_inc_blocked(fs_info);
1385  	while (cur < end) {
1386  		struct btrfs_discard_stripe *stripes;
1387  		unsigned int num_stripes;
1388  		int i;
1389  
1390  		num_bytes = end - cur;
1391  		stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1392  		if (IS_ERR(stripes)) {
1393  			ret = PTR_ERR(stripes);
1394  			if (ret == -EOPNOTSUPP)
1395  				ret = 0;
1396  			break;
1397  		}
1398  
1399  		for (i = 0; i < num_stripes; i++) {
1400  			struct btrfs_discard_stripe *stripe = stripes + i;
1401  			u64 bytes;
1402  
1403  			if (!stripe->dev->bdev) {
1404  				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1405  				continue;
1406  			}
1407  
1408  			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1409  					&stripe->dev->dev_state))
1410  				continue;
1411  
1412  			ret = do_discard_extent(stripe, &bytes);
1413  			if (ret) {
1414  				/*
1415  				 * Keep going if discard is not supported by the
1416  				 * device.
1417  				 */
1418  				if (ret != -EOPNOTSUPP)
1419  					break;
1420  				ret = 0;
1421  			} else {
1422  				discarded_bytes += bytes;
1423  			}
1424  		}
1425  		kfree(stripes);
1426  		if (ret)
1427  			break;
1428  		cur += num_bytes;
1429  	}
1430  	btrfs_bio_counter_dec(fs_info);
1431  	if (actual_bytes)
1432  		*actual_bytes = discarded_bytes;
1433  	return ret;
1434  }
1435  
1436  /* Can return -ENOMEM */
btrfs_inc_extent_ref(struct btrfs_trans_handle * trans,struct btrfs_ref * generic_ref)1437  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1438  			 struct btrfs_ref *generic_ref)
1439  {
1440  	struct btrfs_fs_info *fs_info = trans->fs_info;
1441  	int ret;
1442  
1443  	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1444  	       generic_ref->action);
1445  	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1446  	       generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1447  
1448  	if (generic_ref->type == BTRFS_REF_METADATA)
1449  		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1450  	else
1451  		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1452  
1453  	btrfs_ref_tree_mod(fs_info, generic_ref);
1454  
1455  	return ret;
1456  }
1457  
1458  /*
1459   * __btrfs_inc_extent_ref - insert backreference for a given extent
1460   *
1461   * The counterpart is in __btrfs_free_extent(), with examples and more details
1462   * how it works.
1463   *
1464   * @trans:	    Handle of transaction
1465   *
1466   * @node:	    The delayed ref node used to get the bytenr/length for
1467   *		    extent whose references are incremented.
1468   *
1469   * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1470   *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1471   *		    bytenr of the parent block. Since new extents are always
1472   *		    created with indirect references, this will only be the case
1473   *		    when relocating a shared extent. In that case, root_objectid
1474   *		    will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1475   *		    be 0
1476   *
1477   * @root_objectid:  The id of the root where this modification has originated,
1478   *		    this can be either one of the well-known metadata trees or
1479   *		    the subvolume id which references this extent.
1480   *
1481   * @owner:	    For data extents it is the inode number of the owning file.
1482   *		    For metadata extents this parameter holds the level in the
1483   *		    tree of the extent.
1484   *
1485   * @offset:	    For metadata extents the offset is ignored and is currently
1486   *		    always passed as 0. For data extents it is the fileoffset
1487   *		    this extent belongs to.
1488   *
1489   * @refs_to_add     Number of references to add
1490   *
1491   * @extent_op       Pointer to a structure, holding information necessary when
1492   *                  updating a tree block's flags
1493   *
1494   */
__btrfs_inc_extent_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,u64 parent,u64 root_objectid,u64 owner,u64 offset,int refs_to_add,struct btrfs_delayed_extent_op * extent_op)1495  static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1496  				  struct btrfs_delayed_ref_node *node,
1497  				  u64 parent, u64 root_objectid,
1498  				  u64 owner, u64 offset, int refs_to_add,
1499  				  struct btrfs_delayed_extent_op *extent_op)
1500  {
1501  	struct btrfs_path *path;
1502  	struct extent_buffer *leaf;
1503  	struct btrfs_extent_item *item;
1504  	struct btrfs_key key;
1505  	u64 bytenr = node->bytenr;
1506  	u64 num_bytes = node->num_bytes;
1507  	u64 refs;
1508  	int ret;
1509  
1510  	path = btrfs_alloc_path();
1511  	if (!path)
1512  		return -ENOMEM;
1513  
1514  	/* this will setup the path even if it fails to insert the back ref */
1515  	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1516  					   parent, root_objectid, owner,
1517  					   offset, refs_to_add, extent_op);
1518  	if ((ret < 0 && ret != -EAGAIN) || !ret)
1519  		goto out;
1520  
1521  	/*
1522  	 * Ok we had -EAGAIN which means we didn't have space to insert and
1523  	 * inline extent ref, so just update the reference count and add a
1524  	 * normal backref.
1525  	 */
1526  	leaf = path->nodes[0];
1527  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1528  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1529  	refs = btrfs_extent_refs(leaf, item);
1530  	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1531  	if (extent_op)
1532  		__run_delayed_extent_op(extent_op, leaf, item);
1533  
1534  	btrfs_mark_buffer_dirty(trans, leaf);
1535  	btrfs_release_path(path);
1536  
1537  	/* now insert the actual backref */
1538  	if (owner < BTRFS_FIRST_FREE_OBJECTID)
1539  		ret = insert_tree_block_ref(trans, path, bytenr, parent,
1540  					    root_objectid);
1541  	else
1542  		ret = insert_extent_data_ref(trans, path, bytenr, parent,
1543  					     root_objectid, owner, offset,
1544  					     refs_to_add);
1545  
1546  	if (ret)
1547  		btrfs_abort_transaction(trans, ret);
1548  out:
1549  	btrfs_free_path(path);
1550  	return ret;
1551  }
1552  
run_delayed_data_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1553  static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1554  				struct btrfs_delayed_ref_node *node,
1555  				struct btrfs_delayed_extent_op *extent_op,
1556  				bool insert_reserved)
1557  {
1558  	int ret = 0;
1559  	struct btrfs_delayed_data_ref *ref;
1560  	struct btrfs_key ins;
1561  	u64 parent = 0;
1562  	u64 ref_root = 0;
1563  	u64 flags = 0;
1564  
1565  	ins.objectid = node->bytenr;
1566  	ins.offset = node->num_bytes;
1567  	ins.type = BTRFS_EXTENT_ITEM_KEY;
1568  
1569  	ref = btrfs_delayed_node_to_data_ref(node);
1570  	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1571  
1572  	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1573  		parent = ref->parent;
1574  	ref_root = ref->root;
1575  
1576  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1577  		if (extent_op)
1578  			flags |= extent_op->flags_to_set;
1579  		ret = alloc_reserved_file_extent(trans, parent, ref_root,
1580  						 flags, ref->objectid,
1581  						 ref->offset, &ins,
1582  						 node->ref_mod);
1583  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1584  		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1585  					     ref->objectid, ref->offset,
1586  					     node->ref_mod, extent_op);
1587  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1588  		ret = __btrfs_free_extent(trans, node, parent,
1589  					  ref_root, ref->objectid,
1590  					  ref->offset, node->ref_mod,
1591  					  extent_op);
1592  	} else {
1593  		BUG();
1594  	}
1595  	return ret;
1596  }
1597  
__run_delayed_extent_op(struct btrfs_delayed_extent_op * extent_op,struct extent_buffer * leaf,struct btrfs_extent_item * ei)1598  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1599  				    struct extent_buffer *leaf,
1600  				    struct btrfs_extent_item *ei)
1601  {
1602  	u64 flags = btrfs_extent_flags(leaf, ei);
1603  	if (extent_op->update_flags) {
1604  		flags |= extent_op->flags_to_set;
1605  		btrfs_set_extent_flags(leaf, ei, flags);
1606  	}
1607  
1608  	if (extent_op->update_key) {
1609  		struct btrfs_tree_block_info *bi;
1610  		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1611  		bi = (struct btrfs_tree_block_info *)(ei + 1);
1612  		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1613  	}
1614  }
1615  
run_delayed_extent_op(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head,struct btrfs_delayed_extent_op * extent_op)1616  static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1617  				 struct btrfs_delayed_ref_head *head,
1618  				 struct btrfs_delayed_extent_op *extent_op)
1619  {
1620  	struct btrfs_fs_info *fs_info = trans->fs_info;
1621  	struct btrfs_root *root;
1622  	struct btrfs_key key;
1623  	struct btrfs_path *path;
1624  	struct btrfs_extent_item *ei;
1625  	struct extent_buffer *leaf;
1626  	u32 item_size;
1627  	int ret;
1628  	int err = 0;
1629  	int metadata = 1;
1630  
1631  	if (TRANS_ABORTED(trans))
1632  		return 0;
1633  
1634  	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1635  		metadata = 0;
1636  
1637  	path = btrfs_alloc_path();
1638  	if (!path)
1639  		return -ENOMEM;
1640  
1641  	key.objectid = head->bytenr;
1642  
1643  	if (metadata) {
1644  		key.type = BTRFS_METADATA_ITEM_KEY;
1645  		key.offset = extent_op->level;
1646  	} else {
1647  		key.type = BTRFS_EXTENT_ITEM_KEY;
1648  		key.offset = head->num_bytes;
1649  	}
1650  
1651  	root = btrfs_extent_root(fs_info, key.objectid);
1652  again:
1653  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1654  	if (ret < 0) {
1655  		err = ret;
1656  		goto out;
1657  	}
1658  	if (ret > 0) {
1659  		if (metadata) {
1660  			if (path->slots[0] > 0) {
1661  				path->slots[0]--;
1662  				btrfs_item_key_to_cpu(path->nodes[0], &key,
1663  						      path->slots[0]);
1664  				if (key.objectid == head->bytenr &&
1665  				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1666  				    key.offset == head->num_bytes)
1667  					ret = 0;
1668  			}
1669  			if (ret > 0) {
1670  				btrfs_release_path(path);
1671  				metadata = 0;
1672  
1673  				key.objectid = head->bytenr;
1674  				key.offset = head->num_bytes;
1675  				key.type = BTRFS_EXTENT_ITEM_KEY;
1676  				goto again;
1677  			}
1678  		} else {
1679  			err = -EUCLEAN;
1680  			btrfs_err(fs_info,
1681  		  "missing extent item for extent %llu num_bytes %llu level %d",
1682  				  head->bytenr, head->num_bytes, extent_op->level);
1683  			goto out;
1684  		}
1685  	}
1686  
1687  	leaf = path->nodes[0];
1688  	item_size = btrfs_item_size(leaf, path->slots[0]);
1689  
1690  	if (unlikely(item_size < sizeof(*ei))) {
1691  		err = -EUCLEAN;
1692  		btrfs_err(fs_info,
1693  			  "unexpected extent item size, has %u expect >= %zu",
1694  			  item_size, sizeof(*ei));
1695  		btrfs_abort_transaction(trans, err);
1696  		goto out;
1697  	}
1698  
1699  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1700  	__run_delayed_extent_op(extent_op, leaf, ei);
1701  
1702  	btrfs_mark_buffer_dirty(trans, leaf);
1703  out:
1704  	btrfs_free_path(path);
1705  	return err;
1706  }
1707  
run_delayed_tree_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1708  static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1709  				struct btrfs_delayed_ref_node *node,
1710  				struct btrfs_delayed_extent_op *extent_op,
1711  				bool insert_reserved)
1712  {
1713  	int ret = 0;
1714  	struct btrfs_delayed_tree_ref *ref;
1715  	u64 parent = 0;
1716  	u64 ref_root = 0;
1717  
1718  	ref = btrfs_delayed_node_to_tree_ref(node);
1719  	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1720  
1721  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1722  		parent = ref->parent;
1723  	ref_root = ref->root;
1724  
1725  	if (unlikely(node->ref_mod != 1)) {
1726  		btrfs_err(trans->fs_info,
1727  	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1728  			  node->bytenr, node->ref_mod, node->action, ref_root,
1729  			  parent);
1730  		return -EUCLEAN;
1731  	}
1732  	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1733  		BUG_ON(!extent_op || !extent_op->update_flags);
1734  		ret = alloc_reserved_tree_block(trans, node, extent_op);
1735  	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1736  		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1737  					     ref->level, 0, 1, extent_op);
1738  	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1739  		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1740  					  ref->level, 0, 1, extent_op);
1741  	} else {
1742  		BUG();
1743  	}
1744  	return ret;
1745  }
1746  
1747  /* helper function to actually process a single delayed ref entry */
run_one_delayed_ref(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op,bool insert_reserved)1748  static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1749  			       struct btrfs_delayed_ref_node *node,
1750  			       struct btrfs_delayed_extent_op *extent_op,
1751  			       bool insert_reserved)
1752  {
1753  	int ret = 0;
1754  
1755  	if (TRANS_ABORTED(trans)) {
1756  		if (insert_reserved)
1757  			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1758  		return 0;
1759  	}
1760  
1761  	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1762  	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1763  		ret = run_delayed_tree_ref(trans, node, extent_op,
1764  					   insert_reserved);
1765  	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1766  		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1767  		ret = run_delayed_data_ref(trans, node, extent_op,
1768  					   insert_reserved);
1769  	else
1770  		BUG();
1771  	if (ret && insert_reserved)
1772  		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1773  	if (ret < 0)
1774  		btrfs_err(trans->fs_info,
1775  "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1776  			  node->bytenr, node->num_bytes, node->type,
1777  			  node->action, node->ref_mod, ret);
1778  	return ret;
1779  }
1780  
1781  static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head * head)1782  select_delayed_ref(struct btrfs_delayed_ref_head *head)
1783  {
1784  	struct btrfs_delayed_ref_node *ref;
1785  
1786  	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1787  		return NULL;
1788  
1789  	/*
1790  	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1791  	 * This is to prevent a ref count from going down to zero, which deletes
1792  	 * the extent item from the extent tree, when there still are references
1793  	 * to add, which would fail because they would not find the extent item.
1794  	 */
1795  	if (!list_empty(&head->ref_add_list))
1796  		return list_first_entry(&head->ref_add_list,
1797  				struct btrfs_delayed_ref_node, add_list);
1798  
1799  	ref = rb_entry(rb_first_cached(&head->ref_tree),
1800  		       struct btrfs_delayed_ref_node, ref_node);
1801  	ASSERT(list_empty(&ref->add_list));
1802  	return ref;
1803  }
1804  
unselect_delayed_ref_head(struct btrfs_delayed_ref_root * delayed_refs,struct btrfs_delayed_ref_head * head)1805  static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1806  				      struct btrfs_delayed_ref_head *head)
1807  {
1808  	spin_lock(&delayed_refs->lock);
1809  	head->processing = false;
1810  	delayed_refs->num_heads_ready++;
1811  	spin_unlock(&delayed_refs->lock);
1812  	btrfs_delayed_ref_unlock(head);
1813  }
1814  
cleanup_extent_op(struct btrfs_delayed_ref_head * head)1815  static struct btrfs_delayed_extent_op *cleanup_extent_op(
1816  				struct btrfs_delayed_ref_head *head)
1817  {
1818  	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1819  
1820  	if (!extent_op)
1821  		return NULL;
1822  
1823  	if (head->must_insert_reserved) {
1824  		head->extent_op = NULL;
1825  		btrfs_free_delayed_extent_op(extent_op);
1826  		return NULL;
1827  	}
1828  	return extent_op;
1829  }
1830  
run_and_cleanup_extent_op(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head)1831  static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1832  				     struct btrfs_delayed_ref_head *head)
1833  {
1834  	struct btrfs_delayed_extent_op *extent_op;
1835  	int ret;
1836  
1837  	extent_op = cleanup_extent_op(head);
1838  	if (!extent_op)
1839  		return 0;
1840  	head->extent_op = NULL;
1841  	spin_unlock(&head->lock);
1842  	ret = run_delayed_extent_op(trans, head, extent_op);
1843  	btrfs_free_delayed_extent_op(extent_op);
1844  	return ret ? ret : 1;
1845  }
1846  
btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info * fs_info,struct btrfs_delayed_ref_root * delayed_refs,struct btrfs_delayed_ref_head * head)1847  void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1848  				  struct btrfs_delayed_ref_root *delayed_refs,
1849  				  struct btrfs_delayed_ref_head *head)
1850  {
1851  	int nr_items = 1;	/* Dropping this ref head update. */
1852  
1853  	/*
1854  	 * We had csum deletions accounted for in our delayed refs rsv, we need
1855  	 * to drop the csum leaves for this update from our delayed_refs_rsv.
1856  	 */
1857  	if (head->total_ref_mod < 0 && head->is_data) {
1858  		spin_lock(&delayed_refs->lock);
1859  		delayed_refs->pending_csums -= head->num_bytes;
1860  		spin_unlock(&delayed_refs->lock);
1861  		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1862  	}
1863  
1864  	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1865  }
1866  
cleanup_ref_head(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * head)1867  static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1868  			    struct btrfs_delayed_ref_head *head)
1869  {
1870  
1871  	struct btrfs_fs_info *fs_info = trans->fs_info;
1872  	struct btrfs_delayed_ref_root *delayed_refs;
1873  	int ret;
1874  
1875  	delayed_refs = &trans->transaction->delayed_refs;
1876  
1877  	ret = run_and_cleanup_extent_op(trans, head);
1878  	if (ret < 0) {
1879  		unselect_delayed_ref_head(delayed_refs, head);
1880  		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1881  		return ret;
1882  	} else if (ret) {
1883  		return ret;
1884  	}
1885  
1886  	/*
1887  	 * Need to drop our head ref lock and re-acquire the delayed ref lock
1888  	 * and then re-check to make sure nobody got added.
1889  	 */
1890  	spin_unlock(&head->lock);
1891  	spin_lock(&delayed_refs->lock);
1892  	spin_lock(&head->lock);
1893  	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1894  		spin_unlock(&head->lock);
1895  		spin_unlock(&delayed_refs->lock);
1896  		return 1;
1897  	}
1898  	btrfs_delete_ref_head(delayed_refs, head);
1899  	spin_unlock(&head->lock);
1900  	spin_unlock(&delayed_refs->lock);
1901  
1902  	if (head->must_insert_reserved) {
1903  		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1904  		if (head->is_data) {
1905  			struct btrfs_root *csum_root;
1906  
1907  			csum_root = btrfs_csum_root(fs_info, head->bytenr);
1908  			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1909  					      head->num_bytes);
1910  		}
1911  	}
1912  
1913  	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1914  
1915  	trace_run_delayed_ref_head(fs_info, head, 0);
1916  	btrfs_delayed_ref_unlock(head);
1917  	btrfs_put_delayed_ref_head(head);
1918  	return ret;
1919  }
1920  
btrfs_obtain_ref_head(struct btrfs_trans_handle * trans)1921  static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1922  					struct btrfs_trans_handle *trans)
1923  {
1924  	struct btrfs_delayed_ref_root *delayed_refs =
1925  		&trans->transaction->delayed_refs;
1926  	struct btrfs_delayed_ref_head *head = NULL;
1927  	int ret;
1928  
1929  	spin_lock(&delayed_refs->lock);
1930  	head = btrfs_select_ref_head(delayed_refs);
1931  	if (!head) {
1932  		spin_unlock(&delayed_refs->lock);
1933  		return head;
1934  	}
1935  
1936  	/*
1937  	 * Grab the lock that says we are going to process all the refs for
1938  	 * this head
1939  	 */
1940  	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1941  	spin_unlock(&delayed_refs->lock);
1942  
1943  	/*
1944  	 * We may have dropped the spin lock to get the head mutex lock, and
1945  	 * that might have given someone else time to free the head.  If that's
1946  	 * true, it has been removed from our list and we can move on.
1947  	 */
1948  	if (ret == -EAGAIN)
1949  		head = ERR_PTR(-EAGAIN);
1950  
1951  	return head;
1952  }
1953  
btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_head * locked_ref)1954  static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1955  					   struct btrfs_delayed_ref_head *locked_ref)
1956  {
1957  	struct btrfs_fs_info *fs_info = trans->fs_info;
1958  	struct btrfs_delayed_ref_root *delayed_refs;
1959  	struct btrfs_delayed_extent_op *extent_op;
1960  	struct btrfs_delayed_ref_node *ref;
1961  	bool must_insert_reserved;
1962  	int ret;
1963  
1964  	delayed_refs = &trans->transaction->delayed_refs;
1965  
1966  	lockdep_assert_held(&locked_ref->mutex);
1967  	lockdep_assert_held(&locked_ref->lock);
1968  
1969  	while ((ref = select_delayed_ref(locked_ref))) {
1970  		if (ref->seq &&
1971  		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
1972  			spin_unlock(&locked_ref->lock);
1973  			unselect_delayed_ref_head(delayed_refs, locked_ref);
1974  			return -EAGAIN;
1975  		}
1976  
1977  		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1978  		RB_CLEAR_NODE(&ref->ref_node);
1979  		if (!list_empty(&ref->add_list))
1980  			list_del(&ref->add_list);
1981  		/*
1982  		 * When we play the delayed ref, also correct the ref_mod on
1983  		 * head
1984  		 */
1985  		switch (ref->action) {
1986  		case BTRFS_ADD_DELAYED_REF:
1987  		case BTRFS_ADD_DELAYED_EXTENT:
1988  			locked_ref->ref_mod -= ref->ref_mod;
1989  			break;
1990  		case BTRFS_DROP_DELAYED_REF:
1991  			locked_ref->ref_mod += ref->ref_mod;
1992  			break;
1993  		default:
1994  			WARN_ON(1);
1995  		}
1996  		atomic_dec(&delayed_refs->num_entries);
1997  
1998  		/*
1999  		 * Record the must_insert_reserved flag before we drop the
2000  		 * spin lock.
2001  		 */
2002  		must_insert_reserved = locked_ref->must_insert_reserved;
2003  		locked_ref->must_insert_reserved = false;
2004  
2005  		extent_op = locked_ref->extent_op;
2006  		locked_ref->extent_op = NULL;
2007  		spin_unlock(&locked_ref->lock);
2008  
2009  		ret = run_one_delayed_ref(trans, ref, extent_op,
2010  					  must_insert_reserved);
2011  
2012  		btrfs_free_delayed_extent_op(extent_op);
2013  		if (ret) {
2014  			unselect_delayed_ref_head(delayed_refs, locked_ref);
2015  			btrfs_put_delayed_ref(ref);
2016  			return ret;
2017  		}
2018  
2019  		btrfs_put_delayed_ref(ref);
2020  		cond_resched();
2021  
2022  		spin_lock(&locked_ref->lock);
2023  		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2024  	}
2025  
2026  	return 0;
2027  }
2028  
2029  /*
2030   * Returns 0 on success or if called with an already aborted transaction.
2031   * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2032   */
__btrfs_run_delayed_refs(struct btrfs_trans_handle * trans,unsigned long nr)2033  static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2034  					     unsigned long nr)
2035  {
2036  	struct btrfs_fs_info *fs_info = trans->fs_info;
2037  	struct btrfs_delayed_ref_root *delayed_refs;
2038  	struct btrfs_delayed_ref_head *locked_ref = NULL;
2039  	int ret;
2040  	unsigned long count = 0;
2041  
2042  	delayed_refs = &trans->transaction->delayed_refs;
2043  	do {
2044  		if (!locked_ref) {
2045  			locked_ref = btrfs_obtain_ref_head(trans);
2046  			if (IS_ERR_OR_NULL(locked_ref)) {
2047  				if (PTR_ERR(locked_ref) == -EAGAIN) {
2048  					continue;
2049  				} else {
2050  					break;
2051  				}
2052  			}
2053  			count++;
2054  		}
2055  		/*
2056  		 * We need to try and merge add/drops of the same ref since we
2057  		 * can run into issues with relocate dropping the implicit ref
2058  		 * and then it being added back again before the drop can
2059  		 * finish.  If we merged anything we need to re-loop so we can
2060  		 * get a good ref.
2061  		 * Or we can get node references of the same type that weren't
2062  		 * merged when created due to bumps in the tree mod seq, and
2063  		 * we need to merge them to prevent adding an inline extent
2064  		 * backref before dropping it (triggering a BUG_ON at
2065  		 * insert_inline_extent_backref()).
2066  		 */
2067  		spin_lock(&locked_ref->lock);
2068  		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2069  
2070  		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
2071  		if (ret < 0 && ret != -EAGAIN) {
2072  			/*
2073  			 * Error, btrfs_run_delayed_refs_for_head already
2074  			 * unlocked everything so just bail out
2075  			 */
2076  			return ret;
2077  		} else if (!ret) {
2078  			/*
2079  			 * Success, perform the usual cleanup of a processed
2080  			 * head
2081  			 */
2082  			ret = cleanup_ref_head(trans, locked_ref);
2083  			if (ret > 0 ) {
2084  				/* We dropped our lock, we need to loop. */
2085  				ret = 0;
2086  				continue;
2087  			} else if (ret) {
2088  				return ret;
2089  			}
2090  		}
2091  
2092  		/*
2093  		 * Either success case or btrfs_run_delayed_refs_for_head
2094  		 * returned -EAGAIN, meaning we need to select another head
2095  		 */
2096  
2097  		locked_ref = NULL;
2098  		cond_resched();
2099  	} while ((nr != -1 && count < nr) || locked_ref);
2100  
2101  	return 0;
2102  }
2103  
2104  #ifdef SCRAMBLE_DELAYED_REFS
2105  /*
2106   * Normally delayed refs get processed in ascending bytenr order. This
2107   * correlates in most cases to the order added. To expose dependencies on this
2108   * order, we start to process the tree in the middle instead of the beginning
2109   */
find_middle(struct rb_root * root)2110  static u64 find_middle(struct rb_root *root)
2111  {
2112  	struct rb_node *n = root->rb_node;
2113  	struct btrfs_delayed_ref_node *entry;
2114  	int alt = 1;
2115  	u64 middle;
2116  	u64 first = 0, last = 0;
2117  
2118  	n = rb_first(root);
2119  	if (n) {
2120  		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2121  		first = entry->bytenr;
2122  	}
2123  	n = rb_last(root);
2124  	if (n) {
2125  		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2126  		last = entry->bytenr;
2127  	}
2128  	n = root->rb_node;
2129  
2130  	while (n) {
2131  		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2132  		WARN_ON(!entry->in_tree);
2133  
2134  		middle = entry->bytenr;
2135  
2136  		if (alt)
2137  			n = n->rb_left;
2138  		else
2139  			n = n->rb_right;
2140  
2141  		alt = 1 - alt;
2142  	}
2143  	return middle;
2144  }
2145  #endif
2146  
2147  /*
2148   * this starts processing the delayed reference count updates and
2149   * extent insertions we have queued up so far.  count can be
2150   * 0, which means to process everything in the tree at the start
2151   * of the run (but not newly added entries), or it can be some target
2152   * number you'd like to process.
2153   *
2154   * Returns 0 on success or if called with an aborted transaction
2155   * Returns <0 on error and aborts the transaction
2156   */
btrfs_run_delayed_refs(struct btrfs_trans_handle * trans,unsigned long count)2157  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2158  			   unsigned long count)
2159  {
2160  	struct btrfs_fs_info *fs_info = trans->fs_info;
2161  	struct rb_node *node;
2162  	struct btrfs_delayed_ref_root *delayed_refs;
2163  	struct btrfs_delayed_ref_head *head;
2164  	int ret;
2165  	int run_all = count == (unsigned long)-1;
2166  
2167  	/* We'll clean this up in btrfs_cleanup_transaction */
2168  	if (TRANS_ABORTED(trans))
2169  		return 0;
2170  
2171  	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2172  		return 0;
2173  
2174  	delayed_refs = &trans->transaction->delayed_refs;
2175  	if (count == 0)
2176  		count = delayed_refs->num_heads_ready;
2177  
2178  again:
2179  #ifdef SCRAMBLE_DELAYED_REFS
2180  	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2181  #endif
2182  	ret = __btrfs_run_delayed_refs(trans, count);
2183  	if (ret < 0) {
2184  		btrfs_abort_transaction(trans, ret);
2185  		return ret;
2186  	}
2187  
2188  	if (run_all) {
2189  		btrfs_create_pending_block_groups(trans);
2190  
2191  		spin_lock(&delayed_refs->lock);
2192  		node = rb_first_cached(&delayed_refs->href_root);
2193  		if (!node) {
2194  			spin_unlock(&delayed_refs->lock);
2195  			goto out;
2196  		}
2197  		head = rb_entry(node, struct btrfs_delayed_ref_head,
2198  				href_node);
2199  		refcount_inc(&head->refs);
2200  		spin_unlock(&delayed_refs->lock);
2201  
2202  		/* Mutex was contended, block until it's released and retry. */
2203  		mutex_lock(&head->mutex);
2204  		mutex_unlock(&head->mutex);
2205  
2206  		btrfs_put_delayed_ref_head(head);
2207  		cond_resched();
2208  		goto again;
2209  	}
2210  out:
2211  	return 0;
2212  }
2213  
btrfs_set_disk_extent_flags(struct btrfs_trans_handle * trans,struct extent_buffer * eb,u64 flags)2214  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2215  				struct extent_buffer *eb, u64 flags)
2216  {
2217  	struct btrfs_delayed_extent_op *extent_op;
2218  	int level = btrfs_header_level(eb);
2219  	int ret;
2220  
2221  	extent_op = btrfs_alloc_delayed_extent_op();
2222  	if (!extent_op)
2223  		return -ENOMEM;
2224  
2225  	extent_op->flags_to_set = flags;
2226  	extent_op->update_flags = true;
2227  	extent_op->update_key = false;
2228  	extent_op->level = level;
2229  
2230  	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2231  	if (ret)
2232  		btrfs_free_delayed_extent_op(extent_op);
2233  	return ret;
2234  }
2235  
check_delayed_ref(struct btrfs_root * root,struct btrfs_path * path,u64 objectid,u64 offset,u64 bytenr)2236  static noinline int check_delayed_ref(struct btrfs_root *root,
2237  				      struct btrfs_path *path,
2238  				      u64 objectid, u64 offset, u64 bytenr)
2239  {
2240  	struct btrfs_delayed_ref_head *head;
2241  	struct btrfs_delayed_ref_node *ref;
2242  	struct btrfs_delayed_data_ref *data_ref;
2243  	struct btrfs_delayed_ref_root *delayed_refs;
2244  	struct btrfs_transaction *cur_trans;
2245  	struct rb_node *node;
2246  	int ret = 0;
2247  
2248  	spin_lock(&root->fs_info->trans_lock);
2249  	cur_trans = root->fs_info->running_transaction;
2250  	if (cur_trans)
2251  		refcount_inc(&cur_trans->use_count);
2252  	spin_unlock(&root->fs_info->trans_lock);
2253  	if (!cur_trans)
2254  		return 0;
2255  
2256  	delayed_refs = &cur_trans->delayed_refs;
2257  	spin_lock(&delayed_refs->lock);
2258  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2259  	if (!head) {
2260  		spin_unlock(&delayed_refs->lock);
2261  		btrfs_put_transaction(cur_trans);
2262  		return 0;
2263  	}
2264  
2265  	if (!mutex_trylock(&head->mutex)) {
2266  		if (path->nowait) {
2267  			spin_unlock(&delayed_refs->lock);
2268  			btrfs_put_transaction(cur_trans);
2269  			return -EAGAIN;
2270  		}
2271  
2272  		refcount_inc(&head->refs);
2273  		spin_unlock(&delayed_refs->lock);
2274  
2275  		btrfs_release_path(path);
2276  
2277  		/*
2278  		 * Mutex was contended, block until it's released and let
2279  		 * caller try again
2280  		 */
2281  		mutex_lock(&head->mutex);
2282  		mutex_unlock(&head->mutex);
2283  		btrfs_put_delayed_ref_head(head);
2284  		btrfs_put_transaction(cur_trans);
2285  		return -EAGAIN;
2286  	}
2287  	spin_unlock(&delayed_refs->lock);
2288  
2289  	spin_lock(&head->lock);
2290  	/*
2291  	 * XXX: We should replace this with a proper search function in the
2292  	 * future.
2293  	 */
2294  	for (node = rb_first_cached(&head->ref_tree); node;
2295  	     node = rb_next(node)) {
2296  		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2297  		/* If it's a shared ref we know a cross reference exists */
2298  		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2299  			ret = 1;
2300  			break;
2301  		}
2302  
2303  		data_ref = btrfs_delayed_node_to_data_ref(ref);
2304  
2305  		/*
2306  		 * If our ref doesn't match the one we're currently looking at
2307  		 * then we have a cross reference.
2308  		 */
2309  		if (data_ref->root != root->root_key.objectid ||
2310  		    data_ref->objectid != objectid ||
2311  		    data_ref->offset != offset) {
2312  			ret = 1;
2313  			break;
2314  		}
2315  	}
2316  	spin_unlock(&head->lock);
2317  	mutex_unlock(&head->mutex);
2318  	btrfs_put_transaction(cur_trans);
2319  	return ret;
2320  }
2321  
check_committed_ref(struct btrfs_root * root,struct btrfs_path * path,u64 objectid,u64 offset,u64 bytenr,bool strict)2322  static noinline int check_committed_ref(struct btrfs_root *root,
2323  					struct btrfs_path *path,
2324  					u64 objectid, u64 offset, u64 bytenr,
2325  					bool strict)
2326  {
2327  	struct btrfs_fs_info *fs_info = root->fs_info;
2328  	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2329  	struct extent_buffer *leaf;
2330  	struct btrfs_extent_data_ref *ref;
2331  	struct btrfs_extent_inline_ref *iref;
2332  	struct btrfs_extent_item *ei;
2333  	struct btrfs_key key;
2334  	u32 item_size;
2335  	int type;
2336  	int ret;
2337  
2338  	key.objectid = bytenr;
2339  	key.offset = (u64)-1;
2340  	key.type = BTRFS_EXTENT_ITEM_KEY;
2341  
2342  	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2343  	if (ret < 0)
2344  		goto out;
2345  	BUG_ON(ret == 0); /* Corruption */
2346  
2347  	ret = -ENOENT;
2348  	if (path->slots[0] == 0)
2349  		goto out;
2350  
2351  	path->slots[0]--;
2352  	leaf = path->nodes[0];
2353  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2354  
2355  	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2356  		goto out;
2357  
2358  	ret = 1;
2359  	item_size = btrfs_item_size(leaf, path->slots[0]);
2360  	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2361  
2362  	/* If extent item has more than 1 inline ref then it's shared */
2363  	if (item_size != sizeof(*ei) +
2364  	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2365  		goto out;
2366  
2367  	/*
2368  	 * If extent created before last snapshot => it's shared unless the
2369  	 * snapshot has been deleted. Use the heuristic if strict is false.
2370  	 */
2371  	if (!strict &&
2372  	    (btrfs_extent_generation(leaf, ei) <=
2373  	     btrfs_root_last_snapshot(&root->root_item)))
2374  		goto out;
2375  
2376  	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2377  
2378  	/* If this extent has SHARED_DATA_REF then it's shared */
2379  	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2380  	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2381  		goto out;
2382  
2383  	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2384  	if (btrfs_extent_refs(leaf, ei) !=
2385  	    btrfs_extent_data_ref_count(leaf, ref) ||
2386  	    btrfs_extent_data_ref_root(leaf, ref) !=
2387  	    root->root_key.objectid ||
2388  	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2389  	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
2390  		goto out;
2391  
2392  	ret = 0;
2393  out:
2394  	return ret;
2395  }
2396  
btrfs_cross_ref_exist(struct btrfs_root * root,u64 objectid,u64 offset,u64 bytenr,bool strict,struct btrfs_path * path)2397  int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2398  			  u64 bytenr, bool strict, struct btrfs_path *path)
2399  {
2400  	int ret;
2401  
2402  	do {
2403  		ret = check_committed_ref(root, path, objectid,
2404  					  offset, bytenr, strict);
2405  		if (ret && ret != -ENOENT)
2406  			goto out;
2407  
2408  		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2409  	} while (ret == -EAGAIN && !path->nowait);
2410  
2411  out:
2412  	btrfs_release_path(path);
2413  	if (btrfs_is_data_reloc_root(root))
2414  		WARN_ON(ret > 0);
2415  	return ret;
2416  }
2417  
__btrfs_mod_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref,int inc)2418  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2419  			   struct btrfs_root *root,
2420  			   struct extent_buffer *buf,
2421  			   int full_backref, int inc)
2422  {
2423  	struct btrfs_fs_info *fs_info = root->fs_info;
2424  	u64 bytenr;
2425  	u64 num_bytes;
2426  	u64 parent;
2427  	u64 ref_root;
2428  	u32 nritems;
2429  	struct btrfs_key key;
2430  	struct btrfs_file_extent_item *fi;
2431  	struct btrfs_ref generic_ref = { 0 };
2432  	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2433  	int i;
2434  	int action;
2435  	int level;
2436  	int ret = 0;
2437  
2438  	if (btrfs_is_testing(fs_info))
2439  		return 0;
2440  
2441  	ref_root = btrfs_header_owner(buf);
2442  	nritems = btrfs_header_nritems(buf);
2443  	level = btrfs_header_level(buf);
2444  
2445  	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2446  		return 0;
2447  
2448  	if (full_backref)
2449  		parent = buf->start;
2450  	else
2451  		parent = 0;
2452  	if (inc)
2453  		action = BTRFS_ADD_DELAYED_REF;
2454  	else
2455  		action = BTRFS_DROP_DELAYED_REF;
2456  
2457  	for (i = 0; i < nritems; i++) {
2458  		if (level == 0) {
2459  			btrfs_item_key_to_cpu(buf, &key, i);
2460  			if (key.type != BTRFS_EXTENT_DATA_KEY)
2461  				continue;
2462  			fi = btrfs_item_ptr(buf, i,
2463  					    struct btrfs_file_extent_item);
2464  			if (btrfs_file_extent_type(buf, fi) ==
2465  			    BTRFS_FILE_EXTENT_INLINE)
2466  				continue;
2467  			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2468  			if (bytenr == 0)
2469  				continue;
2470  
2471  			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2472  			key.offset -= btrfs_file_extent_offset(buf, fi);
2473  			btrfs_init_generic_ref(&generic_ref, action, bytenr,
2474  					       num_bytes, parent);
2475  			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2476  					    key.offset, root->root_key.objectid,
2477  					    for_reloc);
2478  			if (inc)
2479  				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2480  			else
2481  				ret = btrfs_free_extent(trans, &generic_ref);
2482  			if (ret)
2483  				goto fail;
2484  		} else {
2485  			bytenr = btrfs_node_blockptr(buf, i);
2486  			num_bytes = fs_info->nodesize;
2487  			btrfs_init_generic_ref(&generic_ref, action, bytenr,
2488  					       num_bytes, parent);
2489  			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2490  					    root->root_key.objectid, for_reloc);
2491  			if (inc)
2492  				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2493  			else
2494  				ret = btrfs_free_extent(trans, &generic_ref);
2495  			if (ret)
2496  				goto fail;
2497  		}
2498  	}
2499  	return 0;
2500  fail:
2501  	return ret;
2502  }
2503  
btrfs_inc_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref)2504  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2505  		  struct extent_buffer *buf, int full_backref)
2506  {
2507  	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2508  }
2509  
btrfs_dec_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,int full_backref)2510  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2511  		  struct extent_buffer *buf, int full_backref)
2512  {
2513  	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2514  }
2515  
get_alloc_profile_by_root(struct btrfs_root * root,int data)2516  static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2517  {
2518  	struct btrfs_fs_info *fs_info = root->fs_info;
2519  	u64 flags;
2520  	u64 ret;
2521  
2522  	if (data)
2523  		flags = BTRFS_BLOCK_GROUP_DATA;
2524  	else if (root == fs_info->chunk_root)
2525  		flags = BTRFS_BLOCK_GROUP_SYSTEM;
2526  	else
2527  		flags = BTRFS_BLOCK_GROUP_METADATA;
2528  
2529  	ret = btrfs_get_alloc_profile(fs_info, flags);
2530  	return ret;
2531  }
2532  
first_logical_byte(struct btrfs_fs_info * fs_info)2533  static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2534  {
2535  	struct rb_node *leftmost;
2536  	u64 bytenr = 0;
2537  
2538  	read_lock(&fs_info->block_group_cache_lock);
2539  	/* Get the block group with the lowest logical start address. */
2540  	leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2541  	if (leftmost) {
2542  		struct btrfs_block_group *bg;
2543  
2544  		bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2545  		bytenr = bg->start;
2546  	}
2547  	read_unlock(&fs_info->block_group_cache_lock);
2548  
2549  	return bytenr;
2550  }
2551  
pin_down_extent(struct btrfs_trans_handle * trans,struct btrfs_block_group * cache,u64 bytenr,u64 num_bytes,int reserved)2552  static int pin_down_extent(struct btrfs_trans_handle *trans,
2553  			   struct btrfs_block_group *cache,
2554  			   u64 bytenr, u64 num_bytes, int reserved)
2555  {
2556  	struct btrfs_fs_info *fs_info = cache->fs_info;
2557  
2558  	spin_lock(&cache->space_info->lock);
2559  	spin_lock(&cache->lock);
2560  	cache->pinned += num_bytes;
2561  	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2562  					     num_bytes);
2563  	if (reserved) {
2564  		cache->reserved -= num_bytes;
2565  		cache->space_info->bytes_reserved -= num_bytes;
2566  	}
2567  	spin_unlock(&cache->lock);
2568  	spin_unlock(&cache->space_info->lock);
2569  
2570  	set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2571  		       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2572  	return 0;
2573  }
2574  
btrfs_pin_extent(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,int reserved)2575  int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2576  		     u64 bytenr, u64 num_bytes, int reserved)
2577  {
2578  	struct btrfs_block_group *cache;
2579  
2580  	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2581  	BUG_ON(!cache); /* Logic error */
2582  
2583  	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2584  
2585  	btrfs_put_block_group(cache);
2586  	return 0;
2587  }
2588  
2589  /*
2590   * this function must be called within transaction
2591   */
btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes)2592  int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2593  				    u64 bytenr, u64 num_bytes)
2594  {
2595  	struct btrfs_block_group *cache;
2596  	int ret;
2597  
2598  	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2599  	if (!cache)
2600  		return -EINVAL;
2601  
2602  	/*
2603  	 * Fully cache the free space first so that our pin removes the free space
2604  	 * from the cache.
2605  	 */
2606  	ret = btrfs_cache_block_group(cache, true);
2607  	if (ret)
2608  		goto out;
2609  
2610  	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2611  
2612  	/* remove us from the free space cache (if we're there at all) */
2613  	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2614  out:
2615  	btrfs_put_block_group(cache);
2616  	return ret;
2617  }
2618  
__exclude_logged_extent(struct btrfs_fs_info * fs_info,u64 start,u64 num_bytes)2619  static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2620  				   u64 start, u64 num_bytes)
2621  {
2622  	int ret;
2623  	struct btrfs_block_group *block_group;
2624  
2625  	block_group = btrfs_lookup_block_group(fs_info, start);
2626  	if (!block_group)
2627  		return -EINVAL;
2628  
2629  	ret = btrfs_cache_block_group(block_group, true);
2630  	if (ret)
2631  		goto out;
2632  
2633  	ret = btrfs_remove_free_space(block_group, start, num_bytes);
2634  out:
2635  	btrfs_put_block_group(block_group);
2636  	return ret;
2637  }
2638  
btrfs_exclude_logged_extents(struct extent_buffer * eb)2639  int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2640  {
2641  	struct btrfs_fs_info *fs_info = eb->fs_info;
2642  	struct btrfs_file_extent_item *item;
2643  	struct btrfs_key key;
2644  	int found_type;
2645  	int i;
2646  	int ret = 0;
2647  
2648  	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2649  		return 0;
2650  
2651  	for (i = 0; i < btrfs_header_nritems(eb); i++) {
2652  		btrfs_item_key_to_cpu(eb, &key, i);
2653  		if (key.type != BTRFS_EXTENT_DATA_KEY)
2654  			continue;
2655  		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2656  		found_type = btrfs_file_extent_type(eb, item);
2657  		if (found_type == BTRFS_FILE_EXTENT_INLINE)
2658  			continue;
2659  		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2660  			continue;
2661  		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2662  		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2663  		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2664  		if (ret)
2665  			break;
2666  	}
2667  
2668  	return ret;
2669  }
2670  
2671  static void
btrfs_inc_block_group_reservations(struct btrfs_block_group * bg)2672  btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2673  {
2674  	atomic_inc(&bg->reservations);
2675  }
2676  
2677  /*
2678   * Returns the free cluster for the given space info and sets empty_cluster to
2679   * what it should be based on the mount options.
2680   */
2681  static struct btrfs_free_cluster *
fetch_cluster_info(struct btrfs_fs_info * fs_info,struct btrfs_space_info * space_info,u64 * empty_cluster)2682  fetch_cluster_info(struct btrfs_fs_info *fs_info,
2683  		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2684  {
2685  	struct btrfs_free_cluster *ret = NULL;
2686  
2687  	*empty_cluster = 0;
2688  	if (btrfs_mixed_space_info(space_info))
2689  		return ret;
2690  
2691  	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2692  		ret = &fs_info->meta_alloc_cluster;
2693  		if (btrfs_test_opt(fs_info, SSD))
2694  			*empty_cluster = SZ_2M;
2695  		else
2696  			*empty_cluster = SZ_64K;
2697  	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2698  		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
2699  		*empty_cluster = SZ_2M;
2700  		ret = &fs_info->data_alloc_cluster;
2701  	}
2702  
2703  	return ret;
2704  }
2705  
unpin_extent_range(struct btrfs_fs_info * fs_info,u64 start,u64 end,const bool return_free_space)2706  static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2707  			      u64 start, u64 end,
2708  			      const bool return_free_space)
2709  {
2710  	struct btrfs_block_group *cache = NULL;
2711  	struct btrfs_space_info *space_info;
2712  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2713  	struct btrfs_free_cluster *cluster = NULL;
2714  	u64 len;
2715  	u64 total_unpinned = 0;
2716  	u64 empty_cluster = 0;
2717  	bool readonly;
2718  
2719  	while (start <= end) {
2720  		readonly = false;
2721  		if (!cache ||
2722  		    start >= cache->start + cache->length) {
2723  			if (cache)
2724  				btrfs_put_block_group(cache);
2725  			total_unpinned = 0;
2726  			cache = btrfs_lookup_block_group(fs_info, start);
2727  			BUG_ON(!cache); /* Logic error */
2728  
2729  			cluster = fetch_cluster_info(fs_info,
2730  						     cache->space_info,
2731  						     &empty_cluster);
2732  			empty_cluster <<= 1;
2733  		}
2734  
2735  		len = cache->start + cache->length - start;
2736  		len = min(len, end + 1 - start);
2737  
2738  		if (return_free_space)
2739  			btrfs_add_free_space(cache, start, len);
2740  
2741  		start += len;
2742  		total_unpinned += len;
2743  		space_info = cache->space_info;
2744  
2745  		/*
2746  		 * If this space cluster has been marked as fragmented and we've
2747  		 * unpinned enough in this block group to potentially allow a
2748  		 * cluster to be created inside of it go ahead and clear the
2749  		 * fragmented check.
2750  		 */
2751  		if (cluster && cluster->fragmented &&
2752  		    total_unpinned > empty_cluster) {
2753  			spin_lock(&cluster->lock);
2754  			cluster->fragmented = 0;
2755  			spin_unlock(&cluster->lock);
2756  		}
2757  
2758  		spin_lock(&space_info->lock);
2759  		spin_lock(&cache->lock);
2760  		cache->pinned -= len;
2761  		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2762  		space_info->max_extent_size = 0;
2763  		if (cache->ro) {
2764  			space_info->bytes_readonly += len;
2765  			readonly = true;
2766  		} else if (btrfs_is_zoned(fs_info)) {
2767  			/* Need reset before reusing in a zoned block group */
2768  			btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
2769  								    len);
2770  			readonly = true;
2771  		}
2772  		spin_unlock(&cache->lock);
2773  		if (!readonly && return_free_space &&
2774  		    global_rsv->space_info == space_info) {
2775  			spin_lock(&global_rsv->lock);
2776  			if (!global_rsv->full) {
2777  				u64 to_add = min(len, global_rsv->size -
2778  						      global_rsv->reserved);
2779  
2780  				global_rsv->reserved += to_add;
2781  				btrfs_space_info_update_bytes_may_use(fs_info,
2782  						space_info, to_add);
2783  				if (global_rsv->reserved >= global_rsv->size)
2784  					global_rsv->full = 1;
2785  				len -= to_add;
2786  			}
2787  			spin_unlock(&global_rsv->lock);
2788  		}
2789  		/* Add to any tickets we may have */
2790  		if (!readonly && return_free_space && len)
2791  			btrfs_try_granting_tickets(fs_info, space_info);
2792  		spin_unlock(&space_info->lock);
2793  	}
2794  
2795  	if (cache)
2796  		btrfs_put_block_group(cache);
2797  	return 0;
2798  }
2799  
btrfs_finish_extent_commit(struct btrfs_trans_handle * trans)2800  int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2801  {
2802  	struct btrfs_fs_info *fs_info = trans->fs_info;
2803  	struct btrfs_block_group *block_group, *tmp;
2804  	struct list_head *deleted_bgs;
2805  	struct extent_io_tree *unpin;
2806  	u64 start;
2807  	u64 end;
2808  	int ret;
2809  
2810  	unpin = &trans->transaction->pinned_extents;
2811  
2812  	while (!TRANS_ABORTED(trans)) {
2813  		struct extent_state *cached_state = NULL;
2814  
2815  		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2816  		if (!find_first_extent_bit(unpin, 0, &start, &end,
2817  					   EXTENT_DIRTY, &cached_state)) {
2818  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2819  			break;
2820  		}
2821  
2822  		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2823  			ret = btrfs_discard_extent(fs_info, start,
2824  						   end + 1 - start, NULL);
2825  
2826  		clear_extent_dirty(unpin, start, end, &cached_state);
2827  		unpin_extent_range(fs_info, start, end, true);
2828  		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2829  		free_extent_state(cached_state);
2830  		cond_resched();
2831  	}
2832  
2833  	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2834  		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2835  		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2836  	}
2837  
2838  	/*
2839  	 * Transaction is finished.  We don't need the lock anymore.  We
2840  	 * do need to clean up the block groups in case of a transaction
2841  	 * abort.
2842  	 */
2843  	deleted_bgs = &trans->transaction->deleted_bgs;
2844  	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2845  		u64 trimmed = 0;
2846  
2847  		ret = -EROFS;
2848  		if (!TRANS_ABORTED(trans))
2849  			ret = btrfs_discard_extent(fs_info,
2850  						   block_group->start,
2851  						   block_group->length,
2852  						   &trimmed);
2853  
2854  		list_del_init(&block_group->bg_list);
2855  		btrfs_unfreeze_block_group(block_group);
2856  		btrfs_put_block_group(block_group);
2857  
2858  		if (ret) {
2859  			const char *errstr = btrfs_decode_error(ret);
2860  			btrfs_warn(fs_info,
2861  			   "discard failed while removing blockgroup: errno=%d %s",
2862  				   ret, errstr);
2863  		}
2864  	}
2865  
2866  	return 0;
2867  }
2868  
do_free_extent_accounting(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes,bool is_data)2869  static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2870  				     u64 bytenr, u64 num_bytes, bool is_data)
2871  {
2872  	int ret;
2873  
2874  	if (is_data) {
2875  		struct btrfs_root *csum_root;
2876  
2877  		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2878  		ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2879  		if (ret) {
2880  			btrfs_abort_transaction(trans, ret);
2881  			return ret;
2882  		}
2883  	}
2884  
2885  	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2886  	if (ret) {
2887  		btrfs_abort_transaction(trans, ret);
2888  		return ret;
2889  	}
2890  
2891  	ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2892  	if (ret)
2893  		btrfs_abort_transaction(trans, ret);
2894  
2895  	return ret;
2896  }
2897  
2898  #define abort_and_dump(trans, path, fmt, args...)	\
2899  ({							\
2900  	btrfs_abort_transaction(trans, -EUCLEAN);	\
2901  	btrfs_print_leaf(path->nodes[0]);		\
2902  	btrfs_crit(trans->fs_info, fmt, ##args);	\
2903  })
2904  
2905  /*
2906   * Drop one or more refs of @node.
2907   *
2908   * 1. Locate the extent refs.
2909   *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2910   *    Locate it, then reduce the refs number or remove the ref line completely.
2911   *
2912   * 2. Update the refs count in EXTENT/METADATA_ITEM
2913   *
2914   * Inline backref case:
2915   *
2916   * in extent tree we have:
2917   *
2918   * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2919   *		refs 2 gen 6 flags DATA
2920   *		extent data backref root FS_TREE objectid 258 offset 0 count 1
2921   *		extent data backref root FS_TREE objectid 257 offset 0 count 1
2922   *
2923   * This function gets called with:
2924   *
2925   *    node->bytenr = 13631488
2926   *    node->num_bytes = 1048576
2927   *    root_objectid = FS_TREE
2928   *    owner_objectid = 257
2929   *    owner_offset = 0
2930   *    refs_to_drop = 1
2931   *
2932   * Then we should get some like:
2933   *
2934   * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2935   *		refs 1 gen 6 flags DATA
2936   *		extent data backref root FS_TREE objectid 258 offset 0 count 1
2937   *
2938   * Keyed backref case:
2939   *
2940   * in extent tree we have:
2941   *
2942   *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2943   *		refs 754 gen 6 flags DATA
2944   *	[...]
2945   *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2946   *		extent data backref root FS_TREE objectid 866 offset 0 count 1
2947   *
2948   * This function get called with:
2949   *
2950   *    node->bytenr = 13631488
2951   *    node->num_bytes = 1048576
2952   *    root_objectid = FS_TREE
2953   *    owner_objectid = 866
2954   *    owner_offset = 0
2955   *    refs_to_drop = 1
2956   *
2957   * Then we should get some like:
2958   *
2959   *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2960   *		refs 753 gen 6 flags DATA
2961   *
2962   * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2963   */
__btrfs_free_extent(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,u64 parent,u64 root_objectid,u64 owner_objectid,u64 owner_offset,int refs_to_drop,struct btrfs_delayed_extent_op * extent_op)2964  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2965  			       struct btrfs_delayed_ref_node *node, u64 parent,
2966  			       u64 root_objectid, u64 owner_objectid,
2967  			       u64 owner_offset, int refs_to_drop,
2968  			       struct btrfs_delayed_extent_op *extent_op)
2969  {
2970  	struct btrfs_fs_info *info = trans->fs_info;
2971  	struct btrfs_key key;
2972  	struct btrfs_path *path;
2973  	struct btrfs_root *extent_root;
2974  	struct extent_buffer *leaf;
2975  	struct btrfs_extent_item *ei;
2976  	struct btrfs_extent_inline_ref *iref;
2977  	int ret;
2978  	int is_data;
2979  	int extent_slot = 0;
2980  	int found_extent = 0;
2981  	int num_to_del = 1;
2982  	u32 item_size;
2983  	u64 refs;
2984  	u64 bytenr = node->bytenr;
2985  	u64 num_bytes = node->num_bytes;
2986  	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2987  
2988  	extent_root = btrfs_extent_root(info, bytenr);
2989  	ASSERT(extent_root);
2990  
2991  	path = btrfs_alloc_path();
2992  	if (!path)
2993  		return -ENOMEM;
2994  
2995  	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2996  
2997  	if (!is_data && refs_to_drop != 1) {
2998  		btrfs_crit(info,
2999  "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3000  			   node->bytenr, refs_to_drop);
3001  		ret = -EINVAL;
3002  		btrfs_abort_transaction(trans, ret);
3003  		goto out;
3004  	}
3005  
3006  	if (is_data)
3007  		skinny_metadata = false;
3008  
3009  	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3010  				    parent, root_objectid, owner_objectid,
3011  				    owner_offset);
3012  	if (ret == 0) {
3013  		/*
3014  		 * Either the inline backref or the SHARED_DATA_REF/
3015  		 * SHARED_BLOCK_REF is found
3016  		 *
3017  		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
3018  		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
3019  		 */
3020  		extent_slot = path->slots[0];
3021  		while (extent_slot >= 0) {
3022  			btrfs_item_key_to_cpu(path->nodes[0], &key,
3023  					      extent_slot);
3024  			if (key.objectid != bytenr)
3025  				break;
3026  			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3027  			    key.offset == num_bytes) {
3028  				found_extent = 1;
3029  				break;
3030  			}
3031  			if (key.type == BTRFS_METADATA_ITEM_KEY &&
3032  			    key.offset == owner_objectid) {
3033  				found_extent = 1;
3034  				break;
3035  			}
3036  
3037  			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3038  			if (path->slots[0] - extent_slot > 5)
3039  				break;
3040  			extent_slot--;
3041  		}
3042  
3043  		if (!found_extent) {
3044  			if (iref) {
3045  				abort_and_dump(trans, path,
3046  "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3047  					   path->slots[0]);
3048  				ret = -EUCLEAN;
3049  				goto out;
3050  			}
3051  			/* Must be SHARED_* item, remove the backref first */
3052  			ret = remove_extent_backref(trans, extent_root, path,
3053  						    NULL, refs_to_drop, is_data);
3054  			if (ret) {
3055  				btrfs_abort_transaction(trans, ret);
3056  				goto out;
3057  			}
3058  			btrfs_release_path(path);
3059  
3060  			/* Slow path to locate EXTENT/METADATA_ITEM */
3061  			key.objectid = bytenr;
3062  			key.type = BTRFS_EXTENT_ITEM_KEY;
3063  			key.offset = num_bytes;
3064  
3065  			if (!is_data && skinny_metadata) {
3066  				key.type = BTRFS_METADATA_ITEM_KEY;
3067  				key.offset = owner_objectid;
3068  			}
3069  
3070  			ret = btrfs_search_slot(trans, extent_root,
3071  						&key, path, -1, 1);
3072  			if (ret > 0 && skinny_metadata && path->slots[0]) {
3073  				/*
3074  				 * Couldn't find our skinny metadata item,
3075  				 * see if we have ye olde extent item.
3076  				 */
3077  				path->slots[0]--;
3078  				btrfs_item_key_to_cpu(path->nodes[0], &key,
3079  						      path->slots[0]);
3080  				if (key.objectid == bytenr &&
3081  				    key.type == BTRFS_EXTENT_ITEM_KEY &&
3082  				    key.offset == num_bytes)
3083  					ret = 0;
3084  			}
3085  
3086  			if (ret > 0 && skinny_metadata) {
3087  				skinny_metadata = false;
3088  				key.objectid = bytenr;
3089  				key.type = BTRFS_EXTENT_ITEM_KEY;
3090  				key.offset = num_bytes;
3091  				btrfs_release_path(path);
3092  				ret = btrfs_search_slot(trans, extent_root,
3093  							&key, path, -1, 1);
3094  			}
3095  
3096  			if (ret) {
3097  				if (ret > 0)
3098  					btrfs_print_leaf(path->nodes[0]);
3099  				btrfs_err(info,
3100  			"umm, got %d back from search, was looking for %llu, slot %d",
3101  					  ret, bytenr, path->slots[0]);
3102  			}
3103  			if (ret < 0) {
3104  				btrfs_abort_transaction(trans, ret);
3105  				goto out;
3106  			}
3107  			extent_slot = path->slots[0];
3108  		}
3109  	} else if (WARN_ON(ret == -ENOENT)) {
3110  		abort_and_dump(trans, path,
3111  "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3112  			       bytenr, parent, root_objectid, owner_objectid,
3113  			       owner_offset, path->slots[0]);
3114  		goto out;
3115  	} else {
3116  		btrfs_abort_transaction(trans, ret);
3117  		goto out;
3118  	}
3119  
3120  	leaf = path->nodes[0];
3121  	item_size = btrfs_item_size(leaf, extent_slot);
3122  	if (unlikely(item_size < sizeof(*ei))) {
3123  		ret = -EUCLEAN;
3124  		btrfs_err(trans->fs_info,
3125  			  "unexpected extent item size, has %u expect >= %zu",
3126  			  item_size, sizeof(*ei));
3127  		btrfs_abort_transaction(trans, ret);
3128  		goto out;
3129  	}
3130  	ei = btrfs_item_ptr(leaf, extent_slot,
3131  			    struct btrfs_extent_item);
3132  	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3133  	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3134  		struct btrfs_tree_block_info *bi;
3135  
3136  		if (item_size < sizeof(*ei) + sizeof(*bi)) {
3137  			abort_and_dump(trans, path,
3138  "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3139  				       key.objectid, key.type, key.offset,
3140  				       path->slots[0], owner_objectid, item_size,
3141  				       sizeof(*ei) + sizeof(*bi));
3142  			ret = -EUCLEAN;
3143  			goto out;
3144  		}
3145  		bi = (struct btrfs_tree_block_info *)(ei + 1);
3146  		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3147  	}
3148  
3149  	refs = btrfs_extent_refs(leaf, ei);
3150  	if (refs < refs_to_drop) {
3151  		abort_and_dump(trans, path,
3152  		"trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3153  			       refs_to_drop, refs, bytenr, path->slots[0]);
3154  		ret = -EUCLEAN;
3155  		goto out;
3156  	}
3157  	refs -= refs_to_drop;
3158  
3159  	if (refs > 0) {
3160  		if (extent_op)
3161  			__run_delayed_extent_op(extent_op, leaf, ei);
3162  		/*
3163  		 * In the case of inline back ref, reference count will
3164  		 * be updated by remove_extent_backref
3165  		 */
3166  		if (iref) {
3167  			if (!found_extent) {
3168  				abort_and_dump(trans, path,
3169  "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3170  					       path->slots[0]);
3171  				ret = -EUCLEAN;
3172  				goto out;
3173  			}
3174  		} else {
3175  			btrfs_set_extent_refs(leaf, ei, refs);
3176  			btrfs_mark_buffer_dirty(trans, leaf);
3177  		}
3178  		if (found_extent) {
3179  			ret = remove_extent_backref(trans, extent_root, path,
3180  						    iref, refs_to_drop, is_data);
3181  			if (ret) {
3182  				btrfs_abort_transaction(trans, ret);
3183  				goto out;
3184  			}
3185  		}
3186  	} else {
3187  		/* In this branch refs == 1 */
3188  		if (found_extent) {
3189  			if (is_data && refs_to_drop !=
3190  			    extent_data_ref_count(path, iref)) {
3191  				abort_and_dump(trans, path,
3192  		"invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3193  					       extent_data_ref_count(path, iref),
3194  					       refs_to_drop, path->slots[0]);
3195  				ret = -EUCLEAN;
3196  				goto out;
3197  			}
3198  			if (iref) {
3199  				if (path->slots[0] != extent_slot) {
3200  					abort_and_dump(trans, path,
3201  "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3202  						       key.objectid, key.type,
3203  						       key.offset, path->slots[0]);
3204  					ret = -EUCLEAN;
3205  					goto out;
3206  				}
3207  			} else {
3208  				/*
3209  				 * No inline ref, we must be at SHARED_* item,
3210  				 * And it's single ref, it must be:
3211  				 * |	extent_slot	  ||extent_slot + 1|
3212  				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3213  				 */
3214  				if (path->slots[0] != extent_slot + 1) {
3215  					abort_and_dump(trans, path,
3216  	"invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3217  						       path->slots[0]);
3218  					ret = -EUCLEAN;
3219  					goto out;
3220  				}
3221  				path->slots[0] = extent_slot;
3222  				num_to_del = 2;
3223  			}
3224  		}
3225  
3226  		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3227  				      num_to_del);
3228  		if (ret) {
3229  			btrfs_abort_transaction(trans, ret);
3230  			goto out;
3231  		}
3232  		btrfs_release_path(path);
3233  
3234  		ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
3235  	}
3236  	btrfs_release_path(path);
3237  
3238  out:
3239  	btrfs_free_path(path);
3240  	return ret;
3241  }
3242  
3243  /*
3244   * when we free an block, it is possible (and likely) that we free the last
3245   * delayed ref for that extent as well.  This searches the delayed ref tree for
3246   * a given extent, and if there are no other delayed refs to be processed, it
3247   * removes it from the tree.
3248   */
check_ref_cleanup(struct btrfs_trans_handle * trans,u64 bytenr)3249  static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3250  				      u64 bytenr)
3251  {
3252  	struct btrfs_delayed_ref_head *head;
3253  	struct btrfs_delayed_ref_root *delayed_refs;
3254  	int ret = 0;
3255  
3256  	delayed_refs = &trans->transaction->delayed_refs;
3257  	spin_lock(&delayed_refs->lock);
3258  	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3259  	if (!head)
3260  		goto out_delayed_unlock;
3261  
3262  	spin_lock(&head->lock);
3263  	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3264  		goto out;
3265  
3266  	if (cleanup_extent_op(head) != NULL)
3267  		goto out;
3268  
3269  	/*
3270  	 * waiting for the lock here would deadlock.  If someone else has it
3271  	 * locked they are already in the process of dropping it anyway
3272  	 */
3273  	if (!mutex_trylock(&head->mutex))
3274  		goto out;
3275  
3276  	btrfs_delete_ref_head(delayed_refs, head);
3277  	head->processing = false;
3278  
3279  	spin_unlock(&head->lock);
3280  	spin_unlock(&delayed_refs->lock);
3281  
3282  	BUG_ON(head->extent_op);
3283  	if (head->must_insert_reserved)
3284  		ret = 1;
3285  
3286  	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3287  	mutex_unlock(&head->mutex);
3288  	btrfs_put_delayed_ref_head(head);
3289  	return ret;
3290  out:
3291  	spin_unlock(&head->lock);
3292  
3293  out_delayed_unlock:
3294  	spin_unlock(&delayed_refs->lock);
3295  	return 0;
3296  }
3297  
btrfs_free_tree_block(struct btrfs_trans_handle * trans,u64 root_id,struct extent_buffer * buf,u64 parent,int last_ref)3298  int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3299  			  u64 root_id,
3300  			  struct extent_buffer *buf,
3301  			  u64 parent, int last_ref)
3302  {
3303  	struct btrfs_fs_info *fs_info = trans->fs_info;
3304  	struct btrfs_ref generic_ref = { 0 };
3305  	int ret;
3306  
3307  	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3308  			       buf->start, buf->len, parent);
3309  	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3310  			    root_id, 0, false);
3311  
3312  	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3313  		btrfs_ref_tree_mod(fs_info, &generic_ref);
3314  		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3315  		if (ret < 0)
3316  			return ret;
3317  	}
3318  
3319  	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3320  		struct btrfs_block_group *cache;
3321  		bool must_pin = false;
3322  
3323  		if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3324  			ret = check_ref_cleanup(trans, buf->start);
3325  			if (!ret) {
3326  				btrfs_redirty_list_add(trans->transaction, buf);
3327  				goto out;
3328  			}
3329  		}
3330  
3331  		cache = btrfs_lookup_block_group(fs_info, buf->start);
3332  
3333  		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3334  			pin_down_extent(trans, cache, buf->start, buf->len, 1);
3335  			btrfs_put_block_group(cache);
3336  			goto out;
3337  		}
3338  
3339  		/*
3340  		 * If there are tree mod log users we may have recorded mod log
3341  		 * operations for this node.  If we re-allocate this node we
3342  		 * could replay operations on this node that happened when it
3343  		 * existed in a completely different root.  For example if it
3344  		 * was part of root A, then was reallocated to root B, and we
3345  		 * are doing a btrfs_old_search_slot(root b), we could replay
3346  		 * operations that happened when the block was part of root A,
3347  		 * giving us an inconsistent view of the btree.
3348  		 *
3349  		 * We are safe from races here because at this point no other
3350  		 * node or root points to this extent buffer, so if after this
3351  		 * check a new tree mod log user joins we will not have an
3352  		 * existing log of operations on this node that we have to
3353  		 * contend with.
3354  		 */
3355  		if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3356  			must_pin = true;
3357  
3358  		if (must_pin || btrfs_is_zoned(fs_info)) {
3359  			btrfs_redirty_list_add(trans->transaction, buf);
3360  			pin_down_extent(trans, cache, buf->start, buf->len, 1);
3361  			btrfs_put_block_group(cache);
3362  			goto out;
3363  		}
3364  
3365  		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3366  
3367  		btrfs_add_free_space(cache, buf->start, buf->len);
3368  		btrfs_free_reserved_bytes(cache, buf->len, 0);
3369  		btrfs_put_block_group(cache);
3370  		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3371  	}
3372  out:
3373  	if (last_ref) {
3374  		/*
3375  		 * Deleting the buffer, clear the corrupt flag since it doesn't
3376  		 * matter anymore.
3377  		 */
3378  		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3379  	}
3380  	return 0;
3381  }
3382  
3383  /* Can return -ENOMEM */
btrfs_free_extent(struct btrfs_trans_handle * trans,struct btrfs_ref * ref)3384  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3385  {
3386  	struct btrfs_fs_info *fs_info = trans->fs_info;
3387  	int ret;
3388  
3389  	if (btrfs_is_testing(fs_info))
3390  		return 0;
3391  
3392  	/*
3393  	 * tree log blocks never actually go into the extent allocation
3394  	 * tree, just update pinning info and exit early.
3395  	 */
3396  	if ((ref->type == BTRFS_REF_METADATA &&
3397  	     ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3398  	    (ref->type == BTRFS_REF_DATA &&
3399  	     ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3400  		/* unlocks the pinned mutex */
3401  		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3402  		ret = 0;
3403  	} else if (ref->type == BTRFS_REF_METADATA) {
3404  		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3405  	} else {
3406  		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3407  	}
3408  
3409  	if (!((ref->type == BTRFS_REF_METADATA &&
3410  	       ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3411  	      (ref->type == BTRFS_REF_DATA &&
3412  	       ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3413  		btrfs_ref_tree_mod(fs_info, ref);
3414  
3415  	return ret;
3416  }
3417  
3418  enum btrfs_loop_type {
3419  	/*
3420  	 * Start caching block groups but do not wait for progress or for them
3421  	 * to be done.
3422  	 */
3423  	LOOP_CACHING_NOWAIT,
3424  
3425  	/*
3426  	 * Wait for the block group free_space >= the space we're waiting for if
3427  	 * the block group isn't cached.
3428  	 */
3429  	LOOP_CACHING_WAIT,
3430  
3431  	/*
3432  	 * Allow allocations to happen from block groups that do not yet have a
3433  	 * size classification.
3434  	 */
3435  	LOOP_UNSET_SIZE_CLASS,
3436  
3437  	/*
3438  	 * Allocate a chunk and then retry the allocation.
3439  	 */
3440  	LOOP_ALLOC_CHUNK,
3441  
3442  	/*
3443  	 * Ignore the size class restrictions for this allocation.
3444  	 */
3445  	LOOP_WRONG_SIZE_CLASS,
3446  
3447  	/*
3448  	 * Ignore the empty size, only try to allocate the number of bytes
3449  	 * needed for this allocation.
3450  	 */
3451  	LOOP_NO_EMPTY_SIZE,
3452  };
3453  
3454  static inline void
btrfs_lock_block_group(struct btrfs_block_group * cache,int delalloc)3455  btrfs_lock_block_group(struct btrfs_block_group *cache,
3456  		       int delalloc)
3457  {
3458  	if (delalloc)
3459  		down_read(&cache->data_rwsem);
3460  }
3461  
btrfs_grab_block_group(struct btrfs_block_group * cache,int delalloc)3462  static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3463  		       int delalloc)
3464  {
3465  	btrfs_get_block_group(cache);
3466  	if (delalloc)
3467  		down_read(&cache->data_rwsem);
3468  }
3469  
btrfs_lock_cluster(struct btrfs_block_group * block_group,struct btrfs_free_cluster * cluster,int delalloc)3470  static struct btrfs_block_group *btrfs_lock_cluster(
3471  		   struct btrfs_block_group *block_group,
3472  		   struct btrfs_free_cluster *cluster,
3473  		   int delalloc)
3474  	__acquires(&cluster->refill_lock)
3475  {
3476  	struct btrfs_block_group *used_bg = NULL;
3477  
3478  	spin_lock(&cluster->refill_lock);
3479  	while (1) {
3480  		used_bg = cluster->block_group;
3481  		if (!used_bg)
3482  			return NULL;
3483  
3484  		if (used_bg == block_group)
3485  			return used_bg;
3486  
3487  		btrfs_get_block_group(used_bg);
3488  
3489  		if (!delalloc)
3490  			return used_bg;
3491  
3492  		if (down_read_trylock(&used_bg->data_rwsem))
3493  			return used_bg;
3494  
3495  		spin_unlock(&cluster->refill_lock);
3496  
3497  		/* We should only have one-level nested. */
3498  		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3499  
3500  		spin_lock(&cluster->refill_lock);
3501  		if (used_bg == cluster->block_group)
3502  			return used_bg;
3503  
3504  		up_read(&used_bg->data_rwsem);
3505  		btrfs_put_block_group(used_bg);
3506  	}
3507  }
3508  
3509  static inline void
btrfs_release_block_group(struct btrfs_block_group * cache,int delalloc)3510  btrfs_release_block_group(struct btrfs_block_group *cache,
3511  			 int delalloc)
3512  {
3513  	if (delalloc)
3514  		up_read(&cache->data_rwsem);
3515  	btrfs_put_block_group(cache);
3516  }
3517  
3518  /*
3519   * Helper function for find_free_extent().
3520   *
3521   * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3522   * Return >0 to inform caller that we find nothing
3523   * Return 0 means we have found a location and set ffe_ctl->found_offset.
3524   */
find_free_extent_clustered(struct btrfs_block_group * bg,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** cluster_bg_ret)3525  static int find_free_extent_clustered(struct btrfs_block_group *bg,
3526  				      struct find_free_extent_ctl *ffe_ctl,
3527  				      struct btrfs_block_group **cluster_bg_ret)
3528  {
3529  	struct btrfs_block_group *cluster_bg;
3530  	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3531  	u64 aligned_cluster;
3532  	u64 offset;
3533  	int ret;
3534  
3535  	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3536  	if (!cluster_bg)
3537  		goto refill_cluster;
3538  	if (cluster_bg != bg && (cluster_bg->ro ||
3539  	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
3540  		goto release_cluster;
3541  
3542  	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3543  			ffe_ctl->num_bytes, cluster_bg->start,
3544  			&ffe_ctl->max_extent_size);
3545  	if (offset) {
3546  		/* We have a block, we're done */
3547  		spin_unlock(&last_ptr->refill_lock);
3548  		trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3549  		*cluster_bg_ret = cluster_bg;
3550  		ffe_ctl->found_offset = offset;
3551  		return 0;
3552  	}
3553  	WARN_ON(last_ptr->block_group != cluster_bg);
3554  
3555  release_cluster:
3556  	/*
3557  	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3558  	 * lets just skip it and let the allocator find whatever block it can
3559  	 * find. If we reach this point, we will have tried the cluster
3560  	 * allocator plenty of times and not have found anything, so we are
3561  	 * likely way too fragmented for the clustering stuff to find anything.
3562  	 *
3563  	 * However, if the cluster is taken from the current block group,
3564  	 * release the cluster first, so that we stand a better chance of
3565  	 * succeeding in the unclustered allocation.
3566  	 */
3567  	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3568  		spin_unlock(&last_ptr->refill_lock);
3569  		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3570  		return -ENOENT;
3571  	}
3572  
3573  	/* This cluster didn't work out, free it and start over */
3574  	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3575  
3576  	if (cluster_bg != bg)
3577  		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3578  
3579  refill_cluster:
3580  	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3581  		spin_unlock(&last_ptr->refill_lock);
3582  		return -ENOENT;
3583  	}
3584  
3585  	aligned_cluster = max_t(u64,
3586  			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3587  			bg->full_stripe_len);
3588  	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3589  			ffe_ctl->num_bytes, aligned_cluster);
3590  	if (ret == 0) {
3591  		/* Now pull our allocation out of this cluster */
3592  		offset = btrfs_alloc_from_cluster(bg, last_ptr,
3593  				ffe_ctl->num_bytes, ffe_ctl->search_start,
3594  				&ffe_ctl->max_extent_size);
3595  		if (offset) {
3596  			/* We found one, proceed */
3597  			spin_unlock(&last_ptr->refill_lock);
3598  			ffe_ctl->found_offset = offset;
3599  			trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3600  			return 0;
3601  		}
3602  	}
3603  	/*
3604  	 * At this point we either didn't find a cluster or we weren't able to
3605  	 * allocate a block from our cluster.  Free the cluster we've been
3606  	 * trying to use, and go to the next block group.
3607  	 */
3608  	btrfs_return_cluster_to_free_space(NULL, last_ptr);
3609  	spin_unlock(&last_ptr->refill_lock);
3610  	return 1;
3611  }
3612  
3613  /*
3614   * Return >0 to inform caller that we find nothing
3615   * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3616   */
find_free_extent_unclustered(struct btrfs_block_group * bg,struct find_free_extent_ctl * ffe_ctl)3617  static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3618  					struct find_free_extent_ctl *ffe_ctl)
3619  {
3620  	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3621  	u64 offset;
3622  
3623  	/*
3624  	 * We are doing an unclustered allocation, set the fragmented flag so
3625  	 * we don't bother trying to setup a cluster again until we get more
3626  	 * space.
3627  	 */
3628  	if (unlikely(last_ptr)) {
3629  		spin_lock(&last_ptr->lock);
3630  		last_ptr->fragmented = 1;
3631  		spin_unlock(&last_ptr->lock);
3632  	}
3633  	if (ffe_ctl->cached) {
3634  		struct btrfs_free_space_ctl *free_space_ctl;
3635  
3636  		free_space_ctl = bg->free_space_ctl;
3637  		spin_lock(&free_space_ctl->tree_lock);
3638  		if (free_space_ctl->free_space <
3639  		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3640  		    ffe_ctl->empty_size) {
3641  			ffe_ctl->total_free_space = max_t(u64,
3642  					ffe_ctl->total_free_space,
3643  					free_space_ctl->free_space);
3644  			spin_unlock(&free_space_ctl->tree_lock);
3645  			return 1;
3646  		}
3647  		spin_unlock(&free_space_ctl->tree_lock);
3648  	}
3649  
3650  	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3651  			ffe_ctl->num_bytes, ffe_ctl->empty_size,
3652  			&ffe_ctl->max_extent_size);
3653  	if (!offset)
3654  		return 1;
3655  	ffe_ctl->found_offset = offset;
3656  	return 0;
3657  }
3658  
do_allocation_clustered(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3659  static int do_allocation_clustered(struct btrfs_block_group *block_group,
3660  				   struct find_free_extent_ctl *ffe_ctl,
3661  				   struct btrfs_block_group **bg_ret)
3662  {
3663  	int ret;
3664  
3665  	/* We want to try and use the cluster allocator, so lets look there */
3666  	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3667  		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3668  		if (ret >= 0)
3669  			return ret;
3670  		/* ret == -ENOENT case falls through */
3671  	}
3672  
3673  	return find_free_extent_unclustered(block_group, ffe_ctl);
3674  }
3675  
3676  /*
3677   * Tree-log block group locking
3678   * ============================
3679   *
3680   * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3681   * indicates the starting address of a block group, which is reserved only
3682   * for tree-log metadata.
3683   *
3684   * Lock nesting
3685   * ============
3686   *
3687   * space_info::lock
3688   *   block_group::lock
3689   *     fs_info::treelog_bg_lock
3690   */
3691  
3692  /*
3693   * Simple allocator for sequential-only block group. It only allows sequential
3694   * allocation. No need to play with trees. This function also reserves the
3695   * bytes as in btrfs_add_reserved_bytes.
3696   */
do_allocation_zoned(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3697  static int do_allocation_zoned(struct btrfs_block_group *block_group,
3698  			       struct find_free_extent_ctl *ffe_ctl,
3699  			       struct btrfs_block_group **bg_ret)
3700  {
3701  	struct btrfs_fs_info *fs_info = block_group->fs_info;
3702  	struct btrfs_space_info *space_info = block_group->space_info;
3703  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3704  	u64 start = block_group->start;
3705  	u64 num_bytes = ffe_ctl->num_bytes;
3706  	u64 avail;
3707  	u64 bytenr = block_group->start;
3708  	u64 log_bytenr;
3709  	u64 data_reloc_bytenr;
3710  	int ret = 0;
3711  	bool skip = false;
3712  
3713  	ASSERT(btrfs_is_zoned(block_group->fs_info));
3714  
3715  	/*
3716  	 * Do not allow non-tree-log blocks in the dedicated tree-log block
3717  	 * group, and vice versa.
3718  	 */
3719  	spin_lock(&fs_info->treelog_bg_lock);
3720  	log_bytenr = fs_info->treelog_bg;
3721  	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3722  			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3723  		skip = true;
3724  	spin_unlock(&fs_info->treelog_bg_lock);
3725  	if (skip)
3726  		return 1;
3727  
3728  	/*
3729  	 * Do not allow non-relocation blocks in the dedicated relocation block
3730  	 * group, and vice versa.
3731  	 */
3732  	spin_lock(&fs_info->relocation_bg_lock);
3733  	data_reloc_bytenr = fs_info->data_reloc_bg;
3734  	if (data_reloc_bytenr &&
3735  	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3736  	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3737  		skip = true;
3738  	spin_unlock(&fs_info->relocation_bg_lock);
3739  	if (skip)
3740  		return 1;
3741  
3742  	/* Check RO and no space case before trying to activate it */
3743  	spin_lock(&block_group->lock);
3744  	if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3745  		ret = 1;
3746  		/*
3747  		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3748  		 * Return the error after taking the locks.
3749  		 */
3750  	}
3751  	spin_unlock(&block_group->lock);
3752  
3753  	/* Metadata block group is activated at write time. */
3754  	if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3755  	    !btrfs_zone_activate(block_group)) {
3756  		ret = 1;
3757  		/*
3758  		 * May need to clear fs_info->{treelog,data_reloc}_bg.
3759  		 * Return the error after taking the locks.
3760  		 */
3761  	}
3762  
3763  	spin_lock(&space_info->lock);
3764  	spin_lock(&block_group->lock);
3765  	spin_lock(&fs_info->treelog_bg_lock);
3766  	spin_lock(&fs_info->relocation_bg_lock);
3767  
3768  	if (ret)
3769  		goto out;
3770  
3771  	ASSERT(!ffe_ctl->for_treelog ||
3772  	       block_group->start == fs_info->treelog_bg ||
3773  	       fs_info->treelog_bg == 0);
3774  	ASSERT(!ffe_ctl->for_data_reloc ||
3775  	       block_group->start == fs_info->data_reloc_bg ||
3776  	       fs_info->data_reloc_bg == 0);
3777  
3778  	if (block_group->ro ||
3779  	    (!ffe_ctl->for_data_reloc &&
3780  	     test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3781  		ret = 1;
3782  		goto out;
3783  	}
3784  
3785  	/*
3786  	 * Do not allow currently using block group to be tree-log dedicated
3787  	 * block group.
3788  	 */
3789  	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3790  	    (block_group->used || block_group->reserved)) {
3791  		ret = 1;
3792  		goto out;
3793  	}
3794  
3795  	/*
3796  	 * Do not allow currently used block group to be the data relocation
3797  	 * dedicated block group.
3798  	 */
3799  	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3800  	    (block_group->used || block_group->reserved)) {
3801  		ret = 1;
3802  		goto out;
3803  	}
3804  
3805  	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3806  	avail = block_group->zone_capacity - block_group->alloc_offset;
3807  	if (avail < num_bytes) {
3808  		if (ffe_ctl->max_extent_size < avail) {
3809  			/*
3810  			 * With sequential allocator, free space is always
3811  			 * contiguous
3812  			 */
3813  			ffe_ctl->max_extent_size = avail;
3814  			ffe_ctl->total_free_space = avail;
3815  		}
3816  		ret = 1;
3817  		goto out;
3818  	}
3819  
3820  	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3821  		fs_info->treelog_bg = block_group->start;
3822  
3823  	if (ffe_ctl->for_data_reloc) {
3824  		if (!fs_info->data_reloc_bg)
3825  			fs_info->data_reloc_bg = block_group->start;
3826  		/*
3827  		 * Do not allow allocations from this block group, unless it is
3828  		 * for data relocation. Compared to increasing the ->ro, setting
3829  		 * the ->zoned_data_reloc_ongoing flag still allows nocow
3830  		 * writers to come in. See btrfs_inc_nocow_writers().
3831  		 *
3832  		 * We need to disable an allocation to avoid an allocation of
3833  		 * regular (non-relocation data) extent. With mix of relocation
3834  		 * extents and regular extents, we can dispatch WRITE commands
3835  		 * (for relocation extents) and ZONE APPEND commands (for
3836  		 * regular extents) at the same time to the same zone, which
3837  		 * easily break the write pointer.
3838  		 *
3839  		 * Also, this flag avoids this block group to be zone finished.
3840  		 */
3841  		set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3842  	}
3843  
3844  	ffe_ctl->found_offset = start + block_group->alloc_offset;
3845  	block_group->alloc_offset += num_bytes;
3846  	spin_lock(&ctl->tree_lock);
3847  	ctl->free_space -= num_bytes;
3848  	spin_unlock(&ctl->tree_lock);
3849  
3850  	/*
3851  	 * We do not check if found_offset is aligned to stripesize. The
3852  	 * address is anyway rewritten when using zone append writing.
3853  	 */
3854  
3855  	ffe_ctl->search_start = ffe_ctl->found_offset;
3856  
3857  out:
3858  	if (ret && ffe_ctl->for_treelog)
3859  		fs_info->treelog_bg = 0;
3860  	if (ret && ffe_ctl->for_data_reloc)
3861  		fs_info->data_reloc_bg = 0;
3862  	spin_unlock(&fs_info->relocation_bg_lock);
3863  	spin_unlock(&fs_info->treelog_bg_lock);
3864  	spin_unlock(&block_group->lock);
3865  	spin_unlock(&space_info->lock);
3866  	return ret;
3867  }
3868  
do_allocation(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group ** bg_ret)3869  static int do_allocation(struct btrfs_block_group *block_group,
3870  			 struct find_free_extent_ctl *ffe_ctl,
3871  			 struct btrfs_block_group **bg_ret)
3872  {
3873  	switch (ffe_ctl->policy) {
3874  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3875  		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3876  	case BTRFS_EXTENT_ALLOC_ZONED:
3877  		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3878  	default:
3879  		BUG();
3880  	}
3881  }
3882  
release_block_group(struct btrfs_block_group * block_group,struct find_free_extent_ctl * ffe_ctl,int delalloc)3883  static void release_block_group(struct btrfs_block_group *block_group,
3884  				struct find_free_extent_ctl *ffe_ctl,
3885  				int delalloc)
3886  {
3887  	switch (ffe_ctl->policy) {
3888  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3889  		ffe_ctl->retry_uncached = false;
3890  		break;
3891  	case BTRFS_EXTENT_ALLOC_ZONED:
3892  		/* Nothing to do */
3893  		break;
3894  	default:
3895  		BUG();
3896  	}
3897  
3898  	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3899  	       ffe_ctl->index);
3900  	btrfs_release_block_group(block_group, delalloc);
3901  }
3902  
found_extent_clustered(struct find_free_extent_ctl * ffe_ctl,struct btrfs_key * ins)3903  static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3904  				   struct btrfs_key *ins)
3905  {
3906  	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3907  
3908  	if (!ffe_ctl->use_cluster && last_ptr) {
3909  		spin_lock(&last_ptr->lock);
3910  		last_ptr->window_start = ins->objectid;
3911  		spin_unlock(&last_ptr->lock);
3912  	}
3913  }
3914  
found_extent(struct find_free_extent_ctl * ffe_ctl,struct btrfs_key * ins)3915  static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3916  			 struct btrfs_key *ins)
3917  {
3918  	switch (ffe_ctl->policy) {
3919  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3920  		found_extent_clustered(ffe_ctl, ins);
3921  		break;
3922  	case BTRFS_EXTENT_ALLOC_ZONED:
3923  		/* Nothing to do */
3924  		break;
3925  	default:
3926  		BUG();
3927  	}
3928  }
3929  
can_allocate_chunk_zoned(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)3930  static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
3931  				    struct find_free_extent_ctl *ffe_ctl)
3932  {
3933  	/* Block group's activeness is not a requirement for METADATA block groups. */
3934  	if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
3935  		return 0;
3936  
3937  	/* If we can activate new zone, just allocate a chunk and use it */
3938  	if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
3939  		return 0;
3940  
3941  	/*
3942  	 * We already reached the max active zones. Try to finish one block
3943  	 * group to make a room for a new block group. This is only possible
3944  	 * for a data block group because btrfs_zone_finish() may need to wait
3945  	 * for a running transaction which can cause a deadlock for metadata
3946  	 * allocation.
3947  	 */
3948  	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
3949  		int ret = btrfs_zone_finish_one_bg(fs_info);
3950  
3951  		if (ret == 1)
3952  			return 0;
3953  		else if (ret < 0)
3954  			return ret;
3955  	}
3956  
3957  	/*
3958  	 * If we have enough free space left in an already active block group
3959  	 * and we can't activate any other zone now, do not allow allocating a
3960  	 * new chunk and let find_free_extent() retry with a smaller size.
3961  	 */
3962  	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
3963  		return -ENOSPC;
3964  
3965  	/*
3966  	 * Even min_alloc_size is not left in any block groups. Since we cannot
3967  	 * activate a new block group, allocating it may not help. Let's tell a
3968  	 * caller to try again and hope it progress something by writing some
3969  	 * parts of the region. That is only possible for data block groups,
3970  	 * where a part of the region can be written.
3971  	 */
3972  	if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
3973  		return -EAGAIN;
3974  
3975  	/*
3976  	 * We cannot activate a new block group and no enough space left in any
3977  	 * block groups. So, allocating a new block group may not help. But,
3978  	 * there is nothing to do anyway, so let's go with it.
3979  	 */
3980  	return 0;
3981  }
3982  
can_allocate_chunk(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)3983  static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
3984  			      struct find_free_extent_ctl *ffe_ctl)
3985  {
3986  	switch (ffe_ctl->policy) {
3987  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
3988  		return 0;
3989  	case BTRFS_EXTENT_ALLOC_ZONED:
3990  		return can_allocate_chunk_zoned(fs_info, ffe_ctl);
3991  	default:
3992  		BUG();
3993  	}
3994  }
3995  
3996  /*
3997   * Return >0 means caller needs to re-search for free extent
3998   * Return 0 means we have the needed free extent.
3999   * Return <0 means we failed to locate any free extent.
4000   */
find_free_extent_update_loop(struct btrfs_fs_info * fs_info,struct btrfs_key * ins,struct find_free_extent_ctl * ffe_ctl,bool full_search)4001  static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
4002  					struct btrfs_key *ins,
4003  					struct find_free_extent_ctl *ffe_ctl,
4004  					bool full_search)
4005  {
4006  	struct btrfs_root *root = fs_info->chunk_root;
4007  	int ret;
4008  
4009  	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
4010  	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
4011  		ffe_ctl->orig_have_caching_bg = true;
4012  
4013  	if (ins->objectid) {
4014  		found_extent(ffe_ctl, ins);
4015  		return 0;
4016  	}
4017  
4018  	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4019  		return 1;
4020  
4021  	ffe_ctl->index++;
4022  	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4023  		return 1;
4024  
4025  	/* See the comments for btrfs_loop_type for an explanation of the phases. */
4026  	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4027  		ffe_ctl->index = 0;
4028  		/*
4029  		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
4030  		 * any uncached bgs and we've already done a full search
4031  		 * through.
4032  		 */
4033  		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4034  		    (!ffe_ctl->orig_have_caching_bg && full_search))
4035  			ffe_ctl->loop++;
4036  		ffe_ctl->loop++;
4037  
4038  		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4039  			struct btrfs_trans_handle *trans;
4040  			int exist = 0;
4041  
4042  			/* Check if allocation policy allows to create a new chunk */
4043  			ret = can_allocate_chunk(fs_info, ffe_ctl);
4044  			if (ret)
4045  				return ret;
4046  
4047  			trans = current->journal_info;
4048  			if (trans)
4049  				exist = 1;
4050  			else
4051  				trans = btrfs_join_transaction(root);
4052  
4053  			if (IS_ERR(trans)) {
4054  				ret = PTR_ERR(trans);
4055  				return ret;
4056  			}
4057  
4058  			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4059  						CHUNK_ALLOC_FORCE_FOR_EXTENT);
4060  
4061  			/* Do not bail out on ENOSPC since we can do more. */
4062  			if (ret == -ENOSPC) {
4063  				ret = 0;
4064  				ffe_ctl->loop++;
4065  			}
4066  			else if (ret < 0)
4067  				btrfs_abort_transaction(trans, ret);
4068  			else
4069  				ret = 0;
4070  			if (!exist)
4071  				btrfs_end_transaction(trans);
4072  			if (ret)
4073  				return ret;
4074  		}
4075  
4076  		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4077  			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4078  				return -ENOSPC;
4079  
4080  			/*
4081  			 * Don't loop again if we already have no empty_size and
4082  			 * no empty_cluster.
4083  			 */
4084  			if (ffe_ctl->empty_size == 0 &&
4085  			    ffe_ctl->empty_cluster == 0)
4086  				return -ENOSPC;
4087  			ffe_ctl->empty_size = 0;
4088  			ffe_ctl->empty_cluster = 0;
4089  		}
4090  		return 1;
4091  	}
4092  	return -ENOSPC;
4093  }
4094  
find_free_extent_check_size_class(struct find_free_extent_ctl * ffe_ctl,struct btrfs_block_group * bg)4095  static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4096  					      struct btrfs_block_group *bg)
4097  {
4098  	if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4099  		return true;
4100  	if (!btrfs_block_group_should_use_size_class(bg))
4101  		return true;
4102  	if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4103  		return true;
4104  	if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4105  	    bg->size_class == BTRFS_BG_SZ_NONE)
4106  		return true;
4107  	return ffe_ctl->size_class == bg->size_class;
4108  }
4109  
prepare_allocation_clustered(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl,struct btrfs_space_info * space_info,struct btrfs_key * ins)4110  static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4111  					struct find_free_extent_ctl *ffe_ctl,
4112  					struct btrfs_space_info *space_info,
4113  					struct btrfs_key *ins)
4114  {
4115  	/*
4116  	 * If our free space is heavily fragmented we may not be able to make
4117  	 * big contiguous allocations, so instead of doing the expensive search
4118  	 * for free space, simply return ENOSPC with our max_extent_size so we
4119  	 * can go ahead and search for a more manageable chunk.
4120  	 *
4121  	 * If our max_extent_size is large enough for our allocation simply
4122  	 * disable clustering since we will likely not be able to find enough
4123  	 * space to create a cluster and induce latency trying.
4124  	 */
4125  	if (space_info->max_extent_size) {
4126  		spin_lock(&space_info->lock);
4127  		if (space_info->max_extent_size &&
4128  		    ffe_ctl->num_bytes > space_info->max_extent_size) {
4129  			ins->offset = space_info->max_extent_size;
4130  			spin_unlock(&space_info->lock);
4131  			return -ENOSPC;
4132  		} else if (space_info->max_extent_size) {
4133  			ffe_ctl->use_cluster = false;
4134  		}
4135  		spin_unlock(&space_info->lock);
4136  	}
4137  
4138  	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4139  					       &ffe_ctl->empty_cluster);
4140  	if (ffe_ctl->last_ptr) {
4141  		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4142  
4143  		spin_lock(&last_ptr->lock);
4144  		if (last_ptr->block_group)
4145  			ffe_ctl->hint_byte = last_ptr->window_start;
4146  		if (last_ptr->fragmented) {
4147  			/*
4148  			 * We still set window_start so we can keep track of the
4149  			 * last place we found an allocation to try and save
4150  			 * some time.
4151  			 */
4152  			ffe_ctl->hint_byte = last_ptr->window_start;
4153  			ffe_ctl->use_cluster = false;
4154  		}
4155  		spin_unlock(&last_ptr->lock);
4156  	}
4157  
4158  	return 0;
4159  }
4160  
prepare_allocation_zoned(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl)4161  static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
4162  				    struct find_free_extent_ctl *ffe_ctl)
4163  {
4164  	if (ffe_ctl->for_treelog) {
4165  		spin_lock(&fs_info->treelog_bg_lock);
4166  		if (fs_info->treelog_bg)
4167  			ffe_ctl->hint_byte = fs_info->treelog_bg;
4168  		spin_unlock(&fs_info->treelog_bg_lock);
4169  	} else if (ffe_ctl->for_data_reloc) {
4170  		spin_lock(&fs_info->relocation_bg_lock);
4171  		if (fs_info->data_reloc_bg)
4172  			ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4173  		spin_unlock(&fs_info->relocation_bg_lock);
4174  	} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
4175  		struct btrfs_block_group *block_group;
4176  
4177  		spin_lock(&fs_info->zone_active_bgs_lock);
4178  		list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
4179  			/*
4180  			 * No lock is OK here because avail is monotinically
4181  			 * decreasing, and this is just a hint.
4182  			 */
4183  			u64 avail = block_group->zone_capacity - block_group->alloc_offset;
4184  
4185  			if (block_group_bits(block_group, ffe_ctl->flags) &&
4186  			    avail >= ffe_ctl->num_bytes) {
4187  				ffe_ctl->hint_byte = block_group->start;
4188  				break;
4189  			}
4190  		}
4191  		spin_unlock(&fs_info->zone_active_bgs_lock);
4192  	}
4193  
4194  	return 0;
4195  }
4196  
prepare_allocation(struct btrfs_fs_info * fs_info,struct find_free_extent_ctl * ffe_ctl,struct btrfs_space_info * space_info,struct btrfs_key * ins)4197  static int prepare_allocation(struct btrfs_fs_info *fs_info,
4198  			      struct find_free_extent_ctl *ffe_ctl,
4199  			      struct btrfs_space_info *space_info,
4200  			      struct btrfs_key *ins)
4201  {
4202  	switch (ffe_ctl->policy) {
4203  	case BTRFS_EXTENT_ALLOC_CLUSTERED:
4204  		return prepare_allocation_clustered(fs_info, ffe_ctl,
4205  						    space_info, ins);
4206  	case BTRFS_EXTENT_ALLOC_ZONED:
4207  		return prepare_allocation_zoned(fs_info, ffe_ctl);
4208  	default:
4209  		BUG();
4210  	}
4211  }
4212  
4213  /*
4214   * walks the btree of allocated extents and find a hole of a given size.
4215   * The key ins is changed to record the hole:
4216   * ins->objectid == start position
4217   * ins->flags = BTRFS_EXTENT_ITEM_KEY
4218   * ins->offset == the size of the hole.
4219   * Any available blocks before search_start are skipped.
4220   *
4221   * If there is no suitable free space, we will record the max size of
4222   * the free space extent currently.
4223   *
4224   * The overall logic and call chain:
4225   *
4226   * find_free_extent()
4227   * |- Iterate through all block groups
4228   * |  |- Get a valid block group
4229   * |  |- Try to do clustered allocation in that block group
4230   * |  |- Try to do unclustered allocation in that block group
4231   * |  |- Check if the result is valid
4232   * |  |  |- If valid, then exit
4233   * |  |- Jump to next block group
4234   * |
4235   * |- Push harder to find free extents
4236   *    |- If not found, re-iterate all block groups
4237   */
find_free_extent(struct btrfs_root * root,struct btrfs_key * ins,struct find_free_extent_ctl * ffe_ctl)4238  static noinline int find_free_extent(struct btrfs_root *root,
4239  				     struct btrfs_key *ins,
4240  				     struct find_free_extent_ctl *ffe_ctl)
4241  {
4242  	struct btrfs_fs_info *fs_info = root->fs_info;
4243  	int ret = 0;
4244  	int cache_block_group_error = 0;
4245  	struct btrfs_block_group *block_group = NULL;
4246  	struct btrfs_space_info *space_info;
4247  	bool full_search = false;
4248  
4249  	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4250  
4251  	ffe_ctl->search_start = 0;
4252  	/* For clustered allocation */
4253  	ffe_ctl->empty_cluster = 0;
4254  	ffe_ctl->last_ptr = NULL;
4255  	ffe_ctl->use_cluster = true;
4256  	ffe_ctl->have_caching_bg = false;
4257  	ffe_ctl->orig_have_caching_bg = false;
4258  	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4259  	ffe_ctl->loop = 0;
4260  	ffe_ctl->retry_uncached = false;
4261  	ffe_ctl->cached = 0;
4262  	ffe_ctl->max_extent_size = 0;
4263  	ffe_ctl->total_free_space = 0;
4264  	ffe_ctl->found_offset = 0;
4265  	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4266  	ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4267  
4268  	if (btrfs_is_zoned(fs_info))
4269  		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4270  
4271  	ins->type = BTRFS_EXTENT_ITEM_KEY;
4272  	ins->objectid = 0;
4273  	ins->offset = 0;
4274  
4275  	trace_find_free_extent(root, ffe_ctl);
4276  
4277  	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4278  	if (!space_info) {
4279  		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4280  		return -ENOSPC;
4281  	}
4282  
4283  	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4284  	if (ret < 0)
4285  		return ret;
4286  
4287  	ffe_ctl->search_start = max(ffe_ctl->search_start,
4288  				    first_logical_byte(fs_info));
4289  	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4290  	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4291  		block_group = btrfs_lookup_block_group(fs_info,
4292  						       ffe_ctl->search_start);
4293  		/*
4294  		 * we don't want to use the block group if it doesn't match our
4295  		 * allocation bits, or if its not cached.
4296  		 *
4297  		 * However if we are re-searching with an ideal block group
4298  		 * picked out then we don't care that the block group is cached.
4299  		 */
4300  		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4301  		    block_group->cached != BTRFS_CACHE_NO) {
4302  			down_read(&space_info->groups_sem);
4303  			if (list_empty(&block_group->list) ||
4304  			    block_group->ro) {
4305  				/*
4306  				 * someone is removing this block group,
4307  				 * we can't jump into the have_block_group
4308  				 * target because our list pointers are not
4309  				 * valid
4310  				 */
4311  				btrfs_put_block_group(block_group);
4312  				up_read(&space_info->groups_sem);
4313  			} else {
4314  				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4315  							block_group->flags);
4316  				btrfs_lock_block_group(block_group,
4317  						       ffe_ctl->delalloc);
4318  				ffe_ctl->hinted = true;
4319  				goto have_block_group;
4320  			}
4321  		} else if (block_group) {
4322  			btrfs_put_block_group(block_group);
4323  		}
4324  	}
4325  search:
4326  	trace_find_free_extent_search_loop(root, ffe_ctl);
4327  	ffe_ctl->have_caching_bg = false;
4328  	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4329  	    ffe_ctl->index == 0)
4330  		full_search = true;
4331  	down_read(&space_info->groups_sem);
4332  	list_for_each_entry(block_group,
4333  			    &space_info->block_groups[ffe_ctl->index], list) {
4334  		struct btrfs_block_group *bg_ret;
4335  
4336  		ffe_ctl->hinted = false;
4337  		/* If the block group is read-only, we can skip it entirely. */
4338  		if (unlikely(block_group->ro)) {
4339  			if (ffe_ctl->for_treelog)
4340  				btrfs_clear_treelog_bg(block_group);
4341  			if (ffe_ctl->for_data_reloc)
4342  				btrfs_clear_data_reloc_bg(block_group);
4343  			continue;
4344  		}
4345  
4346  		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4347  		ffe_ctl->search_start = block_group->start;
4348  
4349  		/*
4350  		 * this can happen if we end up cycling through all the
4351  		 * raid types, but we want to make sure we only allocate
4352  		 * for the proper type.
4353  		 */
4354  		if (!block_group_bits(block_group, ffe_ctl->flags)) {
4355  			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4356  				BTRFS_BLOCK_GROUP_RAID1_MASK |
4357  				BTRFS_BLOCK_GROUP_RAID56_MASK |
4358  				BTRFS_BLOCK_GROUP_RAID10;
4359  
4360  			/*
4361  			 * if they asked for extra copies and this block group
4362  			 * doesn't provide them, bail.  This does allow us to
4363  			 * fill raid0 from raid1.
4364  			 */
4365  			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4366  				goto loop;
4367  
4368  			/*
4369  			 * This block group has different flags than we want.
4370  			 * It's possible that we have MIXED_GROUP flag but no
4371  			 * block group is mixed.  Just skip such block group.
4372  			 */
4373  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4374  			continue;
4375  		}
4376  
4377  have_block_group:
4378  		trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4379  		ffe_ctl->cached = btrfs_block_group_done(block_group);
4380  		if (unlikely(!ffe_ctl->cached)) {
4381  			ffe_ctl->have_caching_bg = true;
4382  			ret = btrfs_cache_block_group(block_group, false);
4383  
4384  			/*
4385  			 * If we get ENOMEM here or something else we want to
4386  			 * try other block groups, because it may not be fatal.
4387  			 * However if we can't find anything else we need to
4388  			 * save our return here so that we return the actual
4389  			 * error that caused problems, not ENOSPC.
4390  			 */
4391  			if (ret < 0) {
4392  				if (!cache_block_group_error)
4393  					cache_block_group_error = ret;
4394  				ret = 0;
4395  				goto loop;
4396  			}
4397  			ret = 0;
4398  		}
4399  
4400  		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4401  			if (!cache_block_group_error)
4402  				cache_block_group_error = -EIO;
4403  			goto loop;
4404  		}
4405  
4406  		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4407  			goto loop;
4408  
4409  		bg_ret = NULL;
4410  		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4411  		if (ret > 0)
4412  			goto loop;
4413  
4414  		if (bg_ret && bg_ret != block_group) {
4415  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4416  			block_group = bg_ret;
4417  		}
4418  
4419  		/* Checks */
4420  		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4421  						 fs_info->stripesize);
4422  
4423  		/* move on to the next group */
4424  		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4425  		    block_group->start + block_group->length) {
4426  			btrfs_add_free_space_unused(block_group,
4427  					    ffe_ctl->found_offset,
4428  					    ffe_ctl->num_bytes);
4429  			goto loop;
4430  		}
4431  
4432  		if (ffe_ctl->found_offset < ffe_ctl->search_start)
4433  			btrfs_add_free_space_unused(block_group,
4434  					ffe_ctl->found_offset,
4435  					ffe_ctl->search_start - ffe_ctl->found_offset);
4436  
4437  		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4438  					       ffe_ctl->num_bytes,
4439  					       ffe_ctl->delalloc,
4440  					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4441  		if (ret == -EAGAIN) {
4442  			btrfs_add_free_space_unused(block_group,
4443  					ffe_ctl->found_offset,
4444  					ffe_ctl->num_bytes);
4445  			goto loop;
4446  		}
4447  		btrfs_inc_block_group_reservations(block_group);
4448  
4449  		/* we are all good, lets return */
4450  		ins->objectid = ffe_ctl->search_start;
4451  		ins->offset = ffe_ctl->num_bytes;
4452  
4453  		trace_btrfs_reserve_extent(block_group, ffe_ctl);
4454  		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4455  		break;
4456  loop:
4457  		if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4458  		    !ffe_ctl->retry_uncached) {
4459  			ffe_ctl->retry_uncached = true;
4460  			btrfs_wait_block_group_cache_progress(block_group,
4461  						ffe_ctl->num_bytes +
4462  						ffe_ctl->empty_cluster +
4463  						ffe_ctl->empty_size);
4464  			goto have_block_group;
4465  		}
4466  		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4467  		cond_resched();
4468  	}
4469  	up_read(&space_info->groups_sem);
4470  
4471  	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4472  	if (ret > 0)
4473  		goto search;
4474  
4475  	if (ret == -ENOSPC && !cache_block_group_error) {
4476  		/*
4477  		 * Use ffe_ctl->total_free_space as fallback if we can't find
4478  		 * any contiguous hole.
4479  		 */
4480  		if (!ffe_ctl->max_extent_size)
4481  			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4482  		spin_lock(&space_info->lock);
4483  		space_info->max_extent_size = ffe_ctl->max_extent_size;
4484  		spin_unlock(&space_info->lock);
4485  		ins->offset = ffe_ctl->max_extent_size;
4486  	} else if (ret == -ENOSPC) {
4487  		ret = cache_block_group_error;
4488  	}
4489  	return ret;
4490  }
4491  
4492  /*
4493   * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4494   *			  hole that is at least as big as @num_bytes.
4495   *
4496   * @root           -	The root that will contain this extent
4497   *
4498   * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
4499   *			is used for accounting purposes. This value differs
4500   *			from @num_bytes only in the case of compressed extents.
4501   *
4502   * @num_bytes      -	Number of bytes to allocate on-disk.
4503   *
4504   * @min_alloc_size -	Indicates the minimum amount of space that the
4505   *			allocator should try to satisfy. In some cases
4506   *			@num_bytes may be larger than what is required and if
4507   *			the filesystem is fragmented then allocation fails.
4508   *			However, the presence of @min_alloc_size gives a
4509   *			chance to try and satisfy the smaller allocation.
4510   *
4511   * @empty_size     -	A hint that you plan on doing more COW. This is the
4512   *			size in bytes the allocator should try to find free
4513   *			next to the block it returns.  This is just a hint and
4514   *			may be ignored by the allocator.
4515   *
4516   * @hint_byte      -	Hint to the allocator to start searching above the byte
4517   *			address passed. It might be ignored.
4518   *
4519   * @ins            -	This key is modified to record the found hole. It will
4520   *			have the following values:
4521   *			ins->objectid == start position
4522   *			ins->flags = BTRFS_EXTENT_ITEM_KEY
4523   *			ins->offset == the size of the hole.
4524   *
4525   * @is_data        -	Boolean flag indicating whether an extent is
4526   *			allocated for data (true) or metadata (false)
4527   *
4528   * @delalloc       -	Boolean flag indicating whether this allocation is for
4529   *			delalloc or not. If 'true' data_rwsem of block groups
4530   *			is going to be acquired.
4531   *
4532   *
4533   * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4534   * case -ENOSPC is returned then @ins->offset will contain the size of the
4535   * largest available hole the allocator managed to find.
4536   */
btrfs_reserve_extent(struct btrfs_root * root,u64 ram_bytes,u64 num_bytes,u64 min_alloc_size,u64 empty_size,u64 hint_byte,struct btrfs_key * ins,int is_data,int delalloc)4537  int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4538  			 u64 num_bytes, u64 min_alloc_size,
4539  			 u64 empty_size, u64 hint_byte,
4540  			 struct btrfs_key *ins, int is_data, int delalloc)
4541  {
4542  	struct btrfs_fs_info *fs_info = root->fs_info;
4543  	struct find_free_extent_ctl ffe_ctl = {};
4544  	bool final_tried = num_bytes == min_alloc_size;
4545  	u64 flags;
4546  	int ret;
4547  	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4548  	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4549  
4550  	flags = get_alloc_profile_by_root(root, is_data);
4551  again:
4552  	WARN_ON(num_bytes < fs_info->sectorsize);
4553  
4554  	ffe_ctl.ram_bytes = ram_bytes;
4555  	ffe_ctl.num_bytes = num_bytes;
4556  	ffe_ctl.min_alloc_size = min_alloc_size;
4557  	ffe_ctl.empty_size = empty_size;
4558  	ffe_ctl.flags = flags;
4559  	ffe_ctl.delalloc = delalloc;
4560  	ffe_ctl.hint_byte = hint_byte;
4561  	ffe_ctl.for_treelog = for_treelog;
4562  	ffe_ctl.for_data_reloc = for_data_reloc;
4563  
4564  	ret = find_free_extent(root, ins, &ffe_ctl);
4565  	if (!ret && !is_data) {
4566  		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4567  	} else if (ret == -ENOSPC) {
4568  		if (!final_tried && ins->offset) {
4569  			num_bytes = min(num_bytes >> 1, ins->offset);
4570  			num_bytes = round_down(num_bytes,
4571  					       fs_info->sectorsize);
4572  			num_bytes = max(num_bytes, min_alloc_size);
4573  			ram_bytes = num_bytes;
4574  			if (num_bytes == min_alloc_size)
4575  				final_tried = true;
4576  			goto again;
4577  		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4578  			struct btrfs_space_info *sinfo;
4579  
4580  			sinfo = btrfs_find_space_info(fs_info, flags);
4581  			btrfs_err(fs_info,
4582  	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4583  				  flags, num_bytes, for_treelog, for_data_reloc);
4584  			if (sinfo)
4585  				btrfs_dump_space_info(fs_info, sinfo,
4586  						      num_bytes, 1);
4587  		}
4588  	}
4589  
4590  	return ret;
4591  }
4592  
btrfs_free_reserved_extent(struct btrfs_fs_info * fs_info,u64 start,u64 len,int delalloc)4593  int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4594  			       u64 start, u64 len, int delalloc)
4595  {
4596  	struct btrfs_block_group *cache;
4597  
4598  	cache = btrfs_lookup_block_group(fs_info, start);
4599  	if (!cache) {
4600  		btrfs_err(fs_info, "Unable to find block group for %llu",
4601  			  start);
4602  		return -ENOSPC;
4603  	}
4604  
4605  	btrfs_add_free_space(cache, start, len);
4606  	btrfs_free_reserved_bytes(cache, len, delalloc);
4607  	trace_btrfs_reserved_extent_free(fs_info, start, len);
4608  
4609  	btrfs_put_block_group(cache);
4610  	return 0;
4611  }
4612  
btrfs_pin_reserved_extent(struct btrfs_trans_handle * trans,u64 start,u64 len)4613  int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4614  			      u64 len)
4615  {
4616  	struct btrfs_block_group *cache;
4617  	int ret = 0;
4618  
4619  	cache = btrfs_lookup_block_group(trans->fs_info, start);
4620  	if (!cache) {
4621  		btrfs_err(trans->fs_info, "unable to find block group for %llu",
4622  			  start);
4623  		return -ENOSPC;
4624  	}
4625  
4626  	ret = pin_down_extent(trans, cache, start, len, 1);
4627  	btrfs_put_block_group(cache);
4628  	return ret;
4629  }
4630  
alloc_reserved_extent(struct btrfs_trans_handle * trans,u64 bytenr,u64 num_bytes)4631  static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4632  				 u64 num_bytes)
4633  {
4634  	struct btrfs_fs_info *fs_info = trans->fs_info;
4635  	int ret;
4636  
4637  	ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4638  	if (ret)
4639  		return ret;
4640  
4641  	ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4642  	if (ret) {
4643  		ASSERT(!ret);
4644  		btrfs_err(fs_info, "update block group failed for %llu %llu",
4645  			  bytenr, num_bytes);
4646  		return ret;
4647  	}
4648  
4649  	trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4650  	return 0;
4651  }
4652  
alloc_reserved_file_extent(struct btrfs_trans_handle * trans,u64 parent,u64 root_objectid,u64 flags,u64 owner,u64 offset,struct btrfs_key * ins,int ref_mod)4653  static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4654  				      u64 parent, u64 root_objectid,
4655  				      u64 flags, u64 owner, u64 offset,
4656  				      struct btrfs_key *ins, int ref_mod)
4657  {
4658  	struct btrfs_fs_info *fs_info = trans->fs_info;
4659  	struct btrfs_root *extent_root;
4660  	int ret;
4661  	struct btrfs_extent_item *extent_item;
4662  	struct btrfs_extent_inline_ref *iref;
4663  	struct btrfs_path *path;
4664  	struct extent_buffer *leaf;
4665  	int type;
4666  	u32 size;
4667  
4668  	if (parent > 0)
4669  		type = BTRFS_SHARED_DATA_REF_KEY;
4670  	else
4671  		type = BTRFS_EXTENT_DATA_REF_KEY;
4672  
4673  	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4674  
4675  	path = btrfs_alloc_path();
4676  	if (!path)
4677  		return -ENOMEM;
4678  
4679  	extent_root = btrfs_extent_root(fs_info, ins->objectid);
4680  	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4681  	if (ret) {
4682  		btrfs_free_path(path);
4683  		return ret;
4684  	}
4685  
4686  	leaf = path->nodes[0];
4687  	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4688  				     struct btrfs_extent_item);
4689  	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4690  	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4691  	btrfs_set_extent_flags(leaf, extent_item,
4692  			       flags | BTRFS_EXTENT_FLAG_DATA);
4693  
4694  	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4695  	btrfs_set_extent_inline_ref_type(leaf, iref, type);
4696  	if (parent > 0) {
4697  		struct btrfs_shared_data_ref *ref;
4698  		ref = (struct btrfs_shared_data_ref *)(iref + 1);
4699  		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4700  		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4701  	} else {
4702  		struct btrfs_extent_data_ref *ref;
4703  		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4704  		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4705  		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4706  		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4707  		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4708  	}
4709  
4710  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4711  	btrfs_free_path(path);
4712  
4713  	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4714  }
4715  
alloc_reserved_tree_block(struct btrfs_trans_handle * trans,struct btrfs_delayed_ref_node * node,struct btrfs_delayed_extent_op * extent_op)4716  static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4717  				     struct btrfs_delayed_ref_node *node,
4718  				     struct btrfs_delayed_extent_op *extent_op)
4719  {
4720  	struct btrfs_fs_info *fs_info = trans->fs_info;
4721  	struct btrfs_root *extent_root;
4722  	int ret;
4723  	struct btrfs_extent_item *extent_item;
4724  	struct btrfs_key extent_key;
4725  	struct btrfs_tree_block_info *block_info;
4726  	struct btrfs_extent_inline_ref *iref;
4727  	struct btrfs_path *path;
4728  	struct extent_buffer *leaf;
4729  	struct btrfs_delayed_tree_ref *ref;
4730  	u32 size = sizeof(*extent_item) + sizeof(*iref);
4731  	u64 flags = extent_op->flags_to_set;
4732  	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4733  
4734  	ref = btrfs_delayed_node_to_tree_ref(node);
4735  
4736  	extent_key.objectid = node->bytenr;
4737  	if (skinny_metadata) {
4738  		extent_key.offset = ref->level;
4739  		extent_key.type = BTRFS_METADATA_ITEM_KEY;
4740  	} else {
4741  		extent_key.offset = node->num_bytes;
4742  		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4743  		size += sizeof(*block_info);
4744  	}
4745  
4746  	path = btrfs_alloc_path();
4747  	if (!path)
4748  		return -ENOMEM;
4749  
4750  	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4751  	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4752  				      size);
4753  	if (ret) {
4754  		btrfs_free_path(path);
4755  		return ret;
4756  	}
4757  
4758  	leaf = path->nodes[0];
4759  	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4760  				     struct btrfs_extent_item);
4761  	btrfs_set_extent_refs(leaf, extent_item, 1);
4762  	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4763  	btrfs_set_extent_flags(leaf, extent_item,
4764  			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4765  
4766  	if (skinny_metadata) {
4767  		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4768  	} else {
4769  		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4770  		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4771  		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4772  		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4773  	}
4774  
4775  	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4776  		btrfs_set_extent_inline_ref_type(leaf, iref,
4777  						 BTRFS_SHARED_BLOCK_REF_KEY);
4778  		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4779  	} else {
4780  		btrfs_set_extent_inline_ref_type(leaf, iref,
4781  						 BTRFS_TREE_BLOCK_REF_KEY);
4782  		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4783  	}
4784  
4785  	btrfs_mark_buffer_dirty(trans, leaf);
4786  	btrfs_free_path(path);
4787  
4788  	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4789  }
4790  
btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 owner,u64 offset,u64 ram_bytes,struct btrfs_key * ins)4791  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4792  				     struct btrfs_root *root, u64 owner,
4793  				     u64 offset, u64 ram_bytes,
4794  				     struct btrfs_key *ins)
4795  {
4796  	struct btrfs_ref generic_ref = { 0 };
4797  
4798  	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4799  
4800  	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4801  			       ins->objectid, ins->offset, 0);
4802  	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4803  			    offset, 0, false);
4804  	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4805  
4806  	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4807  }
4808  
4809  /*
4810   * this is used by the tree logging recovery code.  It records that
4811   * an extent has been allocated and makes sure to clear the free
4812   * space cache bits as well
4813   */
btrfs_alloc_logged_file_extent(struct btrfs_trans_handle * trans,u64 root_objectid,u64 owner,u64 offset,struct btrfs_key * ins)4814  int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4815  				   u64 root_objectid, u64 owner, u64 offset,
4816  				   struct btrfs_key *ins)
4817  {
4818  	struct btrfs_fs_info *fs_info = trans->fs_info;
4819  	int ret;
4820  	struct btrfs_block_group *block_group;
4821  	struct btrfs_space_info *space_info;
4822  
4823  	/*
4824  	 * Mixed block groups will exclude before processing the log so we only
4825  	 * need to do the exclude dance if this fs isn't mixed.
4826  	 */
4827  	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4828  		ret = __exclude_logged_extent(fs_info, ins->objectid,
4829  					      ins->offset);
4830  		if (ret)
4831  			return ret;
4832  	}
4833  
4834  	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4835  	if (!block_group)
4836  		return -EINVAL;
4837  
4838  	space_info = block_group->space_info;
4839  	spin_lock(&space_info->lock);
4840  	spin_lock(&block_group->lock);
4841  	space_info->bytes_reserved += ins->offset;
4842  	block_group->reserved += ins->offset;
4843  	spin_unlock(&block_group->lock);
4844  	spin_unlock(&space_info->lock);
4845  
4846  	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4847  					 offset, ins, 1);
4848  	if (ret)
4849  		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4850  	btrfs_put_block_group(block_group);
4851  	return ret;
4852  }
4853  
4854  static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytenr,int level,u64 owner,enum btrfs_lock_nesting nest)4855  btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4856  		      u64 bytenr, int level, u64 owner,
4857  		      enum btrfs_lock_nesting nest)
4858  {
4859  	struct btrfs_fs_info *fs_info = root->fs_info;
4860  	struct extent_buffer *buf;
4861  	u64 lockdep_owner = owner;
4862  
4863  	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4864  	if (IS_ERR(buf))
4865  		return buf;
4866  
4867  	/*
4868  	 * Extra safety check in case the extent tree is corrupted and extent
4869  	 * allocator chooses to use a tree block which is already used and
4870  	 * locked.
4871  	 */
4872  	if (buf->lock_owner == current->pid) {
4873  		btrfs_err_rl(fs_info,
4874  "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4875  			buf->start, btrfs_header_owner(buf), current->pid);
4876  		free_extent_buffer(buf);
4877  		return ERR_PTR(-EUCLEAN);
4878  	}
4879  
4880  	/*
4881  	 * The reloc trees are just snapshots, so we need them to appear to be
4882  	 * just like any other fs tree WRT lockdep.
4883  	 *
4884  	 * The exception however is in replace_path() in relocation, where we
4885  	 * hold the lock on the original fs root and then search for the reloc
4886  	 * root.  At that point we need to make sure any reloc root buffers are
4887  	 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
4888  	 * lockdep happy.
4889  	 */
4890  	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
4891  	    !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
4892  		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
4893  
4894  	/* btrfs_clear_buffer_dirty() accesses generation field. */
4895  	btrfs_set_header_generation(buf, trans->transid);
4896  
4897  	/*
4898  	 * This needs to stay, because we could allocate a freed block from an
4899  	 * old tree into a new tree, so we need to make sure this new block is
4900  	 * set to the appropriate level and owner.
4901  	 */
4902  	btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
4903  
4904  	__btrfs_tree_lock(buf, nest);
4905  	btrfs_clear_buffer_dirty(trans, buf);
4906  	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4907  	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4908  
4909  	set_extent_buffer_uptodate(buf);
4910  
4911  	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4912  	btrfs_set_header_level(buf, level);
4913  	btrfs_set_header_bytenr(buf, buf->start);
4914  	btrfs_set_header_generation(buf, trans->transid);
4915  	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4916  	btrfs_set_header_owner(buf, owner);
4917  	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4918  	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4919  	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4920  		buf->log_index = root->log_transid % 2;
4921  		/*
4922  		 * we allow two log transactions at a time, use different
4923  		 * EXTENT bit to differentiate dirty pages.
4924  		 */
4925  		if (buf->log_index == 0)
4926  			set_extent_bit(&root->dirty_log_pages, buf->start,
4927  				       buf->start + buf->len - 1,
4928  				       EXTENT_DIRTY, NULL);
4929  		else
4930  			set_extent_bit(&root->dirty_log_pages, buf->start,
4931  				       buf->start + buf->len - 1,
4932  				       EXTENT_NEW, NULL);
4933  	} else {
4934  		buf->log_index = -1;
4935  		set_extent_bit(&trans->transaction->dirty_pages, buf->start,
4936  			       buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
4937  	}
4938  	/* this returns a buffer locked for blocking */
4939  	return buf;
4940  }
4941  
4942  /*
4943   * finds a free extent and does all the dirty work required for allocation
4944   * returns the tree buffer or an ERR_PTR on error.
4945   */
btrfs_alloc_tree_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 parent,u64 root_objectid,const struct btrfs_disk_key * key,int level,u64 hint,u64 empty_size,enum btrfs_lock_nesting nest)4946  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4947  					     struct btrfs_root *root,
4948  					     u64 parent, u64 root_objectid,
4949  					     const struct btrfs_disk_key *key,
4950  					     int level, u64 hint,
4951  					     u64 empty_size,
4952  					     enum btrfs_lock_nesting nest)
4953  {
4954  	struct btrfs_fs_info *fs_info = root->fs_info;
4955  	struct btrfs_key ins;
4956  	struct btrfs_block_rsv *block_rsv;
4957  	struct extent_buffer *buf;
4958  	struct btrfs_delayed_extent_op *extent_op;
4959  	struct btrfs_ref generic_ref = { 0 };
4960  	u64 flags = 0;
4961  	int ret;
4962  	u32 blocksize = fs_info->nodesize;
4963  	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4964  
4965  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4966  	if (btrfs_is_testing(fs_info)) {
4967  		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4968  					    level, root_objectid, nest);
4969  		if (!IS_ERR(buf))
4970  			root->alloc_bytenr += blocksize;
4971  		return buf;
4972  	}
4973  #endif
4974  
4975  	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4976  	if (IS_ERR(block_rsv))
4977  		return ERR_CAST(block_rsv);
4978  
4979  	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4980  				   empty_size, hint, &ins, 0, 0);
4981  	if (ret)
4982  		goto out_unuse;
4983  
4984  	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4985  				    root_objectid, nest);
4986  	if (IS_ERR(buf)) {
4987  		ret = PTR_ERR(buf);
4988  		goto out_free_reserved;
4989  	}
4990  
4991  	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4992  		if (parent == 0)
4993  			parent = ins.objectid;
4994  		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4995  	} else
4996  		BUG_ON(parent > 0);
4997  
4998  	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4999  		extent_op = btrfs_alloc_delayed_extent_op();
5000  		if (!extent_op) {
5001  			ret = -ENOMEM;
5002  			goto out_free_buf;
5003  		}
5004  		if (key)
5005  			memcpy(&extent_op->key, key, sizeof(extent_op->key));
5006  		else
5007  			memset(&extent_op->key, 0, sizeof(extent_op->key));
5008  		extent_op->flags_to_set = flags;
5009  		extent_op->update_key = skinny_metadata ? false : true;
5010  		extent_op->update_flags = true;
5011  		extent_op->level = level;
5012  
5013  		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
5014  				       ins.objectid, ins.offset, parent);
5015  		btrfs_init_tree_ref(&generic_ref, level, root_objectid,
5016  				    root->root_key.objectid, false);
5017  		btrfs_ref_tree_mod(fs_info, &generic_ref);
5018  		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
5019  		if (ret)
5020  			goto out_free_delayed;
5021  	}
5022  	return buf;
5023  
5024  out_free_delayed:
5025  	btrfs_free_delayed_extent_op(extent_op);
5026  out_free_buf:
5027  	btrfs_tree_unlock(buf);
5028  	free_extent_buffer(buf);
5029  out_free_reserved:
5030  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
5031  out_unuse:
5032  	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5033  	return ERR_PTR(ret);
5034  }
5035  
5036  struct walk_control {
5037  	u64 refs[BTRFS_MAX_LEVEL];
5038  	u64 flags[BTRFS_MAX_LEVEL];
5039  	struct btrfs_key update_progress;
5040  	struct btrfs_key drop_progress;
5041  	int drop_level;
5042  	int stage;
5043  	int level;
5044  	int shared_level;
5045  	int update_ref;
5046  	int keep_locks;
5047  	int reada_slot;
5048  	int reada_count;
5049  	int restarted;
5050  };
5051  
5052  #define DROP_REFERENCE	1
5053  #define UPDATE_BACKREF	2
5054  
reada_walk_down(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct walk_control * wc,struct btrfs_path * path)5055  static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5056  				     struct btrfs_root *root,
5057  				     struct walk_control *wc,
5058  				     struct btrfs_path *path)
5059  {
5060  	struct btrfs_fs_info *fs_info = root->fs_info;
5061  	u64 bytenr;
5062  	u64 generation;
5063  	u64 refs;
5064  	u64 flags;
5065  	u32 nritems;
5066  	struct btrfs_key key;
5067  	struct extent_buffer *eb;
5068  	int ret;
5069  	int slot;
5070  	int nread = 0;
5071  
5072  	if (path->slots[wc->level] < wc->reada_slot) {
5073  		wc->reada_count = wc->reada_count * 2 / 3;
5074  		wc->reada_count = max(wc->reada_count, 2);
5075  	} else {
5076  		wc->reada_count = wc->reada_count * 3 / 2;
5077  		wc->reada_count = min_t(int, wc->reada_count,
5078  					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5079  	}
5080  
5081  	eb = path->nodes[wc->level];
5082  	nritems = btrfs_header_nritems(eb);
5083  
5084  	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5085  		if (nread >= wc->reada_count)
5086  			break;
5087  
5088  		cond_resched();
5089  		bytenr = btrfs_node_blockptr(eb, slot);
5090  		generation = btrfs_node_ptr_generation(eb, slot);
5091  
5092  		if (slot == path->slots[wc->level])
5093  			goto reada;
5094  
5095  		if (wc->stage == UPDATE_BACKREF &&
5096  		    generation <= root->root_key.offset)
5097  			continue;
5098  
5099  		/* We don't lock the tree block, it's OK to be racy here */
5100  		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5101  					       wc->level - 1, 1, &refs,
5102  					       &flags);
5103  		/* We don't care about errors in readahead. */
5104  		if (ret < 0)
5105  			continue;
5106  
5107  		/*
5108  		 * This could be racey, it's conceivable that we raced and end
5109  		 * up with a bogus refs count, if that's the case just skip, if
5110  		 * we are actually corrupt we will notice when we look up
5111  		 * everything again with our locks.
5112  		 */
5113  		if (refs == 0)
5114  			continue;
5115  
5116  		if (wc->stage == DROP_REFERENCE) {
5117  			if (refs == 1)
5118  				goto reada;
5119  
5120  			if (wc->level == 1 &&
5121  			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5122  				continue;
5123  			if (!wc->update_ref ||
5124  			    generation <= root->root_key.offset)
5125  				continue;
5126  			btrfs_node_key_to_cpu(eb, &key, slot);
5127  			ret = btrfs_comp_cpu_keys(&key,
5128  						  &wc->update_progress);
5129  			if (ret < 0)
5130  				continue;
5131  		} else {
5132  			if (wc->level == 1 &&
5133  			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5134  				continue;
5135  		}
5136  reada:
5137  		btrfs_readahead_node_child(eb, slot);
5138  		nread++;
5139  	}
5140  	wc->reada_slot = slot;
5141  }
5142  
5143  /*
5144   * helper to process tree block while walking down the tree.
5145   *
5146   * when wc->stage == UPDATE_BACKREF, this function updates
5147   * back refs for pointers in the block.
5148   *
5149   * NOTE: return value 1 means we should stop walking down.
5150   */
walk_down_proc(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,int lookup_info)5151  static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5152  				   struct btrfs_root *root,
5153  				   struct btrfs_path *path,
5154  				   struct walk_control *wc, int lookup_info)
5155  {
5156  	struct btrfs_fs_info *fs_info = root->fs_info;
5157  	int level = wc->level;
5158  	struct extent_buffer *eb = path->nodes[level];
5159  	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5160  	int ret;
5161  
5162  	if (wc->stage == UPDATE_BACKREF &&
5163  	    btrfs_header_owner(eb) != root->root_key.objectid)
5164  		return 1;
5165  
5166  	/*
5167  	 * when reference count of tree block is 1, it won't increase
5168  	 * again. once full backref flag is set, we never clear it.
5169  	 */
5170  	if (lookup_info &&
5171  	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5172  	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5173  		ASSERT(path->locks[level]);
5174  		ret = btrfs_lookup_extent_info(trans, fs_info,
5175  					       eb->start, level, 1,
5176  					       &wc->refs[level],
5177  					       &wc->flags[level]);
5178  		if (ret)
5179  			return ret;
5180  		if (unlikely(wc->refs[level] == 0)) {
5181  			btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5182  				  eb->start);
5183  			return -EUCLEAN;
5184  		}
5185  	}
5186  
5187  	if (wc->stage == DROP_REFERENCE) {
5188  		if (wc->refs[level] > 1)
5189  			return 1;
5190  
5191  		if (path->locks[level] && !wc->keep_locks) {
5192  			btrfs_tree_unlock_rw(eb, path->locks[level]);
5193  			path->locks[level] = 0;
5194  		}
5195  		return 0;
5196  	}
5197  
5198  	/* wc->stage == UPDATE_BACKREF */
5199  	if (!(wc->flags[level] & flag)) {
5200  		ASSERT(path->locks[level]);
5201  		ret = btrfs_inc_ref(trans, root, eb, 1);
5202  		BUG_ON(ret); /* -ENOMEM */
5203  		ret = btrfs_dec_ref(trans, root, eb, 0);
5204  		BUG_ON(ret); /* -ENOMEM */
5205  		ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5206  		BUG_ON(ret); /* -ENOMEM */
5207  		wc->flags[level] |= flag;
5208  	}
5209  
5210  	/*
5211  	 * the block is shared by multiple trees, so it's not good to
5212  	 * keep the tree lock
5213  	 */
5214  	if (path->locks[level] && level > 0) {
5215  		btrfs_tree_unlock_rw(eb, path->locks[level]);
5216  		path->locks[level] = 0;
5217  	}
5218  	return 0;
5219  }
5220  
5221  /*
5222   * This is used to verify a ref exists for this root to deal with a bug where we
5223   * would have a drop_progress key that hadn't been updated properly.
5224   */
check_ref_exists(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 bytenr,u64 parent,int level)5225  static int check_ref_exists(struct btrfs_trans_handle *trans,
5226  			    struct btrfs_root *root, u64 bytenr, u64 parent,
5227  			    int level)
5228  {
5229  	struct btrfs_path *path;
5230  	struct btrfs_extent_inline_ref *iref;
5231  	int ret;
5232  
5233  	path = btrfs_alloc_path();
5234  	if (!path)
5235  		return -ENOMEM;
5236  
5237  	ret = lookup_extent_backref(trans, path, &iref, bytenr,
5238  				    root->fs_info->nodesize, parent,
5239  				    root->root_key.objectid, level, 0);
5240  	btrfs_free_path(path);
5241  	if (ret == -ENOENT)
5242  		return 0;
5243  	if (ret < 0)
5244  		return ret;
5245  	return 1;
5246  }
5247  
5248  /*
5249   * helper to process tree block pointer.
5250   *
5251   * when wc->stage == DROP_REFERENCE, this function checks
5252   * reference count of the block pointed to. if the block
5253   * is shared and we need update back refs for the subtree
5254   * rooted at the block, this function changes wc->stage to
5255   * UPDATE_BACKREF. if the block is shared and there is no
5256   * need to update back, this function drops the reference
5257   * to the block.
5258   *
5259   * NOTE: return value 1 means we should stop walking down.
5260   */
do_walk_down(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,int * lookup_info)5261  static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5262  				 struct btrfs_root *root,
5263  				 struct btrfs_path *path,
5264  				 struct walk_control *wc, int *lookup_info)
5265  {
5266  	struct btrfs_fs_info *fs_info = root->fs_info;
5267  	u64 bytenr;
5268  	u64 generation;
5269  	u64 parent;
5270  	struct btrfs_tree_parent_check check = { 0 };
5271  	struct btrfs_key key;
5272  	struct btrfs_ref ref = { 0 };
5273  	struct extent_buffer *next;
5274  	int level = wc->level;
5275  	int reada = 0;
5276  	int ret = 0;
5277  	bool need_account = false;
5278  
5279  	generation = btrfs_node_ptr_generation(path->nodes[level],
5280  					       path->slots[level]);
5281  	/*
5282  	 * if the lower level block was created before the snapshot
5283  	 * was created, we know there is no need to update back refs
5284  	 * for the subtree
5285  	 */
5286  	if (wc->stage == UPDATE_BACKREF &&
5287  	    generation <= root->root_key.offset) {
5288  		*lookup_info = 1;
5289  		return 1;
5290  	}
5291  
5292  	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5293  
5294  	check.level = level - 1;
5295  	check.transid = generation;
5296  	check.owner_root = root->root_key.objectid;
5297  	check.has_first_key = true;
5298  	btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
5299  			      path->slots[level]);
5300  
5301  	next = find_extent_buffer(fs_info, bytenr);
5302  	if (!next) {
5303  		next = btrfs_find_create_tree_block(fs_info, bytenr,
5304  				root->root_key.objectid, level - 1);
5305  		if (IS_ERR(next))
5306  			return PTR_ERR(next);
5307  		reada = 1;
5308  	}
5309  	btrfs_tree_lock(next);
5310  
5311  	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5312  				       &wc->refs[level - 1],
5313  				       &wc->flags[level - 1]);
5314  	if (ret < 0)
5315  		goto out_unlock;
5316  
5317  	if (unlikely(wc->refs[level - 1] == 0)) {
5318  		btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5319  			  bytenr);
5320  		ret = -EUCLEAN;
5321  		goto out_unlock;
5322  	}
5323  	*lookup_info = 0;
5324  
5325  	if (wc->stage == DROP_REFERENCE) {
5326  		if (wc->refs[level - 1] > 1) {
5327  			need_account = true;
5328  			if (level == 1 &&
5329  			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5330  				goto skip;
5331  
5332  			if (!wc->update_ref ||
5333  			    generation <= root->root_key.offset)
5334  				goto skip;
5335  
5336  			btrfs_node_key_to_cpu(path->nodes[level], &key,
5337  					      path->slots[level]);
5338  			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5339  			if (ret < 0)
5340  				goto skip;
5341  
5342  			wc->stage = UPDATE_BACKREF;
5343  			wc->shared_level = level - 1;
5344  		}
5345  	} else {
5346  		if (level == 1 &&
5347  		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5348  			goto skip;
5349  	}
5350  
5351  	if (!btrfs_buffer_uptodate(next, generation, 0)) {
5352  		btrfs_tree_unlock(next);
5353  		free_extent_buffer(next);
5354  		next = NULL;
5355  		*lookup_info = 1;
5356  	}
5357  
5358  	if (!next) {
5359  		if (reada && level == 1)
5360  			reada_walk_down(trans, root, wc, path);
5361  		next = read_tree_block(fs_info, bytenr, &check);
5362  		if (IS_ERR(next)) {
5363  			return PTR_ERR(next);
5364  		} else if (!extent_buffer_uptodate(next)) {
5365  			free_extent_buffer(next);
5366  			return -EIO;
5367  		}
5368  		btrfs_tree_lock(next);
5369  	}
5370  
5371  	level--;
5372  	ASSERT(level == btrfs_header_level(next));
5373  	if (level != btrfs_header_level(next)) {
5374  		btrfs_err(root->fs_info, "mismatched level");
5375  		ret = -EIO;
5376  		goto out_unlock;
5377  	}
5378  	path->nodes[level] = next;
5379  	path->slots[level] = 0;
5380  	path->locks[level] = BTRFS_WRITE_LOCK;
5381  	wc->level = level;
5382  	if (wc->level == 1)
5383  		wc->reada_slot = 0;
5384  	return 0;
5385  skip:
5386  	wc->refs[level - 1] = 0;
5387  	wc->flags[level - 1] = 0;
5388  	if (wc->stage == DROP_REFERENCE) {
5389  		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5390  			parent = path->nodes[level]->start;
5391  		} else {
5392  			ASSERT(root->root_key.objectid ==
5393  			       btrfs_header_owner(path->nodes[level]));
5394  			if (root->root_key.objectid !=
5395  			    btrfs_header_owner(path->nodes[level])) {
5396  				btrfs_err(root->fs_info,
5397  						"mismatched block owner");
5398  				ret = -EIO;
5399  				goto out_unlock;
5400  			}
5401  			parent = 0;
5402  		}
5403  
5404  		/*
5405  		 * If we had a drop_progress we need to verify the refs are set
5406  		 * as expected.  If we find our ref then we know that from here
5407  		 * on out everything should be correct, and we can clear the
5408  		 * ->restarted flag.
5409  		 */
5410  		if (wc->restarted) {
5411  			ret = check_ref_exists(trans, root, bytenr, parent,
5412  					       level - 1);
5413  			if (ret < 0)
5414  				goto out_unlock;
5415  			if (ret == 0)
5416  				goto no_delete;
5417  			ret = 0;
5418  			wc->restarted = 0;
5419  		}
5420  
5421  		/*
5422  		 * Reloc tree doesn't contribute to qgroup numbers, and we have
5423  		 * already accounted them at merge time (replace_path),
5424  		 * thus we could skip expensive subtree trace here.
5425  		 */
5426  		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5427  		    need_account) {
5428  			ret = btrfs_qgroup_trace_subtree(trans, next,
5429  							 generation, level - 1);
5430  			if (ret) {
5431  				btrfs_err_rl(fs_info,
5432  					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5433  					     ret);
5434  			}
5435  		}
5436  
5437  		/*
5438  		 * We need to update the next key in our walk control so we can
5439  		 * update the drop_progress key accordingly.  We don't care if
5440  		 * find_next_key doesn't find a key because that means we're at
5441  		 * the end and are going to clean up now.
5442  		 */
5443  		wc->drop_level = level;
5444  		find_next_key(path, level, &wc->drop_progress);
5445  
5446  		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5447  				       fs_info->nodesize, parent);
5448  		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5449  				    0, false);
5450  		ret = btrfs_free_extent(trans, &ref);
5451  		if (ret)
5452  			goto out_unlock;
5453  	}
5454  no_delete:
5455  	*lookup_info = 1;
5456  	ret = 1;
5457  
5458  out_unlock:
5459  	btrfs_tree_unlock(next);
5460  	free_extent_buffer(next);
5461  
5462  	return ret;
5463  }
5464  
5465  /*
5466   * helper to process tree block while walking up the tree.
5467   *
5468   * when wc->stage == DROP_REFERENCE, this function drops
5469   * reference count on the block.
5470   *
5471   * when wc->stage == UPDATE_BACKREF, this function changes
5472   * wc->stage back to DROP_REFERENCE if we changed wc->stage
5473   * to UPDATE_BACKREF previously while processing the block.
5474   *
5475   * NOTE: return value 1 means we should stop walking up.
5476   */
walk_up_proc(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5477  static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5478  				 struct btrfs_root *root,
5479  				 struct btrfs_path *path,
5480  				 struct walk_control *wc)
5481  {
5482  	struct btrfs_fs_info *fs_info = root->fs_info;
5483  	int ret = 0;
5484  	int level = wc->level;
5485  	struct extent_buffer *eb = path->nodes[level];
5486  	u64 parent = 0;
5487  
5488  	if (wc->stage == UPDATE_BACKREF) {
5489  		BUG_ON(wc->shared_level < level);
5490  		if (level < wc->shared_level)
5491  			goto out;
5492  
5493  		ret = find_next_key(path, level + 1, &wc->update_progress);
5494  		if (ret > 0)
5495  			wc->update_ref = 0;
5496  
5497  		wc->stage = DROP_REFERENCE;
5498  		wc->shared_level = -1;
5499  		path->slots[level] = 0;
5500  
5501  		/*
5502  		 * check reference count again if the block isn't locked.
5503  		 * we should start walking down the tree again if reference
5504  		 * count is one.
5505  		 */
5506  		if (!path->locks[level]) {
5507  			BUG_ON(level == 0);
5508  			btrfs_tree_lock(eb);
5509  			path->locks[level] = BTRFS_WRITE_LOCK;
5510  
5511  			ret = btrfs_lookup_extent_info(trans, fs_info,
5512  						       eb->start, level, 1,
5513  						       &wc->refs[level],
5514  						       &wc->flags[level]);
5515  			if (ret < 0) {
5516  				btrfs_tree_unlock_rw(eb, path->locks[level]);
5517  				path->locks[level] = 0;
5518  				return ret;
5519  			}
5520  			if (unlikely(wc->refs[level] == 0)) {
5521  				btrfs_tree_unlock_rw(eb, path->locks[level]);
5522  				btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
5523  					  eb->start);
5524  				return -EUCLEAN;
5525  			}
5526  			if (wc->refs[level] == 1) {
5527  				btrfs_tree_unlock_rw(eb, path->locks[level]);
5528  				path->locks[level] = 0;
5529  				return 1;
5530  			}
5531  		}
5532  	}
5533  
5534  	/* wc->stage == DROP_REFERENCE */
5535  	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5536  
5537  	if (wc->refs[level] == 1) {
5538  		if (level == 0) {
5539  			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5540  				ret = btrfs_dec_ref(trans, root, eb, 1);
5541  			else
5542  				ret = btrfs_dec_ref(trans, root, eb, 0);
5543  			BUG_ON(ret); /* -ENOMEM */
5544  			if (is_fstree(root->root_key.objectid)) {
5545  				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5546  				if (ret) {
5547  					btrfs_err_rl(fs_info,
5548  	"error %d accounting leaf items, quota is out of sync, rescan required",
5549  					     ret);
5550  				}
5551  			}
5552  		}
5553  		/* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5554  		if (!path->locks[level]) {
5555  			btrfs_tree_lock(eb);
5556  			path->locks[level] = BTRFS_WRITE_LOCK;
5557  		}
5558  		btrfs_clear_buffer_dirty(trans, eb);
5559  	}
5560  
5561  	if (eb == root->node) {
5562  		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5563  			parent = eb->start;
5564  		else if (root->root_key.objectid != btrfs_header_owner(eb))
5565  			goto owner_mismatch;
5566  	} else {
5567  		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5568  			parent = path->nodes[level + 1]->start;
5569  		else if (root->root_key.objectid !=
5570  			 btrfs_header_owner(path->nodes[level + 1]))
5571  			goto owner_mismatch;
5572  	}
5573  
5574  	ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5575  				    wc->refs[level] == 1);
5576  	if (ret < 0)
5577  		btrfs_abort_transaction(trans, ret);
5578  out:
5579  	wc->refs[level] = 0;
5580  	wc->flags[level] = 0;
5581  	return ret;
5582  
5583  owner_mismatch:
5584  	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5585  		     btrfs_header_owner(eb), root->root_key.objectid);
5586  	return -EUCLEAN;
5587  }
5588  
walk_down_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc)5589  static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5590  				   struct btrfs_root *root,
5591  				   struct btrfs_path *path,
5592  				   struct walk_control *wc)
5593  {
5594  	int level = wc->level;
5595  	int lookup_info = 1;
5596  	int ret = 0;
5597  
5598  	while (level >= 0) {
5599  		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5600  		if (ret)
5601  			break;
5602  
5603  		if (level == 0)
5604  			break;
5605  
5606  		if (path->slots[level] >=
5607  		    btrfs_header_nritems(path->nodes[level]))
5608  			break;
5609  
5610  		ret = do_walk_down(trans, root, path, wc, &lookup_info);
5611  		if (ret > 0) {
5612  			path->slots[level]++;
5613  			continue;
5614  		} else if (ret < 0)
5615  			break;
5616  		level = wc->level;
5617  	}
5618  	return (ret == 1) ? 0 : ret;
5619  }
5620  
walk_up_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct walk_control * wc,int max_level)5621  static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5622  				 struct btrfs_root *root,
5623  				 struct btrfs_path *path,
5624  				 struct walk_control *wc, int max_level)
5625  {
5626  	int level = wc->level;
5627  	int ret;
5628  
5629  	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5630  	while (level < max_level && path->nodes[level]) {
5631  		wc->level = level;
5632  		if (path->slots[level] + 1 <
5633  		    btrfs_header_nritems(path->nodes[level])) {
5634  			path->slots[level]++;
5635  			return 0;
5636  		} else {
5637  			ret = walk_up_proc(trans, root, path, wc);
5638  			if (ret > 0)
5639  				return 0;
5640  			if (ret < 0)
5641  				return ret;
5642  
5643  			if (path->locks[level]) {
5644  				btrfs_tree_unlock_rw(path->nodes[level],
5645  						     path->locks[level]);
5646  				path->locks[level] = 0;
5647  			}
5648  			free_extent_buffer(path->nodes[level]);
5649  			path->nodes[level] = NULL;
5650  			level++;
5651  		}
5652  	}
5653  	return 1;
5654  }
5655  
5656  /*
5657   * drop a subvolume tree.
5658   *
5659   * this function traverses the tree freeing any blocks that only
5660   * referenced by the tree.
5661   *
5662   * when a shared tree block is found. this function decreases its
5663   * reference count by one. if update_ref is true, this function
5664   * also make sure backrefs for the shared block and all lower level
5665   * blocks are properly updated.
5666   *
5667   * If called with for_reloc == 0, may exit early with -EAGAIN
5668   */
btrfs_drop_snapshot(struct btrfs_root * root,int update_ref,int for_reloc)5669  int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5670  {
5671  	const bool is_reloc_root = (root->root_key.objectid ==
5672  				    BTRFS_TREE_RELOC_OBJECTID);
5673  	struct btrfs_fs_info *fs_info = root->fs_info;
5674  	struct btrfs_path *path;
5675  	struct btrfs_trans_handle *trans;
5676  	struct btrfs_root *tree_root = fs_info->tree_root;
5677  	struct btrfs_root_item *root_item = &root->root_item;
5678  	struct walk_control *wc;
5679  	struct btrfs_key key;
5680  	int err = 0;
5681  	int ret;
5682  	int level;
5683  	bool root_dropped = false;
5684  	bool unfinished_drop = false;
5685  
5686  	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5687  
5688  	path = btrfs_alloc_path();
5689  	if (!path) {
5690  		err = -ENOMEM;
5691  		goto out;
5692  	}
5693  
5694  	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5695  	if (!wc) {
5696  		btrfs_free_path(path);
5697  		err = -ENOMEM;
5698  		goto out;
5699  	}
5700  
5701  	/*
5702  	 * Use join to avoid potential EINTR from transaction start. See
5703  	 * wait_reserve_ticket and the whole reservation callchain.
5704  	 */
5705  	if (for_reloc)
5706  		trans = btrfs_join_transaction(tree_root);
5707  	else
5708  		trans = btrfs_start_transaction(tree_root, 0);
5709  	if (IS_ERR(trans)) {
5710  		err = PTR_ERR(trans);
5711  		goto out_free;
5712  	}
5713  
5714  	err = btrfs_run_delayed_items(trans);
5715  	if (err)
5716  		goto out_end_trans;
5717  
5718  	/*
5719  	 * This will help us catch people modifying the fs tree while we're
5720  	 * dropping it.  It is unsafe to mess with the fs tree while it's being
5721  	 * dropped as we unlock the root node and parent nodes as we walk down
5722  	 * the tree, assuming nothing will change.  If something does change
5723  	 * then we'll have stale information and drop references to blocks we've
5724  	 * already dropped.
5725  	 */
5726  	set_bit(BTRFS_ROOT_DELETING, &root->state);
5727  	unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5728  
5729  	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5730  		level = btrfs_header_level(root->node);
5731  		path->nodes[level] = btrfs_lock_root_node(root);
5732  		path->slots[level] = 0;
5733  		path->locks[level] = BTRFS_WRITE_LOCK;
5734  		memset(&wc->update_progress, 0,
5735  		       sizeof(wc->update_progress));
5736  	} else {
5737  		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5738  		memcpy(&wc->update_progress, &key,
5739  		       sizeof(wc->update_progress));
5740  
5741  		level = btrfs_root_drop_level(root_item);
5742  		BUG_ON(level == 0);
5743  		path->lowest_level = level;
5744  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5745  		path->lowest_level = 0;
5746  		if (ret < 0) {
5747  			err = ret;
5748  			goto out_end_trans;
5749  		}
5750  		WARN_ON(ret > 0);
5751  
5752  		/*
5753  		 * unlock our path, this is safe because only this
5754  		 * function is allowed to delete this snapshot
5755  		 */
5756  		btrfs_unlock_up_safe(path, 0);
5757  
5758  		level = btrfs_header_level(root->node);
5759  		while (1) {
5760  			btrfs_tree_lock(path->nodes[level]);
5761  			path->locks[level] = BTRFS_WRITE_LOCK;
5762  
5763  			ret = btrfs_lookup_extent_info(trans, fs_info,
5764  						path->nodes[level]->start,
5765  						level, 1, &wc->refs[level],
5766  						&wc->flags[level]);
5767  			if (ret < 0) {
5768  				err = ret;
5769  				goto out_end_trans;
5770  			}
5771  			BUG_ON(wc->refs[level] == 0);
5772  
5773  			if (level == btrfs_root_drop_level(root_item))
5774  				break;
5775  
5776  			btrfs_tree_unlock(path->nodes[level]);
5777  			path->locks[level] = 0;
5778  			WARN_ON(wc->refs[level] != 1);
5779  			level--;
5780  		}
5781  	}
5782  
5783  	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5784  	wc->level = level;
5785  	wc->shared_level = -1;
5786  	wc->stage = DROP_REFERENCE;
5787  	wc->update_ref = update_ref;
5788  	wc->keep_locks = 0;
5789  	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5790  
5791  	while (1) {
5792  
5793  		ret = walk_down_tree(trans, root, path, wc);
5794  		if (ret < 0) {
5795  			btrfs_abort_transaction(trans, ret);
5796  			err = ret;
5797  			break;
5798  		}
5799  
5800  		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5801  		if (ret < 0) {
5802  			btrfs_abort_transaction(trans, ret);
5803  			err = ret;
5804  			break;
5805  		}
5806  
5807  		if (ret > 0) {
5808  			BUG_ON(wc->stage != DROP_REFERENCE);
5809  			break;
5810  		}
5811  
5812  		if (wc->stage == DROP_REFERENCE) {
5813  			wc->drop_level = wc->level;
5814  			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5815  					      &wc->drop_progress,
5816  					      path->slots[wc->drop_level]);
5817  		}
5818  		btrfs_cpu_key_to_disk(&root_item->drop_progress,
5819  				      &wc->drop_progress);
5820  		btrfs_set_root_drop_level(root_item, wc->drop_level);
5821  
5822  		BUG_ON(wc->level == 0);
5823  		if (btrfs_should_end_transaction(trans) ||
5824  		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5825  			ret = btrfs_update_root(trans, tree_root,
5826  						&root->root_key,
5827  						root_item);
5828  			if (ret) {
5829  				btrfs_abort_transaction(trans, ret);
5830  				err = ret;
5831  				goto out_end_trans;
5832  			}
5833  
5834  			if (!is_reloc_root)
5835  				btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5836  
5837  			btrfs_end_transaction_throttle(trans);
5838  			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5839  				btrfs_debug(fs_info,
5840  					    "drop snapshot early exit");
5841  				err = -EAGAIN;
5842  				goto out_free;
5843  			}
5844  
5845  		       /*
5846  			* Use join to avoid potential EINTR from transaction
5847  			* start. See wait_reserve_ticket and the whole
5848  			* reservation callchain.
5849  			*/
5850  			if (for_reloc)
5851  				trans = btrfs_join_transaction(tree_root);
5852  			else
5853  				trans = btrfs_start_transaction(tree_root, 0);
5854  			if (IS_ERR(trans)) {
5855  				err = PTR_ERR(trans);
5856  				goto out_free;
5857  			}
5858  		}
5859  	}
5860  	btrfs_release_path(path);
5861  	if (err)
5862  		goto out_end_trans;
5863  
5864  	ret = btrfs_del_root(trans, &root->root_key);
5865  	if (ret) {
5866  		btrfs_abort_transaction(trans, ret);
5867  		err = ret;
5868  		goto out_end_trans;
5869  	}
5870  
5871  	if (!is_reloc_root) {
5872  		ret = btrfs_find_root(tree_root, &root->root_key, path,
5873  				      NULL, NULL);
5874  		if (ret < 0) {
5875  			btrfs_abort_transaction(trans, ret);
5876  			err = ret;
5877  			goto out_end_trans;
5878  		} else if (ret > 0) {
5879  			/* if we fail to delete the orphan item this time
5880  			 * around, it'll get picked up the next time.
5881  			 *
5882  			 * The most common failure here is just -ENOENT.
5883  			 */
5884  			btrfs_del_orphan_item(trans, tree_root,
5885  					      root->root_key.objectid);
5886  		}
5887  	}
5888  
5889  	/*
5890  	 * This subvolume is going to be completely dropped, and won't be
5891  	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5892  	 * commit transaction time.  So free it here manually.
5893  	 */
5894  	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5895  	btrfs_qgroup_free_meta_all_pertrans(root);
5896  
5897  	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5898  		btrfs_add_dropped_root(trans, root);
5899  	else
5900  		btrfs_put_root(root);
5901  	root_dropped = true;
5902  out_end_trans:
5903  	if (!is_reloc_root)
5904  		btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5905  
5906  	btrfs_end_transaction_throttle(trans);
5907  out_free:
5908  	kfree(wc);
5909  	btrfs_free_path(path);
5910  out:
5911  	/*
5912  	 * We were an unfinished drop root, check to see if there are any
5913  	 * pending, and if not clear and wake up any waiters.
5914  	 */
5915  	if (!err && unfinished_drop)
5916  		btrfs_maybe_wake_unfinished_drop(fs_info);
5917  
5918  	/*
5919  	 * So if we need to stop dropping the snapshot for whatever reason we
5920  	 * need to make sure to add it back to the dead root list so that we
5921  	 * keep trying to do the work later.  This also cleans up roots if we
5922  	 * don't have it in the radix (like when we recover after a power fail
5923  	 * or unmount) so we don't leak memory.
5924  	 */
5925  	if (!for_reloc && !root_dropped)
5926  		btrfs_add_dead_root(root);
5927  	return err;
5928  }
5929  
5930  /*
5931   * drop subtree rooted at tree block 'node'.
5932   *
5933   * NOTE: this function will unlock and release tree block 'node'
5934   * only used by relocation code
5935   */
btrfs_drop_subtree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * node,struct extent_buffer * parent)5936  int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5937  			struct btrfs_root *root,
5938  			struct extent_buffer *node,
5939  			struct extent_buffer *parent)
5940  {
5941  	struct btrfs_fs_info *fs_info = root->fs_info;
5942  	struct btrfs_path *path;
5943  	struct walk_control *wc;
5944  	int level;
5945  	int parent_level;
5946  	int ret = 0;
5947  	int wret;
5948  
5949  	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5950  
5951  	path = btrfs_alloc_path();
5952  	if (!path)
5953  		return -ENOMEM;
5954  
5955  	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5956  	if (!wc) {
5957  		btrfs_free_path(path);
5958  		return -ENOMEM;
5959  	}
5960  
5961  	btrfs_assert_tree_write_locked(parent);
5962  	parent_level = btrfs_header_level(parent);
5963  	atomic_inc(&parent->refs);
5964  	path->nodes[parent_level] = parent;
5965  	path->slots[parent_level] = btrfs_header_nritems(parent);
5966  
5967  	btrfs_assert_tree_write_locked(node);
5968  	level = btrfs_header_level(node);
5969  	path->nodes[level] = node;
5970  	path->slots[level] = 0;
5971  	path->locks[level] = BTRFS_WRITE_LOCK;
5972  
5973  	wc->refs[parent_level] = 1;
5974  	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5975  	wc->level = level;
5976  	wc->shared_level = -1;
5977  	wc->stage = DROP_REFERENCE;
5978  	wc->update_ref = 0;
5979  	wc->keep_locks = 1;
5980  	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5981  
5982  	while (1) {
5983  		wret = walk_down_tree(trans, root, path, wc);
5984  		if (wret < 0) {
5985  			ret = wret;
5986  			break;
5987  		}
5988  
5989  		wret = walk_up_tree(trans, root, path, wc, parent_level);
5990  		if (wret < 0)
5991  			ret = wret;
5992  		if (wret != 0)
5993  			break;
5994  	}
5995  
5996  	kfree(wc);
5997  	btrfs_free_path(path);
5998  	return ret;
5999  }
6000  
btrfs_error_unpin_extent_range(struct btrfs_fs_info * fs_info,u64 start,u64 end)6001  int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
6002  				   u64 start, u64 end)
6003  {
6004  	return unpin_extent_range(fs_info, start, end, false);
6005  }
6006  
6007  /*
6008   * It used to be that old block groups would be left around forever.
6009   * Iterating over them would be enough to trim unused space.  Since we
6010   * now automatically remove them, we also need to iterate over unallocated
6011   * space.
6012   *
6013   * We don't want a transaction for this since the discard may take a
6014   * substantial amount of time.  We don't require that a transaction be
6015   * running, but we do need to take a running transaction into account
6016   * to ensure that we're not discarding chunks that were released or
6017   * allocated in the current transaction.
6018   *
6019   * Holding the chunks lock will prevent other threads from allocating
6020   * or releasing chunks, but it won't prevent a running transaction
6021   * from committing and releasing the memory that the pending chunks
6022   * list head uses.  For that, we need to take a reference to the
6023   * transaction and hold the commit root sem.  We only need to hold
6024   * it while performing the free space search since we have already
6025   * held back allocations.
6026   */
btrfs_trim_free_extents(struct btrfs_device * device,u64 * trimmed)6027  static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6028  {
6029  	u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
6030  	int ret;
6031  
6032  	*trimmed = 0;
6033  
6034  	/* Discard not supported = nothing to do. */
6035  	if (!bdev_max_discard_sectors(device->bdev))
6036  		return 0;
6037  
6038  	/* Not writable = nothing to do. */
6039  	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6040  		return 0;
6041  
6042  	/* No free space = nothing to do. */
6043  	if (device->total_bytes <= device->bytes_used)
6044  		return 0;
6045  
6046  	ret = 0;
6047  
6048  	while (1) {
6049  		struct btrfs_fs_info *fs_info = device->fs_info;
6050  		u64 bytes;
6051  
6052  		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
6053  		if (ret)
6054  			break;
6055  
6056  		find_first_clear_extent_bit(&device->alloc_state, start,
6057  					    &start, &end,
6058  					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
6059  
6060  		/* Check if there are any CHUNK_* bits left */
6061  		if (start > device->total_bytes) {
6062  			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6063  			btrfs_warn_in_rcu(fs_info,
6064  "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6065  					  start, end - start + 1,
6066  					  btrfs_dev_name(device),
6067  					  device->total_bytes);
6068  			mutex_unlock(&fs_info->chunk_mutex);
6069  			ret = 0;
6070  			break;
6071  		}
6072  
6073  		/* Ensure we skip the reserved space on each device. */
6074  		start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6075  
6076  		/*
6077  		 * If find_first_clear_extent_bit find a range that spans the
6078  		 * end of the device it will set end to -1, in this case it's up
6079  		 * to the caller to trim the value to the size of the device.
6080  		 */
6081  		end = min(end, device->total_bytes - 1);
6082  
6083  		len = end - start + 1;
6084  
6085  		/* We didn't find any extents */
6086  		if (!len) {
6087  			mutex_unlock(&fs_info->chunk_mutex);
6088  			ret = 0;
6089  			break;
6090  		}
6091  
6092  		ret = btrfs_issue_discard(device->bdev, start, len,
6093  					  &bytes);
6094  		if (!ret)
6095  			set_extent_bit(&device->alloc_state, start,
6096  				       start + bytes - 1, CHUNK_TRIMMED, NULL);
6097  		mutex_unlock(&fs_info->chunk_mutex);
6098  
6099  		if (ret)
6100  			break;
6101  
6102  		start += len;
6103  		*trimmed += bytes;
6104  
6105  		if (btrfs_trim_interrupted()) {
6106  			ret = -ERESTARTSYS;
6107  			break;
6108  		}
6109  
6110  		cond_resched();
6111  	}
6112  
6113  	return ret;
6114  }
6115  
6116  /*
6117   * Trim the whole filesystem by:
6118   * 1) trimming the free space in each block group
6119   * 2) trimming the unallocated space on each device
6120   *
6121   * This will also continue trimming even if a block group or device encounters
6122   * an error.  The return value will be the last error, or 0 if nothing bad
6123   * happens.
6124   */
btrfs_trim_fs(struct btrfs_fs_info * fs_info,struct fstrim_range * range)6125  int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6126  {
6127  	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6128  	struct btrfs_block_group *cache = NULL;
6129  	struct btrfs_device *device;
6130  	u64 group_trimmed;
6131  	u64 range_end = U64_MAX;
6132  	u64 start;
6133  	u64 end;
6134  	u64 trimmed = 0;
6135  	u64 bg_failed = 0;
6136  	u64 dev_failed = 0;
6137  	int bg_ret = 0;
6138  	int dev_ret = 0;
6139  	int ret = 0;
6140  
6141  	if (range->start == U64_MAX)
6142  		return -EINVAL;
6143  
6144  	/*
6145  	 * Check range overflow if range->len is set.
6146  	 * The default range->len is U64_MAX.
6147  	 */
6148  	if (range->len != U64_MAX &&
6149  	    check_add_overflow(range->start, range->len, &range_end))
6150  		return -EINVAL;
6151  
6152  	cache = btrfs_lookup_first_block_group(fs_info, range->start);
6153  	for (; cache; cache = btrfs_next_block_group(cache)) {
6154  		if (cache->start >= range_end) {
6155  			btrfs_put_block_group(cache);
6156  			break;
6157  		}
6158  
6159  		start = max(range->start, cache->start);
6160  		end = min(range_end, cache->start + cache->length);
6161  
6162  		if (end - start >= range->minlen) {
6163  			if (!btrfs_block_group_done(cache)) {
6164  				ret = btrfs_cache_block_group(cache, true);
6165  				if (ret) {
6166  					bg_failed++;
6167  					bg_ret = ret;
6168  					continue;
6169  				}
6170  			}
6171  			ret = btrfs_trim_block_group(cache,
6172  						     &group_trimmed,
6173  						     start,
6174  						     end,
6175  						     range->minlen);
6176  
6177  			trimmed += group_trimmed;
6178  			if (ret) {
6179  				bg_failed++;
6180  				bg_ret = ret;
6181  				continue;
6182  			}
6183  		}
6184  	}
6185  
6186  	if (bg_failed)
6187  		btrfs_warn(fs_info,
6188  			"failed to trim %llu block group(s), last error %d",
6189  			bg_failed, bg_ret);
6190  
6191  	mutex_lock(&fs_devices->device_list_mutex);
6192  	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6193  		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6194  			continue;
6195  
6196  		ret = btrfs_trim_free_extents(device, &group_trimmed);
6197  
6198  		trimmed += group_trimmed;
6199  		if (ret) {
6200  			dev_failed++;
6201  			dev_ret = ret;
6202  			break;
6203  		}
6204  	}
6205  	mutex_unlock(&fs_devices->device_list_mutex);
6206  
6207  	if (dev_failed)
6208  		btrfs_warn(fs_info,
6209  			"failed to trim %llu device(s), last error %d",
6210  			dev_failed, dev_ret);
6211  	range->len = trimmed;
6212  	if (bg_ret)
6213  		return bg_ret;
6214  	return dev_ret;
6215  }
6216