xref: /openbmc/linux/fs/btrfs/ctree.c (revision 474be445555ba8f2e776b4b6458c310bc215f76b)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/slab.h>
8  #include <linux/rbtree.h>
9  #include <linux/mm.h>
10  #include <linux/error-injection.h>
11  #include "ctree.h"
12  #include "disk-io.h"
13  #include "transaction.h"
14  #include "print-tree.h"
15  #include "locking.h"
16  #include "volumes.h"
17  #include "qgroup.h"
18  #include "tree-mod-log.h"
19  #include "tree-checker.h"
20  
21  static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
22  		      *root, struct btrfs_path *path, int level);
23  static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
24  		      const struct btrfs_key *ins_key, struct btrfs_path *path,
25  		      int data_size, int extend);
26  static int push_node_left(struct btrfs_trans_handle *trans,
27  			  struct extent_buffer *dst,
28  			  struct extent_buffer *src, int empty);
29  static int balance_node_right(struct btrfs_trans_handle *trans,
30  			      struct extent_buffer *dst_buf,
31  			      struct extent_buffer *src_buf);
32  static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
33  		    int level, int slot);
34  
35  static const struct btrfs_csums {
36  	u16		size;
37  	const char	name[10];
38  	const char	driver[12];
39  } btrfs_csums[] = {
40  	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
41  	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
42  	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
43  	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
44  				     .driver = "blake2b-256" },
45  };
46  
47  int btrfs_super_csum_size(const struct btrfs_super_block *s)
48  {
49  	u16 t = btrfs_super_csum_type(s);
50  	/*
51  	 * csum type is validated at mount time
52  	 */
53  	return btrfs_csums[t].size;
54  }
55  
56  const char *btrfs_super_csum_name(u16 csum_type)
57  {
58  	/* csum type is validated at mount time */
59  	return btrfs_csums[csum_type].name;
60  }
61  
62  /*
63   * Return driver name if defined, otherwise the name that's also a valid driver
64   * name
65   */
66  const char *btrfs_super_csum_driver(u16 csum_type)
67  {
68  	/* csum type is validated at mount time */
69  	return btrfs_csums[csum_type].driver[0] ?
70  		btrfs_csums[csum_type].driver :
71  		btrfs_csums[csum_type].name;
72  }
73  
74  size_t __attribute_const__ btrfs_get_num_csums(void)
75  {
76  	return ARRAY_SIZE(btrfs_csums);
77  }
78  
79  struct btrfs_path *btrfs_alloc_path(void)
80  {
81  	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
82  }
83  
84  /* this also releases the path */
85  void btrfs_free_path(struct btrfs_path *p)
86  {
87  	if (!p)
88  		return;
89  	btrfs_release_path(p);
90  	kmem_cache_free(btrfs_path_cachep, p);
91  }
92  
93  /*
94   * path release drops references on the extent buffers in the path
95   * and it drops any locks held by this path
96   *
97   * It is safe to call this on paths that no locks or extent buffers held.
98   */
99  noinline void btrfs_release_path(struct btrfs_path *p)
100  {
101  	int i;
102  
103  	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
104  		p->slots[i] = 0;
105  		if (!p->nodes[i])
106  			continue;
107  		if (p->locks[i]) {
108  			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
109  			p->locks[i] = 0;
110  		}
111  		free_extent_buffer(p->nodes[i]);
112  		p->nodes[i] = NULL;
113  	}
114  }
115  
116  /*
117   * safely gets a reference on the root node of a tree.  A lock
118   * is not taken, so a concurrent writer may put a different node
119   * at the root of the tree.  See btrfs_lock_root_node for the
120   * looping required.
121   *
122   * The extent buffer returned by this has a reference taken, so
123   * it won't disappear.  It may stop being the root of the tree
124   * at any time because there are no locks held.
125   */
126  struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
127  {
128  	struct extent_buffer *eb;
129  
130  	while (1) {
131  		rcu_read_lock();
132  		eb = rcu_dereference(root->node);
133  
134  		/*
135  		 * RCU really hurts here, we could free up the root node because
136  		 * it was COWed but we may not get the new root node yet so do
137  		 * the inc_not_zero dance and if it doesn't work then
138  		 * synchronize_rcu and try again.
139  		 */
140  		if (atomic_inc_not_zero(&eb->refs)) {
141  			rcu_read_unlock();
142  			break;
143  		}
144  		rcu_read_unlock();
145  		synchronize_rcu();
146  	}
147  	return eb;
148  }
149  
150  /*
151   * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
152   * just get put onto a simple dirty list.  Transaction walks this list to make
153   * sure they get properly updated on disk.
154   */
155  static void add_root_to_dirty_list(struct btrfs_root *root)
156  {
157  	struct btrfs_fs_info *fs_info = root->fs_info;
158  
159  	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
160  	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
161  		return;
162  
163  	spin_lock(&fs_info->trans_lock);
164  	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
165  		/* Want the extent tree to be the last on the list */
166  		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
167  			list_move_tail(&root->dirty_list,
168  				       &fs_info->dirty_cowonly_roots);
169  		else
170  			list_move(&root->dirty_list,
171  				  &fs_info->dirty_cowonly_roots);
172  	}
173  	spin_unlock(&fs_info->trans_lock);
174  }
175  
176  /*
177   * used by snapshot creation to make a copy of a root for a tree with
178   * a given objectid.  The buffer with the new root node is returned in
179   * cow_ret, and this func returns zero on success or a negative error code.
180   */
181  int btrfs_copy_root(struct btrfs_trans_handle *trans,
182  		      struct btrfs_root *root,
183  		      struct extent_buffer *buf,
184  		      struct extent_buffer **cow_ret, u64 new_root_objectid)
185  {
186  	struct btrfs_fs_info *fs_info = root->fs_info;
187  	struct extent_buffer *cow;
188  	int ret = 0;
189  	int level;
190  	struct btrfs_disk_key disk_key;
191  
192  	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193  		trans->transid != fs_info->running_transaction->transid);
194  	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
195  		trans->transid != root->last_trans);
196  
197  	level = btrfs_header_level(buf);
198  	if (level == 0)
199  		btrfs_item_key(buf, &disk_key, 0);
200  	else
201  		btrfs_node_key(buf, &disk_key, 0);
202  
203  	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
204  				     &disk_key, level, buf->start, 0,
205  				     BTRFS_NESTING_NEW_ROOT);
206  	if (IS_ERR(cow))
207  		return PTR_ERR(cow);
208  
209  	copy_extent_buffer_full(cow, buf);
210  	btrfs_set_header_bytenr(cow, cow->start);
211  	btrfs_set_header_generation(cow, trans->transid);
212  	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
213  	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
214  				     BTRFS_HEADER_FLAG_RELOC);
215  	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
216  		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
217  	else
218  		btrfs_set_header_owner(cow, new_root_objectid);
219  
220  	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
221  
222  	WARN_ON(btrfs_header_generation(buf) > trans->transid);
223  	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
224  		ret = btrfs_inc_ref(trans, root, cow, 1);
225  	else
226  		ret = btrfs_inc_ref(trans, root, cow, 0);
227  	if (ret) {
228  		btrfs_tree_unlock(cow);
229  		free_extent_buffer(cow);
230  		btrfs_abort_transaction(trans, ret);
231  		return ret;
232  	}
233  
234  	btrfs_mark_buffer_dirty(cow);
235  	*cow_ret = cow;
236  	return 0;
237  }
238  
239  /*
240   * check if the tree block can be shared by multiple trees
241   */
242  int btrfs_block_can_be_shared(struct btrfs_root *root,
243  			      struct extent_buffer *buf)
244  {
245  	/*
246  	 * Tree blocks not in shareable trees and tree roots are never shared.
247  	 * If a block was allocated after the last snapshot and the block was
248  	 * not allocated by tree relocation, we know the block is not shared.
249  	 */
250  	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
251  	    buf != root->node && buf != root->commit_root &&
252  	    (btrfs_header_generation(buf) <=
253  	     btrfs_root_last_snapshot(&root->root_item) ||
254  	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
255  		return 1;
256  
257  	return 0;
258  }
259  
260  static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
261  				       struct btrfs_root *root,
262  				       struct extent_buffer *buf,
263  				       struct extent_buffer *cow,
264  				       int *last_ref)
265  {
266  	struct btrfs_fs_info *fs_info = root->fs_info;
267  	u64 refs;
268  	u64 owner;
269  	u64 flags;
270  	u64 new_flags = 0;
271  	int ret;
272  
273  	/*
274  	 * Backrefs update rules:
275  	 *
276  	 * Always use full backrefs for extent pointers in tree block
277  	 * allocated by tree relocation.
278  	 *
279  	 * If a shared tree block is no longer referenced by its owner
280  	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
281  	 * use full backrefs for extent pointers in tree block.
282  	 *
283  	 * If a tree block is been relocating
284  	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
285  	 * use full backrefs for extent pointers in tree block.
286  	 * The reason for this is some operations (such as drop tree)
287  	 * are only allowed for blocks use full backrefs.
288  	 */
289  
290  	if (btrfs_block_can_be_shared(root, buf)) {
291  		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
292  					       btrfs_header_level(buf), 1,
293  					       &refs, &flags);
294  		if (ret)
295  			return ret;
296  		if (refs == 0) {
297  			ret = -EROFS;
298  			btrfs_handle_fs_error(fs_info, ret, NULL);
299  			return ret;
300  		}
301  	} else {
302  		refs = 1;
303  		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
304  		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
305  			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
306  		else
307  			flags = 0;
308  	}
309  
310  	owner = btrfs_header_owner(buf);
311  	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
312  	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
313  
314  	if (refs > 1) {
315  		if ((owner == root->root_key.objectid ||
316  		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
317  		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
318  			ret = btrfs_inc_ref(trans, root, buf, 1);
319  			if (ret)
320  				return ret;
321  
322  			if (root->root_key.objectid ==
323  			    BTRFS_TREE_RELOC_OBJECTID) {
324  				ret = btrfs_dec_ref(trans, root, buf, 0);
325  				if (ret)
326  					return ret;
327  				ret = btrfs_inc_ref(trans, root, cow, 1);
328  				if (ret)
329  					return ret;
330  			}
331  			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
332  		} else {
333  
334  			if (root->root_key.objectid ==
335  			    BTRFS_TREE_RELOC_OBJECTID)
336  				ret = btrfs_inc_ref(trans, root, cow, 1);
337  			else
338  				ret = btrfs_inc_ref(trans, root, cow, 0);
339  			if (ret)
340  				return ret;
341  		}
342  		if (new_flags != 0) {
343  			int level = btrfs_header_level(buf);
344  
345  			ret = btrfs_set_disk_extent_flags(trans, buf,
346  							  new_flags, level);
347  			if (ret)
348  				return ret;
349  		}
350  	} else {
351  		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
352  			if (root->root_key.objectid ==
353  			    BTRFS_TREE_RELOC_OBJECTID)
354  				ret = btrfs_inc_ref(trans, root, cow, 1);
355  			else
356  				ret = btrfs_inc_ref(trans, root, cow, 0);
357  			if (ret)
358  				return ret;
359  			ret = btrfs_dec_ref(trans, root, buf, 1);
360  			if (ret)
361  				return ret;
362  		}
363  		btrfs_clean_tree_block(buf);
364  		*last_ref = 1;
365  	}
366  	return 0;
367  }
368  
369  /*
370   * does the dirty work in cow of a single block.  The parent block (if
371   * supplied) is updated to point to the new cow copy.  The new buffer is marked
372   * dirty and returned locked.  If you modify the block it needs to be marked
373   * dirty again.
374   *
375   * search_start -- an allocation hint for the new block
376   *
377   * empty_size -- a hint that you plan on doing more cow.  This is the size in
378   * bytes the allocator should try to find free next to the block it returns.
379   * This is just a hint and may be ignored by the allocator.
380   */
381  static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
382  			     struct btrfs_root *root,
383  			     struct extent_buffer *buf,
384  			     struct extent_buffer *parent, int parent_slot,
385  			     struct extent_buffer **cow_ret,
386  			     u64 search_start, u64 empty_size,
387  			     enum btrfs_lock_nesting nest)
388  {
389  	struct btrfs_fs_info *fs_info = root->fs_info;
390  	struct btrfs_disk_key disk_key;
391  	struct extent_buffer *cow;
392  	int level, ret;
393  	int last_ref = 0;
394  	int unlock_orig = 0;
395  	u64 parent_start = 0;
396  
397  	if (*cow_ret == buf)
398  		unlock_orig = 1;
399  
400  	btrfs_assert_tree_write_locked(buf);
401  
402  	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
403  		trans->transid != fs_info->running_transaction->transid);
404  	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
405  		trans->transid != root->last_trans);
406  
407  	level = btrfs_header_level(buf);
408  
409  	if (level == 0)
410  		btrfs_item_key(buf, &disk_key, 0);
411  	else
412  		btrfs_node_key(buf, &disk_key, 0);
413  
414  	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
415  		parent_start = parent->start;
416  
417  	cow = btrfs_alloc_tree_block(trans, root, parent_start,
418  				     root->root_key.objectid, &disk_key, level,
419  				     search_start, empty_size, nest);
420  	if (IS_ERR(cow))
421  		return PTR_ERR(cow);
422  
423  	/* cow is set to blocking by btrfs_init_new_buffer */
424  
425  	copy_extent_buffer_full(cow, buf);
426  	btrfs_set_header_bytenr(cow, cow->start);
427  	btrfs_set_header_generation(cow, trans->transid);
428  	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
429  	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
430  				     BTRFS_HEADER_FLAG_RELOC);
431  	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
432  		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
433  	else
434  		btrfs_set_header_owner(cow, root->root_key.objectid);
435  
436  	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
437  
438  	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
439  	if (ret) {
440  		btrfs_tree_unlock(cow);
441  		free_extent_buffer(cow);
442  		btrfs_abort_transaction(trans, ret);
443  		return ret;
444  	}
445  
446  	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
447  		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
448  		if (ret) {
449  			btrfs_tree_unlock(cow);
450  			free_extent_buffer(cow);
451  			btrfs_abort_transaction(trans, ret);
452  			return ret;
453  		}
454  	}
455  
456  	if (buf == root->node) {
457  		WARN_ON(parent && parent != buf);
458  		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
459  		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
460  			parent_start = buf->start;
461  
462  		atomic_inc(&cow->refs);
463  		ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
464  		BUG_ON(ret < 0);
465  		rcu_assign_pointer(root->node, cow);
466  
467  		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
468  				      parent_start, last_ref);
469  		free_extent_buffer(buf);
470  		add_root_to_dirty_list(root);
471  	} else {
472  		WARN_ON(trans->transid != btrfs_header_generation(parent));
473  		btrfs_tree_mod_log_insert_key(parent, parent_slot,
474  					      BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
475  		btrfs_set_node_blockptr(parent, parent_slot,
476  					cow->start);
477  		btrfs_set_node_ptr_generation(parent, parent_slot,
478  					      trans->transid);
479  		btrfs_mark_buffer_dirty(parent);
480  		if (last_ref) {
481  			ret = btrfs_tree_mod_log_free_eb(buf);
482  			if (ret) {
483  				btrfs_tree_unlock(cow);
484  				free_extent_buffer(cow);
485  				btrfs_abort_transaction(trans, ret);
486  				return ret;
487  			}
488  		}
489  		btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
490  				      parent_start, last_ref);
491  	}
492  	if (unlock_orig)
493  		btrfs_tree_unlock(buf);
494  	free_extent_buffer_stale(buf);
495  	btrfs_mark_buffer_dirty(cow);
496  	*cow_ret = cow;
497  	return 0;
498  }
499  
500  static inline int should_cow_block(struct btrfs_trans_handle *trans,
501  				   struct btrfs_root *root,
502  				   struct extent_buffer *buf)
503  {
504  	if (btrfs_is_testing(root->fs_info))
505  		return 0;
506  
507  	/* Ensure we can see the FORCE_COW bit */
508  	smp_mb__before_atomic();
509  
510  	/*
511  	 * We do not need to cow a block if
512  	 * 1) this block is not created or changed in this transaction;
513  	 * 2) this block does not belong to TREE_RELOC tree;
514  	 * 3) the root is not forced COW.
515  	 *
516  	 * What is forced COW:
517  	 *    when we create snapshot during committing the transaction,
518  	 *    after we've finished copying src root, we must COW the shared
519  	 *    block to ensure the metadata consistency.
520  	 */
521  	if (btrfs_header_generation(buf) == trans->transid &&
522  	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
523  	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
524  	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
525  	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
526  		return 0;
527  	return 1;
528  }
529  
530  /*
531   * cows a single block, see __btrfs_cow_block for the real work.
532   * This version of it has extra checks so that a block isn't COWed more than
533   * once per transaction, as long as it hasn't been written yet
534   */
535  noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
536  		    struct btrfs_root *root, struct extent_buffer *buf,
537  		    struct extent_buffer *parent, int parent_slot,
538  		    struct extent_buffer **cow_ret,
539  		    enum btrfs_lock_nesting nest)
540  {
541  	struct btrfs_fs_info *fs_info = root->fs_info;
542  	u64 search_start;
543  	int ret;
544  
545  	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
546  		btrfs_err(fs_info,
547  			"COW'ing blocks on a fs root that's being dropped");
548  
549  	if (trans->transaction != fs_info->running_transaction)
550  		WARN(1, KERN_CRIT "trans %llu running %llu\n",
551  		       trans->transid,
552  		       fs_info->running_transaction->transid);
553  
554  	if (trans->transid != fs_info->generation)
555  		WARN(1, KERN_CRIT "trans %llu running %llu\n",
556  		       trans->transid, fs_info->generation);
557  
558  	if (!should_cow_block(trans, root, buf)) {
559  		*cow_ret = buf;
560  		return 0;
561  	}
562  
563  	search_start = buf->start & ~((u64)SZ_1G - 1);
564  
565  	/*
566  	 * Before CoWing this block for later modification, check if it's
567  	 * the subtree root and do the delayed subtree trace if needed.
568  	 *
569  	 * Also We don't care about the error, as it's handled internally.
570  	 */
571  	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
572  	ret = __btrfs_cow_block(trans, root, buf, parent,
573  				 parent_slot, cow_ret, search_start, 0, nest);
574  
575  	trace_btrfs_cow_block(root, buf, *cow_ret);
576  
577  	return ret;
578  }
579  ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
580  
581  /*
582   * helper function for defrag to decide if two blocks pointed to by a
583   * node are actually close by
584   */
585  static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
586  {
587  	if (blocknr < other && other - (blocknr + blocksize) < 32768)
588  		return 1;
589  	if (blocknr > other && blocknr - (other + blocksize) < 32768)
590  		return 1;
591  	return 0;
592  }
593  
594  #ifdef __LITTLE_ENDIAN
595  
596  /*
597   * Compare two keys, on little-endian the disk order is same as CPU order and
598   * we can avoid the conversion.
599   */
600  static int comp_keys(const struct btrfs_disk_key *disk_key,
601  		     const struct btrfs_key *k2)
602  {
603  	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
604  
605  	return btrfs_comp_cpu_keys(k1, k2);
606  }
607  
608  #else
609  
610  /*
611   * compare two keys in a memcmp fashion
612   */
613  static int comp_keys(const struct btrfs_disk_key *disk,
614  		     const struct btrfs_key *k2)
615  {
616  	struct btrfs_key k1;
617  
618  	btrfs_disk_key_to_cpu(&k1, disk);
619  
620  	return btrfs_comp_cpu_keys(&k1, k2);
621  }
622  #endif
623  
624  /*
625   * same as comp_keys only with two btrfs_key's
626   */
627  int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
628  {
629  	if (k1->objectid > k2->objectid)
630  		return 1;
631  	if (k1->objectid < k2->objectid)
632  		return -1;
633  	if (k1->type > k2->type)
634  		return 1;
635  	if (k1->type < k2->type)
636  		return -1;
637  	if (k1->offset > k2->offset)
638  		return 1;
639  	if (k1->offset < k2->offset)
640  		return -1;
641  	return 0;
642  }
643  
644  /*
645   * this is used by the defrag code to go through all the
646   * leaves pointed to by a node and reallocate them so that
647   * disk order is close to key order
648   */
649  int btrfs_realloc_node(struct btrfs_trans_handle *trans,
650  		       struct btrfs_root *root, struct extent_buffer *parent,
651  		       int start_slot, u64 *last_ret,
652  		       struct btrfs_key *progress)
653  {
654  	struct btrfs_fs_info *fs_info = root->fs_info;
655  	struct extent_buffer *cur;
656  	u64 blocknr;
657  	u64 search_start = *last_ret;
658  	u64 last_block = 0;
659  	u64 other;
660  	u32 parent_nritems;
661  	int end_slot;
662  	int i;
663  	int err = 0;
664  	u32 blocksize;
665  	int progress_passed = 0;
666  	struct btrfs_disk_key disk_key;
667  
668  	WARN_ON(trans->transaction != fs_info->running_transaction);
669  	WARN_ON(trans->transid != fs_info->generation);
670  
671  	parent_nritems = btrfs_header_nritems(parent);
672  	blocksize = fs_info->nodesize;
673  	end_slot = parent_nritems - 1;
674  
675  	if (parent_nritems <= 1)
676  		return 0;
677  
678  	for (i = start_slot; i <= end_slot; i++) {
679  		int close = 1;
680  
681  		btrfs_node_key(parent, &disk_key, i);
682  		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
683  			continue;
684  
685  		progress_passed = 1;
686  		blocknr = btrfs_node_blockptr(parent, i);
687  		if (last_block == 0)
688  			last_block = blocknr;
689  
690  		if (i > 0) {
691  			other = btrfs_node_blockptr(parent, i - 1);
692  			close = close_blocks(blocknr, other, blocksize);
693  		}
694  		if (!close && i < end_slot) {
695  			other = btrfs_node_blockptr(parent, i + 1);
696  			close = close_blocks(blocknr, other, blocksize);
697  		}
698  		if (close) {
699  			last_block = blocknr;
700  			continue;
701  		}
702  
703  		cur = btrfs_read_node_slot(parent, i);
704  		if (IS_ERR(cur))
705  			return PTR_ERR(cur);
706  		if (search_start == 0)
707  			search_start = last_block;
708  
709  		btrfs_tree_lock(cur);
710  		err = __btrfs_cow_block(trans, root, cur, parent, i,
711  					&cur, search_start,
712  					min(16 * blocksize,
713  					    (end_slot - i) * blocksize),
714  					BTRFS_NESTING_COW);
715  		if (err) {
716  			btrfs_tree_unlock(cur);
717  			free_extent_buffer(cur);
718  			break;
719  		}
720  		search_start = cur->start;
721  		last_block = cur->start;
722  		*last_ret = search_start;
723  		btrfs_tree_unlock(cur);
724  		free_extent_buffer(cur);
725  	}
726  	return err;
727  }
728  
729  /*
730   * Search for a key in the given extent_buffer.
731   *
732   * The lower boundary for the search is specified by the slot number @low. Use a
733   * value of 0 to search over the whole extent buffer.
734   *
735   * The slot in the extent buffer is returned via @slot. If the key exists in the
736   * extent buffer, then @slot will point to the slot where the key is, otherwise
737   * it points to the slot where you would insert the key.
738   *
739   * Slot may point to the total number of items (i.e. one position beyond the last
740   * key) if the key is bigger than the last key in the extent buffer.
741   */
742  static noinline int generic_bin_search(struct extent_buffer *eb, int low,
743  				       const struct btrfs_key *key, int *slot)
744  {
745  	unsigned long p;
746  	int item_size;
747  	int high = btrfs_header_nritems(eb);
748  	int ret;
749  	const int key_size = sizeof(struct btrfs_disk_key);
750  
751  	if (low > high) {
752  		btrfs_err(eb->fs_info,
753  		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
754  			  __func__, low, high, eb->start,
755  			  btrfs_header_owner(eb), btrfs_header_level(eb));
756  		return -EINVAL;
757  	}
758  
759  	if (btrfs_header_level(eb) == 0) {
760  		p = offsetof(struct btrfs_leaf, items);
761  		item_size = sizeof(struct btrfs_item);
762  	} else {
763  		p = offsetof(struct btrfs_node, ptrs);
764  		item_size = sizeof(struct btrfs_key_ptr);
765  	}
766  
767  	while (low < high) {
768  		unsigned long oip;
769  		unsigned long offset;
770  		struct btrfs_disk_key *tmp;
771  		struct btrfs_disk_key unaligned;
772  		int mid;
773  
774  		mid = (low + high) / 2;
775  		offset = p + mid * item_size;
776  		oip = offset_in_page(offset);
777  
778  		if (oip + key_size <= PAGE_SIZE) {
779  			const unsigned long idx = get_eb_page_index(offset);
780  			char *kaddr = page_address(eb->pages[idx]);
781  
782  			oip = get_eb_offset_in_page(eb, offset);
783  			tmp = (struct btrfs_disk_key *)(kaddr + oip);
784  		} else {
785  			read_extent_buffer(eb, &unaligned, offset, key_size);
786  			tmp = &unaligned;
787  		}
788  
789  		ret = comp_keys(tmp, key);
790  
791  		if (ret < 0)
792  			low = mid + 1;
793  		else if (ret > 0)
794  			high = mid;
795  		else {
796  			*slot = mid;
797  			return 0;
798  		}
799  	}
800  	*slot = low;
801  	return 1;
802  }
803  
804  /*
805   * Simple binary search on an extent buffer. Works for both leaves and nodes, and
806   * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
807   */
808  int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
809  		     int *slot)
810  {
811  	return generic_bin_search(eb, 0, key, slot);
812  }
813  
814  static void root_add_used(struct btrfs_root *root, u32 size)
815  {
816  	spin_lock(&root->accounting_lock);
817  	btrfs_set_root_used(&root->root_item,
818  			    btrfs_root_used(&root->root_item) + size);
819  	spin_unlock(&root->accounting_lock);
820  }
821  
822  static void root_sub_used(struct btrfs_root *root, u32 size)
823  {
824  	spin_lock(&root->accounting_lock);
825  	btrfs_set_root_used(&root->root_item,
826  			    btrfs_root_used(&root->root_item) - size);
827  	spin_unlock(&root->accounting_lock);
828  }
829  
830  /* given a node and slot number, this reads the blocks it points to.  The
831   * extent buffer is returned with a reference taken (but unlocked).
832   */
833  struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
834  					   int slot)
835  {
836  	int level = btrfs_header_level(parent);
837  	struct extent_buffer *eb;
838  	struct btrfs_key first_key;
839  
840  	if (slot < 0 || slot >= btrfs_header_nritems(parent))
841  		return ERR_PTR(-ENOENT);
842  
843  	BUG_ON(level == 0);
844  
845  	btrfs_node_key_to_cpu(parent, &first_key, slot);
846  	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
847  			     btrfs_header_owner(parent),
848  			     btrfs_node_ptr_generation(parent, slot),
849  			     level - 1, &first_key);
850  	if (IS_ERR(eb))
851  		return eb;
852  	if (!extent_buffer_uptodate(eb)) {
853  		free_extent_buffer(eb);
854  		return ERR_PTR(-EIO);
855  	}
856  
857  	return eb;
858  }
859  
860  /*
861   * node level balancing, used to make sure nodes are in proper order for
862   * item deletion.  We balance from the top down, so we have to make sure
863   * that a deletion won't leave an node completely empty later on.
864   */
865  static noinline int balance_level(struct btrfs_trans_handle *trans,
866  			 struct btrfs_root *root,
867  			 struct btrfs_path *path, int level)
868  {
869  	struct btrfs_fs_info *fs_info = root->fs_info;
870  	struct extent_buffer *right = NULL;
871  	struct extent_buffer *mid;
872  	struct extent_buffer *left = NULL;
873  	struct extent_buffer *parent = NULL;
874  	int ret = 0;
875  	int wret;
876  	int pslot;
877  	int orig_slot = path->slots[level];
878  	u64 orig_ptr;
879  
880  	ASSERT(level > 0);
881  
882  	mid = path->nodes[level];
883  
884  	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
885  	WARN_ON(btrfs_header_generation(mid) != trans->transid);
886  
887  	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
888  
889  	if (level < BTRFS_MAX_LEVEL - 1) {
890  		parent = path->nodes[level + 1];
891  		pslot = path->slots[level + 1];
892  	}
893  
894  	/*
895  	 * deal with the case where there is only one pointer in the root
896  	 * by promoting the node below to a root
897  	 */
898  	if (!parent) {
899  		struct extent_buffer *child;
900  
901  		if (btrfs_header_nritems(mid) != 1)
902  			return 0;
903  
904  		/* promote the child to a root */
905  		child = btrfs_read_node_slot(mid, 0);
906  		if (IS_ERR(child)) {
907  			ret = PTR_ERR(child);
908  			btrfs_handle_fs_error(fs_info, ret, NULL);
909  			goto enospc;
910  		}
911  
912  		btrfs_tree_lock(child);
913  		ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
914  				      BTRFS_NESTING_COW);
915  		if (ret) {
916  			btrfs_tree_unlock(child);
917  			free_extent_buffer(child);
918  			goto enospc;
919  		}
920  
921  		ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
922  		BUG_ON(ret < 0);
923  		rcu_assign_pointer(root->node, child);
924  
925  		add_root_to_dirty_list(root);
926  		btrfs_tree_unlock(child);
927  
928  		path->locks[level] = 0;
929  		path->nodes[level] = NULL;
930  		btrfs_clean_tree_block(mid);
931  		btrfs_tree_unlock(mid);
932  		/* once for the path */
933  		free_extent_buffer(mid);
934  
935  		root_sub_used(root, mid->len);
936  		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
937  		/* once for the root ptr */
938  		free_extent_buffer_stale(mid);
939  		return 0;
940  	}
941  	if (btrfs_header_nritems(mid) >
942  	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
943  		return 0;
944  
945  	left = btrfs_read_node_slot(parent, pslot - 1);
946  	if (IS_ERR(left))
947  		left = NULL;
948  
949  	if (left) {
950  		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
951  		wret = btrfs_cow_block(trans, root, left,
952  				       parent, pslot - 1, &left,
953  				       BTRFS_NESTING_LEFT_COW);
954  		if (wret) {
955  			ret = wret;
956  			goto enospc;
957  		}
958  	}
959  
960  	right = btrfs_read_node_slot(parent, pslot + 1);
961  	if (IS_ERR(right))
962  		right = NULL;
963  
964  	if (right) {
965  		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
966  		wret = btrfs_cow_block(trans, root, right,
967  				       parent, pslot + 1, &right,
968  				       BTRFS_NESTING_RIGHT_COW);
969  		if (wret) {
970  			ret = wret;
971  			goto enospc;
972  		}
973  	}
974  
975  	/* first, try to make some room in the middle buffer */
976  	if (left) {
977  		orig_slot += btrfs_header_nritems(left);
978  		wret = push_node_left(trans, left, mid, 1);
979  		if (wret < 0)
980  			ret = wret;
981  	}
982  
983  	/*
984  	 * then try to empty the right most buffer into the middle
985  	 */
986  	if (right) {
987  		wret = push_node_left(trans, mid, right, 1);
988  		if (wret < 0 && wret != -ENOSPC)
989  			ret = wret;
990  		if (btrfs_header_nritems(right) == 0) {
991  			btrfs_clean_tree_block(right);
992  			btrfs_tree_unlock(right);
993  			del_ptr(root, path, level + 1, pslot + 1);
994  			root_sub_used(root, right->len);
995  			btrfs_free_tree_block(trans, btrfs_root_id(root), right,
996  					      0, 1);
997  			free_extent_buffer_stale(right);
998  			right = NULL;
999  		} else {
1000  			struct btrfs_disk_key right_key;
1001  			btrfs_node_key(right, &right_key, 0);
1002  			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1003  					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1004  			BUG_ON(ret < 0);
1005  			btrfs_set_node_key(parent, &right_key, pslot + 1);
1006  			btrfs_mark_buffer_dirty(parent);
1007  		}
1008  	}
1009  	if (btrfs_header_nritems(mid) == 1) {
1010  		/*
1011  		 * we're not allowed to leave a node with one item in the
1012  		 * tree during a delete.  A deletion from lower in the tree
1013  		 * could try to delete the only pointer in this node.
1014  		 * So, pull some keys from the left.
1015  		 * There has to be a left pointer at this point because
1016  		 * otherwise we would have pulled some pointers from the
1017  		 * right
1018  		 */
1019  		if (!left) {
1020  			ret = -EROFS;
1021  			btrfs_handle_fs_error(fs_info, ret, NULL);
1022  			goto enospc;
1023  		}
1024  		wret = balance_node_right(trans, mid, left);
1025  		if (wret < 0) {
1026  			ret = wret;
1027  			goto enospc;
1028  		}
1029  		if (wret == 1) {
1030  			wret = push_node_left(trans, left, mid, 1);
1031  			if (wret < 0)
1032  				ret = wret;
1033  		}
1034  		BUG_ON(wret == 1);
1035  	}
1036  	if (btrfs_header_nritems(mid) == 0) {
1037  		btrfs_clean_tree_block(mid);
1038  		btrfs_tree_unlock(mid);
1039  		del_ptr(root, path, level + 1, pslot);
1040  		root_sub_used(root, mid->len);
1041  		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1042  		free_extent_buffer_stale(mid);
1043  		mid = NULL;
1044  	} else {
1045  		/* update the parent key to reflect our changes */
1046  		struct btrfs_disk_key mid_key;
1047  		btrfs_node_key(mid, &mid_key, 0);
1048  		ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1049  				BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1050  		BUG_ON(ret < 0);
1051  		btrfs_set_node_key(parent, &mid_key, pslot);
1052  		btrfs_mark_buffer_dirty(parent);
1053  	}
1054  
1055  	/* update the path */
1056  	if (left) {
1057  		if (btrfs_header_nritems(left) > orig_slot) {
1058  			atomic_inc(&left->refs);
1059  			/* left was locked after cow */
1060  			path->nodes[level] = left;
1061  			path->slots[level + 1] -= 1;
1062  			path->slots[level] = orig_slot;
1063  			if (mid) {
1064  				btrfs_tree_unlock(mid);
1065  				free_extent_buffer(mid);
1066  			}
1067  		} else {
1068  			orig_slot -= btrfs_header_nritems(left);
1069  			path->slots[level] = orig_slot;
1070  		}
1071  	}
1072  	/* double check we haven't messed things up */
1073  	if (orig_ptr !=
1074  	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1075  		BUG();
1076  enospc:
1077  	if (right) {
1078  		btrfs_tree_unlock(right);
1079  		free_extent_buffer(right);
1080  	}
1081  	if (left) {
1082  		if (path->nodes[level] != left)
1083  			btrfs_tree_unlock(left);
1084  		free_extent_buffer(left);
1085  	}
1086  	return ret;
1087  }
1088  
1089  /* Node balancing for insertion.  Here we only split or push nodes around
1090   * when they are completely full.  This is also done top down, so we
1091   * have to be pessimistic.
1092   */
1093  static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1094  					  struct btrfs_root *root,
1095  					  struct btrfs_path *path, int level)
1096  {
1097  	struct btrfs_fs_info *fs_info = root->fs_info;
1098  	struct extent_buffer *right = NULL;
1099  	struct extent_buffer *mid;
1100  	struct extent_buffer *left = NULL;
1101  	struct extent_buffer *parent = NULL;
1102  	int ret = 0;
1103  	int wret;
1104  	int pslot;
1105  	int orig_slot = path->slots[level];
1106  
1107  	if (level == 0)
1108  		return 1;
1109  
1110  	mid = path->nodes[level];
1111  	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1112  
1113  	if (level < BTRFS_MAX_LEVEL - 1) {
1114  		parent = path->nodes[level + 1];
1115  		pslot = path->slots[level + 1];
1116  	}
1117  
1118  	if (!parent)
1119  		return 1;
1120  
1121  	left = btrfs_read_node_slot(parent, pslot - 1);
1122  	if (IS_ERR(left))
1123  		left = NULL;
1124  
1125  	/* first, try to make some room in the middle buffer */
1126  	if (left) {
1127  		u32 left_nr;
1128  
1129  		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1130  
1131  		left_nr = btrfs_header_nritems(left);
1132  		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1133  			wret = 1;
1134  		} else {
1135  			ret = btrfs_cow_block(trans, root, left, parent,
1136  					      pslot - 1, &left,
1137  					      BTRFS_NESTING_LEFT_COW);
1138  			if (ret)
1139  				wret = 1;
1140  			else {
1141  				wret = push_node_left(trans, left, mid, 0);
1142  			}
1143  		}
1144  		if (wret < 0)
1145  			ret = wret;
1146  		if (wret == 0) {
1147  			struct btrfs_disk_key disk_key;
1148  			orig_slot += left_nr;
1149  			btrfs_node_key(mid, &disk_key, 0);
1150  			ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1151  					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1152  			BUG_ON(ret < 0);
1153  			btrfs_set_node_key(parent, &disk_key, pslot);
1154  			btrfs_mark_buffer_dirty(parent);
1155  			if (btrfs_header_nritems(left) > orig_slot) {
1156  				path->nodes[level] = left;
1157  				path->slots[level + 1] -= 1;
1158  				path->slots[level] = orig_slot;
1159  				btrfs_tree_unlock(mid);
1160  				free_extent_buffer(mid);
1161  			} else {
1162  				orig_slot -=
1163  					btrfs_header_nritems(left);
1164  				path->slots[level] = orig_slot;
1165  				btrfs_tree_unlock(left);
1166  				free_extent_buffer(left);
1167  			}
1168  			return 0;
1169  		}
1170  		btrfs_tree_unlock(left);
1171  		free_extent_buffer(left);
1172  	}
1173  	right = btrfs_read_node_slot(parent, pslot + 1);
1174  	if (IS_ERR(right))
1175  		right = NULL;
1176  
1177  	/*
1178  	 * then try to empty the right most buffer into the middle
1179  	 */
1180  	if (right) {
1181  		u32 right_nr;
1182  
1183  		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1184  
1185  		right_nr = btrfs_header_nritems(right);
1186  		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1187  			wret = 1;
1188  		} else {
1189  			ret = btrfs_cow_block(trans, root, right,
1190  					      parent, pslot + 1,
1191  					      &right, BTRFS_NESTING_RIGHT_COW);
1192  			if (ret)
1193  				wret = 1;
1194  			else {
1195  				wret = balance_node_right(trans, right, mid);
1196  			}
1197  		}
1198  		if (wret < 0)
1199  			ret = wret;
1200  		if (wret == 0) {
1201  			struct btrfs_disk_key disk_key;
1202  
1203  			btrfs_node_key(right, &disk_key, 0);
1204  			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1205  					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1206  			BUG_ON(ret < 0);
1207  			btrfs_set_node_key(parent, &disk_key, pslot + 1);
1208  			btrfs_mark_buffer_dirty(parent);
1209  
1210  			if (btrfs_header_nritems(mid) <= orig_slot) {
1211  				path->nodes[level] = right;
1212  				path->slots[level + 1] += 1;
1213  				path->slots[level] = orig_slot -
1214  					btrfs_header_nritems(mid);
1215  				btrfs_tree_unlock(mid);
1216  				free_extent_buffer(mid);
1217  			} else {
1218  				btrfs_tree_unlock(right);
1219  				free_extent_buffer(right);
1220  			}
1221  			return 0;
1222  		}
1223  		btrfs_tree_unlock(right);
1224  		free_extent_buffer(right);
1225  	}
1226  	return 1;
1227  }
1228  
1229  /*
1230   * readahead one full node of leaves, finding things that are close
1231   * to the block in 'slot', and triggering ra on them.
1232   */
1233  static void reada_for_search(struct btrfs_fs_info *fs_info,
1234  			     struct btrfs_path *path,
1235  			     int level, int slot, u64 objectid)
1236  {
1237  	struct extent_buffer *node;
1238  	struct btrfs_disk_key disk_key;
1239  	u32 nritems;
1240  	u64 search;
1241  	u64 target;
1242  	u64 nread = 0;
1243  	u64 nread_max;
1244  	u32 nr;
1245  	u32 blocksize;
1246  	u32 nscan = 0;
1247  
1248  	if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1249  		return;
1250  
1251  	if (!path->nodes[level])
1252  		return;
1253  
1254  	node = path->nodes[level];
1255  
1256  	/*
1257  	 * Since the time between visiting leaves is much shorter than the time
1258  	 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1259  	 * much IO at once (possibly random).
1260  	 */
1261  	if (path->reada == READA_FORWARD_ALWAYS) {
1262  		if (level > 1)
1263  			nread_max = node->fs_info->nodesize;
1264  		else
1265  			nread_max = SZ_128K;
1266  	} else {
1267  		nread_max = SZ_64K;
1268  	}
1269  
1270  	search = btrfs_node_blockptr(node, slot);
1271  	blocksize = fs_info->nodesize;
1272  	if (path->reada != READA_FORWARD_ALWAYS) {
1273  		struct extent_buffer *eb;
1274  
1275  		eb = find_extent_buffer(fs_info, search);
1276  		if (eb) {
1277  			free_extent_buffer(eb);
1278  			return;
1279  		}
1280  	}
1281  
1282  	target = search;
1283  
1284  	nritems = btrfs_header_nritems(node);
1285  	nr = slot;
1286  
1287  	while (1) {
1288  		if (path->reada == READA_BACK) {
1289  			if (nr == 0)
1290  				break;
1291  			nr--;
1292  		} else if (path->reada == READA_FORWARD ||
1293  			   path->reada == READA_FORWARD_ALWAYS) {
1294  			nr++;
1295  			if (nr >= nritems)
1296  				break;
1297  		}
1298  		if (path->reada == READA_BACK && objectid) {
1299  			btrfs_node_key(node, &disk_key, nr);
1300  			if (btrfs_disk_key_objectid(&disk_key) != objectid)
1301  				break;
1302  		}
1303  		search = btrfs_node_blockptr(node, nr);
1304  		if (path->reada == READA_FORWARD_ALWAYS ||
1305  		    (search <= target && target - search <= 65536) ||
1306  		    (search > target && search - target <= 65536)) {
1307  			btrfs_readahead_node_child(node, nr);
1308  			nread += blocksize;
1309  		}
1310  		nscan++;
1311  		if (nread > nread_max || nscan > 32)
1312  			break;
1313  	}
1314  }
1315  
1316  static noinline void reada_for_balance(struct btrfs_path *path, int level)
1317  {
1318  	struct extent_buffer *parent;
1319  	int slot;
1320  	int nritems;
1321  
1322  	parent = path->nodes[level + 1];
1323  	if (!parent)
1324  		return;
1325  
1326  	nritems = btrfs_header_nritems(parent);
1327  	slot = path->slots[level + 1];
1328  
1329  	if (slot > 0)
1330  		btrfs_readahead_node_child(parent, slot - 1);
1331  	if (slot + 1 < nritems)
1332  		btrfs_readahead_node_child(parent, slot + 1);
1333  }
1334  
1335  
1336  /*
1337   * when we walk down the tree, it is usually safe to unlock the higher layers
1338   * in the tree.  The exceptions are when our path goes through slot 0, because
1339   * operations on the tree might require changing key pointers higher up in the
1340   * tree.
1341   *
1342   * callers might also have set path->keep_locks, which tells this code to keep
1343   * the lock if the path points to the last slot in the block.  This is part of
1344   * walking through the tree, and selecting the next slot in the higher block.
1345   *
1346   * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1347   * if lowest_unlock is 1, level 0 won't be unlocked
1348   */
1349  static noinline void unlock_up(struct btrfs_path *path, int level,
1350  			       int lowest_unlock, int min_write_lock_level,
1351  			       int *write_lock_level)
1352  {
1353  	int i;
1354  	int skip_level = level;
1355  	bool check_skip = true;
1356  
1357  	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1358  		if (!path->nodes[i])
1359  			break;
1360  		if (!path->locks[i])
1361  			break;
1362  
1363  		if (check_skip) {
1364  			if (path->slots[i] == 0) {
1365  				skip_level = i + 1;
1366  				continue;
1367  			}
1368  
1369  			if (path->keep_locks) {
1370  				u32 nritems;
1371  
1372  				nritems = btrfs_header_nritems(path->nodes[i]);
1373  				if (nritems < 1 || path->slots[i] >= nritems - 1) {
1374  					skip_level = i + 1;
1375  					continue;
1376  				}
1377  			}
1378  		}
1379  
1380  		if (i >= lowest_unlock && i > skip_level) {
1381  			check_skip = false;
1382  			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1383  			path->locks[i] = 0;
1384  			if (write_lock_level &&
1385  			    i > min_write_lock_level &&
1386  			    i <= *write_lock_level) {
1387  				*write_lock_level = i - 1;
1388  			}
1389  		}
1390  	}
1391  }
1392  
1393  /*
1394   * Helper function for btrfs_search_slot() and other functions that do a search
1395   * on a btree. The goal is to find a tree block in the cache (the radix tree at
1396   * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1397   * its pages from disk.
1398   *
1399   * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1400   * whole btree search, starting again from the current root node.
1401   */
1402  static int
1403  read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1404  		      struct extent_buffer **eb_ret, int level, int slot,
1405  		      const struct btrfs_key *key)
1406  {
1407  	struct btrfs_fs_info *fs_info = root->fs_info;
1408  	u64 blocknr;
1409  	u64 gen;
1410  	struct extent_buffer *tmp;
1411  	struct btrfs_key first_key;
1412  	int ret;
1413  	int parent_level;
1414  	bool unlock_up;
1415  
1416  	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1417  	blocknr = btrfs_node_blockptr(*eb_ret, slot);
1418  	gen = btrfs_node_ptr_generation(*eb_ret, slot);
1419  	parent_level = btrfs_header_level(*eb_ret);
1420  	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1421  
1422  	/*
1423  	 * If we need to read an extent buffer from disk and we are holding locks
1424  	 * on upper level nodes, we unlock all the upper nodes before reading the
1425  	 * extent buffer, and then return -EAGAIN to the caller as it needs to
1426  	 * restart the search. We don't release the lock on the current level
1427  	 * because we need to walk this node to figure out which blocks to read.
1428  	 */
1429  	tmp = find_extent_buffer(fs_info, blocknr);
1430  	if (tmp) {
1431  		if (p->reada == READA_FORWARD_ALWAYS)
1432  			reada_for_search(fs_info, p, level, slot, key->objectid);
1433  
1434  		/* first we do an atomic uptodate check */
1435  		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1436  			/*
1437  			 * Do extra check for first_key, eb can be stale due to
1438  			 * being cached, read from scrub, or have multiple
1439  			 * parents (shared tree blocks).
1440  			 */
1441  			if (btrfs_verify_level_key(tmp,
1442  					parent_level - 1, &first_key, gen)) {
1443  				free_extent_buffer(tmp);
1444  				return -EUCLEAN;
1445  			}
1446  			*eb_ret = tmp;
1447  			return 0;
1448  		}
1449  
1450  		if (p->nowait) {
1451  			free_extent_buffer(tmp);
1452  			return -EAGAIN;
1453  		}
1454  
1455  		if (unlock_up)
1456  			btrfs_unlock_up_safe(p, level + 1);
1457  
1458  		/* now we're allowed to do a blocking uptodate check */
1459  		ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
1460  		if (ret) {
1461  			free_extent_buffer(tmp);
1462  			btrfs_release_path(p);
1463  			return -EIO;
1464  		}
1465  		if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1466  			free_extent_buffer(tmp);
1467  			btrfs_release_path(p);
1468  			return -EUCLEAN;
1469  		}
1470  
1471  		if (unlock_up)
1472  			ret = -EAGAIN;
1473  
1474  		goto out;
1475  	} else if (p->nowait) {
1476  		return -EAGAIN;
1477  	}
1478  
1479  	if (unlock_up) {
1480  		btrfs_unlock_up_safe(p, level + 1);
1481  		ret = -EAGAIN;
1482  	} else {
1483  		ret = 0;
1484  	}
1485  
1486  	if (p->reada != READA_NONE)
1487  		reada_for_search(fs_info, p, level, slot, key->objectid);
1488  
1489  	tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1490  			      gen, parent_level - 1, &first_key);
1491  	if (IS_ERR(tmp)) {
1492  		btrfs_release_path(p);
1493  		return PTR_ERR(tmp);
1494  	}
1495  	/*
1496  	 * If the read above didn't mark this buffer up to date,
1497  	 * it will never end up being up to date.  Set ret to EIO now
1498  	 * and give up so that our caller doesn't loop forever
1499  	 * on our EAGAINs.
1500  	 */
1501  	if (!extent_buffer_uptodate(tmp))
1502  		ret = -EIO;
1503  
1504  out:
1505  	if (ret == 0) {
1506  		*eb_ret = tmp;
1507  	} else {
1508  		free_extent_buffer(tmp);
1509  		btrfs_release_path(p);
1510  	}
1511  
1512  	return ret;
1513  }
1514  
1515  /*
1516   * helper function for btrfs_search_slot.  This does all of the checks
1517   * for node-level blocks and does any balancing required based on
1518   * the ins_len.
1519   *
1520   * If no extra work was required, zero is returned.  If we had to
1521   * drop the path, -EAGAIN is returned and btrfs_search_slot must
1522   * start over
1523   */
1524  static int
1525  setup_nodes_for_search(struct btrfs_trans_handle *trans,
1526  		       struct btrfs_root *root, struct btrfs_path *p,
1527  		       struct extent_buffer *b, int level, int ins_len,
1528  		       int *write_lock_level)
1529  {
1530  	struct btrfs_fs_info *fs_info = root->fs_info;
1531  	int ret = 0;
1532  
1533  	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1534  	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1535  
1536  		if (*write_lock_level < level + 1) {
1537  			*write_lock_level = level + 1;
1538  			btrfs_release_path(p);
1539  			return -EAGAIN;
1540  		}
1541  
1542  		reada_for_balance(p, level);
1543  		ret = split_node(trans, root, p, level);
1544  
1545  		b = p->nodes[level];
1546  	} else if (ins_len < 0 && btrfs_header_nritems(b) <
1547  		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1548  
1549  		if (*write_lock_level < level + 1) {
1550  			*write_lock_level = level + 1;
1551  			btrfs_release_path(p);
1552  			return -EAGAIN;
1553  		}
1554  
1555  		reada_for_balance(p, level);
1556  		ret = balance_level(trans, root, p, level);
1557  		if (ret)
1558  			return ret;
1559  
1560  		b = p->nodes[level];
1561  		if (!b) {
1562  			btrfs_release_path(p);
1563  			return -EAGAIN;
1564  		}
1565  		BUG_ON(btrfs_header_nritems(b) == 1);
1566  	}
1567  	return ret;
1568  }
1569  
1570  int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1571  		u64 iobjectid, u64 ioff, u8 key_type,
1572  		struct btrfs_key *found_key)
1573  {
1574  	int ret;
1575  	struct btrfs_key key;
1576  	struct extent_buffer *eb;
1577  
1578  	ASSERT(path);
1579  	ASSERT(found_key);
1580  
1581  	key.type = key_type;
1582  	key.objectid = iobjectid;
1583  	key.offset = ioff;
1584  
1585  	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1586  	if (ret < 0)
1587  		return ret;
1588  
1589  	eb = path->nodes[0];
1590  	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1591  		ret = btrfs_next_leaf(fs_root, path);
1592  		if (ret)
1593  			return ret;
1594  		eb = path->nodes[0];
1595  	}
1596  
1597  	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1598  	if (found_key->type != key.type ||
1599  			found_key->objectid != key.objectid)
1600  		return 1;
1601  
1602  	return 0;
1603  }
1604  
1605  static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1606  							struct btrfs_path *p,
1607  							int write_lock_level)
1608  {
1609  	struct extent_buffer *b;
1610  	int root_lock = 0;
1611  	int level = 0;
1612  
1613  	if (p->search_commit_root) {
1614  		b = root->commit_root;
1615  		atomic_inc(&b->refs);
1616  		level = btrfs_header_level(b);
1617  		/*
1618  		 * Ensure that all callers have set skip_locking when
1619  		 * p->search_commit_root = 1.
1620  		 */
1621  		ASSERT(p->skip_locking == 1);
1622  
1623  		goto out;
1624  	}
1625  
1626  	if (p->skip_locking) {
1627  		b = btrfs_root_node(root);
1628  		level = btrfs_header_level(b);
1629  		goto out;
1630  	}
1631  
1632  	/* We try very hard to do read locks on the root */
1633  	root_lock = BTRFS_READ_LOCK;
1634  
1635  	/*
1636  	 * If the level is set to maximum, we can skip trying to get the read
1637  	 * lock.
1638  	 */
1639  	if (write_lock_level < BTRFS_MAX_LEVEL) {
1640  		/*
1641  		 * We don't know the level of the root node until we actually
1642  		 * have it read locked
1643  		 */
1644  		if (p->nowait) {
1645  			b = btrfs_try_read_lock_root_node(root);
1646  			if (IS_ERR(b))
1647  				return b;
1648  		} else {
1649  			b = btrfs_read_lock_root_node(root);
1650  		}
1651  		level = btrfs_header_level(b);
1652  		if (level > write_lock_level)
1653  			goto out;
1654  
1655  		/* Whoops, must trade for write lock */
1656  		btrfs_tree_read_unlock(b);
1657  		free_extent_buffer(b);
1658  	}
1659  
1660  	b = btrfs_lock_root_node(root);
1661  	root_lock = BTRFS_WRITE_LOCK;
1662  
1663  	/* The level might have changed, check again */
1664  	level = btrfs_header_level(b);
1665  
1666  out:
1667  	/*
1668  	 * The root may have failed to write out at some point, and thus is no
1669  	 * longer valid, return an error in this case.
1670  	 */
1671  	if (!extent_buffer_uptodate(b)) {
1672  		if (root_lock)
1673  			btrfs_tree_unlock_rw(b, root_lock);
1674  		free_extent_buffer(b);
1675  		return ERR_PTR(-EIO);
1676  	}
1677  
1678  	p->nodes[level] = b;
1679  	if (!p->skip_locking)
1680  		p->locks[level] = root_lock;
1681  	/*
1682  	 * Callers are responsible for dropping b's references.
1683  	 */
1684  	return b;
1685  }
1686  
1687  /*
1688   * Replace the extent buffer at the lowest level of the path with a cloned
1689   * version. The purpose is to be able to use it safely, after releasing the
1690   * commit root semaphore, even if relocation is happening in parallel, the
1691   * transaction used for relocation is committed and the extent buffer is
1692   * reallocated in the next transaction.
1693   *
1694   * This is used in a context where the caller does not prevent transaction
1695   * commits from happening, either by holding a transaction handle or holding
1696   * some lock, while it's doing searches through a commit root.
1697   * At the moment it's only used for send operations.
1698   */
1699  static int finish_need_commit_sem_search(struct btrfs_path *path)
1700  {
1701  	const int i = path->lowest_level;
1702  	const int slot = path->slots[i];
1703  	struct extent_buffer *lowest = path->nodes[i];
1704  	struct extent_buffer *clone;
1705  
1706  	ASSERT(path->need_commit_sem);
1707  
1708  	if (!lowest)
1709  		return 0;
1710  
1711  	lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1712  
1713  	clone = btrfs_clone_extent_buffer(lowest);
1714  	if (!clone)
1715  		return -ENOMEM;
1716  
1717  	btrfs_release_path(path);
1718  	path->nodes[i] = clone;
1719  	path->slots[i] = slot;
1720  
1721  	return 0;
1722  }
1723  
1724  static inline int search_for_key_slot(struct extent_buffer *eb,
1725  				      int search_low_slot,
1726  				      const struct btrfs_key *key,
1727  				      int prev_cmp,
1728  				      int *slot)
1729  {
1730  	/*
1731  	 * If a previous call to btrfs_bin_search() on a parent node returned an
1732  	 * exact match (prev_cmp == 0), we can safely assume the target key will
1733  	 * always be at slot 0 on lower levels, since each key pointer
1734  	 * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1735  	 * subtree it points to. Thus we can skip searching lower levels.
1736  	 */
1737  	if (prev_cmp == 0) {
1738  		*slot = 0;
1739  		return 0;
1740  	}
1741  
1742  	return generic_bin_search(eb, search_low_slot, key, slot);
1743  }
1744  
1745  static int search_leaf(struct btrfs_trans_handle *trans,
1746  		       struct btrfs_root *root,
1747  		       const struct btrfs_key *key,
1748  		       struct btrfs_path *path,
1749  		       int ins_len,
1750  		       int prev_cmp)
1751  {
1752  	struct extent_buffer *leaf = path->nodes[0];
1753  	int leaf_free_space = -1;
1754  	int search_low_slot = 0;
1755  	int ret;
1756  	bool do_bin_search = true;
1757  
1758  	/*
1759  	 * If we are doing an insertion, the leaf has enough free space and the
1760  	 * destination slot for the key is not slot 0, then we can unlock our
1761  	 * write lock on the parent, and any other upper nodes, before doing the
1762  	 * binary search on the leaf (with search_for_key_slot()), allowing other
1763  	 * tasks to lock the parent and any other upper nodes.
1764  	 */
1765  	if (ins_len > 0) {
1766  		/*
1767  		 * Cache the leaf free space, since we will need it later and it
1768  		 * will not change until then.
1769  		 */
1770  		leaf_free_space = btrfs_leaf_free_space(leaf);
1771  
1772  		/*
1773  		 * !path->locks[1] means we have a single node tree, the leaf is
1774  		 * the root of the tree.
1775  		 */
1776  		if (path->locks[1] && leaf_free_space >= ins_len) {
1777  			struct btrfs_disk_key first_key;
1778  
1779  			ASSERT(btrfs_header_nritems(leaf) > 0);
1780  			btrfs_item_key(leaf, &first_key, 0);
1781  
1782  			/*
1783  			 * Doing the extra comparison with the first key is cheap,
1784  			 * taking into account that the first key is very likely
1785  			 * already in a cache line because it immediately follows
1786  			 * the extent buffer's header and we have recently accessed
1787  			 * the header's level field.
1788  			 */
1789  			ret = comp_keys(&first_key, key);
1790  			if (ret < 0) {
1791  				/*
1792  				 * The first key is smaller than the key we want
1793  				 * to insert, so we are safe to unlock all upper
1794  				 * nodes and we have to do the binary search.
1795  				 *
1796  				 * We do use btrfs_unlock_up_safe() and not
1797  				 * unlock_up() because the later does not unlock
1798  				 * nodes with a slot of 0 - we can safely unlock
1799  				 * any node even if its slot is 0 since in this
1800  				 * case the key does not end up at slot 0 of the
1801  				 * leaf and there's no need to split the leaf.
1802  				 */
1803  				btrfs_unlock_up_safe(path, 1);
1804  				search_low_slot = 1;
1805  			} else {
1806  				/*
1807  				 * The first key is >= then the key we want to
1808  				 * insert, so we can skip the binary search as
1809  				 * the target key will be at slot 0.
1810  				 *
1811  				 * We can not unlock upper nodes when the key is
1812  				 * less than the first key, because we will need
1813  				 * to update the key at slot 0 of the parent node
1814  				 * and possibly of other upper nodes too.
1815  				 * If the key matches the first key, then we can
1816  				 * unlock all the upper nodes, using
1817  				 * btrfs_unlock_up_safe() instead of unlock_up()
1818  				 * as stated above.
1819  				 */
1820  				if (ret == 0)
1821  					btrfs_unlock_up_safe(path, 1);
1822  				/*
1823  				 * ret is already 0 or 1, matching the result of
1824  				 * a btrfs_bin_search() call, so there is no need
1825  				 * to adjust it.
1826  				 */
1827  				do_bin_search = false;
1828  				path->slots[0] = 0;
1829  			}
1830  		}
1831  	}
1832  
1833  	if (do_bin_search) {
1834  		ret = search_for_key_slot(leaf, search_low_slot, key,
1835  					  prev_cmp, &path->slots[0]);
1836  		if (ret < 0)
1837  			return ret;
1838  	}
1839  
1840  	if (ins_len > 0) {
1841  		/*
1842  		 * Item key already exists. In this case, if we are allowed to
1843  		 * insert the item (for example, in dir_item case, item key
1844  		 * collision is allowed), it will be merged with the original
1845  		 * item. Only the item size grows, no new btrfs item will be
1846  		 * added. If search_for_extension is not set, ins_len already
1847  		 * accounts the size btrfs_item, deduct it here so leaf space
1848  		 * check will be correct.
1849  		 */
1850  		if (ret == 0 && !path->search_for_extension) {
1851  			ASSERT(ins_len >= sizeof(struct btrfs_item));
1852  			ins_len -= sizeof(struct btrfs_item);
1853  		}
1854  
1855  		ASSERT(leaf_free_space >= 0);
1856  
1857  		if (leaf_free_space < ins_len) {
1858  			int err;
1859  
1860  			err = split_leaf(trans, root, key, path, ins_len,
1861  					 (ret == 0));
1862  			ASSERT(err <= 0);
1863  			if (WARN_ON(err > 0))
1864  				err = -EUCLEAN;
1865  			if (err)
1866  				ret = err;
1867  		}
1868  	}
1869  
1870  	return ret;
1871  }
1872  
1873  /*
1874   * btrfs_search_slot - look for a key in a tree and perform necessary
1875   * modifications to preserve tree invariants.
1876   *
1877   * @trans:	Handle of transaction, used when modifying the tree
1878   * @p:		Holds all btree nodes along the search path
1879   * @root:	The root node of the tree
1880   * @key:	The key we are looking for
1881   * @ins_len:	Indicates purpose of search:
1882   *              >0  for inserts it's size of item inserted (*)
1883   *              <0  for deletions
1884   *               0  for plain searches, not modifying the tree
1885   *
1886   *              (*) If size of item inserted doesn't include
1887   *              sizeof(struct btrfs_item), then p->search_for_extension must
1888   *              be set.
1889   * @cow:	boolean should CoW operations be performed. Must always be 1
1890   *		when modifying the tree.
1891   *
1892   * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1893   * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1894   *
1895   * If @key is found, 0 is returned and you can find the item in the leaf level
1896   * of the path (level 0)
1897   *
1898   * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1899   * points to the slot where it should be inserted
1900   *
1901   * If an error is encountered while searching the tree a negative error number
1902   * is returned
1903   */
1904  int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1905  		      const struct btrfs_key *key, struct btrfs_path *p,
1906  		      int ins_len, int cow)
1907  {
1908  	struct btrfs_fs_info *fs_info = root->fs_info;
1909  	struct extent_buffer *b;
1910  	int slot;
1911  	int ret;
1912  	int err;
1913  	int level;
1914  	int lowest_unlock = 1;
1915  	/* everything at write_lock_level or lower must be write locked */
1916  	int write_lock_level = 0;
1917  	u8 lowest_level = 0;
1918  	int min_write_lock_level;
1919  	int prev_cmp;
1920  
1921  	lowest_level = p->lowest_level;
1922  	WARN_ON(lowest_level && ins_len > 0);
1923  	WARN_ON(p->nodes[0] != NULL);
1924  	BUG_ON(!cow && ins_len);
1925  
1926  	/*
1927  	 * For now only allow nowait for read only operations.  There's no
1928  	 * strict reason why we can't, we just only need it for reads so it's
1929  	 * only implemented for reads.
1930  	 */
1931  	ASSERT(!p->nowait || !cow);
1932  
1933  	if (ins_len < 0) {
1934  		lowest_unlock = 2;
1935  
1936  		/* when we are removing items, we might have to go up to level
1937  		 * two as we update tree pointers  Make sure we keep write
1938  		 * for those levels as well
1939  		 */
1940  		write_lock_level = 2;
1941  	} else if (ins_len > 0) {
1942  		/*
1943  		 * for inserting items, make sure we have a write lock on
1944  		 * level 1 so we can update keys
1945  		 */
1946  		write_lock_level = 1;
1947  	}
1948  
1949  	if (!cow)
1950  		write_lock_level = -1;
1951  
1952  	if (cow && (p->keep_locks || p->lowest_level))
1953  		write_lock_level = BTRFS_MAX_LEVEL;
1954  
1955  	min_write_lock_level = write_lock_level;
1956  
1957  	if (p->need_commit_sem) {
1958  		ASSERT(p->search_commit_root);
1959  		if (p->nowait) {
1960  			if (!down_read_trylock(&fs_info->commit_root_sem))
1961  				return -EAGAIN;
1962  		} else {
1963  			down_read(&fs_info->commit_root_sem);
1964  		}
1965  	}
1966  
1967  again:
1968  	prev_cmp = -1;
1969  	b = btrfs_search_slot_get_root(root, p, write_lock_level);
1970  	if (IS_ERR(b)) {
1971  		ret = PTR_ERR(b);
1972  		goto done;
1973  	}
1974  
1975  	while (b) {
1976  		int dec = 0;
1977  
1978  		level = btrfs_header_level(b);
1979  
1980  		if (cow) {
1981  			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1982  
1983  			/*
1984  			 * if we don't really need to cow this block
1985  			 * then we don't want to set the path blocking,
1986  			 * so we test it here
1987  			 */
1988  			if (!should_cow_block(trans, root, b))
1989  				goto cow_done;
1990  
1991  			/*
1992  			 * must have write locks on this node and the
1993  			 * parent
1994  			 */
1995  			if (level > write_lock_level ||
1996  			    (level + 1 > write_lock_level &&
1997  			    level + 1 < BTRFS_MAX_LEVEL &&
1998  			    p->nodes[level + 1])) {
1999  				write_lock_level = level + 1;
2000  				btrfs_release_path(p);
2001  				goto again;
2002  			}
2003  
2004  			if (last_level)
2005  				err = btrfs_cow_block(trans, root, b, NULL, 0,
2006  						      &b,
2007  						      BTRFS_NESTING_COW);
2008  			else
2009  				err = btrfs_cow_block(trans, root, b,
2010  						      p->nodes[level + 1],
2011  						      p->slots[level + 1], &b,
2012  						      BTRFS_NESTING_COW);
2013  			if (err) {
2014  				ret = err;
2015  				goto done;
2016  			}
2017  		}
2018  cow_done:
2019  		p->nodes[level] = b;
2020  
2021  		/*
2022  		 * we have a lock on b and as long as we aren't changing
2023  		 * the tree, there is no way to for the items in b to change.
2024  		 * It is safe to drop the lock on our parent before we
2025  		 * go through the expensive btree search on b.
2026  		 *
2027  		 * If we're inserting or deleting (ins_len != 0), then we might
2028  		 * be changing slot zero, which may require changing the parent.
2029  		 * So, we can't drop the lock until after we know which slot
2030  		 * we're operating on.
2031  		 */
2032  		if (!ins_len && !p->keep_locks) {
2033  			int u = level + 1;
2034  
2035  			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2036  				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2037  				p->locks[u] = 0;
2038  			}
2039  		}
2040  
2041  		if (level == 0) {
2042  			if (ins_len > 0)
2043  				ASSERT(write_lock_level >= 1);
2044  
2045  			ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2046  			if (!p->search_for_split)
2047  				unlock_up(p, level, lowest_unlock,
2048  					  min_write_lock_level, NULL);
2049  			goto done;
2050  		}
2051  
2052  		ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2053  		if (ret < 0)
2054  			goto done;
2055  		prev_cmp = ret;
2056  
2057  		if (ret && slot > 0) {
2058  			dec = 1;
2059  			slot--;
2060  		}
2061  		p->slots[level] = slot;
2062  		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2063  					     &write_lock_level);
2064  		if (err == -EAGAIN)
2065  			goto again;
2066  		if (err) {
2067  			ret = err;
2068  			goto done;
2069  		}
2070  		b = p->nodes[level];
2071  		slot = p->slots[level];
2072  
2073  		/*
2074  		 * Slot 0 is special, if we change the key we have to update
2075  		 * the parent pointer which means we must have a write lock on
2076  		 * the parent
2077  		 */
2078  		if (slot == 0 && ins_len && write_lock_level < level + 1) {
2079  			write_lock_level = level + 1;
2080  			btrfs_release_path(p);
2081  			goto again;
2082  		}
2083  
2084  		unlock_up(p, level, lowest_unlock, min_write_lock_level,
2085  			  &write_lock_level);
2086  
2087  		if (level == lowest_level) {
2088  			if (dec)
2089  				p->slots[level]++;
2090  			goto done;
2091  		}
2092  
2093  		err = read_block_for_search(root, p, &b, level, slot, key);
2094  		if (err == -EAGAIN)
2095  			goto again;
2096  		if (err) {
2097  			ret = err;
2098  			goto done;
2099  		}
2100  
2101  		if (!p->skip_locking) {
2102  			level = btrfs_header_level(b);
2103  
2104  			btrfs_maybe_reset_lockdep_class(root, b);
2105  
2106  			if (level <= write_lock_level) {
2107  				btrfs_tree_lock(b);
2108  				p->locks[level] = BTRFS_WRITE_LOCK;
2109  			} else {
2110  				if (p->nowait) {
2111  					if (!btrfs_try_tree_read_lock(b)) {
2112  						free_extent_buffer(b);
2113  						ret = -EAGAIN;
2114  						goto done;
2115  					}
2116  				} else {
2117  					btrfs_tree_read_lock(b);
2118  				}
2119  				p->locks[level] = BTRFS_READ_LOCK;
2120  			}
2121  			p->nodes[level] = b;
2122  		}
2123  	}
2124  	ret = 1;
2125  done:
2126  	if (ret < 0 && !p->skip_release_on_error)
2127  		btrfs_release_path(p);
2128  
2129  	if (p->need_commit_sem) {
2130  		int ret2;
2131  
2132  		ret2 = finish_need_commit_sem_search(p);
2133  		up_read(&fs_info->commit_root_sem);
2134  		if (ret2)
2135  			ret = ret2;
2136  	}
2137  
2138  	return ret;
2139  }
2140  ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2141  
2142  /*
2143   * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2144   * current state of the tree together with the operations recorded in the tree
2145   * modification log to search for the key in a previous version of this tree, as
2146   * denoted by the time_seq parameter.
2147   *
2148   * Naturally, there is no support for insert, delete or cow operations.
2149   *
2150   * The resulting path and return value will be set up as if we called
2151   * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2152   */
2153  int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2154  			  struct btrfs_path *p, u64 time_seq)
2155  {
2156  	struct btrfs_fs_info *fs_info = root->fs_info;
2157  	struct extent_buffer *b;
2158  	int slot;
2159  	int ret;
2160  	int err;
2161  	int level;
2162  	int lowest_unlock = 1;
2163  	u8 lowest_level = 0;
2164  
2165  	lowest_level = p->lowest_level;
2166  	WARN_ON(p->nodes[0] != NULL);
2167  	ASSERT(!p->nowait);
2168  
2169  	if (p->search_commit_root) {
2170  		BUG_ON(time_seq);
2171  		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2172  	}
2173  
2174  again:
2175  	b = btrfs_get_old_root(root, time_seq);
2176  	if (!b) {
2177  		ret = -EIO;
2178  		goto done;
2179  	}
2180  	level = btrfs_header_level(b);
2181  	p->locks[level] = BTRFS_READ_LOCK;
2182  
2183  	while (b) {
2184  		int dec = 0;
2185  
2186  		level = btrfs_header_level(b);
2187  		p->nodes[level] = b;
2188  
2189  		/*
2190  		 * we have a lock on b and as long as we aren't changing
2191  		 * the tree, there is no way to for the items in b to change.
2192  		 * It is safe to drop the lock on our parent before we
2193  		 * go through the expensive btree search on b.
2194  		 */
2195  		btrfs_unlock_up_safe(p, level + 1);
2196  
2197  		ret = btrfs_bin_search(b, key, &slot);
2198  		if (ret < 0)
2199  			goto done;
2200  
2201  		if (level == 0) {
2202  			p->slots[level] = slot;
2203  			unlock_up(p, level, lowest_unlock, 0, NULL);
2204  			goto done;
2205  		}
2206  
2207  		if (ret && slot > 0) {
2208  			dec = 1;
2209  			slot--;
2210  		}
2211  		p->slots[level] = slot;
2212  		unlock_up(p, level, lowest_unlock, 0, NULL);
2213  
2214  		if (level == lowest_level) {
2215  			if (dec)
2216  				p->slots[level]++;
2217  			goto done;
2218  		}
2219  
2220  		err = read_block_for_search(root, p, &b, level, slot, key);
2221  		if (err == -EAGAIN)
2222  			goto again;
2223  		if (err) {
2224  			ret = err;
2225  			goto done;
2226  		}
2227  
2228  		level = btrfs_header_level(b);
2229  		btrfs_tree_read_lock(b);
2230  		b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2231  		if (!b) {
2232  			ret = -ENOMEM;
2233  			goto done;
2234  		}
2235  		p->locks[level] = BTRFS_READ_LOCK;
2236  		p->nodes[level] = b;
2237  	}
2238  	ret = 1;
2239  done:
2240  	if (ret < 0)
2241  		btrfs_release_path(p);
2242  
2243  	return ret;
2244  }
2245  
2246  /*
2247   * helper to use instead of search slot if no exact match is needed but
2248   * instead the next or previous item should be returned.
2249   * When find_higher is true, the next higher item is returned, the next lower
2250   * otherwise.
2251   * When return_any and find_higher are both true, and no higher item is found,
2252   * return the next lower instead.
2253   * When return_any is true and find_higher is false, and no lower item is found,
2254   * return the next higher instead.
2255   * It returns 0 if any item is found, 1 if none is found (tree empty), and
2256   * < 0 on error
2257   */
2258  int btrfs_search_slot_for_read(struct btrfs_root *root,
2259  			       const struct btrfs_key *key,
2260  			       struct btrfs_path *p, int find_higher,
2261  			       int return_any)
2262  {
2263  	int ret;
2264  	struct extent_buffer *leaf;
2265  
2266  again:
2267  	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2268  	if (ret <= 0)
2269  		return ret;
2270  	/*
2271  	 * a return value of 1 means the path is at the position where the
2272  	 * item should be inserted. Normally this is the next bigger item,
2273  	 * but in case the previous item is the last in a leaf, path points
2274  	 * to the first free slot in the previous leaf, i.e. at an invalid
2275  	 * item.
2276  	 */
2277  	leaf = p->nodes[0];
2278  
2279  	if (find_higher) {
2280  		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2281  			ret = btrfs_next_leaf(root, p);
2282  			if (ret <= 0)
2283  				return ret;
2284  			if (!return_any)
2285  				return 1;
2286  			/*
2287  			 * no higher item found, return the next
2288  			 * lower instead
2289  			 */
2290  			return_any = 0;
2291  			find_higher = 0;
2292  			btrfs_release_path(p);
2293  			goto again;
2294  		}
2295  	} else {
2296  		if (p->slots[0] == 0) {
2297  			ret = btrfs_prev_leaf(root, p);
2298  			if (ret < 0)
2299  				return ret;
2300  			if (!ret) {
2301  				leaf = p->nodes[0];
2302  				if (p->slots[0] == btrfs_header_nritems(leaf))
2303  					p->slots[0]--;
2304  				return 0;
2305  			}
2306  			if (!return_any)
2307  				return 1;
2308  			/*
2309  			 * no lower item found, return the next
2310  			 * higher instead
2311  			 */
2312  			return_any = 0;
2313  			find_higher = 1;
2314  			btrfs_release_path(p);
2315  			goto again;
2316  		} else {
2317  			--p->slots[0];
2318  		}
2319  	}
2320  	return 0;
2321  }
2322  
2323  /*
2324   * Execute search and call btrfs_previous_item to traverse backwards if the item
2325   * was not found.
2326   *
2327   * Return 0 if found, 1 if not found and < 0 if error.
2328   */
2329  int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2330  			   struct btrfs_path *path)
2331  {
2332  	int ret;
2333  
2334  	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2335  	if (ret > 0)
2336  		ret = btrfs_previous_item(root, path, key->objectid, key->type);
2337  
2338  	if (ret == 0)
2339  		btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2340  
2341  	return ret;
2342  }
2343  
2344  /**
2345   * Search for a valid slot for the given path.
2346   *
2347   * @root:	The root node of the tree.
2348   * @key:	Will contain a valid item if found.
2349   * @path:	The starting point to validate the slot.
2350   *
2351   * Return: 0  if the item is valid
2352   *         1  if not found
2353   *         <0 if error.
2354   */
2355  int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2356  			      struct btrfs_path *path)
2357  {
2358  	while (1) {
2359  		int ret;
2360  		const int slot = path->slots[0];
2361  		const struct extent_buffer *leaf = path->nodes[0];
2362  
2363  		/* This is where we start walking the path. */
2364  		if (slot >= btrfs_header_nritems(leaf)) {
2365  			/*
2366  			 * If we've reached the last slot in this leaf we need
2367  			 * to go to the next leaf and reset the path.
2368  			 */
2369  			ret = btrfs_next_leaf(root, path);
2370  			if (ret)
2371  				return ret;
2372  			continue;
2373  		}
2374  		/* Store the found, valid item in @key. */
2375  		btrfs_item_key_to_cpu(leaf, key, slot);
2376  		break;
2377  	}
2378  	return 0;
2379  }
2380  
2381  /*
2382   * adjust the pointers going up the tree, starting at level
2383   * making sure the right key of each node is points to 'key'.
2384   * This is used after shifting pointers to the left, so it stops
2385   * fixing up pointers when a given leaf/node is not in slot 0 of the
2386   * higher levels
2387   *
2388   */
2389  static void fixup_low_keys(struct btrfs_path *path,
2390  			   struct btrfs_disk_key *key, int level)
2391  {
2392  	int i;
2393  	struct extent_buffer *t;
2394  	int ret;
2395  
2396  	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2397  		int tslot = path->slots[i];
2398  
2399  		if (!path->nodes[i])
2400  			break;
2401  		t = path->nodes[i];
2402  		ret = btrfs_tree_mod_log_insert_key(t, tslot,
2403  				BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2404  		BUG_ON(ret < 0);
2405  		btrfs_set_node_key(t, key, tslot);
2406  		btrfs_mark_buffer_dirty(path->nodes[i]);
2407  		if (tslot != 0)
2408  			break;
2409  	}
2410  }
2411  
2412  /*
2413   * update item key.
2414   *
2415   * This function isn't completely safe. It's the caller's responsibility
2416   * that the new key won't break the order
2417   */
2418  void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2419  			     struct btrfs_path *path,
2420  			     const struct btrfs_key *new_key)
2421  {
2422  	struct btrfs_disk_key disk_key;
2423  	struct extent_buffer *eb;
2424  	int slot;
2425  
2426  	eb = path->nodes[0];
2427  	slot = path->slots[0];
2428  	if (slot > 0) {
2429  		btrfs_item_key(eb, &disk_key, slot - 1);
2430  		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2431  			btrfs_crit(fs_info,
2432  		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2433  				   slot, btrfs_disk_key_objectid(&disk_key),
2434  				   btrfs_disk_key_type(&disk_key),
2435  				   btrfs_disk_key_offset(&disk_key),
2436  				   new_key->objectid, new_key->type,
2437  				   new_key->offset);
2438  			btrfs_print_leaf(eb);
2439  			BUG();
2440  		}
2441  	}
2442  	if (slot < btrfs_header_nritems(eb) - 1) {
2443  		btrfs_item_key(eb, &disk_key, slot + 1);
2444  		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2445  			btrfs_crit(fs_info,
2446  		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2447  				   slot, btrfs_disk_key_objectid(&disk_key),
2448  				   btrfs_disk_key_type(&disk_key),
2449  				   btrfs_disk_key_offset(&disk_key),
2450  				   new_key->objectid, new_key->type,
2451  				   new_key->offset);
2452  			btrfs_print_leaf(eb);
2453  			BUG();
2454  		}
2455  	}
2456  
2457  	btrfs_cpu_key_to_disk(&disk_key, new_key);
2458  	btrfs_set_item_key(eb, &disk_key, slot);
2459  	btrfs_mark_buffer_dirty(eb);
2460  	if (slot == 0)
2461  		fixup_low_keys(path, &disk_key, 1);
2462  }
2463  
2464  /*
2465   * Check key order of two sibling extent buffers.
2466   *
2467   * Return true if something is wrong.
2468   * Return false if everything is fine.
2469   *
2470   * Tree-checker only works inside one tree block, thus the following
2471   * corruption can not be detected by tree-checker:
2472   *
2473   * Leaf @left			| Leaf @right
2474   * --------------------------------------------------------------
2475   * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2476   *
2477   * Key f6 in leaf @left itself is valid, but not valid when the next
2478   * key in leaf @right is 7.
2479   * This can only be checked at tree block merge time.
2480   * And since tree checker has ensured all key order in each tree block
2481   * is correct, we only need to bother the last key of @left and the first
2482   * key of @right.
2483   */
2484  static bool check_sibling_keys(struct extent_buffer *left,
2485  			       struct extent_buffer *right)
2486  {
2487  	struct btrfs_key left_last;
2488  	struct btrfs_key right_first;
2489  	int level = btrfs_header_level(left);
2490  	int nr_left = btrfs_header_nritems(left);
2491  	int nr_right = btrfs_header_nritems(right);
2492  
2493  	/* No key to check in one of the tree blocks */
2494  	if (!nr_left || !nr_right)
2495  		return false;
2496  
2497  	if (level) {
2498  		btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2499  		btrfs_node_key_to_cpu(right, &right_first, 0);
2500  	} else {
2501  		btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2502  		btrfs_item_key_to_cpu(right, &right_first, 0);
2503  	}
2504  
2505  	if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2506  		btrfs_crit(left->fs_info,
2507  "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2508  			   left_last.objectid, left_last.type,
2509  			   left_last.offset, right_first.objectid,
2510  			   right_first.type, right_first.offset);
2511  		return true;
2512  	}
2513  	return false;
2514  }
2515  
2516  /*
2517   * try to push data from one node into the next node left in the
2518   * tree.
2519   *
2520   * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2521   * error, and > 0 if there was no room in the left hand block.
2522   */
2523  static int push_node_left(struct btrfs_trans_handle *trans,
2524  			  struct extent_buffer *dst,
2525  			  struct extent_buffer *src, int empty)
2526  {
2527  	struct btrfs_fs_info *fs_info = trans->fs_info;
2528  	int push_items = 0;
2529  	int src_nritems;
2530  	int dst_nritems;
2531  	int ret = 0;
2532  
2533  	src_nritems = btrfs_header_nritems(src);
2534  	dst_nritems = btrfs_header_nritems(dst);
2535  	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2536  	WARN_ON(btrfs_header_generation(src) != trans->transid);
2537  	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2538  
2539  	if (!empty && src_nritems <= 8)
2540  		return 1;
2541  
2542  	if (push_items <= 0)
2543  		return 1;
2544  
2545  	if (empty) {
2546  		push_items = min(src_nritems, push_items);
2547  		if (push_items < src_nritems) {
2548  			/* leave at least 8 pointers in the node if
2549  			 * we aren't going to empty it
2550  			 */
2551  			if (src_nritems - push_items < 8) {
2552  				if (push_items <= 8)
2553  					return 1;
2554  				push_items -= 8;
2555  			}
2556  		}
2557  	} else
2558  		push_items = min(src_nritems - 8, push_items);
2559  
2560  	/* dst is the left eb, src is the middle eb */
2561  	if (check_sibling_keys(dst, src)) {
2562  		ret = -EUCLEAN;
2563  		btrfs_abort_transaction(trans, ret);
2564  		return ret;
2565  	}
2566  	ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2567  	if (ret) {
2568  		btrfs_abort_transaction(trans, ret);
2569  		return ret;
2570  	}
2571  	copy_extent_buffer(dst, src,
2572  			   btrfs_node_key_ptr_offset(dst_nritems),
2573  			   btrfs_node_key_ptr_offset(0),
2574  			   push_items * sizeof(struct btrfs_key_ptr));
2575  
2576  	if (push_items < src_nritems) {
2577  		/*
2578  		 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2579  		 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2580  		 */
2581  		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2582  				      btrfs_node_key_ptr_offset(push_items),
2583  				      (src_nritems - push_items) *
2584  				      sizeof(struct btrfs_key_ptr));
2585  	}
2586  	btrfs_set_header_nritems(src, src_nritems - push_items);
2587  	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2588  	btrfs_mark_buffer_dirty(src);
2589  	btrfs_mark_buffer_dirty(dst);
2590  
2591  	return ret;
2592  }
2593  
2594  /*
2595   * try to push data from one node into the next node right in the
2596   * tree.
2597   *
2598   * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2599   * error, and > 0 if there was no room in the right hand block.
2600   *
2601   * this will  only push up to 1/2 the contents of the left node over
2602   */
2603  static int balance_node_right(struct btrfs_trans_handle *trans,
2604  			      struct extent_buffer *dst,
2605  			      struct extent_buffer *src)
2606  {
2607  	struct btrfs_fs_info *fs_info = trans->fs_info;
2608  	int push_items = 0;
2609  	int max_push;
2610  	int src_nritems;
2611  	int dst_nritems;
2612  	int ret = 0;
2613  
2614  	WARN_ON(btrfs_header_generation(src) != trans->transid);
2615  	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2616  
2617  	src_nritems = btrfs_header_nritems(src);
2618  	dst_nritems = btrfs_header_nritems(dst);
2619  	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2620  	if (push_items <= 0)
2621  		return 1;
2622  
2623  	if (src_nritems < 4)
2624  		return 1;
2625  
2626  	max_push = src_nritems / 2 + 1;
2627  	/* don't try to empty the node */
2628  	if (max_push >= src_nritems)
2629  		return 1;
2630  
2631  	if (max_push < push_items)
2632  		push_items = max_push;
2633  
2634  	/* dst is the right eb, src is the middle eb */
2635  	if (check_sibling_keys(src, dst)) {
2636  		ret = -EUCLEAN;
2637  		btrfs_abort_transaction(trans, ret);
2638  		return ret;
2639  	}
2640  	ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2641  	BUG_ON(ret < 0);
2642  	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2643  				      btrfs_node_key_ptr_offset(0),
2644  				      (dst_nritems) *
2645  				      sizeof(struct btrfs_key_ptr));
2646  
2647  	ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2648  					 push_items);
2649  	if (ret) {
2650  		btrfs_abort_transaction(trans, ret);
2651  		return ret;
2652  	}
2653  	copy_extent_buffer(dst, src,
2654  			   btrfs_node_key_ptr_offset(0),
2655  			   btrfs_node_key_ptr_offset(src_nritems - push_items),
2656  			   push_items * sizeof(struct btrfs_key_ptr));
2657  
2658  	btrfs_set_header_nritems(src, src_nritems - push_items);
2659  	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2660  
2661  	btrfs_mark_buffer_dirty(src);
2662  	btrfs_mark_buffer_dirty(dst);
2663  
2664  	return ret;
2665  }
2666  
2667  /*
2668   * helper function to insert a new root level in the tree.
2669   * A new node is allocated, and a single item is inserted to
2670   * point to the existing root
2671   *
2672   * returns zero on success or < 0 on failure.
2673   */
2674  static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2675  			   struct btrfs_root *root,
2676  			   struct btrfs_path *path, int level)
2677  {
2678  	struct btrfs_fs_info *fs_info = root->fs_info;
2679  	u64 lower_gen;
2680  	struct extent_buffer *lower;
2681  	struct extent_buffer *c;
2682  	struct extent_buffer *old;
2683  	struct btrfs_disk_key lower_key;
2684  	int ret;
2685  
2686  	BUG_ON(path->nodes[level]);
2687  	BUG_ON(path->nodes[level-1] != root->node);
2688  
2689  	lower = path->nodes[level-1];
2690  	if (level == 1)
2691  		btrfs_item_key(lower, &lower_key, 0);
2692  	else
2693  		btrfs_node_key(lower, &lower_key, 0);
2694  
2695  	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2696  				   &lower_key, level, root->node->start, 0,
2697  				   BTRFS_NESTING_NEW_ROOT);
2698  	if (IS_ERR(c))
2699  		return PTR_ERR(c);
2700  
2701  	root_add_used(root, fs_info->nodesize);
2702  
2703  	btrfs_set_header_nritems(c, 1);
2704  	btrfs_set_node_key(c, &lower_key, 0);
2705  	btrfs_set_node_blockptr(c, 0, lower->start);
2706  	lower_gen = btrfs_header_generation(lower);
2707  	WARN_ON(lower_gen != trans->transid);
2708  
2709  	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2710  
2711  	btrfs_mark_buffer_dirty(c);
2712  
2713  	old = root->node;
2714  	ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2715  	BUG_ON(ret < 0);
2716  	rcu_assign_pointer(root->node, c);
2717  
2718  	/* the super has an extra ref to root->node */
2719  	free_extent_buffer(old);
2720  
2721  	add_root_to_dirty_list(root);
2722  	atomic_inc(&c->refs);
2723  	path->nodes[level] = c;
2724  	path->locks[level] = BTRFS_WRITE_LOCK;
2725  	path->slots[level] = 0;
2726  	return 0;
2727  }
2728  
2729  /*
2730   * worker function to insert a single pointer in a node.
2731   * the node should have enough room for the pointer already
2732   *
2733   * slot and level indicate where you want the key to go, and
2734   * blocknr is the block the key points to.
2735   */
2736  static void insert_ptr(struct btrfs_trans_handle *trans,
2737  		       struct btrfs_path *path,
2738  		       struct btrfs_disk_key *key, u64 bytenr,
2739  		       int slot, int level)
2740  {
2741  	struct extent_buffer *lower;
2742  	int nritems;
2743  	int ret;
2744  
2745  	BUG_ON(!path->nodes[level]);
2746  	btrfs_assert_tree_write_locked(path->nodes[level]);
2747  	lower = path->nodes[level];
2748  	nritems = btrfs_header_nritems(lower);
2749  	BUG_ON(slot > nritems);
2750  	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2751  	if (slot != nritems) {
2752  		if (level) {
2753  			ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2754  					slot, nritems - slot);
2755  			BUG_ON(ret < 0);
2756  		}
2757  		memmove_extent_buffer(lower,
2758  			      btrfs_node_key_ptr_offset(slot + 1),
2759  			      btrfs_node_key_ptr_offset(slot),
2760  			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
2761  	}
2762  	if (level) {
2763  		ret = btrfs_tree_mod_log_insert_key(lower, slot,
2764  					    BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2765  		BUG_ON(ret < 0);
2766  	}
2767  	btrfs_set_node_key(lower, key, slot);
2768  	btrfs_set_node_blockptr(lower, slot, bytenr);
2769  	WARN_ON(trans->transid == 0);
2770  	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2771  	btrfs_set_header_nritems(lower, nritems + 1);
2772  	btrfs_mark_buffer_dirty(lower);
2773  }
2774  
2775  /*
2776   * split the node at the specified level in path in two.
2777   * The path is corrected to point to the appropriate node after the split
2778   *
2779   * Before splitting this tries to make some room in the node by pushing
2780   * left and right, if either one works, it returns right away.
2781   *
2782   * returns 0 on success and < 0 on failure
2783   */
2784  static noinline int split_node(struct btrfs_trans_handle *trans,
2785  			       struct btrfs_root *root,
2786  			       struct btrfs_path *path, int level)
2787  {
2788  	struct btrfs_fs_info *fs_info = root->fs_info;
2789  	struct extent_buffer *c;
2790  	struct extent_buffer *split;
2791  	struct btrfs_disk_key disk_key;
2792  	int mid;
2793  	int ret;
2794  	u32 c_nritems;
2795  
2796  	c = path->nodes[level];
2797  	WARN_ON(btrfs_header_generation(c) != trans->transid);
2798  	if (c == root->node) {
2799  		/*
2800  		 * trying to split the root, lets make a new one
2801  		 *
2802  		 * tree mod log: We don't log_removal old root in
2803  		 * insert_new_root, because that root buffer will be kept as a
2804  		 * normal node. We are going to log removal of half of the
2805  		 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2806  		 * holding a tree lock on the buffer, which is why we cannot
2807  		 * race with other tree_mod_log users.
2808  		 */
2809  		ret = insert_new_root(trans, root, path, level + 1);
2810  		if (ret)
2811  			return ret;
2812  	} else {
2813  		ret = push_nodes_for_insert(trans, root, path, level);
2814  		c = path->nodes[level];
2815  		if (!ret && btrfs_header_nritems(c) <
2816  		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2817  			return 0;
2818  		if (ret < 0)
2819  			return ret;
2820  	}
2821  
2822  	c_nritems = btrfs_header_nritems(c);
2823  	mid = (c_nritems + 1) / 2;
2824  	btrfs_node_key(c, &disk_key, mid);
2825  
2826  	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2827  				       &disk_key, level, c->start, 0,
2828  				       BTRFS_NESTING_SPLIT);
2829  	if (IS_ERR(split))
2830  		return PTR_ERR(split);
2831  
2832  	root_add_used(root, fs_info->nodesize);
2833  	ASSERT(btrfs_header_level(c) == level);
2834  
2835  	ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2836  	if (ret) {
2837  		btrfs_abort_transaction(trans, ret);
2838  		return ret;
2839  	}
2840  	copy_extent_buffer(split, c,
2841  			   btrfs_node_key_ptr_offset(0),
2842  			   btrfs_node_key_ptr_offset(mid),
2843  			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2844  	btrfs_set_header_nritems(split, c_nritems - mid);
2845  	btrfs_set_header_nritems(c, mid);
2846  
2847  	btrfs_mark_buffer_dirty(c);
2848  	btrfs_mark_buffer_dirty(split);
2849  
2850  	insert_ptr(trans, path, &disk_key, split->start,
2851  		   path->slots[level + 1] + 1, level + 1);
2852  
2853  	if (path->slots[level] >= mid) {
2854  		path->slots[level] -= mid;
2855  		btrfs_tree_unlock(c);
2856  		free_extent_buffer(c);
2857  		path->nodes[level] = split;
2858  		path->slots[level + 1] += 1;
2859  	} else {
2860  		btrfs_tree_unlock(split);
2861  		free_extent_buffer(split);
2862  	}
2863  	return 0;
2864  }
2865  
2866  /*
2867   * how many bytes are required to store the items in a leaf.  start
2868   * and nr indicate which items in the leaf to check.  This totals up the
2869   * space used both by the item structs and the item data
2870   */
2871  static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2872  {
2873  	int data_len;
2874  	int nritems = btrfs_header_nritems(l);
2875  	int end = min(nritems, start + nr) - 1;
2876  
2877  	if (!nr)
2878  		return 0;
2879  	data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
2880  	data_len = data_len - btrfs_item_offset(l, end);
2881  	data_len += sizeof(struct btrfs_item) * nr;
2882  	WARN_ON(data_len < 0);
2883  	return data_len;
2884  }
2885  
2886  /*
2887   * The space between the end of the leaf items and
2888   * the start of the leaf data.  IOW, how much room
2889   * the leaf has left for both items and data
2890   */
2891  noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2892  {
2893  	struct btrfs_fs_info *fs_info = leaf->fs_info;
2894  	int nritems = btrfs_header_nritems(leaf);
2895  	int ret;
2896  
2897  	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2898  	if (ret < 0) {
2899  		btrfs_crit(fs_info,
2900  			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2901  			   ret,
2902  			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2903  			   leaf_space_used(leaf, 0, nritems), nritems);
2904  	}
2905  	return ret;
2906  }
2907  
2908  /*
2909   * min slot controls the lowest index we're willing to push to the
2910   * right.  We'll push up to and including min_slot, but no lower
2911   */
2912  static noinline int __push_leaf_right(struct btrfs_path *path,
2913  				      int data_size, int empty,
2914  				      struct extent_buffer *right,
2915  				      int free_space, u32 left_nritems,
2916  				      u32 min_slot)
2917  {
2918  	struct btrfs_fs_info *fs_info = right->fs_info;
2919  	struct extent_buffer *left = path->nodes[0];
2920  	struct extent_buffer *upper = path->nodes[1];
2921  	struct btrfs_map_token token;
2922  	struct btrfs_disk_key disk_key;
2923  	int slot;
2924  	u32 i;
2925  	int push_space = 0;
2926  	int push_items = 0;
2927  	u32 nr;
2928  	u32 right_nritems;
2929  	u32 data_end;
2930  	u32 this_item_size;
2931  
2932  	if (empty)
2933  		nr = 0;
2934  	else
2935  		nr = max_t(u32, 1, min_slot);
2936  
2937  	if (path->slots[0] >= left_nritems)
2938  		push_space += data_size;
2939  
2940  	slot = path->slots[1];
2941  	i = left_nritems - 1;
2942  	while (i >= nr) {
2943  		if (!empty && push_items > 0) {
2944  			if (path->slots[0] > i)
2945  				break;
2946  			if (path->slots[0] == i) {
2947  				int space = btrfs_leaf_free_space(left);
2948  
2949  				if (space + push_space * 2 > free_space)
2950  					break;
2951  			}
2952  		}
2953  
2954  		if (path->slots[0] == i)
2955  			push_space += data_size;
2956  
2957  		this_item_size = btrfs_item_size(left, i);
2958  		if (this_item_size + sizeof(struct btrfs_item) +
2959  		    push_space > free_space)
2960  			break;
2961  
2962  		push_items++;
2963  		push_space += this_item_size + sizeof(struct btrfs_item);
2964  		if (i == 0)
2965  			break;
2966  		i--;
2967  	}
2968  
2969  	if (push_items == 0)
2970  		goto out_unlock;
2971  
2972  	WARN_ON(!empty && push_items == left_nritems);
2973  
2974  	/* push left to right */
2975  	right_nritems = btrfs_header_nritems(right);
2976  
2977  	push_space = btrfs_item_data_end(left, left_nritems - push_items);
2978  	push_space -= leaf_data_end(left);
2979  
2980  	/* make room in the right data area */
2981  	data_end = leaf_data_end(right);
2982  	memmove_extent_buffer(right,
2983  			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2984  			      BTRFS_LEAF_DATA_OFFSET + data_end,
2985  			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2986  
2987  	/* copy from the left data area */
2988  	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2989  		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2990  		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2991  		     push_space);
2992  
2993  	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2994  			      btrfs_item_nr_offset(0),
2995  			      right_nritems * sizeof(struct btrfs_item));
2996  
2997  	/* copy the items from left to right */
2998  	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2999  		   btrfs_item_nr_offset(left_nritems - push_items),
3000  		   push_items * sizeof(struct btrfs_item));
3001  
3002  	/* update the item pointers */
3003  	btrfs_init_map_token(&token, right);
3004  	right_nritems += push_items;
3005  	btrfs_set_header_nritems(right, right_nritems);
3006  	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3007  	for (i = 0; i < right_nritems; i++) {
3008  		push_space -= btrfs_token_item_size(&token, i);
3009  		btrfs_set_token_item_offset(&token, i, push_space);
3010  	}
3011  
3012  	left_nritems -= push_items;
3013  	btrfs_set_header_nritems(left, left_nritems);
3014  
3015  	if (left_nritems)
3016  		btrfs_mark_buffer_dirty(left);
3017  	else
3018  		btrfs_clean_tree_block(left);
3019  
3020  	btrfs_mark_buffer_dirty(right);
3021  
3022  	btrfs_item_key(right, &disk_key, 0);
3023  	btrfs_set_node_key(upper, &disk_key, slot + 1);
3024  	btrfs_mark_buffer_dirty(upper);
3025  
3026  	/* then fixup the leaf pointer in the path */
3027  	if (path->slots[0] >= left_nritems) {
3028  		path->slots[0] -= left_nritems;
3029  		if (btrfs_header_nritems(path->nodes[0]) == 0)
3030  			btrfs_clean_tree_block(path->nodes[0]);
3031  		btrfs_tree_unlock(path->nodes[0]);
3032  		free_extent_buffer(path->nodes[0]);
3033  		path->nodes[0] = right;
3034  		path->slots[1] += 1;
3035  	} else {
3036  		btrfs_tree_unlock(right);
3037  		free_extent_buffer(right);
3038  	}
3039  	return 0;
3040  
3041  out_unlock:
3042  	btrfs_tree_unlock(right);
3043  	free_extent_buffer(right);
3044  	return 1;
3045  }
3046  
3047  /*
3048   * push some data in the path leaf to the right, trying to free up at
3049   * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3050   *
3051   * returns 1 if the push failed because the other node didn't have enough
3052   * room, 0 if everything worked out and < 0 if there were major errors.
3053   *
3054   * this will push starting from min_slot to the end of the leaf.  It won't
3055   * push any slot lower than min_slot
3056   */
3057  static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3058  			   *root, struct btrfs_path *path,
3059  			   int min_data_size, int data_size,
3060  			   int empty, u32 min_slot)
3061  {
3062  	struct extent_buffer *left = path->nodes[0];
3063  	struct extent_buffer *right;
3064  	struct extent_buffer *upper;
3065  	int slot;
3066  	int free_space;
3067  	u32 left_nritems;
3068  	int ret;
3069  
3070  	if (!path->nodes[1])
3071  		return 1;
3072  
3073  	slot = path->slots[1];
3074  	upper = path->nodes[1];
3075  	if (slot >= btrfs_header_nritems(upper) - 1)
3076  		return 1;
3077  
3078  	btrfs_assert_tree_write_locked(path->nodes[1]);
3079  
3080  	right = btrfs_read_node_slot(upper, slot + 1);
3081  	/*
3082  	 * slot + 1 is not valid or we fail to read the right node,
3083  	 * no big deal, just return.
3084  	 */
3085  	if (IS_ERR(right))
3086  		return 1;
3087  
3088  	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3089  
3090  	free_space = btrfs_leaf_free_space(right);
3091  	if (free_space < data_size)
3092  		goto out_unlock;
3093  
3094  	ret = btrfs_cow_block(trans, root, right, upper,
3095  			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3096  	if (ret)
3097  		goto out_unlock;
3098  
3099  	left_nritems = btrfs_header_nritems(left);
3100  	if (left_nritems == 0)
3101  		goto out_unlock;
3102  
3103  	if (check_sibling_keys(left, right)) {
3104  		ret = -EUCLEAN;
3105  		btrfs_tree_unlock(right);
3106  		free_extent_buffer(right);
3107  		return ret;
3108  	}
3109  	if (path->slots[0] == left_nritems && !empty) {
3110  		/* Key greater than all keys in the leaf, right neighbor has
3111  		 * enough room for it and we're not emptying our leaf to delete
3112  		 * it, therefore use right neighbor to insert the new item and
3113  		 * no need to touch/dirty our left leaf. */
3114  		btrfs_tree_unlock(left);
3115  		free_extent_buffer(left);
3116  		path->nodes[0] = right;
3117  		path->slots[0] = 0;
3118  		path->slots[1]++;
3119  		return 0;
3120  	}
3121  
3122  	return __push_leaf_right(path, min_data_size, empty,
3123  				right, free_space, left_nritems, min_slot);
3124  out_unlock:
3125  	btrfs_tree_unlock(right);
3126  	free_extent_buffer(right);
3127  	return 1;
3128  }
3129  
3130  /*
3131   * push some data in the path leaf to the left, trying to free up at
3132   * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3133   *
3134   * max_slot can put a limit on how far into the leaf we'll push items.  The
3135   * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3136   * items
3137   */
3138  static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3139  				     int empty, struct extent_buffer *left,
3140  				     int free_space, u32 right_nritems,
3141  				     u32 max_slot)
3142  {
3143  	struct btrfs_fs_info *fs_info = left->fs_info;
3144  	struct btrfs_disk_key disk_key;
3145  	struct extent_buffer *right = path->nodes[0];
3146  	int i;
3147  	int push_space = 0;
3148  	int push_items = 0;
3149  	u32 old_left_nritems;
3150  	u32 nr;
3151  	int ret = 0;
3152  	u32 this_item_size;
3153  	u32 old_left_item_size;
3154  	struct btrfs_map_token token;
3155  
3156  	if (empty)
3157  		nr = min(right_nritems, max_slot);
3158  	else
3159  		nr = min(right_nritems - 1, max_slot);
3160  
3161  	for (i = 0; i < nr; i++) {
3162  		if (!empty && push_items > 0) {
3163  			if (path->slots[0] < i)
3164  				break;
3165  			if (path->slots[0] == i) {
3166  				int space = btrfs_leaf_free_space(right);
3167  
3168  				if (space + push_space * 2 > free_space)
3169  					break;
3170  			}
3171  		}
3172  
3173  		if (path->slots[0] == i)
3174  			push_space += data_size;
3175  
3176  		this_item_size = btrfs_item_size(right, i);
3177  		if (this_item_size + sizeof(struct btrfs_item) + push_space >
3178  		    free_space)
3179  			break;
3180  
3181  		push_items++;
3182  		push_space += this_item_size + sizeof(struct btrfs_item);
3183  	}
3184  
3185  	if (push_items == 0) {
3186  		ret = 1;
3187  		goto out;
3188  	}
3189  	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3190  
3191  	/* push data from right to left */
3192  	copy_extent_buffer(left, right,
3193  			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3194  			   btrfs_item_nr_offset(0),
3195  			   push_items * sizeof(struct btrfs_item));
3196  
3197  	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3198  		     btrfs_item_offset(right, push_items - 1);
3199  
3200  	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3201  		     leaf_data_end(left) - push_space,
3202  		     BTRFS_LEAF_DATA_OFFSET +
3203  		     btrfs_item_offset(right, push_items - 1),
3204  		     push_space);
3205  	old_left_nritems = btrfs_header_nritems(left);
3206  	BUG_ON(old_left_nritems <= 0);
3207  
3208  	btrfs_init_map_token(&token, left);
3209  	old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3210  	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3211  		u32 ioff;
3212  
3213  		ioff = btrfs_token_item_offset(&token, i);
3214  		btrfs_set_token_item_offset(&token, i,
3215  		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3216  	}
3217  	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3218  
3219  	/* fixup right node */
3220  	if (push_items > right_nritems)
3221  		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3222  		       right_nritems);
3223  
3224  	if (push_items < right_nritems) {
3225  		push_space = btrfs_item_offset(right, push_items - 1) -
3226  						  leaf_data_end(right);
3227  		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3228  				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3229  				      BTRFS_LEAF_DATA_OFFSET +
3230  				      leaf_data_end(right), push_space);
3231  
3232  		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3233  			      btrfs_item_nr_offset(push_items),
3234  			     (btrfs_header_nritems(right) - push_items) *
3235  			     sizeof(struct btrfs_item));
3236  	}
3237  
3238  	btrfs_init_map_token(&token, right);
3239  	right_nritems -= push_items;
3240  	btrfs_set_header_nritems(right, right_nritems);
3241  	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3242  	for (i = 0; i < right_nritems; i++) {
3243  		push_space = push_space - btrfs_token_item_size(&token, i);
3244  		btrfs_set_token_item_offset(&token, i, push_space);
3245  	}
3246  
3247  	btrfs_mark_buffer_dirty(left);
3248  	if (right_nritems)
3249  		btrfs_mark_buffer_dirty(right);
3250  	else
3251  		btrfs_clean_tree_block(right);
3252  
3253  	btrfs_item_key(right, &disk_key, 0);
3254  	fixup_low_keys(path, &disk_key, 1);
3255  
3256  	/* then fixup the leaf pointer in the path */
3257  	if (path->slots[0] < push_items) {
3258  		path->slots[0] += old_left_nritems;
3259  		btrfs_tree_unlock(path->nodes[0]);
3260  		free_extent_buffer(path->nodes[0]);
3261  		path->nodes[0] = left;
3262  		path->slots[1] -= 1;
3263  	} else {
3264  		btrfs_tree_unlock(left);
3265  		free_extent_buffer(left);
3266  		path->slots[0] -= push_items;
3267  	}
3268  	BUG_ON(path->slots[0] < 0);
3269  	return ret;
3270  out:
3271  	btrfs_tree_unlock(left);
3272  	free_extent_buffer(left);
3273  	return ret;
3274  }
3275  
3276  /*
3277   * push some data in the path leaf to the left, trying to free up at
3278   * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3279   *
3280   * max_slot can put a limit on how far into the leaf we'll push items.  The
3281   * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3282   * items
3283   */
3284  static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3285  			  *root, struct btrfs_path *path, int min_data_size,
3286  			  int data_size, int empty, u32 max_slot)
3287  {
3288  	struct extent_buffer *right = path->nodes[0];
3289  	struct extent_buffer *left;
3290  	int slot;
3291  	int free_space;
3292  	u32 right_nritems;
3293  	int ret = 0;
3294  
3295  	slot = path->slots[1];
3296  	if (slot == 0)
3297  		return 1;
3298  	if (!path->nodes[1])
3299  		return 1;
3300  
3301  	right_nritems = btrfs_header_nritems(right);
3302  	if (right_nritems == 0)
3303  		return 1;
3304  
3305  	btrfs_assert_tree_write_locked(path->nodes[1]);
3306  
3307  	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3308  	/*
3309  	 * slot - 1 is not valid or we fail to read the left node,
3310  	 * no big deal, just return.
3311  	 */
3312  	if (IS_ERR(left))
3313  		return 1;
3314  
3315  	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3316  
3317  	free_space = btrfs_leaf_free_space(left);
3318  	if (free_space < data_size) {
3319  		ret = 1;
3320  		goto out;
3321  	}
3322  
3323  	ret = btrfs_cow_block(trans, root, left,
3324  			      path->nodes[1], slot - 1, &left,
3325  			      BTRFS_NESTING_LEFT_COW);
3326  	if (ret) {
3327  		/* we hit -ENOSPC, but it isn't fatal here */
3328  		if (ret == -ENOSPC)
3329  			ret = 1;
3330  		goto out;
3331  	}
3332  
3333  	if (check_sibling_keys(left, right)) {
3334  		ret = -EUCLEAN;
3335  		goto out;
3336  	}
3337  	return __push_leaf_left(path, min_data_size,
3338  			       empty, left, free_space, right_nritems,
3339  			       max_slot);
3340  out:
3341  	btrfs_tree_unlock(left);
3342  	free_extent_buffer(left);
3343  	return ret;
3344  }
3345  
3346  /*
3347   * split the path's leaf in two, making sure there is at least data_size
3348   * available for the resulting leaf level of the path.
3349   */
3350  static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3351  				    struct btrfs_path *path,
3352  				    struct extent_buffer *l,
3353  				    struct extent_buffer *right,
3354  				    int slot, int mid, int nritems)
3355  {
3356  	struct btrfs_fs_info *fs_info = trans->fs_info;
3357  	int data_copy_size;
3358  	int rt_data_off;
3359  	int i;
3360  	struct btrfs_disk_key disk_key;
3361  	struct btrfs_map_token token;
3362  
3363  	nritems = nritems - mid;
3364  	btrfs_set_header_nritems(right, nritems);
3365  	data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3366  
3367  	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3368  			   btrfs_item_nr_offset(mid),
3369  			   nritems * sizeof(struct btrfs_item));
3370  
3371  	copy_extent_buffer(right, l,
3372  		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3373  		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3374  		     leaf_data_end(l), data_copy_size);
3375  
3376  	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3377  
3378  	btrfs_init_map_token(&token, right);
3379  	for (i = 0; i < nritems; i++) {
3380  		u32 ioff;
3381  
3382  		ioff = btrfs_token_item_offset(&token, i);
3383  		btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3384  	}
3385  
3386  	btrfs_set_header_nritems(l, mid);
3387  	btrfs_item_key(right, &disk_key, 0);
3388  	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3389  
3390  	btrfs_mark_buffer_dirty(right);
3391  	btrfs_mark_buffer_dirty(l);
3392  	BUG_ON(path->slots[0] != slot);
3393  
3394  	if (mid <= slot) {
3395  		btrfs_tree_unlock(path->nodes[0]);
3396  		free_extent_buffer(path->nodes[0]);
3397  		path->nodes[0] = right;
3398  		path->slots[0] -= mid;
3399  		path->slots[1] += 1;
3400  	} else {
3401  		btrfs_tree_unlock(right);
3402  		free_extent_buffer(right);
3403  	}
3404  
3405  	BUG_ON(path->slots[0] < 0);
3406  }
3407  
3408  /*
3409   * double splits happen when we need to insert a big item in the middle
3410   * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3411   * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3412   *          A                 B                 C
3413   *
3414   * We avoid this by trying to push the items on either side of our target
3415   * into the adjacent leaves.  If all goes well we can avoid the double split
3416   * completely.
3417   */
3418  static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3419  					  struct btrfs_root *root,
3420  					  struct btrfs_path *path,
3421  					  int data_size)
3422  {
3423  	int ret;
3424  	int progress = 0;
3425  	int slot;
3426  	u32 nritems;
3427  	int space_needed = data_size;
3428  
3429  	slot = path->slots[0];
3430  	if (slot < btrfs_header_nritems(path->nodes[0]))
3431  		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3432  
3433  	/*
3434  	 * try to push all the items after our slot into the
3435  	 * right leaf
3436  	 */
3437  	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3438  	if (ret < 0)
3439  		return ret;
3440  
3441  	if (ret == 0)
3442  		progress++;
3443  
3444  	nritems = btrfs_header_nritems(path->nodes[0]);
3445  	/*
3446  	 * our goal is to get our slot at the start or end of a leaf.  If
3447  	 * we've done so we're done
3448  	 */
3449  	if (path->slots[0] == 0 || path->slots[0] == nritems)
3450  		return 0;
3451  
3452  	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3453  		return 0;
3454  
3455  	/* try to push all the items before our slot into the next leaf */
3456  	slot = path->slots[0];
3457  	space_needed = data_size;
3458  	if (slot > 0)
3459  		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3460  	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3461  	if (ret < 0)
3462  		return ret;
3463  
3464  	if (ret == 0)
3465  		progress++;
3466  
3467  	if (progress)
3468  		return 0;
3469  	return 1;
3470  }
3471  
3472  /*
3473   * split the path's leaf in two, making sure there is at least data_size
3474   * available for the resulting leaf level of the path.
3475   *
3476   * returns 0 if all went well and < 0 on failure.
3477   */
3478  static noinline int split_leaf(struct btrfs_trans_handle *trans,
3479  			       struct btrfs_root *root,
3480  			       const struct btrfs_key *ins_key,
3481  			       struct btrfs_path *path, int data_size,
3482  			       int extend)
3483  {
3484  	struct btrfs_disk_key disk_key;
3485  	struct extent_buffer *l;
3486  	u32 nritems;
3487  	int mid;
3488  	int slot;
3489  	struct extent_buffer *right;
3490  	struct btrfs_fs_info *fs_info = root->fs_info;
3491  	int ret = 0;
3492  	int wret;
3493  	int split;
3494  	int num_doubles = 0;
3495  	int tried_avoid_double = 0;
3496  
3497  	l = path->nodes[0];
3498  	slot = path->slots[0];
3499  	if (extend && data_size + btrfs_item_size(l, slot) +
3500  	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3501  		return -EOVERFLOW;
3502  
3503  	/* first try to make some room by pushing left and right */
3504  	if (data_size && path->nodes[1]) {
3505  		int space_needed = data_size;
3506  
3507  		if (slot < btrfs_header_nritems(l))
3508  			space_needed -= btrfs_leaf_free_space(l);
3509  
3510  		wret = push_leaf_right(trans, root, path, space_needed,
3511  				       space_needed, 0, 0);
3512  		if (wret < 0)
3513  			return wret;
3514  		if (wret) {
3515  			space_needed = data_size;
3516  			if (slot > 0)
3517  				space_needed -= btrfs_leaf_free_space(l);
3518  			wret = push_leaf_left(trans, root, path, space_needed,
3519  					      space_needed, 0, (u32)-1);
3520  			if (wret < 0)
3521  				return wret;
3522  		}
3523  		l = path->nodes[0];
3524  
3525  		/* did the pushes work? */
3526  		if (btrfs_leaf_free_space(l) >= data_size)
3527  			return 0;
3528  	}
3529  
3530  	if (!path->nodes[1]) {
3531  		ret = insert_new_root(trans, root, path, 1);
3532  		if (ret)
3533  			return ret;
3534  	}
3535  again:
3536  	split = 1;
3537  	l = path->nodes[0];
3538  	slot = path->slots[0];
3539  	nritems = btrfs_header_nritems(l);
3540  	mid = (nritems + 1) / 2;
3541  
3542  	if (mid <= slot) {
3543  		if (nritems == 1 ||
3544  		    leaf_space_used(l, mid, nritems - mid) + data_size >
3545  			BTRFS_LEAF_DATA_SIZE(fs_info)) {
3546  			if (slot >= nritems) {
3547  				split = 0;
3548  			} else {
3549  				mid = slot;
3550  				if (mid != nritems &&
3551  				    leaf_space_used(l, mid, nritems - mid) +
3552  				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3553  					if (data_size && !tried_avoid_double)
3554  						goto push_for_double;
3555  					split = 2;
3556  				}
3557  			}
3558  		}
3559  	} else {
3560  		if (leaf_space_used(l, 0, mid) + data_size >
3561  			BTRFS_LEAF_DATA_SIZE(fs_info)) {
3562  			if (!extend && data_size && slot == 0) {
3563  				split = 0;
3564  			} else if ((extend || !data_size) && slot == 0) {
3565  				mid = 1;
3566  			} else {
3567  				mid = slot;
3568  				if (mid != nritems &&
3569  				    leaf_space_used(l, mid, nritems - mid) +
3570  				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3571  					if (data_size && !tried_avoid_double)
3572  						goto push_for_double;
3573  					split = 2;
3574  				}
3575  			}
3576  		}
3577  	}
3578  
3579  	if (split == 0)
3580  		btrfs_cpu_key_to_disk(&disk_key, ins_key);
3581  	else
3582  		btrfs_item_key(l, &disk_key, mid);
3583  
3584  	/*
3585  	 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3586  	 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3587  	 * subclasses, which is 8 at the time of this patch, and we've maxed it
3588  	 * out.  In the future we could add a
3589  	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3590  	 * use BTRFS_NESTING_NEW_ROOT.
3591  	 */
3592  	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3593  				       &disk_key, 0, l->start, 0,
3594  				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
3595  				       BTRFS_NESTING_SPLIT);
3596  	if (IS_ERR(right))
3597  		return PTR_ERR(right);
3598  
3599  	root_add_used(root, fs_info->nodesize);
3600  
3601  	if (split == 0) {
3602  		if (mid <= slot) {
3603  			btrfs_set_header_nritems(right, 0);
3604  			insert_ptr(trans, path, &disk_key,
3605  				   right->start, path->slots[1] + 1, 1);
3606  			btrfs_tree_unlock(path->nodes[0]);
3607  			free_extent_buffer(path->nodes[0]);
3608  			path->nodes[0] = right;
3609  			path->slots[0] = 0;
3610  			path->slots[1] += 1;
3611  		} else {
3612  			btrfs_set_header_nritems(right, 0);
3613  			insert_ptr(trans, path, &disk_key,
3614  				   right->start, path->slots[1], 1);
3615  			btrfs_tree_unlock(path->nodes[0]);
3616  			free_extent_buffer(path->nodes[0]);
3617  			path->nodes[0] = right;
3618  			path->slots[0] = 0;
3619  			if (path->slots[1] == 0)
3620  				fixup_low_keys(path, &disk_key, 1);
3621  		}
3622  		/*
3623  		 * We create a new leaf 'right' for the required ins_len and
3624  		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3625  		 * the content of ins_len to 'right'.
3626  		 */
3627  		return ret;
3628  	}
3629  
3630  	copy_for_split(trans, path, l, right, slot, mid, nritems);
3631  
3632  	if (split == 2) {
3633  		BUG_ON(num_doubles != 0);
3634  		num_doubles++;
3635  		goto again;
3636  	}
3637  
3638  	return 0;
3639  
3640  push_for_double:
3641  	push_for_double_split(trans, root, path, data_size);
3642  	tried_avoid_double = 1;
3643  	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3644  		return 0;
3645  	goto again;
3646  }
3647  
3648  static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3649  					 struct btrfs_root *root,
3650  					 struct btrfs_path *path, int ins_len)
3651  {
3652  	struct btrfs_key key;
3653  	struct extent_buffer *leaf;
3654  	struct btrfs_file_extent_item *fi;
3655  	u64 extent_len = 0;
3656  	u32 item_size;
3657  	int ret;
3658  
3659  	leaf = path->nodes[0];
3660  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3661  
3662  	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3663  	       key.type != BTRFS_EXTENT_CSUM_KEY);
3664  
3665  	if (btrfs_leaf_free_space(leaf) >= ins_len)
3666  		return 0;
3667  
3668  	item_size = btrfs_item_size(leaf, path->slots[0]);
3669  	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3670  		fi = btrfs_item_ptr(leaf, path->slots[0],
3671  				    struct btrfs_file_extent_item);
3672  		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3673  	}
3674  	btrfs_release_path(path);
3675  
3676  	path->keep_locks = 1;
3677  	path->search_for_split = 1;
3678  	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3679  	path->search_for_split = 0;
3680  	if (ret > 0)
3681  		ret = -EAGAIN;
3682  	if (ret < 0)
3683  		goto err;
3684  
3685  	ret = -EAGAIN;
3686  	leaf = path->nodes[0];
3687  	/* if our item isn't there, return now */
3688  	if (item_size != btrfs_item_size(leaf, path->slots[0]))
3689  		goto err;
3690  
3691  	/* the leaf has  changed, it now has room.  return now */
3692  	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3693  		goto err;
3694  
3695  	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3696  		fi = btrfs_item_ptr(leaf, path->slots[0],
3697  				    struct btrfs_file_extent_item);
3698  		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3699  			goto err;
3700  	}
3701  
3702  	ret = split_leaf(trans, root, &key, path, ins_len, 1);
3703  	if (ret)
3704  		goto err;
3705  
3706  	path->keep_locks = 0;
3707  	btrfs_unlock_up_safe(path, 1);
3708  	return 0;
3709  err:
3710  	path->keep_locks = 0;
3711  	return ret;
3712  }
3713  
3714  static noinline int split_item(struct btrfs_path *path,
3715  			       const struct btrfs_key *new_key,
3716  			       unsigned long split_offset)
3717  {
3718  	struct extent_buffer *leaf;
3719  	int orig_slot, slot;
3720  	char *buf;
3721  	u32 nritems;
3722  	u32 item_size;
3723  	u32 orig_offset;
3724  	struct btrfs_disk_key disk_key;
3725  
3726  	leaf = path->nodes[0];
3727  	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3728  
3729  	orig_slot = path->slots[0];
3730  	orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3731  	item_size = btrfs_item_size(leaf, path->slots[0]);
3732  
3733  	buf = kmalloc(item_size, GFP_NOFS);
3734  	if (!buf)
3735  		return -ENOMEM;
3736  
3737  	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3738  			    path->slots[0]), item_size);
3739  
3740  	slot = path->slots[0] + 1;
3741  	nritems = btrfs_header_nritems(leaf);
3742  	if (slot != nritems) {
3743  		/* shift the items */
3744  		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3745  				btrfs_item_nr_offset(slot),
3746  				(nritems - slot) * sizeof(struct btrfs_item));
3747  	}
3748  
3749  	btrfs_cpu_key_to_disk(&disk_key, new_key);
3750  	btrfs_set_item_key(leaf, &disk_key, slot);
3751  
3752  	btrfs_set_item_offset(leaf, slot, orig_offset);
3753  	btrfs_set_item_size(leaf, slot, item_size - split_offset);
3754  
3755  	btrfs_set_item_offset(leaf, orig_slot,
3756  				 orig_offset + item_size - split_offset);
3757  	btrfs_set_item_size(leaf, orig_slot, split_offset);
3758  
3759  	btrfs_set_header_nritems(leaf, nritems + 1);
3760  
3761  	/* write the data for the start of the original item */
3762  	write_extent_buffer(leaf, buf,
3763  			    btrfs_item_ptr_offset(leaf, path->slots[0]),
3764  			    split_offset);
3765  
3766  	/* write the data for the new item */
3767  	write_extent_buffer(leaf, buf + split_offset,
3768  			    btrfs_item_ptr_offset(leaf, slot),
3769  			    item_size - split_offset);
3770  	btrfs_mark_buffer_dirty(leaf);
3771  
3772  	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3773  	kfree(buf);
3774  	return 0;
3775  }
3776  
3777  /*
3778   * This function splits a single item into two items,
3779   * giving 'new_key' to the new item and splitting the
3780   * old one at split_offset (from the start of the item).
3781   *
3782   * The path may be released by this operation.  After
3783   * the split, the path is pointing to the old item.  The
3784   * new item is going to be in the same node as the old one.
3785   *
3786   * Note, the item being split must be smaller enough to live alone on
3787   * a tree block with room for one extra struct btrfs_item
3788   *
3789   * This allows us to split the item in place, keeping a lock on the
3790   * leaf the entire time.
3791   */
3792  int btrfs_split_item(struct btrfs_trans_handle *trans,
3793  		     struct btrfs_root *root,
3794  		     struct btrfs_path *path,
3795  		     const struct btrfs_key *new_key,
3796  		     unsigned long split_offset)
3797  {
3798  	int ret;
3799  	ret = setup_leaf_for_split(trans, root, path,
3800  				   sizeof(struct btrfs_item));
3801  	if (ret)
3802  		return ret;
3803  
3804  	ret = split_item(path, new_key, split_offset);
3805  	return ret;
3806  }
3807  
3808  /*
3809   * make the item pointed to by the path smaller.  new_size indicates
3810   * how small to make it, and from_end tells us if we just chop bytes
3811   * off the end of the item or if we shift the item to chop bytes off
3812   * the front.
3813   */
3814  void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3815  {
3816  	int slot;
3817  	struct extent_buffer *leaf;
3818  	u32 nritems;
3819  	unsigned int data_end;
3820  	unsigned int old_data_start;
3821  	unsigned int old_size;
3822  	unsigned int size_diff;
3823  	int i;
3824  	struct btrfs_map_token token;
3825  
3826  	leaf = path->nodes[0];
3827  	slot = path->slots[0];
3828  
3829  	old_size = btrfs_item_size(leaf, slot);
3830  	if (old_size == new_size)
3831  		return;
3832  
3833  	nritems = btrfs_header_nritems(leaf);
3834  	data_end = leaf_data_end(leaf);
3835  
3836  	old_data_start = btrfs_item_offset(leaf, slot);
3837  
3838  	size_diff = old_size - new_size;
3839  
3840  	BUG_ON(slot < 0);
3841  	BUG_ON(slot >= nritems);
3842  
3843  	/*
3844  	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3845  	 */
3846  	/* first correct the data pointers */
3847  	btrfs_init_map_token(&token, leaf);
3848  	for (i = slot; i < nritems; i++) {
3849  		u32 ioff;
3850  
3851  		ioff = btrfs_token_item_offset(&token, i);
3852  		btrfs_set_token_item_offset(&token, i, ioff + size_diff);
3853  	}
3854  
3855  	/* shift the data */
3856  	if (from_end) {
3857  		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3858  			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3859  			      data_end, old_data_start + new_size - data_end);
3860  	} else {
3861  		struct btrfs_disk_key disk_key;
3862  		u64 offset;
3863  
3864  		btrfs_item_key(leaf, &disk_key, slot);
3865  
3866  		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3867  			unsigned long ptr;
3868  			struct btrfs_file_extent_item *fi;
3869  
3870  			fi = btrfs_item_ptr(leaf, slot,
3871  					    struct btrfs_file_extent_item);
3872  			fi = (struct btrfs_file_extent_item *)(
3873  			     (unsigned long)fi - size_diff);
3874  
3875  			if (btrfs_file_extent_type(leaf, fi) ==
3876  			    BTRFS_FILE_EXTENT_INLINE) {
3877  				ptr = btrfs_item_ptr_offset(leaf, slot);
3878  				memmove_extent_buffer(leaf, ptr,
3879  				      (unsigned long)fi,
3880  				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
3881  			}
3882  		}
3883  
3884  		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3885  			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3886  			      data_end, old_data_start - data_end);
3887  
3888  		offset = btrfs_disk_key_offset(&disk_key);
3889  		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3890  		btrfs_set_item_key(leaf, &disk_key, slot);
3891  		if (slot == 0)
3892  			fixup_low_keys(path, &disk_key, 1);
3893  	}
3894  
3895  	btrfs_set_item_size(leaf, slot, new_size);
3896  	btrfs_mark_buffer_dirty(leaf);
3897  
3898  	if (btrfs_leaf_free_space(leaf) < 0) {
3899  		btrfs_print_leaf(leaf);
3900  		BUG();
3901  	}
3902  }
3903  
3904  /*
3905   * make the item pointed to by the path bigger, data_size is the added size.
3906   */
3907  void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3908  {
3909  	int slot;
3910  	struct extent_buffer *leaf;
3911  	u32 nritems;
3912  	unsigned int data_end;
3913  	unsigned int old_data;
3914  	unsigned int old_size;
3915  	int i;
3916  	struct btrfs_map_token token;
3917  
3918  	leaf = path->nodes[0];
3919  
3920  	nritems = btrfs_header_nritems(leaf);
3921  	data_end = leaf_data_end(leaf);
3922  
3923  	if (btrfs_leaf_free_space(leaf) < data_size) {
3924  		btrfs_print_leaf(leaf);
3925  		BUG();
3926  	}
3927  	slot = path->slots[0];
3928  	old_data = btrfs_item_data_end(leaf, slot);
3929  
3930  	BUG_ON(slot < 0);
3931  	if (slot >= nritems) {
3932  		btrfs_print_leaf(leaf);
3933  		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3934  			   slot, nritems);
3935  		BUG();
3936  	}
3937  
3938  	/*
3939  	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3940  	 */
3941  	/* first correct the data pointers */
3942  	btrfs_init_map_token(&token, leaf);
3943  	for (i = slot; i < nritems; i++) {
3944  		u32 ioff;
3945  
3946  		ioff = btrfs_token_item_offset(&token, i);
3947  		btrfs_set_token_item_offset(&token, i, ioff - data_size);
3948  	}
3949  
3950  	/* shift the data */
3951  	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3952  		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3953  		      data_end, old_data - data_end);
3954  
3955  	data_end = old_data;
3956  	old_size = btrfs_item_size(leaf, slot);
3957  	btrfs_set_item_size(leaf, slot, old_size + data_size);
3958  	btrfs_mark_buffer_dirty(leaf);
3959  
3960  	if (btrfs_leaf_free_space(leaf) < 0) {
3961  		btrfs_print_leaf(leaf);
3962  		BUG();
3963  	}
3964  }
3965  
3966  /**
3967   * setup_items_for_insert - Helper called before inserting one or more items
3968   * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3969   * in a function that doesn't call btrfs_search_slot
3970   *
3971   * @root:	root we are inserting items to
3972   * @path:	points to the leaf/slot where we are going to insert new items
3973   * @batch:      information about the batch of items to insert
3974   */
3975  static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3976  				   const struct btrfs_item_batch *batch)
3977  {
3978  	struct btrfs_fs_info *fs_info = root->fs_info;
3979  	int i;
3980  	u32 nritems;
3981  	unsigned int data_end;
3982  	struct btrfs_disk_key disk_key;
3983  	struct extent_buffer *leaf;
3984  	int slot;
3985  	struct btrfs_map_token token;
3986  	u32 total_size;
3987  
3988  	/*
3989  	 * Before anything else, update keys in the parent and other ancestors
3990  	 * if needed, then release the write locks on them, so that other tasks
3991  	 * can use them while we modify the leaf.
3992  	 */
3993  	if (path->slots[0] == 0) {
3994  		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
3995  		fixup_low_keys(path, &disk_key, 1);
3996  	}
3997  	btrfs_unlock_up_safe(path, 1);
3998  
3999  	leaf = path->nodes[0];
4000  	slot = path->slots[0];
4001  
4002  	nritems = btrfs_header_nritems(leaf);
4003  	data_end = leaf_data_end(leaf);
4004  	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4005  
4006  	if (btrfs_leaf_free_space(leaf) < total_size) {
4007  		btrfs_print_leaf(leaf);
4008  		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4009  			   total_size, btrfs_leaf_free_space(leaf));
4010  		BUG();
4011  	}
4012  
4013  	btrfs_init_map_token(&token, leaf);
4014  	if (slot != nritems) {
4015  		unsigned int old_data = btrfs_item_data_end(leaf, slot);
4016  
4017  		if (old_data < data_end) {
4018  			btrfs_print_leaf(leaf);
4019  			btrfs_crit(fs_info,
4020  		"item at slot %d with data offset %u beyond data end of leaf %u",
4021  				   slot, old_data, data_end);
4022  			BUG();
4023  		}
4024  		/*
4025  		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4026  		 */
4027  		/* first correct the data pointers */
4028  		for (i = slot; i < nritems; i++) {
4029  			u32 ioff;
4030  
4031  			ioff = btrfs_token_item_offset(&token, i);
4032  			btrfs_set_token_item_offset(&token, i,
4033  						       ioff - batch->total_data_size);
4034  		}
4035  		/* shift the items */
4036  		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
4037  			      btrfs_item_nr_offset(slot),
4038  			      (nritems - slot) * sizeof(struct btrfs_item));
4039  
4040  		/* shift the data */
4041  		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4042  				      data_end - batch->total_data_size,
4043  				      BTRFS_LEAF_DATA_OFFSET + data_end,
4044  				      old_data - data_end);
4045  		data_end = old_data;
4046  	}
4047  
4048  	/* setup the item for the new data */
4049  	for (i = 0; i < batch->nr; i++) {
4050  		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4051  		btrfs_set_item_key(leaf, &disk_key, slot + i);
4052  		data_end -= batch->data_sizes[i];
4053  		btrfs_set_token_item_offset(&token, slot + i, data_end);
4054  		btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4055  	}
4056  
4057  	btrfs_set_header_nritems(leaf, nritems + batch->nr);
4058  	btrfs_mark_buffer_dirty(leaf);
4059  
4060  	if (btrfs_leaf_free_space(leaf) < 0) {
4061  		btrfs_print_leaf(leaf);
4062  		BUG();
4063  	}
4064  }
4065  
4066  /*
4067   * Insert a new item into a leaf.
4068   *
4069   * @root:      The root of the btree.
4070   * @path:      A path pointing to the target leaf and slot.
4071   * @key:       The key of the new item.
4072   * @data_size: The size of the data associated with the new key.
4073   */
4074  void btrfs_setup_item_for_insert(struct btrfs_root *root,
4075  				 struct btrfs_path *path,
4076  				 const struct btrfs_key *key,
4077  				 u32 data_size)
4078  {
4079  	struct btrfs_item_batch batch;
4080  
4081  	batch.keys = key;
4082  	batch.data_sizes = &data_size;
4083  	batch.total_data_size = data_size;
4084  	batch.nr = 1;
4085  
4086  	setup_items_for_insert(root, path, &batch);
4087  }
4088  
4089  /*
4090   * Given a key and some data, insert items into the tree.
4091   * This does all the path init required, making room in the tree if needed.
4092   */
4093  int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4094  			    struct btrfs_root *root,
4095  			    struct btrfs_path *path,
4096  			    const struct btrfs_item_batch *batch)
4097  {
4098  	int ret = 0;
4099  	int slot;
4100  	u32 total_size;
4101  
4102  	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4103  	ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4104  	if (ret == 0)
4105  		return -EEXIST;
4106  	if (ret < 0)
4107  		return ret;
4108  
4109  	slot = path->slots[0];
4110  	BUG_ON(slot < 0);
4111  
4112  	setup_items_for_insert(root, path, batch);
4113  	return 0;
4114  }
4115  
4116  /*
4117   * Given a key and some data, insert an item into the tree.
4118   * This does all the path init required, making room in the tree if needed.
4119   */
4120  int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4121  		      const struct btrfs_key *cpu_key, void *data,
4122  		      u32 data_size)
4123  {
4124  	int ret = 0;
4125  	struct btrfs_path *path;
4126  	struct extent_buffer *leaf;
4127  	unsigned long ptr;
4128  
4129  	path = btrfs_alloc_path();
4130  	if (!path)
4131  		return -ENOMEM;
4132  	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4133  	if (!ret) {
4134  		leaf = path->nodes[0];
4135  		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4136  		write_extent_buffer(leaf, data, ptr, data_size);
4137  		btrfs_mark_buffer_dirty(leaf);
4138  	}
4139  	btrfs_free_path(path);
4140  	return ret;
4141  }
4142  
4143  /*
4144   * This function duplicates an item, giving 'new_key' to the new item.
4145   * It guarantees both items live in the same tree leaf and the new item is
4146   * contiguous with the original item.
4147   *
4148   * This allows us to split a file extent in place, keeping a lock on the leaf
4149   * the entire time.
4150   */
4151  int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4152  			 struct btrfs_root *root,
4153  			 struct btrfs_path *path,
4154  			 const struct btrfs_key *new_key)
4155  {
4156  	struct extent_buffer *leaf;
4157  	int ret;
4158  	u32 item_size;
4159  
4160  	leaf = path->nodes[0];
4161  	item_size = btrfs_item_size(leaf, path->slots[0]);
4162  	ret = setup_leaf_for_split(trans, root, path,
4163  				   item_size + sizeof(struct btrfs_item));
4164  	if (ret)
4165  		return ret;
4166  
4167  	path->slots[0]++;
4168  	btrfs_setup_item_for_insert(root, path, new_key, item_size);
4169  	leaf = path->nodes[0];
4170  	memcpy_extent_buffer(leaf,
4171  			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4172  			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4173  			     item_size);
4174  	return 0;
4175  }
4176  
4177  /*
4178   * delete the pointer from a given node.
4179   *
4180   * the tree should have been previously balanced so the deletion does not
4181   * empty a node.
4182   */
4183  static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4184  		    int level, int slot)
4185  {
4186  	struct extent_buffer *parent = path->nodes[level];
4187  	u32 nritems;
4188  	int ret;
4189  
4190  	nritems = btrfs_header_nritems(parent);
4191  	if (slot != nritems - 1) {
4192  		if (level) {
4193  			ret = btrfs_tree_mod_log_insert_move(parent, slot,
4194  					slot + 1, nritems - slot - 1);
4195  			BUG_ON(ret < 0);
4196  		}
4197  		memmove_extent_buffer(parent,
4198  			      btrfs_node_key_ptr_offset(slot),
4199  			      btrfs_node_key_ptr_offset(slot + 1),
4200  			      sizeof(struct btrfs_key_ptr) *
4201  			      (nritems - slot - 1));
4202  	} else if (level) {
4203  		ret = btrfs_tree_mod_log_insert_key(parent, slot,
4204  				BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
4205  		BUG_ON(ret < 0);
4206  	}
4207  
4208  	nritems--;
4209  	btrfs_set_header_nritems(parent, nritems);
4210  	if (nritems == 0 && parent == root->node) {
4211  		BUG_ON(btrfs_header_level(root->node) != 1);
4212  		/* just turn the root into a leaf and break */
4213  		btrfs_set_header_level(root->node, 0);
4214  	} else if (slot == 0) {
4215  		struct btrfs_disk_key disk_key;
4216  
4217  		btrfs_node_key(parent, &disk_key, 0);
4218  		fixup_low_keys(path, &disk_key, level + 1);
4219  	}
4220  	btrfs_mark_buffer_dirty(parent);
4221  }
4222  
4223  /*
4224   * a helper function to delete the leaf pointed to by path->slots[1] and
4225   * path->nodes[1].
4226   *
4227   * This deletes the pointer in path->nodes[1] and frees the leaf
4228   * block extent.  zero is returned if it all worked out, < 0 otherwise.
4229   *
4230   * The path must have already been setup for deleting the leaf, including
4231   * all the proper balancing.  path->nodes[1] must be locked.
4232   */
4233  static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4234  				    struct btrfs_root *root,
4235  				    struct btrfs_path *path,
4236  				    struct extent_buffer *leaf)
4237  {
4238  	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4239  	del_ptr(root, path, 1, path->slots[1]);
4240  
4241  	/*
4242  	 * btrfs_free_extent is expensive, we want to make sure we
4243  	 * aren't holding any locks when we call it
4244  	 */
4245  	btrfs_unlock_up_safe(path, 0);
4246  
4247  	root_sub_used(root, leaf->len);
4248  
4249  	atomic_inc(&leaf->refs);
4250  	btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4251  	free_extent_buffer_stale(leaf);
4252  }
4253  /*
4254   * delete the item at the leaf level in path.  If that empties
4255   * the leaf, remove it from the tree
4256   */
4257  int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4258  		    struct btrfs_path *path, int slot, int nr)
4259  {
4260  	struct btrfs_fs_info *fs_info = root->fs_info;
4261  	struct extent_buffer *leaf;
4262  	int ret = 0;
4263  	int wret;
4264  	u32 nritems;
4265  
4266  	leaf = path->nodes[0];
4267  	nritems = btrfs_header_nritems(leaf);
4268  
4269  	if (slot + nr != nritems) {
4270  		const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4271  		const int data_end = leaf_data_end(leaf);
4272  		struct btrfs_map_token token;
4273  		u32 dsize = 0;
4274  		int i;
4275  
4276  		for (i = 0; i < nr; i++)
4277  			dsize += btrfs_item_size(leaf, slot + i);
4278  
4279  		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4280  			      data_end + dsize,
4281  			      BTRFS_LEAF_DATA_OFFSET + data_end,
4282  			      last_off - data_end);
4283  
4284  		btrfs_init_map_token(&token, leaf);
4285  		for (i = slot + nr; i < nritems; i++) {
4286  			u32 ioff;
4287  
4288  			ioff = btrfs_token_item_offset(&token, i);
4289  			btrfs_set_token_item_offset(&token, i, ioff + dsize);
4290  		}
4291  
4292  		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4293  			      btrfs_item_nr_offset(slot + nr),
4294  			      sizeof(struct btrfs_item) *
4295  			      (nritems - slot - nr));
4296  	}
4297  	btrfs_set_header_nritems(leaf, nritems - nr);
4298  	nritems -= nr;
4299  
4300  	/* delete the leaf if we've emptied it */
4301  	if (nritems == 0) {
4302  		if (leaf == root->node) {
4303  			btrfs_set_header_level(leaf, 0);
4304  		} else {
4305  			btrfs_clean_tree_block(leaf);
4306  			btrfs_del_leaf(trans, root, path, leaf);
4307  		}
4308  	} else {
4309  		int used = leaf_space_used(leaf, 0, nritems);
4310  		if (slot == 0) {
4311  			struct btrfs_disk_key disk_key;
4312  
4313  			btrfs_item_key(leaf, &disk_key, 0);
4314  			fixup_low_keys(path, &disk_key, 1);
4315  		}
4316  
4317  		/*
4318  		 * Try to delete the leaf if it is mostly empty. We do this by
4319  		 * trying to move all its items into its left and right neighbours.
4320  		 * If we can't move all the items, then we don't delete it - it's
4321  		 * not ideal, but future insertions might fill the leaf with more
4322  		 * items, or items from other leaves might be moved later into our
4323  		 * leaf due to deletions on those leaves.
4324  		 */
4325  		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4326  			u32 min_push_space;
4327  
4328  			/* push_leaf_left fixes the path.
4329  			 * make sure the path still points to our leaf
4330  			 * for possible call to del_ptr below
4331  			 */
4332  			slot = path->slots[1];
4333  			atomic_inc(&leaf->refs);
4334  			/*
4335  			 * We want to be able to at least push one item to the
4336  			 * left neighbour leaf, and that's the first item.
4337  			 */
4338  			min_push_space = sizeof(struct btrfs_item) +
4339  				btrfs_item_size(leaf, 0);
4340  			wret = push_leaf_left(trans, root, path, 0,
4341  					      min_push_space, 1, (u32)-1);
4342  			if (wret < 0 && wret != -ENOSPC)
4343  				ret = wret;
4344  
4345  			if (path->nodes[0] == leaf &&
4346  			    btrfs_header_nritems(leaf)) {
4347  				/*
4348  				 * If we were not able to push all items from our
4349  				 * leaf to its left neighbour, then attempt to
4350  				 * either push all the remaining items to the
4351  				 * right neighbour or none. There's no advantage
4352  				 * in pushing only some items, instead of all, as
4353  				 * it's pointless to end up with a leaf having
4354  				 * too few items while the neighbours can be full
4355  				 * or nearly full.
4356  				 */
4357  				nritems = btrfs_header_nritems(leaf);
4358  				min_push_space = leaf_space_used(leaf, 0, nritems);
4359  				wret = push_leaf_right(trans, root, path, 0,
4360  						       min_push_space, 1, 0);
4361  				if (wret < 0 && wret != -ENOSPC)
4362  					ret = wret;
4363  			}
4364  
4365  			if (btrfs_header_nritems(leaf) == 0) {
4366  				path->slots[1] = slot;
4367  				btrfs_del_leaf(trans, root, path, leaf);
4368  				free_extent_buffer(leaf);
4369  				ret = 0;
4370  			} else {
4371  				/* if we're still in the path, make sure
4372  				 * we're dirty.  Otherwise, one of the
4373  				 * push_leaf functions must have already
4374  				 * dirtied this buffer
4375  				 */
4376  				if (path->nodes[0] == leaf)
4377  					btrfs_mark_buffer_dirty(leaf);
4378  				free_extent_buffer(leaf);
4379  			}
4380  		} else {
4381  			btrfs_mark_buffer_dirty(leaf);
4382  		}
4383  	}
4384  	return ret;
4385  }
4386  
4387  /*
4388   * search the tree again to find a leaf with lesser keys
4389   * returns 0 if it found something or 1 if there are no lesser leaves.
4390   * returns < 0 on io errors.
4391   *
4392   * This may release the path, and so you may lose any locks held at the
4393   * time you call it.
4394   */
4395  int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4396  {
4397  	struct btrfs_key key;
4398  	struct btrfs_disk_key found_key;
4399  	int ret;
4400  
4401  	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4402  
4403  	if (key.offset > 0) {
4404  		key.offset--;
4405  	} else if (key.type > 0) {
4406  		key.type--;
4407  		key.offset = (u64)-1;
4408  	} else if (key.objectid > 0) {
4409  		key.objectid--;
4410  		key.type = (u8)-1;
4411  		key.offset = (u64)-1;
4412  	} else {
4413  		return 1;
4414  	}
4415  
4416  	btrfs_release_path(path);
4417  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4418  	if (ret < 0)
4419  		return ret;
4420  	btrfs_item_key(path->nodes[0], &found_key, 0);
4421  	ret = comp_keys(&found_key, &key);
4422  	/*
4423  	 * We might have had an item with the previous key in the tree right
4424  	 * before we released our path. And after we released our path, that
4425  	 * item might have been pushed to the first slot (0) of the leaf we
4426  	 * were holding due to a tree balance. Alternatively, an item with the
4427  	 * previous key can exist as the only element of a leaf (big fat item).
4428  	 * Therefore account for these 2 cases, so that our callers (like
4429  	 * btrfs_previous_item) don't miss an existing item with a key matching
4430  	 * the previous key we computed above.
4431  	 */
4432  	if (ret <= 0)
4433  		return 0;
4434  	return 1;
4435  }
4436  
4437  /*
4438   * A helper function to walk down the tree starting at min_key, and looking
4439   * for nodes or leaves that are have a minimum transaction id.
4440   * This is used by the btree defrag code, and tree logging
4441   *
4442   * This does not cow, but it does stuff the starting key it finds back
4443   * into min_key, so you can call btrfs_search_slot with cow=1 on the
4444   * key and get a writable path.
4445   *
4446   * This honors path->lowest_level to prevent descent past a given level
4447   * of the tree.
4448   *
4449   * min_trans indicates the oldest transaction that you are interested
4450   * in walking through.  Any nodes or leaves older than min_trans are
4451   * skipped over (without reading them).
4452   *
4453   * returns zero if something useful was found, < 0 on error and 1 if there
4454   * was nothing in the tree that matched the search criteria.
4455   */
4456  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4457  			 struct btrfs_path *path,
4458  			 u64 min_trans)
4459  {
4460  	struct extent_buffer *cur;
4461  	struct btrfs_key found_key;
4462  	int slot;
4463  	int sret;
4464  	u32 nritems;
4465  	int level;
4466  	int ret = 1;
4467  	int keep_locks = path->keep_locks;
4468  
4469  	ASSERT(!path->nowait);
4470  	path->keep_locks = 1;
4471  again:
4472  	cur = btrfs_read_lock_root_node(root);
4473  	level = btrfs_header_level(cur);
4474  	WARN_ON(path->nodes[level]);
4475  	path->nodes[level] = cur;
4476  	path->locks[level] = BTRFS_READ_LOCK;
4477  
4478  	if (btrfs_header_generation(cur) < min_trans) {
4479  		ret = 1;
4480  		goto out;
4481  	}
4482  	while (1) {
4483  		nritems = btrfs_header_nritems(cur);
4484  		level = btrfs_header_level(cur);
4485  		sret = btrfs_bin_search(cur, min_key, &slot);
4486  		if (sret < 0) {
4487  			ret = sret;
4488  			goto out;
4489  		}
4490  
4491  		/* at the lowest level, we're done, setup the path and exit */
4492  		if (level == path->lowest_level) {
4493  			if (slot >= nritems)
4494  				goto find_next_key;
4495  			ret = 0;
4496  			path->slots[level] = slot;
4497  			btrfs_item_key_to_cpu(cur, &found_key, slot);
4498  			goto out;
4499  		}
4500  		if (sret && slot > 0)
4501  			slot--;
4502  		/*
4503  		 * check this node pointer against the min_trans parameters.
4504  		 * If it is too old, skip to the next one.
4505  		 */
4506  		while (slot < nritems) {
4507  			u64 gen;
4508  
4509  			gen = btrfs_node_ptr_generation(cur, slot);
4510  			if (gen < min_trans) {
4511  				slot++;
4512  				continue;
4513  			}
4514  			break;
4515  		}
4516  find_next_key:
4517  		/*
4518  		 * we didn't find a candidate key in this node, walk forward
4519  		 * and find another one
4520  		 */
4521  		if (slot >= nritems) {
4522  			path->slots[level] = slot;
4523  			sret = btrfs_find_next_key(root, path, min_key, level,
4524  						  min_trans);
4525  			if (sret == 0) {
4526  				btrfs_release_path(path);
4527  				goto again;
4528  			} else {
4529  				goto out;
4530  			}
4531  		}
4532  		/* save our key for returning back */
4533  		btrfs_node_key_to_cpu(cur, &found_key, slot);
4534  		path->slots[level] = slot;
4535  		if (level == path->lowest_level) {
4536  			ret = 0;
4537  			goto out;
4538  		}
4539  		cur = btrfs_read_node_slot(cur, slot);
4540  		if (IS_ERR(cur)) {
4541  			ret = PTR_ERR(cur);
4542  			goto out;
4543  		}
4544  
4545  		btrfs_tree_read_lock(cur);
4546  
4547  		path->locks[level - 1] = BTRFS_READ_LOCK;
4548  		path->nodes[level - 1] = cur;
4549  		unlock_up(path, level, 1, 0, NULL);
4550  	}
4551  out:
4552  	path->keep_locks = keep_locks;
4553  	if (ret == 0) {
4554  		btrfs_unlock_up_safe(path, path->lowest_level + 1);
4555  		memcpy(min_key, &found_key, sizeof(found_key));
4556  	}
4557  	return ret;
4558  }
4559  
4560  /*
4561   * this is similar to btrfs_next_leaf, but does not try to preserve
4562   * and fixup the path.  It looks for and returns the next key in the
4563   * tree based on the current path and the min_trans parameters.
4564   *
4565   * 0 is returned if another key is found, < 0 if there are any errors
4566   * and 1 is returned if there are no higher keys in the tree
4567   *
4568   * path->keep_locks should be set to 1 on the search made before
4569   * calling this function.
4570   */
4571  int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4572  			struct btrfs_key *key, int level, u64 min_trans)
4573  {
4574  	int slot;
4575  	struct extent_buffer *c;
4576  
4577  	WARN_ON(!path->keep_locks && !path->skip_locking);
4578  	while (level < BTRFS_MAX_LEVEL) {
4579  		if (!path->nodes[level])
4580  			return 1;
4581  
4582  		slot = path->slots[level] + 1;
4583  		c = path->nodes[level];
4584  next:
4585  		if (slot >= btrfs_header_nritems(c)) {
4586  			int ret;
4587  			int orig_lowest;
4588  			struct btrfs_key cur_key;
4589  			if (level + 1 >= BTRFS_MAX_LEVEL ||
4590  			    !path->nodes[level + 1])
4591  				return 1;
4592  
4593  			if (path->locks[level + 1] || path->skip_locking) {
4594  				level++;
4595  				continue;
4596  			}
4597  
4598  			slot = btrfs_header_nritems(c) - 1;
4599  			if (level == 0)
4600  				btrfs_item_key_to_cpu(c, &cur_key, slot);
4601  			else
4602  				btrfs_node_key_to_cpu(c, &cur_key, slot);
4603  
4604  			orig_lowest = path->lowest_level;
4605  			btrfs_release_path(path);
4606  			path->lowest_level = level;
4607  			ret = btrfs_search_slot(NULL, root, &cur_key, path,
4608  						0, 0);
4609  			path->lowest_level = orig_lowest;
4610  			if (ret < 0)
4611  				return ret;
4612  
4613  			c = path->nodes[level];
4614  			slot = path->slots[level];
4615  			if (ret == 0)
4616  				slot++;
4617  			goto next;
4618  		}
4619  
4620  		if (level == 0)
4621  			btrfs_item_key_to_cpu(c, key, slot);
4622  		else {
4623  			u64 gen = btrfs_node_ptr_generation(c, slot);
4624  
4625  			if (gen < min_trans) {
4626  				slot++;
4627  				goto next;
4628  			}
4629  			btrfs_node_key_to_cpu(c, key, slot);
4630  		}
4631  		return 0;
4632  	}
4633  	return 1;
4634  }
4635  
4636  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4637  			u64 time_seq)
4638  {
4639  	int slot;
4640  	int level;
4641  	struct extent_buffer *c;
4642  	struct extent_buffer *next;
4643  	struct btrfs_fs_info *fs_info = root->fs_info;
4644  	struct btrfs_key key;
4645  	bool need_commit_sem = false;
4646  	u32 nritems;
4647  	int ret;
4648  	int i;
4649  
4650  	ASSERT(!path->nowait);
4651  
4652  	nritems = btrfs_header_nritems(path->nodes[0]);
4653  	if (nritems == 0)
4654  		return 1;
4655  
4656  	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4657  again:
4658  	level = 1;
4659  	next = NULL;
4660  	btrfs_release_path(path);
4661  
4662  	path->keep_locks = 1;
4663  
4664  	if (time_seq) {
4665  		ret = btrfs_search_old_slot(root, &key, path, time_seq);
4666  	} else {
4667  		if (path->need_commit_sem) {
4668  			path->need_commit_sem = 0;
4669  			need_commit_sem = true;
4670  			down_read(&fs_info->commit_root_sem);
4671  		}
4672  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4673  	}
4674  	path->keep_locks = 0;
4675  
4676  	if (ret < 0)
4677  		goto done;
4678  
4679  	nritems = btrfs_header_nritems(path->nodes[0]);
4680  	/*
4681  	 * by releasing the path above we dropped all our locks.  A balance
4682  	 * could have added more items next to the key that used to be
4683  	 * at the very end of the block.  So, check again here and
4684  	 * advance the path if there are now more items available.
4685  	 */
4686  	if (nritems > 0 && path->slots[0] < nritems - 1) {
4687  		if (ret == 0)
4688  			path->slots[0]++;
4689  		ret = 0;
4690  		goto done;
4691  	}
4692  	/*
4693  	 * So the above check misses one case:
4694  	 * - after releasing the path above, someone has removed the item that
4695  	 *   used to be at the very end of the block, and balance between leafs
4696  	 *   gets another one with bigger key.offset to replace it.
4697  	 *
4698  	 * This one should be returned as well, or we can get leaf corruption
4699  	 * later(esp. in __btrfs_drop_extents()).
4700  	 *
4701  	 * And a bit more explanation about this check,
4702  	 * with ret > 0, the key isn't found, the path points to the slot
4703  	 * where it should be inserted, so the path->slots[0] item must be the
4704  	 * bigger one.
4705  	 */
4706  	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4707  		ret = 0;
4708  		goto done;
4709  	}
4710  
4711  	while (level < BTRFS_MAX_LEVEL) {
4712  		if (!path->nodes[level]) {
4713  			ret = 1;
4714  			goto done;
4715  		}
4716  
4717  		slot = path->slots[level] + 1;
4718  		c = path->nodes[level];
4719  		if (slot >= btrfs_header_nritems(c)) {
4720  			level++;
4721  			if (level == BTRFS_MAX_LEVEL) {
4722  				ret = 1;
4723  				goto done;
4724  			}
4725  			continue;
4726  		}
4727  
4728  
4729  		/*
4730  		 * Our current level is where we're going to start from, and to
4731  		 * make sure lockdep doesn't complain we need to drop our locks
4732  		 * and nodes from 0 to our current level.
4733  		 */
4734  		for (i = 0; i < level; i++) {
4735  			if (path->locks[level]) {
4736  				btrfs_tree_read_unlock(path->nodes[i]);
4737  				path->locks[i] = 0;
4738  			}
4739  			free_extent_buffer(path->nodes[i]);
4740  			path->nodes[i] = NULL;
4741  		}
4742  
4743  		next = c;
4744  		ret = read_block_for_search(root, path, &next, level,
4745  					    slot, &key);
4746  		if (ret == -EAGAIN)
4747  			goto again;
4748  
4749  		if (ret < 0) {
4750  			btrfs_release_path(path);
4751  			goto done;
4752  		}
4753  
4754  		if (!path->skip_locking) {
4755  			ret = btrfs_try_tree_read_lock(next);
4756  			if (!ret && time_seq) {
4757  				/*
4758  				 * If we don't get the lock, we may be racing
4759  				 * with push_leaf_left, holding that lock while
4760  				 * itself waiting for the leaf we've currently
4761  				 * locked. To solve this situation, we give up
4762  				 * on our lock and cycle.
4763  				 */
4764  				free_extent_buffer(next);
4765  				btrfs_release_path(path);
4766  				cond_resched();
4767  				goto again;
4768  			}
4769  			if (!ret)
4770  				btrfs_tree_read_lock(next);
4771  		}
4772  		break;
4773  	}
4774  	path->slots[level] = slot;
4775  	while (1) {
4776  		level--;
4777  		path->nodes[level] = next;
4778  		path->slots[level] = 0;
4779  		if (!path->skip_locking)
4780  			path->locks[level] = BTRFS_READ_LOCK;
4781  		if (!level)
4782  			break;
4783  
4784  		ret = read_block_for_search(root, path, &next, level,
4785  					    0, &key);
4786  		if (ret == -EAGAIN)
4787  			goto again;
4788  
4789  		if (ret < 0) {
4790  			btrfs_release_path(path);
4791  			goto done;
4792  		}
4793  
4794  		if (!path->skip_locking)
4795  			btrfs_tree_read_lock(next);
4796  	}
4797  	ret = 0;
4798  done:
4799  	unlock_up(path, 0, 1, 0, NULL);
4800  	if (need_commit_sem) {
4801  		int ret2;
4802  
4803  		path->need_commit_sem = 1;
4804  		ret2 = finish_need_commit_sem_search(path);
4805  		up_read(&fs_info->commit_root_sem);
4806  		if (ret2)
4807  			ret = ret2;
4808  	}
4809  
4810  	return ret;
4811  }
4812  
4813  /*
4814   * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4815   * searching until it gets past min_objectid or finds an item of 'type'
4816   *
4817   * returns 0 if something is found, 1 if nothing was found and < 0 on error
4818   */
4819  int btrfs_previous_item(struct btrfs_root *root,
4820  			struct btrfs_path *path, u64 min_objectid,
4821  			int type)
4822  {
4823  	struct btrfs_key found_key;
4824  	struct extent_buffer *leaf;
4825  	u32 nritems;
4826  	int ret;
4827  
4828  	while (1) {
4829  		if (path->slots[0] == 0) {
4830  			ret = btrfs_prev_leaf(root, path);
4831  			if (ret != 0)
4832  				return ret;
4833  		} else {
4834  			path->slots[0]--;
4835  		}
4836  		leaf = path->nodes[0];
4837  		nritems = btrfs_header_nritems(leaf);
4838  		if (nritems == 0)
4839  			return 1;
4840  		if (path->slots[0] == nritems)
4841  			path->slots[0]--;
4842  
4843  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4844  		if (found_key.objectid < min_objectid)
4845  			break;
4846  		if (found_key.type == type)
4847  			return 0;
4848  		if (found_key.objectid == min_objectid &&
4849  		    found_key.type < type)
4850  			break;
4851  	}
4852  	return 1;
4853  }
4854  
4855  /*
4856   * search in extent tree to find a previous Metadata/Data extent item with
4857   * min objecitd.
4858   *
4859   * returns 0 if something is found, 1 if nothing was found and < 0 on error
4860   */
4861  int btrfs_previous_extent_item(struct btrfs_root *root,
4862  			struct btrfs_path *path, u64 min_objectid)
4863  {
4864  	struct btrfs_key found_key;
4865  	struct extent_buffer *leaf;
4866  	u32 nritems;
4867  	int ret;
4868  
4869  	while (1) {
4870  		if (path->slots[0] == 0) {
4871  			ret = btrfs_prev_leaf(root, path);
4872  			if (ret != 0)
4873  				return ret;
4874  		} else {
4875  			path->slots[0]--;
4876  		}
4877  		leaf = path->nodes[0];
4878  		nritems = btrfs_header_nritems(leaf);
4879  		if (nritems == 0)
4880  			return 1;
4881  		if (path->slots[0] == nritems)
4882  			path->slots[0]--;
4883  
4884  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4885  		if (found_key.objectid < min_objectid)
4886  			break;
4887  		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4888  		    found_key.type == BTRFS_METADATA_ITEM_KEY)
4889  			return 0;
4890  		if (found_key.objectid == min_objectid &&
4891  		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
4892  			break;
4893  	}
4894  	return 1;
4895  }
4896