xref: /openbmc/linux/fs/btrfs/ctree.c (revision 8dda2eac)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 #include "tree-mod-log.h"
18 
19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20 		      *root, struct btrfs_path *path, int level);
21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
23 		      int data_size, int extend);
24 static int push_node_left(struct btrfs_trans_handle *trans,
25 			  struct extent_buffer *dst,
26 			  struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28 			      struct extent_buffer *dst_buf,
29 			      struct extent_buffer *src_buf);
30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
31 		    int level, int slot);
32 
33 static const struct btrfs_csums {
34 	u16		size;
35 	const char	name[10];
36 	const char	driver[12];
37 } btrfs_csums[] = {
38 	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
39 	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
40 	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
41 	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42 				     .driver = "blake2b-256" },
43 };
44 
45 int btrfs_super_csum_size(const struct btrfs_super_block *s)
46 {
47 	u16 t = btrfs_super_csum_type(s);
48 	/*
49 	 * csum type is validated at mount time
50 	 */
51 	return btrfs_csums[t].size;
52 }
53 
54 const char *btrfs_super_csum_name(u16 csum_type)
55 {
56 	/* csum type is validated at mount time */
57 	return btrfs_csums[csum_type].name;
58 }
59 
60 /*
61  * Return driver name if defined, otherwise the name that's also a valid driver
62  * name
63  */
64 const char *btrfs_super_csum_driver(u16 csum_type)
65 {
66 	/* csum type is validated at mount time */
67 	return btrfs_csums[csum_type].driver[0] ?
68 		btrfs_csums[csum_type].driver :
69 		btrfs_csums[csum_type].name;
70 }
71 
72 size_t __attribute_const__ btrfs_get_num_csums(void)
73 {
74 	return ARRAY_SIZE(btrfs_csums);
75 }
76 
77 struct btrfs_path *btrfs_alloc_path(void)
78 {
79 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
80 }
81 
82 /* this also releases the path */
83 void btrfs_free_path(struct btrfs_path *p)
84 {
85 	if (!p)
86 		return;
87 	btrfs_release_path(p);
88 	kmem_cache_free(btrfs_path_cachep, p);
89 }
90 
91 /*
92  * path release drops references on the extent buffers in the path
93  * and it drops any locks held by this path
94  *
95  * It is safe to call this on paths that no locks or extent buffers held.
96  */
97 noinline void btrfs_release_path(struct btrfs_path *p)
98 {
99 	int i;
100 
101 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
102 		p->slots[i] = 0;
103 		if (!p->nodes[i])
104 			continue;
105 		if (p->locks[i]) {
106 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
107 			p->locks[i] = 0;
108 		}
109 		free_extent_buffer(p->nodes[i]);
110 		p->nodes[i] = NULL;
111 	}
112 }
113 
114 /*
115  * safely gets a reference on the root node of a tree.  A lock
116  * is not taken, so a concurrent writer may put a different node
117  * at the root of the tree.  See btrfs_lock_root_node for the
118  * looping required.
119  *
120  * The extent buffer returned by this has a reference taken, so
121  * it won't disappear.  It may stop being the root of the tree
122  * at any time because there are no locks held.
123  */
124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
125 {
126 	struct extent_buffer *eb;
127 
128 	while (1) {
129 		rcu_read_lock();
130 		eb = rcu_dereference(root->node);
131 
132 		/*
133 		 * RCU really hurts here, we could free up the root node because
134 		 * it was COWed but we may not get the new root node yet so do
135 		 * the inc_not_zero dance and if it doesn't work then
136 		 * synchronize_rcu and try again.
137 		 */
138 		if (atomic_inc_not_zero(&eb->refs)) {
139 			rcu_read_unlock();
140 			break;
141 		}
142 		rcu_read_unlock();
143 		synchronize_rcu();
144 	}
145 	return eb;
146 }
147 
148 /*
149  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150  * just get put onto a simple dirty list.  Transaction walks this list to make
151  * sure they get properly updated on disk.
152  */
153 static void add_root_to_dirty_list(struct btrfs_root *root)
154 {
155 	struct btrfs_fs_info *fs_info = root->fs_info;
156 
157 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
159 		return;
160 
161 	spin_lock(&fs_info->trans_lock);
162 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163 		/* Want the extent tree to be the last on the list */
164 		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
165 			list_move_tail(&root->dirty_list,
166 				       &fs_info->dirty_cowonly_roots);
167 		else
168 			list_move(&root->dirty_list,
169 				  &fs_info->dirty_cowonly_roots);
170 	}
171 	spin_unlock(&fs_info->trans_lock);
172 }
173 
174 /*
175  * used by snapshot creation to make a copy of a root for a tree with
176  * a given objectid.  The buffer with the new root node is returned in
177  * cow_ret, and this func returns zero on success or a negative error code.
178  */
179 int btrfs_copy_root(struct btrfs_trans_handle *trans,
180 		      struct btrfs_root *root,
181 		      struct extent_buffer *buf,
182 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
183 {
184 	struct btrfs_fs_info *fs_info = root->fs_info;
185 	struct extent_buffer *cow;
186 	int ret = 0;
187 	int level;
188 	struct btrfs_disk_key disk_key;
189 
190 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
191 		trans->transid != fs_info->running_transaction->transid);
192 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193 		trans->transid != root->last_trans);
194 
195 	level = btrfs_header_level(buf);
196 	if (level == 0)
197 		btrfs_item_key(buf, &disk_key, 0);
198 	else
199 		btrfs_node_key(buf, &disk_key, 0);
200 
201 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
202 				     &disk_key, level, buf->start, 0,
203 				     BTRFS_NESTING_NEW_ROOT);
204 	if (IS_ERR(cow))
205 		return PTR_ERR(cow);
206 
207 	copy_extent_buffer_full(cow, buf);
208 	btrfs_set_header_bytenr(cow, cow->start);
209 	btrfs_set_header_generation(cow, trans->transid);
210 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212 				     BTRFS_HEADER_FLAG_RELOC);
213 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
215 	else
216 		btrfs_set_header_owner(cow, new_root_objectid);
217 
218 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
219 
220 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
221 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222 		ret = btrfs_inc_ref(trans, root, cow, 1);
223 	else
224 		ret = btrfs_inc_ref(trans, root, cow, 0);
225 	if (ret) {
226 		btrfs_tree_unlock(cow);
227 		free_extent_buffer(cow);
228 		btrfs_abort_transaction(trans, ret);
229 		return ret;
230 	}
231 
232 	btrfs_mark_buffer_dirty(cow);
233 	*cow_ret = cow;
234 	return 0;
235 }
236 
237 /*
238  * check if the tree block can be shared by multiple trees
239  */
240 int btrfs_block_can_be_shared(struct btrfs_root *root,
241 			      struct extent_buffer *buf)
242 {
243 	/*
244 	 * Tree blocks not in shareable trees and tree roots are never shared.
245 	 * If a block was allocated after the last snapshot and the block was
246 	 * not allocated by tree relocation, we know the block is not shared.
247 	 */
248 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
249 	    buf != root->node && buf != root->commit_root &&
250 	    (btrfs_header_generation(buf) <=
251 	     btrfs_root_last_snapshot(&root->root_item) ||
252 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
253 		return 1;
254 
255 	return 0;
256 }
257 
258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259 				       struct btrfs_root *root,
260 				       struct extent_buffer *buf,
261 				       struct extent_buffer *cow,
262 				       int *last_ref)
263 {
264 	struct btrfs_fs_info *fs_info = root->fs_info;
265 	u64 refs;
266 	u64 owner;
267 	u64 flags;
268 	u64 new_flags = 0;
269 	int ret;
270 
271 	/*
272 	 * Backrefs update rules:
273 	 *
274 	 * Always use full backrefs for extent pointers in tree block
275 	 * allocated by tree relocation.
276 	 *
277 	 * If a shared tree block is no longer referenced by its owner
278 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279 	 * use full backrefs for extent pointers in tree block.
280 	 *
281 	 * If a tree block is been relocating
282 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283 	 * use full backrefs for extent pointers in tree block.
284 	 * The reason for this is some operations (such as drop tree)
285 	 * are only allowed for blocks use full backrefs.
286 	 */
287 
288 	if (btrfs_block_can_be_shared(root, buf)) {
289 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
290 					       btrfs_header_level(buf), 1,
291 					       &refs, &flags);
292 		if (ret)
293 			return ret;
294 		if (refs == 0) {
295 			ret = -EROFS;
296 			btrfs_handle_fs_error(fs_info, ret, NULL);
297 			return ret;
298 		}
299 	} else {
300 		refs = 1;
301 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
304 		else
305 			flags = 0;
306 	}
307 
308 	owner = btrfs_header_owner(buf);
309 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
311 
312 	if (refs > 1) {
313 		if ((owner == root->root_key.objectid ||
314 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
316 			ret = btrfs_inc_ref(trans, root, buf, 1);
317 			if (ret)
318 				return ret;
319 
320 			if (root->root_key.objectid ==
321 			    BTRFS_TREE_RELOC_OBJECTID) {
322 				ret = btrfs_dec_ref(trans, root, buf, 0);
323 				if (ret)
324 					return ret;
325 				ret = btrfs_inc_ref(trans, root, cow, 1);
326 				if (ret)
327 					return ret;
328 			}
329 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330 		} else {
331 
332 			if (root->root_key.objectid ==
333 			    BTRFS_TREE_RELOC_OBJECTID)
334 				ret = btrfs_inc_ref(trans, root, cow, 1);
335 			else
336 				ret = btrfs_inc_ref(trans, root, cow, 0);
337 			if (ret)
338 				return ret;
339 		}
340 		if (new_flags != 0) {
341 			int level = btrfs_header_level(buf);
342 
343 			ret = btrfs_set_disk_extent_flags(trans, buf,
344 							  new_flags, level, 0);
345 			if (ret)
346 				return ret;
347 		}
348 	} else {
349 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350 			if (root->root_key.objectid ==
351 			    BTRFS_TREE_RELOC_OBJECTID)
352 				ret = btrfs_inc_ref(trans, root, cow, 1);
353 			else
354 				ret = btrfs_inc_ref(trans, root, cow, 0);
355 			if (ret)
356 				return ret;
357 			ret = btrfs_dec_ref(trans, root, buf, 1);
358 			if (ret)
359 				return ret;
360 		}
361 		btrfs_clean_tree_block(buf);
362 		*last_ref = 1;
363 	}
364 	return 0;
365 }
366 
367 static struct extent_buffer *alloc_tree_block_no_bg_flush(
368 					  struct btrfs_trans_handle *trans,
369 					  struct btrfs_root *root,
370 					  u64 parent_start,
371 					  const struct btrfs_disk_key *disk_key,
372 					  int level,
373 					  u64 hint,
374 					  u64 empty_size,
375 					  enum btrfs_lock_nesting nest)
376 {
377 	struct btrfs_fs_info *fs_info = root->fs_info;
378 	struct extent_buffer *ret;
379 
380 	/*
381 	 * If we are COWing a node/leaf from the extent, chunk, device or free
382 	 * space trees, make sure that we do not finish block group creation of
383 	 * pending block groups. We do this to avoid a deadlock.
384 	 * COWing can result in allocation of a new chunk, and flushing pending
385 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
386 	 * when finishing allocation of a new chunk. Creation of a pending block
387 	 * group modifies the extent, chunk, device and free space trees,
388 	 * therefore we could deadlock with ourselves since we are holding a
389 	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
390 	 * try to COW later.
391 	 * For similar reasons, we also need to delay flushing pending block
392 	 * groups when splitting a leaf or node, from one of those trees, since
393 	 * we are holding a write lock on it and its parent or when inserting a
394 	 * new root node for one of those trees.
395 	 */
396 	if (root == fs_info->extent_root ||
397 	    root == fs_info->chunk_root ||
398 	    root == fs_info->dev_root ||
399 	    root == fs_info->free_space_root)
400 		trans->can_flush_pending_bgs = false;
401 
402 	ret = btrfs_alloc_tree_block(trans, root, parent_start,
403 				     root->root_key.objectid, disk_key, level,
404 				     hint, empty_size, nest);
405 	trans->can_flush_pending_bgs = true;
406 
407 	return ret;
408 }
409 
410 /*
411  * does the dirty work in cow of a single block.  The parent block (if
412  * supplied) is updated to point to the new cow copy.  The new buffer is marked
413  * dirty and returned locked.  If you modify the block it needs to be marked
414  * dirty again.
415  *
416  * search_start -- an allocation hint for the new block
417  *
418  * empty_size -- a hint that you plan on doing more cow.  This is the size in
419  * bytes the allocator should try to find free next to the block it returns.
420  * This is just a hint and may be ignored by the allocator.
421  */
422 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
423 			     struct btrfs_root *root,
424 			     struct extent_buffer *buf,
425 			     struct extent_buffer *parent, int parent_slot,
426 			     struct extent_buffer **cow_ret,
427 			     u64 search_start, u64 empty_size,
428 			     enum btrfs_lock_nesting nest)
429 {
430 	struct btrfs_fs_info *fs_info = root->fs_info;
431 	struct btrfs_disk_key disk_key;
432 	struct extent_buffer *cow;
433 	int level, ret;
434 	int last_ref = 0;
435 	int unlock_orig = 0;
436 	u64 parent_start = 0;
437 
438 	if (*cow_ret == buf)
439 		unlock_orig = 1;
440 
441 	btrfs_assert_tree_locked(buf);
442 
443 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
444 		trans->transid != fs_info->running_transaction->transid);
445 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
446 		trans->transid != root->last_trans);
447 
448 	level = btrfs_header_level(buf);
449 
450 	if (level == 0)
451 		btrfs_item_key(buf, &disk_key, 0);
452 	else
453 		btrfs_node_key(buf, &disk_key, 0);
454 
455 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
456 		parent_start = parent->start;
457 
458 	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
459 					   level, search_start, empty_size, nest);
460 	if (IS_ERR(cow))
461 		return PTR_ERR(cow);
462 
463 	/* cow is set to blocking by btrfs_init_new_buffer */
464 
465 	copy_extent_buffer_full(cow, buf);
466 	btrfs_set_header_bytenr(cow, cow->start);
467 	btrfs_set_header_generation(cow, trans->transid);
468 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
469 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
470 				     BTRFS_HEADER_FLAG_RELOC);
471 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
472 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
473 	else
474 		btrfs_set_header_owner(cow, root->root_key.objectid);
475 
476 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
477 
478 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
479 	if (ret) {
480 		btrfs_tree_unlock(cow);
481 		free_extent_buffer(cow);
482 		btrfs_abort_transaction(trans, ret);
483 		return ret;
484 	}
485 
486 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
487 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
488 		if (ret) {
489 			btrfs_tree_unlock(cow);
490 			free_extent_buffer(cow);
491 			btrfs_abort_transaction(trans, ret);
492 			return ret;
493 		}
494 	}
495 
496 	if (buf == root->node) {
497 		WARN_ON(parent && parent != buf);
498 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
499 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
500 			parent_start = buf->start;
501 
502 		atomic_inc(&cow->refs);
503 		ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
504 		BUG_ON(ret < 0);
505 		rcu_assign_pointer(root->node, cow);
506 
507 		btrfs_free_tree_block(trans, root, buf, parent_start,
508 				      last_ref);
509 		free_extent_buffer(buf);
510 		add_root_to_dirty_list(root);
511 	} else {
512 		WARN_ON(trans->transid != btrfs_header_generation(parent));
513 		btrfs_tree_mod_log_insert_key(parent, parent_slot,
514 					      BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
515 		btrfs_set_node_blockptr(parent, parent_slot,
516 					cow->start);
517 		btrfs_set_node_ptr_generation(parent, parent_slot,
518 					      trans->transid);
519 		btrfs_mark_buffer_dirty(parent);
520 		if (last_ref) {
521 			ret = btrfs_tree_mod_log_free_eb(buf);
522 			if (ret) {
523 				btrfs_tree_unlock(cow);
524 				free_extent_buffer(cow);
525 				btrfs_abort_transaction(trans, ret);
526 				return ret;
527 			}
528 		}
529 		btrfs_free_tree_block(trans, root, buf, parent_start,
530 				      last_ref);
531 	}
532 	if (unlock_orig)
533 		btrfs_tree_unlock(buf);
534 	free_extent_buffer_stale(buf);
535 	btrfs_mark_buffer_dirty(cow);
536 	*cow_ret = cow;
537 	return 0;
538 }
539 
540 static inline int should_cow_block(struct btrfs_trans_handle *trans,
541 				   struct btrfs_root *root,
542 				   struct extent_buffer *buf)
543 {
544 	if (btrfs_is_testing(root->fs_info))
545 		return 0;
546 
547 	/* Ensure we can see the FORCE_COW bit */
548 	smp_mb__before_atomic();
549 
550 	/*
551 	 * We do not need to cow a block if
552 	 * 1) this block is not created or changed in this transaction;
553 	 * 2) this block does not belong to TREE_RELOC tree;
554 	 * 3) the root is not forced COW.
555 	 *
556 	 * What is forced COW:
557 	 *    when we create snapshot during committing the transaction,
558 	 *    after we've finished copying src root, we must COW the shared
559 	 *    block to ensure the metadata consistency.
560 	 */
561 	if (btrfs_header_generation(buf) == trans->transid &&
562 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
563 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
564 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
565 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
566 		return 0;
567 	return 1;
568 }
569 
570 /*
571  * cows a single block, see __btrfs_cow_block for the real work.
572  * This version of it has extra checks so that a block isn't COWed more than
573  * once per transaction, as long as it hasn't been written yet
574  */
575 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
576 		    struct btrfs_root *root, struct extent_buffer *buf,
577 		    struct extent_buffer *parent, int parent_slot,
578 		    struct extent_buffer **cow_ret,
579 		    enum btrfs_lock_nesting nest)
580 {
581 	struct btrfs_fs_info *fs_info = root->fs_info;
582 	u64 search_start;
583 	int ret;
584 
585 	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
586 		btrfs_err(fs_info,
587 			"COW'ing blocks on a fs root that's being dropped");
588 
589 	if (trans->transaction != fs_info->running_transaction)
590 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
591 		       trans->transid,
592 		       fs_info->running_transaction->transid);
593 
594 	if (trans->transid != fs_info->generation)
595 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
596 		       trans->transid, fs_info->generation);
597 
598 	if (!should_cow_block(trans, root, buf)) {
599 		*cow_ret = buf;
600 		return 0;
601 	}
602 
603 	search_start = buf->start & ~((u64)SZ_1G - 1);
604 
605 	/*
606 	 * Before CoWing this block for later modification, check if it's
607 	 * the subtree root and do the delayed subtree trace if needed.
608 	 *
609 	 * Also We don't care about the error, as it's handled internally.
610 	 */
611 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
612 	ret = __btrfs_cow_block(trans, root, buf, parent,
613 				 parent_slot, cow_ret, search_start, 0, nest);
614 
615 	trace_btrfs_cow_block(root, buf, *cow_ret);
616 
617 	return ret;
618 }
619 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
620 
621 /*
622  * helper function for defrag to decide if two blocks pointed to by a
623  * node are actually close by
624  */
625 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
626 {
627 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
628 		return 1;
629 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
630 		return 1;
631 	return 0;
632 }
633 
634 #ifdef __LITTLE_ENDIAN
635 
636 /*
637  * Compare two keys, on little-endian the disk order is same as CPU order and
638  * we can avoid the conversion.
639  */
640 static int comp_keys(const struct btrfs_disk_key *disk_key,
641 		     const struct btrfs_key *k2)
642 {
643 	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
644 
645 	return btrfs_comp_cpu_keys(k1, k2);
646 }
647 
648 #else
649 
650 /*
651  * compare two keys in a memcmp fashion
652  */
653 static int comp_keys(const struct btrfs_disk_key *disk,
654 		     const struct btrfs_key *k2)
655 {
656 	struct btrfs_key k1;
657 
658 	btrfs_disk_key_to_cpu(&k1, disk);
659 
660 	return btrfs_comp_cpu_keys(&k1, k2);
661 }
662 #endif
663 
664 /*
665  * same as comp_keys only with two btrfs_key's
666  */
667 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
668 {
669 	if (k1->objectid > k2->objectid)
670 		return 1;
671 	if (k1->objectid < k2->objectid)
672 		return -1;
673 	if (k1->type > k2->type)
674 		return 1;
675 	if (k1->type < k2->type)
676 		return -1;
677 	if (k1->offset > k2->offset)
678 		return 1;
679 	if (k1->offset < k2->offset)
680 		return -1;
681 	return 0;
682 }
683 
684 /*
685  * this is used by the defrag code to go through all the
686  * leaves pointed to by a node and reallocate them so that
687  * disk order is close to key order
688  */
689 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
690 		       struct btrfs_root *root, struct extent_buffer *parent,
691 		       int start_slot, u64 *last_ret,
692 		       struct btrfs_key *progress)
693 {
694 	struct btrfs_fs_info *fs_info = root->fs_info;
695 	struct extent_buffer *cur;
696 	u64 blocknr;
697 	u64 search_start = *last_ret;
698 	u64 last_block = 0;
699 	u64 other;
700 	u32 parent_nritems;
701 	int end_slot;
702 	int i;
703 	int err = 0;
704 	u32 blocksize;
705 	int progress_passed = 0;
706 	struct btrfs_disk_key disk_key;
707 
708 	WARN_ON(trans->transaction != fs_info->running_transaction);
709 	WARN_ON(trans->transid != fs_info->generation);
710 
711 	parent_nritems = btrfs_header_nritems(parent);
712 	blocksize = fs_info->nodesize;
713 	end_slot = parent_nritems - 1;
714 
715 	if (parent_nritems <= 1)
716 		return 0;
717 
718 	for (i = start_slot; i <= end_slot; i++) {
719 		int close = 1;
720 
721 		btrfs_node_key(parent, &disk_key, i);
722 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
723 			continue;
724 
725 		progress_passed = 1;
726 		blocknr = btrfs_node_blockptr(parent, i);
727 		if (last_block == 0)
728 			last_block = blocknr;
729 
730 		if (i > 0) {
731 			other = btrfs_node_blockptr(parent, i - 1);
732 			close = close_blocks(blocknr, other, blocksize);
733 		}
734 		if (!close && i < end_slot) {
735 			other = btrfs_node_blockptr(parent, i + 1);
736 			close = close_blocks(blocknr, other, blocksize);
737 		}
738 		if (close) {
739 			last_block = blocknr;
740 			continue;
741 		}
742 
743 		cur = btrfs_read_node_slot(parent, i);
744 		if (IS_ERR(cur))
745 			return PTR_ERR(cur);
746 		if (search_start == 0)
747 			search_start = last_block;
748 
749 		btrfs_tree_lock(cur);
750 		err = __btrfs_cow_block(trans, root, cur, parent, i,
751 					&cur, search_start,
752 					min(16 * blocksize,
753 					    (end_slot - i) * blocksize),
754 					BTRFS_NESTING_COW);
755 		if (err) {
756 			btrfs_tree_unlock(cur);
757 			free_extent_buffer(cur);
758 			break;
759 		}
760 		search_start = cur->start;
761 		last_block = cur->start;
762 		*last_ret = search_start;
763 		btrfs_tree_unlock(cur);
764 		free_extent_buffer(cur);
765 	}
766 	return err;
767 }
768 
769 /*
770  * search for key in the extent_buffer.  The items start at offset p,
771  * and they are item_size apart.  There are 'max' items in p.
772  *
773  * the slot in the array is returned via slot, and it points to
774  * the place where you would insert key if it is not found in
775  * the array.
776  *
777  * slot may point to max if the key is bigger than all of the keys
778  */
779 static noinline int generic_bin_search(struct extent_buffer *eb,
780 				       unsigned long p, int item_size,
781 				       const struct btrfs_key *key,
782 				       int max, int *slot)
783 {
784 	int low = 0;
785 	int high = max;
786 	int ret;
787 	const int key_size = sizeof(struct btrfs_disk_key);
788 
789 	if (low > high) {
790 		btrfs_err(eb->fs_info,
791 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
792 			  __func__, low, high, eb->start,
793 			  btrfs_header_owner(eb), btrfs_header_level(eb));
794 		return -EINVAL;
795 	}
796 
797 	while (low < high) {
798 		unsigned long oip;
799 		unsigned long offset;
800 		struct btrfs_disk_key *tmp;
801 		struct btrfs_disk_key unaligned;
802 		int mid;
803 
804 		mid = (low + high) / 2;
805 		offset = p + mid * item_size;
806 		oip = offset_in_page(offset);
807 
808 		if (oip + key_size <= PAGE_SIZE) {
809 			const unsigned long idx = get_eb_page_index(offset);
810 			char *kaddr = page_address(eb->pages[idx]);
811 
812 			oip = get_eb_offset_in_page(eb, offset);
813 			tmp = (struct btrfs_disk_key *)(kaddr + oip);
814 		} else {
815 			read_extent_buffer(eb, &unaligned, offset, key_size);
816 			tmp = &unaligned;
817 		}
818 
819 		ret = comp_keys(tmp, key);
820 
821 		if (ret < 0)
822 			low = mid + 1;
823 		else if (ret > 0)
824 			high = mid;
825 		else {
826 			*slot = mid;
827 			return 0;
828 		}
829 	}
830 	*slot = low;
831 	return 1;
832 }
833 
834 /*
835  * simple bin_search frontend that does the right thing for
836  * leaves vs nodes
837  */
838 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
839 		     int *slot)
840 {
841 	if (btrfs_header_level(eb) == 0)
842 		return generic_bin_search(eb,
843 					  offsetof(struct btrfs_leaf, items),
844 					  sizeof(struct btrfs_item),
845 					  key, btrfs_header_nritems(eb),
846 					  slot);
847 	else
848 		return generic_bin_search(eb,
849 					  offsetof(struct btrfs_node, ptrs),
850 					  sizeof(struct btrfs_key_ptr),
851 					  key, btrfs_header_nritems(eb),
852 					  slot);
853 }
854 
855 static void root_add_used(struct btrfs_root *root, u32 size)
856 {
857 	spin_lock(&root->accounting_lock);
858 	btrfs_set_root_used(&root->root_item,
859 			    btrfs_root_used(&root->root_item) + size);
860 	spin_unlock(&root->accounting_lock);
861 }
862 
863 static void root_sub_used(struct btrfs_root *root, u32 size)
864 {
865 	spin_lock(&root->accounting_lock);
866 	btrfs_set_root_used(&root->root_item,
867 			    btrfs_root_used(&root->root_item) - size);
868 	spin_unlock(&root->accounting_lock);
869 }
870 
871 /* given a node and slot number, this reads the blocks it points to.  The
872  * extent buffer is returned with a reference taken (but unlocked).
873  */
874 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
875 					   int slot)
876 {
877 	int level = btrfs_header_level(parent);
878 	struct extent_buffer *eb;
879 	struct btrfs_key first_key;
880 
881 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
882 		return ERR_PTR(-ENOENT);
883 
884 	BUG_ON(level == 0);
885 
886 	btrfs_node_key_to_cpu(parent, &first_key, slot);
887 	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
888 			     btrfs_header_owner(parent),
889 			     btrfs_node_ptr_generation(parent, slot),
890 			     level - 1, &first_key);
891 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
892 		free_extent_buffer(eb);
893 		eb = ERR_PTR(-EIO);
894 	}
895 
896 	return eb;
897 }
898 
899 /*
900  * node level balancing, used to make sure nodes are in proper order for
901  * item deletion.  We balance from the top down, so we have to make sure
902  * that a deletion won't leave an node completely empty later on.
903  */
904 static noinline int balance_level(struct btrfs_trans_handle *trans,
905 			 struct btrfs_root *root,
906 			 struct btrfs_path *path, int level)
907 {
908 	struct btrfs_fs_info *fs_info = root->fs_info;
909 	struct extent_buffer *right = NULL;
910 	struct extent_buffer *mid;
911 	struct extent_buffer *left = NULL;
912 	struct extent_buffer *parent = NULL;
913 	int ret = 0;
914 	int wret;
915 	int pslot;
916 	int orig_slot = path->slots[level];
917 	u64 orig_ptr;
918 
919 	ASSERT(level > 0);
920 
921 	mid = path->nodes[level];
922 
923 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
924 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
925 
926 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
927 
928 	if (level < BTRFS_MAX_LEVEL - 1) {
929 		parent = path->nodes[level + 1];
930 		pslot = path->slots[level + 1];
931 	}
932 
933 	/*
934 	 * deal with the case where there is only one pointer in the root
935 	 * by promoting the node below to a root
936 	 */
937 	if (!parent) {
938 		struct extent_buffer *child;
939 
940 		if (btrfs_header_nritems(mid) != 1)
941 			return 0;
942 
943 		/* promote the child to a root */
944 		child = btrfs_read_node_slot(mid, 0);
945 		if (IS_ERR(child)) {
946 			ret = PTR_ERR(child);
947 			btrfs_handle_fs_error(fs_info, ret, NULL);
948 			goto enospc;
949 		}
950 
951 		btrfs_tree_lock(child);
952 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
953 				      BTRFS_NESTING_COW);
954 		if (ret) {
955 			btrfs_tree_unlock(child);
956 			free_extent_buffer(child);
957 			goto enospc;
958 		}
959 
960 		ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
961 		BUG_ON(ret < 0);
962 		rcu_assign_pointer(root->node, child);
963 
964 		add_root_to_dirty_list(root);
965 		btrfs_tree_unlock(child);
966 
967 		path->locks[level] = 0;
968 		path->nodes[level] = NULL;
969 		btrfs_clean_tree_block(mid);
970 		btrfs_tree_unlock(mid);
971 		/* once for the path */
972 		free_extent_buffer(mid);
973 
974 		root_sub_used(root, mid->len);
975 		btrfs_free_tree_block(trans, root, mid, 0, 1);
976 		/* once for the root ptr */
977 		free_extent_buffer_stale(mid);
978 		return 0;
979 	}
980 	if (btrfs_header_nritems(mid) >
981 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
982 		return 0;
983 
984 	left = btrfs_read_node_slot(parent, pslot - 1);
985 	if (IS_ERR(left))
986 		left = NULL;
987 
988 	if (left) {
989 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
990 		wret = btrfs_cow_block(trans, root, left,
991 				       parent, pslot - 1, &left,
992 				       BTRFS_NESTING_LEFT_COW);
993 		if (wret) {
994 			ret = wret;
995 			goto enospc;
996 		}
997 	}
998 
999 	right = btrfs_read_node_slot(parent, pslot + 1);
1000 	if (IS_ERR(right))
1001 		right = NULL;
1002 
1003 	if (right) {
1004 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1005 		wret = btrfs_cow_block(trans, root, right,
1006 				       parent, pslot + 1, &right,
1007 				       BTRFS_NESTING_RIGHT_COW);
1008 		if (wret) {
1009 			ret = wret;
1010 			goto enospc;
1011 		}
1012 	}
1013 
1014 	/* first, try to make some room in the middle buffer */
1015 	if (left) {
1016 		orig_slot += btrfs_header_nritems(left);
1017 		wret = push_node_left(trans, left, mid, 1);
1018 		if (wret < 0)
1019 			ret = wret;
1020 	}
1021 
1022 	/*
1023 	 * then try to empty the right most buffer into the middle
1024 	 */
1025 	if (right) {
1026 		wret = push_node_left(trans, mid, right, 1);
1027 		if (wret < 0 && wret != -ENOSPC)
1028 			ret = wret;
1029 		if (btrfs_header_nritems(right) == 0) {
1030 			btrfs_clean_tree_block(right);
1031 			btrfs_tree_unlock(right);
1032 			del_ptr(root, path, level + 1, pslot + 1);
1033 			root_sub_used(root, right->len);
1034 			btrfs_free_tree_block(trans, root, right, 0, 1);
1035 			free_extent_buffer_stale(right);
1036 			right = NULL;
1037 		} else {
1038 			struct btrfs_disk_key right_key;
1039 			btrfs_node_key(right, &right_key, 0);
1040 			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1041 					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1042 			BUG_ON(ret < 0);
1043 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1044 			btrfs_mark_buffer_dirty(parent);
1045 		}
1046 	}
1047 	if (btrfs_header_nritems(mid) == 1) {
1048 		/*
1049 		 * we're not allowed to leave a node with one item in the
1050 		 * tree during a delete.  A deletion from lower in the tree
1051 		 * could try to delete the only pointer in this node.
1052 		 * So, pull some keys from the left.
1053 		 * There has to be a left pointer at this point because
1054 		 * otherwise we would have pulled some pointers from the
1055 		 * right
1056 		 */
1057 		if (!left) {
1058 			ret = -EROFS;
1059 			btrfs_handle_fs_error(fs_info, ret, NULL);
1060 			goto enospc;
1061 		}
1062 		wret = balance_node_right(trans, mid, left);
1063 		if (wret < 0) {
1064 			ret = wret;
1065 			goto enospc;
1066 		}
1067 		if (wret == 1) {
1068 			wret = push_node_left(trans, left, mid, 1);
1069 			if (wret < 0)
1070 				ret = wret;
1071 		}
1072 		BUG_ON(wret == 1);
1073 	}
1074 	if (btrfs_header_nritems(mid) == 0) {
1075 		btrfs_clean_tree_block(mid);
1076 		btrfs_tree_unlock(mid);
1077 		del_ptr(root, path, level + 1, pslot);
1078 		root_sub_used(root, mid->len);
1079 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1080 		free_extent_buffer_stale(mid);
1081 		mid = NULL;
1082 	} else {
1083 		/* update the parent key to reflect our changes */
1084 		struct btrfs_disk_key mid_key;
1085 		btrfs_node_key(mid, &mid_key, 0);
1086 		ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1087 				BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1088 		BUG_ON(ret < 0);
1089 		btrfs_set_node_key(parent, &mid_key, pslot);
1090 		btrfs_mark_buffer_dirty(parent);
1091 	}
1092 
1093 	/* update the path */
1094 	if (left) {
1095 		if (btrfs_header_nritems(left) > orig_slot) {
1096 			atomic_inc(&left->refs);
1097 			/* left was locked after cow */
1098 			path->nodes[level] = left;
1099 			path->slots[level + 1] -= 1;
1100 			path->slots[level] = orig_slot;
1101 			if (mid) {
1102 				btrfs_tree_unlock(mid);
1103 				free_extent_buffer(mid);
1104 			}
1105 		} else {
1106 			orig_slot -= btrfs_header_nritems(left);
1107 			path->slots[level] = orig_slot;
1108 		}
1109 	}
1110 	/* double check we haven't messed things up */
1111 	if (orig_ptr !=
1112 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1113 		BUG();
1114 enospc:
1115 	if (right) {
1116 		btrfs_tree_unlock(right);
1117 		free_extent_buffer(right);
1118 	}
1119 	if (left) {
1120 		if (path->nodes[level] != left)
1121 			btrfs_tree_unlock(left);
1122 		free_extent_buffer(left);
1123 	}
1124 	return ret;
1125 }
1126 
1127 /* Node balancing for insertion.  Here we only split or push nodes around
1128  * when they are completely full.  This is also done top down, so we
1129  * have to be pessimistic.
1130  */
1131 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1132 					  struct btrfs_root *root,
1133 					  struct btrfs_path *path, int level)
1134 {
1135 	struct btrfs_fs_info *fs_info = root->fs_info;
1136 	struct extent_buffer *right = NULL;
1137 	struct extent_buffer *mid;
1138 	struct extent_buffer *left = NULL;
1139 	struct extent_buffer *parent = NULL;
1140 	int ret = 0;
1141 	int wret;
1142 	int pslot;
1143 	int orig_slot = path->slots[level];
1144 
1145 	if (level == 0)
1146 		return 1;
1147 
1148 	mid = path->nodes[level];
1149 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1150 
1151 	if (level < BTRFS_MAX_LEVEL - 1) {
1152 		parent = path->nodes[level + 1];
1153 		pslot = path->slots[level + 1];
1154 	}
1155 
1156 	if (!parent)
1157 		return 1;
1158 
1159 	left = btrfs_read_node_slot(parent, pslot - 1);
1160 	if (IS_ERR(left))
1161 		left = NULL;
1162 
1163 	/* first, try to make some room in the middle buffer */
1164 	if (left) {
1165 		u32 left_nr;
1166 
1167 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1168 
1169 		left_nr = btrfs_header_nritems(left);
1170 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1171 			wret = 1;
1172 		} else {
1173 			ret = btrfs_cow_block(trans, root, left, parent,
1174 					      pslot - 1, &left,
1175 					      BTRFS_NESTING_LEFT_COW);
1176 			if (ret)
1177 				wret = 1;
1178 			else {
1179 				wret = push_node_left(trans, left, mid, 0);
1180 			}
1181 		}
1182 		if (wret < 0)
1183 			ret = wret;
1184 		if (wret == 0) {
1185 			struct btrfs_disk_key disk_key;
1186 			orig_slot += left_nr;
1187 			btrfs_node_key(mid, &disk_key, 0);
1188 			ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1189 					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1190 			BUG_ON(ret < 0);
1191 			btrfs_set_node_key(parent, &disk_key, pslot);
1192 			btrfs_mark_buffer_dirty(parent);
1193 			if (btrfs_header_nritems(left) > orig_slot) {
1194 				path->nodes[level] = left;
1195 				path->slots[level + 1] -= 1;
1196 				path->slots[level] = orig_slot;
1197 				btrfs_tree_unlock(mid);
1198 				free_extent_buffer(mid);
1199 			} else {
1200 				orig_slot -=
1201 					btrfs_header_nritems(left);
1202 				path->slots[level] = orig_slot;
1203 				btrfs_tree_unlock(left);
1204 				free_extent_buffer(left);
1205 			}
1206 			return 0;
1207 		}
1208 		btrfs_tree_unlock(left);
1209 		free_extent_buffer(left);
1210 	}
1211 	right = btrfs_read_node_slot(parent, pslot + 1);
1212 	if (IS_ERR(right))
1213 		right = NULL;
1214 
1215 	/*
1216 	 * then try to empty the right most buffer into the middle
1217 	 */
1218 	if (right) {
1219 		u32 right_nr;
1220 
1221 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1222 
1223 		right_nr = btrfs_header_nritems(right);
1224 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1225 			wret = 1;
1226 		} else {
1227 			ret = btrfs_cow_block(trans, root, right,
1228 					      parent, pslot + 1,
1229 					      &right, BTRFS_NESTING_RIGHT_COW);
1230 			if (ret)
1231 				wret = 1;
1232 			else {
1233 				wret = balance_node_right(trans, right, mid);
1234 			}
1235 		}
1236 		if (wret < 0)
1237 			ret = wret;
1238 		if (wret == 0) {
1239 			struct btrfs_disk_key disk_key;
1240 
1241 			btrfs_node_key(right, &disk_key, 0);
1242 			ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1243 					BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1244 			BUG_ON(ret < 0);
1245 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
1246 			btrfs_mark_buffer_dirty(parent);
1247 
1248 			if (btrfs_header_nritems(mid) <= orig_slot) {
1249 				path->nodes[level] = right;
1250 				path->slots[level + 1] += 1;
1251 				path->slots[level] = orig_slot -
1252 					btrfs_header_nritems(mid);
1253 				btrfs_tree_unlock(mid);
1254 				free_extent_buffer(mid);
1255 			} else {
1256 				btrfs_tree_unlock(right);
1257 				free_extent_buffer(right);
1258 			}
1259 			return 0;
1260 		}
1261 		btrfs_tree_unlock(right);
1262 		free_extent_buffer(right);
1263 	}
1264 	return 1;
1265 }
1266 
1267 /*
1268  * readahead one full node of leaves, finding things that are close
1269  * to the block in 'slot', and triggering ra on them.
1270  */
1271 static void reada_for_search(struct btrfs_fs_info *fs_info,
1272 			     struct btrfs_path *path,
1273 			     int level, int slot, u64 objectid)
1274 {
1275 	struct extent_buffer *node;
1276 	struct btrfs_disk_key disk_key;
1277 	u32 nritems;
1278 	u64 search;
1279 	u64 target;
1280 	u64 nread = 0;
1281 	u64 nread_max;
1282 	struct extent_buffer *eb;
1283 	u32 nr;
1284 	u32 blocksize;
1285 	u32 nscan = 0;
1286 
1287 	if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1288 		return;
1289 
1290 	if (!path->nodes[level])
1291 		return;
1292 
1293 	node = path->nodes[level];
1294 
1295 	/*
1296 	 * Since the time between visiting leaves is much shorter than the time
1297 	 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1298 	 * much IO at once (possibly random).
1299 	 */
1300 	if (path->reada == READA_FORWARD_ALWAYS) {
1301 		if (level > 1)
1302 			nread_max = node->fs_info->nodesize;
1303 		else
1304 			nread_max = SZ_128K;
1305 	} else {
1306 		nread_max = SZ_64K;
1307 	}
1308 
1309 	search = btrfs_node_blockptr(node, slot);
1310 	blocksize = fs_info->nodesize;
1311 	eb = find_extent_buffer(fs_info, search);
1312 	if (eb) {
1313 		free_extent_buffer(eb);
1314 		return;
1315 	}
1316 
1317 	target = search;
1318 
1319 	nritems = btrfs_header_nritems(node);
1320 	nr = slot;
1321 
1322 	while (1) {
1323 		if (path->reada == READA_BACK) {
1324 			if (nr == 0)
1325 				break;
1326 			nr--;
1327 		} else if (path->reada == READA_FORWARD ||
1328 			   path->reada == READA_FORWARD_ALWAYS) {
1329 			nr++;
1330 			if (nr >= nritems)
1331 				break;
1332 		}
1333 		if (path->reada == READA_BACK && objectid) {
1334 			btrfs_node_key(node, &disk_key, nr);
1335 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
1336 				break;
1337 		}
1338 		search = btrfs_node_blockptr(node, nr);
1339 		if (path->reada == READA_FORWARD_ALWAYS ||
1340 		    (search <= target && target - search <= 65536) ||
1341 		    (search > target && search - target <= 65536)) {
1342 			btrfs_readahead_node_child(node, nr);
1343 			nread += blocksize;
1344 		}
1345 		nscan++;
1346 		if (nread > nread_max || nscan > 32)
1347 			break;
1348 	}
1349 }
1350 
1351 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1352 {
1353 	struct extent_buffer *parent;
1354 	int slot;
1355 	int nritems;
1356 
1357 	parent = path->nodes[level + 1];
1358 	if (!parent)
1359 		return;
1360 
1361 	nritems = btrfs_header_nritems(parent);
1362 	slot = path->slots[level + 1];
1363 
1364 	if (slot > 0)
1365 		btrfs_readahead_node_child(parent, slot - 1);
1366 	if (slot + 1 < nritems)
1367 		btrfs_readahead_node_child(parent, slot + 1);
1368 }
1369 
1370 
1371 /*
1372  * when we walk down the tree, it is usually safe to unlock the higher layers
1373  * in the tree.  The exceptions are when our path goes through slot 0, because
1374  * operations on the tree might require changing key pointers higher up in the
1375  * tree.
1376  *
1377  * callers might also have set path->keep_locks, which tells this code to keep
1378  * the lock if the path points to the last slot in the block.  This is part of
1379  * walking through the tree, and selecting the next slot in the higher block.
1380  *
1381  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1382  * if lowest_unlock is 1, level 0 won't be unlocked
1383  */
1384 static noinline void unlock_up(struct btrfs_path *path, int level,
1385 			       int lowest_unlock, int min_write_lock_level,
1386 			       int *write_lock_level)
1387 {
1388 	int i;
1389 	int skip_level = level;
1390 	int no_skips = 0;
1391 	struct extent_buffer *t;
1392 
1393 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1394 		if (!path->nodes[i])
1395 			break;
1396 		if (!path->locks[i])
1397 			break;
1398 		if (!no_skips && path->slots[i] == 0) {
1399 			skip_level = i + 1;
1400 			continue;
1401 		}
1402 		if (!no_skips && path->keep_locks) {
1403 			u32 nritems;
1404 			t = path->nodes[i];
1405 			nritems = btrfs_header_nritems(t);
1406 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1407 				skip_level = i + 1;
1408 				continue;
1409 			}
1410 		}
1411 		if (skip_level < i && i >= lowest_unlock)
1412 			no_skips = 1;
1413 
1414 		t = path->nodes[i];
1415 		if (i >= lowest_unlock && i > skip_level) {
1416 			btrfs_tree_unlock_rw(t, path->locks[i]);
1417 			path->locks[i] = 0;
1418 			if (write_lock_level &&
1419 			    i > min_write_lock_level &&
1420 			    i <= *write_lock_level) {
1421 				*write_lock_level = i - 1;
1422 			}
1423 		}
1424 	}
1425 }
1426 
1427 /*
1428  * helper function for btrfs_search_slot.  The goal is to find a block
1429  * in cache without setting the path to blocking.  If we find the block
1430  * we return zero and the path is unchanged.
1431  *
1432  * If we can't find the block, we set the path blocking and do some
1433  * reada.  -EAGAIN is returned and the search must be repeated.
1434  */
1435 static int
1436 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1437 		      struct extent_buffer **eb_ret, int level, int slot,
1438 		      const struct btrfs_key *key)
1439 {
1440 	struct btrfs_fs_info *fs_info = root->fs_info;
1441 	u64 blocknr;
1442 	u64 gen;
1443 	struct extent_buffer *tmp;
1444 	struct btrfs_key first_key;
1445 	int ret;
1446 	int parent_level;
1447 
1448 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
1449 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
1450 	parent_level = btrfs_header_level(*eb_ret);
1451 	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1452 
1453 	tmp = find_extent_buffer(fs_info, blocknr);
1454 	if (tmp) {
1455 		if (p->reada == READA_FORWARD_ALWAYS)
1456 			reada_for_search(fs_info, p, level, slot, key->objectid);
1457 
1458 		/* first we do an atomic uptodate check */
1459 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1460 			/*
1461 			 * Do extra check for first_key, eb can be stale due to
1462 			 * being cached, read from scrub, or have multiple
1463 			 * parents (shared tree blocks).
1464 			 */
1465 			if (btrfs_verify_level_key(tmp,
1466 					parent_level - 1, &first_key, gen)) {
1467 				free_extent_buffer(tmp);
1468 				return -EUCLEAN;
1469 			}
1470 			*eb_ret = tmp;
1471 			return 0;
1472 		}
1473 
1474 		/* now we're allowed to do a blocking uptodate check */
1475 		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1476 		if (!ret) {
1477 			*eb_ret = tmp;
1478 			return 0;
1479 		}
1480 		free_extent_buffer(tmp);
1481 		btrfs_release_path(p);
1482 		return -EIO;
1483 	}
1484 
1485 	/*
1486 	 * reduce lock contention at high levels
1487 	 * of the btree by dropping locks before
1488 	 * we read.  Don't release the lock on the current
1489 	 * level because we need to walk this node to figure
1490 	 * out which blocks to read.
1491 	 */
1492 	btrfs_unlock_up_safe(p, level + 1);
1493 
1494 	if (p->reada != READA_NONE)
1495 		reada_for_search(fs_info, p, level, slot, key->objectid);
1496 
1497 	ret = -EAGAIN;
1498 	tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1499 			      gen, parent_level - 1, &first_key);
1500 	if (!IS_ERR(tmp)) {
1501 		/*
1502 		 * If the read above didn't mark this buffer up to date,
1503 		 * it will never end up being up to date.  Set ret to EIO now
1504 		 * and give up so that our caller doesn't loop forever
1505 		 * on our EAGAINs.
1506 		 */
1507 		if (!extent_buffer_uptodate(tmp))
1508 			ret = -EIO;
1509 		free_extent_buffer(tmp);
1510 	} else {
1511 		ret = PTR_ERR(tmp);
1512 	}
1513 
1514 	btrfs_release_path(p);
1515 	return ret;
1516 }
1517 
1518 /*
1519  * helper function for btrfs_search_slot.  This does all of the checks
1520  * for node-level blocks and does any balancing required based on
1521  * the ins_len.
1522  *
1523  * If no extra work was required, zero is returned.  If we had to
1524  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1525  * start over
1526  */
1527 static int
1528 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1529 		       struct btrfs_root *root, struct btrfs_path *p,
1530 		       struct extent_buffer *b, int level, int ins_len,
1531 		       int *write_lock_level)
1532 {
1533 	struct btrfs_fs_info *fs_info = root->fs_info;
1534 	int ret = 0;
1535 
1536 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1537 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1538 
1539 		if (*write_lock_level < level + 1) {
1540 			*write_lock_level = level + 1;
1541 			btrfs_release_path(p);
1542 			return -EAGAIN;
1543 		}
1544 
1545 		reada_for_balance(p, level);
1546 		ret = split_node(trans, root, p, level);
1547 
1548 		b = p->nodes[level];
1549 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
1550 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1551 
1552 		if (*write_lock_level < level + 1) {
1553 			*write_lock_level = level + 1;
1554 			btrfs_release_path(p);
1555 			return -EAGAIN;
1556 		}
1557 
1558 		reada_for_balance(p, level);
1559 		ret = balance_level(trans, root, p, level);
1560 		if (ret)
1561 			return ret;
1562 
1563 		b = p->nodes[level];
1564 		if (!b) {
1565 			btrfs_release_path(p);
1566 			return -EAGAIN;
1567 		}
1568 		BUG_ON(btrfs_header_nritems(b) == 1);
1569 	}
1570 	return ret;
1571 }
1572 
1573 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1574 		u64 iobjectid, u64 ioff, u8 key_type,
1575 		struct btrfs_key *found_key)
1576 {
1577 	int ret;
1578 	struct btrfs_key key;
1579 	struct extent_buffer *eb;
1580 
1581 	ASSERT(path);
1582 	ASSERT(found_key);
1583 
1584 	key.type = key_type;
1585 	key.objectid = iobjectid;
1586 	key.offset = ioff;
1587 
1588 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1589 	if (ret < 0)
1590 		return ret;
1591 
1592 	eb = path->nodes[0];
1593 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1594 		ret = btrfs_next_leaf(fs_root, path);
1595 		if (ret)
1596 			return ret;
1597 		eb = path->nodes[0];
1598 	}
1599 
1600 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1601 	if (found_key->type != key.type ||
1602 			found_key->objectid != key.objectid)
1603 		return 1;
1604 
1605 	return 0;
1606 }
1607 
1608 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1609 							struct btrfs_path *p,
1610 							int write_lock_level)
1611 {
1612 	struct btrfs_fs_info *fs_info = root->fs_info;
1613 	struct extent_buffer *b;
1614 	int root_lock;
1615 	int level = 0;
1616 
1617 	/* We try very hard to do read locks on the root */
1618 	root_lock = BTRFS_READ_LOCK;
1619 
1620 	if (p->search_commit_root) {
1621 		/*
1622 		 * The commit roots are read only so we always do read locks,
1623 		 * and we always must hold the commit_root_sem when doing
1624 		 * searches on them, the only exception is send where we don't
1625 		 * want to block transaction commits for a long time, so
1626 		 * we need to clone the commit root in order to avoid races
1627 		 * with transaction commits that create a snapshot of one of
1628 		 * the roots used by a send operation.
1629 		 */
1630 		if (p->need_commit_sem) {
1631 			down_read(&fs_info->commit_root_sem);
1632 			b = btrfs_clone_extent_buffer(root->commit_root);
1633 			up_read(&fs_info->commit_root_sem);
1634 			if (!b)
1635 				return ERR_PTR(-ENOMEM);
1636 
1637 		} else {
1638 			b = root->commit_root;
1639 			atomic_inc(&b->refs);
1640 		}
1641 		level = btrfs_header_level(b);
1642 		/*
1643 		 * Ensure that all callers have set skip_locking when
1644 		 * p->search_commit_root = 1.
1645 		 */
1646 		ASSERT(p->skip_locking == 1);
1647 
1648 		goto out;
1649 	}
1650 
1651 	if (p->skip_locking) {
1652 		b = btrfs_root_node(root);
1653 		level = btrfs_header_level(b);
1654 		goto out;
1655 	}
1656 
1657 	/*
1658 	 * If the level is set to maximum, we can skip trying to get the read
1659 	 * lock.
1660 	 */
1661 	if (write_lock_level < BTRFS_MAX_LEVEL) {
1662 		/*
1663 		 * We don't know the level of the root node until we actually
1664 		 * have it read locked
1665 		 */
1666 		b = btrfs_read_lock_root_node(root);
1667 		level = btrfs_header_level(b);
1668 		if (level > write_lock_level)
1669 			goto out;
1670 
1671 		/* Whoops, must trade for write lock */
1672 		btrfs_tree_read_unlock(b);
1673 		free_extent_buffer(b);
1674 	}
1675 
1676 	b = btrfs_lock_root_node(root);
1677 	root_lock = BTRFS_WRITE_LOCK;
1678 
1679 	/* The level might have changed, check again */
1680 	level = btrfs_header_level(b);
1681 
1682 out:
1683 	p->nodes[level] = b;
1684 	if (!p->skip_locking)
1685 		p->locks[level] = root_lock;
1686 	/*
1687 	 * Callers are responsible for dropping b's references.
1688 	 */
1689 	return b;
1690 }
1691 
1692 
1693 /*
1694  * btrfs_search_slot - look for a key in a tree and perform necessary
1695  * modifications to preserve tree invariants.
1696  *
1697  * @trans:	Handle of transaction, used when modifying the tree
1698  * @p:		Holds all btree nodes along the search path
1699  * @root:	The root node of the tree
1700  * @key:	The key we are looking for
1701  * @ins_len:	Indicates purpose of search:
1702  *              >0  for inserts it's size of item inserted (*)
1703  *              <0  for deletions
1704  *               0  for plain searches, not modifying the tree
1705  *
1706  *              (*) If size of item inserted doesn't include
1707  *              sizeof(struct btrfs_item), then p->search_for_extension must
1708  *              be set.
1709  * @cow:	boolean should CoW operations be performed. Must always be 1
1710  *		when modifying the tree.
1711  *
1712  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1713  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1714  *
1715  * If @key is found, 0 is returned and you can find the item in the leaf level
1716  * of the path (level 0)
1717  *
1718  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1719  * points to the slot where it should be inserted
1720  *
1721  * If an error is encountered while searching the tree a negative error number
1722  * is returned
1723  */
1724 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1725 		      const struct btrfs_key *key, struct btrfs_path *p,
1726 		      int ins_len, int cow)
1727 {
1728 	struct extent_buffer *b;
1729 	int slot;
1730 	int ret;
1731 	int err;
1732 	int level;
1733 	int lowest_unlock = 1;
1734 	/* everything at write_lock_level or lower must be write locked */
1735 	int write_lock_level = 0;
1736 	u8 lowest_level = 0;
1737 	int min_write_lock_level;
1738 	int prev_cmp;
1739 
1740 	lowest_level = p->lowest_level;
1741 	WARN_ON(lowest_level && ins_len > 0);
1742 	WARN_ON(p->nodes[0] != NULL);
1743 	BUG_ON(!cow && ins_len);
1744 
1745 	if (ins_len < 0) {
1746 		lowest_unlock = 2;
1747 
1748 		/* when we are removing items, we might have to go up to level
1749 		 * two as we update tree pointers  Make sure we keep write
1750 		 * for those levels as well
1751 		 */
1752 		write_lock_level = 2;
1753 	} else if (ins_len > 0) {
1754 		/*
1755 		 * for inserting items, make sure we have a write lock on
1756 		 * level 1 so we can update keys
1757 		 */
1758 		write_lock_level = 1;
1759 	}
1760 
1761 	if (!cow)
1762 		write_lock_level = -1;
1763 
1764 	if (cow && (p->keep_locks || p->lowest_level))
1765 		write_lock_level = BTRFS_MAX_LEVEL;
1766 
1767 	min_write_lock_level = write_lock_level;
1768 
1769 again:
1770 	prev_cmp = -1;
1771 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
1772 	if (IS_ERR(b)) {
1773 		ret = PTR_ERR(b);
1774 		goto done;
1775 	}
1776 
1777 	while (b) {
1778 		int dec = 0;
1779 
1780 		level = btrfs_header_level(b);
1781 
1782 		if (cow) {
1783 			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1784 
1785 			/*
1786 			 * if we don't really need to cow this block
1787 			 * then we don't want to set the path blocking,
1788 			 * so we test it here
1789 			 */
1790 			if (!should_cow_block(trans, root, b))
1791 				goto cow_done;
1792 
1793 			/*
1794 			 * must have write locks on this node and the
1795 			 * parent
1796 			 */
1797 			if (level > write_lock_level ||
1798 			    (level + 1 > write_lock_level &&
1799 			    level + 1 < BTRFS_MAX_LEVEL &&
1800 			    p->nodes[level + 1])) {
1801 				write_lock_level = level + 1;
1802 				btrfs_release_path(p);
1803 				goto again;
1804 			}
1805 
1806 			if (last_level)
1807 				err = btrfs_cow_block(trans, root, b, NULL, 0,
1808 						      &b,
1809 						      BTRFS_NESTING_COW);
1810 			else
1811 				err = btrfs_cow_block(trans, root, b,
1812 						      p->nodes[level + 1],
1813 						      p->slots[level + 1], &b,
1814 						      BTRFS_NESTING_COW);
1815 			if (err) {
1816 				ret = err;
1817 				goto done;
1818 			}
1819 		}
1820 cow_done:
1821 		p->nodes[level] = b;
1822 		/*
1823 		 * Leave path with blocking locks to avoid massive
1824 		 * lock context switch, this is made on purpose.
1825 		 */
1826 
1827 		/*
1828 		 * we have a lock on b and as long as we aren't changing
1829 		 * the tree, there is no way to for the items in b to change.
1830 		 * It is safe to drop the lock on our parent before we
1831 		 * go through the expensive btree search on b.
1832 		 *
1833 		 * If we're inserting or deleting (ins_len != 0), then we might
1834 		 * be changing slot zero, which may require changing the parent.
1835 		 * So, we can't drop the lock until after we know which slot
1836 		 * we're operating on.
1837 		 */
1838 		if (!ins_len && !p->keep_locks) {
1839 			int u = level + 1;
1840 
1841 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1842 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1843 				p->locks[u] = 0;
1844 			}
1845 		}
1846 
1847 		/*
1848 		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1849 		 * we can safely assume the target key will always be in slot 0
1850 		 * on lower levels due to the invariants BTRFS' btree provides,
1851 		 * namely that a btrfs_key_ptr entry always points to the
1852 		 * lowest key in the child node, thus we can skip searching
1853 		 * lower levels
1854 		 */
1855 		if (prev_cmp == 0) {
1856 			slot = 0;
1857 			ret = 0;
1858 		} else {
1859 			ret = btrfs_bin_search(b, key, &slot);
1860 			prev_cmp = ret;
1861 			if (ret < 0)
1862 				goto done;
1863 		}
1864 
1865 		if (level == 0) {
1866 			p->slots[level] = slot;
1867 			/*
1868 			 * Item key already exists. In this case, if we are
1869 			 * allowed to insert the item (for example, in dir_item
1870 			 * case, item key collision is allowed), it will be
1871 			 * merged with the original item. Only the item size
1872 			 * grows, no new btrfs item will be added. If
1873 			 * search_for_extension is not set, ins_len already
1874 			 * accounts the size btrfs_item, deduct it here so leaf
1875 			 * space check will be correct.
1876 			 */
1877 			if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1878 				ASSERT(ins_len >= sizeof(struct btrfs_item));
1879 				ins_len -= sizeof(struct btrfs_item);
1880 			}
1881 			if (ins_len > 0 &&
1882 			    btrfs_leaf_free_space(b) < ins_len) {
1883 				if (write_lock_level < 1) {
1884 					write_lock_level = 1;
1885 					btrfs_release_path(p);
1886 					goto again;
1887 				}
1888 
1889 				err = split_leaf(trans, root, key,
1890 						 p, ins_len, ret == 0);
1891 
1892 				BUG_ON(err > 0);
1893 				if (err) {
1894 					ret = err;
1895 					goto done;
1896 				}
1897 			}
1898 			if (!p->search_for_split)
1899 				unlock_up(p, level, lowest_unlock,
1900 					  min_write_lock_level, NULL);
1901 			goto done;
1902 		}
1903 		if (ret && slot > 0) {
1904 			dec = 1;
1905 			slot--;
1906 		}
1907 		p->slots[level] = slot;
1908 		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1909 					     &write_lock_level);
1910 		if (err == -EAGAIN)
1911 			goto again;
1912 		if (err) {
1913 			ret = err;
1914 			goto done;
1915 		}
1916 		b = p->nodes[level];
1917 		slot = p->slots[level];
1918 
1919 		/*
1920 		 * Slot 0 is special, if we change the key we have to update
1921 		 * the parent pointer which means we must have a write lock on
1922 		 * the parent
1923 		 */
1924 		if (slot == 0 && ins_len && write_lock_level < level + 1) {
1925 			write_lock_level = level + 1;
1926 			btrfs_release_path(p);
1927 			goto again;
1928 		}
1929 
1930 		unlock_up(p, level, lowest_unlock, min_write_lock_level,
1931 			  &write_lock_level);
1932 
1933 		if (level == lowest_level) {
1934 			if (dec)
1935 				p->slots[level]++;
1936 			goto done;
1937 		}
1938 
1939 		err = read_block_for_search(root, p, &b, level, slot, key);
1940 		if (err == -EAGAIN)
1941 			goto again;
1942 		if (err) {
1943 			ret = err;
1944 			goto done;
1945 		}
1946 
1947 		if (!p->skip_locking) {
1948 			level = btrfs_header_level(b);
1949 			if (level <= write_lock_level) {
1950 				btrfs_tree_lock(b);
1951 				p->locks[level] = BTRFS_WRITE_LOCK;
1952 			} else {
1953 				btrfs_tree_read_lock(b);
1954 				p->locks[level] = BTRFS_READ_LOCK;
1955 			}
1956 			p->nodes[level] = b;
1957 		}
1958 	}
1959 	ret = 1;
1960 done:
1961 	if (ret < 0 && !p->skip_release_on_error)
1962 		btrfs_release_path(p);
1963 	return ret;
1964 }
1965 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
1966 
1967 /*
1968  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1969  * current state of the tree together with the operations recorded in the tree
1970  * modification log to search for the key in a previous version of this tree, as
1971  * denoted by the time_seq parameter.
1972  *
1973  * Naturally, there is no support for insert, delete or cow operations.
1974  *
1975  * The resulting path and return value will be set up as if we called
1976  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1977  */
1978 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
1979 			  struct btrfs_path *p, u64 time_seq)
1980 {
1981 	struct btrfs_fs_info *fs_info = root->fs_info;
1982 	struct extent_buffer *b;
1983 	int slot;
1984 	int ret;
1985 	int err;
1986 	int level;
1987 	int lowest_unlock = 1;
1988 	u8 lowest_level = 0;
1989 
1990 	lowest_level = p->lowest_level;
1991 	WARN_ON(p->nodes[0] != NULL);
1992 
1993 	if (p->search_commit_root) {
1994 		BUG_ON(time_seq);
1995 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
1996 	}
1997 
1998 again:
1999 	b = btrfs_get_old_root(root, time_seq);
2000 	if (!b) {
2001 		ret = -EIO;
2002 		goto done;
2003 	}
2004 	level = btrfs_header_level(b);
2005 	p->locks[level] = BTRFS_READ_LOCK;
2006 
2007 	while (b) {
2008 		int dec = 0;
2009 
2010 		level = btrfs_header_level(b);
2011 		p->nodes[level] = b;
2012 
2013 		/*
2014 		 * we have a lock on b and as long as we aren't changing
2015 		 * the tree, there is no way to for the items in b to change.
2016 		 * It is safe to drop the lock on our parent before we
2017 		 * go through the expensive btree search on b.
2018 		 */
2019 		btrfs_unlock_up_safe(p, level + 1);
2020 
2021 		ret = btrfs_bin_search(b, key, &slot);
2022 		if (ret < 0)
2023 			goto done;
2024 
2025 		if (level == 0) {
2026 			p->slots[level] = slot;
2027 			unlock_up(p, level, lowest_unlock, 0, NULL);
2028 			goto done;
2029 		}
2030 
2031 		if (ret && slot > 0) {
2032 			dec = 1;
2033 			slot--;
2034 		}
2035 		p->slots[level] = slot;
2036 		unlock_up(p, level, lowest_unlock, 0, NULL);
2037 
2038 		if (level == lowest_level) {
2039 			if (dec)
2040 				p->slots[level]++;
2041 			goto done;
2042 		}
2043 
2044 		err = read_block_for_search(root, p, &b, level, slot, key);
2045 		if (err == -EAGAIN)
2046 			goto again;
2047 		if (err) {
2048 			ret = err;
2049 			goto done;
2050 		}
2051 
2052 		level = btrfs_header_level(b);
2053 		btrfs_tree_read_lock(b);
2054 		b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2055 		if (!b) {
2056 			ret = -ENOMEM;
2057 			goto done;
2058 		}
2059 		p->locks[level] = BTRFS_READ_LOCK;
2060 		p->nodes[level] = b;
2061 	}
2062 	ret = 1;
2063 done:
2064 	if (ret < 0)
2065 		btrfs_release_path(p);
2066 
2067 	return ret;
2068 }
2069 
2070 /*
2071  * helper to use instead of search slot if no exact match is needed but
2072  * instead the next or previous item should be returned.
2073  * When find_higher is true, the next higher item is returned, the next lower
2074  * otherwise.
2075  * When return_any and find_higher are both true, and no higher item is found,
2076  * return the next lower instead.
2077  * When return_any is true and find_higher is false, and no lower item is found,
2078  * return the next higher instead.
2079  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2080  * < 0 on error
2081  */
2082 int btrfs_search_slot_for_read(struct btrfs_root *root,
2083 			       const struct btrfs_key *key,
2084 			       struct btrfs_path *p, int find_higher,
2085 			       int return_any)
2086 {
2087 	int ret;
2088 	struct extent_buffer *leaf;
2089 
2090 again:
2091 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2092 	if (ret <= 0)
2093 		return ret;
2094 	/*
2095 	 * a return value of 1 means the path is at the position where the
2096 	 * item should be inserted. Normally this is the next bigger item,
2097 	 * but in case the previous item is the last in a leaf, path points
2098 	 * to the first free slot in the previous leaf, i.e. at an invalid
2099 	 * item.
2100 	 */
2101 	leaf = p->nodes[0];
2102 
2103 	if (find_higher) {
2104 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2105 			ret = btrfs_next_leaf(root, p);
2106 			if (ret <= 0)
2107 				return ret;
2108 			if (!return_any)
2109 				return 1;
2110 			/*
2111 			 * no higher item found, return the next
2112 			 * lower instead
2113 			 */
2114 			return_any = 0;
2115 			find_higher = 0;
2116 			btrfs_release_path(p);
2117 			goto again;
2118 		}
2119 	} else {
2120 		if (p->slots[0] == 0) {
2121 			ret = btrfs_prev_leaf(root, p);
2122 			if (ret < 0)
2123 				return ret;
2124 			if (!ret) {
2125 				leaf = p->nodes[0];
2126 				if (p->slots[0] == btrfs_header_nritems(leaf))
2127 					p->slots[0]--;
2128 				return 0;
2129 			}
2130 			if (!return_any)
2131 				return 1;
2132 			/*
2133 			 * no lower item found, return the next
2134 			 * higher instead
2135 			 */
2136 			return_any = 0;
2137 			find_higher = 1;
2138 			btrfs_release_path(p);
2139 			goto again;
2140 		} else {
2141 			--p->slots[0];
2142 		}
2143 	}
2144 	return 0;
2145 }
2146 
2147 /*
2148  * adjust the pointers going up the tree, starting at level
2149  * making sure the right key of each node is points to 'key'.
2150  * This is used after shifting pointers to the left, so it stops
2151  * fixing up pointers when a given leaf/node is not in slot 0 of the
2152  * higher levels
2153  *
2154  */
2155 static void fixup_low_keys(struct btrfs_path *path,
2156 			   struct btrfs_disk_key *key, int level)
2157 {
2158 	int i;
2159 	struct extent_buffer *t;
2160 	int ret;
2161 
2162 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2163 		int tslot = path->slots[i];
2164 
2165 		if (!path->nodes[i])
2166 			break;
2167 		t = path->nodes[i];
2168 		ret = btrfs_tree_mod_log_insert_key(t, tslot,
2169 				BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2170 		BUG_ON(ret < 0);
2171 		btrfs_set_node_key(t, key, tslot);
2172 		btrfs_mark_buffer_dirty(path->nodes[i]);
2173 		if (tslot != 0)
2174 			break;
2175 	}
2176 }
2177 
2178 /*
2179  * update item key.
2180  *
2181  * This function isn't completely safe. It's the caller's responsibility
2182  * that the new key won't break the order
2183  */
2184 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2185 			     struct btrfs_path *path,
2186 			     const struct btrfs_key *new_key)
2187 {
2188 	struct btrfs_disk_key disk_key;
2189 	struct extent_buffer *eb;
2190 	int slot;
2191 
2192 	eb = path->nodes[0];
2193 	slot = path->slots[0];
2194 	if (slot > 0) {
2195 		btrfs_item_key(eb, &disk_key, slot - 1);
2196 		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2197 			btrfs_crit(fs_info,
2198 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2199 				   slot, btrfs_disk_key_objectid(&disk_key),
2200 				   btrfs_disk_key_type(&disk_key),
2201 				   btrfs_disk_key_offset(&disk_key),
2202 				   new_key->objectid, new_key->type,
2203 				   new_key->offset);
2204 			btrfs_print_leaf(eb);
2205 			BUG();
2206 		}
2207 	}
2208 	if (slot < btrfs_header_nritems(eb) - 1) {
2209 		btrfs_item_key(eb, &disk_key, slot + 1);
2210 		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2211 			btrfs_crit(fs_info,
2212 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2213 				   slot, btrfs_disk_key_objectid(&disk_key),
2214 				   btrfs_disk_key_type(&disk_key),
2215 				   btrfs_disk_key_offset(&disk_key),
2216 				   new_key->objectid, new_key->type,
2217 				   new_key->offset);
2218 			btrfs_print_leaf(eb);
2219 			BUG();
2220 		}
2221 	}
2222 
2223 	btrfs_cpu_key_to_disk(&disk_key, new_key);
2224 	btrfs_set_item_key(eb, &disk_key, slot);
2225 	btrfs_mark_buffer_dirty(eb);
2226 	if (slot == 0)
2227 		fixup_low_keys(path, &disk_key, 1);
2228 }
2229 
2230 /*
2231  * Check key order of two sibling extent buffers.
2232  *
2233  * Return true if something is wrong.
2234  * Return false if everything is fine.
2235  *
2236  * Tree-checker only works inside one tree block, thus the following
2237  * corruption can not be detected by tree-checker:
2238  *
2239  * Leaf @left			| Leaf @right
2240  * --------------------------------------------------------------
2241  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2242  *
2243  * Key f6 in leaf @left itself is valid, but not valid when the next
2244  * key in leaf @right is 7.
2245  * This can only be checked at tree block merge time.
2246  * And since tree checker has ensured all key order in each tree block
2247  * is correct, we only need to bother the last key of @left and the first
2248  * key of @right.
2249  */
2250 static bool check_sibling_keys(struct extent_buffer *left,
2251 			       struct extent_buffer *right)
2252 {
2253 	struct btrfs_key left_last;
2254 	struct btrfs_key right_first;
2255 	int level = btrfs_header_level(left);
2256 	int nr_left = btrfs_header_nritems(left);
2257 	int nr_right = btrfs_header_nritems(right);
2258 
2259 	/* No key to check in one of the tree blocks */
2260 	if (!nr_left || !nr_right)
2261 		return false;
2262 
2263 	if (level) {
2264 		btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2265 		btrfs_node_key_to_cpu(right, &right_first, 0);
2266 	} else {
2267 		btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2268 		btrfs_item_key_to_cpu(right, &right_first, 0);
2269 	}
2270 
2271 	if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2272 		btrfs_crit(left->fs_info,
2273 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2274 			   left_last.objectid, left_last.type,
2275 			   left_last.offset, right_first.objectid,
2276 			   right_first.type, right_first.offset);
2277 		return true;
2278 	}
2279 	return false;
2280 }
2281 
2282 /*
2283  * try to push data from one node into the next node left in the
2284  * tree.
2285  *
2286  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2287  * error, and > 0 if there was no room in the left hand block.
2288  */
2289 static int push_node_left(struct btrfs_trans_handle *trans,
2290 			  struct extent_buffer *dst,
2291 			  struct extent_buffer *src, int empty)
2292 {
2293 	struct btrfs_fs_info *fs_info = trans->fs_info;
2294 	int push_items = 0;
2295 	int src_nritems;
2296 	int dst_nritems;
2297 	int ret = 0;
2298 
2299 	src_nritems = btrfs_header_nritems(src);
2300 	dst_nritems = btrfs_header_nritems(dst);
2301 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2302 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2303 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2304 
2305 	if (!empty && src_nritems <= 8)
2306 		return 1;
2307 
2308 	if (push_items <= 0)
2309 		return 1;
2310 
2311 	if (empty) {
2312 		push_items = min(src_nritems, push_items);
2313 		if (push_items < src_nritems) {
2314 			/* leave at least 8 pointers in the node if
2315 			 * we aren't going to empty it
2316 			 */
2317 			if (src_nritems - push_items < 8) {
2318 				if (push_items <= 8)
2319 					return 1;
2320 				push_items -= 8;
2321 			}
2322 		}
2323 	} else
2324 		push_items = min(src_nritems - 8, push_items);
2325 
2326 	/* dst is the left eb, src is the middle eb */
2327 	if (check_sibling_keys(dst, src)) {
2328 		ret = -EUCLEAN;
2329 		btrfs_abort_transaction(trans, ret);
2330 		return ret;
2331 	}
2332 	ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2333 	if (ret) {
2334 		btrfs_abort_transaction(trans, ret);
2335 		return ret;
2336 	}
2337 	copy_extent_buffer(dst, src,
2338 			   btrfs_node_key_ptr_offset(dst_nritems),
2339 			   btrfs_node_key_ptr_offset(0),
2340 			   push_items * sizeof(struct btrfs_key_ptr));
2341 
2342 	if (push_items < src_nritems) {
2343 		/*
2344 		 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2345 		 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2346 		 */
2347 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2348 				      btrfs_node_key_ptr_offset(push_items),
2349 				      (src_nritems - push_items) *
2350 				      sizeof(struct btrfs_key_ptr));
2351 	}
2352 	btrfs_set_header_nritems(src, src_nritems - push_items);
2353 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2354 	btrfs_mark_buffer_dirty(src);
2355 	btrfs_mark_buffer_dirty(dst);
2356 
2357 	return ret;
2358 }
2359 
2360 /*
2361  * try to push data from one node into the next node right in the
2362  * tree.
2363  *
2364  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2365  * error, and > 0 if there was no room in the right hand block.
2366  *
2367  * this will  only push up to 1/2 the contents of the left node over
2368  */
2369 static int balance_node_right(struct btrfs_trans_handle *trans,
2370 			      struct extent_buffer *dst,
2371 			      struct extent_buffer *src)
2372 {
2373 	struct btrfs_fs_info *fs_info = trans->fs_info;
2374 	int push_items = 0;
2375 	int max_push;
2376 	int src_nritems;
2377 	int dst_nritems;
2378 	int ret = 0;
2379 
2380 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2381 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2382 
2383 	src_nritems = btrfs_header_nritems(src);
2384 	dst_nritems = btrfs_header_nritems(dst);
2385 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2386 	if (push_items <= 0)
2387 		return 1;
2388 
2389 	if (src_nritems < 4)
2390 		return 1;
2391 
2392 	max_push = src_nritems / 2 + 1;
2393 	/* don't try to empty the node */
2394 	if (max_push >= src_nritems)
2395 		return 1;
2396 
2397 	if (max_push < push_items)
2398 		push_items = max_push;
2399 
2400 	/* dst is the right eb, src is the middle eb */
2401 	if (check_sibling_keys(src, dst)) {
2402 		ret = -EUCLEAN;
2403 		btrfs_abort_transaction(trans, ret);
2404 		return ret;
2405 	}
2406 	ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2407 	BUG_ON(ret < 0);
2408 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2409 				      btrfs_node_key_ptr_offset(0),
2410 				      (dst_nritems) *
2411 				      sizeof(struct btrfs_key_ptr));
2412 
2413 	ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2414 					 push_items);
2415 	if (ret) {
2416 		btrfs_abort_transaction(trans, ret);
2417 		return ret;
2418 	}
2419 	copy_extent_buffer(dst, src,
2420 			   btrfs_node_key_ptr_offset(0),
2421 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
2422 			   push_items * sizeof(struct btrfs_key_ptr));
2423 
2424 	btrfs_set_header_nritems(src, src_nritems - push_items);
2425 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2426 
2427 	btrfs_mark_buffer_dirty(src);
2428 	btrfs_mark_buffer_dirty(dst);
2429 
2430 	return ret;
2431 }
2432 
2433 /*
2434  * helper function to insert a new root level in the tree.
2435  * A new node is allocated, and a single item is inserted to
2436  * point to the existing root
2437  *
2438  * returns zero on success or < 0 on failure.
2439  */
2440 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2441 			   struct btrfs_root *root,
2442 			   struct btrfs_path *path, int level)
2443 {
2444 	struct btrfs_fs_info *fs_info = root->fs_info;
2445 	u64 lower_gen;
2446 	struct extent_buffer *lower;
2447 	struct extent_buffer *c;
2448 	struct extent_buffer *old;
2449 	struct btrfs_disk_key lower_key;
2450 	int ret;
2451 
2452 	BUG_ON(path->nodes[level]);
2453 	BUG_ON(path->nodes[level-1] != root->node);
2454 
2455 	lower = path->nodes[level-1];
2456 	if (level == 1)
2457 		btrfs_item_key(lower, &lower_key, 0);
2458 	else
2459 		btrfs_node_key(lower, &lower_key, 0);
2460 
2461 	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
2462 					 root->node->start, 0,
2463 					 BTRFS_NESTING_NEW_ROOT);
2464 	if (IS_ERR(c))
2465 		return PTR_ERR(c);
2466 
2467 	root_add_used(root, fs_info->nodesize);
2468 
2469 	btrfs_set_header_nritems(c, 1);
2470 	btrfs_set_node_key(c, &lower_key, 0);
2471 	btrfs_set_node_blockptr(c, 0, lower->start);
2472 	lower_gen = btrfs_header_generation(lower);
2473 	WARN_ON(lower_gen != trans->transid);
2474 
2475 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2476 
2477 	btrfs_mark_buffer_dirty(c);
2478 
2479 	old = root->node;
2480 	ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2481 	BUG_ON(ret < 0);
2482 	rcu_assign_pointer(root->node, c);
2483 
2484 	/* the super has an extra ref to root->node */
2485 	free_extent_buffer(old);
2486 
2487 	add_root_to_dirty_list(root);
2488 	atomic_inc(&c->refs);
2489 	path->nodes[level] = c;
2490 	path->locks[level] = BTRFS_WRITE_LOCK;
2491 	path->slots[level] = 0;
2492 	return 0;
2493 }
2494 
2495 /*
2496  * worker function to insert a single pointer in a node.
2497  * the node should have enough room for the pointer already
2498  *
2499  * slot and level indicate where you want the key to go, and
2500  * blocknr is the block the key points to.
2501  */
2502 static void insert_ptr(struct btrfs_trans_handle *trans,
2503 		       struct btrfs_path *path,
2504 		       struct btrfs_disk_key *key, u64 bytenr,
2505 		       int slot, int level)
2506 {
2507 	struct extent_buffer *lower;
2508 	int nritems;
2509 	int ret;
2510 
2511 	BUG_ON(!path->nodes[level]);
2512 	btrfs_assert_tree_locked(path->nodes[level]);
2513 	lower = path->nodes[level];
2514 	nritems = btrfs_header_nritems(lower);
2515 	BUG_ON(slot > nritems);
2516 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2517 	if (slot != nritems) {
2518 		if (level) {
2519 			ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2520 					slot, nritems - slot);
2521 			BUG_ON(ret < 0);
2522 		}
2523 		memmove_extent_buffer(lower,
2524 			      btrfs_node_key_ptr_offset(slot + 1),
2525 			      btrfs_node_key_ptr_offset(slot),
2526 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
2527 	}
2528 	if (level) {
2529 		ret = btrfs_tree_mod_log_insert_key(lower, slot,
2530 					    BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2531 		BUG_ON(ret < 0);
2532 	}
2533 	btrfs_set_node_key(lower, key, slot);
2534 	btrfs_set_node_blockptr(lower, slot, bytenr);
2535 	WARN_ON(trans->transid == 0);
2536 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2537 	btrfs_set_header_nritems(lower, nritems + 1);
2538 	btrfs_mark_buffer_dirty(lower);
2539 }
2540 
2541 /*
2542  * split the node at the specified level in path in two.
2543  * The path is corrected to point to the appropriate node after the split
2544  *
2545  * Before splitting this tries to make some room in the node by pushing
2546  * left and right, if either one works, it returns right away.
2547  *
2548  * returns 0 on success and < 0 on failure
2549  */
2550 static noinline int split_node(struct btrfs_trans_handle *trans,
2551 			       struct btrfs_root *root,
2552 			       struct btrfs_path *path, int level)
2553 {
2554 	struct btrfs_fs_info *fs_info = root->fs_info;
2555 	struct extent_buffer *c;
2556 	struct extent_buffer *split;
2557 	struct btrfs_disk_key disk_key;
2558 	int mid;
2559 	int ret;
2560 	u32 c_nritems;
2561 
2562 	c = path->nodes[level];
2563 	WARN_ON(btrfs_header_generation(c) != trans->transid);
2564 	if (c == root->node) {
2565 		/*
2566 		 * trying to split the root, lets make a new one
2567 		 *
2568 		 * tree mod log: We don't log_removal old root in
2569 		 * insert_new_root, because that root buffer will be kept as a
2570 		 * normal node. We are going to log removal of half of the
2571 		 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2572 		 * holding a tree lock on the buffer, which is why we cannot
2573 		 * race with other tree_mod_log users.
2574 		 */
2575 		ret = insert_new_root(trans, root, path, level + 1);
2576 		if (ret)
2577 			return ret;
2578 	} else {
2579 		ret = push_nodes_for_insert(trans, root, path, level);
2580 		c = path->nodes[level];
2581 		if (!ret && btrfs_header_nritems(c) <
2582 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2583 			return 0;
2584 		if (ret < 0)
2585 			return ret;
2586 	}
2587 
2588 	c_nritems = btrfs_header_nritems(c);
2589 	mid = (c_nritems + 1) / 2;
2590 	btrfs_node_key(c, &disk_key, mid);
2591 
2592 	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
2593 					     c->start, 0, BTRFS_NESTING_SPLIT);
2594 	if (IS_ERR(split))
2595 		return PTR_ERR(split);
2596 
2597 	root_add_used(root, fs_info->nodesize);
2598 	ASSERT(btrfs_header_level(c) == level);
2599 
2600 	ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2601 	if (ret) {
2602 		btrfs_abort_transaction(trans, ret);
2603 		return ret;
2604 	}
2605 	copy_extent_buffer(split, c,
2606 			   btrfs_node_key_ptr_offset(0),
2607 			   btrfs_node_key_ptr_offset(mid),
2608 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2609 	btrfs_set_header_nritems(split, c_nritems - mid);
2610 	btrfs_set_header_nritems(c, mid);
2611 
2612 	btrfs_mark_buffer_dirty(c);
2613 	btrfs_mark_buffer_dirty(split);
2614 
2615 	insert_ptr(trans, path, &disk_key, split->start,
2616 		   path->slots[level + 1] + 1, level + 1);
2617 
2618 	if (path->slots[level] >= mid) {
2619 		path->slots[level] -= mid;
2620 		btrfs_tree_unlock(c);
2621 		free_extent_buffer(c);
2622 		path->nodes[level] = split;
2623 		path->slots[level + 1] += 1;
2624 	} else {
2625 		btrfs_tree_unlock(split);
2626 		free_extent_buffer(split);
2627 	}
2628 	return 0;
2629 }
2630 
2631 /*
2632  * how many bytes are required to store the items in a leaf.  start
2633  * and nr indicate which items in the leaf to check.  This totals up the
2634  * space used both by the item structs and the item data
2635  */
2636 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2637 {
2638 	struct btrfs_item *start_item;
2639 	struct btrfs_item *end_item;
2640 	int data_len;
2641 	int nritems = btrfs_header_nritems(l);
2642 	int end = min(nritems, start + nr) - 1;
2643 
2644 	if (!nr)
2645 		return 0;
2646 	start_item = btrfs_item_nr(start);
2647 	end_item = btrfs_item_nr(end);
2648 	data_len = btrfs_item_offset(l, start_item) +
2649 		   btrfs_item_size(l, start_item);
2650 	data_len = data_len - btrfs_item_offset(l, end_item);
2651 	data_len += sizeof(struct btrfs_item) * nr;
2652 	WARN_ON(data_len < 0);
2653 	return data_len;
2654 }
2655 
2656 /*
2657  * The space between the end of the leaf items and
2658  * the start of the leaf data.  IOW, how much room
2659  * the leaf has left for both items and data
2660  */
2661 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2662 {
2663 	struct btrfs_fs_info *fs_info = leaf->fs_info;
2664 	int nritems = btrfs_header_nritems(leaf);
2665 	int ret;
2666 
2667 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2668 	if (ret < 0) {
2669 		btrfs_crit(fs_info,
2670 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2671 			   ret,
2672 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2673 			   leaf_space_used(leaf, 0, nritems), nritems);
2674 	}
2675 	return ret;
2676 }
2677 
2678 /*
2679  * min slot controls the lowest index we're willing to push to the
2680  * right.  We'll push up to and including min_slot, but no lower
2681  */
2682 static noinline int __push_leaf_right(struct btrfs_path *path,
2683 				      int data_size, int empty,
2684 				      struct extent_buffer *right,
2685 				      int free_space, u32 left_nritems,
2686 				      u32 min_slot)
2687 {
2688 	struct btrfs_fs_info *fs_info = right->fs_info;
2689 	struct extent_buffer *left = path->nodes[0];
2690 	struct extent_buffer *upper = path->nodes[1];
2691 	struct btrfs_map_token token;
2692 	struct btrfs_disk_key disk_key;
2693 	int slot;
2694 	u32 i;
2695 	int push_space = 0;
2696 	int push_items = 0;
2697 	struct btrfs_item *item;
2698 	u32 nr;
2699 	u32 right_nritems;
2700 	u32 data_end;
2701 	u32 this_item_size;
2702 
2703 	if (empty)
2704 		nr = 0;
2705 	else
2706 		nr = max_t(u32, 1, min_slot);
2707 
2708 	if (path->slots[0] >= left_nritems)
2709 		push_space += data_size;
2710 
2711 	slot = path->slots[1];
2712 	i = left_nritems - 1;
2713 	while (i >= nr) {
2714 		item = btrfs_item_nr(i);
2715 
2716 		if (!empty && push_items > 0) {
2717 			if (path->slots[0] > i)
2718 				break;
2719 			if (path->slots[0] == i) {
2720 				int space = btrfs_leaf_free_space(left);
2721 
2722 				if (space + push_space * 2 > free_space)
2723 					break;
2724 			}
2725 		}
2726 
2727 		if (path->slots[0] == i)
2728 			push_space += data_size;
2729 
2730 		this_item_size = btrfs_item_size(left, item);
2731 		if (this_item_size + sizeof(*item) + push_space > free_space)
2732 			break;
2733 
2734 		push_items++;
2735 		push_space += this_item_size + sizeof(*item);
2736 		if (i == 0)
2737 			break;
2738 		i--;
2739 	}
2740 
2741 	if (push_items == 0)
2742 		goto out_unlock;
2743 
2744 	WARN_ON(!empty && push_items == left_nritems);
2745 
2746 	/* push left to right */
2747 	right_nritems = btrfs_header_nritems(right);
2748 
2749 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2750 	push_space -= leaf_data_end(left);
2751 
2752 	/* make room in the right data area */
2753 	data_end = leaf_data_end(right);
2754 	memmove_extent_buffer(right,
2755 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2756 			      BTRFS_LEAF_DATA_OFFSET + data_end,
2757 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2758 
2759 	/* copy from the left data area */
2760 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2761 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2762 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2763 		     push_space);
2764 
2765 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2766 			      btrfs_item_nr_offset(0),
2767 			      right_nritems * sizeof(struct btrfs_item));
2768 
2769 	/* copy the items from left to right */
2770 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2771 		   btrfs_item_nr_offset(left_nritems - push_items),
2772 		   push_items * sizeof(struct btrfs_item));
2773 
2774 	/* update the item pointers */
2775 	btrfs_init_map_token(&token, right);
2776 	right_nritems += push_items;
2777 	btrfs_set_header_nritems(right, right_nritems);
2778 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2779 	for (i = 0; i < right_nritems; i++) {
2780 		item = btrfs_item_nr(i);
2781 		push_space -= btrfs_token_item_size(&token, item);
2782 		btrfs_set_token_item_offset(&token, item, push_space);
2783 	}
2784 
2785 	left_nritems -= push_items;
2786 	btrfs_set_header_nritems(left, left_nritems);
2787 
2788 	if (left_nritems)
2789 		btrfs_mark_buffer_dirty(left);
2790 	else
2791 		btrfs_clean_tree_block(left);
2792 
2793 	btrfs_mark_buffer_dirty(right);
2794 
2795 	btrfs_item_key(right, &disk_key, 0);
2796 	btrfs_set_node_key(upper, &disk_key, slot + 1);
2797 	btrfs_mark_buffer_dirty(upper);
2798 
2799 	/* then fixup the leaf pointer in the path */
2800 	if (path->slots[0] >= left_nritems) {
2801 		path->slots[0] -= left_nritems;
2802 		if (btrfs_header_nritems(path->nodes[0]) == 0)
2803 			btrfs_clean_tree_block(path->nodes[0]);
2804 		btrfs_tree_unlock(path->nodes[0]);
2805 		free_extent_buffer(path->nodes[0]);
2806 		path->nodes[0] = right;
2807 		path->slots[1] += 1;
2808 	} else {
2809 		btrfs_tree_unlock(right);
2810 		free_extent_buffer(right);
2811 	}
2812 	return 0;
2813 
2814 out_unlock:
2815 	btrfs_tree_unlock(right);
2816 	free_extent_buffer(right);
2817 	return 1;
2818 }
2819 
2820 /*
2821  * push some data in the path leaf to the right, trying to free up at
2822  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2823  *
2824  * returns 1 if the push failed because the other node didn't have enough
2825  * room, 0 if everything worked out and < 0 if there were major errors.
2826  *
2827  * this will push starting from min_slot to the end of the leaf.  It won't
2828  * push any slot lower than min_slot
2829  */
2830 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2831 			   *root, struct btrfs_path *path,
2832 			   int min_data_size, int data_size,
2833 			   int empty, u32 min_slot)
2834 {
2835 	struct extent_buffer *left = path->nodes[0];
2836 	struct extent_buffer *right;
2837 	struct extent_buffer *upper;
2838 	int slot;
2839 	int free_space;
2840 	u32 left_nritems;
2841 	int ret;
2842 
2843 	if (!path->nodes[1])
2844 		return 1;
2845 
2846 	slot = path->slots[1];
2847 	upper = path->nodes[1];
2848 	if (slot >= btrfs_header_nritems(upper) - 1)
2849 		return 1;
2850 
2851 	btrfs_assert_tree_locked(path->nodes[1]);
2852 
2853 	right = btrfs_read_node_slot(upper, slot + 1);
2854 	/*
2855 	 * slot + 1 is not valid or we fail to read the right node,
2856 	 * no big deal, just return.
2857 	 */
2858 	if (IS_ERR(right))
2859 		return 1;
2860 
2861 	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2862 
2863 	free_space = btrfs_leaf_free_space(right);
2864 	if (free_space < data_size)
2865 		goto out_unlock;
2866 
2867 	/* cow and double check */
2868 	ret = btrfs_cow_block(trans, root, right, upper,
2869 			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2870 	if (ret)
2871 		goto out_unlock;
2872 
2873 	free_space = btrfs_leaf_free_space(right);
2874 	if (free_space < data_size)
2875 		goto out_unlock;
2876 
2877 	left_nritems = btrfs_header_nritems(left);
2878 	if (left_nritems == 0)
2879 		goto out_unlock;
2880 
2881 	if (check_sibling_keys(left, right)) {
2882 		ret = -EUCLEAN;
2883 		btrfs_tree_unlock(right);
2884 		free_extent_buffer(right);
2885 		return ret;
2886 	}
2887 	if (path->slots[0] == left_nritems && !empty) {
2888 		/* Key greater than all keys in the leaf, right neighbor has
2889 		 * enough room for it and we're not emptying our leaf to delete
2890 		 * it, therefore use right neighbor to insert the new item and
2891 		 * no need to touch/dirty our left leaf. */
2892 		btrfs_tree_unlock(left);
2893 		free_extent_buffer(left);
2894 		path->nodes[0] = right;
2895 		path->slots[0] = 0;
2896 		path->slots[1]++;
2897 		return 0;
2898 	}
2899 
2900 	return __push_leaf_right(path, min_data_size, empty,
2901 				right, free_space, left_nritems, min_slot);
2902 out_unlock:
2903 	btrfs_tree_unlock(right);
2904 	free_extent_buffer(right);
2905 	return 1;
2906 }
2907 
2908 /*
2909  * push some data in the path leaf to the left, trying to free up at
2910  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2911  *
2912  * max_slot can put a limit on how far into the leaf we'll push items.  The
2913  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2914  * items
2915  */
2916 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2917 				     int empty, struct extent_buffer *left,
2918 				     int free_space, u32 right_nritems,
2919 				     u32 max_slot)
2920 {
2921 	struct btrfs_fs_info *fs_info = left->fs_info;
2922 	struct btrfs_disk_key disk_key;
2923 	struct extent_buffer *right = path->nodes[0];
2924 	int i;
2925 	int push_space = 0;
2926 	int push_items = 0;
2927 	struct btrfs_item *item;
2928 	u32 old_left_nritems;
2929 	u32 nr;
2930 	int ret = 0;
2931 	u32 this_item_size;
2932 	u32 old_left_item_size;
2933 	struct btrfs_map_token token;
2934 
2935 	if (empty)
2936 		nr = min(right_nritems, max_slot);
2937 	else
2938 		nr = min(right_nritems - 1, max_slot);
2939 
2940 	for (i = 0; i < nr; i++) {
2941 		item = btrfs_item_nr(i);
2942 
2943 		if (!empty && push_items > 0) {
2944 			if (path->slots[0] < i)
2945 				break;
2946 			if (path->slots[0] == i) {
2947 				int space = btrfs_leaf_free_space(right);
2948 
2949 				if (space + push_space * 2 > free_space)
2950 					break;
2951 			}
2952 		}
2953 
2954 		if (path->slots[0] == i)
2955 			push_space += data_size;
2956 
2957 		this_item_size = btrfs_item_size(right, item);
2958 		if (this_item_size + sizeof(*item) + push_space > free_space)
2959 			break;
2960 
2961 		push_items++;
2962 		push_space += this_item_size + sizeof(*item);
2963 	}
2964 
2965 	if (push_items == 0) {
2966 		ret = 1;
2967 		goto out;
2968 	}
2969 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
2970 
2971 	/* push data from right to left */
2972 	copy_extent_buffer(left, right,
2973 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
2974 			   btrfs_item_nr_offset(0),
2975 			   push_items * sizeof(struct btrfs_item));
2976 
2977 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
2978 		     btrfs_item_offset_nr(right, push_items - 1);
2979 
2980 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
2981 		     leaf_data_end(left) - push_space,
2982 		     BTRFS_LEAF_DATA_OFFSET +
2983 		     btrfs_item_offset_nr(right, push_items - 1),
2984 		     push_space);
2985 	old_left_nritems = btrfs_header_nritems(left);
2986 	BUG_ON(old_left_nritems <= 0);
2987 
2988 	btrfs_init_map_token(&token, left);
2989 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2990 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2991 		u32 ioff;
2992 
2993 		item = btrfs_item_nr(i);
2994 
2995 		ioff = btrfs_token_item_offset(&token, item);
2996 		btrfs_set_token_item_offset(&token, item,
2997 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
2998 	}
2999 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3000 
3001 	/* fixup right node */
3002 	if (push_items > right_nritems)
3003 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3004 		       right_nritems);
3005 
3006 	if (push_items < right_nritems) {
3007 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3008 						  leaf_data_end(right);
3009 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3010 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3011 				      BTRFS_LEAF_DATA_OFFSET +
3012 				      leaf_data_end(right), push_space);
3013 
3014 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3015 			      btrfs_item_nr_offset(push_items),
3016 			     (btrfs_header_nritems(right) - push_items) *
3017 			     sizeof(struct btrfs_item));
3018 	}
3019 
3020 	btrfs_init_map_token(&token, right);
3021 	right_nritems -= push_items;
3022 	btrfs_set_header_nritems(right, right_nritems);
3023 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3024 	for (i = 0; i < right_nritems; i++) {
3025 		item = btrfs_item_nr(i);
3026 
3027 		push_space = push_space - btrfs_token_item_size(&token, item);
3028 		btrfs_set_token_item_offset(&token, item, push_space);
3029 	}
3030 
3031 	btrfs_mark_buffer_dirty(left);
3032 	if (right_nritems)
3033 		btrfs_mark_buffer_dirty(right);
3034 	else
3035 		btrfs_clean_tree_block(right);
3036 
3037 	btrfs_item_key(right, &disk_key, 0);
3038 	fixup_low_keys(path, &disk_key, 1);
3039 
3040 	/* then fixup the leaf pointer in the path */
3041 	if (path->slots[0] < push_items) {
3042 		path->slots[0] += old_left_nritems;
3043 		btrfs_tree_unlock(path->nodes[0]);
3044 		free_extent_buffer(path->nodes[0]);
3045 		path->nodes[0] = left;
3046 		path->slots[1] -= 1;
3047 	} else {
3048 		btrfs_tree_unlock(left);
3049 		free_extent_buffer(left);
3050 		path->slots[0] -= push_items;
3051 	}
3052 	BUG_ON(path->slots[0] < 0);
3053 	return ret;
3054 out:
3055 	btrfs_tree_unlock(left);
3056 	free_extent_buffer(left);
3057 	return ret;
3058 }
3059 
3060 /*
3061  * push some data in the path leaf to the left, trying to free up at
3062  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3063  *
3064  * max_slot can put a limit on how far into the leaf we'll push items.  The
3065  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3066  * items
3067  */
3068 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3069 			  *root, struct btrfs_path *path, int min_data_size,
3070 			  int data_size, int empty, u32 max_slot)
3071 {
3072 	struct extent_buffer *right = path->nodes[0];
3073 	struct extent_buffer *left;
3074 	int slot;
3075 	int free_space;
3076 	u32 right_nritems;
3077 	int ret = 0;
3078 
3079 	slot = path->slots[1];
3080 	if (slot == 0)
3081 		return 1;
3082 	if (!path->nodes[1])
3083 		return 1;
3084 
3085 	right_nritems = btrfs_header_nritems(right);
3086 	if (right_nritems == 0)
3087 		return 1;
3088 
3089 	btrfs_assert_tree_locked(path->nodes[1]);
3090 
3091 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3092 	/*
3093 	 * slot - 1 is not valid or we fail to read the left node,
3094 	 * no big deal, just return.
3095 	 */
3096 	if (IS_ERR(left))
3097 		return 1;
3098 
3099 	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3100 
3101 	free_space = btrfs_leaf_free_space(left);
3102 	if (free_space < data_size) {
3103 		ret = 1;
3104 		goto out;
3105 	}
3106 
3107 	/* cow and double check */
3108 	ret = btrfs_cow_block(trans, root, left,
3109 			      path->nodes[1], slot - 1, &left,
3110 			      BTRFS_NESTING_LEFT_COW);
3111 	if (ret) {
3112 		/* we hit -ENOSPC, but it isn't fatal here */
3113 		if (ret == -ENOSPC)
3114 			ret = 1;
3115 		goto out;
3116 	}
3117 
3118 	free_space = btrfs_leaf_free_space(left);
3119 	if (free_space < data_size) {
3120 		ret = 1;
3121 		goto out;
3122 	}
3123 
3124 	if (check_sibling_keys(left, right)) {
3125 		ret = -EUCLEAN;
3126 		goto out;
3127 	}
3128 	return __push_leaf_left(path, min_data_size,
3129 			       empty, left, free_space, right_nritems,
3130 			       max_slot);
3131 out:
3132 	btrfs_tree_unlock(left);
3133 	free_extent_buffer(left);
3134 	return ret;
3135 }
3136 
3137 /*
3138  * split the path's leaf in two, making sure there is at least data_size
3139  * available for the resulting leaf level of the path.
3140  */
3141 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3142 				    struct btrfs_path *path,
3143 				    struct extent_buffer *l,
3144 				    struct extent_buffer *right,
3145 				    int slot, int mid, int nritems)
3146 {
3147 	struct btrfs_fs_info *fs_info = trans->fs_info;
3148 	int data_copy_size;
3149 	int rt_data_off;
3150 	int i;
3151 	struct btrfs_disk_key disk_key;
3152 	struct btrfs_map_token token;
3153 
3154 	nritems = nritems - mid;
3155 	btrfs_set_header_nritems(right, nritems);
3156 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3157 
3158 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3159 			   btrfs_item_nr_offset(mid),
3160 			   nritems * sizeof(struct btrfs_item));
3161 
3162 	copy_extent_buffer(right, l,
3163 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3164 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3165 		     leaf_data_end(l), data_copy_size);
3166 
3167 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3168 
3169 	btrfs_init_map_token(&token, right);
3170 	for (i = 0; i < nritems; i++) {
3171 		struct btrfs_item *item = btrfs_item_nr(i);
3172 		u32 ioff;
3173 
3174 		ioff = btrfs_token_item_offset(&token, item);
3175 		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3176 	}
3177 
3178 	btrfs_set_header_nritems(l, mid);
3179 	btrfs_item_key(right, &disk_key, 0);
3180 	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3181 
3182 	btrfs_mark_buffer_dirty(right);
3183 	btrfs_mark_buffer_dirty(l);
3184 	BUG_ON(path->slots[0] != slot);
3185 
3186 	if (mid <= slot) {
3187 		btrfs_tree_unlock(path->nodes[0]);
3188 		free_extent_buffer(path->nodes[0]);
3189 		path->nodes[0] = right;
3190 		path->slots[0] -= mid;
3191 		path->slots[1] += 1;
3192 	} else {
3193 		btrfs_tree_unlock(right);
3194 		free_extent_buffer(right);
3195 	}
3196 
3197 	BUG_ON(path->slots[0] < 0);
3198 }
3199 
3200 /*
3201  * double splits happen when we need to insert a big item in the middle
3202  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3203  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3204  *          A                 B                 C
3205  *
3206  * We avoid this by trying to push the items on either side of our target
3207  * into the adjacent leaves.  If all goes well we can avoid the double split
3208  * completely.
3209  */
3210 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3211 					  struct btrfs_root *root,
3212 					  struct btrfs_path *path,
3213 					  int data_size)
3214 {
3215 	int ret;
3216 	int progress = 0;
3217 	int slot;
3218 	u32 nritems;
3219 	int space_needed = data_size;
3220 
3221 	slot = path->slots[0];
3222 	if (slot < btrfs_header_nritems(path->nodes[0]))
3223 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3224 
3225 	/*
3226 	 * try to push all the items after our slot into the
3227 	 * right leaf
3228 	 */
3229 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3230 	if (ret < 0)
3231 		return ret;
3232 
3233 	if (ret == 0)
3234 		progress++;
3235 
3236 	nritems = btrfs_header_nritems(path->nodes[0]);
3237 	/*
3238 	 * our goal is to get our slot at the start or end of a leaf.  If
3239 	 * we've done so we're done
3240 	 */
3241 	if (path->slots[0] == 0 || path->slots[0] == nritems)
3242 		return 0;
3243 
3244 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3245 		return 0;
3246 
3247 	/* try to push all the items before our slot into the next leaf */
3248 	slot = path->slots[0];
3249 	space_needed = data_size;
3250 	if (slot > 0)
3251 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3252 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3253 	if (ret < 0)
3254 		return ret;
3255 
3256 	if (ret == 0)
3257 		progress++;
3258 
3259 	if (progress)
3260 		return 0;
3261 	return 1;
3262 }
3263 
3264 /*
3265  * split the path's leaf in two, making sure there is at least data_size
3266  * available for the resulting leaf level of the path.
3267  *
3268  * returns 0 if all went well and < 0 on failure.
3269  */
3270 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3271 			       struct btrfs_root *root,
3272 			       const struct btrfs_key *ins_key,
3273 			       struct btrfs_path *path, int data_size,
3274 			       int extend)
3275 {
3276 	struct btrfs_disk_key disk_key;
3277 	struct extent_buffer *l;
3278 	u32 nritems;
3279 	int mid;
3280 	int slot;
3281 	struct extent_buffer *right;
3282 	struct btrfs_fs_info *fs_info = root->fs_info;
3283 	int ret = 0;
3284 	int wret;
3285 	int split;
3286 	int num_doubles = 0;
3287 	int tried_avoid_double = 0;
3288 
3289 	l = path->nodes[0];
3290 	slot = path->slots[0];
3291 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
3292 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3293 		return -EOVERFLOW;
3294 
3295 	/* first try to make some room by pushing left and right */
3296 	if (data_size && path->nodes[1]) {
3297 		int space_needed = data_size;
3298 
3299 		if (slot < btrfs_header_nritems(l))
3300 			space_needed -= btrfs_leaf_free_space(l);
3301 
3302 		wret = push_leaf_right(trans, root, path, space_needed,
3303 				       space_needed, 0, 0);
3304 		if (wret < 0)
3305 			return wret;
3306 		if (wret) {
3307 			space_needed = data_size;
3308 			if (slot > 0)
3309 				space_needed -= btrfs_leaf_free_space(l);
3310 			wret = push_leaf_left(trans, root, path, space_needed,
3311 					      space_needed, 0, (u32)-1);
3312 			if (wret < 0)
3313 				return wret;
3314 		}
3315 		l = path->nodes[0];
3316 
3317 		/* did the pushes work? */
3318 		if (btrfs_leaf_free_space(l) >= data_size)
3319 			return 0;
3320 	}
3321 
3322 	if (!path->nodes[1]) {
3323 		ret = insert_new_root(trans, root, path, 1);
3324 		if (ret)
3325 			return ret;
3326 	}
3327 again:
3328 	split = 1;
3329 	l = path->nodes[0];
3330 	slot = path->slots[0];
3331 	nritems = btrfs_header_nritems(l);
3332 	mid = (nritems + 1) / 2;
3333 
3334 	if (mid <= slot) {
3335 		if (nritems == 1 ||
3336 		    leaf_space_used(l, mid, nritems - mid) + data_size >
3337 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
3338 			if (slot >= nritems) {
3339 				split = 0;
3340 			} else {
3341 				mid = slot;
3342 				if (mid != nritems &&
3343 				    leaf_space_used(l, mid, nritems - mid) +
3344 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3345 					if (data_size && !tried_avoid_double)
3346 						goto push_for_double;
3347 					split = 2;
3348 				}
3349 			}
3350 		}
3351 	} else {
3352 		if (leaf_space_used(l, 0, mid) + data_size >
3353 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
3354 			if (!extend && data_size && slot == 0) {
3355 				split = 0;
3356 			} else if ((extend || !data_size) && slot == 0) {
3357 				mid = 1;
3358 			} else {
3359 				mid = slot;
3360 				if (mid != nritems &&
3361 				    leaf_space_used(l, mid, nritems - mid) +
3362 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3363 					if (data_size && !tried_avoid_double)
3364 						goto push_for_double;
3365 					split = 2;
3366 				}
3367 			}
3368 		}
3369 	}
3370 
3371 	if (split == 0)
3372 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
3373 	else
3374 		btrfs_item_key(l, &disk_key, mid);
3375 
3376 	/*
3377 	 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3378 	 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3379 	 * subclasses, which is 8 at the time of this patch, and we've maxed it
3380 	 * out.  In the future we could add a
3381 	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3382 	 * use BTRFS_NESTING_NEW_ROOT.
3383 	 */
3384 	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
3385 					     l->start, 0, num_doubles ?
3386 					     BTRFS_NESTING_NEW_ROOT :
3387 					     BTRFS_NESTING_SPLIT);
3388 	if (IS_ERR(right))
3389 		return PTR_ERR(right);
3390 
3391 	root_add_used(root, fs_info->nodesize);
3392 
3393 	if (split == 0) {
3394 		if (mid <= slot) {
3395 			btrfs_set_header_nritems(right, 0);
3396 			insert_ptr(trans, path, &disk_key,
3397 				   right->start, path->slots[1] + 1, 1);
3398 			btrfs_tree_unlock(path->nodes[0]);
3399 			free_extent_buffer(path->nodes[0]);
3400 			path->nodes[0] = right;
3401 			path->slots[0] = 0;
3402 			path->slots[1] += 1;
3403 		} else {
3404 			btrfs_set_header_nritems(right, 0);
3405 			insert_ptr(trans, path, &disk_key,
3406 				   right->start, path->slots[1], 1);
3407 			btrfs_tree_unlock(path->nodes[0]);
3408 			free_extent_buffer(path->nodes[0]);
3409 			path->nodes[0] = right;
3410 			path->slots[0] = 0;
3411 			if (path->slots[1] == 0)
3412 				fixup_low_keys(path, &disk_key, 1);
3413 		}
3414 		/*
3415 		 * We create a new leaf 'right' for the required ins_len and
3416 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3417 		 * the content of ins_len to 'right'.
3418 		 */
3419 		return ret;
3420 	}
3421 
3422 	copy_for_split(trans, path, l, right, slot, mid, nritems);
3423 
3424 	if (split == 2) {
3425 		BUG_ON(num_doubles != 0);
3426 		num_doubles++;
3427 		goto again;
3428 	}
3429 
3430 	return 0;
3431 
3432 push_for_double:
3433 	push_for_double_split(trans, root, path, data_size);
3434 	tried_avoid_double = 1;
3435 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3436 		return 0;
3437 	goto again;
3438 }
3439 
3440 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3441 					 struct btrfs_root *root,
3442 					 struct btrfs_path *path, int ins_len)
3443 {
3444 	struct btrfs_key key;
3445 	struct extent_buffer *leaf;
3446 	struct btrfs_file_extent_item *fi;
3447 	u64 extent_len = 0;
3448 	u32 item_size;
3449 	int ret;
3450 
3451 	leaf = path->nodes[0];
3452 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3453 
3454 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3455 	       key.type != BTRFS_EXTENT_CSUM_KEY);
3456 
3457 	if (btrfs_leaf_free_space(leaf) >= ins_len)
3458 		return 0;
3459 
3460 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3461 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3462 		fi = btrfs_item_ptr(leaf, path->slots[0],
3463 				    struct btrfs_file_extent_item);
3464 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3465 	}
3466 	btrfs_release_path(path);
3467 
3468 	path->keep_locks = 1;
3469 	path->search_for_split = 1;
3470 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3471 	path->search_for_split = 0;
3472 	if (ret > 0)
3473 		ret = -EAGAIN;
3474 	if (ret < 0)
3475 		goto err;
3476 
3477 	ret = -EAGAIN;
3478 	leaf = path->nodes[0];
3479 	/* if our item isn't there, return now */
3480 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3481 		goto err;
3482 
3483 	/* the leaf has  changed, it now has room.  return now */
3484 	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3485 		goto err;
3486 
3487 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3488 		fi = btrfs_item_ptr(leaf, path->slots[0],
3489 				    struct btrfs_file_extent_item);
3490 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3491 			goto err;
3492 	}
3493 
3494 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
3495 	if (ret)
3496 		goto err;
3497 
3498 	path->keep_locks = 0;
3499 	btrfs_unlock_up_safe(path, 1);
3500 	return 0;
3501 err:
3502 	path->keep_locks = 0;
3503 	return ret;
3504 }
3505 
3506 static noinline int split_item(struct btrfs_path *path,
3507 			       const struct btrfs_key *new_key,
3508 			       unsigned long split_offset)
3509 {
3510 	struct extent_buffer *leaf;
3511 	struct btrfs_item *item;
3512 	struct btrfs_item *new_item;
3513 	int slot;
3514 	char *buf;
3515 	u32 nritems;
3516 	u32 item_size;
3517 	u32 orig_offset;
3518 	struct btrfs_disk_key disk_key;
3519 
3520 	leaf = path->nodes[0];
3521 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3522 
3523 	item = btrfs_item_nr(path->slots[0]);
3524 	orig_offset = btrfs_item_offset(leaf, item);
3525 	item_size = btrfs_item_size(leaf, item);
3526 
3527 	buf = kmalloc(item_size, GFP_NOFS);
3528 	if (!buf)
3529 		return -ENOMEM;
3530 
3531 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3532 			    path->slots[0]), item_size);
3533 
3534 	slot = path->slots[0] + 1;
3535 	nritems = btrfs_header_nritems(leaf);
3536 	if (slot != nritems) {
3537 		/* shift the items */
3538 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3539 				btrfs_item_nr_offset(slot),
3540 				(nritems - slot) * sizeof(struct btrfs_item));
3541 	}
3542 
3543 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3544 	btrfs_set_item_key(leaf, &disk_key, slot);
3545 
3546 	new_item = btrfs_item_nr(slot);
3547 
3548 	btrfs_set_item_offset(leaf, new_item, orig_offset);
3549 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3550 
3551 	btrfs_set_item_offset(leaf, item,
3552 			      orig_offset + item_size - split_offset);
3553 	btrfs_set_item_size(leaf, item, split_offset);
3554 
3555 	btrfs_set_header_nritems(leaf, nritems + 1);
3556 
3557 	/* write the data for the start of the original item */
3558 	write_extent_buffer(leaf, buf,
3559 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
3560 			    split_offset);
3561 
3562 	/* write the data for the new item */
3563 	write_extent_buffer(leaf, buf + split_offset,
3564 			    btrfs_item_ptr_offset(leaf, slot),
3565 			    item_size - split_offset);
3566 	btrfs_mark_buffer_dirty(leaf);
3567 
3568 	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3569 	kfree(buf);
3570 	return 0;
3571 }
3572 
3573 /*
3574  * This function splits a single item into two items,
3575  * giving 'new_key' to the new item and splitting the
3576  * old one at split_offset (from the start of the item).
3577  *
3578  * The path may be released by this operation.  After
3579  * the split, the path is pointing to the old item.  The
3580  * new item is going to be in the same node as the old one.
3581  *
3582  * Note, the item being split must be smaller enough to live alone on
3583  * a tree block with room for one extra struct btrfs_item
3584  *
3585  * This allows us to split the item in place, keeping a lock on the
3586  * leaf the entire time.
3587  */
3588 int btrfs_split_item(struct btrfs_trans_handle *trans,
3589 		     struct btrfs_root *root,
3590 		     struct btrfs_path *path,
3591 		     const struct btrfs_key *new_key,
3592 		     unsigned long split_offset)
3593 {
3594 	int ret;
3595 	ret = setup_leaf_for_split(trans, root, path,
3596 				   sizeof(struct btrfs_item));
3597 	if (ret)
3598 		return ret;
3599 
3600 	ret = split_item(path, new_key, split_offset);
3601 	return ret;
3602 }
3603 
3604 /*
3605  * This function duplicate a item, giving 'new_key' to the new item.
3606  * It guarantees both items live in the same tree leaf and the new item
3607  * is contiguous with the original item.
3608  *
3609  * This allows us to split file extent in place, keeping a lock on the
3610  * leaf the entire time.
3611  */
3612 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3613 			 struct btrfs_root *root,
3614 			 struct btrfs_path *path,
3615 			 const struct btrfs_key *new_key)
3616 {
3617 	struct extent_buffer *leaf;
3618 	int ret;
3619 	u32 item_size;
3620 
3621 	leaf = path->nodes[0];
3622 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3623 	ret = setup_leaf_for_split(trans, root, path,
3624 				   item_size + sizeof(struct btrfs_item));
3625 	if (ret)
3626 		return ret;
3627 
3628 	path->slots[0]++;
3629 	setup_items_for_insert(root, path, new_key, &item_size, 1);
3630 	leaf = path->nodes[0];
3631 	memcpy_extent_buffer(leaf,
3632 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
3633 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3634 			     item_size);
3635 	return 0;
3636 }
3637 
3638 /*
3639  * make the item pointed to by the path smaller.  new_size indicates
3640  * how small to make it, and from_end tells us if we just chop bytes
3641  * off the end of the item or if we shift the item to chop bytes off
3642  * the front.
3643  */
3644 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3645 {
3646 	int slot;
3647 	struct extent_buffer *leaf;
3648 	struct btrfs_item *item;
3649 	u32 nritems;
3650 	unsigned int data_end;
3651 	unsigned int old_data_start;
3652 	unsigned int old_size;
3653 	unsigned int size_diff;
3654 	int i;
3655 	struct btrfs_map_token token;
3656 
3657 	leaf = path->nodes[0];
3658 	slot = path->slots[0];
3659 
3660 	old_size = btrfs_item_size_nr(leaf, slot);
3661 	if (old_size == new_size)
3662 		return;
3663 
3664 	nritems = btrfs_header_nritems(leaf);
3665 	data_end = leaf_data_end(leaf);
3666 
3667 	old_data_start = btrfs_item_offset_nr(leaf, slot);
3668 
3669 	size_diff = old_size - new_size;
3670 
3671 	BUG_ON(slot < 0);
3672 	BUG_ON(slot >= nritems);
3673 
3674 	/*
3675 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3676 	 */
3677 	/* first correct the data pointers */
3678 	btrfs_init_map_token(&token, leaf);
3679 	for (i = slot; i < nritems; i++) {
3680 		u32 ioff;
3681 		item = btrfs_item_nr(i);
3682 
3683 		ioff = btrfs_token_item_offset(&token, item);
3684 		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3685 	}
3686 
3687 	/* shift the data */
3688 	if (from_end) {
3689 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3690 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3691 			      data_end, old_data_start + new_size - data_end);
3692 	} else {
3693 		struct btrfs_disk_key disk_key;
3694 		u64 offset;
3695 
3696 		btrfs_item_key(leaf, &disk_key, slot);
3697 
3698 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3699 			unsigned long ptr;
3700 			struct btrfs_file_extent_item *fi;
3701 
3702 			fi = btrfs_item_ptr(leaf, slot,
3703 					    struct btrfs_file_extent_item);
3704 			fi = (struct btrfs_file_extent_item *)(
3705 			     (unsigned long)fi - size_diff);
3706 
3707 			if (btrfs_file_extent_type(leaf, fi) ==
3708 			    BTRFS_FILE_EXTENT_INLINE) {
3709 				ptr = btrfs_item_ptr_offset(leaf, slot);
3710 				memmove_extent_buffer(leaf, ptr,
3711 				      (unsigned long)fi,
3712 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
3713 			}
3714 		}
3715 
3716 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3717 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3718 			      data_end, old_data_start - data_end);
3719 
3720 		offset = btrfs_disk_key_offset(&disk_key);
3721 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3722 		btrfs_set_item_key(leaf, &disk_key, slot);
3723 		if (slot == 0)
3724 			fixup_low_keys(path, &disk_key, 1);
3725 	}
3726 
3727 	item = btrfs_item_nr(slot);
3728 	btrfs_set_item_size(leaf, item, new_size);
3729 	btrfs_mark_buffer_dirty(leaf);
3730 
3731 	if (btrfs_leaf_free_space(leaf) < 0) {
3732 		btrfs_print_leaf(leaf);
3733 		BUG();
3734 	}
3735 }
3736 
3737 /*
3738  * make the item pointed to by the path bigger, data_size is the added size.
3739  */
3740 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3741 {
3742 	int slot;
3743 	struct extent_buffer *leaf;
3744 	struct btrfs_item *item;
3745 	u32 nritems;
3746 	unsigned int data_end;
3747 	unsigned int old_data;
3748 	unsigned int old_size;
3749 	int i;
3750 	struct btrfs_map_token token;
3751 
3752 	leaf = path->nodes[0];
3753 
3754 	nritems = btrfs_header_nritems(leaf);
3755 	data_end = leaf_data_end(leaf);
3756 
3757 	if (btrfs_leaf_free_space(leaf) < data_size) {
3758 		btrfs_print_leaf(leaf);
3759 		BUG();
3760 	}
3761 	slot = path->slots[0];
3762 	old_data = btrfs_item_end_nr(leaf, slot);
3763 
3764 	BUG_ON(slot < 0);
3765 	if (slot >= nritems) {
3766 		btrfs_print_leaf(leaf);
3767 		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3768 			   slot, nritems);
3769 		BUG();
3770 	}
3771 
3772 	/*
3773 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3774 	 */
3775 	/* first correct the data pointers */
3776 	btrfs_init_map_token(&token, leaf);
3777 	for (i = slot; i < nritems; i++) {
3778 		u32 ioff;
3779 		item = btrfs_item_nr(i);
3780 
3781 		ioff = btrfs_token_item_offset(&token, item);
3782 		btrfs_set_token_item_offset(&token, item, ioff - data_size);
3783 	}
3784 
3785 	/* shift the data */
3786 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3787 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3788 		      data_end, old_data - data_end);
3789 
3790 	data_end = old_data;
3791 	old_size = btrfs_item_size_nr(leaf, slot);
3792 	item = btrfs_item_nr(slot);
3793 	btrfs_set_item_size(leaf, item, old_size + data_size);
3794 	btrfs_mark_buffer_dirty(leaf);
3795 
3796 	if (btrfs_leaf_free_space(leaf) < 0) {
3797 		btrfs_print_leaf(leaf);
3798 		BUG();
3799 	}
3800 }
3801 
3802 /**
3803  * setup_items_for_insert - Helper called before inserting one or more items
3804  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3805  * in a function that doesn't call btrfs_search_slot
3806  *
3807  * @root:	root we are inserting items to
3808  * @path:	points to the leaf/slot where we are going to insert new items
3809  * @cpu_key:	array of keys for items to be inserted
3810  * @data_size:	size of the body of each item we are going to insert
3811  * @nr:		size of @cpu_key/@data_size arrays
3812  */
3813 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3814 			    const struct btrfs_key *cpu_key, u32 *data_size,
3815 			    int nr)
3816 {
3817 	struct btrfs_fs_info *fs_info = root->fs_info;
3818 	struct btrfs_item *item;
3819 	int i;
3820 	u32 nritems;
3821 	unsigned int data_end;
3822 	struct btrfs_disk_key disk_key;
3823 	struct extent_buffer *leaf;
3824 	int slot;
3825 	struct btrfs_map_token token;
3826 	u32 total_size;
3827 	u32 total_data = 0;
3828 
3829 	for (i = 0; i < nr; i++)
3830 		total_data += data_size[i];
3831 	total_size = total_data + (nr * sizeof(struct btrfs_item));
3832 
3833 	if (path->slots[0] == 0) {
3834 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3835 		fixup_low_keys(path, &disk_key, 1);
3836 	}
3837 	btrfs_unlock_up_safe(path, 1);
3838 
3839 	leaf = path->nodes[0];
3840 	slot = path->slots[0];
3841 
3842 	nritems = btrfs_header_nritems(leaf);
3843 	data_end = leaf_data_end(leaf);
3844 
3845 	if (btrfs_leaf_free_space(leaf) < total_size) {
3846 		btrfs_print_leaf(leaf);
3847 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
3848 			   total_size, btrfs_leaf_free_space(leaf));
3849 		BUG();
3850 	}
3851 
3852 	btrfs_init_map_token(&token, leaf);
3853 	if (slot != nritems) {
3854 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3855 
3856 		if (old_data < data_end) {
3857 			btrfs_print_leaf(leaf);
3858 			btrfs_crit(fs_info,
3859 		"item at slot %d with data offset %u beyond data end of leaf %u",
3860 				   slot, old_data, data_end);
3861 			BUG();
3862 		}
3863 		/*
3864 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3865 		 */
3866 		/* first correct the data pointers */
3867 		for (i = slot; i < nritems; i++) {
3868 			u32 ioff;
3869 
3870 			item = btrfs_item_nr(i);
3871 			ioff = btrfs_token_item_offset(&token, item);
3872 			btrfs_set_token_item_offset(&token, item,
3873 						    ioff - total_data);
3874 		}
3875 		/* shift the items */
3876 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3877 			      btrfs_item_nr_offset(slot),
3878 			      (nritems - slot) * sizeof(struct btrfs_item));
3879 
3880 		/* shift the data */
3881 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3882 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3883 			      data_end, old_data - data_end);
3884 		data_end = old_data;
3885 	}
3886 
3887 	/* setup the item for the new data */
3888 	for (i = 0; i < nr; i++) {
3889 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3890 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3891 		item = btrfs_item_nr(slot + i);
3892 		data_end -= data_size[i];
3893 		btrfs_set_token_item_offset(&token, item, data_end);
3894 		btrfs_set_token_item_size(&token, item, data_size[i]);
3895 	}
3896 
3897 	btrfs_set_header_nritems(leaf, nritems + nr);
3898 	btrfs_mark_buffer_dirty(leaf);
3899 
3900 	if (btrfs_leaf_free_space(leaf) < 0) {
3901 		btrfs_print_leaf(leaf);
3902 		BUG();
3903 	}
3904 }
3905 
3906 /*
3907  * Given a key and some data, insert items into the tree.
3908  * This does all the path init required, making room in the tree if needed.
3909  */
3910 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3911 			    struct btrfs_root *root,
3912 			    struct btrfs_path *path,
3913 			    const struct btrfs_key *cpu_key, u32 *data_size,
3914 			    int nr)
3915 {
3916 	int ret = 0;
3917 	int slot;
3918 	int i;
3919 	u32 total_size = 0;
3920 	u32 total_data = 0;
3921 
3922 	for (i = 0; i < nr; i++)
3923 		total_data += data_size[i];
3924 
3925 	total_size = total_data + (nr * sizeof(struct btrfs_item));
3926 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3927 	if (ret == 0)
3928 		return -EEXIST;
3929 	if (ret < 0)
3930 		return ret;
3931 
3932 	slot = path->slots[0];
3933 	BUG_ON(slot < 0);
3934 
3935 	setup_items_for_insert(root, path, cpu_key, data_size, nr);
3936 	return 0;
3937 }
3938 
3939 /*
3940  * Given a key and some data, insert an item into the tree.
3941  * This does all the path init required, making room in the tree if needed.
3942  */
3943 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3944 		      const struct btrfs_key *cpu_key, void *data,
3945 		      u32 data_size)
3946 {
3947 	int ret = 0;
3948 	struct btrfs_path *path;
3949 	struct extent_buffer *leaf;
3950 	unsigned long ptr;
3951 
3952 	path = btrfs_alloc_path();
3953 	if (!path)
3954 		return -ENOMEM;
3955 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3956 	if (!ret) {
3957 		leaf = path->nodes[0];
3958 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3959 		write_extent_buffer(leaf, data, ptr, data_size);
3960 		btrfs_mark_buffer_dirty(leaf);
3961 	}
3962 	btrfs_free_path(path);
3963 	return ret;
3964 }
3965 
3966 /*
3967  * delete the pointer from a given node.
3968  *
3969  * the tree should have been previously balanced so the deletion does not
3970  * empty a node.
3971  */
3972 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3973 		    int level, int slot)
3974 {
3975 	struct extent_buffer *parent = path->nodes[level];
3976 	u32 nritems;
3977 	int ret;
3978 
3979 	nritems = btrfs_header_nritems(parent);
3980 	if (slot != nritems - 1) {
3981 		if (level) {
3982 			ret = btrfs_tree_mod_log_insert_move(parent, slot,
3983 					slot + 1, nritems - slot - 1);
3984 			BUG_ON(ret < 0);
3985 		}
3986 		memmove_extent_buffer(parent,
3987 			      btrfs_node_key_ptr_offset(slot),
3988 			      btrfs_node_key_ptr_offset(slot + 1),
3989 			      sizeof(struct btrfs_key_ptr) *
3990 			      (nritems - slot - 1));
3991 	} else if (level) {
3992 		ret = btrfs_tree_mod_log_insert_key(parent, slot,
3993 				BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
3994 		BUG_ON(ret < 0);
3995 	}
3996 
3997 	nritems--;
3998 	btrfs_set_header_nritems(parent, nritems);
3999 	if (nritems == 0 && parent == root->node) {
4000 		BUG_ON(btrfs_header_level(root->node) != 1);
4001 		/* just turn the root into a leaf and break */
4002 		btrfs_set_header_level(root->node, 0);
4003 	} else if (slot == 0) {
4004 		struct btrfs_disk_key disk_key;
4005 
4006 		btrfs_node_key(parent, &disk_key, 0);
4007 		fixup_low_keys(path, &disk_key, level + 1);
4008 	}
4009 	btrfs_mark_buffer_dirty(parent);
4010 }
4011 
4012 /*
4013  * a helper function to delete the leaf pointed to by path->slots[1] and
4014  * path->nodes[1].
4015  *
4016  * This deletes the pointer in path->nodes[1] and frees the leaf
4017  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4018  *
4019  * The path must have already been setup for deleting the leaf, including
4020  * all the proper balancing.  path->nodes[1] must be locked.
4021  */
4022 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4023 				    struct btrfs_root *root,
4024 				    struct btrfs_path *path,
4025 				    struct extent_buffer *leaf)
4026 {
4027 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4028 	del_ptr(root, path, 1, path->slots[1]);
4029 
4030 	/*
4031 	 * btrfs_free_extent is expensive, we want to make sure we
4032 	 * aren't holding any locks when we call it
4033 	 */
4034 	btrfs_unlock_up_safe(path, 0);
4035 
4036 	root_sub_used(root, leaf->len);
4037 
4038 	atomic_inc(&leaf->refs);
4039 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4040 	free_extent_buffer_stale(leaf);
4041 }
4042 /*
4043  * delete the item at the leaf level in path.  If that empties
4044  * the leaf, remove it from the tree
4045  */
4046 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4047 		    struct btrfs_path *path, int slot, int nr)
4048 {
4049 	struct btrfs_fs_info *fs_info = root->fs_info;
4050 	struct extent_buffer *leaf;
4051 	struct btrfs_item *item;
4052 	u32 last_off;
4053 	u32 dsize = 0;
4054 	int ret = 0;
4055 	int wret;
4056 	int i;
4057 	u32 nritems;
4058 
4059 	leaf = path->nodes[0];
4060 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4061 
4062 	for (i = 0; i < nr; i++)
4063 		dsize += btrfs_item_size_nr(leaf, slot + i);
4064 
4065 	nritems = btrfs_header_nritems(leaf);
4066 
4067 	if (slot + nr != nritems) {
4068 		int data_end = leaf_data_end(leaf);
4069 		struct btrfs_map_token token;
4070 
4071 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4072 			      data_end + dsize,
4073 			      BTRFS_LEAF_DATA_OFFSET + data_end,
4074 			      last_off - data_end);
4075 
4076 		btrfs_init_map_token(&token, leaf);
4077 		for (i = slot + nr; i < nritems; i++) {
4078 			u32 ioff;
4079 
4080 			item = btrfs_item_nr(i);
4081 			ioff = btrfs_token_item_offset(&token, item);
4082 			btrfs_set_token_item_offset(&token, item, ioff + dsize);
4083 		}
4084 
4085 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4086 			      btrfs_item_nr_offset(slot + nr),
4087 			      sizeof(struct btrfs_item) *
4088 			      (nritems - slot - nr));
4089 	}
4090 	btrfs_set_header_nritems(leaf, nritems - nr);
4091 	nritems -= nr;
4092 
4093 	/* delete the leaf if we've emptied it */
4094 	if (nritems == 0) {
4095 		if (leaf == root->node) {
4096 			btrfs_set_header_level(leaf, 0);
4097 		} else {
4098 			btrfs_clean_tree_block(leaf);
4099 			btrfs_del_leaf(trans, root, path, leaf);
4100 		}
4101 	} else {
4102 		int used = leaf_space_used(leaf, 0, nritems);
4103 		if (slot == 0) {
4104 			struct btrfs_disk_key disk_key;
4105 
4106 			btrfs_item_key(leaf, &disk_key, 0);
4107 			fixup_low_keys(path, &disk_key, 1);
4108 		}
4109 
4110 		/* delete the leaf if it is mostly empty */
4111 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4112 			/* push_leaf_left fixes the path.
4113 			 * make sure the path still points to our leaf
4114 			 * for possible call to del_ptr below
4115 			 */
4116 			slot = path->slots[1];
4117 			atomic_inc(&leaf->refs);
4118 
4119 			wret = push_leaf_left(trans, root, path, 1, 1,
4120 					      1, (u32)-1);
4121 			if (wret < 0 && wret != -ENOSPC)
4122 				ret = wret;
4123 
4124 			if (path->nodes[0] == leaf &&
4125 			    btrfs_header_nritems(leaf)) {
4126 				wret = push_leaf_right(trans, root, path, 1,
4127 						       1, 1, 0);
4128 				if (wret < 0 && wret != -ENOSPC)
4129 					ret = wret;
4130 			}
4131 
4132 			if (btrfs_header_nritems(leaf) == 0) {
4133 				path->slots[1] = slot;
4134 				btrfs_del_leaf(trans, root, path, leaf);
4135 				free_extent_buffer(leaf);
4136 				ret = 0;
4137 			} else {
4138 				/* if we're still in the path, make sure
4139 				 * we're dirty.  Otherwise, one of the
4140 				 * push_leaf functions must have already
4141 				 * dirtied this buffer
4142 				 */
4143 				if (path->nodes[0] == leaf)
4144 					btrfs_mark_buffer_dirty(leaf);
4145 				free_extent_buffer(leaf);
4146 			}
4147 		} else {
4148 			btrfs_mark_buffer_dirty(leaf);
4149 		}
4150 	}
4151 	return ret;
4152 }
4153 
4154 /*
4155  * search the tree again to find a leaf with lesser keys
4156  * returns 0 if it found something or 1 if there are no lesser leaves.
4157  * returns < 0 on io errors.
4158  *
4159  * This may release the path, and so you may lose any locks held at the
4160  * time you call it.
4161  */
4162 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4163 {
4164 	struct btrfs_key key;
4165 	struct btrfs_disk_key found_key;
4166 	int ret;
4167 
4168 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4169 
4170 	if (key.offset > 0) {
4171 		key.offset--;
4172 	} else if (key.type > 0) {
4173 		key.type--;
4174 		key.offset = (u64)-1;
4175 	} else if (key.objectid > 0) {
4176 		key.objectid--;
4177 		key.type = (u8)-1;
4178 		key.offset = (u64)-1;
4179 	} else {
4180 		return 1;
4181 	}
4182 
4183 	btrfs_release_path(path);
4184 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4185 	if (ret < 0)
4186 		return ret;
4187 	btrfs_item_key(path->nodes[0], &found_key, 0);
4188 	ret = comp_keys(&found_key, &key);
4189 	/*
4190 	 * We might have had an item with the previous key in the tree right
4191 	 * before we released our path. And after we released our path, that
4192 	 * item might have been pushed to the first slot (0) of the leaf we
4193 	 * were holding due to a tree balance. Alternatively, an item with the
4194 	 * previous key can exist as the only element of a leaf (big fat item).
4195 	 * Therefore account for these 2 cases, so that our callers (like
4196 	 * btrfs_previous_item) don't miss an existing item with a key matching
4197 	 * the previous key we computed above.
4198 	 */
4199 	if (ret <= 0)
4200 		return 0;
4201 	return 1;
4202 }
4203 
4204 /*
4205  * A helper function to walk down the tree starting at min_key, and looking
4206  * for nodes or leaves that are have a minimum transaction id.
4207  * This is used by the btree defrag code, and tree logging
4208  *
4209  * This does not cow, but it does stuff the starting key it finds back
4210  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4211  * key and get a writable path.
4212  *
4213  * This honors path->lowest_level to prevent descent past a given level
4214  * of the tree.
4215  *
4216  * min_trans indicates the oldest transaction that you are interested
4217  * in walking through.  Any nodes or leaves older than min_trans are
4218  * skipped over (without reading them).
4219  *
4220  * returns zero if something useful was found, < 0 on error and 1 if there
4221  * was nothing in the tree that matched the search criteria.
4222  */
4223 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4224 			 struct btrfs_path *path,
4225 			 u64 min_trans)
4226 {
4227 	struct extent_buffer *cur;
4228 	struct btrfs_key found_key;
4229 	int slot;
4230 	int sret;
4231 	u32 nritems;
4232 	int level;
4233 	int ret = 1;
4234 	int keep_locks = path->keep_locks;
4235 
4236 	path->keep_locks = 1;
4237 again:
4238 	cur = btrfs_read_lock_root_node(root);
4239 	level = btrfs_header_level(cur);
4240 	WARN_ON(path->nodes[level]);
4241 	path->nodes[level] = cur;
4242 	path->locks[level] = BTRFS_READ_LOCK;
4243 
4244 	if (btrfs_header_generation(cur) < min_trans) {
4245 		ret = 1;
4246 		goto out;
4247 	}
4248 	while (1) {
4249 		nritems = btrfs_header_nritems(cur);
4250 		level = btrfs_header_level(cur);
4251 		sret = btrfs_bin_search(cur, min_key, &slot);
4252 		if (sret < 0) {
4253 			ret = sret;
4254 			goto out;
4255 		}
4256 
4257 		/* at the lowest level, we're done, setup the path and exit */
4258 		if (level == path->lowest_level) {
4259 			if (slot >= nritems)
4260 				goto find_next_key;
4261 			ret = 0;
4262 			path->slots[level] = slot;
4263 			btrfs_item_key_to_cpu(cur, &found_key, slot);
4264 			goto out;
4265 		}
4266 		if (sret && slot > 0)
4267 			slot--;
4268 		/*
4269 		 * check this node pointer against the min_trans parameters.
4270 		 * If it is too old, skip to the next one.
4271 		 */
4272 		while (slot < nritems) {
4273 			u64 gen;
4274 
4275 			gen = btrfs_node_ptr_generation(cur, slot);
4276 			if (gen < min_trans) {
4277 				slot++;
4278 				continue;
4279 			}
4280 			break;
4281 		}
4282 find_next_key:
4283 		/*
4284 		 * we didn't find a candidate key in this node, walk forward
4285 		 * and find another one
4286 		 */
4287 		if (slot >= nritems) {
4288 			path->slots[level] = slot;
4289 			sret = btrfs_find_next_key(root, path, min_key, level,
4290 						  min_trans);
4291 			if (sret == 0) {
4292 				btrfs_release_path(path);
4293 				goto again;
4294 			} else {
4295 				goto out;
4296 			}
4297 		}
4298 		/* save our key for returning back */
4299 		btrfs_node_key_to_cpu(cur, &found_key, slot);
4300 		path->slots[level] = slot;
4301 		if (level == path->lowest_level) {
4302 			ret = 0;
4303 			goto out;
4304 		}
4305 		cur = btrfs_read_node_slot(cur, slot);
4306 		if (IS_ERR(cur)) {
4307 			ret = PTR_ERR(cur);
4308 			goto out;
4309 		}
4310 
4311 		btrfs_tree_read_lock(cur);
4312 
4313 		path->locks[level - 1] = BTRFS_READ_LOCK;
4314 		path->nodes[level - 1] = cur;
4315 		unlock_up(path, level, 1, 0, NULL);
4316 	}
4317 out:
4318 	path->keep_locks = keep_locks;
4319 	if (ret == 0) {
4320 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
4321 		memcpy(min_key, &found_key, sizeof(found_key));
4322 	}
4323 	return ret;
4324 }
4325 
4326 /*
4327  * this is similar to btrfs_next_leaf, but does not try to preserve
4328  * and fixup the path.  It looks for and returns the next key in the
4329  * tree based on the current path and the min_trans parameters.
4330  *
4331  * 0 is returned if another key is found, < 0 if there are any errors
4332  * and 1 is returned if there are no higher keys in the tree
4333  *
4334  * path->keep_locks should be set to 1 on the search made before
4335  * calling this function.
4336  */
4337 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4338 			struct btrfs_key *key, int level, u64 min_trans)
4339 {
4340 	int slot;
4341 	struct extent_buffer *c;
4342 
4343 	WARN_ON(!path->keep_locks && !path->skip_locking);
4344 	while (level < BTRFS_MAX_LEVEL) {
4345 		if (!path->nodes[level])
4346 			return 1;
4347 
4348 		slot = path->slots[level] + 1;
4349 		c = path->nodes[level];
4350 next:
4351 		if (slot >= btrfs_header_nritems(c)) {
4352 			int ret;
4353 			int orig_lowest;
4354 			struct btrfs_key cur_key;
4355 			if (level + 1 >= BTRFS_MAX_LEVEL ||
4356 			    !path->nodes[level + 1])
4357 				return 1;
4358 
4359 			if (path->locks[level + 1] || path->skip_locking) {
4360 				level++;
4361 				continue;
4362 			}
4363 
4364 			slot = btrfs_header_nritems(c) - 1;
4365 			if (level == 0)
4366 				btrfs_item_key_to_cpu(c, &cur_key, slot);
4367 			else
4368 				btrfs_node_key_to_cpu(c, &cur_key, slot);
4369 
4370 			orig_lowest = path->lowest_level;
4371 			btrfs_release_path(path);
4372 			path->lowest_level = level;
4373 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
4374 						0, 0);
4375 			path->lowest_level = orig_lowest;
4376 			if (ret < 0)
4377 				return ret;
4378 
4379 			c = path->nodes[level];
4380 			slot = path->slots[level];
4381 			if (ret == 0)
4382 				slot++;
4383 			goto next;
4384 		}
4385 
4386 		if (level == 0)
4387 			btrfs_item_key_to_cpu(c, key, slot);
4388 		else {
4389 			u64 gen = btrfs_node_ptr_generation(c, slot);
4390 
4391 			if (gen < min_trans) {
4392 				slot++;
4393 				goto next;
4394 			}
4395 			btrfs_node_key_to_cpu(c, key, slot);
4396 		}
4397 		return 0;
4398 	}
4399 	return 1;
4400 }
4401 
4402 /*
4403  * search the tree again to find a leaf with greater keys
4404  * returns 0 if it found something or 1 if there are no greater leaves.
4405  * returns < 0 on io errors.
4406  */
4407 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4408 {
4409 	return btrfs_next_old_leaf(root, path, 0);
4410 }
4411 
4412 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4413 			u64 time_seq)
4414 {
4415 	int slot;
4416 	int level;
4417 	struct extent_buffer *c;
4418 	struct extent_buffer *next;
4419 	struct btrfs_key key;
4420 	u32 nritems;
4421 	int ret;
4422 	int i;
4423 
4424 	nritems = btrfs_header_nritems(path->nodes[0]);
4425 	if (nritems == 0)
4426 		return 1;
4427 
4428 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4429 again:
4430 	level = 1;
4431 	next = NULL;
4432 	btrfs_release_path(path);
4433 
4434 	path->keep_locks = 1;
4435 
4436 	if (time_seq)
4437 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
4438 	else
4439 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4440 	path->keep_locks = 0;
4441 
4442 	if (ret < 0)
4443 		return ret;
4444 
4445 	nritems = btrfs_header_nritems(path->nodes[0]);
4446 	/*
4447 	 * by releasing the path above we dropped all our locks.  A balance
4448 	 * could have added more items next to the key that used to be
4449 	 * at the very end of the block.  So, check again here and
4450 	 * advance the path if there are now more items available.
4451 	 */
4452 	if (nritems > 0 && path->slots[0] < nritems - 1) {
4453 		if (ret == 0)
4454 			path->slots[0]++;
4455 		ret = 0;
4456 		goto done;
4457 	}
4458 	/*
4459 	 * So the above check misses one case:
4460 	 * - after releasing the path above, someone has removed the item that
4461 	 *   used to be at the very end of the block, and balance between leafs
4462 	 *   gets another one with bigger key.offset to replace it.
4463 	 *
4464 	 * This one should be returned as well, or we can get leaf corruption
4465 	 * later(esp. in __btrfs_drop_extents()).
4466 	 *
4467 	 * And a bit more explanation about this check,
4468 	 * with ret > 0, the key isn't found, the path points to the slot
4469 	 * where it should be inserted, so the path->slots[0] item must be the
4470 	 * bigger one.
4471 	 */
4472 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4473 		ret = 0;
4474 		goto done;
4475 	}
4476 
4477 	while (level < BTRFS_MAX_LEVEL) {
4478 		if (!path->nodes[level]) {
4479 			ret = 1;
4480 			goto done;
4481 		}
4482 
4483 		slot = path->slots[level] + 1;
4484 		c = path->nodes[level];
4485 		if (slot >= btrfs_header_nritems(c)) {
4486 			level++;
4487 			if (level == BTRFS_MAX_LEVEL) {
4488 				ret = 1;
4489 				goto done;
4490 			}
4491 			continue;
4492 		}
4493 
4494 
4495 		/*
4496 		 * Our current level is where we're going to start from, and to
4497 		 * make sure lockdep doesn't complain we need to drop our locks
4498 		 * and nodes from 0 to our current level.
4499 		 */
4500 		for (i = 0; i < level; i++) {
4501 			if (path->locks[level]) {
4502 				btrfs_tree_read_unlock(path->nodes[i]);
4503 				path->locks[i] = 0;
4504 			}
4505 			free_extent_buffer(path->nodes[i]);
4506 			path->nodes[i] = NULL;
4507 		}
4508 
4509 		next = c;
4510 		ret = read_block_for_search(root, path, &next, level,
4511 					    slot, &key);
4512 		if (ret == -EAGAIN)
4513 			goto again;
4514 
4515 		if (ret < 0) {
4516 			btrfs_release_path(path);
4517 			goto done;
4518 		}
4519 
4520 		if (!path->skip_locking) {
4521 			ret = btrfs_try_tree_read_lock(next);
4522 			if (!ret && time_seq) {
4523 				/*
4524 				 * If we don't get the lock, we may be racing
4525 				 * with push_leaf_left, holding that lock while
4526 				 * itself waiting for the leaf we've currently
4527 				 * locked. To solve this situation, we give up
4528 				 * on our lock and cycle.
4529 				 */
4530 				free_extent_buffer(next);
4531 				btrfs_release_path(path);
4532 				cond_resched();
4533 				goto again;
4534 			}
4535 			if (!ret)
4536 				btrfs_tree_read_lock(next);
4537 		}
4538 		break;
4539 	}
4540 	path->slots[level] = slot;
4541 	while (1) {
4542 		level--;
4543 		path->nodes[level] = next;
4544 		path->slots[level] = 0;
4545 		if (!path->skip_locking)
4546 			path->locks[level] = BTRFS_READ_LOCK;
4547 		if (!level)
4548 			break;
4549 
4550 		ret = read_block_for_search(root, path, &next, level,
4551 					    0, &key);
4552 		if (ret == -EAGAIN)
4553 			goto again;
4554 
4555 		if (ret < 0) {
4556 			btrfs_release_path(path);
4557 			goto done;
4558 		}
4559 
4560 		if (!path->skip_locking)
4561 			btrfs_tree_read_lock(next);
4562 	}
4563 	ret = 0;
4564 done:
4565 	unlock_up(path, 0, 1, 0, NULL);
4566 
4567 	return ret;
4568 }
4569 
4570 /*
4571  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4572  * searching until it gets past min_objectid or finds an item of 'type'
4573  *
4574  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4575  */
4576 int btrfs_previous_item(struct btrfs_root *root,
4577 			struct btrfs_path *path, u64 min_objectid,
4578 			int type)
4579 {
4580 	struct btrfs_key found_key;
4581 	struct extent_buffer *leaf;
4582 	u32 nritems;
4583 	int ret;
4584 
4585 	while (1) {
4586 		if (path->slots[0] == 0) {
4587 			ret = btrfs_prev_leaf(root, path);
4588 			if (ret != 0)
4589 				return ret;
4590 		} else {
4591 			path->slots[0]--;
4592 		}
4593 		leaf = path->nodes[0];
4594 		nritems = btrfs_header_nritems(leaf);
4595 		if (nritems == 0)
4596 			return 1;
4597 		if (path->slots[0] == nritems)
4598 			path->slots[0]--;
4599 
4600 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4601 		if (found_key.objectid < min_objectid)
4602 			break;
4603 		if (found_key.type == type)
4604 			return 0;
4605 		if (found_key.objectid == min_objectid &&
4606 		    found_key.type < type)
4607 			break;
4608 	}
4609 	return 1;
4610 }
4611 
4612 /*
4613  * search in extent tree to find a previous Metadata/Data extent item with
4614  * min objecitd.
4615  *
4616  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4617  */
4618 int btrfs_previous_extent_item(struct btrfs_root *root,
4619 			struct btrfs_path *path, u64 min_objectid)
4620 {
4621 	struct btrfs_key found_key;
4622 	struct extent_buffer *leaf;
4623 	u32 nritems;
4624 	int ret;
4625 
4626 	while (1) {
4627 		if (path->slots[0] == 0) {
4628 			ret = btrfs_prev_leaf(root, path);
4629 			if (ret != 0)
4630 				return ret;
4631 		} else {
4632 			path->slots[0]--;
4633 		}
4634 		leaf = path->nodes[0];
4635 		nritems = btrfs_header_nritems(leaf);
4636 		if (nritems == 0)
4637 			return 1;
4638 		if (path->slots[0] == nritems)
4639 			path->slots[0]--;
4640 
4641 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4642 		if (found_key.objectid < min_objectid)
4643 			break;
4644 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4645 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
4646 			return 0;
4647 		if (found_key.objectid == min_objectid &&
4648 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
4649 			break;
4650 	}
4651 	return 1;
4652 }
4653