xref: /openbmc/linux/fs/btrfs/ctree.c (revision f616f5cd9da7fceb7d884812da380b26040cd083)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19 		      *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
22 		      int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24 			  struct btrfs_fs_info *fs_info,
25 			  struct extent_buffer *dst,
26 			  struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28 			      struct btrfs_fs_info *fs_info,
29 			      struct extent_buffer *dst_buf,
30 			      struct extent_buffer *src_buf);
31 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
32 		    int level, int slot);
33 
34 struct btrfs_path *btrfs_alloc_path(void)
35 {
36 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
37 }
38 
39 /*
40  * set all locked nodes in the path to blocking locks.  This should
41  * be done before scheduling
42  */
43 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
44 {
45 	int i;
46 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
47 		if (!p->nodes[i] || !p->locks[i])
48 			continue;
49 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
50 		if (p->locks[i] == BTRFS_READ_LOCK)
51 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
52 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
53 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
54 	}
55 }
56 
57 /* this also releases the path */
58 void btrfs_free_path(struct btrfs_path *p)
59 {
60 	if (!p)
61 		return;
62 	btrfs_release_path(p);
63 	kmem_cache_free(btrfs_path_cachep, p);
64 }
65 
66 /*
67  * path release drops references on the extent buffers in the path
68  * and it drops any locks held by this path
69  *
70  * It is safe to call this on paths that no locks or extent buffers held.
71  */
72 noinline void btrfs_release_path(struct btrfs_path *p)
73 {
74 	int i;
75 
76 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
77 		p->slots[i] = 0;
78 		if (!p->nodes[i])
79 			continue;
80 		if (p->locks[i]) {
81 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
82 			p->locks[i] = 0;
83 		}
84 		free_extent_buffer(p->nodes[i]);
85 		p->nodes[i] = NULL;
86 	}
87 }
88 
89 /*
90  * safely gets a reference on the root node of a tree.  A lock
91  * is not taken, so a concurrent writer may put a different node
92  * at the root of the tree.  See btrfs_lock_root_node for the
93  * looping required.
94  *
95  * The extent buffer returned by this has a reference taken, so
96  * it won't disappear.  It may stop being the root of the tree
97  * at any time because there are no locks held.
98  */
99 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
100 {
101 	struct extent_buffer *eb;
102 
103 	while (1) {
104 		rcu_read_lock();
105 		eb = rcu_dereference(root->node);
106 
107 		/*
108 		 * RCU really hurts here, we could free up the root node because
109 		 * it was COWed but we may not get the new root node yet so do
110 		 * the inc_not_zero dance and if it doesn't work then
111 		 * synchronize_rcu and try again.
112 		 */
113 		if (atomic_inc_not_zero(&eb->refs)) {
114 			rcu_read_unlock();
115 			break;
116 		}
117 		rcu_read_unlock();
118 		synchronize_rcu();
119 	}
120 	return eb;
121 }
122 
123 /* loop around taking references on and locking the root node of the
124  * tree until you end up with a lock on the root.  A locked buffer
125  * is returned, with a reference held.
126  */
127 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
128 {
129 	struct extent_buffer *eb;
130 
131 	while (1) {
132 		eb = btrfs_root_node(root);
133 		btrfs_tree_lock(eb);
134 		if (eb == root->node)
135 			break;
136 		btrfs_tree_unlock(eb);
137 		free_extent_buffer(eb);
138 	}
139 	return eb;
140 }
141 
142 /* loop around taking references on and locking the root node of the
143  * tree until you end up with a lock on the root.  A locked buffer
144  * is returned, with a reference held.
145  */
146 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
147 {
148 	struct extent_buffer *eb;
149 
150 	while (1) {
151 		eb = btrfs_root_node(root);
152 		btrfs_tree_read_lock(eb);
153 		if (eb == root->node)
154 			break;
155 		btrfs_tree_read_unlock(eb);
156 		free_extent_buffer(eb);
157 	}
158 	return eb;
159 }
160 
161 /* cowonly root (everything not a reference counted cow subvolume), just get
162  * put onto a simple dirty list.  transaction.c walks this to make sure they
163  * get properly updated on disk.
164  */
165 static void add_root_to_dirty_list(struct btrfs_root *root)
166 {
167 	struct btrfs_fs_info *fs_info = root->fs_info;
168 
169 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
170 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
171 		return;
172 
173 	spin_lock(&fs_info->trans_lock);
174 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
175 		/* Want the extent tree to be the last on the list */
176 		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
177 			list_move_tail(&root->dirty_list,
178 				       &fs_info->dirty_cowonly_roots);
179 		else
180 			list_move(&root->dirty_list,
181 				  &fs_info->dirty_cowonly_roots);
182 	}
183 	spin_unlock(&fs_info->trans_lock);
184 }
185 
186 /*
187  * used by snapshot creation to make a copy of a root for a tree with
188  * a given objectid.  The buffer with the new root node is returned in
189  * cow_ret, and this func returns zero on success or a negative error code.
190  */
191 int btrfs_copy_root(struct btrfs_trans_handle *trans,
192 		      struct btrfs_root *root,
193 		      struct extent_buffer *buf,
194 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
195 {
196 	struct btrfs_fs_info *fs_info = root->fs_info;
197 	struct extent_buffer *cow;
198 	int ret = 0;
199 	int level;
200 	struct btrfs_disk_key disk_key;
201 
202 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
203 		trans->transid != fs_info->running_transaction->transid);
204 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
205 		trans->transid != root->last_trans);
206 
207 	level = btrfs_header_level(buf);
208 	if (level == 0)
209 		btrfs_item_key(buf, &disk_key, 0);
210 	else
211 		btrfs_node_key(buf, &disk_key, 0);
212 
213 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
214 			&disk_key, level, buf->start, 0);
215 	if (IS_ERR(cow))
216 		return PTR_ERR(cow);
217 
218 	copy_extent_buffer_full(cow, buf);
219 	btrfs_set_header_bytenr(cow, cow->start);
220 	btrfs_set_header_generation(cow, trans->transid);
221 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
222 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
223 				     BTRFS_HEADER_FLAG_RELOC);
224 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
225 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
226 	else
227 		btrfs_set_header_owner(cow, new_root_objectid);
228 
229 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
230 
231 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
232 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
233 		ret = btrfs_inc_ref(trans, root, cow, 1);
234 	else
235 		ret = btrfs_inc_ref(trans, root, cow, 0);
236 
237 	if (ret)
238 		return ret;
239 
240 	btrfs_mark_buffer_dirty(cow);
241 	*cow_ret = cow;
242 	return 0;
243 }
244 
245 enum mod_log_op {
246 	MOD_LOG_KEY_REPLACE,
247 	MOD_LOG_KEY_ADD,
248 	MOD_LOG_KEY_REMOVE,
249 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
250 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
251 	MOD_LOG_MOVE_KEYS,
252 	MOD_LOG_ROOT_REPLACE,
253 };
254 
255 struct tree_mod_root {
256 	u64 logical;
257 	u8 level;
258 };
259 
260 struct tree_mod_elem {
261 	struct rb_node node;
262 	u64 logical;
263 	u64 seq;
264 	enum mod_log_op op;
265 
266 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
267 	int slot;
268 
269 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
270 	u64 generation;
271 
272 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
273 	struct btrfs_disk_key key;
274 	u64 blockptr;
275 
276 	/* this is used for op == MOD_LOG_MOVE_KEYS */
277 	struct {
278 		int dst_slot;
279 		int nr_items;
280 	} move;
281 
282 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
283 	struct tree_mod_root old_root;
284 };
285 
286 /*
287  * Pull a new tree mod seq number for our operation.
288  */
289 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
290 {
291 	return atomic64_inc_return(&fs_info->tree_mod_seq);
292 }
293 
294 /*
295  * This adds a new blocker to the tree mod log's blocker list if the @elem
296  * passed does not already have a sequence number set. So when a caller expects
297  * to record tree modifications, it should ensure to set elem->seq to zero
298  * before calling btrfs_get_tree_mod_seq.
299  * Returns a fresh, unused tree log modification sequence number, even if no new
300  * blocker was added.
301  */
302 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
303 			   struct seq_list *elem)
304 {
305 	write_lock(&fs_info->tree_mod_log_lock);
306 	spin_lock(&fs_info->tree_mod_seq_lock);
307 	if (!elem->seq) {
308 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
309 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
310 	}
311 	spin_unlock(&fs_info->tree_mod_seq_lock);
312 	write_unlock(&fs_info->tree_mod_log_lock);
313 
314 	return elem->seq;
315 }
316 
317 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
318 			    struct seq_list *elem)
319 {
320 	struct rb_root *tm_root;
321 	struct rb_node *node;
322 	struct rb_node *next;
323 	struct seq_list *cur_elem;
324 	struct tree_mod_elem *tm;
325 	u64 min_seq = (u64)-1;
326 	u64 seq_putting = elem->seq;
327 
328 	if (!seq_putting)
329 		return;
330 
331 	spin_lock(&fs_info->tree_mod_seq_lock);
332 	list_del(&elem->list);
333 	elem->seq = 0;
334 
335 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
336 		if (cur_elem->seq < min_seq) {
337 			if (seq_putting > cur_elem->seq) {
338 				/*
339 				 * blocker with lower sequence number exists, we
340 				 * cannot remove anything from the log
341 				 */
342 				spin_unlock(&fs_info->tree_mod_seq_lock);
343 				return;
344 			}
345 			min_seq = cur_elem->seq;
346 		}
347 	}
348 	spin_unlock(&fs_info->tree_mod_seq_lock);
349 
350 	/*
351 	 * anything that's lower than the lowest existing (read: blocked)
352 	 * sequence number can be removed from the tree.
353 	 */
354 	write_lock(&fs_info->tree_mod_log_lock);
355 	tm_root = &fs_info->tree_mod_log;
356 	for (node = rb_first(tm_root); node; node = next) {
357 		next = rb_next(node);
358 		tm = rb_entry(node, struct tree_mod_elem, node);
359 		if (tm->seq > min_seq)
360 			continue;
361 		rb_erase(node, tm_root);
362 		kfree(tm);
363 	}
364 	write_unlock(&fs_info->tree_mod_log_lock);
365 }
366 
367 /*
368  * key order of the log:
369  *       node/leaf start address -> sequence
370  *
371  * The 'start address' is the logical address of the *new* root node
372  * for root replace operations, or the logical address of the affected
373  * block for all other operations.
374  *
375  * Note: must be called with write lock for fs_info::tree_mod_log_lock.
376  */
377 static noinline int
378 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
379 {
380 	struct rb_root *tm_root;
381 	struct rb_node **new;
382 	struct rb_node *parent = NULL;
383 	struct tree_mod_elem *cur;
384 
385 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
386 
387 	tm_root = &fs_info->tree_mod_log;
388 	new = &tm_root->rb_node;
389 	while (*new) {
390 		cur = rb_entry(*new, struct tree_mod_elem, node);
391 		parent = *new;
392 		if (cur->logical < tm->logical)
393 			new = &((*new)->rb_left);
394 		else if (cur->logical > tm->logical)
395 			new = &((*new)->rb_right);
396 		else if (cur->seq < tm->seq)
397 			new = &((*new)->rb_left);
398 		else if (cur->seq > tm->seq)
399 			new = &((*new)->rb_right);
400 		else
401 			return -EEXIST;
402 	}
403 
404 	rb_link_node(&tm->node, parent, new);
405 	rb_insert_color(&tm->node, tm_root);
406 	return 0;
407 }
408 
409 /*
410  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
411  * returns zero with the tree_mod_log_lock acquired. The caller must hold
412  * this until all tree mod log insertions are recorded in the rb tree and then
413  * write unlock fs_info::tree_mod_log_lock.
414  */
415 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
416 				    struct extent_buffer *eb) {
417 	smp_mb();
418 	if (list_empty(&(fs_info)->tree_mod_seq_list))
419 		return 1;
420 	if (eb && btrfs_header_level(eb) == 0)
421 		return 1;
422 
423 	write_lock(&fs_info->tree_mod_log_lock);
424 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
425 		write_unlock(&fs_info->tree_mod_log_lock);
426 		return 1;
427 	}
428 
429 	return 0;
430 }
431 
432 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
433 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
434 				    struct extent_buffer *eb)
435 {
436 	smp_mb();
437 	if (list_empty(&(fs_info)->tree_mod_seq_list))
438 		return 0;
439 	if (eb && btrfs_header_level(eb) == 0)
440 		return 0;
441 
442 	return 1;
443 }
444 
445 static struct tree_mod_elem *
446 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
447 		    enum mod_log_op op, gfp_t flags)
448 {
449 	struct tree_mod_elem *tm;
450 
451 	tm = kzalloc(sizeof(*tm), flags);
452 	if (!tm)
453 		return NULL;
454 
455 	tm->logical = eb->start;
456 	if (op != MOD_LOG_KEY_ADD) {
457 		btrfs_node_key(eb, &tm->key, slot);
458 		tm->blockptr = btrfs_node_blockptr(eb, slot);
459 	}
460 	tm->op = op;
461 	tm->slot = slot;
462 	tm->generation = btrfs_node_ptr_generation(eb, slot);
463 	RB_CLEAR_NODE(&tm->node);
464 
465 	return tm;
466 }
467 
468 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
469 		enum mod_log_op op, gfp_t flags)
470 {
471 	struct tree_mod_elem *tm;
472 	int ret;
473 
474 	if (!tree_mod_need_log(eb->fs_info, eb))
475 		return 0;
476 
477 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
478 	if (!tm)
479 		return -ENOMEM;
480 
481 	if (tree_mod_dont_log(eb->fs_info, eb)) {
482 		kfree(tm);
483 		return 0;
484 	}
485 
486 	ret = __tree_mod_log_insert(eb->fs_info, tm);
487 	write_unlock(&eb->fs_info->tree_mod_log_lock);
488 	if (ret)
489 		kfree(tm);
490 
491 	return ret;
492 }
493 
494 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
495 		int dst_slot, int src_slot, int nr_items)
496 {
497 	struct tree_mod_elem *tm = NULL;
498 	struct tree_mod_elem **tm_list = NULL;
499 	int ret = 0;
500 	int i;
501 	int locked = 0;
502 
503 	if (!tree_mod_need_log(eb->fs_info, eb))
504 		return 0;
505 
506 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
507 	if (!tm_list)
508 		return -ENOMEM;
509 
510 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
511 	if (!tm) {
512 		ret = -ENOMEM;
513 		goto free_tms;
514 	}
515 
516 	tm->logical = eb->start;
517 	tm->slot = src_slot;
518 	tm->move.dst_slot = dst_slot;
519 	tm->move.nr_items = nr_items;
520 	tm->op = MOD_LOG_MOVE_KEYS;
521 
522 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
523 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
524 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
525 		if (!tm_list[i]) {
526 			ret = -ENOMEM;
527 			goto free_tms;
528 		}
529 	}
530 
531 	if (tree_mod_dont_log(eb->fs_info, eb))
532 		goto free_tms;
533 	locked = 1;
534 
535 	/*
536 	 * When we override something during the move, we log these removals.
537 	 * This can only happen when we move towards the beginning of the
538 	 * buffer, i.e. dst_slot < src_slot.
539 	 */
540 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
541 		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
542 		if (ret)
543 			goto free_tms;
544 	}
545 
546 	ret = __tree_mod_log_insert(eb->fs_info, tm);
547 	if (ret)
548 		goto free_tms;
549 	write_unlock(&eb->fs_info->tree_mod_log_lock);
550 	kfree(tm_list);
551 
552 	return 0;
553 free_tms:
554 	for (i = 0; i < nr_items; i++) {
555 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
556 			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
557 		kfree(tm_list[i]);
558 	}
559 	if (locked)
560 		write_unlock(&eb->fs_info->tree_mod_log_lock);
561 	kfree(tm_list);
562 	kfree(tm);
563 
564 	return ret;
565 }
566 
567 static inline int
568 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
569 		       struct tree_mod_elem **tm_list,
570 		       int nritems)
571 {
572 	int i, j;
573 	int ret;
574 
575 	for (i = nritems - 1; i >= 0; i--) {
576 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
577 		if (ret) {
578 			for (j = nritems - 1; j > i; j--)
579 				rb_erase(&tm_list[j]->node,
580 					 &fs_info->tree_mod_log);
581 			return ret;
582 		}
583 	}
584 
585 	return 0;
586 }
587 
588 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
589 			 struct extent_buffer *new_root, int log_removal)
590 {
591 	struct btrfs_fs_info *fs_info = old_root->fs_info;
592 	struct tree_mod_elem *tm = NULL;
593 	struct tree_mod_elem **tm_list = NULL;
594 	int nritems = 0;
595 	int ret = 0;
596 	int i;
597 
598 	if (!tree_mod_need_log(fs_info, NULL))
599 		return 0;
600 
601 	if (log_removal && btrfs_header_level(old_root) > 0) {
602 		nritems = btrfs_header_nritems(old_root);
603 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
604 				  GFP_NOFS);
605 		if (!tm_list) {
606 			ret = -ENOMEM;
607 			goto free_tms;
608 		}
609 		for (i = 0; i < nritems; i++) {
610 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
611 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
612 			if (!tm_list[i]) {
613 				ret = -ENOMEM;
614 				goto free_tms;
615 			}
616 		}
617 	}
618 
619 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
620 	if (!tm) {
621 		ret = -ENOMEM;
622 		goto free_tms;
623 	}
624 
625 	tm->logical = new_root->start;
626 	tm->old_root.logical = old_root->start;
627 	tm->old_root.level = btrfs_header_level(old_root);
628 	tm->generation = btrfs_header_generation(old_root);
629 	tm->op = MOD_LOG_ROOT_REPLACE;
630 
631 	if (tree_mod_dont_log(fs_info, NULL))
632 		goto free_tms;
633 
634 	if (tm_list)
635 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
636 	if (!ret)
637 		ret = __tree_mod_log_insert(fs_info, tm);
638 
639 	write_unlock(&fs_info->tree_mod_log_lock);
640 	if (ret)
641 		goto free_tms;
642 	kfree(tm_list);
643 
644 	return ret;
645 
646 free_tms:
647 	if (tm_list) {
648 		for (i = 0; i < nritems; i++)
649 			kfree(tm_list[i]);
650 		kfree(tm_list);
651 	}
652 	kfree(tm);
653 
654 	return ret;
655 }
656 
657 static struct tree_mod_elem *
658 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
659 		      int smallest)
660 {
661 	struct rb_root *tm_root;
662 	struct rb_node *node;
663 	struct tree_mod_elem *cur = NULL;
664 	struct tree_mod_elem *found = NULL;
665 
666 	read_lock(&fs_info->tree_mod_log_lock);
667 	tm_root = &fs_info->tree_mod_log;
668 	node = tm_root->rb_node;
669 	while (node) {
670 		cur = rb_entry(node, struct tree_mod_elem, node);
671 		if (cur->logical < start) {
672 			node = node->rb_left;
673 		} else if (cur->logical > start) {
674 			node = node->rb_right;
675 		} else if (cur->seq < min_seq) {
676 			node = node->rb_left;
677 		} else if (!smallest) {
678 			/* we want the node with the highest seq */
679 			if (found)
680 				BUG_ON(found->seq > cur->seq);
681 			found = cur;
682 			node = node->rb_left;
683 		} else if (cur->seq > min_seq) {
684 			/* we want the node with the smallest seq */
685 			if (found)
686 				BUG_ON(found->seq < cur->seq);
687 			found = cur;
688 			node = node->rb_right;
689 		} else {
690 			found = cur;
691 			break;
692 		}
693 	}
694 	read_unlock(&fs_info->tree_mod_log_lock);
695 
696 	return found;
697 }
698 
699 /*
700  * this returns the element from the log with the smallest time sequence
701  * value that's in the log (the oldest log item). any element with a time
702  * sequence lower than min_seq will be ignored.
703  */
704 static struct tree_mod_elem *
705 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
706 			   u64 min_seq)
707 {
708 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
709 }
710 
711 /*
712  * this returns the element from the log with the largest time sequence
713  * value that's in the log (the most recent log item). any element with
714  * a time sequence lower than min_seq will be ignored.
715  */
716 static struct tree_mod_elem *
717 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
718 {
719 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
720 }
721 
722 static noinline int
723 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
724 		     struct extent_buffer *src, unsigned long dst_offset,
725 		     unsigned long src_offset, int nr_items)
726 {
727 	int ret = 0;
728 	struct tree_mod_elem **tm_list = NULL;
729 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
730 	int i;
731 	int locked = 0;
732 
733 	if (!tree_mod_need_log(fs_info, NULL))
734 		return 0;
735 
736 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
737 		return 0;
738 
739 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
740 			  GFP_NOFS);
741 	if (!tm_list)
742 		return -ENOMEM;
743 
744 	tm_list_add = tm_list;
745 	tm_list_rem = tm_list + nr_items;
746 	for (i = 0; i < nr_items; i++) {
747 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
748 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
749 		if (!tm_list_rem[i]) {
750 			ret = -ENOMEM;
751 			goto free_tms;
752 		}
753 
754 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
755 		    MOD_LOG_KEY_ADD, GFP_NOFS);
756 		if (!tm_list_add[i]) {
757 			ret = -ENOMEM;
758 			goto free_tms;
759 		}
760 	}
761 
762 	if (tree_mod_dont_log(fs_info, NULL))
763 		goto free_tms;
764 	locked = 1;
765 
766 	for (i = 0; i < nr_items; i++) {
767 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
768 		if (ret)
769 			goto free_tms;
770 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
771 		if (ret)
772 			goto free_tms;
773 	}
774 
775 	write_unlock(&fs_info->tree_mod_log_lock);
776 	kfree(tm_list);
777 
778 	return 0;
779 
780 free_tms:
781 	for (i = 0; i < nr_items * 2; i++) {
782 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
783 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
784 		kfree(tm_list[i]);
785 	}
786 	if (locked)
787 		write_unlock(&fs_info->tree_mod_log_lock);
788 	kfree(tm_list);
789 
790 	return ret;
791 }
792 
793 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
794 {
795 	struct tree_mod_elem **tm_list = NULL;
796 	int nritems = 0;
797 	int i;
798 	int ret = 0;
799 
800 	if (btrfs_header_level(eb) == 0)
801 		return 0;
802 
803 	if (!tree_mod_need_log(eb->fs_info, NULL))
804 		return 0;
805 
806 	nritems = btrfs_header_nritems(eb);
807 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
808 	if (!tm_list)
809 		return -ENOMEM;
810 
811 	for (i = 0; i < nritems; i++) {
812 		tm_list[i] = alloc_tree_mod_elem(eb, i,
813 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
814 		if (!tm_list[i]) {
815 			ret = -ENOMEM;
816 			goto free_tms;
817 		}
818 	}
819 
820 	if (tree_mod_dont_log(eb->fs_info, eb))
821 		goto free_tms;
822 
823 	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
824 	write_unlock(&eb->fs_info->tree_mod_log_lock);
825 	if (ret)
826 		goto free_tms;
827 	kfree(tm_list);
828 
829 	return 0;
830 
831 free_tms:
832 	for (i = 0; i < nritems; i++)
833 		kfree(tm_list[i]);
834 	kfree(tm_list);
835 
836 	return ret;
837 }
838 
839 /*
840  * check if the tree block can be shared by multiple trees
841  */
842 int btrfs_block_can_be_shared(struct btrfs_root *root,
843 			      struct extent_buffer *buf)
844 {
845 	/*
846 	 * Tree blocks not in reference counted trees and tree roots
847 	 * are never shared. If a block was allocated after the last
848 	 * snapshot and the block was not allocated by tree relocation,
849 	 * we know the block is not shared.
850 	 */
851 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
852 	    buf != root->node && buf != root->commit_root &&
853 	    (btrfs_header_generation(buf) <=
854 	     btrfs_root_last_snapshot(&root->root_item) ||
855 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
856 		return 1;
857 
858 	return 0;
859 }
860 
861 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
862 				       struct btrfs_root *root,
863 				       struct extent_buffer *buf,
864 				       struct extent_buffer *cow,
865 				       int *last_ref)
866 {
867 	struct btrfs_fs_info *fs_info = root->fs_info;
868 	u64 refs;
869 	u64 owner;
870 	u64 flags;
871 	u64 new_flags = 0;
872 	int ret;
873 
874 	/*
875 	 * Backrefs update rules:
876 	 *
877 	 * Always use full backrefs for extent pointers in tree block
878 	 * allocated by tree relocation.
879 	 *
880 	 * If a shared tree block is no longer referenced by its owner
881 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
882 	 * use full backrefs for extent pointers in tree block.
883 	 *
884 	 * If a tree block is been relocating
885 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
886 	 * use full backrefs for extent pointers in tree block.
887 	 * The reason for this is some operations (such as drop tree)
888 	 * are only allowed for blocks use full backrefs.
889 	 */
890 
891 	if (btrfs_block_can_be_shared(root, buf)) {
892 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
893 					       btrfs_header_level(buf), 1,
894 					       &refs, &flags);
895 		if (ret)
896 			return ret;
897 		if (refs == 0) {
898 			ret = -EROFS;
899 			btrfs_handle_fs_error(fs_info, ret, NULL);
900 			return ret;
901 		}
902 	} else {
903 		refs = 1;
904 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
905 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
906 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
907 		else
908 			flags = 0;
909 	}
910 
911 	owner = btrfs_header_owner(buf);
912 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
913 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
914 
915 	if (refs > 1) {
916 		if ((owner == root->root_key.objectid ||
917 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
918 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
919 			ret = btrfs_inc_ref(trans, root, buf, 1);
920 			if (ret)
921 				return ret;
922 
923 			if (root->root_key.objectid ==
924 			    BTRFS_TREE_RELOC_OBJECTID) {
925 				ret = btrfs_dec_ref(trans, root, buf, 0);
926 				if (ret)
927 					return ret;
928 				ret = btrfs_inc_ref(trans, root, cow, 1);
929 				if (ret)
930 					return ret;
931 			}
932 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
933 		} else {
934 
935 			if (root->root_key.objectid ==
936 			    BTRFS_TREE_RELOC_OBJECTID)
937 				ret = btrfs_inc_ref(trans, root, cow, 1);
938 			else
939 				ret = btrfs_inc_ref(trans, root, cow, 0);
940 			if (ret)
941 				return ret;
942 		}
943 		if (new_flags != 0) {
944 			int level = btrfs_header_level(buf);
945 
946 			ret = btrfs_set_disk_extent_flags(trans, fs_info,
947 							  buf->start,
948 							  buf->len,
949 							  new_flags, level, 0);
950 			if (ret)
951 				return ret;
952 		}
953 	} else {
954 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
955 			if (root->root_key.objectid ==
956 			    BTRFS_TREE_RELOC_OBJECTID)
957 				ret = btrfs_inc_ref(trans, root, cow, 1);
958 			else
959 				ret = btrfs_inc_ref(trans, root, cow, 0);
960 			if (ret)
961 				return ret;
962 			ret = btrfs_dec_ref(trans, root, buf, 1);
963 			if (ret)
964 				return ret;
965 		}
966 		clean_tree_block(fs_info, buf);
967 		*last_ref = 1;
968 	}
969 	return 0;
970 }
971 
972 static struct extent_buffer *alloc_tree_block_no_bg_flush(
973 					  struct btrfs_trans_handle *trans,
974 					  struct btrfs_root *root,
975 					  u64 parent_start,
976 					  const struct btrfs_disk_key *disk_key,
977 					  int level,
978 					  u64 hint,
979 					  u64 empty_size)
980 {
981 	struct btrfs_fs_info *fs_info = root->fs_info;
982 	struct extent_buffer *ret;
983 
984 	/*
985 	 * If we are COWing a node/leaf from the extent, chunk, device or free
986 	 * space trees, make sure that we do not finish block group creation of
987 	 * pending block groups. We do this to avoid a deadlock.
988 	 * COWing can result in allocation of a new chunk, and flushing pending
989 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
990 	 * when finishing allocation of a new chunk. Creation of a pending block
991 	 * group modifies the extent, chunk, device and free space trees,
992 	 * therefore we could deadlock with ourselves since we are holding a
993 	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
994 	 * try to COW later.
995 	 * For similar reasons, we also need to delay flushing pending block
996 	 * groups when splitting a leaf or node, from one of those trees, since
997 	 * we are holding a write lock on it and its parent or when inserting a
998 	 * new root node for one of those trees.
999 	 */
1000 	if (root == fs_info->extent_root ||
1001 	    root == fs_info->chunk_root ||
1002 	    root == fs_info->dev_root ||
1003 	    root == fs_info->free_space_root)
1004 		trans->can_flush_pending_bgs = false;
1005 
1006 	ret = btrfs_alloc_tree_block(trans, root, parent_start,
1007 				     root->root_key.objectid, disk_key, level,
1008 				     hint, empty_size);
1009 	trans->can_flush_pending_bgs = true;
1010 
1011 	return ret;
1012 }
1013 
1014 /*
1015  * does the dirty work in cow of a single block.  The parent block (if
1016  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1017  * dirty and returned locked.  If you modify the block it needs to be marked
1018  * dirty again.
1019  *
1020  * search_start -- an allocation hint for the new block
1021  *
1022  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1023  * bytes the allocator should try to find free next to the block it returns.
1024  * This is just a hint and may be ignored by the allocator.
1025  */
1026 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1027 			     struct btrfs_root *root,
1028 			     struct extent_buffer *buf,
1029 			     struct extent_buffer *parent, int parent_slot,
1030 			     struct extent_buffer **cow_ret,
1031 			     u64 search_start, u64 empty_size)
1032 {
1033 	struct btrfs_fs_info *fs_info = root->fs_info;
1034 	struct btrfs_disk_key disk_key;
1035 	struct extent_buffer *cow;
1036 	int level, ret;
1037 	int last_ref = 0;
1038 	int unlock_orig = 0;
1039 	u64 parent_start = 0;
1040 
1041 	if (*cow_ret == buf)
1042 		unlock_orig = 1;
1043 
1044 	btrfs_assert_tree_locked(buf);
1045 
1046 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1047 		trans->transid != fs_info->running_transaction->transid);
1048 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1049 		trans->transid != root->last_trans);
1050 
1051 	level = btrfs_header_level(buf);
1052 
1053 	if (level == 0)
1054 		btrfs_item_key(buf, &disk_key, 0);
1055 	else
1056 		btrfs_node_key(buf, &disk_key, 0);
1057 
1058 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1059 		parent_start = parent->start;
1060 
1061 	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1062 					   level, search_start, empty_size);
1063 	if (IS_ERR(cow))
1064 		return PTR_ERR(cow);
1065 
1066 	/* cow is set to blocking by btrfs_init_new_buffer */
1067 
1068 	copy_extent_buffer_full(cow, buf);
1069 	btrfs_set_header_bytenr(cow, cow->start);
1070 	btrfs_set_header_generation(cow, trans->transid);
1071 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1072 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1073 				     BTRFS_HEADER_FLAG_RELOC);
1074 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1075 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1076 	else
1077 		btrfs_set_header_owner(cow, root->root_key.objectid);
1078 
1079 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1080 
1081 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1082 	if (ret) {
1083 		btrfs_abort_transaction(trans, ret);
1084 		return ret;
1085 	}
1086 
1087 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1088 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1089 		if (ret) {
1090 			btrfs_abort_transaction(trans, ret);
1091 			return ret;
1092 		}
1093 	}
1094 
1095 	if (buf == root->node) {
1096 		WARN_ON(parent && parent != buf);
1097 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1098 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1099 			parent_start = buf->start;
1100 
1101 		extent_buffer_get(cow);
1102 		ret = tree_mod_log_insert_root(root->node, cow, 1);
1103 		BUG_ON(ret < 0);
1104 		rcu_assign_pointer(root->node, cow);
1105 
1106 		btrfs_free_tree_block(trans, root, buf, parent_start,
1107 				      last_ref);
1108 		free_extent_buffer(buf);
1109 		add_root_to_dirty_list(root);
1110 	} else {
1111 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1112 		tree_mod_log_insert_key(parent, parent_slot,
1113 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1114 		btrfs_set_node_blockptr(parent, parent_slot,
1115 					cow->start);
1116 		btrfs_set_node_ptr_generation(parent, parent_slot,
1117 					      trans->transid);
1118 		btrfs_mark_buffer_dirty(parent);
1119 		if (last_ref) {
1120 			ret = tree_mod_log_free_eb(buf);
1121 			if (ret) {
1122 				btrfs_abort_transaction(trans, ret);
1123 				return ret;
1124 			}
1125 		}
1126 		btrfs_free_tree_block(trans, root, buf, parent_start,
1127 				      last_ref);
1128 	}
1129 	if (unlock_orig)
1130 		btrfs_tree_unlock(buf);
1131 	free_extent_buffer_stale(buf);
1132 	btrfs_mark_buffer_dirty(cow);
1133 	*cow_ret = cow;
1134 	return 0;
1135 }
1136 
1137 /*
1138  * returns the logical address of the oldest predecessor of the given root.
1139  * entries older than time_seq are ignored.
1140  */
1141 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1142 		struct extent_buffer *eb_root, u64 time_seq)
1143 {
1144 	struct tree_mod_elem *tm;
1145 	struct tree_mod_elem *found = NULL;
1146 	u64 root_logical = eb_root->start;
1147 	int looped = 0;
1148 
1149 	if (!time_seq)
1150 		return NULL;
1151 
1152 	/*
1153 	 * the very last operation that's logged for a root is the
1154 	 * replacement operation (if it is replaced at all). this has
1155 	 * the logical address of the *new* root, making it the very
1156 	 * first operation that's logged for this root.
1157 	 */
1158 	while (1) {
1159 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1160 						time_seq);
1161 		if (!looped && !tm)
1162 			return NULL;
1163 		/*
1164 		 * if there are no tree operation for the oldest root, we simply
1165 		 * return it. this should only happen if that (old) root is at
1166 		 * level 0.
1167 		 */
1168 		if (!tm)
1169 			break;
1170 
1171 		/*
1172 		 * if there's an operation that's not a root replacement, we
1173 		 * found the oldest version of our root. normally, we'll find a
1174 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1175 		 */
1176 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1177 			break;
1178 
1179 		found = tm;
1180 		root_logical = tm->old_root.logical;
1181 		looped = 1;
1182 	}
1183 
1184 	/* if there's no old root to return, return what we found instead */
1185 	if (!found)
1186 		found = tm;
1187 
1188 	return found;
1189 }
1190 
1191 /*
1192  * tm is a pointer to the first operation to rewind within eb. then, all
1193  * previous operations will be rewound (until we reach something older than
1194  * time_seq).
1195  */
1196 static void
1197 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1198 		      u64 time_seq, struct tree_mod_elem *first_tm)
1199 {
1200 	u32 n;
1201 	struct rb_node *next;
1202 	struct tree_mod_elem *tm = first_tm;
1203 	unsigned long o_dst;
1204 	unsigned long o_src;
1205 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1206 
1207 	n = btrfs_header_nritems(eb);
1208 	read_lock(&fs_info->tree_mod_log_lock);
1209 	while (tm && tm->seq >= time_seq) {
1210 		/*
1211 		 * all the operations are recorded with the operator used for
1212 		 * the modification. as we're going backwards, we do the
1213 		 * opposite of each operation here.
1214 		 */
1215 		switch (tm->op) {
1216 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1217 			BUG_ON(tm->slot < n);
1218 			/* Fallthrough */
1219 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1220 		case MOD_LOG_KEY_REMOVE:
1221 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1222 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1223 			btrfs_set_node_ptr_generation(eb, tm->slot,
1224 						      tm->generation);
1225 			n++;
1226 			break;
1227 		case MOD_LOG_KEY_REPLACE:
1228 			BUG_ON(tm->slot >= n);
1229 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1230 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1231 			btrfs_set_node_ptr_generation(eb, tm->slot,
1232 						      tm->generation);
1233 			break;
1234 		case MOD_LOG_KEY_ADD:
1235 			/* if a move operation is needed it's in the log */
1236 			n--;
1237 			break;
1238 		case MOD_LOG_MOVE_KEYS:
1239 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1240 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1241 			memmove_extent_buffer(eb, o_dst, o_src,
1242 					      tm->move.nr_items * p_size);
1243 			break;
1244 		case MOD_LOG_ROOT_REPLACE:
1245 			/*
1246 			 * this operation is special. for roots, this must be
1247 			 * handled explicitly before rewinding.
1248 			 * for non-roots, this operation may exist if the node
1249 			 * was a root: root A -> child B; then A gets empty and
1250 			 * B is promoted to the new root. in the mod log, we'll
1251 			 * have a root-replace operation for B, a tree block
1252 			 * that is no root. we simply ignore that operation.
1253 			 */
1254 			break;
1255 		}
1256 		next = rb_next(&tm->node);
1257 		if (!next)
1258 			break;
1259 		tm = rb_entry(next, struct tree_mod_elem, node);
1260 		if (tm->logical != first_tm->logical)
1261 			break;
1262 	}
1263 	read_unlock(&fs_info->tree_mod_log_lock);
1264 	btrfs_set_header_nritems(eb, n);
1265 }
1266 
1267 /*
1268  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1269  * is returned. If rewind operations happen, a fresh buffer is returned. The
1270  * returned buffer is always read-locked. If the returned buffer is not the
1271  * input buffer, the lock on the input buffer is released and the input buffer
1272  * is freed (its refcount is decremented).
1273  */
1274 static struct extent_buffer *
1275 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1276 		    struct extent_buffer *eb, u64 time_seq)
1277 {
1278 	struct extent_buffer *eb_rewin;
1279 	struct tree_mod_elem *tm;
1280 
1281 	if (!time_seq)
1282 		return eb;
1283 
1284 	if (btrfs_header_level(eb) == 0)
1285 		return eb;
1286 
1287 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1288 	if (!tm)
1289 		return eb;
1290 
1291 	btrfs_set_path_blocking(path);
1292 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1293 
1294 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1295 		BUG_ON(tm->slot != 0);
1296 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1297 		if (!eb_rewin) {
1298 			btrfs_tree_read_unlock_blocking(eb);
1299 			free_extent_buffer(eb);
1300 			return NULL;
1301 		}
1302 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1303 		btrfs_set_header_backref_rev(eb_rewin,
1304 					     btrfs_header_backref_rev(eb));
1305 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1306 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1307 	} else {
1308 		eb_rewin = btrfs_clone_extent_buffer(eb);
1309 		if (!eb_rewin) {
1310 			btrfs_tree_read_unlock_blocking(eb);
1311 			free_extent_buffer(eb);
1312 			return NULL;
1313 		}
1314 	}
1315 
1316 	btrfs_tree_read_unlock_blocking(eb);
1317 	free_extent_buffer(eb);
1318 
1319 	btrfs_tree_read_lock(eb_rewin);
1320 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1321 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1322 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1323 
1324 	return eb_rewin;
1325 }
1326 
1327 /*
1328  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1329  * value. If there are no changes, the current root->root_node is returned. If
1330  * anything changed in between, there's a fresh buffer allocated on which the
1331  * rewind operations are done. In any case, the returned buffer is read locked.
1332  * Returns NULL on error (with no locks held).
1333  */
1334 static inline struct extent_buffer *
1335 get_old_root(struct btrfs_root *root, u64 time_seq)
1336 {
1337 	struct btrfs_fs_info *fs_info = root->fs_info;
1338 	struct tree_mod_elem *tm;
1339 	struct extent_buffer *eb = NULL;
1340 	struct extent_buffer *eb_root;
1341 	struct extent_buffer *old;
1342 	struct tree_mod_root *old_root = NULL;
1343 	u64 old_generation = 0;
1344 	u64 logical;
1345 	int level;
1346 
1347 	eb_root = btrfs_read_lock_root_node(root);
1348 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1349 	if (!tm)
1350 		return eb_root;
1351 
1352 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1353 		old_root = &tm->old_root;
1354 		old_generation = tm->generation;
1355 		logical = old_root->logical;
1356 		level = old_root->level;
1357 	} else {
1358 		logical = eb_root->start;
1359 		level = btrfs_header_level(eb_root);
1360 	}
1361 
1362 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1363 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1364 		btrfs_tree_read_unlock(eb_root);
1365 		free_extent_buffer(eb_root);
1366 		old = read_tree_block(fs_info, logical, 0, level, NULL);
1367 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1368 			if (!IS_ERR(old))
1369 				free_extent_buffer(old);
1370 			btrfs_warn(fs_info,
1371 				   "failed to read tree block %llu from get_old_root",
1372 				   logical);
1373 		} else {
1374 			eb = btrfs_clone_extent_buffer(old);
1375 			free_extent_buffer(old);
1376 		}
1377 	} else if (old_root) {
1378 		btrfs_tree_read_unlock(eb_root);
1379 		free_extent_buffer(eb_root);
1380 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1381 	} else {
1382 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1383 		eb = btrfs_clone_extent_buffer(eb_root);
1384 		btrfs_tree_read_unlock_blocking(eb_root);
1385 		free_extent_buffer(eb_root);
1386 	}
1387 
1388 	if (!eb)
1389 		return NULL;
1390 	btrfs_tree_read_lock(eb);
1391 	if (old_root) {
1392 		btrfs_set_header_bytenr(eb, eb->start);
1393 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1394 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1395 		btrfs_set_header_level(eb, old_root->level);
1396 		btrfs_set_header_generation(eb, old_generation);
1397 	}
1398 	if (tm)
1399 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1400 	else
1401 		WARN_ON(btrfs_header_level(eb) != 0);
1402 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1403 
1404 	return eb;
1405 }
1406 
1407 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1408 {
1409 	struct tree_mod_elem *tm;
1410 	int level;
1411 	struct extent_buffer *eb_root = btrfs_root_node(root);
1412 
1413 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1414 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1415 		level = tm->old_root.level;
1416 	} else {
1417 		level = btrfs_header_level(eb_root);
1418 	}
1419 	free_extent_buffer(eb_root);
1420 
1421 	return level;
1422 }
1423 
1424 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1425 				   struct btrfs_root *root,
1426 				   struct extent_buffer *buf)
1427 {
1428 	if (btrfs_is_testing(root->fs_info))
1429 		return 0;
1430 
1431 	/* Ensure we can see the FORCE_COW bit */
1432 	smp_mb__before_atomic();
1433 
1434 	/*
1435 	 * We do not need to cow a block if
1436 	 * 1) this block is not created or changed in this transaction;
1437 	 * 2) this block does not belong to TREE_RELOC tree;
1438 	 * 3) the root is not forced COW.
1439 	 *
1440 	 * What is forced COW:
1441 	 *    when we create snapshot during committing the transaction,
1442 	 *    after we've finished copying src root, we must COW the shared
1443 	 *    block to ensure the metadata consistency.
1444 	 */
1445 	if (btrfs_header_generation(buf) == trans->transid &&
1446 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1447 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1448 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1449 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1450 		return 0;
1451 	return 1;
1452 }
1453 
1454 /*
1455  * cows a single block, see __btrfs_cow_block for the real work.
1456  * This version of it has extra checks so that a block isn't COWed more than
1457  * once per transaction, as long as it hasn't been written yet
1458  */
1459 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1460 		    struct btrfs_root *root, struct extent_buffer *buf,
1461 		    struct extent_buffer *parent, int parent_slot,
1462 		    struct extent_buffer **cow_ret)
1463 {
1464 	struct btrfs_fs_info *fs_info = root->fs_info;
1465 	u64 search_start;
1466 	int ret;
1467 
1468 	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1469 		btrfs_err(fs_info,
1470 			"COW'ing blocks on a fs root that's being dropped");
1471 
1472 	if (trans->transaction != fs_info->running_transaction)
1473 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1474 		       trans->transid,
1475 		       fs_info->running_transaction->transid);
1476 
1477 	if (trans->transid != fs_info->generation)
1478 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1479 		       trans->transid, fs_info->generation);
1480 
1481 	if (!should_cow_block(trans, root, buf)) {
1482 		trans->dirty = true;
1483 		*cow_ret = buf;
1484 		return 0;
1485 	}
1486 
1487 	search_start = buf->start & ~((u64)SZ_1G - 1);
1488 
1489 	if (parent)
1490 		btrfs_set_lock_blocking(parent);
1491 	btrfs_set_lock_blocking(buf);
1492 
1493 	/*
1494 	 * Before CoWing this block for later modification, check if it's
1495 	 * the subtree root and do the delayed subtree trace if needed.
1496 	 *
1497 	 * Also We don't care about the error, as it's handled internally.
1498 	 */
1499 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1500 	ret = __btrfs_cow_block(trans, root, buf, parent,
1501 				 parent_slot, cow_ret, search_start, 0);
1502 
1503 	trace_btrfs_cow_block(root, buf, *cow_ret);
1504 
1505 	return ret;
1506 }
1507 
1508 /*
1509  * helper function for defrag to decide if two blocks pointed to by a
1510  * node are actually close by
1511  */
1512 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1513 {
1514 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1515 		return 1;
1516 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1517 		return 1;
1518 	return 0;
1519 }
1520 
1521 /*
1522  * compare two keys in a memcmp fashion
1523  */
1524 static int comp_keys(const struct btrfs_disk_key *disk,
1525 		     const struct btrfs_key *k2)
1526 {
1527 	struct btrfs_key k1;
1528 
1529 	btrfs_disk_key_to_cpu(&k1, disk);
1530 
1531 	return btrfs_comp_cpu_keys(&k1, k2);
1532 }
1533 
1534 /*
1535  * same as comp_keys only with two btrfs_key's
1536  */
1537 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1538 {
1539 	if (k1->objectid > k2->objectid)
1540 		return 1;
1541 	if (k1->objectid < k2->objectid)
1542 		return -1;
1543 	if (k1->type > k2->type)
1544 		return 1;
1545 	if (k1->type < k2->type)
1546 		return -1;
1547 	if (k1->offset > k2->offset)
1548 		return 1;
1549 	if (k1->offset < k2->offset)
1550 		return -1;
1551 	return 0;
1552 }
1553 
1554 /*
1555  * this is used by the defrag code to go through all the
1556  * leaves pointed to by a node and reallocate them so that
1557  * disk order is close to key order
1558  */
1559 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1560 		       struct btrfs_root *root, struct extent_buffer *parent,
1561 		       int start_slot, u64 *last_ret,
1562 		       struct btrfs_key *progress)
1563 {
1564 	struct btrfs_fs_info *fs_info = root->fs_info;
1565 	struct extent_buffer *cur;
1566 	u64 blocknr;
1567 	u64 gen;
1568 	u64 search_start = *last_ret;
1569 	u64 last_block = 0;
1570 	u64 other;
1571 	u32 parent_nritems;
1572 	int end_slot;
1573 	int i;
1574 	int err = 0;
1575 	int parent_level;
1576 	int uptodate;
1577 	u32 blocksize;
1578 	int progress_passed = 0;
1579 	struct btrfs_disk_key disk_key;
1580 
1581 	parent_level = btrfs_header_level(parent);
1582 
1583 	WARN_ON(trans->transaction != fs_info->running_transaction);
1584 	WARN_ON(trans->transid != fs_info->generation);
1585 
1586 	parent_nritems = btrfs_header_nritems(parent);
1587 	blocksize = fs_info->nodesize;
1588 	end_slot = parent_nritems - 1;
1589 
1590 	if (parent_nritems <= 1)
1591 		return 0;
1592 
1593 	btrfs_set_lock_blocking(parent);
1594 
1595 	for (i = start_slot; i <= end_slot; i++) {
1596 		struct btrfs_key first_key;
1597 		int close = 1;
1598 
1599 		btrfs_node_key(parent, &disk_key, i);
1600 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1601 			continue;
1602 
1603 		progress_passed = 1;
1604 		blocknr = btrfs_node_blockptr(parent, i);
1605 		gen = btrfs_node_ptr_generation(parent, i);
1606 		btrfs_node_key_to_cpu(parent, &first_key, i);
1607 		if (last_block == 0)
1608 			last_block = blocknr;
1609 
1610 		if (i > 0) {
1611 			other = btrfs_node_blockptr(parent, i - 1);
1612 			close = close_blocks(blocknr, other, blocksize);
1613 		}
1614 		if (!close && i < end_slot) {
1615 			other = btrfs_node_blockptr(parent, i + 1);
1616 			close = close_blocks(blocknr, other, blocksize);
1617 		}
1618 		if (close) {
1619 			last_block = blocknr;
1620 			continue;
1621 		}
1622 
1623 		cur = find_extent_buffer(fs_info, blocknr);
1624 		if (cur)
1625 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1626 		else
1627 			uptodate = 0;
1628 		if (!cur || !uptodate) {
1629 			if (!cur) {
1630 				cur = read_tree_block(fs_info, blocknr, gen,
1631 						      parent_level - 1,
1632 						      &first_key);
1633 				if (IS_ERR(cur)) {
1634 					return PTR_ERR(cur);
1635 				} else if (!extent_buffer_uptodate(cur)) {
1636 					free_extent_buffer(cur);
1637 					return -EIO;
1638 				}
1639 			} else if (!uptodate) {
1640 				err = btrfs_read_buffer(cur, gen,
1641 						parent_level - 1,&first_key);
1642 				if (err) {
1643 					free_extent_buffer(cur);
1644 					return err;
1645 				}
1646 			}
1647 		}
1648 		if (search_start == 0)
1649 			search_start = last_block;
1650 
1651 		btrfs_tree_lock(cur);
1652 		btrfs_set_lock_blocking(cur);
1653 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1654 					&cur, search_start,
1655 					min(16 * blocksize,
1656 					    (end_slot - i) * blocksize));
1657 		if (err) {
1658 			btrfs_tree_unlock(cur);
1659 			free_extent_buffer(cur);
1660 			break;
1661 		}
1662 		search_start = cur->start;
1663 		last_block = cur->start;
1664 		*last_ret = search_start;
1665 		btrfs_tree_unlock(cur);
1666 		free_extent_buffer(cur);
1667 	}
1668 	return err;
1669 }
1670 
1671 /*
1672  * search for key in the extent_buffer.  The items start at offset p,
1673  * and they are item_size apart.  There are 'max' items in p.
1674  *
1675  * the slot in the array is returned via slot, and it points to
1676  * the place where you would insert key if it is not found in
1677  * the array.
1678  *
1679  * slot may point to max if the key is bigger than all of the keys
1680  */
1681 static noinline int generic_bin_search(struct extent_buffer *eb,
1682 				       unsigned long p, int item_size,
1683 				       const struct btrfs_key *key,
1684 				       int max, int *slot)
1685 {
1686 	int low = 0;
1687 	int high = max;
1688 	int mid;
1689 	int ret;
1690 	struct btrfs_disk_key *tmp = NULL;
1691 	struct btrfs_disk_key unaligned;
1692 	unsigned long offset;
1693 	char *kaddr = NULL;
1694 	unsigned long map_start = 0;
1695 	unsigned long map_len = 0;
1696 	int err;
1697 
1698 	if (low > high) {
1699 		btrfs_err(eb->fs_info,
1700 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1701 			  __func__, low, high, eb->start,
1702 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1703 		return -EINVAL;
1704 	}
1705 
1706 	while (low < high) {
1707 		mid = (low + high) / 2;
1708 		offset = p + mid * item_size;
1709 
1710 		if (!kaddr || offset < map_start ||
1711 		    (offset + sizeof(struct btrfs_disk_key)) >
1712 		    map_start + map_len) {
1713 
1714 			err = map_private_extent_buffer(eb, offset,
1715 						sizeof(struct btrfs_disk_key),
1716 						&kaddr, &map_start, &map_len);
1717 
1718 			if (!err) {
1719 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1720 							map_start);
1721 			} else if (err == 1) {
1722 				read_extent_buffer(eb, &unaligned,
1723 						   offset, sizeof(unaligned));
1724 				tmp = &unaligned;
1725 			} else {
1726 				return err;
1727 			}
1728 
1729 		} else {
1730 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1731 							map_start);
1732 		}
1733 		ret = comp_keys(tmp, key);
1734 
1735 		if (ret < 0)
1736 			low = mid + 1;
1737 		else if (ret > 0)
1738 			high = mid;
1739 		else {
1740 			*slot = mid;
1741 			return 0;
1742 		}
1743 	}
1744 	*slot = low;
1745 	return 1;
1746 }
1747 
1748 /*
1749  * simple bin_search frontend that does the right thing for
1750  * leaves vs nodes
1751  */
1752 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1753 		     int level, int *slot)
1754 {
1755 	if (level == 0)
1756 		return generic_bin_search(eb,
1757 					  offsetof(struct btrfs_leaf, items),
1758 					  sizeof(struct btrfs_item),
1759 					  key, btrfs_header_nritems(eb),
1760 					  slot);
1761 	else
1762 		return generic_bin_search(eb,
1763 					  offsetof(struct btrfs_node, ptrs),
1764 					  sizeof(struct btrfs_key_ptr),
1765 					  key, btrfs_header_nritems(eb),
1766 					  slot);
1767 }
1768 
1769 static void root_add_used(struct btrfs_root *root, u32 size)
1770 {
1771 	spin_lock(&root->accounting_lock);
1772 	btrfs_set_root_used(&root->root_item,
1773 			    btrfs_root_used(&root->root_item) + size);
1774 	spin_unlock(&root->accounting_lock);
1775 }
1776 
1777 static void root_sub_used(struct btrfs_root *root, u32 size)
1778 {
1779 	spin_lock(&root->accounting_lock);
1780 	btrfs_set_root_used(&root->root_item,
1781 			    btrfs_root_used(&root->root_item) - size);
1782 	spin_unlock(&root->accounting_lock);
1783 }
1784 
1785 /* given a node and slot number, this reads the blocks it points to.  The
1786  * extent buffer is returned with a reference taken (but unlocked).
1787  */
1788 static noinline struct extent_buffer *
1789 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1790 	       int slot)
1791 {
1792 	int level = btrfs_header_level(parent);
1793 	struct extent_buffer *eb;
1794 	struct btrfs_key first_key;
1795 
1796 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1797 		return ERR_PTR(-ENOENT);
1798 
1799 	BUG_ON(level == 0);
1800 
1801 	btrfs_node_key_to_cpu(parent, &first_key, slot);
1802 	eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1803 			     btrfs_node_ptr_generation(parent, slot),
1804 			     level - 1, &first_key);
1805 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1806 		free_extent_buffer(eb);
1807 		eb = ERR_PTR(-EIO);
1808 	}
1809 
1810 	return eb;
1811 }
1812 
1813 /*
1814  * node level balancing, used to make sure nodes are in proper order for
1815  * item deletion.  We balance from the top down, so we have to make sure
1816  * that a deletion won't leave an node completely empty later on.
1817  */
1818 static noinline int balance_level(struct btrfs_trans_handle *trans,
1819 			 struct btrfs_root *root,
1820 			 struct btrfs_path *path, int level)
1821 {
1822 	struct btrfs_fs_info *fs_info = root->fs_info;
1823 	struct extent_buffer *right = NULL;
1824 	struct extent_buffer *mid;
1825 	struct extent_buffer *left = NULL;
1826 	struct extent_buffer *parent = NULL;
1827 	int ret = 0;
1828 	int wret;
1829 	int pslot;
1830 	int orig_slot = path->slots[level];
1831 	u64 orig_ptr;
1832 
1833 	ASSERT(level > 0);
1834 
1835 	mid = path->nodes[level];
1836 
1837 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1838 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1839 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1840 
1841 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1842 
1843 	if (level < BTRFS_MAX_LEVEL - 1) {
1844 		parent = path->nodes[level + 1];
1845 		pslot = path->slots[level + 1];
1846 	}
1847 
1848 	/*
1849 	 * deal with the case where there is only one pointer in the root
1850 	 * by promoting the node below to a root
1851 	 */
1852 	if (!parent) {
1853 		struct extent_buffer *child;
1854 
1855 		if (btrfs_header_nritems(mid) != 1)
1856 			return 0;
1857 
1858 		/* promote the child to a root */
1859 		child = read_node_slot(fs_info, mid, 0);
1860 		if (IS_ERR(child)) {
1861 			ret = PTR_ERR(child);
1862 			btrfs_handle_fs_error(fs_info, ret, NULL);
1863 			goto enospc;
1864 		}
1865 
1866 		btrfs_tree_lock(child);
1867 		btrfs_set_lock_blocking(child);
1868 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1869 		if (ret) {
1870 			btrfs_tree_unlock(child);
1871 			free_extent_buffer(child);
1872 			goto enospc;
1873 		}
1874 
1875 		ret = tree_mod_log_insert_root(root->node, child, 1);
1876 		BUG_ON(ret < 0);
1877 		rcu_assign_pointer(root->node, child);
1878 
1879 		add_root_to_dirty_list(root);
1880 		btrfs_tree_unlock(child);
1881 
1882 		path->locks[level] = 0;
1883 		path->nodes[level] = NULL;
1884 		clean_tree_block(fs_info, mid);
1885 		btrfs_tree_unlock(mid);
1886 		/* once for the path */
1887 		free_extent_buffer(mid);
1888 
1889 		root_sub_used(root, mid->len);
1890 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1891 		/* once for the root ptr */
1892 		free_extent_buffer_stale(mid);
1893 		return 0;
1894 	}
1895 	if (btrfs_header_nritems(mid) >
1896 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1897 		return 0;
1898 
1899 	left = read_node_slot(fs_info, parent, pslot - 1);
1900 	if (IS_ERR(left))
1901 		left = NULL;
1902 
1903 	if (left) {
1904 		btrfs_tree_lock(left);
1905 		btrfs_set_lock_blocking(left);
1906 		wret = btrfs_cow_block(trans, root, left,
1907 				       parent, pslot - 1, &left);
1908 		if (wret) {
1909 			ret = wret;
1910 			goto enospc;
1911 		}
1912 	}
1913 
1914 	right = read_node_slot(fs_info, parent, pslot + 1);
1915 	if (IS_ERR(right))
1916 		right = NULL;
1917 
1918 	if (right) {
1919 		btrfs_tree_lock(right);
1920 		btrfs_set_lock_blocking(right);
1921 		wret = btrfs_cow_block(trans, root, right,
1922 				       parent, pslot + 1, &right);
1923 		if (wret) {
1924 			ret = wret;
1925 			goto enospc;
1926 		}
1927 	}
1928 
1929 	/* first, try to make some room in the middle buffer */
1930 	if (left) {
1931 		orig_slot += btrfs_header_nritems(left);
1932 		wret = push_node_left(trans, fs_info, left, mid, 1);
1933 		if (wret < 0)
1934 			ret = wret;
1935 	}
1936 
1937 	/*
1938 	 * then try to empty the right most buffer into the middle
1939 	 */
1940 	if (right) {
1941 		wret = push_node_left(trans, fs_info, mid, right, 1);
1942 		if (wret < 0 && wret != -ENOSPC)
1943 			ret = wret;
1944 		if (btrfs_header_nritems(right) == 0) {
1945 			clean_tree_block(fs_info, right);
1946 			btrfs_tree_unlock(right);
1947 			del_ptr(root, path, level + 1, pslot + 1);
1948 			root_sub_used(root, right->len);
1949 			btrfs_free_tree_block(trans, root, right, 0, 1);
1950 			free_extent_buffer_stale(right);
1951 			right = NULL;
1952 		} else {
1953 			struct btrfs_disk_key right_key;
1954 			btrfs_node_key(right, &right_key, 0);
1955 			ret = tree_mod_log_insert_key(parent, pslot + 1,
1956 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1957 			BUG_ON(ret < 0);
1958 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1959 			btrfs_mark_buffer_dirty(parent);
1960 		}
1961 	}
1962 	if (btrfs_header_nritems(mid) == 1) {
1963 		/*
1964 		 * we're not allowed to leave a node with one item in the
1965 		 * tree during a delete.  A deletion from lower in the tree
1966 		 * could try to delete the only pointer in this node.
1967 		 * So, pull some keys from the left.
1968 		 * There has to be a left pointer at this point because
1969 		 * otherwise we would have pulled some pointers from the
1970 		 * right
1971 		 */
1972 		if (!left) {
1973 			ret = -EROFS;
1974 			btrfs_handle_fs_error(fs_info, ret, NULL);
1975 			goto enospc;
1976 		}
1977 		wret = balance_node_right(trans, fs_info, mid, left);
1978 		if (wret < 0) {
1979 			ret = wret;
1980 			goto enospc;
1981 		}
1982 		if (wret == 1) {
1983 			wret = push_node_left(trans, fs_info, left, mid, 1);
1984 			if (wret < 0)
1985 				ret = wret;
1986 		}
1987 		BUG_ON(wret == 1);
1988 	}
1989 	if (btrfs_header_nritems(mid) == 0) {
1990 		clean_tree_block(fs_info, mid);
1991 		btrfs_tree_unlock(mid);
1992 		del_ptr(root, path, level + 1, pslot);
1993 		root_sub_used(root, mid->len);
1994 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1995 		free_extent_buffer_stale(mid);
1996 		mid = NULL;
1997 	} else {
1998 		/* update the parent key to reflect our changes */
1999 		struct btrfs_disk_key mid_key;
2000 		btrfs_node_key(mid, &mid_key, 0);
2001 		ret = tree_mod_log_insert_key(parent, pslot,
2002 				MOD_LOG_KEY_REPLACE, GFP_NOFS);
2003 		BUG_ON(ret < 0);
2004 		btrfs_set_node_key(parent, &mid_key, pslot);
2005 		btrfs_mark_buffer_dirty(parent);
2006 	}
2007 
2008 	/* update the path */
2009 	if (left) {
2010 		if (btrfs_header_nritems(left) > orig_slot) {
2011 			extent_buffer_get(left);
2012 			/* left was locked after cow */
2013 			path->nodes[level] = left;
2014 			path->slots[level + 1] -= 1;
2015 			path->slots[level] = orig_slot;
2016 			if (mid) {
2017 				btrfs_tree_unlock(mid);
2018 				free_extent_buffer(mid);
2019 			}
2020 		} else {
2021 			orig_slot -= btrfs_header_nritems(left);
2022 			path->slots[level] = orig_slot;
2023 		}
2024 	}
2025 	/* double check we haven't messed things up */
2026 	if (orig_ptr !=
2027 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2028 		BUG();
2029 enospc:
2030 	if (right) {
2031 		btrfs_tree_unlock(right);
2032 		free_extent_buffer(right);
2033 	}
2034 	if (left) {
2035 		if (path->nodes[level] != left)
2036 			btrfs_tree_unlock(left);
2037 		free_extent_buffer(left);
2038 	}
2039 	return ret;
2040 }
2041 
2042 /* Node balancing for insertion.  Here we only split or push nodes around
2043  * when they are completely full.  This is also done top down, so we
2044  * have to be pessimistic.
2045  */
2046 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2047 					  struct btrfs_root *root,
2048 					  struct btrfs_path *path, int level)
2049 {
2050 	struct btrfs_fs_info *fs_info = root->fs_info;
2051 	struct extent_buffer *right = NULL;
2052 	struct extent_buffer *mid;
2053 	struct extent_buffer *left = NULL;
2054 	struct extent_buffer *parent = NULL;
2055 	int ret = 0;
2056 	int wret;
2057 	int pslot;
2058 	int orig_slot = path->slots[level];
2059 
2060 	if (level == 0)
2061 		return 1;
2062 
2063 	mid = path->nodes[level];
2064 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2065 
2066 	if (level < BTRFS_MAX_LEVEL - 1) {
2067 		parent = path->nodes[level + 1];
2068 		pslot = path->slots[level + 1];
2069 	}
2070 
2071 	if (!parent)
2072 		return 1;
2073 
2074 	left = read_node_slot(fs_info, parent, pslot - 1);
2075 	if (IS_ERR(left))
2076 		left = NULL;
2077 
2078 	/* first, try to make some room in the middle buffer */
2079 	if (left) {
2080 		u32 left_nr;
2081 
2082 		btrfs_tree_lock(left);
2083 		btrfs_set_lock_blocking(left);
2084 
2085 		left_nr = btrfs_header_nritems(left);
2086 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2087 			wret = 1;
2088 		} else {
2089 			ret = btrfs_cow_block(trans, root, left, parent,
2090 					      pslot - 1, &left);
2091 			if (ret)
2092 				wret = 1;
2093 			else {
2094 				wret = push_node_left(trans, fs_info,
2095 						      left, mid, 0);
2096 			}
2097 		}
2098 		if (wret < 0)
2099 			ret = wret;
2100 		if (wret == 0) {
2101 			struct btrfs_disk_key disk_key;
2102 			orig_slot += left_nr;
2103 			btrfs_node_key(mid, &disk_key, 0);
2104 			ret = tree_mod_log_insert_key(parent, pslot,
2105 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2106 			BUG_ON(ret < 0);
2107 			btrfs_set_node_key(parent, &disk_key, pslot);
2108 			btrfs_mark_buffer_dirty(parent);
2109 			if (btrfs_header_nritems(left) > orig_slot) {
2110 				path->nodes[level] = left;
2111 				path->slots[level + 1] -= 1;
2112 				path->slots[level] = orig_slot;
2113 				btrfs_tree_unlock(mid);
2114 				free_extent_buffer(mid);
2115 			} else {
2116 				orig_slot -=
2117 					btrfs_header_nritems(left);
2118 				path->slots[level] = orig_slot;
2119 				btrfs_tree_unlock(left);
2120 				free_extent_buffer(left);
2121 			}
2122 			return 0;
2123 		}
2124 		btrfs_tree_unlock(left);
2125 		free_extent_buffer(left);
2126 	}
2127 	right = read_node_slot(fs_info, parent, pslot + 1);
2128 	if (IS_ERR(right))
2129 		right = NULL;
2130 
2131 	/*
2132 	 * then try to empty the right most buffer into the middle
2133 	 */
2134 	if (right) {
2135 		u32 right_nr;
2136 
2137 		btrfs_tree_lock(right);
2138 		btrfs_set_lock_blocking(right);
2139 
2140 		right_nr = btrfs_header_nritems(right);
2141 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2142 			wret = 1;
2143 		} else {
2144 			ret = btrfs_cow_block(trans, root, right,
2145 					      parent, pslot + 1,
2146 					      &right);
2147 			if (ret)
2148 				wret = 1;
2149 			else {
2150 				wret = balance_node_right(trans, fs_info,
2151 							  right, mid);
2152 			}
2153 		}
2154 		if (wret < 0)
2155 			ret = wret;
2156 		if (wret == 0) {
2157 			struct btrfs_disk_key disk_key;
2158 
2159 			btrfs_node_key(right, &disk_key, 0);
2160 			ret = tree_mod_log_insert_key(parent, pslot + 1,
2161 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2162 			BUG_ON(ret < 0);
2163 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2164 			btrfs_mark_buffer_dirty(parent);
2165 
2166 			if (btrfs_header_nritems(mid) <= orig_slot) {
2167 				path->nodes[level] = right;
2168 				path->slots[level + 1] += 1;
2169 				path->slots[level] = orig_slot -
2170 					btrfs_header_nritems(mid);
2171 				btrfs_tree_unlock(mid);
2172 				free_extent_buffer(mid);
2173 			} else {
2174 				btrfs_tree_unlock(right);
2175 				free_extent_buffer(right);
2176 			}
2177 			return 0;
2178 		}
2179 		btrfs_tree_unlock(right);
2180 		free_extent_buffer(right);
2181 	}
2182 	return 1;
2183 }
2184 
2185 /*
2186  * readahead one full node of leaves, finding things that are close
2187  * to the block in 'slot', and triggering ra on them.
2188  */
2189 static void reada_for_search(struct btrfs_fs_info *fs_info,
2190 			     struct btrfs_path *path,
2191 			     int level, int slot, u64 objectid)
2192 {
2193 	struct extent_buffer *node;
2194 	struct btrfs_disk_key disk_key;
2195 	u32 nritems;
2196 	u64 search;
2197 	u64 target;
2198 	u64 nread = 0;
2199 	struct extent_buffer *eb;
2200 	u32 nr;
2201 	u32 blocksize;
2202 	u32 nscan = 0;
2203 
2204 	if (level != 1)
2205 		return;
2206 
2207 	if (!path->nodes[level])
2208 		return;
2209 
2210 	node = path->nodes[level];
2211 
2212 	search = btrfs_node_blockptr(node, slot);
2213 	blocksize = fs_info->nodesize;
2214 	eb = find_extent_buffer(fs_info, search);
2215 	if (eb) {
2216 		free_extent_buffer(eb);
2217 		return;
2218 	}
2219 
2220 	target = search;
2221 
2222 	nritems = btrfs_header_nritems(node);
2223 	nr = slot;
2224 
2225 	while (1) {
2226 		if (path->reada == READA_BACK) {
2227 			if (nr == 0)
2228 				break;
2229 			nr--;
2230 		} else if (path->reada == READA_FORWARD) {
2231 			nr++;
2232 			if (nr >= nritems)
2233 				break;
2234 		}
2235 		if (path->reada == READA_BACK && objectid) {
2236 			btrfs_node_key(node, &disk_key, nr);
2237 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2238 				break;
2239 		}
2240 		search = btrfs_node_blockptr(node, nr);
2241 		if ((search <= target && target - search <= 65536) ||
2242 		    (search > target && search - target <= 65536)) {
2243 			readahead_tree_block(fs_info, search);
2244 			nread += blocksize;
2245 		}
2246 		nscan++;
2247 		if ((nread > 65536 || nscan > 32))
2248 			break;
2249 	}
2250 }
2251 
2252 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2253 				       struct btrfs_path *path, int level)
2254 {
2255 	int slot;
2256 	int nritems;
2257 	struct extent_buffer *parent;
2258 	struct extent_buffer *eb;
2259 	u64 gen;
2260 	u64 block1 = 0;
2261 	u64 block2 = 0;
2262 
2263 	parent = path->nodes[level + 1];
2264 	if (!parent)
2265 		return;
2266 
2267 	nritems = btrfs_header_nritems(parent);
2268 	slot = path->slots[level + 1];
2269 
2270 	if (slot > 0) {
2271 		block1 = btrfs_node_blockptr(parent, slot - 1);
2272 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2273 		eb = find_extent_buffer(fs_info, block1);
2274 		/*
2275 		 * if we get -eagain from btrfs_buffer_uptodate, we
2276 		 * don't want to return eagain here.  That will loop
2277 		 * forever
2278 		 */
2279 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2280 			block1 = 0;
2281 		free_extent_buffer(eb);
2282 	}
2283 	if (slot + 1 < nritems) {
2284 		block2 = btrfs_node_blockptr(parent, slot + 1);
2285 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2286 		eb = find_extent_buffer(fs_info, block2);
2287 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2288 			block2 = 0;
2289 		free_extent_buffer(eb);
2290 	}
2291 
2292 	if (block1)
2293 		readahead_tree_block(fs_info, block1);
2294 	if (block2)
2295 		readahead_tree_block(fs_info, block2);
2296 }
2297 
2298 
2299 /*
2300  * when we walk down the tree, it is usually safe to unlock the higher layers
2301  * in the tree.  The exceptions are when our path goes through slot 0, because
2302  * operations on the tree might require changing key pointers higher up in the
2303  * tree.
2304  *
2305  * callers might also have set path->keep_locks, which tells this code to keep
2306  * the lock if the path points to the last slot in the block.  This is part of
2307  * walking through the tree, and selecting the next slot in the higher block.
2308  *
2309  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2310  * if lowest_unlock is 1, level 0 won't be unlocked
2311  */
2312 static noinline void unlock_up(struct btrfs_path *path, int level,
2313 			       int lowest_unlock, int min_write_lock_level,
2314 			       int *write_lock_level)
2315 {
2316 	int i;
2317 	int skip_level = level;
2318 	int no_skips = 0;
2319 	struct extent_buffer *t;
2320 
2321 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2322 		if (!path->nodes[i])
2323 			break;
2324 		if (!path->locks[i])
2325 			break;
2326 		if (!no_skips && path->slots[i] == 0) {
2327 			skip_level = i + 1;
2328 			continue;
2329 		}
2330 		if (!no_skips && path->keep_locks) {
2331 			u32 nritems;
2332 			t = path->nodes[i];
2333 			nritems = btrfs_header_nritems(t);
2334 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2335 				skip_level = i + 1;
2336 				continue;
2337 			}
2338 		}
2339 		if (skip_level < i && i >= lowest_unlock)
2340 			no_skips = 1;
2341 
2342 		t = path->nodes[i];
2343 		if (i >= lowest_unlock && i > skip_level) {
2344 			btrfs_tree_unlock_rw(t, path->locks[i]);
2345 			path->locks[i] = 0;
2346 			if (write_lock_level &&
2347 			    i > min_write_lock_level &&
2348 			    i <= *write_lock_level) {
2349 				*write_lock_level = i - 1;
2350 			}
2351 		}
2352 	}
2353 }
2354 
2355 /*
2356  * This releases any locks held in the path starting at level and
2357  * going all the way up to the root.
2358  *
2359  * btrfs_search_slot will keep the lock held on higher nodes in a few
2360  * corner cases, such as COW of the block at slot zero in the node.  This
2361  * ignores those rules, and it should only be called when there are no
2362  * more updates to be done higher up in the tree.
2363  */
2364 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2365 {
2366 	int i;
2367 
2368 	if (path->keep_locks)
2369 		return;
2370 
2371 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2372 		if (!path->nodes[i])
2373 			continue;
2374 		if (!path->locks[i])
2375 			continue;
2376 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2377 		path->locks[i] = 0;
2378 	}
2379 }
2380 
2381 /*
2382  * helper function for btrfs_search_slot.  The goal is to find a block
2383  * in cache without setting the path to blocking.  If we find the block
2384  * we return zero and the path is unchanged.
2385  *
2386  * If we can't find the block, we set the path blocking and do some
2387  * reada.  -EAGAIN is returned and the search must be repeated.
2388  */
2389 static int
2390 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2391 		      struct extent_buffer **eb_ret, int level, int slot,
2392 		      const struct btrfs_key *key)
2393 {
2394 	struct btrfs_fs_info *fs_info = root->fs_info;
2395 	u64 blocknr;
2396 	u64 gen;
2397 	struct extent_buffer *b = *eb_ret;
2398 	struct extent_buffer *tmp;
2399 	struct btrfs_key first_key;
2400 	int ret;
2401 	int parent_level;
2402 
2403 	blocknr = btrfs_node_blockptr(b, slot);
2404 	gen = btrfs_node_ptr_generation(b, slot);
2405 	parent_level = btrfs_header_level(b);
2406 	btrfs_node_key_to_cpu(b, &first_key, slot);
2407 
2408 	tmp = find_extent_buffer(fs_info, blocknr);
2409 	if (tmp) {
2410 		/* first we do an atomic uptodate check */
2411 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2412 			*eb_ret = tmp;
2413 			return 0;
2414 		}
2415 
2416 		/* the pages were up to date, but we failed
2417 		 * the generation number check.  Do a full
2418 		 * read for the generation number that is correct.
2419 		 * We must do this without dropping locks so
2420 		 * we can trust our generation number
2421 		 */
2422 		btrfs_set_path_blocking(p);
2423 
2424 		/* now we're allowed to do a blocking uptodate check */
2425 		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2426 		if (!ret) {
2427 			*eb_ret = tmp;
2428 			return 0;
2429 		}
2430 		free_extent_buffer(tmp);
2431 		btrfs_release_path(p);
2432 		return -EIO;
2433 	}
2434 
2435 	/*
2436 	 * reduce lock contention at high levels
2437 	 * of the btree by dropping locks before
2438 	 * we read.  Don't release the lock on the current
2439 	 * level because we need to walk this node to figure
2440 	 * out which blocks to read.
2441 	 */
2442 	btrfs_unlock_up_safe(p, level + 1);
2443 	btrfs_set_path_blocking(p);
2444 
2445 	if (p->reada != READA_NONE)
2446 		reada_for_search(fs_info, p, level, slot, key->objectid);
2447 
2448 	ret = -EAGAIN;
2449 	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2450 			      &first_key);
2451 	if (!IS_ERR(tmp)) {
2452 		/*
2453 		 * If the read above didn't mark this buffer up to date,
2454 		 * it will never end up being up to date.  Set ret to EIO now
2455 		 * and give up so that our caller doesn't loop forever
2456 		 * on our EAGAINs.
2457 		 */
2458 		if (!extent_buffer_uptodate(tmp))
2459 			ret = -EIO;
2460 		free_extent_buffer(tmp);
2461 	} else {
2462 		ret = PTR_ERR(tmp);
2463 	}
2464 
2465 	btrfs_release_path(p);
2466 	return ret;
2467 }
2468 
2469 /*
2470  * helper function for btrfs_search_slot.  This does all of the checks
2471  * for node-level blocks and does any balancing required based on
2472  * the ins_len.
2473  *
2474  * If no extra work was required, zero is returned.  If we had to
2475  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2476  * start over
2477  */
2478 static int
2479 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2480 		       struct btrfs_root *root, struct btrfs_path *p,
2481 		       struct extent_buffer *b, int level, int ins_len,
2482 		       int *write_lock_level)
2483 {
2484 	struct btrfs_fs_info *fs_info = root->fs_info;
2485 	int ret;
2486 
2487 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2488 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2489 		int sret;
2490 
2491 		if (*write_lock_level < level + 1) {
2492 			*write_lock_level = level + 1;
2493 			btrfs_release_path(p);
2494 			goto again;
2495 		}
2496 
2497 		btrfs_set_path_blocking(p);
2498 		reada_for_balance(fs_info, p, level);
2499 		sret = split_node(trans, root, p, level);
2500 
2501 		BUG_ON(sret > 0);
2502 		if (sret) {
2503 			ret = sret;
2504 			goto done;
2505 		}
2506 		b = p->nodes[level];
2507 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2508 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2509 		int sret;
2510 
2511 		if (*write_lock_level < level + 1) {
2512 			*write_lock_level = level + 1;
2513 			btrfs_release_path(p);
2514 			goto again;
2515 		}
2516 
2517 		btrfs_set_path_blocking(p);
2518 		reada_for_balance(fs_info, p, level);
2519 		sret = balance_level(trans, root, p, level);
2520 
2521 		if (sret) {
2522 			ret = sret;
2523 			goto done;
2524 		}
2525 		b = p->nodes[level];
2526 		if (!b) {
2527 			btrfs_release_path(p);
2528 			goto again;
2529 		}
2530 		BUG_ON(btrfs_header_nritems(b) == 1);
2531 	}
2532 	return 0;
2533 
2534 again:
2535 	ret = -EAGAIN;
2536 done:
2537 	return ret;
2538 }
2539 
2540 static void key_search_validate(struct extent_buffer *b,
2541 				const struct btrfs_key *key,
2542 				int level)
2543 {
2544 #ifdef CONFIG_BTRFS_ASSERT
2545 	struct btrfs_disk_key disk_key;
2546 
2547 	btrfs_cpu_key_to_disk(&disk_key, key);
2548 
2549 	if (level == 0)
2550 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2551 		    offsetof(struct btrfs_leaf, items[0].key),
2552 		    sizeof(disk_key)));
2553 	else
2554 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2555 		    offsetof(struct btrfs_node, ptrs[0].key),
2556 		    sizeof(disk_key)));
2557 #endif
2558 }
2559 
2560 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2561 		      int level, int *prev_cmp, int *slot)
2562 {
2563 	if (*prev_cmp != 0) {
2564 		*prev_cmp = btrfs_bin_search(b, key, level, slot);
2565 		return *prev_cmp;
2566 	}
2567 
2568 	key_search_validate(b, key, level);
2569 	*slot = 0;
2570 
2571 	return 0;
2572 }
2573 
2574 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2575 		u64 iobjectid, u64 ioff, u8 key_type,
2576 		struct btrfs_key *found_key)
2577 {
2578 	int ret;
2579 	struct btrfs_key key;
2580 	struct extent_buffer *eb;
2581 
2582 	ASSERT(path);
2583 	ASSERT(found_key);
2584 
2585 	key.type = key_type;
2586 	key.objectid = iobjectid;
2587 	key.offset = ioff;
2588 
2589 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2590 	if (ret < 0)
2591 		return ret;
2592 
2593 	eb = path->nodes[0];
2594 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2595 		ret = btrfs_next_leaf(fs_root, path);
2596 		if (ret)
2597 			return ret;
2598 		eb = path->nodes[0];
2599 	}
2600 
2601 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2602 	if (found_key->type != key.type ||
2603 			found_key->objectid != key.objectid)
2604 		return 1;
2605 
2606 	return 0;
2607 }
2608 
2609 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2610 							struct btrfs_path *p,
2611 							int write_lock_level)
2612 {
2613 	struct btrfs_fs_info *fs_info = root->fs_info;
2614 	struct extent_buffer *b;
2615 	int root_lock;
2616 	int level = 0;
2617 
2618 	/* We try very hard to do read locks on the root */
2619 	root_lock = BTRFS_READ_LOCK;
2620 
2621 	if (p->search_commit_root) {
2622 		/*
2623 		 * The commit roots are read only so we always do read locks,
2624 		 * and we always must hold the commit_root_sem when doing
2625 		 * searches on them, the only exception is send where we don't
2626 		 * want to block transaction commits for a long time, so
2627 		 * we need to clone the commit root in order to avoid races
2628 		 * with transaction commits that create a snapshot of one of
2629 		 * the roots used by a send operation.
2630 		 */
2631 		if (p->need_commit_sem) {
2632 			down_read(&fs_info->commit_root_sem);
2633 			b = btrfs_clone_extent_buffer(root->commit_root);
2634 			up_read(&fs_info->commit_root_sem);
2635 			if (!b)
2636 				return ERR_PTR(-ENOMEM);
2637 
2638 		} else {
2639 			b = root->commit_root;
2640 			extent_buffer_get(b);
2641 		}
2642 		level = btrfs_header_level(b);
2643 		/*
2644 		 * Ensure that all callers have set skip_locking when
2645 		 * p->search_commit_root = 1.
2646 		 */
2647 		ASSERT(p->skip_locking == 1);
2648 
2649 		goto out;
2650 	}
2651 
2652 	if (p->skip_locking) {
2653 		b = btrfs_root_node(root);
2654 		level = btrfs_header_level(b);
2655 		goto out;
2656 	}
2657 
2658 	/*
2659 	 * If the level is set to maximum, we can skip trying to get the read
2660 	 * lock.
2661 	 */
2662 	if (write_lock_level < BTRFS_MAX_LEVEL) {
2663 		/*
2664 		 * We don't know the level of the root node until we actually
2665 		 * have it read locked
2666 		 */
2667 		b = btrfs_read_lock_root_node(root);
2668 		level = btrfs_header_level(b);
2669 		if (level > write_lock_level)
2670 			goto out;
2671 
2672 		/* Whoops, must trade for write lock */
2673 		btrfs_tree_read_unlock(b);
2674 		free_extent_buffer(b);
2675 	}
2676 
2677 	b = btrfs_lock_root_node(root);
2678 	root_lock = BTRFS_WRITE_LOCK;
2679 
2680 	/* The level might have changed, check again */
2681 	level = btrfs_header_level(b);
2682 
2683 out:
2684 	p->nodes[level] = b;
2685 	if (!p->skip_locking)
2686 		p->locks[level] = root_lock;
2687 	/*
2688 	 * Callers are responsible for dropping b's references.
2689 	 */
2690 	return b;
2691 }
2692 
2693 
2694 /*
2695  * btrfs_search_slot - look for a key in a tree and perform necessary
2696  * modifications to preserve tree invariants.
2697  *
2698  * @trans:	Handle of transaction, used when modifying the tree
2699  * @p:		Holds all btree nodes along the search path
2700  * @root:	The root node of the tree
2701  * @key:	The key we are looking for
2702  * @ins_len:	Indicates purpose of search, for inserts it is 1, for
2703  *		deletions it's -1. 0 for plain searches
2704  * @cow:	boolean should CoW operations be performed. Must always be 1
2705  *		when modifying the tree.
2706  *
2707  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2708  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2709  *
2710  * If @key is found, 0 is returned and you can find the item in the leaf level
2711  * of the path (level 0)
2712  *
2713  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2714  * points to the slot where it should be inserted
2715  *
2716  * If an error is encountered while searching the tree a negative error number
2717  * is returned
2718  */
2719 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2720 		      const struct btrfs_key *key, struct btrfs_path *p,
2721 		      int ins_len, int cow)
2722 {
2723 	struct btrfs_fs_info *fs_info = root->fs_info;
2724 	struct extent_buffer *b;
2725 	int slot;
2726 	int ret;
2727 	int err;
2728 	int level;
2729 	int lowest_unlock = 1;
2730 	/* everything at write_lock_level or lower must be write locked */
2731 	int write_lock_level = 0;
2732 	u8 lowest_level = 0;
2733 	int min_write_lock_level;
2734 	int prev_cmp;
2735 
2736 	lowest_level = p->lowest_level;
2737 	WARN_ON(lowest_level && ins_len > 0);
2738 	WARN_ON(p->nodes[0] != NULL);
2739 	BUG_ON(!cow && ins_len);
2740 
2741 	if (ins_len < 0) {
2742 		lowest_unlock = 2;
2743 
2744 		/* when we are removing items, we might have to go up to level
2745 		 * two as we update tree pointers  Make sure we keep write
2746 		 * for those levels as well
2747 		 */
2748 		write_lock_level = 2;
2749 	} else if (ins_len > 0) {
2750 		/*
2751 		 * for inserting items, make sure we have a write lock on
2752 		 * level 1 so we can update keys
2753 		 */
2754 		write_lock_level = 1;
2755 	}
2756 
2757 	if (!cow)
2758 		write_lock_level = -1;
2759 
2760 	if (cow && (p->keep_locks || p->lowest_level))
2761 		write_lock_level = BTRFS_MAX_LEVEL;
2762 
2763 	min_write_lock_level = write_lock_level;
2764 
2765 again:
2766 	prev_cmp = -1;
2767 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2768 	if (IS_ERR(b)) {
2769 		ret = PTR_ERR(b);
2770 		goto done;
2771 	}
2772 
2773 	while (b) {
2774 		level = btrfs_header_level(b);
2775 
2776 		/*
2777 		 * setup the path here so we can release it under lock
2778 		 * contention with the cow code
2779 		 */
2780 		if (cow) {
2781 			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2782 
2783 			/*
2784 			 * if we don't really need to cow this block
2785 			 * then we don't want to set the path blocking,
2786 			 * so we test it here
2787 			 */
2788 			if (!should_cow_block(trans, root, b)) {
2789 				trans->dirty = true;
2790 				goto cow_done;
2791 			}
2792 
2793 			/*
2794 			 * must have write locks on this node and the
2795 			 * parent
2796 			 */
2797 			if (level > write_lock_level ||
2798 			    (level + 1 > write_lock_level &&
2799 			    level + 1 < BTRFS_MAX_LEVEL &&
2800 			    p->nodes[level + 1])) {
2801 				write_lock_level = level + 1;
2802 				btrfs_release_path(p);
2803 				goto again;
2804 			}
2805 
2806 			btrfs_set_path_blocking(p);
2807 			if (last_level)
2808 				err = btrfs_cow_block(trans, root, b, NULL, 0,
2809 						      &b);
2810 			else
2811 				err = btrfs_cow_block(trans, root, b,
2812 						      p->nodes[level + 1],
2813 						      p->slots[level + 1], &b);
2814 			if (err) {
2815 				ret = err;
2816 				goto done;
2817 			}
2818 		}
2819 cow_done:
2820 		p->nodes[level] = b;
2821 		/*
2822 		 * Leave path with blocking locks to avoid massive
2823 		 * lock context switch, this is made on purpose.
2824 		 */
2825 
2826 		/*
2827 		 * we have a lock on b and as long as we aren't changing
2828 		 * the tree, there is no way to for the items in b to change.
2829 		 * It is safe to drop the lock on our parent before we
2830 		 * go through the expensive btree search on b.
2831 		 *
2832 		 * If we're inserting or deleting (ins_len != 0), then we might
2833 		 * be changing slot zero, which may require changing the parent.
2834 		 * So, we can't drop the lock until after we know which slot
2835 		 * we're operating on.
2836 		 */
2837 		if (!ins_len && !p->keep_locks) {
2838 			int u = level + 1;
2839 
2840 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2841 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2842 				p->locks[u] = 0;
2843 			}
2844 		}
2845 
2846 		ret = key_search(b, key, level, &prev_cmp, &slot);
2847 		if (ret < 0)
2848 			goto done;
2849 
2850 		if (level != 0) {
2851 			int dec = 0;
2852 			if (ret && slot > 0) {
2853 				dec = 1;
2854 				slot -= 1;
2855 			}
2856 			p->slots[level] = slot;
2857 			err = setup_nodes_for_search(trans, root, p, b, level,
2858 					     ins_len, &write_lock_level);
2859 			if (err == -EAGAIN)
2860 				goto again;
2861 			if (err) {
2862 				ret = err;
2863 				goto done;
2864 			}
2865 			b = p->nodes[level];
2866 			slot = p->slots[level];
2867 
2868 			/*
2869 			 * slot 0 is special, if we change the key
2870 			 * we have to update the parent pointer
2871 			 * which means we must have a write lock
2872 			 * on the parent
2873 			 */
2874 			if (slot == 0 && ins_len &&
2875 			    write_lock_level < level + 1) {
2876 				write_lock_level = level + 1;
2877 				btrfs_release_path(p);
2878 				goto again;
2879 			}
2880 
2881 			unlock_up(p, level, lowest_unlock,
2882 				  min_write_lock_level, &write_lock_level);
2883 
2884 			if (level == lowest_level) {
2885 				if (dec)
2886 					p->slots[level]++;
2887 				goto done;
2888 			}
2889 
2890 			err = read_block_for_search(root, p, &b, level,
2891 						    slot, key);
2892 			if (err == -EAGAIN)
2893 				goto again;
2894 			if (err) {
2895 				ret = err;
2896 				goto done;
2897 			}
2898 
2899 			if (!p->skip_locking) {
2900 				level = btrfs_header_level(b);
2901 				if (level <= write_lock_level) {
2902 					err = btrfs_try_tree_write_lock(b);
2903 					if (!err) {
2904 						btrfs_set_path_blocking(p);
2905 						btrfs_tree_lock(b);
2906 					}
2907 					p->locks[level] = BTRFS_WRITE_LOCK;
2908 				} else {
2909 					err = btrfs_tree_read_lock_atomic(b);
2910 					if (!err) {
2911 						btrfs_set_path_blocking(p);
2912 						btrfs_tree_read_lock(b);
2913 					}
2914 					p->locks[level] = BTRFS_READ_LOCK;
2915 				}
2916 				p->nodes[level] = b;
2917 			}
2918 		} else {
2919 			p->slots[level] = slot;
2920 			if (ins_len > 0 &&
2921 			    btrfs_leaf_free_space(fs_info, b) < ins_len) {
2922 				if (write_lock_level < 1) {
2923 					write_lock_level = 1;
2924 					btrfs_release_path(p);
2925 					goto again;
2926 				}
2927 
2928 				btrfs_set_path_blocking(p);
2929 				err = split_leaf(trans, root, key,
2930 						 p, ins_len, ret == 0);
2931 
2932 				BUG_ON(err > 0);
2933 				if (err) {
2934 					ret = err;
2935 					goto done;
2936 				}
2937 			}
2938 			if (!p->search_for_split)
2939 				unlock_up(p, level, lowest_unlock,
2940 					  min_write_lock_level, NULL);
2941 			goto done;
2942 		}
2943 	}
2944 	ret = 1;
2945 done:
2946 	/*
2947 	 * we don't really know what they plan on doing with the path
2948 	 * from here on, so for now just mark it as blocking
2949 	 */
2950 	if (!p->leave_spinning)
2951 		btrfs_set_path_blocking(p);
2952 	if (ret < 0 && !p->skip_release_on_error)
2953 		btrfs_release_path(p);
2954 	return ret;
2955 }
2956 
2957 /*
2958  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2959  * current state of the tree together with the operations recorded in the tree
2960  * modification log to search for the key in a previous version of this tree, as
2961  * denoted by the time_seq parameter.
2962  *
2963  * Naturally, there is no support for insert, delete or cow operations.
2964  *
2965  * The resulting path and return value will be set up as if we called
2966  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2967  */
2968 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2969 			  struct btrfs_path *p, u64 time_seq)
2970 {
2971 	struct btrfs_fs_info *fs_info = root->fs_info;
2972 	struct extent_buffer *b;
2973 	int slot;
2974 	int ret;
2975 	int err;
2976 	int level;
2977 	int lowest_unlock = 1;
2978 	u8 lowest_level = 0;
2979 	int prev_cmp = -1;
2980 
2981 	lowest_level = p->lowest_level;
2982 	WARN_ON(p->nodes[0] != NULL);
2983 
2984 	if (p->search_commit_root) {
2985 		BUG_ON(time_seq);
2986 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2987 	}
2988 
2989 again:
2990 	b = get_old_root(root, time_seq);
2991 	if (!b) {
2992 		ret = -EIO;
2993 		goto done;
2994 	}
2995 	level = btrfs_header_level(b);
2996 	p->locks[level] = BTRFS_READ_LOCK;
2997 
2998 	while (b) {
2999 		level = btrfs_header_level(b);
3000 		p->nodes[level] = b;
3001 
3002 		/*
3003 		 * we have a lock on b and as long as we aren't changing
3004 		 * the tree, there is no way to for the items in b to change.
3005 		 * It is safe to drop the lock on our parent before we
3006 		 * go through the expensive btree search on b.
3007 		 */
3008 		btrfs_unlock_up_safe(p, level + 1);
3009 
3010 		/*
3011 		 * Since we can unwind ebs we want to do a real search every
3012 		 * time.
3013 		 */
3014 		prev_cmp = -1;
3015 		ret = key_search(b, key, level, &prev_cmp, &slot);
3016 
3017 		if (level != 0) {
3018 			int dec = 0;
3019 			if (ret && slot > 0) {
3020 				dec = 1;
3021 				slot -= 1;
3022 			}
3023 			p->slots[level] = slot;
3024 			unlock_up(p, level, lowest_unlock, 0, NULL);
3025 
3026 			if (level == lowest_level) {
3027 				if (dec)
3028 					p->slots[level]++;
3029 				goto done;
3030 			}
3031 
3032 			err = read_block_for_search(root, p, &b, level,
3033 						    slot, key);
3034 			if (err == -EAGAIN)
3035 				goto again;
3036 			if (err) {
3037 				ret = err;
3038 				goto done;
3039 			}
3040 
3041 			level = btrfs_header_level(b);
3042 			err = btrfs_tree_read_lock_atomic(b);
3043 			if (!err) {
3044 				btrfs_set_path_blocking(p);
3045 				btrfs_tree_read_lock(b);
3046 			}
3047 			b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3048 			if (!b) {
3049 				ret = -ENOMEM;
3050 				goto done;
3051 			}
3052 			p->locks[level] = BTRFS_READ_LOCK;
3053 			p->nodes[level] = b;
3054 		} else {
3055 			p->slots[level] = slot;
3056 			unlock_up(p, level, lowest_unlock, 0, NULL);
3057 			goto done;
3058 		}
3059 	}
3060 	ret = 1;
3061 done:
3062 	if (!p->leave_spinning)
3063 		btrfs_set_path_blocking(p);
3064 	if (ret < 0)
3065 		btrfs_release_path(p);
3066 
3067 	return ret;
3068 }
3069 
3070 /*
3071  * helper to use instead of search slot if no exact match is needed but
3072  * instead the next or previous item should be returned.
3073  * When find_higher is true, the next higher item is returned, the next lower
3074  * otherwise.
3075  * When return_any and find_higher are both true, and no higher item is found,
3076  * return the next lower instead.
3077  * When return_any is true and find_higher is false, and no lower item is found,
3078  * return the next higher instead.
3079  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3080  * < 0 on error
3081  */
3082 int btrfs_search_slot_for_read(struct btrfs_root *root,
3083 			       const struct btrfs_key *key,
3084 			       struct btrfs_path *p, int find_higher,
3085 			       int return_any)
3086 {
3087 	int ret;
3088 	struct extent_buffer *leaf;
3089 
3090 again:
3091 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3092 	if (ret <= 0)
3093 		return ret;
3094 	/*
3095 	 * a return value of 1 means the path is at the position where the
3096 	 * item should be inserted. Normally this is the next bigger item,
3097 	 * but in case the previous item is the last in a leaf, path points
3098 	 * to the first free slot in the previous leaf, i.e. at an invalid
3099 	 * item.
3100 	 */
3101 	leaf = p->nodes[0];
3102 
3103 	if (find_higher) {
3104 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3105 			ret = btrfs_next_leaf(root, p);
3106 			if (ret <= 0)
3107 				return ret;
3108 			if (!return_any)
3109 				return 1;
3110 			/*
3111 			 * no higher item found, return the next
3112 			 * lower instead
3113 			 */
3114 			return_any = 0;
3115 			find_higher = 0;
3116 			btrfs_release_path(p);
3117 			goto again;
3118 		}
3119 	} else {
3120 		if (p->slots[0] == 0) {
3121 			ret = btrfs_prev_leaf(root, p);
3122 			if (ret < 0)
3123 				return ret;
3124 			if (!ret) {
3125 				leaf = p->nodes[0];
3126 				if (p->slots[0] == btrfs_header_nritems(leaf))
3127 					p->slots[0]--;
3128 				return 0;
3129 			}
3130 			if (!return_any)
3131 				return 1;
3132 			/*
3133 			 * no lower item found, return the next
3134 			 * higher instead
3135 			 */
3136 			return_any = 0;
3137 			find_higher = 1;
3138 			btrfs_release_path(p);
3139 			goto again;
3140 		} else {
3141 			--p->slots[0];
3142 		}
3143 	}
3144 	return 0;
3145 }
3146 
3147 /*
3148  * adjust the pointers going up the tree, starting at level
3149  * making sure the right key of each node is points to 'key'.
3150  * This is used after shifting pointers to the left, so it stops
3151  * fixing up pointers when a given leaf/node is not in slot 0 of the
3152  * higher levels
3153  *
3154  */
3155 static void fixup_low_keys(struct btrfs_path *path,
3156 			   struct btrfs_disk_key *key, int level)
3157 {
3158 	int i;
3159 	struct extent_buffer *t;
3160 	int ret;
3161 
3162 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3163 		int tslot = path->slots[i];
3164 
3165 		if (!path->nodes[i])
3166 			break;
3167 		t = path->nodes[i];
3168 		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3169 				GFP_ATOMIC);
3170 		BUG_ON(ret < 0);
3171 		btrfs_set_node_key(t, key, tslot);
3172 		btrfs_mark_buffer_dirty(path->nodes[i]);
3173 		if (tslot != 0)
3174 			break;
3175 	}
3176 }
3177 
3178 /*
3179  * update item key.
3180  *
3181  * This function isn't completely safe. It's the caller's responsibility
3182  * that the new key won't break the order
3183  */
3184 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3185 			     struct btrfs_path *path,
3186 			     const struct btrfs_key *new_key)
3187 {
3188 	struct btrfs_disk_key disk_key;
3189 	struct extent_buffer *eb;
3190 	int slot;
3191 
3192 	eb = path->nodes[0];
3193 	slot = path->slots[0];
3194 	if (slot > 0) {
3195 		btrfs_item_key(eb, &disk_key, slot - 1);
3196 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3197 	}
3198 	if (slot < btrfs_header_nritems(eb) - 1) {
3199 		btrfs_item_key(eb, &disk_key, slot + 1);
3200 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3201 	}
3202 
3203 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3204 	btrfs_set_item_key(eb, &disk_key, slot);
3205 	btrfs_mark_buffer_dirty(eb);
3206 	if (slot == 0)
3207 		fixup_low_keys(path, &disk_key, 1);
3208 }
3209 
3210 /*
3211  * try to push data from one node into the next node left in the
3212  * tree.
3213  *
3214  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3215  * error, and > 0 if there was no room in the left hand block.
3216  */
3217 static int push_node_left(struct btrfs_trans_handle *trans,
3218 			  struct btrfs_fs_info *fs_info,
3219 			  struct extent_buffer *dst,
3220 			  struct extent_buffer *src, int empty)
3221 {
3222 	int push_items = 0;
3223 	int src_nritems;
3224 	int dst_nritems;
3225 	int ret = 0;
3226 
3227 	src_nritems = btrfs_header_nritems(src);
3228 	dst_nritems = btrfs_header_nritems(dst);
3229 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3230 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3231 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3232 
3233 	if (!empty && src_nritems <= 8)
3234 		return 1;
3235 
3236 	if (push_items <= 0)
3237 		return 1;
3238 
3239 	if (empty) {
3240 		push_items = min(src_nritems, push_items);
3241 		if (push_items < src_nritems) {
3242 			/* leave at least 8 pointers in the node if
3243 			 * we aren't going to empty it
3244 			 */
3245 			if (src_nritems - push_items < 8) {
3246 				if (push_items <= 8)
3247 					return 1;
3248 				push_items -= 8;
3249 			}
3250 		}
3251 	} else
3252 		push_items = min(src_nritems - 8, push_items);
3253 
3254 	ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3255 				   push_items);
3256 	if (ret) {
3257 		btrfs_abort_transaction(trans, ret);
3258 		return ret;
3259 	}
3260 	copy_extent_buffer(dst, src,
3261 			   btrfs_node_key_ptr_offset(dst_nritems),
3262 			   btrfs_node_key_ptr_offset(0),
3263 			   push_items * sizeof(struct btrfs_key_ptr));
3264 
3265 	if (push_items < src_nritems) {
3266 		/*
3267 		 * Don't call tree_mod_log_insert_move here, key removal was
3268 		 * already fully logged by tree_mod_log_eb_copy above.
3269 		 */
3270 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3271 				      btrfs_node_key_ptr_offset(push_items),
3272 				      (src_nritems - push_items) *
3273 				      sizeof(struct btrfs_key_ptr));
3274 	}
3275 	btrfs_set_header_nritems(src, src_nritems - push_items);
3276 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3277 	btrfs_mark_buffer_dirty(src);
3278 	btrfs_mark_buffer_dirty(dst);
3279 
3280 	return ret;
3281 }
3282 
3283 /*
3284  * try to push data from one node into the next node right in the
3285  * tree.
3286  *
3287  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3288  * error, and > 0 if there was no room in the right hand block.
3289  *
3290  * this will  only push up to 1/2 the contents of the left node over
3291  */
3292 static int balance_node_right(struct btrfs_trans_handle *trans,
3293 			      struct btrfs_fs_info *fs_info,
3294 			      struct extent_buffer *dst,
3295 			      struct extent_buffer *src)
3296 {
3297 	int push_items = 0;
3298 	int max_push;
3299 	int src_nritems;
3300 	int dst_nritems;
3301 	int ret = 0;
3302 
3303 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3304 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3305 
3306 	src_nritems = btrfs_header_nritems(src);
3307 	dst_nritems = btrfs_header_nritems(dst);
3308 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3309 	if (push_items <= 0)
3310 		return 1;
3311 
3312 	if (src_nritems < 4)
3313 		return 1;
3314 
3315 	max_push = src_nritems / 2 + 1;
3316 	/* don't try to empty the node */
3317 	if (max_push >= src_nritems)
3318 		return 1;
3319 
3320 	if (max_push < push_items)
3321 		push_items = max_push;
3322 
3323 	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3324 	BUG_ON(ret < 0);
3325 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3326 				      btrfs_node_key_ptr_offset(0),
3327 				      (dst_nritems) *
3328 				      sizeof(struct btrfs_key_ptr));
3329 
3330 	ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3331 				   src_nritems - push_items, push_items);
3332 	if (ret) {
3333 		btrfs_abort_transaction(trans, ret);
3334 		return ret;
3335 	}
3336 	copy_extent_buffer(dst, src,
3337 			   btrfs_node_key_ptr_offset(0),
3338 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3339 			   push_items * sizeof(struct btrfs_key_ptr));
3340 
3341 	btrfs_set_header_nritems(src, src_nritems - push_items);
3342 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3343 
3344 	btrfs_mark_buffer_dirty(src);
3345 	btrfs_mark_buffer_dirty(dst);
3346 
3347 	return ret;
3348 }
3349 
3350 /*
3351  * helper function to insert a new root level in the tree.
3352  * A new node is allocated, and a single item is inserted to
3353  * point to the existing root
3354  *
3355  * returns zero on success or < 0 on failure.
3356  */
3357 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3358 			   struct btrfs_root *root,
3359 			   struct btrfs_path *path, int level)
3360 {
3361 	struct btrfs_fs_info *fs_info = root->fs_info;
3362 	u64 lower_gen;
3363 	struct extent_buffer *lower;
3364 	struct extent_buffer *c;
3365 	struct extent_buffer *old;
3366 	struct btrfs_disk_key lower_key;
3367 	int ret;
3368 
3369 	BUG_ON(path->nodes[level]);
3370 	BUG_ON(path->nodes[level-1] != root->node);
3371 
3372 	lower = path->nodes[level-1];
3373 	if (level == 1)
3374 		btrfs_item_key(lower, &lower_key, 0);
3375 	else
3376 		btrfs_node_key(lower, &lower_key, 0);
3377 
3378 	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3379 					 root->node->start, 0);
3380 	if (IS_ERR(c))
3381 		return PTR_ERR(c);
3382 
3383 	root_add_used(root, fs_info->nodesize);
3384 
3385 	btrfs_set_header_nritems(c, 1);
3386 	btrfs_set_node_key(c, &lower_key, 0);
3387 	btrfs_set_node_blockptr(c, 0, lower->start);
3388 	lower_gen = btrfs_header_generation(lower);
3389 	WARN_ON(lower_gen != trans->transid);
3390 
3391 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3392 
3393 	btrfs_mark_buffer_dirty(c);
3394 
3395 	old = root->node;
3396 	ret = tree_mod_log_insert_root(root->node, c, 0);
3397 	BUG_ON(ret < 0);
3398 	rcu_assign_pointer(root->node, c);
3399 
3400 	/* the super has an extra ref to root->node */
3401 	free_extent_buffer(old);
3402 
3403 	add_root_to_dirty_list(root);
3404 	extent_buffer_get(c);
3405 	path->nodes[level] = c;
3406 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3407 	path->slots[level] = 0;
3408 	return 0;
3409 }
3410 
3411 /*
3412  * worker function to insert a single pointer in a node.
3413  * the node should have enough room for the pointer already
3414  *
3415  * slot and level indicate where you want the key to go, and
3416  * blocknr is the block the key points to.
3417  */
3418 static void insert_ptr(struct btrfs_trans_handle *trans,
3419 		       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3420 		       struct btrfs_disk_key *key, u64 bytenr,
3421 		       int slot, int level)
3422 {
3423 	struct extent_buffer *lower;
3424 	int nritems;
3425 	int ret;
3426 
3427 	BUG_ON(!path->nodes[level]);
3428 	btrfs_assert_tree_locked(path->nodes[level]);
3429 	lower = path->nodes[level];
3430 	nritems = btrfs_header_nritems(lower);
3431 	BUG_ON(slot > nritems);
3432 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3433 	if (slot != nritems) {
3434 		if (level) {
3435 			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3436 					nritems - slot);
3437 			BUG_ON(ret < 0);
3438 		}
3439 		memmove_extent_buffer(lower,
3440 			      btrfs_node_key_ptr_offset(slot + 1),
3441 			      btrfs_node_key_ptr_offset(slot),
3442 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3443 	}
3444 	if (level) {
3445 		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3446 				GFP_NOFS);
3447 		BUG_ON(ret < 0);
3448 	}
3449 	btrfs_set_node_key(lower, key, slot);
3450 	btrfs_set_node_blockptr(lower, slot, bytenr);
3451 	WARN_ON(trans->transid == 0);
3452 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3453 	btrfs_set_header_nritems(lower, nritems + 1);
3454 	btrfs_mark_buffer_dirty(lower);
3455 }
3456 
3457 /*
3458  * split the node at the specified level in path in two.
3459  * The path is corrected to point to the appropriate node after the split
3460  *
3461  * Before splitting this tries to make some room in the node by pushing
3462  * left and right, if either one works, it returns right away.
3463  *
3464  * returns 0 on success and < 0 on failure
3465  */
3466 static noinline int split_node(struct btrfs_trans_handle *trans,
3467 			       struct btrfs_root *root,
3468 			       struct btrfs_path *path, int level)
3469 {
3470 	struct btrfs_fs_info *fs_info = root->fs_info;
3471 	struct extent_buffer *c;
3472 	struct extent_buffer *split;
3473 	struct btrfs_disk_key disk_key;
3474 	int mid;
3475 	int ret;
3476 	u32 c_nritems;
3477 
3478 	c = path->nodes[level];
3479 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3480 	if (c == root->node) {
3481 		/*
3482 		 * trying to split the root, lets make a new one
3483 		 *
3484 		 * tree mod log: We don't log_removal old root in
3485 		 * insert_new_root, because that root buffer will be kept as a
3486 		 * normal node. We are going to log removal of half of the
3487 		 * elements below with tree_mod_log_eb_copy. We're holding a
3488 		 * tree lock on the buffer, which is why we cannot race with
3489 		 * other tree_mod_log users.
3490 		 */
3491 		ret = insert_new_root(trans, root, path, level + 1);
3492 		if (ret)
3493 			return ret;
3494 	} else {
3495 		ret = push_nodes_for_insert(trans, root, path, level);
3496 		c = path->nodes[level];
3497 		if (!ret && btrfs_header_nritems(c) <
3498 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3499 			return 0;
3500 		if (ret < 0)
3501 			return ret;
3502 	}
3503 
3504 	c_nritems = btrfs_header_nritems(c);
3505 	mid = (c_nritems + 1) / 2;
3506 	btrfs_node_key(c, &disk_key, mid);
3507 
3508 	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3509 					     c->start, 0);
3510 	if (IS_ERR(split))
3511 		return PTR_ERR(split);
3512 
3513 	root_add_used(root, fs_info->nodesize);
3514 	ASSERT(btrfs_header_level(c) == level);
3515 
3516 	ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3517 	if (ret) {
3518 		btrfs_abort_transaction(trans, ret);
3519 		return ret;
3520 	}
3521 	copy_extent_buffer(split, c,
3522 			   btrfs_node_key_ptr_offset(0),
3523 			   btrfs_node_key_ptr_offset(mid),
3524 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3525 	btrfs_set_header_nritems(split, c_nritems - mid);
3526 	btrfs_set_header_nritems(c, mid);
3527 	ret = 0;
3528 
3529 	btrfs_mark_buffer_dirty(c);
3530 	btrfs_mark_buffer_dirty(split);
3531 
3532 	insert_ptr(trans, fs_info, path, &disk_key, split->start,
3533 		   path->slots[level + 1] + 1, level + 1);
3534 
3535 	if (path->slots[level] >= mid) {
3536 		path->slots[level] -= mid;
3537 		btrfs_tree_unlock(c);
3538 		free_extent_buffer(c);
3539 		path->nodes[level] = split;
3540 		path->slots[level + 1] += 1;
3541 	} else {
3542 		btrfs_tree_unlock(split);
3543 		free_extent_buffer(split);
3544 	}
3545 	return ret;
3546 }
3547 
3548 /*
3549  * how many bytes are required to store the items in a leaf.  start
3550  * and nr indicate which items in the leaf to check.  This totals up the
3551  * space used both by the item structs and the item data
3552  */
3553 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3554 {
3555 	struct btrfs_item *start_item;
3556 	struct btrfs_item *end_item;
3557 	struct btrfs_map_token token;
3558 	int data_len;
3559 	int nritems = btrfs_header_nritems(l);
3560 	int end = min(nritems, start + nr) - 1;
3561 
3562 	if (!nr)
3563 		return 0;
3564 	btrfs_init_map_token(&token);
3565 	start_item = btrfs_item_nr(start);
3566 	end_item = btrfs_item_nr(end);
3567 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3568 		btrfs_token_item_size(l, start_item, &token);
3569 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3570 	data_len += sizeof(struct btrfs_item) * nr;
3571 	WARN_ON(data_len < 0);
3572 	return data_len;
3573 }
3574 
3575 /*
3576  * The space between the end of the leaf items and
3577  * the start of the leaf data.  IOW, how much room
3578  * the leaf has left for both items and data
3579  */
3580 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3581 				   struct extent_buffer *leaf)
3582 {
3583 	int nritems = btrfs_header_nritems(leaf);
3584 	int ret;
3585 
3586 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3587 	if (ret < 0) {
3588 		btrfs_crit(fs_info,
3589 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3590 			   ret,
3591 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3592 			   leaf_space_used(leaf, 0, nritems), nritems);
3593 	}
3594 	return ret;
3595 }
3596 
3597 /*
3598  * min slot controls the lowest index we're willing to push to the
3599  * right.  We'll push up to and including min_slot, but no lower
3600  */
3601 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3602 				      struct btrfs_path *path,
3603 				      int data_size, int empty,
3604 				      struct extent_buffer *right,
3605 				      int free_space, u32 left_nritems,
3606 				      u32 min_slot)
3607 {
3608 	struct extent_buffer *left = path->nodes[0];
3609 	struct extent_buffer *upper = path->nodes[1];
3610 	struct btrfs_map_token token;
3611 	struct btrfs_disk_key disk_key;
3612 	int slot;
3613 	u32 i;
3614 	int push_space = 0;
3615 	int push_items = 0;
3616 	struct btrfs_item *item;
3617 	u32 nr;
3618 	u32 right_nritems;
3619 	u32 data_end;
3620 	u32 this_item_size;
3621 
3622 	btrfs_init_map_token(&token);
3623 
3624 	if (empty)
3625 		nr = 0;
3626 	else
3627 		nr = max_t(u32, 1, min_slot);
3628 
3629 	if (path->slots[0] >= left_nritems)
3630 		push_space += data_size;
3631 
3632 	slot = path->slots[1];
3633 	i = left_nritems - 1;
3634 	while (i >= nr) {
3635 		item = btrfs_item_nr(i);
3636 
3637 		if (!empty && push_items > 0) {
3638 			if (path->slots[0] > i)
3639 				break;
3640 			if (path->slots[0] == i) {
3641 				int space = btrfs_leaf_free_space(fs_info, left);
3642 				if (space + push_space * 2 > free_space)
3643 					break;
3644 			}
3645 		}
3646 
3647 		if (path->slots[0] == i)
3648 			push_space += data_size;
3649 
3650 		this_item_size = btrfs_item_size(left, item);
3651 		if (this_item_size + sizeof(*item) + push_space > free_space)
3652 			break;
3653 
3654 		push_items++;
3655 		push_space += this_item_size + sizeof(*item);
3656 		if (i == 0)
3657 			break;
3658 		i--;
3659 	}
3660 
3661 	if (push_items == 0)
3662 		goto out_unlock;
3663 
3664 	WARN_ON(!empty && push_items == left_nritems);
3665 
3666 	/* push left to right */
3667 	right_nritems = btrfs_header_nritems(right);
3668 
3669 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3670 	push_space -= leaf_data_end(fs_info, left);
3671 
3672 	/* make room in the right data area */
3673 	data_end = leaf_data_end(fs_info, right);
3674 	memmove_extent_buffer(right,
3675 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3676 			      BTRFS_LEAF_DATA_OFFSET + data_end,
3677 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3678 
3679 	/* copy from the left data area */
3680 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3681 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3682 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3683 		     push_space);
3684 
3685 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3686 			      btrfs_item_nr_offset(0),
3687 			      right_nritems * sizeof(struct btrfs_item));
3688 
3689 	/* copy the items from left to right */
3690 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3691 		   btrfs_item_nr_offset(left_nritems - push_items),
3692 		   push_items * sizeof(struct btrfs_item));
3693 
3694 	/* update the item pointers */
3695 	right_nritems += push_items;
3696 	btrfs_set_header_nritems(right, right_nritems);
3697 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3698 	for (i = 0; i < right_nritems; i++) {
3699 		item = btrfs_item_nr(i);
3700 		push_space -= btrfs_token_item_size(right, item, &token);
3701 		btrfs_set_token_item_offset(right, item, push_space, &token);
3702 	}
3703 
3704 	left_nritems -= push_items;
3705 	btrfs_set_header_nritems(left, left_nritems);
3706 
3707 	if (left_nritems)
3708 		btrfs_mark_buffer_dirty(left);
3709 	else
3710 		clean_tree_block(fs_info, left);
3711 
3712 	btrfs_mark_buffer_dirty(right);
3713 
3714 	btrfs_item_key(right, &disk_key, 0);
3715 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3716 	btrfs_mark_buffer_dirty(upper);
3717 
3718 	/* then fixup the leaf pointer in the path */
3719 	if (path->slots[0] >= left_nritems) {
3720 		path->slots[0] -= left_nritems;
3721 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3722 			clean_tree_block(fs_info, path->nodes[0]);
3723 		btrfs_tree_unlock(path->nodes[0]);
3724 		free_extent_buffer(path->nodes[0]);
3725 		path->nodes[0] = right;
3726 		path->slots[1] += 1;
3727 	} else {
3728 		btrfs_tree_unlock(right);
3729 		free_extent_buffer(right);
3730 	}
3731 	return 0;
3732 
3733 out_unlock:
3734 	btrfs_tree_unlock(right);
3735 	free_extent_buffer(right);
3736 	return 1;
3737 }
3738 
3739 /*
3740  * push some data in the path leaf to the right, trying to free up at
3741  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3742  *
3743  * returns 1 if the push failed because the other node didn't have enough
3744  * room, 0 if everything worked out and < 0 if there were major errors.
3745  *
3746  * this will push starting from min_slot to the end of the leaf.  It won't
3747  * push any slot lower than min_slot
3748  */
3749 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3750 			   *root, struct btrfs_path *path,
3751 			   int min_data_size, int data_size,
3752 			   int empty, u32 min_slot)
3753 {
3754 	struct btrfs_fs_info *fs_info = root->fs_info;
3755 	struct extent_buffer *left = path->nodes[0];
3756 	struct extent_buffer *right;
3757 	struct extent_buffer *upper;
3758 	int slot;
3759 	int free_space;
3760 	u32 left_nritems;
3761 	int ret;
3762 
3763 	if (!path->nodes[1])
3764 		return 1;
3765 
3766 	slot = path->slots[1];
3767 	upper = path->nodes[1];
3768 	if (slot >= btrfs_header_nritems(upper) - 1)
3769 		return 1;
3770 
3771 	btrfs_assert_tree_locked(path->nodes[1]);
3772 
3773 	right = read_node_slot(fs_info, upper, slot + 1);
3774 	/*
3775 	 * slot + 1 is not valid or we fail to read the right node,
3776 	 * no big deal, just return.
3777 	 */
3778 	if (IS_ERR(right))
3779 		return 1;
3780 
3781 	btrfs_tree_lock(right);
3782 	btrfs_set_lock_blocking(right);
3783 
3784 	free_space = btrfs_leaf_free_space(fs_info, right);
3785 	if (free_space < data_size)
3786 		goto out_unlock;
3787 
3788 	/* cow and double check */
3789 	ret = btrfs_cow_block(trans, root, right, upper,
3790 			      slot + 1, &right);
3791 	if (ret)
3792 		goto out_unlock;
3793 
3794 	free_space = btrfs_leaf_free_space(fs_info, right);
3795 	if (free_space < data_size)
3796 		goto out_unlock;
3797 
3798 	left_nritems = btrfs_header_nritems(left);
3799 	if (left_nritems == 0)
3800 		goto out_unlock;
3801 
3802 	if (path->slots[0] == left_nritems && !empty) {
3803 		/* Key greater than all keys in the leaf, right neighbor has
3804 		 * enough room for it and we're not emptying our leaf to delete
3805 		 * it, therefore use right neighbor to insert the new item and
3806 		 * no need to touch/dirty our left leaf. */
3807 		btrfs_tree_unlock(left);
3808 		free_extent_buffer(left);
3809 		path->nodes[0] = right;
3810 		path->slots[0] = 0;
3811 		path->slots[1]++;
3812 		return 0;
3813 	}
3814 
3815 	return __push_leaf_right(fs_info, path, min_data_size, empty,
3816 				right, free_space, left_nritems, min_slot);
3817 out_unlock:
3818 	btrfs_tree_unlock(right);
3819 	free_extent_buffer(right);
3820 	return 1;
3821 }
3822 
3823 /*
3824  * push some data in the path leaf to the left, trying to free up at
3825  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3826  *
3827  * max_slot can put a limit on how far into the leaf we'll push items.  The
3828  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3829  * items
3830  */
3831 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3832 				     struct btrfs_path *path, int data_size,
3833 				     int empty, struct extent_buffer *left,
3834 				     int free_space, u32 right_nritems,
3835 				     u32 max_slot)
3836 {
3837 	struct btrfs_disk_key disk_key;
3838 	struct extent_buffer *right = path->nodes[0];
3839 	int i;
3840 	int push_space = 0;
3841 	int push_items = 0;
3842 	struct btrfs_item *item;
3843 	u32 old_left_nritems;
3844 	u32 nr;
3845 	int ret = 0;
3846 	u32 this_item_size;
3847 	u32 old_left_item_size;
3848 	struct btrfs_map_token token;
3849 
3850 	btrfs_init_map_token(&token);
3851 
3852 	if (empty)
3853 		nr = min(right_nritems, max_slot);
3854 	else
3855 		nr = min(right_nritems - 1, max_slot);
3856 
3857 	for (i = 0; i < nr; i++) {
3858 		item = btrfs_item_nr(i);
3859 
3860 		if (!empty && push_items > 0) {
3861 			if (path->slots[0] < i)
3862 				break;
3863 			if (path->slots[0] == i) {
3864 				int space = btrfs_leaf_free_space(fs_info, right);
3865 				if (space + push_space * 2 > free_space)
3866 					break;
3867 			}
3868 		}
3869 
3870 		if (path->slots[0] == i)
3871 			push_space += data_size;
3872 
3873 		this_item_size = btrfs_item_size(right, item);
3874 		if (this_item_size + sizeof(*item) + push_space > free_space)
3875 			break;
3876 
3877 		push_items++;
3878 		push_space += this_item_size + sizeof(*item);
3879 	}
3880 
3881 	if (push_items == 0) {
3882 		ret = 1;
3883 		goto out;
3884 	}
3885 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3886 
3887 	/* push data from right to left */
3888 	copy_extent_buffer(left, right,
3889 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3890 			   btrfs_item_nr_offset(0),
3891 			   push_items * sizeof(struct btrfs_item));
3892 
3893 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3894 		     btrfs_item_offset_nr(right, push_items - 1);
3895 
3896 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3897 		     leaf_data_end(fs_info, left) - push_space,
3898 		     BTRFS_LEAF_DATA_OFFSET +
3899 		     btrfs_item_offset_nr(right, push_items - 1),
3900 		     push_space);
3901 	old_left_nritems = btrfs_header_nritems(left);
3902 	BUG_ON(old_left_nritems <= 0);
3903 
3904 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3905 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3906 		u32 ioff;
3907 
3908 		item = btrfs_item_nr(i);
3909 
3910 		ioff = btrfs_token_item_offset(left, item, &token);
3911 		btrfs_set_token_item_offset(left, item,
3912 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3913 		      &token);
3914 	}
3915 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3916 
3917 	/* fixup right node */
3918 	if (push_items > right_nritems)
3919 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3920 		       right_nritems);
3921 
3922 	if (push_items < right_nritems) {
3923 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3924 						  leaf_data_end(fs_info, right);
3925 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3926 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3927 				      BTRFS_LEAF_DATA_OFFSET +
3928 				      leaf_data_end(fs_info, right), push_space);
3929 
3930 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3931 			      btrfs_item_nr_offset(push_items),
3932 			     (btrfs_header_nritems(right) - push_items) *
3933 			     sizeof(struct btrfs_item));
3934 	}
3935 	right_nritems -= push_items;
3936 	btrfs_set_header_nritems(right, right_nritems);
3937 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3938 	for (i = 0; i < right_nritems; i++) {
3939 		item = btrfs_item_nr(i);
3940 
3941 		push_space = push_space - btrfs_token_item_size(right,
3942 								item, &token);
3943 		btrfs_set_token_item_offset(right, item, push_space, &token);
3944 	}
3945 
3946 	btrfs_mark_buffer_dirty(left);
3947 	if (right_nritems)
3948 		btrfs_mark_buffer_dirty(right);
3949 	else
3950 		clean_tree_block(fs_info, right);
3951 
3952 	btrfs_item_key(right, &disk_key, 0);
3953 	fixup_low_keys(path, &disk_key, 1);
3954 
3955 	/* then fixup the leaf pointer in the path */
3956 	if (path->slots[0] < push_items) {
3957 		path->slots[0] += old_left_nritems;
3958 		btrfs_tree_unlock(path->nodes[0]);
3959 		free_extent_buffer(path->nodes[0]);
3960 		path->nodes[0] = left;
3961 		path->slots[1] -= 1;
3962 	} else {
3963 		btrfs_tree_unlock(left);
3964 		free_extent_buffer(left);
3965 		path->slots[0] -= push_items;
3966 	}
3967 	BUG_ON(path->slots[0] < 0);
3968 	return ret;
3969 out:
3970 	btrfs_tree_unlock(left);
3971 	free_extent_buffer(left);
3972 	return ret;
3973 }
3974 
3975 /*
3976  * push some data in the path leaf to the left, trying to free up at
3977  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3978  *
3979  * max_slot can put a limit on how far into the leaf we'll push items.  The
3980  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3981  * items
3982  */
3983 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3984 			  *root, struct btrfs_path *path, int min_data_size,
3985 			  int data_size, int empty, u32 max_slot)
3986 {
3987 	struct btrfs_fs_info *fs_info = root->fs_info;
3988 	struct extent_buffer *right = path->nodes[0];
3989 	struct extent_buffer *left;
3990 	int slot;
3991 	int free_space;
3992 	u32 right_nritems;
3993 	int ret = 0;
3994 
3995 	slot = path->slots[1];
3996 	if (slot == 0)
3997 		return 1;
3998 	if (!path->nodes[1])
3999 		return 1;
4000 
4001 	right_nritems = btrfs_header_nritems(right);
4002 	if (right_nritems == 0)
4003 		return 1;
4004 
4005 	btrfs_assert_tree_locked(path->nodes[1]);
4006 
4007 	left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4008 	/*
4009 	 * slot - 1 is not valid or we fail to read the left node,
4010 	 * no big deal, just return.
4011 	 */
4012 	if (IS_ERR(left))
4013 		return 1;
4014 
4015 	btrfs_tree_lock(left);
4016 	btrfs_set_lock_blocking(left);
4017 
4018 	free_space = btrfs_leaf_free_space(fs_info, left);
4019 	if (free_space < data_size) {
4020 		ret = 1;
4021 		goto out;
4022 	}
4023 
4024 	/* cow and double check */
4025 	ret = btrfs_cow_block(trans, root, left,
4026 			      path->nodes[1], slot - 1, &left);
4027 	if (ret) {
4028 		/* we hit -ENOSPC, but it isn't fatal here */
4029 		if (ret == -ENOSPC)
4030 			ret = 1;
4031 		goto out;
4032 	}
4033 
4034 	free_space = btrfs_leaf_free_space(fs_info, left);
4035 	if (free_space < data_size) {
4036 		ret = 1;
4037 		goto out;
4038 	}
4039 
4040 	return __push_leaf_left(fs_info, path, min_data_size,
4041 			       empty, left, free_space, right_nritems,
4042 			       max_slot);
4043 out:
4044 	btrfs_tree_unlock(left);
4045 	free_extent_buffer(left);
4046 	return ret;
4047 }
4048 
4049 /*
4050  * split the path's leaf in two, making sure there is at least data_size
4051  * available for the resulting leaf level of the path.
4052  */
4053 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4054 				    struct btrfs_fs_info *fs_info,
4055 				    struct btrfs_path *path,
4056 				    struct extent_buffer *l,
4057 				    struct extent_buffer *right,
4058 				    int slot, int mid, int nritems)
4059 {
4060 	int data_copy_size;
4061 	int rt_data_off;
4062 	int i;
4063 	struct btrfs_disk_key disk_key;
4064 	struct btrfs_map_token token;
4065 
4066 	btrfs_init_map_token(&token);
4067 
4068 	nritems = nritems - mid;
4069 	btrfs_set_header_nritems(right, nritems);
4070 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4071 
4072 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4073 			   btrfs_item_nr_offset(mid),
4074 			   nritems * sizeof(struct btrfs_item));
4075 
4076 	copy_extent_buffer(right, l,
4077 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4078 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4079 		     leaf_data_end(fs_info, l), data_copy_size);
4080 
4081 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4082 
4083 	for (i = 0; i < nritems; i++) {
4084 		struct btrfs_item *item = btrfs_item_nr(i);
4085 		u32 ioff;
4086 
4087 		ioff = btrfs_token_item_offset(right, item, &token);
4088 		btrfs_set_token_item_offset(right, item,
4089 					    ioff + rt_data_off, &token);
4090 	}
4091 
4092 	btrfs_set_header_nritems(l, mid);
4093 	btrfs_item_key(right, &disk_key, 0);
4094 	insert_ptr(trans, fs_info, path, &disk_key, right->start,
4095 		   path->slots[1] + 1, 1);
4096 
4097 	btrfs_mark_buffer_dirty(right);
4098 	btrfs_mark_buffer_dirty(l);
4099 	BUG_ON(path->slots[0] != slot);
4100 
4101 	if (mid <= slot) {
4102 		btrfs_tree_unlock(path->nodes[0]);
4103 		free_extent_buffer(path->nodes[0]);
4104 		path->nodes[0] = right;
4105 		path->slots[0] -= mid;
4106 		path->slots[1] += 1;
4107 	} else {
4108 		btrfs_tree_unlock(right);
4109 		free_extent_buffer(right);
4110 	}
4111 
4112 	BUG_ON(path->slots[0] < 0);
4113 }
4114 
4115 /*
4116  * double splits happen when we need to insert a big item in the middle
4117  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4118  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4119  *          A                 B                 C
4120  *
4121  * We avoid this by trying to push the items on either side of our target
4122  * into the adjacent leaves.  If all goes well we can avoid the double split
4123  * completely.
4124  */
4125 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4126 					  struct btrfs_root *root,
4127 					  struct btrfs_path *path,
4128 					  int data_size)
4129 {
4130 	struct btrfs_fs_info *fs_info = root->fs_info;
4131 	int ret;
4132 	int progress = 0;
4133 	int slot;
4134 	u32 nritems;
4135 	int space_needed = data_size;
4136 
4137 	slot = path->slots[0];
4138 	if (slot < btrfs_header_nritems(path->nodes[0]))
4139 		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4140 
4141 	/*
4142 	 * try to push all the items after our slot into the
4143 	 * right leaf
4144 	 */
4145 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4146 	if (ret < 0)
4147 		return ret;
4148 
4149 	if (ret == 0)
4150 		progress++;
4151 
4152 	nritems = btrfs_header_nritems(path->nodes[0]);
4153 	/*
4154 	 * our goal is to get our slot at the start or end of a leaf.  If
4155 	 * we've done so we're done
4156 	 */
4157 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4158 		return 0;
4159 
4160 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4161 		return 0;
4162 
4163 	/* try to push all the items before our slot into the next leaf */
4164 	slot = path->slots[0];
4165 	space_needed = data_size;
4166 	if (slot > 0)
4167 		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4168 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4169 	if (ret < 0)
4170 		return ret;
4171 
4172 	if (ret == 0)
4173 		progress++;
4174 
4175 	if (progress)
4176 		return 0;
4177 	return 1;
4178 }
4179 
4180 /*
4181  * split the path's leaf in two, making sure there is at least data_size
4182  * available for the resulting leaf level of the path.
4183  *
4184  * returns 0 if all went well and < 0 on failure.
4185  */
4186 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4187 			       struct btrfs_root *root,
4188 			       const struct btrfs_key *ins_key,
4189 			       struct btrfs_path *path, int data_size,
4190 			       int extend)
4191 {
4192 	struct btrfs_disk_key disk_key;
4193 	struct extent_buffer *l;
4194 	u32 nritems;
4195 	int mid;
4196 	int slot;
4197 	struct extent_buffer *right;
4198 	struct btrfs_fs_info *fs_info = root->fs_info;
4199 	int ret = 0;
4200 	int wret;
4201 	int split;
4202 	int num_doubles = 0;
4203 	int tried_avoid_double = 0;
4204 
4205 	l = path->nodes[0];
4206 	slot = path->slots[0];
4207 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4208 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4209 		return -EOVERFLOW;
4210 
4211 	/* first try to make some room by pushing left and right */
4212 	if (data_size && path->nodes[1]) {
4213 		int space_needed = data_size;
4214 
4215 		if (slot < btrfs_header_nritems(l))
4216 			space_needed -= btrfs_leaf_free_space(fs_info, l);
4217 
4218 		wret = push_leaf_right(trans, root, path, space_needed,
4219 				       space_needed, 0, 0);
4220 		if (wret < 0)
4221 			return wret;
4222 		if (wret) {
4223 			space_needed = data_size;
4224 			if (slot > 0)
4225 				space_needed -= btrfs_leaf_free_space(fs_info,
4226 								      l);
4227 			wret = push_leaf_left(trans, root, path, space_needed,
4228 					      space_needed, 0, (u32)-1);
4229 			if (wret < 0)
4230 				return wret;
4231 		}
4232 		l = path->nodes[0];
4233 
4234 		/* did the pushes work? */
4235 		if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4236 			return 0;
4237 	}
4238 
4239 	if (!path->nodes[1]) {
4240 		ret = insert_new_root(trans, root, path, 1);
4241 		if (ret)
4242 			return ret;
4243 	}
4244 again:
4245 	split = 1;
4246 	l = path->nodes[0];
4247 	slot = path->slots[0];
4248 	nritems = btrfs_header_nritems(l);
4249 	mid = (nritems + 1) / 2;
4250 
4251 	if (mid <= slot) {
4252 		if (nritems == 1 ||
4253 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4254 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4255 			if (slot >= nritems) {
4256 				split = 0;
4257 			} else {
4258 				mid = slot;
4259 				if (mid != nritems &&
4260 				    leaf_space_used(l, mid, nritems - mid) +
4261 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4262 					if (data_size && !tried_avoid_double)
4263 						goto push_for_double;
4264 					split = 2;
4265 				}
4266 			}
4267 		}
4268 	} else {
4269 		if (leaf_space_used(l, 0, mid) + data_size >
4270 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4271 			if (!extend && data_size && slot == 0) {
4272 				split = 0;
4273 			} else if ((extend || !data_size) && slot == 0) {
4274 				mid = 1;
4275 			} else {
4276 				mid = slot;
4277 				if (mid != nritems &&
4278 				    leaf_space_used(l, mid, nritems - mid) +
4279 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4280 					if (data_size && !tried_avoid_double)
4281 						goto push_for_double;
4282 					split = 2;
4283 				}
4284 			}
4285 		}
4286 	}
4287 
4288 	if (split == 0)
4289 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4290 	else
4291 		btrfs_item_key(l, &disk_key, mid);
4292 
4293 	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4294 					     l->start, 0);
4295 	if (IS_ERR(right))
4296 		return PTR_ERR(right);
4297 
4298 	root_add_used(root, fs_info->nodesize);
4299 
4300 	if (split == 0) {
4301 		if (mid <= slot) {
4302 			btrfs_set_header_nritems(right, 0);
4303 			insert_ptr(trans, fs_info, path, &disk_key,
4304 				   right->start, path->slots[1] + 1, 1);
4305 			btrfs_tree_unlock(path->nodes[0]);
4306 			free_extent_buffer(path->nodes[0]);
4307 			path->nodes[0] = right;
4308 			path->slots[0] = 0;
4309 			path->slots[1] += 1;
4310 		} else {
4311 			btrfs_set_header_nritems(right, 0);
4312 			insert_ptr(trans, fs_info, path, &disk_key,
4313 				   right->start, path->slots[1], 1);
4314 			btrfs_tree_unlock(path->nodes[0]);
4315 			free_extent_buffer(path->nodes[0]);
4316 			path->nodes[0] = right;
4317 			path->slots[0] = 0;
4318 			if (path->slots[1] == 0)
4319 				fixup_low_keys(path, &disk_key, 1);
4320 		}
4321 		/*
4322 		 * We create a new leaf 'right' for the required ins_len and
4323 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4324 		 * the content of ins_len to 'right'.
4325 		 */
4326 		return ret;
4327 	}
4328 
4329 	copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4330 
4331 	if (split == 2) {
4332 		BUG_ON(num_doubles != 0);
4333 		num_doubles++;
4334 		goto again;
4335 	}
4336 
4337 	return 0;
4338 
4339 push_for_double:
4340 	push_for_double_split(trans, root, path, data_size);
4341 	tried_avoid_double = 1;
4342 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4343 		return 0;
4344 	goto again;
4345 }
4346 
4347 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4348 					 struct btrfs_root *root,
4349 					 struct btrfs_path *path, int ins_len)
4350 {
4351 	struct btrfs_fs_info *fs_info = root->fs_info;
4352 	struct btrfs_key key;
4353 	struct extent_buffer *leaf;
4354 	struct btrfs_file_extent_item *fi;
4355 	u64 extent_len = 0;
4356 	u32 item_size;
4357 	int ret;
4358 
4359 	leaf = path->nodes[0];
4360 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4361 
4362 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4363 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4364 
4365 	if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4366 		return 0;
4367 
4368 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4369 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4370 		fi = btrfs_item_ptr(leaf, path->slots[0],
4371 				    struct btrfs_file_extent_item);
4372 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4373 	}
4374 	btrfs_release_path(path);
4375 
4376 	path->keep_locks = 1;
4377 	path->search_for_split = 1;
4378 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4379 	path->search_for_split = 0;
4380 	if (ret > 0)
4381 		ret = -EAGAIN;
4382 	if (ret < 0)
4383 		goto err;
4384 
4385 	ret = -EAGAIN;
4386 	leaf = path->nodes[0];
4387 	/* if our item isn't there, return now */
4388 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4389 		goto err;
4390 
4391 	/* the leaf has  changed, it now has room.  return now */
4392 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4393 		goto err;
4394 
4395 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4396 		fi = btrfs_item_ptr(leaf, path->slots[0],
4397 				    struct btrfs_file_extent_item);
4398 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4399 			goto err;
4400 	}
4401 
4402 	btrfs_set_path_blocking(path);
4403 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4404 	if (ret)
4405 		goto err;
4406 
4407 	path->keep_locks = 0;
4408 	btrfs_unlock_up_safe(path, 1);
4409 	return 0;
4410 err:
4411 	path->keep_locks = 0;
4412 	return ret;
4413 }
4414 
4415 static noinline int split_item(struct btrfs_fs_info *fs_info,
4416 			       struct btrfs_path *path,
4417 			       const struct btrfs_key *new_key,
4418 			       unsigned long split_offset)
4419 {
4420 	struct extent_buffer *leaf;
4421 	struct btrfs_item *item;
4422 	struct btrfs_item *new_item;
4423 	int slot;
4424 	char *buf;
4425 	u32 nritems;
4426 	u32 item_size;
4427 	u32 orig_offset;
4428 	struct btrfs_disk_key disk_key;
4429 
4430 	leaf = path->nodes[0];
4431 	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4432 
4433 	btrfs_set_path_blocking(path);
4434 
4435 	item = btrfs_item_nr(path->slots[0]);
4436 	orig_offset = btrfs_item_offset(leaf, item);
4437 	item_size = btrfs_item_size(leaf, item);
4438 
4439 	buf = kmalloc(item_size, GFP_NOFS);
4440 	if (!buf)
4441 		return -ENOMEM;
4442 
4443 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4444 			    path->slots[0]), item_size);
4445 
4446 	slot = path->slots[0] + 1;
4447 	nritems = btrfs_header_nritems(leaf);
4448 	if (slot != nritems) {
4449 		/* shift the items */
4450 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4451 				btrfs_item_nr_offset(slot),
4452 				(nritems - slot) * sizeof(struct btrfs_item));
4453 	}
4454 
4455 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4456 	btrfs_set_item_key(leaf, &disk_key, slot);
4457 
4458 	new_item = btrfs_item_nr(slot);
4459 
4460 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4461 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4462 
4463 	btrfs_set_item_offset(leaf, item,
4464 			      orig_offset + item_size - split_offset);
4465 	btrfs_set_item_size(leaf, item, split_offset);
4466 
4467 	btrfs_set_header_nritems(leaf, nritems + 1);
4468 
4469 	/* write the data for the start of the original item */
4470 	write_extent_buffer(leaf, buf,
4471 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4472 			    split_offset);
4473 
4474 	/* write the data for the new item */
4475 	write_extent_buffer(leaf, buf + split_offset,
4476 			    btrfs_item_ptr_offset(leaf, slot),
4477 			    item_size - split_offset);
4478 	btrfs_mark_buffer_dirty(leaf);
4479 
4480 	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4481 	kfree(buf);
4482 	return 0;
4483 }
4484 
4485 /*
4486  * This function splits a single item into two items,
4487  * giving 'new_key' to the new item and splitting the
4488  * old one at split_offset (from the start of the item).
4489  *
4490  * The path may be released by this operation.  After
4491  * the split, the path is pointing to the old item.  The
4492  * new item is going to be in the same node as the old one.
4493  *
4494  * Note, the item being split must be smaller enough to live alone on
4495  * a tree block with room for one extra struct btrfs_item
4496  *
4497  * This allows us to split the item in place, keeping a lock on the
4498  * leaf the entire time.
4499  */
4500 int btrfs_split_item(struct btrfs_trans_handle *trans,
4501 		     struct btrfs_root *root,
4502 		     struct btrfs_path *path,
4503 		     const struct btrfs_key *new_key,
4504 		     unsigned long split_offset)
4505 {
4506 	int ret;
4507 	ret = setup_leaf_for_split(trans, root, path,
4508 				   sizeof(struct btrfs_item));
4509 	if (ret)
4510 		return ret;
4511 
4512 	ret = split_item(root->fs_info, path, new_key, split_offset);
4513 	return ret;
4514 }
4515 
4516 /*
4517  * This function duplicate a item, giving 'new_key' to the new item.
4518  * It guarantees both items live in the same tree leaf and the new item
4519  * is contiguous with the original item.
4520  *
4521  * This allows us to split file extent in place, keeping a lock on the
4522  * leaf the entire time.
4523  */
4524 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4525 			 struct btrfs_root *root,
4526 			 struct btrfs_path *path,
4527 			 const struct btrfs_key *new_key)
4528 {
4529 	struct extent_buffer *leaf;
4530 	int ret;
4531 	u32 item_size;
4532 
4533 	leaf = path->nodes[0];
4534 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4535 	ret = setup_leaf_for_split(trans, root, path,
4536 				   item_size + sizeof(struct btrfs_item));
4537 	if (ret)
4538 		return ret;
4539 
4540 	path->slots[0]++;
4541 	setup_items_for_insert(root, path, new_key, &item_size,
4542 			       item_size, item_size +
4543 			       sizeof(struct btrfs_item), 1);
4544 	leaf = path->nodes[0];
4545 	memcpy_extent_buffer(leaf,
4546 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4547 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4548 			     item_size);
4549 	return 0;
4550 }
4551 
4552 /*
4553  * make the item pointed to by the path smaller.  new_size indicates
4554  * how small to make it, and from_end tells us if we just chop bytes
4555  * off the end of the item or if we shift the item to chop bytes off
4556  * the front.
4557  */
4558 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4559 			 struct btrfs_path *path, u32 new_size, int from_end)
4560 {
4561 	int slot;
4562 	struct extent_buffer *leaf;
4563 	struct btrfs_item *item;
4564 	u32 nritems;
4565 	unsigned int data_end;
4566 	unsigned int old_data_start;
4567 	unsigned int old_size;
4568 	unsigned int size_diff;
4569 	int i;
4570 	struct btrfs_map_token token;
4571 
4572 	btrfs_init_map_token(&token);
4573 
4574 	leaf = path->nodes[0];
4575 	slot = path->slots[0];
4576 
4577 	old_size = btrfs_item_size_nr(leaf, slot);
4578 	if (old_size == new_size)
4579 		return;
4580 
4581 	nritems = btrfs_header_nritems(leaf);
4582 	data_end = leaf_data_end(fs_info, leaf);
4583 
4584 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4585 
4586 	size_diff = old_size - new_size;
4587 
4588 	BUG_ON(slot < 0);
4589 	BUG_ON(slot >= nritems);
4590 
4591 	/*
4592 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4593 	 */
4594 	/* first correct the data pointers */
4595 	for (i = slot; i < nritems; i++) {
4596 		u32 ioff;
4597 		item = btrfs_item_nr(i);
4598 
4599 		ioff = btrfs_token_item_offset(leaf, item, &token);
4600 		btrfs_set_token_item_offset(leaf, item,
4601 					    ioff + size_diff, &token);
4602 	}
4603 
4604 	/* shift the data */
4605 	if (from_end) {
4606 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4607 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4608 			      data_end, old_data_start + new_size - data_end);
4609 	} else {
4610 		struct btrfs_disk_key disk_key;
4611 		u64 offset;
4612 
4613 		btrfs_item_key(leaf, &disk_key, slot);
4614 
4615 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4616 			unsigned long ptr;
4617 			struct btrfs_file_extent_item *fi;
4618 
4619 			fi = btrfs_item_ptr(leaf, slot,
4620 					    struct btrfs_file_extent_item);
4621 			fi = (struct btrfs_file_extent_item *)(
4622 			     (unsigned long)fi - size_diff);
4623 
4624 			if (btrfs_file_extent_type(leaf, fi) ==
4625 			    BTRFS_FILE_EXTENT_INLINE) {
4626 				ptr = btrfs_item_ptr_offset(leaf, slot);
4627 				memmove_extent_buffer(leaf, ptr,
4628 				      (unsigned long)fi,
4629 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4630 			}
4631 		}
4632 
4633 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4634 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4635 			      data_end, old_data_start - data_end);
4636 
4637 		offset = btrfs_disk_key_offset(&disk_key);
4638 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4639 		btrfs_set_item_key(leaf, &disk_key, slot);
4640 		if (slot == 0)
4641 			fixup_low_keys(path, &disk_key, 1);
4642 	}
4643 
4644 	item = btrfs_item_nr(slot);
4645 	btrfs_set_item_size(leaf, item, new_size);
4646 	btrfs_mark_buffer_dirty(leaf);
4647 
4648 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4649 		btrfs_print_leaf(leaf);
4650 		BUG();
4651 	}
4652 }
4653 
4654 /*
4655  * make the item pointed to by the path bigger, data_size is the added size.
4656  */
4657 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4658 		       u32 data_size)
4659 {
4660 	int slot;
4661 	struct extent_buffer *leaf;
4662 	struct btrfs_item *item;
4663 	u32 nritems;
4664 	unsigned int data_end;
4665 	unsigned int old_data;
4666 	unsigned int old_size;
4667 	int i;
4668 	struct btrfs_map_token token;
4669 
4670 	btrfs_init_map_token(&token);
4671 
4672 	leaf = path->nodes[0];
4673 
4674 	nritems = btrfs_header_nritems(leaf);
4675 	data_end = leaf_data_end(fs_info, leaf);
4676 
4677 	if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4678 		btrfs_print_leaf(leaf);
4679 		BUG();
4680 	}
4681 	slot = path->slots[0];
4682 	old_data = btrfs_item_end_nr(leaf, slot);
4683 
4684 	BUG_ON(slot < 0);
4685 	if (slot >= nritems) {
4686 		btrfs_print_leaf(leaf);
4687 		btrfs_crit(fs_info, "slot %d too large, nritems %d",
4688 			   slot, nritems);
4689 		BUG_ON(1);
4690 	}
4691 
4692 	/*
4693 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4694 	 */
4695 	/* first correct the data pointers */
4696 	for (i = slot; i < nritems; i++) {
4697 		u32 ioff;
4698 		item = btrfs_item_nr(i);
4699 
4700 		ioff = btrfs_token_item_offset(leaf, item, &token);
4701 		btrfs_set_token_item_offset(leaf, item,
4702 					    ioff - data_size, &token);
4703 	}
4704 
4705 	/* shift the data */
4706 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4707 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4708 		      data_end, old_data - data_end);
4709 
4710 	data_end = old_data;
4711 	old_size = btrfs_item_size_nr(leaf, slot);
4712 	item = btrfs_item_nr(slot);
4713 	btrfs_set_item_size(leaf, item, old_size + data_size);
4714 	btrfs_mark_buffer_dirty(leaf);
4715 
4716 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4717 		btrfs_print_leaf(leaf);
4718 		BUG();
4719 	}
4720 }
4721 
4722 /*
4723  * this is a helper for btrfs_insert_empty_items, the main goal here is
4724  * to save stack depth by doing the bulk of the work in a function
4725  * that doesn't call btrfs_search_slot
4726  */
4727 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4728 			    const struct btrfs_key *cpu_key, u32 *data_size,
4729 			    u32 total_data, u32 total_size, int nr)
4730 {
4731 	struct btrfs_fs_info *fs_info = root->fs_info;
4732 	struct btrfs_item *item;
4733 	int i;
4734 	u32 nritems;
4735 	unsigned int data_end;
4736 	struct btrfs_disk_key disk_key;
4737 	struct extent_buffer *leaf;
4738 	int slot;
4739 	struct btrfs_map_token token;
4740 
4741 	if (path->slots[0] == 0) {
4742 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4743 		fixup_low_keys(path, &disk_key, 1);
4744 	}
4745 	btrfs_unlock_up_safe(path, 1);
4746 
4747 	btrfs_init_map_token(&token);
4748 
4749 	leaf = path->nodes[0];
4750 	slot = path->slots[0];
4751 
4752 	nritems = btrfs_header_nritems(leaf);
4753 	data_end = leaf_data_end(fs_info, leaf);
4754 
4755 	if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4756 		btrfs_print_leaf(leaf);
4757 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4758 			   total_size, btrfs_leaf_free_space(fs_info, leaf));
4759 		BUG();
4760 	}
4761 
4762 	if (slot != nritems) {
4763 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4764 
4765 		if (old_data < data_end) {
4766 			btrfs_print_leaf(leaf);
4767 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4768 				   slot, old_data, data_end);
4769 			BUG_ON(1);
4770 		}
4771 		/*
4772 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4773 		 */
4774 		/* first correct the data pointers */
4775 		for (i = slot; i < nritems; i++) {
4776 			u32 ioff;
4777 
4778 			item = btrfs_item_nr(i);
4779 			ioff = btrfs_token_item_offset(leaf, item, &token);
4780 			btrfs_set_token_item_offset(leaf, item,
4781 						    ioff - total_data, &token);
4782 		}
4783 		/* shift the items */
4784 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4785 			      btrfs_item_nr_offset(slot),
4786 			      (nritems - slot) * sizeof(struct btrfs_item));
4787 
4788 		/* shift the data */
4789 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4790 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4791 			      data_end, old_data - data_end);
4792 		data_end = old_data;
4793 	}
4794 
4795 	/* setup the item for the new data */
4796 	for (i = 0; i < nr; i++) {
4797 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4798 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4799 		item = btrfs_item_nr(slot + i);
4800 		btrfs_set_token_item_offset(leaf, item,
4801 					    data_end - data_size[i], &token);
4802 		data_end -= data_size[i];
4803 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4804 	}
4805 
4806 	btrfs_set_header_nritems(leaf, nritems + nr);
4807 	btrfs_mark_buffer_dirty(leaf);
4808 
4809 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4810 		btrfs_print_leaf(leaf);
4811 		BUG();
4812 	}
4813 }
4814 
4815 /*
4816  * Given a key and some data, insert items into the tree.
4817  * This does all the path init required, making room in the tree if needed.
4818  */
4819 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4820 			    struct btrfs_root *root,
4821 			    struct btrfs_path *path,
4822 			    const struct btrfs_key *cpu_key, u32 *data_size,
4823 			    int nr)
4824 {
4825 	int ret = 0;
4826 	int slot;
4827 	int i;
4828 	u32 total_size = 0;
4829 	u32 total_data = 0;
4830 
4831 	for (i = 0; i < nr; i++)
4832 		total_data += data_size[i];
4833 
4834 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4835 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4836 	if (ret == 0)
4837 		return -EEXIST;
4838 	if (ret < 0)
4839 		return ret;
4840 
4841 	slot = path->slots[0];
4842 	BUG_ON(slot < 0);
4843 
4844 	setup_items_for_insert(root, path, cpu_key, data_size,
4845 			       total_data, total_size, nr);
4846 	return 0;
4847 }
4848 
4849 /*
4850  * Given a key and some data, insert an item into the tree.
4851  * This does all the path init required, making room in the tree if needed.
4852  */
4853 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4854 		      const struct btrfs_key *cpu_key, void *data,
4855 		      u32 data_size)
4856 {
4857 	int ret = 0;
4858 	struct btrfs_path *path;
4859 	struct extent_buffer *leaf;
4860 	unsigned long ptr;
4861 
4862 	path = btrfs_alloc_path();
4863 	if (!path)
4864 		return -ENOMEM;
4865 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4866 	if (!ret) {
4867 		leaf = path->nodes[0];
4868 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4869 		write_extent_buffer(leaf, data, ptr, data_size);
4870 		btrfs_mark_buffer_dirty(leaf);
4871 	}
4872 	btrfs_free_path(path);
4873 	return ret;
4874 }
4875 
4876 /*
4877  * delete the pointer from a given node.
4878  *
4879  * the tree should have been previously balanced so the deletion does not
4880  * empty a node.
4881  */
4882 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4883 		    int level, int slot)
4884 {
4885 	struct extent_buffer *parent = path->nodes[level];
4886 	u32 nritems;
4887 	int ret;
4888 
4889 	nritems = btrfs_header_nritems(parent);
4890 	if (slot != nritems - 1) {
4891 		if (level) {
4892 			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4893 					nritems - slot - 1);
4894 			BUG_ON(ret < 0);
4895 		}
4896 		memmove_extent_buffer(parent,
4897 			      btrfs_node_key_ptr_offset(slot),
4898 			      btrfs_node_key_ptr_offset(slot + 1),
4899 			      sizeof(struct btrfs_key_ptr) *
4900 			      (nritems - slot - 1));
4901 	} else if (level) {
4902 		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4903 				GFP_NOFS);
4904 		BUG_ON(ret < 0);
4905 	}
4906 
4907 	nritems--;
4908 	btrfs_set_header_nritems(parent, nritems);
4909 	if (nritems == 0 && parent == root->node) {
4910 		BUG_ON(btrfs_header_level(root->node) != 1);
4911 		/* just turn the root into a leaf and break */
4912 		btrfs_set_header_level(root->node, 0);
4913 	} else if (slot == 0) {
4914 		struct btrfs_disk_key disk_key;
4915 
4916 		btrfs_node_key(parent, &disk_key, 0);
4917 		fixup_low_keys(path, &disk_key, level + 1);
4918 	}
4919 	btrfs_mark_buffer_dirty(parent);
4920 }
4921 
4922 /*
4923  * a helper function to delete the leaf pointed to by path->slots[1] and
4924  * path->nodes[1].
4925  *
4926  * This deletes the pointer in path->nodes[1] and frees the leaf
4927  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4928  *
4929  * The path must have already been setup for deleting the leaf, including
4930  * all the proper balancing.  path->nodes[1] must be locked.
4931  */
4932 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4933 				    struct btrfs_root *root,
4934 				    struct btrfs_path *path,
4935 				    struct extent_buffer *leaf)
4936 {
4937 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4938 	del_ptr(root, path, 1, path->slots[1]);
4939 
4940 	/*
4941 	 * btrfs_free_extent is expensive, we want to make sure we
4942 	 * aren't holding any locks when we call it
4943 	 */
4944 	btrfs_unlock_up_safe(path, 0);
4945 
4946 	root_sub_used(root, leaf->len);
4947 
4948 	extent_buffer_get(leaf);
4949 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4950 	free_extent_buffer_stale(leaf);
4951 }
4952 /*
4953  * delete the item at the leaf level in path.  If that empties
4954  * the leaf, remove it from the tree
4955  */
4956 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4957 		    struct btrfs_path *path, int slot, int nr)
4958 {
4959 	struct btrfs_fs_info *fs_info = root->fs_info;
4960 	struct extent_buffer *leaf;
4961 	struct btrfs_item *item;
4962 	u32 last_off;
4963 	u32 dsize = 0;
4964 	int ret = 0;
4965 	int wret;
4966 	int i;
4967 	u32 nritems;
4968 	struct btrfs_map_token token;
4969 
4970 	btrfs_init_map_token(&token);
4971 
4972 	leaf = path->nodes[0];
4973 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4974 
4975 	for (i = 0; i < nr; i++)
4976 		dsize += btrfs_item_size_nr(leaf, slot + i);
4977 
4978 	nritems = btrfs_header_nritems(leaf);
4979 
4980 	if (slot + nr != nritems) {
4981 		int data_end = leaf_data_end(fs_info, leaf);
4982 
4983 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4984 			      data_end + dsize,
4985 			      BTRFS_LEAF_DATA_OFFSET + data_end,
4986 			      last_off - data_end);
4987 
4988 		for (i = slot + nr; i < nritems; i++) {
4989 			u32 ioff;
4990 
4991 			item = btrfs_item_nr(i);
4992 			ioff = btrfs_token_item_offset(leaf, item, &token);
4993 			btrfs_set_token_item_offset(leaf, item,
4994 						    ioff + dsize, &token);
4995 		}
4996 
4997 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4998 			      btrfs_item_nr_offset(slot + nr),
4999 			      sizeof(struct btrfs_item) *
5000 			      (nritems - slot - nr));
5001 	}
5002 	btrfs_set_header_nritems(leaf, nritems - nr);
5003 	nritems -= nr;
5004 
5005 	/* delete the leaf if we've emptied it */
5006 	if (nritems == 0) {
5007 		if (leaf == root->node) {
5008 			btrfs_set_header_level(leaf, 0);
5009 		} else {
5010 			btrfs_set_path_blocking(path);
5011 			clean_tree_block(fs_info, leaf);
5012 			btrfs_del_leaf(trans, root, path, leaf);
5013 		}
5014 	} else {
5015 		int used = leaf_space_used(leaf, 0, nritems);
5016 		if (slot == 0) {
5017 			struct btrfs_disk_key disk_key;
5018 
5019 			btrfs_item_key(leaf, &disk_key, 0);
5020 			fixup_low_keys(path, &disk_key, 1);
5021 		}
5022 
5023 		/* delete the leaf if it is mostly empty */
5024 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5025 			/* push_leaf_left fixes the path.
5026 			 * make sure the path still points to our leaf
5027 			 * for possible call to del_ptr below
5028 			 */
5029 			slot = path->slots[1];
5030 			extent_buffer_get(leaf);
5031 
5032 			btrfs_set_path_blocking(path);
5033 			wret = push_leaf_left(trans, root, path, 1, 1,
5034 					      1, (u32)-1);
5035 			if (wret < 0 && wret != -ENOSPC)
5036 				ret = wret;
5037 
5038 			if (path->nodes[0] == leaf &&
5039 			    btrfs_header_nritems(leaf)) {
5040 				wret = push_leaf_right(trans, root, path, 1,
5041 						       1, 1, 0);
5042 				if (wret < 0 && wret != -ENOSPC)
5043 					ret = wret;
5044 			}
5045 
5046 			if (btrfs_header_nritems(leaf) == 0) {
5047 				path->slots[1] = slot;
5048 				btrfs_del_leaf(trans, root, path, leaf);
5049 				free_extent_buffer(leaf);
5050 				ret = 0;
5051 			} else {
5052 				/* if we're still in the path, make sure
5053 				 * we're dirty.  Otherwise, one of the
5054 				 * push_leaf functions must have already
5055 				 * dirtied this buffer
5056 				 */
5057 				if (path->nodes[0] == leaf)
5058 					btrfs_mark_buffer_dirty(leaf);
5059 				free_extent_buffer(leaf);
5060 			}
5061 		} else {
5062 			btrfs_mark_buffer_dirty(leaf);
5063 		}
5064 	}
5065 	return ret;
5066 }
5067 
5068 /*
5069  * search the tree again to find a leaf with lesser keys
5070  * returns 0 if it found something or 1 if there are no lesser leaves.
5071  * returns < 0 on io errors.
5072  *
5073  * This may release the path, and so you may lose any locks held at the
5074  * time you call it.
5075  */
5076 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5077 {
5078 	struct btrfs_key key;
5079 	struct btrfs_disk_key found_key;
5080 	int ret;
5081 
5082 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5083 
5084 	if (key.offset > 0) {
5085 		key.offset--;
5086 	} else if (key.type > 0) {
5087 		key.type--;
5088 		key.offset = (u64)-1;
5089 	} else if (key.objectid > 0) {
5090 		key.objectid--;
5091 		key.type = (u8)-1;
5092 		key.offset = (u64)-1;
5093 	} else {
5094 		return 1;
5095 	}
5096 
5097 	btrfs_release_path(path);
5098 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5099 	if (ret < 0)
5100 		return ret;
5101 	btrfs_item_key(path->nodes[0], &found_key, 0);
5102 	ret = comp_keys(&found_key, &key);
5103 	/*
5104 	 * We might have had an item with the previous key in the tree right
5105 	 * before we released our path. And after we released our path, that
5106 	 * item might have been pushed to the first slot (0) of the leaf we
5107 	 * were holding due to a tree balance. Alternatively, an item with the
5108 	 * previous key can exist as the only element of a leaf (big fat item).
5109 	 * Therefore account for these 2 cases, so that our callers (like
5110 	 * btrfs_previous_item) don't miss an existing item with a key matching
5111 	 * the previous key we computed above.
5112 	 */
5113 	if (ret <= 0)
5114 		return 0;
5115 	return 1;
5116 }
5117 
5118 /*
5119  * A helper function to walk down the tree starting at min_key, and looking
5120  * for nodes or leaves that are have a minimum transaction id.
5121  * This is used by the btree defrag code, and tree logging
5122  *
5123  * This does not cow, but it does stuff the starting key it finds back
5124  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5125  * key and get a writable path.
5126  *
5127  * This honors path->lowest_level to prevent descent past a given level
5128  * of the tree.
5129  *
5130  * min_trans indicates the oldest transaction that you are interested
5131  * in walking through.  Any nodes or leaves older than min_trans are
5132  * skipped over (without reading them).
5133  *
5134  * returns zero if something useful was found, < 0 on error and 1 if there
5135  * was nothing in the tree that matched the search criteria.
5136  */
5137 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5138 			 struct btrfs_path *path,
5139 			 u64 min_trans)
5140 {
5141 	struct btrfs_fs_info *fs_info = root->fs_info;
5142 	struct extent_buffer *cur;
5143 	struct btrfs_key found_key;
5144 	int slot;
5145 	int sret;
5146 	u32 nritems;
5147 	int level;
5148 	int ret = 1;
5149 	int keep_locks = path->keep_locks;
5150 
5151 	path->keep_locks = 1;
5152 again:
5153 	cur = btrfs_read_lock_root_node(root);
5154 	level = btrfs_header_level(cur);
5155 	WARN_ON(path->nodes[level]);
5156 	path->nodes[level] = cur;
5157 	path->locks[level] = BTRFS_READ_LOCK;
5158 
5159 	if (btrfs_header_generation(cur) < min_trans) {
5160 		ret = 1;
5161 		goto out;
5162 	}
5163 	while (1) {
5164 		nritems = btrfs_header_nritems(cur);
5165 		level = btrfs_header_level(cur);
5166 		sret = btrfs_bin_search(cur, min_key, level, &slot);
5167 
5168 		/* at the lowest level, we're done, setup the path and exit */
5169 		if (level == path->lowest_level) {
5170 			if (slot >= nritems)
5171 				goto find_next_key;
5172 			ret = 0;
5173 			path->slots[level] = slot;
5174 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5175 			goto out;
5176 		}
5177 		if (sret && slot > 0)
5178 			slot--;
5179 		/*
5180 		 * check this node pointer against the min_trans parameters.
5181 		 * If it is too old, old, skip to the next one.
5182 		 */
5183 		while (slot < nritems) {
5184 			u64 gen;
5185 
5186 			gen = btrfs_node_ptr_generation(cur, slot);
5187 			if (gen < min_trans) {
5188 				slot++;
5189 				continue;
5190 			}
5191 			break;
5192 		}
5193 find_next_key:
5194 		/*
5195 		 * we didn't find a candidate key in this node, walk forward
5196 		 * and find another one
5197 		 */
5198 		if (slot >= nritems) {
5199 			path->slots[level] = slot;
5200 			btrfs_set_path_blocking(path);
5201 			sret = btrfs_find_next_key(root, path, min_key, level,
5202 						  min_trans);
5203 			if (sret == 0) {
5204 				btrfs_release_path(path);
5205 				goto again;
5206 			} else {
5207 				goto out;
5208 			}
5209 		}
5210 		/* save our key for returning back */
5211 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5212 		path->slots[level] = slot;
5213 		if (level == path->lowest_level) {
5214 			ret = 0;
5215 			goto out;
5216 		}
5217 		btrfs_set_path_blocking(path);
5218 		cur = read_node_slot(fs_info, cur, slot);
5219 		if (IS_ERR(cur)) {
5220 			ret = PTR_ERR(cur);
5221 			goto out;
5222 		}
5223 
5224 		btrfs_tree_read_lock(cur);
5225 
5226 		path->locks[level - 1] = BTRFS_READ_LOCK;
5227 		path->nodes[level - 1] = cur;
5228 		unlock_up(path, level, 1, 0, NULL);
5229 	}
5230 out:
5231 	path->keep_locks = keep_locks;
5232 	if (ret == 0) {
5233 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5234 		btrfs_set_path_blocking(path);
5235 		memcpy(min_key, &found_key, sizeof(found_key));
5236 	}
5237 	return ret;
5238 }
5239 
5240 static int tree_move_down(struct btrfs_fs_info *fs_info,
5241 			   struct btrfs_path *path,
5242 			   int *level)
5243 {
5244 	struct extent_buffer *eb;
5245 
5246 	BUG_ON(*level == 0);
5247 	eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5248 	if (IS_ERR(eb))
5249 		return PTR_ERR(eb);
5250 
5251 	path->nodes[*level - 1] = eb;
5252 	path->slots[*level - 1] = 0;
5253 	(*level)--;
5254 	return 0;
5255 }
5256 
5257 static int tree_move_next_or_upnext(struct btrfs_path *path,
5258 				    int *level, int root_level)
5259 {
5260 	int ret = 0;
5261 	int nritems;
5262 	nritems = btrfs_header_nritems(path->nodes[*level]);
5263 
5264 	path->slots[*level]++;
5265 
5266 	while (path->slots[*level] >= nritems) {
5267 		if (*level == root_level)
5268 			return -1;
5269 
5270 		/* move upnext */
5271 		path->slots[*level] = 0;
5272 		free_extent_buffer(path->nodes[*level]);
5273 		path->nodes[*level] = NULL;
5274 		(*level)++;
5275 		path->slots[*level]++;
5276 
5277 		nritems = btrfs_header_nritems(path->nodes[*level]);
5278 		ret = 1;
5279 	}
5280 	return ret;
5281 }
5282 
5283 /*
5284  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5285  * or down.
5286  */
5287 static int tree_advance(struct btrfs_fs_info *fs_info,
5288 			struct btrfs_path *path,
5289 			int *level, int root_level,
5290 			int allow_down,
5291 			struct btrfs_key *key)
5292 {
5293 	int ret;
5294 
5295 	if (*level == 0 || !allow_down) {
5296 		ret = tree_move_next_or_upnext(path, level, root_level);
5297 	} else {
5298 		ret = tree_move_down(fs_info, path, level);
5299 	}
5300 	if (ret >= 0) {
5301 		if (*level == 0)
5302 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5303 					path->slots[*level]);
5304 		else
5305 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5306 					path->slots[*level]);
5307 	}
5308 	return ret;
5309 }
5310 
5311 static int tree_compare_item(struct btrfs_path *left_path,
5312 			     struct btrfs_path *right_path,
5313 			     char *tmp_buf)
5314 {
5315 	int cmp;
5316 	int len1, len2;
5317 	unsigned long off1, off2;
5318 
5319 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5320 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5321 	if (len1 != len2)
5322 		return 1;
5323 
5324 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5325 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5326 				right_path->slots[0]);
5327 
5328 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5329 
5330 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5331 	if (cmp)
5332 		return 1;
5333 	return 0;
5334 }
5335 
5336 #define ADVANCE 1
5337 #define ADVANCE_ONLY_NEXT -1
5338 
5339 /*
5340  * This function compares two trees and calls the provided callback for
5341  * every changed/new/deleted item it finds.
5342  * If shared tree blocks are encountered, whole subtrees are skipped, making
5343  * the compare pretty fast on snapshotted subvolumes.
5344  *
5345  * This currently works on commit roots only. As commit roots are read only,
5346  * we don't do any locking. The commit roots are protected with transactions.
5347  * Transactions are ended and rejoined when a commit is tried in between.
5348  *
5349  * This function checks for modifications done to the trees while comparing.
5350  * If it detects a change, it aborts immediately.
5351  */
5352 int btrfs_compare_trees(struct btrfs_root *left_root,
5353 			struct btrfs_root *right_root,
5354 			btrfs_changed_cb_t changed_cb, void *ctx)
5355 {
5356 	struct btrfs_fs_info *fs_info = left_root->fs_info;
5357 	int ret;
5358 	int cmp;
5359 	struct btrfs_path *left_path = NULL;
5360 	struct btrfs_path *right_path = NULL;
5361 	struct btrfs_key left_key;
5362 	struct btrfs_key right_key;
5363 	char *tmp_buf = NULL;
5364 	int left_root_level;
5365 	int right_root_level;
5366 	int left_level;
5367 	int right_level;
5368 	int left_end_reached;
5369 	int right_end_reached;
5370 	int advance_left;
5371 	int advance_right;
5372 	u64 left_blockptr;
5373 	u64 right_blockptr;
5374 	u64 left_gen;
5375 	u64 right_gen;
5376 
5377 	left_path = btrfs_alloc_path();
5378 	if (!left_path) {
5379 		ret = -ENOMEM;
5380 		goto out;
5381 	}
5382 	right_path = btrfs_alloc_path();
5383 	if (!right_path) {
5384 		ret = -ENOMEM;
5385 		goto out;
5386 	}
5387 
5388 	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5389 	if (!tmp_buf) {
5390 		ret = -ENOMEM;
5391 		goto out;
5392 	}
5393 
5394 	left_path->search_commit_root = 1;
5395 	left_path->skip_locking = 1;
5396 	right_path->search_commit_root = 1;
5397 	right_path->skip_locking = 1;
5398 
5399 	/*
5400 	 * Strategy: Go to the first items of both trees. Then do
5401 	 *
5402 	 * If both trees are at level 0
5403 	 *   Compare keys of current items
5404 	 *     If left < right treat left item as new, advance left tree
5405 	 *       and repeat
5406 	 *     If left > right treat right item as deleted, advance right tree
5407 	 *       and repeat
5408 	 *     If left == right do deep compare of items, treat as changed if
5409 	 *       needed, advance both trees and repeat
5410 	 * If both trees are at the same level but not at level 0
5411 	 *   Compare keys of current nodes/leafs
5412 	 *     If left < right advance left tree and repeat
5413 	 *     If left > right advance right tree and repeat
5414 	 *     If left == right compare blockptrs of the next nodes/leafs
5415 	 *       If they match advance both trees but stay at the same level
5416 	 *         and repeat
5417 	 *       If they don't match advance both trees while allowing to go
5418 	 *         deeper and repeat
5419 	 * If tree levels are different
5420 	 *   Advance the tree that needs it and repeat
5421 	 *
5422 	 * Advancing a tree means:
5423 	 *   If we are at level 0, try to go to the next slot. If that's not
5424 	 *   possible, go one level up and repeat. Stop when we found a level
5425 	 *   where we could go to the next slot. We may at this point be on a
5426 	 *   node or a leaf.
5427 	 *
5428 	 *   If we are not at level 0 and not on shared tree blocks, go one
5429 	 *   level deeper.
5430 	 *
5431 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5432 	 *   the right if possible or go up and right.
5433 	 */
5434 
5435 	down_read(&fs_info->commit_root_sem);
5436 	left_level = btrfs_header_level(left_root->commit_root);
5437 	left_root_level = left_level;
5438 	left_path->nodes[left_level] =
5439 			btrfs_clone_extent_buffer(left_root->commit_root);
5440 	if (!left_path->nodes[left_level]) {
5441 		up_read(&fs_info->commit_root_sem);
5442 		ret = -ENOMEM;
5443 		goto out;
5444 	}
5445 
5446 	right_level = btrfs_header_level(right_root->commit_root);
5447 	right_root_level = right_level;
5448 	right_path->nodes[right_level] =
5449 			btrfs_clone_extent_buffer(right_root->commit_root);
5450 	if (!right_path->nodes[right_level]) {
5451 		up_read(&fs_info->commit_root_sem);
5452 		ret = -ENOMEM;
5453 		goto out;
5454 	}
5455 	up_read(&fs_info->commit_root_sem);
5456 
5457 	if (left_level == 0)
5458 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5459 				&left_key, left_path->slots[left_level]);
5460 	else
5461 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5462 				&left_key, left_path->slots[left_level]);
5463 	if (right_level == 0)
5464 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5465 				&right_key, right_path->slots[right_level]);
5466 	else
5467 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5468 				&right_key, right_path->slots[right_level]);
5469 
5470 	left_end_reached = right_end_reached = 0;
5471 	advance_left = advance_right = 0;
5472 
5473 	while (1) {
5474 		if (advance_left && !left_end_reached) {
5475 			ret = tree_advance(fs_info, left_path, &left_level,
5476 					left_root_level,
5477 					advance_left != ADVANCE_ONLY_NEXT,
5478 					&left_key);
5479 			if (ret == -1)
5480 				left_end_reached = ADVANCE;
5481 			else if (ret < 0)
5482 				goto out;
5483 			advance_left = 0;
5484 		}
5485 		if (advance_right && !right_end_reached) {
5486 			ret = tree_advance(fs_info, right_path, &right_level,
5487 					right_root_level,
5488 					advance_right != ADVANCE_ONLY_NEXT,
5489 					&right_key);
5490 			if (ret == -1)
5491 				right_end_reached = ADVANCE;
5492 			else if (ret < 0)
5493 				goto out;
5494 			advance_right = 0;
5495 		}
5496 
5497 		if (left_end_reached && right_end_reached) {
5498 			ret = 0;
5499 			goto out;
5500 		} else if (left_end_reached) {
5501 			if (right_level == 0) {
5502 				ret = changed_cb(left_path, right_path,
5503 						&right_key,
5504 						BTRFS_COMPARE_TREE_DELETED,
5505 						ctx);
5506 				if (ret < 0)
5507 					goto out;
5508 			}
5509 			advance_right = ADVANCE;
5510 			continue;
5511 		} else if (right_end_reached) {
5512 			if (left_level == 0) {
5513 				ret = changed_cb(left_path, right_path,
5514 						&left_key,
5515 						BTRFS_COMPARE_TREE_NEW,
5516 						ctx);
5517 				if (ret < 0)
5518 					goto out;
5519 			}
5520 			advance_left = ADVANCE;
5521 			continue;
5522 		}
5523 
5524 		if (left_level == 0 && right_level == 0) {
5525 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5526 			if (cmp < 0) {
5527 				ret = changed_cb(left_path, right_path,
5528 						&left_key,
5529 						BTRFS_COMPARE_TREE_NEW,
5530 						ctx);
5531 				if (ret < 0)
5532 					goto out;
5533 				advance_left = ADVANCE;
5534 			} else if (cmp > 0) {
5535 				ret = changed_cb(left_path, right_path,
5536 						&right_key,
5537 						BTRFS_COMPARE_TREE_DELETED,
5538 						ctx);
5539 				if (ret < 0)
5540 					goto out;
5541 				advance_right = ADVANCE;
5542 			} else {
5543 				enum btrfs_compare_tree_result result;
5544 
5545 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5546 				ret = tree_compare_item(left_path, right_path,
5547 							tmp_buf);
5548 				if (ret)
5549 					result = BTRFS_COMPARE_TREE_CHANGED;
5550 				else
5551 					result = BTRFS_COMPARE_TREE_SAME;
5552 				ret = changed_cb(left_path, right_path,
5553 						 &left_key, result, ctx);
5554 				if (ret < 0)
5555 					goto out;
5556 				advance_left = ADVANCE;
5557 				advance_right = ADVANCE;
5558 			}
5559 		} else if (left_level == right_level) {
5560 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5561 			if (cmp < 0) {
5562 				advance_left = ADVANCE;
5563 			} else if (cmp > 0) {
5564 				advance_right = ADVANCE;
5565 			} else {
5566 				left_blockptr = btrfs_node_blockptr(
5567 						left_path->nodes[left_level],
5568 						left_path->slots[left_level]);
5569 				right_blockptr = btrfs_node_blockptr(
5570 						right_path->nodes[right_level],
5571 						right_path->slots[right_level]);
5572 				left_gen = btrfs_node_ptr_generation(
5573 						left_path->nodes[left_level],
5574 						left_path->slots[left_level]);
5575 				right_gen = btrfs_node_ptr_generation(
5576 						right_path->nodes[right_level],
5577 						right_path->slots[right_level]);
5578 				if (left_blockptr == right_blockptr &&
5579 				    left_gen == right_gen) {
5580 					/*
5581 					 * As we're on a shared block, don't
5582 					 * allow to go deeper.
5583 					 */
5584 					advance_left = ADVANCE_ONLY_NEXT;
5585 					advance_right = ADVANCE_ONLY_NEXT;
5586 				} else {
5587 					advance_left = ADVANCE;
5588 					advance_right = ADVANCE;
5589 				}
5590 			}
5591 		} else if (left_level < right_level) {
5592 			advance_right = ADVANCE;
5593 		} else {
5594 			advance_left = ADVANCE;
5595 		}
5596 	}
5597 
5598 out:
5599 	btrfs_free_path(left_path);
5600 	btrfs_free_path(right_path);
5601 	kvfree(tmp_buf);
5602 	return ret;
5603 }
5604 
5605 /*
5606  * this is similar to btrfs_next_leaf, but does not try to preserve
5607  * and fixup the path.  It looks for and returns the next key in the
5608  * tree based on the current path and the min_trans parameters.
5609  *
5610  * 0 is returned if another key is found, < 0 if there are any errors
5611  * and 1 is returned if there are no higher keys in the tree
5612  *
5613  * path->keep_locks should be set to 1 on the search made before
5614  * calling this function.
5615  */
5616 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5617 			struct btrfs_key *key, int level, u64 min_trans)
5618 {
5619 	int slot;
5620 	struct extent_buffer *c;
5621 
5622 	WARN_ON(!path->keep_locks);
5623 	while (level < BTRFS_MAX_LEVEL) {
5624 		if (!path->nodes[level])
5625 			return 1;
5626 
5627 		slot = path->slots[level] + 1;
5628 		c = path->nodes[level];
5629 next:
5630 		if (slot >= btrfs_header_nritems(c)) {
5631 			int ret;
5632 			int orig_lowest;
5633 			struct btrfs_key cur_key;
5634 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5635 			    !path->nodes[level + 1])
5636 				return 1;
5637 
5638 			if (path->locks[level + 1]) {
5639 				level++;
5640 				continue;
5641 			}
5642 
5643 			slot = btrfs_header_nritems(c) - 1;
5644 			if (level == 0)
5645 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5646 			else
5647 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5648 
5649 			orig_lowest = path->lowest_level;
5650 			btrfs_release_path(path);
5651 			path->lowest_level = level;
5652 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5653 						0, 0);
5654 			path->lowest_level = orig_lowest;
5655 			if (ret < 0)
5656 				return ret;
5657 
5658 			c = path->nodes[level];
5659 			slot = path->slots[level];
5660 			if (ret == 0)
5661 				slot++;
5662 			goto next;
5663 		}
5664 
5665 		if (level == 0)
5666 			btrfs_item_key_to_cpu(c, key, slot);
5667 		else {
5668 			u64 gen = btrfs_node_ptr_generation(c, slot);
5669 
5670 			if (gen < min_trans) {
5671 				slot++;
5672 				goto next;
5673 			}
5674 			btrfs_node_key_to_cpu(c, key, slot);
5675 		}
5676 		return 0;
5677 	}
5678 	return 1;
5679 }
5680 
5681 /*
5682  * search the tree again to find a leaf with greater keys
5683  * returns 0 if it found something or 1 if there are no greater leaves.
5684  * returns < 0 on io errors.
5685  */
5686 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5687 {
5688 	return btrfs_next_old_leaf(root, path, 0);
5689 }
5690 
5691 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5692 			u64 time_seq)
5693 {
5694 	int slot;
5695 	int level;
5696 	struct extent_buffer *c;
5697 	struct extent_buffer *next;
5698 	struct btrfs_key key;
5699 	u32 nritems;
5700 	int ret;
5701 	int old_spinning = path->leave_spinning;
5702 	int next_rw_lock = 0;
5703 
5704 	nritems = btrfs_header_nritems(path->nodes[0]);
5705 	if (nritems == 0)
5706 		return 1;
5707 
5708 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5709 again:
5710 	level = 1;
5711 	next = NULL;
5712 	next_rw_lock = 0;
5713 	btrfs_release_path(path);
5714 
5715 	path->keep_locks = 1;
5716 	path->leave_spinning = 1;
5717 
5718 	if (time_seq)
5719 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5720 	else
5721 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5722 	path->keep_locks = 0;
5723 
5724 	if (ret < 0)
5725 		return ret;
5726 
5727 	nritems = btrfs_header_nritems(path->nodes[0]);
5728 	/*
5729 	 * by releasing the path above we dropped all our locks.  A balance
5730 	 * could have added more items next to the key that used to be
5731 	 * at the very end of the block.  So, check again here and
5732 	 * advance the path if there are now more items available.
5733 	 */
5734 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5735 		if (ret == 0)
5736 			path->slots[0]++;
5737 		ret = 0;
5738 		goto done;
5739 	}
5740 	/*
5741 	 * So the above check misses one case:
5742 	 * - after releasing the path above, someone has removed the item that
5743 	 *   used to be at the very end of the block, and balance between leafs
5744 	 *   gets another one with bigger key.offset to replace it.
5745 	 *
5746 	 * This one should be returned as well, or we can get leaf corruption
5747 	 * later(esp. in __btrfs_drop_extents()).
5748 	 *
5749 	 * And a bit more explanation about this check,
5750 	 * with ret > 0, the key isn't found, the path points to the slot
5751 	 * where it should be inserted, so the path->slots[0] item must be the
5752 	 * bigger one.
5753 	 */
5754 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5755 		ret = 0;
5756 		goto done;
5757 	}
5758 
5759 	while (level < BTRFS_MAX_LEVEL) {
5760 		if (!path->nodes[level]) {
5761 			ret = 1;
5762 			goto done;
5763 		}
5764 
5765 		slot = path->slots[level] + 1;
5766 		c = path->nodes[level];
5767 		if (slot >= btrfs_header_nritems(c)) {
5768 			level++;
5769 			if (level == BTRFS_MAX_LEVEL) {
5770 				ret = 1;
5771 				goto done;
5772 			}
5773 			continue;
5774 		}
5775 
5776 		if (next) {
5777 			btrfs_tree_unlock_rw(next, next_rw_lock);
5778 			free_extent_buffer(next);
5779 		}
5780 
5781 		next = c;
5782 		next_rw_lock = path->locks[level];
5783 		ret = read_block_for_search(root, path, &next, level,
5784 					    slot, &key);
5785 		if (ret == -EAGAIN)
5786 			goto again;
5787 
5788 		if (ret < 0) {
5789 			btrfs_release_path(path);
5790 			goto done;
5791 		}
5792 
5793 		if (!path->skip_locking) {
5794 			ret = btrfs_try_tree_read_lock(next);
5795 			if (!ret && time_seq) {
5796 				/*
5797 				 * If we don't get the lock, we may be racing
5798 				 * with push_leaf_left, holding that lock while
5799 				 * itself waiting for the leaf we've currently
5800 				 * locked. To solve this situation, we give up
5801 				 * on our lock and cycle.
5802 				 */
5803 				free_extent_buffer(next);
5804 				btrfs_release_path(path);
5805 				cond_resched();
5806 				goto again;
5807 			}
5808 			if (!ret) {
5809 				btrfs_set_path_blocking(path);
5810 				btrfs_tree_read_lock(next);
5811 			}
5812 			next_rw_lock = BTRFS_READ_LOCK;
5813 		}
5814 		break;
5815 	}
5816 	path->slots[level] = slot;
5817 	while (1) {
5818 		level--;
5819 		c = path->nodes[level];
5820 		if (path->locks[level])
5821 			btrfs_tree_unlock_rw(c, path->locks[level]);
5822 
5823 		free_extent_buffer(c);
5824 		path->nodes[level] = next;
5825 		path->slots[level] = 0;
5826 		if (!path->skip_locking)
5827 			path->locks[level] = next_rw_lock;
5828 		if (!level)
5829 			break;
5830 
5831 		ret = read_block_for_search(root, path, &next, level,
5832 					    0, &key);
5833 		if (ret == -EAGAIN)
5834 			goto again;
5835 
5836 		if (ret < 0) {
5837 			btrfs_release_path(path);
5838 			goto done;
5839 		}
5840 
5841 		if (!path->skip_locking) {
5842 			ret = btrfs_try_tree_read_lock(next);
5843 			if (!ret) {
5844 				btrfs_set_path_blocking(path);
5845 				btrfs_tree_read_lock(next);
5846 			}
5847 			next_rw_lock = BTRFS_READ_LOCK;
5848 		}
5849 	}
5850 	ret = 0;
5851 done:
5852 	unlock_up(path, 0, 1, 0, NULL);
5853 	path->leave_spinning = old_spinning;
5854 	if (!old_spinning)
5855 		btrfs_set_path_blocking(path);
5856 
5857 	return ret;
5858 }
5859 
5860 /*
5861  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5862  * searching until it gets past min_objectid or finds an item of 'type'
5863  *
5864  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5865  */
5866 int btrfs_previous_item(struct btrfs_root *root,
5867 			struct btrfs_path *path, u64 min_objectid,
5868 			int type)
5869 {
5870 	struct btrfs_key found_key;
5871 	struct extent_buffer *leaf;
5872 	u32 nritems;
5873 	int ret;
5874 
5875 	while (1) {
5876 		if (path->slots[0] == 0) {
5877 			btrfs_set_path_blocking(path);
5878 			ret = btrfs_prev_leaf(root, path);
5879 			if (ret != 0)
5880 				return ret;
5881 		} else {
5882 			path->slots[0]--;
5883 		}
5884 		leaf = path->nodes[0];
5885 		nritems = btrfs_header_nritems(leaf);
5886 		if (nritems == 0)
5887 			return 1;
5888 		if (path->slots[0] == nritems)
5889 			path->slots[0]--;
5890 
5891 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5892 		if (found_key.objectid < min_objectid)
5893 			break;
5894 		if (found_key.type == type)
5895 			return 0;
5896 		if (found_key.objectid == min_objectid &&
5897 		    found_key.type < type)
5898 			break;
5899 	}
5900 	return 1;
5901 }
5902 
5903 /*
5904  * search in extent tree to find a previous Metadata/Data extent item with
5905  * min objecitd.
5906  *
5907  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5908  */
5909 int btrfs_previous_extent_item(struct btrfs_root *root,
5910 			struct btrfs_path *path, u64 min_objectid)
5911 {
5912 	struct btrfs_key found_key;
5913 	struct extent_buffer *leaf;
5914 	u32 nritems;
5915 	int ret;
5916 
5917 	while (1) {
5918 		if (path->slots[0] == 0) {
5919 			btrfs_set_path_blocking(path);
5920 			ret = btrfs_prev_leaf(root, path);
5921 			if (ret != 0)
5922 				return ret;
5923 		} else {
5924 			path->slots[0]--;
5925 		}
5926 		leaf = path->nodes[0];
5927 		nritems = btrfs_header_nritems(leaf);
5928 		if (nritems == 0)
5929 			return 1;
5930 		if (path->slots[0] == nritems)
5931 			path->slots[0]--;
5932 
5933 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5934 		if (found_key.objectid < min_objectid)
5935 			break;
5936 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5937 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5938 			return 0;
5939 		if (found_key.objectid == min_objectid &&
5940 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5941 			break;
5942 	}
5943 	return 1;
5944 }
5945