xref: /openbmc/linux/fs/btrfs/ctree.c (revision bbde9fc1824aab58bc78c084163007dd6c03fe5b)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27 
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 		      *root, struct btrfs_key *ins_key,
32 		      struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 			  struct btrfs_root *root, struct extent_buffer *dst,
35 			  struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      struct extent_buffer *dst_buf,
39 			      struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 		    int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 				 struct extent_buffer *eb);
44 
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47 	struct btrfs_path *path;
48 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49 	return path;
50 }
51 
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58 	int i;
59 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60 		if (!p->nodes[i] || !p->locks[i])
61 			continue;
62 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63 		if (p->locks[i] == BTRFS_READ_LOCK)
64 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
66 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67 	}
68 }
69 
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 					struct extent_buffer *held, int held_rw)
80 {
81 	int i;
82 
83 	if (held) {
84 		btrfs_set_lock_blocking_rw(held, held_rw);
85 		if (held_rw == BTRFS_WRITE_LOCK)
86 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87 		else if (held_rw == BTRFS_READ_LOCK)
88 			held_rw = BTRFS_READ_LOCK_BLOCKING;
89 	}
90 	btrfs_set_path_blocking(p);
91 
92 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93 		if (p->nodes[i] && p->locks[i]) {
94 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96 				p->locks[i] = BTRFS_WRITE_LOCK;
97 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98 				p->locks[i] = BTRFS_READ_LOCK;
99 		}
100 	}
101 
102 	if (held)
103 		btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105 
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109 	if (!p)
110 		return;
111 	btrfs_release_path(p);
112 	kmem_cache_free(btrfs_path_cachep, p);
113 }
114 
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123 	int i;
124 
125 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126 		p->slots[i] = 0;
127 		if (!p->nodes[i])
128 			continue;
129 		if (p->locks[i]) {
130 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131 			p->locks[i] = 0;
132 		}
133 		free_extent_buffer(p->nodes[i]);
134 		p->nodes[i] = NULL;
135 	}
136 }
137 
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150 	struct extent_buffer *eb;
151 
152 	while (1) {
153 		rcu_read_lock();
154 		eb = rcu_dereference(root->node);
155 
156 		/*
157 		 * RCU really hurts here, we could free up the root node because
158 		 * it was cow'ed but we may not get the new root node yet so do
159 		 * the inc_not_zero dance and if it doesn't work then
160 		 * synchronize_rcu and try again.
161 		 */
162 		if (atomic_inc_not_zero(&eb->refs)) {
163 			rcu_read_unlock();
164 			break;
165 		}
166 		rcu_read_unlock();
167 		synchronize_rcu();
168 	}
169 	return eb;
170 }
171 
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178 	struct extent_buffer *eb;
179 
180 	while (1) {
181 		eb = btrfs_root_node(root);
182 		btrfs_tree_lock(eb);
183 		if (eb == root->node)
184 			break;
185 		btrfs_tree_unlock(eb);
186 		free_extent_buffer(eb);
187 	}
188 	return eb;
189 }
190 
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197 	struct extent_buffer *eb;
198 
199 	while (1) {
200 		eb = btrfs_root_node(root);
201 		btrfs_tree_read_lock(eb);
202 		if (eb == root->node)
203 			break;
204 		btrfs_tree_read_unlock(eb);
205 		free_extent_buffer(eb);
206 	}
207 	return eb;
208 }
209 
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
217 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
218 		return;
219 
220 	spin_lock(&root->fs_info->trans_lock);
221 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
222 		/* Want the extent tree to be the last on the list */
223 		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
224 			list_move_tail(&root->dirty_list,
225 				       &root->fs_info->dirty_cowonly_roots);
226 		else
227 			list_move(&root->dirty_list,
228 				  &root->fs_info->dirty_cowonly_roots);
229 	}
230 	spin_unlock(&root->fs_info->trans_lock);
231 }
232 
233 /*
234  * used by snapshot creation to make a copy of a root for a tree with
235  * a given objectid.  The buffer with the new root node is returned in
236  * cow_ret, and this func returns zero on success or a negative error code.
237  */
238 int btrfs_copy_root(struct btrfs_trans_handle *trans,
239 		      struct btrfs_root *root,
240 		      struct extent_buffer *buf,
241 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
242 {
243 	struct extent_buffer *cow;
244 	int ret = 0;
245 	int level;
246 	struct btrfs_disk_key disk_key;
247 
248 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
249 		trans->transid != root->fs_info->running_transaction->transid);
250 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
251 		trans->transid != root->last_trans);
252 
253 	level = btrfs_header_level(buf);
254 	if (level == 0)
255 		btrfs_item_key(buf, &disk_key, 0);
256 	else
257 		btrfs_node_key(buf, &disk_key, 0);
258 
259 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
260 			&disk_key, level, buf->start, 0);
261 	if (IS_ERR(cow))
262 		return PTR_ERR(cow);
263 
264 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
265 	btrfs_set_header_bytenr(cow, cow->start);
266 	btrfs_set_header_generation(cow, trans->transid);
267 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
268 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
269 				     BTRFS_HEADER_FLAG_RELOC);
270 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
271 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
272 	else
273 		btrfs_set_header_owner(cow, new_root_objectid);
274 
275 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
276 			    BTRFS_FSID_SIZE);
277 
278 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
279 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
280 		ret = btrfs_inc_ref(trans, root, cow, 1);
281 	else
282 		ret = btrfs_inc_ref(trans, root, cow, 0);
283 
284 	if (ret)
285 		return ret;
286 
287 	btrfs_mark_buffer_dirty(cow);
288 	*cow_ret = cow;
289 	return 0;
290 }
291 
292 enum mod_log_op {
293 	MOD_LOG_KEY_REPLACE,
294 	MOD_LOG_KEY_ADD,
295 	MOD_LOG_KEY_REMOVE,
296 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
297 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
298 	MOD_LOG_MOVE_KEYS,
299 	MOD_LOG_ROOT_REPLACE,
300 };
301 
302 struct tree_mod_move {
303 	int dst_slot;
304 	int nr_items;
305 };
306 
307 struct tree_mod_root {
308 	u64 logical;
309 	u8 level;
310 };
311 
312 struct tree_mod_elem {
313 	struct rb_node node;
314 	u64 index;		/* shifted logical */
315 	u64 seq;
316 	enum mod_log_op op;
317 
318 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
319 	int slot;
320 
321 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
322 	u64 generation;
323 
324 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
325 	struct btrfs_disk_key key;
326 	u64 blockptr;
327 
328 	/* this is used for op == MOD_LOG_MOVE_KEYS */
329 	struct tree_mod_move move;
330 
331 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
332 	struct tree_mod_root old_root;
333 };
334 
335 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
336 {
337 	read_lock(&fs_info->tree_mod_log_lock);
338 }
339 
340 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
341 {
342 	read_unlock(&fs_info->tree_mod_log_lock);
343 }
344 
345 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
346 {
347 	write_lock(&fs_info->tree_mod_log_lock);
348 }
349 
350 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
351 {
352 	write_unlock(&fs_info->tree_mod_log_lock);
353 }
354 
355 /*
356  * Pull a new tree mod seq number for our operation.
357  */
358 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
359 {
360 	return atomic64_inc_return(&fs_info->tree_mod_seq);
361 }
362 
363 /*
364  * This adds a new blocker to the tree mod log's blocker list if the @elem
365  * passed does not already have a sequence number set. So when a caller expects
366  * to record tree modifications, it should ensure to set elem->seq to zero
367  * before calling btrfs_get_tree_mod_seq.
368  * Returns a fresh, unused tree log modification sequence number, even if no new
369  * blocker was added.
370  */
371 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
372 			   struct seq_list *elem)
373 {
374 	tree_mod_log_write_lock(fs_info);
375 	spin_lock(&fs_info->tree_mod_seq_lock);
376 	if (!elem->seq) {
377 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
378 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
379 	}
380 	spin_unlock(&fs_info->tree_mod_seq_lock);
381 	tree_mod_log_write_unlock(fs_info);
382 
383 	return elem->seq;
384 }
385 
386 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
387 			    struct seq_list *elem)
388 {
389 	struct rb_root *tm_root;
390 	struct rb_node *node;
391 	struct rb_node *next;
392 	struct seq_list *cur_elem;
393 	struct tree_mod_elem *tm;
394 	u64 min_seq = (u64)-1;
395 	u64 seq_putting = elem->seq;
396 
397 	if (!seq_putting)
398 		return;
399 
400 	spin_lock(&fs_info->tree_mod_seq_lock);
401 	list_del(&elem->list);
402 	elem->seq = 0;
403 
404 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
405 		if (cur_elem->seq < min_seq) {
406 			if (seq_putting > cur_elem->seq) {
407 				/*
408 				 * blocker with lower sequence number exists, we
409 				 * cannot remove anything from the log
410 				 */
411 				spin_unlock(&fs_info->tree_mod_seq_lock);
412 				return;
413 			}
414 			min_seq = cur_elem->seq;
415 		}
416 	}
417 	spin_unlock(&fs_info->tree_mod_seq_lock);
418 
419 	/*
420 	 * anything that's lower than the lowest existing (read: blocked)
421 	 * sequence number can be removed from the tree.
422 	 */
423 	tree_mod_log_write_lock(fs_info);
424 	tm_root = &fs_info->tree_mod_log;
425 	for (node = rb_first(tm_root); node; node = next) {
426 		next = rb_next(node);
427 		tm = container_of(node, struct tree_mod_elem, node);
428 		if (tm->seq > min_seq)
429 			continue;
430 		rb_erase(node, tm_root);
431 		kfree(tm);
432 	}
433 	tree_mod_log_write_unlock(fs_info);
434 }
435 
436 /*
437  * key order of the log:
438  *       index -> sequence
439  *
440  * the index is the shifted logical of the *new* root node for root replace
441  * operations, or the shifted logical of the affected block for all other
442  * operations.
443  *
444  * Note: must be called with write lock (tree_mod_log_write_lock).
445  */
446 static noinline int
447 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
448 {
449 	struct rb_root *tm_root;
450 	struct rb_node **new;
451 	struct rb_node *parent = NULL;
452 	struct tree_mod_elem *cur;
453 
454 	BUG_ON(!tm);
455 
456 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
457 
458 	tm_root = &fs_info->tree_mod_log;
459 	new = &tm_root->rb_node;
460 	while (*new) {
461 		cur = container_of(*new, struct tree_mod_elem, node);
462 		parent = *new;
463 		if (cur->index < tm->index)
464 			new = &((*new)->rb_left);
465 		else if (cur->index > tm->index)
466 			new = &((*new)->rb_right);
467 		else if (cur->seq < tm->seq)
468 			new = &((*new)->rb_left);
469 		else if (cur->seq > tm->seq)
470 			new = &((*new)->rb_right);
471 		else
472 			return -EEXIST;
473 	}
474 
475 	rb_link_node(&tm->node, parent, new);
476 	rb_insert_color(&tm->node, tm_root);
477 	return 0;
478 }
479 
480 /*
481  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
482  * returns zero with the tree_mod_log_lock acquired. The caller must hold
483  * this until all tree mod log insertions are recorded in the rb tree and then
484  * call tree_mod_log_write_unlock() to release.
485  */
486 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
487 				    struct extent_buffer *eb) {
488 	smp_mb();
489 	if (list_empty(&(fs_info)->tree_mod_seq_list))
490 		return 1;
491 	if (eb && btrfs_header_level(eb) == 0)
492 		return 1;
493 
494 	tree_mod_log_write_lock(fs_info);
495 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
496 		tree_mod_log_write_unlock(fs_info);
497 		return 1;
498 	}
499 
500 	return 0;
501 }
502 
503 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
504 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
505 				    struct extent_buffer *eb)
506 {
507 	smp_mb();
508 	if (list_empty(&(fs_info)->tree_mod_seq_list))
509 		return 0;
510 	if (eb && btrfs_header_level(eb) == 0)
511 		return 0;
512 
513 	return 1;
514 }
515 
516 static struct tree_mod_elem *
517 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
518 		    enum mod_log_op op, gfp_t flags)
519 {
520 	struct tree_mod_elem *tm;
521 
522 	tm = kzalloc(sizeof(*tm), flags);
523 	if (!tm)
524 		return NULL;
525 
526 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
527 	if (op != MOD_LOG_KEY_ADD) {
528 		btrfs_node_key(eb, &tm->key, slot);
529 		tm->blockptr = btrfs_node_blockptr(eb, slot);
530 	}
531 	tm->op = op;
532 	tm->slot = slot;
533 	tm->generation = btrfs_node_ptr_generation(eb, slot);
534 	RB_CLEAR_NODE(&tm->node);
535 
536 	return tm;
537 }
538 
539 static noinline int
540 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
541 			struct extent_buffer *eb, int slot,
542 			enum mod_log_op op, gfp_t flags)
543 {
544 	struct tree_mod_elem *tm;
545 	int ret;
546 
547 	if (!tree_mod_need_log(fs_info, eb))
548 		return 0;
549 
550 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
551 	if (!tm)
552 		return -ENOMEM;
553 
554 	if (tree_mod_dont_log(fs_info, eb)) {
555 		kfree(tm);
556 		return 0;
557 	}
558 
559 	ret = __tree_mod_log_insert(fs_info, tm);
560 	tree_mod_log_write_unlock(fs_info);
561 	if (ret)
562 		kfree(tm);
563 
564 	return ret;
565 }
566 
567 static noinline int
568 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
569 			 struct extent_buffer *eb, int dst_slot, int src_slot,
570 			 int nr_items, gfp_t flags)
571 {
572 	struct tree_mod_elem *tm = NULL;
573 	struct tree_mod_elem **tm_list = NULL;
574 	int ret = 0;
575 	int i;
576 	int locked = 0;
577 
578 	if (!tree_mod_need_log(fs_info, eb))
579 		return 0;
580 
581 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
582 	if (!tm_list)
583 		return -ENOMEM;
584 
585 	tm = kzalloc(sizeof(*tm), flags);
586 	if (!tm) {
587 		ret = -ENOMEM;
588 		goto free_tms;
589 	}
590 
591 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
592 	tm->slot = src_slot;
593 	tm->move.dst_slot = dst_slot;
594 	tm->move.nr_items = nr_items;
595 	tm->op = MOD_LOG_MOVE_KEYS;
596 
597 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
598 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
599 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
600 		if (!tm_list[i]) {
601 			ret = -ENOMEM;
602 			goto free_tms;
603 		}
604 	}
605 
606 	if (tree_mod_dont_log(fs_info, eb))
607 		goto free_tms;
608 	locked = 1;
609 
610 	/*
611 	 * When we override something during the move, we log these removals.
612 	 * This can only happen when we move towards the beginning of the
613 	 * buffer, i.e. dst_slot < src_slot.
614 	 */
615 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
616 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
617 		if (ret)
618 			goto free_tms;
619 	}
620 
621 	ret = __tree_mod_log_insert(fs_info, tm);
622 	if (ret)
623 		goto free_tms;
624 	tree_mod_log_write_unlock(fs_info);
625 	kfree(tm_list);
626 
627 	return 0;
628 free_tms:
629 	for (i = 0; i < nr_items; i++) {
630 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
631 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
632 		kfree(tm_list[i]);
633 	}
634 	if (locked)
635 		tree_mod_log_write_unlock(fs_info);
636 	kfree(tm_list);
637 	kfree(tm);
638 
639 	return ret;
640 }
641 
642 static inline int
643 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
644 		       struct tree_mod_elem **tm_list,
645 		       int nritems)
646 {
647 	int i, j;
648 	int ret;
649 
650 	for (i = nritems - 1; i >= 0; i--) {
651 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
652 		if (ret) {
653 			for (j = nritems - 1; j > i; j--)
654 				rb_erase(&tm_list[j]->node,
655 					 &fs_info->tree_mod_log);
656 			return ret;
657 		}
658 	}
659 
660 	return 0;
661 }
662 
663 static noinline int
664 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
665 			 struct extent_buffer *old_root,
666 			 struct extent_buffer *new_root, gfp_t flags,
667 			 int log_removal)
668 {
669 	struct tree_mod_elem *tm = NULL;
670 	struct tree_mod_elem **tm_list = NULL;
671 	int nritems = 0;
672 	int ret = 0;
673 	int i;
674 
675 	if (!tree_mod_need_log(fs_info, NULL))
676 		return 0;
677 
678 	if (log_removal && btrfs_header_level(old_root) > 0) {
679 		nritems = btrfs_header_nritems(old_root);
680 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
681 				  flags);
682 		if (!tm_list) {
683 			ret = -ENOMEM;
684 			goto free_tms;
685 		}
686 		for (i = 0; i < nritems; i++) {
687 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
688 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
689 			if (!tm_list[i]) {
690 				ret = -ENOMEM;
691 				goto free_tms;
692 			}
693 		}
694 	}
695 
696 	tm = kzalloc(sizeof(*tm), flags);
697 	if (!tm) {
698 		ret = -ENOMEM;
699 		goto free_tms;
700 	}
701 
702 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
703 	tm->old_root.logical = old_root->start;
704 	tm->old_root.level = btrfs_header_level(old_root);
705 	tm->generation = btrfs_header_generation(old_root);
706 	tm->op = MOD_LOG_ROOT_REPLACE;
707 
708 	if (tree_mod_dont_log(fs_info, NULL))
709 		goto free_tms;
710 
711 	if (tm_list)
712 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
713 	if (!ret)
714 		ret = __tree_mod_log_insert(fs_info, tm);
715 
716 	tree_mod_log_write_unlock(fs_info);
717 	if (ret)
718 		goto free_tms;
719 	kfree(tm_list);
720 
721 	return ret;
722 
723 free_tms:
724 	if (tm_list) {
725 		for (i = 0; i < nritems; i++)
726 			kfree(tm_list[i]);
727 		kfree(tm_list);
728 	}
729 	kfree(tm);
730 
731 	return ret;
732 }
733 
734 static struct tree_mod_elem *
735 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
736 		      int smallest)
737 {
738 	struct rb_root *tm_root;
739 	struct rb_node *node;
740 	struct tree_mod_elem *cur = NULL;
741 	struct tree_mod_elem *found = NULL;
742 	u64 index = start >> PAGE_CACHE_SHIFT;
743 
744 	tree_mod_log_read_lock(fs_info);
745 	tm_root = &fs_info->tree_mod_log;
746 	node = tm_root->rb_node;
747 	while (node) {
748 		cur = container_of(node, struct tree_mod_elem, node);
749 		if (cur->index < index) {
750 			node = node->rb_left;
751 		} else if (cur->index > index) {
752 			node = node->rb_right;
753 		} else if (cur->seq < min_seq) {
754 			node = node->rb_left;
755 		} else if (!smallest) {
756 			/* we want the node with the highest seq */
757 			if (found)
758 				BUG_ON(found->seq > cur->seq);
759 			found = cur;
760 			node = node->rb_left;
761 		} else if (cur->seq > min_seq) {
762 			/* we want the node with the smallest seq */
763 			if (found)
764 				BUG_ON(found->seq < cur->seq);
765 			found = cur;
766 			node = node->rb_right;
767 		} else {
768 			found = cur;
769 			break;
770 		}
771 	}
772 	tree_mod_log_read_unlock(fs_info);
773 
774 	return found;
775 }
776 
777 /*
778  * this returns the element from the log with the smallest time sequence
779  * value that's in the log (the oldest log item). any element with a time
780  * sequence lower than min_seq will be ignored.
781  */
782 static struct tree_mod_elem *
783 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
784 			   u64 min_seq)
785 {
786 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
787 }
788 
789 /*
790  * this returns the element from the log with the largest time sequence
791  * value that's in the log (the most recent log item). any element with
792  * a time sequence lower than min_seq will be ignored.
793  */
794 static struct tree_mod_elem *
795 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
796 {
797 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
798 }
799 
800 static noinline int
801 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
802 		     struct extent_buffer *src, unsigned long dst_offset,
803 		     unsigned long src_offset, int nr_items)
804 {
805 	int ret = 0;
806 	struct tree_mod_elem **tm_list = NULL;
807 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
808 	int i;
809 	int locked = 0;
810 
811 	if (!tree_mod_need_log(fs_info, NULL))
812 		return 0;
813 
814 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
815 		return 0;
816 
817 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
818 			  GFP_NOFS);
819 	if (!tm_list)
820 		return -ENOMEM;
821 
822 	tm_list_add = tm_list;
823 	tm_list_rem = tm_list + nr_items;
824 	for (i = 0; i < nr_items; i++) {
825 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
826 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
827 		if (!tm_list_rem[i]) {
828 			ret = -ENOMEM;
829 			goto free_tms;
830 		}
831 
832 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
833 		    MOD_LOG_KEY_ADD, GFP_NOFS);
834 		if (!tm_list_add[i]) {
835 			ret = -ENOMEM;
836 			goto free_tms;
837 		}
838 	}
839 
840 	if (tree_mod_dont_log(fs_info, NULL))
841 		goto free_tms;
842 	locked = 1;
843 
844 	for (i = 0; i < nr_items; i++) {
845 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
846 		if (ret)
847 			goto free_tms;
848 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
849 		if (ret)
850 			goto free_tms;
851 	}
852 
853 	tree_mod_log_write_unlock(fs_info);
854 	kfree(tm_list);
855 
856 	return 0;
857 
858 free_tms:
859 	for (i = 0; i < nr_items * 2; i++) {
860 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
861 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
862 		kfree(tm_list[i]);
863 	}
864 	if (locked)
865 		tree_mod_log_write_unlock(fs_info);
866 	kfree(tm_list);
867 
868 	return ret;
869 }
870 
871 static inline void
872 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
873 		     int dst_offset, int src_offset, int nr_items)
874 {
875 	int ret;
876 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
877 				       nr_items, GFP_NOFS);
878 	BUG_ON(ret < 0);
879 }
880 
881 static noinline void
882 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
883 			  struct extent_buffer *eb, int slot, int atomic)
884 {
885 	int ret;
886 
887 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
888 					MOD_LOG_KEY_REPLACE,
889 					atomic ? GFP_ATOMIC : GFP_NOFS);
890 	BUG_ON(ret < 0);
891 }
892 
893 static noinline int
894 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
895 {
896 	struct tree_mod_elem **tm_list = NULL;
897 	int nritems = 0;
898 	int i;
899 	int ret = 0;
900 
901 	if (btrfs_header_level(eb) == 0)
902 		return 0;
903 
904 	if (!tree_mod_need_log(fs_info, NULL))
905 		return 0;
906 
907 	nritems = btrfs_header_nritems(eb);
908 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
909 	if (!tm_list)
910 		return -ENOMEM;
911 
912 	for (i = 0; i < nritems; i++) {
913 		tm_list[i] = alloc_tree_mod_elem(eb, i,
914 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
915 		if (!tm_list[i]) {
916 			ret = -ENOMEM;
917 			goto free_tms;
918 		}
919 	}
920 
921 	if (tree_mod_dont_log(fs_info, eb))
922 		goto free_tms;
923 
924 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
925 	tree_mod_log_write_unlock(fs_info);
926 	if (ret)
927 		goto free_tms;
928 	kfree(tm_list);
929 
930 	return 0;
931 
932 free_tms:
933 	for (i = 0; i < nritems; i++)
934 		kfree(tm_list[i]);
935 	kfree(tm_list);
936 
937 	return ret;
938 }
939 
940 static noinline void
941 tree_mod_log_set_root_pointer(struct btrfs_root *root,
942 			      struct extent_buffer *new_root_node,
943 			      int log_removal)
944 {
945 	int ret;
946 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
947 				       new_root_node, GFP_NOFS, log_removal);
948 	BUG_ON(ret < 0);
949 }
950 
951 /*
952  * check if the tree block can be shared by multiple trees
953  */
954 int btrfs_block_can_be_shared(struct btrfs_root *root,
955 			      struct extent_buffer *buf)
956 {
957 	/*
958 	 * Tree blocks not in refernece counted trees and tree roots
959 	 * are never shared. If a block was allocated after the last
960 	 * snapshot and the block was not allocated by tree relocation,
961 	 * we know the block is not shared.
962 	 */
963 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964 	    buf != root->node && buf != root->commit_root &&
965 	    (btrfs_header_generation(buf) <=
966 	     btrfs_root_last_snapshot(&root->root_item) ||
967 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
968 		return 1;
969 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
970 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
971 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
972 		return 1;
973 #endif
974 	return 0;
975 }
976 
977 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
978 				       struct btrfs_root *root,
979 				       struct extent_buffer *buf,
980 				       struct extent_buffer *cow,
981 				       int *last_ref)
982 {
983 	u64 refs;
984 	u64 owner;
985 	u64 flags;
986 	u64 new_flags = 0;
987 	int ret;
988 
989 	/*
990 	 * Backrefs update rules:
991 	 *
992 	 * Always use full backrefs for extent pointers in tree block
993 	 * allocated by tree relocation.
994 	 *
995 	 * If a shared tree block is no longer referenced by its owner
996 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
997 	 * use full backrefs for extent pointers in tree block.
998 	 *
999 	 * If a tree block is been relocating
1000 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1001 	 * use full backrefs for extent pointers in tree block.
1002 	 * The reason for this is some operations (such as drop tree)
1003 	 * are only allowed for blocks use full backrefs.
1004 	 */
1005 
1006 	if (btrfs_block_can_be_shared(root, buf)) {
1007 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
1008 					       btrfs_header_level(buf), 1,
1009 					       &refs, &flags);
1010 		if (ret)
1011 			return ret;
1012 		if (refs == 0) {
1013 			ret = -EROFS;
1014 			btrfs_std_error(root->fs_info, ret);
1015 			return ret;
1016 		}
1017 	} else {
1018 		refs = 1;
1019 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1020 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1021 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1022 		else
1023 			flags = 0;
1024 	}
1025 
1026 	owner = btrfs_header_owner(buf);
1027 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1028 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1029 
1030 	if (refs > 1) {
1031 		if ((owner == root->root_key.objectid ||
1032 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1033 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1034 			ret = btrfs_inc_ref(trans, root, buf, 1);
1035 			BUG_ON(ret); /* -ENOMEM */
1036 
1037 			if (root->root_key.objectid ==
1038 			    BTRFS_TREE_RELOC_OBJECTID) {
1039 				ret = btrfs_dec_ref(trans, root, buf, 0);
1040 				BUG_ON(ret); /* -ENOMEM */
1041 				ret = btrfs_inc_ref(trans, root, cow, 1);
1042 				BUG_ON(ret); /* -ENOMEM */
1043 			}
1044 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1045 		} else {
1046 
1047 			if (root->root_key.objectid ==
1048 			    BTRFS_TREE_RELOC_OBJECTID)
1049 				ret = btrfs_inc_ref(trans, root, cow, 1);
1050 			else
1051 				ret = btrfs_inc_ref(trans, root, cow, 0);
1052 			BUG_ON(ret); /* -ENOMEM */
1053 		}
1054 		if (new_flags != 0) {
1055 			int level = btrfs_header_level(buf);
1056 
1057 			ret = btrfs_set_disk_extent_flags(trans, root,
1058 							  buf->start,
1059 							  buf->len,
1060 							  new_flags, level, 0);
1061 			if (ret)
1062 				return ret;
1063 		}
1064 	} else {
1065 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1066 			if (root->root_key.objectid ==
1067 			    BTRFS_TREE_RELOC_OBJECTID)
1068 				ret = btrfs_inc_ref(trans, root, cow, 1);
1069 			else
1070 				ret = btrfs_inc_ref(trans, root, cow, 0);
1071 			BUG_ON(ret); /* -ENOMEM */
1072 			ret = btrfs_dec_ref(trans, root, buf, 1);
1073 			BUG_ON(ret); /* -ENOMEM */
1074 		}
1075 		clean_tree_block(trans, root->fs_info, buf);
1076 		*last_ref = 1;
1077 	}
1078 	return 0;
1079 }
1080 
1081 /*
1082  * does the dirty work in cow of a single block.  The parent block (if
1083  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1084  * dirty and returned locked.  If you modify the block it needs to be marked
1085  * dirty again.
1086  *
1087  * search_start -- an allocation hint for the new block
1088  *
1089  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1090  * bytes the allocator should try to find free next to the block it returns.
1091  * This is just a hint and may be ignored by the allocator.
1092  */
1093 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1094 			     struct btrfs_root *root,
1095 			     struct extent_buffer *buf,
1096 			     struct extent_buffer *parent, int parent_slot,
1097 			     struct extent_buffer **cow_ret,
1098 			     u64 search_start, u64 empty_size)
1099 {
1100 	struct btrfs_disk_key disk_key;
1101 	struct extent_buffer *cow;
1102 	int level, ret;
1103 	int last_ref = 0;
1104 	int unlock_orig = 0;
1105 	u64 parent_start;
1106 
1107 	if (*cow_ret == buf)
1108 		unlock_orig = 1;
1109 
1110 	btrfs_assert_tree_locked(buf);
1111 
1112 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1113 		trans->transid != root->fs_info->running_transaction->transid);
1114 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1115 		trans->transid != root->last_trans);
1116 
1117 	level = btrfs_header_level(buf);
1118 
1119 	if (level == 0)
1120 		btrfs_item_key(buf, &disk_key, 0);
1121 	else
1122 		btrfs_node_key(buf, &disk_key, 0);
1123 
1124 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1125 		if (parent)
1126 			parent_start = parent->start;
1127 		else
1128 			parent_start = 0;
1129 	} else
1130 		parent_start = 0;
1131 
1132 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
1133 			root->root_key.objectid, &disk_key, level,
1134 			search_start, empty_size);
1135 	if (IS_ERR(cow))
1136 		return PTR_ERR(cow);
1137 
1138 	/* cow is set to blocking by btrfs_init_new_buffer */
1139 
1140 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
1141 	btrfs_set_header_bytenr(cow, cow->start);
1142 	btrfs_set_header_generation(cow, trans->transid);
1143 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1144 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1145 				     BTRFS_HEADER_FLAG_RELOC);
1146 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1147 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1148 	else
1149 		btrfs_set_header_owner(cow, root->root_key.objectid);
1150 
1151 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1152 			    BTRFS_FSID_SIZE);
1153 
1154 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1155 	if (ret) {
1156 		btrfs_abort_transaction(trans, root, ret);
1157 		return ret;
1158 	}
1159 
1160 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1161 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1162 		if (ret)
1163 			return ret;
1164 	}
1165 
1166 	if (buf == root->node) {
1167 		WARN_ON(parent && parent != buf);
1168 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1169 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1170 			parent_start = buf->start;
1171 		else
1172 			parent_start = 0;
1173 
1174 		extent_buffer_get(cow);
1175 		tree_mod_log_set_root_pointer(root, cow, 1);
1176 		rcu_assign_pointer(root->node, cow);
1177 
1178 		btrfs_free_tree_block(trans, root, buf, parent_start,
1179 				      last_ref);
1180 		free_extent_buffer(buf);
1181 		add_root_to_dirty_list(root);
1182 	} else {
1183 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1184 			parent_start = parent->start;
1185 		else
1186 			parent_start = 0;
1187 
1188 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1189 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1190 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1191 		btrfs_set_node_blockptr(parent, parent_slot,
1192 					cow->start);
1193 		btrfs_set_node_ptr_generation(parent, parent_slot,
1194 					      trans->transid);
1195 		btrfs_mark_buffer_dirty(parent);
1196 		if (last_ref) {
1197 			ret = tree_mod_log_free_eb(root->fs_info, buf);
1198 			if (ret) {
1199 				btrfs_abort_transaction(trans, root, ret);
1200 				return ret;
1201 			}
1202 		}
1203 		btrfs_free_tree_block(trans, root, buf, parent_start,
1204 				      last_ref);
1205 	}
1206 	if (unlock_orig)
1207 		btrfs_tree_unlock(buf);
1208 	free_extent_buffer_stale(buf);
1209 	btrfs_mark_buffer_dirty(cow);
1210 	*cow_ret = cow;
1211 	return 0;
1212 }
1213 
1214 /*
1215  * returns the logical address of the oldest predecessor of the given root.
1216  * entries older than time_seq are ignored.
1217  */
1218 static struct tree_mod_elem *
1219 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1220 			   struct extent_buffer *eb_root, u64 time_seq)
1221 {
1222 	struct tree_mod_elem *tm;
1223 	struct tree_mod_elem *found = NULL;
1224 	u64 root_logical = eb_root->start;
1225 	int looped = 0;
1226 
1227 	if (!time_seq)
1228 		return NULL;
1229 
1230 	/*
1231 	 * the very last operation that's logged for a root is the replacement
1232 	 * operation (if it is replaced at all). this has the index of the *new*
1233 	 * root, making it the very first operation that's logged for this root.
1234 	 */
1235 	while (1) {
1236 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1237 						time_seq);
1238 		if (!looped && !tm)
1239 			return NULL;
1240 		/*
1241 		 * if there are no tree operation for the oldest root, we simply
1242 		 * return it. this should only happen if that (old) root is at
1243 		 * level 0.
1244 		 */
1245 		if (!tm)
1246 			break;
1247 
1248 		/*
1249 		 * if there's an operation that's not a root replacement, we
1250 		 * found the oldest version of our root. normally, we'll find a
1251 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1252 		 */
1253 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1254 			break;
1255 
1256 		found = tm;
1257 		root_logical = tm->old_root.logical;
1258 		looped = 1;
1259 	}
1260 
1261 	/* if there's no old root to return, return what we found instead */
1262 	if (!found)
1263 		found = tm;
1264 
1265 	return found;
1266 }
1267 
1268 /*
1269  * tm is a pointer to the first operation to rewind within eb. then, all
1270  * previous operations will be rewinded (until we reach something older than
1271  * time_seq).
1272  */
1273 static void
1274 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1275 		      u64 time_seq, struct tree_mod_elem *first_tm)
1276 {
1277 	u32 n;
1278 	struct rb_node *next;
1279 	struct tree_mod_elem *tm = first_tm;
1280 	unsigned long o_dst;
1281 	unsigned long o_src;
1282 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1283 
1284 	n = btrfs_header_nritems(eb);
1285 	tree_mod_log_read_lock(fs_info);
1286 	while (tm && tm->seq >= time_seq) {
1287 		/*
1288 		 * all the operations are recorded with the operator used for
1289 		 * the modification. as we're going backwards, we do the
1290 		 * opposite of each operation here.
1291 		 */
1292 		switch (tm->op) {
1293 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1294 			BUG_ON(tm->slot < n);
1295 			/* Fallthrough */
1296 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1297 		case MOD_LOG_KEY_REMOVE:
1298 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1299 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1300 			btrfs_set_node_ptr_generation(eb, tm->slot,
1301 						      tm->generation);
1302 			n++;
1303 			break;
1304 		case MOD_LOG_KEY_REPLACE:
1305 			BUG_ON(tm->slot >= n);
1306 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1307 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1308 			btrfs_set_node_ptr_generation(eb, tm->slot,
1309 						      tm->generation);
1310 			break;
1311 		case MOD_LOG_KEY_ADD:
1312 			/* if a move operation is needed it's in the log */
1313 			n--;
1314 			break;
1315 		case MOD_LOG_MOVE_KEYS:
1316 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1317 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1318 			memmove_extent_buffer(eb, o_dst, o_src,
1319 					      tm->move.nr_items * p_size);
1320 			break;
1321 		case MOD_LOG_ROOT_REPLACE:
1322 			/*
1323 			 * this operation is special. for roots, this must be
1324 			 * handled explicitly before rewinding.
1325 			 * for non-roots, this operation may exist if the node
1326 			 * was a root: root A -> child B; then A gets empty and
1327 			 * B is promoted to the new root. in the mod log, we'll
1328 			 * have a root-replace operation for B, a tree block
1329 			 * that is no root. we simply ignore that operation.
1330 			 */
1331 			break;
1332 		}
1333 		next = rb_next(&tm->node);
1334 		if (!next)
1335 			break;
1336 		tm = container_of(next, struct tree_mod_elem, node);
1337 		if (tm->index != first_tm->index)
1338 			break;
1339 	}
1340 	tree_mod_log_read_unlock(fs_info);
1341 	btrfs_set_header_nritems(eb, n);
1342 }
1343 
1344 /*
1345  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1346  * is returned. If rewind operations happen, a fresh buffer is returned. The
1347  * returned buffer is always read-locked. If the returned buffer is not the
1348  * input buffer, the lock on the input buffer is released and the input buffer
1349  * is freed (its refcount is decremented).
1350  */
1351 static struct extent_buffer *
1352 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1353 		    struct extent_buffer *eb, u64 time_seq)
1354 {
1355 	struct extent_buffer *eb_rewin;
1356 	struct tree_mod_elem *tm;
1357 
1358 	if (!time_seq)
1359 		return eb;
1360 
1361 	if (btrfs_header_level(eb) == 0)
1362 		return eb;
1363 
1364 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1365 	if (!tm)
1366 		return eb;
1367 
1368 	btrfs_set_path_blocking(path);
1369 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1370 
1371 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1372 		BUG_ON(tm->slot != 0);
1373 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1374 		if (!eb_rewin) {
1375 			btrfs_tree_read_unlock_blocking(eb);
1376 			free_extent_buffer(eb);
1377 			return NULL;
1378 		}
1379 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1380 		btrfs_set_header_backref_rev(eb_rewin,
1381 					     btrfs_header_backref_rev(eb));
1382 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1383 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1384 	} else {
1385 		eb_rewin = btrfs_clone_extent_buffer(eb);
1386 		if (!eb_rewin) {
1387 			btrfs_tree_read_unlock_blocking(eb);
1388 			free_extent_buffer(eb);
1389 			return NULL;
1390 		}
1391 	}
1392 
1393 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1394 	btrfs_tree_read_unlock_blocking(eb);
1395 	free_extent_buffer(eb);
1396 
1397 	extent_buffer_get(eb_rewin);
1398 	btrfs_tree_read_lock(eb_rewin);
1399 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1400 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1401 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1402 
1403 	return eb_rewin;
1404 }
1405 
1406 /*
1407  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1408  * value. If there are no changes, the current root->root_node is returned. If
1409  * anything changed in between, there's a fresh buffer allocated on which the
1410  * rewind operations are done. In any case, the returned buffer is read locked.
1411  * Returns NULL on error (with no locks held).
1412  */
1413 static inline struct extent_buffer *
1414 get_old_root(struct btrfs_root *root, u64 time_seq)
1415 {
1416 	struct tree_mod_elem *tm;
1417 	struct extent_buffer *eb = NULL;
1418 	struct extent_buffer *eb_root;
1419 	struct extent_buffer *old;
1420 	struct tree_mod_root *old_root = NULL;
1421 	u64 old_generation = 0;
1422 	u64 logical;
1423 
1424 	eb_root = btrfs_read_lock_root_node(root);
1425 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1426 	if (!tm)
1427 		return eb_root;
1428 
1429 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1430 		old_root = &tm->old_root;
1431 		old_generation = tm->generation;
1432 		logical = old_root->logical;
1433 	} else {
1434 		logical = eb_root->start;
1435 	}
1436 
1437 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1438 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1439 		btrfs_tree_read_unlock(eb_root);
1440 		free_extent_buffer(eb_root);
1441 		old = read_tree_block(root, logical, 0);
1442 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1443 			if (!IS_ERR(old))
1444 				free_extent_buffer(old);
1445 			btrfs_warn(root->fs_info,
1446 				"failed to read tree block %llu from get_old_root", logical);
1447 		} else {
1448 			eb = btrfs_clone_extent_buffer(old);
1449 			free_extent_buffer(old);
1450 		}
1451 	} else if (old_root) {
1452 		btrfs_tree_read_unlock(eb_root);
1453 		free_extent_buffer(eb_root);
1454 		eb = alloc_dummy_extent_buffer(root->fs_info, logical);
1455 	} else {
1456 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1457 		eb = btrfs_clone_extent_buffer(eb_root);
1458 		btrfs_tree_read_unlock_blocking(eb_root);
1459 		free_extent_buffer(eb_root);
1460 	}
1461 
1462 	if (!eb)
1463 		return NULL;
1464 	extent_buffer_get(eb);
1465 	btrfs_tree_read_lock(eb);
1466 	if (old_root) {
1467 		btrfs_set_header_bytenr(eb, eb->start);
1468 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1469 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1470 		btrfs_set_header_level(eb, old_root->level);
1471 		btrfs_set_header_generation(eb, old_generation);
1472 	}
1473 	if (tm)
1474 		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1475 	else
1476 		WARN_ON(btrfs_header_level(eb) != 0);
1477 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1478 
1479 	return eb;
1480 }
1481 
1482 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1483 {
1484 	struct tree_mod_elem *tm;
1485 	int level;
1486 	struct extent_buffer *eb_root = btrfs_root_node(root);
1487 
1488 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1489 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1490 		level = tm->old_root.level;
1491 	} else {
1492 		level = btrfs_header_level(eb_root);
1493 	}
1494 	free_extent_buffer(eb_root);
1495 
1496 	return level;
1497 }
1498 
1499 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1500 				   struct btrfs_root *root,
1501 				   struct extent_buffer *buf)
1502 {
1503 	if (btrfs_test_is_dummy_root(root))
1504 		return 0;
1505 
1506 	/* ensure we can see the force_cow */
1507 	smp_rmb();
1508 
1509 	/*
1510 	 * We do not need to cow a block if
1511 	 * 1) this block is not created or changed in this transaction;
1512 	 * 2) this block does not belong to TREE_RELOC tree;
1513 	 * 3) the root is not forced COW.
1514 	 *
1515 	 * What is forced COW:
1516 	 *    when we create snapshot during commiting the transaction,
1517 	 *    after we've finished coping src root, we must COW the shared
1518 	 *    block to ensure the metadata consistency.
1519 	 */
1520 	if (btrfs_header_generation(buf) == trans->transid &&
1521 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1522 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1523 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1524 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1525 		return 0;
1526 	return 1;
1527 }
1528 
1529 /*
1530  * cows a single block, see __btrfs_cow_block for the real work.
1531  * This version of it has extra checks so that a block isn't cow'd more than
1532  * once per transaction, as long as it hasn't been written yet
1533  */
1534 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1535 		    struct btrfs_root *root, struct extent_buffer *buf,
1536 		    struct extent_buffer *parent, int parent_slot,
1537 		    struct extent_buffer **cow_ret)
1538 {
1539 	u64 search_start;
1540 	int ret;
1541 
1542 	if (trans->transaction != root->fs_info->running_transaction)
1543 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1544 		       trans->transid,
1545 		       root->fs_info->running_transaction->transid);
1546 
1547 	if (trans->transid != root->fs_info->generation)
1548 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1549 		       trans->transid, root->fs_info->generation);
1550 
1551 	if (!should_cow_block(trans, root, buf)) {
1552 		*cow_ret = buf;
1553 		return 0;
1554 	}
1555 
1556 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1557 
1558 	if (parent)
1559 		btrfs_set_lock_blocking(parent);
1560 	btrfs_set_lock_blocking(buf);
1561 
1562 	ret = __btrfs_cow_block(trans, root, buf, parent,
1563 				 parent_slot, cow_ret, search_start, 0);
1564 
1565 	trace_btrfs_cow_block(root, buf, *cow_ret);
1566 
1567 	return ret;
1568 }
1569 
1570 /*
1571  * helper function for defrag to decide if two blocks pointed to by a
1572  * node are actually close by
1573  */
1574 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1575 {
1576 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1577 		return 1;
1578 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1579 		return 1;
1580 	return 0;
1581 }
1582 
1583 /*
1584  * compare two keys in a memcmp fashion
1585  */
1586 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1587 {
1588 	struct btrfs_key k1;
1589 
1590 	btrfs_disk_key_to_cpu(&k1, disk);
1591 
1592 	return btrfs_comp_cpu_keys(&k1, k2);
1593 }
1594 
1595 /*
1596  * same as comp_keys only with two btrfs_key's
1597  */
1598 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1599 {
1600 	if (k1->objectid > k2->objectid)
1601 		return 1;
1602 	if (k1->objectid < k2->objectid)
1603 		return -1;
1604 	if (k1->type > k2->type)
1605 		return 1;
1606 	if (k1->type < k2->type)
1607 		return -1;
1608 	if (k1->offset > k2->offset)
1609 		return 1;
1610 	if (k1->offset < k2->offset)
1611 		return -1;
1612 	return 0;
1613 }
1614 
1615 /*
1616  * this is used by the defrag code to go through all the
1617  * leaves pointed to by a node and reallocate them so that
1618  * disk order is close to key order
1619  */
1620 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1621 		       struct btrfs_root *root, struct extent_buffer *parent,
1622 		       int start_slot, u64 *last_ret,
1623 		       struct btrfs_key *progress)
1624 {
1625 	struct extent_buffer *cur;
1626 	u64 blocknr;
1627 	u64 gen;
1628 	u64 search_start = *last_ret;
1629 	u64 last_block = 0;
1630 	u64 other;
1631 	u32 parent_nritems;
1632 	int end_slot;
1633 	int i;
1634 	int err = 0;
1635 	int parent_level;
1636 	int uptodate;
1637 	u32 blocksize;
1638 	int progress_passed = 0;
1639 	struct btrfs_disk_key disk_key;
1640 
1641 	parent_level = btrfs_header_level(parent);
1642 
1643 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1644 	WARN_ON(trans->transid != root->fs_info->generation);
1645 
1646 	parent_nritems = btrfs_header_nritems(parent);
1647 	blocksize = root->nodesize;
1648 	end_slot = parent_nritems - 1;
1649 
1650 	if (parent_nritems <= 1)
1651 		return 0;
1652 
1653 	btrfs_set_lock_blocking(parent);
1654 
1655 	for (i = start_slot; i <= end_slot; i++) {
1656 		int close = 1;
1657 
1658 		btrfs_node_key(parent, &disk_key, i);
1659 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1660 			continue;
1661 
1662 		progress_passed = 1;
1663 		blocknr = btrfs_node_blockptr(parent, i);
1664 		gen = btrfs_node_ptr_generation(parent, i);
1665 		if (last_block == 0)
1666 			last_block = blocknr;
1667 
1668 		if (i > 0) {
1669 			other = btrfs_node_blockptr(parent, i - 1);
1670 			close = close_blocks(blocknr, other, blocksize);
1671 		}
1672 		if (!close && i < end_slot) {
1673 			other = btrfs_node_blockptr(parent, i + 1);
1674 			close = close_blocks(blocknr, other, blocksize);
1675 		}
1676 		if (close) {
1677 			last_block = blocknr;
1678 			continue;
1679 		}
1680 
1681 		cur = btrfs_find_tree_block(root->fs_info, blocknr);
1682 		if (cur)
1683 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1684 		else
1685 			uptodate = 0;
1686 		if (!cur || !uptodate) {
1687 			if (!cur) {
1688 				cur = read_tree_block(root, blocknr, gen);
1689 				if (IS_ERR(cur)) {
1690 					return PTR_ERR(cur);
1691 				} else if (!extent_buffer_uptodate(cur)) {
1692 					free_extent_buffer(cur);
1693 					return -EIO;
1694 				}
1695 			} else if (!uptodate) {
1696 				err = btrfs_read_buffer(cur, gen);
1697 				if (err) {
1698 					free_extent_buffer(cur);
1699 					return err;
1700 				}
1701 			}
1702 		}
1703 		if (search_start == 0)
1704 			search_start = last_block;
1705 
1706 		btrfs_tree_lock(cur);
1707 		btrfs_set_lock_blocking(cur);
1708 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1709 					&cur, search_start,
1710 					min(16 * blocksize,
1711 					    (end_slot - i) * blocksize));
1712 		if (err) {
1713 			btrfs_tree_unlock(cur);
1714 			free_extent_buffer(cur);
1715 			break;
1716 		}
1717 		search_start = cur->start;
1718 		last_block = cur->start;
1719 		*last_ret = search_start;
1720 		btrfs_tree_unlock(cur);
1721 		free_extent_buffer(cur);
1722 	}
1723 	return err;
1724 }
1725 
1726 /*
1727  * The leaf data grows from end-to-front in the node.
1728  * this returns the address of the start of the last item,
1729  * which is the stop of the leaf data stack
1730  */
1731 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1732 					 struct extent_buffer *leaf)
1733 {
1734 	u32 nr = btrfs_header_nritems(leaf);
1735 	if (nr == 0)
1736 		return BTRFS_LEAF_DATA_SIZE(root);
1737 	return btrfs_item_offset_nr(leaf, nr - 1);
1738 }
1739 
1740 
1741 /*
1742  * search for key in the extent_buffer.  The items start at offset p,
1743  * and they are item_size apart.  There are 'max' items in p.
1744  *
1745  * the slot in the array is returned via slot, and it points to
1746  * the place where you would insert key if it is not found in
1747  * the array.
1748  *
1749  * slot may point to max if the key is bigger than all of the keys
1750  */
1751 static noinline int generic_bin_search(struct extent_buffer *eb,
1752 				       unsigned long p,
1753 				       int item_size, struct btrfs_key *key,
1754 				       int max, int *slot)
1755 {
1756 	int low = 0;
1757 	int high = max;
1758 	int mid;
1759 	int ret;
1760 	struct btrfs_disk_key *tmp = NULL;
1761 	struct btrfs_disk_key unaligned;
1762 	unsigned long offset;
1763 	char *kaddr = NULL;
1764 	unsigned long map_start = 0;
1765 	unsigned long map_len = 0;
1766 	int err;
1767 
1768 	while (low < high) {
1769 		mid = (low + high) / 2;
1770 		offset = p + mid * item_size;
1771 
1772 		if (!kaddr || offset < map_start ||
1773 		    (offset + sizeof(struct btrfs_disk_key)) >
1774 		    map_start + map_len) {
1775 
1776 			err = map_private_extent_buffer(eb, offset,
1777 						sizeof(struct btrfs_disk_key),
1778 						&kaddr, &map_start, &map_len);
1779 
1780 			if (!err) {
1781 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1782 							map_start);
1783 			} else {
1784 				read_extent_buffer(eb, &unaligned,
1785 						   offset, sizeof(unaligned));
1786 				tmp = &unaligned;
1787 			}
1788 
1789 		} else {
1790 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1791 							map_start);
1792 		}
1793 		ret = comp_keys(tmp, key);
1794 
1795 		if (ret < 0)
1796 			low = mid + 1;
1797 		else if (ret > 0)
1798 			high = mid;
1799 		else {
1800 			*slot = mid;
1801 			return 0;
1802 		}
1803 	}
1804 	*slot = low;
1805 	return 1;
1806 }
1807 
1808 /*
1809  * simple bin_search frontend that does the right thing for
1810  * leaves vs nodes
1811  */
1812 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1813 		      int level, int *slot)
1814 {
1815 	if (level == 0)
1816 		return generic_bin_search(eb,
1817 					  offsetof(struct btrfs_leaf, items),
1818 					  sizeof(struct btrfs_item),
1819 					  key, btrfs_header_nritems(eb),
1820 					  slot);
1821 	else
1822 		return generic_bin_search(eb,
1823 					  offsetof(struct btrfs_node, ptrs),
1824 					  sizeof(struct btrfs_key_ptr),
1825 					  key, btrfs_header_nritems(eb),
1826 					  slot);
1827 }
1828 
1829 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1830 		     int level, int *slot)
1831 {
1832 	return bin_search(eb, key, level, slot);
1833 }
1834 
1835 static void root_add_used(struct btrfs_root *root, u32 size)
1836 {
1837 	spin_lock(&root->accounting_lock);
1838 	btrfs_set_root_used(&root->root_item,
1839 			    btrfs_root_used(&root->root_item) + size);
1840 	spin_unlock(&root->accounting_lock);
1841 }
1842 
1843 static void root_sub_used(struct btrfs_root *root, u32 size)
1844 {
1845 	spin_lock(&root->accounting_lock);
1846 	btrfs_set_root_used(&root->root_item,
1847 			    btrfs_root_used(&root->root_item) - size);
1848 	spin_unlock(&root->accounting_lock);
1849 }
1850 
1851 /* given a node and slot number, this reads the blocks it points to.  The
1852  * extent buffer is returned with a reference taken (but unlocked).
1853  * NULL is returned on error.
1854  */
1855 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1856 				   struct extent_buffer *parent, int slot)
1857 {
1858 	int level = btrfs_header_level(parent);
1859 	struct extent_buffer *eb;
1860 
1861 	if (slot < 0)
1862 		return NULL;
1863 	if (slot >= btrfs_header_nritems(parent))
1864 		return NULL;
1865 
1866 	BUG_ON(level == 0);
1867 
1868 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1869 			     btrfs_node_ptr_generation(parent, slot));
1870 	if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
1871 		if (!IS_ERR(eb))
1872 			free_extent_buffer(eb);
1873 		eb = NULL;
1874 	}
1875 
1876 	return eb;
1877 }
1878 
1879 /*
1880  * node level balancing, used to make sure nodes are in proper order for
1881  * item deletion.  We balance from the top down, so we have to make sure
1882  * that a deletion won't leave an node completely empty later on.
1883  */
1884 static noinline int balance_level(struct btrfs_trans_handle *trans,
1885 			 struct btrfs_root *root,
1886 			 struct btrfs_path *path, int level)
1887 {
1888 	struct extent_buffer *right = NULL;
1889 	struct extent_buffer *mid;
1890 	struct extent_buffer *left = NULL;
1891 	struct extent_buffer *parent = NULL;
1892 	int ret = 0;
1893 	int wret;
1894 	int pslot;
1895 	int orig_slot = path->slots[level];
1896 	u64 orig_ptr;
1897 
1898 	if (level == 0)
1899 		return 0;
1900 
1901 	mid = path->nodes[level];
1902 
1903 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1904 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1905 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1906 
1907 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1908 
1909 	if (level < BTRFS_MAX_LEVEL - 1) {
1910 		parent = path->nodes[level + 1];
1911 		pslot = path->slots[level + 1];
1912 	}
1913 
1914 	/*
1915 	 * deal with the case where there is only one pointer in the root
1916 	 * by promoting the node below to a root
1917 	 */
1918 	if (!parent) {
1919 		struct extent_buffer *child;
1920 
1921 		if (btrfs_header_nritems(mid) != 1)
1922 			return 0;
1923 
1924 		/* promote the child to a root */
1925 		child = read_node_slot(root, mid, 0);
1926 		if (!child) {
1927 			ret = -EROFS;
1928 			btrfs_std_error(root->fs_info, ret);
1929 			goto enospc;
1930 		}
1931 
1932 		btrfs_tree_lock(child);
1933 		btrfs_set_lock_blocking(child);
1934 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1935 		if (ret) {
1936 			btrfs_tree_unlock(child);
1937 			free_extent_buffer(child);
1938 			goto enospc;
1939 		}
1940 
1941 		tree_mod_log_set_root_pointer(root, child, 1);
1942 		rcu_assign_pointer(root->node, child);
1943 
1944 		add_root_to_dirty_list(root);
1945 		btrfs_tree_unlock(child);
1946 
1947 		path->locks[level] = 0;
1948 		path->nodes[level] = NULL;
1949 		clean_tree_block(trans, root->fs_info, mid);
1950 		btrfs_tree_unlock(mid);
1951 		/* once for the path */
1952 		free_extent_buffer(mid);
1953 
1954 		root_sub_used(root, mid->len);
1955 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1956 		/* once for the root ptr */
1957 		free_extent_buffer_stale(mid);
1958 		return 0;
1959 	}
1960 	if (btrfs_header_nritems(mid) >
1961 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1962 		return 0;
1963 
1964 	left = read_node_slot(root, parent, pslot - 1);
1965 	if (left) {
1966 		btrfs_tree_lock(left);
1967 		btrfs_set_lock_blocking(left);
1968 		wret = btrfs_cow_block(trans, root, left,
1969 				       parent, pslot - 1, &left);
1970 		if (wret) {
1971 			ret = wret;
1972 			goto enospc;
1973 		}
1974 	}
1975 	right = read_node_slot(root, parent, pslot + 1);
1976 	if (right) {
1977 		btrfs_tree_lock(right);
1978 		btrfs_set_lock_blocking(right);
1979 		wret = btrfs_cow_block(trans, root, right,
1980 				       parent, pslot + 1, &right);
1981 		if (wret) {
1982 			ret = wret;
1983 			goto enospc;
1984 		}
1985 	}
1986 
1987 	/* first, try to make some room in the middle buffer */
1988 	if (left) {
1989 		orig_slot += btrfs_header_nritems(left);
1990 		wret = push_node_left(trans, root, left, mid, 1);
1991 		if (wret < 0)
1992 			ret = wret;
1993 	}
1994 
1995 	/*
1996 	 * then try to empty the right most buffer into the middle
1997 	 */
1998 	if (right) {
1999 		wret = push_node_left(trans, root, mid, right, 1);
2000 		if (wret < 0 && wret != -ENOSPC)
2001 			ret = wret;
2002 		if (btrfs_header_nritems(right) == 0) {
2003 			clean_tree_block(trans, root->fs_info, right);
2004 			btrfs_tree_unlock(right);
2005 			del_ptr(root, path, level + 1, pslot + 1);
2006 			root_sub_used(root, right->len);
2007 			btrfs_free_tree_block(trans, root, right, 0, 1);
2008 			free_extent_buffer_stale(right);
2009 			right = NULL;
2010 		} else {
2011 			struct btrfs_disk_key right_key;
2012 			btrfs_node_key(right, &right_key, 0);
2013 			tree_mod_log_set_node_key(root->fs_info, parent,
2014 						  pslot + 1, 0);
2015 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2016 			btrfs_mark_buffer_dirty(parent);
2017 		}
2018 	}
2019 	if (btrfs_header_nritems(mid) == 1) {
2020 		/*
2021 		 * we're not allowed to leave a node with one item in the
2022 		 * tree during a delete.  A deletion from lower in the tree
2023 		 * could try to delete the only pointer in this node.
2024 		 * So, pull some keys from the left.
2025 		 * There has to be a left pointer at this point because
2026 		 * otherwise we would have pulled some pointers from the
2027 		 * right
2028 		 */
2029 		if (!left) {
2030 			ret = -EROFS;
2031 			btrfs_std_error(root->fs_info, ret);
2032 			goto enospc;
2033 		}
2034 		wret = balance_node_right(trans, root, mid, left);
2035 		if (wret < 0) {
2036 			ret = wret;
2037 			goto enospc;
2038 		}
2039 		if (wret == 1) {
2040 			wret = push_node_left(trans, root, left, mid, 1);
2041 			if (wret < 0)
2042 				ret = wret;
2043 		}
2044 		BUG_ON(wret == 1);
2045 	}
2046 	if (btrfs_header_nritems(mid) == 0) {
2047 		clean_tree_block(trans, root->fs_info, mid);
2048 		btrfs_tree_unlock(mid);
2049 		del_ptr(root, path, level + 1, pslot);
2050 		root_sub_used(root, mid->len);
2051 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2052 		free_extent_buffer_stale(mid);
2053 		mid = NULL;
2054 	} else {
2055 		/* update the parent key to reflect our changes */
2056 		struct btrfs_disk_key mid_key;
2057 		btrfs_node_key(mid, &mid_key, 0);
2058 		tree_mod_log_set_node_key(root->fs_info, parent,
2059 					  pslot, 0);
2060 		btrfs_set_node_key(parent, &mid_key, pslot);
2061 		btrfs_mark_buffer_dirty(parent);
2062 	}
2063 
2064 	/* update the path */
2065 	if (left) {
2066 		if (btrfs_header_nritems(left) > orig_slot) {
2067 			extent_buffer_get(left);
2068 			/* left was locked after cow */
2069 			path->nodes[level] = left;
2070 			path->slots[level + 1] -= 1;
2071 			path->slots[level] = orig_slot;
2072 			if (mid) {
2073 				btrfs_tree_unlock(mid);
2074 				free_extent_buffer(mid);
2075 			}
2076 		} else {
2077 			orig_slot -= btrfs_header_nritems(left);
2078 			path->slots[level] = orig_slot;
2079 		}
2080 	}
2081 	/* double check we haven't messed things up */
2082 	if (orig_ptr !=
2083 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2084 		BUG();
2085 enospc:
2086 	if (right) {
2087 		btrfs_tree_unlock(right);
2088 		free_extent_buffer(right);
2089 	}
2090 	if (left) {
2091 		if (path->nodes[level] != left)
2092 			btrfs_tree_unlock(left);
2093 		free_extent_buffer(left);
2094 	}
2095 	return ret;
2096 }
2097 
2098 /* Node balancing for insertion.  Here we only split or push nodes around
2099  * when they are completely full.  This is also done top down, so we
2100  * have to be pessimistic.
2101  */
2102 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2103 					  struct btrfs_root *root,
2104 					  struct btrfs_path *path, int level)
2105 {
2106 	struct extent_buffer *right = NULL;
2107 	struct extent_buffer *mid;
2108 	struct extent_buffer *left = NULL;
2109 	struct extent_buffer *parent = NULL;
2110 	int ret = 0;
2111 	int wret;
2112 	int pslot;
2113 	int orig_slot = path->slots[level];
2114 
2115 	if (level == 0)
2116 		return 1;
2117 
2118 	mid = path->nodes[level];
2119 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2120 
2121 	if (level < BTRFS_MAX_LEVEL - 1) {
2122 		parent = path->nodes[level + 1];
2123 		pslot = path->slots[level + 1];
2124 	}
2125 
2126 	if (!parent)
2127 		return 1;
2128 
2129 	left = read_node_slot(root, parent, pslot - 1);
2130 
2131 	/* first, try to make some room in the middle buffer */
2132 	if (left) {
2133 		u32 left_nr;
2134 
2135 		btrfs_tree_lock(left);
2136 		btrfs_set_lock_blocking(left);
2137 
2138 		left_nr = btrfs_header_nritems(left);
2139 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2140 			wret = 1;
2141 		} else {
2142 			ret = btrfs_cow_block(trans, root, left, parent,
2143 					      pslot - 1, &left);
2144 			if (ret)
2145 				wret = 1;
2146 			else {
2147 				wret = push_node_left(trans, root,
2148 						      left, mid, 0);
2149 			}
2150 		}
2151 		if (wret < 0)
2152 			ret = wret;
2153 		if (wret == 0) {
2154 			struct btrfs_disk_key disk_key;
2155 			orig_slot += left_nr;
2156 			btrfs_node_key(mid, &disk_key, 0);
2157 			tree_mod_log_set_node_key(root->fs_info, parent,
2158 						  pslot, 0);
2159 			btrfs_set_node_key(parent, &disk_key, pslot);
2160 			btrfs_mark_buffer_dirty(parent);
2161 			if (btrfs_header_nritems(left) > orig_slot) {
2162 				path->nodes[level] = left;
2163 				path->slots[level + 1] -= 1;
2164 				path->slots[level] = orig_slot;
2165 				btrfs_tree_unlock(mid);
2166 				free_extent_buffer(mid);
2167 			} else {
2168 				orig_slot -=
2169 					btrfs_header_nritems(left);
2170 				path->slots[level] = orig_slot;
2171 				btrfs_tree_unlock(left);
2172 				free_extent_buffer(left);
2173 			}
2174 			return 0;
2175 		}
2176 		btrfs_tree_unlock(left);
2177 		free_extent_buffer(left);
2178 	}
2179 	right = read_node_slot(root, parent, pslot + 1);
2180 
2181 	/*
2182 	 * then try to empty the right most buffer into the middle
2183 	 */
2184 	if (right) {
2185 		u32 right_nr;
2186 
2187 		btrfs_tree_lock(right);
2188 		btrfs_set_lock_blocking(right);
2189 
2190 		right_nr = btrfs_header_nritems(right);
2191 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2192 			wret = 1;
2193 		} else {
2194 			ret = btrfs_cow_block(trans, root, right,
2195 					      parent, pslot + 1,
2196 					      &right);
2197 			if (ret)
2198 				wret = 1;
2199 			else {
2200 				wret = balance_node_right(trans, root,
2201 							  right, mid);
2202 			}
2203 		}
2204 		if (wret < 0)
2205 			ret = wret;
2206 		if (wret == 0) {
2207 			struct btrfs_disk_key disk_key;
2208 
2209 			btrfs_node_key(right, &disk_key, 0);
2210 			tree_mod_log_set_node_key(root->fs_info, parent,
2211 						  pslot + 1, 0);
2212 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2213 			btrfs_mark_buffer_dirty(parent);
2214 
2215 			if (btrfs_header_nritems(mid) <= orig_slot) {
2216 				path->nodes[level] = right;
2217 				path->slots[level + 1] += 1;
2218 				path->slots[level] = orig_slot -
2219 					btrfs_header_nritems(mid);
2220 				btrfs_tree_unlock(mid);
2221 				free_extent_buffer(mid);
2222 			} else {
2223 				btrfs_tree_unlock(right);
2224 				free_extent_buffer(right);
2225 			}
2226 			return 0;
2227 		}
2228 		btrfs_tree_unlock(right);
2229 		free_extent_buffer(right);
2230 	}
2231 	return 1;
2232 }
2233 
2234 /*
2235  * readahead one full node of leaves, finding things that are close
2236  * to the block in 'slot', and triggering ra on them.
2237  */
2238 static void reada_for_search(struct btrfs_root *root,
2239 			     struct btrfs_path *path,
2240 			     int level, int slot, u64 objectid)
2241 {
2242 	struct extent_buffer *node;
2243 	struct btrfs_disk_key disk_key;
2244 	u32 nritems;
2245 	u64 search;
2246 	u64 target;
2247 	u64 nread = 0;
2248 	u64 gen;
2249 	int direction = path->reada;
2250 	struct extent_buffer *eb;
2251 	u32 nr;
2252 	u32 blocksize;
2253 	u32 nscan = 0;
2254 
2255 	if (level != 1)
2256 		return;
2257 
2258 	if (!path->nodes[level])
2259 		return;
2260 
2261 	node = path->nodes[level];
2262 
2263 	search = btrfs_node_blockptr(node, slot);
2264 	blocksize = root->nodesize;
2265 	eb = btrfs_find_tree_block(root->fs_info, search);
2266 	if (eb) {
2267 		free_extent_buffer(eb);
2268 		return;
2269 	}
2270 
2271 	target = search;
2272 
2273 	nritems = btrfs_header_nritems(node);
2274 	nr = slot;
2275 
2276 	while (1) {
2277 		if (direction < 0) {
2278 			if (nr == 0)
2279 				break;
2280 			nr--;
2281 		} else if (direction > 0) {
2282 			nr++;
2283 			if (nr >= nritems)
2284 				break;
2285 		}
2286 		if (path->reada < 0 && objectid) {
2287 			btrfs_node_key(node, &disk_key, nr);
2288 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2289 				break;
2290 		}
2291 		search = btrfs_node_blockptr(node, nr);
2292 		if ((search <= target && target - search <= 65536) ||
2293 		    (search > target && search - target <= 65536)) {
2294 			gen = btrfs_node_ptr_generation(node, nr);
2295 			readahead_tree_block(root, search);
2296 			nread += blocksize;
2297 		}
2298 		nscan++;
2299 		if ((nread > 65536 || nscan > 32))
2300 			break;
2301 	}
2302 }
2303 
2304 static noinline void reada_for_balance(struct btrfs_root *root,
2305 				       struct btrfs_path *path, int level)
2306 {
2307 	int slot;
2308 	int nritems;
2309 	struct extent_buffer *parent;
2310 	struct extent_buffer *eb;
2311 	u64 gen;
2312 	u64 block1 = 0;
2313 	u64 block2 = 0;
2314 
2315 	parent = path->nodes[level + 1];
2316 	if (!parent)
2317 		return;
2318 
2319 	nritems = btrfs_header_nritems(parent);
2320 	slot = path->slots[level + 1];
2321 
2322 	if (slot > 0) {
2323 		block1 = btrfs_node_blockptr(parent, slot - 1);
2324 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2325 		eb = btrfs_find_tree_block(root->fs_info, block1);
2326 		/*
2327 		 * if we get -eagain from btrfs_buffer_uptodate, we
2328 		 * don't want to return eagain here.  That will loop
2329 		 * forever
2330 		 */
2331 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2332 			block1 = 0;
2333 		free_extent_buffer(eb);
2334 	}
2335 	if (slot + 1 < nritems) {
2336 		block2 = btrfs_node_blockptr(parent, slot + 1);
2337 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2338 		eb = btrfs_find_tree_block(root->fs_info, block2);
2339 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2340 			block2 = 0;
2341 		free_extent_buffer(eb);
2342 	}
2343 
2344 	if (block1)
2345 		readahead_tree_block(root, block1);
2346 	if (block2)
2347 		readahead_tree_block(root, block2);
2348 }
2349 
2350 
2351 /*
2352  * when we walk down the tree, it is usually safe to unlock the higher layers
2353  * in the tree.  The exceptions are when our path goes through slot 0, because
2354  * operations on the tree might require changing key pointers higher up in the
2355  * tree.
2356  *
2357  * callers might also have set path->keep_locks, which tells this code to keep
2358  * the lock if the path points to the last slot in the block.  This is part of
2359  * walking through the tree, and selecting the next slot in the higher block.
2360  *
2361  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2362  * if lowest_unlock is 1, level 0 won't be unlocked
2363  */
2364 static noinline void unlock_up(struct btrfs_path *path, int level,
2365 			       int lowest_unlock, int min_write_lock_level,
2366 			       int *write_lock_level)
2367 {
2368 	int i;
2369 	int skip_level = level;
2370 	int no_skips = 0;
2371 	struct extent_buffer *t;
2372 
2373 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2374 		if (!path->nodes[i])
2375 			break;
2376 		if (!path->locks[i])
2377 			break;
2378 		if (!no_skips && path->slots[i] == 0) {
2379 			skip_level = i + 1;
2380 			continue;
2381 		}
2382 		if (!no_skips && path->keep_locks) {
2383 			u32 nritems;
2384 			t = path->nodes[i];
2385 			nritems = btrfs_header_nritems(t);
2386 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2387 				skip_level = i + 1;
2388 				continue;
2389 			}
2390 		}
2391 		if (skip_level < i && i >= lowest_unlock)
2392 			no_skips = 1;
2393 
2394 		t = path->nodes[i];
2395 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2396 			btrfs_tree_unlock_rw(t, path->locks[i]);
2397 			path->locks[i] = 0;
2398 			if (write_lock_level &&
2399 			    i > min_write_lock_level &&
2400 			    i <= *write_lock_level) {
2401 				*write_lock_level = i - 1;
2402 			}
2403 		}
2404 	}
2405 }
2406 
2407 /*
2408  * This releases any locks held in the path starting at level and
2409  * going all the way up to the root.
2410  *
2411  * btrfs_search_slot will keep the lock held on higher nodes in a few
2412  * corner cases, such as COW of the block at slot zero in the node.  This
2413  * ignores those rules, and it should only be called when there are no
2414  * more updates to be done higher up in the tree.
2415  */
2416 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2417 {
2418 	int i;
2419 
2420 	if (path->keep_locks)
2421 		return;
2422 
2423 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2424 		if (!path->nodes[i])
2425 			continue;
2426 		if (!path->locks[i])
2427 			continue;
2428 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2429 		path->locks[i] = 0;
2430 	}
2431 }
2432 
2433 /*
2434  * helper function for btrfs_search_slot.  The goal is to find a block
2435  * in cache without setting the path to blocking.  If we find the block
2436  * we return zero and the path is unchanged.
2437  *
2438  * If we can't find the block, we set the path blocking and do some
2439  * reada.  -EAGAIN is returned and the search must be repeated.
2440  */
2441 static int
2442 read_block_for_search(struct btrfs_trans_handle *trans,
2443 		       struct btrfs_root *root, struct btrfs_path *p,
2444 		       struct extent_buffer **eb_ret, int level, int slot,
2445 		       struct btrfs_key *key, u64 time_seq)
2446 {
2447 	u64 blocknr;
2448 	u64 gen;
2449 	struct extent_buffer *b = *eb_ret;
2450 	struct extent_buffer *tmp;
2451 	int ret;
2452 
2453 	blocknr = btrfs_node_blockptr(b, slot);
2454 	gen = btrfs_node_ptr_generation(b, slot);
2455 
2456 	tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2457 	if (tmp) {
2458 		/* first we do an atomic uptodate check */
2459 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2460 			*eb_ret = tmp;
2461 			return 0;
2462 		}
2463 
2464 		/* the pages were up to date, but we failed
2465 		 * the generation number check.  Do a full
2466 		 * read for the generation number that is correct.
2467 		 * We must do this without dropping locks so
2468 		 * we can trust our generation number
2469 		 */
2470 		btrfs_set_path_blocking(p);
2471 
2472 		/* now we're allowed to do a blocking uptodate check */
2473 		ret = btrfs_read_buffer(tmp, gen);
2474 		if (!ret) {
2475 			*eb_ret = tmp;
2476 			return 0;
2477 		}
2478 		free_extent_buffer(tmp);
2479 		btrfs_release_path(p);
2480 		return -EIO;
2481 	}
2482 
2483 	/*
2484 	 * reduce lock contention at high levels
2485 	 * of the btree by dropping locks before
2486 	 * we read.  Don't release the lock on the current
2487 	 * level because we need to walk this node to figure
2488 	 * out which blocks to read.
2489 	 */
2490 	btrfs_unlock_up_safe(p, level + 1);
2491 	btrfs_set_path_blocking(p);
2492 
2493 	free_extent_buffer(tmp);
2494 	if (p->reada)
2495 		reada_for_search(root, p, level, slot, key->objectid);
2496 
2497 	btrfs_release_path(p);
2498 
2499 	ret = -EAGAIN;
2500 	tmp = read_tree_block(root, blocknr, 0);
2501 	if (!IS_ERR(tmp)) {
2502 		/*
2503 		 * If the read above didn't mark this buffer up to date,
2504 		 * it will never end up being up to date.  Set ret to EIO now
2505 		 * and give up so that our caller doesn't loop forever
2506 		 * on our EAGAINs.
2507 		 */
2508 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2509 			ret = -EIO;
2510 		free_extent_buffer(tmp);
2511 	}
2512 	return ret;
2513 }
2514 
2515 /*
2516  * helper function for btrfs_search_slot.  This does all of the checks
2517  * for node-level blocks and does any balancing required based on
2518  * the ins_len.
2519  *
2520  * If no extra work was required, zero is returned.  If we had to
2521  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2522  * start over
2523  */
2524 static int
2525 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2526 		       struct btrfs_root *root, struct btrfs_path *p,
2527 		       struct extent_buffer *b, int level, int ins_len,
2528 		       int *write_lock_level)
2529 {
2530 	int ret;
2531 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2532 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2533 		int sret;
2534 
2535 		if (*write_lock_level < level + 1) {
2536 			*write_lock_level = level + 1;
2537 			btrfs_release_path(p);
2538 			goto again;
2539 		}
2540 
2541 		btrfs_set_path_blocking(p);
2542 		reada_for_balance(root, p, level);
2543 		sret = split_node(trans, root, p, level);
2544 		btrfs_clear_path_blocking(p, NULL, 0);
2545 
2546 		BUG_ON(sret > 0);
2547 		if (sret) {
2548 			ret = sret;
2549 			goto done;
2550 		}
2551 		b = p->nodes[level];
2552 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2553 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2554 		int sret;
2555 
2556 		if (*write_lock_level < level + 1) {
2557 			*write_lock_level = level + 1;
2558 			btrfs_release_path(p);
2559 			goto again;
2560 		}
2561 
2562 		btrfs_set_path_blocking(p);
2563 		reada_for_balance(root, p, level);
2564 		sret = balance_level(trans, root, p, level);
2565 		btrfs_clear_path_blocking(p, NULL, 0);
2566 
2567 		if (sret) {
2568 			ret = sret;
2569 			goto done;
2570 		}
2571 		b = p->nodes[level];
2572 		if (!b) {
2573 			btrfs_release_path(p);
2574 			goto again;
2575 		}
2576 		BUG_ON(btrfs_header_nritems(b) == 1);
2577 	}
2578 	return 0;
2579 
2580 again:
2581 	ret = -EAGAIN;
2582 done:
2583 	return ret;
2584 }
2585 
2586 static void key_search_validate(struct extent_buffer *b,
2587 				struct btrfs_key *key,
2588 				int level)
2589 {
2590 #ifdef CONFIG_BTRFS_ASSERT
2591 	struct btrfs_disk_key disk_key;
2592 
2593 	btrfs_cpu_key_to_disk(&disk_key, key);
2594 
2595 	if (level == 0)
2596 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2597 		    offsetof(struct btrfs_leaf, items[0].key),
2598 		    sizeof(disk_key)));
2599 	else
2600 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2601 		    offsetof(struct btrfs_node, ptrs[0].key),
2602 		    sizeof(disk_key)));
2603 #endif
2604 }
2605 
2606 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2607 		      int level, int *prev_cmp, int *slot)
2608 {
2609 	if (*prev_cmp != 0) {
2610 		*prev_cmp = bin_search(b, key, level, slot);
2611 		return *prev_cmp;
2612 	}
2613 
2614 	key_search_validate(b, key, level);
2615 	*slot = 0;
2616 
2617 	return 0;
2618 }
2619 
2620 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2621 		u64 iobjectid, u64 ioff, u8 key_type,
2622 		struct btrfs_key *found_key)
2623 {
2624 	int ret;
2625 	struct btrfs_key key;
2626 	struct extent_buffer *eb;
2627 
2628 	ASSERT(path);
2629 	ASSERT(found_key);
2630 
2631 	key.type = key_type;
2632 	key.objectid = iobjectid;
2633 	key.offset = ioff;
2634 
2635 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2636 	if (ret < 0)
2637 		return ret;
2638 
2639 	eb = path->nodes[0];
2640 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2641 		ret = btrfs_next_leaf(fs_root, path);
2642 		if (ret)
2643 			return ret;
2644 		eb = path->nodes[0];
2645 	}
2646 
2647 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2648 	if (found_key->type != key.type ||
2649 			found_key->objectid != key.objectid)
2650 		return 1;
2651 
2652 	return 0;
2653 }
2654 
2655 /*
2656  * look for key in the tree.  path is filled in with nodes along the way
2657  * if key is found, we return zero and you can find the item in the leaf
2658  * level of the path (level 0)
2659  *
2660  * If the key isn't found, the path points to the slot where it should
2661  * be inserted, and 1 is returned.  If there are other errors during the
2662  * search a negative error number is returned.
2663  *
2664  * if ins_len > 0, nodes and leaves will be split as we walk down the
2665  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2666  * possible)
2667  */
2668 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2669 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2670 		      ins_len, int cow)
2671 {
2672 	struct extent_buffer *b;
2673 	int slot;
2674 	int ret;
2675 	int err;
2676 	int level;
2677 	int lowest_unlock = 1;
2678 	int root_lock;
2679 	/* everything at write_lock_level or lower must be write locked */
2680 	int write_lock_level = 0;
2681 	u8 lowest_level = 0;
2682 	int min_write_lock_level;
2683 	int prev_cmp;
2684 
2685 	lowest_level = p->lowest_level;
2686 	WARN_ON(lowest_level && ins_len > 0);
2687 	WARN_ON(p->nodes[0] != NULL);
2688 	BUG_ON(!cow && ins_len);
2689 
2690 	if (ins_len < 0) {
2691 		lowest_unlock = 2;
2692 
2693 		/* when we are removing items, we might have to go up to level
2694 		 * two as we update tree pointers  Make sure we keep write
2695 		 * for those levels as well
2696 		 */
2697 		write_lock_level = 2;
2698 	} else if (ins_len > 0) {
2699 		/*
2700 		 * for inserting items, make sure we have a write lock on
2701 		 * level 1 so we can update keys
2702 		 */
2703 		write_lock_level = 1;
2704 	}
2705 
2706 	if (!cow)
2707 		write_lock_level = -1;
2708 
2709 	if (cow && (p->keep_locks || p->lowest_level))
2710 		write_lock_level = BTRFS_MAX_LEVEL;
2711 
2712 	min_write_lock_level = write_lock_level;
2713 
2714 again:
2715 	prev_cmp = -1;
2716 	/*
2717 	 * we try very hard to do read locks on the root
2718 	 */
2719 	root_lock = BTRFS_READ_LOCK;
2720 	level = 0;
2721 	if (p->search_commit_root) {
2722 		/*
2723 		 * the commit roots are read only
2724 		 * so we always do read locks
2725 		 */
2726 		if (p->need_commit_sem)
2727 			down_read(&root->fs_info->commit_root_sem);
2728 		b = root->commit_root;
2729 		extent_buffer_get(b);
2730 		level = btrfs_header_level(b);
2731 		if (p->need_commit_sem)
2732 			up_read(&root->fs_info->commit_root_sem);
2733 		if (!p->skip_locking)
2734 			btrfs_tree_read_lock(b);
2735 	} else {
2736 		if (p->skip_locking) {
2737 			b = btrfs_root_node(root);
2738 			level = btrfs_header_level(b);
2739 		} else {
2740 			/* we don't know the level of the root node
2741 			 * until we actually have it read locked
2742 			 */
2743 			b = btrfs_read_lock_root_node(root);
2744 			level = btrfs_header_level(b);
2745 			if (level <= write_lock_level) {
2746 				/* whoops, must trade for write lock */
2747 				btrfs_tree_read_unlock(b);
2748 				free_extent_buffer(b);
2749 				b = btrfs_lock_root_node(root);
2750 				root_lock = BTRFS_WRITE_LOCK;
2751 
2752 				/* the level might have changed, check again */
2753 				level = btrfs_header_level(b);
2754 			}
2755 		}
2756 	}
2757 	p->nodes[level] = b;
2758 	if (!p->skip_locking)
2759 		p->locks[level] = root_lock;
2760 
2761 	while (b) {
2762 		level = btrfs_header_level(b);
2763 
2764 		/*
2765 		 * setup the path here so we can release it under lock
2766 		 * contention with the cow code
2767 		 */
2768 		if (cow) {
2769 			/*
2770 			 * if we don't really need to cow this block
2771 			 * then we don't want to set the path blocking,
2772 			 * so we test it here
2773 			 */
2774 			if (!should_cow_block(trans, root, b))
2775 				goto cow_done;
2776 
2777 			/*
2778 			 * must have write locks on this node and the
2779 			 * parent
2780 			 */
2781 			if (level > write_lock_level ||
2782 			    (level + 1 > write_lock_level &&
2783 			    level + 1 < BTRFS_MAX_LEVEL &&
2784 			    p->nodes[level + 1])) {
2785 				write_lock_level = level + 1;
2786 				btrfs_release_path(p);
2787 				goto again;
2788 			}
2789 
2790 			btrfs_set_path_blocking(p);
2791 			err = btrfs_cow_block(trans, root, b,
2792 					      p->nodes[level + 1],
2793 					      p->slots[level + 1], &b);
2794 			if (err) {
2795 				ret = err;
2796 				goto done;
2797 			}
2798 		}
2799 cow_done:
2800 		p->nodes[level] = b;
2801 		btrfs_clear_path_blocking(p, NULL, 0);
2802 
2803 		/*
2804 		 * we have a lock on b and as long as we aren't changing
2805 		 * the tree, there is no way to for the items in b to change.
2806 		 * It is safe to drop the lock on our parent before we
2807 		 * go through the expensive btree search on b.
2808 		 *
2809 		 * If we're inserting or deleting (ins_len != 0), then we might
2810 		 * be changing slot zero, which may require changing the parent.
2811 		 * So, we can't drop the lock until after we know which slot
2812 		 * we're operating on.
2813 		 */
2814 		if (!ins_len && !p->keep_locks) {
2815 			int u = level + 1;
2816 
2817 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2818 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2819 				p->locks[u] = 0;
2820 			}
2821 		}
2822 
2823 		ret = key_search(b, key, level, &prev_cmp, &slot);
2824 
2825 		if (level != 0) {
2826 			int dec = 0;
2827 			if (ret && slot > 0) {
2828 				dec = 1;
2829 				slot -= 1;
2830 			}
2831 			p->slots[level] = slot;
2832 			err = setup_nodes_for_search(trans, root, p, b, level,
2833 					     ins_len, &write_lock_level);
2834 			if (err == -EAGAIN)
2835 				goto again;
2836 			if (err) {
2837 				ret = err;
2838 				goto done;
2839 			}
2840 			b = p->nodes[level];
2841 			slot = p->slots[level];
2842 
2843 			/*
2844 			 * slot 0 is special, if we change the key
2845 			 * we have to update the parent pointer
2846 			 * which means we must have a write lock
2847 			 * on the parent
2848 			 */
2849 			if (slot == 0 && ins_len &&
2850 			    write_lock_level < level + 1) {
2851 				write_lock_level = level + 1;
2852 				btrfs_release_path(p);
2853 				goto again;
2854 			}
2855 
2856 			unlock_up(p, level, lowest_unlock,
2857 				  min_write_lock_level, &write_lock_level);
2858 
2859 			if (level == lowest_level) {
2860 				if (dec)
2861 					p->slots[level]++;
2862 				goto done;
2863 			}
2864 
2865 			err = read_block_for_search(trans, root, p,
2866 						    &b, level, slot, key, 0);
2867 			if (err == -EAGAIN)
2868 				goto again;
2869 			if (err) {
2870 				ret = err;
2871 				goto done;
2872 			}
2873 
2874 			if (!p->skip_locking) {
2875 				level = btrfs_header_level(b);
2876 				if (level <= write_lock_level) {
2877 					err = btrfs_try_tree_write_lock(b);
2878 					if (!err) {
2879 						btrfs_set_path_blocking(p);
2880 						btrfs_tree_lock(b);
2881 						btrfs_clear_path_blocking(p, b,
2882 								  BTRFS_WRITE_LOCK);
2883 					}
2884 					p->locks[level] = BTRFS_WRITE_LOCK;
2885 				} else {
2886 					err = btrfs_tree_read_lock_atomic(b);
2887 					if (!err) {
2888 						btrfs_set_path_blocking(p);
2889 						btrfs_tree_read_lock(b);
2890 						btrfs_clear_path_blocking(p, b,
2891 								  BTRFS_READ_LOCK);
2892 					}
2893 					p->locks[level] = BTRFS_READ_LOCK;
2894 				}
2895 				p->nodes[level] = b;
2896 			}
2897 		} else {
2898 			p->slots[level] = slot;
2899 			if (ins_len > 0 &&
2900 			    btrfs_leaf_free_space(root, b) < ins_len) {
2901 				if (write_lock_level < 1) {
2902 					write_lock_level = 1;
2903 					btrfs_release_path(p);
2904 					goto again;
2905 				}
2906 
2907 				btrfs_set_path_blocking(p);
2908 				err = split_leaf(trans, root, key,
2909 						 p, ins_len, ret == 0);
2910 				btrfs_clear_path_blocking(p, NULL, 0);
2911 
2912 				BUG_ON(err > 0);
2913 				if (err) {
2914 					ret = err;
2915 					goto done;
2916 				}
2917 			}
2918 			if (!p->search_for_split)
2919 				unlock_up(p, level, lowest_unlock,
2920 					  min_write_lock_level, &write_lock_level);
2921 			goto done;
2922 		}
2923 	}
2924 	ret = 1;
2925 done:
2926 	/*
2927 	 * we don't really know what they plan on doing with the path
2928 	 * from here on, so for now just mark it as blocking
2929 	 */
2930 	if (!p->leave_spinning)
2931 		btrfs_set_path_blocking(p);
2932 	if (ret < 0 && !p->skip_release_on_error)
2933 		btrfs_release_path(p);
2934 	return ret;
2935 }
2936 
2937 /*
2938  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2939  * current state of the tree together with the operations recorded in the tree
2940  * modification log to search for the key in a previous version of this tree, as
2941  * denoted by the time_seq parameter.
2942  *
2943  * Naturally, there is no support for insert, delete or cow operations.
2944  *
2945  * The resulting path and return value will be set up as if we called
2946  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2947  */
2948 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2949 			  struct btrfs_path *p, u64 time_seq)
2950 {
2951 	struct extent_buffer *b;
2952 	int slot;
2953 	int ret;
2954 	int err;
2955 	int level;
2956 	int lowest_unlock = 1;
2957 	u8 lowest_level = 0;
2958 	int prev_cmp = -1;
2959 
2960 	lowest_level = p->lowest_level;
2961 	WARN_ON(p->nodes[0] != NULL);
2962 
2963 	if (p->search_commit_root) {
2964 		BUG_ON(time_seq);
2965 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2966 	}
2967 
2968 again:
2969 	b = get_old_root(root, time_seq);
2970 	level = btrfs_header_level(b);
2971 	p->locks[level] = BTRFS_READ_LOCK;
2972 
2973 	while (b) {
2974 		level = btrfs_header_level(b);
2975 		p->nodes[level] = b;
2976 		btrfs_clear_path_blocking(p, NULL, 0);
2977 
2978 		/*
2979 		 * we have a lock on b and as long as we aren't changing
2980 		 * the tree, there is no way to for the items in b to change.
2981 		 * It is safe to drop the lock on our parent before we
2982 		 * go through the expensive btree search on b.
2983 		 */
2984 		btrfs_unlock_up_safe(p, level + 1);
2985 
2986 		/*
2987 		 * Since we can unwind eb's we want to do a real search every
2988 		 * time.
2989 		 */
2990 		prev_cmp = -1;
2991 		ret = key_search(b, key, level, &prev_cmp, &slot);
2992 
2993 		if (level != 0) {
2994 			int dec = 0;
2995 			if (ret && slot > 0) {
2996 				dec = 1;
2997 				slot -= 1;
2998 			}
2999 			p->slots[level] = slot;
3000 			unlock_up(p, level, lowest_unlock, 0, NULL);
3001 
3002 			if (level == lowest_level) {
3003 				if (dec)
3004 					p->slots[level]++;
3005 				goto done;
3006 			}
3007 
3008 			err = read_block_for_search(NULL, root, p, &b, level,
3009 						    slot, key, time_seq);
3010 			if (err == -EAGAIN)
3011 				goto again;
3012 			if (err) {
3013 				ret = err;
3014 				goto done;
3015 			}
3016 
3017 			level = btrfs_header_level(b);
3018 			err = btrfs_tree_read_lock_atomic(b);
3019 			if (!err) {
3020 				btrfs_set_path_blocking(p);
3021 				btrfs_tree_read_lock(b);
3022 				btrfs_clear_path_blocking(p, b,
3023 							  BTRFS_READ_LOCK);
3024 			}
3025 			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3026 			if (!b) {
3027 				ret = -ENOMEM;
3028 				goto done;
3029 			}
3030 			p->locks[level] = BTRFS_READ_LOCK;
3031 			p->nodes[level] = b;
3032 		} else {
3033 			p->slots[level] = slot;
3034 			unlock_up(p, level, lowest_unlock, 0, NULL);
3035 			goto done;
3036 		}
3037 	}
3038 	ret = 1;
3039 done:
3040 	if (!p->leave_spinning)
3041 		btrfs_set_path_blocking(p);
3042 	if (ret < 0)
3043 		btrfs_release_path(p);
3044 
3045 	return ret;
3046 }
3047 
3048 /*
3049  * helper to use instead of search slot if no exact match is needed but
3050  * instead the next or previous item should be returned.
3051  * When find_higher is true, the next higher item is returned, the next lower
3052  * otherwise.
3053  * When return_any and find_higher are both true, and no higher item is found,
3054  * return the next lower instead.
3055  * When return_any is true and find_higher is false, and no lower item is found,
3056  * return the next higher instead.
3057  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3058  * < 0 on error
3059  */
3060 int btrfs_search_slot_for_read(struct btrfs_root *root,
3061 			       struct btrfs_key *key, struct btrfs_path *p,
3062 			       int find_higher, int return_any)
3063 {
3064 	int ret;
3065 	struct extent_buffer *leaf;
3066 
3067 again:
3068 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3069 	if (ret <= 0)
3070 		return ret;
3071 	/*
3072 	 * a return value of 1 means the path is at the position where the
3073 	 * item should be inserted. Normally this is the next bigger item,
3074 	 * but in case the previous item is the last in a leaf, path points
3075 	 * to the first free slot in the previous leaf, i.e. at an invalid
3076 	 * item.
3077 	 */
3078 	leaf = p->nodes[0];
3079 
3080 	if (find_higher) {
3081 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3082 			ret = btrfs_next_leaf(root, p);
3083 			if (ret <= 0)
3084 				return ret;
3085 			if (!return_any)
3086 				return 1;
3087 			/*
3088 			 * no higher item found, return the next
3089 			 * lower instead
3090 			 */
3091 			return_any = 0;
3092 			find_higher = 0;
3093 			btrfs_release_path(p);
3094 			goto again;
3095 		}
3096 	} else {
3097 		if (p->slots[0] == 0) {
3098 			ret = btrfs_prev_leaf(root, p);
3099 			if (ret < 0)
3100 				return ret;
3101 			if (!ret) {
3102 				leaf = p->nodes[0];
3103 				if (p->slots[0] == btrfs_header_nritems(leaf))
3104 					p->slots[0]--;
3105 				return 0;
3106 			}
3107 			if (!return_any)
3108 				return 1;
3109 			/*
3110 			 * no lower item found, return the next
3111 			 * higher instead
3112 			 */
3113 			return_any = 0;
3114 			find_higher = 1;
3115 			btrfs_release_path(p);
3116 			goto again;
3117 		} else {
3118 			--p->slots[0];
3119 		}
3120 	}
3121 	return 0;
3122 }
3123 
3124 /*
3125  * adjust the pointers going up the tree, starting at level
3126  * making sure the right key of each node is points to 'key'.
3127  * This is used after shifting pointers to the left, so it stops
3128  * fixing up pointers when a given leaf/node is not in slot 0 of the
3129  * higher levels
3130  *
3131  */
3132 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3133 			   struct btrfs_path *path,
3134 			   struct btrfs_disk_key *key, int level)
3135 {
3136 	int i;
3137 	struct extent_buffer *t;
3138 
3139 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3140 		int tslot = path->slots[i];
3141 		if (!path->nodes[i])
3142 			break;
3143 		t = path->nodes[i];
3144 		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3145 		btrfs_set_node_key(t, key, tslot);
3146 		btrfs_mark_buffer_dirty(path->nodes[i]);
3147 		if (tslot != 0)
3148 			break;
3149 	}
3150 }
3151 
3152 /*
3153  * update item key.
3154  *
3155  * This function isn't completely safe. It's the caller's responsibility
3156  * that the new key won't break the order
3157  */
3158 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3159 			     struct btrfs_path *path,
3160 			     struct btrfs_key *new_key)
3161 {
3162 	struct btrfs_disk_key disk_key;
3163 	struct extent_buffer *eb;
3164 	int slot;
3165 
3166 	eb = path->nodes[0];
3167 	slot = path->slots[0];
3168 	if (slot > 0) {
3169 		btrfs_item_key(eb, &disk_key, slot - 1);
3170 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3171 	}
3172 	if (slot < btrfs_header_nritems(eb) - 1) {
3173 		btrfs_item_key(eb, &disk_key, slot + 1);
3174 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3175 	}
3176 
3177 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3178 	btrfs_set_item_key(eb, &disk_key, slot);
3179 	btrfs_mark_buffer_dirty(eb);
3180 	if (slot == 0)
3181 		fixup_low_keys(fs_info, path, &disk_key, 1);
3182 }
3183 
3184 /*
3185  * try to push data from one node into the next node left in the
3186  * tree.
3187  *
3188  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3189  * error, and > 0 if there was no room in the left hand block.
3190  */
3191 static int push_node_left(struct btrfs_trans_handle *trans,
3192 			  struct btrfs_root *root, struct extent_buffer *dst,
3193 			  struct extent_buffer *src, int empty)
3194 {
3195 	int push_items = 0;
3196 	int src_nritems;
3197 	int dst_nritems;
3198 	int ret = 0;
3199 
3200 	src_nritems = btrfs_header_nritems(src);
3201 	dst_nritems = btrfs_header_nritems(dst);
3202 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3203 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3204 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3205 
3206 	if (!empty && src_nritems <= 8)
3207 		return 1;
3208 
3209 	if (push_items <= 0)
3210 		return 1;
3211 
3212 	if (empty) {
3213 		push_items = min(src_nritems, push_items);
3214 		if (push_items < src_nritems) {
3215 			/* leave at least 8 pointers in the node if
3216 			 * we aren't going to empty it
3217 			 */
3218 			if (src_nritems - push_items < 8) {
3219 				if (push_items <= 8)
3220 					return 1;
3221 				push_items -= 8;
3222 			}
3223 		}
3224 	} else
3225 		push_items = min(src_nritems - 8, push_items);
3226 
3227 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3228 				   push_items);
3229 	if (ret) {
3230 		btrfs_abort_transaction(trans, root, ret);
3231 		return ret;
3232 	}
3233 	copy_extent_buffer(dst, src,
3234 			   btrfs_node_key_ptr_offset(dst_nritems),
3235 			   btrfs_node_key_ptr_offset(0),
3236 			   push_items * sizeof(struct btrfs_key_ptr));
3237 
3238 	if (push_items < src_nritems) {
3239 		/*
3240 		 * don't call tree_mod_log_eb_move here, key removal was already
3241 		 * fully logged by tree_mod_log_eb_copy above.
3242 		 */
3243 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3244 				      btrfs_node_key_ptr_offset(push_items),
3245 				      (src_nritems - push_items) *
3246 				      sizeof(struct btrfs_key_ptr));
3247 	}
3248 	btrfs_set_header_nritems(src, src_nritems - push_items);
3249 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3250 	btrfs_mark_buffer_dirty(src);
3251 	btrfs_mark_buffer_dirty(dst);
3252 
3253 	return ret;
3254 }
3255 
3256 /*
3257  * try to push data from one node into the next node right in the
3258  * tree.
3259  *
3260  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3261  * error, and > 0 if there was no room in the right hand block.
3262  *
3263  * this will  only push up to 1/2 the contents of the left node over
3264  */
3265 static int balance_node_right(struct btrfs_trans_handle *trans,
3266 			      struct btrfs_root *root,
3267 			      struct extent_buffer *dst,
3268 			      struct extent_buffer *src)
3269 {
3270 	int push_items = 0;
3271 	int max_push;
3272 	int src_nritems;
3273 	int dst_nritems;
3274 	int ret = 0;
3275 
3276 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3277 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3278 
3279 	src_nritems = btrfs_header_nritems(src);
3280 	dst_nritems = btrfs_header_nritems(dst);
3281 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3282 	if (push_items <= 0)
3283 		return 1;
3284 
3285 	if (src_nritems < 4)
3286 		return 1;
3287 
3288 	max_push = src_nritems / 2 + 1;
3289 	/* don't try to empty the node */
3290 	if (max_push >= src_nritems)
3291 		return 1;
3292 
3293 	if (max_push < push_items)
3294 		push_items = max_push;
3295 
3296 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3297 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3298 				      btrfs_node_key_ptr_offset(0),
3299 				      (dst_nritems) *
3300 				      sizeof(struct btrfs_key_ptr));
3301 
3302 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3303 				   src_nritems - push_items, push_items);
3304 	if (ret) {
3305 		btrfs_abort_transaction(trans, root, ret);
3306 		return ret;
3307 	}
3308 	copy_extent_buffer(dst, src,
3309 			   btrfs_node_key_ptr_offset(0),
3310 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3311 			   push_items * sizeof(struct btrfs_key_ptr));
3312 
3313 	btrfs_set_header_nritems(src, src_nritems - push_items);
3314 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3315 
3316 	btrfs_mark_buffer_dirty(src);
3317 	btrfs_mark_buffer_dirty(dst);
3318 
3319 	return ret;
3320 }
3321 
3322 /*
3323  * helper function to insert a new root level in the tree.
3324  * A new node is allocated, and a single item is inserted to
3325  * point to the existing root
3326  *
3327  * returns zero on success or < 0 on failure.
3328  */
3329 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3330 			   struct btrfs_root *root,
3331 			   struct btrfs_path *path, int level)
3332 {
3333 	u64 lower_gen;
3334 	struct extent_buffer *lower;
3335 	struct extent_buffer *c;
3336 	struct extent_buffer *old;
3337 	struct btrfs_disk_key lower_key;
3338 
3339 	BUG_ON(path->nodes[level]);
3340 	BUG_ON(path->nodes[level-1] != root->node);
3341 
3342 	lower = path->nodes[level-1];
3343 	if (level == 1)
3344 		btrfs_item_key(lower, &lower_key, 0);
3345 	else
3346 		btrfs_node_key(lower, &lower_key, 0);
3347 
3348 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3349 				   &lower_key, level, root->node->start, 0);
3350 	if (IS_ERR(c))
3351 		return PTR_ERR(c);
3352 
3353 	root_add_used(root, root->nodesize);
3354 
3355 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3356 	btrfs_set_header_nritems(c, 1);
3357 	btrfs_set_header_level(c, level);
3358 	btrfs_set_header_bytenr(c, c->start);
3359 	btrfs_set_header_generation(c, trans->transid);
3360 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3361 	btrfs_set_header_owner(c, root->root_key.objectid);
3362 
3363 	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3364 			    BTRFS_FSID_SIZE);
3365 
3366 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3367 			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3368 
3369 	btrfs_set_node_key(c, &lower_key, 0);
3370 	btrfs_set_node_blockptr(c, 0, lower->start);
3371 	lower_gen = btrfs_header_generation(lower);
3372 	WARN_ON(lower_gen != trans->transid);
3373 
3374 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3375 
3376 	btrfs_mark_buffer_dirty(c);
3377 
3378 	old = root->node;
3379 	tree_mod_log_set_root_pointer(root, c, 0);
3380 	rcu_assign_pointer(root->node, c);
3381 
3382 	/* the super has an extra ref to root->node */
3383 	free_extent_buffer(old);
3384 
3385 	add_root_to_dirty_list(root);
3386 	extent_buffer_get(c);
3387 	path->nodes[level] = c;
3388 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3389 	path->slots[level] = 0;
3390 	return 0;
3391 }
3392 
3393 /*
3394  * worker function to insert a single pointer in a node.
3395  * the node should have enough room for the pointer already
3396  *
3397  * slot and level indicate where you want the key to go, and
3398  * blocknr is the block the key points to.
3399  */
3400 static void insert_ptr(struct btrfs_trans_handle *trans,
3401 		       struct btrfs_root *root, struct btrfs_path *path,
3402 		       struct btrfs_disk_key *key, u64 bytenr,
3403 		       int slot, int level)
3404 {
3405 	struct extent_buffer *lower;
3406 	int nritems;
3407 	int ret;
3408 
3409 	BUG_ON(!path->nodes[level]);
3410 	btrfs_assert_tree_locked(path->nodes[level]);
3411 	lower = path->nodes[level];
3412 	nritems = btrfs_header_nritems(lower);
3413 	BUG_ON(slot > nritems);
3414 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3415 	if (slot != nritems) {
3416 		if (level)
3417 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3418 					     slot, nritems - slot);
3419 		memmove_extent_buffer(lower,
3420 			      btrfs_node_key_ptr_offset(slot + 1),
3421 			      btrfs_node_key_ptr_offset(slot),
3422 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3423 	}
3424 	if (level) {
3425 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3426 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3427 		BUG_ON(ret < 0);
3428 	}
3429 	btrfs_set_node_key(lower, key, slot);
3430 	btrfs_set_node_blockptr(lower, slot, bytenr);
3431 	WARN_ON(trans->transid == 0);
3432 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3433 	btrfs_set_header_nritems(lower, nritems + 1);
3434 	btrfs_mark_buffer_dirty(lower);
3435 }
3436 
3437 /*
3438  * split the node at the specified level in path in two.
3439  * The path is corrected to point to the appropriate node after the split
3440  *
3441  * Before splitting this tries to make some room in the node by pushing
3442  * left and right, if either one works, it returns right away.
3443  *
3444  * returns 0 on success and < 0 on failure
3445  */
3446 static noinline int split_node(struct btrfs_trans_handle *trans,
3447 			       struct btrfs_root *root,
3448 			       struct btrfs_path *path, int level)
3449 {
3450 	struct extent_buffer *c;
3451 	struct extent_buffer *split;
3452 	struct btrfs_disk_key disk_key;
3453 	int mid;
3454 	int ret;
3455 	u32 c_nritems;
3456 
3457 	c = path->nodes[level];
3458 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3459 	if (c == root->node) {
3460 		/*
3461 		 * trying to split the root, lets make a new one
3462 		 *
3463 		 * tree mod log: We don't log_removal old root in
3464 		 * insert_new_root, because that root buffer will be kept as a
3465 		 * normal node. We are going to log removal of half of the
3466 		 * elements below with tree_mod_log_eb_copy. We're holding a
3467 		 * tree lock on the buffer, which is why we cannot race with
3468 		 * other tree_mod_log users.
3469 		 */
3470 		ret = insert_new_root(trans, root, path, level + 1);
3471 		if (ret)
3472 			return ret;
3473 	} else {
3474 		ret = push_nodes_for_insert(trans, root, path, level);
3475 		c = path->nodes[level];
3476 		if (!ret && btrfs_header_nritems(c) <
3477 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3478 			return 0;
3479 		if (ret < 0)
3480 			return ret;
3481 	}
3482 
3483 	c_nritems = btrfs_header_nritems(c);
3484 	mid = (c_nritems + 1) / 2;
3485 	btrfs_node_key(c, &disk_key, mid);
3486 
3487 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3488 			&disk_key, level, c->start, 0);
3489 	if (IS_ERR(split))
3490 		return PTR_ERR(split);
3491 
3492 	root_add_used(root, root->nodesize);
3493 
3494 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3495 	btrfs_set_header_level(split, btrfs_header_level(c));
3496 	btrfs_set_header_bytenr(split, split->start);
3497 	btrfs_set_header_generation(split, trans->transid);
3498 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3499 	btrfs_set_header_owner(split, root->root_key.objectid);
3500 	write_extent_buffer(split, root->fs_info->fsid,
3501 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
3502 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3503 			    btrfs_header_chunk_tree_uuid(split),
3504 			    BTRFS_UUID_SIZE);
3505 
3506 	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3507 				   mid, c_nritems - mid);
3508 	if (ret) {
3509 		btrfs_abort_transaction(trans, root, ret);
3510 		return ret;
3511 	}
3512 	copy_extent_buffer(split, c,
3513 			   btrfs_node_key_ptr_offset(0),
3514 			   btrfs_node_key_ptr_offset(mid),
3515 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3516 	btrfs_set_header_nritems(split, c_nritems - mid);
3517 	btrfs_set_header_nritems(c, mid);
3518 	ret = 0;
3519 
3520 	btrfs_mark_buffer_dirty(c);
3521 	btrfs_mark_buffer_dirty(split);
3522 
3523 	insert_ptr(trans, root, path, &disk_key, split->start,
3524 		   path->slots[level + 1] + 1, level + 1);
3525 
3526 	if (path->slots[level] >= mid) {
3527 		path->slots[level] -= mid;
3528 		btrfs_tree_unlock(c);
3529 		free_extent_buffer(c);
3530 		path->nodes[level] = split;
3531 		path->slots[level + 1] += 1;
3532 	} else {
3533 		btrfs_tree_unlock(split);
3534 		free_extent_buffer(split);
3535 	}
3536 	return ret;
3537 }
3538 
3539 /*
3540  * how many bytes are required to store the items in a leaf.  start
3541  * and nr indicate which items in the leaf to check.  This totals up the
3542  * space used both by the item structs and the item data
3543  */
3544 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3545 {
3546 	struct btrfs_item *start_item;
3547 	struct btrfs_item *end_item;
3548 	struct btrfs_map_token token;
3549 	int data_len;
3550 	int nritems = btrfs_header_nritems(l);
3551 	int end = min(nritems, start + nr) - 1;
3552 
3553 	if (!nr)
3554 		return 0;
3555 	btrfs_init_map_token(&token);
3556 	start_item = btrfs_item_nr(start);
3557 	end_item = btrfs_item_nr(end);
3558 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3559 		btrfs_token_item_size(l, start_item, &token);
3560 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3561 	data_len += sizeof(struct btrfs_item) * nr;
3562 	WARN_ON(data_len < 0);
3563 	return data_len;
3564 }
3565 
3566 /*
3567  * The space between the end of the leaf items and
3568  * the start of the leaf data.  IOW, how much room
3569  * the leaf has left for both items and data
3570  */
3571 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3572 				   struct extent_buffer *leaf)
3573 {
3574 	int nritems = btrfs_header_nritems(leaf);
3575 	int ret;
3576 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3577 	if (ret < 0) {
3578 		btrfs_crit(root->fs_info,
3579 			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3580 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3581 		       leaf_space_used(leaf, 0, nritems), nritems);
3582 	}
3583 	return ret;
3584 }
3585 
3586 /*
3587  * min slot controls the lowest index we're willing to push to the
3588  * right.  We'll push up to and including min_slot, but no lower
3589  */
3590 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3591 				      struct btrfs_root *root,
3592 				      struct btrfs_path *path,
3593 				      int data_size, int empty,
3594 				      struct extent_buffer *right,
3595 				      int free_space, u32 left_nritems,
3596 				      u32 min_slot)
3597 {
3598 	struct extent_buffer *left = path->nodes[0];
3599 	struct extent_buffer *upper = path->nodes[1];
3600 	struct btrfs_map_token token;
3601 	struct btrfs_disk_key disk_key;
3602 	int slot;
3603 	u32 i;
3604 	int push_space = 0;
3605 	int push_items = 0;
3606 	struct btrfs_item *item;
3607 	u32 nr;
3608 	u32 right_nritems;
3609 	u32 data_end;
3610 	u32 this_item_size;
3611 
3612 	btrfs_init_map_token(&token);
3613 
3614 	if (empty)
3615 		nr = 0;
3616 	else
3617 		nr = max_t(u32, 1, min_slot);
3618 
3619 	if (path->slots[0] >= left_nritems)
3620 		push_space += data_size;
3621 
3622 	slot = path->slots[1];
3623 	i = left_nritems - 1;
3624 	while (i >= nr) {
3625 		item = btrfs_item_nr(i);
3626 
3627 		if (!empty && push_items > 0) {
3628 			if (path->slots[0] > i)
3629 				break;
3630 			if (path->slots[0] == i) {
3631 				int space = btrfs_leaf_free_space(root, left);
3632 				if (space + push_space * 2 > free_space)
3633 					break;
3634 			}
3635 		}
3636 
3637 		if (path->slots[0] == i)
3638 			push_space += data_size;
3639 
3640 		this_item_size = btrfs_item_size(left, item);
3641 		if (this_item_size + sizeof(*item) + push_space > free_space)
3642 			break;
3643 
3644 		push_items++;
3645 		push_space += this_item_size + sizeof(*item);
3646 		if (i == 0)
3647 			break;
3648 		i--;
3649 	}
3650 
3651 	if (push_items == 0)
3652 		goto out_unlock;
3653 
3654 	WARN_ON(!empty && push_items == left_nritems);
3655 
3656 	/* push left to right */
3657 	right_nritems = btrfs_header_nritems(right);
3658 
3659 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3660 	push_space -= leaf_data_end(root, left);
3661 
3662 	/* make room in the right data area */
3663 	data_end = leaf_data_end(root, right);
3664 	memmove_extent_buffer(right,
3665 			      btrfs_leaf_data(right) + data_end - push_space,
3666 			      btrfs_leaf_data(right) + data_end,
3667 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3668 
3669 	/* copy from the left data area */
3670 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3671 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3672 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3673 		     push_space);
3674 
3675 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3676 			      btrfs_item_nr_offset(0),
3677 			      right_nritems * sizeof(struct btrfs_item));
3678 
3679 	/* copy the items from left to right */
3680 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3681 		   btrfs_item_nr_offset(left_nritems - push_items),
3682 		   push_items * sizeof(struct btrfs_item));
3683 
3684 	/* update the item pointers */
3685 	right_nritems += push_items;
3686 	btrfs_set_header_nritems(right, right_nritems);
3687 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3688 	for (i = 0; i < right_nritems; i++) {
3689 		item = btrfs_item_nr(i);
3690 		push_space -= btrfs_token_item_size(right, item, &token);
3691 		btrfs_set_token_item_offset(right, item, push_space, &token);
3692 	}
3693 
3694 	left_nritems -= push_items;
3695 	btrfs_set_header_nritems(left, left_nritems);
3696 
3697 	if (left_nritems)
3698 		btrfs_mark_buffer_dirty(left);
3699 	else
3700 		clean_tree_block(trans, root->fs_info, left);
3701 
3702 	btrfs_mark_buffer_dirty(right);
3703 
3704 	btrfs_item_key(right, &disk_key, 0);
3705 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3706 	btrfs_mark_buffer_dirty(upper);
3707 
3708 	/* then fixup the leaf pointer in the path */
3709 	if (path->slots[0] >= left_nritems) {
3710 		path->slots[0] -= left_nritems;
3711 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3712 			clean_tree_block(trans, root->fs_info, path->nodes[0]);
3713 		btrfs_tree_unlock(path->nodes[0]);
3714 		free_extent_buffer(path->nodes[0]);
3715 		path->nodes[0] = right;
3716 		path->slots[1] += 1;
3717 	} else {
3718 		btrfs_tree_unlock(right);
3719 		free_extent_buffer(right);
3720 	}
3721 	return 0;
3722 
3723 out_unlock:
3724 	btrfs_tree_unlock(right);
3725 	free_extent_buffer(right);
3726 	return 1;
3727 }
3728 
3729 /*
3730  * push some data in the path leaf to the right, trying to free up at
3731  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3732  *
3733  * returns 1 if the push failed because the other node didn't have enough
3734  * room, 0 if everything worked out and < 0 if there were major errors.
3735  *
3736  * this will push starting from min_slot to the end of the leaf.  It won't
3737  * push any slot lower than min_slot
3738  */
3739 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3740 			   *root, struct btrfs_path *path,
3741 			   int min_data_size, int data_size,
3742 			   int empty, u32 min_slot)
3743 {
3744 	struct extent_buffer *left = path->nodes[0];
3745 	struct extent_buffer *right;
3746 	struct extent_buffer *upper;
3747 	int slot;
3748 	int free_space;
3749 	u32 left_nritems;
3750 	int ret;
3751 
3752 	if (!path->nodes[1])
3753 		return 1;
3754 
3755 	slot = path->slots[1];
3756 	upper = path->nodes[1];
3757 	if (slot >= btrfs_header_nritems(upper) - 1)
3758 		return 1;
3759 
3760 	btrfs_assert_tree_locked(path->nodes[1]);
3761 
3762 	right = read_node_slot(root, upper, slot + 1);
3763 	if (right == NULL)
3764 		return 1;
3765 
3766 	btrfs_tree_lock(right);
3767 	btrfs_set_lock_blocking(right);
3768 
3769 	free_space = btrfs_leaf_free_space(root, right);
3770 	if (free_space < data_size)
3771 		goto out_unlock;
3772 
3773 	/* cow and double check */
3774 	ret = btrfs_cow_block(trans, root, right, upper,
3775 			      slot + 1, &right);
3776 	if (ret)
3777 		goto out_unlock;
3778 
3779 	free_space = btrfs_leaf_free_space(root, right);
3780 	if (free_space < data_size)
3781 		goto out_unlock;
3782 
3783 	left_nritems = btrfs_header_nritems(left);
3784 	if (left_nritems == 0)
3785 		goto out_unlock;
3786 
3787 	if (path->slots[0] == left_nritems && !empty) {
3788 		/* Key greater than all keys in the leaf, right neighbor has
3789 		 * enough room for it and we're not emptying our leaf to delete
3790 		 * it, therefore use right neighbor to insert the new item and
3791 		 * no need to touch/dirty our left leaft. */
3792 		btrfs_tree_unlock(left);
3793 		free_extent_buffer(left);
3794 		path->nodes[0] = right;
3795 		path->slots[0] = 0;
3796 		path->slots[1]++;
3797 		return 0;
3798 	}
3799 
3800 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3801 				right, free_space, left_nritems, min_slot);
3802 out_unlock:
3803 	btrfs_tree_unlock(right);
3804 	free_extent_buffer(right);
3805 	return 1;
3806 }
3807 
3808 /*
3809  * push some data in the path leaf to the left, trying to free up at
3810  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3811  *
3812  * max_slot can put a limit on how far into the leaf we'll push items.  The
3813  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3814  * items
3815  */
3816 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3817 				     struct btrfs_root *root,
3818 				     struct btrfs_path *path, int data_size,
3819 				     int empty, struct extent_buffer *left,
3820 				     int free_space, u32 right_nritems,
3821 				     u32 max_slot)
3822 {
3823 	struct btrfs_disk_key disk_key;
3824 	struct extent_buffer *right = path->nodes[0];
3825 	int i;
3826 	int push_space = 0;
3827 	int push_items = 0;
3828 	struct btrfs_item *item;
3829 	u32 old_left_nritems;
3830 	u32 nr;
3831 	int ret = 0;
3832 	u32 this_item_size;
3833 	u32 old_left_item_size;
3834 	struct btrfs_map_token token;
3835 
3836 	btrfs_init_map_token(&token);
3837 
3838 	if (empty)
3839 		nr = min(right_nritems, max_slot);
3840 	else
3841 		nr = min(right_nritems - 1, max_slot);
3842 
3843 	for (i = 0; i < nr; i++) {
3844 		item = btrfs_item_nr(i);
3845 
3846 		if (!empty && push_items > 0) {
3847 			if (path->slots[0] < i)
3848 				break;
3849 			if (path->slots[0] == i) {
3850 				int space = btrfs_leaf_free_space(root, right);
3851 				if (space + push_space * 2 > free_space)
3852 					break;
3853 			}
3854 		}
3855 
3856 		if (path->slots[0] == i)
3857 			push_space += data_size;
3858 
3859 		this_item_size = btrfs_item_size(right, item);
3860 		if (this_item_size + sizeof(*item) + push_space > free_space)
3861 			break;
3862 
3863 		push_items++;
3864 		push_space += this_item_size + sizeof(*item);
3865 	}
3866 
3867 	if (push_items == 0) {
3868 		ret = 1;
3869 		goto out;
3870 	}
3871 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3872 
3873 	/* push data from right to left */
3874 	copy_extent_buffer(left, right,
3875 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3876 			   btrfs_item_nr_offset(0),
3877 			   push_items * sizeof(struct btrfs_item));
3878 
3879 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3880 		     btrfs_item_offset_nr(right, push_items - 1);
3881 
3882 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3883 		     leaf_data_end(root, left) - push_space,
3884 		     btrfs_leaf_data(right) +
3885 		     btrfs_item_offset_nr(right, push_items - 1),
3886 		     push_space);
3887 	old_left_nritems = btrfs_header_nritems(left);
3888 	BUG_ON(old_left_nritems <= 0);
3889 
3890 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3891 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3892 		u32 ioff;
3893 
3894 		item = btrfs_item_nr(i);
3895 
3896 		ioff = btrfs_token_item_offset(left, item, &token);
3897 		btrfs_set_token_item_offset(left, item,
3898 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3899 		      &token);
3900 	}
3901 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3902 
3903 	/* fixup right node */
3904 	if (push_items > right_nritems)
3905 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3906 		       right_nritems);
3907 
3908 	if (push_items < right_nritems) {
3909 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3910 						  leaf_data_end(root, right);
3911 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3912 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3913 				      btrfs_leaf_data(right) +
3914 				      leaf_data_end(root, right), push_space);
3915 
3916 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3917 			      btrfs_item_nr_offset(push_items),
3918 			     (btrfs_header_nritems(right) - push_items) *
3919 			     sizeof(struct btrfs_item));
3920 	}
3921 	right_nritems -= push_items;
3922 	btrfs_set_header_nritems(right, right_nritems);
3923 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3924 	for (i = 0; i < right_nritems; i++) {
3925 		item = btrfs_item_nr(i);
3926 
3927 		push_space = push_space - btrfs_token_item_size(right,
3928 								item, &token);
3929 		btrfs_set_token_item_offset(right, item, push_space, &token);
3930 	}
3931 
3932 	btrfs_mark_buffer_dirty(left);
3933 	if (right_nritems)
3934 		btrfs_mark_buffer_dirty(right);
3935 	else
3936 		clean_tree_block(trans, root->fs_info, right);
3937 
3938 	btrfs_item_key(right, &disk_key, 0);
3939 	fixup_low_keys(root->fs_info, path, &disk_key, 1);
3940 
3941 	/* then fixup the leaf pointer in the path */
3942 	if (path->slots[0] < push_items) {
3943 		path->slots[0] += old_left_nritems;
3944 		btrfs_tree_unlock(path->nodes[0]);
3945 		free_extent_buffer(path->nodes[0]);
3946 		path->nodes[0] = left;
3947 		path->slots[1] -= 1;
3948 	} else {
3949 		btrfs_tree_unlock(left);
3950 		free_extent_buffer(left);
3951 		path->slots[0] -= push_items;
3952 	}
3953 	BUG_ON(path->slots[0] < 0);
3954 	return ret;
3955 out:
3956 	btrfs_tree_unlock(left);
3957 	free_extent_buffer(left);
3958 	return ret;
3959 }
3960 
3961 /*
3962  * push some data in the path leaf to the left, trying to free up at
3963  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3964  *
3965  * max_slot can put a limit on how far into the leaf we'll push items.  The
3966  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3967  * items
3968  */
3969 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3970 			  *root, struct btrfs_path *path, int min_data_size,
3971 			  int data_size, int empty, u32 max_slot)
3972 {
3973 	struct extent_buffer *right = path->nodes[0];
3974 	struct extent_buffer *left;
3975 	int slot;
3976 	int free_space;
3977 	u32 right_nritems;
3978 	int ret = 0;
3979 
3980 	slot = path->slots[1];
3981 	if (slot == 0)
3982 		return 1;
3983 	if (!path->nodes[1])
3984 		return 1;
3985 
3986 	right_nritems = btrfs_header_nritems(right);
3987 	if (right_nritems == 0)
3988 		return 1;
3989 
3990 	btrfs_assert_tree_locked(path->nodes[1]);
3991 
3992 	left = read_node_slot(root, path->nodes[1], slot - 1);
3993 	if (left == NULL)
3994 		return 1;
3995 
3996 	btrfs_tree_lock(left);
3997 	btrfs_set_lock_blocking(left);
3998 
3999 	free_space = btrfs_leaf_free_space(root, left);
4000 	if (free_space < data_size) {
4001 		ret = 1;
4002 		goto out;
4003 	}
4004 
4005 	/* cow and double check */
4006 	ret = btrfs_cow_block(trans, root, left,
4007 			      path->nodes[1], slot - 1, &left);
4008 	if (ret) {
4009 		/* we hit -ENOSPC, but it isn't fatal here */
4010 		if (ret == -ENOSPC)
4011 			ret = 1;
4012 		goto out;
4013 	}
4014 
4015 	free_space = btrfs_leaf_free_space(root, left);
4016 	if (free_space < data_size) {
4017 		ret = 1;
4018 		goto out;
4019 	}
4020 
4021 	return __push_leaf_left(trans, root, path, min_data_size,
4022 			       empty, left, free_space, right_nritems,
4023 			       max_slot);
4024 out:
4025 	btrfs_tree_unlock(left);
4026 	free_extent_buffer(left);
4027 	return ret;
4028 }
4029 
4030 /*
4031  * split the path's leaf in two, making sure there is at least data_size
4032  * available for the resulting leaf level of the path.
4033  */
4034 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4035 				    struct btrfs_root *root,
4036 				    struct btrfs_path *path,
4037 				    struct extent_buffer *l,
4038 				    struct extent_buffer *right,
4039 				    int slot, int mid, int nritems)
4040 {
4041 	int data_copy_size;
4042 	int rt_data_off;
4043 	int i;
4044 	struct btrfs_disk_key disk_key;
4045 	struct btrfs_map_token token;
4046 
4047 	btrfs_init_map_token(&token);
4048 
4049 	nritems = nritems - mid;
4050 	btrfs_set_header_nritems(right, nritems);
4051 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4052 
4053 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4054 			   btrfs_item_nr_offset(mid),
4055 			   nritems * sizeof(struct btrfs_item));
4056 
4057 	copy_extent_buffer(right, l,
4058 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4059 		     data_copy_size, btrfs_leaf_data(l) +
4060 		     leaf_data_end(root, l), data_copy_size);
4061 
4062 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4063 		      btrfs_item_end_nr(l, mid);
4064 
4065 	for (i = 0; i < nritems; i++) {
4066 		struct btrfs_item *item = btrfs_item_nr(i);
4067 		u32 ioff;
4068 
4069 		ioff = btrfs_token_item_offset(right, item, &token);
4070 		btrfs_set_token_item_offset(right, item,
4071 					    ioff + rt_data_off, &token);
4072 	}
4073 
4074 	btrfs_set_header_nritems(l, mid);
4075 	btrfs_item_key(right, &disk_key, 0);
4076 	insert_ptr(trans, root, path, &disk_key, right->start,
4077 		   path->slots[1] + 1, 1);
4078 
4079 	btrfs_mark_buffer_dirty(right);
4080 	btrfs_mark_buffer_dirty(l);
4081 	BUG_ON(path->slots[0] != slot);
4082 
4083 	if (mid <= slot) {
4084 		btrfs_tree_unlock(path->nodes[0]);
4085 		free_extent_buffer(path->nodes[0]);
4086 		path->nodes[0] = right;
4087 		path->slots[0] -= mid;
4088 		path->slots[1] += 1;
4089 	} else {
4090 		btrfs_tree_unlock(right);
4091 		free_extent_buffer(right);
4092 	}
4093 
4094 	BUG_ON(path->slots[0] < 0);
4095 }
4096 
4097 /*
4098  * double splits happen when we need to insert a big item in the middle
4099  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4100  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4101  *          A                 B                 C
4102  *
4103  * We avoid this by trying to push the items on either side of our target
4104  * into the adjacent leaves.  If all goes well we can avoid the double split
4105  * completely.
4106  */
4107 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4108 					  struct btrfs_root *root,
4109 					  struct btrfs_path *path,
4110 					  int data_size)
4111 {
4112 	int ret;
4113 	int progress = 0;
4114 	int slot;
4115 	u32 nritems;
4116 	int space_needed = data_size;
4117 
4118 	slot = path->slots[0];
4119 	if (slot < btrfs_header_nritems(path->nodes[0]))
4120 		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4121 
4122 	/*
4123 	 * try to push all the items after our slot into the
4124 	 * right leaf
4125 	 */
4126 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4127 	if (ret < 0)
4128 		return ret;
4129 
4130 	if (ret == 0)
4131 		progress++;
4132 
4133 	nritems = btrfs_header_nritems(path->nodes[0]);
4134 	/*
4135 	 * our goal is to get our slot at the start or end of a leaf.  If
4136 	 * we've done so we're done
4137 	 */
4138 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4139 		return 0;
4140 
4141 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4142 		return 0;
4143 
4144 	/* try to push all the items before our slot into the next leaf */
4145 	slot = path->slots[0];
4146 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4147 	if (ret < 0)
4148 		return ret;
4149 
4150 	if (ret == 0)
4151 		progress++;
4152 
4153 	if (progress)
4154 		return 0;
4155 	return 1;
4156 }
4157 
4158 /*
4159  * split the path's leaf in two, making sure there is at least data_size
4160  * available for the resulting leaf level of the path.
4161  *
4162  * returns 0 if all went well and < 0 on failure.
4163  */
4164 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4165 			       struct btrfs_root *root,
4166 			       struct btrfs_key *ins_key,
4167 			       struct btrfs_path *path, int data_size,
4168 			       int extend)
4169 {
4170 	struct btrfs_disk_key disk_key;
4171 	struct extent_buffer *l;
4172 	u32 nritems;
4173 	int mid;
4174 	int slot;
4175 	struct extent_buffer *right;
4176 	struct btrfs_fs_info *fs_info = root->fs_info;
4177 	int ret = 0;
4178 	int wret;
4179 	int split;
4180 	int num_doubles = 0;
4181 	int tried_avoid_double = 0;
4182 
4183 	l = path->nodes[0];
4184 	slot = path->slots[0];
4185 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4186 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4187 		return -EOVERFLOW;
4188 
4189 	/* first try to make some room by pushing left and right */
4190 	if (data_size && path->nodes[1]) {
4191 		int space_needed = data_size;
4192 
4193 		if (slot < btrfs_header_nritems(l))
4194 			space_needed -= btrfs_leaf_free_space(root, l);
4195 
4196 		wret = push_leaf_right(trans, root, path, space_needed,
4197 				       space_needed, 0, 0);
4198 		if (wret < 0)
4199 			return wret;
4200 		if (wret) {
4201 			wret = push_leaf_left(trans, root, path, space_needed,
4202 					      space_needed, 0, (u32)-1);
4203 			if (wret < 0)
4204 				return wret;
4205 		}
4206 		l = path->nodes[0];
4207 
4208 		/* did the pushes work? */
4209 		if (btrfs_leaf_free_space(root, l) >= data_size)
4210 			return 0;
4211 	}
4212 
4213 	if (!path->nodes[1]) {
4214 		ret = insert_new_root(trans, root, path, 1);
4215 		if (ret)
4216 			return ret;
4217 	}
4218 again:
4219 	split = 1;
4220 	l = path->nodes[0];
4221 	slot = path->slots[0];
4222 	nritems = btrfs_header_nritems(l);
4223 	mid = (nritems + 1) / 2;
4224 
4225 	if (mid <= slot) {
4226 		if (nritems == 1 ||
4227 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4228 			BTRFS_LEAF_DATA_SIZE(root)) {
4229 			if (slot >= nritems) {
4230 				split = 0;
4231 			} else {
4232 				mid = slot;
4233 				if (mid != nritems &&
4234 				    leaf_space_used(l, mid, nritems - mid) +
4235 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4236 					if (data_size && !tried_avoid_double)
4237 						goto push_for_double;
4238 					split = 2;
4239 				}
4240 			}
4241 		}
4242 	} else {
4243 		if (leaf_space_used(l, 0, mid) + data_size >
4244 			BTRFS_LEAF_DATA_SIZE(root)) {
4245 			if (!extend && data_size && slot == 0) {
4246 				split = 0;
4247 			} else if ((extend || !data_size) && slot == 0) {
4248 				mid = 1;
4249 			} else {
4250 				mid = slot;
4251 				if (mid != nritems &&
4252 				    leaf_space_used(l, mid, nritems - mid) +
4253 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4254 					if (data_size && !tried_avoid_double)
4255 						goto push_for_double;
4256 					split = 2;
4257 				}
4258 			}
4259 		}
4260 	}
4261 
4262 	if (split == 0)
4263 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4264 	else
4265 		btrfs_item_key(l, &disk_key, mid);
4266 
4267 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4268 			&disk_key, 0, l->start, 0);
4269 	if (IS_ERR(right))
4270 		return PTR_ERR(right);
4271 
4272 	root_add_used(root, root->nodesize);
4273 
4274 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4275 	btrfs_set_header_bytenr(right, right->start);
4276 	btrfs_set_header_generation(right, trans->transid);
4277 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4278 	btrfs_set_header_owner(right, root->root_key.objectid);
4279 	btrfs_set_header_level(right, 0);
4280 	write_extent_buffer(right, fs_info->fsid,
4281 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
4282 
4283 	write_extent_buffer(right, fs_info->chunk_tree_uuid,
4284 			    btrfs_header_chunk_tree_uuid(right),
4285 			    BTRFS_UUID_SIZE);
4286 
4287 	if (split == 0) {
4288 		if (mid <= slot) {
4289 			btrfs_set_header_nritems(right, 0);
4290 			insert_ptr(trans, root, path, &disk_key, right->start,
4291 				   path->slots[1] + 1, 1);
4292 			btrfs_tree_unlock(path->nodes[0]);
4293 			free_extent_buffer(path->nodes[0]);
4294 			path->nodes[0] = right;
4295 			path->slots[0] = 0;
4296 			path->slots[1] += 1;
4297 		} else {
4298 			btrfs_set_header_nritems(right, 0);
4299 			insert_ptr(trans, root, path, &disk_key, right->start,
4300 					  path->slots[1], 1);
4301 			btrfs_tree_unlock(path->nodes[0]);
4302 			free_extent_buffer(path->nodes[0]);
4303 			path->nodes[0] = right;
4304 			path->slots[0] = 0;
4305 			if (path->slots[1] == 0)
4306 				fixup_low_keys(fs_info, path, &disk_key, 1);
4307 		}
4308 		btrfs_mark_buffer_dirty(right);
4309 		return ret;
4310 	}
4311 
4312 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4313 
4314 	if (split == 2) {
4315 		BUG_ON(num_doubles != 0);
4316 		num_doubles++;
4317 		goto again;
4318 	}
4319 
4320 	return 0;
4321 
4322 push_for_double:
4323 	push_for_double_split(trans, root, path, data_size);
4324 	tried_avoid_double = 1;
4325 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4326 		return 0;
4327 	goto again;
4328 }
4329 
4330 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4331 					 struct btrfs_root *root,
4332 					 struct btrfs_path *path, int ins_len)
4333 {
4334 	struct btrfs_key key;
4335 	struct extent_buffer *leaf;
4336 	struct btrfs_file_extent_item *fi;
4337 	u64 extent_len = 0;
4338 	u32 item_size;
4339 	int ret;
4340 
4341 	leaf = path->nodes[0];
4342 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4343 
4344 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4345 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4346 
4347 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4348 		return 0;
4349 
4350 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4351 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4352 		fi = btrfs_item_ptr(leaf, path->slots[0],
4353 				    struct btrfs_file_extent_item);
4354 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4355 	}
4356 	btrfs_release_path(path);
4357 
4358 	path->keep_locks = 1;
4359 	path->search_for_split = 1;
4360 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4361 	path->search_for_split = 0;
4362 	if (ret > 0)
4363 		ret = -EAGAIN;
4364 	if (ret < 0)
4365 		goto err;
4366 
4367 	ret = -EAGAIN;
4368 	leaf = path->nodes[0];
4369 	/* if our item isn't there, return now */
4370 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4371 		goto err;
4372 
4373 	/* the leaf has  changed, it now has room.  return now */
4374 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4375 		goto err;
4376 
4377 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4378 		fi = btrfs_item_ptr(leaf, path->slots[0],
4379 				    struct btrfs_file_extent_item);
4380 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4381 			goto err;
4382 	}
4383 
4384 	btrfs_set_path_blocking(path);
4385 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4386 	if (ret)
4387 		goto err;
4388 
4389 	path->keep_locks = 0;
4390 	btrfs_unlock_up_safe(path, 1);
4391 	return 0;
4392 err:
4393 	path->keep_locks = 0;
4394 	return ret;
4395 }
4396 
4397 static noinline int split_item(struct btrfs_trans_handle *trans,
4398 			       struct btrfs_root *root,
4399 			       struct btrfs_path *path,
4400 			       struct btrfs_key *new_key,
4401 			       unsigned long split_offset)
4402 {
4403 	struct extent_buffer *leaf;
4404 	struct btrfs_item *item;
4405 	struct btrfs_item *new_item;
4406 	int slot;
4407 	char *buf;
4408 	u32 nritems;
4409 	u32 item_size;
4410 	u32 orig_offset;
4411 	struct btrfs_disk_key disk_key;
4412 
4413 	leaf = path->nodes[0];
4414 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4415 
4416 	btrfs_set_path_blocking(path);
4417 
4418 	item = btrfs_item_nr(path->slots[0]);
4419 	orig_offset = btrfs_item_offset(leaf, item);
4420 	item_size = btrfs_item_size(leaf, item);
4421 
4422 	buf = kmalloc(item_size, GFP_NOFS);
4423 	if (!buf)
4424 		return -ENOMEM;
4425 
4426 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4427 			    path->slots[0]), item_size);
4428 
4429 	slot = path->slots[0] + 1;
4430 	nritems = btrfs_header_nritems(leaf);
4431 	if (slot != nritems) {
4432 		/* shift the items */
4433 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4434 				btrfs_item_nr_offset(slot),
4435 				(nritems - slot) * sizeof(struct btrfs_item));
4436 	}
4437 
4438 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4439 	btrfs_set_item_key(leaf, &disk_key, slot);
4440 
4441 	new_item = btrfs_item_nr(slot);
4442 
4443 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4444 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4445 
4446 	btrfs_set_item_offset(leaf, item,
4447 			      orig_offset + item_size - split_offset);
4448 	btrfs_set_item_size(leaf, item, split_offset);
4449 
4450 	btrfs_set_header_nritems(leaf, nritems + 1);
4451 
4452 	/* write the data for the start of the original item */
4453 	write_extent_buffer(leaf, buf,
4454 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4455 			    split_offset);
4456 
4457 	/* write the data for the new item */
4458 	write_extent_buffer(leaf, buf + split_offset,
4459 			    btrfs_item_ptr_offset(leaf, slot),
4460 			    item_size - split_offset);
4461 	btrfs_mark_buffer_dirty(leaf);
4462 
4463 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4464 	kfree(buf);
4465 	return 0;
4466 }
4467 
4468 /*
4469  * This function splits a single item into two items,
4470  * giving 'new_key' to the new item and splitting the
4471  * old one at split_offset (from the start of the item).
4472  *
4473  * The path may be released by this operation.  After
4474  * the split, the path is pointing to the old item.  The
4475  * new item is going to be in the same node as the old one.
4476  *
4477  * Note, the item being split must be smaller enough to live alone on
4478  * a tree block with room for one extra struct btrfs_item
4479  *
4480  * This allows us to split the item in place, keeping a lock on the
4481  * leaf the entire time.
4482  */
4483 int btrfs_split_item(struct btrfs_trans_handle *trans,
4484 		     struct btrfs_root *root,
4485 		     struct btrfs_path *path,
4486 		     struct btrfs_key *new_key,
4487 		     unsigned long split_offset)
4488 {
4489 	int ret;
4490 	ret = setup_leaf_for_split(trans, root, path,
4491 				   sizeof(struct btrfs_item));
4492 	if (ret)
4493 		return ret;
4494 
4495 	ret = split_item(trans, root, path, new_key, split_offset);
4496 	return ret;
4497 }
4498 
4499 /*
4500  * This function duplicate a item, giving 'new_key' to the new item.
4501  * It guarantees both items live in the same tree leaf and the new item
4502  * is contiguous with the original item.
4503  *
4504  * This allows us to split file extent in place, keeping a lock on the
4505  * leaf the entire time.
4506  */
4507 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4508 			 struct btrfs_root *root,
4509 			 struct btrfs_path *path,
4510 			 struct btrfs_key *new_key)
4511 {
4512 	struct extent_buffer *leaf;
4513 	int ret;
4514 	u32 item_size;
4515 
4516 	leaf = path->nodes[0];
4517 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4518 	ret = setup_leaf_for_split(trans, root, path,
4519 				   item_size + sizeof(struct btrfs_item));
4520 	if (ret)
4521 		return ret;
4522 
4523 	path->slots[0]++;
4524 	setup_items_for_insert(root, path, new_key, &item_size,
4525 			       item_size, item_size +
4526 			       sizeof(struct btrfs_item), 1);
4527 	leaf = path->nodes[0];
4528 	memcpy_extent_buffer(leaf,
4529 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4530 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4531 			     item_size);
4532 	return 0;
4533 }
4534 
4535 /*
4536  * make the item pointed to by the path smaller.  new_size indicates
4537  * how small to make it, and from_end tells us if we just chop bytes
4538  * off the end of the item or if we shift the item to chop bytes off
4539  * the front.
4540  */
4541 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4542 			 u32 new_size, int from_end)
4543 {
4544 	int slot;
4545 	struct extent_buffer *leaf;
4546 	struct btrfs_item *item;
4547 	u32 nritems;
4548 	unsigned int data_end;
4549 	unsigned int old_data_start;
4550 	unsigned int old_size;
4551 	unsigned int size_diff;
4552 	int i;
4553 	struct btrfs_map_token token;
4554 
4555 	btrfs_init_map_token(&token);
4556 
4557 	leaf = path->nodes[0];
4558 	slot = path->slots[0];
4559 
4560 	old_size = btrfs_item_size_nr(leaf, slot);
4561 	if (old_size == new_size)
4562 		return;
4563 
4564 	nritems = btrfs_header_nritems(leaf);
4565 	data_end = leaf_data_end(root, leaf);
4566 
4567 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4568 
4569 	size_diff = old_size - new_size;
4570 
4571 	BUG_ON(slot < 0);
4572 	BUG_ON(slot >= nritems);
4573 
4574 	/*
4575 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4576 	 */
4577 	/* first correct the data pointers */
4578 	for (i = slot; i < nritems; i++) {
4579 		u32 ioff;
4580 		item = btrfs_item_nr(i);
4581 
4582 		ioff = btrfs_token_item_offset(leaf, item, &token);
4583 		btrfs_set_token_item_offset(leaf, item,
4584 					    ioff + size_diff, &token);
4585 	}
4586 
4587 	/* shift the data */
4588 	if (from_end) {
4589 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4590 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4591 			      data_end, old_data_start + new_size - data_end);
4592 	} else {
4593 		struct btrfs_disk_key disk_key;
4594 		u64 offset;
4595 
4596 		btrfs_item_key(leaf, &disk_key, slot);
4597 
4598 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4599 			unsigned long ptr;
4600 			struct btrfs_file_extent_item *fi;
4601 
4602 			fi = btrfs_item_ptr(leaf, slot,
4603 					    struct btrfs_file_extent_item);
4604 			fi = (struct btrfs_file_extent_item *)(
4605 			     (unsigned long)fi - size_diff);
4606 
4607 			if (btrfs_file_extent_type(leaf, fi) ==
4608 			    BTRFS_FILE_EXTENT_INLINE) {
4609 				ptr = btrfs_item_ptr_offset(leaf, slot);
4610 				memmove_extent_buffer(leaf, ptr,
4611 				      (unsigned long)fi,
4612 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4613 			}
4614 		}
4615 
4616 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4617 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4618 			      data_end, old_data_start - data_end);
4619 
4620 		offset = btrfs_disk_key_offset(&disk_key);
4621 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4622 		btrfs_set_item_key(leaf, &disk_key, slot);
4623 		if (slot == 0)
4624 			fixup_low_keys(root->fs_info, path, &disk_key, 1);
4625 	}
4626 
4627 	item = btrfs_item_nr(slot);
4628 	btrfs_set_item_size(leaf, item, new_size);
4629 	btrfs_mark_buffer_dirty(leaf);
4630 
4631 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4632 		btrfs_print_leaf(root, leaf);
4633 		BUG();
4634 	}
4635 }
4636 
4637 /*
4638  * make the item pointed to by the path bigger, data_size is the added size.
4639  */
4640 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4641 		       u32 data_size)
4642 {
4643 	int slot;
4644 	struct extent_buffer *leaf;
4645 	struct btrfs_item *item;
4646 	u32 nritems;
4647 	unsigned int data_end;
4648 	unsigned int old_data;
4649 	unsigned int old_size;
4650 	int i;
4651 	struct btrfs_map_token token;
4652 
4653 	btrfs_init_map_token(&token);
4654 
4655 	leaf = path->nodes[0];
4656 
4657 	nritems = btrfs_header_nritems(leaf);
4658 	data_end = leaf_data_end(root, leaf);
4659 
4660 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4661 		btrfs_print_leaf(root, leaf);
4662 		BUG();
4663 	}
4664 	slot = path->slots[0];
4665 	old_data = btrfs_item_end_nr(leaf, slot);
4666 
4667 	BUG_ON(slot < 0);
4668 	if (slot >= nritems) {
4669 		btrfs_print_leaf(root, leaf);
4670 		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4671 		       slot, nritems);
4672 		BUG_ON(1);
4673 	}
4674 
4675 	/*
4676 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4677 	 */
4678 	/* first correct the data pointers */
4679 	for (i = slot; i < nritems; i++) {
4680 		u32 ioff;
4681 		item = btrfs_item_nr(i);
4682 
4683 		ioff = btrfs_token_item_offset(leaf, item, &token);
4684 		btrfs_set_token_item_offset(leaf, item,
4685 					    ioff - data_size, &token);
4686 	}
4687 
4688 	/* shift the data */
4689 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4690 		      data_end - data_size, btrfs_leaf_data(leaf) +
4691 		      data_end, old_data - data_end);
4692 
4693 	data_end = old_data;
4694 	old_size = btrfs_item_size_nr(leaf, slot);
4695 	item = btrfs_item_nr(slot);
4696 	btrfs_set_item_size(leaf, item, old_size + data_size);
4697 	btrfs_mark_buffer_dirty(leaf);
4698 
4699 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4700 		btrfs_print_leaf(root, leaf);
4701 		BUG();
4702 	}
4703 }
4704 
4705 /*
4706  * this is a helper for btrfs_insert_empty_items, the main goal here is
4707  * to save stack depth by doing the bulk of the work in a function
4708  * that doesn't call btrfs_search_slot
4709  */
4710 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4711 			    struct btrfs_key *cpu_key, u32 *data_size,
4712 			    u32 total_data, u32 total_size, int nr)
4713 {
4714 	struct btrfs_item *item;
4715 	int i;
4716 	u32 nritems;
4717 	unsigned int data_end;
4718 	struct btrfs_disk_key disk_key;
4719 	struct extent_buffer *leaf;
4720 	int slot;
4721 	struct btrfs_map_token token;
4722 
4723 	if (path->slots[0] == 0) {
4724 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4725 		fixup_low_keys(root->fs_info, path, &disk_key, 1);
4726 	}
4727 	btrfs_unlock_up_safe(path, 1);
4728 
4729 	btrfs_init_map_token(&token);
4730 
4731 	leaf = path->nodes[0];
4732 	slot = path->slots[0];
4733 
4734 	nritems = btrfs_header_nritems(leaf);
4735 	data_end = leaf_data_end(root, leaf);
4736 
4737 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4738 		btrfs_print_leaf(root, leaf);
4739 		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4740 		       total_size, btrfs_leaf_free_space(root, leaf));
4741 		BUG();
4742 	}
4743 
4744 	if (slot != nritems) {
4745 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4746 
4747 		if (old_data < data_end) {
4748 			btrfs_print_leaf(root, leaf);
4749 			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4750 			       slot, old_data, data_end);
4751 			BUG_ON(1);
4752 		}
4753 		/*
4754 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4755 		 */
4756 		/* first correct the data pointers */
4757 		for (i = slot; i < nritems; i++) {
4758 			u32 ioff;
4759 
4760 			item = btrfs_item_nr( i);
4761 			ioff = btrfs_token_item_offset(leaf, item, &token);
4762 			btrfs_set_token_item_offset(leaf, item,
4763 						    ioff - total_data, &token);
4764 		}
4765 		/* shift the items */
4766 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4767 			      btrfs_item_nr_offset(slot),
4768 			      (nritems - slot) * sizeof(struct btrfs_item));
4769 
4770 		/* shift the data */
4771 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4772 			      data_end - total_data, btrfs_leaf_data(leaf) +
4773 			      data_end, old_data - data_end);
4774 		data_end = old_data;
4775 	}
4776 
4777 	/* setup the item for the new data */
4778 	for (i = 0; i < nr; i++) {
4779 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4780 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4781 		item = btrfs_item_nr(slot + i);
4782 		btrfs_set_token_item_offset(leaf, item,
4783 					    data_end - data_size[i], &token);
4784 		data_end -= data_size[i];
4785 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4786 	}
4787 
4788 	btrfs_set_header_nritems(leaf, nritems + nr);
4789 	btrfs_mark_buffer_dirty(leaf);
4790 
4791 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4792 		btrfs_print_leaf(root, leaf);
4793 		BUG();
4794 	}
4795 }
4796 
4797 /*
4798  * Given a key and some data, insert items into the tree.
4799  * This does all the path init required, making room in the tree if needed.
4800  */
4801 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4802 			    struct btrfs_root *root,
4803 			    struct btrfs_path *path,
4804 			    struct btrfs_key *cpu_key, u32 *data_size,
4805 			    int nr)
4806 {
4807 	int ret = 0;
4808 	int slot;
4809 	int i;
4810 	u32 total_size = 0;
4811 	u32 total_data = 0;
4812 
4813 	for (i = 0; i < nr; i++)
4814 		total_data += data_size[i];
4815 
4816 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4817 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4818 	if (ret == 0)
4819 		return -EEXIST;
4820 	if (ret < 0)
4821 		return ret;
4822 
4823 	slot = path->slots[0];
4824 	BUG_ON(slot < 0);
4825 
4826 	setup_items_for_insert(root, path, cpu_key, data_size,
4827 			       total_data, total_size, nr);
4828 	return 0;
4829 }
4830 
4831 /*
4832  * Given a key and some data, insert an item into the tree.
4833  * This does all the path init required, making room in the tree if needed.
4834  */
4835 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4836 		      *root, struct btrfs_key *cpu_key, void *data, u32
4837 		      data_size)
4838 {
4839 	int ret = 0;
4840 	struct btrfs_path *path;
4841 	struct extent_buffer *leaf;
4842 	unsigned long ptr;
4843 
4844 	path = btrfs_alloc_path();
4845 	if (!path)
4846 		return -ENOMEM;
4847 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4848 	if (!ret) {
4849 		leaf = path->nodes[0];
4850 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4851 		write_extent_buffer(leaf, data, ptr, data_size);
4852 		btrfs_mark_buffer_dirty(leaf);
4853 	}
4854 	btrfs_free_path(path);
4855 	return ret;
4856 }
4857 
4858 /*
4859  * delete the pointer from a given node.
4860  *
4861  * the tree should have been previously balanced so the deletion does not
4862  * empty a node.
4863  */
4864 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4865 		    int level, int slot)
4866 {
4867 	struct extent_buffer *parent = path->nodes[level];
4868 	u32 nritems;
4869 	int ret;
4870 
4871 	nritems = btrfs_header_nritems(parent);
4872 	if (slot != nritems - 1) {
4873 		if (level)
4874 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4875 					     slot + 1, nritems - slot - 1);
4876 		memmove_extent_buffer(parent,
4877 			      btrfs_node_key_ptr_offset(slot),
4878 			      btrfs_node_key_ptr_offset(slot + 1),
4879 			      sizeof(struct btrfs_key_ptr) *
4880 			      (nritems - slot - 1));
4881 	} else if (level) {
4882 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4883 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4884 		BUG_ON(ret < 0);
4885 	}
4886 
4887 	nritems--;
4888 	btrfs_set_header_nritems(parent, nritems);
4889 	if (nritems == 0 && parent == root->node) {
4890 		BUG_ON(btrfs_header_level(root->node) != 1);
4891 		/* just turn the root into a leaf and break */
4892 		btrfs_set_header_level(root->node, 0);
4893 	} else if (slot == 0) {
4894 		struct btrfs_disk_key disk_key;
4895 
4896 		btrfs_node_key(parent, &disk_key, 0);
4897 		fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4898 	}
4899 	btrfs_mark_buffer_dirty(parent);
4900 }
4901 
4902 /*
4903  * a helper function to delete the leaf pointed to by path->slots[1] and
4904  * path->nodes[1].
4905  *
4906  * This deletes the pointer in path->nodes[1] and frees the leaf
4907  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4908  *
4909  * The path must have already been setup for deleting the leaf, including
4910  * all the proper balancing.  path->nodes[1] must be locked.
4911  */
4912 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4913 				    struct btrfs_root *root,
4914 				    struct btrfs_path *path,
4915 				    struct extent_buffer *leaf)
4916 {
4917 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4918 	del_ptr(root, path, 1, path->slots[1]);
4919 
4920 	/*
4921 	 * btrfs_free_extent is expensive, we want to make sure we
4922 	 * aren't holding any locks when we call it
4923 	 */
4924 	btrfs_unlock_up_safe(path, 0);
4925 
4926 	root_sub_used(root, leaf->len);
4927 
4928 	extent_buffer_get(leaf);
4929 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4930 	free_extent_buffer_stale(leaf);
4931 }
4932 /*
4933  * delete the item at the leaf level in path.  If that empties
4934  * the leaf, remove it from the tree
4935  */
4936 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4937 		    struct btrfs_path *path, int slot, int nr)
4938 {
4939 	struct extent_buffer *leaf;
4940 	struct btrfs_item *item;
4941 	int last_off;
4942 	int dsize = 0;
4943 	int ret = 0;
4944 	int wret;
4945 	int i;
4946 	u32 nritems;
4947 	struct btrfs_map_token token;
4948 
4949 	btrfs_init_map_token(&token);
4950 
4951 	leaf = path->nodes[0];
4952 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4953 
4954 	for (i = 0; i < nr; i++)
4955 		dsize += btrfs_item_size_nr(leaf, slot + i);
4956 
4957 	nritems = btrfs_header_nritems(leaf);
4958 
4959 	if (slot + nr != nritems) {
4960 		int data_end = leaf_data_end(root, leaf);
4961 
4962 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4963 			      data_end + dsize,
4964 			      btrfs_leaf_data(leaf) + data_end,
4965 			      last_off - data_end);
4966 
4967 		for (i = slot + nr; i < nritems; i++) {
4968 			u32 ioff;
4969 
4970 			item = btrfs_item_nr(i);
4971 			ioff = btrfs_token_item_offset(leaf, item, &token);
4972 			btrfs_set_token_item_offset(leaf, item,
4973 						    ioff + dsize, &token);
4974 		}
4975 
4976 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4977 			      btrfs_item_nr_offset(slot + nr),
4978 			      sizeof(struct btrfs_item) *
4979 			      (nritems - slot - nr));
4980 	}
4981 	btrfs_set_header_nritems(leaf, nritems - nr);
4982 	nritems -= nr;
4983 
4984 	/* delete the leaf if we've emptied it */
4985 	if (nritems == 0) {
4986 		if (leaf == root->node) {
4987 			btrfs_set_header_level(leaf, 0);
4988 		} else {
4989 			btrfs_set_path_blocking(path);
4990 			clean_tree_block(trans, root->fs_info, leaf);
4991 			btrfs_del_leaf(trans, root, path, leaf);
4992 		}
4993 	} else {
4994 		int used = leaf_space_used(leaf, 0, nritems);
4995 		if (slot == 0) {
4996 			struct btrfs_disk_key disk_key;
4997 
4998 			btrfs_item_key(leaf, &disk_key, 0);
4999 			fixup_low_keys(root->fs_info, path, &disk_key, 1);
5000 		}
5001 
5002 		/* delete the leaf if it is mostly empty */
5003 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5004 			/* push_leaf_left fixes the path.
5005 			 * make sure the path still points to our leaf
5006 			 * for possible call to del_ptr below
5007 			 */
5008 			slot = path->slots[1];
5009 			extent_buffer_get(leaf);
5010 
5011 			btrfs_set_path_blocking(path);
5012 			wret = push_leaf_left(trans, root, path, 1, 1,
5013 					      1, (u32)-1);
5014 			if (wret < 0 && wret != -ENOSPC)
5015 				ret = wret;
5016 
5017 			if (path->nodes[0] == leaf &&
5018 			    btrfs_header_nritems(leaf)) {
5019 				wret = push_leaf_right(trans, root, path, 1,
5020 						       1, 1, 0);
5021 				if (wret < 0 && wret != -ENOSPC)
5022 					ret = wret;
5023 			}
5024 
5025 			if (btrfs_header_nritems(leaf) == 0) {
5026 				path->slots[1] = slot;
5027 				btrfs_del_leaf(trans, root, path, leaf);
5028 				free_extent_buffer(leaf);
5029 				ret = 0;
5030 			} else {
5031 				/* if we're still in the path, make sure
5032 				 * we're dirty.  Otherwise, one of the
5033 				 * push_leaf functions must have already
5034 				 * dirtied this buffer
5035 				 */
5036 				if (path->nodes[0] == leaf)
5037 					btrfs_mark_buffer_dirty(leaf);
5038 				free_extent_buffer(leaf);
5039 			}
5040 		} else {
5041 			btrfs_mark_buffer_dirty(leaf);
5042 		}
5043 	}
5044 	return ret;
5045 }
5046 
5047 /*
5048  * search the tree again to find a leaf with lesser keys
5049  * returns 0 if it found something or 1 if there are no lesser leaves.
5050  * returns < 0 on io errors.
5051  *
5052  * This may release the path, and so you may lose any locks held at the
5053  * time you call it.
5054  */
5055 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5056 {
5057 	struct btrfs_key key;
5058 	struct btrfs_disk_key found_key;
5059 	int ret;
5060 
5061 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5062 
5063 	if (key.offset > 0) {
5064 		key.offset--;
5065 	} else if (key.type > 0) {
5066 		key.type--;
5067 		key.offset = (u64)-1;
5068 	} else if (key.objectid > 0) {
5069 		key.objectid--;
5070 		key.type = (u8)-1;
5071 		key.offset = (u64)-1;
5072 	} else {
5073 		return 1;
5074 	}
5075 
5076 	btrfs_release_path(path);
5077 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5078 	if (ret < 0)
5079 		return ret;
5080 	btrfs_item_key(path->nodes[0], &found_key, 0);
5081 	ret = comp_keys(&found_key, &key);
5082 	/*
5083 	 * We might have had an item with the previous key in the tree right
5084 	 * before we released our path. And after we released our path, that
5085 	 * item might have been pushed to the first slot (0) of the leaf we
5086 	 * were holding due to a tree balance. Alternatively, an item with the
5087 	 * previous key can exist as the only element of a leaf (big fat item).
5088 	 * Therefore account for these 2 cases, so that our callers (like
5089 	 * btrfs_previous_item) don't miss an existing item with a key matching
5090 	 * the previous key we computed above.
5091 	 */
5092 	if (ret <= 0)
5093 		return 0;
5094 	return 1;
5095 }
5096 
5097 /*
5098  * A helper function to walk down the tree starting at min_key, and looking
5099  * for nodes or leaves that are have a minimum transaction id.
5100  * This is used by the btree defrag code, and tree logging
5101  *
5102  * This does not cow, but it does stuff the starting key it finds back
5103  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5104  * key and get a writable path.
5105  *
5106  * This does lock as it descends, and path->keep_locks should be set
5107  * to 1 by the caller.
5108  *
5109  * This honors path->lowest_level to prevent descent past a given level
5110  * of the tree.
5111  *
5112  * min_trans indicates the oldest transaction that you are interested
5113  * in walking through.  Any nodes or leaves older than min_trans are
5114  * skipped over (without reading them).
5115  *
5116  * returns zero if something useful was found, < 0 on error and 1 if there
5117  * was nothing in the tree that matched the search criteria.
5118  */
5119 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5120 			 struct btrfs_path *path,
5121 			 u64 min_trans)
5122 {
5123 	struct extent_buffer *cur;
5124 	struct btrfs_key found_key;
5125 	int slot;
5126 	int sret;
5127 	u32 nritems;
5128 	int level;
5129 	int ret = 1;
5130 	int keep_locks = path->keep_locks;
5131 
5132 	path->keep_locks = 1;
5133 again:
5134 	cur = btrfs_read_lock_root_node(root);
5135 	level = btrfs_header_level(cur);
5136 	WARN_ON(path->nodes[level]);
5137 	path->nodes[level] = cur;
5138 	path->locks[level] = BTRFS_READ_LOCK;
5139 
5140 	if (btrfs_header_generation(cur) < min_trans) {
5141 		ret = 1;
5142 		goto out;
5143 	}
5144 	while (1) {
5145 		nritems = btrfs_header_nritems(cur);
5146 		level = btrfs_header_level(cur);
5147 		sret = bin_search(cur, min_key, level, &slot);
5148 
5149 		/* at the lowest level, we're done, setup the path and exit */
5150 		if (level == path->lowest_level) {
5151 			if (slot >= nritems)
5152 				goto find_next_key;
5153 			ret = 0;
5154 			path->slots[level] = slot;
5155 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5156 			goto out;
5157 		}
5158 		if (sret && slot > 0)
5159 			slot--;
5160 		/*
5161 		 * check this node pointer against the min_trans parameters.
5162 		 * If it is too old, old, skip to the next one.
5163 		 */
5164 		while (slot < nritems) {
5165 			u64 gen;
5166 
5167 			gen = btrfs_node_ptr_generation(cur, slot);
5168 			if (gen < min_trans) {
5169 				slot++;
5170 				continue;
5171 			}
5172 			break;
5173 		}
5174 find_next_key:
5175 		/*
5176 		 * we didn't find a candidate key in this node, walk forward
5177 		 * and find another one
5178 		 */
5179 		if (slot >= nritems) {
5180 			path->slots[level] = slot;
5181 			btrfs_set_path_blocking(path);
5182 			sret = btrfs_find_next_key(root, path, min_key, level,
5183 						  min_trans);
5184 			if (sret == 0) {
5185 				btrfs_release_path(path);
5186 				goto again;
5187 			} else {
5188 				goto out;
5189 			}
5190 		}
5191 		/* save our key for returning back */
5192 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5193 		path->slots[level] = slot;
5194 		if (level == path->lowest_level) {
5195 			ret = 0;
5196 			goto out;
5197 		}
5198 		btrfs_set_path_blocking(path);
5199 		cur = read_node_slot(root, cur, slot);
5200 		BUG_ON(!cur); /* -ENOMEM */
5201 
5202 		btrfs_tree_read_lock(cur);
5203 
5204 		path->locks[level - 1] = BTRFS_READ_LOCK;
5205 		path->nodes[level - 1] = cur;
5206 		unlock_up(path, level, 1, 0, NULL);
5207 		btrfs_clear_path_blocking(path, NULL, 0);
5208 	}
5209 out:
5210 	path->keep_locks = keep_locks;
5211 	if (ret == 0) {
5212 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5213 		btrfs_set_path_blocking(path);
5214 		memcpy(min_key, &found_key, sizeof(found_key));
5215 	}
5216 	return ret;
5217 }
5218 
5219 static void tree_move_down(struct btrfs_root *root,
5220 			   struct btrfs_path *path,
5221 			   int *level, int root_level)
5222 {
5223 	BUG_ON(*level == 0);
5224 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5225 					path->slots[*level]);
5226 	path->slots[*level - 1] = 0;
5227 	(*level)--;
5228 }
5229 
5230 static int tree_move_next_or_upnext(struct btrfs_root *root,
5231 				    struct btrfs_path *path,
5232 				    int *level, int root_level)
5233 {
5234 	int ret = 0;
5235 	int nritems;
5236 	nritems = btrfs_header_nritems(path->nodes[*level]);
5237 
5238 	path->slots[*level]++;
5239 
5240 	while (path->slots[*level] >= nritems) {
5241 		if (*level == root_level)
5242 			return -1;
5243 
5244 		/* move upnext */
5245 		path->slots[*level] = 0;
5246 		free_extent_buffer(path->nodes[*level]);
5247 		path->nodes[*level] = NULL;
5248 		(*level)++;
5249 		path->slots[*level]++;
5250 
5251 		nritems = btrfs_header_nritems(path->nodes[*level]);
5252 		ret = 1;
5253 	}
5254 	return ret;
5255 }
5256 
5257 /*
5258  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5259  * or down.
5260  */
5261 static int tree_advance(struct btrfs_root *root,
5262 			struct btrfs_path *path,
5263 			int *level, int root_level,
5264 			int allow_down,
5265 			struct btrfs_key *key)
5266 {
5267 	int ret;
5268 
5269 	if (*level == 0 || !allow_down) {
5270 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5271 	} else {
5272 		tree_move_down(root, path, level, root_level);
5273 		ret = 0;
5274 	}
5275 	if (ret >= 0) {
5276 		if (*level == 0)
5277 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5278 					path->slots[*level]);
5279 		else
5280 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5281 					path->slots[*level]);
5282 	}
5283 	return ret;
5284 }
5285 
5286 static int tree_compare_item(struct btrfs_root *left_root,
5287 			     struct btrfs_path *left_path,
5288 			     struct btrfs_path *right_path,
5289 			     char *tmp_buf)
5290 {
5291 	int cmp;
5292 	int len1, len2;
5293 	unsigned long off1, off2;
5294 
5295 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5296 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5297 	if (len1 != len2)
5298 		return 1;
5299 
5300 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5301 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5302 				right_path->slots[0]);
5303 
5304 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5305 
5306 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5307 	if (cmp)
5308 		return 1;
5309 	return 0;
5310 }
5311 
5312 #define ADVANCE 1
5313 #define ADVANCE_ONLY_NEXT -1
5314 
5315 /*
5316  * This function compares two trees and calls the provided callback for
5317  * every changed/new/deleted item it finds.
5318  * If shared tree blocks are encountered, whole subtrees are skipped, making
5319  * the compare pretty fast on snapshotted subvolumes.
5320  *
5321  * This currently works on commit roots only. As commit roots are read only,
5322  * we don't do any locking. The commit roots are protected with transactions.
5323  * Transactions are ended and rejoined when a commit is tried in between.
5324  *
5325  * This function checks for modifications done to the trees while comparing.
5326  * If it detects a change, it aborts immediately.
5327  */
5328 int btrfs_compare_trees(struct btrfs_root *left_root,
5329 			struct btrfs_root *right_root,
5330 			btrfs_changed_cb_t changed_cb, void *ctx)
5331 {
5332 	int ret;
5333 	int cmp;
5334 	struct btrfs_path *left_path = NULL;
5335 	struct btrfs_path *right_path = NULL;
5336 	struct btrfs_key left_key;
5337 	struct btrfs_key right_key;
5338 	char *tmp_buf = NULL;
5339 	int left_root_level;
5340 	int right_root_level;
5341 	int left_level;
5342 	int right_level;
5343 	int left_end_reached;
5344 	int right_end_reached;
5345 	int advance_left;
5346 	int advance_right;
5347 	u64 left_blockptr;
5348 	u64 right_blockptr;
5349 	u64 left_gen;
5350 	u64 right_gen;
5351 
5352 	left_path = btrfs_alloc_path();
5353 	if (!left_path) {
5354 		ret = -ENOMEM;
5355 		goto out;
5356 	}
5357 	right_path = btrfs_alloc_path();
5358 	if (!right_path) {
5359 		ret = -ENOMEM;
5360 		goto out;
5361 	}
5362 
5363 	tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5364 	if (!tmp_buf) {
5365 		ret = -ENOMEM;
5366 		goto out;
5367 	}
5368 
5369 	left_path->search_commit_root = 1;
5370 	left_path->skip_locking = 1;
5371 	right_path->search_commit_root = 1;
5372 	right_path->skip_locking = 1;
5373 
5374 	/*
5375 	 * Strategy: Go to the first items of both trees. Then do
5376 	 *
5377 	 * If both trees are at level 0
5378 	 *   Compare keys of current items
5379 	 *     If left < right treat left item as new, advance left tree
5380 	 *       and repeat
5381 	 *     If left > right treat right item as deleted, advance right tree
5382 	 *       and repeat
5383 	 *     If left == right do deep compare of items, treat as changed if
5384 	 *       needed, advance both trees and repeat
5385 	 * If both trees are at the same level but not at level 0
5386 	 *   Compare keys of current nodes/leafs
5387 	 *     If left < right advance left tree and repeat
5388 	 *     If left > right advance right tree and repeat
5389 	 *     If left == right compare blockptrs of the next nodes/leafs
5390 	 *       If they match advance both trees but stay at the same level
5391 	 *         and repeat
5392 	 *       If they don't match advance both trees while allowing to go
5393 	 *         deeper and repeat
5394 	 * If tree levels are different
5395 	 *   Advance the tree that needs it and repeat
5396 	 *
5397 	 * Advancing a tree means:
5398 	 *   If we are at level 0, try to go to the next slot. If that's not
5399 	 *   possible, go one level up and repeat. Stop when we found a level
5400 	 *   where we could go to the next slot. We may at this point be on a
5401 	 *   node or a leaf.
5402 	 *
5403 	 *   If we are not at level 0 and not on shared tree blocks, go one
5404 	 *   level deeper.
5405 	 *
5406 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5407 	 *   the right if possible or go up and right.
5408 	 */
5409 
5410 	down_read(&left_root->fs_info->commit_root_sem);
5411 	left_level = btrfs_header_level(left_root->commit_root);
5412 	left_root_level = left_level;
5413 	left_path->nodes[left_level] = left_root->commit_root;
5414 	extent_buffer_get(left_path->nodes[left_level]);
5415 
5416 	right_level = btrfs_header_level(right_root->commit_root);
5417 	right_root_level = right_level;
5418 	right_path->nodes[right_level] = right_root->commit_root;
5419 	extent_buffer_get(right_path->nodes[right_level]);
5420 	up_read(&left_root->fs_info->commit_root_sem);
5421 
5422 	if (left_level == 0)
5423 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5424 				&left_key, left_path->slots[left_level]);
5425 	else
5426 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5427 				&left_key, left_path->slots[left_level]);
5428 	if (right_level == 0)
5429 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5430 				&right_key, right_path->slots[right_level]);
5431 	else
5432 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5433 				&right_key, right_path->slots[right_level]);
5434 
5435 	left_end_reached = right_end_reached = 0;
5436 	advance_left = advance_right = 0;
5437 
5438 	while (1) {
5439 		if (advance_left && !left_end_reached) {
5440 			ret = tree_advance(left_root, left_path, &left_level,
5441 					left_root_level,
5442 					advance_left != ADVANCE_ONLY_NEXT,
5443 					&left_key);
5444 			if (ret < 0)
5445 				left_end_reached = ADVANCE;
5446 			advance_left = 0;
5447 		}
5448 		if (advance_right && !right_end_reached) {
5449 			ret = tree_advance(right_root, right_path, &right_level,
5450 					right_root_level,
5451 					advance_right != ADVANCE_ONLY_NEXT,
5452 					&right_key);
5453 			if (ret < 0)
5454 				right_end_reached = ADVANCE;
5455 			advance_right = 0;
5456 		}
5457 
5458 		if (left_end_reached && right_end_reached) {
5459 			ret = 0;
5460 			goto out;
5461 		} else if (left_end_reached) {
5462 			if (right_level == 0) {
5463 				ret = changed_cb(left_root, right_root,
5464 						left_path, right_path,
5465 						&right_key,
5466 						BTRFS_COMPARE_TREE_DELETED,
5467 						ctx);
5468 				if (ret < 0)
5469 					goto out;
5470 			}
5471 			advance_right = ADVANCE;
5472 			continue;
5473 		} else if (right_end_reached) {
5474 			if (left_level == 0) {
5475 				ret = changed_cb(left_root, right_root,
5476 						left_path, right_path,
5477 						&left_key,
5478 						BTRFS_COMPARE_TREE_NEW,
5479 						ctx);
5480 				if (ret < 0)
5481 					goto out;
5482 			}
5483 			advance_left = ADVANCE;
5484 			continue;
5485 		}
5486 
5487 		if (left_level == 0 && right_level == 0) {
5488 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5489 			if (cmp < 0) {
5490 				ret = changed_cb(left_root, right_root,
5491 						left_path, right_path,
5492 						&left_key,
5493 						BTRFS_COMPARE_TREE_NEW,
5494 						ctx);
5495 				if (ret < 0)
5496 					goto out;
5497 				advance_left = ADVANCE;
5498 			} else if (cmp > 0) {
5499 				ret = changed_cb(left_root, right_root,
5500 						left_path, right_path,
5501 						&right_key,
5502 						BTRFS_COMPARE_TREE_DELETED,
5503 						ctx);
5504 				if (ret < 0)
5505 					goto out;
5506 				advance_right = ADVANCE;
5507 			} else {
5508 				enum btrfs_compare_tree_result result;
5509 
5510 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5511 				ret = tree_compare_item(left_root, left_path,
5512 						right_path, tmp_buf);
5513 				if (ret)
5514 					result = BTRFS_COMPARE_TREE_CHANGED;
5515 				else
5516 					result = BTRFS_COMPARE_TREE_SAME;
5517 				ret = changed_cb(left_root, right_root,
5518 						 left_path, right_path,
5519 						 &left_key, result, ctx);
5520 				if (ret < 0)
5521 					goto out;
5522 				advance_left = ADVANCE;
5523 				advance_right = ADVANCE;
5524 			}
5525 		} else if (left_level == right_level) {
5526 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5527 			if (cmp < 0) {
5528 				advance_left = ADVANCE;
5529 			} else if (cmp > 0) {
5530 				advance_right = ADVANCE;
5531 			} else {
5532 				left_blockptr = btrfs_node_blockptr(
5533 						left_path->nodes[left_level],
5534 						left_path->slots[left_level]);
5535 				right_blockptr = btrfs_node_blockptr(
5536 						right_path->nodes[right_level],
5537 						right_path->slots[right_level]);
5538 				left_gen = btrfs_node_ptr_generation(
5539 						left_path->nodes[left_level],
5540 						left_path->slots[left_level]);
5541 				right_gen = btrfs_node_ptr_generation(
5542 						right_path->nodes[right_level],
5543 						right_path->slots[right_level]);
5544 				if (left_blockptr == right_blockptr &&
5545 				    left_gen == right_gen) {
5546 					/*
5547 					 * As we're on a shared block, don't
5548 					 * allow to go deeper.
5549 					 */
5550 					advance_left = ADVANCE_ONLY_NEXT;
5551 					advance_right = ADVANCE_ONLY_NEXT;
5552 				} else {
5553 					advance_left = ADVANCE;
5554 					advance_right = ADVANCE;
5555 				}
5556 			}
5557 		} else if (left_level < right_level) {
5558 			advance_right = ADVANCE;
5559 		} else {
5560 			advance_left = ADVANCE;
5561 		}
5562 	}
5563 
5564 out:
5565 	btrfs_free_path(left_path);
5566 	btrfs_free_path(right_path);
5567 	kfree(tmp_buf);
5568 	return ret;
5569 }
5570 
5571 /*
5572  * this is similar to btrfs_next_leaf, but does not try to preserve
5573  * and fixup the path.  It looks for and returns the next key in the
5574  * tree based on the current path and the min_trans parameters.
5575  *
5576  * 0 is returned if another key is found, < 0 if there are any errors
5577  * and 1 is returned if there are no higher keys in the tree
5578  *
5579  * path->keep_locks should be set to 1 on the search made before
5580  * calling this function.
5581  */
5582 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5583 			struct btrfs_key *key, int level, u64 min_trans)
5584 {
5585 	int slot;
5586 	struct extent_buffer *c;
5587 
5588 	WARN_ON(!path->keep_locks);
5589 	while (level < BTRFS_MAX_LEVEL) {
5590 		if (!path->nodes[level])
5591 			return 1;
5592 
5593 		slot = path->slots[level] + 1;
5594 		c = path->nodes[level];
5595 next:
5596 		if (slot >= btrfs_header_nritems(c)) {
5597 			int ret;
5598 			int orig_lowest;
5599 			struct btrfs_key cur_key;
5600 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5601 			    !path->nodes[level + 1])
5602 				return 1;
5603 
5604 			if (path->locks[level + 1]) {
5605 				level++;
5606 				continue;
5607 			}
5608 
5609 			slot = btrfs_header_nritems(c) - 1;
5610 			if (level == 0)
5611 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5612 			else
5613 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5614 
5615 			orig_lowest = path->lowest_level;
5616 			btrfs_release_path(path);
5617 			path->lowest_level = level;
5618 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5619 						0, 0);
5620 			path->lowest_level = orig_lowest;
5621 			if (ret < 0)
5622 				return ret;
5623 
5624 			c = path->nodes[level];
5625 			slot = path->slots[level];
5626 			if (ret == 0)
5627 				slot++;
5628 			goto next;
5629 		}
5630 
5631 		if (level == 0)
5632 			btrfs_item_key_to_cpu(c, key, slot);
5633 		else {
5634 			u64 gen = btrfs_node_ptr_generation(c, slot);
5635 
5636 			if (gen < min_trans) {
5637 				slot++;
5638 				goto next;
5639 			}
5640 			btrfs_node_key_to_cpu(c, key, slot);
5641 		}
5642 		return 0;
5643 	}
5644 	return 1;
5645 }
5646 
5647 /*
5648  * search the tree again to find a leaf with greater keys
5649  * returns 0 if it found something or 1 if there are no greater leaves.
5650  * returns < 0 on io errors.
5651  */
5652 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5653 {
5654 	return btrfs_next_old_leaf(root, path, 0);
5655 }
5656 
5657 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5658 			u64 time_seq)
5659 {
5660 	int slot;
5661 	int level;
5662 	struct extent_buffer *c;
5663 	struct extent_buffer *next;
5664 	struct btrfs_key key;
5665 	u32 nritems;
5666 	int ret;
5667 	int old_spinning = path->leave_spinning;
5668 	int next_rw_lock = 0;
5669 
5670 	nritems = btrfs_header_nritems(path->nodes[0]);
5671 	if (nritems == 0)
5672 		return 1;
5673 
5674 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5675 again:
5676 	level = 1;
5677 	next = NULL;
5678 	next_rw_lock = 0;
5679 	btrfs_release_path(path);
5680 
5681 	path->keep_locks = 1;
5682 	path->leave_spinning = 1;
5683 
5684 	if (time_seq)
5685 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5686 	else
5687 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5688 	path->keep_locks = 0;
5689 
5690 	if (ret < 0)
5691 		return ret;
5692 
5693 	nritems = btrfs_header_nritems(path->nodes[0]);
5694 	/*
5695 	 * by releasing the path above we dropped all our locks.  A balance
5696 	 * could have added more items next to the key that used to be
5697 	 * at the very end of the block.  So, check again here and
5698 	 * advance the path if there are now more items available.
5699 	 */
5700 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5701 		if (ret == 0)
5702 			path->slots[0]++;
5703 		ret = 0;
5704 		goto done;
5705 	}
5706 	/*
5707 	 * So the above check misses one case:
5708 	 * - after releasing the path above, someone has removed the item that
5709 	 *   used to be at the very end of the block, and balance between leafs
5710 	 *   gets another one with bigger key.offset to replace it.
5711 	 *
5712 	 * This one should be returned as well, or we can get leaf corruption
5713 	 * later(esp. in __btrfs_drop_extents()).
5714 	 *
5715 	 * And a bit more explanation about this check,
5716 	 * with ret > 0, the key isn't found, the path points to the slot
5717 	 * where it should be inserted, so the path->slots[0] item must be the
5718 	 * bigger one.
5719 	 */
5720 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5721 		ret = 0;
5722 		goto done;
5723 	}
5724 
5725 	while (level < BTRFS_MAX_LEVEL) {
5726 		if (!path->nodes[level]) {
5727 			ret = 1;
5728 			goto done;
5729 		}
5730 
5731 		slot = path->slots[level] + 1;
5732 		c = path->nodes[level];
5733 		if (slot >= btrfs_header_nritems(c)) {
5734 			level++;
5735 			if (level == BTRFS_MAX_LEVEL) {
5736 				ret = 1;
5737 				goto done;
5738 			}
5739 			continue;
5740 		}
5741 
5742 		if (next) {
5743 			btrfs_tree_unlock_rw(next, next_rw_lock);
5744 			free_extent_buffer(next);
5745 		}
5746 
5747 		next = c;
5748 		next_rw_lock = path->locks[level];
5749 		ret = read_block_for_search(NULL, root, path, &next, level,
5750 					    slot, &key, 0);
5751 		if (ret == -EAGAIN)
5752 			goto again;
5753 
5754 		if (ret < 0) {
5755 			btrfs_release_path(path);
5756 			goto done;
5757 		}
5758 
5759 		if (!path->skip_locking) {
5760 			ret = btrfs_try_tree_read_lock(next);
5761 			if (!ret && time_seq) {
5762 				/*
5763 				 * If we don't get the lock, we may be racing
5764 				 * with push_leaf_left, holding that lock while
5765 				 * itself waiting for the leaf we've currently
5766 				 * locked. To solve this situation, we give up
5767 				 * on our lock and cycle.
5768 				 */
5769 				free_extent_buffer(next);
5770 				btrfs_release_path(path);
5771 				cond_resched();
5772 				goto again;
5773 			}
5774 			if (!ret) {
5775 				btrfs_set_path_blocking(path);
5776 				btrfs_tree_read_lock(next);
5777 				btrfs_clear_path_blocking(path, next,
5778 							  BTRFS_READ_LOCK);
5779 			}
5780 			next_rw_lock = BTRFS_READ_LOCK;
5781 		}
5782 		break;
5783 	}
5784 	path->slots[level] = slot;
5785 	while (1) {
5786 		level--;
5787 		c = path->nodes[level];
5788 		if (path->locks[level])
5789 			btrfs_tree_unlock_rw(c, path->locks[level]);
5790 
5791 		free_extent_buffer(c);
5792 		path->nodes[level] = next;
5793 		path->slots[level] = 0;
5794 		if (!path->skip_locking)
5795 			path->locks[level] = next_rw_lock;
5796 		if (!level)
5797 			break;
5798 
5799 		ret = read_block_for_search(NULL, root, path, &next, level,
5800 					    0, &key, 0);
5801 		if (ret == -EAGAIN)
5802 			goto again;
5803 
5804 		if (ret < 0) {
5805 			btrfs_release_path(path);
5806 			goto done;
5807 		}
5808 
5809 		if (!path->skip_locking) {
5810 			ret = btrfs_try_tree_read_lock(next);
5811 			if (!ret) {
5812 				btrfs_set_path_blocking(path);
5813 				btrfs_tree_read_lock(next);
5814 				btrfs_clear_path_blocking(path, next,
5815 							  BTRFS_READ_LOCK);
5816 			}
5817 			next_rw_lock = BTRFS_READ_LOCK;
5818 		}
5819 	}
5820 	ret = 0;
5821 done:
5822 	unlock_up(path, 0, 1, 0, NULL);
5823 	path->leave_spinning = old_spinning;
5824 	if (!old_spinning)
5825 		btrfs_set_path_blocking(path);
5826 
5827 	return ret;
5828 }
5829 
5830 /*
5831  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5832  * searching until it gets past min_objectid or finds an item of 'type'
5833  *
5834  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5835  */
5836 int btrfs_previous_item(struct btrfs_root *root,
5837 			struct btrfs_path *path, u64 min_objectid,
5838 			int type)
5839 {
5840 	struct btrfs_key found_key;
5841 	struct extent_buffer *leaf;
5842 	u32 nritems;
5843 	int ret;
5844 
5845 	while (1) {
5846 		if (path->slots[0] == 0) {
5847 			btrfs_set_path_blocking(path);
5848 			ret = btrfs_prev_leaf(root, path);
5849 			if (ret != 0)
5850 				return ret;
5851 		} else {
5852 			path->slots[0]--;
5853 		}
5854 		leaf = path->nodes[0];
5855 		nritems = btrfs_header_nritems(leaf);
5856 		if (nritems == 0)
5857 			return 1;
5858 		if (path->slots[0] == nritems)
5859 			path->slots[0]--;
5860 
5861 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5862 		if (found_key.objectid < min_objectid)
5863 			break;
5864 		if (found_key.type == type)
5865 			return 0;
5866 		if (found_key.objectid == min_objectid &&
5867 		    found_key.type < type)
5868 			break;
5869 	}
5870 	return 1;
5871 }
5872 
5873 /*
5874  * search in extent tree to find a previous Metadata/Data extent item with
5875  * min objecitd.
5876  *
5877  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5878  */
5879 int btrfs_previous_extent_item(struct btrfs_root *root,
5880 			struct btrfs_path *path, u64 min_objectid)
5881 {
5882 	struct btrfs_key found_key;
5883 	struct extent_buffer *leaf;
5884 	u32 nritems;
5885 	int ret;
5886 
5887 	while (1) {
5888 		if (path->slots[0] == 0) {
5889 			btrfs_set_path_blocking(path);
5890 			ret = btrfs_prev_leaf(root, path);
5891 			if (ret != 0)
5892 				return ret;
5893 		} else {
5894 			path->slots[0]--;
5895 		}
5896 		leaf = path->nodes[0];
5897 		nritems = btrfs_header_nritems(leaf);
5898 		if (nritems == 0)
5899 			return 1;
5900 		if (path->slots[0] == nritems)
5901 			path->slots[0]--;
5902 
5903 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5904 		if (found_key.objectid < min_objectid)
5905 			break;
5906 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5907 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5908 			return 0;
5909 		if (found_key.objectid == min_objectid &&
5910 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5911 			break;
5912 	}
5913 	return 1;
5914 }
5915