xref: /openbmc/linux/fs/btrfs/ctree.c (revision 93d90ad7)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27 
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 		      *root, struct btrfs_key *ins_key,
32 		      struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 			  struct btrfs_root *root, struct extent_buffer *dst,
35 			  struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      struct extent_buffer *dst_buf,
39 			      struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 		    int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 				 struct extent_buffer *eb);
44 
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47 	struct btrfs_path *path;
48 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49 	return path;
50 }
51 
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58 	int i;
59 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60 		if (!p->nodes[i] || !p->locks[i])
61 			continue;
62 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63 		if (p->locks[i] == BTRFS_READ_LOCK)
64 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
66 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67 	}
68 }
69 
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 					struct extent_buffer *held, int held_rw)
80 {
81 	int i;
82 
83 	if (held) {
84 		btrfs_set_lock_blocking_rw(held, held_rw);
85 		if (held_rw == BTRFS_WRITE_LOCK)
86 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87 		else if (held_rw == BTRFS_READ_LOCK)
88 			held_rw = BTRFS_READ_LOCK_BLOCKING;
89 	}
90 	btrfs_set_path_blocking(p);
91 
92 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93 		if (p->nodes[i] && p->locks[i]) {
94 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96 				p->locks[i] = BTRFS_WRITE_LOCK;
97 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98 				p->locks[i] = BTRFS_READ_LOCK;
99 		}
100 	}
101 
102 	if (held)
103 		btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105 
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109 	if (!p)
110 		return;
111 	btrfs_release_path(p);
112 	kmem_cache_free(btrfs_path_cachep, p);
113 }
114 
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123 	int i;
124 
125 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126 		p->slots[i] = 0;
127 		if (!p->nodes[i])
128 			continue;
129 		if (p->locks[i]) {
130 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131 			p->locks[i] = 0;
132 		}
133 		free_extent_buffer(p->nodes[i]);
134 		p->nodes[i] = NULL;
135 	}
136 }
137 
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150 	struct extent_buffer *eb;
151 
152 	while (1) {
153 		rcu_read_lock();
154 		eb = rcu_dereference(root->node);
155 
156 		/*
157 		 * RCU really hurts here, we could free up the root node because
158 		 * it was cow'ed but we may not get the new root node yet so do
159 		 * the inc_not_zero dance and if it doesn't work then
160 		 * synchronize_rcu and try again.
161 		 */
162 		if (atomic_inc_not_zero(&eb->refs)) {
163 			rcu_read_unlock();
164 			break;
165 		}
166 		rcu_read_unlock();
167 		synchronize_rcu();
168 	}
169 	return eb;
170 }
171 
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178 	struct extent_buffer *eb;
179 
180 	while (1) {
181 		eb = btrfs_root_node(root);
182 		btrfs_tree_lock(eb);
183 		if (eb == root->node)
184 			break;
185 		btrfs_tree_unlock(eb);
186 		free_extent_buffer(eb);
187 	}
188 	return eb;
189 }
190 
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197 	struct extent_buffer *eb;
198 
199 	while (1) {
200 		eb = btrfs_root_node(root);
201 		btrfs_tree_read_lock(eb);
202 		if (eb == root->node)
203 			break;
204 		btrfs_tree_read_unlock(eb);
205 		free_extent_buffer(eb);
206 	}
207 	return eb;
208 }
209 
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216 	spin_lock(&root->fs_info->trans_lock);
217 	if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
218 	    list_empty(&root->dirty_list)) {
219 		list_add(&root->dirty_list,
220 			 &root->fs_info->dirty_cowonly_roots);
221 	}
222 	spin_unlock(&root->fs_info->trans_lock);
223 }
224 
225 /*
226  * used by snapshot creation to make a copy of a root for a tree with
227  * a given objectid.  The buffer with the new root node is returned in
228  * cow_ret, and this func returns zero on success or a negative error code.
229  */
230 int btrfs_copy_root(struct btrfs_trans_handle *trans,
231 		      struct btrfs_root *root,
232 		      struct extent_buffer *buf,
233 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
234 {
235 	struct extent_buffer *cow;
236 	int ret = 0;
237 	int level;
238 	struct btrfs_disk_key disk_key;
239 
240 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
241 		trans->transid != root->fs_info->running_transaction->transid);
242 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
243 		trans->transid != root->last_trans);
244 
245 	level = btrfs_header_level(buf);
246 	if (level == 0)
247 		btrfs_item_key(buf, &disk_key, 0);
248 	else
249 		btrfs_node_key(buf, &disk_key, 0);
250 
251 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
252 			&disk_key, level, buf->start, 0);
253 	if (IS_ERR(cow))
254 		return PTR_ERR(cow);
255 
256 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
257 	btrfs_set_header_bytenr(cow, cow->start);
258 	btrfs_set_header_generation(cow, trans->transid);
259 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
260 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
261 				     BTRFS_HEADER_FLAG_RELOC);
262 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
263 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
264 	else
265 		btrfs_set_header_owner(cow, new_root_objectid);
266 
267 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
268 			    BTRFS_FSID_SIZE);
269 
270 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
271 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
272 		ret = btrfs_inc_ref(trans, root, cow, 1);
273 	else
274 		ret = btrfs_inc_ref(trans, root, cow, 0);
275 
276 	if (ret)
277 		return ret;
278 
279 	btrfs_mark_buffer_dirty(cow);
280 	*cow_ret = cow;
281 	return 0;
282 }
283 
284 enum mod_log_op {
285 	MOD_LOG_KEY_REPLACE,
286 	MOD_LOG_KEY_ADD,
287 	MOD_LOG_KEY_REMOVE,
288 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
289 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
290 	MOD_LOG_MOVE_KEYS,
291 	MOD_LOG_ROOT_REPLACE,
292 };
293 
294 struct tree_mod_move {
295 	int dst_slot;
296 	int nr_items;
297 };
298 
299 struct tree_mod_root {
300 	u64 logical;
301 	u8 level;
302 };
303 
304 struct tree_mod_elem {
305 	struct rb_node node;
306 	u64 index;		/* shifted logical */
307 	u64 seq;
308 	enum mod_log_op op;
309 
310 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
311 	int slot;
312 
313 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
314 	u64 generation;
315 
316 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
317 	struct btrfs_disk_key key;
318 	u64 blockptr;
319 
320 	/* this is used for op == MOD_LOG_MOVE_KEYS */
321 	struct tree_mod_move move;
322 
323 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
324 	struct tree_mod_root old_root;
325 };
326 
327 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
328 {
329 	read_lock(&fs_info->tree_mod_log_lock);
330 }
331 
332 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
333 {
334 	read_unlock(&fs_info->tree_mod_log_lock);
335 }
336 
337 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
338 {
339 	write_lock(&fs_info->tree_mod_log_lock);
340 }
341 
342 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
343 {
344 	write_unlock(&fs_info->tree_mod_log_lock);
345 }
346 
347 /*
348  * Pull a new tree mod seq number for our operation.
349  */
350 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
351 {
352 	return atomic64_inc_return(&fs_info->tree_mod_seq);
353 }
354 
355 /*
356  * This adds a new blocker to the tree mod log's blocker list if the @elem
357  * passed does not already have a sequence number set. So when a caller expects
358  * to record tree modifications, it should ensure to set elem->seq to zero
359  * before calling btrfs_get_tree_mod_seq.
360  * Returns a fresh, unused tree log modification sequence number, even if no new
361  * blocker was added.
362  */
363 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
364 			   struct seq_list *elem)
365 {
366 	tree_mod_log_write_lock(fs_info);
367 	spin_lock(&fs_info->tree_mod_seq_lock);
368 	if (!elem->seq) {
369 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
370 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
371 	}
372 	spin_unlock(&fs_info->tree_mod_seq_lock);
373 	tree_mod_log_write_unlock(fs_info);
374 
375 	return elem->seq;
376 }
377 
378 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
379 			    struct seq_list *elem)
380 {
381 	struct rb_root *tm_root;
382 	struct rb_node *node;
383 	struct rb_node *next;
384 	struct seq_list *cur_elem;
385 	struct tree_mod_elem *tm;
386 	u64 min_seq = (u64)-1;
387 	u64 seq_putting = elem->seq;
388 
389 	if (!seq_putting)
390 		return;
391 
392 	spin_lock(&fs_info->tree_mod_seq_lock);
393 	list_del(&elem->list);
394 	elem->seq = 0;
395 
396 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
397 		if (cur_elem->seq < min_seq) {
398 			if (seq_putting > cur_elem->seq) {
399 				/*
400 				 * blocker with lower sequence number exists, we
401 				 * cannot remove anything from the log
402 				 */
403 				spin_unlock(&fs_info->tree_mod_seq_lock);
404 				return;
405 			}
406 			min_seq = cur_elem->seq;
407 		}
408 	}
409 	spin_unlock(&fs_info->tree_mod_seq_lock);
410 
411 	/*
412 	 * anything that's lower than the lowest existing (read: blocked)
413 	 * sequence number can be removed from the tree.
414 	 */
415 	tree_mod_log_write_lock(fs_info);
416 	tm_root = &fs_info->tree_mod_log;
417 	for (node = rb_first(tm_root); node; node = next) {
418 		next = rb_next(node);
419 		tm = container_of(node, struct tree_mod_elem, node);
420 		if (tm->seq > min_seq)
421 			continue;
422 		rb_erase(node, tm_root);
423 		kfree(tm);
424 	}
425 	tree_mod_log_write_unlock(fs_info);
426 }
427 
428 /*
429  * key order of the log:
430  *       index -> sequence
431  *
432  * the index is the shifted logical of the *new* root node for root replace
433  * operations, or the shifted logical of the affected block for all other
434  * operations.
435  *
436  * Note: must be called with write lock (tree_mod_log_write_lock).
437  */
438 static noinline int
439 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
440 {
441 	struct rb_root *tm_root;
442 	struct rb_node **new;
443 	struct rb_node *parent = NULL;
444 	struct tree_mod_elem *cur;
445 
446 	BUG_ON(!tm);
447 
448 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
449 
450 	tm_root = &fs_info->tree_mod_log;
451 	new = &tm_root->rb_node;
452 	while (*new) {
453 		cur = container_of(*new, struct tree_mod_elem, node);
454 		parent = *new;
455 		if (cur->index < tm->index)
456 			new = &((*new)->rb_left);
457 		else if (cur->index > tm->index)
458 			new = &((*new)->rb_right);
459 		else if (cur->seq < tm->seq)
460 			new = &((*new)->rb_left);
461 		else if (cur->seq > tm->seq)
462 			new = &((*new)->rb_right);
463 		else
464 			return -EEXIST;
465 	}
466 
467 	rb_link_node(&tm->node, parent, new);
468 	rb_insert_color(&tm->node, tm_root);
469 	return 0;
470 }
471 
472 /*
473  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
474  * returns zero with the tree_mod_log_lock acquired. The caller must hold
475  * this until all tree mod log insertions are recorded in the rb tree and then
476  * call tree_mod_log_write_unlock() to release.
477  */
478 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
479 				    struct extent_buffer *eb) {
480 	smp_mb();
481 	if (list_empty(&(fs_info)->tree_mod_seq_list))
482 		return 1;
483 	if (eb && btrfs_header_level(eb) == 0)
484 		return 1;
485 
486 	tree_mod_log_write_lock(fs_info);
487 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
488 		tree_mod_log_write_unlock(fs_info);
489 		return 1;
490 	}
491 
492 	return 0;
493 }
494 
495 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
496 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
497 				    struct extent_buffer *eb)
498 {
499 	smp_mb();
500 	if (list_empty(&(fs_info)->tree_mod_seq_list))
501 		return 0;
502 	if (eb && btrfs_header_level(eb) == 0)
503 		return 0;
504 
505 	return 1;
506 }
507 
508 static struct tree_mod_elem *
509 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
510 		    enum mod_log_op op, gfp_t flags)
511 {
512 	struct tree_mod_elem *tm;
513 
514 	tm = kzalloc(sizeof(*tm), flags);
515 	if (!tm)
516 		return NULL;
517 
518 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
519 	if (op != MOD_LOG_KEY_ADD) {
520 		btrfs_node_key(eb, &tm->key, slot);
521 		tm->blockptr = btrfs_node_blockptr(eb, slot);
522 	}
523 	tm->op = op;
524 	tm->slot = slot;
525 	tm->generation = btrfs_node_ptr_generation(eb, slot);
526 	RB_CLEAR_NODE(&tm->node);
527 
528 	return tm;
529 }
530 
531 static noinline int
532 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
533 			struct extent_buffer *eb, int slot,
534 			enum mod_log_op op, gfp_t flags)
535 {
536 	struct tree_mod_elem *tm;
537 	int ret;
538 
539 	if (!tree_mod_need_log(fs_info, eb))
540 		return 0;
541 
542 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
543 	if (!tm)
544 		return -ENOMEM;
545 
546 	if (tree_mod_dont_log(fs_info, eb)) {
547 		kfree(tm);
548 		return 0;
549 	}
550 
551 	ret = __tree_mod_log_insert(fs_info, tm);
552 	tree_mod_log_write_unlock(fs_info);
553 	if (ret)
554 		kfree(tm);
555 
556 	return ret;
557 }
558 
559 static noinline int
560 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
561 			 struct extent_buffer *eb, int dst_slot, int src_slot,
562 			 int nr_items, gfp_t flags)
563 {
564 	struct tree_mod_elem *tm = NULL;
565 	struct tree_mod_elem **tm_list = NULL;
566 	int ret = 0;
567 	int i;
568 	int locked = 0;
569 
570 	if (!tree_mod_need_log(fs_info, eb))
571 		return 0;
572 
573 	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
574 	if (!tm_list)
575 		return -ENOMEM;
576 
577 	tm = kzalloc(sizeof(*tm), flags);
578 	if (!tm) {
579 		ret = -ENOMEM;
580 		goto free_tms;
581 	}
582 
583 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
584 	tm->slot = src_slot;
585 	tm->move.dst_slot = dst_slot;
586 	tm->move.nr_items = nr_items;
587 	tm->op = MOD_LOG_MOVE_KEYS;
588 
589 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
590 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
591 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
592 		if (!tm_list[i]) {
593 			ret = -ENOMEM;
594 			goto free_tms;
595 		}
596 	}
597 
598 	if (tree_mod_dont_log(fs_info, eb))
599 		goto free_tms;
600 	locked = 1;
601 
602 	/*
603 	 * When we override something during the move, we log these removals.
604 	 * This can only happen when we move towards the beginning of the
605 	 * buffer, i.e. dst_slot < src_slot.
606 	 */
607 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
608 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
609 		if (ret)
610 			goto free_tms;
611 	}
612 
613 	ret = __tree_mod_log_insert(fs_info, tm);
614 	if (ret)
615 		goto free_tms;
616 	tree_mod_log_write_unlock(fs_info);
617 	kfree(tm_list);
618 
619 	return 0;
620 free_tms:
621 	for (i = 0; i < nr_items; i++) {
622 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
623 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
624 		kfree(tm_list[i]);
625 	}
626 	if (locked)
627 		tree_mod_log_write_unlock(fs_info);
628 	kfree(tm_list);
629 	kfree(tm);
630 
631 	return ret;
632 }
633 
634 static inline int
635 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
636 		       struct tree_mod_elem **tm_list,
637 		       int nritems)
638 {
639 	int i, j;
640 	int ret;
641 
642 	for (i = nritems - 1; i >= 0; i--) {
643 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
644 		if (ret) {
645 			for (j = nritems - 1; j > i; j--)
646 				rb_erase(&tm_list[j]->node,
647 					 &fs_info->tree_mod_log);
648 			return ret;
649 		}
650 	}
651 
652 	return 0;
653 }
654 
655 static noinline int
656 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
657 			 struct extent_buffer *old_root,
658 			 struct extent_buffer *new_root, gfp_t flags,
659 			 int log_removal)
660 {
661 	struct tree_mod_elem *tm = NULL;
662 	struct tree_mod_elem **tm_list = NULL;
663 	int nritems = 0;
664 	int ret = 0;
665 	int i;
666 
667 	if (!tree_mod_need_log(fs_info, NULL))
668 		return 0;
669 
670 	if (log_removal && btrfs_header_level(old_root) > 0) {
671 		nritems = btrfs_header_nritems(old_root);
672 		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
673 				  flags);
674 		if (!tm_list) {
675 			ret = -ENOMEM;
676 			goto free_tms;
677 		}
678 		for (i = 0; i < nritems; i++) {
679 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
680 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
681 			if (!tm_list[i]) {
682 				ret = -ENOMEM;
683 				goto free_tms;
684 			}
685 		}
686 	}
687 
688 	tm = kzalloc(sizeof(*tm), flags);
689 	if (!tm) {
690 		ret = -ENOMEM;
691 		goto free_tms;
692 	}
693 
694 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
695 	tm->old_root.logical = old_root->start;
696 	tm->old_root.level = btrfs_header_level(old_root);
697 	tm->generation = btrfs_header_generation(old_root);
698 	tm->op = MOD_LOG_ROOT_REPLACE;
699 
700 	if (tree_mod_dont_log(fs_info, NULL))
701 		goto free_tms;
702 
703 	if (tm_list)
704 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
705 	if (!ret)
706 		ret = __tree_mod_log_insert(fs_info, tm);
707 
708 	tree_mod_log_write_unlock(fs_info);
709 	if (ret)
710 		goto free_tms;
711 	kfree(tm_list);
712 
713 	return ret;
714 
715 free_tms:
716 	if (tm_list) {
717 		for (i = 0; i < nritems; i++)
718 			kfree(tm_list[i]);
719 		kfree(tm_list);
720 	}
721 	kfree(tm);
722 
723 	return ret;
724 }
725 
726 static struct tree_mod_elem *
727 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
728 		      int smallest)
729 {
730 	struct rb_root *tm_root;
731 	struct rb_node *node;
732 	struct tree_mod_elem *cur = NULL;
733 	struct tree_mod_elem *found = NULL;
734 	u64 index = start >> PAGE_CACHE_SHIFT;
735 
736 	tree_mod_log_read_lock(fs_info);
737 	tm_root = &fs_info->tree_mod_log;
738 	node = tm_root->rb_node;
739 	while (node) {
740 		cur = container_of(node, struct tree_mod_elem, node);
741 		if (cur->index < index) {
742 			node = node->rb_left;
743 		} else if (cur->index > index) {
744 			node = node->rb_right;
745 		} else if (cur->seq < min_seq) {
746 			node = node->rb_left;
747 		} else if (!smallest) {
748 			/* we want the node with the highest seq */
749 			if (found)
750 				BUG_ON(found->seq > cur->seq);
751 			found = cur;
752 			node = node->rb_left;
753 		} else if (cur->seq > min_seq) {
754 			/* we want the node with the smallest seq */
755 			if (found)
756 				BUG_ON(found->seq < cur->seq);
757 			found = cur;
758 			node = node->rb_right;
759 		} else {
760 			found = cur;
761 			break;
762 		}
763 	}
764 	tree_mod_log_read_unlock(fs_info);
765 
766 	return found;
767 }
768 
769 /*
770  * this returns the element from the log with the smallest time sequence
771  * value that's in the log (the oldest log item). any element with a time
772  * sequence lower than min_seq will be ignored.
773  */
774 static struct tree_mod_elem *
775 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
776 			   u64 min_seq)
777 {
778 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
779 }
780 
781 /*
782  * this returns the element from the log with the largest time sequence
783  * value that's in the log (the most recent log item). any element with
784  * a time sequence lower than min_seq will be ignored.
785  */
786 static struct tree_mod_elem *
787 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
788 {
789 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
790 }
791 
792 static noinline int
793 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
794 		     struct extent_buffer *src, unsigned long dst_offset,
795 		     unsigned long src_offset, int nr_items)
796 {
797 	int ret = 0;
798 	struct tree_mod_elem **tm_list = NULL;
799 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
800 	int i;
801 	int locked = 0;
802 
803 	if (!tree_mod_need_log(fs_info, NULL))
804 		return 0;
805 
806 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
807 		return 0;
808 
809 	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
810 			  GFP_NOFS);
811 	if (!tm_list)
812 		return -ENOMEM;
813 
814 	tm_list_add = tm_list;
815 	tm_list_rem = tm_list + nr_items;
816 	for (i = 0; i < nr_items; i++) {
817 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
818 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
819 		if (!tm_list_rem[i]) {
820 			ret = -ENOMEM;
821 			goto free_tms;
822 		}
823 
824 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
825 		    MOD_LOG_KEY_ADD, GFP_NOFS);
826 		if (!tm_list_add[i]) {
827 			ret = -ENOMEM;
828 			goto free_tms;
829 		}
830 	}
831 
832 	if (tree_mod_dont_log(fs_info, NULL))
833 		goto free_tms;
834 	locked = 1;
835 
836 	for (i = 0; i < nr_items; i++) {
837 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
838 		if (ret)
839 			goto free_tms;
840 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
841 		if (ret)
842 			goto free_tms;
843 	}
844 
845 	tree_mod_log_write_unlock(fs_info);
846 	kfree(tm_list);
847 
848 	return 0;
849 
850 free_tms:
851 	for (i = 0; i < nr_items * 2; i++) {
852 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
853 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
854 		kfree(tm_list[i]);
855 	}
856 	if (locked)
857 		tree_mod_log_write_unlock(fs_info);
858 	kfree(tm_list);
859 
860 	return ret;
861 }
862 
863 static inline void
864 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
865 		     int dst_offset, int src_offset, int nr_items)
866 {
867 	int ret;
868 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
869 				       nr_items, GFP_NOFS);
870 	BUG_ON(ret < 0);
871 }
872 
873 static noinline void
874 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
875 			  struct extent_buffer *eb, int slot, int atomic)
876 {
877 	int ret;
878 
879 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
880 					MOD_LOG_KEY_REPLACE,
881 					atomic ? GFP_ATOMIC : GFP_NOFS);
882 	BUG_ON(ret < 0);
883 }
884 
885 static noinline int
886 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
887 {
888 	struct tree_mod_elem **tm_list = NULL;
889 	int nritems = 0;
890 	int i;
891 	int ret = 0;
892 
893 	if (btrfs_header_level(eb) == 0)
894 		return 0;
895 
896 	if (!tree_mod_need_log(fs_info, NULL))
897 		return 0;
898 
899 	nritems = btrfs_header_nritems(eb);
900 	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
901 			  GFP_NOFS);
902 	if (!tm_list)
903 		return -ENOMEM;
904 
905 	for (i = 0; i < nritems; i++) {
906 		tm_list[i] = alloc_tree_mod_elem(eb, i,
907 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
908 		if (!tm_list[i]) {
909 			ret = -ENOMEM;
910 			goto free_tms;
911 		}
912 	}
913 
914 	if (tree_mod_dont_log(fs_info, eb))
915 		goto free_tms;
916 
917 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
918 	tree_mod_log_write_unlock(fs_info);
919 	if (ret)
920 		goto free_tms;
921 	kfree(tm_list);
922 
923 	return 0;
924 
925 free_tms:
926 	for (i = 0; i < nritems; i++)
927 		kfree(tm_list[i]);
928 	kfree(tm_list);
929 
930 	return ret;
931 }
932 
933 static noinline void
934 tree_mod_log_set_root_pointer(struct btrfs_root *root,
935 			      struct extent_buffer *new_root_node,
936 			      int log_removal)
937 {
938 	int ret;
939 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
940 				       new_root_node, GFP_NOFS, log_removal);
941 	BUG_ON(ret < 0);
942 }
943 
944 /*
945  * check if the tree block can be shared by multiple trees
946  */
947 int btrfs_block_can_be_shared(struct btrfs_root *root,
948 			      struct extent_buffer *buf)
949 {
950 	/*
951 	 * Tree blocks not in refernece counted trees and tree roots
952 	 * are never shared. If a block was allocated after the last
953 	 * snapshot and the block was not allocated by tree relocation,
954 	 * we know the block is not shared.
955 	 */
956 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
957 	    buf != root->node && buf != root->commit_root &&
958 	    (btrfs_header_generation(buf) <=
959 	     btrfs_root_last_snapshot(&root->root_item) ||
960 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
961 		return 1;
962 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
963 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
965 		return 1;
966 #endif
967 	return 0;
968 }
969 
970 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
971 				       struct btrfs_root *root,
972 				       struct extent_buffer *buf,
973 				       struct extent_buffer *cow,
974 				       int *last_ref)
975 {
976 	u64 refs;
977 	u64 owner;
978 	u64 flags;
979 	u64 new_flags = 0;
980 	int ret;
981 
982 	/*
983 	 * Backrefs update rules:
984 	 *
985 	 * Always use full backrefs for extent pointers in tree block
986 	 * allocated by tree relocation.
987 	 *
988 	 * If a shared tree block is no longer referenced by its owner
989 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
990 	 * use full backrefs for extent pointers in tree block.
991 	 *
992 	 * If a tree block is been relocating
993 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
994 	 * use full backrefs for extent pointers in tree block.
995 	 * The reason for this is some operations (such as drop tree)
996 	 * are only allowed for blocks use full backrefs.
997 	 */
998 
999 	if (btrfs_block_can_be_shared(root, buf)) {
1000 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
1001 					       btrfs_header_level(buf), 1,
1002 					       &refs, &flags);
1003 		if (ret)
1004 			return ret;
1005 		if (refs == 0) {
1006 			ret = -EROFS;
1007 			btrfs_std_error(root->fs_info, ret);
1008 			return ret;
1009 		}
1010 	} else {
1011 		refs = 1;
1012 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1013 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1014 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1015 		else
1016 			flags = 0;
1017 	}
1018 
1019 	owner = btrfs_header_owner(buf);
1020 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1021 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1022 
1023 	if (refs > 1) {
1024 		if ((owner == root->root_key.objectid ||
1025 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1026 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1027 			ret = btrfs_inc_ref(trans, root, buf, 1);
1028 			BUG_ON(ret); /* -ENOMEM */
1029 
1030 			if (root->root_key.objectid ==
1031 			    BTRFS_TREE_RELOC_OBJECTID) {
1032 				ret = btrfs_dec_ref(trans, root, buf, 0);
1033 				BUG_ON(ret); /* -ENOMEM */
1034 				ret = btrfs_inc_ref(trans, root, cow, 1);
1035 				BUG_ON(ret); /* -ENOMEM */
1036 			}
1037 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1038 		} else {
1039 
1040 			if (root->root_key.objectid ==
1041 			    BTRFS_TREE_RELOC_OBJECTID)
1042 				ret = btrfs_inc_ref(trans, root, cow, 1);
1043 			else
1044 				ret = btrfs_inc_ref(trans, root, cow, 0);
1045 			BUG_ON(ret); /* -ENOMEM */
1046 		}
1047 		if (new_flags != 0) {
1048 			int level = btrfs_header_level(buf);
1049 
1050 			ret = btrfs_set_disk_extent_flags(trans, root,
1051 							  buf->start,
1052 							  buf->len,
1053 							  new_flags, level, 0);
1054 			if (ret)
1055 				return ret;
1056 		}
1057 	} else {
1058 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1059 			if (root->root_key.objectid ==
1060 			    BTRFS_TREE_RELOC_OBJECTID)
1061 				ret = btrfs_inc_ref(trans, root, cow, 1);
1062 			else
1063 				ret = btrfs_inc_ref(trans, root, cow, 0);
1064 			BUG_ON(ret); /* -ENOMEM */
1065 			ret = btrfs_dec_ref(trans, root, buf, 1);
1066 			BUG_ON(ret); /* -ENOMEM */
1067 		}
1068 		clean_tree_block(trans, root, buf);
1069 		*last_ref = 1;
1070 	}
1071 	return 0;
1072 }
1073 
1074 /*
1075  * does the dirty work in cow of a single block.  The parent block (if
1076  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1077  * dirty and returned locked.  If you modify the block it needs to be marked
1078  * dirty again.
1079  *
1080  * search_start -- an allocation hint for the new block
1081  *
1082  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1083  * bytes the allocator should try to find free next to the block it returns.
1084  * This is just a hint and may be ignored by the allocator.
1085  */
1086 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1087 			     struct btrfs_root *root,
1088 			     struct extent_buffer *buf,
1089 			     struct extent_buffer *parent, int parent_slot,
1090 			     struct extent_buffer **cow_ret,
1091 			     u64 search_start, u64 empty_size)
1092 {
1093 	struct btrfs_disk_key disk_key;
1094 	struct extent_buffer *cow;
1095 	int level, ret;
1096 	int last_ref = 0;
1097 	int unlock_orig = 0;
1098 	u64 parent_start;
1099 
1100 	if (*cow_ret == buf)
1101 		unlock_orig = 1;
1102 
1103 	btrfs_assert_tree_locked(buf);
1104 
1105 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1106 		trans->transid != root->fs_info->running_transaction->transid);
1107 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1108 		trans->transid != root->last_trans);
1109 
1110 	level = btrfs_header_level(buf);
1111 
1112 	if (level == 0)
1113 		btrfs_item_key(buf, &disk_key, 0);
1114 	else
1115 		btrfs_node_key(buf, &disk_key, 0);
1116 
1117 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1118 		if (parent)
1119 			parent_start = parent->start;
1120 		else
1121 			parent_start = 0;
1122 	} else
1123 		parent_start = 0;
1124 
1125 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
1126 			root->root_key.objectid, &disk_key, level,
1127 			search_start, empty_size);
1128 	if (IS_ERR(cow))
1129 		return PTR_ERR(cow);
1130 
1131 	/* cow is set to blocking by btrfs_init_new_buffer */
1132 
1133 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
1134 	btrfs_set_header_bytenr(cow, cow->start);
1135 	btrfs_set_header_generation(cow, trans->transid);
1136 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1137 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1138 				     BTRFS_HEADER_FLAG_RELOC);
1139 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1140 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1141 	else
1142 		btrfs_set_header_owner(cow, root->root_key.objectid);
1143 
1144 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1145 			    BTRFS_FSID_SIZE);
1146 
1147 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1148 	if (ret) {
1149 		btrfs_abort_transaction(trans, root, ret);
1150 		return ret;
1151 	}
1152 
1153 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1154 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1155 		if (ret)
1156 			return ret;
1157 	}
1158 
1159 	if (buf == root->node) {
1160 		WARN_ON(parent && parent != buf);
1161 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1162 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1163 			parent_start = buf->start;
1164 		else
1165 			parent_start = 0;
1166 
1167 		extent_buffer_get(cow);
1168 		tree_mod_log_set_root_pointer(root, cow, 1);
1169 		rcu_assign_pointer(root->node, cow);
1170 
1171 		btrfs_free_tree_block(trans, root, buf, parent_start,
1172 				      last_ref);
1173 		free_extent_buffer(buf);
1174 		add_root_to_dirty_list(root);
1175 	} else {
1176 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1177 			parent_start = parent->start;
1178 		else
1179 			parent_start = 0;
1180 
1181 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1182 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1183 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1184 		btrfs_set_node_blockptr(parent, parent_slot,
1185 					cow->start);
1186 		btrfs_set_node_ptr_generation(parent, parent_slot,
1187 					      trans->transid);
1188 		btrfs_mark_buffer_dirty(parent);
1189 		if (last_ref) {
1190 			ret = tree_mod_log_free_eb(root->fs_info, buf);
1191 			if (ret) {
1192 				btrfs_abort_transaction(trans, root, ret);
1193 				return ret;
1194 			}
1195 		}
1196 		btrfs_free_tree_block(trans, root, buf, parent_start,
1197 				      last_ref);
1198 	}
1199 	if (unlock_orig)
1200 		btrfs_tree_unlock(buf);
1201 	free_extent_buffer_stale(buf);
1202 	btrfs_mark_buffer_dirty(cow);
1203 	*cow_ret = cow;
1204 	return 0;
1205 }
1206 
1207 /*
1208  * returns the logical address of the oldest predecessor of the given root.
1209  * entries older than time_seq are ignored.
1210  */
1211 static struct tree_mod_elem *
1212 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1213 			   struct extent_buffer *eb_root, u64 time_seq)
1214 {
1215 	struct tree_mod_elem *tm;
1216 	struct tree_mod_elem *found = NULL;
1217 	u64 root_logical = eb_root->start;
1218 	int looped = 0;
1219 
1220 	if (!time_seq)
1221 		return NULL;
1222 
1223 	/*
1224 	 * the very last operation that's logged for a root is the replacement
1225 	 * operation (if it is replaced at all). this has the index of the *new*
1226 	 * root, making it the very first operation that's logged for this root.
1227 	 */
1228 	while (1) {
1229 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1230 						time_seq);
1231 		if (!looped && !tm)
1232 			return NULL;
1233 		/*
1234 		 * if there are no tree operation for the oldest root, we simply
1235 		 * return it. this should only happen if that (old) root is at
1236 		 * level 0.
1237 		 */
1238 		if (!tm)
1239 			break;
1240 
1241 		/*
1242 		 * if there's an operation that's not a root replacement, we
1243 		 * found the oldest version of our root. normally, we'll find a
1244 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1245 		 */
1246 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1247 			break;
1248 
1249 		found = tm;
1250 		root_logical = tm->old_root.logical;
1251 		looped = 1;
1252 	}
1253 
1254 	/* if there's no old root to return, return what we found instead */
1255 	if (!found)
1256 		found = tm;
1257 
1258 	return found;
1259 }
1260 
1261 /*
1262  * tm is a pointer to the first operation to rewind within eb. then, all
1263  * previous operations will be rewinded (until we reach something older than
1264  * time_seq).
1265  */
1266 static void
1267 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1268 		      u64 time_seq, struct tree_mod_elem *first_tm)
1269 {
1270 	u32 n;
1271 	struct rb_node *next;
1272 	struct tree_mod_elem *tm = first_tm;
1273 	unsigned long o_dst;
1274 	unsigned long o_src;
1275 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1276 
1277 	n = btrfs_header_nritems(eb);
1278 	tree_mod_log_read_lock(fs_info);
1279 	while (tm && tm->seq >= time_seq) {
1280 		/*
1281 		 * all the operations are recorded with the operator used for
1282 		 * the modification. as we're going backwards, we do the
1283 		 * opposite of each operation here.
1284 		 */
1285 		switch (tm->op) {
1286 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1287 			BUG_ON(tm->slot < n);
1288 			/* Fallthrough */
1289 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1290 		case MOD_LOG_KEY_REMOVE:
1291 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1292 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1293 			btrfs_set_node_ptr_generation(eb, tm->slot,
1294 						      tm->generation);
1295 			n++;
1296 			break;
1297 		case MOD_LOG_KEY_REPLACE:
1298 			BUG_ON(tm->slot >= n);
1299 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1300 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1301 			btrfs_set_node_ptr_generation(eb, tm->slot,
1302 						      tm->generation);
1303 			break;
1304 		case MOD_LOG_KEY_ADD:
1305 			/* if a move operation is needed it's in the log */
1306 			n--;
1307 			break;
1308 		case MOD_LOG_MOVE_KEYS:
1309 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1310 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1311 			memmove_extent_buffer(eb, o_dst, o_src,
1312 					      tm->move.nr_items * p_size);
1313 			break;
1314 		case MOD_LOG_ROOT_REPLACE:
1315 			/*
1316 			 * this operation is special. for roots, this must be
1317 			 * handled explicitly before rewinding.
1318 			 * for non-roots, this operation may exist if the node
1319 			 * was a root: root A -> child B; then A gets empty and
1320 			 * B is promoted to the new root. in the mod log, we'll
1321 			 * have a root-replace operation for B, a tree block
1322 			 * that is no root. we simply ignore that operation.
1323 			 */
1324 			break;
1325 		}
1326 		next = rb_next(&tm->node);
1327 		if (!next)
1328 			break;
1329 		tm = container_of(next, struct tree_mod_elem, node);
1330 		if (tm->index != first_tm->index)
1331 			break;
1332 	}
1333 	tree_mod_log_read_unlock(fs_info);
1334 	btrfs_set_header_nritems(eb, n);
1335 }
1336 
1337 /*
1338  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1339  * is returned. If rewind operations happen, a fresh buffer is returned. The
1340  * returned buffer is always read-locked. If the returned buffer is not the
1341  * input buffer, the lock on the input buffer is released and the input buffer
1342  * is freed (its refcount is decremented).
1343  */
1344 static struct extent_buffer *
1345 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1346 		    struct extent_buffer *eb, u64 time_seq)
1347 {
1348 	struct extent_buffer *eb_rewin;
1349 	struct tree_mod_elem *tm;
1350 
1351 	if (!time_seq)
1352 		return eb;
1353 
1354 	if (btrfs_header_level(eb) == 0)
1355 		return eb;
1356 
1357 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1358 	if (!tm)
1359 		return eb;
1360 
1361 	btrfs_set_path_blocking(path);
1362 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1363 
1364 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1365 		BUG_ON(tm->slot != 0);
1366 		eb_rewin = alloc_dummy_extent_buffer(eb->start,
1367 						fs_info->tree_root->nodesize);
1368 		if (!eb_rewin) {
1369 			btrfs_tree_read_unlock_blocking(eb);
1370 			free_extent_buffer(eb);
1371 			return NULL;
1372 		}
1373 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1374 		btrfs_set_header_backref_rev(eb_rewin,
1375 					     btrfs_header_backref_rev(eb));
1376 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1377 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1378 	} else {
1379 		eb_rewin = btrfs_clone_extent_buffer(eb);
1380 		if (!eb_rewin) {
1381 			btrfs_tree_read_unlock_blocking(eb);
1382 			free_extent_buffer(eb);
1383 			return NULL;
1384 		}
1385 	}
1386 
1387 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1388 	btrfs_tree_read_unlock_blocking(eb);
1389 	free_extent_buffer(eb);
1390 
1391 	extent_buffer_get(eb_rewin);
1392 	btrfs_tree_read_lock(eb_rewin);
1393 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1394 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1395 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1396 
1397 	return eb_rewin;
1398 }
1399 
1400 /*
1401  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1402  * value. If there are no changes, the current root->root_node is returned. If
1403  * anything changed in between, there's a fresh buffer allocated on which the
1404  * rewind operations are done. In any case, the returned buffer is read locked.
1405  * Returns NULL on error (with no locks held).
1406  */
1407 static inline struct extent_buffer *
1408 get_old_root(struct btrfs_root *root, u64 time_seq)
1409 {
1410 	struct tree_mod_elem *tm;
1411 	struct extent_buffer *eb = NULL;
1412 	struct extent_buffer *eb_root;
1413 	struct extent_buffer *old;
1414 	struct tree_mod_root *old_root = NULL;
1415 	u64 old_generation = 0;
1416 	u64 logical;
1417 
1418 	eb_root = btrfs_read_lock_root_node(root);
1419 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1420 	if (!tm)
1421 		return eb_root;
1422 
1423 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1424 		old_root = &tm->old_root;
1425 		old_generation = tm->generation;
1426 		logical = old_root->logical;
1427 	} else {
1428 		logical = eb_root->start;
1429 	}
1430 
1431 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1432 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1433 		btrfs_tree_read_unlock(eb_root);
1434 		free_extent_buffer(eb_root);
1435 		old = read_tree_block(root, logical, 0);
1436 		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1437 			free_extent_buffer(old);
1438 			btrfs_warn(root->fs_info,
1439 				"failed to read tree block %llu from get_old_root", logical);
1440 		} else {
1441 			eb = btrfs_clone_extent_buffer(old);
1442 			free_extent_buffer(old);
1443 		}
1444 	} else if (old_root) {
1445 		btrfs_tree_read_unlock(eb_root);
1446 		free_extent_buffer(eb_root);
1447 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1448 	} else {
1449 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1450 		eb = btrfs_clone_extent_buffer(eb_root);
1451 		btrfs_tree_read_unlock_blocking(eb_root);
1452 		free_extent_buffer(eb_root);
1453 	}
1454 
1455 	if (!eb)
1456 		return NULL;
1457 	extent_buffer_get(eb);
1458 	btrfs_tree_read_lock(eb);
1459 	if (old_root) {
1460 		btrfs_set_header_bytenr(eb, eb->start);
1461 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1462 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1463 		btrfs_set_header_level(eb, old_root->level);
1464 		btrfs_set_header_generation(eb, old_generation);
1465 	}
1466 	if (tm)
1467 		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1468 	else
1469 		WARN_ON(btrfs_header_level(eb) != 0);
1470 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1471 
1472 	return eb;
1473 }
1474 
1475 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1476 {
1477 	struct tree_mod_elem *tm;
1478 	int level;
1479 	struct extent_buffer *eb_root = btrfs_root_node(root);
1480 
1481 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1482 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1483 		level = tm->old_root.level;
1484 	} else {
1485 		level = btrfs_header_level(eb_root);
1486 	}
1487 	free_extent_buffer(eb_root);
1488 
1489 	return level;
1490 }
1491 
1492 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1493 				   struct btrfs_root *root,
1494 				   struct extent_buffer *buf)
1495 {
1496 	if (btrfs_test_is_dummy_root(root))
1497 		return 0;
1498 
1499 	/* ensure we can see the force_cow */
1500 	smp_rmb();
1501 
1502 	/*
1503 	 * We do not need to cow a block if
1504 	 * 1) this block is not created or changed in this transaction;
1505 	 * 2) this block does not belong to TREE_RELOC tree;
1506 	 * 3) the root is not forced COW.
1507 	 *
1508 	 * What is forced COW:
1509 	 *    when we create snapshot during commiting the transaction,
1510 	 *    after we've finished coping src root, we must COW the shared
1511 	 *    block to ensure the metadata consistency.
1512 	 */
1513 	if (btrfs_header_generation(buf) == trans->transid &&
1514 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1515 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1516 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1517 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1518 		return 0;
1519 	return 1;
1520 }
1521 
1522 /*
1523  * cows a single block, see __btrfs_cow_block for the real work.
1524  * This version of it has extra checks so that a block isn't cow'd more than
1525  * once per transaction, as long as it hasn't been written yet
1526  */
1527 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1528 		    struct btrfs_root *root, struct extent_buffer *buf,
1529 		    struct extent_buffer *parent, int parent_slot,
1530 		    struct extent_buffer **cow_ret)
1531 {
1532 	u64 search_start;
1533 	int ret;
1534 
1535 	if (trans->transaction != root->fs_info->running_transaction)
1536 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1537 		       trans->transid,
1538 		       root->fs_info->running_transaction->transid);
1539 
1540 	if (trans->transid != root->fs_info->generation)
1541 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1542 		       trans->transid, root->fs_info->generation);
1543 
1544 	if (!should_cow_block(trans, root, buf)) {
1545 		*cow_ret = buf;
1546 		return 0;
1547 	}
1548 
1549 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1550 
1551 	if (parent)
1552 		btrfs_set_lock_blocking(parent);
1553 	btrfs_set_lock_blocking(buf);
1554 
1555 	ret = __btrfs_cow_block(trans, root, buf, parent,
1556 				 parent_slot, cow_ret, search_start, 0);
1557 
1558 	trace_btrfs_cow_block(root, buf, *cow_ret);
1559 
1560 	return ret;
1561 }
1562 
1563 /*
1564  * helper function for defrag to decide if two blocks pointed to by a
1565  * node are actually close by
1566  */
1567 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1568 {
1569 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1570 		return 1;
1571 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1572 		return 1;
1573 	return 0;
1574 }
1575 
1576 /*
1577  * compare two keys in a memcmp fashion
1578  */
1579 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1580 {
1581 	struct btrfs_key k1;
1582 
1583 	btrfs_disk_key_to_cpu(&k1, disk);
1584 
1585 	return btrfs_comp_cpu_keys(&k1, k2);
1586 }
1587 
1588 /*
1589  * same as comp_keys only with two btrfs_key's
1590  */
1591 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1592 {
1593 	if (k1->objectid > k2->objectid)
1594 		return 1;
1595 	if (k1->objectid < k2->objectid)
1596 		return -1;
1597 	if (k1->type > k2->type)
1598 		return 1;
1599 	if (k1->type < k2->type)
1600 		return -1;
1601 	if (k1->offset > k2->offset)
1602 		return 1;
1603 	if (k1->offset < k2->offset)
1604 		return -1;
1605 	return 0;
1606 }
1607 
1608 /*
1609  * this is used by the defrag code to go through all the
1610  * leaves pointed to by a node and reallocate them so that
1611  * disk order is close to key order
1612  */
1613 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1614 		       struct btrfs_root *root, struct extent_buffer *parent,
1615 		       int start_slot, u64 *last_ret,
1616 		       struct btrfs_key *progress)
1617 {
1618 	struct extent_buffer *cur;
1619 	u64 blocknr;
1620 	u64 gen;
1621 	u64 search_start = *last_ret;
1622 	u64 last_block = 0;
1623 	u64 other;
1624 	u32 parent_nritems;
1625 	int end_slot;
1626 	int i;
1627 	int err = 0;
1628 	int parent_level;
1629 	int uptodate;
1630 	u32 blocksize;
1631 	int progress_passed = 0;
1632 	struct btrfs_disk_key disk_key;
1633 
1634 	parent_level = btrfs_header_level(parent);
1635 
1636 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1637 	WARN_ON(trans->transid != root->fs_info->generation);
1638 
1639 	parent_nritems = btrfs_header_nritems(parent);
1640 	blocksize = root->nodesize;
1641 	end_slot = parent_nritems;
1642 
1643 	if (parent_nritems == 1)
1644 		return 0;
1645 
1646 	btrfs_set_lock_blocking(parent);
1647 
1648 	for (i = start_slot; i < end_slot; i++) {
1649 		int close = 1;
1650 
1651 		btrfs_node_key(parent, &disk_key, i);
1652 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1653 			continue;
1654 
1655 		progress_passed = 1;
1656 		blocknr = btrfs_node_blockptr(parent, i);
1657 		gen = btrfs_node_ptr_generation(parent, i);
1658 		if (last_block == 0)
1659 			last_block = blocknr;
1660 
1661 		if (i > 0) {
1662 			other = btrfs_node_blockptr(parent, i - 1);
1663 			close = close_blocks(blocknr, other, blocksize);
1664 		}
1665 		if (!close && i < end_slot - 2) {
1666 			other = btrfs_node_blockptr(parent, i + 1);
1667 			close = close_blocks(blocknr, other, blocksize);
1668 		}
1669 		if (close) {
1670 			last_block = blocknr;
1671 			continue;
1672 		}
1673 
1674 		cur = btrfs_find_tree_block(root, blocknr);
1675 		if (cur)
1676 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1677 		else
1678 			uptodate = 0;
1679 		if (!cur || !uptodate) {
1680 			if (!cur) {
1681 				cur = read_tree_block(root, blocknr, gen);
1682 				if (!cur || !extent_buffer_uptodate(cur)) {
1683 					free_extent_buffer(cur);
1684 					return -EIO;
1685 				}
1686 			} else if (!uptodate) {
1687 				err = btrfs_read_buffer(cur, gen);
1688 				if (err) {
1689 					free_extent_buffer(cur);
1690 					return err;
1691 				}
1692 			}
1693 		}
1694 		if (search_start == 0)
1695 			search_start = last_block;
1696 
1697 		btrfs_tree_lock(cur);
1698 		btrfs_set_lock_blocking(cur);
1699 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1700 					&cur, search_start,
1701 					min(16 * blocksize,
1702 					    (end_slot - i) * blocksize));
1703 		if (err) {
1704 			btrfs_tree_unlock(cur);
1705 			free_extent_buffer(cur);
1706 			break;
1707 		}
1708 		search_start = cur->start;
1709 		last_block = cur->start;
1710 		*last_ret = search_start;
1711 		btrfs_tree_unlock(cur);
1712 		free_extent_buffer(cur);
1713 	}
1714 	return err;
1715 }
1716 
1717 /*
1718  * The leaf data grows from end-to-front in the node.
1719  * this returns the address of the start of the last item,
1720  * which is the stop of the leaf data stack
1721  */
1722 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1723 					 struct extent_buffer *leaf)
1724 {
1725 	u32 nr = btrfs_header_nritems(leaf);
1726 	if (nr == 0)
1727 		return BTRFS_LEAF_DATA_SIZE(root);
1728 	return btrfs_item_offset_nr(leaf, nr - 1);
1729 }
1730 
1731 
1732 /*
1733  * search for key in the extent_buffer.  The items start at offset p,
1734  * and they are item_size apart.  There are 'max' items in p.
1735  *
1736  * the slot in the array is returned via slot, and it points to
1737  * the place where you would insert key if it is not found in
1738  * the array.
1739  *
1740  * slot may point to max if the key is bigger than all of the keys
1741  */
1742 static noinline int generic_bin_search(struct extent_buffer *eb,
1743 				       unsigned long p,
1744 				       int item_size, struct btrfs_key *key,
1745 				       int max, int *slot)
1746 {
1747 	int low = 0;
1748 	int high = max;
1749 	int mid;
1750 	int ret;
1751 	struct btrfs_disk_key *tmp = NULL;
1752 	struct btrfs_disk_key unaligned;
1753 	unsigned long offset;
1754 	char *kaddr = NULL;
1755 	unsigned long map_start = 0;
1756 	unsigned long map_len = 0;
1757 	int err;
1758 
1759 	while (low < high) {
1760 		mid = (low + high) / 2;
1761 		offset = p + mid * item_size;
1762 
1763 		if (!kaddr || offset < map_start ||
1764 		    (offset + sizeof(struct btrfs_disk_key)) >
1765 		    map_start + map_len) {
1766 
1767 			err = map_private_extent_buffer(eb, offset,
1768 						sizeof(struct btrfs_disk_key),
1769 						&kaddr, &map_start, &map_len);
1770 
1771 			if (!err) {
1772 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1773 							map_start);
1774 			} else {
1775 				read_extent_buffer(eb, &unaligned,
1776 						   offset, sizeof(unaligned));
1777 				tmp = &unaligned;
1778 			}
1779 
1780 		} else {
1781 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1782 							map_start);
1783 		}
1784 		ret = comp_keys(tmp, key);
1785 
1786 		if (ret < 0)
1787 			low = mid + 1;
1788 		else if (ret > 0)
1789 			high = mid;
1790 		else {
1791 			*slot = mid;
1792 			return 0;
1793 		}
1794 	}
1795 	*slot = low;
1796 	return 1;
1797 }
1798 
1799 /*
1800  * simple bin_search frontend that does the right thing for
1801  * leaves vs nodes
1802  */
1803 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1804 		      int level, int *slot)
1805 {
1806 	if (level == 0)
1807 		return generic_bin_search(eb,
1808 					  offsetof(struct btrfs_leaf, items),
1809 					  sizeof(struct btrfs_item),
1810 					  key, btrfs_header_nritems(eb),
1811 					  slot);
1812 	else
1813 		return generic_bin_search(eb,
1814 					  offsetof(struct btrfs_node, ptrs),
1815 					  sizeof(struct btrfs_key_ptr),
1816 					  key, btrfs_header_nritems(eb),
1817 					  slot);
1818 }
1819 
1820 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1821 		     int level, int *slot)
1822 {
1823 	return bin_search(eb, key, level, slot);
1824 }
1825 
1826 static void root_add_used(struct btrfs_root *root, u32 size)
1827 {
1828 	spin_lock(&root->accounting_lock);
1829 	btrfs_set_root_used(&root->root_item,
1830 			    btrfs_root_used(&root->root_item) + size);
1831 	spin_unlock(&root->accounting_lock);
1832 }
1833 
1834 static void root_sub_used(struct btrfs_root *root, u32 size)
1835 {
1836 	spin_lock(&root->accounting_lock);
1837 	btrfs_set_root_used(&root->root_item,
1838 			    btrfs_root_used(&root->root_item) - size);
1839 	spin_unlock(&root->accounting_lock);
1840 }
1841 
1842 /* given a node and slot number, this reads the blocks it points to.  The
1843  * extent buffer is returned with a reference taken (but unlocked).
1844  * NULL is returned on error.
1845  */
1846 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1847 				   struct extent_buffer *parent, int slot)
1848 {
1849 	int level = btrfs_header_level(parent);
1850 	struct extent_buffer *eb;
1851 
1852 	if (slot < 0)
1853 		return NULL;
1854 	if (slot >= btrfs_header_nritems(parent))
1855 		return NULL;
1856 
1857 	BUG_ON(level == 0);
1858 
1859 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1860 			     btrfs_node_ptr_generation(parent, slot));
1861 	if (eb && !extent_buffer_uptodate(eb)) {
1862 		free_extent_buffer(eb);
1863 		eb = NULL;
1864 	}
1865 
1866 	return eb;
1867 }
1868 
1869 /*
1870  * node level balancing, used to make sure nodes are in proper order for
1871  * item deletion.  We balance from the top down, so we have to make sure
1872  * that a deletion won't leave an node completely empty later on.
1873  */
1874 static noinline int balance_level(struct btrfs_trans_handle *trans,
1875 			 struct btrfs_root *root,
1876 			 struct btrfs_path *path, int level)
1877 {
1878 	struct extent_buffer *right = NULL;
1879 	struct extent_buffer *mid;
1880 	struct extent_buffer *left = NULL;
1881 	struct extent_buffer *parent = NULL;
1882 	int ret = 0;
1883 	int wret;
1884 	int pslot;
1885 	int orig_slot = path->slots[level];
1886 	u64 orig_ptr;
1887 
1888 	if (level == 0)
1889 		return 0;
1890 
1891 	mid = path->nodes[level];
1892 
1893 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1894 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1895 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1896 
1897 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1898 
1899 	if (level < BTRFS_MAX_LEVEL - 1) {
1900 		parent = path->nodes[level + 1];
1901 		pslot = path->slots[level + 1];
1902 	}
1903 
1904 	/*
1905 	 * deal with the case where there is only one pointer in the root
1906 	 * by promoting the node below to a root
1907 	 */
1908 	if (!parent) {
1909 		struct extent_buffer *child;
1910 
1911 		if (btrfs_header_nritems(mid) != 1)
1912 			return 0;
1913 
1914 		/* promote the child to a root */
1915 		child = read_node_slot(root, mid, 0);
1916 		if (!child) {
1917 			ret = -EROFS;
1918 			btrfs_std_error(root->fs_info, ret);
1919 			goto enospc;
1920 		}
1921 
1922 		btrfs_tree_lock(child);
1923 		btrfs_set_lock_blocking(child);
1924 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1925 		if (ret) {
1926 			btrfs_tree_unlock(child);
1927 			free_extent_buffer(child);
1928 			goto enospc;
1929 		}
1930 
1931 		tree_mod_log_set_root_pointer(root, child, 1);
1932 		rcu_assign_pointer(root->node, child);
1933 
1934 		add_root_to_dirty_list(root);
1935 		btrfs_tree_unlock(child);
1936 
1937 		path->locks[level] = 0;
1938 		path->nodes[level] = NULL;
1939 		clean_tree_block(trans, root, mid);
1940 		btrfs_tree_unlock(mid);
1941 		/* once for the path */
1942 		free_extent_buffer(mid);
1943 
1944 		root_sub_used(root, mid->len);
1945 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1946 		/* once for the root ptr */
1947 		free_extent_buffer_stale(mid);
1948 		return 0;
1949 	}
1950 	if (btrfs_header_nritems(mid) >
1951 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1952 		return 0;
1953 
1954 	left = read_node_slot(root, parent, pslot - 1);
1955 	if (left) {
1956 		btrfs_tree_lock(left);
1957 		btrfs_set_lock_blocking(left);
1958 		wret = btrfs_cow_block(trans, root, left,
1959 				       parent, pslot - 1, &left);
1960 		if (wret) {
1961 			ret = wret;
1962 			goto enospc;
1963 		}
1964 	}
1965 	right = read_node_slot(root, parent, pslot + 1);
1966 	if (right) {
1967 		btrfs_tree_lock(right);
1968 		btrfs_set_lock_blocking(right);
1969 		wret = btrfs_cow_block(trans, root, right,
1970 				       parent, pslot + 1, &right);
1971 		if (wret) {
1972 			ret = wret;
1973 			goto enospc;
1974 		}
1975 	}
1976 
1977 	/* first, try to make some room in the middle buffer */
1978 	if (left) {
1979 		orig_slot += btrfs_header_nritems(left);
1980 		wret = push_node_left(trans, root, left, mid, 1);
1981 		if (wret < 0)
1982 			ret = wret;
1983 	}
1984 
1985 	/*
1986 	 * then try to empty the right most buffer into the middle
1987 	 */
1988 	if (right) {
1989 		wret = push_node_left(trans, root, mid, right, 1);
1990 		if (wret < 0 && wret != -ENOSPC)
1991 			ret = wret;
1992 		if (btrfs_header_nritems(right) == 0) {
1993 			clean_tree_block(trans, root, right);
1994 			btrfs_tree_unlock(right);
1995 			del_ptr(root, path, level + 1, pslot + 1);
1996 			root_sub_used(root, right->len);
1997 			btrfs_free_tree_block(trans, root, right, 0, 1);
1998 			free_extent_buffer_stale(right);
1999 			right = NULL;
2000 		} else {
2001 			struct btrfs_disk_key right_key;
2002 			btrfs_node_key(right, &right_key, 0);
2003 			tree_mod_log_set_node_key(root->fs_info, parent,
2004 						  pslot + 1, 0);
2005 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2006 			btrfs_mark_buffer_dirty(parent);
2007 		}
2008 	}
2009 	if (btrfs_header_nritems(mid) == 1) {
2010 		/*
2011 		 * we're not allowed to leave a node with one item in the
2012 		 * tree during a delete.  A deletion from lower in the tree
2013 		 * could try to delete the only pointer in this node.
2014 		 * So, pull some keys from the left.
2015 		 * There has to be a left pointer at this point because
2016 		 * otherwise we would have pulled some pointers from the
2017 		 * right
2018 		 */
2019 		if (!left) {
2020 			ret = -EROFS;
2021 			btrfs_std_error(root->fs_info, ret);
2022 			goto enospc;
2023 		}
2024 		wret = balance_node_right(trans, root, mid, left);
2025 		if (wret < 0) {
2026 			ret = wret;
2027 			goto enospc;
2028 		}
2029 		if (wret == 1) {
2030 			wret = push_node_left(trans, root, left, mid, 1);
2031 			if (wret < 0)
2032 				ret = wret;
2033 		}
2034 		BUG_ON(wret == 1);
2035 	}
2036 	if (btrfs_header_nritems(mid) == 0) {
2037 		clean_tree_block(trans, root, mid);
2038 		btrfs_tree_unlock(mid);
2039 		del_ptr(root, path, level + 1, pslot);
2040 		root_sub_used(root, mid->len);
2041 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2042 		free_extent_buffer_stale(mid);
2043 		mid = NULL;
2044 	} else {
2045 		/* update the parent key to reflect our changes */
2046 		struct btrfs_disk_key mid_key;
2047 		btrfs_node_key(mid, &mid_key, 0);
2048 		tree_mod_log_set_node_key(root->fs_info, parent,
2049 					  pslot, 0);
2050 		btrfs_set_node_key(parent, &mid_key, pslot);
2051 		btrfs_mark_buffer_dirty(parent);
2052 	}
2053 
2054 	/* update the path */
2055 	if (left) {
2056 		if (btrfs_header_nritems(left) > orig_slot) {
2057 			extent_buffer_get(left);
2058 			/* left was locked after cow */
2059 			path->nodes[level] = left;
2060 			path->slots[level + 1] -= 1;
2061 			path->slots[level] = orig_slot;
2062 			if (mid) {
2063 				btrfs_tree_unlock(mid);
2064 				free_extent_buffer(mid);
2065 			}
2066 		} else {
2067 			orig_slot -= btrfs_header_nritems(left);
2068 			path->slots[level] = orig_slot;
2069 		}
2070 	}
2071 	/* double check we haven't messed things up */
2072 	if (orig_ptr !=
2073 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2074 		BUG();
2075 enospc:
2076 	if (right) {
2077 		btrfs_tree_unlock(right);
2078 		free_extent_buffer(right);
2079 	}
2080 	if (left) {
2081 		if (path->nodes[level] != left)
2082 			btrfs_tree_unlock(left);
2083 		free_extent_buffer(left);
2084 	}
2085 	return ret;
2086 }
2087 
2088 /* Node balancing for insertion.  Here we only split or push nodes around
2089  * when they are completely full.  This is also done top down, so we
2090  * have to be pessimistic.
2091  */
2092 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2093 					  struct btrfs_root *root,
2094 					  struct btrfs_path *path, int level)
2095 {
2096 	struct extent_buffer *right = NULL;
2097 	struct extent_buffer *mid;
2098 	struct extent_buffer *left = NULL;
2099 	struct extent_buffer *parent = NULL;
2100 	int ret = 0;
2101 	int wret;
2102 	int pslot;
2103 	int orig_slot = path->slots[level];
2104 
2105 	if (level == 0)
2106 		return 1;
2107 
2108 	mid = path->nodes[level];
2109 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2110 
2111 	if (level < BTRFS_MAX_LEVEL - 1) {
2112 		parent = path->nodes[level + 1];
2113 		pslot = path->slots[level + 1];
2114 	}
2115 
2116 	if (!parent)
2117 		return 1;
2118 
2119 	left = read_node_slot(root, parent, pslot - 1);
2120 
2121 	/* first, try to make some room in the middle buffer */
2122 	if (left) {
2123 		u32 left_nr;
2124 
2125 		btrfs_tree_lock(left);
2126 		btrfs_set_lock_blocking(left);
2127 
2128 		left_nr = btrfs_header_nritems(left);
2129 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2130 			wret = 1;
2131 		} else {
2132 			ret = btrfs_cow_block(trans, root, left, parent,
2133 					      pslot - 1, &left);
2134 			if (ret)
2135 				wret = 1;
2136 			else {
2137 				wret = push_node_left(trans, root,
2138 						      left, mid, 0);
2139 			}
2140 		}
2141 		if (wret < 0)
2142 			ret = wret;
2143 		if (wret == 0) {
2144 			struct btrfs_disk_key disk_key;
2145 			orig_slot += left_nr;
2146 			btrfs_node_key(mid, &disk_key, 0);
2147 			tree_mod_log_set_node_key(root->fs_info, parent,
2148 						  pslot, 0);
2149 			btrfs_set_node_key(parent, &disk_key, pslot);
2150 			btrfs_mark_buffer_dirty(parent);
2151 			if (btrfs_header_nritems(left) > orig_slot) {
2152 				path->nodes[level] = left;
2153 				path->slots[level + 1] -= 1;
2154 				path->slots[level] = orig_slot;
2155 				btrfs_tree_unlock(mid);
2156 				free_extent_buffer(mid);
2157 			} else {
2158 				orig_slot -=
2159 					btrfs_header_nritems(left);
2160 				path->slots[level] = orig_slot;
2161 				btrfs_tree_unlock(left);
2162 				free_extent_buffer(left);
2163 			}
2164 			return 0;
2165 		}
2166 		btrfs_tree_unlock(left);
2167 		free_extent_buffer(left);
2168 	}
2169 	right = read_node_slot(root, parent, pslot + 1);
2170 
2171 	/*
2172 	 * then try to empty the right most buffer into the middle
2173 	 */
2174 	if (right) {
2175 		u32 right_nr;
2176 
2177 		btrfs_tree_lock(right);
2178 		btrfs_set_lock_blocking(right);
2179 
2180 		right_nr = btrfs_header_nritems(right);
2181 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2182 			wret = 1;
2183 		} else {
2184 			ret = btrfs_cow_block(trans, root, right,
2185 					      parent, pslot + 1,
2186 					      &right);
2187 			if (ret)
2188 				wret = 1;
2189 			else {
2190 				wret = balance_node_right(trans, root,
2191 							  right, mid);
2192 			}
2193 		}
2194 		if (wret < 0)
2195 			ret = wret;
2196 		if (wret == 0) {
2197 			struct btrfs_disk_key disk_key;
2198 
2199 			btrfs_node_key(right, &disk_key, 0);
2200 			tree_mod_log_set_node_key(root->fs_info, parent,
2201 						  pslot + 1, 0);
2202 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2203 			btrfs_mark_buffer_dirty(parent);
2204 
2205 			if (btrfs_header_nritems(mid) <= orig_slot) {
2206 				path->nodes[level] = right;
2207 				path->slots[level + 1] += 1;
2208 				path->slots[level] = orig_slot -
2209 					btrfs_header_nritems(mid);
2210 				btrfs_tree_unlock(mid);
2211 				free_extent_buffer(mid);
2212 			} else {
2213 				btrfs_tree_unlock(right);
2214 				free_extent_buffer(right);
2215 			}
2216 			return 0;
2217 		}
2218 		btrfs_tree_unlock(right);
2219 		free_extent_buffer(right);
2220 	}
2221 	return 1;
2222 }
2223 
2224 /*
2225  * readahead one full node of leaves, finding things that are close
2226  * to the block in 'slot', and triggering ra on them.
2227  */
2228 static void reada_for_search(struct btrfs_root *root,
2229 			     struct btrfs_path *path,
2230 			     int level, int slot, u64 objectid)
2231 {
2232 	struct extent_buffer *node;
2233 	struct btrfs_disk_key disk_key;
2234 	u32 nritems;
2235 	u64 search;
2236 	u64 target;
2237 	u64 nread = 0;
2238 	u64 gen;
2239 	int direction = path->reada;
2240 	struct extent_buffer *eb;
2241 	u32 nr;
2242 	u32 blocksize;
2243 	u32 nscan = 0;
2244 
2245 	if (level != 1)
2246 		return;
2247 
2248 	if (!path->nodes[level])
2249 		return;
2250 
2251 	node = path->nodes[level];
2252 
2253 	search = btrfs_node_blockptr(node, slot);
2254 	blocksize = root->nodesize;
2255 	eb = btrfs_find_tree_block(root, search);
2256 	if (eb) {
2257 		free_extent_buffer(eb);
2258 		return;
2259 	}
2260 
2261 	target = search;
2262 
2263 	nritems = btrfs_header_nritems(node);
2264 	nr = slot;
2265 
2266 	while (1) {
2267 		if (direction < 0) {
2268 			if (nr == 0)
2269 				break;
2270 			nr--;
2271 		} else if (direction > 0) {
2272 			nr++;
2273 			if (nr >= nritems)
2274 				break;
2275 		}
2276 		if (path->reada < 0 && objectid) {
2277 			btrfs_node_key(node, &disk_key, nr);
2278 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2279 				break;
2280 		}
2281 		search = btrfs_node_blockptr(node, nr);
2282 		if ((search <= target && target - search <= 65536) ||
2283 		    (search > target && search - target <= 65536)) {
2284 			gen = btrfs_node_ptr_generation(node, nr);
2285 			readahead_tree_block(root, search, blocksize);
2286 			nread += blocksize;
2287 		}
2288 		nscan++;
2289 		if ((nread > 65536 || nscan > 32))
2290 			break;
2291 	}
2292 }
2293 
2294 static noinline void reada_for_balance(struct btrfs_root *root,
2295 				       struct btrfs_path *path, int level)
2296 {
2297 	int slot;
2298 	int nritems;
2299 	struct extent_buffer *parent;
2300 	struct extent_buffer *eb;
2301 	u64 gen;
2302 	u64 block1 = 0;
2303 	u64 block2 = 0;
2304 	int blocksize;
2305 
2306 	parent = path->nodes[level + 1];
2307 	if (!parent)
2308 		return;
2309 
2310 	nritems = btrfs_header_nritems(parent);
2311 	slot = path->slots[level + 1];
2312 	blocksize = root->nodesize;
2313 
2314 	if (slot > 0) {
2315 		block1 = btrfs_node_blockptr(parent, slot - 1);
2316 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2317 		eb = btrfs_find_tree_block(root, block1);
2318 		/*
2319 		 * if we get -eagain from btrfs_buffer_uptodate, we
2320 		 * don't want to return eagain here.  That will loop
2321 		 * forever
2322 		 */
2323 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2324 			block1 = 0;
2325 		free_extent_buffer(eb);
2326 	}
2327 	if (slot + 1 < nritems) {
2328 		block2 = btrfs_node_blockptr(parent, slot + 1);
2329 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2330 		eb = btrfs_find_tree_block(root, block2);
2331 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2332 			block2 = 0;
2333 		free_extent_buffer(eb);
2334 	}
2335 
2336 	if (block1)
2337 		readahead_tree_block(root, block1, blocksize);
2338 	if (block2)
2339 		readahead_tree_block(root, block2, blocksize);
2340 }
2341 
2342 
2343 /*
2344  * when we walk down the tree, it is usually safe to unlock the higher layers
2345  * in the tree.  The exceptions are when our path goes through slot 0, because
2346  * operations on the tree might require changing key pointers higher up in the
2347  * tree.
2348  *
2349  * callers might also have set path->keep_locks, which tells this code to keep
2350  * the lock if the path points to the last slot in the block.  This is part of
2351  * walking through the tree, and selecting the next slot in the higher block.
2352  *
2353  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2354  * if lowest_unlock is 1, level 0 won't be unlocked
2355  */
2356 static noinline void unlock_up(struct btrfs_path *path, int level,
2357 			       int lowest_unlock, int min_write_lock_level,
2358 			       int *write_lock_level)
2359 {
2360 	int i;
2361 	int skip_level = level;
2362 	int no_skips = 0;
2363 	struct extent_buffer *t;
2364 
2365 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2366 		if (!path->nodes[i])
2367 			break;
2368 		if (!path->locks[i])
2369 			break;
2370 		if (!no_skips && path->slots[i] == 0) {
2371 			skip_level = i + 1;
2372 			continue;
2373 		}
2374 		if (!no_skips && path->keep_locks) {
2375 			u32 nritems;
2376 			t = path->nodes[i];
2377 			nritems = btrfs_header_nritems(t);
2378 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2379 				skip_level = i + 1;
2380 				continue;
2381 			}
2382 		}
2383 		if (skip_level < i && i >= lowest_unlock)
2384 			no_skips = 1;
2385 
2386 		t = path->nodes[i];
2387 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2388 			btrfs_tree_unlock_rw(t, path->locks[i]);
2389 			path->locks[i] = 0;
2390 			if (write_lock_level &&
2391 			    i > min_write_lock_level &&
2392 			    i <= *write_lock_level) {
2393 				*write_lock_level = i - 1;
2394 			}
2395 		}
2396 	}
2397 }
2398 
2399 /*
2400  * This releases any locks held in the path starting at level and
2401  * going all the way up to the root.
2402  *
2403  * btrfs_search_slot will keep the lock held on higher nodes in a few
2404  * corner cases, such as COW of the block at slot zero in the node.  This
2405  * ignores those rules, and it should only be called when there are no
2406  * more updates to be done higher up in the tree.
2407  */
2408 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2409 {
2410 	int i;
2411 
2412 	if (path->keep_locks)
2413 		return;
2414 
2415 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2416 		if (!path->nodes[i])
2417 			continue;
2418 		if (!path->locks[i])
2419 			continue;
2420 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2421 		path->locks[i] = 0;
2422 	}
2423 }
2424 
2425 /*
2426  * helper function for btrfs_search_slot.  The goal is to find a block
2427  * in cache without setting the path to blocking.  If we find the block
2428  * we return zero and the path is unchanged.
2429  *
2430  * If we can't find the block, we set the path blocking and do some
2431  * reada.  -EAGAIN is returned and the search must be repeated.
2432  */
2433 static int
2434 read_block_for_search(struct btrfs_trans_handle *trans,
2435 		       struct btrfs_root *root, struct btrfs_path *p,
2436 		       struct extent_buffer **eb_ret, int level, int slot,
2437 		       struct btrfs_key *key, u64 time_seq)
2438 {
2439 	u64 blocknr;
2440 	u64 gen;
2441 	struct extent_buffer *b = *eb_ret;
2442 	struct extent_buffer *tmp;
2443 	int ret;
2444 
2445 	blocknr = btrfs_node_blockptr(b, slot);
2446 	gen = btrfs_node_ptr_generation(b, slot);
2447 
2448 	tmp = btrfs_find_tree_block(root, blocknr);
2449 	if (tmp) {
2450 		/* first we do an atomic uptodate check */
2451 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2452 			*eb_ret = tmp;
2453 			return 0;
2454 		}
2455 
2456 		/* the pages were up to date, but we failed
2457 		 * the generation number check.  Do a full
2458 		 * read for the generation number that is correct.
2459 		 * We must do this without dropping locks so
2460 		 * we can trust our generation number
2461 		 */
2462 		btrfs_set_path_blocking(p);
2463 
2464 		/* now we're allowed to do a blocking uptodate check */
2465 		ret = btrfs_read_buffer(tmp, gen);
2466 		if (!ret) {
2467 			*eb_ret = tmp;
2468 			return 0;
2469 		}
2470 		free_extent_buffer(tmp);
2471 		btrfs_release_path(p);
2472 		return -EIO;
2473 	}
2474 
2475 	/*
2476 	 * reduce lock contention at high levels
2477 	 * of the btree by dropping locks before
2478 	 * we read.  Don't release the lock on the current
2479 	 * level because we need to walk this node to figure
2480 	 * out which blocks to read.
2481 	 */
2482 	btrfs_unlock_up_safe(p, level + 1);
2483 	btrfs_set_path_blocking(p);
2484 
2485 	free_extent_buffer(tmp);
2486 	if (p->reada)
2487 		reada_for_search(root, p, level, slot, key->objectid);
2488 
2489 	btrfs_release_path(p);
2490 
2491 	ret = -EAGAIN;
2492 	tmp = read_tree_block(root, blocknr, 0);
2493 	if (tmp) {
2494 		/*
2495 		 * If the read above didn't mark this buffer up to date,
2496 		 * it will never end up being up to date.  Set ret to EIO now
2497 		 * and give up so that our caller doesn't loop forever
2498 		 * on our EAGAINs.
2499 		 */
2500 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2501 			ret = -EIO;
2502 		free_extent_buffer(tmp);
2503 	}
2504 	return ret;
2505 }
2506 
2507 /*
2508  * helper function for btrfs_search_slot.  This does all of the checks
2509  * for node-level blocks and does any balancing required based on
2510  * the ins_len.
2511  *
2512  * If no extra work was required, zero is returned.  If we had to
2513  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2514  * start over
2515  */
2516 static int
2517 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2518 		       struct btrfs_root *root, struct btrfs_path *p,
2519 		       struct extent_buffer *b, int level, int ins_len,
2520 		       int *write_lock_level)
2521 {
2522 	int ret;
2523 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2524 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2525 		int sret;
2526 
2527 		if (*write_lock_level < level + 1) {
2528 			*write_lock_level = level + 1;
2529 			btrfs_release_path(p);
2530 			goto again;
2531 		}
2532 
2533 		btrfs_set_path_blocking(p);
2534 		reada_for_balance(root, p, level);
2535 		sret = split_node(trans, root, p, level);
2536 		btrfs_clear_path_blocking(p, NULL, 0);
2537 
2538 		BUG_ON(sret > 0);
2539 		if (sret) {
2540 			ret = sret;
2541 			goto done;
2542 		}
2543 		b = p->nodes[level];
2544 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2545 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2546 		int sret;
2547 
2548 		if (*write_lock_level < level + 1) {
2549 			*write_lock_level = level + 1;
2550 			btrfs_release_path(p);
2551 			goto again;
2552 		}
2553 
2554 		btrfs_set_path_blocking(p);
2555 		reada_for_balance(root, p, level);
2556 		sret = balance_level(trans, root, p, level);
2557 		btrfs_clear_path_blocking(p, NULL, 0);
2558 
2559 		if (sret) {
2560 			ret = sret;
2561 			goto done;
2562 		}
2563 		b = p->nodes[level];
2564 		if (!b) {
2565 			btrfs_release_path(p);
2566 			goto again;
2567 		}
2568 		BUG_ON(btrfs_header_nritems(b) == 1);
2569 	}
2570 	return 0;
2571 
2572 again:
2573 	ret = -EAGAIN;
2574 done:
2575 	return ret;
2576 }
2577 
2578 static void key_search_validate(struct extent_buffer *b,
2579 				struct btrfs_key *key,
2580 				int level)
2581 {
2582 #ifdef CONFIG_BTRFS_ASSERT
2583 	struct btrfs_disk_key disk_key;
2584 
2585 	btrfs_cpu_key_to_disk(&disk_key, key);
2586 
2587 	if (level == 0)
2588 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2589 		    offsetof(struct btrfs_leaf, items[0].key),
2590 		    sizeof(disk_key)));
2591 	else
2592 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593 		    offsetof(struct btrfs_node, ptrs[0].key),
2594 		    sizeof(disk_key)));
2595 #endif
2596 }
2597 
2598 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2599 		      int level, int *prev_cmp, int *slot)
2600 {
2601 	if (*prev_cmp != 0) {
2602 		*prev_cmp = bin_search(b, key, level, slot);
2603 		return *prev_cmp;
2604 	}
2605 
2606 	key_search_validate(b, key, level);
2607 	*slot = 0;
2608 
2609 	return 0;
2610 }
2611 
2612 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
2613 		u64 iobjectid, u64 ioff, u8 key_type,
2614 		struct btrfs_key *found_key)
2615 {
2616 	int ret;
2617 	struct btrfs_key key;
2618 	struct extent_buffer *eb;
2619 	struct btrfs_path *path;
2620 
2621 	key.type = key_type;
2622 	key.objectid = iobjectid;
2623 	key.offset = ioff;
2624 
2625 	if (found_path == NULL) {
2626 		path = btrfs_alloc_path();
2627 		if (!path)
2628 			return -ENOMEM;
2629 	} else
2630 		path = found_path;
2631 
2632 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2633 	if ((ret < 0) || (found_key == NULL)) {
2634 		if (path != found_path)
2635 			btrfs_free_path(path);
2636 		return ret;
2637 	}
2638 
2639 	eb = path->nodes[0];
2640 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2641 		ret = btrfs_next_leaf(fs_root, path);
2642 		if (ret)
2643 			return ret;
2644 		eb = path->nodes[0];
2645 	}
2646 
2647 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2648 	if (found_key->type != key.type ||
2649 			found_key->objectid != key.objectid)
2650 		return 1;
2651 
2652 	return 0;
2653 }
2654 
2655 /*
2656  * look for key in the tree.  path is filled in with nodes along the way
2657  * if key is found, we return zero and you can find the item in the leaf
2658  * level of the path (level 0)
2659  *
2660  * If the key isn't found, the path points to the slot where it should
2661  * be inserted, and 1 is returned.  If there are other errors during the
2662  * search a negative error number is returned.
2663  *
2664  * if ins_len > 0, nodes and leaves will be split as we walk down the
2665  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2666  * possible)
2667  */
2668 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2669 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2670 		      ins_len, int cow)
2671 {
2672 	struct extent_buffer *b;
2673 	int slot;
2674 	int ret;
2675 	int err;
2676 	int level;
2677 	int lowest_unlock = 1;
2678 	int root_lock;
2679 	/* everything at write_lock_level or lower must be write locked */
2680 	int write_lock_level = 0;
2681 	u8 lowest_level = 0;
2682 	int min_write_lock_level;
2683 	int prev_cmp;
2684 
2685 	lowest_level = p->lowest_level;
2686 	WARN_ON(lowest_level && ins_len > 0);
2687 	WARN_ON(p->nodes[0] != NULL);
2688 	BUG_ON(!cow && ins_len);
2689 
2690 	if (ins_len < 0) {
2691 		lowest_unlock = 2;
2692 
2693 		/* when we are removing items, we might have to go up to level
2694 		 * two as we update tree pointers  Make sure we keep write
2695 		 * for those levels as well
2696 		 */
2697 		write_lock_level = 2;
2698 	} else if (ins_len > 0) {
2699 		/*
2700 		 * for inserting items, make sure we have a write lock on
2701 		 * level 1 so we can update keys
2702 		 */
2703 		write_lock_level = 1;
2704 	}
2705 
2706 	if (!cow)
2707 		write_lock_level = -1;
2708 
2709 	if (cow && (p->keep_locks || p->lowest_level))
2710 		write_lock_level = BTRFS_MAX_LEVEL;
2711 
2712 	min_write_lock_level = write_lock_level;
2713 
2714 again:
2715 	prev_cmp = -1;
2716 	/*
2717 	 * we try very hard to do read locks on the root
2718 	 */
2719 	root_lock = BTRFS_READ_LOCK;
2720 	level = 0;
2721 	if (p->search_commit_root) {
2722 		/*
2723 		 * the commit roots are read only
2724 		 * so we always do read locks
2725 		 */
2726 		if (p->need_commit_sem)
2727 			down_read(&root->fs_info->commit_root_sem);
2728 		b = root->commit_root;
2729 		extent_buffer_get(b);
2730 		level = btrfs_header_level(b);
2731 		if (p->need_commit_sem)
2732 			up_read(&root->fs_info->commit_root_sem);
2733 		if (!p->skip_locking)
2734 			btrfs_tree_read_lock(b);
2735 	} else {
2736 		if (p->skip_locking) {
2737 			b = btrfs_root_node(root);
2738 			level = btrfs_header_level(b);
2739 		} else {
2740 			/* we don't know the level of the root node
2741 			 * until we actually have it read locked
2742 			 */
2743 			b = btrfs_read_lock_root_node(root);
2744 			level = btrfs_header_level(b);
2745 			if (level <= write_lock_level) {
2746 				/* whoops, must trade for write lock */
2747 				btrfs_tree_read_unlock(b);
2748 				free_extent_buffer(b);
2749 				b = btrfs_lock_root_node(root);
2750 				root_lock = BTRFS_WRITE_LOCK;
2751 
2752 				/* the level might have changed, check again */
2753 				level = btrfs_header_level(b);
2754 			}
2755 		}
2756 	}
2757 	p->nodes[level] = b;
2758 	if (!p->skip_locking)
2759 		p->locks[level] = root_lock;
2760 
2761 	while (b) {
2762 		level = btrfs_header_level(b);
2763 
2764 		/*
2765 		 * setup the path here so we can release it under lock
2766 		 * contention with the cow code
2767 		 */
2768 		if (cow) {
2769 			/*
2770 			 * if we don't really need to cow this block
2771 			 * then we don't want to set the path blocking,
2772 			 * so we test it here
2773 			 */
2774 			if (!should_cow_block(trans, root, b))
2775 				goto cow_done;
2776 
2777 			/*
2778 			 * must have write locks on this node and the
2779 			 * parent
2780 			 */
2781 			if (level > write_lock_level ||
2782 			    (level + 1 > write_lock_level &&
2783 			    level + 1 < BTRFS_MAX_LEVEL &&
2784 			    p->nodes[level + 1])) {
2785 				write_lock_level = level + 1;
2786 				btrfs_release_path(p);
2787 				goto again;
2788 			}
2789 
2790 			btrfs_set_path_blocking(p);
2791 			err = btrfs_cow_block(trans, root, b,
2792 					      p->nodes[level + 1],
2793 					      p->slots[level + 1], &b);
2794 			if (err) {
2795 				ret = err;
2796 				goto done;
2797 			}
2798 		}
2799 cow_done:
2800 		p->nodes[level] = b;
2801 		btrfs_clear_path_blocking(p, NULL, 0);
2802 
2803 		/*
2804 		 * we have a lock on b and as long as we aren't changing
2805 		 * the tree, there is no way to for the items in b to change.
2806 		 * It is safe to drop the lock on our parent before we
2807 		 * go through the expensive btree search on b.
2808 		 *
2809 		 * If we're inserting or deleting (ins_len != 0), then we might
2810 		 * be changing slot zero, which may require changing the parent.
2811 		 * So, we can't drop the lock until after we know which slot
2812 		 * we're operating on.
2813 		 */
2814 		if (!ins_len && !p->keep_locks) {
2815 			int u = level + 1;
2816 
2817 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2818 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2819 				p->locks[u] = 0;
2820 			}
2821 		}
2822 
2823 		ret = key_search(b, key, level, &prev_cmp, &slot);
2824 
2825 		if (level != 0) {
2826 			int dec = 0;
2827 			if (ret && slot > 0) {
2828 				dec = 1;
2829 				slot -= 1;
2830 			}
2831 			p->slots[level] = slot;
2832 			err = setup_nodes_for_search(trans, root, p, b, level,
2833 					     ins_len, &write_lock_level);
2834 			if (err == -EAGAIN)
2835 				goto again;
2836 			if (err) {
2837 				ret = err;
2838 				goto done;
2839 			}
2840 			b = p->nodes[level];
2841 			slot = p->slots[level];
2842 
2843 			/*
2844 			 * slot 0 is special, if we change the key
2845 			 * we have to update the parent pointer
2846 			 * which means we must have a write lock
2847 			 * on the parent
2848 			 */
2849 			if (slot == 0 && ins_len &&
2850 			    write_lock_level < level + 1) {
2851 				write_lock_level = level + 1;
2852 				btrfs_release_path(p);
2853 				goto again;
2854 			}
2855 
2856 			unlock_up(p, level, lowest_unlock,
2857 				  min_write_lock_level, &write_lock_level);
2858 
2859 			if (level == lowest_level) {
2860 				if (dec)
2861 					p->slots[level]++;
2862 				goto done;
2863 			}
2864 
2865 			err = read_block_for_search(trans, root, p,
2866 						    &b, level, slot, key, 0);
2867 			if (err == -EAGAIN)
2868 				goto again;
2869 			if (err) {
2870 				ret = err;
2871 				goto done;
2872 			}
2873 
2874 			if (!p->skip_locking) {
2875 				level = btrfs_header_level(b);
2876 				if (level <= write_lock_level) {
2877 					err = btrfs_try_tree_write_lock(b);
2878 					if (!err) {
2879 						btrfs_set_path_blocking(p);
2880 						btrfs_tree_lock(b);
2881 						btrfs_clear_path_blocking(p, b,
2882 								  BTRFS_WRITE_LOCK);
2883 					}
2884 					p->locks[level] = BTRFS_WRITE_LOCK;
2885 				} else {
2886 					err = btrfs_tree_read_lock_atomic(b);
2887 					if (!err) {
2888 						btrfs_set_path_blocking(p);
2889 						btrfs_tree_read_lock(b);
2890 						btrfs_clear_path_blocking(p, b,
2891 								  BTRFS_READ_LOCK);
2892 					}
2893 					p->locks[level] = BTRFS_READ_LOCK;
2894 				}
2895 				p->nodes[level] = b;
2896 			}
2897 		} else {
2898 			p->slots[level] = slot;
2899 			if (ins_len > 0 &&
2900 			    btrfs_leaf_free_space(root, b) < ins_len) {
2901 				if (write_lock_level < 1) {
2902 					write_lock_level = 1;
2903 					btrfs_release_path(p);
2904 					goto again;
2905 				}
2906 
2907 				btrfs_set_path_blocking(p);
2908 				err = split_leaf(trans, root, key,
2909 						 p, ins_len, ret == 0);
2910 				btrfs_clear_path_blocking(p, NULL, 0);
2911 
2912 				BUG_ON(err > 0);
2913 				if (err) {
2914 					ret = err;
2915 					goto done;
2916 				}
2917 			}
2918 			if (!p->search_for_split)
2919 				unlock_up(p, level, lowest_unlock,
2920 					  min_write_lock_level, &write_lock_level);
2921 			goto done;
2922 		}
2923 	}
2924 	ret = 1;
2925 done:
2926 	/*
2927 	 * we don't really know what they plan on doing with the path
2928 	 * from here on, so for now just mark it as blocking
2929 	 */
2930 	if (!p->leave_spinning)
2931 		btrfs_set_path_blocking(p);
2932 	if (ret < 0 && !p->skip_release_on_error)
2933 		btrfs_release_path(p);
2934 	return ret;
2935 }
2936 
2937 /*
2938  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2939  * current state of the tree together with the operations recorded in the tree
2940  * modification log to search for the key in a previous version of this tree, as
2941  * denoted by the time_seq parameter.
2942  *
2943  * Naturally, there is no support for insert, delete or cow operations.
2944  *
2945  * The resulting path and return value will be set up as if we called
2946  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2947  */
2948 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2949 			  struct btrfs_path *p, u64 time_seq)
2950 {
2951 	struct extent_buffer *b;
2952 	int slot;
2953 	int ret;
2954 	int err;
2955 	int level;
2956 	int lowest_unlock = 1;
2957 	u8 lowest_level = 0;
2958 	int prev_cmp = -1;
2959 
2960 	lowest_level = p->lowest_level;
2961 	WARN_ON(p->nodes[0] != NULL);
2962 
2963 	if (p->search_commit_root) {
2964 		BUG_ON(time_seq);
2965 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2966 	}
2967 
2968 again:
2969 	b = get_old_root(root, time_seq);
2970 	level = btrfs_header_level(b);
2971 	p->locks[level] = BTRFS_READ_LOCK;
2972 
2973 	while (b) {
2974 		level = btrfs_header_level(b);
2975 		p->nodes[level] = b;
2976 		btrfs_clear_path_blocking(p, NULL, 0);
2977 
2978 		/*
2979 		 * we have a lock on b and as long as we aren't changing
2980 		 * the tree, there is no way to for the items in b to change.
2981 		 * It is safe to drop the lock on our parent before we
2982 		 * go through the expensive btree search on b.
2983 		 */
2984 		btrfs_unlock_up_safe(p, level + 1);
2985 
2986 		/*
2987 		 * Since we can unwind eb's we want to do a real search every
2988 		 * time.
2989 		 */
2990 		prev_cmp = -1;
2991 		ret = key_search(b, key, level, &prev_cmp, &slot);
2992 
2993 		if (level != 0) {
2994 			int dec = 0;
2995 			if (ret && slot > 0) {
2996 				dec = 1;
2997 				slot -= 1;
2998 			}
2999 			p->slots[level] = slot;
3000 			unlock_up(p, level, lowest_unlock, 0, NULL);
3001 
3002 			if (level == lowest_level) {
3003 				if (dec)
3004 					p->slots[level]++;
3005 				goto done;
3006 			}
3007 
3008 			err = read_block_for_search(NULL, root, p, &b, level,
3009 						    slot, key, time_seq);
3010 			if (err == -EAGAIN)
3011 				goto again;
3012 			if (err) {
3013 				ret = err;
3014 				goto done;
3015 			}
3016 
3017 			level = btrfs_header_level(b);
3018 			err = btrfs_tree_read_lock_atomic(b);
3019 			if (!err) {
3020 				btrfs_set_path_blocking(p);
3021 				btrfs_tree_read_lock(b);
3022 				btrfs_clear_path_blocking(p, b,
3023 							  BTRFS_READ_LOCK);
3024 			}
3025 			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3026 			if (!b) {
3027 				ret = -ENOMEM;
3028 				goto done;
3029 			}
3030 			p->locks[level] = BTRFS_READ_LOCK;
3031 			p->nodes[level] = b;
3032 		} else {
3033 			p->slots[level] = slot;
3034 			unlock_up(p, level, lowest_unlock, 0, NULL);
3035 			goto done;
3036 		}
3037 	}
3038 	ret = 1;
3039 done:
3040 	if (!p->leave_spinning)
3041 		btrfs_set_path_blocking(p);
3042 	if (ret < 0)
3043 		btrfs_release_path(p);
3044 
3045 	return ret;
3046 }
3047 
3048 /*
3049  * helper to use instead of search slot if no exact match is needed but
3050  * instead the next or previous item should be returned.
3051  * When find_higher is true, the next higher item is returned, the next lower
3052  * otherwise.
3053  * When return_any and find_higher are both true, and no higher item is found,
3054  * return the next lower instead.
3055  * When return_any is true and find_higher is false, and no lower item is found,
3056  * return the next higher instead.
3057  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3058  * < 0 on error
3059  */
3060 int btrfs_search_slot_for_read(struct btrfs_root *root,
3061 			       struct btrfs_key *key, struct btrfs_path *p,
3062 			       int find_higher, int return_any)
3063 {
3064 	int ret;
3065 	struct extent_buffer *leaf;
3066 
3067 again:
3068 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3069 	if (ret <= 0)
3070 		return ret;
3071 	/*
3072 	 * a return value of 1 means the path is at the position where the
3073 	 * item should be inserted. Normally this is the next bigger item,
3074 	 * but in case the previous item is the last in a leaf, path points
3075 	 * to the first free slot in the previous leaf, i.e. at an invalid
3076 	 * item.
3077 	 */
3078 	leaf = p->nodes[0];
3079 
3080 	if (find_higher) {
3081 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3082 			ret = btrfs_next_leaf(root, p);
3083 			if (ret <= 0)
3084 				return ret;
3085 			if (!return_any)
3086 				return 1;
3087 			/*
3088 			 * no higher item found, return the next
3089 			 * lower instead
3090 			 */
3091 			return_any = 0;
3092 			find_higher = 0;
3093 			btrfs_release_path(p);
3094 			goto again;
3095 		}
3096 	} else {
3097 		if (p->slots[0] == 0) {
3098 			ret = btrfs_prev_leaf(root, p);
3099 			if (ret < 0)
3100 				return ret;
3101 			if (!ret) {
3102 				leaf = p->nodes[0];
3103 				if (p->slots[0] == btrfs_header_nritems(leaf))
3104 					p->slots[0]--;
3105 				return 0;
3106 			}
3107 			if (!return_any)
3108 				return 1;
3109 			/*
3110 			 * no lower item found, return the next
3111 			 * higher instead
3112 			 */
3113 			return_any = 0;
3114 			find_higher = 1;
3115 			btrfs_release_path(p);
3116 			goto again;
3117 		} else {
3118 			--p->slots[0];
3119 		}
3120 	}
3121 	return 0;
3122 }
3123 
3124 /*
3125  * adjust the pointers going up the tree, starting at level
3126  * making sure the right key of each node is points to 'key'.
3127  * This is used after shifting pointers to the left, so it stops
3128  * fixing up pointers when a given leaf/node is not in slot 0 of the
3129  * higher levels
3130  *
3131  */
3132 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3133 			   struct btrfs_disk_key *key, int level)
3134 {
3135 	int i;
3136 	struct extent_buffer *t;
3137 
3138 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3139 		int tslot = path->slots[i];
3140 		if (!path->nodes[i])
3141 			break;
3142 		t = path->nodes[i];
3143 		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3144 		btrfs_set_node_key(t, key, tslot);
3145 		btrfs_mark_buffer_dirty(path->nodes[i]);
3146 		if (tslot != 0)
3147 			break;
3148 	}
3149 }
3150 
3151 /*
3152  * update item key.
3153  *
3154  * This function isn't completely safe. It's the caller's responsibility
3155  * that the new key won't break the order
3156  */
3157 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3158 			     struct btrfs_key *new_key)
3159 {
3160 	struct btrfs_disk_key disk_key;
3161 	struct extent_buffer *eb;
3162 	int slot;
3163 
3164 	eb = path->nodes[0];
3165 	slot = path->slots[0];
3166 	if (slot > 0) {
3167 		btrfs_item_key(eb, &disk_key, slot - 1);
3168 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3169 	}
3170 	if (slot < btrfs_header_nritems(eb) - 1) {
3171 		btrfs_item_key(eb, &disk_key, slot + 1);
3172 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3173 	}
3174 
3175 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3176 	btrfs_set_item_key(eb, &disk_key, slot);
3177 	btrfs_mark_buffer_dirty(eb);
3178 	if (slot == 0)
3179 		fixup_low_keys(root, path, &disk_key, 1);
3180 }
3181 
3182 /*
3183  * try to push data from one node into the next node left in the
3184  * tree.
3185  *
3186  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3187  * error, and > 0 if there was no room in the left hand block.
3188  */
3189 static int push_node_left(struct btrfs_trans_handle *trans,
3190 			  struct btrfs_root *root, struct extent_buffer *dst,
3191 			  struct extent_buffer *src, int empty)
3192 {
3193 	int push_items = 0;
3194 	int src_nritems;
3195 	int dst_nritems;
3196 	int ret = 0;
3197 
3198 	src_nritems = btrfs_header_nritems(src);
3199 	dst_nritems = btrfs_header_nritems(dst);
3200 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3201 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3202 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3203 
3204 	if (!empty && src_nritems <= 8)
3205 		return 1;
3206 
3207 	if (push_items <= 0)
3208 		return 1;
3209 
3210 	if (empty) {
3211 		push_items = min(src_nritems, push_items);
3212 		if (push_items < src_nritems) {
3213 			/* leave at least 8 pointers in the node if
3214 			 * we aren't going to empty it
3215 			 */
3216 			if (src_nritems - push_items < 8) {
3217 				if (push_items <= 8)
3218 					return 1;
3219 				push_items -= 8;
3220 			}
3221 		}
3222 	} else
3223 		push_items = min(src_nritems - 8, push_items);
3224 
3225 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3226 				   push_items);
3227 	if (ret) {
3228 		btrfs_abort_transaction(trans, root, ret);
3229 		return ret;
3230 	}
3231 	copy_extent_buffer(dst, src,
3232 			   btrfs_node_key_ptr_offset(dst_nritems),
3233 			   btrfs_node_key_ptr_offset(0),
3234 			   push_items * sizeof(struct btrfs_key_ptr));
3235 
3236 	if (push_items < src_nritems) {
3237 		/*
3238 		 * don't call tree_mod_log_eb_move here, key removal was already
3239 		 * fully logged by tree_mod_log_eb_copy above.
3240 		 */
3241 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3242 				      btrfs_node_key_ptr_offset(push_items),
3243 				      (src_nritems - push_items) *
3244 				      sizeof(struct btrfs_key_ptr));
3245 	}
3246 	btrfs_set_header_nritems(src, src_nritems - push_items);
3247 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3248 	btrfs_mark_buffer_dirty(src);
3249 	btrfs_mark_buffer_dirty(dst);
3250 
3251 	return ret;
3252 }
3253 
3254 /*
3255  * try to push data from one node into the next node right in the
3256  * tree.
3257  *
3258  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3259  * error, and > 0 if there was no room in the right hand block.
3260  *
3261  * this will  only push up to 1/2 the contents of the left node over
3262  */
3263 static int balance_node_right(struct btrfs_trans_handle *trans,
3264 			      struct btrfs_root *root,
3265 			      struct extent_buffer *dst,
3266 			      struct extent_buffer *src)
3267 {
3268 	int push_items = 0;
3269 	int max_push;
3270 	int src_nritems;
3271 	int dst_nritems;
3272 	int ret = 0;
3273 
3274 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3275 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3276 
3277 	src_nritems = btrfs_header_nritems(src);
3278 	dst_nritems = btrfs_header_nritems(dst);
3279 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3280 	if (push_items <= 0)
3281 		return 1;
3282 
3283 	if (src_nritems < 4)
3284 		return 1;
3285 
3286 	max_push = src_nritems / 2 + 1;
3287 	/* don't try to empty the node */
3288 	if (max_push >= src_nritems)
3289 		return 1;
3290 
3291 	if (max_push < push_items)
3292 		push_items = max_push;
3293 
3294 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3295 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3296 				      btrfs_node_key_ptr_offset(0),
3297 				      (dst_nritems) *
3298 				      sizeof(struct btrfs_key_ptr));
3299 
3300 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3301 				   src_nritems - push_items, push_items);
3302 	if (ret) {
3303 		btrfs_abort_transaction(trans, root, ret);
3304 		return ret;
3305 	}
3306 	copy_extent_buffer(dst, src,
3307 			   btrfs_node_key_ptr_offset(0),
3308 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3309 			   push_items * sizeof(struct btrfs_key_ptr));
3310 
3311 	btrfs_set_header_nritems(src, src_nritems - push_items);
3312 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3313 
3314 	btrfs_mark_buffer_dirty(src);
3315 	btrfs_mark_buffer_dirty(dst);
3316 
3317 	return ret;
3318 }
3319 
3320 /*
3321  * helper function to insert a new root level in the tree.
3322  * A new node is allocated, and a single item is inserted to
3323  * point to the existing root
3324  *
3325  * returns zero on success or < 0 on failure.
3326  */
3327 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3328 			   struct btrfs_root *root,
3329 			   struct btrfs_path *path, int level)
3330 {
3331 	u64 lower_gen;
3332 	struct extent_buffer *lower;
3333 	struct extent_buffer *c;
3334 	struct extent_buffer *old;
3335 	struct btrfs_disk_key lower_key;
3336 
3337 	BUG_ON(path->nodes[level]);
3338 	BUG_ON(path->nodes[level-1] != root->node);
3339 
3340 	lower = path->nodes[level-1];
3341 	if (level == 1)
3342 		btrfs_item_key(lower, &lower_key, 0);
3343 	else
3344 		btrfs_node_key(lower, &lower_key, 0);
3345 
3346 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3347 				   &lower_key, level, root->node->start, 0);
3348 	if (IS_ERR(c))
3349 		return PTR_ERR(c);
3350 
3351 	root_add_used(root, root->nodesize);
3352 
3353 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3354 	btrfs_set_header_nritems(c, 1);
3355 	btrfs_set_header_level(c, level);
3356 	btrfs_set_header_bytenr(c, c->start);
3357 	btrfs_set_header_generation(c, trans->transid);
3358 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3359 	btrfs_set_header_owner(c, root->root_key.objectid);
3360 
3361 	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3362 			    BTRFS_FSID_SIZE);
3363 
3364 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3365 			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3366 
3367 	btrfs_set_node_key(c, &lower_key, 0);
3368 	btrfs_set_node_blockptr(c, 0, lower->start);
3369 	lower_gen = btrfs_header_generation(lower);
3370 	WARN_ON(lower_gen != trans->transid);
3371 
3372 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3373 
3374 	btrfs_mark_buffer_dirty(c);
3375 
3376 	old = root->node;
3377 	tree_mod_log_set_root_pointer(root, c, 0);
3378 	rcu_assign_pointer(root->node, c);
3379 
3380 	/* the super has an extra ref to root->node */
3381 	free_extent_buffer(old);
3382 
3383 	add_root_to_dirty_list(root);
3384 	extent_buffer_get(c);
3385 	path->nodes[level] = c;
3386 	path->locks[level] = BTRFS_WRITE_LOCK;
3387 	path->slots[level] = 0;
3388 	return 0;
3389 }
3390 
3391 /*
3392  * worker function to insert a single pointer in a node.
3393  * the node should have enough room for the pointer already
3394  *
3395  * slot and level indicate where you want the key to go, and
3396  * blocknr is the block the key points to.
3397  */
3398 static void insert_ptr(struct btrfs_trans_handle *trans,
3399 		       struct btrfs_root *root, struct btrfs_path *path,
3400 		       struct btrfs_disk_key *key, u64 bytenr,
3401 		       int slot, int level)
3402 {
3403 	struct extent_buffer *lower;
3404 	int nritems;
3405 	int ret;
3406 
3407 	BUG_ON(!path->nodes[level]);
3408 	btrfs_assert_tree_locked(path->nodes[level]);
3409 	lower = path->nodes[level];
3410 	nritems = btrfs_header_nritems(lower);
3411 	BUG_ON(slot > nritems);
3412 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3413 	if (slot != nritems) {
3414 		if (level)
3415 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3416 					     slot, nritems - slot);
3417 		memmove_extent_buffer(lower,
3418 			      btrfs_node_key_ptr_offset(slot + 1),
3419 			      btrfs_node_key_ptr_offset(slot),
3420 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3421 	}
3422 	if (level) {
3423 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3424 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3425 		BUG_ON(ret < 0);
3426 	}
3427 	btrfs_set_node_key(lower, key, slot);
3428 	btrfs_set_node_blockptr(lower, slot, bytenr);
3429 	WARN_ON(trans->transid == 0);
3430 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3431 	btrfs_set_header_nritems(lower, nritems + 1);
3432 	btrfs_mark_buffer_dirty(lower);
3433 }
3434 
3435 /*
3436  * split the node at the specified level in path in two.
3437  * The path is corrected to point to the appropriate node after the split
3438  *
3439  * Before splitting this tries to make some room in the node by pushing
3440  * left and right, if either one works, it returns right away.
3441  *
3442  * returns 0 on success and < 0 on failure
3443  */
3444 static noinline int split_node(struct btrfs_trans_handle *trans,
3445 			       struct btrfs_root *root,
3446 			       struct btrfs_path *path, int level)
3447 {
3448 	struct extent_buffer *c;
3449 	struct extent_buffer *split;
3450 	struct btrfs_disk_key disk_key;
3451 	int mid;
3452 	int ret;
3453 	u32 c_nritems;
3454 
3455 	c = path->nodes[level];
3456 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3457 	if (c == root->node) {
3458 		/*
3459 		 * trying to split the root, lets make a new one
3460 		 *
3461 		 * tree mod log: We don't log_removal old root in
3462 		 * insert_new_root, because that root buffer will be kept as a
3463 		 * normal node. We are going to log removal of half of the
3464 		 * elements below with tree_mod_log_eb_copy. We're holding a
3465 		 * tree lock on the buffer, which is why we cannot race with
3466 		 * other tree_mod_log users.
3467 		 */
3468 		ret = insert_new_root(trans, root, path, level + 1);
3469 		if (ret)
3470 			return ret;
3471 	} else {
3472 		ret = push_nodes_for_insert(trans, root, path, level);
3473 		c = path->nodes[level];
3474 		if (!ret && btrfs_header_nritems(c) <
3475 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3476 			return 0;
3477 		if (ret < 0)
3478 			return ret;
3479 	}
3480 
3481 	c_nritems = btrfs_header_nritems(c);
3482 	mid = (c_nritems + 1) / 2;
3483 	btrfs_node_key(c, &disk_key, mid);
3484 
3485 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3486 			&disk_key, level, c->start, 0);
3487 	if (IS_ERR(split))
3488 		return PTR_ERR(split);
3489 
3490 	root_add_used(root, root->nodesize);
3491 
3492 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3493 	btrfs_set_header_level(split, btrfs_header_level(c));
3494 	btrfs_set_header_bytenr(split, split->start);
3495 	btrfs_set_header_generation(split, trans->transid);
3496 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3497 	btrfs_set_header_owner(split, root->root_key.objectid);
3498 	write_extent_buffer(split, root->fs_info->fsid,
3499 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
3500 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3501 			    btrfs_header_chunk_tree_uuid(split),
3502 			    BTRFS_UUID_SIZE);
3503 
3504 	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3505 				   mid, c_nritems - mid);
3506 	if (ret) {
3507 		btrfs_abort_transaction(trans, root, ret);
3508 		return ret;
3509 	}
3510 	copy_extent_buffer(split, c,
3511 			   btrfs_node_key_ptr_offset(0),
3512 			   btrfs_node_key_ptr_offset(mid),
3513 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3514 	btrfs_set_header_nritems(split, c_nritems - mid);
3515 	btrfs_set_header_nritems(c, mid);
3516 	ret = 0;
3517 
3518 	btrfs_mark_buffer_dirty(c);
3519 	btrfs_mark_buffer_dirty(split);
3520 
3521 	insert_ptr(trans, root, path, &disk_key, split->start,
3522 		   path->slots[level + 1] + 1, level + 1);
3523 
3524 	if (path->slots[level] >= mid) {
3525 		path->slots[level] -= mid;
3526 		btrfs_tree_unlock(c);
3527 		free_extent_buffer(c);
3528 		path->nodes[level] = split;
3529 		path->slots[level + 1] += 1;
3530 	} else {
3531 		btrfs_tree_unlock(split);
3532 		free_extent_buffer(split);
3533 	}
3534 	return ret;
3535 }
3536 
3537 /*
3538  * how many bytes are required to store the items in a leaf.  start
3539  * and nr indicate which items in the leaf to check.  This totals up the
3540  * space used both by the item structs and the item data
3541  */
3542 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3543 {
3544 	struct btrfs_item *start_item;
3545 	struct btrfs_item *end_item;
3546 	struct btrfs_map_token token;
3547 	int data_len;
3548 	int nritems = btrfs_header_nritems(l);
3549 	int end = min(nritems, start + nr) - 1;
3550 
3551 	if (!nr)
3552 		return 0;
3553 	btrfs_init_map_token(&token);
3554 	start_item = btrfs_item_nr(start);
3555 	end_item = btrfs_item_nr(end);
3556 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3557 		btrfs_token_item_size(l, start_item, &token);
3558 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3559 	data_len += sizeof(struct btrfs_item) * nr;
3560 	WARN_ON(data_len < 0);
3561 	return data_len;
3562 }
3563 
3564 /*
3565  * The space between the end of the leaf items and
3566  * the start of the leaf data.  IOW, how much room
3567  * the leaf has left for both items and data
3568  */
3569 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3570 				   struct extent_buffer *leaf)
3571 {
3572 	int nritems = btrfs_header_nritems(leaf);
3573 	int ret;
3574 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3575 	if (ret < 0) {
3576 		btrfs_crit(root->fs_info,
3577 			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3578 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3579 		       leaf_space_used(leaf, 0, nritems), nritems);
3580 	}
3581 	return ret;
3582 }
3583 
3584 /*
3585  * min slot controls the lowest index we're willing to push to the
3586  * right.  We'll push up to and including min_slot, but no lower
3587  */
3588 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3589 				      struct btrfs_root *root,
3590 				      struct btrfs_path *path,
3591 				      int data_size, int empty,
3592 				      struct extent_buffer *right,
3593 				      int free_space, u32 left_nritems,
3594 				      u32 min_slot)
3595 {
3596 	struct extent_buffer *left = path->nodes[0];
3597 	struct extent_buffer *upper = path->nodes[1];
3598 	struct btrfs_map_token token;
3599 	struct btrfs_disk_key disk_key;
3600 	int slot;
3601 	u32 i;
3602 	int push_space = 0;
3603 	int push_items = 0;
3604 	struct btrfs_item *item;
3605 	u32 nr;
3606 	u32 right_nritems;
3607 	u32 data_end;
3608 	u32 this_item_size;
3609 
3610 	btrfs_init_map_token(&token);
3611 
3612 	if (empty)
3613 		nr = 0;
3614 	else
3615 		nr = max_t(u32, 1, min_slot);
3616 
3617 	if (path->slots[0] >= left_nritems)
3618 		push_space += data_size;
3619 
3620 	slot = path->slots[1];
3621 	i = left_nritems - 1;
3622 	while (i >= nr) {
3623 		item = btrfs_item_nr(i);
3624 
3625 		if (!empty && push_items > 0) {
3626 			if (path->slots[0] > i)
3627 				break;
3628 			if (path->slots[0] == i) {
3629 				int space = btrfs_leaf_free_space(root, left);
3630 				if (space + push_space * 2 > free_space)
3631 					break;
3632 			}
3633 		}
3634 
3635 		if (path->slots[0] == i)
3636 			push_space += data_size;
3637 
3638 		this_item_size = btrfs_item_size(left, item);
3639 		if (this_item_size + sizeof(*item) + push_space > free_space)
3640 			break;
3641 
3642 		push_items++;
3643 		push_space += this_item_size + sizeof(*item);
3644 		if (i == 0)
3645 			break;
3646 		i--;
3647 	}
3648 
3649 	if (push_items == 0)
3650 		goto out_unlock;
3651 
3652 	WARN_ON(!empty && push_items == left_nritems);
3653 
3654 	/* push left to right */
3655 	right_nritems = btrfs_header_nritems(right);
3656 
3657 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3658 	push_space -= leaf_data_end(root, left);
3659 
3660 	/* make room in the right data area */
3661 	data_end = leaf_data_end(root, right);
3662 	memmove_extent_buffer(right,
3663 			      btrfs_leaf_data(right) + data_end - push_space,
3664 			      btrfs_leaf_data(right) + data_end,
3665 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3666 
3667 	/* copy from the left data area */
3668 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3669 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3670 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3671 		     push_space);
3672 
3673 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3674 			      btrfs_item_nr_offset(0),
3675 			      right_nritems * sizeof(struct btrfs_item));
3676 
3677 	/* copy the items from left to right */
3678 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3679 		   btrfs_item_nr_offset(left_nritems - push_items),
3680 		   push_items * sizeof(struct btrfs_item));
3681 
3682 	/* update the item pointers */
3683 	right_nritems += push_items;
3684 	btrfs_set_header_nritems(right, right_nritems);
3685 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3686 	for (i = 0; i < right_nritems; i++) {
3687 		item = btrfs_item_nr(i);
3688 		push_space -= btrfs_token_item_size(right, item, &token);
3689 		btrfs_set_token_item_offset(right, item, push_space, &token);
3690 	}
3691 
3692 	left_nritems -= push_items;
3693 	btrfs_set_header_nritems(left, left_nritems);
3694 
3695 	if (left_nritems)
3696 		btrfs_mark_buffer_dirty(left);
3697 	else
3698 		clean_tree_block(trans, root, left);
3699 
3700 	btrfs_mark_buffer_dirty(right);
3701 
3702 	btrfs_item_key(right, &disk_key, 0);
3703 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3704 	btrfs_mark_buffer_dirty(upper);
3705 
3706 	/* then fixup the leaf pointer in the path */
3707 	if (path->slots[0] >= left_nritems) {
3708 		path->slots[0] -= left_nritems;
3709 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3710 			clean_tree_block(trans, root, path->nodes[0]);
3711 		btrfs_tree_unlock(path->nodes[0]);
3712 		free_extent_buffer(path->nodes[0]);
3713 		path->nodes[0] = right;
3714 		path->slots[1] += 1;
3715 	} else {
3716 		btrfs_tree_unlock(right);
3717 		free_extent_buffer(right);
3718 	}
3719 	return 0;
3720 
3721 out_unlock:
3722 	btrfs_tree_unlock(right);
3723 	free_extent_buffer(right);
3724 	return 1;
3725 }
3726 
3727 /*
3728  * push some data in the path leaf to the right, trying to free up at
3729  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3730  *
3731  * returns 1 if the push failed because the other node didn't have enough
3732  * room, 0 if everything worked out and < 0 if there were major errors.
3733  *
3734  * this will push starting from min_slot to the end of the leaf.  It won't
3735  * push any slot lower than min_slot
3736  */
3737 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3738 			   *root, struct btrfs_path *path,
3739 			   int min_data_size, int data_size,
3740 			   int empty, u32 min_slot)
3741 {
3742 	struct extent_buffer *left = path->nodes[0];
3743 	struct extent_buffer *right;
3744 	struct extent_buffer *upper;
3745 	int slot;
3746 	int free_space;
3747 	u32 left_nritems;
3748 	int ret;
3749 
3750 	if (!path->nodes[1])
3751 		return 1;
3752 
3753 	slot = path->slots[1];
3754 	upper = path->nodes[1];
3755 	if (slot >= btrfs_header_nritems(upper) - 1)
3756 		return 1;
3757 
3758 	btrfs_assert_tree_locked(path->nodes[1]);
3759 
3760 	right = read_node_slot(root, upper, slot + 1);
3761 	if (right == NULL)
3762 		return 1;
3763 
3764 	btrfs_tree_lock(right);
3765 	btrfs_set_lock_blocking(right);
3766 
3767 	free_space = btrfs_leaf_free_space(root, right);
3768 	if (free_space < data_size)
3769 		goto out_unlock;
3770 
3771 	/* cow and double check */
3772 	ret = btrfs_cow_block(trans, root, right, upper,
3773 			      slot + 1, &right);
3774 	if (ret)
3775 		goto out_unlock;
3776 
3777 	free_space = btrfs_leaf_free_space(root, right);
3778 	if (free_space < data_size)
3779 		goto out_unlock;
3780 
3781 	left_nritems = btrfs_header_nritems(left);
3782 	if (left_nritems == 0)
3783 		goto out_unlock;
3784 
3785 	if (path->slots[0] == left_nritems && !empty) {
3786 		/* Key greater than all keys in the leaf, right neighbor has
3787 		 * enough room for it and we're not emptying our leaf to delete
3788 		 * it, therefore use right neighbor to insert the new item and
3789 		 * no need to touch/dirty our left leaft. */
3790 		btrfs_tree_unlock(left);
3791 		free_extent_buffer(left);
3792 		path->nodes[0] = right;
3793 		path->slots[0] = 0;
3794 		path->slots[1]++;
3795 		return 0;
3796 	}
3797 
3798 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3799 				right, free_space, left_nritems, min_slot);
3800 out_unlock:
3801 	btrfs_tree_unlock(right);
3802 	free_extent_buffer(right);
3803 	return 1;
3804 }
3805 
3806 /*
3807  * push some data in the path leaf to the left, trying to free up at
3808  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3809  *
3810  * max_slot can put a limit on how far into the leaf we'll push items.  The
3811  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3812  * items
3813  */
3814 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3815 				     struct btrfs_root *root,
3816 				     struct btrfs_path *path, int data_size,
3817 				     int empty, struct extent_buffer *left,
3818 				     int free_space, u32 right_nritems,
3819 				     u32 max_slot)
3820 {
3821 	struct btrfs_disk_key disk_key;
3822 	struct extent_buffer *right = path->nodes[0];
3823 	int i;
3824 	int push_space = 0;
3825 	int push_items = 0;
3826 	struct btrfs_item *item;
3827 	u32 old_left_nritems;
3828 	u32 nr;
3829 	int ret = 0;
3830 	u32 this_item_size;
3831 	u32 old_left_item_size;
3832 	struct btrfs_map_token token;
3833 
3834 	btrfs_init_map_token(&token);
3835 
3836 	if (empty)
3837 		nr = min(right_nritems, max_slot);
3838 	else
3839 		nr = min(right_nritems - 1, max_slot);
3840 
3841 	for (i = 0; i < nr; i++) {
3842 		item = btrfs_item_nr(i);
3843 
3844 		if (!empty && push_items > 0) {
3845 			if (path->slots[0] < i)
3846 				break;
3847 			if (path->slots[0] == i) {
3848 				int space = btrfs_leaf_free_space(root, right);
3849 				if (space + push_space * 2 > free_space)
3850 					break;
3851 			}
3852 		}
3853 
3854 		if (path->slots[0] == i)
3855 			push_space += data_size;
3856 
3857 		this_item_size = btrfs_item_size(right, item);
3858 		if (this_item_size + sizeof(*item) + push_space > free_space)
3859 			break;
3860 
3861 		push_items++;
3862 		push_space += this_item_size + sizeof(*item);
3863 	}
3864 
3865 	if (push_items == 0) {
3866 		ret = 1;
3867 		goto out;
3868 	}
3869 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3870 
3871 	/* push data from right to left */
3872 	copy_extent_buffer(left, right,
3873 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3874 			   btrfs_item_nr_offset(0),
3875 			   push_items * sizeof(struct btrfs_item));
3876 
3877 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3878 		     btrfs_item_offset_nr(right, push_items - 1);
3879 
3880 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3881 		     leaf_data_end(root, left) - push_space,
3882 		     btrfs_leaf_data(right) +
3883 		     btrfs_item_offset_nr(right, push_items - 1),
3884 		     push_space);
3885 	old_left_nritems = btrfs_header_nritems(left);
3886 	BUG_ON(old_left_nritems <= 0);
3887 
3888 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3889 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3890 		u32 ioff;
3891 
3892 		item = btrfs_item_nr(i);
3893 
3894 		ioff = btrfs_token_item_offset(left, item, &token);
3895 		btrfs_set_token_item_offset(left, item,
3896 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3897 		      &token);
3898 	}
3899 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3900 
3901 	/* fixup right node */
3902 	if (push_items > right_nritems)
3903 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3904 		       right_nritems);
3905 
3906 	if (push_items < right_nritems) {
3907 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3908 						  leaf_data_end(root, right);
3909 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3910 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3911 				      btrfs_leaf_data(right) +
3912 				      leaf_data_end(root, right), push_space);
3913 
3914 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3915 			      btrfs_item_nr_offset(push_items),
3916 			     (btrfs_header_nritems(right) - push_items) *
3917 			     sizeof(struct btrfs_item));
3918 	}
3919 	right_nritems -= push_items;
3920 	btrfs_set_header_nritems(right, right_nritems);
3921 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3922 	for (i = 0; i < right_nritems; i++) {
3923 		item = btrfs_item_nr(i);
3924 
3925 		push_space = push_space - btrfs_token_item_size(right,
3926 								item, &token);
3927 		btrfs_set_token_item_offset(right, item, push_space, &token);
3928 	}
3929 
3930 	btrfs_mark_buffer_dirty(left);
3931 	if (right_nritems)
3932 		btrfs_mark_buffer_dirty(right);
3933 	else
3934 		clean_tree_block(trans, root, right);
3935 
3936 	btrfs_item_key(right, &disk_key, 0);
3937 	fixup_low_keys(root, path, &disk_key, 1);
3938 
3939 	/* then fixup the leaf pointer in the path */
3940 	if (path->slots[0] < push_items) {
3941 		path->slots[0] += old_left_nritems;
3942 		btrfs_tree_unlock(path->nodes[0]);
3943 		free_extent_buffer(path->nodes[0]);
3944 		path->nodes[0] = left;
3945 		path->slots[1] -= 1;
3946 	} else {
3947 		btrfs_tree_unlock(left);
3948 		free_extent_buffer(left);
3949 		path->slots[0] -= push_items;
3950 	}
3951 	BUG_ON(path->slots[0] < 0);
3952 	return ret;
3953 out:
3954 	btrfs_tree_unlock(left);
3955 	free_extent_buffer(left);
3956 	return ret;
3957 }
3958 
3959 /*
3960  * push some data in the path leaf to the left, trying to free up at
3961  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3962  *
3963  * max_slot can put a limit on how far into the leaf we'll push items.  The
3964  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3965  * items
3966  */
3967 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3968 			  *root, struct btrfs_path *path, int min_data_size,
3969 			  int data_size, int empty, u32 max_slot)
3970 {
3971 	struct extent_buffer *right = path->nodes[0];
3972 	struct extent_buffer *left;
3973 	int slot;
3974 	int free_space;
3975 	u32 right_nritems;
3976 	int ret = 0;
3977 
3978 	slot = path->slots[1];
3979 	if (slot == 0)
3980 		return 1;
3981 	if (!path->nodes[1])
3982 		return 1;
3983 
3984 	right_nritems = btrfs_header_nritems(right);
3985 	if (right_nritems == 0)
3986 		return 1;
3987 
3988 	btrfs_assert_tree_locked(path->nodes[1]);
3989 
3990 	left = read_node_slot(root, path->nodes[1], slot - 1);
3991 	if (left == NULL)
3992 		return 1;
3993 
3994 	btrfs_tree_lock(left);
3995 	btrfs_set_lock_blocking(left);
3996 
3997 	free_space = btrfs_leaf_free_space(root, left);
3998 	if (free_space < data_size) {
3999 		ret = 1;
4000 		goto out;
4001 	}
4002 
4003 	/* cow and double check */
4004 	ret = btrfs_cow_block(trans, root, left,
4005 			      path->nodes[1], slot - 1, &left);
4006 	if (ret) {
4007 		/* we hit -ENOSPC, but it isn't fatal here */
4008 		if (ret == -ENOSPC)
4009 			ret = 1;
4010 		goto out;
4011 	}
4012 
4013 	free_space = btrfs_leaf_free_space(root, left);
4014 	if (free_space < data_size) {
4015 		ret = 1;
4016 		goto out;
4017 	}
4018 
4019 	return __push_leaf_left(trans, root, path, min_data_size,
4020 			       empty, left, free_space, right_nritems,
4021 			       max_slot);
4022 out:
4023 	btrfs_tree_unlock(left);
4024 	free_extent_buffer(left);
4025 	return ret;
4026 }
4027 
4028 /*
4029  * split the path's leaf in two, making sure there is at least data_size
4030  * available for the resulting leaf level of the path.
4031  */
4032 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4033 				    struct btrfs_root *root,
4034 				    struct btrfs_path *path,
4035 				    struct extent_buffer *l,
4036 				    struct extent_buffer *right,
4037 				    int slot, int mid, int nritems)
4038 {
4039 	int data_copy_size;
4040 	int rt_data_off;
4041 	int i;
4042 	struct btrfs_disk_key disk_key;
4043 	struct btrfs_map_token token;
4044 
4045 	btrfs_init_map_token(&token);
4046 
4047 	nritems = nritems - mid;
4048 	btrfs_set_header_nritems(right, nritems);
4049 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4050 
4051 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4052 			   btrfs_item_nr_offset(mid),
4053 			   nritems * sizeof(struct btrfs_item));
4054 
4055 	copy_extent_buffer(right, l,
4056 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4057 		     data_copy_size, btrfs_leaf_data(l) +
4058 		     leaf_data_end(root, l), data_copy_size);
4059 
4060 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4061 		      btrfs_item_end_nr(l, mid);
4062 
4063 	for (i = 0; i < nritems; i++) {
4064 		struct btrfs_item *item = btrfs_item_nr(i);
4065 		u32 ioff;
4066 
4067 		ioff = btrfs_token_item_offset(right, item, &token);
4068 		btrfs_set_token_item_offset(right, item,
4069 					    ioff + rt_data_off, &token);
4070 	}
4071 
4072 	btrfs_set_header_nritems(l, mid);
4073 	btrfs_item_key(right, &disk_key, 0);
4074 	insert_ptr(trans, root, path, &disk_key, right->start,
4075 		   path->slots[1] + 1, 1);
4076 
4077 	btrfs_mark_buffer_dirty(right);
4078 	btrfs_mark_buffer_dirty(l);
4079 	BUG_ON(path->slots[0] != slot);
4080 
4081 	if (mid <= slot) {
4082 		btrfs_tree_unlock(path->nodes[0]);
4083 		free_extent_buffer(path->nodes[0]);
4084 		path->nodes[0] = right;
4085 		path->slots[0] -= mid;
4086 		path->slots[1] += 1;
4087 	} else {
4088 		btrfs_tree_unlock(right);
4089 		free_extent_buffer(right);
4090 	}
4091 
4092 	BUG_ON(path->slots[0] < 0);
4093 }
4094 
4095 /*
4096  * double splits happen when we need to insert a big item in the middle
4097  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4098  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4099  *          A                 B                 C
4100  *
4101  * We avoid this by trying to push the items on either side of our target
4102  * into the adjacent leaves.  If all goes well we can avoid the double split
4103  * completely.
4104  */
4105 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4106 					  struct btrfs_root *root,
4107 					  struct btrfs_path *path,
4108 					  int data_size)
4109 {
4110 	int ret;
4111 	int progress = 0;
4112 	int slot;
4113 	u32 nritems;
4114 	int space_needed = data_size;
4115 
4116 	slot = path->slots[0];
4117 	if (slot < btrfs_header_nritems(path->nodes[0]))
4118 		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4119 
4120 	/*
4121 	 * try to push all the items after our slot into the
4122 	 * right leaf
4123 	 */
4124 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4125 	if (ret < 0)
4126 		return ret;
4127 
4128 	if (ret == 0)
4129 		progress++;
4130 
4131 	nritems = btrfs_header_nritems(path->nodes[0]);
4132 	/*
4133 	 * our goal is to get our slot at the start or end of a leaf.  If
4134 	 * we've done so we're done
4135 	 */
4136 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4137 		return 0;
4138 
4139 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4140 		return 0;
4141 
4142 	/* try to push all the items before our slot into the next leaf */
4143 	slot = path->slots[0];
4144 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4145 	if (ret < 0)
4146 		return ret;
4147 
4148 	if (ret == 0)
4149 		progress++;
4150 
4151 	if (progress)
4152 		return 0;
4153 	return 1;
4154 }
4155 
4156 /*
4157  * split the path's leaf in two, making sure there is at least data_size
4158  * available for the resulting leaf level of the path.
4159  *
4160  * returns 0 if all went well and < 0 on failure.
4161  */
4162 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4163 			       struct btrfs_root *root,
4164 			       struct btrfs_key *ins_key,
4165 			       struct btrfs_path *path, int data_size,
4166 			       int extend)
4167 {
4168 	struct btrfs_disk_key disk_key;
4169 	struct extent_buffer *l;
4170 	u32 nritems;
4171 	int mid;
4172 	int slot;
4173 	struct extent_buffer *right;
4174 	int ret = 0;
4175 	int wret;
4176 	int split;
4177 	int num_doubles = 0;
4178 	int tried_avoid_double = 0;
4179 
4180 	l = path->nodes[0];
4181 	slot = path->slots[0];
4182 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4183 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4184 		return -EOVERFLOW;
4185 
4186 	/* first try to make some room by pushing left and right */
4187 	if (data_size && path->nodes[1]) {
4188 		int space_needed = data_size;
4189 
4190 		if (slot < btrfs_header_nritems(l))
4191 			space_needed -= btrfs_leaf_free_space(root, l);
4192 
4193 		wret = push_leaf_right(trans, root, path, space_needed,
4194 				       space_needed, 0, 0);
4195 		if (wret < 0)
4196 			return wret;
4197 		if (wret) {
4198 			wret = push_leaf_left(trans, root, path, space_needed,
4199 					      space_needed, 0, (u32)-1);
4200 			if (wret < 0)
4201 				return wret;
4202 		}
4203 		l = path->nodes[0];
4204 
4205 		/* did the pushes work? */
4206 		if (btrfs_leaf_free_space(root, l) >= data_size)
4207 			return 0;
4208 	}
4209 
4210 	if (!path->nodes[1]) {
4211 		ret = insert_new_root(trans, root, path, 1);
4212 		if (ret)
4213 			return ret;
4214 	}
4215 again:
4216 	split = 1;
4217 	l = path->nodes[0];
4218 	slot = path->slots[0];
4219 	nritems = btrfs_header_nritems(l);
4220 	mid = (nritems + 1) / 2;
4221 
4222 	if (mid <= slot) {
4223 		if (nritems == 1 ||
4224 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4225 			BTRFS_LEAF_DATA_SIZE(root)) {
4226 			if (slot >= nritems) {
4227 				split = 0;
4228 			} else {
4229 				mid = slot;
4230 				if (mid != nritems &&
4231 				    leaf_space_used(l, mid, nritems - mid) +
4232 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4233 					if (data_size && !tried_avoid_double)
4234 						goto push_for_double;
4235 					split = 2;
4236 				}
4237 			}
4238 		}
4239 	} else {
4240 		if (leaf_space_used(l, 0, mid) + data_size >
4241 			BTRFS_LEAF_DATA_SIZE(root)) {
4242 			if (!extend && data_size && slot == 0) {
4243 				split = 0;
4244 			} else if ((extend || !data_size) && slot == 0) {
4245 				mid = 1;
4246 			} else {
4247 				mid = slot;
4248 				if (mid != nritems &&
4249 				    leaf_space_used(l, mid, nritems - mid) +
4250 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4251 					if (data_size && !tried_avoid_double)
4252 						goto push_for_double;
4253 					split = 2;
4254 				}
4255 			}
4256 		}
4257 	}
4258 
4259 	if (split == 0)
4260 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4261 	else
4262 		btrfs_item_key(l, &disk_key, mid);
4263 
4264 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4265 			&disk_key, 0, l->start, 0);
4266 	if (IS_ERR(right))
4267 		return PTR_ERR(right);
4268 
4269 	root_add_used(root, root->nodesize);
4270 
4271 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4272 	btrfs_set_header_bytenr(right, right->start);
4273 	btrfs_set_header_generation(right, trans->transid);
4274 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4275 	btrfs_set_header_owner(right, root->root_key.objectid);
4276 	btrfs_set_header_level(right, 0);
4277 	write_extent_buffer(right, root->fs_info->fsid,
4278 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
4279 
4280 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4281 			    btrfs_header_chunk_tree_uuid(right),
4282 			    BTRFS_UUID_SIZE);
4283 
4284 	if (split == 0) {
4285 		if (mid <= slot) {
4286 			btrfs_set_header_nritems(right, 0);
4287 			insert_ptr(trans, root, path, &disk_key, right->start,
4288 				   path->slots[1] + 1, 1);
4289 			btrfs_tree_unlock(path->nodes[0]);
4290 			free_extent_buffer(path->nodes[0]);
4291 			path->nodes[0] = right;
4292 			path->slots[0] = 0;
4293 			path->slots[1] += 1;
4294 		} else {
4295 			btrfs_set_header_nritems(right, 0);
4296 			insert_ptr(trans, root, path, &disk_key, right->start,
4297 					  path->slots[1], 1);
4298 			btrfs_tree_unlock(path->nodes[0]);
4299 			free_extent_buffer(path->nodes[0]);
4300 			path->nodes[0] = right;
4301 			path->slots[0] = 0;
4302 			if (path->slots[1] == 0)
4303 				fixup_low_keys(root, path, &disk_key, 1);
4304 		}
4305 		btrfs_mark_buffer_dirty(right);
4306 		return ret;
4307 	}
4308 
4309 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4310 
4311 	if (split == 2) {
4312 		BUG_ON(num_doubles != 0);
4313 		num_doubles++;
4314 		goto again;
4315 	}
4316 
4317 	return 0;
4318 
4319 push_for_double:
4320 	push_for_double_split(trans, root, path, data_size);
4321 	tried_avoid_double = 1;
4322 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4323 		return 0;
4324 	goto again;
4325 }
4326 
4327 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4328 					 struct btrfs_root *root,
4329 					 struct btrfs_path *path, int ins_len)
4330 {
4331 	struct btrfs_key key;
4332 	struct extent_buffer *leaf;
4333 	struct btrfs_file_extent_item *fi;
4334 	u64 extent_len = 0;
4335 	u32 item_size;
4336 	int ret;
4337 
4338 	leaf = path->nodes[0];
4339 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4340 
4341 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4342 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4343 
4344 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4345 		return 0;
4346 
4347 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4348 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4349 		fi = btrfs_item_ptr(leaf, path->slots[0],
4350 				    struct btrfs_file_extent_item);
4351 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4352 	}
4353 	btrfs_release_path(path);
4354 
4355 	path->keep_locks = 1;
4356 	path->search_for_split = 1;
4357 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4358 	path->search_for_split = 0;
4359 	if (ret < 0)
4360 		goto err;
4361 
4362 	ret = -EAGAIN;
4363 	leaf = path->nodes[0];
4364 	/* if our item isn't there or got smaller, return now */
4365 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4366 		goto err;
4367 
4368 	/* the leaf has  changed, it now has room.  return now */
4369 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4370 		goto err;
4371 
4372 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4373 		fi = btrfs_item_ptr(leaf, path->slots[0],
4374 				    struct btrfs_file_extent_item);
4375 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4376 			goto err;
4377 	}
4378 
4379 	btrfs_set_path_blocking(path);
4380 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4381 	if (ret)
4382 		goto err;
4383 
4384 	path->keep_locks = 0;
4385 	btrfs_unlock_up_safe(path, 1);
4386 	return 0;
4387 err:
4388 	path->keep_locks = 0;
4389 	return ret;
4390 }
4391 
4392 static noinline int split_item(struct btrfs_trans_handle *trans,
4393 			       struct btrfs_root *root,
4394 			       struct btrfs_path *path,
4395 			       struct btrfs_key *new_key,
4396 			       unsigned long split_offset)
4397 {
4398 	struct extent_buffer *leaf;
4399 	struct btrfs_item *item;
4400 	struct btrfs_item *new_item;
4401 	int slot;
4402 	char *buf;
4403 	u32 nritems;
4404 	u32 item_size;
4405 	u32 orig_offset;
4406 	struct btrfs_disk_key disk_key;
4407 
4408 	leaf = path->nodes[0];
4409 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4410 
4411 	btrfs_set_path_blocking(path);
4412 
4413 	item = btrfs_item_nr(path->slots[0]);
4414 	orig_offset = btrfs_item_offset(leaf, item);
4415 	item_size = btrfs_item_size(leaf, item);
4416 
4417 	buf = kmalloc(item_size, GFP_NOFS);
4418 	if (!buf)
4419 		return -ENOMEM;
4420 
4421 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4422 			    path->slots[0]), item_size);
4423 
4424 	slot = path->slots[0] + 1;
4425 	nritems = btrfs_header_nritems(leaf);
4426 	if (slot != nritems) {
4427 		/* shift the items */
4428 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4429 				btrfs_item_nr_offset(slot),
4430 				(nritems - slot) * sizeof(struct btrfs_item));
4431 	}
4432 
4433 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4434 	btrfs_set_item_key(leaf, &disk_key, slot);
4435 
4436 	new_item = btrfs_item_nr(slot);
4437 
4438 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4439 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4440 
4441 	btrfs_set_item_offset(leaf, item,
4442 			      orig_offset + item_size - split_offset);
4443 	btrfs_set_item_size(leaf, item, split_offset);
4444 
4445 	btrfs_set_header_nritems(leaf, nritems + 1);
4446 
4447 	/* write the data for the start of the original item */
4448 	write_extent_buffer(leaf, buf,
4449 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4450 			    split_offset);
4451 
4452 	/* write the data for the new item */
4453 	write_extent_buffer(leaf, buf + split_offset,
4454 			    btrfs_item_ptr_offset(leaf, slot),
4455 			    item_size - split_offset);
4456 	btrfs_mark_buffer_dirty(leaf);
4457 
4458 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4459 	kfree(buf);
4460 	return 0;
4461 }
4462 
4463 /*
4464  * This function splits a single item into two items,
4465  * giving 'new_key' to the new item and splitting the
4466  * old one at split_offset (from the start of the item).
4467  *
4468  * The path may be released by this operation.  After
4469  * the split, the path is pointing to the old item.  The
4470  * new item is going to be in the same node as the old one.
4471  *
4472  * Note, the item being split must be smaller enough to live alone on
4473  * a tree block with room for one extra struct btrfs_item
4474  *
4475  * This allows us to split the item in place, keeping a lock on the
4476  * leaf the entire time.
4477  */
4478 int btrfs_split_item(struct btrfs_trans_handle *trans,
4479 		     struct btrfs_root *root,
4480 		     struct btrfs_path *path,
4481 		     struct btrfs_key *new_key,
4482 		     unsigned long split_offset)
4483 {
4484 	int ret;
4485 	ret = setup_leaf_for_split(trans, root, path,
4486 				   sizeof(struct btrfs_item));
4487 	if (ret)
4488 		return ret;
4489 
4490 	ret = split_item(trans, root, path, new_key, split_offset);
4491 	return ret;
4492 }
4493 
4494 /*
4495  * This function duplicate a item, giving 'new_key' to the new item.
4496  * It guarantees both items live in the same tree leaf and the new item
4497  * is contiguous with the original item.
4498  *
4499  * This allows us to split file extent in place, keeping a lock on the
4500  * leaf the entire time.
4501  */
4502 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4503 			 struct btrfs_root *root,
4504 			 struct btrfs_path *path,
4505 			 struct btrfs_key *new_key)
4506 {
4507 	struct extent_buffer *leaf;
4508 	int ret;
4509 	u32 item_size;
4510 
4511 	leaf = path->nodes[0];
4512 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4513 	ret = setup_leaf_for_split(trans, root, path,
4514 				   item_size + sizeof(struct btrfs_item));
4515 	if (ret)
4516 		return ret;
4517 
4518 	path->slots[0]++;
4519 	setup_items_for_insert(root, path, new_key, &item_size,
4520 			       item_size, item_size +
4521 			       sizeof(struct btrfs_item), 1);
4522 	leaf = path->nodes[0];
4523 	memcpy_extent_buffer(leaf,
4524 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4525 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4526 			     item_size);
4527 	return 0;
4528 }
4529 
4530 /*
4531  * make the item pointed to by the path smaller.  new_size indicates
4532  * how small to make it, and from_end tells us if we just chop bytes
4533  * off the end of the item or if we shift the item to chop bytes off
4534  * the front.
4535  */
4536 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4537 			 u32 new_size, int from_end)
4538 {
4539 	int slot;
4540 	struct extent_buffer *leaf;
4541 	struct btrfs_item *item;
4542 	u32 nritems;
4543 	unsigned int data_end;
4544 	unsigned int old_data_start;
4545 	unsigned int old_size;
4546 	unsigned int size_diff;
4547 	int i;
4548 	struct btrfs_map_token token;
4549 
4550 	btrfs_init_map_token(&token);
4551 
4552 	leaf = path->nodes[0];
4553 	slot = path->slots[0];
4554 
4555 	old_size = btrfs_item_size_nr(leaf, slot);
4556 	if (old_size == new_size)
4557 		return;
4558 
4559 	nritems = btrfs_header_nritems(leaf);
4560 	data_end = leaf_data_end(root, leaf);
4561 
4562 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4563 
4564 	size_diff = old_size - new_size;
4565 
4566 	BUG_ON(slot < 0);
4567 	BUG_ON(slot >= nritems);
4568 
4569 	/*
4570 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4571 	 */
4572 	/* first correct the data pointers */
4573 	for (i = slot; i < nritems; i++) {
4574 		u32 ioff;
4575 		item = btrfs_item_nr(i);
4576 
4577 		ioff = btrfs_token_item_offset(leaf, item, &token);
4578 		btrfs_set_token_item_offset(leaf, item,
4579 					    ioff + size_diff, &token);
4580 	}
4581 
4582 	/* shift the data */
4583 	if (from_end) {
4584 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4585 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4586 			      data_end, old_data_start + new_size - data_end);
4587 	} else {
4588 		struct btrfs_disk_key disk_key;
4589 		u64 offset;
4590 
4591 		btrfs_item_key(leaf, &disk_key, slot);
4592 
4593 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4594 			unsigned long ptr;
4595 			struct btrfs_file_extent_item *fi;
4596 
4597 			fi = btrfs_item_ptr(leaf, slot,
4598 					    struct btrfs_file_extent_item);
4599 			fi = (struct btrfs_file_extent_item *)(
4600 			     (unsigned long)fi - size_diff);
4601 
4602 			if (btrfs_file_extent_type(leaf, fi) ==
4603 			    BTRFS_FILE_EXTENT_INLINE) {
4604 				ptr = btrfs_item_ptr_offset(leaf, slot);
4605 				memmove_extent_buffer(leaf, ptr,
4606 				      (unsigned long)fi,
4607 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4608 			}
4609 		}
4610 
4611 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4612 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4613 			      data_end, old_data_start - data_end);
4614 
4615 		offset = btrfs_disk_key_offset(&disk_key);
4616 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4617 		btrfs_set_item_key(leaf, &disk_key, slot);
4618 		if (slot == 0)
4619 			fixup_low_keys(root, path, &disk_key, 1);
4620 	}
4621 
4622 	item = btrfs_item_nr(slot);
4623 	btrfs_set_item_size(leaf, item, new_size);
4624 	btrfs_mark_buffer_dirty(leaf);
4625 
4626 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4627 		btrfs_print_leaf(root, leaf);
4628 		BUG();
4629 	}
4630 }
4631 
4632 /*
4633  * make the item pointed to by the path bigger, data_size is the added size.
4634  */
4635 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4636 		       u32 data_size)
4637 {
4638 	int slot;
4639 	struct extent_buffer *leaf;
4640 	struct btrfs_item *item;
4641 	u32 nritems;
4642 	unsigned int data_end;
4643 	unsigned int old_data;
4644 	unsigned int old_size;
4645 	int i;
4646 	struct btrfs_map_token token;
4647 
4648 	btrfs_init_map_token(&token);
4649 
4650 	leaf = path->nodes[0];
4651 
4652 	nritems = btrfs_header_nritems(leaf);
4653 	data_end = leaf_data_end(root, leaf);
4654 
4655 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4656 		btrfs_print_leaf(root, leaf);
4657 		BUG();
4658 	}
4659 	slot = path->slots[0];
4660 	old_data = btrfs_item_end_nr(leaf, slot);
4661 
4662 	BUG_ON(slot < 0);
4663 	if (slot >= nritems) {
4664 		btrfs_print_leaf(root, leaf);
4665 		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4666 		       slot, nritems);
4667 		BUG_ON(1);
4668 	}
4669 
4670 	/*
4671 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4672 	 */
4673 	/* first correct the data pointers */
4674 	for (i = slot; i < nritems; i++) {
4675 		u32 ioff;
4676 		item = btrfs_item_nr(i);
4677 
4678 		ioff = btrfs_token_item_offset(leaf, item, &token);
4679 		btrfs_set_token_item_offset(leaf, item,
4680 					    ioff - data_size, &token);
4681 	}
4682 
4683 	/* shift the data */
4684 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4685 		      data_end - data_size, btrfs_leaf_data(leaf) +
4686 		      data_end, old_data - data_end);
4687 
4688 	data_end = old_data;
4689 	old_size = btrfs_item_size_nr(leaf, slot);
4690 	item = btrfs_item_nr(slot);
4691 	btrfs_set_item_size(leaf, item, old_size + data_size);
4692 	btrfs_mark_buffer_dirty(leaf);
4693 
4694 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4695 		btrfs_print_leaf(root, leaf);
4696 		BUG();
4697 	}
4698 }
4699 
4700 /*
4701  * this is a helper for btrfs_insert_empty_items, the main goal here is
4702  * to save stack depth by doing the bulk of the work in a function
4703  * that doesn't call btrfs_search_slot
4704  */
4705 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4706 			    struct btrfs_key *cpu_key, u32 *data_size,
4707 			    u32 total_data, u32 total_size, int nr)
4708 {
4709 	struct btrfs_item *item;
4710 	int i;
4711 	u32 nritems;
4712 	unsigned int data_end;
4713 	struct btrfs_disk_key disk_key;
4714 	struct extent_buffer *leaf;
4715 	int slot;
4716 	struct btrfs_map_token token;
4717 
4718 	if (path->slots[0] == 0) {
4719 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4720 		fixup_low_keys(root, path, &disk_key, 1);
4721 	}
4722 	btrfs_unlock_up_safe(path, 1);
4723 
4724 	btrfs_init_map_token(&token);
4725 
4726 	leaf = path->nodes[0];
4727 	slot = path->slots[0];
4728 
4729 	nritems = btrfs_header_nritems(leaf);
4730 	data_end = leaf_data_end(root, leaf);
4731 
4732 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4733 		btrfs_print_leaf(root, leaf);
4734 		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4735 		       total_size, btrfs_leaf_free_space(root, leaf));
4736 		BUG();
4737 	}
4738 
4739 	if (slot != nritems) {
4740 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4741 
4742 		if (old_data < data_end) {
4743 			btrfs_print_leaf(root, leaf);
4744 			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4745 			       slot, old_data, data_end);
4746 			BUG_ON(1);
4747 		}
4748 		/*
4749 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4750 		 */
4751 		/* first correct the data pointers */
4752 		for (i = slot; i < nritems; i++) {
4753 			u32 ioff;
4754 
4755 			item = btrfs_item_nr( i);
4756 			ioff = btrfs_token_item_offset(leaf, item, &token);
4757 			btrfs_set_token_item_offset(leaf, item,
4758 						    ioff - total_data, &token);
4759 		}
4760 		/* shift the items */
4761 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4762 			      btrfs_item_nr_offset(slot),
4763 			      (nritems - slot) * sizeof(struct btrfs_item));
4764 
4765 		/* shift the data */
4766 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4767 			      data_end - total_data, btrfs_leaf_data(leaf) +
4768 			      data_end, old_data - data_end);
4769 		data_end = old_data;
4770 	}
4771 
4772 	/* setup the item for the new data */
4773 	for (i = 0; i < nr; i++) {
4774 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4775 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4776 		item = btrfs_item_nr(slot + i);
4777 		btrfs_set_token_item_offset(leaf, item,
4778 					    data_end - data_size[i], &token);
4779 		data_end -= data_size[i];
4780 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4781 	}
4782 
4783 	btrfs_set_header_nritems(leaf, nritems + nr);
4784 	btrfs_mark_buffer_dirty(leaf);
4785 
4786 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4787 		btrfs_print_leaf(root, leaf);
4788 		BUG();
4789 	}
4790 }
4791 
4792 /*
4793  * Given a key and some data, insert items into the tree.
4794  * This does all the path init required, making room in the tree if needed.
4795  */
4796 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4797 			    struct btrfs_root *root,
4798 			    struct btrfs_path *path,
4799 			    struct btrfs_key *cpu_key, u32 *data_size,
4800 			    int nr)
4801 {
4802 	int ret = 0;
4803 	int slot;
4804 	int i;
4805 	u32 total_size = 0;
4806 	u32 total_data = 0;
4807 
4808 	for (i = 0; i < nr; i++)
4809 		total_data += data_size[i];
4810 
4811 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4812 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4813 	if (ret == 0)
4814 		return -EEXIST;
4815 	if (ret < 0)
4816 		return ret;
4817 
4818 	slot = path->slots[0];
4819 	BUG_ON(slot < 0);
4820 
4821 	setup_items_for_insert(root, path, cpu_key, data_size,
4822 			       total_data, total_size, nr);
4823 	return 0;
4824 }
4825 
4826 /*
4827  * Given a key and some data, insert an item into the tree.
4828  * This does all the path init required, making room in the tree if needed.
4829  */
4830 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4831 		      *root, struct btrfs_key *cpu_key, void *data, u32
4832 		      data_size)
4833 {
4834 	int ret = 0;
4835 	struct btrfs_path *path;
4836 	struct extent_buffer *leaf;
4837 	unsigned long ptr;
4838 
4839 	path = btrfs_alloc_path();
4840 	if (!path)
4841 		return -ENOMEM;
4842 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4843 	if (!ret) {
4844 		leaf = path->nodes[0];
4845 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4846 		write_extent_buffer(leaf, data, ptr, data_size);
4847 		btrfs_mark_buffer_dirty(leaf);
4848 	}
4849 	btrfs_free_path(path);
4850 	return ret;
4851 }
4852 
4853 /*
4854  * delete the pointer from a given node.
4855  *
4856  * the tree should have been previously balanced so the deletion does not
4857  * empty a node.
4858  */
4859 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4860 		    int level, int slot)
4861 {
4862 	struct extent_buffer *parent = path->nodes[level];
4863 	u32 nritems;
4864 	int ret;
4865 
4866 	nritems = btrfs_header_nritems(parent);
4867 	if (slot != nritems - 1) {
4868 		if (level)
4869 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4870 					     slot + 1, nritems - slot - 1);
4871 		memmove_extent_buffer(parent,
4872 			      btrfs_node_key_ptr_offset(slot),
4873 			      btrfs_node_key_ptr_offset(slot + 1),
4874 			      sizeof(struct btrfs_key_ptr) *
4875 			      (nritems - slot - 1));
4876 	} else if (level) {
4877 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4878 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4879 		BUG_ON(ret < 0);
4880 	}
4881 
4882 	nritems--;
4883 	btrfs_set_header_nritems(parent, nritems);
4884 	if (nritems == 0 && parent == root->node) {
4885 		BUG_ON(btrfs_header_level(root->node) != 1);
4886 		/* just turn the root into a leaf and break */
4887 		btrfs_set_header_level(root->node, 0);
4888 	} else if (slot == 0) {
4889 		struct btrfs_disk_key disk_key;
4890 
4891 		btrfs_node_key(parent, &disk_key, 0);
4892 		fixup_low_keys(root, path, &disk_key, level + 1);
4893 	}
4894 	btrfs_mark_buffer_dirty(parent);
4895 }
4896 
4897 /*
4898  * a helper function to delete the leaf pointed to by path->slots[1] and
4899  * path->nodes[1].
4900  *
4901  * This deletes the pointer in path->nodes[1] and frees the leaf
4902  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4903  *
4904  * The path must have already been setup for deleting the leaf, including
4905  * all the proper balancing.  path->nodes[1] must be locked.
4906  */
4907 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4908 				    struct btrfs_root *root,
4909 				    struct btrfs_path *path,
4910 				    struct extent_buffer *leaf)
4911 {
4912 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4913 	del_ptr(root, path, 1, path->slots[1]);
4914 
4915 	/*
4916 	 * btrfs_free_extent is expensive, we want to make sure we
4917 	 * aren't holding any locks when we call it
4918 	 */
4919 	btrfs_unlock_up_safe(path, 0);
4920 
4921 	root_sub_used(root, leaf->len);
4922 
4923 	extent_buffer_get(leaf);
4924 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4925 	free_extent_buffer_stale(leaf);
4926 }
4927 /*
4928  * delete the item at the leaf level in path.  If that empties
4929  * the leaf, remove it from the tree
4930  */
4931 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4932 		    struct btrfs_path *path, int slot, int nr)
4933 {
4934 	struct extent_buffer *leaf;
4935 	struct btrfs_item *item;
4936 	int last_off;
4937 	int dsize = 0;
4938 	int ret = 0;
4939 	int wret;
4940 	int i;
4941 	u32 nritems;
4942 	struct btrfs_map_token token;
4943 
4944 	btrfs_init_map_token(&token);
4945 
4946 	leaf = path->nodes[0];
4947 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4948 
4949 	for (i = 0; i < nr; i++)
4950 		dsize += btrfs_item_size_nr(leaf, slot + i);
4951 
4952 	nritems = btrfs_header_nritems(leaf);
4953 
4954 	if (slot + nr != nritems) {
4955 		int data_end = leaf_data_end(root, leaf);
4956 
4957 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4958 			      data_end + dsize,
4959 			      btrfs_leaf_data(leaf) + data_end,
4960 			      last_off - data_end);
4961 
4962 		for (i = slot + nr; i < nritems; i++) {
4963 			u32 ioff;
4964 
4965 			item = btrfs_item_nr(i);
4966 			ioff = btrfs_token_item_offset(leaf, item, &token);
4967 			btrfs_set_token_item_offset(leaf, item,
4968 						    ioff + dsize, &token);
4969 		}
4970 
4971 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4972 			      btrfs_item_nr_offset(slot + nr),
4973 			      sizeof(struct btrfs_item) *
4974 			      (nritems - slot - nr));
4975 	}
4976 	btrfs_set_header_nritems(leaf, nritems - nr);
4977 	nritems -= nr;
4978 
4979 	/* delete the leaf if we've emptied it */
4980 	if (nritems == 0) {
4981 		if (leaf == root->node) {
4982 			btrfs_set_header_level(leaf, 0);
4983 		} else {
4984 			btrfs_set_path_blocking(path);
4985 			clean_tree_block(trans, root, leaf);
4986 			btrfs_del_leaf(trans, root, path, leaf);
4987 		}
4988 	} else {
4989 		int used = leaf_space_used(leaf, 0, nritems);
4990 		if (slot == 0) {
4991 			struct btrfs_disk_key disk_key;
4992 
4993 			btrfs_item_key(leaf, &disk_key, 0);
4994 			fixup_low_keys(root, path, &disk_key, 1);
4995 		}
4996 
4997 		/* delete the leaf if it is mostly empty */
4998 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4999 			/* push_leaf_left fixes the path.
5000 			 * make sure the path still points to our leaf
5001 			 * for possible call to del_ptr below
5002 			 */
5003 			slot = path->slots[1];
5004 			extent_buffer_get(leaf);
5005 
5006 			btrfs_set_path_blocking(path);
5007 			wret = push_leaf_left(trans, root, path, 1, 1,
5008 					      1, (u32)-1);
5009 			if (wret < 0 && wret != -ENOSPC)
5010 				ret = wret;
5011 
5012 			if (path->nodes[0] == leaf &&
5013 			    btrfs_header_nritems(leaf)) {
5014 				wret = push_leaf_right(trans, root, path, 1,
5015 						       1, 1, 0);
5016 				if (wret < 0 && wret != -ENOSPC)
5017 					ret = wret;
5018 			}
5019 
5020 			if (btrfs_header_nritems(leaf) == 0) {
5021 				path->slots[1] = slot;
5022 				btrfs_del_leaf(trans, root, path, leaf);
5023 				free_extent_buffer(leaf);
5024 				ret = 0;
5025 			} else {
5026 				/* if we're still in the path, make sure
5027 				 * we're dirty.  Otherwise, one of the
5028 				 * push_leaf functions must have already
5029 				 * dirtied this buffer
5030 				 */
5031 				if (path->nodes[0] == leaf)
5032 					btrfs_mark_buffer_dirty(leaf);
5033 				free_extent_buffer(leaf);
5034 			}
5035 		} else {
5036 			btrfs_mark_buffer_dirty(leaf);
5037 		}
5038 	}
5039 	return ret;
5040 }
5041 
5042 /*
5043  * search the tree again to find a leaf with lesser keys
5044  * returns 0 if it found something or 1 if there are no lesser leaves.
5045  * returns < 0 on io errors.
5046  *
5047  * This may release the path, and so you may lose any locks held at the
5048  * time you call it.
5049  */
5050 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5051 {
5052 	struct btrfs_key key;
5053 	struct btrfs_disk_key found_key;
5054 	int ret;
5055 
5056 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5057 
5058 	if (key.offset > 0) {
5059 		key.offset--;
5060 	} else if (key.type > 0) {
5061 		key.type--;
5062 		key.offset = (u64)-1;
5063 	} else if (key.objectid > 0) {
5064 		key.objectid--;
5065 		key.type = (u8)-1;
5066 		key.offset = (u64)-1;
5067 	} else {
5068 		return 1;
5069 	}
5070 
5071 	btrfs_release_path(path);
5072 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5073 	if (ret < 0)
5074 		return ret;
5075 	btrfs_item_key(path->nodes[0], &found_key, 0);
5076 	ret = comp_keys(&found_key, &key);
5077 	/*
5078 	 * We might have had an item with the previous key in the tree right
5079 	 * before we released our path. And after we released our path, that
5080 	 * item might have been pushed to the first slot (0) of the leaf we
5081 	 * were holding due to a tree balance. Alternatively, an item with the
5082 	 * previous key can exist as the only element of a leaf (big fat item).
5083 	 * Therefore account for these 2 cases, so that our callers (like
5084 	 * btrfs_previous_item) don't miss an existing item with a key matching
5085 	 * the previous key we computed above.
5086 	 */
5087 	if (ret <= 0)
5088 		return 0;
5089 	return 1;
5090 }
5091 
5092 /*
5093  * A helper function to walk down the tree starting at min_key, and looking
5094  * for nodes or leaves that are have a minimum transaction id.
5095  * This is used by the btree defrag code, and tree logging
5096  *
5097  * This does not cow, but it does stuff the starting key it finds back
5098  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5099  * key and get a writable path.
5100  *
5101  * This does lock as it descends, and path->keep_locks should be set
5102  * to 1 by the caller.
5103  *
5104  * This honors path->lowest_level to prevent descent past a given level
5105  * of the tree.
5106  *
5107  * min_trans indicates the oldest transaction that you are interested
5108  * in walking through.  Any nodes or leaves older than min_trans are
5109  * skipped over (without reading them).
5110  *
5111  * returns zero if something useful was found, < 0 on error and 1 if there
5112  * was nothing in the tree that matched the search criteria.
5113  */
5114 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5115 			 struct btrfs_path *path,
5116 			 u64 min_trans)
5117 {
5118 	struct extent_buffer *cur;
5119 	struct btrfs_key found_key;
5120 	int slot;
5121 	int sret;
5122 	u32 nritems;
5123 	int level;
5124 	int ret = 1;
5125 	int keep_locks = path->keep_locks;
5126 
5127 	path->keep_locks = 1;
5128 again:
5129 	cur = btrfs_read_lock_root_node(root);
5130 	level = btrfs_header_level(cur);
5131 	WARN_ON(path->nodes[level]);
5132 	path->nodes[level] = cur;
5133 	path->locks[level] = BTRFS_READ_LOCK;
5134 
5135 	if (btrfs_header_generation(cur) < min_trans) {
5136 		ret = 1;
5137 		goto out;
5138 	}
5139 	while (1) {
5140 		nritems = btrfs_header_nritems(cur);
5141 		level = btrfs_header_level(cur);
5142 		sret = bin_search(cur, min_key, level, &slot);
5143 
5144 		/* at the lowest level, we're done, setup the path and exit */
5145 		if (level == path->lowest_level) {
5146 			if (slot >= nritems)
5147 				goto find_next_key;
5148 			ret = 0;
5149 			path->slots[level] = slot;
5150 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5151 			goto out;
5152 		}
5153 		if (sret && slot > 0)
5154 			slot--;
5155 		/*
5156 		 * check this node pointer against the min_trans parameters.
5157 		 * If it is too old, old, skip to the next one.
5158 		 */
5159 		while (slot < nritems) {
5160 			u64 gen;
5161 
5162 			gen = btrfs_node_ptr_generation(cur, slot);
5163 			if (gen < min_trans) {
5164 				slot++;
5165 				continue;
5166 			}
5167 			break;
5168 		}
5169 find_next_key:
5170 		/*
5171 		 * we didn't find a candidate key in this node, walk forward
5172 		 * and find another one
5173 		 */
5174 		if (slot >= nritems) {
5175 			path->slots[level] = slot;
5176 			btrfs_set_path_blocking(path);
5177 			sret = btrfs_find_next_key(root, path, min_key, level,
5178 						  min_trans);
5179 			if (sret == 0) {
5180 				btrfs_release_path(path);
5181 				goto again;
5182 			} else {
5183 				goto out;
5184 			}
5185 		}
5186 		/* save our key for returning back */
5187 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5188 		path->slots[level] = slot;
5189 		if (level == path->lowest_level) {
5190 			ret = 0;
5191 			goto out;
5192 		}
5193 		btrfs_set_path_blocking(path);
5194 		cur = read_node_slot(root, cur, slot);
5195 		BUG_ON(!cur); /* -ENOMEM */
5196 
5197 		btrfs_tree_read_lock(cur);
5198 
5199 		path->locks[level - 1] = BTRFS_READ_LOCK;
5200 		path->nodes[level - 1] = cur;
5201 		unlock_up(path, level, 1, 0, NULL);
5202 		btrfs_clear_path_blocking(path, NULL, 0);
5203 	}
5204 out:
5205 	path->keep_locks = keep_locks;
5206 	if (ret == 0) {
5207 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5208 		btrfs_set_path_blocking(path);
5209 		memcpy(min_key, &found_key, sizeof(found_key));
5210 	}
5211 	return ret;
5212 }
5213 
5214 static void tree_move_down(struct btrfs_root *root,
5215 			   struct btrfs_path *path,
5216 			   int *level, int root_level)
5217 {
5218 	BUG_ON(*level == 0);
5219 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5220 					path->slots[*level]);
5221 	path->slots[*level - 1] = 0;
5222 	(*level)--;
5223 }
5224 
5225 static int tree_move_next_or_upnext(struct btrfs_root *root,
5226 				    struct btrfs_path *path,
5227 				    int *level, int root_level)
5228 {
5229 	int ret = 0;
5230 	int nritems;
5231 	nritems = btrfs_header_nritems(path->nodes[*level]);
5232 
5233 	path->slots[*level]++;
5234 
5235 	while (path->slots[*level] >= nritems) {
5236 		if (*level == root_level)
5237 			return -1;
5238 
5239 		/* move upnext */
5240 		path->slots[*level] = 0;
5241 		free_extent_buffer(path->nodes[*level]);
5242 		path->nodes[*level] = NULL;
5243 		(*level)++;
5244 		path->slots[*level]++;
5245 
5246 		nritems = btrfs_header_nritems(path->nodes[*level]);
5247 		ret = 1;
5248 	}
5249 	return ret;
5250 }
5251 
5252 /*
5253  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5254  * or down.
5255  */
5256 static int tree_advance(struct btrfs_root *root,
5257 			struct btrfs_path *path,
5258 			int *level, int root_level,
5259 			int allow_down,
5260 			struct btrfs_key *key)
5261 {
5262 	int ret;
5263 
5264 	if (*level == 0 || !allow_down) {
5265 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5266 	} else {
5267 		tree_move_down(root, path, level, root_level);
5268 		ret = 0;
5269 	}
5270 	if (ret >= 0) {
5271 		if (*level == 0)
5272 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5273 					path->slots[*level]);
5274 		else
5275 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5276 					path->slots[*level]);
5277 	}
5278 	return ret;
5279 }
5280 
5281 static int tree_compare_item(struct btrfs_root *left_root,
5282 			     struct btrfs_path *left_path,
5283 			     struct btrfs_path *right_path,
5284 			     char *tmp_buf)
5285 {
5286 	int cmp;
5287 	int len1, len2;
5288 	unsigned long off1, off2;
5289 
5290 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5291 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5292 	if (len1 != len2)
5293 		return 1;
5294 
5295 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5296 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5297 				right_path->slots[0]);
5298 
5299 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5300 
5301 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5302 	if (cmp)
5303 		return 1;
5304 	return 0;
5305 }
5306 
5307 #define ADVANCE 1
5308 #define ADVANCE_ONLY_NEXT -1
5309 
5310 /*
5311  * This function compares two trees and calls the provided callback for
5312  * every changed/new/deleted item it finds.
5313  * If shared tree blocks are encountered, whole subtrees are skipped, making
5314  * the compare pretty fast on snapshotted subvolumes.
5315  *
5316  * This currently works on commit roots only. As commit roots are read only,
5317  * we don't do any locking. The commit roots are protected with transactions.
5318  * Transactions are ended and rejoined when a commit is tried in between.
5319  *
5320  * This function checks for modifications done to the trees while comparing.
5321  * If it detects a change, it aborts immediately.
5322  */
5323 int btrfs_compare_trees(struct btrfs_root *left_root,
5324 			struct btrfs_root *right_root,
5325 			btrfs_changed_cb_t changed_cb, void *ctx)
5326 {
5327 	int ret;
5328 	int cmp;
5329 	struct btrfs_path *left_path = NULL;
5330 	struct btrfs_path *right_path = NULL;
5331 	struct btrfs_key left_key;
5332 	struct btrfs_key right_key;
5333 	char *tmp_buf = NULL;
5334 	int left_root_level;
5335 	int right_root_level;
5336 	int left_level;
5337 	int right_level;
5338 	int left_end_reached;
5339 	int right_end_reached;
5340 	int advance_left;
5341 	int advance_right;
5342 	u64 left_blockptr;
5343 	u64 right_blockptr;
5344 	u64 left_gen;
5345 	u64 right_gen;
5346 
5347 	left_path = btrfs_alloc_path();
5348 	if (!left_path) {
5349 		ret = -ENOMEM;
5350 		goto out;
5351 	}
5352 	right_path = btrfs_alloc_path();
5353 	if (!right_path) {
5354 		ret = -ENOMEM;
5355 		goto out;
5356 	}
5357 
5358 	tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5359 	if (!tmp_buf) {
5360 		ret = -ENOMEM;
5361 		goto out;
5362 	}
5363 
5364 	left_path->search_commit_root = 1;
5365 	left_path->skip_locking = 1;
5366 	right_path->search_commit_root = 1;
5367 	right_path->skip_locking = 1;
5368 
5369 	/*
5370 	 * Strategy: Go to the first items of both trees. Then do
5371 	 *
5372 	 * If both trees are at level 0
5373 	 *   Compare keys of current items
5374 	 *     If left < right treat left item as new, advance left tree
5375 	 *       and repeat
5376 	 *     If left > right treat right item as deleted, advance right tree
5377 	 *       and repeat
5378 	 *     If left == right do deep compare of items, treat as changed if
5379 	 *       needed, advance both trees and repeat
5380 	 * If both trees are at the same level but not at level 0
5381 	 *   Compare keys of current nodes/leafs
5382 	 *     If left < right advance left tree and repeat
5383 	 *     If left > right advance right tree and repeat
5384 	 *     If left == right compare blockptrs of the next nodes/leafs
5385 	 *       If they match advance both trees but stay at the same level
5386 	 *         and repeat
5387 	 *       If they don't match advance both trees while allowing to go
5388 	 *         deeper and repeat
5389 	 * If tree levels are different
5390 	 *   Advance the tree that needs it and repeat
5391 	 *
5392 	 * Advancing a tree means:
5393 	 *   If we are at level 0, try to go to the next slot. If that's not
5394 	 *   possible, go one level up and repeat. Stop when we found a level
5395 	 *   where we could go to the next slot. We may at this point be on a
5396 	 *   node or a leaf.
5397 	 *
5398 	 *   If we are not at level 0 and not on shared tree blocks, go one
5399 	 *   level deeper.
5400 	 *
5401 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5402 	 *   the right if possible or go up and right.
5403 	 */
5404 
5405 	down_read(&left_root->fs_info->commit_root_sem);
5406 	left_level = btrfs_header_level(left_root->commit_root);
5407 	left_root_level = left_level;
5408 	left_path->nodes[left_level] = left_root->commit_root;
5409 	extent_buffer_get(left_path->nodes[left_level]);
5410 
5411 	right_level = btrfs_header_level(right_root->commit_root);
5412 	right_root_level = right_level;
5413 	right_path->nodes[right_level] = right_root->commit_root;
5414 	extent_buffer_get(right_path->nodes[right_level]);
5415 	up_read(&left_root->fs_info->commit_root_sem);
5416 
5417 	if (left_level == 0)
5418 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5419 				&left_key, left_path->slots[left_level]);
5420 	else
5421 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5422 				&left_key, left_path->slots[left_level]);
5423 	if (right_level == 0)
5424 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5425 				&right_key, right_path->slots[right_level]);
5426 	else
5427 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5428 				&right_key, right_path->slots[right_level]);
5429 
5430 	left_end_reached = right_end_reached = 0;
5431 	advance_left = advance_right = 0;
5432 
5433 	while (1) {
5434 		if (advance_left && !left_end_reached) {
5435 			ret = tree_advance(left_root, left_path, &left_level,
5436 					left_root_level,
5437 					advance_left != ADVANCE_ONLY_NEXT,
5438 					&left_key);
5439 			if (ret < 0)
5440 				left_end_reached = ADVANCE;
5441 			advance_left = 0;
5442 		}
5443 		if (advance_right && !right_end_reached) {
5444 			ret = tree_advance(right_root, right_path, &right_level,
5445 					right_root_level,
5446 					advance_right != ADVANCE_ONLY_NEXT,
5447 					&right_key);
5448 			if (ret < 0)
5449 				right_end_reached = ADVANCE;
5450 			advance_right = 0;
5451 		}
5452 
5453 		if (left_end_reached && right_end_reached) {
5454 			ret = 0;
5455 			goto out;
5456 		} else if (left_end_reached) {
5457 			if (right_level == 0) {
5458 				ret = changed_cb(left_root, right_root,
5459 						left_path, right_path,
5460 						&right_key,
5461 						BTRFS_COMPARE_TREE_DELETED,
5462 						ctx);
5463 				if (ret < 0)
5464 					goto out;
5465 			}
5466 			advance_right = ADVANCE;
5467 			continue;
5468 		} else if (right_end_reached) {
5469 			if (left_level == 0) {
5470 				ret = changed_cb(left_root, right_root,
5471 						left_path, right_path,
5472 						&left_key,
5473 						BTRFS_COMPARE_TREE_NEW,
5474 						ctx);
5475 				if (ret < 0)
5476 					goto out;
5477 			}
5478 			advance_left = ADVANCE;
5479 			continue;
5480 		}
5481 
5482 		if (left_level == 0 && right_level == 0) {
5483 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5484 			if (cmp < 0) {
5485 				ret = changed_cb(left_root, right_root,
5486 						left_path, right_path,
5487 						&left_key,
5488 						BTRFS_COMPARE_TREE_NEW,
5489 						ctx);
5490 				if (ret < 0)
5491 					goto out;
5492 				advance_left = ADVANCE;
5493 			} else if (cmp > 0) {
5494 				ret = changed_cb(left_root, right_root,
5495 						left_path, right_path,
5496 						&right_key,
5497 						BTRFS_COMPARE_TREE_DELETED,
5498 						ctx);
5499 				if (ret < 0)
5500 					goto out;
5501 				advance_right = ADVANCE;
5502 			} else {
5503 				enum btrfs_compare_tree_result result;
5504 
5505 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5506 				ret = tree_compare_item(left_root, left_path,
5507 						right_path, tmp_buf);
5508 				if (ret)
5509 					result = BTRFS_COMPARE_TREE_CHANGED;
5510 				else
5511 					result = BTRFS_COMPARE_TREE_SAME;
5512 				ret = changed_cb(left_root, right_root,
5513 						 left_path, right_path,
5514 						 &left_key, result, ctx);
5515 				if (ret < 0)
5516 					goto out;
5517 				advance_left = ADVANCE;
5518 				advance_right = ADVANCE;
5519 			}
5520 		} else if (left_level == right_level) {
5521 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5522 			if (cmp < 0) {
5523 				advance_left = ADVANCE;
5524 			} else if (cmp > 0) {
5525 				advance_right = ADVANCE;
5526 			} else {
5527 				left_blockptr = btrfs_node_blockptr(
5528 						left_path->nodes[left_level],
5529 						left_path->slots[left_level]);
5530 				right_blockptr = btrfs_node_blockptr(
5531 						right_path->nodes[right_level],
5532 						right_path->slots[right_level]);
5533 				left_gen = btrfs_node_ptr_generation(
5534 						left_path->nodes[left_level],
5535 						left_path->slots[left_level]);
5536 				right_gen = btrfs_node_ptr_generation(
5537 						right_path->nodes[right_level],
5538 						right_path->slots[right_level]);
5539 				if (left_blockptr == right_blockptr &&
5540 				    left_gen == right_gen) {
5541 					/*
5542 					 * As we're on a shared block, don't
5543 					 * allow to go deeper.
5544 					 */
5545 					advance_left = ADVANCE_ONLY_NEXT;
5546 					advance_right = ADVANCE_ONLY_NEXT;
5547 				} else {
5548 					advance_left = ADVANCE;
5549 					advance_right = ADVANCE;
5550 				}
5551 			}
5552 		} else if (left_level < right_level) {
5553 			advance_right = ADVANCE;
5554 		} else {
5555 			advance_left = ADVANCE;
5556 		}
5557 	}
5558 
5559 out:
5560 	btrfs_free_path(left_path);
5561 	btrfs_free_path(right_path);
5562 	kfree(tmp_buf);
5563 	return ret;
5564 }
5565 
5566 /*
5567  * this is similar to btrfs_next_leaf, but does not try to preserve
5568  * and fixup the path.  It looks for and returns the next key in the
5569  * tree based on the current path and the min_trans parameters.
5570  *
5571  * 0 is returned if another key is found, < 0 if there are any errors
5572  * and 1 is returned if there are no higher keys in the tree
5573  *
5574  * path->keep_locks should be set to 1 on the search made before
5575  * calling this function.
5576  */
5577 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5578 			struct btrfs_key *key, int level, u64 min_trans)
5579 {
5580 	int slot;
5581 	struct extent_buffer *c;
5582 
5583 	WARN_ON(!path->keep_locks);
5584 	while (level < BTRFS_MAX_LEVEL) {
5585 		if (!path->nodes[level])
5586 			return 1;
5587 
5588 		slot = path->slots[level] + 1;
5589 		c = path->nodes[level];
5590 next:
5591 		if (slot >= btrfs_header_nritems(c)) {
5592 			int ret;
5593 			int orig_lowest;
5594 			struct btrfs_key cur_key;
5595 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5596 			    !path->nodes[level + 1])
5597 				return 1;
5598 
5599 			if (path->locks[level + 1]) {
5600 				level++;
5601 				continue;
5602 			}
5603 
5604 			slot = btrfs_header_nritems(c) - 1;
5605 			if (level == 0)
5606 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5607 			else
5608 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5609 
5610 			orig_lowest = path->lowest_level;
5611 			btrfs_release_path(path);
5612 			path->lowest_level = level;
5613 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5614 						0, 0);
5615 			path->lowest_level = orig_lowest;
5616 			if (ret < 0)
5617 				return ret;
5618 
5619 			c = path->nodes[level];
5620 			slot = path->slots[level];
5621 			if (ret == 0)
5622 				slot++;
5623 			goto next;
5624 		}
5625 
5626 		if (level == 0)
5627 			btrfs_item_key_to_cpu(c, key, slot);
5628 		else {
5629 			u64 gen = btrfs_node_ptr_generation(c, slot);
5630 
5631 			if (gen < min_trans) {
5632 				slot++;
5633 				goto next;
5634 			}
5635 			btrfs_node_key_to_cpu(c, key, slot);
5636 		}
5637 		return 0;
5638 	}
5639 	return 1;
5640 }
5641 
5642 /*
5643  * search the tree again to find a leaf with greater keys
5644  * returns 0 if it found something or 1 if there are no greater leaves.
5645  * returns < 0 on io errors.
5646  */
5647 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5648 {
5649 	return btrfs_next_old_leaf(root, path, 0);
5650 }
5651 
5652 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5653 			u64 time_seq)
5654 {
5655 	int slot;
5656 	int level;
5657 	struct extent_buffer *c;
5658 	struct extent_buffer *next;
5659 	struct btrfs_key key;
5660 	u32 nritems;
5661 	int ret;
5662 	int old_spinning = path->leave_spinning;
5663 	int next_rw_lock = 0;
5664 
5665 	nritems = btrfs_header_nritems(path->nodes[0]);
5666 	if (nritems == 0)
5667 		return 1;
5668 
5669 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5670 again:
5671 	level = 1;
5672 	next = NULL;
5673 	next_rw_lock = 0;
5674 	btrfs_release_path(path);
5675 
5676 	path->keep_locks = 1;
5677 	path->leave_spinning = 1;
5678 
5679 	if (time_seq)
5680 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5681 	else
5682 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5683 	path->keep_locks = 0;
5684 
5685 	if (ret < 0)
5686 		return ret;
5687 
5688 	nritems = btrfs_header_nritems(path->nodes[0]);
5689 	/*
5690 	 * by releasing the path above we dropped all our locks.  A balance
5691 	 * could have added more items next to the key that used to be
5692 	 * at the very end of the block.  So, check again here and
5693 	 * advance the path if there are now more items available.
5694 	 */
5695 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5696 		if (ret == 0)
5697 			path->slots[0]++;
5698 		ret = 0;
5699 		goto done;
5700 	}
5701 	/*
5702 	 * So the above check misses one case:
5703 	 * - after releasing the path above, someone has removed the item that
5704 	 *   used to be at the very end of the block, and balance between leafs
5705 	 *   gets another one with bigger key.offset to replace it.
5706 	 *
5707 	 * This one should be returned as well, or we can get leaf corruption
5708 	 * later(esp. in __btrfs_drop_extents()).
5709 	 *
5710 	 * And a bit more explanation about this check,
5711 	 * with ret > 0, the key isn't found, the path points to the slot
5712 	 * where it should be inserted, so the path->slots[0] item must be the
5713 	 * bigger one.
5714 	 */
5715 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5716 		ret = 0;
5717 		goto done;
5718 	}
5719 
5720 	while (level < BTRFS_MAX_LEVEL) {
5721 		if (!path->nodes[level]) {
5722 			ret = 1;
5723 			goto done;
5724 		}
5725 
5726 		slot = path->slots[level] + 1;
5727 		c = path->nodes[level];
5728 		if (slot >= btrfs_header_nritems(c)) {
5729 			level++;
5730 			if (level == BTRFS_MAX_LEVEL) {
5731 				ret = 1;
5732 				goto done;
5733 			}
5734 			continue;
5735 		}
5736 
5737 		if (next) {
5738 			btrfs_tree_unlock_rw(next, next_rw_lock);
5739 			free_extent_buffer(next);
5740 		}
5741 
5742 		next = c;
5743 		next_rw_lock = path->locks[level];
5744 		ret = read_block_for_search(NULL, root, path, &next, level,
5745 					    slot, &key, 0);
5746 		if (ret == -EAGAIN)
5747 			goto again;
5748 
5749 		if (ret < 0) {
5750 			btrfs_release_path(path);
5751 			goto done;
5752 		}
5753 
5754 		if (!path->skip_locking) {
5755 			ret = btrfs_try_tree_read_lock(next);
5756 			if (!ret && time_seq) {
5757 				/*
5758 				 * If we don't get the lock, we may be racing
5759 				 * with push_leaf_left, holding that lock while
5760 				 * itself waiting for the leaf we've currently
5761 				 * locked. To solve this situation, we give up
5762 				 * on our lock and cycle.
5763 				 */
5764 				free_extent_buffer(next);
5765 				btrfs_release_path(path);
5766 				cond_resched();
5767 				goto again;
5768 			}
5769 			if (!ret) {
5770 				btrfs_set_path_blocking(path);
5771 				btrfs_tree_read_lock(next);
5772 				btrfs_clear_path_blocking(path, next,
5773 							  BTRFS_READ_LOCK);
5774 			}
5775 			next_rw_lock = BTRFS_READ_LOCK;
5776 		}
5777 		break;
5778 	}
5779 	path->slots[level] = slot;
5780 	while (1) {
5781 		level--;
5782 		c = path->nodes[level];
5783 		if (path->locks[level])
5784 			btrfs_tree_unlock_rw(c, path->locks[level]);
5785 
5786 		free_extent_buffer(c);
5787 		path->nodes[level] = next;
5788 		path->slots[level] = 0;
5789 		if (!path->skip_locking)
5790 			path->locks[level] = next_rw_lock;
5791 		if (!level)
5792 			break;
5793 
5794 		ret = read_block_for_search(NULL, root, path, &next, level,
5795 					    0, &key, 0);
5796 		if (ret == -EAGAIN)
5797 			goto again;
5798 
5799 		if (ret < 0) {
5800 			btrfs_release_path(path);
5801 			goto done;
5802 		}
5803 
5804 		if (!path->skip_locking) {
5805 			ret = btrfs_try_tree_read_lock(next);
5806 			if (!ret) {
5807 				btrfs_set_path_blocking(path);
5808 				btrfs_tree_read_lock(next);
5809 				btrfs_clear_path_blocking(path, next,
5810 							  BTRFS_READ_LOCK);
5811 			}
5812 			next_rw_lock = BTRFS_READ_LOCK;
5813 		}
5814 	}
5815 	ret = 0;
5816 done:
5817 	unlock_up(path, 0, 1, 0, NULL);
5818 	path->leave_spinning = old_spinning;
5819 	if (!old_spinning)
5820 		btrfs_set_path_blocking(path);
5821 
5822 	return ret;
5823 }
5824 
5825 /*
5826  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5827  * searching until it gets past min_objectid or finds an item of 'type'
5828  *
5829  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5830  */
5831 int btrfs_previous_item(struct btrfs_root *root,
5832 			struct btrfs_path *path, u64 min_objectid,
5833 			int type)
5834 {
5835 	struct btrfs_key found_key;
5836 	struct extent_buffer *leaf;
5837 	u32 nritems;
5838 	int ret;
5839 
5840 	while (1) {
5841 		if (path->slots[0] == 0) {
5842 			btrfs_set_path_blocking(path);
5843 			ret = btrfs_prev_leaf(root, path);
5844 			if (ret != 0)
5845 				return ret;
5846 		} else {
5847 			path->slots[0]--;
5848 		}
5849 		leaf = path->nodes[0];
5850 		nritems = btrfs_header_nritems(leaf);
5851 		if (nritems == 0)
5852 			return 1;
5853 		if (path->slots[0] == nritems)
5854 			path->slots[0]--;
5855 
5856 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5857 		if (found_key.objectid < min_objectid)
5858 			break;
5859 		if (found_key.type == type)
5860 			return 0;
5861 		if (found_key.objectid == min_objectid &&
5862 		    found_key.type < type)
5863 			break;
5864 	}
5865 	return 1;
5866 }
5867 
5868 /*
5869  * search in extent tree to find a previous Metadata/Data extent item with
5870  * min objecitd.
5871  *
5872  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5873  */
5874 int btrfs_previous_extent_item(struct btrfs_root *root,
5875 			struct btrfs_path *path, u64 min_objectid)
5876 {
5877 	struct btrfs_key found_key;
5878 	struct extent_buffer *leaf;
5879 	u32 nritems;
5880 	int ret;
5881 
5882 	while (1) {
5883 		if (path->slots[0] == 0) {
5884 			btrfs_set_path_blocking(path);
5885 			ret = btrfs_prev_leaf(root, path);
5886 			if (ret != 0)
5887 				return ret;
5888 		} else {
5889 			path->slots[0]--;
5890 		}
5891 		leaf = path->nodes[0];
5892 		nritems = btrfs_header_nritems(leaf);
5893 		if (nritems == 0)
5894 			return 1;
5895 		if (path->slots[0] == nritems)
5896 			path->slots[0]--;
5897 
5898 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5899 		if (found_key.objectid < min_objectid)
5900 			break;
5901 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5902 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5903 			return 0;
5904 		if (found_key.objectid == min_objectid &&
5905 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5906 			break;
5907 	}
5908 	return 1;
5909 }
5910