xref: /openbmc/linux/fs/btrfs/ctree.c (revision e1f7c9ee)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27 
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 		      *root, struct btrfs_key *ins_key,
32 		      struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 			  struct btrfs_root *root, struct extent_buffer *dst,
35 			  struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      struct extent_buffer *dst_buf,
39 			      struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 		    int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 				 struct extent_buffer *eb);
44 
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47 	struct btrfs_path *path;
48 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49 	return path;
50 }
51 
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58 	int i;
59 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60 		if (!p->nodes[i] || !p->locks[i])
61 			continue;
62 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63 		if (p->locks[i] == BTRFS_READ_LOCK)
64 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
66 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67 	}
68 }
69 
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 					struct extent_buffer *held, int held_rw)
80 {
81 	int i;
82 
83 #ifdef CONFIG_DEBUG_LOCK_ALLOC
84 	/* lockdep really cares that we take all of these spinlocks
85 	 * in the right order.  If any of the locks in the path are not
86 	 * currently blocking, it is going to complain.  So, make really
87 	 * really sure by forcing the path to blocking before we clear
88 	 * the path blocking.
89 	 */
90 	if (held) {
91 		btrfs_set_lock_blocking_rw(held, held_rw);
92 		if (held_rw == BTRFS_WRITE_LOCK)
93 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
94 		else if (held_rw == BTRFS_READ_LOCK)
95 			held_rw = BTRFS_READ_LOCK_BLOCKING;
96 	}
97 	btrfs_set_path_blocking(p);
98 #endif
99 
100 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
101 		if (p->nodes[i] && p->locks[i]) {
102 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
103 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
104 				p->locks[i] = BTRFS_WRITE_LOCK;
105 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
106 				p->locks[i] = BTRFS_READ_LOCK;
107 		}
108 	}
109 
110 #ifdef CONFIG_DEBUG_LOCK_ALLOC
111 	if (held)
112 		btrfs_clear_lock_blocking_rw(held, held_rw);
113 #endif
114 }
115 
116 /* this also releases the path */
117 void btrfs_free_path(struct btrfs_path *p)
118 {
119 	if (!p)
120 		return;
121 	btrfs_release_path(p);
122 	kmem_cache_free(btrfs_path_cachep, p);
123 }
124 
125 /*
126  * path release drops references on the extent buffers in the path
127  * and it drops any locks held by this path
128  *
129  * It is safe to call this on paths that no locks or extent buffers held.
130  */
131 noinline void btrfs_release_path(struct btrfs_path *p)
132 {
133 	int i;
134 
135 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
136 		p->slots[i] = 0;
137 		if (!p->nodes[i])
138 			continue;
139 		if (p->locks[i]) {
140 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
141 			p->locks[i] = 0;
142 		}
143 		free_extent_buffer(p->nodes[i]);
144 		p->nodes[i] = NULL;
145 	}
146 }
147 
148 /*
149  * safely gets a reference on the root node of a tree.  A lock
150  * is not taken, so a concurrent writer may put a different node
151  * at the root of the tree.  See btrfs_lock_root_node for the
152  * looping required.
153  *
154  * The extent buffer returned by this has a reference taken, so
155  * it won't disappear.  It may stop being the root of the tree
156  * at any time because there are no locks held.
157  */
158 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
159 {
160 	struct extent_buffer *eb;
161 
162 	while (1) {
163 		rcu_read_lock();
164 		eb = rcu_dereference(root->node);
165 
166 		/*
167 		 * RCU really hurts here, we could free up the root node because
168 		 * it was cow'ed but we may not get the new root node yet so do
169 		 * the inc_not_zero dance and if it doesn't work then
170 		 * synchronize_rcu and try again.
171 		 */
172 		if (atomic_inc_not_zero(&eb->refs)) {
173 			rcu_read_unlock();
174 			break;
175 		}
176 		rcu_read_unlock();
177 		synchronize_rcu();
178 	}
179 	return eb;
180 }
181 
182 /* loop around taking references on and locking the root node of the
183  * tree until you end up with a lock on the root.  A locked buffer
184  * is returned, with a reference held.
185  */
186 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
187 {
188 	struct extent_buffer *eb;
189 
190 	while (1) {
191 		eb = btrfs_root_node(root);
192 		btrfs_tree_lock(eb);
193 		if (eb == root->node)
194 			break;
195 		btrfs_tree_unlock(eb);
196 		free_extent_buffer(eb);
197 	}
198 	return eb;
199 }
200 
201 /* loop around taking references on and locking the root node of the
202  * tree until you end up with a lock on the root.  A locked buffer
203  * is returned, with a reference held.
204  */
205 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
206 {
207 	struct extent_buffer *eb;
208 
209 	while (1) {
210 		eb = btrfs_root_node(root);
211 		btrfs_tree_read_lock(eb);
212 		if (eb == root->node)
213 			break;
214 		btrfs_tree_read_unlock(eb);
215 		free_extent_buffer(eb);
216 	}
217 	return eb;
218 }
219 
220 /* cowonly root (everything not a reference counted cow subvolume), just get
221  * put onto a simple dirty list.  transaction.c walks this to make sure they
222  * get properly updated on disk.
223  */
224 static void add_root_to_dirty_list(struct btrfs_root *root)
225 {
226 	spin_lock(&root->fs_info->trans_lock);
227 	if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
228 	    list_empty(&root->dirty_list)) {
229 		list_add(&root->dirty_list,
230 			 &root->fs_info->dirty_cowonly_roots);
231 	}
232 	spin_unlock(&root->fs_info->trans_lock);
233 }
234 
235 /*
236  * used by snapshot creation to make a copy of a root for a tree with
237  * a given objectid.  The buffer with the new root node is returned in
238  * cow_ret, and this func returns zero on success or a negative error code.
239  */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241 		      struct btrfs_root *root,
242 		      struct extent_buffer *buf,
243 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245 	struct extent_buffer *cow;
246 	int ret = 0;
247 	int level;
248 	struct btrfs_disk_key disk_key;
249 
250 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
251 		trans->transid != root->fs_info->running_transaction->transid);
252 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
253 		trans->transid != root->last_trans);
254 
255 	level = btrfs_header_level(buf);
256 	if (level == 0)
257 		btrfs_item_key(buf, &disk_key, 0);
258 	else
259 		btrfs_node_key(buf, &disk_key, 0);
260 
261 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
262 			&disk_key, level, buf->start, 0);
263 	if (IS_ERR(cow))
264 		return PTR_ERR(cow);
265 
266 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
267 	btrfs_set_header_bytenr(cow, cow->start);
268 	btrfs_set_header_generation(cow, trans->transid);
269 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
270 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
271 				     BTRFS_HEADER_FLAG_RELOC);
272 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
273 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
274 	else
275 		btrfs_set_header_owner(cow, new_root_objectid);
276 
277 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
278 			    BTRFS_FSID_SIZE);
279 
280 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
281 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
282 		ret = btrfs_inc_ref(trans, root, cow, 1);
283 	else
284 		ret = btrfs_inc_ref(trans, root, cow, 0);
285 
286 	if (ret)
287 		return ret;
288 
289 	btrfs_mark_buffer_dirty(cow);
290 	*cow_ret = cow;
291 	return 0;
292 }
293 
294 enum mod_log_op {
295 	MOD_LOG_KEY_REPLACE,
296 	MOD_LOG_KEY_ADD,
297 	MOD_LOG_KEY_REMOVE,
298 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
299 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
300 	MOD_LOG_MOVE_KEYS,
301 	MOD_LOG_ROOT_REPLACE,
302 };
303 
304 struct tree_mod_move {
305 	int dst_slot;
306 	int nr_items;
307 };
308 
309 struct tree_mod_root {
310 	u64 logical;
311 	u8 level;
312 };
313 
314 struct tree_mod_elem {
315 	struct rb_node node;
316 	u64 index;		/* shifted logical */
317 	u64 seq;
318 	enum mod_log_op op;
319 
320 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
321 	int slot;
322 
323 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
324 	u64 generation;
325 
326 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
327 	struct btrfs_disk_key key;
328 	u64 blockptr;
329 
330 	/* this is used for op == MOD_LOG_MOVE_KEYS */
331 	struct tree_mod_move move;
332 
333 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
334 	struct tree_mod_root old_root;
335 };
336 
337 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
338 {
339 	read_lock(&fs_info->tree_mod_log_lock);
340 }
341 
342 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
343 {
344 	read_unlock(&fs_info->tree_mod_log_lock);
345 }
346 
347 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
348 {
349 	write_lock(&fs_info->tree_mod_log_lock);
350 }
351 
352 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
353 {
354 	write_unlock(&fs_info->tree_mod_log_lock);
355 }
356 
357 /*
358  * Pull a new tree mod seq number for our operation.
359  */
360 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
361 {
362 	return atomic64_inc_return(&fs_info->tree_mod_seq);
363 }
364 
365 /*
366  * This adds a new blocker to the tree mod log's blocker list if the @elem
367  * passed does not already have a sequence number set. So when a caller expects
368  * to record tree modifications, it should ensure to set elem->seq to zero
369  * before calling btrfs_get_tree_mod_seq.
370  * Returns a fresh, unused tree log modification sequence number, even if no new
371  * blocker was added.
372  */
373 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
374 			   struct seq_list *elem)
375 {
376 	tree_mod_log_write_lock(fs_info);
377 	spin_lock(&fs_info->tree_mod_seq_lock);
378 	if (!elem->seq) {
379 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
380 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
381 	}
382 	spin_unlock(&fs_info->tree_mod_seq_lock);
383 	tree_mod_log_write_unlock(fs_info);
384 
385 	return elem->seq;
386 }
387 
388 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
389 			    struct seq_list *elem)
390 {
391 	struct rb_root *tm_root;
392 	struct rb_node *node;
393 	struct rb_node *next;
394 	struct seq_list *cur_elem;
395 	struct tree_mod_elem *tm;
396 	u64 min_seq = (u64)-1;
397 	u64 seq_putting = elem->seq;
398 
399 	if (!seq_putting)
400 		return;
401 
402 	spin_lock(&fs_info->tree_mod_seq_lock);
403 	list_del(&elem->list);
404 	elem->seq = 0;
405 
406 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
407 		if (cur_elem->seq < min_seq) {
408 			if (seq_putting > cur_elem->seq) {
409 				/*
410 				 * blocker with lower sequence number exists, we
411 				 * cannot remove anything from the log
412 				 */
413 				spin_unlock(&fs_info->tree_mod_seq_lock);
414 				return;
415 			}
416 			min_seq = cur_elem->seq;
417 		}
418 	}
419 	spin_unlock(&fs_info->tree_mod_seq_lock);
420 
421 	/*
422 	 * anything that's lower than the lowest existing (read: blocked)
423 	 * sequence number can be removed from the tree.
424 	 */
425 	tree_mod_log_write_lock(fs_info);
426 	tm_root = &fs_info->tree_mod_log;
427 	for (node = rb_first(tm_root); node; node = next) {
428 		next = rb_next(node);
429 		tm = container_of(node, struct tree_mod_elem, node);
430 		if (tm->seq > min_seq)
431 			continue;
432 		rb_erase(node, tm_root);
433 		kfree(tm);
434 	}
435 	tree_mod_log_write_unlock(fs_info);
436 }
437 
438 /*
439  * key order of the log:
440  *       index -> sequence
441  *
442  * the index is the shifted logical of the *new* root node for root replace
443  * operations, or the shifted logical of the affected block for all other
444  * operations.
445  *
446  * Note: must be called with write lock (tree_mod_log_write_lock).
447  */
448 static noinline int
449 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
450 {
451 	struct rb_root *tm_root;
452 	struct rb_node **new;
453 	struct rb_node *parent = NULL;
454 	struct tree_mod_elem *cur;
455 
456 	BUG_ON(!tm);
457 
458 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
459 
460 	tm_root = &fs_info->tree_mod_log;
461 	new = &tm_root->rb_node;
462 	while (*new) {
463 		cur = container_of(*new, struct tree_mod_elem, node);
464 		parent = *new;
465 		if (cur->index < tm->index)
466 			new = &((*new)->rb_left);
467 		else if (cur->index > tm->index)
468 			new = &((*new)->rb_right);
469 		else if (cur->seq < tm->seq)
470 			new = &((*new)->rb_left);
471 		else if (cur->seq > tm->seq)
472 			new = &((*new)->rb_right);
473 		else
474 			return -EEXIST;
475 	}
476 
477 	rb_link_node(&tm->node, parent, new);
478 	rb_insert_color(&tm->node, tm_root);
479 	return 0;
480 }
481 
482 /*
483  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
484  * returns zero with the tree_mod_log_lock acquired. The caller must hold
485  * this until all tree mod log insertions are recorded in the rb tree and then
486  * call tree_mod_log_write_unlock() to release.
487  */
488 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
489 				    struct extent_buffer *eb) {
490 	smp_mb();
491 	if (list_empty(&(fs_info)->tree_mod_seq_list))
492 		return 1;
493 	if (eb && btrfs_header_level(eb) == 0)
494 		return 1;
495 
496 	tree_mod_log_write_lock(fs_info);
497 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
498 		tree_mod_log_write_unlock(fs_info);
499 		return 1;
500 	}
501 
502 	return 0;
503 }
504 
505 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
506 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
507 				    struct extent_buffer *eb)
508 {
509 	smp_mb();
510 	if (list_empty(&(fs_info)->tree_mod_seq_list))
511 		return 0;
512 	if (eb && btrfs_header_level(eb) == 0)
513 		return 0;
514 
515 	return 1;
516 }
517 
518 static struct tree_mod_elem *
519 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
520 		    enum mod_log_op op, gfp_t flags)
521 {
522 	struct tree_mod_elem *tm;
523 
524 	tm = kzalloc(sizeof(*tm), flags);
525 	if (!tm)
526 		return NULL;
527 
528 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
529 	if (op != MOD_LOG_KEY_ADD) {
530 		btrfs_node_key(eb, &tm->key, slot);
531 		tm->blockptr = btrfs_node_blockptr(eb, slot);
532 	}
533 	tm->op = op;
534 	tm->slot = slot;
535 	tm->generation = btrfs_node_ptr_generation(eb, slot);
536 	RB_CLEAR_NODE(&tm->node);
537 
538 	return tm;
539 }
540 
541 static noinline int
542 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
543 			struct extent_buffer *eb, int slot,
544 			enum mod_log_op op, gfp_t flags)
545 {
546 	struct tree_mod_elem *tm;
547 	int ret;
548 
549 	if (!tree_mod_need_log(fs_info, eb))
550 		return 0;
551 
552 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
553 	if (!tm)
554 		return -ENOMEM;
555 
556 	if (tree_mod_dont_log(fs_info, eb)) {
557 		kfree(tm);
558 		return 0;
559 	}
560 
561 	ret = __tree_mod_log_insert(fs_info, tm);
562 	tree_mod_log_write_unlock(fs_info);
563 	if (ret)
564 		kfree(tm);
565 
566 	return ret;
567 }
568 
569 static noinline int
570 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
571 			 struct extent_buffer *eb, int dst_slot, int src_slot,
572 			 int nr_items, gfp_t flags)
573 {
574 	struct tree_mod_elem *tm = NULL;
575 	struct tree_mod_elem **tm_list = NULL;
576 	int ret = 0;
577 	int i;
578 	int locked = 0;
579 
580 	if (!tree_mod_need_log(fs_info, eb))
581 		return 0;
582 
583 	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
584 	if (!tm_list)
585 		return -ENOMEM;
586 
587 	tm = kzalloc(sizeof(*tm), flags);
588 	if (!tm) {
589 		ret = -ENOMEM;
590 		goto free_tms;
591 	}
592 
593 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
594 	tm->slot = src_slot;
595 	tm->move.dst_slot = dst_slot;
596 	tm->move.nr_items = nr_items;
597 	tm->op = MOD_LOG_MOVE_KEYS;
598 
599 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
600 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
601 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
602 		if (!tm_list[i]) {
603 			ret = -ENOMEM;
604 			goto free_tms;
605 		}
606 	}
607 
608 	if (tree_mod_dont_log(fs_info, eb))
609 		goto free_tms;
610 	locked = 1;
611 
612 	/*
613 	 * When we override something during the move, we log these removals.
614 	 * This can only happen when we move towards the beginning of the
615 	 * buffer, i.e. dst_slot < src_slot.
616 	 */
617 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
618 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
619 		if (ret)
620 			goto free_tms;
621 	}
622 
623 	ret = __tree_mod_log_insert(fs_info, tm);
624 	if (ret)
625 		goto free_tms;
626 	tree_mod_log_write_unlock(fs_info);
627 	kfree(tm_list);
628 
629 	return 0;
630 free_tms:
631 	for (i = 0; i < nr_items; i++) {
632 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
633 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
634 		kfree(tm_list[i]);
635 	}
636 	if (locked)
637 		tree_mod_log_write_unlock(fs_info);
638 	kfree(tm_list);
639 	kfree(tm);
640 
641 	return ret;
642 }
643 
644 static inline int
645 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
646 		       struct tree_mod_elem **tm_list,
647 		       int nritems)
648 {
649 	int i, j;
650 	int ret;
651 
652 	for (i = nritems - 1; i >= 0; i--) {
653 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
654 		if (ret) {
655 			for (j = nritems - 1; j > i; j--)
656 				rb_erase(&tm_list[j]->node,
657 					 &fs_info->tree_mod_log);
658 			return ret;
659 		}
660 	}
661 
662 	return 0;
663 }
664 
665 static noinline int
666 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
667 			 struct extent_buffer *old_root,
668 			 struct extent_buffer *new_root, gfp_t flags,
669 			 int log_removal)
670 {
671 	struct tree_mod_elem *tm = NULL;
672 	struct tree_mod_elem **tm_list = NULL;
673 	int nritems = 0;
674 	int ret = 0;
675 	int i;
676 
677 	if (!tree_mod_need_log(fs_info, NULL))
678 		return 0;
679 
680 	if (log_removal && btrfs_header_level(old_root) > 0) {
681 		nritems = btrfs_header_nritems(old_root);
682 		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
683 				  flags);
684 		if (!tm_list) {
685 			ret = -ENOMEM;
686 			goto free_tms;
687 		}
688 		for (i = 0; i < nritems; i++) {
689 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
690 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
691 			if (!tm_list[i]) {
692 				ret = -ENOMEM;
693 				goto free_tms;
694 			}
695 		}
696 	}
697 
698 	tm = kzalloc(sizeof(*tm), flags);
699 	if (!tm) {
700 		ret = -ENOMEM;
701 		goto free_tms;
702 	}
703 
704 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
705 	tm->old_root.logical = old_root->start;
706 	tm->old_root.level = btrfs_header_level(old_root);
707 	tm->generation = btrfs_header_generation(old_root);
708 	tm->op = MOD_LOG_ROOT_REPLACE;
709 
710 	if (tree_mod_dont_log(fs_info, NULL))
711 		goto free_tms;
712 
713 	if (tm_list)
714 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
715 	if (!ret)
716 		ret = __tree_mod_log_insert(fs_info, tm);
717 
718 	tree_mod_log_write_unlock(fs_info);
719 	if (ret)
720 		goto free_tms;
721 	kfree(tm_list);
722 
723 	return ret;
724 
725 free_tms:
726 	if (tm_list) {
727 		for (i = 0; i < nritems; i++)
728 			kfree(tm_list[i]);
729 		kfree(tm_list);
730 	}
731 	kfree(tm);
732 
733 	return ret;
734 }
735 
736 static struct tree_mod_elem *
737 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
738 		      int smallest)
739 {
740 	struct rb_root *tm_root;
741 	struct rb_node *node;
742 	struct tree_mod_elem *cur = NULL;
743 	struct tree_mod_elem *found = NULL;
744 	u64 index = start >> PAGE_CACHE_SHIFT;
745 
746 	tree_mod_log_read_lock(fs_info);
747 	tm_root = &fs_info->tree_mod_log;
748 	node = tm_root->rb_node;
749 	while (node) {
750 		cur = container_of(node, struct tree_mod_elem, node);
751 		if (cur->index < index) {
752 			node = node->rb_left;
753 		} else if (cur->index > index) {
754 			node = node->rb_right;
755 		} else if (cur->seq < min_seq) {
756 			node = node->rb_left;
757 		} else if (!smallest) {
758 			/* we want the node with the highest seq */
759 			if (found)
760 				BUG_ON(found->seq > cur->seq);
761 			found = cur;
762 			node = node->rb_left;
763 		} else if (cur->seq > min_seq) {
764 			/* we want the node with the smallest seq */
765 			if (found)
766 				BUG_ON(found->seq < cur->seq);
767 			found = cur;
768 			node = node->rb_right;
769 		} else {
770 			found = cur;
771 			break;
772 		}
773 	}
774 	tree_mod_log_read_unlock(fs_info);
775 
776 	return found;
777 }
778 
779 /*
780  * this returns the element from the log with the smallest time sequence
781  * value that's in the log (the oldest log item). any element with a time
782  * sequence lower than min_seq will be ignored.
783  */
784 static struct tree_mod_elem *
785 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
786 			   u64 min_seq)
787 {
788 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
789 }
790 
791 /*
792  * this returns the element from the log with the largest time sequence
793  * value that's in the log (the most recent log item). any element with
794  * a time sequence lower than min_seq will be ignored.
795  */
796 static struct tree_mod_elem *
797 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
798 {
799 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
800 }
801 
802 static noinline int
803 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
804 		     struct extent_buffer *src, unsigned long dst_offset,
805 		     unsigned long src_offset, int nr_items)
806 {
807 	int ret = 0;
808 	struct tree_mod_elem **tm_list = NULL;
809 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
810 	int i;
811 	int locked = 0;
812 
813 	if (!tree_mod_need_log(fs_info, NULL))
814 		return 0;
815 
816 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
817 		return 0;
818 
819 	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
820 			  GFP_NOFS);
821 	if (!tm_list)
822 		return -ENOMEM;
823 
824 	tm_list_add = tm_list;
825 	tm_list_rem = tm_list + nr_items;
826 	for (i = 0; i < nr_items; i++) {
827 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
828 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
829 		if (!tm_list_rem[i]) {
830 			ret = -ENOMEM;
831 			goto free_tms;
832 		}
833 
834 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
835 		    MOD_LOG_KEY_ADD, GFP_NOFS);
836 		if (!tm_list_add[i]) {
837 			ret = -ENOMEM;
838 			goto free_tms;
839 		}
840 	}
841 
842 	if (tree_mod_dont_log(fs_info, NULL))
843 		goto free_tms;
844 	locked = 1;
845 
846 	for (i = 0; i < nr_items; i++) {
847 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
848 		if (ret)
849 			goto free_tms;
850 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
851 		if (ret)
852 			goto free_tms;
853 	}
854 
855 	tree_mod_log_write_unlock(fs_info);
856 	kfree(tm_list);
857 
858 	return 0;
859 
860 free_tms:
861 	for (i = 0; i < nr_items * 2; i++) {
862 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
863 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
864 		kfree(tm_list[i]);
865 	}
866 	if (locked)
867 		tree_mod_log_write_unlock(fs_info);
868 	kfree(tm_list);
869 
870 	return ret;
871 }
872 
873 static inline void
874 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
875 		     int dst_offset, int src_offset, int nr_items)
876 {
877 	int ret;
878 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
879 				       nr_items, GFP_NOFS);
880 	BUG_ON(ret < 0);
881 }
882 
883 static noinline void
884 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
885 			  struct extent_buffer *eb, int slot, int atomic)
886 {
887 	int ret;
888 
889 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
890 					MOD_LOG_KEY_REPLACE,
891 					atomic ? GFP_ATOMIC : GFP_NOFS);
892 	BUG_ON(ret < 0);
893 }
894 
895 static noinline int
896 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
897 {
898 	struct tree_mod_elem **tm_list = NULL;
899 	int nritems = 0;
900 	int i;
901 	int ret = 0;
902 
903 	if (btrfs_header_level(eb) == 0)
904 		return 0;
905 
906 	if (!tree_mod_need_log(fs_info, NULL))
907 		return 0;
908 
909 	nritems = btrfs_header_nritems(eb);
910 	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
911 			  GFP_NOFS);
912 	if (!tm_list)
913 		return -ENOMEM;
914 
915 	for (i = 0; i < nritems; i++) {
916 		tm_list[i] = alloc_tree_mod_elem(eb, i,
917 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
918 		if (!tm_list[i]) {
919 			ret = -ENOMEM;
920 			goto free_tms;
921 		}
922 	}
923 
924 	if (tree_mod_dont_log(fs_info, eb))
925 		goto free_tms;
926 
927 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
928 	tree_mod_log_write_unlock(fs_info);
929 	if (ret)
930 		goto free_tms;
931 	kfree(tm_list);
932 
933 	return 0;
934 
935 free_tms:
936 	for (i = 0; i < nritems; i++)
937 		kfree(tm_list[i]);
938 	kfree(tm_list);
939 
940 	return ret;
941 }
942 
943 static noinline void
944 tree_mod_log_set_root_pointer(struct btrfs_root *root,
945 			      struct extent_buffer *new_root_node,
946 			      int log_removal)
947 {
948 	int ret;
949 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
950 				       new_root_node, GFP_NOFS, log_removal);
951 	BUG_ON(ret < 0);
952 }
953 
954 /*
955  * check if the tree block can be shared by multiple trees
956  */
957 int btrfs_block_can_be_shared(struct btrfs_root *root,
958 			      struct extent_buffer *buf)
959 {
960 	/*
961 	 * Tree blocks not in refernece counted trees and tree roots
962 	 * are never shared. If a block was allocated after the last
963 	 * snapshot and the block was not allocated by tree relocation,
964 	 * we know the block is not shared.
965 	 */
966 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
967 	    buf != root->node && buf != root->commit_root &&
968 	    (btrfs_header_generation(buf) <=
969 	     btrfs_root_last_snapshot(&root->root_item) ||
970 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
971 		return 1;
972 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
973 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
974 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
975 		return 1;
976 #endif
977 	return 0;
978 }
979 
980 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
981 				       struct btrfs_root *root,
982 				       struct extent_buffer *buf,
983 				       struct extent_buffer *cow,
984 				       int *last_ref)
985 {
986 	u64 refs;
987 	u64 owner;
988 	u64 flags;
989 	u64 new_flags = 0;
990 	int ret;
991 
992 	/*
993 	 * Backrefs update rules:
994 	 *
995 	 * Always use full backrefs for extent pointers in tree block
996 	 * allocated by tree relocation.
997 	 *
998 	 * If a shared tree block is no longer referenced by its owner
999 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
1000 	 * use full backrefs for extent pointers in tree block.
1001 	 *
1002 	 * If a tree block is been relocating
1003 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1004 	 * use full backrefs for extent pointers in tree block.
1005 	 * The reason for this is some operations (such as drop tree)
1006 	 * are only allowed for blocks use full backrefs.
1007 	 */
1008 
1009 	if (btrfs_block_can_be_shared(root, buf)) {
1010 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
1011 					       btrfs_header_level(buf), 1,
1012 					       &refs, &flags);
1013 		if (ret)
1014 			return ret;
1015 		if (refs == 0) {
1016 			ret = -EROFS;
1017 			btrfs_std_error(root->fs_info, ret);
1018 			return ret;
1019 		}
1020 	} else {
1021 		refs = 1;
1022 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1023 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1024 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1025 		else
1026 			flags = 0;
1027 	}
1028 
1029 	owner = btrfs_header_owner(buf);
1030 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1031 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1032 
1033 	if (refs > 1) {
1034 		if ((owner == root->root_key.objectid ||
1035 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1036 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1037 			ret = btrfs_inc_ref(trans, root, buf, 1);
1038 			BUG_ON(ret); /* -ENOMEM */
1039 
1040 			if (root->root_key.objectid ==
1041 			    BTRFS_TREE_RELOC_OBJECTID) {
1042 				ret = btrfs_dec_ref(trans, root, buf, 0);
1043 				BUG_ON(ret); /* -ENOMEM */
1044 				ret = btrfs_inc_ref(trans, root, cow, 1);
1045 				BUG_ON(ret); /* -ENOMEM */
1046 			}
1047 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1048 		} else {
1049 
1050 			if (root->root_key.objectid ==
1051 			    BTRFS_TREE_RELOC_OBJECTID)
1052 				ret = btrfs_inc_ref(trans, root, cow, 1);
1053 			else
1054 				ret = btrfs_inc_ref(trans, root, cow, 0);
1055 			BUG_ON(ret); /* -ENOMEM */
1056 		}
1057 		if (new_flags != 0) {
1058 			int level = btrfs_header_level(buf);
1059 
1060 			ret = btrfs_set_disk_extent_flags(trans, root,
1061 							  buf->start,
1062 							  buf->len,
1063 							  new_flags, level, 0);
1064 			if (ret)
1065 				return ret;
1066 		}
1067 	} else {
1068 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1069 			if (root->root_key.objectid ==
1070 			    BTRFS_TREE_RELOC_OBJECTID)
1071 				ret = btrfs_inc_ref(trans, root, cow, 1);
1072 			else
1073 				ret = btrfs_inc_ref(trans, root, cow, 0);
1074 			BUG_ON(ret); /* -ENOMEM */
1075 			ret = btrfs_dec_ref(trans, root, buf, 1);
1076 			BUG_ON(ret); /* -ENOMEM */
1077 		}
1078 		clean_tree_block(trans, root, buf);
1079 		*last_ref = 1;
1080 	}
1081 	return 0;
1082 }
1083 
1084 /*
1085  * does the dirty work in cow of a single block.  The parent block (if
1086  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1087  * dirty and returned locked.  If you modify the block it needs to be marked
1088  * dirty again.
1089  *
1090  * search_start -- an allocation hint for the new block
1091  *
1092  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1093  * bytes the allocator should try to find free next to the block it returns.
1094  * This is just a hint and may be ignored by the allocator.
1095  */
1096 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1097 			     struct btrfs_root *root,
1098 			     struct extent_buffer *buf,
1099 			     struct extent_buffer *parent, int parent_slot,
1100 			     struct extent_buffer **cow_ret,
1101 			     u64 search_start, u64 empty_size)
1102 {
1103 	struct btrfs_disk_key disk_key;
1104 	struct extent_buffer *cow;
1105 	int level, ret;
1106 	int last_ref = 0;
1107 	int unlock_orig = 0;
1108 	u64 parent_start;
1109 
1110 	if (*cow_ret == buf)
1111 		unlock_orig = 1;
1112 
1113 	btrfs_assert_tree_locked(buf);
1114 
1115 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1116 		trans->transid != root->fs_info->running_transaction->transid);
1117 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1118 		trans->transid != root->last_trans);
1119 
1120 	level = btrfs_header_level(buf);
1121 
1122 	if (level == 0)
1123 		btrfs_item_key(buf, &disk_key, 0);
1124 	else
1125 		btrfs_node_key(buf, &disk_key, 0);
1126 
1127 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1128 		if (parent)
1129 			parent_start = parent->start;
1130 		else
1131 			parent_start = 0;
1132 	} else
1133 		parent_start = 0;
1134 
1135 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
1136 			root->root_key.objectid, &disk_key, level,
1137 			search_start, empty_size);
1138 	if (IS_ERR(cow))
1139 		return PTR_ERR(cow);
1140 
1141 	/* cow is set to blocking by btrfs_init_new_buffer */
1142 
1143 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
1144 	btrfs_set_header_bytenr(cow, cow->start);
1145 	btrfs_set_header_generation(cow, trans->transid);
1146 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1147 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1148 				     BTRFS_HEADER_FLAG_RELOC);
1149 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1150 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1151 	else
1152 		btrfs_set_header_owner(cow, root->root_key.objectid);
1153 
1154 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1155 			    BTRFS_FSID_SIZE);
1156 
1157 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1158 	if (ret) {
1159 		btrfs_abort_transaction(trans, root, ret);
1160 		return ret;
1161 	}
1162 
1163 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1164 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1165 		if (ret)
1166 			return ret;
1167 	}
1168 
1169 	if (buf == root->node) {
1170 		WARN_ON(parent && parent != buf);
1171 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1172 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1173 			parent_start = buf->start;
1174 		else
1175 			parent_start = 0;
1176 
1177 		extent_buffer_get(cow);
1178 		tree_mod_log_set_root_pointer(root, cow, 1);
1179 		rcu_assign_pointer(root->node, cow);
1180 
1181 		btrfs_free_tree_block(trans, root, buf, parent_start,
1182 				      last_ref);
1183 		free_extent_buffer(buf);
1184 		add_root_to_dirty_list(root);
1185 	} else {
1186 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1187 			parent_start = parent->start;
1188 		else
1189 			parent_start = 0;
1190 
1191 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1192 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1193 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1194 		btrfs_set_node_blockptr(parent, parent_slot,
1195 					cow->start);
1196 		btrfs_set_node_ptr_generation(parent, parent_slot,
1197 					      trans->transid);
1198 		btrfs_mark_buffer_dirty(parent);
1199 		if (last_ref) {
1200 			ret = tree_mod_log_free_eb(root->fs_info, buf);
1201 			if (ret) {
1202 				btrfs_abort_transaction(trans, root, ret);
1203 				return ret;
1204 			}
1205 		}
1206 		btrfs_free_tree_block(trans, root, buf, parent_start,
1207 				      last_ref);
1208 	}
1209 	if (unlock_orig)
1210 		btrfs_tree_unlock(buf);
1211 	free_extent_buffer_stale(buf);
1212 	btrfs_mark_buffer_dirty(cow);
1213 	*cow_ret = cow;
1214 	return 0;
1215 }
1216 
1217 /*
1218  * returns the logical address of the oldest predecessor of the given root.
1219  * entries older than time_seq are ignored.
1220  */
1221 static struct tree_mod_elem *
1222 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1223 			   struct extent_buffer *eb_root, u64 time_seq)
1224 {
1225 	struct tree_mod_elem *tm;
1226 	struct tree_mod_elem *found = NULL;
1227 	u64 root_logical = eb_root->start;
1228 	int looped = 0;
1229 
1230 	if (!time_seq)
1231 		return NULL;
1232 
1233 	/*
1234 	 * the very last operation that's logged for a root is the replacement
1235 	 * operation (if it is replaced at all). this has the index of the *new*
1236 	 * root, making it the very first operation that's logged for this root.
1237 	 */
1238 	while (1) {
1239 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1240 						time_seq);
1241 		if (!looped && !tm)
1242 			return NULL;
1243 		/*
1244 		 * if there are no tree operation for the oldest root, we simply
1245 		 * return it. this should only happen if that (old) root is at
1246 		 * level 0.
1247 		 */
1248 		if (!tm)
1249 			break;
1250 
1251 		/*
1252 		 * if there's an operation that's not a root replacement, we
1253 		 * found the oldest version of our root. normally, we'll find a
1254 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1255 		 */
1256 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1257 			break;
1258 
1259 		found = tm;
1260 		root_logical = tm->old_root.logical;
1261 		looped = 1;
1262 	}
1263 
1264 	/* if there's no old root to return, return what we found instead */
1265 	if (!found)
1266 		found = tm;
1267 
1268 	return found;
1269 }
1270 
1271 /*
1272  * tm is a pointer to the first operation to rewind within eb. then, all
1273  * previous operations will be rewinded (until we reach something older than
1274  * time_seq).
1275  */
1276 static void
1277 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1278 		      u64 time_seq, struct tree_mod_elem *first_tm)
1279 {
1280 	u32 n;
1281 	struct rb_node *next;
1282 	struct tree_mod_elem *tm = first_tm;
1283 	unsigned long o_dst;
1284 	unsigned long o_src;
1285 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1286 
1287 	n = btrfs_header_nritems(eb);
1288 	tree_mod_log_read_lock(fs_info);
1289 	while (tm && tm->seq >= time_seq) {
1290 		/*
1291 		 * all the operations are recorded with the operator used for
1292 		 * the modification. as we're going backwards, we do the
1293 		 * opposite of each operation here.
1294 		 */
1295 		switch (tm->op) {
1296 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1297 			BUG_ON(tm->slot < n);
1298 			/* Fallthrough */
1299 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1300 		case MOD_LOG_KEY_REMOVE:
1301 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1302 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1303 			btrfs_set_node_ptr_generation(eb, tm->slot,
1304 						      tm->generation);
1305 			n++;
1306 			break;
1307 		case MOD_LOG_KEY_REPLACE:
1308 			BUG_ON(tm->slot >= n);
1309 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1310 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1311 			btrfs_set_node_ptr_generation(eb, tm->slot,
1312 						      tm->generation);
1313 			break;
1314 		case MOD_LOG_KEY_ADD:
1315 			/* if a move operation is needed it's in the log */
1316 			n--;
1317 			break;
1318 		case MOD_LOG_MOVE_KEYS:
1319 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1320 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1321 			memmove_extent_buffer(eb, o_dst, o_src,
1322 					      tm->move.nr_items * p_size);
1323 			break;
1324 		case MOD_LOG_ROOT_REPLACE:
1325 			/*
1326 			 * this operation is special. for roots, this must be
1327 			 * handled explicitly before rewinding.
1328 			 * for non-roots, this operation may exist if the node
1329 			 * was a root: root A -> child B; then A gets empty and
1330 			 * B is promoted to the new root. in the mod log, we'll
1331 			 * have a root-replace operation for B, a tree block
1332 			 * that is no root. we simply ignore that operation.
1333 			 */
1334 			break;
1335 		}
1336 		next = rb_next(&tm->node);
1337 		if (!next)
1338 			break;
1339 		tm = container_of(next, struct tree_mod_elem, node);
1340 		if (tm->index != first_tm->index)
1341 			break;
1342 	}
1343 	tree_mod_log_read_unlock(fs_info);
1344 	btrfs_set_header_nritems(eb, n);
1345 }
1346 
1347 /*
1348  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1349  * is returned. If rewind operations happen, a fresh buffer is returned. The
1350  * returned buffer is always read-locked. If the returned buffer is not the
1351  * input buffer, the lock on the input buffer is released and the input buffer
1352  * is freed (its refcount is decremented).
1353  */
1354 static struct extent_buffer *
1355 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1356 		    struct extent_buffer *eb, u64 time_seq)
1357 {
1358 	struct extent_buffer *eb_rewin;
1359 	struct tree_mod_elem *tm;
1360 
1361 	if (!time_seq)
1362 		return eb;
1363 
1364 	if (btrfs_header_level(eb) == 0)
1365 		return eb;
1366 
1367 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1368 	if (!tm)
1369 		return eb;
1370 
1371 	btrfs_set_path_blocking(path);
1372 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1373 
1374 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1375 		BUG_ON(tm->slot != 0);
1376 		eb_rewin = alloc_dummy_extent_buffer(eb->start,
1377 						fs_info->tree_root->nodesize);
1378 		if (!eb_rewin) {
1379 			btrfs_tree_read_unlock_blocking(eb);
1380 			free_extent_buffer(eb);
1381 			return NULL;
1382 		}
1383 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1384 		btrfs_set_header_backref_rev(eb_rewin,
1385 					     btrfs_header_backref_rev(eb));
1386 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1387 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1388 	} else {
1389 		eb_rewin = btrfs_clone_extent_buffer(eb);
1390 		if (!eb_rewin) {
1391 			btrfs_tree_read_unlock_blocking(eb);
1392 			free_extent_buffer(eb);
1393 			return NULL;
1394 		}
1395 	}
1396 
1397 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1398 	btrfs_tree_read_unlock_blocking(eb);
1399 	free_extent_buffer(eb);
1400 
1401 	extent_buffer_get(eb_rewin);
1402 	btrfs_tree_read_lock(eb_rewin);
1403 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1404 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1405 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1406 
1407 	return eb_rewin;
1408 }
1409 
1410 /*
1411  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1412  * value. If there are no changes, the current root->root_node is returned. If
1413  * anything changed in between, there's a fresh buffer allocated on which the
1414  * rewind operations are done. In any case, the returned buffer is read locked.
1415  * Returns NULL on error (with no locks held).
1416  */
1417 static inline struct extent_buffer *
1418 get_old_root(struct btrfs_root *root, u64 time_seq)
1419 {
1420 	struct tree_mod_elem *tm;
1421 	struct extent_buffer *eb = NULL;
1422 	struct extent_buffer *eb_root;
1423 	struct extent_buffer *old;
1424 	struct tree_mod_root *old_root = NULL;
1425 	u64 old_generation = 0;
1426 	u64 logical;
1427 
1428 	eb_root = btrfs_read_lock_root_node(root);
1429 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1430 	if (!tm)
1431 		return eb_root;
1432 
1433 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1434 		old_root = &tm->old_root;
1435 		old_generation = tm->generation;
1436 		logical = old_root->logical;
1437 	} else {
1438 		logical = eb_root->start;
1439 	}
1440 
1441 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1442 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1443 		btrfs_tree_read_unlock(eb_root);
1444 		free_extent_buffer(eb_root);
1445 		old = read_tree_block(root, logical, 0);
1446 		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1447 			free_extent_buffer(old);
1448 			btrfs_warn(root->fs_info,
1449 				"failed to read tree block %llu from get_old_root", logical);
1450 		} else {
1451 			eb = btrfs_clone_extent_buffer(old);
1452 			free_extent_buffer(old);
1453 		}
1454 	} else if (old_root) {
1455 		btrfs_tree_read_unlock(eb_root);
1456 		free_extent_buffer(eb_root);
1457 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1458 	} else {
1459 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1460 		eb = btrfs_clone_extent_buffer(eb_root);
1461 		btrfs_tree_read_unlock_blocking(eb_root);
1462 		free_extent_buffer(eb_root);
1463 	}
1464 
1465 	if (!eb)
1466 		return NULL;
1467 	extent_buffer_get(eb);
1468 	btrfs_tree_read_lock(eb);
1469 	if (old_root) {
1470 		btrfs_set_header_bytenr(eb, eb->start);
1471 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1472 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1473 		btrfs_set_header_level(eb, old_root->level);
1474 		btrfs_set_header_generation(eb, old_generation);
1475 	}
1476 	if (tm)
1477 		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1478 	else
1479 		WARN_ON(btrfs_header_level(eb) != 0);
1480 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1481 
1482 	return eb;
1483 }
1484 
1485 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1486 {
1487 	struct tree_mod_elem *tm;
1488 	int level;
1489 	struct extent_buffer *eb_root = btrfs_root_node(root);
1490 
1491 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1492 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1493 		level = tm->old_root.level;
1494 	} else {
1495 		level = btrfs_header_level(eb_root);
1496 	}
1497 	free_extent_buffer(eb_root);
1498 
1499 	return level;
1500 }
1501 
1502 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1503 				   struct btrfs_root *root,
1504 				   struct extent_buffer *buf)
1505 {
1506 	if (btrfs_test_is_dummy_root(root))
1507 		return 0;
1508 
1509 	/* ensure we can see the force_cow */
1510 	smp_rmb();
1511 
1512 	/*
1513 	 * We do not need to cow a block if
1514 	 * 1) this block is not created or changed in this transaction;
1515 	 * 2) this block does not belong to TREE_RELOC tree;
1516 	 * 3) the root is not forced COW.
1517 	 *
1518 	 * What is forced COW:
1519 	 *    when we create snapshot during commiting the transaction,
1520 	 *    after we've finished coping src root, we must COW the shared
1521 	 *    block to ensure the metadata consistency.
1522 	 */
1523 	if (btrfs_header_generation(buf) == trans->transid &&
1524 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1525 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1526 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1527 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1528 		return 0;
1529 	return 1;
1530 }
1531 
1532 /*
1533  * cows a single block, see __btrfs_cow_block for the real work.
1534  * This version of it has extra checks so that a block isn't cow'd more than
1535  * once per transaction, as long as it hasn't been written yet
1536  */
1537 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1538 		    struct btrfs_root *root, struct extent_buffer *buf,
1539 		    struct extent_buffer *parent, int parent_slot,
1540 		    struct extent_buffer **cow_ret)
1541 {
1542 	u64 search_start;
1543 	int ret;
1544 
1545 	if (trans->transaction != root->fs_info->running_transaction)
1546 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1547 		       trans->transid,
1548 		       root->fs_info->running_transaction->transid);
1549 
1550 	if (trans->transid != root->fs_info->generation)
1551 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1552 		       trans->transid, root->fs_info->generation);
1553 
1554 	if (!should_cow_block(trans, root, buf)) {
1555 		*cow_ret = buf;
1556 		return 0;
1557 	}
1558 
1559 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1560 
1561 	if (parent)
1562 		btrfs_set_lock_blocking(parent);
1563 	btrfs_set_lock_blocking(buf);
1564 
1565 	ret = __btrfs_cow_block(trans, root, buf, parent,
1566 				 parent_slot, cow_ret, search_start, 0);
1567 
1568 	trace_btrfs_cow_block(root, buf, *cow_ret);
1569 
1570 	return ret;
1571 }
1572 
1573 /*
1574  * helper function for defrag to decide if two blocks pointed to by a
1575  * node are actually close by
1576  */
1577 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1578 {
1579 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1580 		return 1;
1581 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1582 		return 1;
1583 	return 0;
1584 }
1585 
1586 /*
1587  * compare two keys in a memcmp fashion
1588  */
1589 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1590 {
1591 	struct btrfs_key k1;
1592 
1593 	btrfs_disk_key_to_cpu(&k1, disk);
1594 
1595 	return btrfs_comp_cpu_keys(&k1, k2);
1596 }
1597 
1598 /*
1599  * same as comp_keys only with two btrfs_key's
1600  */
1601 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1602 {
1603 	if (k1->objectid > k2->objectid)
1604 		return 1;
1605 	if (k1->objectid < k2->objectid)
1606 		return -1;
1607 	if (k1->type > k2->type)
1608 		return 1;
1609 	if (k1->type < k2->type)
1610 		return -1;
1611 	if (k1->offset > k2->offset)
1612 		return 1;
1613 	if (k1->offset < k2->offset)
1614 		return -1;
1615 	return 0;
1616 }
1617 
1618 /*
1619  * this is used by the defrag code to go through all the
1620  * leaves pointed to by a node and reallocate them so that
1621  * disk order is close to key order
1622  */
1623 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1624 		       struct btrfs_root *root, struct extent_buffer *parent,
1625 		       int start_slot, u64 *last_ret,
1626 		       struct btrfs_key *progress)
1627 {
1628 	struct extent_buffer *cur;
1629 	u64 blocknr;
1630 	u64 gen;
1631 	u64 search_start = *last_ret;
1632 	u64 last_block = 0;
1633 	u64 other;
1634 	u32 parent_nritems;
1635 	int end_slot;
1636 	int i;
1637 	int err = 0;
1638 	int parent_level;
1639 	int uptodate;
1640 	u32 blocksize;
1641 	int progress_passed = 0;
1642 	struct btrfs_disk_key disk_key;
1643 
1644 	parent_level = btrfs_header_level(parent);
1645 
1646 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1647 	WARN_ON(trans->transid != root->fs_info->generation);
1648 
1649 	parent_nritems = btrfs_header_nritems(parent);
1650 	blocksize = root->nodesize;
1651 	end_slot = parent_nritems;
1652 
1653 	if (parent_nritems == 1)
1654 		return 0;
1655 
1656 	btrfs_set_lock_blocking(parent);
1657 
1658 	for (i = start_slot; i < end_slot; i++) {
1659 		int close = 1;
1660 
1661 		btrfs_node_key(parent, &disk_key, i);
1662 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1663 			continue;
1664 
1665 		progress_passed = 1;
1666 		blocknr = btrfs_node_blockptr(parent, i);
1667 		gen = btrfs_node_ptr_generation(parent, i);
1668 		if (last_block == 0)
1669 			last_block = blocknr;
1670 
1671 		if (i > 0) {
1672 			other = btrfs_node_blockptr(parent, i - 1);
1673 			close = close_blocks(blocknr, other, blocksize);
1674 		}
1675 		if (!close && i < end_slot - 2) {
1676 			other = btrfs_node_blockptr(parent, i + 1);
1677 			close = close_blocks(blocknr, other, blocksize);
1678 		}
1679 		if (close) {
1680 			last_block = blocknr;
1681 			continue;
1682 		}
1683 
1684 		cur = btrfs_find_tree_block(root, blocknr);
1685 		if (cur)
1686 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1687 		else
1688 			uptodate = 0;
1689 		if (!cur || !uptodate) {
1690 			if (!cur) {
1691 				cur = read_tree_block(root, blocknr, gen);
1692 				if (!cur || !extent_buffer_uptodate(cur)) {
1693 					free_extent_buffer(cur);
1694 					return -EIO;
1695 				}
1696 			} else if (!uptodate) {
1697 				err = btrfs_read_buffer(cur, gen);
1698 				if (err) {
1699 					free_extent_buffer(cur);
1700 					return err;
1701 				}
1702 			}
1703 		}
1704 		if (search_start == 0)
1705 			search_start = last_block;
1706 
1707 		btrfs_tree_lock(cur);
1708 		btrfs_set_lock_blocking(cur);
1709 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1710 					&cur, search_start,
1711 					min(16 * blocksize,
1712 					    (end_slot - i) * blocksize));
1713 		if (err) {
1714 			btrfs_tree_unlock(cur);
1715 			free_extent_buffer(cur);
1716 			break;
1717 		}
1718 		search_start = cur->start;
1719 		last_block = cur->start;
1720 		*last_ret = search_start;
1721 		btrfs_tree_unlock(cur);
1722 		free_extent_buffer(cur);
1723 	}
1724 	return err;
1725 }
1726 
1727 /*
1728  * The leaf data grows from end-to-front in the node.
1729  * this returns the address of the start of the last item,
1730  * which is the stop of the leaf data stack
1731  */
1732 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1733 					 struct extent_buffer *leaf)
1734 {
1735 	u32 nr = btrfs_header_nritems(leaf);
1736 	if (nr == 0)
1737 		return BTRFS_LEAF_DATA_SIZE(root);
1738 	return btrfs_item_offset_nr(leaf, nr - 1);
1739 }
1740 
1741 
1742 /*
1743  * search for key in the extent_buffer.  The items start at offset p,
1744  * and they are item_size apart.  There are 'max' items in p.
1745  *
1746  * the slot in the array is returned via slot, and it points to
1747  * the place where you would insert key if it is not found in
1748  * the array.
1749  *
1750  * slot may point to max if the key is bigger than all of the keys
1751  */
1752 static noinline int generic_bin_search(struct extent_buffer *eb,
1753 				       unsigned long p,
1754 				       int item_size, struct btrfs_key *key,
1755 				       int max, int *slot)
1756 {
1757 	int low = 0;
1758 	int high = max;
1759 	int mid;
1760 	int ret;
1761 	struct btrfs_disk_key *tmp = NULL;
1762 	struct btrfs_disk_key unaligned;
1763 	unsigned long offset;
1764 	char *kaddr = NULL;
1765 	unsigned long map_start = 0;
1766 	unsigned long map_len = 0;
1767 	int err;
1768 
1769 	while (low < high) {
1770 		mid = (low + high) / 2;
1771 		offset = p + mid * item_size;
1772 
1773 		if (!kaddr || offset < map_start ||
1774 		    (offset + sizeof(struct btrfs_disk_key)) >
1775 		    map_start + map_len) {
1776 
1777 			err = map_private_extent_buffer(eb, offset,
1778 						sizeof(struct btrfs_disk_key),
1779 						&kaddr, &map_start, &map_len);
1780 
1781 			if (!err) {
1782 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1783 							map_start);
1784 			} else {
1785 				read_extent_buffer(eb, &unaligned,
1786 						   offset, sizeof(unaligned));
1787 				tmp = &unaligned;
1788 			}
1789 
1790 		} else {
1791 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1792 							map_start);
1793 		}
1794 		ret = comp_keys(tmp, key);
1795 
1796 		if (ret < 0)
1797 			low = mid + 1;
1798 		else if (ret > 0)
1799 			high = mid;
1800 		else {
1801 			*slot = mid;
1802 			return 0;
1803 		}
1804 	}
1805 	*slot = low;
1806 	return 1;
1807 }
1808 
1809 /*
1810  * simple bin_search frontend that does the right thing for
1811  * leaves vs nodes
1812  */
1813 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1814 		      int level, int *slot)
1815 {
1816 	if (level == 0)
1817 		return generic_bin_search(eb,
1818 					  offsetof(struct btrfs_leaf, items),
1819 					  sizeof(struct btrfs_item),
1820 					  key, btrfs_header_nritems(eb),
1821 					  slot);
1822 	else
1823 		return generic_bin_search(eb,
1824 					  offsetof(struct btrfs_node, ptrs),
1825 					  sizeof(struct btrfs_key_ptr),
1826 					  key, btrfs_header_nritems(eb),
1827 					  slot);
1828 }
1829 
1830 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1831 		     int level, int *slot)
1832 {
1833 	return bin_search(eb, key, level, slot);
1834 }
1835 
1836 static void root_add_used(struct btrfs_root *root, u32 size)
1837 {
1838 	spin_lock(&root->accounting_lock);
1839 	btrfs_set_root_used(&root->root_item,
1840 			    btrfs_root_used(&root->root_item) + size);
1841 	spin_unlock(&root->accounting_lock);
1842 }
1843 
1844 static void root_sub_used(struct btrfs_root *root, u32 size)
1845 {
1846 	spin_lock(&root->accounting_lock);
1847 	btrfs_set_root_used(&root->root_item,
1848 			    btrfs_root_used(&root->root_item) - size);
1849 	spin_unlock(&root->accounting_lock);
1850 }
1851 
1852 /* given a node and slot number, this reads the blocks it points to.  The
1853  * extent buffer is returned with a reference taken (but unlocked).
1854  * NULL is returned on error.
1855  */
1856 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1857 				   struct extent_buffer *parent, int slot)
1858 {
1859 	int level = btrfs_header_level(parent);
1860 	struct extent_buffer *eb;
1861 
1862 	if (slot < 0)
1863 		return NULL;
1864 	if (slot >= btrfs_header_nritems(parent))
1865 		return NULL;
1866 
1867 	BUG_ON(level == 0);
1868 
1869 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1870 			     btrfs_node_ptr_generation(parent, slot));
1871 	if (eb && !extent_buffer_uptodate(eb)) {
1872 		free_extent_buffer(eb);
1873 		eb = NULL;
1874 	}
1875 
1876 	return eb;
1877 }
1878 
1879 /*
1880  * node level balancing, used to make sure nodes are in proper order for
1881  * item deletion.  We balance from the top down, so we have to make sure
1882  * that a deletion won't leave an node completely empty later on.
1883  */
1884 static noinline int balance_level(struct btrfs_trans_handle *trans,
1885 			 struct btrfs_root *root,
1886 			 struct btrfs_path *path, int level)
1887 {
1888 	struct extent_buffer *right = NULL;
1889 	struct extent_buffer *mid;
1890 	struct extent_buffer *left = NULL;
1891 	struct extent_buffer *parent = NULL;
1892 	int ret = 0;
1893 	int wret;
1894 	int pslot;
1895 	int orig_slot = path->slots[level];
1896 	u64 orig_ptr;
1897 
1898 	if (level == 0)
1899 		return 0;
1900 
1901 	mid = path->nodes[level];
1902 
1903 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1904 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1905 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1906 
1907 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1908 
1909 	if (level < BTRFS_MAX_LEVEL - 1) {
1910 		parent = path->nodes[level + 1];
1911 		pslot = path->slots[level + 1];
1912 	}
1913 
1914 	/*
1915 	 * deal with the case where there is only one pointer in the root
1916 	 * by promoting the node below to a root
1917 	 */
1918 	if (!parent) {
1919 		struct extent_buffer *child;
1920 
1921 		if (btrfs_header_nritems(mid) != 1)
1922 			return 0;
1923 
1924 		/* promote the child to a root */
1925 		child = read_node_slot(root, mid, 0);
1926 		if (!child) {
1927 			ret = -EROFS;
1928 			btrfs_std_error(root->fs_info, ret);
1929 			goto enospc;
1930 		}
1931 
1932 		btrfs_tree_lock(child);
1933 		btrfs_set_lock_blocking(child);
1934 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1935 		if (ret) {
1936 			btrfs_tree_unlock(child);
1937 			free_extent_buffer(child);
1938 			goto enospc;
1939 		}
1940 
1941 		tree_mod_log_set_root_pointer(root, child, 1);
1942 		rcu_assign_pointer(root->node, child);
1943 
1944 		add_root_to_dirty_list(root);
1945 		btrfs_tree_unlock(child);
1946 
1947 		path->locks[level] = 0;
1948 		path->nodes[level] = NULL;
1949 		clean_tree_block(trans, root, mid);
1950 		btrfs_tree_unlock(mid);
1951 		/* once for the path */
1952 		free_extent_buffer(mid);
1953 
1954 		root_sub_used(root, mid->len);
1955 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1956 		/* once for the root ptr */
1957 		free_extent_buffer_stale(mid);
1958 		return 0;
1959 	}
1960 	if (btrfs_header_nritems(mid) >
1961 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1962 		return 0;
1963 
1964 	left = read_node_slot(root, parent, pslot - 1);
1965 	if (left) {
1966 		btrfs_tree_lock(left);
1967 		btrfs_set_lock_blocking(left);
1968 		wret = btrfs_cow_block(trans, root, left,
1969 				       parent, pslot - 1, &left);
1970 		if (wret) {
1971 			ret = wret;
1972 			goto enospc;
1973 		}
1974 	}
1975 	right = read_node_slot(root, parent, pslot + 1);
1976 	if (right) {
1977 		btrfs_tree_lock(right);
1978 		btrfs_set_lock_blocking(right);
1979 		wret = btrfs_cow_block(trans, root, right,
1980 				       parent, pslot + 1, &right);
1981 		if (wret) {
1982 			ret = wret;
1983 			goto enospc;
1984 		}
1985 	}
1986 
1987 	/* first, try to make some room in the middle buffer */
1988 	if (left) {
1989 		orig_slot += btrfs_header_nritems(left);
1990 		wret = push_node_left(trans, root, left, mid, 1);
1991 		if (wret < 0)
1992 			ret = wret;
1993 	}
1994 
1995 	/*
1996 	 * then try to empty the right most buffer into the middle
1997 	 */
1998 	if (right) {
1999 		wret = push_node_left(trans, root, mid, right, 1);
2000 		if (wret < 0 && wret != -ENOSPC)
2001 			ret = wret;
2002 		if (btrfs_header_nritems(right) == 0) {
2003 			clean_tree_block(trans, root, right);
2004 			btrfs_tree_unlock(right);
2005 			del_ptr(root, path, level + 1, pslot + 1);
2006 			root_sub_used(root, right->len);
2007 			btrfs_free_tree_block(trans, root, right, 0, 1);
2008 			free_extent_buffer_stale(right);
2009 			right = NULL;
2010 		} else {
2011 			struct btrfs_disk_key right_key;
2012 			btrfs_node_key(right, &right_key, 0);
2013 			tree_mod_log_set_node_key(root->fs_info, parent,
2014 						  pslot + 1, 0);
2015 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2016 			btrfs_mark_buffer_dirty(parent);
2017 		}
2018 	}
2019 	if (btrfs_header_nritems(mid) == 1) {
2020 		/*
2021 		 * we're not allowed to leave a node with one item in the
2022 		 * tree during a delete.  A deletion from lower in the tree
2023 		 * could try to delete the only pointer in this node.
2024 		 * So, pull some keys from the left.
2025 		 * There has to be a left pointer at this point because
2026 		 * otherwise we would have pulled some pointers from the
2027 		 * right
2028 		 */
2029 		if (!left) {
2030 			ret = -EROFS;
2031 			btrfs_std_error(root->fs_info, ret);
2032 			goto enospc;
2033 		}
2034 		wret = balance_node_right(trans, root, mid, left);
2035 		if (wret < 0) {
2036 			ret = wret;
2037 			goto enospc;
2038 		}
2039 		if (wret == 1) {
2040 			wret = push_node_left(trans, root, left, mid, 1);
2041 			if (wret < 0)
2042 				ret = wret;
2043 		}
2044 		BUG_ON(wret == 1);
2045 	}
2046 	if (btrfs_header_nritems(mid) == 0) {
2047 		clean_tree_block(trans, root, mid);
2048 		btrfs_tree_unlock(mid);
2049 		del_ptr(root, path, level + 1, pslot);
2050 		root_sub_used(root, mid->len);
2051 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2052 		free_extent_buffer_stale(mid);
2053 		mid = NULL;
2054 	} else {
2055 		/* update the parent key to reflect our changes */
2056 		struct btrfs_disk_key mid_key;
2057 		btrfs_node_key(mid, &mid_key, 0);
2058 		tree_mod_log_set_node_key(root->fs_info, parent,
2059 					  pslot, 0);
2060 		btrfs_set_node_key(parent, &mid_key, pslot);
2061 		btrfs_mark_buffer_dirty(parent);
2062 	}
2063 
2064 	/* update the path */
2065 	if (left) {
2066 		if (btrfs_header_nritems(left) > orig_slot) {
2067 			extent_buffer_get(left);
2068 			/* left was locked after cow */
2069 			path->nodes[level] = left;
2070 			path->slots[level + 1] -= 1;
2071 			path->slots[level] = orig_slot;
2072 			if (mid) {
2073 				btrfs_tree_unlock(mid);
2074 				free_extent_buffer(mid);
2075 			}
2076 		} else {
2077 			orig_slot -= btrfs_header_nritems(left);
2078 			path->slots[level] = orig_slot;
2079 		}
2080 	}
2081 	/* double check we haven't messed things up */
2082 	if (orig_ptr !=
2083 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2084 		BUG();
2085 enospc:
2086 	if (right) {
2087 		btrfs_tree_unlock(right);
2088 		free_extent_buffer(right);
2089 	}
2090 	if (left) {
2091 		if (path->nodes[level] != left)
2092 			btrfs_tree_unlock(left);
2093 		free_extent_buffer(left);
2094 	}
2095 	return ret;
2096 }
2097 
2098 /* Node balancing for insertion.  Here we only split or push nodes around
2099  * when they are completely full.  This is also done top down, so we
2100  * have to be pessimistic.
2101  */
2102 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2103 					  struct btrfs_root *root,
2104 					  struct btrfs_path *path, int level)
2105 {
2106 	struct extent_buffer *right = NULL;
2107 	struct extent_buffer *mid;
2108 	struct extent_buffer *left = NULL;
2109 	struct extent_buffer *parent = NULL;
2110 	int ret = 0;
2111 	int wret;
2112 	int pslot;
2113 	int orig_slot = path->slots[level];
2114 
2115 	if (level == 0)
2116 		return 1;
2117 
2118 	mid = path->nodes[level];
2119 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2120 
2121 	if (level < BTRFS_MAX_LEVEL - 1) {
2122 		parent = path->nodes[level + 1];
2123 		pslot = path->slots[level + 1];
2124 	}
2125 
2126 	if (!parent)
2127 		return 1;
2128 
2129 	left = read_node_slot(root, parent, pslot - 1);
2130 
2131 	/* first, try to make some room in the middle buffer */
2132 	if (left) {
2133 		u32 left_nr;
2134 
2135 		btrfs_tree_lock(left);
2136 		btrfs_set_lock_blocking(left);
2137 
2138 		left_nr = btrfs_header_nritems(left);
2139 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2140 			wret = 1;
2141 		} else {
2142 			ret = btrfs_cow_block(trans, root, left, parent,
2143 					      pslot - 1, &left);
2144 			if (ret)
2145 				wret = 1;
2146 			else {
2147 				wret = push_node_left(trans, root,
2148 						      left, mid, 0);
2149 			}
2150 		}
2151 		if (wret < 0)
2152 			ret = wret;
2153 		if (wret == 0) {
2154 			struct btrfs_disk_key disk_key;
2155 			orig_slot += left_nr;
2156 			btrfs_node_key(mid, &disk_key, 0);
2157 			tree_mod_log_set_node_key(root->fs_info, parent,
2158 						  pslot, 0);
2159 			btrfs_set_node_key(parent, &disk_key, pslot);
2160 			btrfs_mark_buffer_dirty(parent);
2161 			if (btrfs_header_nritems(left) > orig_slot) {
2162 				path->nodes[level] = left;
2163 				path->slots[level + 1] -= 1;
2164 				path->slots[level] = orig_slot;
2165 				btrfs_tree_unlock(mid);
2166 				free_extent_buffer(mid);
2167 			} else {
2168 				orig_slot -=
2169 					btrfs_header_nritems(left);
2170 				path->slots[level] = orig_slot;
2171 				btrfs_tree_unlock(left);
2172 				free_extent_buffer(left);
2173 			}
2174 			return 0;
2175 		}
2176 		btrfs_tree_unlock(left);
2177 		free_extent_buffer(left);
2178 	}
2179 	right = read_node_slot(root, parent, pslot + 1);
2180 
2181 	/*
2182 	 * then try to empty the right most buffer into the middle
2183 	 */
2184 	if (right) {
2185 		u32 right_nr;
2186 
2187 		btrfs_tree_lock(right);
2188 		btrfs_set_lock_blocking(right);
2189 
2190 		right_nr = btrfs_header_nritems(right);
2191 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2192 			wret = 1;
2193 		} else {
2194 			ret = btrfs_cow_block(trans, root, right,
2195 					      parent, pslot + 1,
2196 					      &right);
2197 			if (ret)
2198 				wret = 1;
2199 			else {
2200 				wret = balance_node_right(trans, root,
2201 							  right, mid);
2202 			}
2203 		}
2204 		if (wret < 0)
2205 			ret = wret;
2206 		if (wret == 0) {
2207 			struct btrfs_disk_key disk_key;
2208 
2209 			btrfs_node_key(right, &disk_key, 0);
2210 			tree_mod_log_set_node_key(root->fs_info, parent,
2211 						  pslot + 1, 0);
2212 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2213 			btrfs_mark_buffer_dirty(parent);
2214 
2215 			if (btrfs_header_nritems(mid) <= orig_slot) {
2216 				path->nodes[level] = right;
2217 				path->slots[level + 1] += 1;
2218 				path->slots[level] = orig_slot -
2219 					btrfs_header_nritems(mid);
2220 				btrfs_tree_unlock(mid);
2221 				free_extent_buffer(mid);
2222 			} else {
2223 				btrfs_tree_unlock(right);
2224 				free_extent_buffer(right);
2225 			}
2226 			return 0;
2227 		}
2228 		btrfs_tree_unlock(right);
2229 		free_extent_buffer(right);
2230 	}
2231 	return 1;
2232 }
2233 
2234 /*
2235  * readahead one full node of leaves, finding things that are close
2236  * to the block in 'slot', and triggering ra on them.
2237  */
2238 static void reada_for_search(struct btrfs_root *root,
2239 			     struct btrfs_path *path,
2240 			     int level, int slot, u64 objectid)
2241 {
2242 	struct extent_buffer *node;
2243 	struct btrfs_disk_key disk_key;
2244 	u32 nritems;
2245 	u64 search;
2246 	u64 target;
2247 	u64 nread = 0;
2248 	u64 gen;
2249 	int direction = path->reada;
2250 	struct extent_buffer *eb;
2251 	u32 nr;
2252 	u32 blocksize;
2253 	u32 nscan = 0;
2254 
2255 	if (level != 1)
2256 		return;
2257 
2258 	if (!path->nodes[level])
2259 		return;
2260 
2261 	node = path->nodes[level];
2262 
2263 	search = btrfs_node_blockptr(node, slot);
2264 	blocksize = root->nodesize;
2265 	eb = btrfs_find_tree_block(root, search);
2266 	if (eb) {
2267 		free_extent_buffer(eb);
2268 		return;
2269 	}
2270 
2271 	target = search;
2272 
2273 	nritems = btrfs_header_nritems(node);
2274 	nr = slot;
2275 
2276 	while (1) {
2277 		if (direction < 0) {
2278 			if (nr == 0)
2279 				break;
2280 			nr--;
2281 		} else if (direction > 0) {
2282 			nr++;
2283 			if (nr >= nritems)
2284 				break;
2285 		}
2286 		if (path->reada < 0 && objectid) {
2287 			btrfs_node_key(node, &disk_key, nr);
2288 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2289 				break;
2290 		}
2291 		search = btrfs_node_blockptr(node, nr);
2292 		if ((search <= target && target - search <= 65536) ||
2293 		    (search > target && search - target <= 65536)) {
2294 			gen = btrfs_node_ptr_generation(node, nr);
2295 			readahead_tree_block(root, search, blocksize);
2296 			nread += blocksize;
2297 		}
2298 		nscan++;
2299 		if ((nread > 65536 || nscan > 32))
2300 			break;
2301 	}
2302 }
2303 
2304 static noinline void reada_for_balance(struct btrfs_root *root,
2305 				       struct btrfs_path *path, int level)
2306 {
2307 	int slot;
2308 	int nritems;
2309 	struct extent_buffer *parent;
2310 	struct extent_buffer *eb;
2311 	u64 gen;
2312 	u64 block1 = 0;
2313 	u64 block2 = 0;
2314 	int blocksize;
2315 
2316 	parent = path->nodes[level + 1];
2317 	if (!parent)
2318 		return;
2319 
2320 	nritems = btrfs_header_nritems(parent);
2321 	slot = path->slots[level + 1];
2322 	blocksize = root->nodesize;
2323 
2324 	if (slot > 0) {
2325 		block1 = btrfs_node_blockptr(parent, slot - 1);
2326 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2327 		eb = btrfs_find_tree_block(root, block1);
2328 		/*
2329 		 * if we get -eagain from btrfs_buffer_uptodate, we
2330 		 * don't want to return eagain here.  That will loop
2331 		 * forever
2332 		 */
2333 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2334 			block1 = 0;
2335 		free_extent_buffer(eb);
2336 	}
2337 	if (slot + 1 < nritems) {
2338 		block2 = btrfs_node_blockptr(parent, slot + 1);
2339 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2340 		eb = btrfs_find_tree_block(root, block2);
2341 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2342 			block2 = 0;
2343 		free_extent_buffer(eb);
2344 	}
2345 
2346 	if (block1)
2347 		readahead_tree_block(root, block1, blocksize);
2348 	if (block2)
2349 		readahead_tree_block(root, block2, blocksize);
2350 }
2351 
2352 
2353 /*
2354  * when we walk down the tree, it is usually safe to unlock the higher layers
2355  * in the tree.  The exceptions are when our path goes through slot 0, because
2356  * operations on the tree might require changing key pointers higher up in the
2357  * tree.
2358  *
2359  * callers might also have set path->keep_locks, which tells this code to keep
2360  * the lock if the path points to the last slot in the block.  This is part of
2361  * walking through the tree, and selecting the next slot in the higher block.
2362  *
2363  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2364  * if lowest_unlock is 1, level 0 won't be unlocked
2365  */
2366 static noinline void unlock_up(struct btrfs_path *path, int level,
2367 			       int lowest_unlock, int min_write_lock_level,
2368 			       int *write_lock_level)
2369 {
2370 	int i;
2371 	int skip_level = level;
2372 	int no_skips = 0;
2373 	struct extent_buffer *t;
2374 
2375 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2376 		if (!path->nodes[i])
2377 			break;
2378 		if (!path->locks[i])
2379 			break;
2380 		if (!no_skips && path->slots[i] == 0) {
2381 			skip_level = i + 1;
2382 			continue;
2383 		}
2384 		if (!no_skips && path->keep_locks) {
2385 			u32 nritems;
2386 			t = path->nodes[i];
2387 			nritems = btrfs_header_nritems(t);
2388 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2389 				skip_level = i + 1;
2390 				continue;
2391 			}
2392 		}
2393 		if (skip_level < i && i >= lowest_unlock)
2394 			no_skips = 1;
2395 
2396 		t = path->nodes[i];
2397 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2398 			btrfs_tree_unlock_rw(t, path->locks[i]);
2399 			path->locks[i] = 0;
2400 			if (write_lock_level &&
2401 			    i > min_write_lock_level &&
2402 			    i <= *write_lock_level) {
2403 				*write_lock_level = i - 1;
2404 			}
2405 		}
2406 	}
2407 }
2408 
2409 /*
2410  * This releases any locks held in the path starting at level and
2411  * going all the way up to the root.
2412  *
2413  * btrfs_search_slot will keep the lock held on higher nodes in a few
2414  * corner cases, such as COW of the block at slot zero in the node.  This
2415  * ignores those rules, and it should only be called when there are no
2416  * more updates to be done higher up in the tree.
2417  */
2418 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2419 {
2420 	int i;
2421 
2422 	if (path->keep_locks)
2423 		return;
2424 
2425 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2426 		if (!path->nodes[i])
2427 			continue;
2428 		if (!path->locks[i])
2429 			continue;
2430 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2431 		path->locks[i] = 0;
2432 	}
2433 }
2434 
2435 /*
2436  * helper function for btrfs_search_slot.  The goal is to find a block
2437  * in cache without setting the path to blocking.  If we find the block
2438  * we return zero and the path is unchanged.
2439  *
2440  * If we can't find the block, we set the path blocking and do some
2441  * reada.  -EAGAIN is returned and the search must be repeated.
2442  */
2443 static int
2444 read_block_for_search(struct btrfs_trans_handle *trans,
2445 		       struct btrfs_root *root, struct btrfs_path *p,
2446 		       struct extent_buffer **eb_ret, int level, int slot,
2447 		       struct btrfs_key *key, u64 time_seq)
2448 {
2449 	u64 blocknr;
2450 	u64 gen;
2451 	struct extent_buffer *b = *eb_ret;
2452 	struct extent_buffer *tmp;
2453 	int ret;
2454 
2455 	blocknr = btrfs_node_blockptr(b, slot);
2456 	gen = btrfs_node_ptr_generation(b, slot);
2457 
2458 	tmp = btrfs_find_tree_block(root, blocknr);
2459 	if (tmp) {
2460 		/* first we do an atomic uptodate check */
2461 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2462 			*eb_ret = tmp;
2463 			return 0;
2464 		}
2465 
2466 		/* the pages were up to date, but we failed
2467 		 * the generation number check.  Do a full
2468 		 * read for the generation number that is correct.
2469 		 * We must do this without dropping locks so
2470 		 * we can trust our generation number
2471 		 */
2472 		btrfs_set_path_blocking(p);
2473 
2474 		/* now we're allowed to do a blocking uptodate check */
2475 		ret = btrfs_read_buffer(tmp, gen);
2476 		if (!ret) {
2477 			*eb_ret = tmp;
2478 			return 0;
2479 		}
2480 		free_extent_buffer(tmp);
2481 		btrfs_release_path(p);
2482 		return -EIO;
2483 	}
2484 
2485 	/*
2486 	 * reduce lock contention at high levels
2487 	 * of the btree by dropping locks before
2488 	 * we read.  Don't release the lock on the current
2489 	 * level because we need to walk this node to figure
2490 	 * out which blocks to read.
2491 	 */
2492 	btrfs_unlock_up_safe(p, level + 1);
2493 	btrfs_set_path_blocking(p);
2494 
2495 	free_extent_buffer(tmp);
2496 	if (p->reada)
2497 		reada_for_search(root, p, level, slot, key->objectid);
2498 
2499 	btrfs_release_path(p);
2500 
2501 	ret = -EAGAIN;
2502 	tmp = read_tree_block(root, blocknr, 0);
2503 	if (tmp) {
2504 		/*
2505 		 * If the read above didn't mark this buffer up to date,
2506 		 * it will never end up being up to date.  Set ret to EIO now
2507 		 * and give up so that our caller doesn't loop forever
2508 		 * on our EAGAINs.
2509 		 */
2510 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2511 			ret = -EIO;
2512 		free_extent_buffer(tmp);
2513 	}
2514 	return ret;
2515 }
2516 
2517 /*
2518  * helper function for btrfs_search_slot.  This does all of the checks
2519  * for node-level blocks and does any balancing required based on
2520  * the ins_len.
2521  *
2522  * If no extra work was required, zero is returned.  If we had to
2523  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2524  * start over
2525  */
2526 static int
2527 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2528 		       struct btrfs_root *root, struct btrfs_path *p,
2529 		       struct extent_buffer *b, int level, int ins_len,
2530 		       int *write_lock_level)
2531 {
2532 	int ret;
2533 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2534 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2535 		int sret;
2536 
2537 		if (*write_lock_level < level + 1) {
2538 			*write_lock_level = level + 1;
2539 			btrfs_release_path(p);
2540 			goto again;
2541 		}
2542 
2543 		btrfs_set_path_blocking(p);
2544 		reada_for_balance(root, p, level);
2545 		sret = split_node(trans, root, p, level);
2546 		btrfs_clear_path_blocking(p, NULL, 0);
2547 
2548 		BUG_ON(sret > 0);
2549 		if (sret) {
2550 			ret = sret;
2551 			goto done;
2552 		}
2553 		b = p->nodes[level];
2554 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2555 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2556 		int sret;
2557 
2558 		if (*write_lock_level < level + 1) {
2559 			*write_lock_level = level + 1;
2560 			btrfs_release_path(p);
2561 			goto again;
2562 		}
2563 
2564 		btrfs_set_path_blocking(p);
2565 		reada_for_balance(root, p, level);
2566 		sret = balance_level(trans, root, p, level);
2567 		btrfs_clear_path_blocking(p, NULL, 0);
2568 
2569 		if (sret) {
2570 			ret = sret;
2571 			goto done;
2572 		}
2573 		b = p->nodes[level];
2574 		if (!b) {
2575 			btrfs_release_path(p);
2576 			goto again;
2577 		}
2578 		BUG_ON(btrfs_header_nritems(b) == 1);
2579 	}
2580 	return 0;
2581 
2582 again:
2583 	ret = -EAGAIN;
2584 done:
2585 	return ret;
2586 }
2587 
2588 static void key_search_validate(struct extent_buffer *b,
2589 				struct btrfs_key *key,
2590 				int level)
2591 {
2592 #ifdef CONFIG_BTRFS_ASSERT
2593 	struct btrfs_disk_key disk_key;
2594 
2595 	btrfs_cpu_key_to_disk(&disk_key, key);
2596 
2597 	if (level == 0)
2598 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2599 		    offsetof(struct btrfs_leaf, items[0].key),
2600 		    sizeof(disk_key)));
2601 	else
2602 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2603 		    offsetof(struct btrfs_node, ptrs[0].key),
2604 		    sizeof(disk_key)));
2605 #endif
2606 }
2607 
2608 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2609 		      int level, int *prev_cmp, int *slot)
2610 {
2611 	if (*prev_cmp != 0) {
2612 		*prev_cmp = bin_search(b, key, level, slot);
2613 		return *prev_cmp;
2614 	}
2615 
2616 	key_search_validate(b, key, level);
2617 	*slot = 0;
2618 
2619 	return 0;
2620 }
2621 
2622 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
2623 		u64 iobjectid, u64 ioff, u8 key_type,
2624 		struct btrfs_key *found_key)
2625 {
2626 	int ret;
2627 	struct btrfs_key key;
2628 	struct extent_buffer *eb;
2629 	struct btrfs_path *path;
2630 
2631 	key.type = key_type;
2632 	key.objectid = iobjectid;
2633 	key.offset = ioff;
2634 
2635 	if (found_path == NULL) {
2636 		path = btrfs_alloc_path();
2637 		if (!path)
2638 			return -ENOMEM;
2639 	} else
2640 		path = found_path;
2641 
2642 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2643 	if ((ret < 0) || (found_key == NULL)) {
2644 		if (path != found_path)
2645 			btrfs_free_path(path);
2646 		return ret;
2647 	}
2648 
2649 	eb = path->nodes[0];
2650 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2651 		ret = btrfs_next_leaf(fs_root, path);
2652 		if (ret)
2653 			return ret;
2654 		eb = path->nodes[0];
2655 	}
2656 
2657 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2658 	if (found_key->type != key.type ||
2659 			found_key->objectid != key.objectid)
2660 		return 1;
2661 
2662 	return 0;
2663 }
2664 
2665 /*
2666  * look for key in the tree.  path is filled in with nodes along the way
2667  * if key is found, we return zero and you can find the item in the leaf
2668  * level of the path (level 0)
2669  *
2670  * If the key isn't found, the path points to the slot where it should
2671  * be inserted, and 1 is returned.  If there are other errors during the
2672  * search a negative error number is returned.
2673  *
2674  * if ins_len > 0, nodes and leaves will be split as we walk down the
2675  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2676  * possible)
2677  */
2678 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2679 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2680 		      ins_len, int cow)
2681 {
2682 	struct extent_buffer *b;
2683 	int slot;
2684 	int ret;
2685 	int err;
2686 	int level;
2687 	int lowest_unlock = 1;
2688 	int root_lock;
2689 	/* everything at write_lock_level or lower must be write locked */
2690 	int write_lock_level = 0;
2691 	u8 lowest_level = 0;
2692 	int min_write_lock_level;
2693 	int prev_cmp;
2694 
2695 	lowest_level = p->lowest_level;
2696 	WARN_ON(lowest_level && ins_len > 0);
2697 	WARN_ON(p->nodes[0] != NULL);
2698 	BUG_ON(!cow && ins_len);
2699 
2700 	if (ins_len < 0) {
2701 		lowest_unlock = 2;
2702 
2703 		/* when we are removing items, we might have to go up to level
2704 		 * two as we update tree pointers  Make sure we keep write
2705 		 * for those levels as well
2706 		 */
2707 		write_lock_level = 2;
2708 	} else if (ins_len > 0) {
2709 		/*
2710 		 * for inserting items, make sure we have a write lock on
2711 		 * level 1 so we can update keys
2712 		 */
2713 		write_lock_level = 1;
2714 	}
2715 
2716 	if (!cow)
2717 		write_lock_level = -1;
2718 
2719 	if (cow && (p->keep_locks || p->lowest_level))
2720 		write_lock_level = BTRFS_MAX_LEVEL;
2721 
2722 	min_write_lock_level = write_lock_level;
2723 
2724 again:
2725 	prev_cmp = -1;
2726 	/*
2727 	 * we try very hard to do read locks on the root
2728 	 */
2729 	root_lock = BTRFS_READ_LOCK;
2730 	level = 0;
2731 	if (p->search_commit_root) {
2732 		/*
2733 		 * the commit roots are read only
2734 		 * so we always do read locks
2735 		 */
2736 		if (p->need_commit_sem)
2737 			down_read(&root->fs_info->commit_root_sem);
2738 		b = root->commit_root;
2739 		extent_buffer_get(b);
2740 		level = btrfs_header_level(b);
2741 		if (p->need_commit_sem)
2742 			up_read(&root->fs_info->commit_root_sem);
2743 		if (!p->skip_locking)
2744 			btrfs_tree_read_lock(b);
2745 	} else {
2746 		if (p->skip_locking) {
2747 			b = btrfs_root_node(root);
2748 			level = btrfs_header_level(b);
2749 		} else {
2750 			/* we don't know the level of the root node
2751 			 * until we actually have it read locked
2752 			 */
2753 			b = btrfs_read_lock_root_node(root);
2754 			level = btrfs_header_level(b);
2755 			if (level <= write_lock_level) {
2756 				/* whoops, must trade for write lock */
2757 				btrfs_tree_read_unlock(b);
2758 				free_extent_buffer(b);
2759 				b = btrfs_lock_root_node(root);
2760 				root_lock = BTRFS_WRITE_LOCK;
2761 
2762 				/* the level might have changed, check again */
2763 				level = btrfs_header_level(b);
2764 			}
2765 		}
2766 	}
2767 	p->nodes[level] = b;
2768 	if (!p->skip_locking)
2769 		p->locks[level] = root_lock;
2770 
2771 	while (b) {
2772 		level = btrfs_header_level(b);
2773 
2774 		/*
2775 		 * setup the path here so we can release it under lock
2776 		 * contention with the cow code
2777 		 */
2778 		if (cow) {
2779 			/*
2780 			 * if we don't really need to cow this block
2781 			 * then we don't want to set the path blocking,
2782 			 * so we test it here
2783 			 */
2784 			if (!should_cow_block(trans, root, b))
2785 				goto cow_done;
2786 
2787 			/*
2788 			 * must have write locks on this node and the
2789 			 * parent
2790 			 */
2791 			if (level > write_lock_level ||
2792 			    (level + 1 > write_lock_level &&
2793 			    level + 1 < BTRFS_MAX_LEVEL &&
2794 			    p->nodes[level + 1])) {
2795 				write_lock_level = level + 1;
2796 				btrfs_release_path(p);
2797 				goto again;
2798 			}
2799 
2800 			btrfs_set_path_blocking(p);
2801 			err = btrfs_cow_block(trans, root, b,
2802 					      p->nodes[level + 1],
2803 					      p->slots[level + 1], &b);
2804 			if (err) {
2805 				ret = err;
2806 				goto done;
2807 			}
2808 		}
2809 cow_done:
2810 		p->nodes[level] = b;
2811 		btrfs_clear_path_blocking(p, NULL, 0);
2812 
2813 		/*
2814 		 * we have a lock on b and as long as we aren't changing
2815 		 * the tree, there is no way to for the items in b to change.
2816 		 * It is safe to drop the lock on our parent before we
2817 		 * go through the expensive btree search on b.
2818 		 *
2819 		 * If we're inserting or deleting (ins_len != 0), then we might
2820 		 * be changing slot zero, which may require changing the parent.
2821 		 * So, we can't drop the lock until after we know which slot
2822 		 * we're operating on.
2823 		 */
2824 		if (!ins_len && !p->keep_locks) {
2825 			int u = level + 1;
2826 
2827 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2828 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2829 				p->locks[u] = 0;
2830 			}
2831 		}
2832 
2833 		ret = key_search(b, key, level, &prev_cmp, &slot);
2834 
2835 		if (level != 0) {
2836 			int dec = 0;
2837 			if (ret && slot > 0) {
2838 				dec = 1;
2839 				slot -= 1;
2840 			}
2841 			p->slots[level] = slot;
2842 			err = setup_nodes_for_search(trans, root, p, b, level,
2843 					     ins_len, &write_lock_level);
2844 			if (err == -EAGAIN)
2845 				goto again;
2846 			if (err) {
2847 				ret = err;
2848 				goto done;
2849 			}
2850 			b = p->nodes[level];
2851 			slot = p->slots[level];
2852 
2853 			/*
2854 			 * slot 0 is special, if we change the key
2855 			 * we have to update the parent pointer
2856 			 * which means we must have a write lock
2857 			 * on the parent
2858 			 */
2859 			if (slot == 0 && ins_len &&
2860 			    write_lock_level < level + 1) {
2861 				write_lock_level = level + 1;
2862 				btrfs_release_path(p);
2863 				goto again;
2864 			}
2865 
2866 			unlock_up(p, level, lowest_unlock,
2867 				  min_write_lock_level, &write_lock_level);
2868 
2869 			if (level == lowest_level) {
2870 				if (dec)
2871 					p->slots[level]++;
2872 				goto done;
2873 			}
2874 
2875 			err = read_block_for_search(trans, root, p,
2876 						    &b, level, slot, key, 0);
2877 			if (err == -EAGAIN)
2878 				goto again;
2879 			if (err) {
2880 				ret = err;
2881 				goto done;
2882 			}
2883 
2884 			if (!p->skip_locking) {
2885 				level = btrfs_header_level(b);
2886 				if (level <= write_lock_level) {
2887 					err = btrfs_try_tree_write_lock(b);
2888 					if (!err) {
2889 						btrfs_set_path_blocking(p);
2890 						btrfs_tree_lock(b);
2891 						btrfs_clear_path_blocking(p, b,
2892 								  BTRFS_WRITE_LOCK);
2893 					}
2894 					p->locks[level] = BTRFS_WRITE_LOCK;
2895 				} else {
2896 					err = btrfs_try_tree_read_lock(b);
2897 					if (!err) {
2898 						btrfs_set_path_blocking(p);
2899 						btrfs_tree_read_lock(b);
2900 						btrfs_clear_path_blocking(p, b,
2901 								  BTRFS_READ_LOCK);
2902 					}
2903 					p->locks[level] = BTRFS_READ_LOCK;
2904 				}
2905 				p->nodes[level] = b;
2906 			}
2907 		} else {
2908 			p->slots[level] = slot;
2909 			if (ins_len > 0 &&
2910 			    btrfs_leaf_free_space(root, b) < ins_len) {
2911 				if (write_lock_level < 1) {
2912 					write_lock_level = 1;
2913 					btrfs_release_path(p);
2914 					goto again;
2915 				}
2916 
2917 				btrfs_set_path_blocking(p);
2918 				err = split_leaf(trans, root, key,
2919 						 p, ins_len, ret == 0);
2920 				btrfs_clear_path_blocking(p, NULL, 0);
2921 
2922 				BUG_ON(err > 0);
2923 				if (err) {
2924 					ret = err;
2925 					goto done;
2926 				}
2927 			}
2928 			if (!p->search_for_split)
2929 				unlock_up(p, level, lowest_unlock,
2930 					  min_write_lock_level, &write_lock_level);
2931 			goto done;
2932 		}
2933 	}
2934 	ret = 1;
2935 done:
2936 	/*
2937 	 * we don't really know what they plan on doing with the path
2938 	 * from here on, so for now just mark it as blocking
2939 	 */
2940 	if (!p->leave_spinning)
2941 		btrfs_set_path_blocking(p);
2942 	if (ret < 0)
2943 		btrfs_release_path(p);
2944 	return ret;
2945 }
2946 
2947 /*
2948  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2949  * current state of the tree together with the operations recorded in the tree
2950  * modification log to search for the key in a previous version of this tree, as
2951  * denoted by the time_seq parameter.
2952  *
2953  * Naturally, there is no support for insert, delete or cow operations.
2954  *
2955  * The resulting path and return value will be set up as if we called
2956  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2957  */
2958 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2959 			  struct btrfs_path *p, u64 time_seq)
2960 {
2961 	struct extent_buffer *b;
2962 	int slot;
2963 	int ret;
2964 	int err;
2965 	int level;
2966 	int lowest_unlock = 1;
2967 	u8 lowest_level = 0;
2968 	int prev_cmp = -1;
2969 
2970 	lowest_level = p->lowest_level;
2971 	WARN_ON(p->nodes[0] != NULL);
2972 
2973 	if (p->search_commit_root) {
2974 		BUG_ON(time_seq);
2975 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2976 	}
2977 
2978 again:
2979 	b = get_old_root(root, time_seq);
2980 	level = btrfs_header_level(b);
2981 	p->locks[level] = BTRFS_READ_LOCK;
2982 
2983 	while (b) {
2984 		level = btrfs_header_level(b);
2985 		p->nodes[level] = b;
2986 		btrfs_clear_path_blocking(p, NULL, 0);
2987 
2988 		/*
2989 		 * we have a lock on b and as long as we aren't changing
2990 		 * the tree, there is no way to for the items in b to change.
2991 		 * It is safe to drop the lock on our parent before we
2992 		 * go through the expensive btree search on b.
2993 		 */
2994 		btrfs_unlock_up_safe(p, level + 1);
2995 
2996 		/*
2997 		 * Since we can unwind eb's we want to do a real search every
2998 		 * time.
2999 		 */
3000 		prev_cmp = -1;
3001 		ret = key_search(b, key, level, &prev_cmp, &slot);
3002 
3003 		if (level != 0) {
3004 			int dec = 0;
3005 			if (ret && slot > 0) {
3006 				dec = 1;
3007 				slot -= 1;
3008 			}
3009 			p->slots[level] = slot;
3010 			unlock_up(p, level, lowest_unlock, 0, NULL);
3011 
3012 			if (level == lowest_level) {
3013 				if (dec)
3014 					p->slots[level]++;
3015 				goto done;
3016 			}
3017 
3018 			err = read_block_for_search(NULL, root, p, &b, level,
3019 						    slot, key, time_seq);
3020 			if (err == -EAGAIN)
3021 				goto again;
3022 			if (err) {
3023 				ret = err;
3024 				goto done;
3025 			}
3026 
3027 			level = btrfs_header_level(b);
3028 			err = btrfs_try_tree_read_lock(b);
3029 			if (!err) {
3030 				btrfs_set_path_blocking(p);
3031 				btrfs_tree_read_lock(b);
3032 				btrfs_clear_path_blocking(p, b,
3033 							  BTRFS_READ_LOCK);
3034 			}
3035 			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3036 			if (!b) {
3037 				ret = -ENOMEM;
3038 				goto done;
3039 			}
3040 			p->locks[level] = BTRFS_READ_LOCK;
3041 			p->nodes[level] = b;
3042 		} else {
3043 			p->slots[level] = slot;
3044 			unlock_up(p, level, lowest_unlock, 0, NULL);
3045 			goto done;
3046 		}
3047 	}
3048 	ret = 1;
3049 done:
3050 	if (!p->leave_spinning)
3051 		btrfs_set_path_blocking(p);
3052 	if (ret < 0)
3053 		btrfs_release_path(p);
3054 
3055 	return ret;
3056 }
3057 
3058 /*
3059  * helper to use instead of search slot if no exact match is needed but
3060  * instead the next or previous item should be returned.
3061  * When find_higher is true, the next higher item is returned, the next lower
3062  * otherwise.
3063  * When return_any and find_higher are both true, and no higher item is found,
3064  * return the next lower instead.
3065  * When return_any is true and find_higher is false, and no lower item is found,
3066  * return the next higher instead.
3067  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3068  * < 0 on error
3069  */
3070 int btrfs_search_slot_for_read(struct btrfs_root *root,
3071 			       struct btrfs_key *key, struct btrfs_path *p,
3072 			       int find_higher, int return_any)
3073 {
3074 	int ret;
3075 	struct extent_buffer *leaf;
3076 
3077 again:
3078 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3079 	if (ret <= 0)
3080 		return ret;
3081 	/*
3082 	 * a return value of 1 means the path is at the position where the
3083 	 * item should be inserted. Normally this is the next bigger item,
3084 	 * but in case the previous item is the last in a leaf, path points
3085 	 * to the first free slot in the previous leaf, i.e. at an invalid
3086 	 * item.
3087 	 */
3088 	leaf = p->nodes[0];
3089 
3090 	if (find_higher) {
3091 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3092 			ret = btrfs_next_leaf(root, p);
3093 			if (ret <= 0)
3094 				return ret;
3095 			if (!return_any)
3096 				return 1;
3097 			/*
3098 			 * no higher item found, return the next
3099 			 * lower instead
3100 			 */
3101 			return_any = 0;
3102 			find_higher = 0;
3103 			btrfs_release_path(p);
3104 			goto again;
3105 		}
3106 	} else {
3107 		if (p->slots[0] == 0) {
3108 			ret = btrfs_prev_leaf(root, p);
3109 			if (ret < 0)
3110 				return ret;
3111 			if (!ret) {
3112 				leaf = p->nodes[0];
3113 				if (p->slots[0] == btrfs_header_nritems(leaf))
3114 					p->slots[0]--;
3115 				return 0;
3116 			}
3117 			if (!return_any)
3118 				return 1;
3119 			/*
3120 			 * no lower item found, return the next
3121 			 * higher instead
3122 			 */
3123 			return_any = 0;
3124 			find_higher = 1;
3125 			btrfs_release_path(p);
3126 			goto again;
3127 		} else {
3128 			--p->slots[0];
3129 		}
3130 	}
3131 	return 0;
3132 }
3133 
3134 /*
3135  * adjust the pointers going up the tree, starting at level
3136  * making sure the right key of each node is points to 'key'.
3137  * This is used after shifting pointers to the left, so it stops
3138  * fixing up pointers when a given leaf/node is not in slot 0 of the
3139  * higher levels
3140  *
3141  */
3142 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3143 			   struct btrfs_disk_key *key, int level)
3144 {
3145 	int i;
3146 	struct extent_buffer *t;
3147 
3148 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3149 		int tslot = path->slots[i];
3150 		if (!path->nodes[i])
3151 			break;
3152 		t = path->nodes[i];
3153 		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3154 		btrfs_set_node_key(t, key, tslot);
3155 		btrfs_mark_buffer_dirty(path->nodes[i]);
3156 		if (tslot != 0)
3157 			break;
3158 	}
3159 }
3160 
3161 /*
3162  * update item key.
3163  *
3164  * This function isn't completely safe. It's the caller's responsibility
3165  * that the new key won't break the order
3166  */
3167 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3168 			     struct btrfs_key *new_key)
3169 {
3170 	struct btrfs_disk_key disk_key;
3171 	struct extent_buffer *eb;
3172 	int slot;
3173 
3174 	eb = path->nodes[0];
3175 	slot = path->slots[0];
3176 	if (slot > 0) {
3177 		btrfs_item_key(eb, &disk_key, slot - 1);
3178 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3179 	}
3180 	if (slot < btrfs_header_nritems(eb) - 1) {
3181 		btrfs_item_key(eb, &disk_key, slot + 1);
3182 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3183 	}
3184 
3185 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3186 	btrfs_set_item_key(eb, &disk_key, slot);
3187 	btrfs_mark_buffer_dirty(eb);
3188 	if (slot == 0)
3189 		fixup_low_keys(root, path, &disk_key, 1);
3190 }
3191 
3192 /*
3193  * try to push data from one node into the next node left in the
3194  * tree.
3195  *
3196  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3197  * error, and > 0 if there was no room in the left hand block.
3198  */
3199 static int push_node_left(struct btrfs_trans_handle *trans,
3200 			  struct btrfs_root *root, struct extent_buffer *dst,
3201 			  struct extent_buffer *src, int empty)
3202 {
3203 	int push_items = 0;
3204 	int src_nritems;
3205 	int dst_nritems;
3206 	int ret = 0;
3207 
3208 	src_nritems = btrfs_header_nritems(src);
3209 	dst_nritems = btrfs_header_nritems(dst);
3210 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3211 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3212 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3213 
3214 	if (!empty && src_nritems <= 8)
3215 		return 1;
3216 
3217 	if (push_items <= 0)
3218 		return 1;
3219 
3220 	if (empty) {
3221 		push_items = min(src_nritems, push_items);
3222 		if (push_items < src_nritems) {
3223 			/* leave at least 8 pointers in the node if
3224 			 * we aren't going to empty it
3225 			 */
3226 			if (src_nritems - push_items < 8) {
3227 				if (push_items <= 8)
3228 					return 1;
3229 				push_items -= 8;
3230 			}
3231 		}
3232 	} else
3233 		push_items = min(src_nritems - 8, push_items);
3234 
3235 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3236 				   push_items);
3237 	if (ret) {
3238 		btrfs_abort_transaction(trans, root, ret);
3239 		return ret;
3240 	}
3241 	copy_extent_buffer(dst, src,
3242 			   btrfs_node_key_ptr_offset(dst_nritems),
3243 			   btrfs_node_key_ptr_offset(0),
3244 			   push_items * sizeof(struct btrfs_key_ptr));
3245 
3246 	if (push_items < src_nritems) {
3247 		/*
3248 		 * don't call tree_mod_log_eb_move here, key removal was already
3249 		 * fully logged by tree_mod_log_eb_copy above.
3250 		 */
3251 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3252 				      btrfs_node_key_ptr_offset(push_items),
3253 				      (src_nritems - push_items) *
3254 				      sizeof(struct btrfs_key_ptr));
3255 	}
3256 	btrfs_set_header_nritems(src, src_nritems - push_items);
3257 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3258 	btrfs_mark_buffer_dirty(src);
3259 	btrfs_mark_buffer_dirty(dst);
3260 
3261 	return ret;
3262 }
3263 
3264 /*
3265  * try to push data from one node into the next node right in the
3266  * tree.
3267  *
3268  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3269  * error, and > 0 if there was no room in the right hand block.
3270  *
3271  * this will  only push up to 1/2 the contents of the left node over
3272  */
3273 static int balance_node_right(struct btrfs_trans_handle *trans,
3274 			      struct btrfs_root *root,
3275 			      struct extent_buffer *dst,
3276 			      struct extent_buffer *src)
3277 {
3278 	int push_items = 0;
3279 	int max_push;
3280 	int src_nritems;
3281 	int dst_nritems;
3282 	int ret = 0;
3283 
3284 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3285 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3286 
3287 	src_nritems = btrfs_header_nritems(src);
3288 	dst_nritems = btrfs_header_nritems(dst);
3289 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3290 	if (push_items <= 0)
3291 		return 1;
3292 
3293 	if (src_nritems < 4)
3294 		return 1;
3295 
3296 	max_push = src_nritems / 2 + 1;
3297 	/* don't try to empty the node */
3298 	if (max_push >= src_nritems)
3299 		return 1;
3300 
3301 	if (max_push < push_items)
3302 		push_items = max_push;
3303 
3304 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3305 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3306 				      btrfs_node_key_ptr_offset(0),
3307 				      (dst_nritems) *
3308 				      sizeof(struct btrfs_key_ptr));
3309 
3310 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3311 				   src_nritems - push_items, push_items);
3312 	if (ret) {
3313 		btrfs_abort_transaction(trans, root, ret);
3314 		return ret;
3315 	}
3316 	copy_extent_buffer(dst, src,
3317 			   btrfs_node_key_ptr_offset(0),
3318 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3319 			   push_items * sizeof(struct btrfs_key_ptr));
3320 
3321 	btrfs_set_header_nritems(src, src_nritems - push_items);
3322 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3323 
3324 	btrfs_mark_buffer_dirty(src);
3325 	btrfs_mark_buffer_dirty(dst);
3326 
3327 	return ret;
3328 }
3329 
3330 /*
3331  * helper function to insert a new root level in the tree.
3332  * A new node is allocated, and a single item is inserted to
3333  * point to the existing root
3334  *
3335  * returns zero on success or < 0 on failure.
3336  */
3337 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3338 			   struct btrfs_root *root,
3339 			   struct btrfs_path *path, int level)
3340 {
3341 	u64 lower_gen;
3342 	struct extent_buffer *lower;
3343 	struct extent_buffer *c;
3344 	struct extent_buffer *old;
3345 	struct btrfs_disk_key lower_key;
3346 
3347 	BUG_ON(path->nodes[level]);
3348 	BUG_ON(path->nodes[level-1] != root->node);
3349 
3350 	lower = path->nodes[level-1];
3351 	if (level == 1)
3352 		btrfs_item_key(lower, &lower_key, 0);
3353 	else
3354 		btrfs_node_key(lower, &lower_key, 0);
3355 
3356 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3357 				   &lower_key, level, root->node->start, 0);
3358 	if (IS_ERR(c))
3359 		return PTR_ERR(c);
3360 
3361 	root_add_used(root, root->nodesize);
3362 
3363 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3364 	btrfs_set_header_nritems(c, 1);
3365 	btrfs_set_header_level(c, level);
3366 	btrfs_set_header_bytenr(c, c->start);
3367 	btrfs_set_header_generation(c, trans->transid);
3368 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3369 	btrfs_set_header_owner(c, root->root_key.objectid);
3370 
3371 	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3372 			    BTRFS_FSID_SIZE);
3373 
3374 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3375 			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3376 
3377 	btrfs_set_node_key(c, &lower_key, 0);
3378 	btrfs_set_node_blockptr(c, 0, lower->start);
3379 	lower_gen = btrfs_header_generation(lower);
3380 	WARN_ON(lower_gen != trans->transid);
3381 
3382 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3383 
3384 	btrfs_mark_buffer_dirty(c);
3385 
3386 	old = root->node;
3387 	tree_mod_log_set_root_pointer(root, c, 0);
3388 	rcu_assign_pointer(root->node, c);
3389 
3390 	/* the super has an extra ref to root->node */
3391 	free_extent_buffer(old);
3392 
3393 	add_root_to_dirty_list(root);
3394 	extent_buffer_get(c);
3395 	path->nodes[level] = c;
3396 	path->locks[level] = BTRFS_WRITE_LOCK;
3397 	path->slots[level] = 0;
3398 	return 0;
3399 }
3400 
3401 /*
3402  * worker function to insert a single pointer in a node.
3403  * the node should have enough room for the pointer already
3404  *
3405  * slot and level indicate where you want the key to go, and
3406  * blocknr is the block the key points to.
3407  */
3408 static void insert_ptr(struct btrfs_trans_handle *trans,
3409 		       struct btrfs_root *root, struct btrfs_path *path,
3410 		       struct btrfs_disk_key *key, u64 bytenr,
3411 		       int slot, int level)
3412 {
3413 	struct extent_buffer *lower;
3414 	int nritems;
3415 	int ret;
3416 
3417 	BUG_ON(!path->nodes[level]);
3418 	btrfs_assert_tree_locked(path->nodes[level]);
3419 	lower = path->nodes[level];
3420 	nritems = btrfs_header_nritems(lower);
3421 	BUG_ON(slot > nritems);
3422 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3423 	if (slot != nritems) {
3424 		if (level)
3425 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3426 					     slot, nritems - slot);
3427 		memmove_extent_buffer(lower,
3428 			      btrfs_node_key_ptr_offset(slot + 1),
3429 			      btrfs_node_key_ptr_offset(slot),
3430 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3431 	}
3432 	if (level) {
3433 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3434 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3435 		BUG_ON(ret < 0);
3436 	}
3437 	btrfs_set_node_key(lower, key, slot);
3438 	btrfs_set_node_blockptr(lower, slot, bytenr);
3439 	WARN_ON(trans->transid == 0);
3440 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3441 	btrfs_set_header_nritems(lower, nritems + 1);
3442 	btrfs_mark_buffer_dirty(lower);
3443 }
3444 
3445 /*
3446  * split the node at the specified level in path in two.
3447  * The path is corrected to point to the appropriate node after the split
3448  *
3449  * Before splitting this tries to make some room in the node by pushing
3450  * left and right, if either one works, it returns right away.
3451  *
3452  * returns 0 on success and < 0 on failure
3453  */
3454 static noinline int split_node(struct btrfs_trans_handle *trans,
3455 			       struct btrfs_root *root,
3456 			       struct btrfs_path *path, int level)
3457 {
3458 	struct extent_buffer *c;
3459 	struct extent_buffer *split;
3460 	struct btrfs_disk_key disk_key;
3461 	int mid;
3462 	int ret;
3463 	u32 c_nritems;
3464 
3465 	c = path->nodes[level];
3466 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3467 	if (c == root->node) {
3468 		/*
3469 		 * trying to split the root, lets make a new one
3470 		 *
3471 		 * tree mod log: We don't log_removal old root in
3472 		 * insert_new_root, because that root buffer will be kept as a
3473 		 * normal node. We are going to log removal of half of the
3474 		 * elements below with tree_mod_log_eb_copy. We're holding a
3475 		 * tree lock on the buffer, which is why we cannot race with
3476 		 * other tree_mod_log users.
3477 		 */
3478 		ret = insert_new_root(trans, root, path, level + 1);
3479 		if (ret)
3480 			return ret;
3481 	} else {
3482 		ret = push_nodes_for_insert(trans, root, path, level);
3483 		c = path->nodes[level];
3484 		if (!ret && btrfs_header_nritems(c) <
3485 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3486 			return 0;
3487 		if (ret < 0)
3488 			return ret;
3489 	}
3490 
3491 	c_nritems = btrfs_header_nritems(c);
3492 	mid = (c_nritems + 1) / 2;
3493 	btrfs_node_key(c, &disk_key, mid);
3494 
3495 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3496 			&disk_key, level, c->start, 0);
3497 	if (IS_ERR(split))
3498 		return PTR_ERR(split);
3499 
3500 	root_add_used(root, root->nodesize);
3501 
3502 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3503 	btrfs_set_header_level(split, btrfs_header_level(c));
3504 	btrfs_set_header_bytenr(split, split->start);
3505 	btrfs_set_header_generation(split, trans->transid);
3506 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3507 	btrfs_set_header_owner(split, root->root_key.objectid);
3508 	write_extent_buffer(split, root->fs_info->fsid,
3509 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
3510 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3511 			    btrfs_header_chunk_tree_uuid(split),
3512 			    BTRFS_UUID_SIZE);
3513 
3514 	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3515 				   mid, c_nritems - mid);
3516 	if (ret) {
3517 		btrfs_abort_transaction(trans, root, ret);
3518 		return ret;
3519 	}
3520 	copy_extent_buffer(split, c,
3521 			   btrfs_node_key_ptr_offset(0),
3522 			   btrfs_node_key_ptr_offset(mid),
3523 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3524 	btrfs_set_header_nritems(split, c_nritems - mid);
3525 	btrfs_set_header_nritems(c, mid);
3526 	ret = 0;
3527 
3528 	btrfs_mark_buffer_dirty(c);
3529 	btrfs_mark_buffer_dirty(split);
3530 
3531 	insert_ptr(trans, root, path, &disk_key, split->start,
3532 		   path->slots[level + 1] + 1, level + 1);
3533 
3534 	if (path->slots[level] >= mid) {
3535 		path->slots[level] -= mid;
3536 		btrfs_tree_unlock(c);
3537 		free_extent_buffer(c);
3538 		path->nodes[level] = split;
3539 		path->slots[level + 1] += 1;
3540 	} else {
3541 		btrfs_tree_unlock(split);
3542 		free_extent_buffer(split);
3543 	}
3544 	return ret;
3545 }
3546 
3547 /*
3548  * how many bytes are required to store the items in a leaf.  start
3549  * and nr indicate which items in the leaf to check.  This totals up the
3550  * space used both by the item structs and the item data
3551  */
3552 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3553 {
3554 	struct btrfs_item *start_item;
3555 	struct btrfs_item *end_item;
3556 	struct btrfs_map_token token;
3557 	int data_len;
3558 	int nritems = btrfs_header_nritems(l);
3559 	int end = min(nritems, start + nr) - 1;
3560 
3561 	if (!nr)
3562 		return 0;
3563 	btrfs_init_map_token(&token);
3564 	start_item = btrfs_item_nr(start);
3565 	end_item = btrfs_item_nr(end);
3566 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3567 		btrfs_token_item_size(l, start_item, &token);
3568 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3569 	data_len += sizeof(struct btrfs_item) * nr;
3570 	WARN_ON(data_len < 0);
3571 	return data_len;
3572 }
3573 
3574 /*
3575  * The space between the end of the leaf items and
3576  * the start of the leaf data.  IOW, how much room
3577  * the leaf has left for both items and data
3578  */
3579 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3580 				   struct extent_buffer *leaf)
3581 {
3582 	int nritems = btrfs_header_nritems(leaf);
3583 	int ret;
3584 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3585 	if (ret < 0) {
3586 		btrfs_crit(root->fs_info,
3587 			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3588 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3589 		       leaf_space_used(leaf, 0, nritems), nritems);
3590 	}
3591 	return ret;
3592 }
3593 
3594 /*
3595  * min slot controls the lowest index we're willing to push to the
3596  * right.  We'll push up to and including min_slot, but no lower
3597  */
3598 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3599 				      struct btrfs_root *root,
3600 				      struct btrfs_path *path,
3601 				      int data_size, int empty,
3602 				      struct extent_buffer *right,
3603 				      int free_space, u32 left_nritems,
3604 				      u32 min_slot)
3605 {
3606 	struct extent_buffer *left = path->nodes[0];
3607 	struct extent_buffer *upper = path->nodes[1];
3608 	struct btrfs_map_token token;
3609 	struct btrfs_disk_key disk_key;
3610 	int slot;
3611 	u32 i;
3612 	int push_space = 0;
3613 	int push_items = 0;
3614 	struct btrfs_item *item;
3615 	u32 nr;
3616 	u32 right_nritems;
3617 	u32 data_end;
3618 	u32 this_item_size;
3619 
3620 	btrfs_init_map_token(&token);
3621 
3622 	if (empty)
3623 		nr = 0;
3624 	else
3625 		nr = max_t(u32, 1, min_slot);
3626 
3627 	if (path->slots[0] >= left_nritems)
3628 		push_space += data_size;
3629 
3630 	slot = path->slots[1];
3631 	i = left_nritems - 1;
3632 	while (i >= nr) {
3633 		item = btrfs_item_nr(i);
3634 
3635 		if (!empty && push_items > 0) {
3636 			if (path->slots[0] > i)
3637 				break;
3638 			if (path->slots[0] == i) {
3639 				int space = btrfs_leaf_free_space(root, left);
3640 				if (space + push_space * 2 > free_space)
3641 					break;
3642 			}
3643 		}
3644 
3645 		if (path->slots[0] == i)
3646 			push_space += data_size;
3647 
3648 		this_item_size = btrfs_item_size(left, item);
3649 		if (this_item_size + sizeof(*item) + push_space > free_space)
3650 			break;
3651 
3652 		push_items++;
3653 		push_space += this_item_size + sizeof(*item);
3654 		if (i == 0)
3655 			break;
3656 		i--;
3657 	}
3658 
3659 	if (push_items == 0)
3660 		goto out_unlock;
3661 
3662 	WARN_ON(!empty && push_items == left_nritems);
3663 
3664 	/* push left to right */
3665 	right_nritems = btrfs_header_nritems(right);
3666 
3667 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3668 	push_space -= leaf_data_end(root, left);
3669 
3670 	/* make room in the right data area */
3671 	data_end = leaf_data_end(root, right);
3672 	memmove_extent_buffer(right,
3673 			      btrfs_leaf_data(right) + data_end - push_space,
3674 			      btrfs_leaf_data(right) + data_end,
3675 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3676 
3677 	/* copy from the left data area */
3678 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3679 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3680 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3681 		     push_space);
3682 
3683 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3684 			      btrfs_item_nr_offset(0),
3685 			      right_nritems * sizeof(struct btrfs_item));
3686 
3687 	/* copy the items from left to right */
3688 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3689 		   btrfs_item_nr_offset(left_nritems - push_items),
3690 		   push_items * sizeof(struct btrfs_item));
3691 
3692 	/* update the item pointers */
3693 	right_nritems += push_items;
3694 	btrfs_set_header_nritems(right, right_nritems);
3695 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3696 	for (i = 0; i < right_nritems; i++) {
3697 		item = btrfs_item_nr(i);
3698 		push_space -= btrfs_token_item_size(right, item, &token);
3699 		btrfs_set_token_item_offset(right, item, push_space, &token);
3700 	}
3701 
3702 	left_nritems -= push_items;
3703 	btrfs_set_header_nritems(left, left_nritems);
3704 
3705 	if (left_nritems)
3706 		btrfs_mark_buffer_dirty(left);
3707 	else
3708 		clean_tree_block(trans, root, left);
3709 
3710 	btrfs_mark_buffer_dirty(right);
3711 
3712 	btrfs_item_key(right, &disk_key, 0);
3713 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3714 	btrfs_mark_buffer_dirty(upper);
3715 
3716 	/* then fixup the leaf pointer in the path */
3717 	if (path->slots[0] >= left_nritems) {
3718 		path->slots[0] -= left_nritems;
3719 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3720 			clean_tree_block(trans, root, path->nodes[0]);
3721 		btrfs_tree_unlock(path->nodes[0]);
3722 		free_extent_buffer(path->nodes[0]);
3723 		path->nodes[0] = right;
3724 		path->slots[1] += 1;
3725 	} else {
3726 		btrfs_tree_unlock(right);
3727 		free_extent_buffer(right);
3728 	}
3729 	return 0;
3730 
3731 out_unlock:
3732 	btrfs_tree_unlock(right);
3733 	free_extent_buffer(right);
3734 	return 1;
3735 }
3736 
3737 /*
3738  * push some data in the path leaf to the right, trying to free up at
3739  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3740  *
3741  * returns 1 if the push failed because the other node didn't have enough
3742  * room, 0 if everything worked out and < 0 if there were major errors.
3743  *
3744  * this will push starting from min_slot to the end of the leaf.  It won't
3745  * push any slot lower than min_slot
3746  */
3747 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3748 			   *root, struct btrfs_path *path,
3749 			   int min_data_size, int data_size,
3750 			   int empty, u32 min_slot)
3751 {
3752 	struct extent_buffer *left = path->nodes[0];
3753 	struct extent_buffer *right;
3754 	struct extent_buffer *upper;
3755 	int slot;
3756 	int free_space;
3757 	u32 left_nritems;
3758 	int ret;
3759 
3760 	if (!path->nodes[1])
3761 		return 1;
3762 
3763 	slot = path->slots[1];
3764 	upper = path->nodes[1];
3765 	if (slot >= btrfs_header_nritems(upper) - 1)
3766 		return 1;
3767 
3768 	btrfs_assert_tree_locked(path->nodes[1]);
3769 
3770 	right = read_node_slot(root, upper, slot + 1);
3771 	if (right == NULL)
3772 		return 1;
3773 
3774 	btrfs_tree_lock(right);
3775 	btrfs_set_lock_blocking(right);
3776 
3777 	free_space = btrfs_leaf_free_space(root, right);
3778 	if (free_space < data_size)
3779 		goto out_unlock;
3780 
3781 	/* cow and double check */
3782 	ret = btrfs_cow_block(trans, root, right, upper,
3783 			      slot + 1, &right);
3784 	if (ret)
3785 		goto out_unlock;
3786 
3787 	free_space = btrfs_leaf_free_space(root, right);
3788 	if (free_space < data_size)
3789 		goto out_unlock;
3790 
3791 	left_nritems = btrfs_header_nritems(left);
3792 	if (left_nritems == 0)
3793 		goto out_unlock;
3794 
3795 	if (path->slots[0] == left_nritems && !empty) {
3796 		/* Key greater than all keys in the leaf, right neighbor has
3797 		 * enough room for it and we're not emptying our leaf to delete
3798 		 * it, therefore use right neighbor to insert the new item and
3799 		 * no need to touch/dirty our left leaft. */
3800 		btrfs_tree_unlock(left);
3801 		free_extent_buffer(left);
3802 		path->nodes[0] = right;
3803 		path->slots[0] = 0;
3804 		path->slots[1]++;
3805 		return 0;
3806 	}
3807 
3808 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3809 				right, free_space, left_nritems, min_slot);
3810 out_unlock:
3811 	btrfs_tree_unlock(right);
3812 	free_extent_buffer(right);
3813 	return 1;
3814 }
3815 
3816 /*
3817  * push some data in the path leaf to the left, trying to free up at
3818  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3819  *
3820  * max_slot can put a limit on how far into the leaf we'll push items.  The
3821  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3822  * items
3823  */
3824 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3825 				     struct btrfs_root *root,
3826 				     struct btrfs_path *path, int data_size,
3827 				     int empty, struct extent_buffer *left,
3828 				     int free_space, u32 right_nritems,
3829 				     u32 max_slot)
3830 {
3831 	struct btrfs_disk_key disk_key;
3832 	struct extent_buffer *right = path->nodes[0];
3833 	int i;
3834 	int push_space = 0;
3835 	int push_items = 0;
3836 	struct btrfs_item *item;
3837 	u32 old_left_nritems;
3838 	u32 nr;
3839 	int ret = 0;
3840 	u32 this_item_size;
3841 	u32 old_left_item_size;
3842 	struct btrfs_map_token token;
3843 
3844 	btrfs_init_map_token(&token);
3845 
3846 	if (empty)
3847 		nr = min(right_nritems, max_slot);
3848 	else
3849 		nr = min(right_nritems - 1, max_slot);
3850 
3851 	for (i = 0; i < nr; i++) {
3852 		item = btrfs_item_nr(i);
3853 
3854 		if (!empty && push_items > 0) {
3855 			if (path->slots[0] < i)
3856 				break;
3857 			if (path->slots[0] == i) {
3858 				int space = btrfs_leaf_free_space(root, right);
3859 				if (space + push_space * 2 > free_space)
3860 					break;
3861 			}
3862 		}
3863 
3864 		if (path->slots[0] == i)
3865 			push_space += data_size;
3866 
3867 		this_item_size = btrfs_item_size(right, item);
3868 		if (this_item_size + sizeof(*item) + push_space > free_space)
3869 			break;
3870 
3871 		push_items++;
3872 		push_space += this_item_size + sizeof(*item);
3873 	}
3874 
3875 	if (push_items == 0) {
3876 		ret = 1;
3877 		goto out;
3878 	}
3879 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3880 
3881 	/* push data from right to left */
3882 	copy_extent_buffer(left, right,
3883 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3884 			   btrfs_item_nr_offset(0),
3885 			   push_items * sizeof(struct btrfs_item));
3886 
3887 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3888 		     btrfs_item_offset_nr(right, push_items - 1);
3889 
3890 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3891 		     leaf_data_end(root, left) - push_space,
3892 		     btrfs_leaf_data(right) +
3893 		     btrfs_item_offset_nr(right, push_items - 1),
3894 		     push_space);
3895 	old_left_nritems = btrfs_header_nritems(left);
3896 	BUG_ON(old_left_nritems <= 0);
3897 
3898 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3899 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3900 		u32 ioff;
3901 
3902 		item = btrfs_item_nr(i);
3903 
3904 		ioff = btrfs_token_item_offset(left, item, &token);
3905 		btrfs_set_token_item_offset(left, item,
3906 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3907 		      &token);
3908 	}
3909 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3910 
3911 	/* fixup right node */
3912 	if (push_items > right_nritems)
3913 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3914 		       right_nritems);
3915 
3916 	if (push_items < right_nritems) {
3917 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3918 						  leaf_data_end(root, right);
3919 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3920 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3921 				      btrfs_leaf_data(right) +
3922 				      leaf_data_end(root, right), push_space);
3923 
3924 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3925 			      btrfs_item_nr_offset(push_items),
3926 			     (btrfs_header_nritems(right) - push_items) *
3927 			     sizeof(struct btrfs_item));
3928 	}
3929 	right_nritems -= push_items;
3930 	btrfs_set_header_nritems(right, right_nritems);
3931 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3932 	for (i = 0; i < right_nritems; i++) {
3933 		item = btrfs_item_nr(i);
3934 
3935 		push_space = push_space - btrfs_token_item_size(right,
3936 								item, &token);
3937 		btrfs_set_token_item_offset(right, item, push_space, &token);
3938 	}
3939 
3940 	btrfs_mark_buffer_dirty(left);
3941 	if (right_nritems)
3942 		btrfs_mark_buffer_dirty(right);
3943 	else
3944 		clean_tree_block(trans, root, right);
3945 
3946 	btrfs_item_key(right, &disk_key, 0);
3947 	fixup_low_keys(root, path, &disk_key, 1);
3948 
3949 	/* then fixup the leaf pointer in the path */
3950 	if (path->slots[0] < push_items) {
3951 		path->slots[0] += old_left_nritems;
3952 		btrfs_tree_unlock(path->nodes[0]);
3953 		free_extent_buffer(path->nodes[0]);
3954 		path->nodes[0] = left;
3955 		path->slots[1] -= 1;
3956 	} else {
3957 		btrfs_tree_unlock(left);
3958 		free_extent_buffer(left);
3959 		path->slots[0] -= push_items;
3960 	}
3961 	BUG_ON(path->slots[0] < 0);
3962 	return ret;
3963 out:
3964 	btrfs_tree_unlock(left);
3965 	free_extent_buffer(left);
3966 	return ret;
3967 }
3968 
3969 /*
3970  * push some data in the path leaf to the left, trying to free up at
3971  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3972  *
3973  * max_slot can put a limit on how far into the leaf we'll push items.  The
3974  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3975  * items
3976  */
3977 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3978 			  *root, struct btrfs_path *path, int min_data_size,
3979 			  int data_size, int empty, u32 max_slot)
3980 {
3981 	struct extent_buffer *right = path->nodes[0];
3982 	struct extent_buffer *left;
3983 	int slot;
3984 	int free_space;
3985 	u32 right_nritems;
3986 	int ret = 0;
3987 
3988 	slot = path->slots[1];
3989 	if (slot == 0)
3990 		return 1;
3991 	if (!path->nodes[1])
3992 		return 1;
3993 
3994 	right_nritems = btrfs_header_nritems(right);
3995 	if (right_nritems == 0)
3996 		return 1;
3997 
3998 	btrfs_assert_tree_locked(path->nodes[1]);
3999 
4000 	left = read_node_slot(root, path->nodes[1], slot - 1);
4001 	if (left == NULL)
4002 		return 1;
4003 
4004 	btrfs_tree_lock(left);
4005 	btrfs_set_lock_blocking(left);
4006 
4007 	free_space = btrfs_leaf_free_space(root, left);
4008 	if (free_space < data_size) {
4009 		ret = 1;
4010 		goto out;
4011 	}
4012 
4013 	/* cow and double check */
4014 	ret = btrfs_cow_block(trans, root, left,
4015 			      path->nodes[1], slot - 1, &left);
4016 	if (ret) {
4017 		/* we hit -ENOSPC, but it isn't fatal here */
4018 		if (ret == -ENOSPC)
4019 			ret = 1;
4020 		goto out;
4021 	}
4022 
4023 	free_space = btrfs_leaf_free_space(root, left);
4024 	if (free_space < data_size) {
4025 		ret = 1;
4026 		goto out;
4027 	}
4028 
4029 	return __push_leaf_left(trans, root, path, min_data_size,
4030 			       empty, left, free_space, right_nritems,
4031 			       max_slot);
4032 out:
4033 	btrfs_tree_unlock(left);
4034 	free_extent_buffer(left);
4035 	return ret;
4036 }
4037 
4038 /*
4039  * split the path's leaf in two, making sure there is at least data_size
4040  * available for the resulting leaf level of the path.
4041  */
4042 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4043 				    struct btrfs_root *root,
4044 				    struct btrfs_path *path,
4045 				    struct extent_buffer *l,
4046 				    struct extent_buffer *right,
4047 				    int slot, int mid, int nritems)
4048 {
4049 	int data_copy_size;
4050 	int rt_data_off;
4051 	int i;
4052 	struct btrfs_disk_key disk_key;
4053 	struct btrfs_map_token token;
4054 
4055 	btrfs_init_map_token(&token);
4056 
4057 	nritems = nritems - mid;
4058 	btrfs_set_header_nritems(right, nritems);
4059 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4060 
4061 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4062 			   btrfs_item_nr_offset(mid),
4063 			   nritems * sizeof(struct btrfs_item));
4064 
4065 	copy_extent_buffer(right, l,
4066 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4067 		     data_copy_size, btrfs_leaf_data(l) +
4068 		     leaf_data_end(root, l), data_copy_size);
4069 
4070 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4071 		      btrfs_item_end_nr(l, mid);
4072 
4073 	for (i = 0; i < nritems; i++) {
4074 		struct btrfs_item *item = btrfs_item_nr(i);
4075 		u32 ioff;
4076 
4077 		ioff = btrfs_token_item_offset(right, item, &token);
4078 		btrfs_set_token_item_offset(right, item,
4079 					    ioff + rt_data_off, &token);
4080 	}
4081 
4082 	btrfs_set_header_nritems(l, mid);
4083 	btrfs_item_key(right, &disk_key, 0);
4084 	insert_ptr(trans, root, path, &disk_key, right->start,
4085 		   path->slots[1] + 1, 1);
4086 
4087 	btrfs_mark_buffer_dirty(right);
4088 	btrfs_mark_buffer_dirty(l);
4089 	BUG_ON(path->slots[0] != slot);
4090 
4091 	if (mid <= slot) {
4092 		btrfs_tree_unlock(path->nodes[0]);
4093 		free_extent_buffer(path->nodes[0]);
4094 		path->nodes[0] = right;
4095 		path->slots[0] -= mid;
4096 		path->slots[1] += 1;
4097 	} else {
4098 		btrfs_tree_unlock(right);
4099 		free_extent_buffer(right);
4100 	}
4101 
4102 	BUG_ON(path->slots[0] < 0);
4103 }
4104 
4105 /*
4106  * double splits happen when we need to insert a big item in the middle
4107  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4108  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4109  *          A                 B                 C
4110  *
4111  * We avoid this by trying to push the items on either side of our target
4112  * into the adjacent leaves.  If all goes well we can avoid the double split
4113  * completely.
4114  */
4115 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4116 					  struct btrfs_root *root,
4117 					  struct btrfs_path *path,
4118 					  int data_size)
4119 {
4120 	int ret;
4121 	int progress = 0;
4122 	int slot;
4123 	u32 nritems;
4124 	int space_needed = data_size;
4125 
4126 	slot = path->slots[0];
4127 	if (slot < btrfs_header_nritems(path->nodes[0]))
4128 		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4129 
4130 	/*
4131 	 * try to push all the items after our slot into the
4132 	 * right leaf
4133 	 */
4134 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4135 	if (ret < 0)
4136 		return ret;
4137 
4138 	if (ret == 0)
4139 		progress++;
4140 
4141 	nritems = btrfs_header_nritems(path->nodes[0]);
4142 	/*
4143 	 * our goal is to get our slot at the start or end of a leaf.  If
4144 	 * we've done so we're done
4145 	 */
4146 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4147 		return 0;
4148 
4149 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4150 		return 0;
4151 
4152 	/* try to push all the items before our slot into the next leaf */
4153 	slot = path->slots[0];
4154 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4155 	if (ret < 0)
4156 		return ret;
4157 
4158 	if (ret == 0)
4159 		progress++;
4160 
4161 	if (progress)
4162 		return 0;
4163 	return 1;
4164 }
4165 
4166 /*
4167  * split the path's leaf in two, making sure there is at least data_size
4168  * available for the resulting leaf level of the path.
4169  *
4170  * returns 0 if all went well and < 0 on failure.
4171  */
4172 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4173 			       struct btrfs_root *root,
4174 			       struct btrfs_key *ins_key,
4175 			       struct btrfs_path *path, int data_size,
4176 			       int extend)
4177 {
4178 	struct btrfs_disk_key disk_key;
4179 	struct extent_buffer *l;
4180 	u32 nritems;
4181 	int mid;
4182 	int slot;
4183 	struct extent_buffer *right;
4184 	int ret = 0;
4185 	int wret;
4186 	int split;
4187 	int num_doubles = 0;
4188 	int tried_avoid_double = 0;
4189 
4190 	l = path->nodes[0];
4191 	slot = path->slots[0];
4192 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4193 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4194 		return -EOVERFLOW;
4195 
4196 	/* first try to make some room by pushing left and right */
4197 	if (data_size && path->nodes[1]) {
4198 		int space_needed = data_size;
4199 
4200 		if (slot < btrfs_header_nritems(l))
4201 			space_needed -= btrfs_leaf_free_space(root, l);
4202 
4203 		wret = push_leaf_right(trans, root, path, space_needed,
4204 				       space_needed, 0, 0);
4205 		if (wret < 0)
4206 			return wret;
4207 		if (wret) {
4208 			wret = push_leaf_left(trans, root, path, space_needed,
4209 					      space_needed, 0, (u32)-1);
4210 			if (wret < 0)
4211 				return wret;
4212 		}
4213 		l = path->nodes[0];
4214 
4215 		/* did the pushes work? */
4216 		if (btrfs_leaf_free_space(root, l) >= data_size)
4217 			return 0;
4218 	}
4219 
4220 	if (!path->nodes[1]) {
4221 		ret = insert_new_root(trans, root, path, 1);
4222 		if (ret)
4223 			return ret;
4224 	}
4225 again:
4226 	split = 1;
4227 	l = path->nodes[0];
4228 	slot = path->slots[0];
4229 	nritems = btrfs_header_nritems(l);
4230 	mid = (nritems + 1) / 2;
4231 
4232 	if (mid <= slot) {
4233 		if (nritems == 1 ||
4234 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4235 			BTRFS_LEAF_DATA_SIZE(root)) {
4236 			if (slot >= nritems) {
4237 				split = 0;
4238 			} else {
4239 				mid = slot;
4240 				if (mid != nritems &&
4241 				    leaf_space_used(l, mid, nritems - mid) +
4242 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4243 					if (data_size && !tried_avoid_double)
4244 						goto push_for_double;
4245 					split = 2;
4246 				}
4247 			}
4248 		}
4249 	} else {
4250 		if (leaf_space_used(l, 0, mid) + data_size >
4251 			BTRFS_LEAF_DATA_SIZE(root)) {
4252 			if (!extend && data_size && slot == 0) {
4253 				split = 0;
4254 			} else if ((extend || !data_size) && slot == 0) {
4255 				mid = 1;
4256 			} else {
4257 				mid = slot;
4258 				if (mid != nritems &&
4259 				    leaf_space_used(l, mid, nritems - mid) +
4260 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4261 					if (data_size && !tried_avoid_double)
4262 						goto push_for_double;
4263 					split = 2;
4264 				}
4265 			}
4266 		}
4267 	}
4268 
4269 	if (split == 0)
4270 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4271 	else
4272 		btrfs_item_key(l, &disk_key, mid);
4273 
4274 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4275 			&disk_key, 0, l->start, 0);
4276 	if (IS_ERR(right))
4277 		return PTR_ERR(right);
4278 
4279 	root_add_used(root, root->nodesize);
4280 
4281 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4282 	btrfs_set_header_bytenr(right, right->start);
4283 	btrfs_set_header_generation(right, trans->transid);
4284 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4285 	btrfs_set_header_owner(right, root->root_key.objectid);
4286 	btrfs_set_header_level(right, 0);
4287 	write_extent_buffer(right, root->fs_info->fsid,
4288 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
4289 
4290 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4291 			    btrfs_header_chunk_tree_uuid(right),
4292 			    BTRFS_UUID_SIZE);
4293 
4294 	if (split == 0) {
4295 		if (mid <= slot) {
4296 			btrfs_set_header_nritems(right, 0);
4297 			insert_ptr(trans, root, path, &disk_key, right->start,
4298 				   path->slots[1] + 1, 1);
4299 			btrfs_tree_unlock(path->nodes[0]);
4300 			free_extent_buffer(path->nodes[0]);
4301 			path->nodes[0] = right;
4302 			path->slots[0] = 0;
4303 			path->slots[1] += 1;
4304 		} else {
4305 			btrfs_set_header_nritems(right, 0);
4306 			insert_ptr(trans, root, path, &disk_key, right->start,
4307 					  path->slots[1], 1);
4308 			btrfs_tree_unlock(path->nodes[0]);
4309 			free_extent_buffer(path->nodes[0]);
4310 			path->nodes[0] = right;
4311 			path->slots[0] = 0;
4312 			if (path->slots[1] == 0)
4313 				fixup_low_keys(root, path, &disk_key, 1);
4314 		}
4315 		btrfs_mark_buffer_dirty(right);
4316 		return ret;
4317 	}
4318 
4319 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4320 
4321 	if (split == 2) {
4322 		BUG_ON(num_doubles != 0);
4323 		num_doubles++;
4324 		goto again;
4325 	}
4326 
4327 	return 0;
4328 
4329 push_for_double:
4330 	push_for_double_split(trans, root, path, data_size);
4331 	tried_avoid_double = 1;
4332 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4333 		return 0;
4334 	goto again;
4335 }
4336 
4337 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4338 					 struct btrfs_root *root,
4339 					 struct btrfs_path *path, int ins_len)
4340 {
4341 	struct btrfs_key key;
4342 	struct extent_buffer *leaf;
4343 	struct btrfs_file_extent_item *fi;
4344 	u64 extent_len = 0;
4345 	u32 item_size;
4346 	int ret;
4347 
4348 	leaf = path->nodes[0];
4349 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4350 
4351 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4352 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4353 
4354 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4355 		return 0;
4356 
4357 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4358 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4359 		fi = btrfs_item_ptr(leaf, path->slots[0],
4360 				    struct btrfs_file_extent_item);
4361 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4362 	}
4363 	btrfs_release_path(path);
4364 
4365 	path->keep_locks = 1;
4366 	path->search_for_split = 1;
4367 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4368 	path->search_for_split = 0;
4369 	if (ret < 0)
4370 		goto err;
4371 
4372 	ret = -EAGAIN;
4373 	leaf = path->nodes[0];
4374 	/* if our item isn't there or got smaller, return now */
4375 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4376 		goto err;
4377 
4378 	/* the leaf has  changed, it now has room.  return now */
4379 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4380 		goto err;
4381 
4382 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4383 		fi = btrfs_item_ptr(leaf, path->slots[0],
4384 				    struct btrfs_file_extent_item);
4385 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4386 			goto err;
4387 	}
4388 
4389 	btrfs_set_path_blocking(path);
4390 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4391 	if (ret)
4392 		goto err;
4393 
4394 	path->keep_locks = 0;
4395 	btrfs_unlock_up_safe(path, 1);
4396 	return 0;
4397 err:
4398 	path->keep_locks = 0;
4399 	return ret;
4400 }
4401 
4402 static noinline int split_item(struct btrfs_trans_handle *trans,
4403 			       struct btrfs_root *root,
4404 			       struct btrfs_path *path,
4405 			       struct btrfs_key *new_key,
4406 			       unsigned long split_offset)
4407 {
4408 	struct extent_buffer *leaf;
4409 	struct btrfs_item *item;
4410 	struct btrfs_item *new_item;
4411 	int slot;
4412 	char *buf;
4413 	u32 nritems;
4414 	u32 item_size;
4415 	u32 orig_offset;
4416 	struct btrfs_disk_key disk_key;
4417 
4418 	leaf = path->nodes[0];
4419 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4420 
4421 	btrfs_set_path_blocking(path);
4422 
4423 	item = btrfs_item_nr(path->slots[0]);
4424 	orig_offset = btrfs_item_offset(leaf, item);
4425 	item_size = btrfs_item_size(leaf, item);
4426 
4427 	buf = kmalloc(item_size, GFP_NOFS);
4428 	if (!buf)
4429 		return -ENOMEM;
4430 
4431 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4432 			    path->slots[0]), item_size);
4433 
4434 	slot = path->slots[0] + 1;
4435 	nritems = btrfs_header_nritems(leaf);
4436 	if (slot != nritems) {
4437 		/* shift the items */
4438 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4439 				btrfs_item_nr_offset(slot),
4440 				(nritems - slot) * sizeof(struct btrfs_item));
4441 	}
4442 
4443 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4444 	btrfs_set_item_key(leaf, &disk_key, slot);
4445 
4446 	new_item = btrfs_item_nr(slot);
4447 
4448 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4449 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4450 
4451 	btrfs_set_item_offset(leaf, item,
4452 			      orig_offset + item_size - split_offset);
4453 	btrfs_set_item_size(leaf, item, split_offset);
4454 
4455 	btrfs_set_header_nritems(leaf, nritems + 1);
4456 
4457 	/* write the data for the start of the original item */
4458 	write_extent_buffer(leaf, buf,
4459 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4460 			    split_offset);
4461 
4462 	/* write the data for the new item */
4463 	write_extent_buffer(leaf, buf + split_offset,
4464 			    btrfs_item_ptr_offset(leaf, slot),
4465 			    item_size - split_offset);
4466 	btrfs_mark_buffer_dirty(leaf);
4467 
4468 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4469 	kfree(buf);
4470 	return 0;
4471 }
4472 
4473 /*
4474  * This function splits a single item into two items,
4475  * giving 'new_key' to the new item and splitting the
4476  * old one at split_offset (from the start of the item).
4477  *
4478  * The path may be released by this operation.  After
4479  * the split, the path is pointing to the old item.  The
4480  * new item is going to be in the same node as the old one.
4481  *
4482  * Note, the item being split must be smaller enough to live alone on
4483  * a tree block with room for one extra struct btrfs_item
4484  *
4485  * This allows us to split the item in place, keeping a lock on the
4486  * leaf the entire time.
4487  */
4488 int btrfs_split_item(struct btrfs_trans_handle *trans,
4489 		     struct btrfs_root *root,
4490 		     struct btrfs_path *path,
4491 		     struct btrfs_key *new_key,
4492 		     unsigned long split_offset)
4493 {
4494 	int ret;
4495 	ret = setup_leaf_for_split(trans, root, path,
4496 				   sizeof(struct btrfs_item));
4497 	if (ret)
4498 		return ret;
4499 
4500 	ret = split_item(trans, root, path, new_key, split_offset);
4501 	return ret;
4502 }
4503 
4504 /*
4505  * This function duplicate a item, giving 'new_key' to the new item.
4506  * It guarantees both items live in the same tree leaf and the new item
4507  * is contiguous with the original item.
4508  *
4509  * This allows us to split file extent in place, keeping a lock on the
4510  * leaf the entire time.
4511  */
4512 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4513 			 struct btrfs_root *root,
4514 			 struct btrfs_path *path,
4515 			 struct btrfs_key *new_key)
4516 {
4517 	struct extent_buffer *leaf;
4518 	int ret;
4519 	u32 item_size;
4520 
4521 	leaf = path->nodes[0];
4522 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4523 	ret = setup_leaf_for_split(trans, root, path,
4524 				   item_size + sizeof(struct btrfs_item));
4525 	if (ret)
4526 		return ret;
4527 
4528 	path->slots[0]++;
4529 	setup_items_for_insert(root, path, new_key, &item_size,
4530 			       item_size, item_size +
4531 			       sizeof(struct btrfs_item), 1);
4532 	leaf = path->nodes[0];
4533 	memcpy_extent_buffer(leaf,
4534 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4535 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4536 			     item_size);
4537 	return 0;
4538 }
4539 
4540 /*
4541  * make the item pointed to by the path smaller.  new_size indicates
4542  * how small to make it, and from_end tells us if we just chop bytes
4543  * off the end of the item or if we shift the item to chop bytes off
4544  * the front.
4545  */
4546 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4547 			 u32 new_size, int from_end)
4548 {
4549 	int slot;
4550 	struct extent_buffer *leaf;
4551 	struct btrfs_item *item;
4552 	u32 nritems;
4553 	unsigned int data_end;
4554 	unsigned int old_data_start;
4555 	unsigned int old_size;
4556 	unsigned int size_diff;
4557 	int i;
4558 	struct btrfs_map_token token;
4559 
4560 	btrfs_init_map_token(&token);
4561 
4562 	leaf = path->nodes[0];
4563 	slot = path->slots[0];
4564 
4565 	old_size = btrfs_item_size_nr(leaf, slot);
4566 	if (old_size == new_size)
4567 		return;
4568 
4569 	nritems = btrfs_header_nritems(leaf);
4570 	data_end = leaf_data_end(root, leaf);
4571 
4572 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4573 
4574 	size_diff = old_size - new_size;
4575 
4576 	BUG_ON(slot < 0);
4577 	BUG_ON(slot >= nritems);
4578 
4579 	/*
4580 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4581 	 */
4582 	/* first correct the data pointers */
4583 	for (i = slot; i < nritems; i++) {
4584 		u32 ioff;
4585 		item = btrfs_item_nr(i);
4586 
4587 		ioff = btrfs_token_item_offset(leaf, item, &token);
4588 		btrfs_set_token_item_offset(leaf, item,
4589 					    ioff + size_diff, &token);
4590 	}
4591 
4592 	/* shift the data */
4593 	if (from_end) {
4594 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4595 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4596 			      data_end, old_data_start + new_size - data_end);
4597 	} else {
4598 		struct btrfs_disk_key disk_key;
4599 		u64 offset;
4600 
4601 		btrfs_item_key(leaf, &disk_key, slot);
4602 
4603 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4604 			unsigned long ptr;
4605 			struct btrfs_file_extent_item *fi;
4606 
4607 			fi = btrfs_item_ptr(leaf, slot,
4608 					    struct btrfs_file_extent_item);
4609 			fi = (struct btrfs_file_extent_item *)(
4610 			     (unsigned long)fi - size_diff);
4611 
4612 			if (btrfs_file_extent_type(leaf, fi) ==
4613 			    BTRFS_FILE_EXTENT_INLINE) {
4614 				ptr = btrfs_item_ptr_offset(leaf, slot);
4615 				memmove_extent_buffer(leaf, ptr,
4616 				      (unsigned long)fi,
4617 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4618 			}
4619 		}
4620 
4621 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4622 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4623 			      data_end, old_data_start - data_end);
4624 
4625 		offset = btrfs_disk_key_offset(&disk_key);
4626 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4627 		btrfs_set_item_key(leaf, &disk_key, slot);
4628 		if (slot == 0)
4629 			fixup_low_keys(root, path, &disk_key, 1);
4630 	}
4631 
4632 	item = btrfs_item_nr(slot);
4633 	btrfs_set_item_size(leaf, item, new_size);
4634 	btrfs_mark_buffer_dirty(leaf);
4635 
4636 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4637 		btrfs_print_leaf(root, leaf);
4638 		BUG();
4639 	}
4640 }
4641 
4642 /*
4643  * make the item pointed to by the path bigger, data_size is the added size.
4644  */
4645 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4646 		       u32 data_size)
4647 {
4648 	int slot;
4649 	struct extent_buffer *leaf;
4650 	struct btrfs_item *item;
4651 	u32 nritems;
4652 	unsigned int data_end;
4653 	unsigned int old_data;
4654 	unsigned int old_size;
4655 	int i;
4656 	struct btrfs_map_token token;
4657 
4658 	btrfs_init_map_token(&token);
4659 
4660 	leaf = path->nodes[0];
4661 
4662 	nritems = btrfs_header_nritems(leaf);
4663 	data_end = leaf_data_end(root, leaf);
4664 
4665 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4666 		btrfs_print_leaf(root, leaf);
4667 		BUG();
4668 	}
4669 	slot = path->slots[0];
4670 	old_data = btrfs_item_end_nr(leaf, slot);
4671 
4672 	BUG_ON(slot < 0);
4673 	if (slot >= nritems) {
4674 		btrfs_print_leaf(root, leaf);
4675 		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4676 		       slot, nritems);
4677 		BUG_ON(1);
4678 	}
4679 
4680 	/*
4681 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4682 	 */
4683 	/* first correct the data pointers */
4684 	for (i = slot; i < nritems; i++) {
4685 		u32 ioff;
4686 		item = btrfs_item_nr(i);
4687 
4688 		ioff = btrfs_token_item_offset(leaf, item, &token);
4689 		btrfs_set_token_item_offset(leaf, item,
4690 					    ioff - data_size, &token);
4691 	}
4692 
4693 	/* shift the data */
4694 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4695 		      data_end - data_size, btrfs_leaf_data(leaf) +
4696 		      data_end, old_data - data_end);
4697 
4698 	data_end = old_data;
4699 	old_size = btrfs_item_size_nr(leaf, slot);
4700 	item = btrfs_item_nr(slot);
4701 	btrfs_set_item_size(leaf, item, old_size + data_size);
4702 	btrfs_mark_buffer_dirty(leaf);
4703 
4704 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4705 		btrfs_print_leaf(root, leaf);
4706 		BUG();
4707 	}
4708 }
4709 
4710 /*
4711  * this is a helper for btrfs_insert_empty_items, the main goal here is
4712  * to save stack depth by doing the bulk of the work in a function
4713  * that doesn't call btrfs_search_slot
4714  */
4715 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4716 			    struct btrfs_key *cpu_key, u32 *data_size,
4717 			    u32 total_data, u32 total_size, int nr)
4718 {
4719 	struct btrfs_item *item;
4720 	int i;
4721 	u32 nritems;
4722 	unsigned int data_end;
4723 	struct btrfs_disk_key disk_key;
4724 	struct extent_buffer *leaf;
4725 	int slot;
4726 	struct btrfs_map_token token;
4727 
4728 	if (path->slots[0] == 0) {
4729 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4730 		fixup_low_keys(root, path, &disk_key, 1);
4731 	}
4732 	btrfs_unlock_up_safe(path, 1);
4733 
4734 	btrfs_init_map_token(&token);
4735 
4736 	leaf = path->nodes[0];
4737 	slot = path->slots[0];
4738 
4739 	nritems = btrfs_header_nritems(leaf);
4740 	data_end = leaf_data_end(root, leaf);
4741 
4742 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4743 		btrfs_print_leaf(root, leaf);
4744 		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4745 		       total_size, btrfs_leaf_free_space(root, leaf));
4746 		BUG();
4747 	}
4748 
4749 	if (slot != nritems) {
4750 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4751 
4752 		if (old_data < data_end) {
4753 			btrfs_print_leaf(root, leaf);
4754 			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4755 			       slot, old_data, data_end);
4756 			BUG_ON(1);
4757 		}
4758 		/*
4759 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4760 		 */
4761 		/* first correct the data pointers */
4762 		for (i = slot; i < nritems; i++) {
4763 			u32 ioff;
4764 
4765 			item = btrfs_item_nr( i);
4766 			ioff = btrfs_token_item_offset(leaf, item, &token);
4767 			btrfs_set_token_item_offset(leaf, item,
4768 						    ioff - total_data, &token);
4769 		}
4770 		/* shift the items */
4771 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4772 			      btrfs_item_nr_offset(slot),
4773 			      (nritems - slot) * sizeof(struct btrfs_item));
4774 
4775 		/* shift the data */
4776 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4777 			      data_end - total_data, btrfs_leaf_data(leaf) +
4778 			      data_end, old_data - data_end);
4779 		data_end = old_data;
4780 	}
4781 
4782 	/* setup the item for the new data */
4783 	for (i = 0; i < nr; i++) {
4784 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4785 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4786 		item = btrfs_item_nr(slot + i);
4787 		btrfs_set_token_item_offset(leaf, item,
4788 					    data_end - data_size[i], &token);
4789 		data_end -= data_size[i];
4790 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4791 	}
4792 
4793 	btrfs_set_header_nritems(leaf, nritems + nr);
4794 	btrfs_mark_buffer_dirty(leaf);
4795 
4796 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4797 		btrfs_print_leaf(root, leaf);
4798 		BUG();
4799 	}
4800 }
4801 
4802 /*
4803  * Given a key and some data, insert items into the tree.
4804  * This does all the path init required, making room in the tree if needed.
4805  */
4806 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4807 			    struct btrfs_root *root,
4808 			    struct btrfs_path *path,
4809 			    struct btrfs_key *cpu_key, u32 *data_size,
4810 			    int nr)
4811 {
4812 	int ret = 0;
4813 	int slot;
4814 	int i;
4815 	u32 total_size = 0;
4816 	u32 total_data = 0;
4817 
4818 	for (i = 0; i < nr; i++)
4819 		total_data += data_size[i];
4820 
4821 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4822 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4823 	if (ret == 0)
4824 		return -EEXIST;
4825 	if (ret < 0)
4826 		return ret;
4827 
4828 	slot = path->slots[0];
4829 	BUG_ON(slot < 0);
4830 
4831 	setup_items_for_insert(root, path, cpu_key, data_size,
4832 			       total_data, total_size, nr);
4833 	return 0;
4834 }
4835 
4836 /*
4837  * Given a key and some data, insert an item into the tree.
4838  * This does all the path init required, making room in the tree if needed.
4839  */
4840 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4841 		      *root, struct btrfs_key *cpu_key, void *data, u32
4842 		      data_size)
4843 {
4844 	int ret = 0;
4845 	struct btrfs_path *path;
4846 	struct extent_buffer *leaf;
4847 	unsigned long ptr;
4848 
4849 	path = btrfs_alloc_path();
4850 	if (!path)
4851 		return -ENOMEM;
4852 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4853 	if (!ret) {
4854 		leaf = path->nodes[0];
4855 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4856 		write_extent_buffer(leaf, data, ptr, data_size);
4857 		btrfs_mark_buffer_dirty(leaf);
4858 	}
4859 	btrfs_free_path(path);
4860 	return ret;
4861 }
4862 
4863 /*
4864  * delete the pointer from a given node.
4865  *
4866  * the tree should have been previously balanced so the deletion does not
4867  * empty a node.
4868  */
4869 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4870 		    int level, int slot)
4871 {
4872 	struct extent_buffer *parent = path->nodes[level];
4873 	u32 nritems;
4874 	int ret;
4875 
4876 	nritems = btrfs_header_nritems(parent);
4877 	if (slot != nritems - 1) {
4878 		if (level)
4879 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4880 					     slot + 1, nritems - slot - 1);
4881 		memmove_extent_buffer(parent,
4882 			      btrfs_node_key_ptr_offset(slot),
4883 			      btrfs_node_key_ptr_offset(slot + 1),
4884 			      sizeof(struct btrfs_key_ptr) *
4885 			      (nritems - slot - 1));
4886 	} else if (level) {
4887 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4888 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4889 		BUG_ON(ret < 0);
4890 	}
4891 
4892 	nritems--;
4893 	btrfs_set_header_nritems(parent, nritems);
4894 	if (nritems == 0 && parent == root->node) {
4895 		BUG_ON(btrfs_header_level(root->node) != 1);
4896 		/* just turn the root into a leaf and break */
4897 		btrfs_set_header_level(root->node, 0);
4898 	} else if (slot == 0) {
4899 		struct btrfs_disk_key disk_key;
4900 
4901 		btrfs_node_key(parent, &disk_key, 0);
4902 		fixup_low_keys(root, path, &disk_key, level + 1);
4903 	}
4904 	btrfs_mark_buffer_dirty(parent);
4905 }
4906 
4907 /*
4908  * a helper function to delete the leaf pointed to by path->slots[1] and
4909  * path->nodes[1].
4910  *
4911  * This deletes the pointer in path->nodes[1] and frees the leaf
4912  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4913  *
4914  * The path must have already been setup for deleting the leaf, including
4915  * all the proper balancing.  path->nodes[1] must be locked.
4916  */
4917 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4918 				    struct btrfs_root *root,
4919 				    struct btrfs_path *path,
4920 				    struct extent_buffer *leaf)
4921 {
4922 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4923 	del_ptr(root, path, 1, path->slots[1]);
4924 
4925 	/*
4926 	 * btrfs_free_extent is expensive, we want to make sure we
4927 	 * aren't holding any locks when we call it
4928 	 */
4929 	btrfs_unlock_up_safe(path, 0);
4930 
4931 	root_sub_used(root, leaf->len);
4932 
4933 	extent_buffer_get(leaf);
4934 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4935 	free_extent_buffer_stale(leaf);
4936 }
4937 /*
4938  * delete the item at the leaf level in path.  If that empties
4939  * the leaf, remove it from the tree
4940  */
4941 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4942 		    struct btrfs_path *path, int slot, int nr)
4943 {
4944 	struct extent_buffer *leaf;
4945 	struct btrfs_item *item;
4946 	int last_off;
4947 	int dsize = 0;
4948 	int ret = 0;
4949 	int wret;
4950 	int i;
4951 	u32 nritems;
4952 	struct btrfs_map_token token;
4953 
4954 	btrfs_init_map_token(&token);
4955 
4956 	leaf = path->nodes[0];
4957 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4958 
4959 	for (i = 0; i < nr; i++)
4960 		dsize += btrfs_item_size_nr(leaf, slot + i);
4961 
4962 	nritems = btrfs_header_nritems(leaf);
4963 
4964 	if (slot + nr != nritems) {
4965 		int data_end = leaf_data_end(root, leaf);
4966 
4967 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4968 			      data_end + dsize,
4969 			      btrfs_leaf_data(leaf) + data_end,
4970 			      last_off - data_end);
4971 
4972 		for (i = slot + nr; i < nritems; i++) {
4973 			u32 ioff;
4974 
4975 			item = btrfs_item_nr(i);
4976 			ioff = btrfs_token_item_offset(leaf, item, &token);
4977 			btrfs_set_token_item_offset(leaf, item,
4978 						    ioff + dsize, &token);
4979 		}
4980 
4981 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4982 			      btrfs_item_nr_offset(slot + nr),
4983 			      sizeof(struct btrfs_item) *
4984 			      (nritems - slot - nr));
4985 	}
4986 	btrfs_set_header_nritems(leaf, nritems - nr);
4987 	nritems -= nr;
4988 
4989 	/* delete the leaf if we've emptied it */
4990 	if (nritems == 0) {
4991 		if (leaf == root->node) {
4992 			btrfs_set_header_level(leaf, 0);
4993 		} else {
4994 			btrfs_set_path_blocking(path);
4995 			clean_tree_block(trans, root, leaf);
4996 			btrfs_del_leaf(trans, root, path, leaf);
4997 		}
4998 	} else {
4999 		int used = leaf_space_used(leaf, 0, nritems);
5000 		if (slot == 0) {
5001 			struct btrfs_disk_key disk_key;
5002 
5003 			btrfs_item_key(leaf, &disk_key, 0);
5004 			fixup_low_keys(root, path, &disk_key, 1);
5005 		}
5006 
5007 		/* delete the leaf if it is mostly empty */
5008 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5009 			/* push_leaf_left fixes the path.
5010 			 * make sure the path still points to our leaf
5011 			 * for possible call to del_ptr below
5012 			 */
5013 			slot = path->slots[1];
5014 			extent_buffer_get(leaf);
5015 
5016 			btrfs_set_path_blocking(path);
5017 			wret = push_leaf_left(trans, root, path, 1, 1,
5018 					      1, (u32)-1);
5019 			if (wret < 0 && wret != -ENOSPC)
5020 				ret = wret;
5021 
5022 			if (path->nodes[0] == leaf &&
5023 			    btrfs_header_nritems(leaf)) {
5024 				wret = push_leaf_right(trans, root, path, 1,
5025 						       1, 1, 0);
5026 				if (wret < 0 && wret != -ENOSPC)
5027 					ret = wret;
5028 			}
5029 
5030 			if (btrfs_header_nritems(leaf) == 0) {
5031 				path->slots[1] = slot;
5032 				btrfs_del_leaf(trans, root, path, leaf);
5033 				free_extent_buffer(leaf);
5034 				ret = 0;
5035 			} else {
5036 				/* if we're still in the path, make sure
5037 				 * we're dirty.  Otherwise, one of the
5038 				 * push_leaf functions must have already
5039 				 * dirtied this buffer
5040 				 */
5041 				if (path->nodes[0] == leaf)
5042 					btrfs_mark_buffer_dirty(leaf);
5043 				free_extent_buffer(leaf);
5044 			}
5045 		} else {
5046 			btrfs_mark_buffer_dirty(leaf);
5047 		}
5048 	}
5049 	return ret;
5050 }
5051 
5052 /*
5053  * search the tree again to find a leaf with lesser keys
5054  * returns 0 if it found something or 1 if there are no lesser leaves.
5055  * returns < 0 on io errors.
5056  *
5057  * This may release the path, and so you may lose any locks held at the
5058  * time you call it.
5059  */
5060 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5061 {
5062 	struct btrfs_key key;
5063 	struct btrfs_disk_key found_key;
5064 	int ret;
5065 
5066 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5067 
5068 	if (key.offset > 0) {
5069 		key.offset--;
5070 	} else if (key.type > 0) {
5071 		key.type--;
5072 		key.offset = (u64)-1;
5073 	} else if (key.objectid > 0) {
5074 		key.objectid--;
5075 		key.type = (u8)-1;
5076 		key.offset = (u64)-1;
5077 	} else {
5078 		return 1;
5079 	}
5080 
5081 	btrfs_release_path(path);
5082 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5083 	if (ret < 0)
5084 		return ret;
5085 	btrfs_item_key(path->nodes[0], &found_key, 0);
5086 	ret = comp_keys(&found_key, &key);
5087 	/*
5088 	 * We might have had an item with the previous key in the tree right
5089 	 * before we released our path. And after we released our path, that
5090 	 * item might have been pushed to the first slot (0) of the leaf we
5091 	 * were holding due to a tree balance. Alternatively, an item with the
5092 	 * previous key can exist as the only element of a leaf (big fat item).
5093 	 * Therefore account for these 2 cases, so that our callers (like
5094 	 * btrfs_previous_item) don't miss an existing item with a key matching
5095 	 * the previous key we computed above.
5096 	 */
5097 	if (ret <= 0)
5098 		return 0;
5099 	return 1;
5100 }
5101 
5102 /*
5103  * A helper function to walk down the tree starting at min_key, and looking
5104  * for nodes or leaves that are have a minimum transaction id.
5105  * This is used by the btree defrag code, and tree logging
5106  *
5107  * This does not cow, but it does stuff the starting key it finds back
5108  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5109  * key and get a writable path.
5110  *
5111  * This does lock as it descends, and path->keep_locks should be set
5112  * to 1 by the caller.
5113  *
5114  * This honors path->lowest_level to prevent descent past a given level
5115  * of the tree.
5116  *
5117  * min_trans indicates the oldest transaction that you are interested
5118  * in walking through.  Any nodes or leaves older than min_trans are
5119  * skipped over (without reading them).
5120  *
5121  * returns zero if something useful was found, < 0 on error and 1 if there
5122  * was nothing in the tree that matched the search criteria.
5123  */
5124 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5125 			 struct btrfs_path *path,
5126 			 u64 min_trans)
5127 {
5128 	struct extent_buffer *cur;
5129 	struct btrfs_key found_key;
5130 	int slot;
5131 	int sret;
5132 	u32 nritems;
5133 	int level;
5134 	int ret = 1;
5135 	int keep_locks = path->keep_locks;
5136 
5137 	path->keep_locks = 1;
5138 again:
5139 	cur = btrfs_read_lock_root_node(root);
5140 	level = btrfs_header_level(cur);
5141 	WARN_ON(path->nodes[level]);
5142 	path->nodes[level] = cur;
5143 	path->locks[level] = BTRFS_READ_LOCK;
5144 
5145 	if (btrfs_header_generation(cur) < min_trans) {
5146 		ret = 1;
5147 		goto out;
5148 	}
5149 	while (1) {
5150 		nritems = btrfs_header_nritems(cur);
5151 		level = btrfs_header_level(cur);
5152 		sret = bin_search(cur, min_key, level, &slot);
5153 
5154 		/* at the lowest level, we're done, setup the path and exit */
5155 		if (level == path->lowest_level) {
5156 			if (slot >= nritems)
5157 				goto find_next_key;
5158 			ret = 0;
5159 			path->slots[level] = slot;
5160 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5161 			goto out;
5162 		}
5163 		if (sret && slot > 0)
5164 			slot--;
5165 		/*
5166 		 * check this node pointer against the min_trans parameters.
5167 		 * If it is too old, old, skip to the next one.
5168 		 */
5169 		while (slot < nritems) {
5170 			u64 gen;
5171 
5172 			gen = btrfs_node_ptr_generation(cur, slot);
5173 			if (gen < min_trans) {
5174 				slot++;
5175 				continue;
5176 			}
5177 			break;
5178 		}
5179 find_next_key:
5180 		/*
5181 		 * we didn't find a candidate key in this node, walk forward
5182 		 * and find another one
5183 		 */
5184 		if (slot >= nritems) {
5185 			path->slots[level] = slot;
5186 			btrfs_set_path_blocking(path);
5187 			sret = btrfs_find_next_key(root, path, min_key, level,
5188 						  min_trans);
5189 			if (sret == 0) {
5190 				btrfs_release_path(path);
5191 				goto again;
5192 			} else {
5193 				goto out;
5194 			}
5195 		}
5196 		/* save our key for returning back */
5197 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5198 		path->slots[level] = slot;
5199 		if (level == path->lowest_level) {
5200 			ret = 0;
5201 			goto out;
5202 		}
5203 		btrfs_set_path_blocking(path);
5204 		cur = read_node_slot(root, cur, slot);
5205 		BUG_ON(!cur); /* -ENOMEM */
5206 
5207 		btrfs_tree_read_lock(cur);
5208 
5209 		path->locks[level - 1] = BTRFS_READ_LOCK;
5210 		path->nodes[level - 1] = cur;
5211 		unlock_up(path, level, 1, 0, NULL);
5212 		btrfs_clear_path_blocking(path, NULL, 0);
5213 	}
5214 out:
5215 	path->keep_locks = keep_locks;
5216 	if (ret == 0) {
5217 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5218 		btrfs_set_path_blocking(path);
5219 		memcpy(min_key, &found_key, sizeof(found_key));
5220 	}
5221 	return ret;
5222 }
5223 
5224 static void tree_move_down(struct btrfs_root *root,
5225 			   struct btrfs_path *path,
5226 			   int *level, int root_level)
5227 {
5228 	BUG_ON(*level == 0);
5229 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5230 					path->slots[*level]);
5231 	path->slots[*level - 1] = 0;
5232 	(*level)--;
5233 }
5234 
5235 static int tree_move_next_or_upnext(struct btrfs_root *root,
5236 				    struct btrfs_path *path,
5237 				    int *level, int root_level)
5238 {
5239 	int ret = 0;
5240 	int nritems;
5241 	nritems = btrfs_header_nritems(path->nodes[*level]);
5242 
5243 	path->slots[*level]++;
5244 
5245 	while (path->slots[*level] >= nritems) {
5246 		if (*level == root_level)
5247 			return -1;
5248 
5249 		/* move upnext */
5250 		path->slots[*level] = 0;
5251 		free_extent_buffer(path->nodes[*level]);
5252 		path->nodes[*level] = NULL;
5253 		(*level)++;
5254 		path->slots[*level]++;
5255 
5256 		nritems = btrfs_header_nritems(path->nodes[*level]);
5257 		ret = 1;
5258 	}
5259 	return ret;
5260 }
5261 
5262 /*
5263  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5264  * or down.
5265  */
5266 static int tree_advance(struct btrfs_root *root,
5267 			struct btrfs_path *path,
5268 			int *level, int root_level,
5269 			int allow_down,
5270 			struct btrfs_key *key)
5271 {
5272 	int ret;
5273 
5274 	if (*level == 0 || !allow_down) {
5275 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5276 	} else {
5277 		tree_move_down(root, path, level, root_level);
5278 		ret = 0;
5279 	}
5280 	if (ret >= 0) {
5281 		if (*level == 0)
5282 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5283 					path->slots[*level]);
5284 		else
5285 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5286 					path->slots[*level]);
5287 	}
5288 	return ret;
5289 }
5290 
5291 static int tree_compare_item(struct btrfs_root *left_root,
5292 			     struct btrfs_path *left_path,
5293 			     struct btrfs_path *right_path,
5294 			     char *tmp_buf)
5295 {
5296 	int cmp;
5297 	int len1, len2;
5298 	unsigned long off1, off2;
5299 
5300 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5301 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5302 	if (len1 != len2)
5303 		return 1;
5304 
5305 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5306 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5307 				right_path->slots[0]);
5308 
5309 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5310 
5311 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5312 	if (cmp)
5313 		return 1;
5314 	return 0;
5315 }
5316 
5317 #define ADVANCE 1
5318 #define ADVANCE_ONLY_NEXT -1
5319 
5320 /*
5321  * This function compares two trees and calls the provided callback for
5322  * every changed/new/deleted item it finds.
5323  * If shared tree blocks are encountered, whole subtrees are skipped, making
5324  * the compare pretty fast on snapshotted subvolumes.
5325  *
5326  * This currently works on commit roots only. As commit roots are read only,
5327  * we don't do any locking. The commit roots are protected with transactions.
5328  * Transactions are ended and rejoined when a commit is tried in between.
5329  *
5330  * This function checks for modifications done to the trees while comparing.
5331  * If it detects a change, it aborts immediately.
5332  */
5333 int btrfs_compare_trees(struct btrfs_root *left_root,
5334 			struct btrfs_root *right_root,
5335 			btrfs_changed_cb_t changed_cb, void *ctx)
5336 {
5337 	int ret;
5338 	int cmp;
5339 	struct btrfs_path *left_path = NULL;
5340 	struct btrfs_path *right_path = NULL;
5341 	struct btrfs_key left_key;
5342 	struct btrfs_key right_key;
5343 	char *tmp_buf = NULL;
5344 	int left_root_level;
5345 	int right_root_level;
5346 	int left_level;
5347 	int right_level;
5348 	int left_end_reached;
5349 	int right_end_reached;
5350 	int advance_left;
5351 	int advance_right;
5352 	u64 left_blockptr;
5353 	u64 right_blockptr;
5354 	u64 left_gen;
5355 	u64 right_gen;
5356 
5357 	left_path = btrfs_alloc_path();
5358 	if (!left_path) {
5359 		ret = -ENOMEM;
5360 		goto out;
5361 	}
5362 	right_path = btrfs_alloc_path();
5363 	if (!right_path) {
5364 		ret = -ENOMEM;
5365 		goto out;
5366 	}
5367 
5368 	tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5369 	if (!tmp_buf) {
5370 		ret = -ENOMEM;
5371 		goto out;
5372 	}
5373 
5374 	left_path->search_commit_root = 1;
5375 	left_path->skip_locking = 1;
5376 	right_path->search_commit_root = 1;
5377 	right_path->skip_locking = 1;
5378 
5379 	/*
5380 	 * Strategy: Go to the first items of both trees. Then do
5381 	 *
5382 	 * If both trees are at level 0
5383 	 *   Compare keys of current items
5384 	 *     If left < right treat left item as new, advance left tree
5385 	 *       and repeat
5386 	 *     If left > right treat right item as deleted, advance right tree
5387 	 *       and repeat
5388 	 *     If left == right do deep compare of items, treat as changed if
5389 	 *       needed, advance both trees and repeat
5390 	 * If both trees are at the same level but not at level 0
5391 	 *   Compare keys of current nodes/leafs
5392 	 *     If left < right advance left tree and repeat
5393 	 *     If left > right advance right tree and repeat
5394 	 *     If left == right compare blockptrs of the next nodes/leafs
5395 	 *       If they match advance both trees but stay at the same level
5396 	 *         and repeat
5397 	 *       If they don't match advance both trees while allowing to go
5398 	 *         deeper and repeat
5399 	 * If tree levels are different
5400 	 *   Advance the tree that needs it and repeat
5401 	 *
5402 	 * Advancing a tree means:
5403 	 *   If we are at level 0, try to go to the next slot. If that's not
5404 	 *   possible, go one level up and repeat. Stop when we found a level
5405 	 *   where we could go to the next slot. We may at this point be on a
5406 	 *   node or a leaf.
5407 	 *
5408 	 *   If we are not at level 0 and not on shared tree blocks, go one
5409 	 *   level deeper.
5410 	 *
5411 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5412 	 *   the right if possible or go up and right.
5413 	 */
5414 
5415 	down_read(&left_root->fs_info->commit_root_sem);
5416 	left_level = btrfs_header_level(left_root->commit_root);
5417 	left_root_level = left_level;
5418 	left_path->nodes[left_level] = left_root->commit_root;
5419 	extent_buffer_get(left_path->nodes[left_level]);
5420 
5421 	right_level = btrfs_header_level(right_root->commit_root);
5422 	right_root_level = right_level;
5423 	right_path->nodes[right_level] = right_root->commit_root;
5424 	extent_buffer_get(right_path->nodes[right_level]);
5425 	up_read(&left_root->fs_info->commit_root_sem);
5426 
5427 	if (left_level == 0)
5428 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5429 				&left_key, left_path->slots[left_level]);
5430 	else
5431 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5432 				&left_key, left_path->slots[left_level]);
5433 	if (right_level == 0)
5434 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5435 				&right_key, right_path->slots[right_level]);
5436 	else
5437 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5438 				&right_key, right_path->slots[right_level]);
5439 
5440 	left_end_reached = right_end_reached = 0;
5441 	advance_left = advance_right = 0;
5442 
5443 	while (1) {
5444 		if (advance_left && !left_end_reached) {
5445 			ret = tree_advance(left_root, left_path, &left_level,
5446 					left_root_level,
5447 					advance_left != ADVANCE_ONLY_NEXT,
5448 					&left_key);
5449 			if (ret < 0)
5450 				left_end_reached = ADVANCE;
5451 			advance_left = 0;
5452 		}
5453 		if (advance_right && !right_end_reached) {
5454 			ret = tree_advance(right_root, right_path, &right_level,
5455 					right_root_level,
5456 					advance_right != ADVANCE_ONLY_NEXT,
5457 					&right_key);
5458 			if (ret < 0)
5459 				right_end_reached = ADVANCE;
5460 			advance_right = 0;
5461 		}
5462 
5463 		if (left_end_reached && right_end_reached) {
5464 			ret = 0;
5465 			goto out;
5466 		} else if (left_end_reached) {
5467 			if (right_level == 0) {
5468 				ret = changed_cb(left_root, right_root,
5469 						left_path, right_path,
5470 						&right_key,
5471 						BTRFS_COMPARE_TREE_DELETED,
5472 						ctx);
5473 				if (ret < 0)
5474 					goto out;
5475 			}
5476 			advance_right = ADVANCE;
5477 			continue;
5478 		} else if (right_end_reached) {
5479 			if (left_level == 0) {
5480 				ret = changed_cb(left_root, right_root,
5481 						left_path, right_path,
5482 						&left_key,
5483 						BTRFS_COMPARE_TREE_NEW,
5484 						ctx);
5485 				if (ret < 0)
5486 					goto out;
5487 			}
5488 			advance_left = ADVANCE;
5489 			continue;
5490 		}
5491 
5492 		if (left_level == 0 && right_level == 0) {
5493 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5494 			if (cmp < 0) {
5495 				ret = changed_cb(left_root, right_root,
5496 						left_path, right_path,
5497 						&left_key,
5498 						BTRFS_COMPARE_TREE_NEW,
5499 						ctx);
5500 				if (ret < 0)
5501 					goto out;
5502 				advance_left = ADVANCE;
5503 			} else if (cmp > 0) {
5504 				ret = changed_cb(left_root, right_root,
5505 						left_path, right_path,
5506 						&right_key,
5507 						BTRFS_COMPARE_TREE_DELETED,
5508 						ctx);
5509 				if (ret < 0)
5510 					goto out;
5511 				advance_right = ADVANCE;
5512 			} else {
5513 				enum btrfs_compare_tree_result result;
5514 
5515 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5516 				ret = tree_compare_item(left_root, left_path,
5517 						right_path, tmp_buf);
5518 				if (ret)
5519 					result = BTRFS_COMPARE_TREE_CHANGED;
5520 				else
5521 					result = BTRFS_COMPARE_TREE_SAME;
5522 				ret = changed_cb(left_root, right_root,
5523 						 left_path, right_path,
5524 						 &left_key, result, ctx);
5525 				if (ret < 0)
5526 					goto out;
5527 				advance_left = ADVANCE;
5528 				advance_right = ADVANCE;
5529 			}
5530 		} else if (left_level == right_level) {
5531 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5532 			if (cmp < 0) {
5533 				advance_left = ADVANCE;
5534 			} else if (cmp > 0) {
5535 				advance_right = ADVANCE;
5536 			} else {
5537 				left_blockptr = btrfs_node_blockptr(
5538 						left_path->nodes[left_level],
5539 						left_path->slots[left_level]);
5540 				right_blockptr = btrfs_node_blockptr(
5541 						right_path->nodes[right_level],
5542 						right_path->slots[right_level]);
5543 				left_gen = btrfs_node_ptr_generation(
5544 						left_path->nodes[left_level],
5545 						left_path->slots[left_level]);
5546 				right_gen = btrfs_node_ptr_generation(
5547 						right_path->nodes[right_level],
5548 						right_path->slots[right_level]);
5549 				if (left_blockptr == right_blockptr &&
5550 				    left_gen == right_gen) {
5551 					/*
5552 					 * As we're on a shared block, don't
5553 					 * allow to go deeper.
5554 					 */
5555 					advance_left = ADVANCE_ONLY_NEXT;
5556 					advance_right = ADVANCE_ONLY_NEXT;
5557 				} else {
5558 					advance_left = ADVANCE;
5559 					advance_right = ADVANCE;
5560 				}
5561 			}
5562 		} else if (left_level < right_level) {
5563 			advance_right = ADVANCE;
5564 		} else {
5565 			advance_left = ADVANCE;
5566 		}
5567 	}
5568 
5569 out:
5570 	btrfs_free_path(left_path);
5571 	btrfs_free_path(right_path);
5572 	kfree(tmp_buf);
5573 	return ret;
5574 }
5575 
5576 /*
5577  * this is similar to btrfs_next_leaf, but does not try to preserve
5578  * and fixup the path.  It looks for and returns the next key in the
5579  * tree based on the current path and the min_trans parameters.
5580  *
5581  * 0 is returned if another key is found, < 0 if there are any errors
5582  * and 1 is returned if there are no higher keys in the tree
5583  *
5584  * path->keep_locks should be set to 1 on the search made before
5585  * calling this function.
5586  */
5587 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5588 			struct btrfs_key *key, int level, u64 min_trans)
5589 {
5590 	int slot;
5591 	struct extent_buffer *c;
5592 
5593 	WARN_ON(!path->keep_locks);
5594 	while (level < BTRFS_MAX_LEVEL) {
5595 		if (!path->nodes[level])
5596 			return 1;
5597 
5598 		slot = path->slots[level] + 1;
5599 		c = path->nodes[level];
5600 next:
5601 		if (slot >= btrfs_header_nritems(c)) {
5602 			int ret;
5603 			int orig_lowest;
5604 			struct btrfs_key cur_key;
5605 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5606 			    !path->nodes[level + 1])
5607 				return 1;
5608 
5609 			if (path->locks[level + 1]) {
5610 				level++;
5611 				continue;
5612 			}
5613 
5614 			slot = btrfs_header_nritems(c) - 1;
5615 			if (level == 0)
5616 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5617 			else
5618 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5619 
5620 			orig_lowest = path->lowest_level;
5621 			btrfs_release_path(path);
5622 			path->lowest_level = level;
5623 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5624 						0, 0);
5625 			path->lowest_level = orig_lowest;
5626 			if (ret < 0)
5627 				return ret;
5628 
5629 			c = path->nodes[level];
5630 			slot = path->slots[level];
5631 			if (ret == 0)
5632 				slot++;
5633 			goto next;
5634 		}
5635 
5636 		if (level == 0)
5637 			btrfs_item_key_to_cpu(c, key, slot);
5638 		else {
5639 			u64 gen = btrfs_node_ptr_generation(c, slot);
5640 
5641 			if (gen < min_trans) {
5642 				slot++;
5643 				goto next;
5644 			}
5645 			btrfs_node_key_to_cpu(c, key, slot);
5646 		}
5647 		return 0;
5648 	}
5649 	return 1;
5650 }
5651 
5652 /*
5653  * search the tree again to find a leaf with greater keys
5654  * returns 0 if it found something or 1 if there are no greater leaves.
5655  * returns < 0 on io errors.
5656  */
5657 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5658 {
5659 	return btrfs_next_old_leaf(root, path, 0);
5660 }
5661 
5662 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5663 			u64 time_seq)
5664 {
5665 	int slot;
5666 	int level;
5667 	struct extent_buffer *c;
5668 	struct extent_buffer *next;
5669 	struct btrfs_key key;
5670 	u32 nritems;
5671 	int ret;
5672 	int old_spinning = path->leave_spinning;
5673 	int next_rw_lock = 0;
5674 
5675 	nritems = btrfs_header_nritems(path->nodes[0]);
5676 	if (nritems == 0)
5677 		return 1;
5678 
5679 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5680 again:
5681 	level = 1;
5682 	next = NULL;
5683 	next_rw_lock = 0;
5684 	btrfs_release_path(path);
5685 
5686 	path->keep_locks = 1;
5687 	path->leave_spinning = 1;
5688 
5689 	if (time_seq)
5690 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5691 	else
5692 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5693 	path->keep_locks = 0;
5694 
5695 	if (ret < 0)
5696 		return ret;
5697 
5698 	nritems = btrfs_header_nritems(path->nodes[0]);
5699 	/*
5700 	 * by releasing the path above we dropped all our locks.  A balance
5701 	 * could have added more items next to the key that used to be
5702 	 * at the very end of the block.  So, check again here and
5703 	 * advance the path if there are now more items available.
5704 	 */
5705 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5706 		if (ret == 0)
5707 			path->slots[0]++;
5708 		ret = 0;
5709 		goto done;
5710 	}
5711 	/*
5712 	 * So the above check misses one case:
5713 	 * - after releasing the path above, someone has removed the item that
5714 	 *   used to be at the very end of the block, and balance between leafs
5715 	 *   gets another one with bigger key.offset to replace it.
5716 	 *
5717 	 * This one should be returned as well, or we can get leaf corruption
5718 	 * later(esp. in __btrfs_drop_extents()).
5719 	 *
5720 	 * And a bit more explanation about this check,
5721 	 * with ret > 0, the key isn't found, the path points to the slot
5722 	 * where it should be inserted, so the path->slots[0] item must be the
5723 	 * bigger one.
5724 	 */
5725 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5726 		ret = 0;
5727 		goto done;
5728 	}
5729 
5730 	while (level < BTRFS_MAX_LEVEL) {
5731 		if (!path->nodes[level]) {
5732 			ret = 1;
5733 			goto done;
5734 		}
5735 
5736 		slot = path->slots[level] + 1;
5737 		c = path->nodes[level];
5738 		if (slot >= btrfs_header_nritems(c)) {
5739 			level++;
5740 			if (level == BTRFS_MAX_LEVEL) {
5741 				ret = 1;
5742 				goto done;
5743 			}
5744 			continue;
5745 		}
5746 
5747 		if (next) {
5748 			btrfs_tree_unlock_rw(next, next_rw_lock);
5749 			free_extent_buffer(next);
5750 		}
5751 
5752 		next = c;
5753 		next_rw_lock = path->locks[level];
5754 		ret = read_block_for_search(NULL, root, path, &next, level,
5755 					    slot, &key, 0);
5756 		if (ret == -EAGAIN)
5757 			goto again;
5758 
5759 		if (ret < 0) {
5760 			btrfs_release_path(path);
5761 			goto done;
5762 		}
5763 
5764 		if (!path->skip_locking) {
5765 			ret = btrfs_try_tree_read_lock(next);
5766 			if (!ret && time_seq) {
5767 				/*
5768 				 * If we don't get the lock, we may be racing
5769 				 * with push_leaf_left, holding that lock while
5770 				 * itself waiting for the leaf we've currently
5771 				 * locked. To solve this situation, we give up
5772 				 * on our lock and cycle.
5773 				 */
5774 				free_extent_buffer(next);
5775 				btrfs_release_path(path);
5776 				cond_resched();
5777 				goto again;
5778 			}
5779 			if (!ret) {
5780 				btrfs_set_path_blocking(path);
5781 				btrfs_tree_read_lock(next);
5782 				btrfs_clear_path_blocking(path, next,
5783 							  BTRFS_READ_LOCK);
5784 			}
5785 			next_rw_lock = BTRFS_READ_LOCK;
5786 		}
5787 		break;
5788 	}
5789 	path->slots[level] = slot;
5790 	while (1) {
5791 		level--;
5792 		c = path->nodes[level];
5793 		if (path->locks[level])
5794 			btrfs_tree_unlock_rw(c, path->locks[level]);
5795 
5796 		free_extent_buffer(c);
5797 		path->nodes[level] = next;
5798 		path->slots[level] = 0;
5799 		if (!path->skip_locking)
5800 			path->locks[level] = next_rw_lock;
5801 		if (!level)
5802 			break;
5803 
5804 		ret = read_block_for_search(NULL, root, path, &next, level,
5805 					    0, &key, 0);
5806 		if (ret == -EAGAIN)
5807 			goto again;
5808 
5809 		if (ret < 0) {
5810 			btrfs_release_path(path);
5811 			goto done;
5812 		}
5813 
5814 		if (!path->skip_locking) {
5815 			ret = btrfs_try_tree_read_lock(next);
5816 			if (!ret) {
5817 				btrfs_set_path_blocking(path);
5818 				btrfs_tree_read_lock(next);
5819 				btrfs_clear_path_blocking(path, next,
5820 							  BTRFS_READ_LOCK);
5821 			}
5822 			next_rw_lock = BTRFS_READ_LOCK;
5823 		}
5824 	}
5825 	ret = 0;
5826 done:
5827 	unlock_up(path, 0, 1, 0, NULL);
5828 	path->leave_spinning = old_spinning;
5829 	if (!old_spinning)
5830 		btrfs_set_path_blocking(path);
5831 
5832 	return ret;
5833 }
5834 
5835 /*
5836  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5837  * searching until it gets past min_objectid or finds an item of 'type'
5838  *
5839  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5840  */
5841 int btrfs_previous_item(struct btrfs_root *root,
5842 			struct btrfs_path *path, u64 min_objectid,
5843 			int type)
5844 {
5845 	struct btrfs_key found_key;
5846 	struct extent_buffer *leaf;
5847 	u32 nritems;
5848 	int ret;
5849 
5850 	while (1) {
5851 		if (path->slots[0] == 0) {
5852 			btrfs_set_path_blocking(path);
5853 			ret = btrfs_prev_leaf(root, path);
5854 			if (ret != 0)
5855 				return ret;
5856 		} else {
5857 			path->slots[0]--;
5858 		}
5859 		leaf = path->nodes[0];
5860 		nritems = btrfs_header_nritems(leaf);
5861 		if (nritems == 0)
5862 			return 1;
5863 		if (path->slots[0] == nritems)
5864 			path->slots[0]--;
5865 
5866 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5867 		if (found_key.objectid < min_objectid)
5868 			break;
5869 		if (found_key.type == type)
5870 			return 0;
5871 		if (found_key.objectid == min_objectid &&
5872 		    found_key.type < type)
5873 			break;
5874 	}
5875 	return 1;
5876 }
5877 
5878 /*
5879  * search in extent tree to find a previous Metadata/Data extent item with
5880  * min objecitd.
5881  *
5882  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5883  */
5884 int btrfs_previous_extent_item(struct btrfs_root *root,
5885 			struct btrfs_path *path, u64 min_objectid)
5886 {
5887 	struct btrfs_key found_key;
5888 	struct extent_buffer *leaf;
5889 	u32 nritems;
5890 	int ret;
5891 
5892 	while (1) {
5893 		if (path->slots[0] == 0) {
5894 			btrfs_set_path_blocking(path);
5895 			ret = btrfs_prev_leaf(root, path);
5896 			if (ret != 0)
5897 				return ret;
5898 		} else {
5899 			path->slots[0]--;
5900 		}
5901 		leaf = path->nodes[0];
5902 		nritems = btrfs_header_nritems(leaf);
5903 		if (nritems == 0)
5904 			return 1;
5905 		if (path->slots[0] == nritems)
5906 			path->slots[0]--;
5907 
5908 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5909 		if (found_key.objectid < min_objectid)
5910 			break;
5911 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5912 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5913 			return 0;
5914 		if (found_key.objectid == min_objectid &&
5915 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5916 			break;
5917 	}
5918 	return 1;
5919 }
5920