xref: /openbmc/linux/fs/btrfs/ctree.c (revision 483eb062)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27 
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 		      *root, struct btrfs_key *ins_key,
32 		      struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 			  struct btrfs_root *root, struct extent_buffer *dst,
35 			  struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      struct extent_buffer *dst_buf,
39 			      struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 		    int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 				 struct extent_buffer *eb);
44 
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47 	struct btrfs_path *path;
48 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49 	return path;
50 }
51 
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58 	int i;
59 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60 		if (!p->nodes[i] || !p->locks[i])
61 			continue;
62 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63 		if (p->locks[i] == BTRFS_READ_LOCK)
64 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
66 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67 	}
68 }
69 
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 					struct extent_buffer *held, int held_rw)
80 {
81 	int i;
82 
83 #ifdef CONFIG_DEBUG_LOCK_ALLOC
84 	/* lockdep really cares that we take all of these spinlocks
85 	 * in the right order.  If any of the locks in the path are not
86 	 * currently blocking, it is going to complain.  So, make really
87 	 * really sure by forcing the path to blocking before we clear
88 	 * the path blocking.
89 	 */
90 	if (held) {
91 		btrfs_set_lock_blocking_rw(held, held_rw);
92 		if (held_rw == BTRFS_WRITE_LOCK)
93 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
94 		else if (held_rw == BTRFS_READ_LOCK)
95 			held_rw = BTRFS_READ_LOCK_BLOCKING;
96 	}
97 	btrfs_set_path_blocking(p);
98 #endif
99 
100 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
101 		if (p->nodes[i] && p->locks[i]) {
102 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
103 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
104 				p->locks[i] = BTRFS_WRITE_LOCK;
105 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
106 				p->locks[i] = BTRFS_READ_LOCK;
107 		}
108 	}
109 
110 #ifdef CONFIG_DEBUG_LOCK_ALLOC
111 	if (held)
112 		btrfs_clear_lock_blocking_rw(held, held_rw);
113 #endif
114 }
115 
116 /* this also releases the path */
117 void btrfs_free_path(struct btrfs_path *p)
118 {
119 	if (!p)
120 		return;
121 	btrfs_release_path(p);
122 	kmem_cache_free(btrfs_path_cachep, p);
123 }
124 
125 /*
126  * path release drops references on the extent buffers in the path
127  * and it drops any locks held by this path
128  *
129  * It is safe to call this on paths that no locks or extent buffers held.
130  */
131 noinline void btrfs_release_path(struct btrfs_path *p)
132 {
133 	int i;
134 
135 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
136 		p->slots[i] = 0;
137 		if (!p->nodes[i])
138 			continue;
139 		if (p->locks[i]) {
140 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
141 			p->locks[i] = 0;
142 		}
143 		free_extent_buffer(p->nodes[i]);
144 		p->nodes[i] = NULL;
145 	}
146 }
147 
148 /*
149  * safely gets a reference on the root node of a tree.  A lock
150  * is not taken, so a concurrent writer may put a different node
151  * at the root of the tree.  See btrfs_lock_root_node for the
152  * looping required.
153  *
154  * The extent buffer returned by this has a reference taken, so
155  * it won't disappear.  It may stop being the root of the tree
156  * at any time because there are no locks held.
157  */
158 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
159 {
160 	struct extent_buffer *eb;
161 
162 	while (1) {
163 		rcu_read_lock();
164 		eb = rcu_dereference(root->node);
165 
166 		/*
167 		 * RCU really hurts here, we could free up the root node because
168 		 * it was cow'ed but we may not get the new root node yet so do
169 		 * the inc_not_zero dance and if it doesn't work then
170 		 * synchronize_rcu and try again.
171 		 */
172 		if (atomic_inc_not_zero(&eb->refs)) {
173 			rcu_read_unlock();
174 			break;
175 		}
176 		rcu_read_unlock();
177 		synchronize_rcu();
178 	}
179 	return eb;
180 }
181 
182 /* loop around taking references on and locking the root node of the
183  * tree until you end up with a lock on the root.  A locked buffer
184  * is returned, with a reference held.
185  */
186 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
187 {
188 	struct extent_buffer *eb;
189 
190 	while (1) {
191 		eb = btrfs_root_node(root);
192 		btrfs_tree_lock(eb);
193 		if (eb == root->node)
194 			break;
195 		btrfs_tree_unlock(eb);
196 		free_extent_buffer(eb);
197 	}
198 	return eb;
199 }
200 
201 /* loop around taking references on and locking the root node of the
202  * tree until you end up with a lock on the root.  A locked buffer
203  * is returned, with a reference held.
204  */
205 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
206 {
207 	struct extent_buffer *eb;
208 
209 	while (1) {
210 		eb = btrfs_root_node(root);
211 		btrfs_tree_read_lock(eb);
212 		if (eb == root->node)
213 			break;
214 		btrfs_tree_read_unlock(eb);
215 		free_extent_buffer(eb);
216 	}
217 	return eb;
218 }
219 
220 /* cowonly root (everything not a reference counted cow subvolume), just get
221  * put onto a simple dirty list.  transaction.c walks this to make sure they
222  * get properly updated on disk.
223  */
224 static void add_root_to_dirty_list(struct btrfs_root *root)
225 {
226 	spin_lock(&root->fs_info->trans_lock);
227 	if (root->track_dirty && list_empty(&root->dirty_list)) {
228 		list_add(&root->dirty_list,
229 			 &root->fs_info->dirty_cowonly_roots);
230 	}
231 	spin_unlock(&root->fs_info->trans_lock);
232 }
233 
234 /*
235  * used by snapshot creation to make a copy of a root for a tree with
236  * a given objectid.  The buffer with the new root node is returned in
237  * cow_ret, and this func returns zero on success or a negative error code.
238  */
239 int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 		      struct btrfs_root *root,
241 		      struct extent_buffer *buf,
242 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
243 {
244 	struct extent_buffer *cow;
245 	int ret = 0;
246 	int level;
247 	struct btrfs_disk_key disk_key;
248 
249 	WARN_ON(root->ref_cows && trans->transid !=
250 		root->fs_info->running_transaction->transid);
251 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
252 
253 	level = btrfs_header_level(buf);
254 	if (level == 0)
255 		btrfs_item_key(buf, &disk_key, 0);
256 	else
257 		btrfs_node_key(buf, &disk_key, 0);
258 
259 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
260 				     new_root_objectid, &disk_key, level,
261 				     buf->start, 0);
262 	if (IS_ERR(cow))
263 		return PTR_ERR(cow);
264 
265 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
266 	btrfs_set_header_bytenr(cow, cow->start);
267 	btrfs_set_header_generation(cow, trans->transid);
268 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
269 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
270 				     BTRFS_HEADER_FLAG_RELOC);
271 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
272 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
273 	else
274 		btrfs_set_header_owner(cow, new_root_objectid);
275 
276 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
277 			    BTRFS_FSID_SIZE);
278 
279 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
280 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
281 		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
282 	else
283 		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
284 
285 	if (ret)
286 		return ret;
287 
288 	btrfs_mark_buffer_dirty(cow);
289 	*cow_ret = cow;
290 	return 0;
291 }
292 
293 enum mod_log_op {
294 	MOD_LOG_KEY_REPLACE,
295 	MOD_LOG_KEY_ADD,
296 	MOD_LOG_KEY_REMOVE,
297 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
298 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
299 	MOD_LOG_MOVE_KEYS,
300 	MOD_LOG_ROOT_REPLACE,
301 };
302 
303 struct tree_mod_move {
304 	int dst_slot;
305 	int nr_items;
306 };
307 
308 struct tree_mod_root {
309 	u64 logical;
310 	u8 level;
311 };
312 
313 struct tree_mod_elem {
314 	struct rb_node node;
315 	u64 index;		/* shifted logical */
316 	u64 seq;
317 	enum mod_log_op op;
318 
319 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
320 	int slot;
321 
322 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
323 	u64 generation;
324 
325 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
326 	struct btrfs_disk_key key;
327 	u64 blockptr;
328 
329 	/* this is used for op == MOD_LOG_MOVE_KEYS */
330 	struct tree_mod_move move;
331 
332 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
333 	struct tree_mod_root old_root;
334 };
335 
336 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
337 {
338 	read_lock(&fs_info->tree_mod_log_lock);
339 }
340 
341 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
342 {
343 	read_unlock(&fs_info->tree_mod_log_lock);
344 }
345 
346 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
347 {
348 	write_lock(&fs_info->tree_mod_log_lock);
349 }
350 
351 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
352 {
353 	write_unlock(&fs_info->tree_mod_log_lock);
354 }
355 
356 /*
357  * Increment the upper half of tree_mod_seq, set lower half zero.
358  *
359  * Must be called with fs_info->tree_mod_seq_lock held.
360  */
361 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
362 {
363 	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
364 	seq &= 0xffffffff00000000ull;
365 	seq += 1ull << 32;
366 	atomic64_set(&fs_info->tree_mod_seq, seq);
367 	return seq;
368 }
369 
370 /*
371  * Increment the lower half of tree_mod_seq.
372  *
373  * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
374  * are generated should not technically require a spin lock here. (Rationale:
375  * incrementing the minor while incrementing the major seq number is between its
376  * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
377  * just returns a unique sequence number as usual.) We have decided to leave
378  * that requirement in here and rethink it once we notice it really imposes a
379  * problem on some workload.
380  */
381 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
382 {
383 	return atomic64_inc_return(&fs_info->tree_mod_seq);
384 }
385 
386 /*
387  * return the last minor in the previous major tree_mod_seq number
388  */
389 u64 btrfs_tree_mod_seq_prev(u64 seq)
390 {
391 	return (seq & 0xffffffff00000000ull) - 1ull;
392 }
393 
394 /*
395  * This adds a new blocker to the tree mod log's blocker list if the @elem
396  * passed does not already have a sequence number set. So when a caller expects
397  * to record tree modifications, it should ensure to set elem->seq to zero
398  * before calling btrfs_get_tree_mod_seq.
399  * Returns a fresh, unused tree log modification sequence number, even if no new
400  * blocker was added.
401  */
402 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
403 			   struct seq_list *elem)
404 {
405 	u64 seq;
406 
407 	tree_mod_log_write_lock(fs_info);
408 	spin_lock(&fs_info->tree_mod_seq_lock);
409 	if (!elem->seq) {
410 		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
411 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
412 	}
413 	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
414 	spin_unlock(&fs_info->tree_mod_seq_lock);
415 	tree_mod_log_write_unlock(fs_info);
416 
417 	return seq;
418 }
419 
420 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
421 			    struct seq_list *elem)
422 {
423 	struct rb_root *tm_root;
424 	struct rb_node *node;
425 	struct rb_node *next;
426 	struct seq_list *cur_elem;
427 	struct tree_mod_elem *tm;
428 	u64 min_seq = (u64)-1;
429 	u64 seq_putting = elem->seq;
430 
431 	if (!seq_putting)
432 		return;
433 
434 	spin_lock(&fs_info->tree_mod_seq_lock);
435 	list_del(&elem->list);
436 	elem->seq = 0;
437 
438 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
439 		if (cur_elem->seq < min_seq) {
440 			if (seq_putting > cur_elem->seq) {
441 				/*
442 				 * blocker with lower sequence number exists, we
443 				 * cannot remove anything from the log
444 				 */
445 				spin_unlock(&fs_info->tree_mod_seq_lock);
446 				return;
447 			}
448 			min_seq = cur_elem->seq;
449 		}
450 	}
451 	spin_unlock(&fs_info->tree_mod_seq_lock);
452 
453 	/*
454 	 * anything that's lower than the lowest existing (read: blocked)
455 	 * sequence number can be removed from the tree.
456 	 */
457 	tree_mod_log_write_lock(fs_info);
458 	tm_root = &fs_info->tree_mod_log;
459 	for (node = rb_first(tm_root); node; node = next) {
460 		next = rb_next(node);
461 		tm = container_of(node, struct tree_mod_elem, node);
462 		if (tm->seq > min_seq)
463 			continue;
464 		rb_erase(node, tm_root);
465 		kfree(tm);
466 	}
467 	tree_mod_log_write_unlock(fs_info);
468 }
469 
470 /*
471  * key order of the log:
472  *       index -> sequence
473  *
474  * the index is the shifted logical of the *new* root node for root replace
475  * operations, or the shifted logical of the affected block for all other
476  * operations.
477  *
478  * Note: must be called with write lock (tree_mod_log_write_lock).
479  */
480 static noinline int
481 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
482 {
483 	struct rb_root *tm_root;
484 	struct rb_node **new;
485 	struct rb_node *parent = NULL;
486 	struct tree_mod_elem *cur;
487 
488 	BUG_ON(!tm);
489 
490 	spin_lock(&fs_info->tree_mod_seq_lock);
491 	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
492 	spin_unlock(&fs_info->tree_mod_seq_lock);
493 
494 	tm_root = &fs_info->tree_mod_log;
495 	new = &tm_root->rb_node;
496 	while (*new) {
497 		cur = container_of(*new, struct tree_mod_elem, node);
498 		parent = *new;
499 		if (cur->index < tm->index)
500 			new = &((*new)->rb_left);
501 		else if (cur->index > tm->index)
502 			new = &((*new)->rb_right);
503 		else if (cur->seq < tm->seq)
504 			new = &((*new)->rb_left);
505 		else if (cur->seq > tm->seq)
506 			new = &((*new)->rb_right);
507 		else
508 			return -EEXIST;
509 	}
510 
511 	rb_link_node(&tm->node, parent, new);
512 	rb_insert_color(&tm->node, tm_root);
513 	return 0;
514 }
515 
516 /*
517  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
518  * returns zero with the tree_mod_log_lock acquired. The caller must hold
519  * this until all tree mod log insertions are recorded in the rb tree and then
520  * call tree_mod_log_write_unlock() to release.
521  */
522 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
523 				    struct extent_buffer *eb) {
524 	smp_mb();
525 	if (list_empty(&(fs_info)->tree_mod_seq_list))
526 		return 1;
527 	if (eb && btrfs_header_level(eb) == 0)
528 		return 1;
529 
530 	tree_mod_log_write_lock(fs_info);
531 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
532 		tree_mod_log_write_unlock(fs_info);
533 		return 1;
534 	}
535 
536 	return 0;
537 }
538 
539 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
540 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
541 				    struct extent_buffer *eb)
542 {
543 	smp_mb();
544 	if (list_empty(&(fs_info)->tree_mod_seq_list))
545 		return 0;
546 	if (eb && btrfs_header_level(eb) == 0)
547 		return 0;
548 
549 	return 1;
550 }
551 
552 static struct tree_mod_elem *
553 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
554 		    enum mod_log_op op, gfp_t flags)
555 {
556 	struct tree_mod_elem *tm;
557 
558 	tm = kzalloc(sizeof(*tm), flags);
559 	if (!tm)
560 		return NULL;
561 
562 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
563 	if (op != MOD_LOG_KEY_ADD) {
564 		btrfs_node_key(eb, &tm->key, slot);
565 		tm->blockptr = btrfs_node_blockptr(eb, slot);
566 	}
567 	tm->op = op;
568 	tm->slot = slot;
569 	tm->generation = btrfs_node_ptr_generation(eb, slot);
570 	RB_CLEAR_NODE(&tm->node);
571 
572 	return tm;
573 }
574 
575 static noinline int
576 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
577 			struct extent_buffer *eb, int slot,
578 			enum mod_log_op op, gfp_t flags)
579 {
580 	struct tree_mod_elem *tm;
581 	int ret;
582 
583 	if (!tree_mod_need_log(fs_info, eb))
584 		return 0;
585 
586 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
587 	if (!tm)
588 		return -ENOMEM;
589 
590 	if (tree_mod_dont_log(fs_info, eb)) {
591 		kfree(tm);
592 		return 0;
593 	}
594 
595 	ret = __tree_mod_log_insert(fs_info, tm);
596 	tree_mod_log_write_unlock(fs_info);
597 	if (ret)
598 		kfree(tm);
599 
600 	return ret;
601 }
602 
603 static noinline int
604 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
605 			 struct extent_buffer *eb, int dst_slot, int src_slot,
606 			 int nr_items, gfp_t flags)
607 {
608 	struct tree_mod_elem *tm = NULL;
609 	struct tree_mod_elem **tm_list = NULL;
610 	int ret = 0;
611 	int i;
612 	int locked = 0;
613 
614 	if (!tree_mod_need_log(fs_info, eb))
615 		return 0;
616 
617 	tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
618 	if (!tm_list)
619 		return -ENOMEM;
620 
621 	tm = kzalloc(sizeof(*tm), flags);
622 	if (!tm) {
623 		ret = -ENOMEM;
624 		goto free_tms;
625 	}
626 
627 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
628 	tm->slot = src_slot;
629 	tm->move.dst_slot = dst_slot;
630 	tm->move.nr_items = nr_items;
631 	tm->op = MOD_LOG_MOVE_KEYS;
632 
633 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
634 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
635 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
636 		if (!tm_list[i]) {
637 			ret = -ENOMEM;
638 			goto free_tms;
639 		}
640 	}
641 
642 	if (tree_mod_dont_log(fs_info, eb))
643 		goto free_tms;
644 	locked = 1;
645 
646 	/*
647 	 * When we override something during the move, we log these removals.
648 	 * This can only happen when we move towards the beginning of the
649 	 * buffer, i.e. dst_slot < src_slot.
650 	 */
651 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
652 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
653 		if (ret)
654 			goto free_tms;
655 	}
656 
657 	ret = __tree_mod_log_insert(fs_info, tm);
658 	if (ret)
659 		goto free_tms;
660 	tree_mod_log_write_unlock(fs_info);
661 	kfree(tm_list);
662 
663 	return 0;
664 free_tms:
665 	for (i = 0; i < nr_items; i++) {
666 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
667 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
668 		kfree(tm_list[i]);
669 	}
670 	if (locked)
671 		tree_mod_log_write_unlock(fs_info);
672 	kfree(tm_list);
673 	kfree(tm);
674 
675 	return ret;
676 }
677 
678 static inline int
679 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
680 		       struct tree_mod_elem **tm_list,
681 		       int nritems)
682 {
683 	int i, j;
684 	int ret;
685 
686 	for (i = nritems - 1; i >= 0; i--) {
687 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
688 		if (ret) {
689 			for (j = nritems - 1; j > i; j--)
690 				rb_erase(&tm_list[j]->node,
691 					 &fs_info->tree_mod_log);
692 			return ret;
693 		}
694 	}
695 
696 	return 0;
697 }
698 
699 static noinline int
700 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
701 			 struct extent_buffer *old_root,
702 			 struct extent_buffer *new_root, gfp_t flags,
703 			 int log_removal)
704 {
705 	struct tree_mod_elem *tm = NULL;
706 	struct tree_mod_elem **tm_list = NULL;
707 	int nritems = 0;
708 	int ret = 0;
709 	int i;
710 
711 	if (!tree_mod_need_log(fs_info, NULL))
712 		return 0;
713 
714 	if (log_removal && btrfs_header_level(old_root) > 0) {
715 		nritems = btrfs_header_nritems(old_root);
716 		tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
717 				  flags);
718 		if (!tm_list) {
719 			ret = -ENOMEM;
720 			goto free_tms;
721 		}
722 		for (i = 0; i < nritems; i++) {
723 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
724 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
725 			if (!tm_list[i]) {
726 				ret = -ENOMEM;
727 				goto free_tms;
728 			}
729 		}
730 	}
731 
732 	tm = kzalloc(sizeof(*tm), flags);
733 	if (!tm) {
734 		ret = -ENOMEM;
735 		goto free_tms;
736 	}
737 
738 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
739 	tm->old_root.logical = old_root->start;
740 	tm->old_root.level = btrfs_header_level(old_root);
741 	tm->generation = btrfs_header_generation(old_root);
742 	tm->op = MOD_LOG_ROOT_REPLACE;
743 
744 	if (tree_mod_dont_log(fs_info, NULL))
745 		goto free_tms;
746 
747 	if (tm_list)
748 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
749 	if (!ret)
750 		ret = __tree_mod_log_insert(fs_info, tm);
751 
752 	tree_mod_log_write_unlock(fs_info);
753 	if (ret)
754 		goto free_tms;
755 	kfree(tm_list);
756 
757 	return ret;
758 
759 free_tms:
760 	if (tm_list) {
761 		for (i = 0; i < nritems; i++)
762 			kfree(tm_list[i]);
763 		kfree(tm_list);
764 	}
765 	kfree(tm);
766 
767 	return ret;
768 }
769 
770 static struct tree_mod_elem *
771 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
772 		      int smallest)
773 {
774 	struct rb_root *tm_root;
775 	struct rb_node *node;
776 	struct tree_mod_elem *cur = NULL;
777 	struct tree_mod_elem *found = NULL;
778 	u64 index = start >> PAGE_CACHE_SHIFT;
779 
780 	tree_mod_log_read_lock(fs_info);
781 	tm_root = &fs_info->tree_mod_log;
782 	node = tm_root->rb_node;
783 	while (node) {
784 		cur = container_of(node, struct tree_mod_elem, node);
785 		if (cur->index < index) {
786 			node = node->rb_left;
787 		} else if (cur->index > index) {
788 			node = node->rb_right;
789 		} else if (cur->seq < min_seq) {
790 			node = node->rb_left;
791 		} else if (!smallest) {
792 			/* we want the node with the highest seq */
793 			if (found)
794 				BUG_ON(found->seq > cur->seq);
795 			found = cur;
796 			node = node->rb_left;
797 		} else if (cur->seq > min_seq) {
798 			/* we want the node with the smallest seq */
799 			if (found)
800 				BUG_ON(found->seq < cur->seq);
801 			found = cur;
802 			node = node->rb_right;
803 		} else {
804 			found = cur;
805 			break;
806 		}
807 	}
808 	tree_mod_log_read_unlock(fs_info);
809 
810 	return found;
811 }
812 
813 /*
814  * this returns the element from the log with the smallest time sequence
815  * value that's in the log (the oldest log item). any element with a time
816  * sequence lower than min_seq will be ignored.
817  */
818 static struct tree_mod_elem *
819 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
820 			   u64 min_seq)
821 {
822 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
823 }
824 
825 /*
826  * this returns the element from the log with the largest time sequence
827  * value that's in the log (the most recent log item). any element with
828  * a time sequence lower than min_seq will be ignored.
829  */
830 static struct tree_mod_elem *
831 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
832 {
833 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
834 }
835 
836 static noinline int
837 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
838 		     struct extent_buffer *src, unsigned long dst_offset,
839 		     unsigned long src_offset, int nr_items)
840 {
841 	int ret = 0;
842 	struct tree_mod_elem **tm_list = NULL;
843 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
844 	int i;
845 	int locked = 0;
846 
847 	if (!tree_mod_need_log(fs_info, NULL))
848 		return 0;
849 
850 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
851 		return 0;
852 
853 	tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
854 			  GFP_NOFS);
855 	if (!tm_list)
856 		return -ENOMEM;
857 
858 	tm_list_add = tm_list;
859 	tm_list_rem = tm_list + nr_items;
860 	for (i = 0; i < nr_items; i++) {
861 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
862 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
863 		if (!tm_list_rem[i]) {
864 			ret = -ENOMEM;
865 			goto free_tms;
866 		}
867 
868 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
869 		    MOD_LOG_KEY_ADD, GFP_NOFS);
870 		if (!tm_list_add[i]) {
871 			ret = -ENOMEM;
872 			goto free_tms;
873 		}
874 	}
875 
876 	if (tree_mod_dont_log(fs_info, NULL))
877 		goto free_tms;
878 	locked = 1;
879 
880 	for (i = 0; i < nr_items; i++) {
881 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
882 		if (ret)
883 			goto free_tms;
884 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
885 		if (ret)
886 			goto free_tms;
887 	}
888 
889 	tree_mod_log_write_unlock(fs_info);
890 	kfree(tm_list);
891 
892 	return 0;
893 
894 free_tms:
895 	for (i = 0; i < nr_items * 2; i++) {
896 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
897 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
898 		kfree(tm_list[i]);
899 	}
900 	if (locked)
901 		tree_mod_log_write_unlock(fs_info);
902 	kfree(tm_list);
903 
904 	return ret;
905 }
906 
907 static inline void
908 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
909 		     int dst_offset, int src_offset, int nr_items)
910 {
911 	int ret;
912 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
913 				       nr_items, GFP_NOFS);
914 	BUG_ON(ret < 0);
915 }
916 
917 static noinline void
918 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
919 			  struct extent_buffer *eb, int slot, int atomic)
920 {
921 	int ret;
922 
923 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
924 					MOD_LOG_KEY_REPLACE,
925 					atomic ? GFP_ATOMIC : GFP_NOFS);
926 	BUG_ON(ret < 0);
927 }
928 
929 static noinline int
930 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
931 {
932 	struct tree_mod_elem **tm_list = NULL;
933 	int nritems = 0;
934 	int i;
935 	int ret = 0;
936 
937 	if (btrfs_header_level(eb) == 0)
938 		return 0;
939 
940 	if (!tree_mod_need_log(fs_info, NULL))
941 		return 0;
942 
943 	nritems = btrfs_header_nritems(eb);
944 	tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
945 			  GFP_NOFS);
946 	if (!tm_list)
947 		return -ENOMEM;
948 
949 	for (i = 0; i < nritems; i++) {
950 		tm_list[i] = alloc_tree_mod_elem(eb, i,
951 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
952 		if (!tm_list[i]) {
953 			ret = -ENOMEM;
954 			goto free_tms;
955 		}
956 	}
957 
958 	if (tree_mod_dont_log(fs_info, eb))
959 		goto free_tms;
960 
961 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
962 	tree_mod_log_write_unlock(fs_info);
963 	if (ret)
964 		goto free_tms;
965 	kfree(tm_list);
966 
967 	return 0;
968 
969 free_tms:
970 	for (i = 0; i < nritems; i++)
971 		kfree(tm_list[i]);
972 	kfree(tm_list);
973 
974 	return ret;
975 }
976 
977 static noinline void
978 tree_mod_log_set_root_pointer(struct btrfs_root *root,
979 			      struct extent_buffer *new_root_node,
980 			      int log_removal)
981 {
982 	int ret;
983 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
984 				       new_root_node, GFP_NOFS, log_removal);
985 	BUG_ON(ret < 0);
986 }
987 
988 /*
989  * check if the tree block can be shared by multiple trees
990  */
991 int btrfs_block_can_be_shared(struct btrfs_root *root,
992 			      struct extent_buffer *buf)
993 {
994 	/*
995 	 * Tree blocks not in refernece counted trees and tree roots
996 	 * are never shared. If a block was allocated after the last
997 	 * snapshot and the block was not allocated by tree relocation,
998 	 * we know the block is not shared.
999 	 */
1000 	if (root->ref_cows &&
1001 	    buf != root->node && buf != root->commit_root &&
1002 	    (btrfs_header_generation(buf) <=
1003 	     btrfs_root_last_snapshot(&root->root_item) ||
1004 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
1005 		return 1;
1006 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1007 	if (root->ref_cows &&
1008 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1009 		return 1;
1010 #endif
1011 	return 0;
1012 }
1013 
1014 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1015 				       struct btrfs_root *root,
1016 				       struct extent_buffer *buf,
1017 				       struct extent_buffer *cow,
1018 				       int *last_ref)
1019 {
1020 	u64 refs;
1021 	u64 owner;
1022 	u64 flags;
1023 	u64 new_flags = 0;
1024 	int ret;
1025 
1026 	/*
1027 	 * Backrefs update rules:
1028 	 *
1029 	 * Always use full backrefs for extent pointers in tree block
1030 	 * allocated by tree relocation.
1031 	 *
1032 	 * If a shared tree block is no longer referenced by its owner
1033 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
1034 	 * use full backrefs for extent pointers in tree block.
1035 	 *
1036 	 * If a tree block is been relocating
1037 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1038 	 * use full backrefs for extent pointers in tree block.
1039 	 * The reason for this is some operations (such as drop tree)
1040 	 * are only allowed for blocks use full backrefs.
1041 	 */
1042 
1043 	if (btrfs_block_can_be_shared(root, buf)) {
1044 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
1045 					       btrfs_header_level(buf), 1,
1046 					       &refs, &flags);
1047 		if (ret)
1048 			return ret;
1049 		if (refs == 0) {
1050 			ret = -EROFS;
1051 			btrfs_std_error(root->fs_info, ret);
1052 			return ret;
1053 		}
1054 	} else {
1055 		refs = 1;
1056 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1057 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1058 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1059 		else
1060 			flags = 0;
1061 	}
1062 
1063 	owner = btrfs_header_owner(buf);
1064 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1065 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1066 
1067 	if (refs > 1) {
1068 		if ((owner == root->root_key.objectid ||
1069 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1070 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1071 			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
1072 			BUG_ON(ret); /* -ENOMEM */
1073 
1074 			if (root->root_key.objectid ==
1075 			    BTRFS_TREE_RELOC_OBJECTID) {
1076 				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
1077 				BUG_ON(ret); /* -ENOMEM */
1078 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
1079 				BUG_ON(ret); /* -ENOMEM */
1080 			}
1081 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1082 		} else {
1083 
1084 			if (root->root_key.objectid ==
1085 			    BTRFS_TREE_RELOC_OBJECTID)
1086 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
1087 			else
1088 				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
1089 			BUG_ON(ret); /* -ENOMEM */
1090 		}
1091 		if (new_flags != 0) {
1092 			int level = btrfs_header_level(buf);
1093 
1094 			ret = btrfs_set_disk_extent_flags(trans, root,
1095 							  buf->start,
1096 							  buf->len,
1097 							  new_flags, level, 0);
1098 			if (ret)
1099 				return ret;
1100 		}
1101 	} else {
1102 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1103 			if (root->root_key.objectid ==
1104 			    BTRFS_TREE_RELOC_OBJECTID)
1105 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
1106 			else
1107 				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
1108 			BUG_ON(ret); /* -ENOMEM */
1109 			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
1110 			BUG_ON(ret); /* -ENOMEM */
1111 		}
1112 		clean_tree_block(trans, root, buf);
1113 		*last_ref = 1;
1114 	}
1115 	return 0;
1116 }
1117 
1118 /*
1119  * does the dirty work in cow of a single block.  The parent block (if
1120  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1121  * dirty and returned locked.  If you modify the block it needs to be marked
1122  * dirty again.
1123  *
1124  * search_start -- an allocation hint for the new block
1125  *
1126  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1127  * bytes the allocator should try to find free next to the block it returns.
1128  * This is just a hint and may be ignored by the allocator.
1129  */
1130 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1131 			     struct btrfs_root *root,
1132 			     struct extent_buffer *buf,
1133 			     struct extent_buffer *parent, int parent_slot,
1134 			     struct extent_buffer **cow_ret,
1135 			     u64 search_start, u64 empty_size)
1136 {
1137 	struct btrfs_disk_key disk_key;
1138 	struct extent_buffer *cow;
1139 	int level, ret;
1140 	int last_ref = 0;
1141 	int unlock_orig = 0;
1142 	u64 parent_start;
1143 
1144 	if (*cow_ret == buf)
1145 		unlock_orig = 1;
1146 
1147 	btrfs_assert_tree_locked(buf);
1148 
1149 	WARN_ON(root->ref_cows && trans->transid !=
1150 		root->fs_info->running_transaction->transid);
1151 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
1152 
1153 	level = btrfs_header_level(buf);
1154 
1155 	if (level == 0)
1156 		btrfs_item_key(buf, &disk_key, 0);
1157 	else
1158 		btrfs_node_key(buf, &disk_key, 0);
1159 
1160 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1161 		if (parent)
1162 			parent_start = parent->start;
1163 		else
1164 			parent_start = 0;
1165 	} else
1166 		parent_start = 0;
1167 
1168 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
1169 				     root->root_key.objectid, &disk_key,
1170 				     level, search_start, empty_size);
1171 	if (IS_ERR(cow))
1172 		return PTR_ERR(cow);
1173 
1174 	/* cow is set to blocking by btrfs_init_new_buffer */
1175 
1176 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
1177 	btrfs_set_header_bytenr(cow, cow->start);
1178 	btrfs_set_header_generation(cow, trans->transid);
1179 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1180 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1181 				     BTRFS_HEADER_FLAG_RELOC);
1182 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1183 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1184 	else
1185 		btrfs_set_header_owner(cow, root->root_key.objectid);
1186 
1187 	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1188 			    BTRFS_FSID_SIZE);
1189 
1190 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1191 	if (ret) {
1192 		btrfs_abort_transaction(trans, root, ret);
1193 		return ret;
1194 	}
1195 
1196 	if (root->ref_cows) {
1197 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1198 		if (ret)
1199 			return ret;
1200 	}
1201 
1202 	if (buf == root->node) {
1203 		WARN_ON(parent && parent != buf);
1204 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1205 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1206 			parent_start = buf->start;
1207 		else
1208 			parent_start = 0;
1209 
1210 		extent_buffer_get(cow);
1211 		tree_mod_log_set_root_pointer(root, cow, 1);
1212 		rcu_assign_pointer(root->node, cow);
1213 
1214 		btrfs_free_tree_block(trans, root, buf, parent_start,
1215 				      last_ref);
1216 		free_extent_buffer(buf);
1217 		add_root_to_dirty_list(root);
1218 	} else {
1219 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1220 			parent_start = parent->start;
1221 		else
1222 			parent_start = 0;
1223 
1224 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1225 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1226 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1227 		btrfs_set_node_blockptr(parent, parent_slot,
1228 					cow->start);
1229 		btrfs_set_node_ptr_generation(parent, parent_slot,
1230 					      trans->transid);
1231 		btrfs_mark_buffer_dirty(parent);
1232 		if (last_ref) {
1233 			ret = tree_mod_log_free_eb(root->fs_info, buf);
1234 			if (ret) {
1235 				btrfs_abort_transaction(trans, root, ret);
1236 				return ret;
1237 			}
1238 		}
1239 		btrfs_free_tree_block(trans, root, buf, parent_start,
1240 				      last_ref);
1241 	}
1242 	if (unlock_orig)
1243 		btrfs_tree_unlock(buf);
1244 	free_extent_buffer_stale(buf);
1245 	btrfs_mark_buffer_dirty(cow);
1246 	*cow_ret = cow;
1247 	return 0;
1248 }
1249 
1250 /*
1251  * returns the logical address of the oldest predecessor of the given root.
1252  * entries older than time_seq are ignored.
1253  */
1254 static struct tree_mod_elem *
1255 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1256 			   struct extent_buffer *eb_root, u64 time_seq)
1257 {
1258 	struct tree_mod_elem *tm;
1259 	struct tree_mod_elem *found = NULL;
1260 	u64 root_logical = eb_root->start;
1261 	int looped = 0;
1262 
1263 	if (!time_seq)
1264 		return NULL;
1265 
1266 	/*
1267 	 * the very last operation that's logged for a root is the replacement
1268 	 * operation (if it is replaced at all). this has the index of the *new*
1269 	 * root, making it the very first operation that's logged for this root.
1270 	 */
1271 	while (1) {
1272 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1273 						time_seq);
1274 		if (!looped && !tm)
1275 			return NULL;
1276 		/*
1277 		 * if there are no tree operation for the oldest root, we simply
1278 		 * return it. this should only happen if that (old) root is at
1279 		 * level 0.
1280 		 */
1281 		if (!tm)
1282 			break;
1283 
1284 		/*
1285 		 * if there's an operation that's not a root replacement, we
1286 		 * found the oldest version of our root. normally, we'll find a
1287 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1288 		 */
1289 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1290 			break;
1291 
1292 		found = tm;
1293 		root_logical = tm->old_root.logical;
1294 		looped = 1;
1295 	}
1296 
1297 	/* if there's no old root to return, return what we found instead */
1298 	if (!found)
1299 		found = tm;
1300 
1301 	return found;
1302 }
1303 
1304 /*
1305  * tm is a pointer to the first operation to rewind within eb. then, all
1306  * previous operations will be rewinded (until we reach something older than
1307  * time_seq).
1308  */
1309 static void
1310 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1311 		      u64 time_seq, struct tree_mod_elem *first_tm)
1312 {
1313 	u32 n;
1314 	struct rb_node *next;
1315 	struct tree_mod_elem *tm = first_tm;
1316 	unsigned long o_dst;
1317 	unsigned long o_src;
1318 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1319 
1320 	n = btrfs_header_nritems(eb);
1321 	tree_mod_log_read_lock(fs_info);
1322 	while (tm && tm->seq >= time_seq) {
1323 		/*
1324 		 * all the operations are recorded with the operator used for
1325 		 * the modification. as we're going backwards, we do the
1326 		 * opposite of each operation here.
1327 		 */
1328 		switch (tm->op) {
1329 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1330 			BUG_ON(tm->slot < n);
1331 			/* Fallthrough */
1332 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1333 		case MOD_LOG_KEY_REMOVE:
1334 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1335 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1336 			btrfs_set_node_ptr_generation(eb, tm->slot,
1337 						      tm->generation);
1338 			n++;
1339 			break;
1340 		case MOD_LOG_KEY_REPLACE:
1341 			BUG_ON(tm->slot >= n);
1342 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1343 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1344 			btrfs_set_node_ptr_generation(eb, tm->slot,
1345 						      tm->generation);
1346 			break;
1347 		case MOD_LOG_KEY_ADD:
1348 			/* if a move operation is needed it's in the log */
1349 			n--;
1350 			break;
1351 		case MOD_LOG_MOVE_KEYS:
1352 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1353 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1354 			memmove_extent_buffer(eb, o_dst, o_src,
1355 					      tm->move.nr_items * p_size);
1356 			break;
1357 		case MOD_LOG_ROOT_REPLACE:
1358 			/*
1359 			 * this operation is special. for roots, this must be
1360 			 * handled explicitly before rewinding.
1361 			 * for non-roots, this operation may exist if the node
1362 			 * was a root: root A -> child B; then A gets empty and
1363 			 * B is promoted to the new root. in the mod log, we'll
1364 			 * have a root-replace operation for B, a tree block
1365 			 * that is no root. we simply ignore that operation.
1366 			 */
1367 			break;
1368 		}
1369 		next = rb_next(&tm->node);
1370 		if (!next)
1371 			break;
1372 		tm = container_of(next, struct tree_mod_elem, node);
1373 		if (tm->index != first_tm->index)
1374 			break;
1375 	}
1376 	tree_mod_log_read_unlock(fs_info);
1377 	btrfs_set_header_nritems(eb, n);
1378 }
1379 
1380 /*
1381  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1382  * is returned. If rewind operations happen, a fresh buffer is returned. The
1383  * returned buffer is always read-locked. If the returned buffer is not the
1384  * input buffer, the lock on the input buffer is released and the input buffer
1385  * is freed (its refcount is decremented).
1386  */
1387 static struct extent_buffer *
1388 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1389 		    struct extent_buffer *eb, u64 time_seq)
1390 {
1391 	struct extent_buffer *eb_rewin;
1392 	struct tree_mod_elem *tm;
1393 
1394 	if (!time_seq)
1395 		return eb;
1396 
1397 	if (btrfs_header_level(eb) == 0)
1398 		return eb;
1399 
1400 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1401 	if (!tm)
1402 		return eb;
1403 
1404 	btrfs_set_path_blocking(path);
1405 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1406 
1407 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1408 		BUG_ON(tm->slot != 0);
1409 		eb_rewin = alloc_dummy_extent_buffer(eb->start,
1410 						fs_info->tree_root->nodesize);
1411 		if (!eb_rewin) {
1412 			btrfs_tree_read_unlock_blocking(eb);
1413 			free_extent_buffer(eb);
1414 			return NULL;
1415 		}
1416 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1417 		btrfs_set_header_backref_rev(eb_rewin,
1418 					     btrfs_header_backref_rev(eb));
1419 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1420 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1421 	} else {
1422 		eb_rewin = btrfs_clone_extent_buffer(eb);
1423 		if (!eb_rewin) {
1424 			btrfs_tree_read_unlock_blocking(eb);
1425 			free_extent_buffer(eb);
1426 			return NULL;
1427 		}
1428 	}
1429 
1430 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1431 	btrfs_tree_read_unlock_blocking(eb);
1432 	free_extent_buffer(eb);
1433 
1434 	extent_buffer_get(eb_rewin);
1435 	btrfs_tree_read_lock(eb_rewin);
1436 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1437 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1438 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1439 
1440 	return eb_rewin;
1441 }
1442 
1443 /*
1444  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1445  * value. If there are no changes, the current root->root_node is returned. If
1446  * anything changed in between, there's a fresh buffer allocated on which the
1447  * rewind operations are done. In any case, the returned buffer is read locked.
1448  * Returns NULL on error (with no locks held).
1449  */
1450 static inline struct extent_buffer *
1451 get_old_root(struct btrfs_root *root, u64 time_seq)
1452 {
1453 	struct tree_mod_elem *tm;
1454 	struct extent_buffer *eb = NULL;
1455 	struct extent_buffer *eb_root;
1456 	struct extent_buffer *old;
1457 	struct tree_mod_root *old_root = NULL;
1458 	u64 old_generation = 0;
1459 	u64 logical;
1460 	u32 blocksize;
1461 
1462 	eb_root = btrfs_read_lock_root_node(root);
1463 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1464 	if (!tm)
1465 		return eb_root;
1466 
1467 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1468 		old_root = &tm->old_root;
1469 		old_generation = tm->generation;
1470 		logical = old_root->logical;
1471 	} else {
1472 		logical = eb_root->start;
1473 	}
1474 
1475 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1476 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1477 		btrfs_tree_read_unlock(eb_root);
1478 		free_extent_buffer(eb_root);
1479 		blocksize = btrfs_level_size(root, old_root->level);
1480 		old = read_tree_block(root, logical, blocksize, 0);
1481 		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1482 			free_extent_buffer(old);
1483 			btrfs_warn(root->fs_info,
1484 				"failed to read tree block %llu from get_old_root", logical);
1485 		} else {
1486 			eb = btrfs_clone_extent_buffer(old);
1487 			free_extent_buffer(old);
1488 		}
1489 	} else if (old_root) {
1490 		btrfs_tree_read_unlock(eb_root);
1491 		free_extent_buffer(eb_root);
1492 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1493 	} else {
1494 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1495 		eb = btrfs_clone_extent_buffer(eb_root);
1496 		btrfs_tree_read_unlock_blocking(eb_root);
1497 		free_extent_buffer(eb_root);
1498 	}
1499 
1500 	if (!eb)
1501 		return NULL;
1502 	extent_buffer_get(eb);
1503 	btrfs_tree_read_lock(eb);
1504 	if (old_root) {
1505 		btrfs_set_header_bytenr(eb, eb->start);
1506 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1507 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1508 		btrfs_set_header_level(eb, old_root->level);
1509 		btrfs_set_header_generation(eb, old_generation);
1510 	}
1511 	if (tm)
1512 		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1513 	else
1514 		WARN_ON(btrfs_header_level(eb) != 0);
1515 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1516 
1517 	return eb;
1518 }
1519 
1520 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1521 {
1522 	struct tree_mod_elem *tm;
1523 	int level;
1524 	struct extent_buffer *eb_root = btrfs_root_node(root);
1525 
1526 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1527 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1528 		level = tm->old_root.level;
1529 	} else {
1530 		level = btrfs_header_level(eb_root);
1531 	}
1532 	free_extent_buffer(eb_root);
1533 
1534 	return level;
1535 }
1536 
1537 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1538 				   struct btrfs_root *root,
1539 				   struct extent_buffer *buf)
1540 {
1541 	/* ensure we can see the force_cow */
1542 	smp_rmb();
1543 
1544 	/*
1545 	 * We do not need to cow a block if
1546 	 * 1) this block is not created or changed in this transaction;
1547 	 * 2) this block does not belong to TREE_RELOC tree;
1548 	 * 3) the root is not forced COW.
1549 	 *
1550 	 * What is forced COW:
1551 	 *    when we create snapshot during commiting the transaction,
1552 	 *    after we've finished coping src root, we must COW the shared
1553 	 *    block to ensure the metadata consistency.
1554 	 */
1555 	if (btrfs_header_generation(buf) == trans->transid &&
1556 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1557 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1558 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1559 	    !root->force_cow)
1560 		return 0;
1561 	return 1;
1562 }
1563 
1564 /*
1565  * cows a single block, see __btrfs_cow_block for the real work.
1566  * This version of it has extra checks so that a block isn't cow'd more than
1567  * once per transaction, as long as it hasn't been written yet
1568  */
1569 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1570 		    struct btrfs_root *root, struct extent_buffer *buf,
1571 		    struct extent_buffer *parent, int parent_slot,
1572 		    struct extent_buffer **cow_ret)
1573 {
1574 	u64 search_start;
1575 	int ret;
1576 
1577 	if (trans->transaction != root->fs_info->running_transaction)
1578 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1579 		       trans->transid,
1580 		       root->fs_info->running_transaction->transid);
1581 
1582 	if (trans->transid != root->fs_info->generation)
1583 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1584 		       trans->transid, root->fs_info->generation);
1585 
1586 	if (!should_cow_block(trans, root, buf)) {
1587 		*cow_ret = buf;
1588 		return 0;
1589 	}
1590 
1591 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1592 
1593 	if (parent)
1594 		btrfs_set_lock_blocking(parent);
1595 	btrfs_set_lock_blocking(buf);
1596 
1597 	ret = __btrfs_cow_block(trans, root, buf, parent,
1598 				 parent_slot, cow_ret, search_start, 0);
1599 
1600 	trace_btrfs_cow_block(root, buf, *cow_ret);
1601 
1602 	return ret;
1603 }
1604 
1605 /*
1606  * helper function for defrag to decide if two blocks pointed to by a
1607  * node are actually close by
1608  */
1609 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1610 {
1611 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1612 		return 1;
1613 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1614 		return 1;
1615 	return 0;
1616 }
1617 
1618 /*
1619  * compare two keys in a memcmp fashion
1620  */
1621 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1622 {
1623 	struct btrfs_key k1;
1624 
1625 	btrfs_disk_key_to_cpu(&k1, disk);
1626 
1627 	return btrfs_comp_cpu_keys(&k1, k2);
1628 }
1629 
1630 /*
1631  * same as comp_keys only with two btrfs_key's
1632  */
1633 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1634 {
1635 	if (k1->objectid > k2->objectid)
1636 		return 1;
1637 	if (k1->objectid < k2->objectid)
1638 		return -1;
1639 	if (k1->type > k2->type)
1640 		return 1;
1641 	if (k1->type < k2->type)
1642 		return -1;
1643 	if (k1->offset > k2->offset)
1644 		return 1;
1645 	if (k1->offset < k2->offset)
1646 		return -1;
1647 	return 0;
1648 }
1649 
1650 /*
1651  * this is used by the defrag code to go through all the
1652  * leaves pointed to by a node and reallocate them so that
1653  * disk order is close to key order
1654  */
1655 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1656 		       struct btrfs_root *root, struct extent_buffer *parent,
1657 		       int start_slot, u64 *last_ret,
1658 		       struct btrfs_key *progress)
1659 {
1660 	struct extent_buffer *cur;
1661 	u64 blocknr;
1662 	u64 gen;
1663 	u64 search_start = *last_ret;
1664 	u64 last_block = 0;
1665 	u64 other;
1666 	u32 parent_nritems;
1667 	int end_slot;
1668 	int i;
1669 	int err = 0;
1670 	int parent_level;
1671 	int uptodate;
1672 	u32 blocksize;
1673 	int progress_passed = 0;
1674 	struct btrfs_disk_key disk_key;
1675 
1676 	parent_level = btrfs_header_level(parent);
1677 
1678 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1679 	WARN_ON(trans->transid != root->fs_info->generation);
1680 
1681 	parent_nritems = btrfs_header_nritems(parent);
1682 	blocksize = btrfs_level_size(root, parent_level - 1);
1683 	end_slot = parent_nritems;
1684 
1685 	if (parent_nritems == 1)
1686 		return 0;
1687 
1688 	btrfs_set_lock_blocking(parent);
1689 
1690 	for (i = start_slot; i < end_slot; i++) {
1691 		int close = 1;
1692 
1693 		btrfs_node_key(parent, &disk_key, i);
1694 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1695 			continue;
1696 
1697 		progress_passed = 1;
1698 		blocknr = btrfs_node_blockptr(parent, i);
1699 		gen = btrfs_node_ptr_generation(parent, i);
1700 		if (last_block == 0)
1701 			last_block = blocknr;
1702 
1703 		if (i > 0) {
1704 			other = btrfs_node_blockptr(parent, i - 1);
1705 			close = close_blocks(blocknr, other, blocksize);
1706 		}
1707 		if (!close && i < end_slot - 2) {
1708 			other = btrfs_node_blockptr(parent, i + 1);
1709 			close = close_blocks(blocknr, other, blocksize);
1710 		}
1711 		if (close) {
1712 			last_block = blocknr;
1713 			continue;
1714 		}
1715 
1716 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
1717 		if (cur)
1718 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1719 		else
1720 			uptodate = 0;
1721 		if (!cur || !uptodate) {
1722 			if (!cur) {
1723 				cur = read_tree_block(root, blocknr,
1724 							 blocksize, gen);
1725 				if (!cur || !extent_buffer_uptodate(cur)) {
1726 					free_extent_buffer(cur);
1727 					return -EIO;
1728 				}
1729 			} else if (!uptodate) {
1730 				err = btrfs_read_buffer(cur, gen);
1731 				if (err) {
1732 					free_extent_buffer(cur);
1733 					return err;
1734 				}
1735 			}
1736 		}
1737 		if (search_start == 0)
1738 			search_start = last_block;
1739 
1740 		btrfs_tree_lock(cur);
1741 		btrfs_set_lock_blocking(cur);
1742 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1743 					&cur, search_start,
1744 					min(16 * blocksize,
1745 					    (end_slot - i) * blocksize));
1746 		if (err) {
1747 			btrfs_tree_unlock(cur);
1748 			free_extent_buffer(cur);
1749 			break;
1750 		}
1751 		search_start = cur->start;
1752 		last_block = cur->start;
1753 		*last_ret = search_start;
1754 		btrfs_tree_unlock(cur);
1755 		free_extent_buffer(cur);
1756 	}
1757 	return err;
1758 }
1759 
1760 /*
1761  * The leaf data grows from end-to-front in the node.
1762  * this returns the address of the start of the last item,
1763  * which is the stop of the leaf data stack
1764  */
1765 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1766 					 struct extent_buffer *leaf)
1767 {
1768 	u32 nr = btrfs_header_nritems(leaf);
1769 	if (nr == 0)
1770 		return BTRFS_LEAF_DATA_SIZE(root);
1771 	return btrfs_item_offset_nr(leaf, nr - 1);
1772 }
1773 
1774 
1775 /*
1776  * search for key in the extent_buffer.  The items start at offset p,
1777  * and they are item_size apart.  There are 'max' items in p.
1778  *
1779  * the slot in the array is returned via slot, and it points to
1780  * the place where you would insert key if it is not found in
1781  * the array.
1782  *
1783  * slot may point to max if the key is bigger than all of the keys
1784  */
1785 static noinline int generic_bin_search(struct extent_buffer *eb,
1786 				       unsigned long p,
1787 				       int item_size, struct btrfs_key *key,
1788 				       int max, int *slot)
1789 {
1790 	int low = 0;
1791 	int high = max;
1792 	int mid;
1793 	int ret;
1794 	struct btrfs_disk_key *tmp = NULL;
1795 	struct btrfs_disk_key unaligned;
1796 	unsigned long offset;
1797 	char *kaddr = NULL;
1798 	unsigned long map_start = 0;
1799 	unsigned long map_len = 0;
1800 	int err;
1801 
1802 	while (low < high) {
1803 		mid = (low + high) / 2;
1804 		offset = p + mid * item_size;
1805 
1806 		if (!kaddr || offset < map_start ||
1807 		    (offset + sizeof(struct btrfs_disk_key)) >
1808 		    map_start + map_len) {
1809 
1810 			err = map_private_extent_buffer(eb, offset,
1811 						sizeof(struct btrfs_disk_key),
1812 						&kaddr, &map_start, &map_len);
1813 
1814 			if (!err) {
1815 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1816 							map_start);
1817 			} else {
1818 				read_extent_buffer(eb, &unaligned,
1819 						   offset, sizeof(unaligned));
1820 				tmp = &unaligned;
1821 			}
1822 
1823 		} else {
1824 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1825 							map_start);
1826 		}
1827 		ret = comp_keys(tmp, key);
1828 
1829 		if (ret < 0)
1830 			low = mid + 1;
1831 		else if (ret > 0)
1832 			high = mid;
1833 		else {
1834 			*slot = mid;
1835 			return 0;
1836 		}
1837 	}
1838 	*slot = low;
1839 	return 1;
1840 }
1841 
1842 /*
1843  * simple bin_search frontend that does the right thing for
1844  * leaves vs nodes
1845  */
1846 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1847 		      int level, int *slot)
1848 {
1849 	if (level == 0)
1850 		return generic_bin_search(eb,
1851 					  offsetof(struct btrfs_leaf, items),
1852 					  sizeof(struct btrfs_item),
1853 					  key, btrfs_header_nritems(eb),
1854 					  slot);
1855 	else
1856 		return generic_bin_search(eb,
1857 					  offsetof(struct btrfs_node, ptrs),
1858 					  sizeof(struct btrfs_key_ptr),
1859 					  key, btrfs_header_nritems(eb),
1860 					  slot);
1861 }
1862 
1863 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1864 		     int level, int *slot)
1865 {
1866 	return bin_search(eb, key, level, slot);
1867 }
1868 
1869 static void root_add_used(struct btrfs_root *root, u32 size)
1870 {
1871 	spin_lock(&root->accounting_lock);
1872 	btrfs_set_root_used(&root->root_item,
1873 			    btrfs_root_used(&root->root_item) + size);
1874 	spin_unlock(&root->accounting_lock);
1875 }
1876 
1877 static void root_sub_used(struct btrfs_root *root, u32 size)
1878 {
1879 	spin_lock(&root->accounting_lock);
1880 	btrfs_set_root_used(&root->root_item,
1881 			    btrfs_root_used(&root->root_item) - size);
1882 	spin_unlock(&root->accounting_lock);
1883 }
1884 
1885 /* given a node and slot number, this reads the blocks it points to.  The
1886  * extent buffer is returned with a reference taken (but unlocked).
1887  * NULL is returned on error.
1888  */
1889 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1890 				   struct extent_buffer *parent, int slot)
1891 {
1892 	int level = btrfs_header_level(parent);
1893 	struct extent_buffer *eb;
1894 
1895 	if (slot < 0)
1896 		return NULL;
1897 	if (slot >= btrfs_header_nritems(parent))
1898 		return NULL;
1899 
1900 	BUG_ON(level == 0);
1901 
1902 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1903 			     btrfs_level_size(root, level - 1),
1904 			     btrfs_node_ptr_generation(parent, slot));
1905 	if (eb && !extent_buffer_uptodate(eb)) {
1906 		free_extent_buffer(eb);
1907 		eb = NULL;
1908 	}
1909 
1910 	return eb;
1911 }
1912 
1913 /*
1914  * node level balancing, used to make sure nodes are in proper order for
1915  * item deletion.  We balance from the top down, so we have to make sure
1916  * that a deletion won't leave an node completely empty later on.
1917  */
1918 static noinline int balance_level(struct btrfs_trans_handle *trans,
1919 			 struct btrfs_root *root,
1920 			 struct btrfs_path *path, int level)
1921 {
1922 	struct extent_buffer *right = NULL;
1923 	struct extent_buffer *mid;
1924 	struct extent_buffer *left = NULL;
1925 	struct extent_buffer *parent = NULL;
1926 	int ret = 0;
1927 	int wret;
1928 	int pslot;
1929 	int orig_slot = path->slots[level];
1930 	u64 orig_ptr;
1931 
1932 	if (level == 0)
1933 		return 0;
1934 
1935 	mid = path->nodes[level];
1936 
1937 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1938 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1939 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1940 
1941 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1942 
1943 	if (level < BTRFS_MAX_LEVEL - 1) {
1944 		parent = path->nodes[level + 1];
1945 		pslot = path->slots[level + 1];
1946 	}
1947 
1948 	/*
1949 	 * deal with the case where there is only one pointer in the root
1950 	 * by promoting the node below to a root
1951 	 */
1952 	if (!parent) {
1953 		struct extent_buffer *child;
1954 
1955 		if (btrfs_header_nritems(mid) != 1)
1956 			return 0;
1957 
1958 		/* promote the child to a root */
1959 		child = read_node_slot(root, mid, 0);
1960 		if (!child) {
1961 			ret = -EROFS;
1962 			btrfs_std_error(root->fs_info, ret);
1963 			goto enospc;
1964 		}
1965 
1966 		btrfs_tree_lock(child);
1967 		btrfs_set_lock_blocking(child);
1968 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1969 		if (ret) {
1970 			btrfs_tree_unlock(child);
1971 			free_extent_buffer(child);
1972 			goto enospc;
1973 		}
1974 
1975 		tree_mod_log_set_root_pointer(root, child, 1);
1976 		rcu_assign_pointer(root->node, child);
1977 
1978 		add_root_to_dirty_list(root);
1979 		btrfs_tree_unlock(child);
1980 
1981 		path->locks[level] = 0;
1982 		path->nodes[level] = NULL;
1983 		clean_tree_block(trans, root, mid);
1984 		btrfs_tree_unlock(mid);
1985 		/* once for the path */
1986 		free_extent_buffer(mid);
1987 
1988 		root_sub_used(root, mid->len);
1989 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1990 		/* once for the root ptr */
1991 		free_extent_buffer_stale(mid);
1992 		return 0;
1993 	}
1994 	if (btrfs_header_nritems(mid) >
1995 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1996 		return 0;
1997 
1998 	left = read_node_slot(root, parent, pslot - 1);
1999 	if (left) {
2000 		btrfs_tree_lock(left);
2001 		btrfs_set_lock_blocking(left);
2002 		wret = btrfs_cow_block(trans, root, left,
2003 				       parent, pslot - 1, &left);
2004 		if (wret) {
2005 			ret = wret;
2006 			goto enospc;
2007 		}
2008 	}
2009 	right = read_node_slot(root, parent, pslot + 1);
2010 	if (right) {
2011 		btrfs_tree_lock(right);
2012 		btrfs_set_lock_blocking(right);
2013 		wret = btrfs_cow_block(trans, root, right,
2014 				       parent, pslot + 1, &right);
2015 		if (wret) {
2016 			ret = wret;
2017 			goto enospc;
2018 		}
2019 	}
2020 
2021 	/* first, try to make some room in the middle buffer */
2022 	if (left) {
2023 		orig_slot += btrfs_header_nritems(left);
2024 		wret = push_node_left(trans, root, left, mid, 1);
2025 		if (wret < 0)
2026 			ret = wret;
2027 	}
2028 
2029 	/*
2030 	 * then try to empty the right most buffer into the middle
2031 	 */
2032 	if (right) {
2033 		wret = push_node_left(trans, root, mid, right, 1);
2034 		if (wret < 0 && wret != -ENOSPC)
2035 			ret = wret;
2036 		if (btrfs_header_nritems(right) == 0) {
2037 			clean_tree_block(trans, root, right);
2038 			btrfs_tree_unlock(right);
2039 			del_ptr(root, path, level + 1, pslot + 1);
2040 			root_sub_used(root, right->len);
2041 			btrfs_free_tree_block(trans, root, right, 0, 1);
2042 			free_extent_buffer_stale(right);
2043 			right = NULL;
2044 		} else {
2045 			struct btrfs_disk_key right_key;
2046 			btrfs_node_key(right, &right_key, 0);
2047 			tree_mod_log_set_node_key(root->fs_info, parent,
2048 						  pslot + 1, 0);
2049 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2050 			btrfs_mark_buffer_dirty(parent);
2051 		}
2052 	}
2053 	if (btrfs_header_nritems(mid) == 1) {
2054 		/*
2055 		 * we're not allowed to leave a node with one item in the
2056 		 * tree during a delete.  A deletion from lower in the tree
2057 		 * could try to delete the only pointer in this node.
2058 		 * So, pull some keys from the left.
2059 		 * There has to be a left pointer at this point because
2060 		 * otherwise we would have pulled some pointers from the
2061 		 * right
2062 		 */
2063 		if (!left) {
2064 			ret = -EROFS;
2065 			btrfs_std_error(root->fs_info, ret);
2066 			goto enospc;
2067 		}
2068 		wret = balance_node_right(trans, root, mid, left);
2069 		if (wret < 0) {
2070 			ret = wret;
2071 			goto enospc;
2072 		}
2073 		if (wret == 1) {
2074 			wret = push_node_left(trans, root, left, mid, 1);
2075 			if (wret < 0)
2076 				ret = wret;
2077 		}
2078 		BUG_ON(wret == 1);
2079 	}
2080 	if (btrfs_header_nritems(mid) == 0) {
2081 		clean_tree_block(trans, root, mid);
2082 		btrfs_tree_unlock(mid);
2083 		del_ptr(root, path, level + 1, pslot);
2084 		root_sub_used(root, mid->len);
2085 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2086 		free_extent_buffer_stale(mid);
2087 		mid = NULL;
2088 	} else {
2089 		/* update the parent key to reflect our changes */
2090 		struct btrfs_disk_key mid_key;
2091 		btrfs_node_key(mid, &mid_key, 0);
2092 		tree_mod_log_set_node_key(root->fs_info, parent,
2093 					  pslot, 0);
2094 		btrfs_set_node_key(parent, &mid_key, pslot);
2095 		btrfs_mark_buffer_dirty(parent);
2096 	}
2097 
2098 	/* update the path */
2099 	if (left) {
2100 		if (btrfs_header_nritems(left) > orig_slot) {
2101 			extent_buffer_get(left);
2102 			/* left was locked after cow */
2103 			path->nodes[level] = left;
2104 			path->slots[level + 1] -= 1;
2105 			path->slots[level] = orig_slot;
2106 			if (mid) {
2107 				btrfs_tree_unlock(mid);
2108 				free_extent_buffer(mid);
2109 			}
2110 		} else {
2111 			orig_slot -= btrfs_header_nritems(left);
2112 			path->slots[level] = orig_slot;
2113 		}
2114 	}
2115 	/* double check we haven't messed things up */
2116 	if (orig_ptr !=
2117 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2118 		BUG();
2119 enospc:
2120 	if (right) {
2121 		btrfs_tree_unlock(right);
2122 		free_extent_buffer(right);
2123 	}
2124 	if (left) {
2125 		if (path->nodes[level] != left)
2126 			btrfs_tree_unlock(left);
2127 		free_extent_buffer(left);
2128 	}
2129 	return ret;
2130 }
2131 
2132 /* Node balancing for insertion.  Here we only split or push nodes around
2133  * when they are completely full.  This is also done top down, so we
2134  * have to be pessimistic.
2135  */
2136 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2137 					  struct btrfs_root *root,
2138 					  struct btrfs_path *path, int level)
2139 {
2140 	struct extent_buffer *right = NULL;
2141 	struct extent_buffer *mid;
2142 	struct extent_buffer *left = NULL;
2143 	struct extent_buffer *parent = NULL;
2144 	int ret = 0;
2145 	int wret;
2146 	int pslot;
2147 	int orig_slot = path->slots[level];
2148 
2149 	if (level == 0)
2150 		return 1;
2151 
2152 	mid = path->nodes[level];
2153 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2154 
2155 	if (level < BTRFS_MAX_LEVEL - 1) {
2156 		parent = path->nodes[level + 1];
2157 		pslot = path->slots[level + 1];
2158 	}
2159 
2160 	if (!parent)
2161 		return 1;
2162 
2163 	left = read_node_slot(root, parent, pslot - 1);
2164 
2165 	/* first, try to make some room in the middle buffer */
2166 	if (left) {
2167 		u32 left_nr;
2168 
2169 		btrfs_tree_lock(left);
2170 		btrfs_set_lock_blocking(left);
2171 
2172 		left_nr = btrfs_header_nritems(left);
2173 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2174 			wret = 1;
2175 		} else {
2176 			ret = btrfs_cow_block(trans, root, left, parent,
2177 					      pslot - 1, &left);
2178 			if (ret)
2179 				wret = 1;
2180 			else {
2181 				wret = push_node_left(trans, root,
2182 						      left, mid, 0);
2183 			}
2184 		}
2185 		if (wret < 0)
2186 			ret = wret;
2187 		if (wret == 0) {
2188 			struct btrfs_disk_key disk_key;
2189 			orig_slot += left_nr;
2190 			btrfs_node_key(mid, &disk_key, 0);
2191 			tree_mod_log_set_node_key(root->fs_info, parent,
2192 						  pslot, 0);
2193 			btrfs_set_node_key(parent, &disk_key, pslot);
2194 			btrfs_mark_buffer_dirty(parent);
2195 			if (btrfs_header_nritems(left) > orig_slot) {
2196 				path->nodes[level] = left;
2197 				path->slots[level + 1] -= 1;
2198 				path->slots[level] = orig_slot;
2199 				btrfs_tree_unlock(mid);
2200 				free_extent_buffer(mid);
2201 			} else {
2202 				orig_slot -=
2203 					btrfs_header_nritems(left);
2204 				path->slots[level] = orig_slot;
2205 				btrfs_tree_unlock(left);
2206 				free_extent_buffer(left);
2207 			}
2208 			return 0;
2209 		}
2210 		btrfs_tree_unlock(left);
2211 		free_extent_buffer(left);
2212 	}
2213 	right = read_node_slot(root, parent, pslot + 1);
2214 
2215 	/*
2216 	 * then try to empty the right most buffer into the middle
2217 	 */
2218 	if (right) {
2219 		u32 right_nr;
2220 
2221 		btrfs_tree_lock(right);
2222 		btrfs_set_lock_blocking(right);
2223 
2224 		right_nr = btrfs_header_nritems(right);
2225 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2226 			wret = 1;
2227 		} else {
2228 			ret = btrfs_cow_block(trans, root, right,
2229 					      parent, pslot + 1,
2230 					      &right);
2231 			if (ret)
2232 				wret = 1;
2233 			else {
2234 				wret = balance_node_right(trans, root,
2235 							  right, mid);
2236 			}
2237 		}
2238 		if (wret < 0)
2239 			ret = wret;
2240 		if (wret == 0) {
2241 			struct btrfs_disk_key disk_key;
2242 
2243 			btrfs_node_key(right, &disk_key, 0);
2244 			tree_mod_log_set_node_key(root->fs_info, parent,
2245 						  pslot + 1, 0);
2246 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2247 			btrfs_mark_buffer_dirty(parent);
2248 
2249 			if (btrfs_header_nritems(mid) <= orig_slot) {
2250 				path->nodes[level] = right;
2251 				path->slots[level + 1] += 1;
2252 				path->slots[level] = orig_slot -
2253 					btrfs_header_nritems(mid);
2254 				btrfs_tree_unlock(mid);
2255 				free_extent_buffer(mid);
2256 			} else {
2257 				btrfs_tree_unlock(right);
2258 				free_extent_buffer(right);
2259 			}
2260 			return 0;
2261 		}
2262 		btrfs_tree_unlock(right);
2263 		free_extent_buffer(right);
2264 	}
2265 	return 1;
2266 }
2267 
2268 /*
2269  * readahead one full node of leaves, finding things that are close
2270  * to the block in 'slot', and triggering ra on them.
2271  */
2272 static void reada_for_search(struct btrfs_root *root,
2273 			     struct btrfs_path *path,
2274 			     int level, int slot, u64 objectid)
2275 {
2276 	struct extent_buffer *node;
2277 	struct btrfs_disk_key disk_key;
2278 	u32 nritems;
2279 	u64 search;
2280 	u64 target;
2281 	u64 nread = 0;
2282 	u64 gen;
2283 	int direction = path->reada;
2284 	struct extent_buffer *eb;
2285 	u32 nr;
2286 	u32 blocksize;
2287 	u32 nscan = 0;
2288 
2289 	if (level != 1)
2290 		return;
2291 
2292 	if (!path->nodes[level])
2293 		return;
2294 
2295 	node = path->nodes[level];
2296 
2297 	search = btrfs_node_blockptr(node, slot);
2298 	blocksize = btrfs_level_size(root, level - 1);
2299 	eb = btrfs_find_tree_block(root, search, blocksize);
2300 	if (eb) {
2301 		free_extent_buffer(eb);
2302 		return;
2303 	}
2304 
2305 	target = search;
2306 
2307 	nritems = btrfs_header_nritems(node);
2308 	nr = slot;
2309 
2310 	while (1) {
2311 		if (direction < 0) {
2312 			if (nr == 0)
2313 				break;
2314 			nr--;
2315 		} else if (direction > 0) {
2316 			nr++;
2317 			if (nr >= nritems)
2318 				break;
2319 		}
2320 		if (path->reada < 0 && objectid) {
2321 			btrfs_node_key(node, &disk_key, nr);
2322 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2323 				break;
2324 		}
2325 		search = btrfs_node_blockptr(node, nr);
2326 		if ((search <= target && target - search <= 65536) ||
2327 		    (search > target && search - target <= 65536)) {
2328 			gen = btrfs_node_ptr_generation(node, nr);
2329 			readahead_tree_block(root, search, blocksize, gen);
2330 			nread += blocksize;
2331 		}
2332 		nscan++;
2333 		if ((nread > 65536 || nscan > 32))
2334 			break;
2335 	}
2336 }
2337 
2338 static noinline void reada_for_balance(struct btrfs_root *root,
2339 				       struct btrfs_path *path, int level)
2340 {
2341 	int slot;
2342 	int nritems;
2343 	struct extent_buffer *parent;
2344 	struct extent_buffer *eb;
2345 	u64 gen;
2346 	u64 block1 = 0;
2347 	u64 block2 = 0;
2348 	int blocksize;
2349 
2350 	parent = path->nodes[level + 1];
2351 	if (!parent)
2352 		return;
2353 
2354 	nritems = btrfs_header_nritems(parent);
2355 	slot = path->slots[level + 1];
2356 	blocksize = btrfs_level_size(root, level);
2357 
2358 	if (slot > 0) {
2359 		block1 = btrfs_node_blockptr(parent, slot - 1);
2360 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2361 		eb = btrfs_find_tree_block(root, block1, blocksize);
2362 		/*
2363 		 * if we get -eagain from btrfs_buffer_uptodate, we
2364 		 * don't want to return eagain here.  That will loop
2365 		 * forever
2366 		 */
2367 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2368 			block1 = 0;
2369 		free_extent_buffer(eb);
2370 	}
2371 	if (slot + 1 < nritems) {
2372 		block2 = btrfs_node_blockptr(parent, slot + 1);
2373 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2374 		eb = btrfs_find_tree_block(root, block2, blocksize);
2375 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2376 			block2 = 0;
2377 		free_extent_buffer(eb);
2378 	}
2379 
2380 	if (block1)
2381 		readahead_tree_block(root, block1, blocksize, 0);
2382 	if (block2)
2383 		readahead_tree_block(root, block2, blocksize, 0);
2384 }
2385 
2386 
2387 /*
2388  * when we walk down the tree, it is usually safe to unlock the higher layers
2389  * in the tree.  The exceptions are when our path goes through slot 0, because
2390  * operations on the tree might require changing key pointers higher up in the
2391  * tree.
2392  *
2393  * callers might also have set path->keep_locks, which tells this code to keep
2394  * the lock if the path points to the last slot in the block.  This is part of
2395  * walking through the tree, and selecting the next slot in the higher block.
2396  *
2397  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2398  * if lowest_unlock is 1, level 0 won't be unlocked
2399  */
2400 static noinline void unlock_up(struct btrfs_path *path, int level,
2401 			       int lowest_unlock, int min_write_lock_level,
2402 			       int *write_lock_level)
2403 {
2404 	int i;
2405 	int skip_level = level;
2406 	int no_skips = 0;
2407 	struct extent_buffer *t;
2408 
2409 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2410 		if (!path->nodes[i])
2411 			break;
2412 		if (!path->locks[i])
2413 			break;
2414 		if (!no_skips && path->slots[i] == 0) {
2415 			skip_level = i + 1;
2416 			continue;
2417 		}
2418 		if (!no_skips && path->keep_locks) {
2419 			u32 nritems;
2420 			t = path->nodes[i];
2421 			nritems = btrfs_header_nritems(t);
2422 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2423 				skip_level = i + 1;
2424 				continue;
2425 			}
2426 		}
2427 		if (skip_level < i && i >= lowest_unlock)
2428 			no_skips = 1;
2429 
2430 		t = path->nodes[i];
2431 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2432 			btrfs_tree_unlock_rw(t, path->locks[i]);
2433 			path->locks[i] = 0;
2434 			if (write_lock_level &&
2435 			    i > min_write_lock_level &&
2436 			    i <= *write_lock_level) {
2437 				*write_lock_level = i - 1;
2438 			}
2439 		}
2440 	}
2441 }
2442 
2443 /*
2444  * This releases any locks held in the path starting at level and
2445  * going all the way up to the root.
2446  *
2447  * btrfs_search_slot will keep the lock held on higher nodes in a few
2448  * corner cases, such as COW of the block at slot zero in the node.  This
2449  * ignores those rules, and it should only be called when there are no
2450  * more updates to be done higher up in the tree.
2451  */
2452 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2453 {
2454 	int i;
2455 
2456 	if (path->keep_locks)
2457 		return;
2458 
2459 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2460 		if (!path->nodes[i])
2461 			continue;
2462 		if (!path->locks[i])
2463 			continue;
2464 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2465 		path->locks[i] = 0;
2466 	}
2467 }
2468 
2469 /*
2470  * helper function for btrfs_search_slot.  The goal is to find a block
2471  * in cache without setting the path to blocking.  If we find the block
2472  * we return zero and the path is unchanged.
2473  *
2474  * If we can't find the block, we set the path blocking and do some
2475  * reada.  -EAGAIN is returned and the search must be repeated.
2476  */
2477 static int
2478 read_block_for_search(struct btrfs_trans_handle *trans,
2479 		       struct btrfs_root *root, struct btrfs_path *p,
2480 		       struct extent_buffer **eb_ret, int level, int slot,
2481 		       struct btrfs_key *key, u64 time_seq)
2482 {
2483 	u64 blocknr;
2484 	u64 gen;
2485 	u32 blocksize;
2486 	struct extent_buffer *b = *eb_ret;
2487 	struct extent_buffer *tmp;
2488 	int ret;
2489 
2490 	blocknr = btrfs_node_blockptr(b, slot);
2491 	gen = btrfs_node_ptr_generation(b, slot);
2492 	blocksize = btrfs_level_size(root, level - 1);
2493 
2494 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2495 	if (tmp) {
2496 		/* first we do an atomic uptodate check */
2497 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2498 			*eb_ret = tmp;
2499 			return 0;
2500 		}
2501 
2502 		/* the pages were up to date, but we failed
2503 		 * the generation number check.  Do a full
2504 		 * read for the generation number that is correct.
2505 		 * We must do this without dropping locks so
2506 		 * we can trust our generation number
2507 		 */
2508 		btrfs_set_path_blocking(p);
2509 
2510 		/* now we're allowed to do a blocking uptodate check */
2511 		ret = btrfs_read_buffer(tmp, gen);
2512 		if (!ret) {
2513 			*eb_ret = tmp;
2514 			return 0;
2515 		}
2516 		free_extent_buffer(tmp);
2517 		btrfs_release_path(p);
2518 		return -EIO;
2519 	}
2520 
2521 	/*
2522 	 * reduce lock contention at high levels
2523 	 * of the btree by dropping locks before
2524 	 * we read.  Don't release the lock on the current
2525 	 * level because we need to walk this node to figure
2526 	 * out which blocks to read.
2527 	 */
2528 	btrfs_unlock_up_safe(p, level + 1);
2529 	btrfs_set_path_blocking(p);
2530 
2531 	free_extent_buffer(tmp);
2532 	if (p->reada)
2533 		reada_for_search(root, p, level, slot, key->objectid);
2534 
2535 	btrfs_release_path(p);
2536 
2537 	ret = -EAGAIN;
2538 	tmp = read_tree_block(root, blocknr, blocksize, 0);
2539 	if (tmp) {
2540 		/*
2541 		 * If the read above didn't mark this buffer up to date,
2542 		 * it will never end up being up to date.  Set ret to EIO now
2543 		 * and give up so that our caller doesn't loop forever
2544 		 * on our EAGAINs.
2545 		 */
2546 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2547 			ret = -EIO;
2548 		free_extent_buffer(tmp);
2549 	}
2550 	return ret;
2551 }
2552 
2553 /*
2554  * helper function for btrfs_search_slot.  This does all of the checks
2555  * for node-level blocks and does any balancing required based on
2556  * the ins_len.
2557  *
2558  * If no extra work was required, zero is returned.  If we had to
2559  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2560  * start over
2561  */
2562 static int
2563 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2564 		       struct btrfs_root *root, struct btrfs_path *p,
2565 		       struct extent_buffer *b, int level, int ins_len,
2566 		       int *write_lock_level)
2567 {
2568 	int ret;
2569 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2570 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2571 		int sret;
2572 
2573 		if (*write_lock_level < level + 1) {
2574 			*write_lock_level = level + 1;
2575 			btrfs_release_path(p);
2576 			goto again;
2577 		}
2578 
2579 		btrfs_set_path_blocking(p);
2580 		reada_for_balance(root, p, level);
2581 		sret = split_node(trans, root, p, level);
2582 		btrfs_clear_path_blocking(p, NULL, 0);
2583 
2584 		BUG_ON(sret > 0);
2585 		if (sret) {
2586 			ret = sret;
2587 			goto done;
2588 		}
2589 		b = p->nodes[level];
2590 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2591 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2592 		int sret;
2593 
2594 		if (*write_lock_level < level + 1) {
2595 			*write_lock_level = level + 1;
2596 			btrfs_release_path(p);
2597 			goto again;
2598 		}
2599 
2600 		btrfs_set_path_blocking(p);
2601 		reada_for_balance(root, p, level);
2602 		sret = balance_level(trans, root, p, level);
2603 		btrfs_clear_path_blocking(p, NULL, 0);
2604 
2605 		if (sret) {
2606 			ret = sret;
2607 			goto done;
2608 		}
2609 		b = p->nodes[level];
2610 		if (!b) {
2611 			btrfs_release_path(p);
2612 			goto again;
2613 		}
2614 		BUG_ON(btrfs_header_nritems(b) == 1);
2615 	}
2616 	return 0;
2617 
2618 again:
2619 	ret = -EAGAIN;
2620 done:
2621 	return ret;
2622 }
2623 
2624 static void key_search_validate(struct extent_buffer *b,
2625 				struct btrfs_key *key,
2626 				int level)
2627 {
2628 #ifdef CONFIG_BTRFS_ASSERT
2629 	struct btrfs_disk_key disk_key;
2630 
2631 	btrfs_cpu_key_to_disk(&disk_key, key);
2632 
2633 	if (level == 0)
2634 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2635 		    offsetof(struct btrfs_leaf, items[0].key),
2636 		    sizeof(disk_key)));
2637 	else
2638 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2639 		    offsetof(struct btrfs_node, ptrs[0].key),
2640 		    sizeof(disk_key)));
2641 #endif
2642 }
2643 
2644 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2645 		      int level, int *prev_cmp, int *slot)
2646 {
2647 	if (*prev_cmp != 0) {
2648 		*prev_cmp = bin_search(b, key, level, slot);
2649 		return *prev_cmp;
2650 	}
2651 
2652 	key_search_validate(b, key, level);
2653 	*slot = 0;
2654 
2655 	return 0;
2656 }
2657 
2658 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
2659 		u64 iobjectid, u64 ioff, u8 key_type,
2660 		struct btrfs_key *found_key)
2661 {
2662 	int ret;
2663 	struct btrfs_key key;
2664 	struct extent_buffer *eb;
2665 	struct btrfs_path *path;
2666 
2667 	key.type = key_type;
2668 	key.objectid = iobjectid;
2669 	key.offset = ioff;
2670 
2671 	if (found_path == NULL) {
2672 		path = btrfs_alloc_path();
2673 		if (!path)
2674 			return -ENOMEM;
2675 	} else
2676 		path = found_path;
2677 
2678 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2679 	if ((ret < 0) || (found_key == NULL)) {
2680 		if (path != found_path)
2681 			btrfs_free_path(path);
2682 		return ret;
2683 	}
2684 
2685 	eb = path->nodes[0];
2686 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2687 		ret = btrfs_next_leaf(fs_root, path);
2688 		if (ret)
2689 			return ret;
2690 		eb = path->nodes[0];
2691 	}
2692 
2693 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2694 	if (found_key->type != key.type ||
2695 			found_key->objectid != key.objectid)
2696 		return 1;
2697 
2698 	return 0;
2699 }
2700 
2701 /*
2702  * look for key in the tree.  path is filled in with nodes along the way
2703  * if key is found, we return zero and you can find the item in the leaf
2704  * level of the path (level 0)
2705  *
2706  * If the key isn't found, the path points to the slot where it should
2707  * be inserted, and 1 is returned.  If there are other errors during the
2708  * search a negative error number is returned.
2709  *
2710  * if ins_len > 0, nodes and leaves will be split as we walk down the
2711  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2712  * possible)
2713  */
2714 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2715 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2716 		      ins_len, int cow)
2717 {
2718 	struct extent_buffer *b;
2719 	int slot;
2720 	int ret;
2721 	int err;
2722 	int level;
2723 	int lowest_unlock = 1;
2724 	int root_lock;
2725 	/* everything at write_lock_level or lower must be write locked */
2726 	int write_lock_level = 0;
2727 	u8 lowest_level = 0;
2728 	int min_write_lock_level;
2729 	int prev_cmp;
2730 
2731 	lowest_level = p->lowest_level;
2732 	WARN_ON(lowest_level && ins_len > 0);
2733 	WARN_ON(p->nodes[0] != NULL);
2734 	BUG_ON(!cow && ins_len);
2735 
2736 	if (ins_len < 0) {
2737 		lowest_unlock = 2;
2738 
2739 		/* when we are removing items, we might have to go up to level
2740 		 * two as we update tree pointers  Make sure we keep write
2741 		 * for those levels as well
2742 		 */
2743 		write_lock_level = 2;
2744 	} else if (ins_len > 0) {
2745 		/*
2746 		 * for inserting items, make sure we have a write lock on
2747 		 * level 1 so we can update keys
2748 		 */
2749 		write_lock_level = 1;
2750 	}
2751 
2752 	if (!cow)
2753 		write_lock_level = -1;
2754 
2755 	if (cow && (p->keep_locks || p->lowest_level))
2756 		write_lock_level = BTRFS_MAX_LEVEL;
2757 
2758 	min_write_lock_level = write_lock_level;
2759 
2760 again:
2761 	prev_cmp = -1;
2762 	/*
2763 	 * we try very hard to do read locks on the root
2764 	 */
2765 	root_lock = BTRFS_READ_LOCK;
2766 	level = 0;
2767 	if (p->search_commit_root) {
2768 		/*
2769 		 * the commit roots are read only
2770 		 * so we always do read locks
2771 		 */
2772 		b = root->commit_root;
2773 		extent_buffer_get(b);
2774 		level = btrfs_header_level(b);
2775 		if (!p->skip_locking)
2776 			btrfs_tree_read_lock(b);
2777 	} else {
2778 		if (p->skip_locking) {
2779 			b = btrfs_root_node(root);
2780 			level = btrfs_header_level(b);
2781 		} else {
2782 			/* we don't know the level of the root node
2783 			 * until we actually have it read locked
2784 			 */
2785 			b = btrfs_read_lock_root_node(root);
2786 			level = btrfs_header_level(b);
2787 			if (level <= write_lock_level) {
2788 				/* whoops, must trade for write lock */
2789 				btrfs_tree_read_unlock(b);
2790 				free_extent_buffer(b);
2791 				b = btrfs_lock_root_node(root);
2792 				root_lock = BTRFS_WRITE_LOCK;
2793 
2794 				/* the level might have changed, check again */
2795 				level = btrfs_header_level(b);
2796 			}
2797 		}
2798 	}
2799 	p->nodes[level] = b;
2800 	if (!p->skip_locking)
2801 		p->locks[level] = root_lock;
2802 
2803 	while (b) {
2804 		level = btrfs_header_level(b);
2805 
2806 		/*
2807 		 * setup the path here so we can release it under lock
2808 		 * contention with the cow code
2809 		 */
2810 		if (cow) {
2811 			/*
2812 			 * if we don't really need to cow this block
2813 			 * then we don't want to set the path blocking,
2814 			 * so we test it here
2815 			 */
2816 			if (!should_cow_block(trans, root, b))
2817 				goto cow_done;
2818 
2819 			btrfs_set_path_blocking(p);
2820 
2821 			/*
2822 			 * must have write locks on this node and the
2823 			 * parent
2824 			 */
2825 			if (level > write_lock_level ||
2826 			    (level + 1 > write_lock_level &&
2827 			    level + 1 < BTRFS_MAX_LEVEL &&
2828 			    p->nodes[level + 1])) {
2829 				write_lock_level = level + 1;
2830 				btrfs_release_path(p);
2831 				goto again;
2832 			}
2833 
2834 			err = btrfs_cow_block(trans, root, b,
2835 					      p->nodes[level + 1],
2836 					      p->slots[level + 1], &b);
2837 			if (err) {
2838 				ret = err;
2839 				goto done;
2840 			}
2841 		}
2842 cow_done:
2843 		p->nodes[level] = b;
2844 		btrfs_clear_path_blocking(p, NULL, 0);
2845 
2846 		/*
2847 		 * we have a lock on b and as long as we aren't changing
2848 		 * the tree, there is no way to for the items in b to change.
2849 		 * It is safe to drop the lock on our parent before we
2850 		 * go through the expensive btree search on b.
2851 		 *
2852 		 * If we're inserting or deleting (ins_len != 0), then we might
2853 		 * be changing slot zero, which may require changing the parent.
2854 		 * So, we can't drop the lock until after we know which slot
2855 		 * we're operating on.
2856 		 */
2857 		if (!ins_len && !p->keep_locks) {
2858 			int u = level + 1;
2859 
2860 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2861 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2862 				p->locks[u] = 0;
2863 			}
2864 		}
2865 
2866 		ret = key_search(b, key, level, &prev_cmp, &slot);
2867 
2868 		if (level != 0) {
2869 			int dec = 0;
2870 			if (ret && slot > 0) {
2871 				dec = 1;
2872 				slot -= 1;
2873 			}
2874 			p->slots[level] = slot;
2875 			err = setup_nodes_for_search(trans, root, p, b, level,
2876 					     ins_len, &write_lock_level);
2877 			if (err == -EAGAIN)
2878 				goto again;
2879 			if (err) {
2880 				ret = err;
2881 				goto done;
2882 			}
2883 			b = p->nodes[level];
2884 			slot = p->slots[level];
2885 
2886 			/*
2887 			 * slot 0 is special, if we change the key
2888 			 * we have to update the parent pointer
2889 			 * which means we must have a write lock
2890 			 * on the parent
2891 			 */
2892 			if (slot == 0 && ins_len &&
2893 			    write_lock_level < level + 1) {
2894 				write_lock_level = level + 1;
2895 				btrfs_release_path(p);
2896 				goto again;
2897 			}
2898 
2899 			unlock_up(p, level, lowest_unlock,
2900 				  min_write_lock_level, &write_lock_level);
2901 
2902 			if (level == lowest_level) {
2903 				if (dec)
2904 					p->slots[level]++;
2905 				goto done;
2906 			}
2907 
2908 			err = read_block_for_search(trans, root, p,
2909 						    &b, level, slot, key, 0);
2910 			if (err == -EAGAIN)
2911 				goto again;
2912 			if (err) {
2913 				ret = err;
2914 				goto done;
2915 			}
2916 
2917 			if (!p->skip_locking) {
2918 				level = btrfs_header_level(b);
2919 				if (level <= write_lock_level) {
2920 					err = btrfs_try_tree_write_lock(b);
2921 					if (!err) {
2922 						btrfs_set_path_blocking(p);
2923 						btrfs_tree_lock(b);
2924 						btrfs_clear_path_blocking(p, b,
2925 								  BTRFS_WRITE_LOCK);
2926 					}
2927 					p->locks[level] = BTRFS_WRITE_LOCK;
2928 				} else {
2929 					err = btrfs_try_tree_read_lock(b);
2930 					if (!err) {
2931 						btrfs_set_path_blocking(p);
2932 						btrfs_tree_read_lock(b);
2933 						btrfs_clear_path_blocking(p, b,
2934 								  BTRFS_READ_LOCK);
2935 					}
2936 					p->locks[level] = BTRFS_READ_LOCK;
2937 				}
2938 				p->nodes[level] = b;
2939 			}
2940 		} else {
2941 			p->slots[level] = slot;
2942 			if (ins_len > 0 &&
2943 			    btrfs_leaf_free_space(root, b) < ins_len) {
2944 				if (write_lock_level < 1) {
2945 					write_lock_level = 1;
2946 					btrfs_release_path(p);
2947 					goto again;
2948 				}
2949 
2950 				btrfs_set_path_blocking(p);
2951 				err = split_leaf(trans, root, key,
2952 						 p, ins_len, ret == 0);
2953 				btrfs_clear_path_blocking(p, NULL, 0);
2954 
2955 				BUG_ON(err > 0);
2956 				if (err) {
2957 					ret = err;
2958 					goto done;
2959 				}
2960 			}
2961 			if (!p->search_for_split)
2962 				unlock_up(p, level, lowest_unlock,
2963 					  min_write_lock_level, &write_lock_level);
2964 			goto done;
2965 		}
2966 	}
2967 	ret = 1;
2968 done:
2969 	/*
2970 	 * we don't really know what they plan on doing with the path
2971 	 * from here on, so for now just mark it as blocking
2972 	 */
2973 	if (!p->leave_spinning)
2974 		btrfs_set_path_blocking(p);
2975 	if (ret < 0)
2976 		btrfs_release_path(p);
2977 	return ret;
2978 }
2979 
2980 /*
2981  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2982  * current state of the tree together with the operations recorded in the tree
2983  * modification log to search for the key in a previous version of this tree, as
2984  * denoted by the time_seq parameter.
2985  *
2986  * Naturally, there is no support for insert, delete or cow operations.
2987  *
2988  * The resulting path and return value will be set up as if we called
2989  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2990  */
2991 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2992 			  struct btrfs_path *p, u64 time_seq)
2993 {
2994 	struct extent_buffer *b;
2995 	int slot;
2996 	int ret;
2997 	int err;
2998 	int level;
2999 	int lowest_unlock = 1;
3000 	u8 lowest_level = 0;
3001 	int prev_cmp = -1;
3002 
3003 	lowest_level = p->lowest_level;
3004 	WARN_ON(p->nodes[0] != NULL);
3005 
3006 	if (p->search_commit_root) {
3007 		BUG_ON(time_seq);
3008 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
3009 	}
3010 
3011 again:
3012 	b = get_old_root(root, time_seq);
3013 	level = btrfs_header_level(b);
3014 	p->locks[level] = BTRFS_READ_LOCK;
3015 
3016 	while (b) {
3017 		level = btrfs_header_level(b);
3018 		p->nodes[level] = b;
3019 		btrfs_clear_path_blocking(p, NULL, 0);
3020 
3021 		/*
3022 		 * we have a lock on b and as long as we aren't changing
3023 		 * the tree, there is no way to for the items in b to change.
3024 		 * It is safe to drop the lock on our parent before we
3025 		 * go through the expensive btree search on b.
3026 		 */
3027 		btrfs_unlock_up_safe(p, level + 1);
3028 
3029 		/*
3030 		 * Since we can unwind eb's we want to do a real search every
3031 		 * time.
3032 		 */
3033 		prev_cmp = -1;
3034 		ret = key_search(b, key, level, &prev_cmp, &slot);
3035 
3036 		if (level != 0) {
3037 			int dec = 0;
3038 			if (ret && slot > 0) {
3039 				dec = 1;
3040 				slot -= 1;
3041 			}
3042 			p->slots[level] = slot;
3043 			unlock_up(p, level, lowest_unlock, 0, NULL);
3044 
3045 			if (level == lowest_level) {
3046 				if (dec)
3047 					p->slots[level]++;
3048 				goto done;
3049 			}
3050 
3051 			err = read_block_for_search(NULL, root, p, &b, level,
3052 						    slot, key, time_seq);
3053 			if (err == -EAGAIN)
3054 				goto again;
3055 			if (err) {
3056 				ret = err;
3057 				goto done;
3058 			}
3059 
3060 			level = btrfs_header_level(b);
3061 			err = btrfs_try_tree_read_lock(b);
3062 			if (!err) {
3063 				btrfs_set_path_blocking(p);
3064 				btrfs_tree_read_lock(b);
3065 				btrfs_clear_path_blocking(p, b,
3066 							  BTRFS_READ_LOCK);
3067 			}
3068 			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3069 			if (!b) {
3070 				ret = -ENOMEM;
3071 				goto done;
3072 			}
3073 			p->locks[level] = BTRFS_READ_LOCK;
3074 			p->nodes[level] = b;
3075 		} else {
3076 			p->slots[level] = slot;
3077 			unlock_up(p, level, lowest_unlock, 0, NULL);
3078 			goto done;
3079 		}
3080 	}
3081 	ret = 1;
3082 done:
3083 	if (!p->leave_spinning)
3084 		btrfs_set_path_blocking(p);
3085 	if (ret < 0)
3086 		btrfs_release_path(p);
3087 
3088 	return ret;
3089 }
3090 
3091 /*
3092  * helper to use instead of search slot if no exact match is needed but
3093  * instead the next or previous item should be returned.
3094  * When find_higher is true, the next higher item is returned, the next lower
3095  * otherwise.
3096  * When return_any and find_higher are both true, and no higher item is found,
3097  * return the next lower instead.
3098  * When return_any is true and find_higher is false, and no lower item is found,
3099  * return the next higher instead.
3100  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3101  * < 0 on error
3102  */
3103 int btrfs_search_slot_for_read(struct btrfs_root *root,
3104 			       struct btrfs_key *key, struct btrfs_path *p,
3105 			       int find_higher, int return_any)
3106 {
3107 	int ret;
3108 	struct extent_buffer *leaf;
3109 
3110 again:
3111 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3112 	if (ret <= 0)
3113 		return ret;
3114 	/*
3115 	 * a return value of 1 means the path is at the position where the
3116 	 * item should be inserted. Normally this is the next bigger item,
3117 	 * but in case the previous item is the last in a leaf, path points
3118 	 * to the first free slot in the previous leaf, i.e. at an invalid
3119 	 * item.
3120 	 */
3121 	leaf = p->nodes[0];
3122 
3123 	if (find_higher) {
3124 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3125 			ret = btrfs_next_leaf(root, p);
3126 			if (ret <= 0)
3127 				return ret;
3128 			if (!return_any)
3129 				return 1;
3130 			/*
3131 			 * no higher item found, return the next
3132 			 * lower instead
3133 			 */
3134 			return_any = 0;
3135 			find_higher = 0;
3136 			btrfs_release_path(p);
3137 			goto again;
3138 		}
3139 	} else {
3140 		if (p->slots[0] == 0) {
3141 			ret = btrfs_prev_leaf(root, p);
3142 			if (ret < 0)
3143 				return ret;
3144 			if (!ret) {
3145 				leaf = p->nodes[0];
3146 				if (p->slots[0] == btrfs_header_nritems(leaf))
3147 					p->slots[0]--;
3148 				return 0;
3149 			}
3150 			if (!return_any)
3151 				return 1;
3152 			/*
3153 			 * no lower item found, return the next
3154 			 * higher instead
3155 			 */
3156 			return_any = 0;
3157 			find_higher = 1;
3158 			btrfs_release_path(p);
3159 			goto again;
3160 		} else {
3161 			--p->slots[0];
3162 		}
3163 	}
3164 	return 0;
3165 }
3166 
3167 /*
3168  * adjust the pointers going up the tree, starting at level
3169  * making sure the right key of each node is points to 'key'.
3170  * This is used after shifting pointers to the left, so it stops
3171  * fixing up pointers when a given leaf/node is not in slot 0 of the
3172  * higher levels
3173  *
3174  */
3175 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3176 			   struct btrfs_disk_key *key, int level)
3177 {
3178 	int i;
3179 	struct extent_buffer *t;
3180 
3181 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3182 		int tslot = path->slots[i];
3183 		if (!path->nodes[i])
3184 			break;
3185 		t = path->nodes[i];
3186 		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3187 		btrfs_set_node_key(t, key, tslot);
3188 		btrfs_mark_buffer_dirty(path->nodes[i]);
3189 		if (tslot != 0)
3190 			break;
3191 	}
3192 }
3193 
3194 /*
3195  * update item key.
3196  *
3197  * This function isn't completely safe. It's the caller's responsibility
3198  * that the new key won't break the order
3199  */
3200 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3201 			     struct btrfs_key *new_key)
3202 {
3203 	struct btrfs_disk_key disk_key;
3204 	struct extent_buffer *eb;
3205 	int slot;
3206 
3207 	eb = path->nodes[0];
3208 	slot = path->slots[0];
3209 	if (slot > 0) {
3210 		btrfs_item_key(eb, &disk_key, slot - 1);
3211 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3212 	}
3213 	if (slot < btrfs_header_nritems(eb) - 1) {
3214 		btrfs_item_key(eb, &disk_key, slot + 1);
3215 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3216 	}
3217 
3218 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3219 	btrfs_set_item_key(eb, &disk_key, slot);
3220 	btrfs_mark_buffer_dirty(eb);
3221 	if (slot == 0)
3222 		fixup_low_keys(root, path, &disk_key, 1);
3223 }
3224 
3225 /*
3226  * try to push data from one node into the next node left in the
3227  * tree.
3228  *
3229  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3230  * error, and > 0 if there was no room in the left hand block.
3231  */
3232 static int push_node_left(struct btrfs_trans_handle *trans,
3233 			  struct btrfs_root *root, struct extent_buffer *dst,
3234 			  struct extent_buffer *src, int empty)
3235 {
3236 	int push_items = 0;
3237 	int src_nritems;
3238 	int dst_nritems;
3239 	int ret = 0;
3240 
3241 	src_nritems = btrfs_header_nritems(src);
3242 	dst_nritems = btrfs_header_nritems(dst);
3243 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3244 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3245 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3246 
3247 	if (!empty && src_nritems <= 8)
3248 		return 1;
3249 
3250 	if (push_items <= 0)
3251 		return 1;
3252 
3253 	if (empty) {
3254 		push_items = min(src_nritems, push_items);
3255 		if (push_items < src_nritems) {
3256 			/* leave at least 8 pointers in the node if
3257 			 * we aren't going to empty it
3258 			 */
3259 			if (src_nritems - push_items < 8) {
3260 				if (push_items <= 8)
3261 					return 1;
3262 				push_items -= 8;
3263 			}
3264 		}
3265 	} else
3266 		push_items = min(src_nritems - 8, push_items);
3267 
3268 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3269 				   push_items);
3270 	if (ret) {
3271 		btrfs_abort_transaction(trans, root, ret);
3272 		return ret;
3273 	}
3274 	copy_extent_buffer(dst, src,
3275 			   btrfs_node_key_ptr_offset(dst_nritems),
3276 			   btrfs_node_key_ptr_offset(0),
3277 			   push_items * sizeof(struct btrfs_key_ptr));
3278 
3279 	if (push_items < src_nritems) {
3280 		/*
3281 		 * don't call tree_mod_log_eb_move here, key removal was already
3282 		 * fully logged by tree_mod_log_eb_copy above.
3283 		 */
3284 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3285 				      btrfs_node_key_ptr_offset(push_items),
3286 				      (src_nritems - push_items) *
3287 				      sizeof(struct btrfs_key_ptr));
3288 	}
3289 	btrfs_set_header_nritems(src, src_nritems - push_items);
3290 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3291 	btrfs_mark_buffer_dirty(src);
3292 	btrfs_mark_buffer_dirty(dst);
3293 
3294 	return ret;
3295 }
3296 
3297 /*
3298  * try to push data from one node into the next node right in the
3299  * tree.
3300  *
3301  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3302  * error, and > 0 if there was no room in the right hand block.
3303  *
3304  * this will  only push up to 1/2 the contents of the left node over
3305  */
3306 static int balance_node_right(struct btrfs_trans_handle *trans,
3307 			      struct btrfs_root *root,
3308 			      struct extent_buffer *dst,
3309 			      struct extent_buffer *src)
3310 {
3311 	int push_items = 0;
3312 	int max_push;
3313 	int src_nritems;
3314 	int dst_nritems;
3315 	int ret = 0;
3316 
3317 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3318 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3319 
3320 	src_nritems = btrfs_header_nritems(src);
3321 	dst_nritems = btrfs_header_nritems(dst);
3322 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3323 	if (push_items <= 0)
3324 		return 1;
3325 
3326 	if (src_nritems < 4)
3327 		return 1;
3328 
3329 	max_push = src_nritems / 2 + 1;
3330 	/* don't try to empty the node */
3331 	if (max_push >= src_nritems)
3332 		return 1;
3333 
3334 	if (max_push < push_items)
3335 		push_items = max_push;
3336 
3337 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3338 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3339 				      btrfs_node_key_ptr_offset(0),
3340 				      (dst_nritems) *
3341 				      sizeof(struct btrfs_key_ptr));
3342 
3343 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3344 				   src_nritems - push_items, push_items);
3345 	if (ret) {
3346 		btrfs_abort_transaction(trans, root, ret);
3347 		return ret;
3348 	}
3349 	copy_extent_buffer(dst, src,
3350 			   btrfs_node_key_ptr_offset(0),
3351 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3352 			   push_items * sizeof(struct btrfs_key_ptr));
3353 
3354 	btrfs_set_header_nritems(src, src_nritems - push_items);
3355 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3356 
3357 	btrfs_mark_buffer_dirty(src);
3358 	btrfs_mark_buffer_dirty(dst);
3359 
3360 	return ret;
3361 }
3362 
3363 /*
3364  * helper function to insert a new root level in the tree.
3365  * A new node is allocated, and a single item is inserted to
3366  * point to the existing root
3367  *
3368  * returns zero on success or < 0 on failure.
3369  */
3370 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3371 			   struct btrfs_root *root,
3372 			   struct btrfs_path *path, int level)
3373 {
3374 	u64 lower_gen;
3375 	struct extent_buffer *lower;
3376 	struct extent_buffer *c;
3377 	struct extent_buffer *old;
3378 	struct btrfs_disk_key lower_key;
3379 
3380 	BUG_ON(path->nodes[level]);
3381 	BUG_ON(path->nodes[level-1] != root->node);
3382 
3383 	lower = path->nodes[level-1];
3384 	if (level == 1)
3385 		btrfs_item_key(lower, &lower_key, 0);
3386 	else
3387 		btrfs_node_key(lower, &lower_key, 0);
3388 
3389 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3390 				   root->root_key.objectid, &lower_key,
3391 				   level, root->node->start, 0);
3392 	if (IS_ERR(c))
3393 		return PTR_ERR(c);
3394 
3395 	root_add_used(root, root->nodesize);
3396 
3397 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3398 	btrfs_set_header_nritems(c, 1);
3399 	btrfs_set_header_level(c, level);
3400 	btrfs_set_header_bytenr(c, c->start);
3401 	btrfs_set_header_generation(c, trans->transid);
3402 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3403 	btrfs_set_header_owner(c, root->root_key.objectid);
3404 
3405 	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3406 			    BTRFS_FSID_SIZE);
3407 
3408 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3409 			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3410 
3411 	btrfs_set_node_key(c, &lower_key, 0);
3412 	btrfs_set_node_blockptr(c, 0, lower->start);
3413 	lower_gen = btrfs_header_generation(lower);
3414 	WARN_ON(lower_gen != trans->transid);
3415 
3416 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3417 
3418 	btrfs_mark_buffer_dirty(c);
3419 
3420 	old = root->node;
3421 	tree_mod_log_set_root_pointer(root, c, 0);
3422 	rcu_assign_pointer(root->node, c);
3423 
3424 	/* the super has an extra ref to root->node */
3425 	free_extent_buffer(old);
3426 
3427 	add_root_to_dirty_list(root);
3428 	extent_buffer_get(c);
3429 	path->nodes[level] = c;
3430 	path->locks[level] = BTRFS_WRITE_LOCK;
3431 	path->slots[level] = 0;
3432 	return 0;
3433 }
3434 
3435 /*
3436  * worker function to insert a single pointer in a node.
3437  * the node should have enough room for the pointer already
3438  *
3439  * slot and level indicate where you want the key to go, and
3440  * blocknr is the block the key points to.
3441  */
3442 static void insert_ptr(struct btrfs_trans_handle *trans,
3443 		       struct btrfs_root *root, struct btrfs_path *path,
3444 		       struct btrfs_disk_key *key, u64 bytenr,
3445 		       int slot, int level)
3446 {
3447 	struct extent_buffer *lower;
3448 	int nritems;
3449 	int ret;
3450 
3451 	BUG_ON(!path->nodes[level]);
3452 	btrfs_assert_tree_locked(path->nodes[level]);
3453 	lower = path->nodes[level];
3454 	nritems = btrfs_header_nritems(lower);
3455 	BUG_ON(slot > nritems);
3456 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3457 	if (slot != nritems) {
3458 		if (level)
3459 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3460 					     slot, nritems - slot);
3461 		memmove_extent_buffer(lower,
3462 			      btrfs_node_key_ptr_offset(slot + 1),
3463 			      btrfs_node_key_ptr_offset(slot),
3464 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3465 	}
3466 	if (level) {
3467 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3468 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3469 		BUG_ON(ret < 0);
3470 	}
3471 	btrfs_set_node_key(lower, key, slot);
3472 	btrfs_set_node_blockptr(lower, slot, bytenr);
3473 	WARN_ON(trans->transid == 0);
3474 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3475 	btrfs_set_header_nritems(lower, nritems + 1);
3476 	btrfs_mark_buffer_dirty(lower);
3477 }
3478 
3479 /*
3480  * split the node at the specified level in path in two.
3481  * The path is corrected to point to the appropriate node after the split
3482  *
3483  * Before splitting this tries to make some room in the node by pushing
3484  * left and right, if either one works, it returns right away.
3485  *
3486  * returns 0 on success and < 0 on failure
3487  */
3488 static noinline int split_node(struct btrfs_trans_handle *trans,
3489 			       struct btrfs_root *root,
3490 			       struct btrfs_path *path, int level)
3491 {
3492 	struct extent_buffer *c;
3493 	struct extent_buffer *split;
3494 	struct btrfs_disk_key disk_key;
3495 	int mid;
3496 	int ret;
3497 	u32 c_nritems;
3498 
3499 	c = path->nodes[level];
3500 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3501 	if (c == root->node) {
3502 		/*
3503 		 * trying to split the root, lets make a new one
3504 		 *
3505 		 * tree mod log: We don't log_removal old root in
3506 		 * insert_new_root, because that root buffer will be kept as a
3507 		 * normal node. We are going to log removal of half of the
3508 		 * elements below with tree_mod_log_eb_copy. We're holding a
3509 		 * tree lock on the buffer, which is why we cannot race with
3510 		 * other tree_mod_log users.
3511 		 */
3512 		ret = insert_new_root(trans, root, path, level + 1);
3513 		if (ret)
3514 			return ret;
3515 	} else {
3516 		ret = push_nodes_for_insert(trans, root, path, level);
3517 		c = path->nodes[level];
3518 		if (!ret && btrfs_header_nritems(c) <
3519 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3520 			return 0;
3521 		if (ret < 0)
3522 			return ret;
3523 	}
3524 
3525 	c_nritems = btrfs_header_nritems(c);
3526 	mid = (c_nritems + 1) / 2;
3527 	btrfs_node_key(c, &disk_key, mid);
3528 
3529 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3530 					root->root_key.objectid,
3531 					&disk_key, level, c->start, 0);
3532 	if (IS_ERR(split))
3533 		return PTR_ERR(split);
3534 
3535 	root_add_used(root, root->nodesize);
3536 
3537 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3538 	btrfs_set_header_level(split, btrfs_header_level(c));
3539 	btrfs_set_header_bytenr(split, split->start);
3540 	btrfs_set_header_generation(split, trans->transid);
3541 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3542 	btrfs_set_header_owner(split, root->root_key.objectid);
3543 	write_extent_buffer(split, root->fs_info->fsid,
3544 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
3545 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3546 			    btrfs_header_chunk_tree_uuid(split),
3547 			    BTRFS_UUID_SIZE);
3548 
3549 	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3550 				   mid, c_nritems - mid);
3551 	if (ret) {
3552 		btrfs_abort_transaction(trans, root, ret);
3553 		return ret;
3554 	}
3555 	copy_extent_buffer(split, c,
3556 			   btrfs_node_key_ptr_offset(0),
3557 			   btrfs_node_key_ptr_offset(mid),
3558 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3559 	btrfs_set_header_nritems(split, c_nritems - mid);
3560 	btrfs_set_header_nritems(c, mid);
3561 	ret = 0;
3562 
3563 	btrfs_mark_buffer_dirty(c);
3564 	btrfs_mark_buffer_dirty(split);
3565 
3566 	insert_ptr(trans, root, path, &disk_key, split->start,
3567 		   path->slots[level + 1] + 1, level + 1);
3568 
3569 	if (path->slots[level] >= mid) {
3570 		path->slots[level] -= mid;
3571 		btrfs_tree_unlock(c);
3572 		free_extent_buffer(c);
3573 		path->nodes[level] = split;
3574 		path->slots[level + 1] += 1;
3575 	} else {
3576 		btrfs_tree_unlock(split);
3577 		free_extent_buffer(split);
3578 	}
3579 	return ret;
3580 }
3581 
3582 /*
3583  * how many bytes are required to store the items in a leaf.  start
3584  * and nr indicate which items in the leaf to check.  This totals up the
3585  * space used both by the item structs and the item data
3586  */
3587 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3588 {
3589 	struct btrfs_item *start_item;
3590 	struct btrfs_item *end_item;
3591 	struct btrfs_map_token token;
3592 	int data_len;
3593 	int nritems = btrfs_header_nritems(l);
3594 	int end = min(nritems, start + nr) - 1;
3595 
3596 	if (!nr)
3597 		return 0;
3598 	btrfs_init_map_token(&token);
3599 	start_item = btrfs_item_nr(start);
3600 	end_item = btrfs_item_nr(end);
3601 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3602 		btrfs_token_item_size(l, start_item, &token);
3603 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3604 	data_len += sizeof(struct btrfs_item) * nr;
3605 	WARN_ON(data_len < 0);
3606 	return data_len;
3607 }
3608 
3609 /*
3610  * The space between the end of the leaf items and
3611  * the start of the leaf data.  IOW, how much room
3612  * the leaf has left for both items and data
3613  */
3614 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3615 				   struct extent_buffer *leaf)
3616 {
3617 	int nritems = btrfs_header_nritems(leaf);
3618 	int ret;
3619 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3620 	if (ret < 0) {
3621 		btrfs_crit(root->fs_info,
3622 			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3623 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3624 		       leaf_space_used(leaf, 0, nritems), nritems);
3625 	}
3626 	return ret;
3627 }
3628 
3629 /*
3630  * min slot controls the lowest index we're willing to push to the
3631  * right.  We'll push up to and including min_slot, but no lower
3632  */
3633 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3634 				      struct btrfs_root *root,
3635 				      struct btrfs_path *path,
3636 				      int data_size, int empty,
3637 				      struct extent_buffer *right,
3638 				      int free_space, u32 left_nritems,
3639 				      u32 min_slot)
3640 {
3641 	struct extent_buffer *left = path->nodes[0];
3642 	struct extent_buffer *upper = path->nodes[1];
3643 	struct btrfs_map_token token;
3644 	struct btrfs_disk_key disk_key;
3645 	int slot;
3646 	u32 i;
3647 	int push_space = 0;
3648 	int push_items = 0;
3649 	struct btrfs_item *item;
3650 	u32 nr;
3651 	u32 right_nritems;
3652 	u32 data_end;
3653 	u32 this_item_size;
3654 
3655 	btrfs_init_map_token(&token);
3656 
3657 	if (empty)
3658 		nr = 0;
3659 	else
3660 		nr = max_t(u32, 1, min_slot);
3661 
3662 	if (path->slots[0] >= left_nritems)
3663 		push_space += data_size;
3664 
3665 	slot = path->slots[1];
3666 	i = left_nritems - 1;
3667 	while (i >= nr) {
3668 		item = btrfs_item_nr(i);
3669 
3670 		if (!empty && push_items > 0) {
3671 			if (path->slots[0] > i)
3672 				break;
3673 			if (path->slots[0] == i) {
3674 				int space = btrfs_leaf_free_space(root, left);
3675 				if (space + push_space * 2 > free_space)
3676 					break;
3677 			}
3678 		}
3679 
3680 		if (path->slots[0] == i)
3681 			push_space += data_size;
3682 
3683 		this_item_size = btrfs_item_size(left, item);
3684 		if (this_item_size + sizeof(*item) + push_space > free_space)
3685 			break;
3686 
3687 		push_items++;
3688 		push_space += this_item_size + sizeof(*item);
3689 		if (i == 0)
3690 			break;
3691 		i--;
3692 	}
3693 
3694 	if (push_items == 0)
3695 		goto out_unlock;
3696 
3697 	WARN_ON(!empty && push_items == left_nritems);
3698 
3699 	/* push left to right */
3700 	right_nritems = btrfs_header_nritems(right);
3701 
3702 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3703 	push_space -= leaf_data_end(root, left);
3704 
3705 	/* make room in the right data area */
3706 	data_end = leaf_data_end(root, right);
3707 	memmove_extent_buffer(right,
3708 			      btrfs_leaf_data(right) + data_end - push_space,
3709 			      btrfs_leaf_data(right) + data_end,
3710 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3711 
3712 	/* copy from the left data area */
3713 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3714 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3715 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3716 		     push_space);
3717 
3718 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3719 			      btrfs_item_nr_offset(0),
3720 			      right_nritems * sizeof(struct btrfs_item));
3721 
3722 	/* copy the items from left to right */
3723 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3724 		   btrfs_item_nr_offset(left_nritems - push_items),
3725 		   push_items * sizeof(struct btrfs_item));
3726 
3727 	/* update the item pointers */
3728 	right_nritems += push_items;
3729 	btrfs_set_header_nritems(right, right_nritems);
3730 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3731 	for (i = 0; i < right_nritems; i++) {
3732 		item = btrfs_item_nr(i);
3733 		push_space -= btrfs_token_item_size(right, item, &token);
3734 		btrfs_set_token_item_offset(right, item, push_space, &token);
3735 	}
3736 
3737 	left_nritems -= push_items;
3738 	btrfs_set_header_nritems(left, left_nritems);
3739 
3740 	if (left_nritems)
3741 		btrfs_mark_buffer_dirty(left);
3742 	else
3743 		clean_tree_block(trans, root, left);
3744 
3745 	btrfs_mark_buffer_dirty(right);
3746 
3747 	btrfs_item_key(right, &disk_key, 0);
3748 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3749 	btrfs_mark_buffer_dirty(upper);
3750 
3751 	/* then fixup the leaf pointer in the path */
3752 	if (path->slots[0] >= left_nritems) {
3753 		path->slots[0] -= left_nritems;
3754 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3755 			clean_tree_block(trans, root, path->nodes[0]);
3756 		btrfs_tree_unlock(path->nodes[0]);
3757 		free_extent_buffer(path->nodes[0]);
3758 		path->nodes[0] = right;
3759 		path->slots[1] += 1;
3760 	} else {
3761 		btrfs_tree_unlock(right);
3762 		free_extent_buffer(right);
3763 	}
3764 	return 0;
3765 
3766 out_unlock:
3767 	btrfs_tree_unlock(right);
3768 	free_extent_buffer(right);
3769 	return 1;
3770 }
3771 
3772 /*
3773  * push some data in the path leaf to the right, trying to free up at
3774  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3775  *
3776  * returns 1 if the push failed because the other node didn't have enough
3777  * room, 0 if everything worked out and < 0 if there were major errors.
3778  *
3779  * this will push starting from min_slot to the end of the leaf.  It won't
3780  * push any slot lower than min_slot
3781  */
3782 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3783 			   *root, struct btrfs_path *path,
3784 			   int min_data_size, int data_size,
3785 			   int empty, u32 min_slot)
3786 {
3787 	struct extent_buffer *left = path->nodes[0];
3788 	struct extent_buffer *right;
3789 	struct extent_buffer *upper;
3790 	int slot;
3791 	int free_space;
3792 	u32 left_nritems;
3793 	int ret;
3794 
3795 	if (!path->nodes[1])
3796 		return 1;
3797 
3798 	slot = path->slots[1];
3799 	upper = path->nodes[1];
3800 	if (slot >= btrfs_header_nritems(upper) - 1)
3801 		return 1;
3802 
3803 	btrfs_assert_tree_locked(path->nodes[1]);
3804 
3805 	right = read_node_slot(root, upper, slot + 1);
3806 	if (right == NULL)
3807 		return 1;
3808 
3809 	btrfs_tree_lock(right);
3810 	btrfs_set_lock_blocking(right);
3811 
3812 	free_space = btrfs_leaf_free_space(root, right);
3813 	if (free_space < data_size)
3814 		goto out_unlock;
3815 
3816 	/* cow and double check */
3817 	ret = btrfs_cow_block(trans, root, right, upper,
3818 			      slot + 1, &right);
3819 	if (ret)
3820 		goto out_unlock;
3821 
3822 	free_space = btrfs_leaf_free_space(root, right);
3823 	if (free_space < data_size)
3824 		goto out_unlock;
3825 
3826 	left_nritems = btrfs_header_nritems(left);
3827 	if (left_nritems == 0)
3828 		goto out_unlock;
3829 
3830 	if (path->slots[0] == left_nritems && !empty) {
3831 		/* Key greater than all keys in the leaf, right neighbor has
3832 		 * enough room for it and we're not emptying our leaf to delete
3833 		 * it, therefore use right neighbor to insert the new item and
3834 		 * no need to touch/dirty our left leaft. */
3835 		btrfs_tree_unlock(left);
3836 		free_extent_buffer(left);
3837 		path->nodes[0] = right;
3838 		path->slots[0] = 0;
3839 		path->slots[1]++;
3840 		return 0;
3841 	}
3842 
3843 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3844 				right, free_space, left_nritems, min_slot);
3845 out_unlock:
3846 	btrfs_tree_unlock(right);
3847 	free_extent_buffer(right);
3848 	return 1;
3849 }
3850 
3851 /*
3852  * push some data in the path leaf to the left, trying to free up at
3853  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3854  *
3855  * max_slot can put a limit on how far into the leaf we'll push items.  The
3856  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3857  * items
3858  */
3859 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3860 				     struct btrfs_root *root,
3861 				     struct btrfs_path *path, int data_size,
3862 				     int empty, struct extent_buffer *left,
3863 				     int free_space, u32 right_nritems,
3864 				     u32 max_slot)
3865 {
3866 	struct btrfs_disk_key disk_key;
3867 	struct extent_buffer *right = path->nodes[0];
3868 	int i;
3869 	int push_space = 0;
3870 	int push_items = 0;
3871 	struct btrfs_item *item;
3872 	u32 old_left_nritems;
3873 	u32 nr;
3874 	int ret = 0;
3875 	u32 this_item_size;
3876 	u32 old_left_item_size;
3877 	struct btrfs_map_token token;
3878 
3879 	btrfs_init_map_token(&token);
3880 
3881 	if (empty)
3882 		nr = min(right_nritems, max_slot);
3883 	else
3884 		nr = min(right_nritems - 1, max_slot);
3885 
3886 	for (i = 0; i < nr; i++) {
3887 		item = btrfs_item_nr(i);
3888 
3889 		if (!empty && push_items > 0) {
3890 			if (path->slots[0] < i)
3891 				break;
3892 			if (path->slots[0] == i) {
3893 				int space = btrfs_leaf_free_space(root, right);
3894 				if (space + push_space * 2 > free_space)
3895 					break;
3896 			}
3897 		}
3898 
3899 		if (path->slots[0] == i)
3900 			push_space += data_size;
3901 
3902 		this_item_size = btrfs_item_size(right, item);
3903 		if (this_item_size + sizeof(*item) + push_space > free_space)
3904 			break;
3905 
3906 		push_items++;
3907 		push_space += this_item_size + sizeof(*item);
3908 	}
3909 
3910 	if (push_items == 0) {
3911 		ret = 1;
3912 		goto out;
3913 	}
3914 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3915 
3916 	/* push data from right to left */
3917 	copy_extent_buffer(left, right,
3918 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3919 			   btrfs_item_nr_offset(0),
3920 			   push_items * sizeof(struct btrfs_item));
3921 
3922 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3923 		     btrfs_item_offset_nr(right, push_items - 1);
3924 
3925 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3926 		     leaf_data_end(root, left) - push_space,
3927 		     btrfs_leaf_data(right) +
3928 		     btrfs_item_offset_nr(right, push_items - 1),
3929 		     push_space);
3930 	old_left_nritems = btrfs_header_nritems(left);
3931 	BUG_ON(old_left_nritems <= 0);
3932 
3933 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3934 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3935 		u32 ioff;
3936 
3937 		item = btrfs_item_nr(i);
3938 
3939 		ioff = btrfs_token_item_offset(left, item, &token);
3940 		btrfs_set_token_item_offset(left, item,
3941 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3942 		      &token);
3943 	}
3944 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3945 
3946 	/* fixup right node */
3947 	if (push_items > right_nritems)
3948 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3949 		       right_nritems);
3950 
3951 	if (push_items < right_nritems) {
3952 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3953 						  leaf_data_end(root, right);
3954 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3955 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3956 				      btrfs_leaf_data(right) +
3957 				      leaf_data_end(root, right), push_space);
3958 
3959 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3960 			      btrfs_item_nr_offset(push_items),
3961 			     (btrfs_header_nritems(right) - push_items) *
3962 			     sizeof(struct btrfs_item));
3963 	}
3964 	right_nritems -= push_items;
3965 	btrfs_set_header_nritems(right, right_nritems);
3966 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3967 	for (i = 0; i < right_nritems; i++) {
3968 		item = btrfs_item_nr(i);
3969 
3970 		push_space = push_space - btrfs_token_item_size(right,
3971 								item, &token);
3972 		btrfs_set_token_item_offset(right, item, push_space, &token);
3973 	}
3974 
3975 	btrfs_mark_buffer_dirty(left);
3976 	if (right_nritems)
3977 		btrfs_mark_buffer_dirty(right);
3978 	else
3979 		clean_tree_block(trans, root, right);
3980 
3981 	btrfs_item_key(right, &disk_key, 0);
3982 	fixup_low_keys(root, path, &disk_key, 1);
3983 
3984 	/* then fixup the leaf pointer in the path */
3985 	if (path->slots[0] < push_items) {
3986 		path->slots[0] += old_left_nritems;
3987 		btrfs_tree_unlock(path->nodes[0]);
3988 		free_extent_buffer(path->nodes[0]);
3989 		path->nodes[0] = left;
3990 		path->slots[1] -= 1;
3991 	} else {
3992 		btrfs_tree_unlock(left);
3993 		free_extent_buffer(left);
3994 		path->slots[0] -= push_items;
3995 	}
3996 	BUG_ON(path->slots[0] < 0);
3997 	return ret;
3998 out:
3999 	btrfs_tree_unlock(left);
4000 	free_extent_buffer(left);
4001 	return ret;
4002 }
4003 
4004 /*
4005  * push some data in the path leaf to the left, trying to free up at
4006  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4007  *
4008  * max_slot can put a limit on how far into the leaf we'll push items.  The
4009  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4010  * items
4011  */
4012 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4013 			  *root, struct btrfs_path *path, int min_data_size,
4014 			  int data_size, int empty, u32 max_slot)
4015 {
4016 	struct extent_buffer *right = path->nodes[0];
4017 	struct extent_buffer *left;
4018 	int slot;
4019 	int free_space;
4020 	u32 right_nritems;
4021 	int ret = 0;
4022 
4023 	slot = path->slots[1];
4024 	if (slot == 0)
4025 		return 1;
4026 	if (!path->nodes[1])
4027 		return 1;
4028 
4029 	right_nritems = btrfs_header_nritems(right);
4030 	if (right_nritems == 0)
4031 		return 1;
4032 
4033 	btrfs_assert_tree_locked(path->nodes[1]);
4034 
4035 	left = read_node_slot(root, path->nodes[1], slot - 1);
4036 	if (left == NULL)
4037 		return 1;
4038 
4039 	btrfs_tree_lock(left);
4040 	btrfs_set_lock_blocking(left);
4041 
4042 	free_space = btrfs_leaf_free_space(root, left);
4043 	if (free_space < data_size) {
4044 		ret = 1;
4045 		goto out;
4046 	}
4047 
4048 	/* cow and double check */
4049 	ret = btrfs_cow_block(trans, root, left,
4050 			      path->nodes[1], slot - 1, &left);
4051 	if (ret) {
4052 		/* we hit -ENOSPC, but it isn't fatal here */
4053 		if (ret == -ENOSPC)
4054 			ret = 1;
4055 		goto out;
4056 	}
4057 
4058 	free_space = btrfs_leaf_free_space(root, left);
4059 	if (free_space < data_size) {
4060 		ret = 1;
4061 		goto out;
4062 	}
4063 
4064 	return __push_leaf_left(trans, root, path, min_data_size,
4065 			       empty, left, free_space, right_nritems,
4066 			       max_slot);
4067 out:
4068 	btrfs_tree_unlock(left);
4069 	free_extent_buffer(left);
4070 	return ret;
4071 }
4072 
4073 /*
4074  * split the path's leaf in two, making sure there is at least data_size
4075  * available for the resulting leaf level of the path.
4076  */
4077 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4078 				    struct btrfs_root *root,
4079 				    struct btrfs_path *path,
4080 				    struct extent_buffer *l,
4081 				    struct extent_buffer *right,
4082 				    int slot, int mid, int nritems)
4083 {
4084 	int data_copy_size;
4085 	int rt_data_off;
4086 	int i;
4087 	struct btrfs_disk_key disk_key;
4088 	struct btrfs_map_token token;
4089 
4090 	btrfs_init_map_token(&token);
4091 
4092 	nritems = nritems - mid;
4093 	btrfs_set_header_nritems(right, nritems);
4094 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4095 
4096 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4097 			   btrfs_item_nr_offset(mid),
4098 			   nritems * sizeof(struct btrfs_item));
4099 
4100 	copy_extent_buffer(right, l,
4101 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4102 		     data_copy_size, btrfs_leaf_data(l) +
4103 		     leaf_data_end(root, l), data_copy_size);
4104 
4105 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4106 		      btrfs_item_end_nr(l, mid);
4107 
4108 	for (i = 0; i < nritems; i++) {
4109 		struct btrfs_item *item = btrfs_item_nr(i);
4110 		u32 ioff;
4111 
4112 		ioff = btrfs_token_item_offset(right, item, &token);
4113 		btrfs_set_token_item_offset(right, item,
4114 					    ioff + rt_data_off, &token);
4115 	}
4116 
4117 	btrfs_set_header_nritems(l, mid);
4118 	btrfs_item_key(right, &disk_key, 0);
4119 	insert_ptr(trans, root, path, &disk_key, right->start,
4120 		   path->slots[1] + 1, 1);
4121 
4122 	btrfs_mark_buffer_dirty(right);
4123 	btrfs_mark_buffer_dirty(l);
4124 	BUG_ON(path->slots[0] != slot);
4125 
4126 	if (mid <= slot) {
4127 		btrfs_tree_unlock(path->nodes[0]);
4128 		free_extent_buffer(path->nodes[0]);
4129 		path->nodes[0] = right;
4130 		path->slots[0] -= mid;
4131 		path->slots[1] += 1;
4132 	} else {
4133 		btrfs_tree_unlock(right);
4134 		free_extent_buffer(right);
4135 	}
4136 
4137 	BUG_ON(path->slots[0] < 0);
4138 }
4139 
4140 /*
4141  * double splits happen when we need to insert a big item in the middle
4142  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4143  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4144  *          A                 B                 C
4145  *
4146  * We avoid this by trying to push the items on either side of our target
4147  * into the adjacent leaves.  If all goes well we can avoid the double split
4148  * completely.
4149  */
4150 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4151 					  struct btrfs_root *root,
4152 					  struct btrfs_path *path,
4153 					  int data_size)
4154 {
4155 	int ret;
4156 	int progress = 0;
4157 	int slot;
4158 	u32 nritems;
4159 	int space_needed = data_size;
4160 
4161 	slot = path->slots[0];
4162 	if (slot < btrfs_header_nritems(path->nodes[0]))
4163 		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4164 
4165 	/*
4166 	 * try to push all the items after our slot into the
4167 	 * right leaf
4168 	 */
4169 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4170 	if (ret < 0)
4171 		return ret;
4172 
4173 	if (ret == 0)
4174 		progress++;
4175 
4176 	nritems = btrfs_header_nritems(path->nodes[0]);
4177 	/*
4178 	 * our goal is to get our slot at the start or end of a leaf.  If
4179 	 * we've done so we're done
4180 	 */
4181 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4182 		return 0;
4183 
4184 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4185 		return 0;
4186 
4187 	/* try to push all the items before our slot into the next leaf */
4188 	slot = path->slots[0];
4189 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4190 	if (ret < 0)
4191 		return ret;
4192 
4193 	if (ret == 0)
4194 		progress++;
4195 
4196 	if (progress)
4197 		return 0;
4198 	return 1;
4199 }
4200 
4201 /*
4202  * split the path's leaf in two, making sure there is at least data_size
4203  * available for the resulting leaf level of the path.
4204  *
4205  * returns 0 if all went well and < 0 on failure.
4206  */
4207 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4208 			       struct btrfs_root *root,
4209 			       struct btrfs_key *ins_key,
4210 			       struct btrfs_path *path, int data_size,
4211 			       int extend)
4212 {
4213 	struct btrfs_disk_key disk_key;
4214 	struct extent_buffer *l;
4215 	u32 nritems;
4216 	int mid;
4217 	int slot;
4218 	struct extent_buffer *right;
4219 	int ret = 0;
4220 	int wret;
4221 	int split;
4222 	int num_doubles = 0;
4223 	int tried_avoid_double = 0;
4224 
4225 	l = path->nodes[0];
4226 	slot = path->slots[0];
4227 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4228 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4229 		return -EOVERFLOW;
4230 
4231 	/* first try to make some room by pushing left and right */
4232 	if (data_size && path->nodes[1]) {
4233 		int space_needed = data_size;
4234 
4235 		if (slot < btrfs_header_nritems(l))
4236 			space_needed -= btrfs_leaf_free_space(root, l);
4237 
4238 		wret = push_leaf_right(trans, root, path, space_needed,
4239 				       space_needed, 0, 0);
4240 		if (wret < 0)
4241 			return wret;
4242 		if (wret) {
4243 			wret = push_leaf_left(trans, root, path, space_needed,
4244 					      space_needed, 0, (u32)-1);
4245 			if (wret < 0)
4246 				return wret;
4247 		}
4248 		l = path->nodes[0];
4249 
4250 		/* did the pushes work? */
4251 		if (btrfs_leaf_free_space(root, l) >= data_size)
4252 			return 0;
4253 	}
4254 
4255 	if (!path->nodes[1]) {
4256 		ret = insert_new_root(trans, root, path, 1);
4257 		if (ret)
4258 			return ret;
4259 	}
4260 again:
4261 	split = 1;
4262 	l = path->nodes[0];
4263 	slot = path->slots[0];
4264 	nritems = btrfs_header_nritems(l);
4265 	mid = (nritems + 1) / 2;
4266 
4267 	if (mid <= slot) {
4268 		if (nritems == 1 ||
4269 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4270 			BTRFS_LEAF_DATA_SIZE(root)) {
4271 			if (slot >= nritems) {
4272 				split = 0;
4273 			} else {
4274 				mid = slot;
4275 				if (mid != nritems &&
4276 				    leaf_space_used(l, mid, nritems - mid) +
4277 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4278 					if (data_size && !tried_avoid_double)
4279 						goto push_for_double;
4280 					split = 2;
4281 				}
4282 			}
4283 		}
4284 	} else {
4285 		if (leaf_space_used(l, 0, mid) + data_size >
4286 			BTRFS_LEAF_DATA_SIZE(root)) {
4287 			if (!extend && data_size && slot == 0) {
4288 				split = 0;
4289 			} else if ((extend || !data_size) && slot == 0) {
4290 				mid = 1;
4291 			} else {
4292 				mid = slot;
4293 				if (mid != nritems &&
4294 				    leaf_space_used(l, mid, nritems - mid) +
4295 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4296 					if (data_size && !tried_avoid_double)
4297 						goto push_for_double;
4298 					split = 2;
4299 				}
4300 			}
4301 		}
4302 	}
4303 
4304 	if (split == 0)
4305 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4306 	else
4307 		btrfs_item_key(l, &disk_key, mid);
4308 
4309 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
4310 					root->root_key.objectid,
4311 					&disk_key, 0, l->start, 0);
4312 	if (IS_ERR(right))
4313 		return PTR_ERR(right);
4314 
4315 	root_add_used(root, root->leafsize);
4316 
4317 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4318 	btrfs_set_header_bytenr(right, right->start);
4319 	btrfs_set_header_generation(right, trans->transid);
4320 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4321 	btrfs_set_header_owner(right, root->root_key.objectid);
4322 	btrfs_set_header_level(right, 0);
4323 	write_extent_buffer(right, root->fs_info->fsid,
4324 			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
4325 
4326 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4327 			    btrfs_header_chunk_tree_uuid(right),
4328 			    BTRFS_UUID_SIZE);
4329 
4330 	if (split == 0) {
4331 		if (mid <= slot) {
4332 			btrfs_set_header_nritems(right, 0);
4333 			insert_ptr(trans, root, path, &disk_key, right->start,
4334 				   path->slots[1] + 1, 1);
4335 			btrfs_tree_unlock(path->nodes[0]);
4336 			free_extent_buffer(path->nodes[0]);
4337 			path->nodes[0] = right;
4338 			path->slots[0] = 0;
4339 			path->slots[1] += 1;
4340 		} else {
4341 			btrfs_set_header_nritems(right, 0);
4342 			insert_ptr(trans, root, path, &disk_key, right->start,
4343 					  path->slots[1], 1);
4344 			btrfs_tree_unlock(path->nodes[0]);
4345 			free_extent_buffer(path->nodes[0]);
4346 			path->nodes[0] = right;
4347 			path->slots[0] = 0;
4348 			if (path->slots[1] == 0)
4349 				fixup_low_keys(root, path, &disk_key, 1);
4350 		}
4351 		btrfs_mark_buffer_dirty(right);
4352 		return ret;
4353 	}
4354 
4355 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4356 
4357 	if (split == 2) {
4358 		BUG_ON(num_doubles != 0);
4359 		num_doubles++;
4360 		goto again;
4361 	}
4362 
4363 	return 0;
4364 
4365 push_for_double:
4366 	push_for_double_split(trans, root, path, data_size);
4367 	tried_avoid_double = 1;
4368 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4369 		return 0;
4370 	goto again;
4371 }
4372 
4373 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4374 					 struct btrfs_root *root,
4375 					 struct btrfs_path *path, int ins_len)
4376 {
4377 	struct btrfs_key key;
4378 	struct extent_buffer *leaf;
4379 	struct btrfs_file_extent_item *fi;
4380 	u64 extent_len = 0;
4381 	u32 item_size;
4382 	int ret;
4383 
4384 	leaf = path->nodes[0];
4385 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4386 
4387 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4388 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4389 
4390 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4391 		return 0;
4392 
4393 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4394 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4395 		fi = btrfs_item_ptr(leaf, path->slots[0],
4396 				    struct btrfs_file_extent_item);
4397 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4398 	}
4399 	btrfs_release_path(path);
4400 
4401 	path->keep_locks = 1;
4402 	path->search_for_split = 1;
4403 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4404 	path->search_for_split = 0;
4405 	if (ret < 0)
4406 		goto err;
4407 
4408 	ret = -EAGAIN;
4409 	leaf = path->nodes[0];
4410 	/* if our item isn't there or got smaller, return now */
4411 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4412 		goto err;
4413 
4414 	/* the leaf has  changed, it now has room.  return now */
4415 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4416 		goto err;
4417 
4418 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4419 		fi = btrfs_item_ptr(leaf, path->slots[0],
4420 				    struct btrfs_file_extent_item);
4421 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4422 			goto err;
4423 	}
4424 
4425 	btrfs_set_path_blocking(path);
4426 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4427 	if (ret)
4428 		goto err;
4429 
4430 	path->keep_locks = 0;
4431 	btrfs_unlock_up_safe(path, 1);
4432 	return 0;
4433 err:
4434 	path->keep_locks = 0;
4435 	return ret;
4436 }
4437 
4438 static noinline int split_item(struct btrfs_trans_handle *trans,
4439 			       struct btrfs_root *root,
4440 			       struct btrfs_path *path,
4441 			       struct btrfs_key *new_key,
4442 			       unsigned long split_offset)
4443 {
4444 	struct extent_buffer *leaf;
4445 	struct btrfs_item *item;
4446 	struct btrfs_item *new_item;
4447 	int slot;
4448 	char *buf;
4449 	u32 nritems;
4450 	u32 item_size;
4451 	u32 orig_offset;
4452 	struct btrfs_disk_key disk_key;
4453 
4454 	leaf = path->nodes[0];
4455 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4456 
4457 	btrfs_set_path_blocking(path);
4458 
4459 	item = btrfs_item_nr(path->slots[0]);
4460 	orig_offset = btrfs_item_offset(leaf, item);
4461 	item_size = btrfs_item_size(leaf, item);
4462 
4463 	buf = kmalloc(item_size, GFP_NOFS);
4464 	if (!buf)
4465 		return -ENOMEM;
4466 
4467 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4468 			    path->slots[0]), item_size);
4469 
4470 	slot = path->slots[0] + 1;
4471 	nritems = btrfs_header_nritems(leaf);
4472 	if (slot != nritems) {
4473 		/* shift the items */
4474 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4475 				btrfs_item_nr_offset(slot),
4476 				(nritems - slot) * sizeof(struct btrfs_item));
4477 	}
4478 
4479 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4480 	btrfs_set_item_key(leaf, &disk_key, slot);
4481 
4482 	new_item = btrfs_item_nr(slot);
4483 
4484 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4485 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4486 
4487 	btrfs_set_item_offset(leaf, item,
4488 			      orig_offset + item_size - split_offset);
4489 	btrfs_set_item_size(leaf, item, split_offset);
4490 
4491 	btrfs_set_header_nritems(leaf, nritems + 1);
4492 
4493 	/* write the data for the start of the original item */
4494 	write_extent_buffer(leaf, buf,
4495 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4496 			    split_offset);
4497 
4498 	/* write the data for the new item */
4499 	write_extent_buffer(leaf, buf + split_offset,
4500 			    btrfs_item_ptr_offset(leaf, slot),
4501 			    item_size - split_offset);
4502 	btrfs_mark_buffer_dirty(leaf);
4503 
4504 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4505 	kfree(buf);
4506 	return 0;
4507 }
4508 
4509 /*
4510  * This function splits a single item into two items,
4511  * giving 'new_key' to the new item and splitting the
4512  * old one at split_offset (from the start of the item).
4513  *
4514  * The path may be released by this operation.  After
4515  * the split, the path is pointing to the old item.  The
4516  * new item is going to be in the same node as the old one.
4517  *
4518  * Note, the item being split must be smaller enough to live alone on
4519  * a tree block with room for one extra struct btrfs_item
4520  *
4521  * This allows us to split the item in place, keeping a lock on the
4522  * leaf the entire time.
4523  */
4524 int btrfs_split_item(struct btrfs_trans_handle *trans,
4525 		     struct btrfs_root *root,
4526 		     struct btrfs_path *path,
4527 		     struct btrfs_key *new_key,
4528 		     unsigned long split_offset)
4529 {
4530 	int ret;
4531 	ret = setup_leaf_for_split(trans, root, path,
4532 				   sizeof(struct btrfs_item));
4533 	if (ret)
4534 		return ret;
4535 
4536 	ret = split_item(trans, root, path, new_key, split_offset);
4537 	return ret;
4538 }
4539 
4540 /*
4541  * This function duplicate a item, giving 'new_key' to the new item.
4542  * It guarantees both items live in the same tree leaf and the new item
4543  * is contiguous with the original item.
4544  *
4545  * This allows us to split file extent in place, keeping a lock on the
4546  * leaf the entire time.
4547  */
4548 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4549 			 struct btrfs_root *root,
4550 			 struct btrfs_path *path,
4551 			 struct btrfs_key *new_key)
4552 {
4553 	struct extent_buffer *leaf;
4554 	int ret;
4555 	u32 item_size;
4556 
4557 	leaf = path->nodes[0];
4558 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4559 	ret = setup_leaf_for_split(trans, root, path,
4560 				   item_size + sizeof(struct btrfs_item));
4561 	if (ret)
4562 		return ret;
4563 
4564 	path->slots[0]++;
4565 	setup_items_for_insert(root, path, new_key, &item_size,
4566 			       item_size, item_size +
4567 			       sizeof(struct btrfs_item), 1);
4568 	leaf = path->nodes[0];
4569 	memcpy_extent_buffer(leaf,
4570 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4571 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4572 			     item_size);
4573 	return 0;
4574 }
4575 
4576 /*
4577  * make the item pointed to by the path smaller.  new_size indicates
4578  * how small to make it, and from_end tells us if we just chop bytes
4579  * off the end of the item or if we shift the item to chop bytes off
4580  * the front.
4581  */
4582 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4583 			 u32 new_size, int from_end)
4584 {
4585 	int slot;
4586 	struct extent_buffer *leaf;
4587 	struct btrfs_item *item;
4588 	u32 nritems;
4589 	unsigned int data_end;
4590 	unsigned int old_data_start;
4591 	unsigned int old_size;
4592 	unsigned int size_diff;
4593 	int i;
4594 	struct btrfs_map_token token;
4595 
4596 	btrfs_init_map_token(&token);
4597 
4598 	leaf = path->nodes[0];
4599 	slot = path->slots[0];
4600 
4601 	old_size = btrfs_item_size_nr(leaf, slot);
4602 	if (old_size == new_size)
4603 		return;
4604 
4605 	nritems = btrfs_header_nritems(leaf);
4606 	data_end = leaf_data_end(root, leaf);
4607 
4608 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4609 
4610 	size_diff = old_size - new_size;
4611 
4612 	BUG_ON(slot < 0);
4613 	BUG_ON(slot >= nritems);
4614 
4615 	/*
4616 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4617 	 */
4618 	/* first correct the data pointers */
4619 	for (i = slot; i < nritems; i++) {
4620 		u32 ioff;
4621 		item = btrfs_item_nr(i);
4622 
4623 		ioff = btrfs_token_item_offset(leaf, item, &token);
4624 		btrfs_set_token_item_offset(leaf, item,
4625 					    ioff + size_diff, &token);
4626 	}
4627 
4628 	/* shift the data */
4629 	if (from_end) {
4630 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4631 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4632 			      data_end, old_data_start + new_size - data_end);
4633 	} else {
4634 		struct btrfs_disk_key disk_key;
4635 		u64 offset;
4636 
4637 		btrfs_item_key(leaf, &disk_key, slot);
4638 
4639 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4640 			unsigned long ptr;
4641 			struct btrfs_file_extent_item *fi;
4642 
4643 			fi = btrfs_item_ptr(leaf, slot,
4644 					    struct btrfs_file_extent_item);
4645 			fi = (struct btrfs_file_extent_item *)(
4646 			     (unsigned long)fi - size_diff);
4647 
4648 			if (btrfs_file_extent_type(leaf, fi) ==
4649 			    BTRFS_FILE_EXTENT_INLINE) {
4650 				ptr = btrfs_item_ptr_offset(leaf, slot);
4651 				memmove_extent_buffer(leaf, ptr,
4652 				      (unsigned long)fi,
4653 				      offsetof(struct btrfs_file_extent_item,
4654 						 disk_bytenr));
4655 			}
4656 		}
4657 
4658 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4659 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4660 			      data_end, old_data_start - data_end);
4661 
4662 		offset = btrfs_disk_key_offset(&disk_key);
4663 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4664 		btrfs_set_item_key(leaf, &disk_key, slot);
4665 		if (slot == 0)
4666 			fixup_low_keys(root, path, &disk_key, 1);
4667 	}
4668 
4669 	item = btrfs_item_nr(slot);
4670 	btrfs_set_item_size(leaf, item, new_size);
4671 	btrfs_mark_buffer_dirty(leaf);
4672 
4673 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4674 		btrfs_print_leaf(root, leaf);
4675 		BUG();
4676 	}
4677 }
4678 
4679 /*
4680  * make the item pointed to by the path bigger, data_size is the added size.
4681  */
4682 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4683 		       u32 data_size)
4684 {
4685 	int slot;
4686 	struct extent_buffer *leaf;
4687 	struct btrfs_item *item;
4688 	u32 nritems;
4689 	unsigned int data_end;
4690 	unsigned int old_data;
4691 	unsigned int old_size;
4692 	int i;
4693 	struct btrfs_map_token token;
4694 
4695 	btrfs_init_map_token(&token);
4696 
4697 	leaf = path->nodes[0];
4698 
4699 	nritems = btrfs_header_nritems(leaf);
4700 	data_end = leaf_data_end(root, leaf);
4701 
4702 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4703 		btrfs_print_leaf(root, leaf);
4704 		BUG();
4705 	}
4706 	slot = path->slots[0];
4707 	old_data = btrfs_item_end_nr(leaf, slot);
4708 
4709 	BUG_ON(slot < 0);
4710 	if (slot >= nritems) {
4711 		btrfs_print_leaf(root, leaf);
4712 		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4713 		       slot, nritems);
4714 		BUG_ON(1);
4715 	}
4716 
4717 	/*
4718 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4719 	 */
4720 	/* first correct the data pointers */
4721 	for (i = slot; i < nritems; i++) {
4722 		u32 ioff;
4723 		item = btrfs_item_nr(i);
4724 
4725 		ioff = btrfs_token_item_offset(leaf, item, &token);
4726 		btrfs_set_token_item_offset(leaf, item,
4727 					    ioff - data_size, &token);
4728 	}
4729 
4730 	/* shift the data */
4731 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4732 		      data_end - data_size, btrfs_leaf_data(leaf) +
4733 		      data_end, old_data - data_end);
4734 
4735 	data_end = old_data;
4736 	old_size = btrfs_item_size_nr(leaf, slot);
4737 	item = btrfs_item_nr(slot);
4738 	btrfs_set_item_size(leaf, item, old_size + data_size);
4739 	btrfs_mark_buffer_dirty(leaf);
4740 
4741 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4742 		btrfs_print_leaf(root, leaf);
4743 		BUG();
4744 	}
4745 }
4746 
4747 /*
4748  * this is a helper for btrfs_insert_empty_items, the main goal here is
4749  * to save stack depth by doing the bulk of the work in a function
4750  * that doesn't call btrfs_search_slot
4751  */
4752 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4753 			    struct btrfs_key *cpu_key, u32 *data_size,
4754 			    u32 total_data, u32 total_size, int nr)
4755 {
4756 	struct btrfs_item *item;
4757 	int i;
4758 	u32 nritems;
4759 	unsigned int data_end;
4760 	struct btrfs_disk_key disk_key;
4761 	struct extent_buffer *leaf;
4762 	int slot;
4763 	struct btrfs_map_token token;
4764 
4765 	btrfs_init_map_token(&token);
4766 
4767 	leaf = path->nodes[0];
4768 	slot = path->slots[0];
4769 
4770 	nritems = btrfs_header_nritems(leaf);
4771 	data_end = leaf_data_end(root, leaf);
4772 
4773 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4774 		btrfs_print_leaf(root, leaf);
4775 		btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4776 		       total_size, btrfs_leaf_free_space(root, leaf));
4777 		BUG();
4778 	}
4779 
4780 	if (slot != nritems) {
4781 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4782 
4783 		if (old_data < data_end) {
4784 			btrfs_print_leaf(root, leaf);
4785 			btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4786 			       slot, old_data, data_end);
4787 			BUG_ON(1);
4788 		}
4789 		/*
4790 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4791 		 */
4792 		/* first correct the data pointers */
4793 		for (i = slot; i < nritems; i++) {
4794 			u32 ioff;
4795 
4796 			item = btrfs_item_nr( i);
4797 			ioff = btrfs_token_item_offset(leaf, item, &token);
4798 			btrfs_set_token_item_offset(leaf, item,
4799 						    ioff - total_data, &token);
4800 		}
4801 		/* shift the items */
4802 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4803 			      btrfs_item_nr_offset(slot),
4804 			      (nritems - slot) * sizeof(struct btrfs_item));
4805 
4806 		/* shift the data */
4807 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4808 			      data_end - total_data, btrfs_leaf_data(leaf) +
4809 			      data_end, old_data - data_end);
4810 		data_end = old_data;
4811 	}
4812 
4813 	/* setup the item for the new data */
4814 	for (i = 0; i < nr; i++) {
4815 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4816 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4817 		item = btrfs_item_nr(slot + i);
4818 		btrfs_set_token_item_offset(leaf, item,
4819 					    data_end - data_size[i], &token);
4820 		data_end -= data_size[i];
4821 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4822 	}
4823 
4824 	btrfs_set_header_nritems(leaf, nritems + nr);
4825 
4826 	if (slot == 0) {
4827 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4828 		fixup_low_keys(root, path, &disk_key, 1);
4829 	}
4830 	btrfs_unlock_up_safe(path, 1);
4831 	btrfs_mark_buffer_dirty(leaf);
4832 
4833 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4834 		btrfs_print_leaf(root, leaf);
4835 		BUG();
4836 	}
4837 }
4838 
4839 /*
4840  * Given a key and some data, insert items into the tree.
4841  * This does all the path init required, making room in the tree if needed.
4842  */
4843 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4844 			    struct btrfs_root *root,
4845 			    struct btrfs_path *path,
4846 			    struct btrfs_key *cpu_key, u32 *data_size,
4847 			    int nr)
4848 {
4849 	int ret = 0;
4850 	int slot;
4851 	int i;
4852 	u32 total_size = 0;
4853 	u32 total_data = 0;
4854 
4855 	for (i = 0; i < nr; i++)
4856 		total_data += data_size[i];
4857 
4858 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4859 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4860 	if (ret == 0)
4861 		return -EEXIST;
4862 	if (ret < 0)
4863 		return ret;
4864 
4865 	slot = path->slots[0];
4866 	BUG_ON(slot < 0);
4867 
4868 	setup_items_for_insert(root, path, cpu_key, data_size,
4869 			       total_data, total_size, nr);
4870 	return 0;
4871 }
4872 
4873 /*
4874  * Given a key and some data, insert an item into the tree.
4875  * This does all the path init required, making room in the tree if needed.
4876  */
4877 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4878 		      *root, struct btrfs_key *cpu_key, void *data, u32
4879 		      data_size)
4880 {
4881 	int ret = 0;
4882 	struct btrfs_path *path;
4883 	struct extent_buffer *leaf;
4884 	unsigned long ptr;
4885 
4886 	path = btrfs_alloc_path();
4887 	if (!path)
4888 		return -ENOMEM;
4889 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4890 	if (!ret) {
4891 		leaf = path->nodes[0];
4892 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4893 		write_extent_buffer(leaf, data, ptr, data_size);
4894 		btrfs_mark_buffer_dirty(leaf);
4895 	}
4896 	btrfs_free_path(path);
4897 	return ret;
4898 }
4899 
4900 /*
4901  * delete the pointer from a given node.
4902  *
4903  * the tree should have been previously balanced so the deletion does not
4904  * empty a node.
4905  */
4906 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4907 		    int level, int slot)
4908 {
4909 	struct extent_buffer *parent = path->nodes[level];
4910 	u32 nritems;
4911 	int ret;
4912 
4913 	nritems = btrfs_header_nritems(parent);
4914 	if (slot != nritems - 1) {
4915 		if (level)
4916 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4917 					     slot + 1, nritems - slot - 1);
4918 		memmove_extent_buffer(parent,
4919 			      btrfs_node_key_ptr_offset(slot),
4920 			      btrfs_node_key_ptr_offset(slot + 1),
4921 			      sizeof(struct btrfs_key_ptr) *
4922 			      (nritems - slot - 1));
4923 	} else if (level) {
4924 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4925 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4926 		BUG_ON(ret < 0);
4927 	}
4928 
4929 	nritems--;
4930 	btrfs_set_header_nritems(parent, nritems);
4931 	if (nritems == 0 && parent == root->node) {
4932 		BUG_ON(btrfs_header_level(root->node) != 1);
4933 		/* just turn the root into a leaf and break */
4934 		btrfs_set_header_level(root->node, 0);
4935 	} else if (slot == 0) {
4936 		struct btrfs_disk_key disk_key;
4937 
4938 		btrfs_node_key(parent, &disk_key, 0);
4939 		fixup_low_keys(root, path, &disk_key, level + 1);
4940 	}
4941 	btrfs_mark_buffer_dirty(parent);
4942 }
4943 
4944 /*
4945  * a helper function to delete the leaf pointed to by path->slots[1] and
4946  * path->nodes[1].
4947  *
4948  * This deletes the pointer in path->nodes[1] and frees the leaf
4949  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4950  *
4951  * The path must have already been setup for deleting the leaf, including
4952  * all the proper balancing.  path->nodes[1] must be locked.
4953  */
4954 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4955 				    struct btrfs_root *root,
4956 				    struct btrfs_path *path,
4957 				    struct extent_buffer *leaf)
4958 {
4959 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4960 	del_ptr(root, path, 1, path->slots[1]);
4961 
4962 	/*
4963 	 * btrfs_free_extent is expensive, we want to make sure we
4964 	 * aren't holding any locks when we call it
4965 	 */
4966 	btrfs_unlock_up_safe(path, 0);
4967 
4968 	root_sub_used(root, leaf->len);
4969 
4970 	extent_buffer_get(leaf);
4971 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4972 	free_extent_buffer_stale(leaf);
4973 }
4974 /*
4975  * delete the item at the leaf level in path.  If that empties
4976  * the leaf, remove it from the tree
4977  */
4978 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4979 		    struct btrfs_path *path, int slot, int nr)
4980 {
4981 	struct extent_buffer *leaf;
4982 	struct btrfs_item *item;
4983 	int last_off;
4984 	int dsize = 0;
4985 	int ret = 0;
4986 	int wret;
4987 	int i;
4988 	u32 nritems;
4989 	struct btrfs_map_token token;
4990 
4991 	btrfs_init_map_token(&token);
4992 
4993 	leaf = path->nodes[0];
4994 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4995 
4996 	for (i = 0; i < nr; i++)
4997 		dsize += btrfs_item_size_nr(leaf, slot + i);
4998 
4999 	nritems = btrfs_header_nritems(leaf);
5000 
5001 	if (slot + nr != nritems) {
5002 		int data_end = leaf_data_end(root, leaf);
5003 
5004 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
5005 			      data_end + dsize,
5006 			      btrfs_leaf_data(leaf) + data_end,
5007 			      last_off - data_end);
5008 
5009 		for (i = slot + nr; i < nritems; i++) {
5010 			u32 ioff;
5011 
5012 			item = btrfs_item_nr(i);
5013 			ioff = btrfs_token_item_offset(leaf, item, &token);
5014 			btrfs_set_token_item_offset(leaf, item,
5015 						    ioff + dsize, &token);
5016 		}
5017 
5018 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5019 			      btrfs_item_nr_offset(slot + nr),
5020 			      sizeof(struct btrfs_item) *
5021 			      (nritems - slot - nr));
5022 	}
5023 	btrfs_set_header_nritems(leaf, nritems - nr);
5024 	nritems -= nr;
5025 
5026 	/* delete the leaf if we've emptied it */
5027 	if (nritems == 0) {
5028 		if (leaf == root->node) {
5029 			btrfs_set_header_level(leaf, 0);
5030 		} else {
5031 			btrfs_set_path_blocking(path);
5032 			clean_tree_block(trans, root, leaf);
5033 			btrfs_del_leaf(trans, root, path, leaf);
5034 		}
5035 	} else {
5036 		int used = leaf_space_used(leaf, 0, nritems);
5037 		if (slot == 0) {
5038 			struct btrfs_disk_key disk_key;
5039 
5040 			btrfs_item_key(leaf, &disk_key, 0);
5041 			fixup_low_keys(root, path, &disk_key, 1);
5042 		}
5043 
5044 		/* delete the leaf if it is mostly empty */
5045 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5046 			/* push_leaf_left fixes the path.
5047 			 * make sure the path still points to our leaf
5048 			 * for possible call to del_ptr below
5049 			 */
5050 			slot = path->slots[1];
5051 			extent_buffer_get(leaf);
5052 
5053 			btrfs_set_path_blocking(path);
5054 			wret = push_leaf_left(trans, root, path, 1, 1,
5055 					      1, (u32)-1);
5056 			if (wret < 0 && wret != -ENOSPC)
5057 				ret = wret;
5058 
5059 			if (path->nodes[0] == leaf &&
5060 			    btrfs_header_nritems(leaf)) {
5061 				wret = push_leaf_right(trans, root, path, 1,
5062 						       1, 1, 0);
5063 				if (wret < 0 && wret != -ENOSPC)
5064 					ret = wret;
5065 			}
5066 
5067 			if (btrfs_header_nritems(leaf) == 0) {
5068 				path->slots[1] = slot;
5069 				btrfs_del_leaf(trans, root, path, leaf);
5070 				free_extent_buffer(leaf);
5071 				ret = 0;
5072 			} else {
5073 				/* if we're still in the path, make sure
5074 				 * we're dirty.  Otherwise, one of the
5075 				 * push_leaf functions must have already
5076 				 * dirtied this buffer
5077 				 */
5078 				if (path->nodes[0] == leaf)
5079 					btrfs_mark_buffer_dirty(leaf);
5080 				free_extent_buffer(leaf);
5081 			}
5082 		} else {
5083 			btrfs_mark_buffer_dirty(leaf);
5084 		}
5085 	}
5086 	return ret;
5087 }
5088 
5089 /*
5090  * search the tree again to find a leaf with lesser keys
5091  * returns 0 if it found something or 1 if there are no lesser leaves.
5092  * returns < 0 on io errors.
5093  *
5094  * This may release the path, and so you may lose any locks held at the
5095  * time you call it.
5096  */
5097 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5098 {
5099 	struct btrfs_key key;
5100 	struct btrfs_disk_key found_key;
5101 	int ret;
5102 
5103 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5104 
5105 	if (key.offset > 0) {
5106 		key.offset--;
5107 	} else if (key.type > 0) {
5108 		key.type--;
5109 		key.offset = (u64)-1;
5110 	} else if (key.objectid > 0) {
5111 		key.objectid--;
5112 		key.type = (u8)-1;
5113 		key.offset = (u64)-1;
5114 	} else {
5115 		return 1;
5116 	}
5117 
5118 	btrfs_release_path(path);
5119 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5120 	if (ret < 0)
5121 		return ret;
5122 	btrfs_item_key(path->nodes[0], &found_key, 0);
5123 	ret = comp_keys(&found_key, &key);
5124 	if (ret < 0)
5125 		return 0;
5126 	return 1;
5127 }
5128 
5129 /*
5130  * A helper function to walk down the tree starting at min_key, and looking
5131  * for nodes or leaves that are have a minimum transaction id.
5132  * This is used by the btree defrag code, and tree logging
5133  *
5134  * This does not cow, but it does stuff the starting key it finds back
5135  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5136  * key and get a writable path.
5137  *
5138  * This does lock as it descends, and path->keep_locks should be set
5139  * to 1 by the caller.
5140  *
5141  * This honors path->lowest_level to prevent descent past a given level
5142  * of the tree.
5143  *
5144  * min_trans indicates the oldest transaction that you are interested
5145  * in walking through.  Any nodes or leaves older than min_trans are
5146  * skipped over (without reading them).
5147  *
5148  * returns zero if something useful was found, < 0 on error and 1 if there
5149  * was nothing in the tree that matched the search criteria.
5150  */
5151 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5152 			 struct btrfs_path *path,
5153 			 u64 min_trans)
5154 {
5155 	struct extent_buffer *cur;
5156 	struct btrfs_key found_key;
5157 	int slot;
5158 	int sret;
5159 	u32 nritems;
5160 	int level;
5161 	int ret = 1;
5162 
5163 	WARN_ON(!path->keep_locks);
5164 again:
5165 	cur = btrfs_read_lock_root_node(root);
5166 	level = btrfs_header_level(cur);
5167 	WARN_ON(path->nodes[level]);
5168 	path->nodes[level] = cur;
5169 	path->locks[level] = BTRFS_READ_LOCK;
5170 
5171 	if (btrfs_header_generation(cur) < min_trans) {
5172 		ret = 1;
5173 		goto out;
5174 	}
5175 	while (1) {
5176 		nritems = btrfs_header_nritems(cur);
5177 		level = btrfs_header_level(cur);
5178 		sret = bin_search(cur, min_key, level, &slot);
5179 
5180 		/* at the lowest level, we're done, setup the path and exit */
5181 		if (level == path->lowest_level) {
5182 			if (slot >= nritems)
5183 				goto find_next_key;
5184 			ret = 0;
5185 			path->slots[level] = slot;
5186 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5187 			goto out;
5188 		}
5189 		if (sret && slot > 0)
5190 			slot--;
5191 		/*
5192 		 * check this node pointer against the min_trans parameters.
5193 		 * If it is too old, old, skip to the next one.
5194 		 */
5195 		while (slot < nritems) {
5196 			u64 gen;
5197 
5198 			gen = btrfs_node_ptr_generation(cur, slot);
5199 			if (gen < min_trans) {
5200 				slot++;
5201 				continue;
5202 			}
5203 			break;
5204 		}
5205 find_next_key:
5206 		/*
5207 		 * we didn't find a candidate key in this node, walk forward
5208 		 * and find another one
5209 		 */
5210 		if (slot >= nritems) {
5211 			path->slots[level] = slot;
5212 			btrfs_set_path_blocking(path);
5213 			sret = btrfs_find_next_key(root, path, min_key, level,
5214 						  min_trans);
5215 			if (sret == 0) {
5216 				btrfs_release_path(path);
5217 				goto again;
5218 			} else {
5219 				goto out;
5220 			}
5221 		}
5222 		/* save our key for returning back */
5223 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5224 		path->slots[level] = slot;
5225 		if (level == path->lowest_level) {
5226 			ret = 0;
5227 			unlock_up(path, level, 1, 0, NULL);
5228 			goto out;
5229 		}
5230 		btrfs_set_path_blocking(path);
5231 		cur = read_node_slot(root, cur, slot);
5232 		BUG_ON(!cur); /* -ENOMEM */
5233 
5234 		btrfs_tree_read_lock(cur);
5235 
5236 		path->locks[level - 1] = BTRFS_READ_LOCK;
5237 		path->nodes[level - 1] = cur;
5238 		unlock_up(path, level, 1, 0, NULL);
5239 		btrfs_clear_path_blocking(path, NULL, 0);
5240 	}
5241 out:
5242 	if (ret == 0)
5243 		memcpy(min_key, &found_key, sizeof(found_key));
5244 	btrfs_set_path_blocking(path);
5245 	return ret;
5246 }
5247 
5248 static void tree_move_down(struct btrfs_root *root,
5249 			   struct btrfs_path *path,
5250 			   int *level, int root_level)
5251 {
5252 	BUG_ON(*level == 0);
5253 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5254 					path->slots[*level]);
5255 	path->slots[*level - 1] = 0;
5256 	(*level)--;
5257 }
5258 
5259 static int tree_move_next_or_upnext(struct btrfs_root *root,
5260 				    struct btrfs_path *path,
5261 				    int *level, int root_level)
5262 {
5263 	int ret = 0;
5264 	int nritems;
5265 	nritems = btrfs_header_nritems(path->nodes[*level]);
5266 
5267 	path->slots[*level]++;
5268 
5269 	while (path->slots[*level] >= nritems) {
5270 		if (*level == root_level)
5271 			return -1;
5272 
5273 		/* move upnext */
5274 		path->slots[*level] = 0;
5275 		free_extent_buffer(path->nodes[*level]);
5276 		path->nodes[*level] = NULL;
5277 		(*level)++;
5278 		path->slots[*level]++;
5279 
5280 		nritems = btrfs_header_nritems(path->nodes[*level]);
5281 		ret = 1;
5282 	}
5283 	return ret;
5284 }
5285 
5286 /*
5287  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5288  * or down.
5289  */
5290 static int tree_advance(struct btrfs_root *root,
5291 			struct btrfs_path *path,
5292 			int *level, int root_level,
5293 			int allow_down,
5294 			struct btrfs_key *key)
5295 {
5296 	int ret;
5297 
5298 	if (*level == 0 || !allow_down) {
5299 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5300 	} else {
5301 		tree_move_down(root, path, level, root_level);
5302 		ret = 0;
5303 	}
5304 	if (ret >= 0) {
5305 		if (*level == 0)
5306 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5307 					path->slots[*level]);
5308 		else
5309 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5310 					path->slots[*level]);
5311 	}
5312 	return ret;
5313 }
5314 
5315 static int tree_compare_item(struct btrfs_root *left_root,
5316 			     struct btrfs_path *left_path,
5317 			     struct btrfs_path *right_path,
5318 			     char *tmp_buf)
5319 {
5320 	int cmp;
5321 	int len1, len2;
5322 	unsigned long off1, off2;
5323 
5324 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5325 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5326 	if (len1 != len2)
5327 		return 1;
5328 
5329 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5330 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5331 				right_path->slots[0]);
5332 
5333 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5334 
5335 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5336 	if (cmp)
5337 		return 1;
5338 	return 0;
5339 }
5340 
5341 #define ADVANCE 1
5342 #define ADVANCE_ONLY_NEXT -1
5343 
5344 /*
5345  * This function compares two trees and calls the provided callback for
5346  * every changed/new/deleted item it finds.
5347  * If shared tree blocks are encountered, whole subtrees are skipped, making
5348  * the compare pretty fast on snapshotted subvolumes.
5349  *
5350  * This currently works on commit roots only. As commit roots are read only,
5351  * we don't do any locking. The commit roots are protected with transactions.
5352  * Transactions are ended and rejoined when a commit is tried in between.
5353  *
5354  * This function checks for modifications done to the trees while comparing.
5355  * If it detects a change, it aborts immediately.
5356  */
5357 int btrfs_compare_trees(struct btrfs_root *left_root,
5358 			struct btrfs_root *right_root,
5359 			btrfs_changed_cb_t changed_cb, void *ctx)
5360 {
5361 	int ret;
5362 	int cmp;
5363 	struct btrfs_trans_handle *trans = NULL;
5364 	struct btrfs_path *left_path = NULL;
5365 	struct btrfs_path *right_path = NULL;
5366 	struct btrfs_key left_key;
5367 	struct btrfs_key right_key;
5368 	char *tmp_buf = NULL;
5369 	int left_root_level;
5370 	int right_root_level;
5371 	int left_level;
5372 	int right_level;
5373 	int left_end_reached;
5374 	int right_end_reached;
5375 	int advance_left;
5376 	int advance_right;
5377 	u64 left_blockptr;
5378 	u64 right_blockptr;
5379 	u64 left_start_ctransid;
5380 	u64 right_start_ctransid;
5381 	u64 ctransid;
5382 
5383 	left_path = btrfs_alloc_path();
5384 	if (!left_path) {
5385 		ret = -ENOMEM;
5386 		goto out;
5387 	}
5388 	right_path = btrfs_alloc_path();
5389 	if (!right_path) {
5390 		ret = -ENOMEM;
5391 		goto out;
5392 	}
5393 
5394 	tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5395 	if (!tmp_buf) {
5396 		ret = -ENOMEM;
5397 		goto out;
5398 	}
5399 
5400 	left_path->search_commit_root = 1;
5401 	left_path->skip_locking = 1;
5402 	right_path->search_commit_root = 1;
5403 	right_path->skip_locking = 1;
5404 
5405 	spin_lock(&left_root->root_item_lock);
5406 	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5407 	spin_unlock(&left_root->root_item_lock);
5408 
5409 	spin_lock(&right_root->root_item_lock);
5410 	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5411 	spin_unlock(&right_root->root_item_lock);
5412 
5413 	trans = btrfs_join_transaction(left_root);
5414 	if (IS_ERR(trans)) {
5415 		ret = PTR_ERR(trans);
5416 		trans = NULL;
5417 		goto out;
5418 	}
5419 
5420 	/*
5421 	 * Strategy: Go to the first items of both trees. Then do
5422 	 *
5423 	 * If both trees are at level 0
5424 	 *   Compare keys of current items
5425 	 *     If left < right treat left item as new, advance left tree
5426 	 *       and repeat
5427 	 *     If left > right treat right item as deleted, advance right tree
5428 	 *       and repeat
5429 	 *     If left == right do deep compare of items, treat as changed if
5430 	 *       needed, advance both trees and repeat
5431 	 * If both trees are at the same level but not at level 0
5432 	 *   Compare keys of current nodes/leafs
5433 	 *     If left < right advance left tree and repeat
5434 	 *     If left > right advance right tree and repeat
5435 	 *     If left == right compare blockptrs of the next nodes/leafs
5436 	 *       If they match advance both trees but stay at the same level
5437 	 *         and repeat
5438 	 *       If they don't match advance both trees while allowing to go
5439 	 *         deeper and repeat
5440 	 * If tree levels are different
5441 	 *   Advance the tree that needs it and repeat
5442 	 *
5443 	 * Advancing a tree means:
5444 	 *   If we are at level 0, try to go to the next slot. If that's not
5445 	 *   possible, go one level up and repeat. Stop when we found a level
5446 	 *   where we could go to the next slot. We may at this point be on a
5447 	 *   node or a leaf.
5448 	 *
5449 	 *   If we are not at level 0 and not on shared tree blocks, go one
5450 	 *   level deeper.
5451 	 *
5452 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5453 	 *   the right if possible or go up and right.
5454 	 */
5455 
5456 	left_level = btrfs_header_level(left_root->commit_root);
5457 	left_root_level = left_level;
5458 	left_path->nodes[left_level] = left_root->commit_root;
5459 	extent_buffer_get(left_path->nodes[left_level]);
5460 
5461 	right_level = btrfs_header_level(right_root->commit_root);
5462 	right_root_level = right_level;
5463 	right_path->nodes[right_level] = right_root->commit_root;
5464 	extent_buffer_get(right_path->nodes[right_level]);
5465 
5466 	if (left_level == 0)
5467 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5468 				&left_key, left_path->slots[left_level]);
5469 	else
5470 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5471 				&left_key, left_path->slots[left_level]);
5472 	if (right_level == 0)
5473 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5474 				&right_key, right_path->slots[right_level]);
5475 	else
5476 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5477 				&right_key, right_path->slots[right_level]);
5478 
5479 	left_end_reached = right_end_reached = 0;
5480 	advance_left = advance_right = 0;
5481 
5482 	while (1) {
5483 		/*
5484 		 * We need to make sure the transaction does not get committed
5485 		 * while we do anything on commit roots. This means, we need to
5486 		 * join and leave transactions for every item that we process.
5487 		 */
5488 		if (trans && btrfs_should_end_transaction(trans, left_root)) {
5489 			btrfs_release_path(left_path);
5490 			btrfs_release_path(right_path);
5491 
5492 			ret = btrfs_end_transaction(trans, left_root);
5493 			trans = NULL;
5494 			if (ret < 0)
5495 				goto out;
5496 		}
5497 		/* now rejoin the transaction */
5498 		if (!trans) {
5499 			trans = btrfs_join_transaction(left_root);
5500 			if (IS_ERR(trans)) {
5501 				ret = PTR_ERR(trans);
5502 				trans = NULL;
5503 				goto out;
5504 			}
5505 
5506 			spin_lock(&left_root->root_item_lock);
5507 			ctransid = btrfs_root_ctransid(&left_root->root_item);
5508 			spin_unlock(&left_root->root_item_lock);
5509 			if (ctransid != left_start_ctransid)
5510 				left_start_ctransid = 0;
5511 
5512 			spin_lock(&right_root->root_item_lock);
5513 			ctransid = btrfs_root_ctransid(&right_root->root_item);
5514 			spin_unlock(&right_root->root_item_lock);
5515 			if (ctransid != right_start_ctransid)
5516 				right_start_ctransid = 0;
5517 
5518 			if (!left_start_ctransid || !right_start_ctransid) {
5519 				WARN(1, KERN_WARNING
5520 					"BTRFS: btrfs_compare_tree detected "
5521 					"a change in one of the trees while "
5522 					"iterating. This is probably a "
5523 					"bug.\n");
5524 				ret = -EIO;
5525 				goto out;
5526 			}
5527 
5528 			/*
5529 			 * the commit root may have changed, so start again
5530 			 * where we stopped
5531 			 */
5532 			left_path->lowest_level = left_level;
5533 			right_path->lowest_level = right_level;
5534 			ret = btrfs_search_slot(NULL, left_root,
5535 					&left_key, left_path, 0, 0);
5536 			if (ret < 0)
5537 				goto out;
5538 			ret = btrfs_search_slot(NULL, right_root,
5539 					&right_key, right_path, 0, 0);
5540 			if (ret < 0)
5541 				goto out;
5542 		}
5543 
5544 		if (advance_left && !left_end_reached) {
5545 			ret = tree_advance(left_root, left_path, &left_level,
5546 					left_root_level,
5547 					advance_left != ADVANCE_ONLY_NEXT,
5548 					&left_key);
5549 			if (ret < 0)
5550 				left_end_reached = ADVANCE;
5551 			advance_left = 0;
5552 		}
5553 		if (advance_right && !right_end_reached) {
5554 			ret = tree_advance(right_root, right_path, &right_level,
5555 					right_root_level,
5556 					advance_right != ADVANCE_ONLY_NEXT,
5557 					&right_key);
5558 			if (ret < 0)
5559 				right_end_reached = ADVANCE;
5560 			advance_right = 0;
5561 		}
5562 
5563 		if (left_end_reached && right_end_reached) {
5564 			ret = 0;
5565 			goto out;
5566 		} else if (left_end_reached) {
5567 			if (right_level == 0) {
5568 				ret = changed_cb(left_root, right_root,
5569 						left_path, right_path,
5570 						&right_key,
5571 						BTRFS_COMPARE_TREE_DELETED,
5572 						ctx);
5573 				if (ret < 0)
5574 					goto out;
5575 			}
5576 			advance_right = ADVANCE;
5577 			continue;
5578 		} else if (right_end_reached) {
5579 			if (left_level == 0) {
5580 				ret = changed_cb(left_root, right_root,
5581 						left_path, right_path,
5582 						&left_key,
5583 						BTRFS_COMPARE_TREE_NEW,
5584 						ctx);
5585 				if (ret < 0)
5586 					goto out;
5587 			}
5588 			advance_left = ADVANCE;
5589 			continue;
5590 		}
5591 
5592 		if (left_level == 0 && right_level == 0) {
5593 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5594 			if (cmp < 0) {
5595 				ret = changed_cb(left_root, right_root,
5596 						left_path, right_path,
5597 						&left_key,
5598 						BTRFS_COMPARE_TREE_NEW,
5599 						ctx);
5600 				if (ret < 0)
5601 					goto out;
5602 				advance_left = ADVANCE;
5603 			} else if (cmp > 0) {
5604 				ret = changed_cb(left_root, right_root,
5605 						left_path, right_path,
5606 						&right_key,
5607 						BTRFS_COMPARE_TREE_DELETED,
5608 						ctx);
5609 				if (ret < 0)
5610 					goto out;
5611 				advance_right = ADVANCE;
5612 			} else {
5613 				enum btrfs_compare_tree_result cmp;
5614 
5615 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5616 				ret = tree_compare_item(left_root, left_path,
5617 						right_path, tmp_buf);
5618 				if (ret)
5619 					cmp = BTRFS_COMPARE_TREE_CHANGED;
5620 				else
5621 					cmp = BTRFS_COMPARE_TREE_SAME;
5622 				ret = changed_cb(left_root, right_root,
5623 						 left_path, right_path,
5624 						 &left_key, cmp, ctx);
5625 				if (ret < 0)
5626 					goto out;
5627 				advance_left = ADVANCE;
5628 				advance_right = ADVANCE;
5629 			}
5630 		} else if (left_level == right_level) {
5631 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5632 			if (cmp < 0) {
5633 				advance_left = ADVANCE;
5634 			} else if (cmp > 0) {
5635 				advance_right = ADVANCE;
5636 			} else {
5637 				left_blockptr = btrfs_node_blockptr(
5638 						left_path->nodes[left_level],
5639 						left_path->slots[left_level]);
5640 				right_blockptr = btrfs_node_blockptr(
5641 						right_path->nodes[right_level],
5642 						right_path->slots[right_level]);
5643 				if (left_blockptr == right_blockptr) {
5644 					/*
5645 					 * As we're on a shared block, don't
5646 					 * allow to go deeper.
5647 					 */
5648 					advance_left = ADVANCE_ONLY_NEXT;
5649 					advance_right = ADVANCE_ONLY_NEXT;
5650 				} else {
5651 					advance_left = ADVANCE;
5652 					advance_right = ADVANCE;
5653 				}
5654 			}
5655 		} else if (left_level < right_level) {
5656 			advance_right = ADVANCE;
5657 		} else {
5658 			advance_left = ADVANCE;
5659 		}
5660 	}
5661 
5662 out:
5663 	btrfs_free_path(left_path);
5664 	btrfs_free_path(right_path);
5665 	kfree(tmp_buf);
5666 
5667 	if (trans) {
5668 		if (!ret)
5669 			ret = btrfs_end_transaction(trans, left_root);
5670 		else
5671 			btrfs_end_transaction(trans, left_root);
5672 	}
5673 
5674 	return ret;
5675 }
5676 
5677 /*
5678  * this is similar to btrfs_next_leaf, but does not try to preserve
5679  * and fixup the path.  It looks for and returns the next key in the
5680  * tree based on the current path and the min_trans parameters.
5681  *
5682  * 0 is returned if another key is found, < 0 if there are any errors
5683  * and 1 is returned if there are no higher keys in the tree
5684  *
5685  * path->keep_locks should be set to 1 on the search made before
5686  * calling this function.
5687  */
5688 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5689 			struct btrfs_key *key, int level, u64 min_trans)
5690 {
5691 	int slot;
5692 	struct extent_buffer *c;
5693 
5694 	WARN_ON(!path->keep_locks);
5695 	while (level < BTRFS_MAX_LEVEL) {
5696 		if (!path->nodes[level])
5697 			return 1;
5698 
5699 		slot = path->slots[level] + 1;
5700 		c = path->nodes[level];
5701 next:
5702 		if (slot >= btrfs_header_nritems(c)) {
5703 			int ret;
5704 			int orig_lowest;
5705 			struct btrfs_key cur_key;
5706 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5707 			    !path->nodes[level + 1])
5708 				return 1;
5709 
5710 			if (path->locks[level + 1]) {
5711 				level++;
5712 				continue;
5713 			}
5714 
5715 			slot = btrfs_header_nritems(c) - 1;
5716 			if (level == 0)
5717 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5718 			else
5719 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5720 
5721 			orig_lowest = path->lowest_level;
5722 			btrfs_release_path(path);
5723 			path->lowest_level = level;
5724 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5725 						0, 0);
5726 			path->lowest_level = orig_lowest;
5727 			if (ret < 0)
5728 				return ret;
5729 
5730 			c = path->nodes[level];
5731 			slot = path->slots[level];
5732 			if (ret == 0)
5733 				slot++;
5734 			goto next;
5735 		}
5736 
5737 		if (level == 0)
5738 			btrfs_item_key_to_cpu(c, key, slot);
5739 		else {
5740 			u64 gen = btrfs_node_ptr_generation(c, slot);
5741 
5742 			if (gen < min_trans) {
5743 				slot++;
5744 				goto next;
5745 			}
5746 			btrfs_node_key_to_cpu(c, key, slot);
5747 		}
5748 		return 0;
5749 	}
5750 	return 1;
5751 }
5752 
5753 /*
5754  * search the tree again to find a leaf with greater keys
5755  * returns 0 if it found something or 1 if there are no greater leaves.
5756  * returns < 0 on io errors.
5757  */
5758 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5759 {
5760 	return btrfs_next_old_leaf(root, path, 0);
5761 }
5762 
5763 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5764 			u64 time_seq)
5765 {
5766 	int slot;
5767 	int level;
5768 	struct extent_buffer *c;
5769 	struct extent_buffer *next;
5770 	struct btrfs_key key;
5771 	u32 nritems;
5772 	int ret;
5773 	int old_spinning = path->leave_spinning;
5774 	int next_rw_lock = 0;
5775 
5776 	nritems = btrfs_header_nritems(path->nodes[0]);
5777 	if (nritems == 0)
5778 		return 1;
5779 
5780 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5781 again:
5782 	level = 1;
5783 	next = NULL;
5784 	next_rw_lock = 0;
5785 	btrfs_release_path(path);
5786 
5787 	path->keep_locks = 1;
5788 	path->leave_spinning = 1;
5789 
5790 	if (time_seq)
5791 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5792 	else
5793 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5794 	path->keep_locks = 0;
5795 
5796 	if (ret < 0)
5797 		return ret;
5798 
5799 	nritems = btrfs_header_nritems(path->nodes[0]);
5800 	/*
5801 	 * by releasing the path above we dropped all our locks.  A balance
5802 	 * could have added more items next to the key that used to be
5803 	 * at the very end of the block.  So, check again here and
5804 	 * advance the path if there are now more items available.
5805 	 */
5806 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5807 		if (ret == 0)
5808 			path->slots[0]++;
5809 		ret = 0;
5810 		goto done;
5811 	}
5812 
5813 	while (level < BTRFS_MAX_LEVEL) {
5814 		if (!path->nodes[level]) {
5815 			ret = 1;
5816 			goto done;
5817 		}
5818 
5819 		slot = path->slots[level] + 1;
5820 		c = path->nodes[level];
5821 		if (slot >= btrfs_header_nritems(c)) {
5822 			level++;
5823 			if (level == BTRFS_MAX_LEVEL) {
5824 				ret = 1;
5825 				goto done;
5826 			}
5827 			continue;
5828 		}
5829 
5830 		if (next) {
5831 			btrfs_tree_unlock_rw(next, next_rw_lock);
5832 			free_extent_buffer(next);
5833 		}
5834 
5835 		next = c;
5836 		next_rw_lock = path->locks[level];
5837 		ret = read_block_for_search(NULL, root, path, &next, level,
5838 					    slot, &key, 0);
5839 		if (ret == -EAGAIN)
5840 			goto again;
5841 
5842 		if (ret < 0) {
5843 			btrfs_release_path(path);
5844 			goto done;
5845 		}
5846 
5847 		if (!path->skip_locking) {
5848 			ret = btrfs_try_tree_read_lock(next);
5849 			if (!ret && time_seq) {
5850 				/*
5851 				 * If we don't get the lock, we may be racing
5852 				 * with push_leaf_left, holding that lock while
5853 				 * itself waiting for the leaf we've currently
5854 				 * locked. To solve this situation, we give up
5855 				 * on our lock and cycle.
5856 				 */
5857 				free_extent_buffer(next);
5858 				btrfs_release_path(path);
5859 				cond_resched();
5860 				goto again;
5861 			}
5862 			if (!ret) {
5863 				btrfs_set_path_blocking(path);
5864 				btrfs_tree_read_lock(next);
5865 				btrfs_clear_path_blocking(path, next,
5866 							  BTRFS_READ_LOCK);
5867 			}
5868 			next_rw_lock = BTRFS_READ_LOCK;
5869 		}
5870 		break;
5871 	}
5872 	path->slots[level] = slot;
5873 	while (1) {
5874 		level--;
5875 		c = path->nodes[level];
5876 		if (path->locks[level])
5877 			btrfs_tree_unlock_rw(c, path->locks[level]);
5878 
5879 		free_extent_buffer(c);
5880 		path->nodes[level] = next;
5881 		path->slots[level] = 0;
5882 		if (!path->skip_locking)
5883 			path->locks[level] = next_rw_lock;
5884 		if (!level)
5885 			break;
5886 
5887 		ret = read_block_for_search(NULL, root, path, &next, level,
5888 					    0, &key, 0);
5889 		if (ret == -EAGAIN)
5890 			goto again;
5891 
5892 		if (ret < 0) {
5893 			btrfs_release_path(path);
5894 			goto done;
5895 		}
5896 
5897 		if (!path->skip_locking) {
5898 			ret = btrfs_try_tree_read_lock(next);
5899 			if (!ret) {
5900 				btrfs_set_path_blocking(path);
5901 				btrfs_tree_read_lock(next);
5902 				btrfs_clear_path_blocking(path, next,
5903 							  BTRFS_READ_LOCK);
5904 			}
5905 			next_rw_lock = BTRFS_READ_LOCK;
5906 		}
5907 	}
5908 	ret = 0;
5909 done:
5910 	unlock_up(path, 0, 1, 0, NULL);
5911 	path->leave_spinning = old_spinning;
5912 	if (!old_spinning)
5913 		btrfs_set_path_blocking(path);
5914 
5915 	return ret;
5916 }
5917 
5918 /*
5919  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5920  * searching until it gets past min_objectid or finds an item of 'type'
5921  *
5922  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5923  */
5924 int btrfs_previous_item(struct btrfs_root *root,
5925 			struct btrfs_path *path, u64 min_objectid,
5926 			int type)
5927 {
5928 	struct btrfs_key found_key;
5929 	struct extent_buffer *leaf;
5930 	u32 nritems;
5931 	int ret;
5932 
5933 	while (1) {
5934 		if (path->slots[0] == 0) {
5935 			btrfs_set_path_blocking(path);
5936 			ret = btrfs_prev_leaf(root, path);
5937 			if (ret != 0)
5938 				return ret;
5939 		} else {
5940 			path->slots[0]--;
5941 		}
5942 		leaf = path->nodes[0];
5943 		nritems = btrfs_header_nritems(leaf);
5944 		if (nritems == 0)
5945 			return 1;
5946 		if (path->slots[0] == nritems)
5947 			path->slots[0]--;
5948 
5949 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5950 		if (found_key.objectid < min_objectid)
5951 			break;
5952 		if (found_key.type == type)
5953 			return 0;
5954 		if (found_key.objectid == min_objectid &&
5955 		    found_key.type < type)
5956 			break;
5957 	}
5958 	return 1;
5959 }
5960 
5961 /*
5962  * search in extent tree to find a previous Metadata/Data extent item with
5963  * min objecitd.
5964  *
5965  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5966  */
5967 int btrfs_previous_extent_item(struct btrfs_root *root,
5968 			struct btrfs_path *path, u64 min_objectid)
5969 {
5970 	struct btrfs_key found_key;
5971 	struct extent_buffer *leaf;
5972 	u32 nritems;
5973 	int ret;
5974 
5975 	while (1) {
5976 		if (path->slots[0] == 0) {
5977 			btrfs_set_path_blocking(path);
5978 			ret = btrfs_prev_leaf(root, path);
5979 			if (ret != 0)
5980 				return ret;
5981 		} else {
5982 			path->slots[0]--;
5983 		}
5984 		leaf = path->nodes[0];
5985 		nritems = btrfs_header_nritems(leaf);
5986 		if (nritems == 0)
5987 			return 1;
5988 		if (path->slots[0] == nritems)
5989 			path->slots[0]--;
5990 
5991 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5992 		if (found_key.objectid < min_objectid)
5993 			break;
5994 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5995 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5996 			return 0;
5997 		if (found_key.objectid == min_objectid &&
5998 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5999 			break;
6000 	}
6001 	return 1;
6002 }
6003