xref: /openbmc/linux/fs/btrfs/ctree.c (revision b6bec26c)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27 
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31 		      *root, struct btrfs_key *ins_key,
32 		      struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34 			  struct btrfs_root *root, struct extent_buffer *dst,
35 			  struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37 			      struct btrfs_root *root,
38 			      struct extent_buffer *dst_buf,
39 			      struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 		    struct btrfs_path *path, int level, int slot);
42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 				 struct extent_buffer *eb);
44 struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
45 					  u32 blocksize, u64 parent_transid,
46 					  u64 time_seq);
47 struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
48 						u64 bytenr, u32 blocksize,
49 						u64 time_seq);
50 
51 struct btrfs_path *btrfs_alloc_path(void)
52 {
53 	struct btrfs_path *path;
54 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
55 	return path;
56 }
57 
58 /*
59  * set all locked nodes in the path to blocking locks.  This should
60  * be done before scheduling
61  */
62 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
63 {
64 	int i;
65 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
66 		if (!p->nodes[i] || !p->locks[i])
67 			continue;
68 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
69 		if (p->locks[i] == BTRFS_READ_LOCK)
70 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
71 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
72 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
73 	}
74 }
75 
76 /*
77  * reset all the locked nodes in the patch to spinning locks.
78  *
79  * held is used to keep lockdep happy, when lockdep is enabled
80  * we set held to a blocking lock before we go around and
81  * retake all the spinlocks in the path.  You can safely use NULL
82  * for held
83  */
84 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
85 					struct extent_buffer *held, int held_rw)
86 {
87 	int i;
88 
89 #ifdef CONFIG_DEBUG_LOCK_ALLOC
90 	/* lockdep really cares that we take all of these spinlocks
91 	 * in the right order.  If any of the locks in the path are not
92 	 * currently blocking, it is going to complain.  So, make really
93 	 * really sure by forcing the path to blocking before we clear
94 	 * the path blocking.
95 	 */
96 	if (held) {
97 		btrfs_set_lock_blocking_rw(held, held_rw);
98 		if (held_rw == BTRFS_WRITE_LOCK)
99 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
100 		else if (held_rw == BTRFS_READ_LOCK)
101 			held_rw = BTRFS_READ_LOCK_BLOCKING;
102 	}
103 	btrfs_set_path_blocking(p);
104 #endif
105 
106 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
107 		if (p->nodes[i] && p->locks[i]) {
108 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
109 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
110 				p->locks[i] = BTRFS_WRITE_LOCK;
111 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
112 				p->locks[i] = BTRFS_READ_LOCK;
113 		}
114 	}
115 
116 #ifdef CONFIG_DEBUG_LOCK_ALLOC
117 	if (held)
118 		btrfs_clear_lock_blocking_rw(held, held_rw);
119 #endif
120 }
121 
122 /* this also releases the path */
123 void btrfs_free_path(struct btrfs_path *p)
124 {
125 	if (!p)
126 		return;
127 	btrfs_release_path(p);
128 	kmem_cache_free(btrfs_path_cachep, p);
129 }
130 
131 /*
132  * path release drops references on the extent buffers in the path
133  * and it drops any locks held by this path
134  *
135  * It is safe to call this on paths that no locks or extent buffers held.
136  */
137 noinline void btrfs_release_path(struct btrfs_path *p)
138 {
139 	int i;
140 
141 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
142 		p->slots[i] = 0;
143 		if (!p->nodes[i])
144 			continue;
145 		if (p->locks[i]) {
146 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
147 			p->locks[i] = 0;
148 		}
149 		free_extent_buffer(p->nodes[i]);
150 		p->nodes[i] = NULL;
151 	}
152 }
153 
154 /*
155  * safely gets a reference on the root node of a tree.  A lock
156  * is not taken, so a concurrent writer may put a different node
157  * at the root of the tree.  See btrfs_lock_root_node for the
158  * looping required.
159  *
160  * The extent buffer returned by this has a reference taken, so
161  * it won't disappear.  It may stop being the root of the tree
162  * at any time because there are no locks held.
163  */
164 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
165 {
166 	struct extent_buffer *eb;
167 
168 	while (1) {
169 		rcu_read_lock();
170 		eb = rcu_dereference(root->node);
171 
172 		/*
173 		 * RCU really hurts here, we could free up the root node because
174 		 * it was cow'ed but we may not get the new root node yet so do
175 		 * the inc_not_zero dance and if it doesn't work then
176 		 * synchronize_rcu and try again.
177 		 */
178 		if (atomic_inc_not_zero(&eb->refs)) {
179 			rcu_read_unlock();
180 			break;
181 		}
182 		rcu_read_unlock();
183 		synchronize_rcu();
184 	}
185 	return eb;
186 }
187 
188 /* loop around taking references on and locking the root node of the
189  * tree until you end up with a lock on the root.  A locked buffer
190  * is returned, with a reference held.
191  */
192 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
193 {
194 	struct extent_buffer *eb;
195 
196 	while (1) {
197 		eb = btrfs_root_node(root);
198 		btrfs_tree_lock(eb);
199 		if (eb == root->node)
200 			break;
201 		btrfs_tree_unlock(eb);
202 		free_extent_buffer(eb);
203 	}
204 	return eb;
205 }
206 
207 /* loop around taking references on and locking the root node of the
208  * tree until you end up with a lock on the root.  A locked buffer
209  * is returned, with a reference held.
210  */
211 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
212 {
213 	struct extent_buffer *eb;
214 
215 	while (1) {
216 		eb = btrfs_root_node(root);
217 		btrfs_tree_read_lock(eb);
218 		if (eb == root->node)
219 			break;
220 		btrfs_tree_read_unlock(eb);
221 		free_extent_buffer(eb);
222 	}
223 	return eb;
224 }
225 
226 /* cowonly root (everything not a reference counted cow subvolume), just get
227  * put onto a simple dirty list.  transaction.c walks this to make sure they
228  * get properly updated on disk.
229  */
230 static void add_root_to_dirty_list(struct btrfs_root *root)
231 {
232 	spin_lock(&root->fs_info->trans_lock);
233 	if (root->track_dirty && list_empty(&root->dirty_list)) {
234 		list_add(&root->dirty_list,
235 			 &root->fs_info->dirty_cowonly_roots);
236 	}
237 	spin_unlock(&root->fs_info->trans_lock);
238 }
239 
240 /*
241  * used by snapshot creation to make a copy of a root for a tree with
242  * a given objectid.  The buffer with the new root node is returned in
243  * cow_ret, and this func returns zero on success or a negative error code.
244  */
245 int btrfs_copy_root(struct btrfs_trans_handle *trans,
246 		      struct btrfs_root *root,
247 		      struct extent_buffer *buf,
248 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
249 {
250 	struct extent_buffer *cow;
251 	int ret = 0;
252 	int level;
253 	struct btrfs_disk_key disk_key;
254 
255 	WARN_ON(root->ref_cows && trans->transid !=
256 		root->fs_info->running_transaction->transid);
257 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
258 
259 	level = btrfs_header_level(buf);
260 	if (level == 0)
261 		btrfs_item_key(buf, &disk_key, 0);
262 	else
263 		btrfs_node_key(buf, &disk_key, 0);
264 
265 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
266 				     new_root_objectid, &disk_key, level,
267 				     buf->start, 0);
268 	if (IS_ERR(cow))
269 		return PTR_ERR(cow);
270 
271 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
272 	btrfs_set_header_bytenr(cow, cow->start);
273 	btrfs_set_header_generation(cow, trans->transid);
274 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
275 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
276 				     BTRFS_HEADER_FLAG_RELOC);
277 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
278 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
279 	else
280 		btrfs_set_header_owner(cow, new_root_objectid);
281 
282 	write_extent_buffer(cow, root->fs_info->fsid,
283 			    (unsigned long)btrfs_header_fsid(cow),
284 			    BTRFS_FSID_SIZE);
285 
286 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
287 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
288 		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
289 	else
290 		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
291 
292 	if (ret)
293 		return ret;
294 
295 	btrfs_mark_buffer_dirty(cow);
296 	*cow_ret = cow;
297 	return 0;
298 }
299 
300 enum mod_log_op {
301 	MOD_LOG_KEY_REPLACE,
302 	MOD_LOG_KEY_ADD,
303 	MOD_LOG_KEY_REMOVE,
304 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
305 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
306 	MOD_LOG_MOVE_KEYS,
307 	MOD_LOG_ROOT_REPLACE,
308 };
309 
310 struct tree_mod_move {
311 	int dst_slot;
312 	int nr_items;
313 };
314 
315 struct tree_mod_root {
316 	u64 logical;
317 	u8 level;
318 };
319 
320 struct tree_mod_elem {
321 	struct rb_node node;
322 	u64 index;		/* shifted logical */
323 	u64 seq;
324 	enum mod_log_op op;
325 
326 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
327 	int slot;
328 
329 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
330 	u64 generation;
331 
332 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
333 	struct btrfs_disk_key key;
334 	u64 blockptr;
335 
336 	/* this is used for op == MOD_LOG_MOVE_KEYS */
337 	struct tree_mod_move move;
338 
339 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
340 	struct tree_mod_root old_root;
341 };
342 
343 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
344 {
345 	read_lock(&fs_info->tree_mod_log_lock);
346 }
347 
348 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
349 {
350 	read_unlock(&fs_info->tree_mod_log_lock);
351 }
352 
353 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
354 {
355 	write_lock(&fs_info->tree_mod_log_lock);
356 }
357 
358 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
359 {
360 	write_unlock(&fs_info->tree_mod_log_lock);
361 }
362 
363 /*
364  * This adds a new blocker to the tree mod log's blocker list if the @elem
365  * passed does not already have a sequence number set. So when a caller expects
366  * to record tree modifications, it should ensure to set elem->seq to zero
367  * before calling btrfs_get_tree_mod_seq.
368  * Returns a fresh, unused tree log modification sequence number, even if no new
369  * blocker was added.
370  */
371 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
372 			   struct seq_list *elem)
373 {
374 	u64 seq;
375 
376 	tree_mod_log_write_lock(fs_info);
377 	spin_lock(&fs_info->tree_mod_seq_lock);
378 	if (!elem->seq) {
379 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
380 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
381 	}
382 	seq = btrfs_inc_tree_mod_seq(fs_info);
383 	spin_unlock(&fs_info->tree_mod_seq_lock);
384 	tree_mod_log_write_unlock(fs_info);
385 
386 	return seq;
387 }
388 
389 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
390 			    struct seq_list *elem)
391 {
392 	struct rb_root *tm_root;
393 	struct rb_node *node;
394 	struct rb_node *next;
395 	struct seq_list *cur_elem;
396 	struct tree_mod_elem *tm;
397 	u64 min_seq = (u64)-1;
398 	u64 seq_putting = elem->seq;
399 
400 	if (!seq_putting)
401 		return;
402 
403 	spin_lock(&fs_info->tree_mod_seq_lock);
404 	list_del(&elem->list);
405 	elem->seq = 0;
406 
407 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
408 		if (cur_elem->seq < min_seq) {
409 			if (seq_putting > cur_elem->seq) {
410 				/*
411 				 * blocker with lower sequence number exists, we
412 				 * cannot remove anything from the log
413 				 */
414 				spin_unlock(&fs_info->tree_mod_seq_lock);
415 				return;
416 			}
417 			min_seq = cur_elem->seq;
418 		}
419 	}
420 	spin_unlock(&fs_info->tree_mod_seq_lock);
421 
422 	/*
423 	 * anything that's lower than the lowest existing (read: blocked)
424 	 * sequence number can be removed from the tree.
425 	 */
426 	tree_mod_log_write_lock(fs_info);
427 	tm_root = &fs_info->tree_mod_log;
428 	for (node = rb_first(tm_root); node; node = next) {
429 		next = rb_next(node);
430 		tm = container_of(node, struct tree_mod_elem, node);
431 		if (tm->seq > min_seq)
432 			continue;
433 		rb_erase(node, tm_root);
434 		kfree(tm);
435 	}
436 	tree_mod_log_write_unlock(fs_info);
437 }
438 
439 /*
440  * key order of the log:
441  *       index -> sequence
442  *
443  * the index is the shifted logical of the *new* root node for root replace
444  * operations, or the shifted logical of the affected block for all other
445  * operations.
446  */
447 static noinline int
448 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
449 {
450 	struct rb_root *tm_root;
451 	struct rb_node **new;
452 	struct rb_node *parent = NULL;
453 	struct tree_mod_elem *cur;
454 
455 	BUG_ON(!tm || !tm->seq);
456 
457 	tm_root = &fs_info->tree_mod_log;
458 	new = &tm_root->rb_node;
459 	while (*new) {
460 		cur = container_of(*new, struct tree_mod_elem, node);
461 		parent = *new;
462 		if (cur->index < tm->index)
463 			new = &((*new)->rb_left);
464 		else if (cur->index > tm->index)
465 			new = &((*new)->rb_right);
466 		else if (cur->seq < tm->seq)
467 			new = &((*new)->rb_left);
468 		else if (cur->seq > tm->seq)
469 			new = &((*new)->rb_right);
470 		else {
471 			kfree(tm);
472 			return -EEXIST;
473 		}
474 	}
475 
476 	rb_link_node(&tm->node, parent, new);
477 	rb_insert_color(&tm->node, tm_root);
478 	return 0;
479 }
480 
481 /*
482  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
483  * returns zero with the tree_mod_log_lock acquired. The caller must hold
484  * this until all tree mod log insertions are recorded in the rb tree and then
485  * call tree_mod_log_write_unlock() to release.
486  */
487 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
488 				    struct extent_buffer *eb) {
489 	smp_mb();
490 	if (list_empty(&(fs_info)->tree_mod_seq_list))
491 		return 1;
492 	if (eb && btrfs_header_level(eb) == 0)
493 		return 1;
494 
495 	tree_mod_log_write_lock(fs_info);
496 	if (list_empty(&fs_info->tree_mod_seq_list)) {
497 		/*
498 		 * someone emptied the list while we were waiting for the lock.
499 		 * we must not add to the list when no blocker exists.
500 		 */
501 		tree_mod_log_write_unlock(fs_info);
502 		return 1;
503 	}
504 
505 	return 0;
506 }
507 
508 /*
509  * This allocates memory and gets a tree modification sequence number.
510  *
511  * Returns <0 on error.
512  * Returns >0 (the added sequence number) on success.
513  */
514 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
515 				 struct tree_mod_elem **tm_ret)
516 {
517 	struct tree_mod_elem *tm;
518 
519 	/*
520 	 * once we switch from spin locks to something different, we should
521 	 * honor the flags parameter here.
522 	 */
523 	tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
524 	if (!tm)
525 		return -ENOMEM;
526 
527 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
528 	return tm->seq;
529 }
530 
531 static inline int
532 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
533 			  struct extent_buffer *eb, int slot,
534 			  enum mod_log_op op, gfp_t flags)
535 {
536 	int ret;
537 	struct tree_mod_elem *tm;
538 
539 	ret = tree_mod_alloc(fs_info, flags, &tm);
540 	if (ret < 0)
541 		return ret;
542 
543 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
544 	if (op != MOD_LOG_KEY_ADD) {
545 		btrfs_node_key(eb, &tm->key, slot);
546 		tm->blockptr = btrfs_node_blockptr(eb, slot);
547 	}
548 	tm->op = op;
549 	tm->slot = slot;
550 	tm->generation = btrfs_node_ptr_generation(eb, slot);
551 
552 	return __tree_mod_log_insert(fs_info, tm);
553 }
554 
555 static noinline int
556 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
557 			     struct extent_buffer *eb, int slot,
558 			     enum mod_log_op op, gfp_t flags)
559 {
560 	int ret;
561 
562 	if (tree_mod_dont_log(fs_info, eb))
563 		return 0;
564 
565 	ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
566 
567 	tree_mod_log_write_unlock(fs_info);
568 	return ret;
569 }
570 
571 static noinline int
572 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
573 			int slot, enum mod_log_op op)
574 {
575 	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
576 }
577 
578 static noinline int
579 tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
580 			     struct extent_buffer *eb, int slot,
581 			     enum mod_log_op op)
582 {
583 	return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
584 }
585 
586 static noinline int
587 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
588 			 struct extent_buffer *eb, int dst_slot, int src_slot,
589 			 int nr_items, gfp_t flags)
590 {
591 	struct tree_mod_elem *tm;
592 	int ret;
593 	int i;
594 
595 	if (tree_mod_dont_log(fs_info, eb))
596 		return 0;
597 
598 	/*
599 	 * When we override something during the move, we log these removals.
600 	 * This can only happen when we move towards the beginning of the
601 	 * buffer, i.e. dst_slot < src_slot.
602 	 */
603 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
604 		ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
605 					      MOD_LOG_KEY_REMOVE_WHILE_MOVING);
606 		BUG_ON(ret < 0);
607 	}
608 
609 	ret = tree_mod_alloc(fs_info, flags, &tm);
610 	if (ret < 0)
611 		goto out;
612 
613 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
614 	tm->slot = src_slot;
615 	tm->move.dst_slot = dst_slot;
616 	tm->move.nr_items = nr_items;
617 	tm->op = MOD_LOG_MOVE_KEYS;
618 
619 	ret = __tree_mod_log_insert(fs_info, tm);
620 out:
621 	tree_mod_log_write_unlock(fs_info);
622 	return ret;
623 }
624 
625 static inline void
626 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
627 {
628 	int i;
629 	u32 nritems;
630 	int ret;
631 
632 	if (btrfs_header_level(eb) == 0)
633 		return;
634 
635 	nritems = btrfs_header_nritems(eb);
636 	for (i = nritems - 1; i >= 0; i--) {
637 		ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
638 					      MOD_LOG_KEY_REMOVE_WHILE_FREEING);
639 		BUG_ON(ret < 0);
640 	}
641 }
642 
643 static noinline int
644 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
645 			 struct extent_buffer *old_root,
646 			 struct extent_buffer *new_root, gfp_t flags)
647 {
648 	struct tree_mod_elem *tm;
649 	int ret;
650 
651 	if (tree_mod_dont_log(fs_info, NULL))
652 		return 0;
653 
654 	ret = tree_mod_alloc(fs_info, flags, &tm);
655 	if (ret < 0)
656 		goto out;
657 
658 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
659 	tm->old_root.logical = old_root->start;
660 	tm->old_root.level = btrfs_header_level(old_root);
661 	tm->generation = btrfs_header_generation(old_root);
662 	tm->op = MOD_LOG_ROOT_REPLACE;
663 
664 	ret = __tree_mod_log_insert(fs_info, tm);
665 out:
666 	tree_mod_log_write_unlock(fs_info);
667 	return ret;
668 }
669 
670 static struct tree_mod_elem *
671 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
672 		      int smallest)
673 {
674 	struct rb_root *tm_root;
675 	struct rb_node *node;
676 	struct tree_mod_elem *cur = NULL;
677 	struct tree_mod_elem *found = NULL;
678 	u64 index = start >> PAGE_CACHE_SHIFT;
679 
680 	tree_mod_log_read_lock(fs_info);
681 	tm_root = &fs_info->tree_mod_log;
682 	node = tm_root->rb_node;
683 	while (node) {
684 		cur = container_of(node, struct tree_mod_elem, node);
685 		if (cur->index < index) {
686 			node = node->rb_left;
687 		} else if (cur->index > index) {
688 			node = node->rb_right;
689 		} else if (cur->seq < min_seq) {
690 			node = node->rb_left;
691 		} else if (!smallest) {
692 			/* we want the node with the highest seq */
693 			if (found)
694 				BUG_ON(found->seq > cur->seq);
695 			found = cur;
696 			node = node->rb_left;
697 		} else if (cur->seq > min_seq) {
698 			/* we want the node with the smallest seq */
699 			if (found)
700 				BUG_ON(found->seq < cur->seq);
701 			found = cur;
702 			node = node->rb_right;
703 		} else {
704 			found = cur;
705 			break;
706 		}
707 	}
708 	tree_mod_log_read_unlock(fs_info);
709 
710 	return found;
711 }
712 
713 /*
714  * this returns the element from the log with the smallest time sequence
715  * value that's in the log (the oldest log item). any element with a time
716  * sequence lower than min_seq will be ignored.
717  */
718 static struct tree_mod_elem *
719 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
720 			   u64 min_seq)
721 {
722 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
723 }
724 
725 /*
726  * this returns the element from the log with the largest time sequence
727  * value that's in the log (the most recent log item). any element with
728  * a time sequence lower than min_seq will be ignored.
729  */
730 static struct tree_mod_elem *
731 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
732 {
733 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
734 }
735 
736 static noinline void
737 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
738 		     struct extent_buffer *src, unsigned long dst_offset,
739 		     unsigned long src_offset, int nr_items)
740 {
741 	int ret;
742 	int i;
743 
744 	if (tree_mod_dont_log(fs_info, NULL))
745 		return;
746 
747 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
748 		tree_mod_log_write_unlock(fs_info);
749 		return;
750 	}
751 
752 	for (i = 0; i < nr_items; i++) {
753 		ret = tree_mod_log_insert_key_locked(fs_info, src,
754 						     i + src_offset,
755 						     MOD_LOG_KEY_REMOVE);
756 		BUG_ON(ret < 0);
757 		ret = tree_mod_log_insert_key_locked(fs_info, dst,
758 						     i + dst_offset,
759 						     MOD_LOG_KEY_ADD);
760 		BUG_ON(ret < 0);
761 	}
762 
763 	tree_mod_log_write_unlock(fs_info);
764 }
765 
766 static inline void
767 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
768 		     int dst_offset, int src_offset, int nr_items)
769 {
770 	int ret;
771 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
772 				       nr_items, GFP_NOFS);
773 	BUG_ON(ret < 0);
774 }
775 
776 static noinline void
777 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
778 			  struct extent_buffer *eb, int slot, int atomic)
779 {
780 	int ret;
781 
782 	ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
783 					   MOD_LOG_KEY_REPLACE,
784 					   atomic ? GFP_ATOMIC : GFP_NOFS);
785 	BUG_ON(ret < 0);
786 }
787 
788 static noinline void
789 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
790 {
791 	if (tree_mod_dont_log(fs_info, eb))
792 		return;
793 
794 	__tree_mod_log_free_eb(fs_info, eb);
795 
796 	tree_mod_log_write_unlock(fs_info);
797 }
798 
799 static noinline void
800 tree_mod_log_set_root_pointer(struct btrfs_root *root,
801 			      struct extent_buffer *new_root_node)
802 {
803 	int ret;
804 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
805 				       new_root_node, GFP_NOFS);
806 	BUG_ON(ret < 0);
807 }
808 
809 /*
810  * check if the tree block can be shared by multiple trees
811  */
812 int btrfs_block_can_be_shared(struct btrfs_root *root,
813 			      struct extent_buffer *buf)
814 {
815 	/*
816 	 * Tree blocks not in refernece counted trees and tree roots
817 	 * are never shared. If a block was allocated after the last
818 	 * snapshot and the block was not allocated by tree relocation,
819 	 * we know the block is not shared.
820 	 */
821 	if (root->ref_cows &&
822 	    buf != root->node && buf != root->commit_root &&
823 	    (btrfs_header_generation(buf) <=
824 	     btrfs_root_last_snapshot(&root->root_item) ||
825 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
826 		return 1;
827 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
828 	if (root->ref_cows &&
829 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
830 		return 1;
831 #endif
832 	return 0;
833 }
834 
835 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
836 				       struct btrfs_root *root,
837 				       struct extent_buffer *buf,
838 				       struct extent_buffer *cow,
839 				       int *last_ref)
840 {
841 	u64 refs;
842 	u64 owner;
843 	u64 flags;
844 	u64 new_flags = 0;
845 	int ret;
846 
847 	/*
848 	 * Backrefs update rules:
849 	 *
850 	 * Always use full backrefs for extent pointers in tree block
851 	 * allocated by tree relocation.
852 	 *
853 	 * If a shared tree block is no longer referenced by its owner
854 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
855 	 * use full backrefs for extent pointers in tree block.
856 	 *
857 	 * If a tree block is been relocating
858 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
859 	 * use full backrefs for extent pointers in tree block.
860 	 * The reason for this is some operations (such as drop tree)
861 	 * are only allowed for blocks use full backrefs.
862 	 */
863 
864 	if (btrfs_block_can_be_shared(root, buf)) {
865 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
866 					       buf->len, &refs, &flags);
867 		if (ret)
868 			return ret;
869 		if (refs == 0) {
870 			ret = -EROFS;
871 			btrfs_std_error(root->fs_info, ret);
872 			return ret;
873 		}
874 	} else {
875 		refs = 1;
876 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
877 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
878 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
879 		else
880 			flags = 0;
881 	}
882 
883 	owner = btrfs_header_owner(buf);
884 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
885 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
886 
887 	if (refs > 1) {
888 		if ((owner == root->root_key.objectid ||
889 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
890 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
891 			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
892 			BUG_ON(ret); /* -ENOMEM */
893 
894 			if (root->root_key.objectid ==
895 			    BTRFS_TREE_RELOC_OBJECTID) {
896 				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
897 				BUG_ON(ret); /* -ENOMEM */
898 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
899 				BUG_ON(ret); /* -ENOMEM */
900 			}
901 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
902 		} else {
903 
904 			if (root->root_key.objectid ==
905 			    BTRFS_TREE_RELOC_OBJECTID)
906 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
907 			else
908 				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
909 			BUG_ON(ret); /* -ENOMEM */
910 		}
911 		if (new_flags != 0) {
912 			ret = btrfs_set_disk_extent_flags(trans, root,
913 							  buf->start,
914 							  buf->len,
915 							  new_flags, 0);
916 			if (ret)
917 				return ret;
918 		}
919 	} else {
920 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
921 			if (root->root_key.objectid ==
922 			    BTRFS_TREE_RELOC_OBJECTID)
923 				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
924 			else
925 				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
926 			BUG_ON(ret); /* -ENOMEM */
927 			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
928 			BUG_ON(ret); /* -ENOMEM */
929 		}
930 		tree_mod_log_free_eb(root->fs_info, buf);
931 		clean_tree_block(trans, root, buf);
932 		*last_ref = 1;
933 	}
934 	return 0;
935 }
936 
937 /*
938  * does the dirty work in cow of a single block.  The parent block (if
939  * supplied) is updated to point to the new cow copy.  The new buffer is marked
940  * dirty and returned locked.  If you modify the block it needs to be marked
941  * dirty again.
942  *
943  * search_start -- an allocation hint for the new block
944  *
945  * empty_size -- a hint that you plan on doing more cow.  This is the size in
946  * bytes the allocator should try to find free next to the block it returns.
947  * This is just a hint and may be ignored by the allocator.
948  */
949 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
950 			     struct btrfs_root *root,
951 			     struct extent_buffer *buf,
952 			     struct extent_buffer *parent, int parent_slot,
953 			     struct extent_buffer **cow_ret,
954 			     u64 search_start, u64 empty_size)
955 {
956 	struct btrfs_disk_key disk_key;
957 	struct extent_buffer *cow;
958 	int level, ret;
959 	int last_ref = 0;
960 	int unlock_orig = 0;
961 	u64 parent_start;
962 
963 	if (*cow_ret == buf)
964 		unlock_orig = 1;
965 
966 	btrfs_assert_tree_locked(buf);
967 
968 	WARN_ON(root->ref_cows && trans->transid !=
969 		root->fs_info->running_transaction->transid);
970 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
971 
972 	level = btrfs_header_level(buf);
973 
974 	if (level == 0)
975 		btrfs_item_key(buf, &disk_key, 0);
976 	else
977 		btrfs_node_key(buf, &disk_key, 0);
978 
979 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
980 		if (parent)
981 			parent_start = parent->start;
982 		else
983 			parent_start = 0;
984 	} else
985 		parent_start = 0;
986 
987 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
988 				     root->root_key.objectid, &disk_key,
989 				     level, search_start, empty_size);
990 	if (IS_ERR(cow))
991 		return PTR_ERR(cow);
992 
993 	/* cow is set to blocking by btrfs_init_new_buffer */
994 
995 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
996 	btrfs_set_header_bytenr(cow, cow->start);
997 	btrfs_set_header_generation(cow, trans->transid);
998 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
999 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1000 				     BTRFS_HEADER_FLAG_RELOC);
1001 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1002 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1003 	else
1004 		btrfs_set_header_owner(cow, root->root_key.objectid);
1005 
1006 	write_extent_buffer(cow, root->fs_info->fsid,
1007 			    (unsigned long)btrfs_header_fsid(cow),
1008 			    BTRFS_FSID_SIZE);
1009 
1010 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1011 	if (ret) {
1012 		btrfs_abort_transaction(trans, root, ret);
1013 		return ret;
1014 	}
1015 
1016 	if (root->ref_cows)
1017 		btrfs_reloc_cow_block(trans, root, buf, cow);
1018 
1019 	if (buf == root->node) {
1020 		WARN_ON(parent && parent != buf);
1021 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1022 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1023 			parent_start = buf->start;
1024 		else
1025 			parent_start = 0;
1026 
1027 		extent_buffer_get(cow);
1028 		tree_mod_log_set_root_pointer(root, cow);
1029 		rcu_assign_pointer(root->node, cow);
1030 
1031 		btrfs_free_tree_block(trans, root, buf, parent_start,
1032 				      last_ref);
1033 		free_extent_buffer(buf);
1034 		add_root_to_dirty_list(root);
1035 	} else {
1036 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1037 			parent_start = parent->start;
1038 		else
1039 			parent_start = 0;
1040 
1041 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1042 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1043 					MOD_LOG_KEY_REPLACE);
1044 		btrfs_set_node_blockptr(parent, parent_slot,
1045 					cow->start);
1046 		btrfs_set_node_ptr_generation(parent, parent_slot,
1047 					      trans->transid);
1048 		btrfs_mark_buffer_dirty(parent);
1049 		btrfs_free_tree_block(trans, root, buf, parent_start,
1050 				      last_ref);
1051 	}
1052 	if (unlock_orig)
1053 		btrfs_tree_unlock(buf);
1054 	free_extent_buffer_stale(buf);
1055 	btrfs_mark_buffer_dirty(cow);
1056 	*cow_ret = cow;
1057 	return 0;
1058 }
1059 
1060 /*
1061  * returns the logical address of the oldest predecessor of the given root.
1062  * entries older than time_seq are ignored.
1063  */
1064 static struct tree_mod_elem *
1065 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1066 			   struct btrfs_root *root, u64 time_seq)
1067 {
1068 	struct tree_mod_elem *tm;
1069 	struct tree_mod_elem *found = NULL;
1070 	u64 root_logical = root->node->start;
1071 	int looped = 0;
1072 
1073 	if (!time_seq)
1074 		return 0;
1075 
1076 	/*
1077 	 * the very last operation that's logged for a root is the replacement
1078 	 * operation (if it is replaced at all). this has the index of the *new*
1079 	 * root, making it the very first operation that's logged for this root.
1080 	 */
1081 	while (1) {
1082 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1083 						time_seq);
1084 		if (!looped && !tm)
1085 			return 0;
1086 		/*
1087 		 * if there are no tree operation for the oldest root, we simply
1088 		 * return it. this should only happen if that (old) root is at
1089 		 * level 0.
1090 		 */
1091 		if (!tm)
1092 			break;
1093 
1094 		/*
1095 		 * if there's an operation that's not a root replacement, we
1096 		 * found the oldest version of our root. normally, we'll find a
1097 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1098 		 */
1099 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1100 			break;
1101 
1102 		found = tm;
1103 		root_logical = tm->old_root.logical;
1104 		BUG_ON(root_logical == root->node->start);
1105 		looped = 1;
1106 	}
1107 
1108 	/* if there's no old root to return, return what we found instead */
1109 	if (!found)
1110 		found = tm;
1111 
1112 	return found;
1113 }
1114 
1115 /*
1116  * tm is a pointer to the first operation to rewind within eb. then, all
1117  * previous operations will be rewinded (until we reach something older than
1118  * time_seq).
1119  */
1120 static void
1121 __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1122 		      struct tree_mod_elem *first_tm)
1123 {
1124 	u32 n;
1125 	struct rb_node *next;
1126 	struct tree_mod_elem *tm = first_tm;
1127 	unsigned long o_dst;
1128 	unsigned long o_src;
1129 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1130 
1131 	n = btrfs_header_nritems(eb);
1132 	while (tm && tm->seq >= time_seq) {
1133 		/*
1134 		 * all the operations are recorded with the operator used for
1135 		 * the modification. as we're going backwards, we do the
1136 		 * opposite of each operation here.
1137 		 */
1138 		switch (tm->op) {
1139 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1140 			BUG_ON(tm->slot < n);
1141 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1142 		case MOD_LOG_KEY_REMOVE:
1143 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1144 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1145 			btrfs_set_node_ptr_generation(eb, tm->slot,
1146 						      tm->generation);
1147 			n++;
1148 			break;
1149 		case MOD_LOG_KEY_REPLACE:
1150 			BUG_ON(tm->slot >= n);
1151 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1152 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1153 			btrfs_set_node_ptr_generation(eb, tm->slot,
1154 						      tm->generation);
1155 			break;
1156 		case MOD_LOG_KEY_ADD:
1157 			/* if a move operation is needed it's in the log */
1158 			n--;
1159 			break;
1160 		case MOD_LOG_MOVE_KEYS:
1161 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1162 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1163 			memmove_extent_buffer(eb, o_dst, o_src,
1164 					      tm->move.nr_items * p_size);
1165 			break;
1166 		case MOD_LOG_ROOT_REPLACE:
1167 			/*
1168 			 * this operation is special. for roots, this must be
1169 			 * handled explicitly before rewinding.
1170 			 * for non-roots, this operation may exist if the node
1171 			 * was a root: root A -> child B; then A gets empty and
1172 			 * B is promoted to the new root. in the mod log, we'll
1173 			 * have a root-replace operation for B, a tree block
1174 			 * that is no root. we simply ignore that operation.
1175 			 */
1176 			break;
1177 		}
1178 		next = rb_next(&tm->node);
1179 		if (!next)
1180 			break;
1181 		tm = container_of(next, struct tree_mod_elem, node);
1182 		if (tm->index != first_tm->index)
1183 			break;
1184 	}
1185 	btrfs_set_header_nritems(eb, n);
1186 }
1187 
1188 static struct extent_buffer *
1189 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1190 		    u64 time_seq)
1191 {
1192 	struct extent_buffer *eb_rewin;
1193 	struct tree_mod_elem *tm;
1194 
1195 	if (!time_seq)
1196 		return eb;
1197 
1198 	if (btrfs_header_level(eb) == 0)
1199 		return eb;
1200 
1201 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1202 	if (!tm)
1203 		return eb;
1204 
1205 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1206 		BUG_ON(tm->slot != 0);
1207 		eb_rewin = alloc_dummy_extent_buffer(eb->start,
1208 						fs_info->tree_root->nodesize);
1209 		BUG_ON(!eb_rewin);
1210 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1211 		btrfs_set_header_backref_rev(eb_rewin,
1212 					     btrfs_header_backref_rev(eb));
1213 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1214 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1215 	} else {
1216 		eb_rewin = btrfs_clone_extent_buffer(eb);
1217 		BUG_ON(!eb_rewin);
1218 	}
1219 
1220 	extent_buffer_get(eb_rewin);
1221 	free_extent_buffer(eb);
1222 
1223 	__tree_mod_log_rewind(eb_rewin, time_seq, tm);
1224 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1225 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
1226 
1227 	return eb_rewin;
1228 }
1229 
1230 /*
1231  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1232  * value. If there are no changes, the current root->root_node is returned. If
1233  * anything changed in between, there's a fresh buffer allocated on which the
1234  * rewind operations are done. In any case, the returned buffer is read locked.
1235  * Returns NULL on error (with no locks held).
1236  */
1237 static inline struct extent_buffer *
1238 get_old_root(struct btrfs_root *root, u64 time_seq)
1239 {
1240 	struct tree_mod_elem *tm;
1241 	struct extent_buffer *eb;
1242 	struct extent_buffer *old;
1243 	struct tree_mod_root *old_root = NULL;
1244 	u64 old_generation = 0;
1245 	u64 logical;
1246 	u32 blocksize;
1247 
1248 	eb = btrfs_read_lock_root_node(root);
1249 	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1250 	if (!tm)
1251 		return root->node;
1252 
1253 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1254 		old_root = &tm->old_root;
1255 		old_generation = tm->generation;
1256 		logical = old_root->logical;
1257 	} else {
1258 		logical = root->node->start;
1259 	}
1260 
1261 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1262 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1263 		btrfs_tree_read_unlock(root->node);
1264 		free_extent_buffer(root->node);
1265 		blocksize = btrfs_level_size(root, old_root->level);
1266 		old = read_tree_block(root, logical, blocksize, 0);
1267 		if (!old) {
1268 			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1269 				logical);
1270 			WARN_ON(1);
1271 		} else {
1272 			eb = btrfs_clone_extent_buffer(old);
1273 			free_extent_buffer(old);
1274 		}
1275 	} else if (old_root) {
1276 		btrfs_tree_read_unlock(root->node);
1277 		free_extent_buffer(root->node);
1278 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1279 	} else {
1280 		eb = btrfs_clone_extent_buffer(root->node);
1281 		btrfs_tree_read_unlock(root->node);
1282 		free_extent_buffer(root->node);
1283 	}
1284 
1285 	if (!eb)
1286 		return NULL;
1287 	extent_buffer_get(eb);
1288 	btrfs_tree_read_lock(eb);
1289 	if (old_root) {
1290 		btrfs_set_header_bytenr(eb, eb->start);
1291 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1292 		btrfs_set_header_owner(eb, root->root_key.objectid);
1293 		btrfs_set_header_level(eb, old_root->level);
1294 		btrfs_set_header_generation(eb, old_generation);
1295 	}
1296 	if (tm)
1297 		__tree_mod_log_rewind(eb, time_seq, tm);
1298 	else
1299 		WARN_ON(btrfs_header_level(eb) != 0);
1300 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1301 
1302 	return eb;
1303 }
1304 
1305 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1306 {
1307 	struct tree_mod_elem *tm;
1308 	int level;
1309 
1310 	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1311 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1312 		level = tm->old_root.level;
1313 	} else {
1314 		rcu_read_lock();
1315 		level = btrfs_header_level(root->node);
1316 		rcu_read_unlock();
1317 	}
1318 
1319 	return level;
1320 }
1321 
1322 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1323 				   struct btrfs_root *root,
1324 				   struct extent_buffer *buf)
1325 {
1326 	/* ensure we can see the force_cow */
1327 	smp_rmb();
1328 
1329 	/*
1330 	 * We do not need to cow a block if
1331 	 * 1) this block is not created or changed in this transaction;
1332 	 * 2) this block does not belong to TREE_RELOC tree;
1333 	 * 3) the root is not forced COW.
1334 	 *
1335 	 * What is forced COW:
1336 	 *    when we create snapshot during commiting the transaction,
1337 	 *    after we've finished coping src root, we must COW the shared
1338 	 *    block to ensure the metadata consistency.
1339 	 */
1340 	if (btrfs_header_generation(buf) == trans->transid &&
1341 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1342 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1343 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1344 	    !root->force_cow)
1345 		return 0;
1346 	return 1;
1347 }
1348 
1349 /*
1350  * cows a single block, see __btrfs_cow_block for the real work.
1351  * This version of it has extra checks so that a block isn't cow'd more than
1352  * once per transaction, as long as it hasn't been written yet
1353  */
1354 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1355 		    struct btrfs_root *root, struct extent_buffer *buf,
1356 		    struct extent_buffer *parent, int parent_slot,
1357 		    struct extent_buffer **cow_ret)
1358 {
1359 	u64 search_start;
1360 	int ret;
1361 
1362 	if (trans->transaction != root->fs_info->running_transaction)
1363 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1364 		       (unsigned long long)trans->transid,
1365 		       (unsigned long long)
1366 		       root->fs_info->running_transaction->transid);
1367 
1368 	if (trans->transid != root->fs_info->generation)
1369 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1370 		       (unsigned long long)trans->transid,
1371 		       (unsigned long long)root->fs_info->generation);
1372 
1373 	if (!should_cow_block(trans, root, buf)) {
1374 		*cow_ret = buf;
1375 		return 0;
1376 	}
1377 
1378 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1379 
1380 	if (parent)
1381 		btrfs_set_lock_blocking(parent);
1382 	btrfs_set_lock_blocking(buf);
1383 
1384 	ret = __btrfs_cow_block(trans, root, buf, parent,
1385 				 parent_slot, cow_ret, search_start, 0);
1386 
1387 	trace_btrfs_cow_block(root, buf, *cow_ret);
1388 
1389 	return ret;
1390 }
1391 
1392 /*
1393  * helper function for defrag to decide if two blocks pointed to by a
1394  * node are actually close by
1395  */
1396 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1397 {
1398 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1399 		return 1;
1400 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1401 		return 1;
1402 	return 0;
1403 }
1404 
1405 /*
1406  * compare two keys in a memcmp fashion
1407  */
1408 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1409 {
1410 	struct btrfs_key k1;
1411 
1412 	btrfs_disk_key_to_cpu(&k1, disk);
1413 
1414 	return btrfs_comp_cpu_keys(&k1, k2);
1415 }
1416 
1417 /*
1418  * same as comp_keys only with two btrfs_key's
1419  */
1420 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1421 {
1422 	if (k1->objectid > k2->objectid)
1423 		return 1;
1424 	if (k1->objectid < k2->objectid)
1425 		return -1;
1426 	if (k1->type > k2->type)
1427 		return 1;
1428 	if (k1->type < k2->type)
1429 		return -1;
1430 	if (k1->offset > k2->offset)
1431 		return 1;
1432 	if (k1->offset < k2->offset)
1433 		return -1;
1434 	return 0;
1435 }
1436 
1437 /*
1438  * this is used by the defrag code to go through all the
1439  * leaves pointed to by a node and reallocate them so that
1440  * disk order is close to key order
1441  */
1442 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1443 		       struct btrfs_root *root, struct extent_buffer *parent,
1444 		       int start_slot, int cache_only, u64 *last_ret,
1445 		       struct btrfs_key *progress)
1446 {
1447 	struct extent_buffer *cur;
1448 	u64 blocknr;
1449 	u64 gen;
1450 	u64 search_start = *last_ret;
1451 	u64 last_block = 0;
1452 	u64 other;
1453 	u32 parent_nritems;
1454 	int end_slot;
1455 	int i;
1456 	int err = 0;
1457 	int parent_level;
1458 	int uptodate;
1459 	u32 blocksize;
1460 	int progress_passed = 0;
1461 	struct btrfs_disk_key disk_key;
1462 
1463 	parent_level = btrfs_header_level(parent);
1464 	if (cache_only && parent_level != 1)
1465 		return 0;
1466 
1467 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1468 	WARN_ON(trans->transid != root->fs_info->generation);
1469 
1470 	parent_nritems = btrfs_header_nritems(parent);
1471 	blocksize = btrfs_level_size(root, parent_level - 1);
1472 	end_slot = parent_nritems;
1473 
1474 	if (parent_nritems == 1)
1475 		return 0;
1476 
1477 	btrfs_set_lock_blocking(parent);
1478 
1479 	for (i = start_slot; i < end_slot; i++) {
1480 		int close = 1;
1481 
1482 		btrfs_node_key(parent, &disk_key, i);
1483 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1484 			continue;
1485 
1486 		progress_passed = 1;
1487 		blocknr = btrfs_node_blockptr(parent, i);
1488 		gen = btrfs_node_ptr_generation(parent, i);
1489 		if (last_block == 0)
1490 			last_block = blocknr;
1491 
1492 		if (i > 0) {
1493 			other = btrfs_node_blockptr(parent, i - 1);
1494 			close = close_blocks(blocknr, other, blocksize);
1495 		}
1496 		if (!close && i < end_slot - 2) {
1497 			other = btrfs_node_blockptr(parent, i + 1);
1498 			close = close_blocks(blocknr, other, blocksize);
1499 		}
1500 		if (close) {
1501 			last_block = blocknr;
1502 			continue;
1503 		}
1504 
1505 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
1506 		if (cur)
1507 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1508 		else
1509 			uptodate = 0;
1510 		if (!cur || !uptodate) {
1511 			if (cache_only) {
1512 				free_extent_buffer(cur);
1513 				continue;
1514 			}
1515 			if (!cur) {
1516 				cur = read_tree_block(root, blocknr,
1517 							 blocksize, gen);
1518 				if (!cur)
1519 					return -EIO;
1520 			} else if (!uptodate) {
1521 				err = btrfs_read_buffer(cur, gen);
1522 				if (err) {
1523 					free_extent_buffer(cur);
1524 					return err;
1525 				}
1526 			}
1527 		}
1528 		if (search_start == 0)
1529 			search_start = last_block;
1530 
1531 		btrfs_tree_lock(cur);
1532 		btrfs_set_lock_blocking(cur);
1533 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1534 					&cur, search_start,
1535 					min(16 * blocksize,
1536 					    (end_slot - i) * blocksize));
1537 		if (err) {
1538 			btrfs_tree_unlock(cur);
1539 			free_extent_buffer(cur);
1540 			break;
1541 		}
1542 		search_start = cur->start;
1543 		last_block = cur->start;
1544 		*last_ret = search_start;
1545 		btrfs_tree_unlock(cur);
1546 		free_extent_buffer(cur);
1547 	}
1548 	return err;
1549 }
1550 
1551 /*
1552  * The leaf data grows from end-to-front in the node.
1553  * this returns the address of the start of the last item,
1554  * which is the stop of the leaf data stack
1555  */
1556 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1557 					 struct extent_buffer *leaf)
1558 {
1559 	u32 nr = btrfs_header_nritems(leaf);
1560 	if (nr == 0)
1561 		return BTRFS_LEAF_DATA_SIZE(root);
1562 	return btrfs_item_offset_nr(leaf, nr - 1);
1563 }
1564 
1565 
1566 /*
1567  * search for key in the extent_buffer.  The items start at offset p,
1568  * and they are item_size apart.  There are 'max' items in p.
1569  *
1570  * the slot in the array is returned via slot, and it points to
1571  * the place where you would insert key if it is not found in
1572  * the array.
1573  *
1574  * slot may point to max if the key is bigger than all of the keys
1575  */
1576 static noinline int generic_bin_search(struct extent_buffer *eb,
1577 				       unsigned long p,
1578 				       int item_size, struct btrfs_key *key,
1579 				       int max, int *slot)
1580 {
1581 	int low = 0;
1582 	int high = max;
1583 	int mid;
1584 	int ret;
1585 	struct btrfs_disk_key *tmp = NULL;
1586 	struct btrfs_disk_key unaligned;
1587 	unsigned long offset;
1588 	char *kaddr = NULL;
1589 	unsigned long map_start = 0;
1590 	unsigned long map_len = 0;
1591 	int err;
1592 
1593 	while (low < high) {
1594 		mid = (low + high) / 2;
1595 		offset = p + mid * item_size;
1596 
1597 		if (!kaddr || offset < map_start ||
1598 		    (offset + sizeof(struct btrfs_disk_key)) >
1599 		    map_start + map_len) {
1600 
1601 			err = map_private_extent_buffer(eb, offset,
1602 						sizeof(struct btrfs_disk_key),
1603 						&kaddr, &map_start, &map_len);
1604 
1605 			if (!err) {
1606 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1607 							map_start);
1608 			} else {
1609 				read_extent_buffer(eb, &unaligned,
1610 						   offset, sizeof(unaligned));
1611 				tmp = &unaligned;
1612 			}
1613 
1614 		} else {
1615 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1616 							map_start);
1617 		}
1618 		ret = comp_keys(tmp, key);
1619 
1620 		if (ret < 0)
1621 			low = mid + 1;
1622 		else if (ret > 0)
1623 			high = mid;
1624 		else {
1625 			*slot = mid;
1626 			return 0;
1627 		}
1628 	}
1629 	*slot = low;
1630 	return 1;
1631 }
1632 
1633 /*
1634  * simple bin_search frontend that does the right thing for
1635  * leaves vs nodes
1636  */
1637 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1638 		      int level, int *slot)
1639 {
1640 	if (level == 0)
1641 		return generic_bin_search(eb,
1642 					  offsetof(struct btrfs_leaf, items),
1643 					  sizeof(struct btrfs_item),
1644 					  key, btrfs_header_nritems(eb),
1645 					  slot);
1646 	else
1647 		return generic_bin_search(eb,
1648 					  offsetof(struct btrfs_node, ptrs),
1649 					  sizeof(struct btrfs_key_ptr),
1650 					  key, btrfs_header_nritems(eb),
1651 					  slot);
1652 }
1653 
1654 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1655 		     int level, int *slot)
1656 {
1657 	return bin_search(eb, key, level, slot);
1658 }
1659 
1660 static void root_add_used(struct btrfs_root *root, u32 size)
1661 {
1662 	spin_lock(&root->accounting_lock);
1663 	btrfs_set_root_used(&root->root_item,
1664 			    btrfs_root_used(&root->root_item) + size);
1665 	spin_unlock(&root->accounting_lock);
1666 }
1667 
1668 static void root_sub_used(struct btrfs_root *root, u32 size)
1669 {
1670 	spin_lock(&root->accounting_lock);
1671 	btrfs_set_root_used(&root->root_item,
1672 			    btrfs_root_used(&root->root_item) - size);
1673 	spin_unlock(&root->accounting_lock);
1674 }
1675 
1676 /* given a node and slot number, this reads the blocks it points to.  The
1677  * extent buffer is returned with a reference taken (but unlocked).
1678  * NULL is returned on error.
1679  */
1680 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1681 				   struct extent_buffer *parent, int slot)
1682 {
1683 	int level = btrfs_header_level(parent);
1684 	if (slot < 0)
1685 		return NULL;
1686 	if (slot >= btrfs_header_nritems(parent))
1687 		return NULL;
1688 
1689 	BUG_ON(level == 0);
1690 
1691 	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
1692 		       btrfs_level_size(root, level - 1),
1693 		       btrfs_node_ptr_generation(parent, slot));
1694 }
1695 
1696 /*
1697  * node level balancing, used to make sure nodes are in proper order for
1698  * item deletion.  We balance from the top down, so we have to make sure
1699  * that a deletion won't leave an node completely empty later on.
1700  */
1701 static noinline int balance_level(struct btrfs_trans_handle *trans,
1702 			 struct btrfs_root *root,
1703 			 struct btrfs_path *path, int level)
1704 {
1705 	struct extent_buffer *right = NULL;
1706 	struct extent_buffer *mid;
1707 	struct extent_buffer *left = NULL;
1708 	struct extent_buffer *parent = NULL;
1709 	int ret = 0;
1710 	int wret;
1711 	int pslot;
1712 	int orig_slot = path->slots[level];
1713 	u64 orig_ptr;
1714 
1715 	if (level == 0)
1716 		return 0;
1717 
1718 	mid = path->nodes[level];
1719 
1720 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1721 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1722 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1723 
1724 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1725 
1726 	if (level < BTRFS_MAX_LEVEL - 1) {
1727 		parent = path->nodes[level + 1];
1728 		pslot = path->slots[level + 1];
1729 	}
1730 
1731 	/*
1732 	 * deal with the case where there is only one pointer in the root
1733 	 * by promoting the node below to a root
1734 	 */
1735 	if (!parent) {
1736 		struct extent_buffer *child;
1737 
1738 		if (btrfs_header_nritems(mid) != 1)
1739 			return 0;
1740 
1741 		/* promote the child to a root */
1742 		child = read_node_slot(root, mid, 0);
1743 		if (!child) {
1744 			ret = -EROFS;
1745 			btrfs_std_error(root->fs_info, ret);
1746 			goto enospc;
1747 		}
1748 
1749 		btrfs_tree_lock(child);
1750 		btrfs_set_lock_blocking(child);
1751 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1752 		if (ret) {
1753 			btrfs_tree_unlock(child);
1754 			free_extent_buffer(child);
1755 			goto enospc;
1756 		}
1757 
1758 		tree_mod_log_free_eb(root->fs_info, root->node);
1759 		tree_mod_log_set_root_pointer(root, child);
1760 		rcu_assign_pointer(root->node, child);
1761 
1762 		add_root_to_dirty_list(root);
1763 		btrfs_tree_unlock(child);
1764 
1765 		path->locks[level] = 0;
1766 		path->nodes[level] = NULL;
1767 		clean_tree_block(trans, root, mid);
1768 		btrfs_tree_unlock(mid);
1769 		/* once for the path */
1770 		free_extent_buffer(mid);
1771 
1772 		root_sub_used(root, mid->len);
1773 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1774 		/* once for the root ptr */
1775 		free_extent_buffer_stale(mid);
1776 		return 0;
1777 	}
1778 	if (btrfs_header_nritems(mid) >
1779 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1780 		return 0;
1781 
1782 	left = read_node_slot(root, parent, pslot - 1);
1783 	if (left) {
1784 		btrfs_tree_lock(left);
1785 		btrfs_set_lock_blocking(left);
1786 		wret = btrfs_cow_block(trans, root, left,
1787 				       parent, pslot - 1, &left);
1788 		if (wret) {
1789 			ret = wret;
1790 			goto enospc;
1791 		}
1792 	}
1793 	right = read_node_slot(root, parent, pslot + 1);
1794 	if (right) {
1795 		btrfs_tree_lock(right);
1796 		btrfs_set_lock_blocking(right);
1797 		wret = btrfs_cow_block(trans, root, right,
1798 				       parent, pslot + 1, &right);
1799 		if (wret) {
1800 			ret = wret;
1801 			goto enospc;
1802 		}
1803 	}
1804 
1805 	/* first, try to make some room in the middle buffer */
1806 	if (left) {
1807 		orig_slot += btrfs_header_nritems(left);
1808 		wret = push_node_left(trans, root, left, mid, 1);
1809 		if (wret < 0)
1810 			ret = wret;
1811 	}
1812 
1813 	/*
1814 	 * then try to empty the right most buffer into the middle
1815 	 */
1816 	if (right) {
1817 		wret = push_node_left(trans, root, mid, right, 1);
1818 		if (wret < 0 && wret != -ENOSPC)
1819 			ret = wret;
1820 		if (btrfs_header_nritems(right) == 0) {
1821 			clean_tree_block(trans, root, right);
1822 			btrfs_tree_unlock(right);
1823 			del_ptr(trans, root, path, level + 1, pslot + 1);
1824 			root_sub_used(root, right->len);
1825 			btrfs_free_tree_block(trans, root, right, 0, 1);
1826 			free_extent_buffer_stale(right);
1827 			right = NULL;
1828 		} else {
1829 			struct btrfs_disk_key right_key;
1830 			btrfs_node_key(right, &right_key, 0);
1831 			tree_mod_log_set_node_key(root->fs_info, parent,
1832 						  pslot + 1, 0);
1833 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1834 			btrfs_mark_buffer_dirty(parent);
1835 		}
1836 	}
1837 	if (btrfs_header_nritems(mid) == 1) {
1838 		/*
1839 		 * we're not allowed to leave a node with one item in the
1840 		 * tree during a delete.  A deletion from lower in the tree
1841 		 * could try to delete the only pointer in this node.
1842 		 * So, pull some keys from the left.
1843 		 * There has to be a left pointer at this point because
1844 		 * otherwise we would have pulled some pointers from the
1845 		 * right
1846 		 */
1847 		if (!left) {
1848 			ret = -EROFS;
1849 			btrfs_std_error(root->fs_info, ret);
1850 			goto enospc;
1851 		}
1852 		wret = balance_node_right(trans, root, mid, left);
1853 		if (wret < 0) {
1854 			ret = wret;
1855 			goto enospc;
1856 		}
1857 		if (wret == 1) {
1858 			wret = push_node_left(trans, root, left, mid, 1);
1859 			if (wret < 0)
1860 				ret = wret;
1861 		}
1862 		BUG_ON(wret == 1);
1863 	}
1864 	if (btrfs_header_nritems(mid) == 0) {
1865 		clean_tree_block(trans, root, mid);
1866 		btrfs_tree_unlock(mid);
1867 		del_ptr(trans, root, path, level + 1, pslot);
1868 		root_sub_used(root, mid->len);
1869 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1870 		free_extent_buffer_stale(mid);
1871 		mid = NULL;
1872 	} else {
1873 		/* update the parent key to reflect our changes */
1874 		struct btrfs_disk_key mid_key;
1875 		btrfs_node_key(mid, &mid_key, 0);
1876 		tree_mod_log_set_node_key(root->fs_info, parent,
1877 					  pslot, 0);
1878 		btrfs_set_node_key(parent, &mid_key, pslot);
1879 		btrfs_mark_buffer_dirty(parent);
1880 	}
1881 
1882 	/* update the path */
1883 	if (left) {
1884 		if (btrfs_header_nritems(left) > orig_slot) {
1885 			extent_buffer_get(left);
1886 			/* left was locked after cow */
1887 			path->nodes[level] = left;
1888 			path->slots[level + 1] -= 1;
1889 			path->slots[level] = orig_slot;
1890 			if (mid) {
1891 				btrfs_tree_unlock(mid);
1892 				free_extent_buffer(mid);
1893 			}
1894 		} else {
1895 			orig_slot -= btrfs_header_nritems(left);
1896 			path->slots[level] = orig_slot;
1897 		}
1898 	}
1899 	/* double check we haven't messed things up */
1900 	if (orig_ptr !=
1901 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1902 		BUG();
1903 enospc:
1904 	if (right) {
1905 		btrfs_tree_unlock(right);
1906 		free_extent_buffer(right);
1907 	}
1908 	if (left) {
1909 		if (path->nodes[level] != left)
1910 			btrfs_tree_unlock(left);
1911 		free_extent_buffer(left);
1912 	}
1913 	return ret;
1914 }
1915 
1916 /* Node balancing for insertion.  Here we only split or push nodes around
1917  * when they are completely full.  This is also done top down, so we
1918  * have to be pessimistic.
1919  */
1920 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1921 					  struct btrfs_root *root,
1922 					  struct btrfs_path *path, int level)
1923 {
1924 	struct extent_buffer *right = NULL;
1925 	struct extent_buffer *mid;
1926 	struct extent_buffer *left = NULL;
1927 	struct extent_buffer *parent = NULL;
1928 	int ret = 0;
1929 	int wret;
1930 	int pslot;
1931 	int orig_slot = path->slots[level];
1932 
1933 	if (level == 0)
1934 		return 1;
1935 
1936 	mid = path->nodes[level];
1937 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1938 
1939 	if (level < BTRFS_MAX_LEVEL - 1) {
1940 		parent = path->nodes[level + 1];
1941 		pslot = path->slots[level + 1];
1942 	}
1943 
1944 	if (!parent)
1945 		return 1;
1946 
1947 	left = read_node_slot(root, parent, pslot - 1);
1948 
1949 	/* first, try to make some room in the middle buffer */
1950 	if (left) {
1951 		u32 left_nr;
1952 
1953 		btrfs_tree_lock(left);
1954 		btrfs_set_lock_blocking(left);
1955 
1956 		left_nr = btrfs_header_nritems(left);
1957 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1958 			wret = 1;
1959 		} else {
1960 			ret = btrfs_cow_block(trans, root, left, parent,
1961 					      pslot - 1, &left);
1962 			if (ret)
1963 				wret = 1;
1964 			else {
1965 				wret = push_node_left(trans, root,
1966 						      left, mid, 0);
1967 			}
1968 		}
1969 		if (wret < 0)
1970 			ret = wret;
1971 		if (wret == 0) {
1972 			struct btrfs_disk_key disk_key;
1973 			orig_slot += left_nr;
1974 			btrfs_node_key(mid, &disk_key, 0);
1975 			tree_mod_log_set_node_key(root->fs_info, parent,
1976 						  pslot, 0);
1977 			btrfs_set_node_key(parent, &disk_key, pslot);
1978 			btrfs_mark_buffer_dirty(parent);
1979 			if (btrfs_header_nritems(left) > orig_slot) {
1980 				path->nodes[level] = left;
1981 				path->slots[level + 1] -= 1;
1982 				path->slots[level] = orig_slot;
1983 				btrfs_tree_unlock(mid);
1984 				free_extent_buffer(mid);
1985 			} else {
1986 				orig_slot -=
1987 					btrfs_header_nritems(left);
1988 				path->slots[level] = orig_slot;
1989 				btrfs_tree_unlock(left);
1990 				free_extent_buffer(left);
1991 			}
1992 			return 0;
1993 		}
1994 		btrfs_tree_unlock(left);
1995 		free_extent_buffer(left);
1996 	}
1997 	right = read_node_slot(root, parent, pslot + 1);
1998 
1999 	/*
2000 	 * then try to empty the right most buffer into the middle
2001 	 */
2002 	if (right) {
2003 		u32 right_nr;
2004 
2005 		btrfs_tree_lock(right);
2006 		btrfs_set_lock_blocking(right);
2007 
2008 		right_nr = btrfs_header_nritems(right);
2009 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2010 			wret = 1;
2011 		} else {
2012 			ret = btrfs_cow_block(trans, root, right,
2013 					      parent, pslot + 1,
2014 					      &right);
2015 			if (ret)
2016 				wret = 1;
2017 			else {
2018 				wret = balance_node_right(trans, root,
2019 							  right, mid);
2020 			}
2021 		}
2022 		if (wret < 0)
2023 			ret = wret;
2024 		if (wret == 0) {
2025 			struct btrfs_disk_key disk_key;
2026 
2027 			btrfs_node_key(right, &disk_key, 0);
2028 			tree_mod_log_set_node_key(root->fs_info, parent,
2029 						  pslot + 1, 0);
2030 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2031 			btrfs_mark_buffer_dirty(parent);
2032 
2033 			if (btrfs_header_nritems(mid) <= orig_slot) {
2034 				path->nodes[level] = right;
2035 				path->slots[level + 1] += 1;
2036 				path->slots[level] = orig_slot -
2037 					btrfs_header_nritems(mid);
2038 				btrfs_tree_unlock(mid);
2039 				free_extent_buffer(mid);
2040 			} else {
2041 				btrfs_tree_unlock(right);
2042 				free_extent_buffer(right);
2043 			}
2044 			return 0;
2045 		}
2046 		btrfs_tree_unlock(right);
2047 		free_extent_buffer(right);
2048 	}
2049 	return 1;
2050 }
2051 
2052 /*
2053  * readahead one full node of leaves, finding things that are close
2054  * to the block in 'slot', and triggering ra on them.
2055  */
2056 static void reada_for_search(struct btrfs_root *root,
2057 			     struct btrfs_path *path,
2058 			     int level, int slot, u64 objectid)
2059 {
2060 	struct extent_buffer *node;
2061 	struct btrfs_disk_key disk_key;
2062 	u32 nritems;
2063 	u64 search;
2064 	u64 target;
2065 	u64 nread = 0;
2066 	u64 gen;
2067 	int direction = path->reada;
2068 	struct extent_buffer *eb;
2069 	u32 nr;
2070 	u32 blocksize;
2071 	u32 nscan = 0;
2072 
2073 	if (level != 1)
2074 		return;
2075 
2076 	if (!path->nodes[level])
2077 		return;
2078 
2079 	node = path->nodes[level];
2080 
2081 	search = btrfs_node_blockptr(node, slot);
2082 	blocksize = btrfs_level_size(root, level - 1);
2083 	eb = btrfs_find_tree_block(root, search, blocksize);
2084 	if (eb) {
2085 		free_extent_buffer(eb);
2086 		return;
2087 	}
2088 
2089 	target = search;
2090 
2091 	nritems = btrfs_header_nritems(node);
2092 	nr = slot;
2093 
2094 	while (1) {
2095 		if (direction < 0) {
2096 			if (nr == 0)
2097 				break;
2098 			nr--;
2099 		} else if (direction > 0) {
2100 			nr++;
2101 			if (nr >= nritems)
2102 				break;
2103 		}
2104 		if (path->reada < 0 && objectid) {
2105 			btrfs_node_key(node, &disk_key, nr);
2106 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2107 				break;
2108 		}
2109 		search = btrfs_node_blockptr(node, nr);
2110 		if ((search <= target && target - search <= 65536) ||
2111 		    (search > target && search - target <= 65536)) {
2112 			gen = btrfs_node_ptr_generation(node, nr);
2113 			readahead_tree_block(root, search, blocksize, gen);
2114 			nread += blocksize;
2115 		}
2116 		nscan++;
2117 		if ((nread > 65536 || nscan > 32))
2118 			break;
2119 	}
2120 }
2121 
2122 /*
2123  * returns -EAGAIN if it had to drop the path, or zero if everything was in
2124  * cache
2125  */
2126 static noinline int reada_for_balance(struct btrfs_root *root,
2127 				      struct btrfs_path *path, int level)
2128 {
2129 	int slot;
2130 	int nritems;
2131 	struct extent_buffer *parent;
2132 	struct extent_buffer *eb;
2133 	u64 gen;
2134 	u64 block1 = 0;
2135 	u64 block2 = 0;
2136 	int ret = 0;
2137 	int blocksize;
2138 
2139 	parent = path->nodes[level + 1];
2140 	if (!parent)
2141 		return 0;
2142 
2143 	nritems = btrfs_header_nritems(parent);
2144 	slot = path->slots[level + 1];
2145 	blocksize = btrfs_level_size(root, level);
2146 
2147 	if (slot > 0) {
2148 		block1 = btrfs_node_blockptr(parent, slot - 1);
2149 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2150 		eb = btrfs_find_tree_block(root, block1, blocksize);
2151 		/*
2152 		 * if we get -eagain from btrfs_buffer_uptodate, we
2153 		 * don't want to return eagain here.  That will loop
2154 		 * forever
2155 		 */
2156 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2157 			block1 = 0;
2158 		free_extent_buffer(eb);
2159 	}
2160 	if (slot + 1 < nritems) {
2161 		block2 = btrfs_node_blockptr(parent, slot + 1);
2162 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2163 		eb = btrfs_find_tree_block(root, block2, blocksize);
2164 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2165 			block2 = 0;
2166 		free_extent_buffer(eb);
2167 	}
2168 	if (block1 || block2) {
2169 		ret = -EAGAIN;
2170 
2171 		/* release the whole path */
2172 		btrfs_release_path(path);
2173 
2174 		/* read the blocks */
2175 		if (block1)
2176 			readahead_tree_block(root, block1, blocksize, 0);
2177 		if (block2)
2178 			readahead_tree_block(root, block2, blocksize, 0);
2179 
2180 		if (block1) {
2181 			eb = read_tree_block(root, block1, blocksize, 0);
2182 			free_extent_buffer(eb);
2183 		}
2184 		if (block2) {
2185 			eb = read_tree_block(root, block2, blocksize, 0);
2186 			free_extent_buffer(eb);
2187 		}
2188 	}
2189 	return ret;
2190 }
2191 
2192 
2193 /*
2194  * when we walk down the tree, it is usually safe to unlock the higher layers
2195  * in the tree.  The exceptions are when our path goes through slot 0, because
2196  * operations on the tree might require changing key pointers higher up in the
2197  * tree.
2198  *
2199  * callers might also have set path->keep_locks, which tells this code to keep
2200  * the lock if the path points to the last slot in the block.  This is part of
2201  * walking through the tree, and selecting the next slot in the higher block.
2202  *
2203  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2204  * if lowest_unlock is 1, level 0 won't be unlocked
2205  */
2206 static noinline void unlock_up(struct btrfs_path *path, int level,
2207 			       int lowest_unlock, int min_write_lock_level,
2208 			       int *write_lock_level)
2209 {
2210 	int i;
2211 	int skip_level = level;
2212 	int no_skips = 0;
2213 	struct extent_buffer *t;
2214 
2215 	if (path->really_keep_locks)
2216 		return;
2217 
2218 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2219 		if (!path->nodes[i])
2220 			break;
2221 		if (!path->locks[i])
2222 			break;
2223 		if (!no_skips && path->slots[i] == 0) {
2224 			skip_level = i + 1;
2225 			continue;
2226 		}
2227 		if (!no_skips && path->keep_locks) {
2228 			u32 nritems;
2229 			t = path->nodes[i];
2230 			nritems = btrfs_header_nritems(t);
2231 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2232 				skip_level = i + 1;
2233 				continue;
2234 			}
2235 		}
2236 		if (skip_level < i && i >= lowest_unlock)
2237 			no_skips = 1;
2238 
2239 		t = path->nodes[i];
2240 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2241 			btrfs_tree_unlock_rw(t, path->locks[i]);
2242 			path->locks[i] = 0;
2243 			if (write_lock_level &&
2244 			    i > min_write_lock_level &&
2245 			    i <= *write_lock_level) {
2246 				*write_lock_level = i - 1;
2247 			}
2248 		}
2249 	}
2250 }
2251 
2252 /*
2253  * This releases any locks held in the path starting at level and
2254  * going all the way up to the root.
2255  *
2256  * btrfs_search_slot will keep the lock held on higher nodes in a few
2257  * corner cases, such as COW of the block at slot zero in the node.  This
2258  * ignores those rules, and it should only be called when there are no
2259  * more updates to be done higher up in the tree.
2260  */
2261 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2262 {
2263 	int i;
2264 
2265 	if (path->keep_locks || path->really_keep_locks)
2266 		return;
2267 
2268 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2269 		if (!path->nodes[i])
2270 			continue;
2271 		if (!path->locks[i])
2272 			continue;
2273 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2274 		path->locks[i] = 0;
2275 	}
2276 }
2277 
2278 /*
2279  * helper function for btrfs_search_slot.  The goal is to find a block
2280  * in cache without setting the path to blocking.  If we find the block
2281  * we return zero and the path is unchanged.
2282  *
2283  * If we can't find the block, we set the path blocking and do some
2284  * reada.  -EAGAIN is returned and the search must be repeated.
2285  */
2286 static int
2287 read_block_for_search(struct btrfs_trans_handle *trans,
2288 		       struct btrfs_root *root, struct btrfs_path *p,
2289 		       struct extent_buffer **eb_ret, int level, int slot,
2290 		       struct btrfs_key *key, u64 time_seq)
2291 {
2292 	u64 blocknr;
2293 	u64 gen;
2294 	u32 blocksize;
2295 	struct extent_buffer *b = *eb_ret;
2296 	struct extent_buffer *tmp;
2297 	int ret;
2298 
2299 	blocknr = btrfs_node_blockptr(b, slot);
2300 	gen = btrfs_node_ptr_generation(b, slot);
2301 	blocksize = btrfs_level_size(root, level - 1);
2302 
2303 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2304 	if (tmp) {
2305 		/* first we do an atomic uptodate check */
2306 		if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2307 			if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2308 				/*
2309 				 * we found an up to date block without
2310 				 * sleeping, return
2311 				 * right away
2312 				 */
2313 				*eb_ret = tmp;
2314 				return 0;
2315 			}
2316 			/* the pages were up to date, but we failed
2317 			 * the generation number check.  Do a full
2318 			 * read for the generation number that is correct.
2319 			 * We must do this without dropping locks so
2320 			 * we can trust our generation number
2321 			 */
2322 			free_extent_buffer(tmp);
2323 			btrfs_set_path_blocking(p);
2324 
2325 			/* now we're allowed to do a blocking uptodate check */
2326 			tmp = read_tree_block(root, blocknr, blocksize, gen);
2327 			if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2328 				*eb_ret = tmp;
2329 				return 0;
2330 			}
2331 			free_extent_buffer(tmp);
2332 			btrfs_release_path(p);
2333 			return -EIO;
2334 		}
2335 	}
2336 
2337 	/*
2338 	 * reduce lock contention at high levels
2339 	 * of the btree by dropping locks before
2340 	 * we read.  Don't release the lock on the current
2341 	 * level because we need to walk this node to figure
2342 	 * out which blocks to read.
2343 	 */
2344 	btrfs_unlock_up_safe(p, level + 1);
2345 	btrfs_set_path_blocking(p);
2346 
2347 	free_extent_buffer(tmp);
2348 	if (p->reada)
2349 		reada_for_search(root, p, level, slot, key->objectid);
2350 
2351 	btrfs_release_path(p);
2352 
2353 	ret = -EAGAIN;
2354 	tmp = read_tree_block(root, blocknr, blocksize, 0);
2355 	if (tmp) {
2356 		/*
2357 		 * If the read above didn't mark this buffer up to date,
2358 		 * it will never end up being up to date.  Set ret to EIO now
2359 		 * and give up so that our caller doesn't loop forever
2360 		 * on our EAGAINs.
2361 		 */
2362 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2363 			ret = -EIO;
2364 		free_extent_buffer(tmp);
2365 	}
2366 	return ret;
2367 }
2368 
2369 /*
2370  * helper function for btrfs_search_slot.  This does all of the checks
2371  * for node-level blocks and does any balancing required based on
2372  * the ins_len.
2373  *
2374  * If no extra work was required, zero is returned.  If we had to
2375  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2376  * start over
2377  */
2378 static int
2379 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2380 		       struct btrfs_root *root, struct btrfs_path *p,
2381 		       struct extent_buffer *b, int level, int ins_len,
2382 		       int *write_lock_level)
2383 {
2384 	int ret;
2385 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2386 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2387 		int sret;
2388 
2389 		if (*write_lock_level < level + 1) {
2390 			*write_lock_level = level + 1;
2391 			btrfs_release_path(p);
2392 			goto again;
2393 		}
2394 
2395 		sret = reada_for_balance(root, p, level);
2396 		if (sret)
2397 			goto again;
2398 
2399 		btrfs_set_path_blocking(p);
2400 		sret = split_node(trans, root, p, level);
2401 		btrfs_clear_path_blocking(p, NULL, 0);
2402 
2403 		BUG_ON(sret > 0);
2404 		if (sret) {
2405 			ret = sret;
2406 			goto done;
2407 		}
2408 		b = p->nodes[level];
2409 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2410 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2411 		int sret;
2412 
2413 		if (*write_lock_level < level + 1) {
2414 			*write_lock_level = level + 1;
2415 			btrfs_release_path(p);
2416 			goto again;
2417 		}
2418 
2419 		sret = reada_for_balance(root, p, level);
2420 		if (sret)
2421 			goto again;
2422 
2423 		btrfs_set_path_blocking(p);
2424 		sret = balance_level(trans, root, p, level);
2425 		btrfs_clear_path_blocking(p, NULL, 0);
2426 
2427 		if (sret) {
2428 			ret = sret;
2429 			goto done;
2430 		}
2431 		b = p->nodes[level];
2432 		if (!b) {
2433 			btrfs_release_path(p);
2434 			goto again;
2435 		}
2436 		BUG_ON(btrfs_header_nritems(b) == 1);
2437 	}
2438 	return 0;
2439 
2440 again:
2441 	ret = -EAGAIN;
2442 done:
2443 	return ret;
2444 }
2445 
2446 /*
2447  * look for key in the tree.  path is filled in with nodes along the way
2448  * if key is found, we return zero and you can find the item in the leaf
2449  * level of the path (level 0)
2450  *
2451  * If the key isn't found, the path points to the slot where it should
2452  * be inserted, and 1 is returned.  If there are other errors during the
2453  * search a negative error number is returned.
2454  *
2455  * if ins_len > 0, nodes and leaves will be split as we walk down the
2456  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2457  * possible)
2458  */
2459 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2460 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2461 		      ins_len, int cow)
2462 {
2463 	struct extent_buffer *b;
2464 	int slot;
2465 	int ret;
2466 	int err;
2467 	int level;
2468 	int lowest_unlock = 1;
2469 	int root_lock;
2470 	/* everything at write_lock_level or lower must be write locked */
2471 	int write_lock_level = 0;
2472 	u8 lowest_level = 0;
2473 	int min_write_lock_level;
2474 
2475 	lowest_level = p->lowest_level;
2476 	WARN_ON(lowest_level && ins_len > 0);
2477 	WARN_ON(p->nodes[0] != NULL);
2478 
2479 	if (ins_len < 0) {
2480 		lowest_unlock = 2;
2481 
2482 		/* when we are removing items, we might have to go up to level
2483 		 * two as we update tree pointers  Make sure we keep write
2484 		 * for those levels as well
2485 		 */
2486 		write_lock_level = 2;
2487 	} else if (ins_len > 0) {
2488 		/*
2489 		 * for inserting items, make sure we have a write lock on
2490 		 * level 1 so we can update keys
2491 		 */
2492 		write_lock_level = 1;
2493 	}
2494 
2495 	if (!cow)
2496 		write_lock_level = -1;
2497 
2498 	if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2499 		write_lock_level = BTRFS_MAX_LEVEL;
2500 
2501 	min_write_lock_level = write_lock_level;
2502 
2503 again:
2504 	/*
2505 	 * we try very hard to do read locks on the root
2506 	 */
2507 	root_lock = BTRFS_READ_LOCK;
2508 	level = 0;
2509 	if (p->search_commit_root) {
2510 		/*
2511 		 * the commit roots are read only
2512 		 * so we always do read locks
2513 		 */
2514 		b = root->commit_root;
2515 		extent_buffer_get(b);
2516 		level = btrfs_header_level(b);
2517 		if (!p->skip_locking)
2518 			btrfs_tree_read_lock(b);
2519 	} else {
2520 		if (p->skip_locking) {
2521 			b = btrfs_root_node(root);
2522 			level = btrfs_header_level(b);
2523 		} else {
2524 			/* we don't know the level of the root node
2525 			 * until we actually have it read locked
2526 			 */
2527 			b = btrfs_read_lock_root_node(root);
2528 			level = btrfs_header_level(b);
2529 			if (level <= write_lock_level) {
2530 				/* whoops, must trade for write lock */
2531 				btrfs_tree_read_unlock(b);
2532 				free_extent_buffer(b);
2533 				b = btrfs_lock_root_node(root);
2534 				root_lock = BTRFS_WRITE_LOCK;
2535 
2536 				/* the level might have changed, check again */
2537 				level = btrfs_header_level(b);
2538 			}
2539 		}
2540 	}
2541 	p->nodes[level] = b;
2542 	if (!p->skip_locking)
2543 		p->locks[level] = root_lock;
2544 
2545 	while (b) {
2546 		level = btrfs_header_level(b);
2547 
2548 		/*
2549 		 * setup the path here so we can release it under lock
2550 		 * contention with the cow code
2551 		 */
2552 		if (cow) {
2553 			/*
2554 			 * if we don't really need to cow this block
2555 			 * then we don't want to set the path blocking,
2556 			 * so we test it here
2557 			 */
2558 			if (!should_cow_block(trans, root, b))
2559 				goto cow_done;
2560 
2561 			btrfs_set_path_blocking(p);
2562 
2563 			/*
2564 			 * must have write locks on this node and the
2565 			 * parent
2566 			 */
2567 			if (level > write_lock_level ||
2568 			    (level + 1 > write_lock_level &&
2569 			    level + 1 < BTRFS_MAX_LEVEL &&
2570 			    p->nodes[level + 1])) {
2571 				write_lock_level = level + 1;
2572 				btrfs_release_path(p);
2573 				goto again;
2574 			}
2575 
2576 			err = btrfs_cow_block(trans, root, b,
2577 					      p->nodes[level + 1],
2578 					      p->slots[level + 1], &b);
2579 			if (err) {
2580 				ret = err;
2581 				goto done;
2582 			}
2583 		}
2584 cow_done:
2585 		BUG_ON(!cow && ins_len);
2586 
2587 		p->nodes[level] = b;
2588 		btrfs_clear_path_blocking(p, NULL, 0);
2589 
2590 		/*
2591 		 * we have a lock on b and as long as we aren't changing
2592 		 * the tree, there is no way to for the items in b to change.
2593 		 * It is safe to drop the lock on our parent before we
2594 		 * go through the expensive btree search on b.
2595 		 *
2596 		 * If cow is true, then we might be changing slot zero,
2597 		 * which may require changing the parent.  So, we can't
2598 		 * drop the lock until after we know which slot we're
2599 		 * operating on.
2600 		 */
2601 		if (!cow)
2602 			btrfs_unlock_up_safe(p, level + 1);
2603 
2604 		ret = bin_search(b, key, level, &slot);
2605 
2606 		if (level != 0) {
2607 			int dec = 0;
2608 			if (ret && slot > 0) {
2609 				dec = 1;
2610 				slot -= 1;
2611 			}
2612 			p->slots[level] = slot;
2613 			err = setup_nodes_for_search(trans, root, p, b, level,
2614 					     ins_len, &write_lock_level);
2615 			if (err == -EAGAIN)
2616 				goto again;
2617 			if (err) {
2618 				ret = err;
2619 				goto done;
2620 			}
2621 			b = p->nodes[level];
2622 			slot = p->slots[level];
2623 
2624 			/*
2625 			 * slot 0 is special, if we change the key
2626 			 * we have to update the parent pointer
2627 			 * which means we must have a write lock
2628 			 * on the parent
2629 			 */
2630 			if (slot == 0 && cow &&
2631 			    write_lock_level < level + 1) {
2632 				write_lock_level = level + 1;
2633 				btrfs_release_path(p);
2634 				goto again;
2635 			}
2636 
2637 			unlock_up(p, level, lowest_unlock,
2638 				  min_write_lock_level, &write_lock_level);
2639 
2640 			if (level == lowest_level) {
2641 				if (dec)
2642 					p->slots[level]++;
2643 				goto done;
2644 			}
2645 
2646 			err = read_block_for_search(trans, root, p,
2647 						    &b, level, slot, key, 0);
2648 			if (err == -EAGAIN)
2649 				goto again;
2650 			if (err) {
2651 				ret = err;
2652 				goto done;
2653 			}
2654 
2655 			if (!p->skip_locking) {
2656 				level = btrfs_header_level(b);
2657 				if (level <= write_lock_level) {
2658 					err = btrfs_try_tree_write_lock(b);
2659 					if (!err) {
2660 						btrfs_set_path_blocking(p);
2661 						btrfs_tree_lock(b);
2662 						btrfs_clear_path_blocking(p, b,
2663 								  BTRFS_WRITE_LOCK);
2664 					}
2665 					p->locks[level] = BTRFS_WRITE_LOCK;
2666 				} else {
2667 					err = btrfs_try_tree_read_lock(b);
2668 					if (!err) {
2669 						btrfs_set_path_blocking(p);
2670 						btrfs_tree_read_lock(b);
2671 						btrfs_clear_path_blocking(p, b,
2672 								  BTRFS_READ_LOCK);
2673 					}
2674 					p->locks[level] = BTRFS_READ_LOCK;
2675 				}
2676 				p->nodes[level] = b;
2677 			}
2678 		} else {
2679 			p->slots[level] = slot;
2680 			if (ins_len > 0 &&
2681 			    btrfs_leaf_free_space(root, b) < ins_len) {
2682 				if (write_lock_level < 1) {
2683 					write_lock_level = 1;
2684 					btrfs_release_path(p);
2685 					goto again;
2686 				}
2687 
2688 				btrfs_set_path_blocking(p);
2689 				err = split_leaf(trans, root, key,
2690 						 p, ins_len, ret == 0);
2691 				btrfs_clear_path_blocking(p, NULL, 0);
2692 
2693 				BUG_ON(err > 0);
2694 				if (err) {
2695 					ret = err;
2696 					goto done;
2697 				}
2698 			}
2699 			if (!p->search_for_split)
2700 				unlock_up(p, level, lowest_unlock,
2701 					  min_write_lock_level, &write_lock_level);
2702 			goto done;
2703 		}
2704 	}
2705 	ret = 1;
2706 done:
2707 	/*
2708 	 * we don't really know what they plan on doing with the path
2709 	 * from here on, so for now just mark it as blocking
2710 	 */
2711 	if (!p->leave_spinning)
2712 		btrfs_set_path_blocking(p);
2713 	if (ret < 0)
2714 		btrfs_release_path(p);
2715 	return ret;
2716 }
2717 
2718 /*
2719  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2720  * current state of the tree together with the operations recorded in the tree
2721  * modification log to search for the key in a previous version of this tree, as
2722  * denoted by the time_seq parameter.
2723  *
2724  * Naturally, there is no support for insert, delete or cow operations.
2725  *
2726  * The resulting path and return value will be set up as if we called
2727  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2728  */
2729 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2730 			  struct btrfs_path *p, u64 time_seq)
2731 {
2732 	struct extent_buffer *b;
2733 	int slot;
2734 	int ret;
2735 	int err;
2736 	int level;
2737 	int lowest_unlock = 1;
2738 	u8 lowest_level = 0;
2739 
2740 	lowest_level = p->lowest_level;
2741 	WARN_ON(p->nodes[0] != NULL);
2742 
2743 	if (p->search_commit_root) {
2744 		BUG_ON(time_seq);
2745 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2746 	}
2747 
2748 again:
2749 	b = get_old_root(root, time_seq);
2750 	level = btrfs_header_level(b);
2751 	p->locks[level] = BTRFS_READ_LOCK;
2752 
2753 	while (b) {
2754 		level = btrfs_header_level(b);
2755 		p->nodes[level] = b;
2756 		btrfs_clear_path_blocking(p, NULL, 0);
2757 
2758 		/*
2759 		 * we have a lock on b and as long as we aren't changing
2760 		 * the tree, there is no way to for the items in b to change.
2761 		 * It is safe to drop the lock on our parent before we
2762 		 * go through the expensive btree search on b.
2763 		 */
2764 		btrfs_unlock_up_safe(p, level + 1);
2765 
2766 		ret = bin_search(b, key, level, &slot);
2767 
2768 		if (level != 0) {
2769 			int dec = 0;
2770 			if (ret && slot > 0) {
2771 				dec = 1;
2772 				slot -= 1;
2773 			}
2774 			p->slots[level] = slot;
2775 			unlock_up(p, level, lowest_unlock, 0, NULL);
2776 
2777 			if (level == lowest_level) {
2778 				if (dec)
2779 					p->slots[level]++;
2780 				goto done;
2781 			}
2782 
2783 			err = read_block_for_search(NULL, root, p, &b, level,
2784 						    slot, key, time_seq);
2785 			if (err == -EAGAIN)
2786 				goto again;
2787 			if (err) {
2788 				ret = err;
2789 				goto done;
2790 			}
2791 
2792 			level = btrfs_header_level(b);
2793 			err = btrfs_try_tree_read_lock(b);
2794 			if (!err) {
2795 				btrfs_set_path_blocking(p);
2796 				btrfs_tree_read_lock(b);
2797 				btrfs_clear_path_blocking(p, b,
2798 							  BTRFS_READ_LOCK);
2799 			}
2800 			p->locks[level] = BTRFS_READ_LOCK;
2801 			p->nodes[level] = b;
2802 			b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2803 			if (b != p->nodes[level]) {
2804 				btrfs_tree_unlock_rw(p->nodes[level],
2805 						     p->locks[level]);
2806 				p->locks[level] = 0;
2807 				p->nodes[level] = b;
2808 			}
2809 		} else {
2810 			p->slots[level] = slot;
2811 			unlock_up(p, level, lowest_unlock, 0, NULL);
2812 			goto done;
2813 		}
2814 	}
2815 	ret = 1;
2816 done:
2817 	if (!p->leave_spinning)
2818 		btrfs_set_path_blocking(p);
2819 	if (ret < 0)
2820 		btrfs_release_path(p);
2821 
2822 	return ret;
2823 }
2824 
2825 /*
2826  * helper to use instead of search slot if no exact match is needed but
2827  * instead the next or previous item should be returned.
2828  * When find_higher is true, the next higher item is returned, the next lower
2829  * otherwise.
2830  * When return_any and find_higher are both true, and no higher item is found,
2831  * return the next lower instead.
2832  * When return_any is true and find_higher is false, and no lower item is found,
2833  * return the next higher instead.
2834  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2835  * < 0 on error
2836  */
2837 int btrfs_search_slot_for_read(struct btrfs_root *root,
2838 			       struct btrfs_key *key, struct btrfs_path *p,
2839 			       int find_higher, int return_any)
2840 {
2841 	int ret;
2842 	struct extent_buffer *leaf;
2843 
2844 again:
2845 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2846 	if (ret <= 0)
2847 		return ret;
2848 	/*
2849 	 * a return value of 1 means the path is at the position where the
2850 	 * item should be inserted. Normally this is the next bigger item,
2851 	 * but in case the previous item is the last in a leaf, path points
2852 	 * to the first free slot in the previous leaf, i.e. at an invalid
2853 	 * item.
2854 	 */
2855 	leaf = p->nodes[0];
2856 
2857 	if (find_higher) {
2858 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2859 			ret = btrfs_next_leaf(root, p);
2860 			if (ret <= 0)
2861 				return ret;
2862 			if (!return_any)
2863 				return 1;
2864 			/*
2865 			 * no higher item found, return the next
2866 			 * lower instead
2867 			 */
2868 			return_any = 0;
2869 			find_higher = 0;
2870 			btrfs_release_path(p);
2871 			goto again;
2872 		}
2873 	} else {
2874 		if (p->slots[0] == 0) {
2875 			ret = btrfs_prev_leaf(root, p);
2876 			if (ret < 0)
2877 				return ret;
2878 			if (!ret) {
2879 				p->slots[0] = btrfs_header_nritems(leaf) - 1;
2880 				return 0;
2881 			}
2882 			if (!return_any)
2883 				return 1;
2884 			/*
2885 			 * no lower item found, return the next
2886 			 * higher instead
2887 			 */
2888 			return_any = 0;
2889 			find_higher = 1;
2890 			btrfs_release_path(p);
2891 			goto again;
2892 		} else {
2893 			--p->slots[0];
2894 		}
2895 	}
2896 	return 0;
2897 }
2898 
2899 /*
2900  * adjust the pointers going up the tree, starting at level
2901  * making sure the right key of each node is points to 'key'.
2902  * This is used after shifting pointers to the left, so it stops
2903  * fixing up pointers when a given leaf/node is not in slot 0 of the
2904  * higher levels
2905  *
2906  */
2907 static void fixup_low_keys(struct btrfs_trans_handle *trans,
2908 			   struct btrfs_root *root, struct btrfs_path *path,
2909 			   struct btrfs_disk_key *key, int level)
2910 {
2911 	int i;
2912 	struct extent_buffer *t;
2913 
2914 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2915 		int tslot = path->slots[i];
2916 		if (!path->nodes[i])
2917 			break;
2918 		t = path->nodes[i];
2919 		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2920 		btrfs_set_node_key(t, key, tslot);
2921 		btrfs_mark_buffer_dirty(path->nodes[i]);
2922 		if (tslot != 0)
2923 			break;
2924 	}
2925 }
2926 
2927 /*
2928  * update item key.
2929  *
2930  * This function isn't completely safe. It's the caller's responsibility
2931  * that the new key won't break the order
2932  */
2933 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2934 			     struct btrfs_root *root, struct btrfs_path *path,
2935 			     struct btrfs_key *new_key)
2936 {
2937 	struct btrfs_disk_key disk_key;
2938 	struct extent_buffer *eb;
2939 	int slot;
2940 
2941 	eb = path->nodes[0];
2942 	slot = path->slots[0];
2943 	if (slot > 0) {
2944 		btrfs_item_key(eb, &disk_key, slot - 1);
2945 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2946 	}
2947 	if (slot < btrfs_header_nritems(eb) - 1) {
2948 		btrfs_item_key(eb, &disk_key, slot + 1);
2949 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2950 	}
2951 
2952 	btrfs_cpu_key_to_disk(&disk_key, new_key);
2953 	btrfs_set_item_key(eb, &disk_key, slot);
2954 	btrfs_mark_buffer_dirty(eb);
2955 	if (slot == 0)
2956 		fixup_low_keys(trans, root, path, &disk_key, 1);
2957 }
2958 
2959 /*
2960  * try to push data from one node into the next node left in the
2961  * tree.
2962  *
2963  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2964  * error, and > 0 if there was no room in the left hand block.
2965  */
2966 static int push_node_left(struct btrfs_trans_handle *trans,
2967 			  struct btrfs_root *root, struct extent_buffer *dst,
2968 			  struct extent_buffer *src, int empty)
2969 {
2970 	int push_items = 0;
2971 	int src_nritems;
2972 	int dst_nritems;
2973 	int ret = 0;
2974 
2975 	src_nritems = btrfs_header_nritems(src);
2976 	dst_nritems = btrfs_header_nritems(dst);
2977 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2978 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2979 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2980 
2981 	if (!empty && src_nritems <= 8)
2982 		return 1;
2983 
2984 	if (push_items <= 0)
2985 		return 1;
2986 
2987 	if (empty) {
2988 		push_items = min(src_nritems, push_items);
2989 		if (push_items < src_nritems) {
2990 			/* leave at least 8 pointers in the node if
2991 			 * we aren't going to empty it
2992 			 */
2993 			if (src_nritems - push_items < 8) {
2994 				if (push_items <= 8)
2995 					return 1;
2996 				push_items -= 8;
2997 			}
2998 		}
2999 	} else
3000 		push_items = min(src_nritems - 8, push_items);
3001 
3002 	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3003 			     push_items);
3004 	copy_extent_buffer(dst, src,
3005 			   btrfs_node_key_ptr_offset(dst_nritems),
3006 			   btrfs_node_key_ptr_offset(0),
3007 			   push_items * sizeof(struct btrfs_key_ptr));
3008 
3009 	if (push_items < src_nritems) {
3010 		/*
3011 		 * don't call tree_mod_log_eb_move here, key removal was already
3012 		 * fully logged by tree_mod_log_eb_copy above.
3013 		 */
3014 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3015 				      btrfs_node_key_ptr_offset(push_items),
3016 				      (src_nritems - push_items) *
3017 				      sizeof(struct btrfs_key_ptr));
3018 	}
3019 	btrfs_set_header_nritems(src, src_nritems - push_items);
3020 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3021 	btrfs_mark_buffer_dirty(src);
3022 	btrfs_mark_buffer_dirty(dst);
3023 
3024 	return ret;
3025 }
3026 
3027 /*
3028  * try to push data from one node into the next node right in the
3029  * tree.
3030  *
3031  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3032  * error, and > 0 if there was no room in the right hand block.
3033  *
3034  * this will  only push up to 1/2 the contents of the left node over
3035  */
3036 static int balance_node_right(struct btrfs_trans_handle *trans,
3037 			      struct btrfs_root *root,
3038 			      struct extent_buffer *dst,
3039 			      struct extent_buffer *src)
3040 {
3041 	int push_items = 0;
3042 	int max_push;
3043 	int src_nritems;
3044 	int dst_nritems;
3045 	int ret = 0;
3046 
3047 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3048 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3049 
3050 	src_nritems = btrfs_header_nritems(src);
3051 	dst_nritems = btrfs_header_nritems(dst);
3052 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3053 	if (push_items <= 0)
3054 		return 1;
3055 
3056 	if (src_nritems < 4)
3057 		return 1;
3058 
3059 	max_push = src_nritems / 2 + 1;
3060 	/* don't try to empty the node */
3061 	if (max_push >= src_nritems)
3062 		return 1;
3063 
3064 	if (max_push < push_items)
3065 		push_items = max_push;
3066 
3067 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3068 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3069 				      btrfs_node_key_ptr_offset(0),
3070 				      (dst_nritems) *
3071 				      sizeof(struct btrfs_key_ptr));
3072 
3073 	tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3074 			     src_nritems - push_items, push_items);
3075 	copy_extent_buffer(dst, src,
3076 			   btrfs_node_key_ptr_offset(0),
3077 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3078 			   push_items * sizeof(struct btrfs_key_ptr));
3079 
3080 	btrfs_set_header_nritems(src, src_nritems - push_items);
3081 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3082 
3083 	btrfs_mark_buffer_dirty(src);
3084 	btrfs_mark_buffer_dirty(dst);
3085 
3086 	return ret;
3087 }
3088 
3089 /*
3090  * helper function to insert a new root level in the tree.
3091  * A new node is allocated, and a single item is inserted to
3092  * point to the existing root
3093  *
3094  * returns zero on success or < 0 on failure.
3095  */
3096 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3097 			   struct btrfs_root *root,
3098 			   struct btrfs_path *path, int level)
3099 {
3100 	u64 lower_gen;
3101 	struct extent_buffer *lower;
3102 	struct extent_buffer *c;
3103 	struct extent_buffer *old;
3104 	struct btrfs_disk_key lower_key;
3105 
3106 	BUG_ON(path->nodes[level]);
3107 	BUG_ON(path->nodes[level-1] != root->node);
3108 
3109 	lower = path->nodes[level-1];
3110 	if (level == 1)
3111 		btrfs_item_key(lower, &lower_key, 0);
3112 	else
3113 		btrfs_node_key(lower, &lower_key, 0);
3114 
3115 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3116 				   root->root_key.objectid, &lower_key,
3117 				   level, root->node->start, 0);
3118 	if (IS_ERR(c))
3119 		return PTR_ERR(c);
3120 
3121 	root_add_used(root, root->nodesize);
3122 
3123 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3124 	btrfs_set_header_nritems(c, 1);
3125 	btrfs_set_header_level(c, level);
3126 	btrfs_set_header_bytenr(c, c->start);
3127 	btrfs_set_header_generation(c, trans->transid);
3128 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3129 	btrfs_set_header_owner(c, root->root_key.objectid);
3130 
3131 	write_extent_buffer(c, root->fs_info->fsid,
3132 			    (unsigned long)btrfs_header_fsid(c),
3133 			    BTRFS_FSID_SIZE);
3134 
3135 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3136 			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
3137 			    BTRFS_UUID_SIZE);
3138 
3139 	btrfs_set_node_key(c, &lower_key, 0);
3140 	btrfs_set_node_blockptr(c, 0, lower->start);
3141 	lower_gen = btrfs_header_generation(lower);
3142 	WARN_ON(lower_gen != trans->transid);
3143 
3144 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3145 
3146 	btrfs_mark_buffer_dirty(c);
3147 
3148 	old = root->node;
3149 	tree_mod_log_set_root_pointer(root, c);
3150 	rcu_assign_pointer(root->node, c);
3151 
3152 	/* the super has an extra ref to root->node */
3153 	free_extent_buffer(old);
3154 
3155 	add_root_to_dirty_list(root);
3156 	extent_buffer_get(c);
3157 	path->nodes[level] = c;
3158 	path->locks[level] = BTRFS_WRITE_LOCK;
3159 	path->slots[level] = 0;
3160 	return 0;
3161 }
3162 
3163 /*
3164  * worker function to insert a single pointer in a node.
3165  * the node should have enough room for the pointer already
3166  *
3167  * slot and level indicate where you want the key to go, and
3168  * blocknr is the block the key points to.
3169  */
3170 static void insert_ptr(struct btrfs_trans_handle *trans,
3171 		       struct btrfs_root *root, struct btrfs_path *path,
3172 		       struct btrfs_disk_key *key, u64 bytenr,
3173 		       int slot, int level)
3174 {
3175 	struct extent_buffer *lower;
3176 	int nritems;
3177 	int ret;
3178 
3179 	BUG_ON(!path->nodes[level]);
3180 	btrfs_assert_tree_locked(path->nodes[level]);
3181 	lower = path->nodes[level];
3182 	nritems = btrfs_header_nritems(lower);
3183 	BUG_ON(slot > nritems);
3184 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3185 	if (slot != nritems) {
3186 		if (level)
3187 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3188 					     slot, nritems - slot);
3189 		memmove_extent_buffer(lower,
3190 			      btrfs_node_key_ptr_offset(slot + 1),
3191 			      btrfs_node_key_ptr_offset(slot),
3192 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3193 	}
3194 	if (level) {
3195 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3196 					      MOD_LOG_KEY_ADD);
3197 		BUG_ON(ret < 0);
3198 	}
3199 	btrfs_set_node_key(lower, key, slot);
3200 	btrfs_set_node_blockptr(lower, slot, bytenr);
3201 	WARN_ON(trans->transid == 0);
3202 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3203 	btrfs_set_header_nritems(lower, nritems + 1);
3204 	btrfs_mark_buffer_dirty(lower);
3205 }
3206 
3207 /*
3208  * split the node at the specified level in path in two.
3209  * The path is corrected to point to the appropriate node after the split
3210  *
3211  * Before splitting this tries to make some room in the node by pushing
3212  * left and right, if either one works, it returns right away.
3213  *
3214  * returns 0 on success and < 0 on failure
3215  */
3216 static noinline int split_node(struct btrfs_trans_handle *trans,
3217 			       struct btrfs_root *root,
3218 			       struct btrfs_path *path, int level)
3219 {
3220 	struct extent_buffer *c;
3221 	struct extent_buffer *split;
3222 	struct btrfs_disk_key disk_key;
3223 	int mid;
3224 	int ret;
3225 	u32 c_nritems;
3226 
3227 	c = path->nodes[level];
3228 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3229 	if (c == root->node) {
3230 		/* trying to split the root, lets make a new one */
3231 		ret = insert_new_root(trans, root, path, level + 1);
3232 		if (ret)
3233 			return ret;
3234 	} else {
3235 		ret = push_nodes_for_insert(trans, root, path, level);
3236 		c = path->nodes[level];
3237 		if (!ret && btrfs_header_nritems(c) <
3238 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3239 			return 0;
3240 		if (ret < 0)
3241 			return ret;
3242 	}
3243 
3244 	c_nritems = btrfs_header_nritems(c);
3245 	mid = (c_nritems + 1) / 2;
3246 	btrfs_node_key(c, &disk_key, mid);
3247 
3248 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3249 					root->root_key.objectid,
3250 					&disk_key, level, c->start, 0);
3251 	if (IS_ERR(split))
3252 		return PTR_ERR(split);
3253 
3254 	root_add_used(root, root->nodesize);
3255 
3256 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3257 	btrfs_set_header_level(split, btrfs_header_level(c));
3258 	btrfs_set_header_bytenr(split, split->start);
3259 	btrfs_set_header_generation(split, trans->transid);
3260 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3261 	btrfs_set_header_owner(split, root->root_key.objectid);
3262 	write_extent_buffer(split, root->fs_info->fsid,
3263 			    (unsigned long)btrfs_header_fsid(split),
3264 			    BTRFS_FSID_SIZE);
3265 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3266 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
3267 			    BTRFS_UUID_SIZE);
3268 
3269 	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3270 	copy_extent_buffer(split, c,
3271 			   btrfs_node_key_ptr_offset(0),
3272 			   btrfs_node_key_ptr_offset(mid),
3273 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3274 	btrfs_set_header_nritems(split, c_nritems - mid);
3275 	btrfs_set_header_nritems(c, mid);
3276 	ret = 0;
3277 
3278 	btrfs_mark_buffer_dirty(c);
3279 	btrfs_mark_buffer_dirty(split);
3280 
3281 	insert_ptr(trans, root, path, &disk_key, split->start,
3282 		   path->slots[level + 1] + 1, level + 1);
3283 
3284 	if (path->slots[level] >= mid) {
3285 		path->slots[level] -= mid;
3286 		btrfs_tree_unlock(c);
3287 		free_extent_buffer(c);
3288 		path->nodes[level] = split;
3289 		path->slots[level + 1] += 1;
3290 	} else {
3291 		btrfs_tree_unlock(split);
3292 		free_extent_buffer(split);
3293 	}
3294 	return ret;
3295 }
3296 
3297 /*
3298  * how many bytes are required to store the items in a leaf.  start
3299  * and nr indicate which items in the leaf to check.  This totals up the
3300  * space used both by the item structs and the item data
3301  */
3302 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3303 {
3304 	struct btrfs_item *start_item;
3305 	struct btrfs_item *end_item;
3306 	struct btrfs_map_token token;
3307 	int data_len;
3308 	int nritems = btrfs_header_nritems(l);
3309 	int end = min(nritems, start + nr) - 1;
3310 
3311 	if (!nr)
3312 		return 0;
3313 	btrfs_init_map_token(&token);
3314 	start_item = btrfs_item_nr(l, start);
3315 	end_item = btrfs_item_nr(l, end);
3316 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 		btrfs_token_item_size(l, start_item, &token);
3318 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3319 	data_len += sizeof(struct btrfs_item) * nr;
3320 	WARN_ON(data_len < 0);
3321 	return data_len;
3322 }
3323 
3324 /*
3325  * The space between the end of the leaf items and
3326  * the start of the leaf data.  IOW, how much room
3327  * the leaf has left for both items and data
3328  */
3329 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3330 				   struct extent_buffer *leaf)
3331 {
3332 	int nritems = btrfs_header_nritems(leaf);
3333 	int ret;
3334 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3335 	if (ret < 0) {
3336 		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3337 		       "used %d nritems %d\n",
3338 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3339 		       leaf_space_used(leaf, 0, nritems), nritems);
3340 	}
3341 	return ret;
3342 }
3343 
3344 /*
3345  * min slot controls the lowest index we're willing to push to the
3346  * right.  We'll push up to and including min_slot, but no lower
3347  */
3348 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3349 				      struct btrfs_root *root,
3350 				      struct btrfs_path *path,
3351 				      int data_size, int empty,
3352 				      struct extent_buffer *right,
3353 				      int free_space, u32 left_nritems,
3354 				      u32 min_slot)
3355 {
3356 	struct extent_buffer *left = path->nodes[0];
3357 	struct extent_buffer *upper = path->nodes[1];
3358 	struct btrfs_map_token token;
3359 	struct btrfs_disk_key disk_key;
3360 	int slot;
3361 	u32 i;
3362 	int push_space = 0;
3363 	int push_items = 0;
3364 	struct btrfs_item *item;
3365 	u32 nr;
3366 	u32 right_nritems;
3367 	u32 data_end;
3368 	u32 this_item_size;
3369 
3370 	btrfs_init_map_token(&token);
3371 
3372 	if (empty)
3373 		nr = 0;
3374 	else
3375 		nr = max_t(u32, 1, min_slot);
3376 
3377 	if (path->slots[0] >= left_nritems)
3378 		push_space += data_size;
3379 
3380 	slot = path->slots[1];
3381 	i = left_nritems - 1;
3382 	while (i >= nr) {
3383 		item = btrfs_item_nr(left, i);
3384 
3385 		if (!empty && push_items > 0) {
3386 			if (path->slots[0] > i)
3387 				break;
3388 			if (path->slots[0] == i) {
3389 				int space = btrfs_leaf_free_space(root, left);
3390 				if (space + push_space * 2 > free_space)
3391 					break;
3392 			}
3393 		}
3394 
3395 		if (path->slots[0] == i)
3396 			push_space += data_size;
3397 
3398 		this_item_size = btrfs_item_size(left, item);
3399 		if (this_item_size + sizeof(*item) + push_space > free_space)
3400 			break;
3401 
3402 		push_items++;
3403 		push_space += this_item_size + sizeof(*item);
3404 		if (i == 0)
3405 			break;
3406 		i--;
3407 	}
3408 
3409 	if (push_items == 0)
3410 		goto out_unlock;
3411 
3412 	WARN_ON(!empty && push_items == left_nritems);
3413 
3414 	/* push left to right */
3415 	right_nritems = btrfs_header_nritems(right);
3416 
3417 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3418 	push_space -= leaf_data_end(root, left);
3419 
3420 	/* make room in the right data area */
3421 	data_end = leaf_data_end(root, right);
3422 	memmove_extent_buffer(right,
3423 			      btrfs_leaf_data(right) + data_end - push_space,
3424 			      btrfs_leaf_data(right) + data_end,
3425 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3426 
3427 	/* copy from the left data area */
3428 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3429 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3430 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3431 		     push_space);
3432 
3433 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3434 			      btrfs_item_nr_offset(0),
3435 			      right_nritems * sizeof(struct btrfs_item));
3436 
3437 	/* copy the items from left to right */
3438 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3439 		   btrfs_item_nr_offset(left_nritems - push_items),
3440 		   push_items * sizeof(struct btrfs_item));
3441 
3442 	/* update the item pointers */
3443 	right_nritems += push_items;
3444 	btrfs_set_header_nritems(right, right_nritems);
3445 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3446 	for (i = 0; i < right_nritems; i++) {
3447 		item = btrfs_item_nr(right, i);
3448 		push_space -= btrfs_token_item_size(right, item, &token);
3449 		btrfs_set_token_item_offset(right, item, push_space, &token);
3450 	}
3451 
3452 	left_nritems -= push_items;
3453 	btrfs_set_header_nritems(left, left_nritems);
3454 
3455 	if (left_nritems)
3456 		btrfs_mark_buffer_dirty(left);
3457 	else
3458 		clean_tree_block(trans, root, left);
3459 
3460 	btrfs_mark_buffer_dirty(right);
3461 
3462 	btrfs_item_key(right, &disk_key, 0);
3463 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3464 	btrfs_mark_buffer_dirty(upper);
3465 
3466 	/* then fixup the leaf pointer in the path */
3467 	if (path->slots[0] >= left_nritems) {
3468 		path->slots[0] -= left_nritems;
3469 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3470 			clean_tree_block(trans, root, path->nodes[0]);
3471 		btrfs_tree_unlock(path->nodes[0]);
3472 		free_extent_buffer(path->nodes[0]);
3473 		path->nodes[0] = right;
3474 		path->slots[1] += 1;
3475 	} else {
3476 		btrfs_tree_unlock(right);
3477 		free_extent_buffer(right);
3478 	}
3479 	return 0;
3480 
3481 out_unlock:
3482 	btrfs_tree_unlock(right);
3483 	free_extent_buffer(right);
3484 	return 1;
3485 }
3486 
3487 /*
3488  * push some data in the path leaf to the right, trying to free up at
3489  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3490  *
3491  * returns 1 if the push failed because the other node didn't have enough
3492  * room, 0 if everything worked out and < 0 if there were major errors.
3493  *
3494  * this will push starting from min_slot to the end of the leaf.  It won't
3495  * push any slot lower than min_slot
3496  */
3497 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3498 			   *root, struct btrfs_path *path,
3499 			   int min_data_size, int data_size,
3500 			   int empty, u32 min_slot)
3501 {
3502 	struct extent_buffer *left = path->nodes[0];
3503 	struct extent_buffer *right;
3504 	struct extent_buffer *upper;
3505 	int slot;
3506 	int free_space;
3507 	u32 left_nritems;
3508 	int ret;
3509 
3510 	if (!path->nodes[1])
3511 		return 1;
3512 
3513 	slot = path->slots[1];
3514 	upper = path->nodes[1];
3515 	if (slot >= btrfs_header_nritems(upper) - 1)
3516 		return 1;
3517 
3518 	btrfs_assert_tree_locked(path->nodes[1]);
3519 
3520 	right = read_node_slot(root, upper, slot + 1);
3521 	if (right == NULL)
3522 		return 1;
3523 
3524 	btrfs_tree_lock(right);
3525 	btrfs_set_lock_blocking(right);
3526 
3527 	free_space = btrfs_leaf_free_space(root, right);
3528 	if (free_space < data_size)
3529 		goto out_unlock;
3530 
3531 	/* cow and double check */
3532 	ret = btrfs_cow_block(trans, root, right, upper,
3533 			      slot + 1, &right);
3534 	if (ret)
3535 		goto out_unlock;
3536 
3537 	free_space = btrfs_leaf_free_space(root, right);
3538 	if (free_space < data_size)
3539 		goto out_unlock;
3540 
3541 	left_nritems = btrfs_header_nritems(left);
3542 	if (left_nritems == 0)
3543 		goto out_unlock;
3544 
3545 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3546 				right, free_space, left_nritems, min_slot);
3547 out_unlock:
3548 	btrfs_tree_unlock(right);
3549 	free_extent_buffer(right);
3550 	return 1;
3551 }
3552 
3553 /*
3554  * push some data in the path leaf to the left, trying to free up at
3555  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3556  *
3557  * max_slot can put a limit on how far into the leaf we'll push items.  The
3558  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3559  * items
3560  */
3561 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3562 				     struct btrfs_root *root,
3563 				     struct btrfs_path *path, int data_size,
3564 				     int empty, struct extent_buffer *left,
3565 				     int free_space, u32 right_nritems,
3566 				     u32 max_slot)
3567 {
3568 	struct btrfs_disk_key disk_key;
3569 	struct extent_buffer *right = path->nodes[0];
3570 	int i;
3571 	int push_space = 0;
3572 	int push_items = 0;
3573 	struct btrfs_item *item;
3574 	u32 old_left_nritems;
3575 	u32 nr;
3576 	int ret = 0;
3577 	u32 this_item_size;
3578 	u32 old_left_item_size;
3579 	struct btrfs_map_token token;
3580 
3581 	btrfs_init_map_token(&token);
3582 
3583 	if (empty)
3584 		nr = min(right_nritems, max_slot);
3585 	else
3586 		nr = min(right_nritems - 1, max_slot);
3587 
3588 	for (i = 0; i < nr; i++) {
3589 		item = btrfs_item_nr(right, i);
3590 
3591 		if (!empty && push_items > 0) {
3592 			if (path->slots[0] < i)
3593 				break;
3594 			if (path->slots[0] == i) {
3595 				int space = btrfs_leaf_free_space(root, right);
3596 				if (space + push_space * 2 > free_space)
3597 					break;
3598 			}
3599 		}
3600 
3601 		if (path->slots[0] == i)
3602 			push_space += data_size;
3603 
3604 		this_item_size = btrfs_item_size(right, item);
3605 		if (this_item_size + sizeof(*item) + push_space > free_space)
3606 			break;
3607 
3608 		push_items++;
3609 		push_space += this_item_size + sizeof(*item);
3610 	}
3611 
3612 	if (push_items == 0) {
3613 		ret = 1;
3614 		goto out;
3615 	}
3616 	if (!empty && push_items == btrfs_header_nritems(right))
3617 		WARN_ON(1);
3618 
3619 	/* push data from right to left */
3620 	copy_extent_buffer(left, right,
3621 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3622 			   btrfs_item_nr_offset(0),
3623 			   push_items * sizeof(struct btrfs_item));
3624 
3625 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3626 		     btrfs_item_offset_nr(right, push_items - 1);
3627 
3628 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3629 		     leaf_data_end(root, left) - push_space,
3630 		     btrfs_leaf_data(right) +
3631 		     btrfs_item_offset_nr(right, push_items - 1),
3632 		     push_space);
3633 	old_left_nritems = btrfs_header_nritems(left);
3634 	BUG_ON(old_left_nritems <= 0);
3635 
3636 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3637 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3638 		u32 ioff;
3639 
3640 		item = btrfs_item_nr(left, i);
3641 
3642 		ioff = btrfs_token_item_offset(left, item, &token);
3643 		btrfs_set_token_item_offset(left, item,
3644 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3645 		      &token);
3646 	}
3647 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3648 
3649 	/* fixup right node */
3650 	if (push_items > right_nritems)
3651 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3652 		       right_nritems);
3653 
3654 	if (push_items < right_nritems) {
3655 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3656 						  leaf_data_end(root, right);
3657 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3658 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3659 				      btrfs_leaf_data(right) +
3660 				      leaf_data_end(root, right), push_space);
3661 
3662 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3663 			      btrfs_item_nr_offset(push_items),
3664 			     (btrfs_header_nritems(right) - push_items) *
3665 			     sizeof(struct btrfs_item));
3666 	}
3667 	right_nritems -= push_items;
3668 	btrfs_set_header_nritems(right, right_nritems);
3669 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3670 	for (i = 0; i < right_nritems; i++) {
3671 		item = btrfs_item_nr(right, i);
3672 
3673 		push_space = push_space - btrfs_token_item_size(right,
3674 								item, &token);
3675 		btrfs_set_token_item_offset(right, item, push_space, &token);
3676 	}
3677 
3678 	btrfs_mark_buffer_dirty(left);
3679 	if (right_nritems)
3680 		btrfs_mark_buffer_dirty(right);
3681 	else
3682 		clean_tree_block(trans, root, right);
3683 
3684 	btrfs_item_key(right, &disk_key, 0);
3685 	fixup_low_keys(trans, root, path, &disk_key, 1);
3686 
3687 	/* then fixup the leaf pointer in the path */
3688 	if (path->slots[0] < push_items) {
3689 		path->slots[0] += old_left_nritems;
3690 		btrfs_tree_unlock(path->nodes[0]);
3691 		free_extent_buffer(path->nodes[0]);
3692 		path->nodes[0] = left;
3693 		path->slots[1] -= 1;
3694 	} else {
3695 		btrfs_tree_unlock(left);
3696 		free_extent_buffer(left);
3697 		path->slots[0] -= push_items;
3698 	}
3699 	BUG_ON(path->slots[0] < 0);
3700 	return ret;
3701 out:
3702 	btrfs_tree_unlock(left);
3703 	free_extent_buffer(left);
3704 	return ret;
3705 }
3706 
3707 /*
3708  * push some data in the path leaf to the left, trying to free up at
3709  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3710  *
3711  * max_slot can put a limit on how far into the leaf we'll push items.  The
3712  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3713  * items
3714  */
3715 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3716 			  *root, struct btrfs_path *path, int min_data_size,
3717 			  int data_size, int empty, u32 max_slot)
3718 {
3719 	struct extent_buffer *right = path->nodes[0];
3720 	struct extent_buffer *left;
3721 	int slot;
3722 	int free_space;
3723 	u32 right_nritems;
3724 	int ret = 0;
3725 
3726 	slot = path->slots[1];
3727 	if (slot == 0)
3728 		return 1;
3729 	if (!path->nodes[1])
3730 		return 1;
3731 
3732 	right_nritems = btrfs_header_nritems(right);
3733 	if (right_nritems == 0)
3734 		return 1;
3735 
3736 	btrfs_assert_tree_locked(path->nodes[1]);
3737 
3738 	left = read_node_slot(root, path->nodes[1], slot - 1);
3739 	if (left == NULL)
3740 		return 1;
3741 
3742 	btrfs_tree_lock(left);
3743 	btrfs_set_lock_blocking(left);
3744 
3745 	free_space = btrfs_leaf_free_space(root, left);
3746 	if (free_space < data_size) {
3747 		ret = 1;
3748 		goto out;
3749 	}
3750 
3751 	/* cow and double check */
3752 	ret = btrfs_cow_block(trans, root, left,
3753 			      path->nodes[1], slot - 1, &left);
3754 	if (ret) {
3755 		/* we hit -ENOSPC, but it isn't fatal here */
3756 		if (ret == -ENOSPC)
3757 			ret = 1;
3758 		goto out;
3759 	}
3760 
3761 	free_space = btrfs_leaf_free_space(root, left);
3762 	if (free_space < data_size) {
3763 		ret = 1;
3764 		goto out;
3765 	}
3766 
3767 	return __push_leaf_left(trans, root, path, min_data_size,
3768 			       empty, left, free_space, right_nritems,
3769 			       max_slot);
3770 out:
3771 	btrfs_tree_unlock(left);
3772 	free_extent_buffer(left);
3773 	return ret;
3774 }
3775 
3776 /*
3777  * split the path's leaf in two, making sure there is at least data_size
3778  * available for the resulting leaf level of the path.
3779  */
3780 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3781 				    struct btrfs_root *root,
3782 				    struct btrfs_path *path,
3783 				    struct extent_buffer *l,
3784 				    struct extent_buffer *right,
3785 				    int slot, int mid, int nritems)
3786 {
3787 	int data_copy_size;
3788 	int rt_data_off;
3789 	int i;
3790 	struct btrfs_disk_key disk_key;
3791 	struct btrfs_map_token token;
3792 
3793 	btrfs_init_map_token(&token);
3794 
3795 	nritems = nritems - mid;
3796 	btrfs_set_header_nritems(right, nritems);
3797 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3798 
3799 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3800 			   btrfs_item_nr_offset(mid),
3801 			   nritems * sizeof(struct btrfs_item));
3802 
3803 	copy_extent_buffer(right, l,
3804 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3805 		     data_copy_size, btrfs_leaf_data(l) +
3806 		     leaf_data_end(root, l), data_copy_size);
3807 
3808 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3809 		      btrfs_item_end_nr(l, mid);
3810 
3811 	for (i = 0; i < nritems; i++) {
3812 		struct btrfs_item *item = btrfs_item_nr(right, i);
3813 		u32 ioff;
3814 
3815 		ioff = btrfs_token_item_offset(right, item, &token);
3816 		btrfs_set_token_item_offset(right, item,
3817 					    ioff + rt_data_off, &token);
3818 	}
3819 
3820 	btrfs_set_header_nritems(l, mid);
3821 	btrfs_item_key(right, &disk_key, 0);
3822 	insert_ptr(trans, root, path, &disk_key, right->start,
3823 		   path->slots[1] + 1, 1);
3824 
3825 	btrfs_mark_buffer_dirty(right);
3826 	btrfs_mark_buffer_dirty(l);
3827 	BUG_ON(path->slots[0] != slot);
3828 
3829 	if (mid <= slot) {
3830 		btrfs_tree_unlock(path->nodes[0]);
3831 		free_extent_buffer(path->nodes[0]);
3832 		path->nodes[0] = right;
3833 		path->slots[0] -= mid;
3834 		path->slots[1] += 1;
3835 	} else {
3836 		btrfs_tree_unlock(right);
3837 		free_extent_buffer(right);
3838 	}
3839 
3840 	BUG_ON(path->slots[0] < 0);
3841 }
3842 
3843 /*
3844  * double splits happen when we need to insert a big item in the middle
3845  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3846  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3847  *          A                 B                 C
3848  *
3849  * We avoid this by trying to push the items on either side of our target
3850  * into the adjacent leaves.  If all goes well we can avoid the double split
3851  * completely.
3852  */
3853 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3854 					  struct btrfs_root *root,
3855 					  struct btrfs_path *path,
3856 					  int data_size)
3857 {
3858 	int ret;
3859 	int progress = 0;
3860 	int slot;
3861 	u32 nritems;
3862 
3863 	slot = path->slots[0];
3864 
3865 	/*
3866 	 * try to push all the items after our slot into the
3867 	 * right leaf
3868 	 */
3869 	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3870 	if (ret < 0)
3871 		return ret;
3872 
3873 	if (ret == 0)
3874 		progress++;
3875 
3876 	nritems = btrfs_header_nritems(path->nodes[0]);
3877 	/*
3878 	 * our goal is to get our slot at the start or end of a leaf.  If
3879 	 * we've done so we're done
3880 	 */
3881 	if (path->slots[0] == 0 || path->slots[0] == nritems)
3882 		return 0;
3883 
3884 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3885 		return 0;
3886 
3887 	/* try to push all the items before our slot into the next leaf */
3888 	slot = path->slots[0];
3889 	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3890 	if (ret < 0)
3891 		return ret;
3892 
3893 	if (ret == 0)
3894 		progress++;
3895 
3896 	if (progress)
3897 		return 0;
3898 	return 1;
3899 }
3900 
3901 /*
3902  * split the path's leaf in two, making sure there is at least data_size
3903  * available for the resulting leaf level of the path.
3904  *
3905  * returns 0 if all went well and < 0 on failure.
3906  */
3907 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3908 			       struct btrfs_root *root,
3909 			       struct btrfs_key *ins_key,
3910 			       struct btrfs_path *path, int data_size,
3911 			       int extend)
3912 {
3913 	struct btrfs_disk_key disk_key;
3914 	struct extent_buffer *l;
3915 	u32 nritems;
3916 	int mid;
3917 	int slot;
3918 	struct extent_buffer *right;
3919 	int ret = 0;
3920 	int wret;
3921 	int split;
3922 	int num_doubles = 0;
3923 	int tried_avoid_double = 0;
3924 
3925 	l = path->nodes[0];
3926 	slot = path->slots[0];
3927 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
3928 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3929 		return -EOVERFLOW;
3930 
3931 	/* first try to make some room by pushing left and right */
3932 	if (data_size) {
3933 		wret = push_leaf_right(trans, root, path, data_size,
3934 				       data_size, 0, 0);
3935 		if (wret < 0)
3936 			return wret;
3937 		if (wret) {
3938 			wret = push_leaf_left(trans, root, path, data_size,
3939 					      data_size, 0, (u32)-1);
3940 			if (wret < 0)
3941 				return wret;
3942 		}
3943 		l = path->nodes[0];
3944 
3945 		/* did the pushes work? */
3946 		if (btrfs_leaf_free_space(root, l) >= data_size)
3947 			return 0;
3948 	}
3949 
3950 	if (!path->nodes[1]) {
3951 		ret = insert_new_root(trans, root, path, 1);
3952 		if (ret)
3953 			return ret;
3954 	}
3955 again:
3956 	split = 1;
3957 	l = path->nodes[0];
3958 	slot = path->slots[0];
3959 	nritems = btrfs_header_nritems(l);
3960 	mid = (nritems + 1) / 2;
3961 
3962 	if (mid <= slot) {
3963 		if (nritems == 1 ||
3964 		    leaf_space_used(l, mid, nritems - mid) + data_size >
3965 			BTRFS_LEAF_DATA_SIZE(root)) {
3966 			if (slot >= nritems) {
3967 				split = 0;
3968 			} else {
3969 				mid = slot;
3970 				if (mid != nritems &&
3971 				    leaf_space_used(l, mid, nritems - mid) +
3972 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3973 					if (data_size && !tried_avoid_double)
3974 						goto push_for_double;
3975 					split = 2;
3976 				}
3977 			}
3978 		}
3979 	} else {
3980 		if (leaf_space_used(l, 0, mid) + data_size >
3981 			BTRFS_LEAF_DATA_SIZE(root)) {
3982 			if (!extend && data_size && slot == 0) {
3983 				split = 0;
3984 			} else if ((extend || !data_size) && slot == 0) {
3985 				mid = 1;
3986 			} else {
3987 				mid = slot;
3988 				if (mid != nritems &&
3989 				    leaf_space_used(l, mid, nritems - mid) +
3990 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3991 					if (data_size && !tried_avoid_double)
3992 						goto push_for_double;
3993 					split = 2 ;
3994 				}
3995 			}
3996 		}
3997 	}
3998 
3999 	if (split == 0)
4000 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4001 	else
4002 		btrfs_item_key(l, &disk_key, mid);
4003 
4004 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
4005 					root->root_key.objectid,
4006 					&disk_key, 0, l->start, 0);
4007 	if (IS_ERR(right))
4008 		return PTR_ERR(right);
4009 
4010 	root_add_used(root, root->leafsize);
4011 
4012 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4013 	btrfs_set_header_bytenr(right, right->start);
4014 	btrfs_set_header_generation(right, trans->transid);
4015 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4016 	btrfs_set_header_owner(right, root->root_key.objectid);
4017 	btrfs_set_header_level(right, 0);
4018 	write_extent_buffer(right, root->fs_info->fsid,
4019 			    (unsigned long)btrfs_header_fsid(right),
4020 			    BTRFS_FSID_SIZE);
4021 
4022 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4023 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
4024 			    BTRFS_UUID_SIZE);
4025 
4026 	if (split == 0) {
4027 		if (mid <= slot) {
4028 			btrfs_set_header_nritems(right, 0);
4029 			insert_ptr(trans, root, path, &disk_key, right->start,
4030 				   path->slots[1] + 1, 1);
4031 			btrfs_tree_unlock(path->nodes[0]);
4032 			free_extent_buffer(path->nodes[0]);
4033 			path->nodes[0] = right;
4034 			path->slots[0] = 0;
4035 			path->slots[1] += 1;
4036 		} else {
4037 			btrfs_set_header_nritems(right, 0);
4038 			insert_ptr(trans, root, path, &disk_key, right->start,
4039 					  path->slots[1], 1);
4040 			btrfs_tree_unlock(path->nodes[0]);
4041 			free_extent_buffer(path->nodes[0]);
4042 			path->nodes[0] = right;
4043 			path->slots[0] = 0;
4044 			if (path->slots[1] == 0)
4045 				fixup_low_keys(trans, root, path,
4046 					       &disk_key, 1);
4047 		}
4048 		btrfs_mark_buffer_dirty(right);
4049 		return ret;
4050 	}
4051 
4052 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4053 
4054 	if (split == 2) {
4055 		BUG_ON(num_doubles != 0);
4056 		num_doubles++;
4057 		goto again;
4058 	}
4059 
4060 	return 0;
4061 
4062 push_for_double:
4063 	push_for_double_split(trans, root, path, data_size);
4064 	tried_avoid_double = 1;
4065 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4066 		return 0;
4067 	goto again;
4068 }
4069 
4070 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4071 					 struct btrfs_root *root,
4072 					 struct btrfs_path *path, int ins_len)
4073 {
4074 	struct btrfs_key key;
4075 	struct extent_buffer *leaf;
4076 	struct btrfs_file_extent_item *fi;
4077 	u64 extent_len = 0;
4078 	u32 item_size;
4079 	int ret;
4080 
4081 	leaf = path->nodes[0];
4082 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4083 
4084 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4085 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4086 
4087 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4088 		return 0;
4089 
4090 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4091 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4092 		fi = btrfs_item_ptr(leaf, path->slots[0],
4093 				    struct btrfs_file_extent_item);
4094 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4095 	}
4096 	btrfs_release_path(path);
4097 
4098 	path->keep_locks = 1;
4099 	path->search_for_split = 1;
4100 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4101 	path->search_for_split = 0;
4102 	if (ret < 0)
4103 		goto err;
4104 
4105 	ret = -EAGAIN;
4106 	leaf = path->nodes[0];
4107 	/* if our item isn't there or got smaller, return now */
4108 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4109 		goto err;
4110 
4111 	/* the leaf has  changed, it now has room.  return now */
4112 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4113 		goto err;
4114 
4115 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4116 		fi = btrfs_item_ptr(leaf, path->slots[0],
4117 				    struct btrfs_file_extent_item);
4118 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4119 			goto err;
4120 	}
4121 
4122 	btrfs_set_path_blocking(path);
4123 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4124 	if (ret)
4125 		goto err;
4126 
4127 	path->keep_locks = 0;
4128 	btrfs_unlock_up_safe(path, 1);
4129 	return 0;
4130 err:
4131 	path->keep_locks = 0;
4132 	return ret;
4133 }
4134 
4135 static noinline int split_item(struct btrfs_trans_handle *trans,
4136 			       struct btrfs_root *root,
4137 			       struct btrfs_path *path,
4138 			       struct btrfs_key *new_key,
4139 			       unsigned long split_offset)
4140 {
4141 	struct extent_buffer *leaf;
4142 	struct btrfs_item *item;
4143 	struct btrfs_item *new_item;
4144 	int slot;
4145 	char *buf;
4146 	u32 nritems;
4147 	u32 item_size;
4148 	u32 orig_offset;
4149 	struct btrfs_disk_key disk_key;
4150 
4151 	leaf = path->nodes[0];
4152 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4153 
4154 	btrfs_set_path_blocking(path);
4155 
4156 	item = btrfs_item_nr(leaf, path->slots[0]);
4157 	orig_offset = btrfs_item_offset(leaf, item);
4158 	item_size = btrfs_item_size(leaf, item);
4159 
4160 	buf = kmalloc(item_size, GFP_NOFS);
4161 	if (!buf)
4162 		return -ENOMEM;
4163 
4164 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4165 			    path->slots[0]), item_size);
4166 
4167 	slot = path->slots[0] + 1;
4168 	nritems = btrfs_header_nritems(leaf);
4169 	if (slot != nritems) {
4170 		/* shift the items */
4171 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4172 				btrfs_item_nr_offset(slot),
4173 				(nritems - slot) * sizeof(struct btrfs_item));
4174 	}
4175 
4176 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4177 	btrfs_set_item_key(leaf, &disk_key, slot);
4178 
4179 	new_item = btrfs_item_nr(leaf, slot);
4180 
4181 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4182 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4183 
4184 	btrfs_set_item_offset(leaf, item,
4185 			      orig_offset + item_size - split_offset);
4186 	btrfs_set_item_size(leaf, item, split_offset);
4187 
4188 	btrfs_set_header_nritems(leaf, nritems + 1);
4189 
4190 	/* write the data for the start of the original item */
4191 	write_extent_buffer(leaf, buf,
4192 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4193 			    split_offset);
4194 
4195 	/* write the data for the new item */
4196 	write_extent_buffer(leaf, buf + split_offset,
4197 			    btrfs_item_ptr_offset(leaf, slot),
4198 			    item_size - split_offset);
4199 	btrfs_mark_buffer_dirty(leaf);
4200 
4201 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4202 	kfree(buf);
4203 	return 0;
4204 }
4205 
4206 /*
4207  * This function splits a single item into two items,
4208  * giving 'new_key' to the new item and splitting the
4209  * old one at split_offset (from the start of the item).
4210  *
4211  * The path may be released by this operation.  After
4212  * the split, the path is pointing to the old item.  The
4213  * new item is going to be in the same node as the old one.
4214  *
4215  * Note, the item being split must be smaller enough to live alone on
4216  * a tree block with room for one extra struct btrfs_item
4217  *
4218  * This allows us to split the item in place, keeping a lock on the
4219  * leaf the entire time.
4220  */
4221 int btrfs_split_item(struct btrfs_trans_handle *trans,
4222 		     struct btrfs_root *root,
4223 		     struct btrfs_path *path,
4224 		     struct btrfs_key *new_key,
4225 		     unsigned long split_offset)
4226 {
4227 	int ret;
4228 	ret = setup_leaf_for_split(trans, root, path,
4229 				   sizeof(struct btrfs_item));
4230 	if (ret)
4231 		return ret;
4232 
4233 	ret = split_item(trans, root, path, new_key, split_offset);
4234 	return ret;
4235 }
4236 
4237 /*
4238  * This function duplicate a item, giving 'new_key' to the new item.
4239  * It guarantees both items live in the same tree leaf and the new item
4240  * is contiguous with the original item.
4241  *
4242  * This allows us to split file extent in place, keeping a lock on the
4243  * leaf the entire time.
4244  */
4245 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4246 			 struct btrfs_root *root,
4247 			 struct btrfs_path *path,
4248 			 struct btrfs_key *new_key)
4249 {
4250 	struct extent_buffer *leaf;
4251 	int ret;
4252 	u32 item_size;
4253 
4254 	leaf = path->nodes[0];
4255 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4256 	ret = setup_leaf_for_split(trans, root, path,
4257 				   item_size + sizeof(struct btrfs_item));
4258 	if (ret)
4259 		return ret;
4260 
4261 	path->slots[0]++;
4262 	setup_items_for_insert(trans, root, path, new_key, &item_size,
4263 			       item_size, item_size +
4264 			       sizeof(struct btrfs_item), 1);
4265 	leaf = path->nodes[0];
4266 	memcpy_extent_buffer(leaf,
4267 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4268 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4269 			     item_size);
4270 	return 0;
4271 }
4272 
4273 /*
4274  * make the item pointed to by the path smaller.  new_size indicates
4275  * how small to make it, and from_end tells us if we just chop bytes
4276  * off the end of the item or if we shift the item to chop bytes off
4277  * the front.
4278  */
4279 void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4280 			 struct btrfs_root *root,
4281 			 struct btrfs_path *path,
4282 			 u32 new_size, int from_end)
4283 {
4284 	int slot;
4285 	struct extent_buffer *leaf;
4286 	struct btrfs_item *item;
4287 	u32 nritems;
4288 	unsigned int data_end;
4289 	unsigned int old_data_start;
4290 	unsigned int old_size;
4291 	unsigned int size_diff;
4292 	int i;
4293 	struct btrfs_map_token token;
4294 
4295 	btrfs_init_map_token(&token);
4296 
4297 	leaf = path->nodes[0];
4298 	slot = path->slots[0];
4299 
4300 	old_size = btrfs_item_size_nr(leaf, slot);
4301 	if (old_size == new_size)
4302 		return;
4303 
4304 	nritems = btrfs_header_nritems(leaf);
4305 	data_end = leaf_data_end(root, leaf);
4306 
4307 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4308 
4309 	size_diff = old_size - new_size;
4310 
4311 	BUG_ON(slot < 0);
4312 	BUG_ON(slot >= nritems);
4313 
4314 	/*
4315 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4316 	 */
4317 	/* first correct the data pointers */
4318 	for (i = slot; i < nritems; i++) {
4319 		u32 ioff;
4320 		item = btrfs_item_nr(leaf, i);
4321 
4322 		ioff = btrfs_token_item_offset(leaf, item, &token);
4323 		btrfs_set_token_item_offset(leaf, item,
4324 					    ioff + size_diff, &token);
4325 	}
4326 
4327 	/* shift the data */
4328 	if (from_end) {
4329 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4330 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4331 			      data_end, old_data_start + new_size - data_end);
4332 	} else {
4333 		struct btrfs_disk_key disk_key;
4334 		u64 offset;
4335 
4336 		btrfs_item_key(leaf, &disk_key, slot);
4337 
4338 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4339 			unsigned long ptr;
4340 			struct btrfs_file_extent_item *fi;
4341 
4342 			fi = btrfs_item_ptr(leaf, slot,
4343 					    struct btrfs_file_extent_item);
4344 			fi = (struct btrfs_file_extent_item *)(
4345 			     (unsigned long)fi - size_diff);
4346 
4347 			if (btrfs_file_extent_type(leaf, fi) ==
4348 			    BTRFS_FILE_EXTENT_INLINE) {
4349 				ptr = btrfs_item_ptr_offset(leaf, slot);
4350 				memmove_extent_buffer(leaf, ptr,
4351 				      (unsigned long)fi,
4352 				      offsetof(struct btrfs_file_extent_item,
4353 						 disk_bytenr));
4354 			}
4355 		}
4356 
4357 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4358 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4359 			      data_end, old_data_start - data_end);
4360 
4361 		offset = btrfs_disk_key_offset(&disk_key);
4362 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4363 		btrfs_set_item_key(leaf, &disk_key, slot);
4364 		if (slot == 0)
4365 			fixup_low_keys(trans, root, path, &disk_key, 1);
4366 	}
4367 
4368 	item = btrfs_item_nr(leaf, slot);
4369 	btrfs_set_item_size(leaf, item, new_size);
4370 	btrfs_mark_buffer_dirty(leaf);
4371 
4372 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4373 		btrfs_print_leaf(root, leaf);
4374 		BUG();
4375 	}
4376 }
4377 
4378 /*
4379  * make the item pointed to by the path bigger, data_size is the new size.
4380  */
4381 void btrfs_extend_item(struct btrfs_trans_handle *trans,
4382 		       struct btrfs_root *root, struct btrfs_path *path,
4383 		       u32 data_size)
4384 {
4385 	int slot;
4386 	struct extent_buffer *leaf;
4387 	struct btrfs_item *item;
4388 	u32 nritems;
4389 	unsigned int data_end;
4390 	unsigned int old_data;
4391 	unsigned int old_size;
4392 	int i;
4393 	struct btrfs_map_token token;
4394 
4395 	btrfs_init_map_token(&token);
4396 
4397 	leaf = path->nodes[0];
4398 
4399 	nritems = btrfs_header_nritems(leaf);
4400 	data_end = leaf_data_end(root, leaf);
4401 
4402 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4403 		btrfs_print_leaf(root, leaf);
4404 		BUG();
4405 	}
4406 	slot = path->slots[0];
4407 	old_data = btrfs_item_end_nr(leaf, slot);
4408 
4409 	BUG_ON(slot < 0);
4410 	if (slot >= nritems) {
4411 		btrfs_print_leaf(root, leaf);
4412 		printk(KERN_CRIT "slot %d too large, nritems %d\n",
4413 		       slot, nritems);
4414 		BUG_ON(1);
4415 	}
4416 
4417 	/*
4418 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4419 	 */
4420 	/* first correct the data pointers */
4421 	for (i = slot; i < nritems; i++) {
4422 		u32 ioff;
4423 		item = btrfs_item_nr(leaf, i);
4424 
4425 		ioff = btrfs_token_item_offset(leaf, item, &token);
4426 		btrfs_set_token_item_offset(leaf, item,
4427 					    ioff - data_size, &token);
4428 	}
4429 
4430 	/* shift the data */
4431 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4432 		      data_end - data_size, btrfs_leaf_data(leaf) +
4433 		      data_end, old_data - data_end);
4434 
4435 	data_end = old_data;
4436 	old_size = btrfs_item_size_nr(leaf, slot);
4437 	item = btrfs_item_nr(leaf, slot);
4438 	btrfs_set_item_size(leaf, item, old_size + data_size);
4439 	btrfs_mark_buffer_dirty(leaf);
4440 
4441 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4442 		btrfs_print_leaf(root, leaf);
4443 		BUG();
4444 	}
4445 }
4446 
4447 /*
4448  * this is a helper for btrfs_insert_empty_items, the main goal here is
4449  * to save stack depth by doing the bulk of the work in a function
4450  * that doesn't call btrfs_search_slot
4451  */
4452 void setup_items_for_insert(struct btrfs_trans_handle *trans,
4453 			    struct btrfs_root *root, struct btrfs_path *path,
4454 			    struct btrfs_key *cpu_key, u32 *data_size,
4455 			    u32 total_data, u32 total_size, int nr)
4456 {
4457 	struct btrfs_item *item;
4458 	int i;
4459 	u32 nritems;
4460 	unsigned int data_end;
4461 	struct btrfs_disk_key disk_key;
4462 	struct extent_buffer *leaf;
4463 	int slot;
4464 	struct btrfs_map_token token;
4465 
4466 	btrfs_init_map_token(&token);
4467 
4468 	leaf = path->nodes[0];
4469 	slot = path->slots[0];
4470 
4471 	nritems = btrfs_header_nritems(leaf);
4472 	data_end = leaf_data_end(root, leaf);
4473 
4474 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4475 		btrfs_print_leaf(root, leaf);
4476 		printk(KERN_CRIT "not enough freespace need %u have %d\n",
4477 		       total_size, btrfs_leaf_free_space(root, leaf));
4478 		BUG();
4479 	}
4480 
4481 	if (slot != nritems) {
4482 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4483 
4484 		if (old_data < data_end) {
4485 			btrfs_print_leaf(root, leaf);
4486 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4487 			       slot, old_data, data_end);
4488 			BUG_ON(1);
4489 		}
4490 		/*
4491 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4492 		 */
4493 		/* first correct the data pointers */
4494 		for (i = slot; i < nritems; i++) {
4495 			u32 ioff;
4496 
4497 			item = btrfs_item_nr(leaf, i);
4498 			ioff = btrfs_token_item_offset(leaf, item, &token);
4499 			btrfs_set_token_item_offset(leaf, item,
4500 						    ioff - total_data, &token);
4501 		}
4502 		/* shift the items */
4503 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4504 			      btrfs_item_nr_offset(slot),
4505 			      (nritems - slot) * sizeof(struct btrfs_item));
4506 
4507 		/* shift the data */
4508 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4509 			      data_end - total_data, btrfs_leaf_data(leaf) +
4510 			      data_end, old_data - data_end);
4511 		data_end = old_data;
4512 	}
4513 
4514 	/* setup the item for the new data */
4515 	for (i = 0; i < nr; i++) {
4516 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4517 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4518 		item = btrfs_item_nr(leaf, slot + i);
4519 		btrfs_set_token_item_offset(leaf, item,
4520 					    data_end - data_size[i], &token);
4521 		data_end -= data_size[i];
4522 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4523 	}
4524 
4525 	btrfs_set_header_nritems(leaf, nritems + nr);
4526 
4527 	if (slot == 0) {
4528 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4529 		fixup_low_keys(trans, root, path, &disk_key, 1);
4530 	}
4531 	btrfs_unlock_up_safe(path, 1);
4532 	btrfs_mark_buffer_dirty(leaf);
4533 
4534 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4535 		btrfs_print_leaf(root, leaf);
4536 		BUG();
4537 	}
4538 }
4539 
4540 /*
4541  * Given a key and some data, insert items into the tree.
4542  * This does all the path init required, making room in the tree if needed.
4543  */
4544 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4545 			    struct btrfs_root *root,
4546 			    struct btrfs_path *path,
4547 			    struct btrfs_key *cpu_key, u32 *data_size,
4548 			    int nr)
4549 {
4550 	int ret = 0;
4551 	int slot;
4552 	int i;
4553 	u32 total_size = 0;
4554 	u32 total_data = 0;
4555 
4556 	for (i = 0; i < nr; i++)
4557 		total_data += data_size[i];
4558 
4559 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4560 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4561 	if (ret == 0)
4562 		return -EEXIST;
4563 	if (ret < 0)
4564 		return ret;
4565 
4566 	slot = path->slots[0];
4567 	BUG_ON(slot < 0);
4568 
4569 	setup_items_for_insert(trans, root, path, cpu_key, data_size,
4570 			       total_data, total_size, nr);
4571 	return 0;
4572 }
4573 
4574 /*
4575  * Given a key and some data, insert an item into the tree.
4576  * This does all the path init required, making room in the tree if needed.
4577  */
4578 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4579 		      *root, struct btrfs_key *cpu_key, void *data, u32
4580 		      data_size)
4581 {
4582 	int ret = 0;
4583 	struct btrfs_path *path;
4584 	struct extent_buffer *leaf;
4585 	unsigned long ptr;
4586 
4587 	path = btrfs_alloc_path();
4588 	if (!path)
4589 		return -ENOMEM;
4590 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4591 	if (!ret) {
4592 		leaf = path->nodes[0];
4593 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4594 		write_extent_buffer(leaf, data, ptr, data_size);
4595 		btrfs_mark_buffer_dirty(leaf);
4596 	}
4597 	btrfs_free_path(path);
4598 	return ret;
4599 }
4600 
4601 /*
4602  * delete the pointer from a given node.
4603  *
4604  * the tree should have been previously balanced so the deletion does not
4605  * empty a node.
4606  */
4607 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4608 		    struct btrfs_path *path, int level, int slot)
4609 {
4610 	struct extent_buffer *parent = path->nodes[level];
4611 	u32 nritems;
4612 	int ret;
4613 
4614 	nritems = btrfs_header_nritems(parent);
4615 	if (slot != nritems - 1) {
4616 		if (level)
4617 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4618 					     slot + 1, nritems - slot - 1);
4619 		memmove_extent_buffer(parent,
4620 			      btrfs_node_key_ptr_offset(slot),
4621 			      btrfs_node_key_ptr_offset(slot + 1),
4622 			      sizeof(struct btrfs_key_ptr) *
4623 			      (nritems - slot - 1));
4624 	} else if (level) {
4625 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4626 					      MOD_LOG_KEY_REMOVE);
4627 		BUG_ON(ret < 0);
4628 	}
4629 
4630 	nritems--;
4631 	btrfs_set_header_nritems(parent, nritems);
4632 	if (nritems == 0 && parent == root->node) {
4633 		BUG_ON(btrfs_header_level(root->node) != 1);
4634 		/* just turn the root into a leaf and break */
4635 		btrfs_set_header_level(root->node, 0);
4636 	} else if (slot == 0) {
4637 		struct btrfs_disk_key disk_key;
4638 
4639 		btrfs_node_key(parent, &disk_key, 0);
4640 		fixup_low_keys(trans, root, path, &disk_key, level + 1);
4641 	}
4642 	btrfs_mark_buffer_dirty(parent);
4643 }
4644 
4645 /*
4646  * a helper function to delete the leaf pointed to by path->slots[1] and
4647  * path->nodes[1].
4648  *
4649  * This deletes the pointer in path->nodes[1] and frees the leaf
4650  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4651  *
4652  * The path must have already been setup for deleting the leaf, including
4653  * all the proper balancing.  path->nodes[1] must be locked.
4654  */
4655 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4656 				    struct btrfs_root *root,
4657 				    struct btrfs_path *path,
4658 				    struct extent_buffer *leaf)
4659 {
4660 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4661 	del_ptr(trans, root, path, 1, path->slots[1]);
4662 
4663 	/*
4664 	 * btrfs_free_extent is expensive, we want to make sure we
4665 	 * aren't holding any locks when we call it
4666 	 */
4667 	btrfs_unlock_up_safe(path, 0);
4668 
4669 	root_sub_used(root, leaf->len);
4670 
4671 	extent_buffer_get(leaf);
4672 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4673 	free_extent_buffer_stale(leaf);
4674 }
4675 /*
4676  * delete the item at the leaf level in path.  If that empties
4677  * the leaf, remove it from the tree
4678  */
4679 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4680 		    struct btrfs_path *path, int slot, int nr)
4681 {
4682 	struct extent_buffer *leaf;
4683 	struct btrfs_item *item;
4684 	int last_off;
4685 	int dsize = 0;
4686 	int ret = 0;
4687 	int wret;
4688 	int i;
4689 	u32 nritems;
4690 	struct btrfs_map_token token;
4691 
4692 	btrfs_init_map_token(&token);
4693 
4694 	leaf = path->nodes[0];
4695 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4696 
4697 	for (i = 0; i < nr; i++)
4698 		dsize += btrfs_item_size_nr(leaf, slot + i);
4699 
4700 	nritems = btrfs_header_nritems(leaf);
4701 
4702 	if (slot + nr != nritems) {
4703 		int data_end = leaf_data_end(root, leaf);
4704 
4705 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4706 			      data_end + dsize,
4707 			      btrfs_leaf_data(leaf) + data_end,
4708 			      last_off - data_end);
4709 
4710 		for (i = slot + nr; i < nritems; i++) {
4711 			u32 ioff;
4712 
4713 			item = btrfs_item_nr(leaf, i);
4714 			ioff = btrfs_token_item_offset(leaf, item, &token);
4715 			btrfs_set_token_item_offset(leaf, item,
4716 						    ioff + dsize, &token);
4717 		}
4718 
4719 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4720 			      btrfs_item_nr_offset(slot + nr),
4721 			      sizeof(struct btrfs_item) *
4722 			      (nritems - slot - nr));
4723 	}
4724 	btrfs_set_header_nritems(leaf, nritems - nr);
4725 	nritems -= nr;
4726 
4727 	/* delete the leaf if we've emptied it */
4728 	if (nritems == 0) {
4729 		if (leaf == root->node) {
4730 			btrfs_set_header_level(leaf, 0);
4731 		} else {
4732 			btrfs_set_path_blocking(path);
4733 			clean_tree_block(trans, root, leaf);
4734 			btrfs_del_leaf(trans, root, path, leaf);
4735 		}
4736 	} else {
4737 		int used = leaf_space_used(leaf, 0, nritems);
4738 		if (slot == 0) {
4739 			struct btrfs_disk_key disk_key;
4740 
4741 			btrfs_item_key(leaf, &disk_key, 0);
4742 			fixup_low_keys(trans, root, path, &disk_key, 1);
4743 		}
4744 
4745 		/* delete the leaf if it is mostly empty */
4746 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4747 			/* push_leaf_left fixes the path.
4748 			 * make sure the path still points to our leaf
4749 			 * for possible call to del_ptr below
4750 			 */
4751 			slot = path->slots[1];
4752 			extent_buffer_get(leaf);
4753 
4754 			btrfs_set_path_blocking(path);
4755 			wret = push_leaf_left(trans, root, path, 1, 1,
4756 					      1, (u32)-1);
4757 			if (wret < 0 && wret != -ENOSPC)
4758 				ret = wret;
4759 
4760 			if (path->nodes[0] == leaf &&
4761 			    btrfs_header_nritems(leaf)) {
4762 				wret = push_leaf_right(trans, root, path, 1,
4763 						       1, 1, 0);
4764 				if (wret < 0 && wret != -ENOSPC)
4765 					ret = wret;
4766 			}
4767 
4768 			if (btrfs_header_nritems(leaf) == 0) {
4769 				path->slots[1] = slot;
4770 				btrfs_del_leaf(trans, root, path, leaf);
4771 				free_extent_buffer(leaf);
4772 				ret = 0;
4773 			} else {
4774 				/* if we're still in the path, make sure
4775 				 * we're dirty.  Otherwise, one of the
4776 				 * push_leaf functions must have already
4777 				 * dirtied this buffer
4778 				 */
4779 				if (path->nodes[0] == leaf)
4780 					btrfs_mark_buffer_dirty(leaf);
4781 				free_extent_buffer(leaf);
4782 			}
4783 		} else {
4784 			btrfs_mark_buffer_dirty(leaf);
4785 		}
4786 	}
4787 	return ret;
4788 }
4789 
4790 /*
4791  * search the tree again to find a leaf with lesser keys
4792  * returns 0 if it found something or 1 if there are no lesser leaves.
4793  * returns < 0 on io errors.
4794  *
4795  * This may release the path, and so you may lose any locks held at the
4796  * time you call it.
4797  */
4798 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4799 {
4800 	struct btrfs_key key;
4801 	struct btrfs_disk_key found_key;
4802 	int ret;
4803 
4804 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4805 
4806 	if (key.offset > 0)
4807 		key.offset--;
4808 	else if (key.type > 0)
4809 		key.type--;
4810 	else if (key.objectid > 0)
4811 		key.objectid--;
4812 	else
4813 		return 1;
4814 
4815 	btrfs_release_path(path);
4816 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4817 	if (ret < 0)
4818 		return ret;
4819 	btrfs_item_key(path->nodes[0], &found_key, 0);
4820 	ret = comp_keys(&found_key, &key);
4821 	if (ret < 0)
4822 		return 0;
4823 	return 1;
4824 }
4825 
4826 /*
4827  * A helper function to walk down the tree starting at min_key, and looking
4828  * for nodes or leaves that are either in cache or have a minimum
4829  * transaction id.  This is used by the btree defrag code, and tree logging
4830  *
4831  * This does not cow, but it does stuff the starting key it finds back
4832  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4833  * key and get a writable path.
4834  *
4835  * This does lock as it descends, and path->keep_locks should be set
4836  * to 1 by the caller.
4837  *
4838  * This honors path->lowest_level to prevent descent past a given level
4839  * of the tree.
4840  *
4841  * min_trans indicates the oldest transaction that you are interested
4842  * in walking through.  Any nodes or leaves older than min_trans are
4843  * skipped over (without reading them).
4844  *
4845  * returns zero if something useful was found, < 0 on error and 1 if there
4846  * was nothing in the tree that matched the search criteria.
4847  */
4848 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4849 			 struct btrfs_key *max_key,
4850 			 struct btrfs_path *path, int cache_only,
4851 			 u64 min_trans)
4852 {
4853 	struct extent_buffer *cur;
4854 	struct btrfs_key found_key;
4855 	int slot;
4856 	int sret;
4857 	u32 nritems;
4858 	int level;
4859 	int ret = 1;
4860 
4861 	WARN_ON(!path->keep_locks);
4862 again:
4863 	cur = btrfs_read_lock_root_node(root);
4864 	level = btrfs_header_level(cur);
4865 	WARN_ON(path->nodes[level]);
4866 	path->nodes[level] = cur;
4867 	path->locks[level] = BTRFS_READ_LOCK;
4868 
4869 	if (btrfs_header_generation(cur) < min_trans) {
4870 		ret = 1;
4871 		goto out;
4872 	}
4873 	while (1) {
4874 		nritems = btrfs_header_nritems(cur);
4875 		level = btrfs_header_level(cur);
4876 		sret = bin_search(cur, min_key, level, &slot);
4877 
4878 		/* at the lowest level, we're done, setup the path and exit */
4879 		if (level == path->lowest_level) {
4880 			if (slot >= nritems)
4881 				goto find_next_key;
4882 			ret = 0;
4883 			path->slots[level] = slot;
4884 			btrfs_item_key_to_cpu(cur, &found_key, slot);
4885 			goto out;
4886 		}
4887 		if (sret && slot > 0)
4888 			slot--;
4889 		/*
4890 		 * check this node pointer against the cache_only and
4891 		 * min_trans parameters.  If it isn't in cache or is too
4892 		 * old, skip to the next one.
4893 		 */
4894 		while (slot < nritems) {
4895 			u64 blockptr;
4896 			u64 gen;
4897 			struct extent_buffer *tmp;
4898 			struct btrfs_disk_key disk_key;
4899 
4900 			blockptr = btrfs_node_blockptr(cur, slot);
4901 			gen = btrfs_node_ptr_generation(cur, slot);
4902 			if (gen < min_trans) {
4903 				slot++;
4904 				continue;
4905 			}
4906 			if (!cache_only)
4907 				break;
4908 
4909 			if (max_key) {
4910 				btrfs_node_key(cur, &disk_key, slot);
4911 				if (comp_keys(&disk_key, max_key) >= 0) {
4912 					ret = 1;
4913 					goto out;
4914 				}
4915 			}
4916 
4917 			tmp = btrfs_find_tree_block(root, blockptr,
4918 					    btrfs_level_size(root, level - 1));
4919 
4920 			if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
4921 				free_extent_buffer(tmp);
4922 				break;
4923 			}
4924 			if (tmp)
4925 				free_extent_buffer(tmp);
4926 			slot++;
4927 		}
4928 find_next_key:
4929 		/*
4930 		 * we didn't find a candidate key in this node, walk forward
4931 		 * and find another one
4932 		 */
4933 		if (slot >= nritems) {
4934 			path->slots[level] = slot;
4935 			btrfs_set_path_blocking(path);
4936 			sret = btrfs_find_next_key(root, path, min_key, level,
4937 						  cache_only, min_trans);
4938 			if (sret == 0) {
4939 				btrfs_release_path(path);
4940 				goto again;
4941 			} else {
4942 				goto out;
4943 			}
4944 		}
4945 		/* save our key for returning back */
4946 		btrfs_node_key_to_cpu(cur, &found_key, slot);
4947 		path->slots[level] = slot;
4948 		if (level == path->lowest_level) {
4949 			ret = 0;
4950 			unlock_up(path, level, 1, 0, NULL);
4951 			goto out;
4952 		}
4953 		btrfs_set_path_blocking(path);
4954 		cur = read_node_slot(root, cur, slot);
4955 		BUG_ON(!cur); /* -ENOMEM */
4956 
4957 		btrfs_tree_read_lock(cur);
4958 
4959 		path->locks[level - 1] = BTRFS_READ_LOCK;
4960 		path->nodes[level - 1] = cur;
4961 		unlock_up(path, level, 1, 0, NULL);
4962 		btrfs_clear_path_blocking(path, NULL, 0);
4963 	}
4964 out:
4965 	if (ret == 0)
4966 		memcpy(min_key, &found_key, sizeof(found_key));
4967 	btrfs_set_path_blocking(path);
4968 	return ret;
4969 }
4970 
4971 static void tree_move_down(struct btrfs_root *root,
4972 			   struct btrfs_path *path,
4973 			   int *level, int root_level)
4974 {
4975 	BUG_ON(*level == 0);
4976 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
4977 					path->slots[*level]);
4978 	path->slots[*level - 1] = 0;
4979 	(*level)--;
4980 }
4981 
4982 static int tree_move_next_or_upnext(struct btrfs_root *root,
4983 				    struct btrfs_path *path,
4984 				    int *level, int root_level)
4985 {
4986 	int ret = 0;
4987 	int nritems;
4988 	nritems = btrfs_header_nritems(path->nodes[*level]);
4989 
4990 	path->slots[*level]++;
4991 
4992 	while (path->slots[*level] >= nritems) {
4993 		if (*level == root_level)
4994 			return -1;
4995 
4996 		/* move upnext */
4997 		path->slots[*level] = 0;
4998 		free_extent_buffer(path->nodes[*level]);
4999 		path->nodes[*level] = NULL;
5000 		(*level)++;
5001 		path->slots[*level]++;
5002 
5003 		nritems = btrfs_header_nritems(path->nodes[*level]);
5004 		ret = 1;
5005 	}
5006 	return ret;
5007 }
5008 
5009 /*
5010  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5011  * or down.
5012  */
5013 static int tree_advance(struct btrfs_root *root,
5014 			struct btrfs_path *path,
5015 			int *level, int root_level,
5016 			int allow_down,
5017 			struct btrfs_key *key)
5018 {
5019 	int ret;
5020 
5021 	if (*level == 0 || !allow_down) {
5022 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5023 	} else {
5024 		tree_move_down(root, path, level, root_level);
5025 		ret = 0;
5026 	}
5027 	if (ret >= 0) {
5028 		if (*level == 0)
5029 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5030 					path->slots[*level]);
5031 		else
5032 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5033 					path->slots[*level]);
5034 	}
5035 	return ret;
5036 }
5037 
5038 static int tree_compare_item(struct btrfs_root *left_root,
5039 			     struct btrfs_path *left_path,
5040 			     struct btrfs_path *right_path,
5041 			     char *tmp_buf)
5042 {
5043 	int cmp;
5044 	int len1, len2;
5045 	unsigned long off1, off2;
5046 
5047 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5048 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5049 	if (len1 != len2)
5050 		return 1;
5051 
5052 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5053 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5054 				right_path->slots[0]);
5055 
5056 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5057 
5058 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5059 	if (cmp)
5060 		return 1;
5061 	return 0;
5062 }
5063 
5064 #define ADVANCE 1
5065 #define ADVANCE_ONLY_NEXT -1
5066 
5067 /*
5068  * This function compares two trees and calls the provided callback for
5069  * every changed/new/deleted item it finds.
5070  * If shared tree blocks are encountered, whole subtrees are skipped, making
5071  * the compare pretty fast on snapshotted subvolumes.
5072  *
5073  * This currently works on commit roots only. As commit roots are read only,
5074  * we don't do any locking. The commit roots are protected with transactions.
5075  * Transactions are ended and rejoined when a commit is tried in between.
5076  *
5077  * This function checks for modifications done to the trees while comparing.
5078  * If it detects a change, it aborts immediately.
5079  */
5080 int btrfs_compare_trees(struct btrfs_root *left_root,
5081 			struct btrfs_root *right_root,
5082 			btrfs_changed_cb_t changed_cb, void *ctx)
5083 {
5084 	int ret;
5085 	int cmp;
5086 	struct btrfs_trans_handle *trans = NULL;
5087 	struct btrfs_path *left_path = NULL;
5088 	struct btrfs_path *right_path = NULL;
5089 	struct btrfs_key left_key;
5090 	struct btrfs_key right_key;
5091 	char *tmp_buf = NULL;
5092 	int left_root_level;
5093 	int right_root_level;
5094 	int left_level;
5095 	int right_level;
5096 	int left_end_reached;
5097 	int right_end_reached;
5098 	int advance_left;
5099 	int advance_right;
5100 	u64 left_blockptr;
5101 	u64 right_blockptr;
5102 	u64 left_start_ctransid;
5103 	u64 right_start_ctransid;
5104 	u64 ctransid;
5105 
5106 	left_path = btrfs_alloc_path();
5107 	if (!left_path) {
5108 		ret = -ENOMEM;
5109 		goto out;
5110 	}
5111 	right_path = btrfs_alloc_path();
5112 	if (!right_path) {
5113 		ret = -ENOMEM;
5114 		goto out;
5115 	}
5116 
5117 	tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5118 	if (!tmp_buf) {
5119 		ret = -ENOMEM;
5120 		goto out;
5121 	}
5122 
5123 	left_path->search_commit_root = 1;
5124 	left_path->skip_locking = 1;
5125 	right_path->search_commit_root = 1;
5126 	right_path->skip_locking = 1;
5127 
5128 	spin_lock(&left_root->root_item_lock);
5129 	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5130 	spin_unlock(&left_root->root_item_lock);
5131 
5132 	spin_lock(&right_root->root_item_lock);
5133 	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5134 	spin_unlock(&right_root->root_item_lock);
5135 
5136 	trans = btrfs_join_transaction(left_root);
5137 	if (IS_ERR(trans)) {
5138 		ret = PTR_ERR(trans);
5139 		trans = NULL;
5140 		goto out;
5141 	}
5142 
5143 	/*
5144 	 * Strategy: Go to the first items of both trees. Then do
5145 	 *
5146 	 * If both trees are at level 0
5147 	 *   Compare keys of current items
5148 	 *     If left < right treat left item as new, advance left tree
5149 	 *       and repeat
5150 	 *     If left > right treat right item as deleted, advance right tree
5151 	 *       and repeat
5152 	 *     If left == right do deep compare of items, treat as changed if
5153 	 *       needed, advance both trees and repeat
5154 	 * If both trees are at the same level but not at level 0
5155 	 *   Compare keys of current nodes/leafs
5156 	 *     If left < right advance left tree and repeat
5157 	 *     If left > right advance right tree and repeat
5158 	 *     If left == right compare blockptrs of the next nodes/leafs
5159 	 *       If they match advance both trees but stay at the same level
5160 	 *         and repeat
5161 	 *       If they don't match advance both trees while allowing to go
5162 	 *         deeper and repeat
5163 	 * If tree levels are different
5164 	 *   Advance the tree that needs it and repeat
5165 	 *
5166 	 * Advancing a tree means:
5167 	 *   If we are at level 0, try to go to the next slot. If that's not
5168 	 *   possible, go one level up and repeat. Stop when we found a level
5169 	 *   where we could go to the next slot. We may at this point be on a
5170 	 *   node or a leaf.
5171 	 *
5172 	 *   If we are not at level 0 and not on shared tree blocks, go one
5173 	 *   level deeper.
5174 	 *
5175 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5176 	 *   the right if possible or go up and right.
5177 	 */
5178 
5179 	left_level = btrfs_header_level(left_root->commit_root);
5180 	left_root_level = left_level;
5181 	left_path->nodes[left_level] = left_root->commit_root;
5182 	extent_buffer_get(left_path->nodes[left_level]);
5183 
5184 	right_level = btrfs_header_level(right_root->commit_root);
5185 	right_root_level = right_level;
5186 	right_path->nodes[right_level] = right_root->commit_root;
5187 	extent_buffer_get(right_path->nodes[right_level]);
5188 
5189 	if (left_level == 0)
5190 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5191 				&left_key, left_path->slots[left_level]);
5192 	else
5193 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5194 				&left_key, left_path->slots[left_level]);
5195 	if (right_level == 0)
5196 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5197 				&right_key, right_path->slots[right_level]);
5198 	else
5199 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5200 				&right_key, right_path->slots[right_level]);
5201 
5202 	left_end_reached = right_end_reached = 0;
5203 	advance_left = advance_right = 0;
5204 
5205 	while (1) {
5206 		/*
5207 		 * We need to make sure the transaction does not get committed
5208 		 * while we do anything on commit roots. This means, we need to
5209 		 * join and leave transactions for every item that we process.
5210 		 */
5211 		if (trans && btrfs_should_end_transaction(trans, left_root)) {
5212 			btrfs_release_path(left_path);
5213 			btrfs_release_path(right_path);
5214 
5215 			ret = btrfs_end_transaction(trans, left_root);
5216 			trans = NULL;
5217 			if (ret < 0)
5218 				goto out;
5219 		}
5220 		/* now rejoin the transaction */
5221 		if (!trans) {
5222 			trans = btrfs_join_transaction(left_root);
5223 			if (IS_ERR(trans)) {
5224 				ret = PTR_ERR(trans);
5225 				trans = NULL;
5226 				goto out;
5227 			}
5228 
5229 			spin_lock(&left_root->root_item_lock);
5230 			ctransid = btrfs_root_ctransid(&left_root->root_item);
5231 			spin_unlock(&left_root->root_item_lock);
5232 			if (ctransid != left_start_ctransid)
5233 				left_start_ctransid = 0;
5234 
5235 			spin_lock(&right_root->root_item_lock);
5236 			ctransid = btrfs_root_ctransid(&right_root->root_item);
5237 			spin_unlock(&right_root->root_item_lock);
5238 			if (ctransid != right_start_ctransid)
5239 				right_start_ctransid = 0;
5240 
5241 			if (!left_start_ctransid || !right_start_ctransid) {
5242 				WARN(1, KERN_WARNING
5243 					"btrfs: btrfs_compare_tree detected "
5244 					"a change in one of the trees while "
5245 					"iterating. This is probably a "
5246 					"bug.\n");
5247 				ret = -EIO;
5248 				goto out;
5249 			}
5250 
5251 			/*
5252 			 * the commit root may have changed, so start again
5253 			 * where we stopped
5254 			 */
5255 			left_path->lowest_level = left_level;
5256 			right_path->lowest_level = right_level;
5257 			ret = btrfs_search_slot(NULL, left_root,
5258 					&left_key, left_path, 0, 0);
5259 			if (ret < 0)
5260 				goto out;
5261 			ret = btrfs_search_slot(NULL, right_root,
5262 					&right_key, right_path, 0, 0);
5263 			if (ret < 0)
5264 				goto out;
5265 		}
5266 
5267 		if (advance_left && !left_end_reached) {
5268 			ret = tree_advance(left_root, left_path, &left_level,
5269 					left_root_level,
5270 					advance_left != ADVANCE_ONLY_NEXT,
5271 					&left_key);
5272 			if (ret < 0)
5273 				left_end_reached = ADVANCE;
5274 			advance_left = 0;
5275 		}
5276 		if (advance_right && !right_end_reached) {
5277 			ret = tree_advance(right_root, right_path, &right_level,
5278 					right_root_level,
5279 					advance_right != ADVANCE_ONLY_NEXT,
5280 					&right_key);
5281 			if (ret < 0)
5282 				right_end_reached = ADVANCE;
5283 			advance_right = 0;
5284 		}
5285 
5286 		if (left_end_reached && right_end_reached) {
5287 			ret = 0;
5288 			goto out;
5289 		} else if (left_end_reached) {
5290 			if (right_level == 0) {
5291 				ret = changed_cb(left_root, right_root,
5292 						left_path, right_path,
5293 						&right_key,
5294 						BTRFS_COMPARE_TREE_DELETED,
5295 						ctx);
5296 				if (ret < 0)
5297 					goto out;
5298 			}
5299 			advance_right = ADVANCE;
5300 			continue;
5301 		} else if (right_end_reached) {
5302 			if (left_level == 0) {
5303 				ret = changed_cb(left_root, right_root,
5304 						left_path, right_path,
5305 						&left_key,
5306 						BTRFS_COMPARE_TREE_NEW,
5307 						ctx);
5308 				if (ret < 0)
5309 					goto out;
5310 			}
5311 			advance_left = ADVANCE;
5312 			continue;
5313 		}
5314 
5315 		if (left_level == 0 && right_level == 0) {
5316 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5317 			if (cmp < 0) {
5318 				ret = changed_cb(left_root, right_root,
5319 						left_path, right_path,
5320 						&left_key,
5321 						BTRFS_COMPARE_TREE_NEW,
5322 						ctx);
5323 				if (ret < 0)
5324 					goto out;
5325 				advance_left = ADVANCE;
5326 			} else if (cmp > 0) {
5327 				ret = changed_cb(left_root, right_root,
5328 						left_path, right_path,
5329 						&right_key,
5330 						BTRFS_COMPARE_TREE_DELETED,
5331 						ctx);
5332 				if (ret < 0)
5333 					goto out;
5334 				advance_right = ADVANCE;
5335 			} else {
5336 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5337 				ret = tree_compare_item(left_root, left_path,
5338 						right_path, tmp_buf);
5339 				if (ret) {
5340 					WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5341 					ret = changed_cb(left_root, right_root,
5342 						left_path, right_path,
5343 						&left_key,
5344 						BTRFS_COMPARE_TREE_CHANGED,
5345 						ctx);
5346 					if (ret < 0)
5347 						goto out;
5348 				}
5349 				advance_left = ADVANCE;
5350 				advance_right = ADVANCE;
5351 			}
5352 		} else if (left_level == right_level) {
5353 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5354 			if (cmp < 0) {
5355 				advance_left = ADVANCE;
5356 			} else if (cmp > 0) {
5357 				advance_right = ADVANCE;
5358 			} else {
5359 				left_blockptr = btrfs_node_blockptr(
5360 						left_path->nodes[left_level],
5361 						left_path->slots[left_level]);
5362 				right_blockptr = btrfs_node_blockptr(
5363 						right_path->nodes[right_level],
5364 						right_path->slots[right_level]);
5365 				if (left_blockptr == right_blockptr) {
5366 					/*
5367 					 * As we're on a shared block, don't
5368 					 * allow to go deeper.
5369 					 */
5370 					advance_left = ADVANCE_ONLY_NEXT;
5371 					advance_right = ADVANCE_ONLY_NEXT;
5372 				} else {
5373 					advance_left = ADVANCE;
5374 					advance_right = ADVANCE;
5375 				}
5376 			}
5377 		} else if (left_level < right_level) {
5378 			advance_right = ADVANCE;
5379 		} else {
5380 			advance_left = ADVANCE;
5381 		}
5382 	}
5383 
5384 out:
5385 	btrfs_free_path(left_path);
5386 	btrfs_free_path(right_path);
5387 	kfree(tmp_buf);
5388 
5389 	if (trans) {
5390 		if (!ret)
5391 			ret = btrfs_end_transaction(trans, left_root);
5392 		else
5393 			btrfs_end_transaction(trans, left_root);
5394 	}
5395 
5396 	return ret;
5397 }
5398 
5399 /*
5400  * this is similar to btrfs_next_leaf, but does not try to preserve
5401  * and fixup the path.  It looks for and returns the next key in the
5402  * tree based on the current path and the cache_only and min_trans
5403  * parameters.
5404  *
5405  * 0 is returned if another key is found, < 0 if there are any errors
5406  * and 1 is returned if there are no higher keys in the tree
5407  *
5408  * path->keep_locks should be set to 1 on the search made before
5409  * calling this function.
5410  */
5411 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5412 			struct btrfs_key *key, int level,
5413 			int cache_only, u64 min_trans)
5414 {
5415 	int slot;
5416 	struct extent_buffer *c;
5417 
5418 	WARN_ON(!path->keep_locks);
5419 	while (level < BTRFS_MAX_LEVEL) {
5420 		if (!path->nodes[level])
5421 			return 1;
5422 
5423 		slot = path->slots[level] + 1;
5424 		c = path->nodes[level];
5425 next:
5426 		if (slot >= btrfs_header_nritems(c)) {
5427 			int ret;
5428 			int orig_lowest;
5429 			struct btrfs_key cur_key;
5430 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5431 			    !path->nodes[level + 1])
5432 				return 1;
5433 
5434 			if (path->locks[level + 1]) {
5435 				level++;
5436 				continue;
5437 			}
5438 
5439 			slot = btrfs_header_nritems(c) - 1;
5440 			if (level == 0)
5441 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5442 			else
5443 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5444 
5445 			orig_lowest = path->lowest_level;
5446 			btrfs_release_path(path);
5447 			path->lowest_level = level;
5448 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5449 						0, 0);
5450 			path->lowest_level = orig_lowest;
5451 			if (ret < 0)
5452 				return ret;
5453 
5454 			c = path->nodes[level];
5455 			slot = path->slots[level];
5456 			if (ret == 0)
5457 				slot++;
5458 			goto next;
5459 		}
5460 
5461 		if (level == 0)
5462 			btrfs_item_key_to_cpu(c, key, slot);
5463 		else {
5464 			u64 blockptr = btrfs_node_blockptr(c, slot);
5465 			u64 gen = btrfs_node_ptr_generation(c, slot);
5466 
5467 			if (cache_only) {
5468 				struct extent_buffer *cur;
5469 				cur = btrfs_find_tree_block(root, blockptr,
5470 					    btrfs_level_size(root, level - 1));
5471 				if (!cur ||
5472 				    btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
5473 					slot++;
5474 					if (cur)
5475 						free_extent_buffer(cur);
5476 					goto next;
5477 				}
5478 				free_extent_buffer(cur);
5479 			}
5480 			if (gen < min_trans) {
5481 				slot++;
5482 				goto next;
5483 			}
5484 			btrfs_node_key_to_cpu(c, key, slot);
5485 		}
5486 		return 0;
5487 	}
5488 	return 1;
5489 }
5490 
5491 /*
5492  * search the tree again to find a leaf with greater keys
5493  * returns 0 if it found something or 1 if there are no greater leaves.
5494  * returns < 0 on io errors.
5495  */
5496 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5497 {
5498 	return btrfs_next_old_leaf(root, path, 0);
5499 }
5500 
5501 /* Release the path up to but not including the given level */
5502 static void btrfs_release_level(struct btrfs_path *path, int level)
5503 {
5504 	int i;
5505 
5506 	for (i = 0; i < level; i++) {
5507 		path->slots[i] = 0;
5508 		if (!path->nodes[i])
5509 			continue;
5510 		if (path->locks[i]) {
5511 			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5512 			path->locks[i] = 0;
5513 		}
5514 		free_extent_buffer(path->nodes[i]);
5515 		path->nodes[i] = NULL;
5516 	}
5517 }
5518 
5519 /*
5520  * This function assumes 2 things
5521  *
5522  * 1) You are using path->keep_locks
5523  * 2) You are not inserting items.
5524  *
5525  * If either of these are not true do not use this function. If you need a next
5526  * leaf with either of these not being true then this function can be easily
5527  * adapted to do that, but at the moment these are the limitations.
5528  */
5529 int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5530 			  struct btrfs_root *root, struct btrfs_path *path,
5531 			  int del)
5532 {
5533 	struct extent_buffer *b;
5534 	struct btrfs_key key;
5535 	u32 nritems;
5536 	int level = 1;
5537 	int slot;
5538 	int ret = 1;
5539 	int write_lock_level = BTRFS_MAX_LEVEL;
5540 	int ins_len = del ? -1 : 0;
5541 
5542 	WARN_ON(!(path->keep_locks || path->really_keep_locks));
5543 
5544 	nritems = btrfs_header_nritems(path->nodes[0]);
5545 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5546 
5547 	while (path->nodes[level]) {
5548 		nritems = btrfs_header_nritems(path->nodes[level]);
5549 		if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5550 search:
5551 			btrfs_release_path(path);
5552 			ret = btrfs_search_slot(trans, root, &key, path,
5553 						ins_len, 1);
5554 			if (ret < 0)
5555 				goto out;
5556 			level = 1;
5557 			continue;
5558 		}
5559 
5560 		if (path->slots[level] >= nritems - 1) {
5561 			level++;
5562 			continue;
5563 		}
5564 
5565 		btrfs_release_level(path, level);
5566 		break;
5567 	}
5568 
5569 	if (!path->nodes[level]) {
5570 		ret = 1;
5571 		goto out;
5572 	}
5573 
5574 	path->slots[level]++;
5575 	b = path->nodes[level];
5576 
5577 	while (b) {
5578 		level = btrfs_header_level(b);
5579 
5580 		if (!should_cow_block(trans, root, b))
5581 			goto cow_done;
5582 
5583 		btrfs_set_path_blocking(path);
5584 		ret = btrfs_cow_block(trans, root, b,
5585 				      path->nodes[level + 1],
5586 				      path->slots[level + 1], &b);
5587 		if (ret)
5588 			goto out;
5589 cow_done:
5590 		path->nodes[level] = b;
5591 		btrfs_clear_path_blocking(path, NULL, 0);
5592 		if (level != 0) {
5593 			ret = setup_nodes_for_search(trans, root, path, b,
5594 						     level, ins_len,
5595 						     &write_lock_level);
5596 			if (ret == -EAGAIN)
5597 				goto search;
5598 			if (ret)
5599 				goto out;
5600 
5601 			b = path->nodes[level];
5602 			slot = path->slots[level];
5603 
5604 			ret = read_block_for_search(trans, root, path,
5605 						    &b, level, slot, &key, 0);
5606 			if (ret == -EAGAIN)
5607 				goto search;
5608 			if (ret)
5609 				goto out;
5610 			level = btrfs_header_level(b);
5611 			if (!btrfs_try_tree_write_lock(b)) {
5612 				btrfs_set_path_blocking(path);
5613 				btrfs_tree_lock(b);
5614 				btrfs_clear_path_blocking(path, b,
5615 							  BTRFS_WRITE_LOCK);
5616 			}
5617 			path->locks[level] = BTRFS_WRITE_LOCK;
5618 			path->nodes[level] = b;
5619 			path->slots[level] = 0;
5620 		} else {
5621 			path->slots[level] = 0;
5622 			ret = 0;
5623 			break;
5624 		}
5625 	}
5626 
5627 out:
5628 	if (ret)
5629 		btrfs_release_path(path);
5630 
5631 	return ret;
5632 }
5633 
5634 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5635 			u64 time_seq)
5636 {
5637 	int slot;
5638 	int level;
5639 	struct extent_buffer *c;
5640 	struct extent_buffer *next;
5641 	struct btrfs_key key;
5642 	u32 nritems;
5643 	int ret;
5644 	int old_spinning = path->leave_spinning;
5645 	int next_rw_lock = 0;
5646 
5647 	nritems = btrfs_header_nritems(path->nodes[0]);
5648 	if (nritems == 0)
5649 		return 1;
5650 
5651 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5652 again:
5653 	level = 1;
5654 	next = NULL;
5655 	next_rw_lock = 0;
5656 	btrfs_release_path(path);
5657 
5658 	path->keep_locks = 1;
5659 	path->leave_spinning = 1;
5660 
5661 	if (time_seq)
5662 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5663 	else
5664 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5665 	path->keep_locks = 0;
5666 
5667 	if (ret < 0)
5668 		return ret;
5669 
5670 	nritems = btrfs_header_nritems(path->nodes[0]);
5671 	/*
5672 	 * by releasing the path above we dropped all our locks.  A balance
5673 	 * could have added more items next to the key that used to be
5674 	 * at the very end of the block.  So, check again here and
5675 	 * advance the path if there are now more items available.
5676 	 */
5677 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5678 		if (ret == 0)
5679 			path->slots[0]++;
5680 		ret = 0;
5681 		goto done;
5682 	}
5683 
5684 	while (level < BTRFS_MAX_LEVEL) {
5685 		if (!path->nodes[level]) {
5686 			ret = 1;
5687 			goto done;
5688 		}
5689 
5690 		slot = path->slots[level] + 1;
5691 		c = path->nodes[level];
5692 		if (slot >= btrfs_header_nritems(c)) {
5693 			level++;
5694 			if (level == BTRFS_MAX_LEVEL) {
5695 				ret = 1;
5696 				goto done;
5697 			}
5698 			continue;
5699 		}
5700 
5701 		if (next) {
5702 			btrfs_tree_unlock_rw(next, next_rw_lock);
5703 			free_extent_buffer(next);
5704 		}
5705 
5706 		next = c;
5707 		next_rw_lock = path->locks[level];
5708 		ret = read_block_for_search(NULL, root, path, &next, level,
5709 					    slot, &key, 0);
5710 		if (ret == -EAGAIN)
5711 			goto again;
5712 
5713 		if (ret < 0) {
5714 			btrfs_release_path(path);
5715 			goto done;
5716 		}
5717 
5718 		if (!path->skip_locking) {
5719 			ret = btrfs_try_tree_read_lock(next);
5720 			if (!ret && time_seq) {
5721 				/*
5722 				 * If we don't get the lock, we may be racing
5723 				 * with push_leaf_left, holding that lock while
5724 				 * itself waiting for the leaf we've currently
5725 				 * locked. To solve this situation, we give up
5726 				 * on our lock and cycle.
5727 				 */
5728 				free_extent_buffer(next);
5729 				btrfs_release_path(path);
5730 				cond_resched();
5731 				goto again;
5732 			}
5733 			if (!ret) {
5734 				btrfs_set_path_blocking(path);
5735 				btrfs_tree_read_lock(next);
5736 				btrfs_clear_path_blocking(path, next,
5737 							  BTRFS_READ_LOCK);
5738 			}
5739 			next_rw_lock = BTRFS_READ_LOCK;
5740 		}
5741 		break;
5742 	}
5743 	path->slots[level] = slot;
5744 	while (1) {
5745 		level--;
5746 		c = path->nodes[level];
5747 		if (path->locks[level])
5748 			btrfs_tree_unlock_rw(c, path->locks[level]);
5749 
5750 		free_extent_buffer(c);
5751 		path->nodes[level] = next;
5752 		path->slots[level] = 0;
5753 		if (!path->skip_locking)
5754 			path->locks[level] = next_rw_lock;
5755 		if (!level)
5756 			break;
5757 
5758 		ret = read_block_for_search(NULL, root, path, &next, level,
5759 					    0, &key, 0);
5760 		if (ret == -EAGAIN)
5761 			goto again;
5762 
5763 		if (ret < 0) {
5764 			btrfs_release_path(path);
5765 			goto done;
5766 		}
5767 
5768 		if (!path->skip_locking) {
5769 			ret = btrfs_try_tree_read_lock(next);
5770 			if (!ret) {
5771 				btrfs_set_path_blocking(path);
5772 				btrfs_tree_read_lock(next);
5773 				btrfs_clear_path_blocking(path, next,
5774 							  BTRFS_READ_LOCK);
5775 			}
5776 			next_rw_lock = BTRFS_READ_LOCK;
5777 		}
5778 	}
5779 	ret = 0;
5780 done:
5781 	unlock_up(path, 0, 1, 0, NULL);
5782 	path->leave_spinning = old_spinning;
5783 	if (!old_spinning)
5784 		btrfs_set_path_blocking(path);
5785 
5786 	return ret;
5787 }
5788 
5789 /*
5790  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5791  * searching until it gets past min_objectid or finds an item of 'type'
5792  *
5793  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5794  */
5795 int btrfs_previous_item(struct btrfs_root *root,
5796 			struct btrfs_path *path, u64 min_objectid,
5797 			int type)
5798 {
5799 	struct btrfs_key found_key;
5800 	struct extent_buffer *leaf;
5801 	u32 nritems;
5802 	int ret;
5803 
5804 	while (1) {
5805 		if (path->slots[0] == 0) {
5806 			btrfs_set_path_blocking(path);
5807 			ret = btrfs_prev_leaf(root, path);
5808 			if (ret != 0)
5809 				return ret;
5810 		} else {
5811 			path->slots[0]--;
5812 		}
5813 		leaf = path->nodes[0];
5814 		nritems = btrfs_header_nritems(leaf);
5815 		if (nritems == 0)
5816 			return 1;
5817 		if (path->slots[0] == nritems)
5818 			path->slots[0]--;
5819 
5820 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5821 		if (found_key.objectid < min_objectid)
5822 			break;
5823 		if (found_key.type == type)
5824 			return 0;
5825 		if (found_key.objectid == min_objectid &&
5826 		    found_key.type < type)
5827 			break;
5828 	}
5829 	return 1;
5830 }
5831