xref: /openbmc/linux/fs/btrfs/ctree.c (revision 68198dca)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include <linux/mm.h>
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "transaction.h"
26 #include "print-tree.h"
27 #include "locking.h"
28 
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 		      *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
33 		      int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35 			  struct btrfs_fs_info *fs_info,
36 			  struct extent_buffer *dst,
37 			  struct extent_buffer *src, int empty);
38 static int balance_node_right(struct btrfs_trans_handle *trans,
39 			      struct btrfs_fs_info *fs_info,
40 			      struct extent_buffer *dst_buf,
41 			      struct extent_buffer *src_buf);
42 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
43 		    int level, int slot);
44 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
45 				 struct extent_buffer *eb);
46 
47 struct btrfs_path *btrfs_alloc_path(void)
48 {
49 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 }
51 
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58 	int i;
59 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60 		if (!p->nodes[i] || !p->locks[i])
61 			continue;
62 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63 		if (p->locks[i] == BTRFS_READ_LOCK)
64 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
66 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67 	}
68 }
69 
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 					struct extent_buffer *held, int held_rw)
80 {
81 	int i;
82 
83 	if (held) {
84 		btrfs_set_lock_blocking_rw(held, held_rw);
85 		if (held_rw == BTRFS_WRITE_LOCK)
86 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87 		else if (held_rw == BTRFS_READ_LOCK)
88 			held_rw = BTRFS_READ_LOCK_BLOCKING;
89 	}
90 	btrfs_set_path_blocking(p);
91 
92 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93 		if (p->nodes[i] && p->locks[i]) {
94 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96 				p->locks[i] = BTRFS_WRITE_LOCK;
97 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98 				p->locks[i] = BTRFS_READ_LOCK;
99 		}
100 	}
101 
102 	if (held)
103 		btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105 
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109 	if (!p)
110 		return;
111 	btrfs_release_path(p);
112 	kmem_cache_free(btrfs_path_cachep, p);
113 }
114 
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123 	int i;
124 
125 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126 		p->slots[i] = 0;
127 		if (!p->nodes[i])
128 			continue;
129 		if (p->locks[i]) {
130 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131 			p->locks[i] = 0;
132 		}
133 		free_extent_buffer(p->nodes[i]);
134 		p->nodes[i] = NULL;
135 	}
136 }
137 
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150 	struct extent_buffer *eb;
151 
152 	while (1) {
153 		rcu_read_lock();
154 		eb = rcu_dereference(root->node);
155 
156 		/*
157 		 * RCU really hurts here, we could free up the root node because
158 		 * it was COWed but we may not get the new root node yet so do
159 		 * the inc_not_zero dance and if it doesn't work then
160 		 * synchronize_rcu and try again.
161 		 */
162 		if (atomic_inc_not_zero(&eb->refs)) {
163 			rcu_read_unlock();
164 			break;
165 		}
166 		rcu_read_unlock();
167 		synchronize_rcu();
168 	}
169 	return eb;
170 }
171 
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178 	struct extent_buffer *eb;
179 
180 	while (1) {
181 		eb = btrfs_root_node(root);
182 		btrfs_tree_lock(eb);
183 		if (eb == root->node)
184 			break;
185 		btrfs_tree_unlock(eb);
186 		free_extent_buffer(eb);
187 	}
188 	return eb;
189 }
190 
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197 	struct extent_buffer *eb;
198 
199 	while (1) {
200 		eb = btrfs_root_node(root);
201 		btrfs_tree_read_lock(eb);
202 		if (eb == root->node)
203 			break;
204 		btrfs_tree_read_unlock(eb);
205 		free_extent_buffer(eb);
206 	}
207 	return eb;
208 }
209 
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216 	struct btrfs_fs_info *fs_info = root->fs_info;
217 
218 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
219 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
220 		return;
221 
222 	spin_lock(&fs_info->trans_lock);
223 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
224 		/* Want the extent tree to be the last on the list */
225 		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
226 			list_move_tail(&root->dirty_list,
227 				       &fs_info->dirty_cowonly_roots);
228 		else
229 			list_move(&root->dirty_list,
230 				  &fs_info->dirty_cowonly_roots);
231 	}
232 	spin_unlock(&fs_info->trans_lock);
233 }
234 
235 /*
236  * used by snapshot creation to make a copy of a root for a tree with
237  * a given objectid.  The buffer with the new root node is returned in
238  * cow_ret, and this func returns zero on success or a negative error code.
239  */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241 		      struct btrfs_root *root,
242 		      struct extent_buffer *buf,
243 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245 	struct btrfs_fs_info *fs_info = root->fs_info;
246 	struct extent_buffer *cow;
247 	int ret = 0;
248 	int level;
249 	struct btrfs_disk_key disk_key;
250 
251 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
252 		trans->transid != fs_info->running_transaction->transid);
253 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
254 		trans->transid != root->last_trans);
255 
256 	level = btrfs_header_level(buf);
257 	if (level == 0)
258 		btrfs_item_key(buf, &disk_key, 0);
259 	else
260 		btrfs_node_key(buf, &disk_key, 0);
261 
262 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
263 			&disk_key, level, buf->start, 0);
264 	if (IS_ERR(cow))
265 		return PTR_ERR(cow);
266 
267 	copy_extent_buffer_full(cow, buf);
268 	btrfs_set_header_bytenr(cow, cow->start);
269 	btrfs_set_header_generation(cow, trans->transid);
270 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
271 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
272 				     BTRFS_HEADER_FLAG_RELOC);
273 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
274 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
275 	else
276 		btrfs_set_header_owner(cow, new_root_objectid);
277 
278 	write_extent_buffer_fsid(cow, fs_info->fsid);
279 
280 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
281 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
282 		ret = btrfs_inc_ref(trans, root, cow, 1);
283 	else
284 		ret = btrfs_inc_ref(trans, root, cow, 0);
285 
286 	if (ret)
287 		return ret;
288 
289 	btrfs_mark_buffer_dirty(cow);
290 	*cow_ret = cow;
291 	return 0;
292 }
293 
294 enum mod_log_op {
295 	MOD_LOG_KEY_REPLACE,
296 	MOD_LOG_KEY_ADD,
297 	MOD_LOG_KEY_REMOVE,
298 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
299 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
300 	MOD_LOG_MOVE_KEYS,
301 	MOD_LOG_ROOT_REPLACE,
302 };
303 
304 struct tree_mod_move {
305 	int dst_slot;
306 	int nr_items;
307 };
308 
309 struct tree_mod_root {
310 	u64 logical;
311 	u8 level;
312 };
313 
314 struct tree_mod_elem {
315 	struct rb_node node;
316 	u64 logical;
317 	u64 seq;
318 	enum mod_log_op op;
319 
320 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
321 	int slot;
322 
323 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
324 	u64 generation;
325 
326 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
327 	struct btrfs_disk_key key;
328 	u64 blockptr;
329 
330 	/* this is used for op == MOD_LOG_MOVE_KEYS */
331 	struct tree_mod_move move;
332 
333 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
334 	struct tree_mod_root old_root;
335 };
336 
337 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
338 {
339 	read_lock(&fs_info->tree_mod_log_lock);
340 }
341 
342 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
343 {
344 	read_unlock(&fs_info->tree_mod_log_lock);
345 }
346 
347 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
348 {
349 	write_lock(&fs_info->tree_mod_log_lock);
350 }
351 
352 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
353 {
354 	write_unlock(&fs_info->tree_mod_log_lock);
355 }
356 
357 /*
358  * Pull a new tree mod seq number for our operation.
359  */
360 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
361 {
362 	return atomic64_inc_return(&fs_info->tree_mod_seq);
363 }
364 
365 /*
366  * This adds a new blocker to the tree mod log's blocker list if the @elem
367  * passed does not already have a sequence number set. So when a caller expects
368  * to record tree modifications, it should ensure to set elem->seq to zero
369  * before calling btrfs_get_tree_mod_seq.
370  * Returns a fresh, unused tree log modification sequence number, even if no new
371  * blocker was added.
372  */
373 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
374 			   struct seq_list *elem)
375 {
376 	tree_mod_log_write_lock(fs_info);
377 	spin_lock(&fs_info->tree_mod_seq_lock);
378 	if (!elem->seq) {
379 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
380 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
381 	}
382 	spin_unlock(&fs_info->tree_mod_seq_lock);
383 	tree_mod_log_write_unlock(fs_info);
384 
385 	return elem->seq;
386 }
387 
388 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
389 			    struct seq_list *elem)
390 {
391 	struct rb_root *tm_root;
392 	struct rb_node *node;
393 	struct rb_node *next;
394 	struct seq_list *cur_elem;
395 	struct tree_mod_elem *tm;
396 	u64 min_seq = (u64)-1;
397 	u64 seq_putting = elem->seq;
398 
399 	if (!seq_putting)
400 		return;
401 
402 	spin_lock(&fs_info->tree_mod_seq_lock);
403 	list_del(&elem->list);
404 	elem->seq = 0;
405 
406 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
407 		if (cur_elem->seq < min_seq) {
408 			if (seq_putting > cur_elem->seq) {
409 				/*
410 				 * blocker with lower sequence number exists, we
411 				 * cannot remove anything from the log
412 				 */
413 				spin_unlock(&fs_info->tree_mod_seq_lock);
414 				return;
415 			}
416 			min_seq = cur_elem->seq;
417 		}
418 	}
419 	spin_unlock(&fs_info->tree_mod_seq_lock);
420 
421 	/*
422 	 * anything that's lower than the lowest existing (read: blocked)
423 	 * sequence number can be removed from the tree.
424 	 */
425 	tree_mod_log_write_lock(fs_info);
426 	tm_root = &fs_info->tree_mod_log;
427 	for (node = rb_first(tm_root); node; node = next) {
428 		next = rb_next(node);
429 		tm = rb_entry(node, struct tree_mod_elem, node);
430 		if (tm->seq > min_seq)
431 			continue;
432 		rb_erase(node, tm_root);
433 		kfree(tm);
434 	}
435 	tree_mod_log_write_unlock(fs_info);
436 }
437 
438 /*
439  * key order of the log:
440  *       node/leaf start address -> sequence
441  *
442  * The 'start address' is the logical address of the *new* root node
443  * for root replace operations, or the logical address of the affected
444  * block for all other operations.
445  *
446  * Note: must be called with write lock (tree_mod_log_write_lock).
447  */
448 static noinline int
449 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
450 {
451 	struct rb_root *tm_root;
452 	struct rb_node **new;
453 	struct rb_node *parent = NULL;
454 	struct tree_mod_elem *cur;
455 
456 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
457 
458 	tm_root = &fs_info->tree_mod_log;
459 	new = &tm_root->rb_node;
460 	while (*new) {
461 		cur = rb_entry(*new, struct tree_mod_elem, node);
462 		parent = *new;
463 		if (cur->logical < tm->logical)
464 			new = &((*new)->rb_left);
465 		else if (cur->logical > tm->logical)
466 			new = &((*new)->rb_right);
467 		else if (cur->seq < tm->seq)
468 			new = &((*new)->rb_left);
469 		else if (cur->seq > tm->seq)
470 			new = &((*new)->rb_right);
471 		else
472 			return -EEXIST;
473 	}
474 
475 	rb_link_node(&tm->node, parent, new);
476 	rb_insert_color(&tm->node, tm_root);
477 	return 0;
478 }
479 
480 /*
481  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
482  * returns zero with the tree_mod_log_lock acquired. The caller must hold
483  * this until all tree mod log insertions are recorded in the rb tree and then
484  * call tree_mod_log_write_unlock() to release.
485  */
486 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
487 				    struct extent_buffer *eb) {
488 	smp_mb();
489 	if (list_empty(&(fs_info)->tree_mod_seq_list))
490 		return 1;
491 	if (eb && btrfs_header_level(eb) == 0)
492 		return 1;
493 
494 	tree_mod_log_write_lock(fs_info);
495 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
496 		tree_mod_log_write_unlock(fs_info);
497 		return 1;
498 	}
499 
500 	return 0;
501 }
502 
503 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
504 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
505 				    struct extent_buffer *eb)
506 {
507 	smp_mb();
508 	if (list_empty(&(fs_info)->tree_mod_seq_list))
509 		return 0;
510 	if (eb && btrfs_header_level(eb) == 0)
511 		return 0;
512 
513 	return 1;
514 }
515 
516 static struct tree_mod_elem *
517 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
518 		    enum mod_log_op op, gfp_t flags)
519 {
520 	struct tree_mod_elem *tm;
521 
522 	tm = kzalloc(sizeof(*tm), flags);
523 	if (!tm)
524 		return NULL;
525 
526 	tm->logical = eb->start;
527 	if (op != MOD_LOG_KEY_ADD) {
528 		btrfs_node_key(eb, &tm->key, slot);
529 		tm->blockptr = btrfs_node_blockptr(eb, slot);
530 	}
531 	tm->op = op;
532 	tm->slot = slot;
533 	tm->generation = btrfs_node_ptr_generation(eb, slot);
534 	RB_CLEAR_NODE(&tm->node);
535 
536 	return tm;
537 }
538 
539 static noinline int
540 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
541 			struct extent_buffer *eb, int slot,
542 			enum mod_log_op op, gfp_t flags)
543 {
544 	struct tree_mod_elem *tm;
545 	int ret;
546 
547 	if (!tree_mod_need_log(fs_info, eb))
548 		return 0;
549 
550 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
551 	if (!tm)
552 		return -ENOMEM;
553 
554 	if (tree_mod_dont_log(fs_info, eb)) {
555 		kfree(tm);
556 		return 0;
557 	}
558 
559 	ret = __tree_mod_log_insert(fs_info, tm);
560 	tree_mod_log_write_unlock(fs_info);
561 	if (ret)
562 		kfree(tm);
563 
564 	return ret;
565 }
566 
567 static noinline int
568 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
569 			 struct extent_buffer *eb, int dst_slot, int src_slot,
570 			 int nr_items)
571 {
572 	struct tree_mod_elem *tm = NULL;
573 	struct tree_mod_elem **tm_list = NULL;
574 	int ret = 0;
575 	int i;
576 	int locked = 0;
577 
578 	if (!tree_mod_need_log(fs_info, eb))
579 		return 0;
580 
581 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
582 	if (!tm_list)
583 		return -ENOMEM;
584 
585 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
586 	if (!tm) {
587 		ret = -ENOMEM;
588 		goto free_tms;
589 	}
590 
591 	tm->logical = eb->start;
592 	tm->slot = src_slot;
593 	tm->move.dst_slot = dst_slot;
594 	tm->move.nr_items = nr_items;
595 	tm->op = MOD_LOG_MOVE_KEYS;
596 
597 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
598 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
599 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
600 		if (!tm_list[i]) {
601 			ret = -ENOMEM;
602 			goto free_tms;
603 		}
604 	}
605 
606 	if (tree_mod_dont_log(fs_info, eb))
607 		goto free_tms;
608 	locked = 1;
609 
610 	/*
611 	 * When we override something during the move, we log these removals.
612 	 * This can only happen when we move towards the beginning of the
613 	 * buffer, i.e. dst_slot < src_slot.
614 	 */
615 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
616 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
617 		if (ret)
618 			goto free_tms;
619 	}
620 
621 	ret = __tree_mod_log_insert(fs_info, tm);
622 	if (ret)
623 		goto free_tms;
624 	tree_mod_log_write_unlock(fs_info);
625 	kfree(tm_list);
626 
627 	return 0;
628 free_tms:
629 	for (i = 0; i < nr_items; i++) {
630 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
631 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
632 		kfree(tm_list[i]);
633 	}
634 	if (locked)
635 		tree_mod_log_write_unlock(fs_info);
636 	kfree(tm_list);
637 	kfree(tm);
638 
639 	return ret;
640 }
641 
642 static inline int
643 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
644 		       struct tree_mod_elem **tm_list,
645 		       int nritems)
646 {
647 	int i, j;
648 	int ret;
649 
650 	for (i = nritems - 1; i >= 0; i--) {
651 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
652 		if (ret) {
653 			for (j = nritems - 1; j > i; j--)
654 				rb_erase(&tm_list[j]->node,
655 					 &fs_info->tree_mod_log);
656 			return ret;
657 		}
658 	}
659 
660 	return 0;
661 }
662 
663 static noinline int
664 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
665 			 struct extent_buffer *old_root,
666 			 struct extent_buffer *new_root,
667 			 int log_removal)
668 {
669 	struct tree_mod_elem *tm = NULL;
670 	struct tree_mod_elem **tm_list = NULL;
671 	int nritems = 0;
672 	int ret = 0;
673 	int i;
674 
675 	if (!tree_mod_need_log(fs_info, NULL))
676 		return 0;
677 
678 	if (log_removal && btrfs_header_level(old_root) > 0) {
679 		nritems = btrfs_header_nritems(old_root);
680 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
681 				  GFP_NOFS);
682 		if (!tm_list) {
683 			ret = -ENOMEM;
684 			goto free_tms;
685 		}
686 		for (i = 0; i < nritems; i++) {
687 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
688 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
689 			if (!tm_list[i]) {
690 				ret = -ENOMEM;
691 				goto free_tms;
692 			}
693 		}
694 	}
695 
696 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
697 	if (!tm) {
698 		ret = -ENOMEM;
699 		goto free_tms;
700 	}
701 
702 	tm->logical = new_root->start;
703 	tm->old_root.logical = old_root->start;
704 	tm->old_root.level = btrfs_header_level(old_root);
705 	tm->generation = btrfs_header_generation(old_root);
706 	tm->op = MOD_LOG_ROOT_REPLACE;
707 
708 	if (tree_mod_dont_log(fs_info, NULL))
709 		goto free_tms;
710 
711 	if (tm_list)
712 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
713 	if (!ret)
714 		ret = __tree_mod_log_insert(fs_info, tm);
715 
716 	tree_mod_log_write_unlock(fs_info);
717 	if (ret)
718 		goto free_tms;
719 	kfree(tm_list);
720 
721 	return ret;
722 
723 free_tms:
724 	if (tm_list) {
725 		for (i = 0; i < nritems; i++)
726 			kfree(tm_list[i]);
727 		kfree(tm_list);
728 	}
729 	kfree(tm);
730 
731 	return ret;
732 }
733 
734 static struct tree_mod_elem *
735 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
736 		      int smallest)
737 {
738 	struct rb_root *tm_root;
739 	struct rb_node *node;
740 	struct tree_mod_elem *cur = NULL;
741 	struct tree_mod_elem *found = NULL;
742 
743 	tree_mod_log_read_lock(fs_info);
744 	tm_root = &fs_info->tree_mod_log;
745 	node = tm_root->rb_node;
746 	while (node) {
747 		cur = rb_entry(node, struct tree_mod_elem, node);
748 		if (cur->logical < start) {
749 			node = node->rb_left;
750 		} else if (cur->logical > start) {
751 			node = node->rb_right;
752 		} else if (cur->seq < min_seq) {
753 			node = node->rb_left;
754 		} else if (!smallest) {
755 			/* we want the node with the highest seq */
756 			if (found)
757 				BUG_ON(found->seq > cur->seq);
758 			found = cur;
759 			node = node->rb_left;
760 		} else if (cur->seq > min_seq) {
761 			/* we want the node with the smallest seq */
762 			if (found)
763 				BUG_ON(found->seq < cur->seq);
764 			found = cur;
765 			node = node->rb_right;
766 		} else {
767 			found = cur;
768 			break;
769 		}
770 	}
771 	tree_mod_log_read_unlock(fs_info);
772 
773 	return found;
774 }
775 
776 /*
777  * this returns the element from the log with the smallest time sequence
778  * value that's in the log (the oldest log item). any element with a time
779  * sequence lower than min_seq will be ignored.
780  */
781 static struct tree_mod_elem *
782 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
783 			   u64 min_seq)
784 {
785 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
786 }
787 
788 /*
789  * this returns the element from the log with the largest time sequence
790  * value that's in the log (the most recent log item). any element with
791  * a time sequence lower than min_seq will be ignored.
792  */
793 static struct tree_mod_elem *
794 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
795 {
796 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
797 }
798 
799 static noinline int
800 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
801 		     struct extent_buffer *src, unsigned long dst_offset,
802 		     unsigned long src_offset, int nr_items)
803 {
804 	int ret = 0;
805 	struct tree_mod_elem **tm_list = NULL;
806 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
807 	int i;
808 	int locked = 0;
809 
810 	if (!tree_mod_need_log(fs_info, NULL))
811 		return 0;
812 
813 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
814 		return 0;
815 
816 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
817 			  GFP_NOFS);
818 	if (!tm_list)
819 		return -ENOMEM;
820 
821 	tm_list_add = tm_list;
822 	tm_list_rem = tm_list + nr_items;
823 	for (i = 0; i < nr_items; i++) {
824 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
825 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
826 		if (!tm_list_rem[i]) {
827 			ret = -ENOMEM;
828 			goto free_tms;
829 		}
830 
831 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
832 		    MOD_LOG_KEY_ADD, GFP_NOFS);
833 		if (!tm_list_add[i]) {
834 			ret = -ENOMEM;
835 			goto free_tms;
836 		}
837 	}
838 
839 	if (tree_mod_dont_log(fs_info, NULL))
840 		goto free_tms;
841 	locked = 1;
842 
843 	for (i = 0; i < nr_items; i++) {
844 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
845 		if (ret)
846 			goto free_tms;
847 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
848 		if (ret)
849 			goto free_tms;
850 	}
851 
852 	tree_mod_log_write_unlock(fs_info);
853 	kfree(tm_list);
854 
855 	return 0;
856 
857 free_tms:
858 	for (i = 0; i < nr_items * 2; i++) {
859 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
860 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
861 		kfree(tm_list[i]);
862 	}
863 	if (locked)
864 		tree_mod_log_write_unlock(fs_info);
865 	kfree(tm_list);
866 
867 	return ret;
868 }
869 
870 static inline void
871 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
872 		     int dst_offset, int src_offset, int nr_items)
873 {
874 	int ret;
875 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
876 				       nr_items);
877 	BUG_ON(ret < 0);
878 }
879 
880 static noinline void
881 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
882 			  struct extent_buffer *eb, int slot, int atomic)
883 {
884 	int ret;
885 
886 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
887 					MOD_LOG_KEY_REPLACE,
888 					atomic ? GFP_ATOMIC : GFP_NOFS);
889 	BUG_ON(ret < 0);
890 }
891 
892 static noinline int
893 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
894 {
895 	struct tree_mod_elem **tm_list = NULL;
896 	int nritems = 0;
897 	int i;
898 	int ret = 0;
899 
900 	if (btrfs_header_level(eb) == 0)
901 		return 0;
902 
903 	if (!tree_mod_need_log(fs_info, NULL))
904 		return 0;
905 
906 	nritems = btrfs_header_nritems(eb);
907 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
908 	if (!tm_list)
909 		return -ENOMEM;
910 
911 	for (i = 0; i < nritems; i++) {
912 		tm_list[i] = alloc_tree_mod_elem(eb, i,
913 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
914 		if (!tm_list[i]) {
915 			ret = -ENOMEM;
916 			goto free_tms;
917 		}
918 	}
919 
920 	if (tree_mod_dont_log(fs_info, eb))
921 		goto free_tms;
922 
923 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
924 	tree_mod_log_write_unlock(fs_info);
925 	if (ret)
926 		goto free_tms;
927 	kfree(tm_list);
928 
929 	return 0;
930 
931 free_tms:
932 	for (i = 0; i < nritems; i++)
933 		kfree(tm_list[i]);
934 	kfree(tm_list);
935 
936 	return ret;
937 }
938 
939 static noinline void
940 tree_mod_log_set_root_pointer(struct btrfs_root *root,
941 			      struct extent_buffer *new_root_node,
942 			      int log_removal)
943 {
944 	int ret;
945 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
946 				       new_root_node, log_removal);
947 	BUG_ON(ret < 0);
948 }
949 
950 /*
951  * check if the tree block can be shared by multiple trees
952  */
953 int btrfs_block_can_be_shared(struct btrfs_root *root,
954 			      struct extent_buffer *buf)
955 {
956 	/*
957 	 * Tree blocks not in reference counted trees and tree roots
958 	 * are never shared. If a block was allocated after the last
959 	 * snapshot and the block was not allocated by tree relocation,
960 	 * we know the block is not shared.
961 	 */
962 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
963 	    buf != root->node && buf != root->commit_root &&
964 	    (btrfs_header_generation(buf) <=
965 	     btrfs_root_last_snapshot(&root->root_item) ||
966 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
967 		return 1;
968 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
969 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
970 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
971 		return 1;
972 #endif
973 	return 0;
974 }
975 
976 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
977 				       struct btrfs_root *root,
978 				       struct extent_buffer *buf,
979 				       struct extent_buffer *cow,
980 				       int *last_ref)
981 {
982 	struct btrfs_fs_info *fs_info = root->fs_info;
983 	u64 refs;
984 	u64 owner;
985 	u64 flags;
986 	u64 new_flags = 0;
987 	int ret;
988 
989 	/*
990 	 * Backrefs update rules:
991 	 *
992 	 * Always use full backrefs for extent pointers in tree block
993 	 * allocated by tree relocation.
994 	 *
995 	 * If a shared tree block is no longer referenced by its owner
996 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
997 	 * use full backrefs for extent pointers in tree block.
998 	 *
999 	 * If a tree block is been relocating
1000 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1001 	 * use full backrefs for extent pointers in tree block.
1002 	 * The reason for this is some operations (such as drop tree)
1003 	 * are only allowed for blocks use full backrefs.
1004 	 */
1005 
1006 	if (btrfs_block_can_be_shared(root, buf)) {
1007 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
1008 					       btrfs_header_level(buf), 1,
1009 					       &refs, &flags);
1010 		if (ret)
1011 			return ret;
1012 		if (refs == 0) {
1013 			ret = -EROFS;
1014 			btrfs_handle_fs_error(fs_info, ret, NULL);
1015 			return ret;
1016 		}
1017 	} else {
1018 		refs = 1;
1019 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1020 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1021 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1022 		else
1023 			flags = 0;
1024 	}
1025 
1026 	owner = btrfs_header_owner(buf);
1027 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1028 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1029 
1030 	if (refs > 1) {
1031 		if ((owner == root->root_key.objectid ||
1032 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1033 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1034 			ret = btrfs_inc_ref(trans, root, buf, 1);
1035 			if (ret)
1036 				return ret;
1037 
1038 			if (root->root_key.objectid ==
1039 			    BTRFS_TREE_RELOC_OBJECTID) {
1040 				ret = btrfs_dec_ref(trans, root, buf, 0);
1041 				if (ret)
1042 					return ret;
1043 				ret = btrfs_inc_ref(trans, root, cow, 1);
1044 				if (ret)
1045 					return ret;
1046 			}
1047 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1048 		} else {
1049 
1050 			if (root->root_key.objectid ==
1051 			    BTRFS_TREE_RELOC_OBJECTID)
1052 				ret = btrfs_inc_ref(trans, root, cow, 1);
1053 			else
1054 				ret = btrfs_inc_ref(trans, root, cow, 0);
1055 			if (ret)
1056 				return ret;
1057 		}
1058 		if (new_flags != 0) {
1059 			int level = btrfs_header_level(buf);
1060 
1061 			ret = btrfs_set_disk_extent_flags(trans, fs_info,
1062 							  buf->start,
1063 							  buf->len,
1064 							  new_flags, level, 0);
1065 			if (ret)
1066 				return ret;
1067 		}
1068 	} else {
1069 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1070 			if (root->root_key.objectid ==
1071 			    BTRFS_TREE_RELOC_OBJECTID)
1072 				ret = btrfs_inc_ref(trans, root, cow, 1);
1073 			else
1074 				ret = btrfs_inc_ref(trans, root, cow, 0);
1075 			if (ret)
1076 				return ret;
1077 			ret = btrfs_dec_ref(trans, root, buf, 1);
1078 			if (ret)
1079 				return ret;
1080 		}
1081 		clean_tree_block(fs_info, buf);
1082 		*last_ref = 1;
1083 	}
1084 	return 0;
1085 }
1086 
1087 /*
1088  * does the dirty work in cow of a single block.  The parent block (if
1089  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1090  * dirty and returned locked.  If you modify the block it needs to be marked
1091  * dirty again.
1092  *
1093  * search_start -- an allocation hint for the new block
1094  *
1095  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1096  * bytes the allocator should try to find free next to the block it returns.
1097  * This is just a hint and may be ignored by the allocator.
1098  */
1099 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1100 			     struct btrfs_root *root,
1101 			     struct extent_buffer *buf,
1102 			     struct extent_buffer *parent, int parent_slot,
1103 			     struct extent_buffer **cow_ret,
1104 			     u64 search_start, u64 empty_size)
1105 {
1106 	struct btrfs_fs_info *fs_info = root->fs_info;
1107 	struct btrfs_disk_key disk_key;
1108 	struct extent_buffer *cow;
1109 	int level, ret;
1110 	int last_ref = 0;
1111 	int unlock_orig = 0;
1112 	u64 parent_start = 0;
1113 
1114 	if (*cow_ret == buf)
1115 		unlock_orig = 1;
1116 
1117 	btrfs_assert_tree_locked(buf);
1118 
1119 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1120 		trans->transid != fs_info->running_transaction->transid);
1121 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1122 		trans->transid != root->last_trans);
1123 
1124 	level = btrfs_header_level(buf);
1125 
1126 	if (level == 0)
1127 		btrfs_item_key(buf, &disk_key, 0);
1128 	else
1129 		btrfs_node_key(buf, &disk_key, 0);
1130 
1131 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1132 		parent_start = parent->start;
1133 
1134 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
1135 			root->root_key.objectid, &disk_key, level,
1136 			search_start, empty_size);
1137 	if (IS_ERR(cow))
1138 		return PTR_ERR(cow);
1139 
1140 	/* cow is set to blocking by btrfs_init_new_buffer */
1141 
1142 	copy_extent_buffer_full(cow, buf);
1143 	btrfs_set_header_bytenr(cow, cow->start);
1144 	btrfs_set_header_generation(cow, trans->transid);
1145 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1146 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1147 				     BTRFS_HEADER_FLAG_RELOC);
1148 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1149 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1150 	else
1151 		btrfs_set_header_owner(cow, root->root_key.objectid);
1152 
1153 	write_extent_buffer_fsid(cow, fs_info->fsid);
1154 
1155 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1156 	if (ret) {
1157 		btrfs_abort_transaction(trans, ret);
1158 		return ret;
1159 	}
1160 
1161 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1162 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1163 		if (ret) {
1164 			btrfs_abort_transaction(trans, ret);
1165 			return ret;
1166 		}
1167 	}
1168 
1169 	if (buf == root->node) {
1170 		WARN_ON(parent && parent != buf);
1171 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1172 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1173 			parent_start = buf->start;
1174 
1175 		extent_buffer_get(cow);
1176 		tree_mod_log_set_root_pointer(root, cow, 1);
1177 		rcu_assign_pointer(root->node, cow);
1178 
1179 		btrfs_free_tree_block(trans, root, buf, parent_start,
1180 				      last_ref);
1181 		free_extent_buffer(buf);
1182 		add_root_to_dirty_list(root);
1183 	} else {
1184 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1185 		tree_mod_log_insert_key(fs_info, parent, parent_slot,
1186 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1187 		btrfs_set_node_blockptr(parent, parent_slot,
1188 					cow->start);
1189 		btrfs_set_node_ptr_generation(parent, parent_slot,
1190 					      trans->transid);
1191 		btrfs_mark_buffer_dirty(parent);
1192 		if (last_ref) {
1193 			ret = tree_mod_log_free_eb(fs_info, buf);
1194 			if (ret) {
1195 				btrfs_abort_transaction(trans, ret);
1196 				return ret;
1197 			}
1198 		}
1199 		btrfs_free_tree_block(trans, root, buf, parent_start,
1200 				      last_ref);
1201 	}
1202 	if (unlock_orig)
1203 		btrfs_tree_unlock(buf);
1204 	free_extent_buffer_stale(buf);
1205 	btrfs_mark_buffer_dirty(cow);
1206 	*cow_ret = cow;
1207 	return 0;
1208 }
1209 
1210 /*
1211  * returns the logical address of the oldest predecessor of the given root.
1212  * entries older than time_seq are ignored.
1213  */
1214 static struct tree_mod_elem *
1215 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1216 			   struct extent_buffer *eb_root, u64 time_seq)
1217 {
1218 	struct tree_mod_elem *tm;
1219 	struct tree_mod_elem *found = NULL;
1220 	u64 root_logical = eb_root->start;
1221 	int looped = 0;
1222 
1223 	if (!time_seq)
1224 		return NULL;
1225 
1226 	/*
1227 	 * the very last operation that's logged for a root is the
1228 	 * replacement operation (if it is replaced at all). this has
1229 	 * the logical address of the *new* root, making it the very
1230 	 * first operation that's logged for this root.
1231 	 */
1232 	while (1) {
1233 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1234 						time_seq);
1235 		if (!looped && !tm)
1236 			return NULL;
1237 		/*
1238 		 * if there are no tree operation for the oldest root, we simply
1239 		 * return it. this should only happen if that (old) root is at
1240 		 * level 0.
1241 		 */
1242 		if (!tm)
1243 			break;
1244 
1245 		/*
1246 		 * if there's an operation that's not a root replacement, we
1247 		 * found the oldest version of our root. normally, we'll find a
1248 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1249 		 */
1250 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1251 			break;
1252 
1253 		found = tm;
1254 		root_logical = tm->old_root.logical;
1255 		looped = 1;
1256 	}
1257 
1258 	/* if there's no old root to return, return what we found instead */
1259 	if (!found)
1260 		found = tm;
1261 
1262 	return found;
1263 }
1264 
1265 /*
1266  * tm is a pointer to the first operation to rewind within eb. then, all
1267  * previous operations will be rewound (until we reach something older than
1268  * time_seq).
1269  */
1270 static void
1271 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1272 		      u64 time_seq, struct tree_mod_elem *first_tm)
1273 {
1274 	u32 n;
1275 	struct rb_node *next;
1276 	struct tree_mod_elem *tm = first_tm;
1277 	unsigned long o_dst;
1278 	unsigned long o_src;
1279 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1280 
1281 	n = btrfs_header_nritems(eb);
1282 	tree_mod_log_read_lock(fs_info);
1283 	while (tm && tm->seq >= time_seq) {
1284 		/*
1285 		 * all the operations are recorded with the operator used for
1286 		 * the modification. as we're going backwards, we do the
1287 		 * opposite of each operation here.
1288 		 */
1289 		switch (tm->op) {
1290 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1291 			BUG_ON(tm->slot < n);
1292 			/* Fallthrough */
1293 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1294 		case MOD_LOG_KEY_REMOVE:
1295 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1296 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1297 			btrfs_set_node_ptr_generation(eb, tm->slot,
1298 						      tm->generation);
1299 			n++;
1300 			break;
1301 		case MOD_LOG_KEY_REPLACE:
1302 			BUG_ON(tm->slot >= n);
1303 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1304 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1305 			btrfs_set_node_ptr_generation(eb, tm->slot,
1306 						      tm->generation);
1307 			break;
1308 		case MOD_LOG_KEY_ADD:
1309 			/* if a move operation is needed it's in the log */
1310 			n--;
1311 			break;
1312 		case MOD_LOG_MOVE_KEYS:
1313 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1314 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1315 			memmove_extent_buffer(eb, o_dst, o_src,
1316 					      tm->move.nr_items * p_size);
1317 			break;
1318 		case MOD_LOG_ROOT_REPLACE:
1319 			/*
1320 			 * this operation is special. for roots, this must be
1321 			 * handled explicitly before rewinding.
1322 			 * for non-roots, this operation may exist if the node
1323 			 * was a root: root A -> child B; then A gets empty and
1324 			 * B is promoted to the new root. in the mod log, we'll
1325 			 * have a root-replace operation for B, a tree block
1326 			 * that is no root. we simply ignore that operation.
1327 			 */
1328 			break;
1329 		}
1330 		next = rb_next(&tm->node);
1331 		if (!next)
1332 			break;
1333 		tm = rb_entry(next, struct tree_mod_elem, node);
1334 		if (tm->logical != first_tm->logical)
1335 			break;
1336 	}
1337 	tree_mod_log_read_unlock(fs_info);
1338 	btrfs_set_header_nritems(eb, n);
1339 }
1340 
1341 /*
1342  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1343  * is returned. If rewind operations happen, a fresh buffer is returned. The
1344  * returned buffer is always read-locked. If the returned buffer is not the
1345  * input buffer, the lock on the input buffer is released and the input buffer
1346  * is freed (its refcount is decremented).
1347  */
1348 static struct extent_buffer *
1349 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1350 		    struct extent_buffer *eb, u64 time_seq)
1351 {
1352 	struct extent_buffer *eb_rewin;
1353 	struct tree_mod_elem *tm;
1354 
1355 	if (!time_seq)
1356 		return eb;
1357 
1358 	if (btrfs_header_level(eb) == 0)
1359 		return eb;
1360 
1361 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1362 	if (!tm)
1363 		return eb;
1364 
1365 	btrfs_set_path_blocking(path);
1366 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1367 
1368 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1369 		BUG_ON(tm->slot != 0);
1370 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1371 		if (!eb_rewin) {
1372 			btrfs_tree_read_unlock_blocking(eb);
1373 			free_extent_buffer(eb);
1374 			return NULL;
1375 		}
1376 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1377 		btrfs_set_header_backref_rev(eb_rewin,
1378 					     btrfs_header_backref_rev(eb));
1379 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1380 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1381 	} else {
1382 		eb_rewin = btrfs_clone_extent_buffer(eb);
1383 		if (!eb_rewin) {
1384 			btrfs_tree_read_unlock_blocking(eb);
1385 			free_extent_buffer(eb);
1386 			return NULL;
1387 		}
1388 	}
1389 
1390 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1391 	btrfs_tree_read_unlock_blocking(eb);
1392 	free_extent_buffer(eb);
1393 
1394 	extent_buffer_get(eb_rewin);
1395 	btrfs_tree_read_lock(eb_rewin);
1396 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1397 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1398 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1399 
1400 	return eb_rewin;
1401 }
1402 
1403 /*
1404  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1405  * value. If there are no changes, the current root->root_node is returned. If
1406  * anything changed in between, there's a fresh buffer allocated on which the
1407  * rewind operations are done. In any case, the returned buffer is read locked.
1408  * Returns NULL on error (with no locks held).
1409  */
1410 static inline struct extent_buffer *
1411 get_old_root(struct btrfs_root *root, u64 time_seq)
1412 {
1413 	struct btrfs_fs_info *fs_info = root->fs_info;
1414 	struct tree_mod_elem *tm;
1415 	struct extent_buffer *eb = NULL;
1416 	struct extent_buffer *eb_root;
1417 	struct extent_buffer *old;
1418 	struct tree_mod_root *old_root = NULL;
1419 	u64 old_generation = 0;
1420 	u64 logical;
1421 
1422 	eb_root = btrfs_read_lock_root_node(root);
1423 	tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
1424 	if (!tm)
1425 		return eb_root;
1426 
1427 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1428 		old_root = &tm->old_root;
1429 		old_generation = tm->generation;
1430 		logical = old_root->logical;
1431 	} else {
1432 		logical = eb_root->start;
1433 	}
1434 
1435 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1436 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1437 		btrfs_tree_read_unlock(eb_root);
1438 		free_extent_buffer(eb_root);
1439 		old = read_tree_block(fs_info, logical, 0);
1440 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1441 			if (!IS_ERR(old))
1442 				free_extent_buffer(old);
1443 			btrfs_warn(fs_info,
1444 				   "failed to read tree block %llu from get_old_root",
1445 				   logical);
1446 		} else {
1447 			eb = btrfs_clone_extent_buffer(old);
1448 			free_extent_buffer(old);
1449 		}
1450 	} else if (old_root) {
1451 		btrfs_tree_read_unlock(eb_root);
1452 		free_extent_buffer(eb_root);
1453 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1454 	} else {
1455 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1456 		eb = btrfs_clone_extent_buffer(eb_root);
1457 		btrfs_tree_read_unlock_blocking(eb_root);
1458 		free_extent_buffer(eb_root);
1459 	}
1460 
1461 	if (!eb)
1462 		return NULL;
1463 	extent_buffer_get(eb);
1464 	btrfs_tree_read_lock(eb);
1465 	if (old_root) {
1466 		btrfs_set_header_bytenr(eb, eb->start);
1467 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1468 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1469 		btrfs_set_header_level(eb, old_root->level);
1470 		btrfs_set_header_generation(eb, old_generation);
1471 	}
1472 	if (tm)
1473 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1474 	else
1475 		WARN_ON(btrfs_header_level(eb) != 0);
1476 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1477 
1478 	return eb;
1479 }
1480 
1481 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1482 {
1483 	struct tree_mod_elem *tm;
1484 	int level;
1485 	struct extent_buffer *eb_root = btrfs_root_node(root);
1486 
1487 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1488 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1489 		level = tm->old_root.level;
1490 	} else {
1491 		level = btrfs_header_level(eb_root);
1492 	}
1493 	free_extent_buffer(eb_root);
1494 
1495 	return level;
1496 }
1497 
1498 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1499 				   struct btrfs_root *root,
1500 				   struct extent_buffer *buf)
1501 {
1502 	if (btrfs_is_testing(root->fs_info))
1503 		return 0;
1504 
1505 	/* ensure we can see the force_cow */
1506 	smp_rmb();
1507 
1508 	/*
1509 	 * We do not need to cow a block if
1510 	 * 1) this block is not created or changed in this transaction;
1511 	 * 2) this block does not belong to TREE_RELOC tree;
1512 	 * 3) the root is not forced COW.
1513 	 *
1514 	 * What is forced COW:
1515 	 *    when we create snapshot during committing the transaction,
1516 	 *    after we've finished coping src root, we must COW the shared
1517 	 *    block to ensure the metadata consistency.
1518 	 */
1519 	if (btrfs_header_generation(buf) == trans->transid &&
1520 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1521 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1522 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1523 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1524 		return 0;
1525 	return 1;
1526 }
1527 
1528 /*
1529  * cows a single block, see __btrfs_cow_block for the real work.
1530  * This version of it has extra checks so that a block isn't COWed more than
1531  * once per transaction, as long as it hasn't been written yet
1532  */
1533 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1534 		    struct btrfs_root *root, struct extent_buffer *buf,
1535 		    struct extent_buffer *parent, int parent_slot,
1536 		    struct extent_buffer **cow_ret)
1537 {
1538 	struct btrfs_fs_info *fs_info = root->fs_info;
1539 	u64 search_start;
1540 	int ret;
1541 
1542 	if (trans->transaction != fs_info->running_transaction)
1543 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1544 		       trans->transid,
1545 		       fs_info->running_transaction->transid);
1546 
1547 	if (trans->transid != fs_info->generation)
1548 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1549 		       trans->transid, fs_info->generation);
1550 
1551 	if (!should_cow_block(trans, root, buf)) {
1552 		trans->dirty = true;
1553 		*cow_ret = buf;
1554 		return 0;
1555 	}
1556 
1557 	search_start = buf->start & ~((u64)SZ_1G - 1);
1558 
1559 	if (parent)
1560 		btrfs_set_lock_blocking(parent);
1561 	btrfs_set_lock_blocking(buf);
1562 
1563 	ret = __btrfs_cow_block(trans, root, buf, parent,
1564 				 parent_slot, cow_ret, search_start, 0);
1565 
1566 	trace_btrfs_cow_block(root, buf, *cow_ret);
1567 
1568 	return ret;
1569 }
1570 
1571 /*
1572  * helper function for defrag to decide if two blocks pointed to by a
1573  * node are actually close by
1574  */
1575 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1576 {
1577 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1578 		return 1;
1579 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1580 		return 1;
1581 	return 0;
1582 }
1583 
1584 /*
1585  * compare two keys in a memcmp fashion
1586  */
1587 static int comp_keys(const struct btrfs_disk_key *disk,
1588 		     const struct btrfs_key *k2)
1589 {
1590 	struct btrfs_key k1;
1591 
1592 	btrfs_disk_key_to_cpu(&k1, disk);
1593 
1594 	return btrfs_comp_cpu_keys(&k1, k2);
1595 }
1596 
1597 /*
1598  * same as comp_keys only with two btrfs_key's
1599  */
1600 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1601 {
1602 	if (k1->objectid > k2->objectid)
1603 		return 1;
1604 	if (k1->objectid < k2->objectid)
1605 		return -1;
1606 	if (k1->type > k2->type)
1607 		return 1;
1608 	if (k1->type < k2->type)
1609 		return -1;
1610 	if (k1->offset > k2->offset)
1611 		return 1;
1612 	if (k1->offset < k2->offset)
1613 		return -1;
1614 	return 0;
1615 }
1616 
1617 /*
1618  * this is used by the defrag code to go through all the
1619  * leaves pointed to by a node and reallocate them so that
1620  * disk order is close to key order
1621  */
1622 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1623 		       struct btrfs_root *root, struct extent_buffer *parent,
1624 		       int start_slot, u64 *last_ret,
1625 		       struct btrfs_key *progress)
1626 {
1627 	struct btrfs_fs_info *fs_info = root->fs_info;
1628 	struct extent_buffer *cur;
1629 	u64 blocknr;
1630 	u64 gen;
1631 	u64 search_start = *last_ret;
1632 	u64 last_block = 0;
1633 	u64 other;
1634 	u32 parent_nritems;
1635 	int end_slot;
1636 	int i;
1637 	int err = 0;
1638 	int parent_level;
1639 	int uptodate;
1640 	u32 blocksize;
1641 	int progress_passed = 0;
1642 	struct btrfs_disk_key disk_key;
1643 
1644 	parent_level = btrfs_header_level(parent);
1645 
1646 	WARN_ON(trans->transaction != fs_info->running_transaction);
1647 	WARN_ON(trans->transid != fs_info->generation);
1648 
1649 	parent_nritems = btrfs_header_nritems(parent);
1650 	blocksize = fs_info->nodesize;
1651 	end_slot = parent_nritems - 1;
1652 
1653 	if (parent_nritems <= 1)
1654 		return 0;
1655 
1656 	btrfs_set_lock_blocking(parent);
1657 
1658 	for (i = start_slot; i <= end_slot; i++) {
1659 		int close = 1;
1660 
1661 		btrfs_node_key(parent, &disk_key, i);
1662 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1663 			continue;
1664 
1665 		progress_passed = 1;
1666 		blocknr = btrfs_node_blockptr(parent, i);
1667 		gen = btrfs_node_ptr_generation(parent, i);
1668 		if (last_block == 0)
1669 			last_block = blocknr;
1670 
1671 		if (i > 0) {
1672 			other = btrfs_node_blockptr(parent, i - 1);
1673 			close = close_blocks(blocknr, other, blocksize);
1674 		}
1675 		if (!close && i < end_slot) {
1676 			other = btrfs_node_blockptr(parent, i + 1);
1677 			close = close_blocks(blocknr, other, blocksize);
1678 		}
1679 		if (close) {
1680 			last_block = blocknr;
1681 			continue;
1682 		}
1683 
1684 		cur = find_extent_buffer(fs_info, blocknr);
1685 		if (cur)
1686 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1687 		else
1688 			uptodate = 0;
1689 		if (!cur || !uptodate) {
1690 			if (!cur) {
1691 				cur = read_tree_block(fs_info, blocknr, gen);
1692 				if (IS_ERR(cur)) {
1693 					return PTR_ERR(cur);
1694 				} else if (!extent_buffer_uptodate(cur)) {
1695 					free_extent_buffer(cur);
1696 					return -EIO;
1697 				}
1698 			} else if (!uptodate) {
1699 				err = btrfs_read_buffer(cur, gen);
1700 				if (err) {
1701 					free_extent_buffer(cur);
1702 					return err;
1703 				}
1704 			}
1705 		}
1706 		if (search_start == 0)
1707 			search_start = last_block;
1708 
1709 		btrfs_tree_lock(cur);
1710 		btrfs_set_lock_blocking(cur);
1711 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1712 					&cur, search_start,
1713 					min(16 * blocksize,
1714 					    (end_slot - i) * blocksize));
1715 		if (err) {
1716 			btrfs_tree_unlock(cur);
1717 			free_extent_buffer(cur);
1718 			break;
1719 		}
1720 		search_start = cur->start;
1721 		last_block = cur->start;
1722 		*last_ret = search_start;
1723 		btrfs_tree_unlock(cur);
1724 		free_extent_buffer(cur);
1725 	}
1726 	return err;
1727 }
1728 
1729 /*
1730  * search for key in the extent_buffer.  The items start at offset p,
1731  * and they are item_size apart.  There are 'max' items in p.
1732  *
1733  * the slot in the array is returned via slot, and it points to
1734  * the place where you would insert key if it is not found in
1735  * the array.
1736  *
1737  * slot may point to max if the key is bigger than all of the keys
1738  */
1739 static noinline int generic_bin_search(struct extent_buffer *eb,
1740 				       unsigned long p, int item_size,
1741 				       const struct btrfs_key *key,
1742 				       int max, int *slot)
1743 {
1744 	int low = 0;
1745 	int high = max;
1746 	int mid;
1747 	int ret;
1748 	struct btrfs_disk_key *tmp = NULL;
1749 	struct btrfs_disk_key unaligned;
1750 	unsigned long offset;
1751 	char *kaddr = NULL;
1752 	unsigned long map_start = 0;
1753 	unsigned long map_len = 0;
1754 	int err;
1755 
1756 	if (low > high) {
1757 		btrfs_err(eb->fs_info,
1758 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1759 			  __func__, low, high, eb->start,
1760 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1761 		return -EINVAL;
1762 	}
1763 
1764 	while (low < high) {
1765 		mid = (low + high) / 2;
1766 		offset = p + mid * item_size;
1767 
1768 		if (!kaddr || offset < map_start ||
1769 		    (offset + sizeof(struct btrfs_disk_key)) >
1770 		    map_start + map_len) {
1771 
1772 			err = map_private_extent_buffer(eb, offset,
1773 						sizeof(struct btrfs_disk_key),
1774 						&kaddr, &map_start, &map_len);
1775 
1776 			if (!err) {
1777 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1778 							map_start);
1779 			} else if (err == 1) {
1780 				read_extent_buffer(eb, &unaligned,
1781 						   offset, sizeof(unaligned));
1782 				tmp = &unaligned;
1783 			} else {
1784 				return err;
1785 			}
1786 
1787 		} else {
1788 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1789 							map_start);
1790 		}
1791 		ret = comp_keys(tmp, key);
1792 
1793 		if (ret < 0)
1794 			low = mid + 1;
1795 		else if (ret > 0)
1796 			high = mid;
1797 		else {
1798 			*slot = mid;
1799 			return 0;
1800 		}
1801 	}
1802 	*slot = low;
1803 	return 1;
1804 }
1805 
1806 /*
1807  * simple bin_search frontend that does the right thing for
1808  * leaves vs nodes
1809  */
1810 static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1811 		      int level, int *slot)
1812 {
1813 	if (level == 0)
1814 		return generic_bin_search(eb,
1815 					  offsetof(struct btrfs_leaf, items),
1816 					  sizeof(struct btrfs_item),
1817 					  key, btrfs_header_nritems(eb),
1818 					  slot);
1819 	else
1820 		return generic_bin_search(eb,
1821 					  offsetof(struct btrfs_node, ptrs),
1822 					  sizeof(struct btrfs_key_ptr),
1823 					  key, btrfs_header_nritems(eb),
1824 					  slot);
1825 }
1826 
1827 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1828 		     int level, int *slot)
1829 {
1830 	return bin_search(eb, key, level, slot);
1831 }
1832 
1833 static void root_add_used(struct btrfs_root *root, u32 size)
1834 {
1835 	spin_lock(&root->accounting_lock);
1836 	btrfs_set_root_used(&root->root_item,
1837 			    btrfs_root_used(&root->root_item) + size);
1838 	spin_unlock(&root->accounting_lock);
1839 }
1840 
1841 static void root_sub_used(struct btrfs_root *root, u32 size)
1842 {
1843 	spin_lock(&root->accounting_lock);
1844 	btrfs_set_root_used(&root->root_item,
1845 			    btrfs_root_used(&root->root_item) - size);
1846 	spin_unlock(&root->accounting_lock);
1847 }
1848 
1849 /* given a node and slot number, this reads the blocks it points to.  The
1850  * extent buffer is returned with a reference taken (but unlocked).
1851  */
1852 static noinline struct extent_buffer *
1853 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1854 	       int slot)
1855 {
1856 	int level = btrfs_header_level(parent);
1857 	struct extent_buffer *eb;
1858 
1859 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1860 		return ERR_PTR(-ENOENT);
1861 
1862 	BUG_ON(level == 0);
1863 
1864 	eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1865 			     btrfs_node_ptr_generation(parent, slot));
1866 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1867 		free_extent_buffer(eb);
1868 		eb = ERR_PTR(-EIO);
1869 	}
1870 
1871 	return eb;
1872 }
1873 
1874 /*
1875  * node level balancing, used to make sure nodes are in proper order for
1876  * item deletion.  We balance from the top down, so we have to make sure
1877  * that a deletion won't leave an node completely empty later on.
1878  */
1879 static noinline int balance_level(struct btrfs_trans_handle *trans,
1880 			 struct btrfs_root *root,
1881 			 struct btrfs_path *path, int level)
1882 {
1883 	struct btrfs_fs_info *fs_info = root->fs_info;
1884 	struct extent_buffer *right = NULL;
1885 	struct extent_buffer *mid;
1886 	struct extent_buffer *left = NULL;
1887 	struct extent_buffer *parent = NULL;
1888 	int ret = 0;
1889 	int wret;
1890 	int pslot;
1891 	int orig_slot = path->slots[level];
1892 	u64 orig_ptr;
1893 
1894 	if (level == 0)
1895 		return 0;
1896 
1897 	mid = path->nodes[level];
1898 
1899 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1900 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1901 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1902 
1903 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1904 
1905 	if (level < BTRFS_MAX_LEVEL - 1) {
1906 		parent = path->nodes[level + 1];
1907 		pslot = path->slots[level + 1];
1908 	}
1909 
1910 	/*
1911 	 * deal with the case where there is only one pointer in the root
1912 	 * by promoting the node below to a root
1913 	 */
1914 	if (!parent) {
1915 		struct extent_buffer *child;
1916 
1917 		if (btrfs_header_nritems(mid) != 1)
1918 			return 0;
1919 
1920 		/* promote the child to a root */
1921 		child = read_node_slot(fs_info, mid, 0);
1922 		if (IS_ERR(child)) {
1923 			ret = PTR_ERR(child);
1924 			btrfs_handle_fs_error(fs_info, ret, NULL);
1925 			goto enospc;
1926 		}
1927 
1928 		btrfs_tree_lock(child);
1929 		btrfs_set_lock_blocking(child);
1930 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1931 		if (ret) {
1932 			btrfs_tree_unlock(child);
1933 			free_extent_buffer(child);
1934 			goto enospc;
1935 		}
1936 
1937 		tree_mod_log_set_root_pointer(root, child, 1);
1938 		rcu_assign_pointer(root->node, child);
1939 
1940 		add_root_to_dirty_list(root);
1941 		btrfs_tree_unlock(child);
1942 
1943 		path->locks[level] = 0;
1944 		path->nodes[level] = NULL;
1945 		clean_tree_block(fs_info, mid);
1946 		btrfs_tree_unlock(mid);
1947 		/* once for the path */
1948 		free_extent_buffer(mid);
1949 
1950 		root_sub_used(root, mid->len);
1951 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1952 		/* once for the root ptr */
1953 		free_extent_buffer_stale(mid);
1954 		return 0;
1955 	}
1956 	if (btrfs_header_nritems(mid) >
1957 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1958 		return 0;
1959 
1960 	left = read_node_slot(fs_info, parent, pslot - 1);
1961 	if (IS_ERR(left))
1962 		left = NULL;
1963 
1964 	if (left) {
1965 		btrfs_tree_lock(left);
1966 		btrfs_set_lock_blocking(left);
1967 		wret = btrfs_cow_block(trans, root, left,
1968 				       parent, pslot - 1, &left);
1969 		if (wret) {
1970 			ret = wret;
1971 			goto enospc;
1972 		}
1973 	}
1974 
1975 	right = read_node_slot(fs_info, parent, pslot + 1);
1976 	if (IS_ERR(right))
1977 		right = NULL;
1978 
1979 	if (right) {
1980 		btrfs_tree_lock(right);
1981 		btrfs_set_lock_blocking(right);
1982 		wret = btrfs_cow_block(trans, root, right,
1983 				       parent, pslot + 1, &right);
1984 		if (wret) {
1985 			ret = wret;
1986 			goto enospc;
1987 		}
1988 	}
1989 
1990 	/* first, try to make some room in the middle buffer */
1991 	if (left) {
1992 		orig_slot += btrfs_header_nritems(left);
1993 		wret = push_node_left(trans, fs_info, left, mid, 1);
1994 		if (wret < 0)
1995 			ret = wret;
1996 	}
1997 
1998 	/*
1999 	 * then try to empty the right most buffer into the middle
2000 	 */
2001 	if (right) {
2002 		wret = push_node_left(trans, fs_info, mid, right, 1);
2003 		if (wret < 0 && wret != -ENOSPC)
2004 			ret = wret;
2005 		if (btrfs_header_nritems(right) == 0) {
2006 			clean_tree_block(fs_info, right);
2007 			btrfs_tree_unlock(right);
2008 			del_ptr(root, path, level + 1, pslot + 1);
2009 			root_sub_used(root, right->len);
2010 			btrfs_free_tree_block(trans, root, right, 0, 1);
2011 			free_extent_buffer_stale(right);
2012 			right = NULL;
2013 		} else {
2014 			struct btrfs_disk_key right_key;
2015 			btrfs_node_key(right, &right_key, 0);
2016 			tree_mod_log_set_node_key(fs_info, parent,
2017 						  pslot + 1, 0);
2018 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2019 			btrfs_mark_buffer_dirty(parent);
2020 		}
2021 	}
2022 	if (btrfs_header_nritems(mid) == 1) {
2023 		/*
2024 		 * we're not allowed to leave a node with one item in the
2025 		 * tree during a delete.  A deletion from lower in the tree
2026 		 * could try to delete the only pointer in this node.
2027 		 * So, pull some keys from the left.
2028 		 * There has to be a left pointer at this point because
2029 		 * otherwise we would have pulled some pointers from the
2030 		 * right
2031 		 */
2032 		if (!left) {
2033 			ret = -EROFS;
2034 			btrfs_handle_fs_error(fs_info, ret, NULL);
2035 			goto enospc;
2036 		}
2037 		wret = balance_node_right(trans, fs_info, mid, left);
2038 		if (wret < 0) {
2039 			ret = wret;
2040 			goto enospc;
2041 		}
2042 		if (wret == 1) {
2043 			wret = push_node_left(trans, fs_info, left, mid, 1);
2044 			if (wret < 0)
2045 				ret = wret;
2046 		}
2047 		BUG_ON(wret == 1);
2048 	}
2049 	if (btrfs_header_nritems(mid) == 0) {
2050 		clean_tree_block(fs_info, mid);
2051 		btrfs_tree_unlock(mid);
2052 		del_ptr(root, path, level + 1, pslot);
2053 		root_sub_used(root, mid->len);
2054 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2055 		free_extent_buffer_stale(mid);
2056 		mid = NULL;
2057 	} else {
2058 		/* update the parent key to reflect our changes */
2059 		struct btrfs_disk_key mid_key;
2060 		btrfs_node_key(mid, &mid_key, 0);
2061 		tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
2062 		btrfs_set_node_key(parent, &mid_key, pslot);
2063 		btrfs_mark_buffer_dirty(parent);
2064 	}
2065 
2066 	/* update the path */
2067 	if (left) {
2068 		if (btrfs_header_nritems(left) > orig_slot) {
2069 			extent_buffer_get(left);
2070 			/* left was locked after cow */
2071 			path->nodes[level] = left;
2072 			path->slots[level + 1] -= 1;
2073 			path->slots[level] = orig_slot;
2074 			if (mid) {
2075 				btrfs_tree_unlock(mid);
2076 				free_extent_buffer(mid);
2077 			}
2078 		} else {
2079 			orig_slot -= btrfs_header_nritems(left);
2080 			path->slots[level] = orig_slot;
2081 		}
2082 	}
2083 	/* double check we haven't messed things up */
2084 	if (orig_ptr !=
2085 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2086 		BUG();
2087 enospc:
2088 	if (right) {
2089 		btrfs_tree_unlock(right);
2090 		free_extent_buffer(right);
2091 	}
2092 	if (left) {
2093 		if (path->nodes[level] != left)
2094 			btrfs_tree_unlock(left);
2095 		free_extent_buffer(left);
2096 	}
2097 	return ret;
2098 }
2099 
2100 /* Node balancing for insertion.  Here we only split or push nodes around
2101  * when they are completely full.  This is also done top down, so we
2102  * have to be pessimistic.
2103  */
2104 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2105 					  struct btrfs_root *root,
2106 					  struct btrfs_path *path, int level)
2107 {
2108 	struct btrfs_fs_info *fs_info = root->fs_info;
2109 	struct extent_buffer *right = NULL;
2110 	struct extent_buffer *mid;
2111 	struct extent_buffer *left = NULL;
2112 	struct extent_buffer *parent = NULL;
2113 	int ret = 0;
2114 	int wret;
2115 	int pslot;
2116 	int orig_slot = path->slots[level];
2117 
2118 	if (level == 0)
2119 		return 1;
2120 
2121 	mid = path->nodes[level];
2122 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2123 
2124 	if (level < BTRFS_MAX_LEVEL - 1) {
2125 		parent = path->nodes[level + 1];
2126 		pslot = path->slots[level + 1];
2127 	}
2128 
2129 	if (!parent)
2130 		return 1;
2131 
2132 	left = read_node_slot(fs_info, parent, pslot - 1);
2133 	if (IS_ERR(left))
2134 		left = NULL;
2135 
2136 	/* first, try to make some room in the middle buffer */
2137 	if (left) {
2138 		u32 left_nr;
2139 
2140 		btrfs_tree_lock(left);
2141 		btrfs_set_lock_blocking(left);
2142 
2143 		left_nr = btrfs_header_nritems(left);
2144 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2145 			wret = 1;
2146 		} else {
2147 			ret = btrfs_cow_block(trans, root, left, parent,
2148 					      pslot - 1, &left);
2149 			if (ret)
2150 				wret = 1;
2151 			else {
2152 				wret = push_node_left(trans, fs_info,
2153 						      left, mid, 0);
2154 			}
2155 		}
2156 		if (wret < 0)
2157 			ret = wret;
2158 		if (wret == 0) {
2159 			struct btrfs_disk_key disk_key;
2160 			orig_slot += left_nr;
2161 			btrfs_node_key(mid, &disk_key, 0);
2162 			tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
2163 			btrfs_set_node_key(parent, &disk_key, pslot);
2164 			btrfs_mark_buffer_dirty(parent);
2165 			if (btrfs_header_nritems(left) > orig_slot) {
2166 				path->nodes[level] = left;
2167 				path->slots[level + 1] -= 1;
2168 				path->slots[level] = orig_slot;
2169 				btrfs_tree_unlock(mid);
2170 				free_extent_buffer(mid);
2171 			} else {
2172 				orig_slot -=
2173 					btrfs_header_nritems(left);
2174 				path->slots[level] = orig_slot;
2175 				btrfs_tree_unlock(left);
2176 				free_extent_buffer(left);
2177 			}
2178 			return 0;
2179 		}
2180 		btrfs_tree_unlock(left);
2181 		free_extent_buffer(left);
2182 	}
2183 	right = read_node_slot(fs_info, parent, pslot + 1);
2184 	if (IS_ERR(right))
2185 		right = NULL;
2186 
2187 	/*
2188 	 * then try to empty the right most buffer into the middle
2189 	 */
2190 	if (right) {
2191 		u32 right_nr;
2192 
2193 		btrfs_tree_lock(right);
2194 		btrfs_set_lock_blocking(right);
2195 
2196 		right_nr = btrfs_header_nritems(right);
2197 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2198 			wret = 1;
2199 		} else {
2200 			ret = btrfs_cow_block(trans, root, right,
2201 					      parent, pslot + 1,
2202 					      &right);
2203 			if (ret)
2204 				wret = 1;
2205 			else {
2206 				wret = balance_node_right(trans, fs_info,
2207 							  right, mid);
2208 			}
2209 		}
2210 		if (wret < 0)
2211 			ret = wret;
2212 		if (wret == 0) {
2213 			struct btrfs_disk_key disk_key;
2214 
2215 			btrfs_node_key(right, &disk_key, 0);
2216 			tree_mod_log_set_node_key(fs_info, parent,
2217 						  pslot + 1, 0);
2218 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2219 			btrfs_mark_buffer_dirty(parent);
2220 
2221 			if (btrfs_header_nritems(mid) <= orig_slot) {
2222 				path->nodes[level] = right;
2223 				path->slots[level + 1] += 1;
2224 				path->slots[level] = orig_slot -
2225 					btrfs_header_nritems(mid);
2226 				btrfs_tree_unlock(mid);
2227 				free_extent_buffer(mid);
2228 			} else {
2229 				btrfs_tree_unlock(right);
2230 				free_extent_buffer(right);
2231 			}
2232 			return 0;
2233 		}
2234 		btrfs_tree_unlock(right);
2235 		free_extent_buffer(right);
2236 	}
2237 	return 1;
2238 }
2239 
2240 /*
2241  * readahead one full node of leaves, finding things that are close
2242  * to the block in 'slot', and triggering ra on them.
2243  */
2244 static void reada_for_search(struct btrfs_fs_info *fs_info,
2245 			     struct btrfs_path *path,
2246 			     int level, int slot, u64 objectid)
2247 {
2248 	struct extent_buffer *node;
2249 	struct btrfs_disk_key disk_key;
2250 	u32 nritems;
2251 	u64 search;
2252 	u64 target;
2253 	u64 nread = 0;
2254 	struct extent_buffer *eb;
2255 	u32 nr;
2256 	u32 blocksize;
2257 	u32 nscan = 0;
2258 
2259 	if (level != 1)
2260 		return;
2261 
2262 	if (!path->nodes[level])
2263 		return;
2264 
2265 	node = path->nodes[level];
2266 
2267 	search = btrfs_node_blockptr(node, slot);
2268 	blocksize = fs_info->nodesize;
2269 	eb = find_extent_buffer(fs_info, search);
2270 	if (eb) {
2271 		free_extent_buffer(eb);
2272 		return;
2273 	}
2274 
2275 	target = search;
2276 
2277 	nritems = btrfs_header_nritems(node);
2278 	nr = slot;
2279 
2280 	while (1) {
2281 		if (path->reada == READA_BACK) {
2282 			if (nr == 0)
2283 				break;
2284 			nr--;
2285 		} else if (path->reada == READA_FORWARD) {
2286 			nr++;
2287 			if (nr >= nritems)
2288 				break;
2289 		}
2290 		if (path->reada == READA_BACK && objectid) {
2291 			btrfs_node_key(node, &disk_key, nr);
2292 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2293 				break;
2294 		}
2295 		search = btrfs_node_blockptr(node, nr);
2296 		if ((search <= target && target - search <= 65536) ||
2297 		    (search > target && search - target <= 65536)) {
2298 			readahead_tree_block(fs_info, search);
2299 			nread += blocksize;
2300 		}
2301 		nscan++;
2302 		if ((nread > 65536 || nscan > 32))
2303 			break;
2304 	}
2305 }
2306 
2307 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2308 				       struct btrfs_path *path, int level)
2309 {
2310 	int slot;
2311 	int nritems;
2312 	struct extent_buffer *parent;
2313 	struct extent_buffer *eb;
2314 	u64 gen;
2315 	u64 block1 = 0;
2316 	u64 block2 = 0;
2317 
2318 	parent = path->nodes[level + 1];
2319 	if (!parent)
2320 		return;
2321 
2322 	nritems = btrfs_header_nritems(parent);
2323 	slot = path->slots[level + 1];
2324 
2325 	if (slot > 0) {
2326 		block1 = btrfs_node_blockptr(parent, slot - 1);
2327 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2328 		eb = find_extent_buffer(fs_info, block1);
2329 		/*
2330 		 * if we get -eagain from btrfs_buffer_uptodate, we
2331 		 * don't want to return eagain here.  That will loop
2332 		 * forever
2333 		 */
2334 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2335 			block1 = 0;
2336 		free_extent_buffer(eb);
2337 	}
2338 	if (slot + 1 < nritems) {
2339 		block2 = btrfs_node_blockptr(parent, slot + 1);
2340 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2341 		eb = find_extent_buffer(fs_info, block2);
2342 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2343 			block2 = 0;
2344 		free_extent_buffer(eb);
2345 	}
2346 
2347 	if (block1)
2348 		readahead_tree_block(fs_info, block1);
2349 	if (block2)
2350 		readahead_tree_block(fs_info, block2);
2351 }
2352 
2353 
2354 /*
2355  * when we walk down the tree, it is usually safe to unlock the higher layers
2356  * in the tree.  The exceptions are when our path goes through slot 0, because
2357  * operations on the tree might require changing key pointers higher up in the
2358  * tree.
2359  *
2360  * callers might also have set path->keep_locks, which tells this code to keep
2361  * the lock if the path points to the last slot in the block.  This is part of
2362  * walking through the tree, and selecting the next slot in the higher block.
2363  *
2364  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2365  * if lowest_unlock is 1, level 0 won't be unlocked
2366  */
2367 static noinline void unlock_up(struct btrfs_path *path, int level,
2368 			       int lowest_unlock, int min_write_lock_level,
2369 			       int *write_lock_level)
2370 {
2371 	int i;
2372 	int skip_level = level;
2373 	int no_skips = 0;
2374 	struct extent_buffer *t;
2375 
2376 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2377 		if (!path->nodes[i])
2378 			break;
2379 		if (!path->locks[i])
2380 			break;
2381 		if (!no_skips && path->slots[i] == 0) {
2382 			skip_level = i + 1;
2383 			continue;
2384 		}
2385 		if (!no_skips && path->keep_locks) {
2386 			u32 nritems;
2387 			t = path->nodes[i];
2388 			nritems = btrfs_header_nritems(t);
2389 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2390 				skip_level = i + 1;
2391 				continue;
2392 			}
2393 		}
2394 		if (skip_level < i && i >= lowest_unlock)
2395 			no_skips = 1;
2396 
2397 		t = path->nodes[i];
2398 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2399 			btrfs_tree_unlock_rw(t, path->locks[i]);
2400 			path->locks[i] = 0;
2401 			if (write_lock_level &&
2402 			    i > min_write_lock_level &&
2403 			    i <= *write_lock_level) {
2404 				*write_lock_level = i - 1;
2405 			}
2406 		}
2407 	}
2408 }
2409 
2410 /*
2411  * This releases any locks held in the path starting at level and
2412  * going all the way up to the root.
2413  *
2414  * btrfs_search_slot will keep the lock held on higher nodes in a few
2415  * corner cases, such as COW of the block at slot zero in the node.  This
2416  * ignores those rules, and it should only be called when there are no
2417  * more updates to be done higher up in the tree.
2418  */
2419 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2420 {
2421 	int i;
2422 
2423 	if (path->keep_locks)
2424 		return;
2425 
2426 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2427 		if (!path->nodes[i])
2428 			continue;
2429 		if (!path->locks[i])
2430 			continue;
2431 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2432 		path->locks[i] = 0;
2433 	}
2434 }
2435 
2436 /*
2437  * helper function for btrfs_search_slot.  The goal is to find a block
2438  * in cache without setting the path to blocking.  If we find the block
2439  * we return zero and the path is unchanged.
2440  *
2441  * If we can't find the block, we set the path blocking and do some
2442  * reada.  -EAGAIN is returned and the search must be repeated.
2443  */
2444 static int
2445 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2446 		      struct extent_buffer **eb_ret, int level, int slot,
2447 		      const struct btrfs_key *key)
2448 {
2449 	struct btrfs_fs_info *fs_info = root->fs_info;
2450 	u64 blocknr;
2451 	u64 gen;
2452 	struct extent_buffer *b = *eb_ret;
2453 	struct extent_buffer *tmp;
2454 	int ret;
2455 
2456 	blocknr = btrfs_node_blockptr(b, slot);
2457 	gen = btrfs_node_ptr_generation(b, slot);
2458 
2459 	tmp = find_extent_buffer(fs_info, blocknr);
2460 	if (tmp) {
2461 		/* first we do an atomic uptodate check */
2462 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2463 			*eb_ret = tmp;
2464 			return 0;
2465 		}
2466 
2467 		/* the pages were up to date, but we failed
2468 		 * the generation number check.  Do a full
2469 		 * read for the generation number that is correct.
2470 		 * We must do this without dropping locks so
2471 		 * we can trust our generation number
2472 		 */
2473 		btrfs_set_path_blocking(p);
2474 
2475 		/* now we're allowed to do a blocking uptodate check */
2476 		ret = btrfs_read_buffer(tmp, gen);
2477 		if (!ret) {
2478 			*eb_ret = tmp;
2479 			return 0;
2480 		}
2481 		free_extent_buffer(tmp);
2482 		btrfs_release_path(p);
2483 		return -EIO;
2484 	}
2485 
2486 	/*
2487 	 * reduce lock contention at high levels
2488 	 * of the btree by dropping locks before
2489 	 * we read.  Don't release the lock on the current
2490 	 * level because we need to walk this node to figure
2491 	 * out which blocks to read.
2492 	 */
2493 	btrfs_unlock_up_safe(p, level + 1);
2494 	btrfs_set_path_blocking(p);
2495 
2496 	free_extent_buffer(tmp);
2497 	if (p->reada != READA_NONE)
2498 		reada_for_search(fs_info, p, level, slot, key->objectid);
2499 
2500 	btrfs_release_path(p);
2501 
2502 	ret = -EAGAIN;
2503 	tmp = read_tree_block(fs_info, blocknr, 0);
2504 	if (!IS_ERR(tmp)) {
2505 		/*
2506 		 * If the read above didn't mark this buffer up to date,
2507 		 * it will never end up being up to date.  Set ret to EIO now
2508 		 * and give up so that our caller doesn't loop forever
2509 		 * on our EAGAINs.
2510 		 */
2511 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2512 			ret = -EIO;
2513 		free_extent_buffer(tmp);
2514 	} else {
2515 		ret = PTR_ERR(tmp);
2516 	}
2517 	return ret;
2518 }
2519 
2520 /*
2521  * helper function for btrfs_search_slot.  This does all of the checks
2522  * for node-level blocks and does any balancing required based on
2523  * the ins_len.
2524  *
2525  * If no extra work was required, zero is returned.  If we had to
2526  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2527  * start over
2528  */
2529 static int
2530 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2531 		       struct btrfs_root *root, struct btrfs_path *p,
2532 		       struct extent_buffer *b, int level, int ins_len,
2533 		       int *write_lock_level)
2534 {
2535 	struct btrfs_fs_info *fs_info = root->fs_info;
2536 	int ret;
2537 
2538 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2539 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2540 		int sret;
2541 
2542 		if (*write_lock_level < level + 1) {
2543 			*write_lock_level = level + 1;
2544 			btrfs_release_path(p);
2545 			goto again;
2546 		}
2547 
2548 		btrfs_set_path_blocking(p);
2549 		reada_for_balance(fs_info, p, level);
2550 		sret = split_node(trans, root, p, level);
2551 		btrfs_clear_path_blocking(p, NULL, 0);
2552 
2553 		BUG_ON(sret > 0);
2554 		if (sret) {
2555 			ret = sret;
2556 			goto done;
2557 		}
2558 		b = p->nodes[level];
2559 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2560 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2561 		int sret;
2562 
2563 		if (*write_lock_level < level + 1) {
2564 			*write_lock_level = level + 1;
2565 			btrfs_release_path(p);
2566 			goto again;
2567 		}
2568 
2569 		btrfs_set_path_blocking(p);
2570 		reada_for_balance(fs_info, p, level);
2571 		sret = balance_level(trans, root, p, level);
2572 		btrfs_clear_path_blocking(p, NULL, 0);
2573 
2574 		if (sret) {
2575 			ret = sret;
2576 			goto done;
2577 		}
2578 		b = p->nodes[level];
2579 		if (!b) {
2580 			btrfs_release_path(p);
2581 			goto again;
2582 		}
2583 		BUG_ON(btrfs_header_nritems(b) == 1);
2584 	}
2585 	return 0;
2586 
2587 again:
2588 	ret = -EAGAIN;
2589 done:
2590 	return ret;
2591 }
2592 
2593 static void key_search_validate(struct extent_buffer *b,
2594 				const struct btrfs_key *key,
2595 				int level)
2596 {
2597 #ifdef CONFIG_BTRFS_ASSERT
2598 	struct btrfs_disk_key disk_key;
2599 
2600 	btrfs_cpu_key_to_disk(&disk_key, key);
2601 
2602 	if (level == 0)
2603 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2604 		    offsetof(struct btrfs_leaf, items[0].key),
2605 		    sizeof(disk_key)));
2606 	else
2607 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2608 		    offsetof(struct btrfs_node, ptrs[0].key),
2609 		    sizeof(disk_key)));
2610 #endif
2611 }
2612 
2613 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2614 		      int level, int *prev_cmp, int *slot)
2615 {
2616 	if (*prev_cmp != 0) {
2617 		*prev_cmp = bin_search(b, key, level, slot);
2618 		return *prev_cmp;
2619 	}
2620 
2621 	key_search_validate(b, key, level);
2622 	*slot = 0;
2623 
2624 	return 0;
2625 }
2626 
2627 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2628 		u64 iobjectid, u64 ioff, u8 key_type,
2629 		struct btrfs_key *found_key)
2630 {
2631 	int ret;
2632 	struct btrfs_key key;
2633 	struct extent_buffer *eb;
2634 
2635 	ASSERT(path);
2636 	ASSERT(found_key);
2637 
2638 	key.type = key_type;
2639 	key.objectid = iobjectid;
2640 	key.offset = ioff;
2641 
2642 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2643 	if (ret < 0)
2644 		return ret;
2645 
2646 	eb = path->nodes[0];
2647 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2648 		ret = btrfs_next_leaf(fs_root, path);
2649 		if (ret)
2650 			return ret;
2651 		eb = path->nodes[0];
2652 	}
2653 
2654 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2655 	if (found_key->type != key.type ||
2656 			found_key->objectid != key.objectid)
2657 		return 1;
2658 
2659 	return 0;
2660 }
2661 
2662 /*
2663  * look for key in the tree.  path is filled in with nodes along the way
2664  * if key is found, we return zero and you can find the item in the leaf
2665  * level of the path (level 0)
2666  *
2667  * If the key isn't found, the path points to the slot where it should
2668  * be inserted, and 1 is returned.  If there are other errors during the
2669  * search a negative error number is returned.
2670  *
2671  * if ins_len > 0, nodes and leaves will be split as we walk down the
2672  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2673  * possible)
2674  */
2675 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2676 		      const struct btrfs_key *key, struct btrfs_path *p,
2677 		      int ins_len, int cow)
2678 {
2679 	struct btrfs_fs_info *fs_info = root->fs_info;
2680 	struct extent_buffer *b;
2681 	int slot;
2682 	int ret;
2683 	int err;
2684 	int level;
2685 	int lowest_unlock = 1;
2686 	int root_lock;
2687 	/* everything at write_lock_level or lower must be write locked */
2688 	int write_lock_level = 0;
2689 	u8 lowest_level = 0;
2690 	int min_write_lock_level;
2691 	int prev_cmp;
2692 
2693 	lowest_level = p->lowest_level;
2694 	WARN_ON(lowest_level && ins_len > 0);
2695 	WARN_ON(p->nodes[0] != NULL);
2696 	BUG_ON(!cow && ins_len);
2697 
2698 	if (ins_len < 0) {
2699 		lowest_unlock = 2;
2700 
2701 		/* when we are removing items, we might have to go up to level
2702 		 * two as we update tree pointers  Make sure we keep write
2703 		 * for those levels as well
2704 		 */
2705 		write_lock_level = 2;
2706 	} else if (ins_len > 0) {
2707 		/*
2708 		 * for inserting items, make sure we have a write lock on
2709 		 * level 1 so we can update keys
2710 		 */
2711 		write_lock_level = 1;
2712 	}
2713 
2714 	if (!cow)
2715 		write_lock_level = -1;
2716 
2717 	if (cow && (p->keep_locks || p->lowest_level))
2718 		write_lock_level = BTRFS_MAX_LEVEL;
2719 
2720 	min_write_lock_level = write_lock_level;
2721 
2722 again:
2723 	prev_cmp = -1;
2724 	/*
2725 	 * we try very hard to do read locks on the root
2726 	 */
2727 	root_lock = BTRFS_READ_LOCK;
2728 	level = 0;
2729 	if (p->search_commit_root) {
2730 		/*
2731 		 * the commit roots are read only
2732 		 * so we always do read locks
2733 		 */
2734 		if (p->need_commit_sem)
2735 			down_read(&fs_info->commit_root_sem);
2736 		b = root->commit_root;
2737 		extent_buffer_get(b);
2738 		level = btrfs_header_level(b);
2739 		if (p->need_commit_sem)
2740 			up_read(&fs_info->commit_root_sem);
2741 		if (!p->skip_locking)
2742 			btrfs_tree_read_lock(b);
2743 	} else {
2744 		if (p->skip_locking) {
2745 			b = btrfs_root_node(root);
2746 			level = btrfs_header_level(b);
2747 		} else {
2748 			/* we don't know the level of the root node
2749 			 * until we actually have it read locked
2750 			 */
2751 			b = btrfs_read_lock_root_node(root);
2752 			level = btrfs_header_level(b);
2753 			if (level <= write_lock_level) {
2754 				/* whoops, must trade for write lock */
2755 				btrfs_tree_read_unlock(b);
2756 				free_extent_buffer(b);
2757 				b = btrfs_lock_root_node(root);
2758 				root_lock = BTRFS_WRITE_LOCK;
2759 
2760 				/* the level might have changed, check again */
2761 				level = btrfs_header_level(b);
2762 			}
2763 		}
2764 	}
2765 	p->nodes[level] = b;
2766 	if (!p->skip_locking)
2767 		p->locks[level] = root_lock;
2768 
2769 	while (b) {
2770 		level = btrfs_header_level(b);
2771 
2772 		/*
2773 		 * setup the path here so we can release it under lock
2774 		 * contention with the cow code
2775 		 */
2776 		if (cow) {
2777 			/*
2778 			 * if we don't really need to cow this block
2779 			 * then we don't want to set the path blocking,
2780 			 * so we test it here
2781 			 */
2782 			if (!should_cow_block(trans, root, b)) {
2783 				trans->dirty = true;
2784 				goto cow_done;
2785 			}
2786 
2787 			/*
2788 			 * must have write locks on this node and the
2789 			 * parent
2790 			 */
2791 			if (level > write_lock_level ||
2792 			    (level + 1 > write_lock_level &&
2793 			    level + 1 < BTRFS_MAX_LEVEL &&
2794 			    p->nodes[level + 1])) {
2795 				write_lock_level = level + 1;
2796 				btrfs_release_path(p);
2797 				goto again;
2798 			}
2799 
2800 			btrfs_set_path_blocking(p);
2801 			err = btrfs_cow_block(trans, root, b,
2802 					      p->nodes[level + 1],
2803 					      p->slots[level + 1], &b);
2804 			if (err) {
2805 				ret = err;
2806 				goto done;
2807 			}
2808 		}
2809 cow_done:
2810 		p->nodes[level] = b;
2811 		btrfs_clear_path_blocking(p, NULL, 0);
2812 
2813 		/*
2814 		 * we have a lock on b and as long as we aren't changing
2815 		 * the tree, there is no way to for the items in b to change.
2816 		 * It is safe to drop the lock on our parent before we
2817 		 * go through the expensive btree search on b.
2818 		 *
2819 		 * If we're inserting or deleting (ins_len != 0), then we might
2820 		 * be changing slot zero, which may require changing the parent.
2821 		 * So, we can't drop the lock until after we know which slot
2822 		 * we're operating on.
2823 		 */
2824 		if (!ins_len && !p->keep_locks) {
2825 			int u = level + 1;
2826 
2827 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2828 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2829 				p->locks[u] = 0;
2830 			}
2831 		}
2832 
2833 		ret = key_search(b, key, level, &prev_cmp, &slot);
2834 		if (ret < 0)
2835 			goto done;
2836 
2837 		if (level != 0) {
2838 			int dec = 0;
2839 			if (ret && slot > 0) {
2840 				dec = 1;
2841 				slot -= 1;
2842 			}
2843 			p->slots[level] = slot;
2844 			err = setup_nodes_for_search(trans, root, p, b, level,
2845 					     ins_len, &write_lock_level);
2846 			if (err == -EAGAIN)
2847 				goto again;
2848 			if (err) {
2849 				ret = err;
2850 				goto done;
2851 			}
2852 			b = p->nodes[level];
2853 			slot = p->slots[level];
2854 
2855 			/*
2856 			 * slot 0 is special, if we change the key
2857 			 * we have to update the parent pointer
2858 			 * which means we must have a write lock
2859 			 * on the parent
2860 			 */
2861 			if (slot == 0 && ins_len &&
2862 			    write_lock_level < level + 1) {
2863 				write_lock_level = level + 1;
2864 				btrfs_release_path(p);
2865 				goto again;
2866 			}
2867 
2868 			unlock_up(p, level, lowest_unlock,
2869 				  min_write_lock_level, &write_lock_level);
2870 
2871 			if (level == lowest_level) {
2872 				if (dec)
2873 					p->slots[level]++;
2874 				goto done;
2875 			}
2876 
2877 			err = read_block_for_search(root, p, &b, level,
2878 						    slot, key);
2879 			if (err == -EAGAIN)
2880 				goto again;
2881 			if (err) {
2882 				ret = err;
2883 				goto done;
2884 			}
2885 
2886 			if (!p->skip_locking) {
2887 				level = btrfs_header_level(b);
2888 				if (level <= write_lock_level) {
2889 					err = btrfs_try_tree_write_lock(b);
2890 					if (!err) {
2891 						btrfs_set_path_blocking(p);
2892 						btrfs_tree_lock(b);
2893 						btrfs_clear_path_blocking(p, b,
2894 								  BTRFS_WRITE_LOCK);
2895 					}
2896 					p->locks[level] = BTRFS_WRITE_LOCK;
2897 				} else {
2898 					err = btrfs_tree_read_lock_atomic(b);
2899 					if (!err) {
2900 						btrfs_set_path_blocking(p);
2901 						btrfs_tree_read_lock(b);
2902 						btrfs_clear_path_blocking(p, b,
2903 								  BTRFS_READ_LOCK);
2904 					}
2905 					p->locks[level] = BTRFS_READ_LOCK;
2906 				}
2907 				p->nodes[level] = b;
2908 			}
2909 		} else {
2910 			p->slots[level] = slot;
2911 			if (ins_len > 0 &&
2912 			    btrfs_leaf_free_space(fs_info, b) < ins_len) {
2913 				if (write_lock_level < 1) {
2914 					write_lock_level = 1;
2915 					btrfs_release_path(p);
2916 					goto again;
2917 				}
2918 
2919 				btrfs_set_path_blocking(p);
2920 				err = split_leaf(trans, root, key,
2921 						 p, ins_len, ret == 0);
2922 				btrfs_clear_path_blocking(p, NULL, 0);
2923 
2924 				BUG_ON(err > 0);
2925 				if (err) {
2926 					ret = err;
2927 					goto done;
2928 				}
2929 			}
2930 			if (!p->search_for_split)
2931 				unlock_up(p, level, lowest_unlock,
2932 					  min_write_lock_level, &write_lock_level);
2933 			goto done;
2934 		}
2935 	}
2936 	ret = 1;
2937 done:
2938 	/*
2939 	 * we don't really know what they plan on doing with the path
2940 	 * from here on, so for now just mark it as blocking
2941 	 */
2942 	if (!p->leave_spinning)
2943 		btrfs_set_path_blocking(p);
2944 	if (ret < 0 && !p->skip_release_on_error)
2945 		btrfs_release_path(p);
2946 	return ret;
2947 }
2948 
2949 /*
2950  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2951  * current state of the tree together with the operations recorded in the tree
2952  * modification log to search for the key in a previous version of this tree, as
2953  * denoted by the time_seq parameter.
2954  *
2955  * Naturally, there is no support for insert, delete or cow operations.
2956  *
2957  * The resulting path and return value will be set up as if we called
2958  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2959  */
2960 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2961 			  struct btrfs_path *p, u64 time_seq)
2962 {
2963 	struct btrfs_fs_info *fs_info = root->fs_info;
2964 	struct extent_buffer *b;
2965 	int slot;
2966 	int ret;
2967 	int err;
2968 	int level;
2969 	int lowest_unlock = 1;
2970 	u8 lowest_level = 0;
2971 	int prev_cmp = -1;
2972 
2973 	lowest_level = p->lowest_level;
2974 	WARN_ON(p->nodes[0] != NULL);
2975 
2976 	if (p->search_commit_root) {
2977 		BUG_ON(time_seq);
2978 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2979 	}
2980 
2981 again:
2982 	b = get_old_root(root, time_seq);
2983 	level = btrfs_header_level(b);
2984 	p->locks[level] = BTRFS_READ_LOCK;
2985 
2986 	while (b) {
2987 		level = btrfs_header_level(b);
2988 		p->nodes[level] = b;
2989 		btrfs_clear_path_blocking(p, NULL, 0);
2990 
2991 		/*
2992 		 * we have a lock on b and as long as we aren't changing
2993 		 * the tree, there is no way to for the items in b to change.
2994 		 * It is safe to drop the lock on our parent before we
2995 		 * go through the expensive btree search on b.
2996 		 */
2997 		btrfs_unlock_up_safe(p, level + 1);
2998 
2999 		/*
3000 		 * Since we can unwind ebs we want to do a real search every
3001 		 * time.
3002 		 */
3003 		prev_cmp = -1;
3004 		ret = key_search(b, key, level, &prev_cmp, &slot);
3005 
3006 		if (level != 0) {
3007 			int dec = 0;
3008 			if (ret && slot > 0) {
3009 				dec = 1;
3010 				slot -= 1;
3011 			}
3012 			p->slots[level] = slot;
3013 			unlock_up(p, level, lowest_unlock, 0, NULL);
3014 
3015 			if (level == lowest_level) {
3016 				if (dec)
3017 					p->slots[level]++;
3018 				goto done;
3019 			}
3020 
3021 			err = read_block_for_search(root, p, &b, level,
3022 						    slot, key);
3023 			if (err == -EAGAIN)
3024 				goto again;
3025 			if (err) {
3026 				ret = err;
3027 				goto done;
3028 			}
3029 
3030 			level = btrfs_header_level(b);
3031 			err = btrfs_tree_read_lock_atomic(b);
3032 			if (!err) {
3033 				btrfs_set_path_blocking(p);
3034 				btrfs_tree_read_lock(b);
3035 				btrfs_clear_path_blocking(p, b,
3036 							  BTRFS_READ_LOCK);
3037 			}
3038 			b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3039 			if (!b) {
3040 				ret = -ENOMEM;
3041 				goto done;
3042 			}
3043 			p->locks[level] = BTRFS_READ_LOCK;
3044 			p->nodes[level] = b;
3045 		} else {
3046 			p->slots[level] = slot;
3047 			unlock_up(p, level, lowest_unlock, 0, NULL);
3048 			goto done;
3049 		}
3050 	}
3051 	ret = 1;
3052 done:
3053 	if (!p->leave_spinning)
3054 		btrfs_set_path_blocking(p);
3055 	if (ret < 0)
3056 		btrfs_release_path(p);
3057 
3058 	return ret;
3059 }
3060 
3061 /*
3062  * helper to use instead of search slot if no exact match is needed but
3063  * instead the next or previous item should be returned.
3064  * When find_higher is true, the next higher item is returned, the next lower
3065  * otherwise.
3066  * When return_any and find_higher are both true, and no higher item is found,
3067  * return the next lower instead.
3068  * When return_any is true and find_higher is false, and no lower item is found,
3069  * return the next higher instead.
3070  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3071  * < 0 on error
3072  */
3073 int btrfs_search_slot_for_read(struct btrfs_root *root,
3074 			       const struct btrfs_key *key,
3075 			       struct btrfs_path *p, int find_higher,
3076 			       int return_any)
3077 {
3078 	int ret;
3079 	struct extent_buffer *leaf;
3080 
3081 again:
3082 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3083 	if (ret <= 0)
3084 		return ret;
3085 	/*
3086 	 * a return value of 1 means the path is at the position where the
3087 	 * item should be inserted. Normally this is the next bigger item,
3088 	 * but in case the previous item is the last in a leaf, path points
3089 	 * to the first free slot in the previous leaf, i.e. at an invalid
3090 	 * item.
3091 	 */
3092 	leaf = p->nodes[0];
3093 
3094 	if (find_higher) {
3095 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3096 			ret = btrfs_next_leaf(root, p);
3097 			if (ret <= 0)
3098 				return ret;
3099 			if (!return_any)
3100 				return 1;
3101 			/*
3102 			 * no higher item found, return the next
3103 			 * lower instead
3104 			 */
3105 			return_any = 0;
3106 			find_higher = 0;
3107 			btrfs_release_path(p);
3108 			goto again;
3109 		}
3110 	} else {
3111 		if (p->slots[0] == 0) {
3112 			ret = btrfs_prev_leaf(root, p);
3113 			if (ret < 0)
3114 				return ret;
3115 			if (!ret) {
3116 				leaf = p->nodes[0];
3117 				if (p->slots[0] == btrfs_header_nritems(leaf))
3118 					p->slots[0]--;
3119 				return 0;
3120 			}
3121 			if (!return_any)
3122 				return 1;
3123 			/*
3124 			 * no lower item found, return the next
3125 			 * higher instead
3126 			 */
3127 			return_any = 0;
3128 			find_higher = 1;
3129 			btrfs_release_path(p);
3130 			goto again;
3131 		} else {
3132 			--p->slots[0];
3133 		}
3134 	}
3135 	return 0;
3136 }
3137 
3138 /*
3139  * adjust the pointers going up the tree, starting at level
3140  * making sure the right key of each node is points to 'key'.
3141  * This is used after shifting pointers to the left, so it stops
3142  * fixing up pointers when a given leaf/node is not in slot 0 of the
3143  * higher levels
3144  *
3145  */
3146 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3147 			   struct btrfs_path *path,
3148 			   struct btrfs_disk_key *key, int level)
3149 {
3150 	int i;
3151 	struct extent_buffer *t;
3152 
3153 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3154 		int tslot = path->slots[i];
3155 		if (!path->nodes[i])
3156 			break;
3157 		t = path->nodes[i];
3158 		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3159 		btrfs_set_node_key(t, key, tslot);
3160 		btrfs_mark_buffer_dirty(path->nodes[i]);
3161 		if (tslot != 0)
3162 			break;
3163 	}
3164 }
3165 
3166 /*
3167  * update item key.
3168  *
3169  * This function isn't completely safe. It's the caller's responsibility
3170  * that the new key won't break the order
3171  */
3172 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3173 			     struct btrfs_path *path,
3174 			     const struct btrfs_key *new_key)
3175 {
3176 	struct btrfs_disk_key disk_key;
3177 	struct extent_buffer *eb;
3178 	int slot;
3179 
3180 	eb = path->nodes[0];
3181 	slot = path->slots[0];
3182 	if (slot > 0) {
3183 		btrfs_item_key(eb, &disk_key, slot - 1);
3184 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3185 	}
3186 	if (slot < btrfs_header_nritems(eb) - 1) {
3187 		btrfs_item_key(eb, &disk_key, slot + 1);
3188 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3189 	}
3190 
3191 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3192 	btrfs_set_item_key(eb, &disk_key, slot);
3193 	btrfs_mark_buffer_dirty(eb);
3194 	if (slot == 0)
3195 		fixup_low_keys(fs_info, path, &disk_key, 1);
3196 }
3197 
3198 /*
3199  * try to push data from one node into the next node left in the
3200  * tree.
3201  *
3202  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3203  * error, and > 0 if there was no room in the left hand block.
3204  */
3205 static int push_node_left(struct btrfs_trans_handle *trans,
3206 			  struct btrfs_fs_info *fs_info,
3207 			  struct extent_buffer *dst,
3208 			  struct extent_buffer *src, int empty)
3209 {
3210 	int push_items = 0;
3211 	int src_nritems;
3212 	int dst_nritems;
3213 	int ret = 0;
3214 
3215 	src_nritems = btrfs_header_nritems(src);
3216 	dst_nritems = btrfs_header_nritems(dst);
3217 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3218 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3219 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3220 
3221 	if (!empty && src_nritems <= 8)
3222 		return 1;
3223 
3224 	if (push_items <= 0)
3225 		return 1;
3226 
3227 	if (empty) {
3228 		push_items = min(src_nritems, push_items);
3229 		if (push_items < src_nritems) {
3230 			/* leave at least 8 pointers in the node if
3231 			 * we aren't going to empty it
3232 			 */
3233 			if (src_nritems - push_items < 8) {
3234 				if (push_items <= 8)
3235 					return 1;
3236 				push_items -= 8;
3237 			}
3238 		}
3239 	} else
3240 		push_items = min(src_nritems - 8, push_items);
3241 
3242 	ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3243 				   push_items);
3244 	if (ret) {
3245 		btrfs_abort_transaction(trans, ret);
3246 		return ret;
3247 	}
3248 	copy_extent_buffer(dst, src,
3249 			   btrfs_node_key_ptr_offset(dst_nritems),
3250 			   btrfs_node_key_ptr_offset(0),
3251 			   push_items * sizeof(struct btrfs_key_ptr));
3252 
3253 	if (push_items < src_nritems) {
3254 		/*
3255 		 * don't call tree_mod_log_eb_move here, key removal was already
3256 		 * fully logged by tree_mod_log_eb_copy above.
3257 		 */
3258 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3259 				      btrfs_node_key_ptr_offset(push_items),
3260 				      (src_nritems - push_items) *
3261 				      sizeof(struct btrfs_key_ptr));
3262 	}
3263 	btrfs_set_header_nritems(src, src_nritems - push_items);
3264 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3265 	btrfs_mark_buffer_dirty(src);
3266 	btrfs_mark_buffer_dirty(dst);
3267 
3268 	return ret;
3269 }
3270 
3271 /*
3272  * try to push data from one node into the next node right in the
3273  * tree.
3274  *
3275  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3276  * error, and > 0 if there was no room in the right hand block.
3277  *
3278  * this will  only push up to 1/2 the contents of the left node over
3279  */
3280 static int balance_node_right(struct btrfs_trans_handle *trans,
3281 			      struct btrfs_fs_info *fs_info,
3282 			      struct extent_buffer *dst,
3283 			      struct extent_buffer *src)
3284 {
3285 	int push_items = 0;
3286 	int max_push;
3287 	int src_nritems;
3288 	int dst_nritems;
3289 	int ret = 0;
3290 
3291 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3292 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3293 
3294 	src_nritems = btrfs_header_nritems(src);
3295 	dst_nritems = btrfs_header_nritems(dst);
3296 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3297 	if (push_items <= 0)
3298 		return 1;
3299 
3300 	if (src_nritems < 4)
3301 		return 1;
3302 
3303 	max_push = src_nritems / 2 + 1;
3304 	/* don't try to empty the node */
3305 	if (max_push >= src_nritems)
3306 		return 1;
3307 
3308 	if (max_push < push_items)
3309 		push_items = max_push;
3310 
3311 	tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
3312 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3313 				      btrfs_node_key_ptr_offset(0),
3314 				      (dst_nritems) *
3315 				      sizeof(struct btrfs_key_ptr));
3316 
3317 	ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3318 				   src_nritems - push_items, push_items);
3319 	if (ret) {
3320 		btrfs_abort_transaction(trans, ret);
3321 		return ret;
3322 	}
3323 	copy_extent_buffer(dst, src,
3324 			   btrfs_node_key_ptr_offset(0),
3325 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3326 			   push_items * sizeof(struct btrfs_key_ptr));
3327 
3328 	btrfs_set_header_nritems(src, src_nritems - push_items);
3329 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3330 
3331 	btrfs_mark_buffer_dirty(src);
3332 	btrfs_mark_buffer_dirty(dst);
3333 
3334 	return ret;
3335 }
3336 
3337 /*
3338  * helper function to insert a new root level in the tree.
3339  * A new node is allocated, and a single item is inserted to
3340  * point to the existing root
3341  *
3342  * returns zero on success or < 0 on failure.
3343  */
3344 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3345 			   struct btrfs_root *root,
3346 			   struct btrfs_path *path, int level)
3347 {
3348 	struct btrfs_fs_info *fs_info = root->fs_info;
3349 	u64 lower_gen;
3350 	struct extent_buffer *lower;
3351 	struct extent_buffer *c;
3352 	struct extent_buffer *old;
3353 	struct btrfs_disk_key lower_key;
3354 
3355 	BUG_ON(path->nodes[level]);
3356 	BUG_ON(path->nodes[level-1] != root->node);
3357 
3358 	lower = path->nodes[level-1];
3359 	if (level == 1)
3360 		btrfs_item_key(lower, &lower_key, 0);
3361 	else
3362 		btrfs_node_key(lower, &lower_key, 0);
3363 
3364 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3365 				   &lower_key, level, root->node->start, 0);
3366 	if (IS_ERR(c))
3367 		return PTR_ERR(c);
3368 
3369 	root_add_used(root, fs_info->nodesize);
3370 
3371 	memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
3372 	btrfs_set_header_nritems(c, 1);
3373 	btrfs_set_header_level(c, level);
3374 	btrfs_set_header_bytenr(c, c->start);
3375 	btrfs_set_header_generation(c, trans->transid);
3376 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3377 	btrfs_set_header_owner(c, root->root_key.objectid);
3378 
3379 	write_extent_buffer_fsid(c, fs_info->fsid);
3380 	write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid);
3381 
3382 	btrfs_set_node_key(c, &lower_key, 0);
3383 	btrfs_set_node_blockptr(c, 0, lower->start);
3384 	lower_gen = btrfs_header_generation(lower);
3385 	WARN_ON(lower_gen != trans->transid);
3386 
3387 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3388 
3389 	btrfs_mark_buffer_dirty(c);
3390 
3391 	old = root->node;
3392 	tree_mod_log_set_root_pointer(root, c, 0);
3393 	rcu_assign_pointer(root->node, c);
3394 
3395 	/* the super has an extra ref to root->node */
3396 	free_extent_buffer(old);
3397 
3398 	add_root_to_dirty_list(root);
3399 	extent_buffer_get(c);
3400 	path->nodes[level] = c;
3401 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3402 	path->slots[level] = 0;
3403 	return 0;
3404 }
3405 
3406 /*
3407  * worker function to insert a single pointer in a node.
3408  * the node should have enough room for the pointer already
3409  *
3410  * slot and level indicate where you want the key to go, and
3411  * blocknr is the block the key points to.
3412  */
3413 static void insert_ptr(struct btrfs_trans_handle *trans,
3414 		       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3415 		       struct btrfs_disk_key *key, u64 bytenr,
3416 		       int slot, int level)
3417 {
3418 	struct extent_buffer *lower;
3419 	int nritems;
3420 	int ret;
3421 
3422 	BUG_ON(!path->nodes[level]);
3423 	btrfs_assert_tree_locked(path->nodes[level]);
3424 	lower = path->nodes[level];
3425 	nritems = btrfs_header_nritems(lower);
3426 	BUG_ON(slot > nritems);
3427 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3428 	if (slot != nritems) {
3429 		if (level)
3430 			tree_mod_log_eb_move(fs_info, lower, slot + 1,
3431 					     slot, nritems - slot);
3432 		memmove_extent_buffer(lower,
3433 			      btrfs_node_key_ptr_offset(slot + 1),
3434 			      btrfs_node_key_ptr_offset(slot),
3435 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3436 	}
3437 	if (level) {
3438 		ret = tree_mod_log_insert_key(fs_info, lower, slot,
3439 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3440 		BUG_ON(ret < 0);
3441 	}
3442 	btrfs_set_node_key(lower, key, slot);
3443 	btrfs_set_node_blockptr(lower, slot, bytenr);
3444 	WARN_ON(trans->transid == 0);
3445 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3446 	btrfs_set_header_nritems(lower, nritems + 1);
3447 	btrfs_mark_buffer_dirty(lower);
3448 }
3449 
3450 /*
3451  * split the node at the specified level in path in two.
3452  * The path is corrected to point to the appropriate node after the split
3453  *
3454  * Before splitting this tries to make some room in the node by pushing
3455  * left and right, if either one works, it returns right away.
3456  *
3457  * returns 0 on success and < 0 on failure
3458  */
3459 static noinline int split_node(struct btrfs_trans_handle *trans,
3460 			       struct btrfs_root *root,
3461 			       struct btrfs_path *path, int level)
3462 {
3463 	struct btrfs_fs_info *fs_info = root->fs_info;
3464 	struct extent_buffer *c;
3465 	struct extent_buffer *split;
3466 	struct btrfs_disk_key disk_key;
3467 	int mid;
3468 	int ret;
3469 	u32 c_nritems;
3470 
3471 	c = path->nodes[level];
3472 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3473 	if (c == root->node) {
3474 		/*
3475 		 * trying to split the root, lets make a new one
3476 		 *
3477 		 * tree mod log: We don't log_removal old root in
3478 		 * insert_new_root, because that root buffer will be kept as a
3479 		 * normal node. We are going to log removal of half of the
3480 		 * elements below with tree_mod_log_eb_copy. We're holding a
3481 		 * tree lock on the buffer, which is why we cannot race with
3482 		 * other tree_mod_log users.
3483 		 */
3484 		ret = insert_new_root(trans, root, path, level + 1);
3485 		if (ret)
3486 			return ret;
3487 	} else {
3488 		ret = push_nodes_for_insert(trans, root, path, level);
3489 		c = path->nodes[level];
3490 		if (!ret && btrfs_header_nritems(c) <
3491 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3492 			return 0;
3493 		if (ret < 0)
3494 			return ret;
3495 	}
3496 
3497 	c_nritems = btrfs_header_nritems(c);
3498 	mid = (c_nritems + 1) / 2;
3499 	btrfs_node_key(c, &disk_key, mid);
3500 
3501 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3502 			&disk_key, level, c->start, 0);
3503 	if (IS_ERR(split))
3504 		return PTR_ERR(split);
3505 
3506 	root_add_used(root, fs_info->nodesize);
3507 
3508 	memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
3509 	btrfs_set_header_level(split, btrfs_header_level(c));
3510 	btrfs_set_header_bytenr(split, split->start);
3511 	btrfs_set_header_generation(split, trans->transid);
3512 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3513 	btrfs_set_header_owner(split, root->root_key.objectid);
3514 	write_extent_buffer_fsid(split, fs_info->fsid);
3515 	write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid);
3516 
3517 	ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3518 	if (ret) {
3519 		btrfs_abort_transaction(trans, ret);
3520 		return ret;
3521 	}
3522 	copy_extent_buffer(split, c,
3523 			   btrfs_node_key_ptr_offset(0),
3524 			   btrfs_node_key_ptr_offset(mid),
3525 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3526 	btrfs_set_header_nritems(split, c_nritems - mid);
3527 	btrfs_set_header_nritems(c, mid);
3528 	ret = 0;
3529 
3530 	btrfs_mark_buffer_dirty(c);
3531 	btrfs_mark_buffer_dirty(split);
3532 
3533 	insert_ptr(trans, fs_info, path, &disk_key, split->start,
3534 		   path->slots[level + 1] + 1, level + 1);
3535 
3536 	if (path->slots[level] >= mid) {
3537 		path->slots[level] -= mid;
3538 		btrfs_tree_unlock(c);
3539 		free_extent_buffer(c);
3540 		path->nodes[level] = split;
3541 		path->slots[level + 1] += 1;
3542 	} else {
3543 		btrfs_tree_unlock(split);
3544 		free_extent_buffer(split);
3545 	}
3546 	return ret;
3547 }
3548 
3549 /*
3550  * how many bytes are required to store the items in a leaf.  start
3551  * and nr indicate which items in the leaf to check.  This totals up the
3552  * space used both by the item structs and the item data
3553  */
3554 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3555 {
3556 	struct btrfs_item *start_item;
3557 	struct btrfs_item *end_item;
3558 	struct btrfs_map_token token;
3559 	int data_len;
3560 	int nritems = btrfs_header_nritems(l);
3561 	int end = min(nritems, start + nr) - 1;
3562 
3563 	if (!nr)
3564 		return 0;
3565 	btrfs_init_map_token(&token);
3566 	start_item = btrfs_item_nr(start);
3567 	end_item = btrfs_item_nr(end);
3568 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3569 		btrfs_token_item_size(l, start_item, &token);
3570 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3571 	data_len += sizeof(struct btrfs_item) * nr;
3572 	WARN_ON(data_len < 0);
3573 	return data_len;
3574 }
3575 
3576 /*
3577  * The space between the end of the leaf items and
3578  * the start of the leaf data.  IOW, how much room
3579  * the leaf has left for both items and data
3580  */
3581 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3582 				   struct extent_buffer *leaf)
3583 {
3584 	int nritems = btrfs_header_nritems(leaf);
3585 	int ret;
3586 
3587 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3588 	if (ret < 0) {
3589 		btrfs_crit(fs_info,
3590 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3591 			   ret,
3592 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3593 			   leaf_space_used(leaf, 0, nritems), nritems);
3594 	}
3595 	return ret;
3596 }
3597 
3598 /*
3599  * min slot controls the lowest index we're willing to push to the
3600  * right.  We'll push up to and including min_slot, but no lower
3601  */
3602 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3603 				      struct btrfs_path *path,
3604 				      int data_size, int empty,
3605 				      struct extent_buffer *right,
3606 				      int free_space, u32 left_nritems,
3607 				      u32 min_slot)
3608 {
3609 	struct extent_buffer *left = path->nodes[0];
3610 	struct extent_buffer *upper = path->nodes[1];
3611 	struct btrfs_map_token token;
3612 	struct btrfs_disk_key disk_key;
3613 	int slot;
3614 	u32 i;
3615 	int push_space = 0;
3616 	int push_items = 0;
3617 	struct btrfs_item *item;
3618 	u32 nr;
3619 	u32 right_nritems;
3620 	u32 data_end;
3621 	u32 this_item_size;
3622 
3623 	btrfs_init_map_token(&token);
3624 
3625 	if (empty)
3626 		nr = 0;
3627 	else
3628 		nr = max_t(u32, 1, min_slot);
3629 
3630 	if (path->slots[0] >= left_nritems)
3631 		push_space += data_size;
3632 
3633 	slot = path->slots[1];
3634 	i = left_nritems - 1;
3635 	while (i >= nr) {
3636 		item = btrfs_item_nr(i);
3637 
3638 		if (!empty && push_items > 0) {
3639 			if (path->slots[0] > i)
3640 				break;
3641 			if (path->slots[0] == i) {
3642 				int space = btrfs_leaf_free_space(fs_info, left);
3643 				if (space + push_space * 2 > free_space)
3644 					break;
3645 			}
3646 		}
3647 
3648 		if (path->slots[0] == i)
3649 			push_space += data_size;
3650 
3651 		this_item_size = btrfs_item_size(left, item);
3652 		if (this_item_size + sizeof(*item) + push_space > free_space)
3653 			break;
3654 
3655 		push_items++;
3656 		push_space += this_item_size + sizeof(*item);
3657 		if (i == 0)
3658 			break;
3659 		i--;
3660 	}
3661 
3662 	if (push_items == 0)
3663 		goto out_unlock;
3664 
3665 	WARN_ON(!empty && push_items == left_nritems);
3666 
3667 	/* push left to right */
3668 	right_nritems = btrfs_header_nritems(right);
3669 
3670 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3671 	push_space -= leaf_data_end(fs_info, left);
3672 
3673 	/* make room in the right data area */
3674 	data_end = leaf_data_end(fs_info, right);
3675 	memmove_extent_buffer(right,
3676 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3677 			      BTRFS_LEAF_DATA_OFFSET + data_end,
3678 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3679 
3680 	/* copy from the left data area */
3681 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3682 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3683 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3684 		     push_space);
3685 
3686 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3687 			      btrfs_item_nr_offset(0),
3688 			      right_nritems * sizeof(struct btrfs_item));
3689 
3690 	/* copy the items from left to right */
3691 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3692 		   btrfs_item_nr_offset(left_nritems - push_items),
3693 		   push_items * sizeof(struct btrfs_item));
3694 
3695 	/* update the item pointers */
3696 	right_nritems += push_items;
3697 	btrfs_set_header_nritems(right, right_nritems);
3698 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3699 	for (i = 0; i < right_nritems; i++) {
3700 		item = btrfs_item_nr(i);
3701 		push_space -= btrfs_token_item_size(right, item, &token);
3702 		btrfs_set_token_item_offset(right, item, push_space, &token);
3703 	}
3704 
3705 	left_nritems -= push_items;
3706 	btrfs_set_header_nritems(left, left_nritems);
3707 
3708 	if (left_nritems)
3709 		btrfs_mark_buffer_dirty(left);
3710 	else
3711 		clean_tree_block(fs_info, left);
3712 
3713 	btrfs_mark_buffer_dirty(right);
3714 
3715 	btrfs_item_key(right, &disk_key, 0);
3716 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3717 	btrfs_mark_buffer_dirty(upper);
3718 
3719 	/* then fixup the leaf pointer in the path */
3720 	if (path->slots[0] >= left_nritems) {
3721 		path->slots[0] -= left_nritems;
3722 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3723 			clean_tree_block(fs_info, path->nodes[0]);
3724 		btrfs_tree_unlock(path->nodes[0]);
3725 		free_extent_buffer(path->nodes[0]);
3726 		path->nodes[0] = right;
3727 		path->slots[1] += 1;
3728 	} else {
3729 		btrfs_tree_unlock(right);
3730 		free_extent_buffer(right);
3731 	}
3732 	return 0;
3733 
3734 out_unlock:
3735 	btrfs_tree_unlock(right);
3736 	free_extent_buffer(right);
3737 	return 1;
3738 }
3739 
3740 /*
3741  * push some data in the path leaf to the right, trying to free up at
3742  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3743  *
3744  * returns 1 if the push failed because the other node didn't have enough
3745  * room, 0 if everything worked out and < 0 if there were major errors.
3746  *
3747  * this will push starting from min_slot to the end of the leaf.  It won't
3748  * push any slot lower than min_slot
3749  */
3750 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3751 			   *root, struct btrfs_path *path,
3752 			   int min_data_size, int data_size,
3753 			   int empty, u32 min_slot)
3754 {
3755 	struct btrfs_fs_info *fs_info = root->fs_info;
3756 	struct extent_buffer *left = path->nodes[0];
3757 	struct extent_buffer *right;
3758 	struct extent_buffer *upper;
3759 	int slot;
3760 	int free_space;
3761 	u32 left_nritems;
3762 	int ret;
3763 
3764 	if (!path->nodes[1])
3765 		return 1;
3766 
3767 	slot = path->slots[1];
3768 	upper = path->nodes[1];
3769 	if (slot >= btrfs_header_nritems(upper) - 1)
3770 		return 1;
3771 
3772 	btrfs_assert_tree_locked(path->nodes[1]);
3773 
3774 	right = read_node_slot(fs_info, upper, slot + 1);
3775 	/*
3776 	 * slot + 1 is not valid or we fail to read the right node,
3777 	 * no big deal, just return.
3778 	 */
3779 	if (IS_ERR(right))
3780 		return 1;
3781 
3782 	btrfs_tree_lock(right);
3783 	btrfs_set_lock_blocking(right);
3784 
3785 	free_space = btrfs_leaf_free_space(fs_info, right);
3786 	if (free_space < data_size)
3787 		goto out_unlock;
3788 
3789 	/* cow and double check */
3790 	ret = btrfs_cow_block(trans, root, right, upper,
3791 			      slot + 1, &right);
3792 	if (ret)
3793 		goto out_unlock;
3794 
3795 	free_space = btrfs_leaf_free_space(fs_info, right);
3796 	if (free_space < data_size)
3797 		goto out_unlock;
3798 
3799 	left_nritems = btrfs_header_nritems(left);
3800 	if (left_nritems == 0)
3801 		goto out_unlock;
3802 
3803 	if (path->slots[0] == left_nritems && !empty) {
3804 		/* Key greater than all keys in the leaf, right neighbor has
3805 		 * enough room for it and we're not emptying our leaf to delete
3806 		 * it, therefore use right neighbor to insert the new item and
3807 		 * no need to touch/dirty our left leaft. */
3808 		btrfs_tree_unlock(left);
3809 		free_extent_buffer(left);
3810 		path->nodes[0] = right;
3811 		path->slots[0] = 0;
3812 		path->slots[1]++;
3813 		return 0;
3814 	}
3815 
3816 	return __push_leaf_right(fs_info, path, min_data_size, empty,
3817 				right, free_space, left_nritems, min_slot);
3818 out_unlock:
3819 	btrfs_tree_unlock(right);
3820 	free_extent_buffer(right);
3821 	return 1;
3822 }
3823 
3824 /*
3825  * push some data in the path leaf to the left, trying to free up at
3826  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3827  *
3828  * max_slot can put a limit on how far into the leaf we'll push items.  The
3829  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3830  * items
3831  */
3832 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3833 				     struct btrfs_path *path, int data_size,
3834 				     int empty, struct extent_buffer *left,
3835 				     int free_space, u32 right_nritems,
3836 				     u32 max_slot)
3837 {
3838 	struct btrfs_disk_key disk_key;
3839 	struct extent_buffer *right = path->nodes[0];
3840 	int i;
3841 	int push_space = 0;
3842 	int push_items = 0;
3843 	struct btrfs_item *item;
3844 	u32 old_left_nritems;
3845 	u32 nr;
3846 	int ret = 0;
3847 	u32 this_item_size;
3848 	u32 old_left_item_size;
3849 	struct btrfs_map_token token;
3850 
3851 	btrfs_init_map_token(&token);
3852 
3853 	if (empty)
3854 		nr = min(right_nritems, max_slot);
3855 	else
3856 		nr = min(right_nritems - 1, max_slot);
3857 
3858 	for (i = 0; i < nr; i++) {
3859 		item = btrfs_item_nr(i);
3860 
3861 		if (!empty && push_items > 0) {
3862 			if (path->slots[0] < i)
3863 				break;
3864 			if (path->slots[0] == i) {
3865 				int space = btrfs_leaf_free_space(fs_info, right);
3866 				if (space + push_space * 2 > free_space)
3867 					break;
3868 			}
3869 		}
3870 
3871 		if (path->slots[0] == i)
3872 			push_space += data_size;
3873 
3874 		this_item_size = btrfs_item_size(right, item);
3875 		if (this_item_size + sizeof(*item) + push_space > free_space)
3876 			break;
3877 
3878 		push_items++;
3879 		push_space += this_item_size + sizeof(*item);
3880 	}
3881 
3882 	if (push_items == 0) {
3883 		ret = 1;
3884 		goto out;
3885 	}
3886 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3887 
3888 	/* push data from right to left */
3889 	copy_extent_buffer(left, right,
3890 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3891 			   btrfs_item_nr_offset(0),
3892 			   push_items * sizeof(struct btrfs_item));
3893 
3894 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3895 		     btrfs_item_offset_nr(right, push_items - 1);
3896 
3897 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3898 		     leaf_data_end(fs_info, left) - push_space,
3899 		     BTRFS_LEAF_DATA_OFFSET +
3900 		     btrfs_item_offset_nr(right, push_items - 1),
3901 		     push_space);
3902 	old_left_nritems = btrfs_header_nritems(left);
3903 	BUG_ON(old_left_nritems <= 0);
3904 
3905 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3906 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3907 		u32 ioff;
3908 
3909 		item = btrfs_item_nr(i);
3910 
3911 		ioff = btrfs_token_item_offset(left, item, &token);
3912 		btrfs_set_token_item_offset(left, item,
3913 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3914 		      &token);
3915 	}
3916 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3917 
3918 	/* fixup right node */
3919 	if (push_items > right_nritems)
3920 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3921 		       right_nritems);
3922 
3923 	if (push_items < right_nritems) {
3924 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3925 						  leaf_data_end(fs_info, right);
3926 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3927 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3928 				      BTRFS_LEAF_DATA_OFFSET +
3929 				      leaf_data_end(fs_info, right), push_space);
3930 
3931 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3932 			      btrfs_item_nr_offset(push_items),
3933 			     (btrfs_header_nritems(right) - push_items) *
3934 			     sizeof(struct btrfs_item));
3935 	}
3936 	right_nritems -= push_items;
3937 	btrfs_set_header_nritems(right, right_nritems);
3938 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3939 	for (i = 0; i < right_nritems; i++) {
3940 		item = btrfs_item_nr(i);
3941 
3942 		push_space = push_space - btrfs_token_item_size(right,
3943 								item, &token);
3944 		btrfs_set_token_item_offset(right, item, push_space, &token);
3945 	}
3946 
3947 	btrfs_mark_buffer_dirty(left);
3948 	if (right_nritems)
3949 		btrfs_mark_buffer_dirty(right);
3950 	else
3951 		clean_tree_block(fs_info, right);
3952 
3953 	btrfs_item_key(right, &disk_key, 0);
3954 	fixup_low_keys(fs_info, path, &disk_key, 1);
3955 
3956 	/* then fixup the leaf pointer in the path */
3957 	if (path->slots[0] < push_items) {
3958 		path->slots[0] += old_left_nritems;
3959 		btrfs_tree_unlock(path->nodes[0]);
3960 		free_extent_buffer(path->nodes[0]);
3961 		path->nodes[0] = left;
3962 		path->slots[1] -= 1;
3963 	} else {
3964 		btrfs_tree_unlock(left);
3965 		free_extent_buffer(left);
3966 		path->slots[0] -= push_items;
3967 	}
3968 	BUG_ON(path->slots[0] < 0);
3969 	return ret;
3970 out:
3971 	btrfs_tree_unlock(left);
3972 	free_extent_buffer(left);
3973 	return ret;
3974 }
3975 
3976 /*
3977  * push some data in the path leaf to the left, trying to free up at
3978  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3979  *
3980  * max_slot can put a limit on how far into the leaf we'll push items.  The
3981  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3982  * items
3983  */
3984 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3985 			  *root, struct btrfs_path *path, int min_data_size,
3986 			  int data_size, int empty, u32 max_slot)
3987 {
3988 	struct btrfs_fs_info *fs_info = root->fs_info;
3989 	struct extent_buffer *right = path->nodes[0];
3990 	struct extent_buffer *left;
3991 	int slot;
3992 	int free_space;
3993 	u32 right_nritems;
3994 	int ret = 0;
3995 
3996 	slot = path->slots[1];
3997 	if (slot == 0)
3998 		return 1;
3999 	if (!path->nodes[1])
4000 		return 1;
4001 
4002 	right_nritems = btrfs_header_nritems(right);
4003 	if (right_nritems == 0)
4004 		return 1;
4005 
4006 	btrfs_assert_tree_locked(path->nodes[1]);
4007 
4008 	left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4009 	/*
4010 	 * slot - 1 is not valid or we fail to read the left node,
4011 	 * no big deal, just return.
4012 	 */
4013 	if (IS_ERR(left))
4014 		return 1;
4015 
4016 	btrfs_tree_lock(left);
4017 	btrfs_set_lock_blocking(left);
4018 
4019 	free_space = btrfs_leaf_free_space(fs_info, left);
4020 	if (free_space < data_size) {
4021 		ret = 1;
4022 		goto out;
4023 	}
4024 
4025 	/* cow and double check */
4026 	ret = btrfs_cow_block(trans, root, left,
4027 			      path->nodes[1], slot - 1, &left);
4028 	if (ret) {
4029 		/* we hit -ENOSPC, but it isn't fatal here */
4030 		if (ret == -ENOSPC)
4031 			ret = 1;
4032 		goto out;
4033 	}
4034 
4035 	free_space = btrfs_leaf_free_space(fs_info, left);
4036 	if (free_space < data_size) {
4037 		ret = 1;
4038 		goto out;
4039 	}
4040 
4041 	return __push_leaf_left(fs_info, path, min_data_size,
4042 			       empty, left, free_space, right_nritems,
4043 			       max_slot);
4044 out:
4045 	btrfs_tree_unlock(left);
4046 	free_extent_buffer(left);
4047 	return ret;
4048 }
4049 
4050 /*
4051  * split the path's leaf in two, making sure there is at least data_size
4052  * available for the resulting leaf level of the path.
4053  */
4054 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4055 				    struct btrfs_fs_info *fs_info,
4056 				    struct btrfs_path *path,
4057 				    struct extent_buffer *l,
4058 				    struct extent_buffer *right,
4059 				    int slot, int mid, int nritems)
4060 {
4061 	int data_copy_size;
4062 	int rt_data_off;
4063 	int i;
4064 	struct btrfs_disk_key disk_key;
4065 	struct btrfs_map_token token;
4066 
4067 	btrfs_init_map_token(&token);
4068 
4069 	nritems = nritems - mid;
4070 	btrfs_set_header_nritems(right, nritems);
4071 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4072 
4073 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4074 			   btrfs_item_nr_offset(mid),
4075 			   nritems * sizeof(struct btrfs_item));
4076 
4077 	copy_extent_buffer(right, l,
4078 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4079 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4080 		     leaf_data_end(fs_info, l), data_copy_size);
4081 
4082 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4083 
4084 	for (i = 0; i < nritems; i++) {
4085 		struct btrfs_item *item = btrfs_item_nr(i);
4086 		u32 ioff;
4087 
4088 		ioff = btrfs_token_item_offset(right, item, &token);
4089 		btrfs_set_token_item_offset(right, item,
4090 					    ioff + rt_data_off, &token);
4091 	}
4092 
4093 	btrfs_set_header_nritems(l, mid);
4094 	btrfs_item_key(right, &disk_key, 0);
4095 	insert_ptr(trans, fs_info, path, &disk_key, right->start,
4096 		   path->slots[1] + 1, 1);
4097 
4098 	btrfs_mark_buffer_dirty(right);
4099 	btrfs_mark_buffer_dirty(l);
4100 	BUG_ON(path->slots[0] != slot);
4101 
4102 	if (mid <= slot) {
4103 		btrfs_tree_unlock(path->nodes[0]);
4104 		free_extent_buffer(path->nodes[0]);
4105 		path->nodes[0] = right;
4106 		path->slots[0] -= mid;
4107 		path->slots[1] += 1;
4108 	} else {
4109 		btrfs_tree_unlock(right);
4110 		free_extent_buffer(right);
4111 	}
4112 
4113 	BUG_ON(path->slots[0] < 0);
4114 }
4115 
4116 /*
4117  * double splits happen when we need to insert a big item in the middle
4118  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4119  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4120  *          A                 B                 C
4121  *
4122  * We avoid this by trying to push the items on either side of our target
4123  * into the adjacent leaves.  If all goes well we can avoid the double split
4124  * completely.
4125  */
4126 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4127 					  struct btrfs_root *root,
4128 					  struct btrfs_path *path,
4129 					  int data_size)
4130 {
4131 	struct btrfs_fs_info *fs_info = root->fs_info;
4132 	int ret;
4133 	int progress = 0;
4134 	int slot;
4135 	u32 nritems;
4136 	int space_needed = data_size;
4137 
4138 	slot = path->slots[0];
4139 	if (slot < btrfs_header_nritems(path->nodes[0]))
4140 		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4141 
4142 	/*
4143 	 * try to push all the items after our slot into the
4144 	 * right leaf
4145 	 */
4146 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4147 	if (ret < 0)
4148 		return ret;
4149 
4150 	if (ret == 0)
4151 		progress++;
4152 
4153 	nritems = btrfs_header_nritems(path->nodes[0]);
4154 	/*
4155 	 * our goal is to get our slot at the start or end of a leaf.  If
4156 	 * we've done so we're done
4157 	 */
4158 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4159 		return 0;
4160 
4161 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4162 		return 0;
4163 
4164 	/* try to push all the items before our slot into the next leaf */
4165 	slot = path->slots[0];
4166 	space_needed = data_size;
4167 	if (slot > 0)
4168 		space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4169 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4170 	if (ret < 0)
4171 		return ret;
4172 
4173 	if (ret == 0)
4174 		progress++;
4175 
4176 	if (progress)
4177 		return 0;
4178 	return 1;
4179 }
4180 
4181 /*
4182  * split the path's leaf in two, making sure there is at least data_size
4183  * available for the resulting leaf level of the path.
4184  *
4185  * returns 0 if all went well and < 0 on failure.
4186  */
4187 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4188 			       struct btrfs_root *root,
4189 			       const struct btrfs_key *ins_key,
4190 			       struct btrfs_path *path, int data_size,
4191 			       int extend)
4192 {
4193 	struct btrfs_disk_key disk_key;
4194 	struct extent_buffer *l;
4195 	u32 nritems;
4196 	int mid;
4197 	int slot;
4198 	struct extent_buffer *right;
4199 	struct btrfs_fs_info *fs_info = root->fs_info;
4200 	int ret = 0;
4201 	int wret;
4202 	int split;
4203 	int num_doubles = 0;
4204 	int tried_avoid_double = 0;
4205 
4206 	l = path->nodes[0];
4207 	slot = path->slots[0];
4208 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4209 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4210 		return -EOVERFLOW;
4211 
4212 	/* first try to make some room by pushing left and right */
4213 	if (data_size && path->nodes[1]) {
4214 		int space_needed = data_size;
4215 
4216 		if (slot < btrfs_header_nritems(l))
4217 			space_needed -= btrfs_leaf_free_space(fs_info, l);
4218 
4219 		wret = push_leaf_right(trans, root, path, space_needed,
4220 				       space_needed, 0, 0);
4221 		if (wret < 0)
4222 			return wret;
4223 		if (wret) {
4224 			space_needed = data_size;
4225 			if (slot > 0)
4226 				space_needed -= btrfs_leaf_free_space(fs_info,
4227 								      l);
4228 			wret = push_leaf_left(trans, root, path, space_needed,
4229 					      space_needed, 0, (u32)-1);
4230 			if (wret < 0)
4231 				return wret;
4232 		}
4233 		l = path->nodes[0];
4234 
4235 		/* did the pushes work? */
4236 		if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4237 			return 0;
4238 	}
4239 
4240 	if (!path->nodes[1]) {
4241 		ret = insert_new_root(trans, root, path, 1);
4242 		if (ret)
4243 			return ret;
4244 	}
4245 again:
4246 	split = 1;
4247 	l = path->nodes[0];
4248 	slot = path->slots[0];
4249 	nritems = btrfs_header_nritems(l);
4250 	mid = (nritems + 1) / 2;
4251 
4252 	if (mid <= slot) {
4253 		if (nritems == 1 ||
4254 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4255 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4256 			if (slot >= nritems) {
4257 				split = 0;
4258 			} else {
4259 				mid = slot;
4260 				if (mid != nritems &&
4261 				    leaf_space_used(l, mid, nritems - mid) +
4262 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4263 					if (data_size && !tried_avoid_double)
4264 						goto push_for_double;
4265 					split = 2;
4266 				}
4267 			}
4268 		}
4269 	} else {
4270 		if (leaf_space_used(l, 0, mid) + data_size >
4271 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4272 			if (!extend && data_size && slot == 0) {
4273 				split = 0;
4274 			} else if ((extend || !data_size) && slot == 0) {
4275 				mid = 1;
4276 			} else {
4277 				mid = slot;
4278 				if (mid != nritems &&
4279 				    leaf_space_used(l, mid, nritems - mid) +
4280 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4281 					if (data_size && !tried_avoid_double)
4282 						goto push_for_double;
4283 					split = 2;
4284 				}
4285 			}
4286 		}
4287 	}
4288 
4289 	if (split == 0)
4290 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4291 	else
4292 		btrfs_item_key(l, &disk_key, mid);
4293 
4294 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4295 			&disk_key, 0, l->start, 0);
4296 	if (IS_ERR(right))
4297 		return PTR_ERR(right);
4298 
4299 	root_add_used(root, fs_info->nodesize);
4300 
4301 	memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
4302 	btrfs_set_header_bytenr(right, right->start);
4303 	btrfs_set_header_generation(right, trans->transid);
4304 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4305 	btrfs_set_header_owner(right, root->root_key.objectid);
4306 	btrfs_set_header_level(right, 0);
4307 	write_extent_buffer_fsid(right, fs_info->fsid);
4308 	write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
4309 
4310 	if (split == 0) {
4311 		if (mid <= slot) {
4312 			btrfs_set_header_nritems(right, 0);
4313 			insert_ptr(trans, fs_info, path, &disk_key,
4314 				   right->start, path->slots[1] + 1, 1);
4315 			btrfs_tree_unlock(path->nodes[0]);
4316 			free_extent_buffer(path->nodes[0]);
4317 			path->nodes[0] = right;
4318 			path->slots[0] = 0;
4319 			path->slots[1] += 1;
4320 		} else {
4321 			btrfs_set_header_nritems(right, 0);
4322 			insert_ptr(trans, fs_info, path, &disk_key,
4323 				   right->start, path->slots[1], 1);
4324 			btrfs_tree_unlock(path->nodes[0]);
4325 			free_extent_buffer(path->nodes[0]);
4326 			path->nodes[0] = right;
4327 			path->slots[0] = 0;
4328 			if (path->slots[1] == 0)
4329 				fixup_low_keys(fs_info, path, &disk_key, 1);
4330 		}
4331 		/*
4332 		 * We create a new leaf 'right' for the required ins_len and
4333 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4334 		 * the content of ins_len to 'right'.
4335 		 */
4336 		return ret;
4337 	}
4338 
4339 	copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4340 
4341 	if (split == 2) {
4342 		BUG_ON(num_doubles != 0);
4343 		num_doubles++;
4344 		goto again;
4345 	}
4346 
4347 	return 0;
4348 
4349 push_for_double:
4350 	push_for_double_split(trans, root, path, data_size);
4351 	tried_avoid_double = 1;
4352 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4353 		return 0;
4354 	goto again;
4355 }
4356 
4357 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4358 					 struct btrfs_root *root,
4359 					 struct btrfs_path *path, int ins_len)
4360 {
4361 	struct btrfs_fs_info *fs_info = root->fs_info;
4362 	struct btrfs_key key;
4363 	struct extent_buffer *leaf;
4364 	struct btrfs_file_extent_item *fi;
4365 	u64 extent_len = 0;
4366 	u32 item_size;
4367 	int ret;
4368 
4369 	leaf = path->nodes[0];
4370 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4371 
4372 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4373 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4374 
4375 	if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4376 		return 0;
4377 
4378 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4379 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4380 		fi = btrfs_item_ptr(leaf, path->slots[0],
4381 				    struct btrfs_file_extent_item);
4382 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4383 	}
4384 	btrfs_release_path(path);
4385 
4386 	path->keep_locks = 1;
4387 	path->search_for_split = 1;
4388 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4389 	path->search_for_split = 0;
4390 	if (ret > 0)
4391 		ret = -EAGAIN;
4392 	if (ret < 0)
4393 		goto err;
4394 
4395 	ret = -EAGAIN;
4396 	leaf = path->nodes[0];
4397 	/* if our item isn't there, return now */
4398 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4399 		goto err;
4400 
4401 	/* the leaf has  changed, it now has room.  return now */
4402 	if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4403 		goto err;
4404 
4405 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4406 		fi = btrfs_item_ptr(leaf, path->slots[0],
4407 				    struct btrfs_file_extent_item);
4408 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4409 			goto err;
4410 	}
4411 
4412 	btrfs_set_path_blocking(path);
4413 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4414 	if (ret)
4415 		goto err;
4416 
4417 	path->keep_locks = 0;
4418 	btrfs_unlock_up_safe(path, 1);
4419 	return 0;
4420 err:
4421 	path->keep_locks = 0;
4422 	return ret;
4423 }
4424 
4425 static noinline int split_item(struct btrfs_fs_info *fs_info,
4426 			       struct btrfs_path *path,
4427 			       const struct btrfs_key *new_key,
4428 			       unsigned long split_offset)
4429 {
4430 	struct extent_buffer *leaf;
4431 	struct btrfs_item *item;
4432 	struct btrfs_item *new_item;
4433 	int slot;
4434 	char *buf;
4435 	u32 nritems;
4436 	u32 item_size;
4437 	u32 orig_offset;
4438 	struct btrfs_disk_key disk_key;
4439 
4440 	leaf = path->nodes[0];
4441 	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4442 
4443 	btrfs_set_path_blocking(path);
4444 
4445 	item = btrfs_item_nr(path->slots[0]);
4446 	orig_offset = btrfs_item_offset(leaf, item);
4447 	item_size = btrfs_item_size(leaf, item);
4448 
4449 	buf = kmalloc(item_size, GFP_NOFS);
4450 	if (!buf)
4451 		return -ENOMEM;
4452 
4453 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4454 			    path->slots[0]), item_size);
4455 
4456 	slot = path->slots[0] + 1;
4457 	nritems = btrfs_header_nritems(leaf);
4458 	if (slot != nritems) {
4459 		/* shift the items */
4460 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4461 				btrfs_item_nr_offset(slot),
4462 				(nritems - slot) * sizeof(struct btrfs_item));
4463 	}
4464 
4465 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4466 	btrfs_set_item_key(leaf, &disk_key, slot);
4467 
4468 	new_item = btrfs_item_nr(slot);
4469 
4470 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4471 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4472 
4473 	btrfs_set_item_offset(leaf, item,
4474 			      orig_offset + item_size - split_offset);
4475 	btrfs_set_item_size(leaf, item, split_offset);
4476 
4477 	btrfs_set_header_nritems(leaf, nritems + 1);
4478 
4479 	/* write the data for the start of the original item */
4480 	write_extent_buffer(leaf, buf,
4481 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4482 			    split_offset);
4483 
4484 	/* write the data for the new item */
4485 	write_extent_buffer(leaf, buf + split_offset,
4486 			    btrfs_item_ptr_offset(leaf, slot),
4487 			    item_size - split_offset);
4488 	btrfs_mark_buffer_dirty(leaf);
4489 
4490 	BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4491 	kfree(buf);
4492 	return 0;
4493 }
4494 
4495 /*
4496  * This function splits a single item into two items,
4497  * giving 'new_key' to the new item and splitting the
4498  * old one at split_offset (from the start of the item).
4499  *
4500  * The path may be released by this operation.  After
4501  * the split, the path is pointing to the old item.  The
4502  * new item is going to be in the same node as the old one.
4503  *
4504  * Note, the item being split must be smaller enough to live alone on
4505  * a tree block with room for one extra struct btrfs_item
4506  *
4507  * This allows us to split the item in place, keeping a lock on the
4508  * leaf the entire time.
4509  */
4510 int btrfs_split_item(struct btrfs_trans_handle *trans,
4511 		     struct btrfs_root *root,
4512 		     struct btrfs_path *path,
4513 		     const struct btrfs_key *new_key,
4514 		     unsigned long split_offset)
4515 {
4516 	int ret;
4517 	ret = setup_leaf_for_split(trans, root, path,
4518 				   sizeof(struct btrfs_item));
4519 	if (ret)
4520 		return ret;
4521 
4522 	ret = split_item(root->fs_info, path, new_key, split_offset);
4523 	return ret;
4524 }
4525 
4526 /*
4527  * This function duplicate a item, giving 'new_key' to the new item.
4528  * It guarantees both items live in the same tree leaf and the new item
4529  * is contiguous with the original item.
4530  *
4531  * This allows us to split file extent in place, keeping a lock on the
4532  * leaf the entire time.
4533  */
4534 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4535 			 struct btrfs_root *root,
4536 			 struct btrfs_path *path,
4537 			 const struct btrfs_key *new_key)
4538 {
4539 	struct extent_buffer *leaf;
4540 	int ret;
4541 	u32 item_size;
4542 
4543 	leaf = path->nodes[0];
4544 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4545 	ret = setup_leaf_for_split(trans, root, path,
4546 				   item_size + sizeof(struct btrfs_item));
4547 	if (ret)
4548 		return ret;
4549 
4550 	path->slots[0]++;
4551 	setup_items_for_insert(root, path, new_key, &item_size,
4552 			       item_size, item_size +
4553 			       sizeof(struct btrfs_item), 1);
4554 	leaf = path->nodes[0];
4555 	memcpy_extent_buffer(leaf,
4556 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4557 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4558 			     item_size);
4559 	return 0;
4560 }
4561 
4562 /*
4563  * make the item pointed to by the path smaller.  new_size indicates
4564  * how small to make it, and from_end tells us if we just chop bytes
4565  * off the end of the item or if we shift the item to chop bytes off
4566  * the front.
4567  */
4568 void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4569 			 struct btrfs_path *path, u32 new_size, int from_end)
4570 {
4571 	int slot;
4572 	struct extent_buffer *leaf;
4573 	struct btrfs_item *item;
4574 	u32 nritems;
4575 	unsigned int data_end;
4576 	unsigned int old_data_start;
4577 	unsigned int old_size;
4578 	unsigned int size_diff;
4579 	int i;
4580 	struct btrfs_map_token token;
4581 
4582 	btrfs_init_map_token(&token);
4583 
4584 	leaf = path->nodes[0];
4585 	slot = path->slots[0];
4586 
4587 	old_size = btrfs_item_size_nr(leaf, slot);
4588 	if (old_size == new_size)
4589 		return;
4590 
4591 	nritems = btrfs_header_nritems(leaf);
4592 	data_end = leaf_data_end(fs_info, leaf);
4593 
4594 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4595 
4596 	size_diff = old_size - new_size;
4597 
4598 	BUG_ON(slot < 0);
4599 	BUG_ON(slot >= nritems);
4600 
4601 	/*
4602 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4603 	 */
4604 	/* first correct the data pointers */
4605 	for (i = slot; i < nritems; i++) {
4606 		u32 ioff;
4607 		item = btrfs_item_nr(i);
4608 
4609 		ioff = btrfs_token_item_offset(leaf, item, &token);
4610 		btrfs_set_token_item_offset(leaf, item,
4611 					    ioff + size_diff, &token);
4612 	}
4613 
4614 	/* shift the data */
4615 	if (from_end) {
4616 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4617 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4618 			      data_end, old_data_start + new_size - data_end);
4619 	} else {
4620 		struct btrfs_disk_key disk_key;
4621 		u64 offset;
4622 
4623 		btrfs_item_key(leaf, &disk_key, slot);
4624 
4625 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4626 			unsigned long ptr;
4627 			struct btrfs_file_extent_item *fi;
4628 
4629 			fi = btrfs_item_ptr(leaf, slot,
4630 					    struct btrfs_file_extent_item);
4631 			fi = (struct btrfs_file_extent_item *)(
4632 			     (unsigned long)fi - size_diff);
4633 
4634 			if (btrfs_file_extent_type(leaf, fi) ==
4635 			    BTRFS_FILE_EXTENT_INLINE) {
4636 				ptr = btrfs_item_ptr_offset(leaf, slot);
4637 				memmove_extent_buffer(leaf, ptr,
4638 				      (unsigned long)fi,
4639 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4640 			}
4641 		}
4642 
4643 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4644 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4645 			      data_end, old_data_start - data_end);
4646 
4647 		offset = btrfs_disk_key_offset(&disk_key);
4648 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4649 		btrfs_set_item_key(leaf, &disk_key, slot);
4650 		if (slot == 0)
4651 			fixup_low_keys(fs_info, path, &disk_key, 1);
4652 	}
4653 
4654 	item = btrfs_item_nr(slot);
4655 	btrfs_set_item_size(leaf, item, new_size);
4656 	btrfs_mark_buffer_dirty(leaf);
4657 
4658 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4659 		btrfs_print_leaf(leaf);
4660 		BUG();
4661 	}
4662 }
4663 
4664 /*
4665  * make the item pointed to by the path bigger, data_size is the added size.
4666  */
4667 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4668 		       u32 data_size)
4669 {
4670 	int slot;
4671 	struct extent_buffer *leaf;
4672 	struct btrfs_item *item;
4673 	u32 nritems;
4674 	unsigned int data_end;
4675 	unsigned int old_data;
4676 	unsigned int old_size;
4677 	int i;
4678 	struct btrfs_map_token token;
4679 
4680 	btrfs_init_map_token(&token);
4681 
4682 	leaf = path->nodes[0];
4683 
4684 	nritems = btrfs_header_nritems(leaf);
4685 	data_end = leaf_data_end(fs_info, leaf);
4686 
4687 	if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4688 		btrfs_print_leaf(leaf);
4689 		BUG();
4690 	}
4691 	slot = path->slots[0];
4692 	old_data = btrfs_item_end_nr(leaf, slot);
4693 
4694 	BUG_ON(slot < 0);
4695 	if (slot >= nritems) {
4696 		btrfs_print_leaf(leaf);
4697 		btrfs_crit(fs_info, "slot %d too large, nritems %d",
4698 			   slot, nritems);
4699 		BUG_ON(1);
4700 	}
4701 
4702 	/*
4703 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4704 	 */
4705 	/* first correct the data pointers */
4706 	for (i = slot; i < nritems; i++) {
4707 		u32 ioff;
4708 		item = btrfs_item_nr(i);
4709 
4710 		ioff = btrfs_token_item_offset(leaf, item, &token);
4711 		btrfs_set_token_item_offset(leaf, item,
4712 					    ioff - data_size, &token);
4713 	}
4714 
4715 	/* shift the data */
4716 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4717 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4718 		      data_end, old_data - data_end);
4719 
4720 	data_end = old_data;
4721 	old_size = btrfs_item_size_nr(leaf, slot);
4722 	item = btrfs_item_nr(slot);
4723 	btrfs_set_item_size(leaf, item, old_size + data_size);
4724 	btrfs_mark_buffer_dirty(leaf);
4725 
4726 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4727 		btrfs_print_leaf(leaf);
4728 		BUG();
4729 	}
4730 }
4731 
4732 /*
4733  * this is a helper for btrfs_insert_empty_items, the main goal here is
4734  * to save stack depth by doing the bulk of the work in a function
4735  * that doesn't call btrfs_search_slot
4736  */
4737 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4738 			    const struct btrfs_key *cpu_key, u32 *data_size,
4739 			    u32 total_data, u32 total_size, int nr)
4740 {
4741 	struct btrfs_fs_info *fs_info = root->fs_info;
4742 	struct btrfs_item *item;
4743 	int i;
4744 	u32 nritems;
4745 	unsigned int data_end;
4746 	struct btrfs_disk_key disk_key;
4747 	struct extent_buffer *leaf;
4748 	int slot;
4749 	struct btrfs_map_token token;
4750 
4751 	if (path->slots[0] == 0) {
4752 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4753 		fixup_low_keys(fs_info, path, &disk_key, 1);
4754 	}
4755 	btrfs_unlock_up_safe(path, 1);
4756 
4757 	btrfs_init_map_token(&token);
4758 
4759 	leaf = path->nodes[0];
4760 	slot = path->slots[0];
4761 
4762 	nritems = btrfs_header_nritems(leaf);
4763 	data_end = leaf_data_end(fs_info, leaf);
4764 
4765 	if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4766 		btrfs_print_leaf(leaf);
4767 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4768 			   total_size, btrfs_leaf_free_space(fs_info, leaf));
4769 		BUG();
4770 	}
4771 
4772 	if (slot != nritems) {
4773 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4774 
4775 		if (old_data < data_end) {
4776 			btrfs_print_leaf(leaf);
4777 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4778 				   slot, old_data, data_end);
4779 			BUG_ON(1);
4780 		}
4781 		/*
4782 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4783 		 */
4784 		/* first correct the data pointers */
4785 		for (i = slot; i < nritems; i++) {
4786 			u32 ioff;
4787 
4788 			item = btrfs_item_nr(i);
4789 			ioff = btrfs_token_item_offset(leaf, item, &token);
4790 			btrfs_set_token_item_offset(leaf, item,
4791 						    ioff - total_data, &token);
4792 		}
4793 		/* shift the items */
4794 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4795 			      btrfs_item_nr_offset(slot),
4796 			      (nritems - slot) * sizeof(struct btrfs_item));
4797 
4798 		/* shift the data */
4799 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4800 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4801 			      data_end, old_data - data_end);
4802 		data_end = old_data;
4803 	}
4804 
4805 	/* setup the item for the new data */
4806 	for (i = 0; i < nr; i++) {
4807 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4808 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4809 		item = btrfs_item_nr(slot + i);
4810 		btrfs_set_token_item_offset(leaf, item,
4811 					    data_end - data_size[i], &token);
4812 		data_end -= data_size[i];
4813 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4814 	}
4815 
4816 	btrfs_set_header_nritems(leaf, nritems + nr);
4817 	btrfs_mark_buffer_dirty(leaf);
4818 
4819 	if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4820 		btrfs_print_leaf(leaf);
4821 		BUG();
4822 	}
4823 }
4824 
4825 /*
4826  * Given a key and some data, insert items into the tree.
4827  * This does all the path init required, making room in the tree if needed.
4828  */
4829 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4830 			    struct btrfs_root *root,
4831 			    struct btrfs_path *path,
4832 			    const struct btrfs_key *cpu_key, u32 *data_size,
4833 			    int nr)
4834 {
4835 	int ret = 0;
4836 	int slot;
4837 	int i;
4838 	u32 total_size = 0;
4839 	u32 total_data = 0;
4840 
4841 	for (i = 0; i < nr; i++)
4842 		total_data += data_size[i];
4843 
4844 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4845 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4846 	if (ret == 0)
4847 		return -EEXIST;
4848 	if (ret < 0)
4849 		return ret;
4850 
4851 	slot = path->slots[0];
4852 	BUG_ON(slot < 0);
4853 
4854 	setup_items_for_insert(root, path, cpu_key, data_size,
4855 			       total_data, total_size, nr);
4856 	return 0;
4857 }
4858 
4859 /*
4860  * Given a key and some data, insert an item into the tree.
4861  * This does all the path init required, making room in the tree if needed.
4862  */
4863 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4864 		      const struct btrfs_key *cpu_key, void *data,
4865 		      u32 data_size)
4866 {
4867 	int ret = 0;
4868 	struct btrfs_path *path;
4869 	struct extent_buffer *leaf;
4870 	unsigned long ptr;
4871 
4872 	path = btrfs_alloc_path();
4873 	if (!path)
4874 		return -ENOMEM;
4875 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4876 	if (!ret) {
4877 		leaf = path->nodes[0];
4878 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4879 		write_extent_buffer(leaf, data, ptr, data_size);
4880 		btrfs_mark_buffer_dirty(leaf);
4881 	}
4882 	btrfs_free_path(path);
4883 	return ret;
4884 }
4885 
4886 /*
4887  * delete the pointer from a given node.
4888  *
4889  * the tree should have been previously balanced so the deletion does not
4890  * empty a node.
4891  */
4892 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4893 		    int level, int slot)
4894 {
4895 	struct btrfs_fs_info *fs_info = root->fs_info;
4896 	struct extent_buffer *parent = path->nodes[level];
4897 	u32 nritems;
4898 	int ret;
4899 
4900 	nritems = btrfs_header_nritems(parent);
4901 	if (slot != nritems - 1) {
4902 		if (level)
4903 			tree_mod_log_eb_move(fs_info, parent, slot,
4904 					     slot + 1, nritems - slot - 1);
4905 		memmove_extent_buffer(parent,
4906 			      btrfs_node_key_ptr_offset(slot),
4907 			      btrfs_node_key_ptr_offset(slot + 1),
4908 			      sizeof(struct btrfs_key_ptr) *
4909 			      (nritems - slot - 1));
4910 	} else if (level) {
4911 		ret = tree_mod_log_insert_key(fs_info, parent, slot,
4912 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4913 		BUG_ON(ret < 0);
4914 	}
4915 
4916 	nritems--;
4917 	btrfs_set_header_nritems(parent, nritems);
4918 	if (nritems == 0 && parent == root->node) {
4919 		BUG_ON(btrfs_header_level(root->node) != 1);
4920 		/* just turn the root into a leaf and break */
4921 		btrfs_set_header_level(root->node, 0);
4922 	} else if (slot == 0) {
4923 		struct btrfs_disk_key disk_key;
4924 
4925 		btrfs_node_key(parent, &disk_key, 0);
4926 		fixup_low_keys(fs_info, path, &disk_key, level + 1);
4927 	}
4928 	btrfs_mark_buffer_dirty(parent);
4929 }
4930 
4931 /*
4932  * a helper function to delete the leaf pointed to by path->slots[1] and
4933  * path->nodes[1].
4934  *
4935  * This deletes the pointer in path->nodes[1] and frees the leaf
4936  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4937  *
4938  * The path must have already been setup for deleting the leaf, including
4939  * all the proper balancing.  path->nodes[1] must be locked.
4940  */
4941 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4942 				    struct btrfs_root *root,
4943 				    struct btrfs_path *path,
4944 				    struct extent_buffer *leaf)
4945 {
4946 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4947 	del_ptr(root, path, 1, path->slots[1]);
4948 
4949 	/*
4950 	 * btrfs_free_extent is expensive, we want to make sure we
4951 	 * aren't holding any locks when we call it
4952 	 */
4953 	btrfs_unlock_up_safe(path, 0);
4954 
4955 	root_sub_used(root, leaf->len);
4956 
4957 	extent_buffer_get(leaf);
4958 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4959 	free_extent_buffer_stale(leaf);
4960 }
4961 /*
4962  * delete the item at the leaf level in path.  If that empties
4963  * the leaf, remove it from the tree
4964  */
4965 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4966 		    struct btrfs_path *path, int slot, int nr)
4967 {
4968 	struct btrfs_fs_info *fs_info = root->fs_info;
4969 	struct extent_buffer *leaf;
4970 	struct btrfs_item *item;
4971 	u32 last_off;
4972 	u32 dsize = 0;
4973 	int ret = 0;
4974 	int wret;
4975 	int i;
4976 	u32 nritems;
4977 	struct btrfs_map_token token;
4978 
4979 	btrfs_init_map_token(&token);
4980 
4981 	leaf = path->nodes[0];
4982 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4983 
4984 	for (i = 0; i < nr; i++)
4985 		dsize += btrfs_item_size_nr(leaf, slot + i);
4986 
4987 	nritems = btrfs_header_nritems(leaf);
4988 
4989 	if (slot + nr != nritems) {
4990 		int data_end = leaf_data_end(fs_info, leaf);
4991 
4992 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4993 			      data_end + dsize,
4994 			      BTRFS_LEAF_DATA_OFFSET + data_end,
4995 			      last_off - data_end);
4996 
4997 		for (i = slot + nr; i < nritems; i++) {
4998 			u32 ioff;
4999 
5000 			item = btrfs_item_nr(i);
5001 			ioff = btrfs_token_item_offset(leaf, item, &token);
5002 			btrfs_set_token_item_offset(leaf, item,
5003 						    ioff + dsize, &token);
5004 		}
5005 
5006 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5007 			      btrfs_item_nr_offset(slot + nr),
5008 			      sizeof(struct btrfs_item) *
5009 			      (nritems - slot - nr));
5010 	}
5011 	btrfs_set_header_nritems(leaf, nritems - nr);
5012 	nritems -= nr;
5013 
5014 	/* delete the leaf if we've emptied it */
5015 	if (nritems == 0) {
5016 		if (leaf == root->node) {
5017 			btrfs_set_header_level(leaf, 0);
5018 		} else {
5019 			btrfs_set_path_blocking(path);
5020 			clean_tree_block(fs_info, leaf);
5021 			btrfs_del_leaf(trans, root, path, leaf);
5022 		}
5023 	} else {
5024 		int used = leaf_space_used(leaf, 0, nritems);
5025 		if (slot == 0) {
5026 			struct btrfs_disk_key disk_key;
5027 
5028 			btrfs_item_key(leaf, &disk_key, 0);
5029 			fixup_low_keys(fs_info, path, &disk_key, 1);
5030 		}
5031 
5032 		/* delete the leaf if it is mostly empty */
5033 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5034 			/* push_leaf_left fixes the path.
5035 			 * make sure the path still points to our leaf
5036 			 * for possible call to del_ptr below
5037 			 */
5038 			slot = path->slots[1];
5039 			extent_buffer_get(leaf);
5040 
5041 			btrfs_set_path_blocking(path);
5042 			wret = push_leaf_left(trans, root, path, 1, 1,
5043 					      1, (u32)-1);
5044 			if (wret < 0 && wret != -ENOSPC)
5045 				ret = wret;
5046 
5047 			if (path->nodes[0] == leaf &&
5048 			    btrfs_header_nritems(leaf)) {
5049 				wret = push_leaf_right(trans, root, path, 1,
5050 						       1, 1, 0);
5051 				if (wret < 0 && wret != -ENOSPC)
5052 					ret = wret;
5053 			}
5054 
5055 			if (btrfs_header_nritems(leaf) == 0) {
5056 				path->slots[1] = slot;
5057 				btrfs_del_leaf(trans, root, path, leaf);
5058 				free_extent_buffer(leaf);
5059 				ret = 0;
5060 			} else {
5061 				/* if we're still in the path, make sure
5062 				 * we're dirty.  Otherwise, one of the
5063 				 * push_leaf functions must have already
5064 				 * dirtied this buffer
5065 				 */
5066 				if (path->nodes[0] == leaf)
5067 					btrfs_mark_buffer_dirty(leaf);
5068 				free_extent_buffer(leaf);
5069 			}
5070 		} else {
5071 			btrfs_mark_buffer_dirty(leaf);
5072 		}
5073 	}
5074 	return ret;
5075 }
5076 
5077 /*
5078  * search the tree again to find a leaf with lesser keys
5079  * returns 0 if it found something or 1 if there are no lesser leaves.
5080  * returns < 0 on io errors.
5081  *
5082  * This may release the path, and so you may lose any locks held at the
5083  * time you call it.
5084  */
5085 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5086 {
5087 	struct btrfs_key key;
5088 	struct btrfs_disk_key found_key;
5089 	int ret;
5090 
5091 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5092 
5093 	if (key.offset > 0) {
5094 		key.offset--;
5095 	} else if (key.type > 0) {
5096 		key.type--;
5097 		key.offset = (u64)-1;
5098 	} else if (key.objectid > 0) {
5099 		key.objectid--;
5100 		key.type = (u8)-1;
5101 		key.offset = (u64)-1;
5102 	} else {
5103 		return 1;
5104 	}
5105 
5106 	btrfs_release_path(path);
5107 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5108 	if (ret < 0)
5109 		return ret;
5110 	btrfs_item_key(path->nodes[0], &found_key, 0);
5111 	ret = comp_keys(&found_key, &key);
5112 	/*
5113 	 * We might have had an item with the previous key in the tree right
5114 	 * before we released our path. And after we released our path, that
5115 	 * item might have been pushed to the first slot (0) of the leaf we
5116 	 * were holding due to a tree balance. Alternatively, an item with the
5117 	 * previous key can exist as the only element of a leaf (big fat item).
5118 	 * Therefore account for these 2 cases, so that our callers (like
5119 	 * btrfs_previous_item) don't miss an existing item with a key matching
5120 	 * the previous key we computed above.
5121 	 */
5122 	if (ret <= 0)
5123 		return 0;
5124 	return 1;
5125 }
5126 
5127 /*
5128  * A helper function to walk down the tree starting at min_key, and looking
5129  * for nodes or leaves that are have a minimum transaction id.
5130  * This is used by the btree defrag code, and tree logging
5131  *
5132  * This does not cow, but it does stuff the starting key it finds back
5133  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5134  * key and get a writable path.
5135  *
5136  * This does lock as it descends, and path->keep_locks should be set
5137  * to 1 by the caller.
5138  *
5139  * This honors path->lowest_level to prevent descent past a given level
5140  * of the tree.
5141  *
5142  * min_trans indicates the oldest transaction that you are interested
5143  * in walking through.  Any nodes or leaves older than min_trans are
5144  * skipped over (without reading them).
5145  *
5146  * returns zero if something useful was found, < 0 on error and 1 if there
5147  * was nothing in the tree that matched the search criteria.
5148  */
5149 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5150 			 struct btrfs_path *path,
5151 			 u64 min_trans)
5152 {
5153 	struct btrfs_fs_info *fs_info = root->fs_info;
5154 	struct extent_buffer *cur;
5155 	struct btrfs_key found_key;
5156 	int slot;
5157 	int sret;
5158 	u32 nritems;
5159 	int level;
5160 	int ret = 1;
5161 	int keep_locks = path->keep_locks;
5162 
5163 	path->keep_locks = 1;
5164 again:
5165 	cur = btrfs_read_lock_root_node(root);
5166 	level = btrfs_header_level(cur);
5167 	WARN_ON(path->nodes[level]);
5168 	path->nodes[level] = cur;
5169 	path->locks[level] = BTRFS_READ_LOCK;
5170 
5171 	if (btrfs_header_generation(cur) < min_trans) {
5172 		ret = 1;
5173 		goto out;
5174 	}
5175 	while (1) {
5176 		nritems = btrfs_header_nritems(cur);
5177 		level = btrfs_header_level(cur);
5178 		sret = bin_search(cur, min_key, level, &slot);
5179 
5180 		/* at the lowest level, we're done, setup the path and exit */
5181 		if (level == path->lowest_level) {
5182 			if (slot >= nritems)
5183 				goto find_next_key;
5184 			ret = 0;
5185 			path->slots[level] = slot;
5186 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5187 			goto out;
5188 		}
5189 		if (sret && slot > 0)
5190 			slot--;
5191 		/*
5192 		 * check this node pointer against the min_trans parameters.
5193 		 * If it is too old, old, skip to the next one.
5194 		 */
5195 		while (slot < nritems) {
5196 			u64 gen;
5197 
5198 			gen = btrfs_node_ptr_generation(cur, slot);
5199 			if (gen < min_trans) {
5200 				slot++;
5201 				continue;
5202 			}
5203 			break;
5204 		}
5205 find_next_key:
5206 		/*
5207 		 * we didn't find a candidate key in this node, walk forward
5208 		 * and find another one
5209 		 */
5210 		if (slot >= nritems) {
5211 			path->slots[level] = slot;
5212 			btrfs_set_path_blocking(path);
5213 			sret = btrfs_find_next_key(root, path, min_key, level,
5214 						  min_trans);
5215 			if (sret == 0) {
5216 				btrfs_release_path(path);
5217 				goto again;
5218 			} else {
5219 				goto out;
5220 			}
5221 		}
5222 		/* save our key for returning back */
5223 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5224 		path->slots[level] = slot;
5225 		if (level == path->lowest_level) {
5226 			ret = 0;
5227 			goto out;
5228 		}
5229 		btrfs_set_path_blocking(path);
5230 		cur = read_node_slot(fs_info, cur, slot);
5231 		if (IS_ERR(cur)) {
5232 			ret = PTR_ERR(cur);
5233 			goto out;
5234 		}
5235 
5236 		btrfs_tree_read_lock(cur);
5237 
5238 		path->locks[level - 1] = BTRFS_READ_LOCK;
5239 		path->nodes[level - 1] = cur;
5240 		unlock_up(path, level, 1, 0, NULL);
5241 		btrfs_clear_path_blocking(path, NULL, 0);
5242 	}
5243 out:
5244 	path->keep_locks = keep_locks;
5245 	if (ret == 0) {
5246 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5247 		btrfs_set_path_blocking(path);
5248 		memcpy(min_key, &found_key, sizeof(found_key));
5249 	}
5250 	return ret;
5251 }
5252 
5253 static int tree_move_down(struct btrfs_fs_info *fs_info,
5254 			   struct btrfs_path *path,
5255 			   int *level)
5256 {
5257 	struct extent_buffer *eb;
5258 
5259 	BUG_ON(*level == 0);
5260 	eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5261 	if (IS_ERR(eb))
5262 		return PTR_ERR(eb);
5263 
5264 	path->nodes[*level - 1] = eb;
5265 	path->slots[*level - 1] = 0;
5266 	(*level)--;
5267 	return 0;
5268 }
5269 
5270 static int tree_move_next_or_upnext(struct btrfs_path *path,
5271 				    int *level, int root_level)
5272 {
5273 	int ret = 0;
5274 	int nritems;
5275 	nritems = btrfs_header_nritems(path->nodes[*level]);
5276 
5277 	path->slots[*level]++;
5278 
5279 	while (path->slots[*level] >= nritems) {
5280 		if (*level == root_level)
5281 			return -1;
5282 
5283 		/* move upnext */
5284 		path->slots[*level] = 0;
5285 		free_extent_buffer(path->nodes[*level]);
5286 		path->nodes[*level] = NULL;
5287 		(*level)++;
5288 		path->slots[*level]++;
5289 
5290 		nritems = btrfs_header_nritems(path->nodes[*level]);
5291 		ret = 1;
5292 	}
5293 	return ret;
5294 }
5295 
5296 /*
5297  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5298  * or down.
5299  */
5300 static int tree_advance(struct btrfs_fs_info *fs_info,
5301 			struct btrfs_path *path,
5302 			int *level, int root_level,
5303 			int allow_down,
5304 			struct btrfs_key *key)
5305 {
5306 	int ret;
5307 
5308 	if (*level == 0 || !allow_down) {
5309 		ret = tree_move_next_or_upnext(path, level, root_level);
5310 	} else {
5311 		ret = tree_move_down(fs_info, path, level);
5312 	}
5313 	if (ret >= 0) {
5314 		if (*level == 0)
5315 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5316 					path->slots[*level]);
5317 		else
5318 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5319 					path->slots[*level]);
5320 	}
5321 	return ret;
5322 }
5323 
5324 static int tree_compare_item(struct btrfs_path *left_path,
5325 			     struct btrfs_path *right_path,
5326 			     char *tmp_buf)
5327 {
5328 	int cmp;
5329 	int len1, len2;
5330 	unsigned long off1, off2;
5331 
5332 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5333 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5334 	if (len1 != len2)
5335 		return 1;
5336 
5337 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5338 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5339 				right_path->slots[0]);
5340 
5341 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5342 
5343 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5344 	if (cmp)
5345 		return 1;
5346 	return 0;
5347 }
5348 
5349 #define ADVANCE 1
5350 #define ADVANCE_ONLY_NEXT -1
5351 
5352 /*
5353  * This function compares two trees and calls the provided callback for
5354  * every changed/new/deleted item it finds.
5355  * If shared tree blocks are encountered, whole subtrees are skipped, making
5356  * the compare pretty fast on snapshotted subvolumes.
5357  *
5358  * This currently works on commit roots only. As commit roots are read only,
5359  * we don't do any locking. The commit roots are protected with transactions.
5360  * Transactions are ended and rejoined when a commit is tried in between.
5361  *
5362  * This function checks for modifications done to the trees while comparing.
5363  * If it detects a change, it aborts immediately.
5364  */
5365 int btrfs_compare_trees(struct btrfs_root *left_root,
5366 			struct btrfs_root *right_root,
5367 			btrfs_changed_cb_t changed_cb, void *ctx)
5368 {
5369 	struct btrfs_fs_info *fs_info = left_root->fs_info;
5370 	int ret;
5371 	int cmp;
5372 	struct btrfs_path *left_path = NULL;
5373 	struct btrfs_path *right_path = NULL;
5374 	struct btrfs_key left_key;
5375 	struct btrfs_key right_key;
5376 	char *tmp_buf = NULL;
5377 	int left_root_level;
5378 	int right_root_level;
5379 	int left_level;
5380 	int right_level;
5381 	int left_end_reached;
5382 	int right_end_reached;
5383 	int advance_left;
5384 	int advance_right;
5385 	u64 left_blockptr;
5386 	u64 right_blockptr;
5387 	u64 left_gen;
5388 	u64 right_gen;
5389 
5390 	left_path = btrfs_alloc_path();
5391 	if (!left_path) {
5392 		ret = -ENOMEM;
5393 		goto out;
5394 	}
5395 	right_path = btrfs_alloc_path();
5396 	if (!right_path) {
5397 		ret = -ENOMEM;
5398 		goto out;
5399 	}
5400 
5401 	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5402 	if (!tmp_buf) {
5403 		ret = -ENOMEM;
5404 		goto out;
5405 	}
5406 
5407 	left_path->search_commit_root = 1;
5408 	left_path->skip_locking = 1;
5409 	right_path->search_commit_root = 1;
5410 	right_path->skip_locking = 1;
5411 
5412 	/*
5413 	 * Strategy: Go to the first items of both trees. Then do
5414 	 *
5415 	 * If both trees are at level 0
5416 	 *   Compare keys of current items
5417 	 *     If left < right treat left item as new, advance left tree
5418 	 *       and repeat
5419 	 *     If left > right treat right item as deleted, advance right tree
5420 	 *       and repeat
5421 	 *     If left == right do deep compare of items, treat as changed if
5422 	 *       needed, advance both trees and repeat
5423 	 * If both trees are at the same level but not at level 0
5424 	 *   Compare keys of current nodes/leafs
5425 	 *     If left < right advance left tree and repeat
5426 	 *     If left > right advance right tree and repeat
5427 	 *     If left == right compare blockptrs of the next nodes/leafs
5428 	 *       If they match advance both trees but stay at the same level
5429 	 *         and repeat
5430 	 *       If they don't match advance both trees while allowing to go
5431 	 *         deeper and repeat
5432 	 * If tree levels are different
5433 	 *   Advance the tree that needs it and repeat
5434 	 *
5435 	 * Advancing a tree means:
5436 	 *   If we are at level 0, try to go to the next slot. If that's not
5437 	 *   possible, go one level up and repeat. Stop when we found a level
5438 	 *   where we could go to the next slot. We may at this point be on a
5439 	 *   node or a leaf.
5440 	 *
5441 	 *   If we are not at level 0 and not on shared tree blocks, go one
5442 	 *   level deeper.
5443 	 *
5444 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5445 	 *   the right if possible or go up and right.
5446 	 */
5447 
5448 	down_read(&fs_info->commit_root_sem);
5449 	left_level = btrfs_header_level(left_root->commit_root);
5450 	left_root_level = left_level;
5451 	left_path->nodes[left_level] = left_root->commit_root;
5452 	extent_buffer_get(left_path->nodes[left_level]);
5453 
5454 	right_level = btrfs_header_level(right_root->commit_root);
5455 	right_root_level = right_level;
5456 	right_path->nodes[right_level] = right_root->commit_root;
5457 	extent_buffer_get(right_path->nodes[right_level]);
5458 	up_read(&fs_info->commit_root_sem);
5459 
5460 	if (left_level == 0)
5461 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5462 				&left_key, left_path->slots[left_level]);
5463 	else
5464 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5465 				&left_key, left_path->slots[left_level]);
5466 	if (right_level == 0)
5467 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5468 				&right_key, right_path->slots[right_level]);
5469 	else
5470 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5471 				&right_key, right_path->slots[right_level]);
5472 
5473 	left_end_reached = right_end_reached = 0;
5474 	advance_left = advance_right = 0;
5475 
5476 	while (1) {
5477 		if (advance_left && !left_end_reached) {
5478 			ret = tree_advance(fs_info, left_path, &left_level,
5479 					left_root_level,
5480 					advance_left != ADVANCE_ONLY_NEXT,
5481 					&left_key);
5482 			if (ret == -1)
5483 				left_end_reached = ADVANCE;
5484 			else if (ret < 0)
5485 				goto out;
5486 			advance_left = 0;
5487 		}
5488 		if (advance_right && !right_end_reached) {
5489 			ret = tree_advance(fs_info, right_path, &right_level,
5490 					right_root_level,
5491 					advance_right != ADVANCE_ONLY_NEXT,
5492 					&right_key);
5493 			if (ret == -1)
5494 				right_end_reached = ADVANCE;
5495 			else if (ret < 0)
5496 				goto out;
5497 			advance_right = 0;
5498 		}
5499 
5500 		if (left_end_reached && right_end_reached) {
5501 			ret = 0;
5502 			goto out;
5503 		} else if (left_end_reached) {
5504 			if (right_level == 0) {
5505 				ret = changed_cb(left_path, right_path,
5506 						&right_key,
5507 						BTRFS_COMPARE_TREE_DELETED,
5508 						ctx);
5509 				if (ret < 0)
5510 					goto out;
5511 			}
5512 			advance_right = ADVANCE;
5513 			continue;
5514 		} else if (right_end_reached) {
5515 			if (left_level == 0) {
5516 				ret = changed_cb(left_path, right_path,
5517 						&left_key,
5518 						BTRFS_COMPARE_TREE_NEW,
5519 						ctx);
5520 				if (ret < 0)
5521 					goto out;
5522 			}
5523 			advance_left = ADVANCE;
5524 			continue;
5525 		}
5526 
5527 		if (left_level == 0 && right_level == 0) {
5528 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5529 			if (cmp < 0) {
5530 				ret = changed_cb(left_path, right_path,
5531 						&left_key,
5532 						BTRFS_COMPARE_TREE_NEW,
5533 						ctx);
5534 				if (ret < 0)
5535 					goto out;
5536 				advance_left = ADVANCE;
5537 			} else if (cmp > 0) {
5538 				ret = changed_cb(left_path, right_path,
5539 						&right_key,
5540 						BTRFS_COMPARE_TREE_DELETED,
5541 						ctx);
5542 				if (ret < 0)
5543 					goto out;
5544 				advance_right = ADVANCE;
5545 			} else {
5546 				enum btrfs_compare_tree_result result;
5547 
5548 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5549 				ret = tree_compare_item(left_path, right_path,
5550 							tmp_buf);
5551 				if (ret)
5552 					result = BTRFS_COMPARE_TREE_CHANGED;
5553 				else
5554 					result = BTRFS_COMPARE_TREE_SAME;
5555 				ret = changed_cb(left_path, right_path,
5556 						 &left_key, result, ctx);
5557 				if (ret < 0)
5558 					goto out;
5559 				advance_left = ADVANCE;
5560 				advance_right = ADVANCE;
5561 			}
5562 		} else if (left_level == right_level) {
5563 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5564 			if (cmp < 0) {
5565 				advance_left = ADVANCE;
5566 			} else if (cmp > 0) {
5567 				advance_right = ADVANCE;
5568 			} else {
5569 				left_blockptr = btrfs_node_blockptr(
5570 						left_path->nodes[left_level],
5571 						left_path->slots[left_level]);
5572 				right_blockptr = btrfs_node_blockptr(
5573 						right_path->nodes[right_level],
5574 						right_path->slots[right_level]);
5575 				left_gen = btrfs_node_ptr_generation(
5576 						left_path->nodes[left_level],
5577 						left_path->slots[left_level]);
5578 				right_gen = btrfs_node_ptr_generation(
5579 						right_path->nodes[right_level],
5580 						right_path->slots[right_level]);
5581 				if (left_blockptr == right_blockptr &&
5582 				    left_gen == right_gen) {
5583 					/*
5584 					 * As we're on a shared block, don't
5585 					 * allow to go deeper.
5586 					 */
5587 					advance_left = ADVANCE_ONLY_NEXT;
5588 					advance_right = ADVANCE_ONLY_NEXT;
5589 				} else {
5590 					advance_left = ADVANCE;
5591 					advance_right = ADVANCE;
5592 				}
5593 			}
5594 		} else if (left_level < right_level) {
5595 			advance_right = ADVANCE;
5596 		} else {
5597 			advance_left = ADVANCE;
5598 		}
5599 	}
5600 
5601 out:
5602 	btrfs_free_path(left_path);
5603 	btrfs_free_path(right_path);
5604 	kvfree(tmp_buf);
5605 	return ret;
5606 }
5607 
5608 /*
5609  * this is similar to btrfs_next_leaf, but does not try to preserve
5610  * and fixup the path.  It looks for and returns the next key in the
5611  * tree based on the current path and the min_trans parameters.
5612  *
5613  * 0 is returned if another key is found, < 0 if there are any errors
5614  * and 1 is returned if there are no higher keys in the tree
5615  *
5616  * path->keep_locks should be set to 1 on the search made before
5617  * calling this function.
5618  */
5619 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5620 			struct btrfs_key *key, int level, u64 min_trans)
5621 {
5622 	int slot;
5623 	struct extent_buffer *c;
5624 
5625 	WARN_ON(!path->keep_locks);
5626 	while (level < BTRFS_MAX_LEVEL) {
5627 		if (!path->nodes[level])
5628 			return 1;
5629 
5630 		slot = path->slots[level] + 1;
5631 		c = path->nodes[level];
5632 next:
5633 		if (slot >= btrfs_header_nritems(c)) {
5634 			int ret;
5635 			int orig_lowest;
5636 			struct btrfs_key cur_key;
5637 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5638 			    !path->nodes[level + 1])
5639 				return 1;
5640 
5641 			if (path->locks[level + 1]) {
5642 				level++;
5643 				continue;
5644 			}
5645 
5646 			slot = btrfs_header_nritems(c) - 1;
5647 			if (level == 0)
5648 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5649 			else
5650 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5651 
5652 			orig_lowest = path->lowest_level;
5653 			btrfs_release_path(path);
5654 			path->lowest_level = level;
5655 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5656 						0, 0);
5657 			path->lowest_level = orig_lowest;
5658 			if (ret < 0)
5659 				return ret;
5660 
5661 			c = path->nodes[level];
5662 			slot = path->slots[level];
5663 			if (ret == 0)
5664 				slot++;
5665 			goto next;
5666 		}
5667 
5668 		if (level == 0)
5669 			btrfs_item_key_to_cpu(c, key, slot);
5670 		else {
5671 			u64 gen = btrfs_node_ptr_generation(c, slot);
5672 
5673 			if (gen < min_trans) {
5674 				slot++;
5675 				goto next;
5676 			}
5677 			btrfs_node_key_to_cpu(c, key, slot);
5678 		}
5679 		return 0;
5680 	}
5681 	return 1;
5682 }
5683 
5684 /*
5685  * search the tree again to find a leaf with greater keys
5686  * returns 0 if it found something or 1 if there are no greater leaves.
5687  * returns < 0 on io errors.
5688  */
5689 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5690 {
5691 	return btrfs_next_old_leaf(root, path, 0);
5692 }
5693 
5694 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5695 			u64 time_seq)
5696 {
5697 	int slot;
5698 	int level;
5699 	struct extent_buffer *c;
5700 	struct extent_buffer *next;
5701 	struct btrfs_key key;
5702 	u32 nritems;
5703 	int ret;
5704 	int old_spinning = path->leave_spinning;
5705 	int next_rw_lock = 0;
5706 
5707 	nritems = btrfs_header_nritems(path->nodes[0]);
5708 	if (nritems == 0)
5709 		return 1;
5710 
5711 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5712 again:
5713 	level = 1;
5714 	next = NULL;
5715 	next_rw_lock = 0;
5716 	btrfs_release_path(path);
5717 
5718 	path->keep_locks = 1;
5719 	path->leave_spinning = 1;
5720 
5721 	if (time_seq)
5722 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5723 	else
5724 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5725 	path->keep_locks = 0;
5726 
5727 	if (ret < 0)
5728 		return ret;
5729 
5730 	nritems = btrfs_header_nritems(path->nodes[0]);
5731 	/*
5732 	 * by releasing the path above we dropped all our locks.  A balance
5733 	 * could have added more items next to the key that used to be
5734 	 * at the very end of the block.  So, check again here and
5735 	 * advance the path if there are now more items available.
5736 	 */
5737 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5738 		if (ret == 0)
5739 			path->slots[0]++;
5740 		ret = 0;
5741 		goto done;
5742 	}
5743 	/*
5744 	 * So the above check misses one case:
5745 	 * - after releasing the path above, someone has removed the item that
5746 	 *   used to be at the very end of the block, and balance between leafs
5747 	 *   gets another one with bigger key.offset to replace it.
5748 	 *
5749 	 * This one should be returned as well, or we can get leaf corruption
5750 	 * later(esp. in __btrfs_drop_extents()).
5751 	 *
5752 	 * And a bit more explanation about this check,
5753 	 * with ret > 0, the key isn't found, the path points to the slot
5754 	 * where it should be inserted, so the path->slots[0] item must be the
5755 	 * bigger one.
5756 	 */
5757 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5758 		ret = 0;
5759 		goto done;
5760 	}
5761 
5762 	while (level < BTRFS_MAX_LEVEL) {
5763 		if (!path->nodes[level]) {
5764 			ret = 1;
5765 			goto done;
5766 		}
5767 
5768 		slot = path->slots[level] + 1;
5769 		c = path->nodes[level];
5770 		if (slot >= btrfs_header_nritems(c)) {
5771 			level++;
5772 			if (level == BTRFS_MAX_LEVEL) {
5773 				ret = 1;
5774 				goto done;
5775 			}
5776 			continue;
5777 		}
5778 
5779 		if (next) {
5780 			btrfs_tree_unlock_rw(next, next_rw_lock);
5781 			free_extent_buffer(next);
5782 		}
5783 
5784 		next = c;
5785 		next_rw_lock = path->locks[level];
5786 		ret = read_block_for_search(root, path, &next, level,
5787 					    slot, &key);
5788 		if (ret == -EAGAIN)
5789 			goto again;
5790 
5791 		if (ret < 0) {
5792 			btrfs_release_path(path);
5793 			goto done;
5794 		}
5795 
5796 		if (!path->skip_locking) {
5797 			ret = btrfs_try_tree_read_lock(next);
5798 			if (!ret && time_seq) {
5799 				/*
5800 				 * If we don't get the lock, we may be racing
5801 				 * with push_leaf_left, holding that lock while
5802 				 * itself waiting for the leaf we've currently
5803 				 * locked. To solve this situation, we give up
5804 				 * on our lock and cycle.
5805 				 */
5806 				free_extent_buffer(next);
5807 				btrfs_release_path(path);
5808 				cond_resched();
5809 				goto again;
5810 			}
5811 			if (!ret) {
5812 				btrfs_set_path_blocking(path);
5813 				btrfs_tree_read_lock(next);
5814 				btrfs_clear_path_blocking(path, next,
5815 							  BTRFS_READ_LOCK);
5816 			}
5817 			next_rw_lock = BTRFS_READ_LOCK;
5818 		}
5819 		break;
5820 	}
5821 	path->slots[level] = slot;
5822 	while (1) {
5823 		level--;
5824 		c = path->nodes[level];
5825 		if (path->locks[level])
5826 			btrfs_tree_unlock_rw(c, path->locks[level]);
5827 
5828 		free_extent_buffer(c);
5829 		path->nodes[level] = next;
5830 		path->slots[level] = 0;
5831 		if (!path->skip_locking)
5832 			path->locks[level] = next_rw_lock;
5833 		if (!level)
5834 			break;
5835 
5836 		ret = read_block_for_search(root, path, &next, level,
5837 					    0, &key);
5838 		if (ret == -EAGAIN)
5839 			goto again;
5840 
5841 		if (ret < 0) {
5842 			btrfs_release_path(path);
5843 			goto done;
5844 		}
5845 
5846 		if (!path->skip_locking) {
5847 			ret = btrfs_try_tree_read_lock(next);
5848 			if (!ret) {
5849 				btrfs_set_path_blocking(path);
5850 				btrfs_tree_read_lock(next);
5851 				btrfs_clear_path_blocking(path, next,
5852 							  BTRFS_READ_LOCK);
5853 			}
5854 			next_rw_lock = BTRFS_READ_LOCK;
5855 		}
5856 	}
5857 	ret = 0;
5858 done:
5859 	unlock_up(path, 0, 1, 0, NULL);
5860 	path->leave_spinning = old_spinning;
5861 	if (!old_spinning)
5862 		btrfs_set_path_blocking(path);
5863 
5864 	return ret;
5865 }
5866 
5867 /*
5868  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5869  * searching until it gets past min_objectid or finds an item of 'type'
5870  *
5871  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5872  */
5873 int btrfs_previous_item(struct btrfs_root *root,
5874 			struct btrfs_path *path, u64 min_objectid,
5875 			int type)
5876 {
5877 	struct btrfs_key found_key;
5878 	struct extent_buffer *leaf;
5879 	u32 nritems;
5880 	int ret;
5881 
5882 	while (1) {
5883 		if (path->slots[0] == 0) {
5884 			btrfs_set_path_blocking(path);
5885 			ret = btrfs_prev_leaf(root, path);
5886 			if (ret != 0)
5887 				return ret;
5888 		} else {
5889 			path->slots[0]--;
5890 		}
5891 		leaf = path->nodes[0];
5892 		nritems = btrfs_header_nritems(leaf);
5893 		if (nritems == 0)
5894 			return 1;
5895 		if (path->slots[0] == nritems)
5896 			path->slots[0]--;
5897 
5898 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5899 		if (found_key.objectid < min_objectid)
5900 			break;
5901 		if (found_key.type == type)
5902 			return 0;
5903 		if (found_key.objectid == min_objectid &&
5904 		    found_key.type < type)
5905 			break;
5906 	}
5907 	return 1;
5908 }
5909 
5910 /*
5911  * search in extent tree to find a previous Metadata/Data extent item with
5912  * min objecitd.
5913  *
5914  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5915  */
5916 int btrfs_previous_extent_item(struct btrfs_root *root,
5917 			struct btrfs_path *path, u64 min_objectid)
5918 {
5919 	struct btrfs_key found_key;
5920 	struct extent_buffer *leaf;
5921 	u32 nritems;
5922 	int ret;
5923 
5924 	while (1) {
5925 		if (path->slots[0] == 0) {
5926 			btrfs_set_path_blocking(path);
5927 			ret = btrfs_prev_leaf(root, path);
5928 			if (ret != 0)
5929 				return ret;
5930 		} else {
5931 			path->slots[0]--;
5932 		}
5933 		leaf = path->nodes[0];
5934 		nritems = btrfs_header_nritems(leaf);
5935 		if (nritems == 0)
5936 			return 1;
5937 		if (path->slots[0] == nritems)
5938 			path->slots[0]--;
5939 
5940 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5941 		if (found_key.objectid < min_objectid)
5942 			break;
5943 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5944 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5945 			return 0;
5946 		if (found_key.objectid == min_objectid &&
5947 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5948 			break;
5949 	}
5950 	return 1;
5951 }
5952