xref: /openbmc/linux/fs/btrfs/ctree.c (revision b159fa2808b1b53d784807a48ad95fa809be10b0)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include <linux/vmalloc.h>
23 #include "ctree.h"
24 #include "disk-io.h"
25 #include "transaction.h"
26 #include "print-tree.h"
27 #include "locking.h"
28 
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30 		      *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
32 		      *root, struct btrfs_key *ins_key,
33 		      struct btrfs_path *path, int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35 			  struct btrfs_root *root, struct extent_buffer *dst,
36 			  struct extent_buffer *src, int empty);
37 static int balance_node_right(struct btrfs_trans_handle *trans,
38 			      struct btrfs_root *root,
39 			      struct extent_buffer *dst_buf,
40 			      struct extent_buffer *src_buf);
41 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
42 		    int level, int slot);
43 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 				 struct extent_buffer *eb);
45 
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49 }
50 
51 /*
52  * set all locked nodes in the path to blocking locks.  This should
53  * be done before scheduling
54  */
55 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
56 {
57 	int i;
58 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
59 		if (!p->nodes[i] || !p->locks[i])
60 			continue;
61 		btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
62 		if (p->locks[i] == BTRFS_READ_LOCK)
63 			p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
64 		else if (p->locks[i] == BTRFS_WRITE_LOCK)
65 			p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
66 	}
67 }
68 
69 /*
70  * reset all the locked nodes in the patch to spinning locks.
71  *
72  * held is used to keep lockdep happy, when lockdep is enabled
73  * we set held to a blocking lock before we go around and
74  * retake all the spinlocks in the path.  You can safely use NULL
75  * for held
76  */
77 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
78 					struct extent_buffer *held, int held_rw)
79 {
80 	int i;
81 
82 	if (held) {
83 		btrfs_set_lock_blocking_rw(held, held_rw);
84 		if (held_rw == BTRFS_WRITE_LOCK)
85 			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
86 		else if (held_rw == BTRFS_READ_LOCK)
87 			held_rw = BTRFS_READ_LOCK_BLOCKING;
88 	}
89 	btrfs_set_path_blocking(p);
90 
91 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
92 		if (p->nodes[i] && p->locks[i]) {
93 			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
94 			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
95 				p->locks[i] = BTRFS_WRITE_LOCK;
96 			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
97 				p->locks[i] = BTRFS_READ_LOCK;
98 		}
99 	}
100 
101 	if (held)
102 		btrfs_clear_lock_blocking_rw(held, held_rw);
103 }
104 
105 /* this also releases the path */
106 void btrfs_free_path(struct btrfs_path *p)
107 {
108 	if (!p)
109 		return;
110 	btrfs_release_path(p);
111 	kmem_cache_free(btrfs_path_cachep, p);
112 }
113 
114 /*
115  * path release drops references on the extent buffers in the path
116  * and it drops any locks held by this path
117  *
118  * It is safe to call this on paths that no locks or extent buffers held.
119  */
120 noinline void btrfs_release_path(struct btrfs_path *p)
121 {
122 	int i;
123 
124 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
125 		p->slots[i] = 0;
126 		if (!p->nodes[i])
127 			continue;
128 		if (p->locks[i]) {
129 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
130 			p->locks[i] = 0;
131 		}
132 		free_extent_buffer(p->nodes[i]);
133 		p->nodes[i] = NULL;
134 	}
135 }
136 
137 /*
138  * safely gets a reference on the root node of a tree.  A lock
139  * is not taken, so a concurrent writer may put a different node
140  * at the root of the tree.  See btrfs_lock_root_node for the
141  * looping required.
142  *
143  * The extent buffer returned by this has a reference taken, so
144  * it won't disappear.  It may stop being the root of the tree
145  * at any time because there are no locks held.
146  */
147 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
148 {
149 	struct extent_buffer *eb;
150 
151 	while (1) {
152 		rcu_read_lock();
153 		eb = rcu_dereference(root->node);
154 
155 		/*
156 		 * RCU really hurts here, we could free up the root node because
157 		 * it was COWed but we may not get the new root node yet so do
158 		 * the inc_not_zero dance and if it doesn't work then
159 		 * synchronize_rcu and try again.
160 		 */
161 		if (atomic_inc_not_zero(&eb->refs)) {
162 			rcu_read_unlock();
163 			break;
164 		}
165 		rcu_read_unlock();
166 		synchronize_rcu();
167 	}
168 	return eb;
169 }
170 
171 /* loop around taking references on and locking the root node of the
172  * tree until you end up with a lock on the root.  A locked buffer
173  * is returned, with a reference held.
174  */
175 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
176 {
177 	struct extent_buffer *eb;
178 
179 	while (1) {
180 		eb = btrfs_root_node(root);
181 		btrfs_tree_lock(eb);
182 		if (eb == root->node)
183 			break;
184 		btrfs_tree_unlock(eb);
185 		free_extent_buffer(eb);
186 	}
187 	return eb;
188 }
189 
190 /* loop around taking references on and locking the root node of the
191  * tree until you end up with a lock on the root.  A locked buffer
192  * is returned, with a reference held.
193  */
194 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
195 {
196 	struct extent_buffer *eb;
197 
198 	while (1) {
199 		eb = btrfs_root_node(root);
200 		btrfs_tree_read_lock(eb);
201 		if (eb == root->node)
202 			break;
203 		btrfs_tree_read_unlock(eb);
204 		free_extent_buffer(eb);
205 	}
206 	return eb;
207 }
208 
209 /* cowonly root (everything not a reference counted cow subvolume), just get
210  * put onto a simple dirty list.  transaction.c walks this to make sure they
211  * get properly updated on disk.
212  */
213 static void add_root_to_dirty_list(struct btrfs_root *root)
214 {
215 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
216 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
217 		return;
218 
219 	spin_lock(&root->fs_info->trans_lock);
220 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
221 		/* Want the extent tree to be the last on the list */
222 		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
223 			list_move_tail(&root->dirty_list,
224 				       &root->fs_info->dirty_cowonly_roots);
225 		else
226 			list_move(&root->dirty_list,
227 				  &root->fs_info->dirty_cowonly_roots);
228 	}
229 	spin_unlock(&root->fs_info->trans_lock);
230 }
231 
232 /*
233  * used by snapshot creation to make a copy of a root for a tree with
234  * a given objectid.  The buffer with the new root node is returned in
235  * cow_ret, and this func returns zero on success or a negative error code.
236  */
237 int btrfs_copy_root(struct btrfs_trans_handle *trans,
238 		      struct btrfs_root *root,
239 		      struct extent_buffer *buf,
240 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
241 {
242 	struct extent_buffer *cow;
243 	int ret = 0;
244 	int level;
245 	struct btrfs_disk_key disk_key;
246 
247 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
248 		trans->transid != root->fs_info->running_transaction->transid);
249 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
250 		trans->transid != root->last_trans);
251 
252 	level = btrfs_header_level(buf);
253 	if (level == 0)
254 		btrfs_item_key(buf, &disk_key, 0);
255 	else
256 		btrfs_node_key(buf, &disk_key, 0);
257 
258 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
259 			&disk_key, level, buf->start, 0);
260 	if (IS_ERR(cow))
261 		return PTR_ERR(cow);
262 
263 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
264 	btrfs_set_header_bytenr(cow, cow->start);
265 	btrfs_set_header_generation(cow, trans->transid);
266 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
267 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
268 				     BTRFS_HEADER_FLAG_RELOC);
269 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
270 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
271 	else
272 		btrfs_set_header_owner(cow, new_root_objectid);
273 
274 	write_extent_buffer_fsid(cow, root->fs_info->fsid);
275 
276 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
277 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
278 		ret = btrfs_inc_ref(trans, root, cow, 1);
279 	else
280 		ret = btrfs_inc_ref(trans, root, cow, 0);
281 
282 	if (ret)
283 		return ret;
284 
285 	btrfs_mark_buffer_dirty(cow);
286 	*cow_ret = cow;
287 	return 0;
288 }
289 
290 enum mod_log_op {
291 	MOD_LOG_KEY_REPLACE,
292 	MOD_LOG_KEY_ADD,
293 	MOD_LOG_KEY_REMOVE,
294 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
295 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
296 	MOD_LOG_MOVE_KEYS,
297 	MOD_LOG_ROOT_REPLACE,
298 };
299 
300 struct tree_mod_move {
301 	int dst_slot;
302 	int nr_items;
303 };
304 
305 struct tree_mod_root {
306 	u64 logical;
307 	u8 level;
308 };
309 
310 struct tree_mod_elem {
311 	struct rb_node node;
312 	u64 logical;
313 	u64 seq;
314 	enum mod_log_op op;
315 
316 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
317 	int slot;
318 
319 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
320 	u64 generation;
321 
322 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
323 	struct btrfs_disk_key key;
324 	u64 blockptr;
325 
326 	/* this is used for op == MOD_LOG_MOVE_KEYS */
327 	struct tree_mod_move move;
328 
329 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
330 	struct tree_mod_root old_root;
331 };
332 
333 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
334 {
335 	read_lock(&fs_info->tree_mod_log_lock);
336 }
337 
338 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
339 {
340 	read_unlock(&fs_info->tree_mod_log_lock);
341 }
342 
343 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
344 {
345 	write_lock(&fs_info->tree_mod_log_lock);
346 }
347 
348 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
349 {
350 	write_unlock(&fs_info->tree_mod_log_lock);
351 }
352 
353 /*
354  * Pull a new tree mod seq number for our operation.
355  */
356 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
357 {
358 	return atomic64_inc_return(&fs_info->tree_mod_seq);
359 }
360 
361 /*
362  * This adds a new blocker to the tree mod log's blocker list if the @elem
363  * passed does not already have a sequence number set. So when a caller expects
364  * to record tree modifications, it should ensure to set elem->seq to zero
365  * before calling btrfs_get_tree_mod_seq.
366  * Returns a fresh, unused tree log modification sequence number, even if no new
367  * blocker was added.
368  */
369 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
370 			   struct seq_list *elem)
371 {
372 	tree_mod_log_write_lock(fs_info);
373 	spin_lock(&fs_info->tree_mod_seq_lock);
374 	if (!elem->seq) {
375 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
376 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
377 	}
378 	spin_unlock(&fs_info->tree_mod_seq_lock);
379 	tree_mod_log_write_unlock(fs_info);
380 
381 	return elem->seq;
382 }
383 
384 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
385 			    struct seq_list *elem)
386 {
387 	struct rb_root *tm_root;
388 	struct rb_node *node;
389 	struct rb_node *next;
390 	struct seq_list *cur_elem;
391 	struct tree_mod_elem *tm;
392 	u64 min_seq = (u64)-1;
393 	u64 seq_putting = elem->seq;
394 
395 	if (!seq_putting)
396 		return;
397 
398 	spin_lock(&fs_info->tree_mod_seq_lock);
399 	list_del(&elem->list);
400 	elem->seq = 0;
401 
402 	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
403 		if (cur_elem->seq < min_seq) {
404 			if (seq_putting > cur_elem->seq) {
405 				/*
406 				 * blocker with lower sequence number exists, we
407 				 * cannot remove anything from the log
408 				 */
409 				spin_unlock(&fs_info->tree_mod_seq_lock);
410 				return;
411 			}
412 			min_seq = cur_elem->seq;
413 		}
414 	}
415 	spin_unlock(&fs_info->tree_mod_seq_lock);
416 
417 	/*
418 	 * anything that's lower than the lowest existing (read: blocked)
419 	 * sequence number can be removed from the tree.
420 	 */
421 	tree_mod_log_write_lock(fs_info);
422 	tm_root = &fs_info->tree_mod_log;
423 	for (node = rb_first(tm_root); node; node = next) {
424 		next = rb_next(node);
425 		tm = container_of(node, struct tree_mod_elem, node);
426 		if (tm->seq > min_seq)
427 			continue;
428 		rb_erase(node, tm_root);
429 		kfree(tm);
430 	}
431 	tree_mod_log_write_unlock(fs_info);
432 }
433 
434 /*
435  * key order of the log:
436  *       node/leaf start address -> sequence
437  *
438  * The 'start address' is the logical address of the *new* root node
439  * for root replace operations, or the logical address of the affected
440  * block for all other operations.
441  *
442  * Note: must be called with write lock (tree_mod_log_write_lock).
443  */
444 static noinline int
445 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
446 {
447 	struct rb_root *tm_root;
448 	struct rb_node **new;
449 	struct rb_node *parent = NULL;
450 	struct tree_mod_elem *cur;
451 
452 	BUG_ON(!tm);
453 
454 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
455 
456 	tm_root = &fs_info->tree_mod_log;
457 	new = &tm_root->rb_node;
458 	while (*new) {
459 		cur = container_of(*new, struct tree_mod_elem, node);
460 		parent = *new;
461 		if (cur->logical < tm->logical)
462 			new = &((*new)->rb_left);
463 		else if (cur->logical > tm->logical)
464 			new = &((*new)->rb_right);
465 		else if (cur->seq < tm->seq)
466 			new = &((*new)->rb_left);
467 		else if (cur->seq > tm->seq)
468 			new = &((*new)->rb_right);
469 		else
470 			return -EEXIST;
471 	}
472 
473 	rb_link_node(&tm->node, parent, new);
474 	rb_insert_color(&tm->node, tm_root);
475 	return 0;
476 }
477 
478 /*
479  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
480  * returns zero with the tree_mod_log_lock acquired. The caller must hold
481  * this until all tree mod log insertions are recorded in the rb tree and then
482  * call tree_mod_log_write_unlock() to release.
483  */
484 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
485 				    struct extent_buffer *eb) {
486 	smp_mb();
487 	if (list_empty(&(fs_info)->tree_mod_seq_list))
488 		return 1;
489 	if (eb && btrfs_header_level(eb) == 0)
490 		return 1;
491 
492 	tree_mod_log_write_lock(fs_info);
493 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
494 		tree_mod_log_write_unlock(fs_info);
495 		return 1;
496 	}
497 
498 	return 0;
499 }
500 
501 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
502 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
503 				    struct extent_buffer *eb)
504 {
505 	smp_mb();
506 	if (list_empty(&(fs_info)->tree_mod_seq_list))
507 		return 0;
508 	if (eb && btrfs_header_level(eb) == 0)
509 		return 0;
510 
511 	return 1;
512 }
513 
514 static struct tree_mod_elem *
515 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
516 		    enum mod_log_op op, gfp_t flags)
517 {
518 	struct tree_mod_elem *tm;
519 
520 	tm = kzalloc(sizeof(*tm), flags);
521 	if (!tm)
522 		return NULL;
523 
524 	tm->logical = eb->start;
525 	if (op != MOD_LOG_KEY_ADD) {
526 		btrfs_node_key(eb, &tm->key, slot);
527 		tm->blockptr = btrfs_node_blockptr(eb, slot);
528 	}
529 	tm->op = op;
530 	tm->slot = slot;
531 	tm->generation = btrfs_node_ptr_generation(eb, slot);
532 	RB_CLEAR_NODE(&tm->node);
533 
534 	return tm;
535 }
536 
537 static noinline int
538 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
539 			struct extent_buffer *eb, int slot,
540 			enum mod_log_op op, gfp_t flags)
541 {
542 	struct tree_mod_elem *tm;
543 	int ret;
544 
545 	if (!tree_mod_need_log(fs_info, eb))
546 		return 0;
547 
548 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
549 	if (!tm)
550 		return -ENOMEM;
551 
552 	if (tree_mod_dont_log(fs_info, eb)) {
553 		kfree(tm);
554 		return 0;
555 	}
556 
557 	ret = __tree_mod_log_insert(fs_info, tm);
558 	tree_mod_log_write_unlock(fs_info);
559 	if (ret)
560 		kfree(tm);
561 
562 	return ret;
563 }
564 
565 static noinline int
566 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
567 			 struct extent_buffer *eb, int dst_slot, int src_slot,
568 			 int nr_items, gfp_t flags)
569 {
570 	struct tree_mod_elem *tm = NULL;
571 	struct tree_mod_elem **tm_list = NULL;
572 	int ret = 0;
573 	int i;
574 	int locked = 0;
575 
576 	if (!tree_mod_need_log(fs_info, eb))
577 		return 0;
578 
579 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
580 	if (!tm_list)
581 		return -ENOMEM;
582 
583 	tm = kzalloc(sizeof(*tm), flags);
584 	if (!tm) {
585 		ret = -ENOMEM;
586 		goto free_tms;
587 	}
588 
589 	tm->logical = eb->start;
590 	tm->slot = src_slot;
591 	tm->move.dst_slot = dst_slot;
592 	tm->move.nr_items = nr_items;
593 	tm->op = MOD_LOG_MOVE_KEYS;
594 
595 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
596 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
597 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
598 		if (!tm_list[i]) {
599 			ret = -ENOMEM;
600 			goto free_tms;
601 		}
602 	}
603 
604 	if (tree_mod_dont_log(fs_info, eb))
605 		goto free_tms;
606 	locked = 1;
607 
608 	/*
609 	 * When we override something during the move, we log these removals.
610 	 * This can only happen when we move towards the beginning of the
611 	 * buffer, i.e. dst_slot < src_slot.
612 	 */
613 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
614 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
615 		if (ret)
616 			goto free_tms;
617 	}
618 
619 	ret = __tree_mod_log_insert(fs_info, tm);
620 	if (ret)
621 		goto free_tms;
622 	tree_mod_log_write_unlock(fs_info);
623 	kfree(tm_list);
624 
625 	return 0;
626 free_tms:
627 	for (i = 0; i < nr_items; i++) {
628 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
629 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
630 		kfree(tm_list[i]);
631 	}
632 	if (locked)
633 		tree_mod_log_write_unlock(fs_info);
634 	kfree(tm_list);
635 	kfree(tm);
636 
637 	return ret;
638 }
639 
640 static inline int
641 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
642 		       struct tree_mod_elem **tm_list,
643 		       int nritems)
644 {
645 	int i, j;
646 	int ret;
647 
648 	for (i = nritems - 1; i >= 0; i--) {
649 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
650 		if (ret) {
651 			for (j = nritems - 1; j > i; j--)
652 				rb_erase(&tm_list[j]->node,
653 					 &fs_info->tree_mod_log);
654 			return ret;
655 		}
656 	}
657 
658 	return 0;
659 }
660 
661 static noinline int
662 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
663 			 struct extent_buffer *old_root,
664 			 struct extent_buffer *new_root, gfp_t flags,
665 			 int log_removal)
666 {
667 	struct tree_mod_elem *tm = NULL;
668 	struct tree_mod_elem **tm_list = NULL;
669 	int nritems = 0;
670 	int ret = 0;
671 	int i;
672 
673 	if (!tree_mod_need_log(fs_info, NULL))
674 		return 0;
675 
676 	if (log_removal && btrfs_header_level(old_root) > 0) {
677 		nritems = btrfs_header_nritems(old_root);
678 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
679 				  flags);
680 		if (!tm_list) {
681 			ret = -ENOMEM;
682 			goto free_tms;
683 		}
684 		for (i = 0; i < nritems; i++) {
685 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
686 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
687 			if (!tm_list[i]) {
688 				ret = -ENOMEM;
689 				goto free_tms;
690 			}
691 		}
692 	}
693 
694 	tm = kzalloc(sizeof(*tm), flags);
695 	if (!tm) {
696 		ret = -ENOMEM;
697 		goto free_tms;
698 	}
699 
700 	tm->logical = new_root->start;
701 	tm->old_root.logical = old_root->start;
702 	tm->old_root.level = btrfs_header_level(old_root);
703 	tm->generation = btrfs_header_generation(old_root);
704 	tm->op = MOD_LOG_ROOT_REPLACE;
705 
706 	if (tree_mod_dont_log(fs_info, NULL))
707 		goto free_tms;
708 
709 	if (tm_list)
710 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
711 	if (!ret)
712 		ret = __tree_mod_log_insert(fs_info, tm);
713 
714 	tree_mod_log_write_unlock(fs_info);
715 	if (ret)
716 		goto free_tms;
717 	kfree(tm_list);
718 
719 	return ret;
720 
721 free_tms:
722 	if (tm_list) {
723 		for (i = 0; i < nritems; i++)
724 			kfree(tm_list[i]);
725 		kfree(tm_list);
726 	}
727 	kfree(tm);
728 
729 	return ret;
730 }
731 
732 static struct tree_mod_elem *
733 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
734 		      int smallest)
735 {
736 	struct rb_root *tm_root;
737 	struct rb_node *node;
738 	struct tree_mod_elem *cur = NULL;
739 	struct tree_mod_elem *found = NULL;
740 
741 	tree_mod_log_read_lock(fs_info);
742 	tm_root = &fs_info->tree_mod_log;
743 	node = tm_root->rb_node;
744 	while (node) {
745 		cur = container_of(node, struct tree_mod_elem, node);
746 		if (cur->logical < start) {
747 			node = node->rb_left;
748 		} else if (cur->logical > start) {
749 			node = node->rb_right;
750 		} else if (cur->seq < min_seq) {
751 			node = node->rb_left;
752 		} else if (!smallest) {
753 			/* we want the node with the highest seq */
754 			if (found)
755 				BUG_ON(found->seq > cur->seq);
756 			found = cur;
757 			node = node->rb_left;
758 		} else if (cur->seq > min_seq) {
759 			/* we want the node with the smallest seq */
760 			if (found)
761 				BUG_ON(found->seq < cur->seq);
762 			found = cur;
763 			node = node->rb_right;
764 		} else {
765 			found = cur;
766 			break;
767 		}
768 	}
769 	tree_mod_log_read_unlock(fs_info);
770 
771 	return found;
772 }
773 
774 /*
775  * this returns the element from the log with the smallest time sequence
776  * value that's in the log (the oldest log item). any element with a time
777  * sequence lower than min_seq will be ignored.
778  */
779 static struct tree_mod_elem *
780 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
781 			   u64 min_seq)
782 {
783 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
784 }
785 
786 /*
787  * this returns the element from the log with the largest time sequence
788  * value that's in the log (the most recent log item). any element with
789  * a time sequence lower than min_seq will be ignored.
790  */
791 static struct tree_mod_elem *
792 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
793 {
794 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
795 }
796 
797 static noinline int
798 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
799 		     struct extent_buffer *src, unsigned long dst_offset,
800 		     unsigned long src_offset, int nr_items)
801 {
802 	int ret = 0;
803 	struct tree_mod_elem **tm_list = NULL;
804 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
805 	int i;
806 	int locked = 0;
807 
808 	if (!tree_mod_need_log(fs_info, NULL))
809 		return 0;
810 
811 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
812 		return 0;
813 
814 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
815 			  GFP_NOFS);
816 	if (!tm_list)
817 		return -ENOMEM;
818 
819 	tm_list_add = tm_list;
820 	tm_list_rem = tm_list + nr_items;
821 	for (i = 0; i < nr_items; i++) {
822 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
823 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
824 		if (!tm_list_rem[i]) {
825 			ret = -ENOMEM;
826 			goto free_tms;
827 		}
828 
829 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
830 		    MOD_LOG_KEY_ADD, GFP_NOFS);
831 		if (!tm_list_add[i]) {
832 			ret = -ENOMEM;
833 			goto free_tms;
834 		}
835 	}
836 
837 	if (tree_mod_dont_log(fs_info, NULL))
838 		goto free_tms;
839 	locked = 1;
840 
841 	for (i = 0; i < nr_items; i++) {
842 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
843 		if (ret)
844 			goto free_tms;
845 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
846 		if (ret)
847 			goto free_tms;
848 	}
849 
850 	tree_mod_log_write_unlock(fs_info);
851 	kfree(tm_list);
852 
853 	return 0;
854 
855 free_tms:
856 	for (i = 0; i < nr_items * 2; i++) {
857 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
858 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
859 		kfree(tm_list[i]);
860 	}
861 	if (locked)
862 		tree_mod_log_write_unlock(fs_info);
863 	kfree(tm_list);
864 
865 	return ret;
866 }
867 
868 static inline void
869 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
870 		     int dst_offset, int src_offset, int nr_items)
871 {
872 	int ret;
873 	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
874 				       nr_items, GFP_NOFS);
875 	BUG_ON(ret < 0);
876 }
877 
878 static noinline void
879 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
880 			  struct extent_buffer *eb, int slot, int atomic)
881 {
882 	int ret;
883 
884 	ret = tree_mod_log_insert_key(fs_info, eb, slot,
885 					MOD_LOG_KEY_REPLACE,
886 					atomic ? GFP_ATOMIC : GFP_NOFS);
887 	BUG_ON(ret < 0);
888 }
889 
890 static noinline int
891 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
892 {
893 	struct tree_mod_elem **tm_list = NULL;
894 	int nritems = 0;
895 	int i;
896 	int ret = 0;
897 
898 	if (btrfs_header_level(eb) == 0)
899 		return 0;
900 
901 	if (!tree_mod_need_log(fs_info, NULL))
902 		return 0;
903 
904 	nritems = btrfs_header_nritems(eb);
905 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
906 	if (!tm_list)
907 		return -ENOMEM;
908 
909 	for (i = 0; i < nritems; i++) {
910 		tm_list[i] = alloc_tree_mod_elem(eb, i,
911 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
912 		if (!tm_list[i]) {
913 			ret = -ENOMEM;
914 			goto free_tms;
915 		}
916 	}
917 
918 	if (tree_mod_dont_log(fs_info, eb))
919 		goto free_tms;
920 
921 	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
922 	tree_mod_log_write_unlock(fs_info);
923 	if (ret)
924 		goto free_tms;
925 	kfree(tm_list);
926 
927 	return 0;
928 
929 free_tms:
930 	for (i = 0; i < nritems; i++)
931 		kfree(tm_list[i]);
932 	kfree(tm_list);
933 
934 	return ret;
935 }
936 
937 static noinline void
938 tree_mod_log_set_root_pointer(struct btrfs_root *root,
939 			      struct extent_buffer *new_root_node,
940 			      int log_removal)
941 {
942 	int ret;
943 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
944 				       new_root_node, GFP_NOFS, log_removal);
945 	BUG_ON(ret < 0);
946 }
947 
948 /*
949  * check if the tree block can be shared by multiple trees
950  */
951 int btrfs_block_can_be_shared(struct btrfs_root *root,
952 			      struct extent_buffer *buf)
953 {
954 	/*
955 	 * Tree blocks not in reference counted trees and tree roots
956 	 * are never shared. If a block was allocated after the last
957 	 * snapshot and the block was not allocated by tree relocation,
958 	 * we know the block is not shared.
959 	 */
960 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
961 	    buf != root->node && buf != root->commit_root &&
962 	    (btrfs_header_generation(buf) <=
963 	     btrfs_root_last_snapshot(&root->root_item) ||
964 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
965 		return 1;
966 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
967 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
968 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
969 		return 1;
970 #endif
971 	return 0;
972 }
973 
974 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
975 				       struct btrfs_root *root,
976 				       struct extent_buffer *buf,
977 				       struct extent_buffer *cow,
978 				       int *last_ref)
979 {
980 	u64 refs;
981 	u64 owner;
982 	u64 flags;
983 	u64 new_flags = 0;
984 	int ret;
985 
986 	/*
987 	 * Backrefs update rules:
988 	 *
989 	 * Always use full backrefs for extent pointers in tree block
990 	 * allocated by tree relocation.
991 	 *
992 	 * If a shared tree block is no longer referenced by its owner
993 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
994 	 * use full backrefs for extent pointers in tree block.
995 	 *
996 	 * If a tree block is been relocating
997 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
998 	 * use full backrefs for extent pointers in tree block.
999 	 * The reason for this is some operations (such as drop tree)
1000 	 * are only allowed for blocks use full backrefs.
1001 	 */
1002 
1003 	if (btrfs_block_can_be_shared(root, buf)) {
1004 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
1005 					       btrfs_header_level(buf), 1,
1006 					       &refs, &flags);
1007 		if (ret)
1008 			return ret;
1009 		if (refs == 0) {
1010 			ret = -EROFS;
1011 			btrfs_handle_fs_error(root->fs_info, ret, NULL);
1012 			return ret;
1013 		}
1014 	} else {
1015 		refs = 1;
1016 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1017 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1018 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1019 		else
1020 			flags = 0;
1021 	}
1022 
1023 	owner = btrfs_header_owner(buf);
1024 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1025 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1026 
1027 	if (refs > 1) {
1028 		if ((owner == root->root_key.objectid ||
1029 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1030 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1031 			ret = btrfs_inc_ref(trans, root, buf, 1);
1032 			BUG_ON(ret); /* -ENOMEM */
1033 
1034 			if (root->root_key.objectid ==
1035 			    BTRFS_TREE_RELOC_OBJECTID) {
1036 				ret = btrfs_dec_ref(trans, root, buf, 0);
1037 				BUG_ON(ret); /* -ENOMEM */
1038 				ret = btrfs_inc_ref(trans, root, cow, 1);
1039 				BUG_ON(ret); /* -ENOMEM */
1040 			}
1041 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1042 		} else {
1043 
1044 			if (root->root_key.objectid ==
1045 			    BTRFS_TREE_RELOC_OBJECTID)
1046 				ret = btrfs_inc_ref(trans, root, cow, 1);
1047 			else
1048 				ret = btrfs_inc_ref(trans, root, cow, 0);
1049 			BUG_ON(ret); /* -ENOMEM */
1050 		}
1051 		if (new_flags != 0) {
1052 			int level = btrfs_header_level(buf);
1053 
1054 			ret = btrfs_set_disk_extent_flags(trans, root,
1055 							  buf->start,
1056 							  buf->len,
1057 							  new_flags, level, 0);
1058 			if (ret)
1059 				return ret;
1060 		}
1061 	} else {
1062 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1063 			if (root->root_key.objectid ==
1064 			    BTRFS_TREE_RELOC_OBJECTID)
1065 				ret = btrfs_inc_ref(trans, root, cow, 1);
1066 			else
1067 				ret = btrfs_inc_ref(trans, root, cow, 0);
1068 			BUG_ON(ret); /* -ENOMEM */
1069 			ret = btrfs_dec_ref(trans, root, buf, 1);
1070 			BUG_ON(ret); /* -ENOMEM */
1071 		}
1072 		clean_tree_block(trans, root->fs_info, buf);
1073 		*last_ref = 1;
1074 	}
1075 	return 0;
1076 }
1077 
1078 /*
1079  * does the dirty work in cow of a single block.  The parent block (if
1080  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1081  * dirty and returned locked.  If you modify the block it needs to be marked
1082  * dirty again.
1083  *
1084  * search_start -- an allocation hint for the new block
1085  *
1086  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1087  * bytes the allocator should try to find free next to the block it returns.
1088  * This is just a hint and may be ignored by the allocator.
1089  */
1090 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1091 			     struct btrfs_root *root,
1092 			     struct extent_buffer *buf,
1093 			     struct extent_buffer *parent, int parent_slot,
1094 			     struct extent_buffer **cow_ret,
1095 			     u64 search_start, u64 empty_size)
1096 {
1097 	struct btrfs_disk_key disk_key;
1098 	struct extent_buffer *cow;
1099 	int level, ret;
1100 	int last_ref = 0;
1101 	int unlock_orig = 0;
1102 	u64 parent_start = 0;
1103 
1104 	if (*cow_ret == buf)
1105 		unlock_orig = 1;
1106 
1107 	btrfs_assert_tree_locked(buf);
1108 
1109 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1110 		trans->transid != root->fs_info->running_transaction->transid);
1111 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1112 		trans->transid != root->last_trans);
1113 
1114 	level = btrfs_header_level(buf);
1115 
1116 	if (level == 0)
1117 		btrfs_item_key(buf, &disk_key, 0);
1118 	else
1119 		btrfs_node_key(buf, &disk_key, 0);
1120 
1121 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1122 		parent_start = parent->start;
1123 
1124 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
1125 			root->root_key.objectid, &disk_key, level,
1126 			search_start, empty_size);
1127 	if (IS_ERR(cow))
1128 		return PTR_ERR(cow);
1129 
1130 	/* cow is set to blocking by btrfs_init_new_buffer */
1131 
1132 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
1133 	btrfs_set_header_bytenr(cow, cow->start);
1134 	btrfs_set_header_generation(cow, trans->transid);
1135 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1136 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1137 				     BTRFS_HEADER_FLAG_RELOC);
1138 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1139 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1140 	else
1141 		btrfs_set_header_owner(cow, root->root_key.objectid);
1142 
1143 	write_extent_buffer_fsid(cow, root->fs_info->fsid);
1144 
1145 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1146 	if (ret) {
1147 		btrfs_abort_transaction(trans, ret);
1148 		return ret;
1149 	}
1150 
1151 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1152 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1153 		if (ret) {
1154 			btrfs_abort_transaction(trans, ret);
1155 			return ret;
1156 		}
1157 	}
1158 
1159 	if (buf == root->node) {
1160 		WARN_ON(parent && parent != buf);
1161 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1162 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1163 			parent_start = buf->start;
1164 
1165 		extent_buffer_get(cow);
1166 		tree_mod_log_set_root_pointer(root, cow, 1);
1167 		rcu_assign_pointer(root->node, cow);
1168 
1169 		btrfs_free_tree_block(trans, root, buf, parent_start,
1170 				      last_ref);
1171 		free_extent_buffer(buf);
1172 		add_root_to_dirty_list(root);
1173 	} else {
1174 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1175 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1176 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1177 		btrfs_set_node_blockptr(parent, parent_slot,
1178 					cow->start);
1179 		btrfs_set_node_ptr_generation(parent, parent_slot,
1180 					      trans->transid);
1181 		btrfs_mark_buffer_dirty(parent);
1182 		if (last_ref) {
1183 			ret = tree_mod_log_free_eb(root->fs_info, buf);
1184 			if (ret) {
1185 				btrfs_abort_transaction(trans, ret);
1186 				return ret;
1187 			}
1188 		}
1189 		btrfs_free_tree_block(trans, root, buf, parent_start,
1190 				      last_ref);
1191 	}
1192 	if (unlock_orig)
1193 		btrfs_tree_unlock(buf);
1194 	free_extent_buffer_stale(buf);
1195 	btrfs_mark_buffer_dirty(cow);
1196 	*cow_ret = cow;
1197 	return 0;
1198 }
1199 
1200 /*
1201  * returns the logical address of the oldest predecessor of the given root.
1202  * entries older than time_seq are ignored.
1203  */
1204 static struct tree_mod_elem *
1205 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1206 			   struct extent_buffer *eb_root, u64 time_seq)
1207 {
1208 	struct tree_mod_elem *tm;
1209 	struct tree_mod_elem *found = NULL;
1210 	u64 root_logical = eb_root->start;
1211 	int looped = 0;
1212 
1213 	if (!time_seq)
1214 		return NULL;
1215 
1216 	/*
1217 	 * the very last operation that's logged for a root is the
1218 	 * replacement operation (if it is replaced at all). this has
1219 	 * the logical address of the *new* root, making it the very
1220 	 * first operation that's logged for this root.
1221 	 */
1222 	while (1) {
1223 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
1224 						time_seq);
1225 		if (!looped && !tm)
1226 			return NULL;
1227 		/*
1228 		 * if there are no tree operation for the oldest root, we simply
1229 		 * return it. this should only happen if that (old) root is at
1230 		 * level 0.
1231 		 */
1232 		if (!tm)
1233 			break;
1234 
1235 		/*
1236 		 * if there's an operation that's not a root replacement, we
1237 		 * found the oldest version of our root. normally, we'll find a
1238 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1239 		 */
1240 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1241 			break;
1242 
1243 		found = tm;
1244 		root_logical = tm->old_root.logical;
1245 		looped = 1;
1246 	}
1247 
1248 	/* if there's no old root to return, return what we found instead */
1249 	if (!found)
1250 		found = tm;
1251 
1252 	return found;
1253 }
1254 
1255 /*
1256  * tm is a pointer to the first operation to rewind within eb. then, all
1257  * previous operations will be rewound (until we reach something older than
1258  * time_seq).
1259  */
1260 static void
1261 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1262 		      u64 time_seq, struct tree_mod_elem *first_tm)
1263 {
1264 	u32 n;
1265 	struct rb_node *next;
1266 	struct tree_mod_elem *tm = first_tm;
1267 	unsigned long o_dst;
1268 	unsigned long o_src;
1269 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1270 
1271 	n = btrfs_header_nritems(eb);
1272 	tree_mod_log_read_lock(fs_info);
1273 	while (tm && tm->seq >= time_seq) {
1274 		/*
1275 		 * all the operations are recorded with the operator used for
1276 		 * the modification. as we're going backwards, we do the
1277 		 * opposite of each operation here.
1278 		 */
1279 		switch (tm->op) {
1280 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1281 			BUG_ON(tm->slot < n);
1282 			/* Fallthrough */
1283 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1284 		case MOD_LOG_KEY_REMOVE:
1285 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1286 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1287 			btrfs_set_node_ptr_generation(eb, tm->slot,
1288 						      tm->generation);
1289 			n++;
1290 			break;
1291 		case MOD_LOG_KEY_REPLACE:
1292 			BUG_ON(tm->slot >= n);
1293 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1294 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1295 			btrfs_set_node_ptr_generation(eb, tm->slot,
1296 						      tm->generation);
1297 			break;
1298 		case MOD_LOG_KEY_ADD:
1299 			/* if a move operation is needed it's in the log */
1300 			n--;
1301 			break;
1302 		case MOD_LOG_MOVE_KEYS:
1303 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1304 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1305 			memmove_extent_buffer(eb, o_dst, o_src,
1306 					      tm->move.nr_items * p_size);
1307 			break;
1308 		case MOD_LOG_ROOT_REPLACE:
1309 			/*
1310 			 * this operation is special. for roots, this must be
1311 			 * handled explicitly before rewinding.
1312 			 * for non-roots, this operation may exist if the node
1313 			 * was a root: root A -> child B; then A gets empty and
1314 			 * B is promoted to the new root. in the mod log, we'll
1315 			 * have a root-replace operation for B, a tree block
1316 			 * that is no root. we simply ignore that operation.
1317 			 */
1318 			break;
1319 		}
1320 		next = rb_next(&tm->node);
1321 		if (!next)
1322 			break;
1323 		tm = container_of(next, struct tree_mod_elem, node);
1324 		if (tm->logical != first_tm->logical)
1325 			break;
1326 	}
1327 	tree_mod_log_read_unlock(fs_info);
1328 	btrfs_set_header_nritems(eb, n);
1329 }
1330 
1331 /*
1332  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1333  * is returned. If rewind operations happen, a fresh buffer is returned. The
1334  * returned buffer is always read-locked. If the returned buffer is not the
1335  * input buffer, the lock on the input buffer is released and the input buffer
1336  * is freed (its refcount is decremented).
1337  */
1338 static struct extent_buffer *
1339 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1340 		    struct extent_buffer *eb, u64 time_seq)
1341 {
1342 	struct extent_buffer *eb_rewin;
1343 	struct tree_mod_elem *tm;
1344 
1345 	if (!time_seq)
1346 		return eb;
1347 
1348 	if (btrfs_header_level(eb) == 0)
1349 		return eb;
1350 
1351 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1352 	if (!tm)
1353 		return eb;
1354 
1355 	btrfs_set_path_blocking(path);
1356 	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1357 
1358 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1359 		BUG_ON(tm->slot != 0);
1360 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
1361 						eb->len);
1362 		if (!eb_rewin) {
1363 			btrfs_tree_read_unlock_blocking(eb);
1364 			free_extent_buffer(eb);
1365 			return NULL;
1366 		}
1367 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1368 		btrfs_set_header_backref_rev(eb_rewin,
1369 					     btrfs_header_backref_rev(eb));
1370 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1371 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1372 	} else {
1373 		eb_rewin = btrfs_clone_extent_buffer(eb);
1374 		if (!eb_rewin) {
1375 			btrfs_tree_read_unlock_blocking(eb);
1376 			free_extent_buffer(eb);
1377 			return NULL;
1378 		}
1379 	}
1380 
1381 	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1382 	btrfs_tree_read_unlock_blocking(eb);
1383 	free_extent_buffer(eb);
1384 
1385 	extent_buffer_get(eb_rewin);
1386 	btrfs_tree_read_lock(eb_rewin);
1387 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1388 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1389 		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1390 
1391 	return eb_rewin;
1392 }
1393 
1394 /*
1395  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1396  * value. If there are no changes, the current root->root_node is returned. If
1397  * anything changed in between, there's a fresh buffer allocated on which the
1398  * rewind operations are done. In any case, the returned buffer is read locked.
1399  * Returns NULL on error (with no locks held).
1400  */
1401 static inline struct extent_buffer *
1402 get_old_root(struct btrfs_root *root, u64 time_seq)
1403 {
1404 	struct tree_mod_elem *tm;
1405 	struct extent_buffer *eb = NULL;
1406 	struct extent_buffer *eb_root;
1407 	struct extent_buffer *old;
1408 	struct tree_mod_root *old_root = NULL;
1409 	u64 old_generation = 0;
1410 	u64 logical;
1411 
1412 	eb_root = btrfs_read_lock_root_node(root);
1413 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1414 	if (!tm)
1415 		return eb_root;
1416 
1417 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1418 		old_root = &tm->old_root;
1419 		old_generation = tm->generation;
1420 		logical = old_root->logical;
1421 	} else {
1422 		logical = eb_root->start;
1423 	}
1424 
1425 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1426 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1427 		btrfs_tree_read_unlock(eb_root);
1428 		free_extent_buffer(eb_root);
1429 		old = read_tree_block(root, logical, 0);
1430 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1431 			if (!IS_ERR(old))
1432 				free_extent_buffer(old);
1433 			btrfs_warn(root->fs_info,
1434 				"failed to read tree block %llu from get_old_root", logical);
1435 		} else {
1436 			eb = btrfs_clone_extent_buffer(old);
1437 			free_extent_buffer(old);
1438 		}
1439 	} else if (old_root) {
1440 		btrfs_tree_read_unlock(eb_root);
1441 		free_extent_buffer(eb_root);
1442 		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
1443 					root->nodesize);
1444 	} else {
1445 		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1446 		eb = btrfs_clone_extent_buffer(eb_root);
1447 		btrfs_tree_read_unlock_blocking(eb_root);
1448 		free_extent_buffer(eb_root);
1449 	}
1450 
1451 	if (!eb)
1452 		return NULL;
1453 	extent_buffer_get(eb);
1454 	btrfs_tree_read_lock(eb);
1455 	if (old_root) {
1456 		btrfs_set_header_bytenr(eb, eb->start);
1457 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1458 		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1459 		btrfs_set_header_level(eb, old_root->level);
1460 		btrfs_set_header_generation(eb, old_generation);
1461 	}
1462 	if (tm)
1463 		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1464 	else
1465 		WARN_ON(btrfs_header_level(eb) != 0);
1466 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1467 
1468 	return eb;
1469 }
1470 
1471 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1472 {
1473 	struct tree_mod_elem *tm;
1474 	int level;
1475 	struct extent_buffer *eb_root = btrfs_root_node(root);
1476 
1477 	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1478 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1479 		level = tm->old_root.level;
1480 	} else {
1481 		level = btrfs_header_level(eb_root);
1482 	}
1483 	free_extent_buffer(eb_root);
1484 
1485 	return level;
1486 }
1487 
1488 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1489 				   struct btrfs_root *root,
1490 				   struct extent_buffer *buf)
1491 {
1492 	if (btrfs_is_testing(root->fs_info))
1493 		return 0;
1494 
1495 	/* ensure we can see the force_cow */
1496 	smp_rmb();
1497 
1498 	/*
1499 	 * We do not need to cow a block if
1500 	 * 1) this block is not created or changed in this transaction;
1501 	 * 2) this block does not belong to TREE_RELOC tree;
1502 	 * 3) the root is not forced COW.
1503 	 *
1504 	 * What is forced COW:
1505 	 *    when we create snapshot during committing the transaction,
1506 	 *    after we've finished coping src root, we must COW the shared
1507 	 *    block to ensure the metadata consistency.
1508 	 */
1509 	if (btrfs_header_generation(buf) == trans->transid &&
1510 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1511 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1512 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1513 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1514 		return 0;
1515 	return 1;
1516 }
1517 
1518 /*
1519  * cows a single block, see __btrfs_cow_block for the real work.
1520  * This version of it has extra checks so that a block isn't COWed more than
1521  * once per transaction, as long as it hasn't been written yet
1522  */
1523 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1524 		    struct btrfs_root *root, struct extent_buffer *buf,
1525 		    struct extent_buffer *parent, int parent_slot,
1526 		    struct extent_buffer **cow_ret)
1527 {
1528 	u64 search_start;
1529 	int ret;
1530 
1531 	if (trans->transaction != root->fs_info->running_transaction)
1532 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1533 		       trans->transid,
1534 		       root->fs_info->running_transaction->transid);
1535 
1536 	if (trans->transid != root->fs_info->generation)
1537 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1538 		       trans->transid, root->fs_info->generation);
1539 
1540 	if (!should_cow_block(trans, root, buf)) {
1541 		trans->dirty = true;
1542 		*cow_ret = buf;
1543 		return 0;
1544 	}
1545 
1546 	search_start = buf->start & ~((u64)SZ_1G - 1);
1547 
1548 	if (parent)
1549 		btrfs_set_lock_blocking(parent);
1550 	btrfs_set_lock_blocking(buf);
1551 
1552 	ret = __btrfs_cow_block(trans, root, buf, parent,
1553 				 parent_slot, cow_ret, search_start, 0);
1554 
1555 	trace_btrfs_cow_block(root, buf, *cow_ret);
1556 
1557 	return ret;
1558 }
1559 
1560 /*
1561  * helper function for defrag to decide if two blocks pointed to by a
1562  * node are actually close by
1563  */
1564 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1565 {
1566 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1567 		return 1;
1568 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1569 		return 1;
1570 	return 0;
1571 }
1572 
1573 /*
1574  * compare two keys in a memcmp fashion
1575  */
1576 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1577 {
1578 	struct btrfs_key k1;
1579 
1580 	btrfs_disk_key_to_cpu(&k1, disk);
1581 
1582 	return btrfs_comp_cpu_keys(&k1, k2);
1583 }
1584 
1585 /*
1586  * same as comp_keys only with two btrfs_key's
1587  */
1588 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1589 {
1590 	if (k1->objectid > k2->objectid)
1591 		return 1;
1592 	if (k1->objectid < k2->objectid)
1593 		return -1;
1594 	if (k1->type > k2->type)
1595 		return 1;
1596 	if (k1->type < k2->type)
1597 		return -1;
1598 	if (k1->offset > k2->offset)
1599 		return 1;
1600 	if (k1->offset < k2->offset)
1601 		return -1;
1602 	return 0;
1603 }
1604 
1605 /*
1606  * this is used by the defrag code to go through all the
1607  * leaves pointed to by a node and reallocate them so that
1608  * disk order is close to key order
1609  */
1610 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1611 		       struct btrfs_root *root, struct extent_buffer *parent,
1612 		       int start_slot, u64 *last_ret,
1613 		       struct btrfs_key *progress)
1614 {
1615 	struct extent_buffer *cur;
1616 	u64 blocknr;
1617 	u64 gen;
1618 	u64 search_start = *last_ret;
1619 	u64 last_block = 0;
1620 	u64 other;
1621 	u32 parent_nritems;
1622 	int end_slot;
1623 	int i;
1624 	int err = 0;
1625 	int parent_level;
1626 	int uptodate;
1627 	u32 blocksize;
1628 	int progress_passed = 0;
1629 	struct btrfs_disk_key disk_key;
1630 
1631 	parent_level = btrfs_header_level(parent);
1632 
1633 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
1634 	WARN_ON(trans->transid != root->fs_info->generation);
1635 
1636 	parent_nritems = btrfs_header_nritems(parent);
1637 	blocksize = root->nodesize;
1638 	end_slot = parent_nritems - 1;
1639 
1640 	if (parent_nritems <= 1)
1641 		return 0;
1642 
1643 	btrfs_set_lock_blocking(parent);
1644 
1645 	for (i = start_slot; i <= end_slot; i++) {
1646 		int close = 1;
1647 
1648 		btrfs_node_key(parent, &disk_key, i);
1649 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1650 			continue;
1651 
1652 		progress_passed = 1;
1653 		blocknr = btrfs_node_blockptr(parent, i);
1654 		gen = btrfs_node_ptr_generation(parent, i);
1655 		if (last_block == 0)
1656 			last_block = blocknr;
1657 
1658 		if (i > 0) {
1659 			other = btrfs_node_blockptr(parent, i - 1);
1660 			close = close_blocks(blocknr, other, blocksize);
1661 		}
1662 		if (!close && i < end_slot) {
1663 			other = btrfs_node_blockptr(parent, i + 1);
1664 			close = close_blocks(blocknr, other, blocksize);
1665 		}
1666 		if (close) {
1667 			last_block = blocknr;
1668 			continue;
1669 		}
1670 
1671 		cur = find_extent_buffer(root->fs_info, blocknr);
1672 		if (cur)
1673 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1674 		else
1675 			uptodate = 0;
1676 		if (!cur || !uptodate) {
1677 			if (!cur) {
1678 				cur = read_tree_block(root, blocknr, gen);
1679 				if (IS_ERR(cur)) {
1680 					return PTR_ERR(cur);
1681 				} else if (!extent_buffer_uptodate(cur)) {
1682 					free_extent_buffer(cur);
1683 					return -EIO;
1684 				}
1685 			} else if (!uptodate) {
1686 				err = btrfs_read_buffer(cur, gen);
1687 				if (err) {
1688 					free_extent_buffer(cur);
1689 					return err;
1690 				}
1691 			}
1692 		}
1693 		if (search_start == 0)
1694 			search_start = last_block;
1695 
1696 		btrfs_tree_lock(cur);
1697 		btrfs_set_lock_blocking(cur);
1698 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1699 					&cur, search_start,
1700 					min(16 * blocksize,
1701 					    (end_slot - i) * blocksize));
1702 		if (err) {
1703 			btrfs_tree_unlock(cur);
1704 			free_extent_buffer(cur);
1705 			break;
1706 		}
1707 		search_start = cur->start;
1708 		last_block = cur->start;
1709 		*last_ret = search_start;
1710 		btrfs_tree_unlock(cur);
1711 		free_extent_buffer(cur);
1712 	}
1713 	return err;
1714 }
1715 
1716 
1717 /*
1718  * search for key in the extent_buffer.  The items start at offset p,
1719  * and they are item_size apart.  There are 'max' items in p.
1720  *
1721  * the slot in the array is returned via slot, and it points to
1722  * the place where you would insert key if it is not found in
1723  * the array.
1724  *
1725  * slot may point to max if the key is bigger than all of the keys
1726  */
1727 static noinline int generic_bin_search(struct extent_buffer *eb,
1728 				       unsigned long p,
1729 				       int item_size, struct btrfs_key *key,
1730 				       int max, int *slot)
1731 {
1732 	int low = 0;
1733 	int high = max;
1734 	int mid;
1735 	int ret;
1736 	struct btrfs_disk_key *tmp = NULL;
1737 	struct btrfs_disk_key unaligned;
1738 	unsigned long offset;
1739 	char *kaddr = NULL;
1740 	unsigned long map_start = 0;
1741 	unsigned long map_len = 0;
1742 	int err;
1743 
1744 	if (low > high) {
1745 		btrfs_err(eb->fs_info,
1746 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1747 			  __func__, low, high, eb->start,
1748 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1749 		return -EINVAL;
1750 	}
1751 
1752 	while (low < high) {
1753 		mid = (low + high) / 2;
1754 		offset = p + mid * item_size;
1755 
1756 		if (!kaddr || offset < map_start ||
1757 		    (offset + sizeof(struct btrfs_disk_key)) >
1758 		    map_start + map_len) {
1759 
1760 			err = map_private_extent_buffer(eb, offset,
1761 						sizeof(struct btrfs_disk_key),
1762 						&kaddr, &map_start, &map_len);
1763 
1764 			if (!err) {
1765 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1766 							map_start);
1767 			} else if (err == 1) {
1768 				read_extent_buffer(eb, &unaligned,
1769 						   offset, sizeof(unaligned));
1770 				tmp = &unaligned;
1771 			} else {
1772 				return err;
1773 			}
1774 
1775 		} else {
1776 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1777 							map_start);
1778 		}
1779 		ret = comp_keys(tmp, key);
1780 
1781 		if (ret < 0)
1782 			low = mid + 1;
1783 		else if (ret > 0)
1784 			high = mid;
1785 		else {
1786 			*slot = mid;
1787 			return 0;
1788 		}
1789 	}
1790 	*slot = low;
1791 	return 1;
1792 }
1793 
1794 /*
1795  * simple bin_search frontend that does the right thing for
1796  * leaves vs nodes
1797  */
1798 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1799 		      int level, int *slot)
1800 {
1801 	if (level == 0)
1802 		return generic_bin_search(eb,
1803 					  offsetof(struct btrfs_leaf, items),
1804 					  sizeof(struct btrfs_item),
1805 					  key, btrfs_header_nritems(eb),
1806 					  slot);
1807 	else
1808 		return generic_bin_search(eb,
1809 					  offsetof(struct btrfs_node, ptrs),
1810 					  sizeof(struct btrfs_key_ptr),
1811 					  key, btrfs_header_nritems(eb),
1812 					  slot);
1813 }
1814 
1815 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1816 		     int level, int *slot)
1817 {
1818 	return bin_search(eb, key, level, slot);
1819 }
1820 
1821 static void root_add_used(struct btrfs_root *root, u32 size)
1822 {
1823 	spin_lock(&root->accounting_lock);
1824 	btrfs_set_root_used(&root->root_item,
1825 			    btrfs_root_used(&root->root_item) + size);
1826 	spin_unlock(&root->accounting_lock);
1827 }
1828 
1829 static void root_sub_used(struct btrfs_root *root, u32 size)
1830 {
1831 	spin_lock(&root->accounting_lock);
1832 	btrfs_set_root_used(&root->root_item,
1833 			    btrfs_root_used(&root->root_item) - size);
1834 	spin_unlock(&root->accounting_lock);
1835 }
1836 
1837 /* given a node and slot number, this reads the blocks it points to.  The
1838  * extent buffer is returned with a reference taken (but unlocked).
1839  */
1840 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1841 				   struct extent_buffer *parent, int slot)
1842 {
1843 	int level = btrfs_header_level(parent);
1844 	struct extent_buffer *eb;
1845 
1846 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1847 		return ERR_PTR(-ENOENT);
1848 
1849 	BUG_ON(level == 0);
1850 
1851 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1852 			     btrfs_node_ptr_generation(parent, slot));
1853 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1854 		free_extent_buffer(eb);
1855 		eb = ERR_PTR(-EIO);
1856 	}
1857 
1858 	return eb;
1859 }
1860 
1861 /*
1862  * node level balancing, used to make sure nodes are in proper order for
1863  * item deletion.  We balance from the top down, so we have to make sure
1864  * that a deletion won't leave an node completely empty later on.
1865  */
1866 static noinline int balance_level(struct btrfs_trans_handle *trans,
1867 			 struct btrfs_root *root,
1868 			 struct btrfs_path *path, int level)
1869 {
1870 	struct extent_buffer *right = NULL;
1871 	struct extent_buffer *mid;
1872 	struct extent_buffer *left = NULL;
1873 	struct extent_buffer *parent = NULL;
1874 	int ret = 0;
1875 	int wret;
1876 	int pslot;
1877 	int orig_slot = path->slots[level];
1878 	u64 orig_ptr;
1879 
1880 	if (level == 0)
1881 		return 0;
1882 
1883 	mid = path->nodes[level];
1884 
1885 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1886 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1887 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1888 
1889 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1890 
1891 	if (level < BTRFS_MAX_LEVEL - 1) {
1892 		parent = path->nodes[level + 1];
1893 		pslot = path->slots[level + 1];
1894 	}
1895 
1896 	/*
1897 	 * deal with the case where there is only one pointer in the root
1898 	 * by promoting the node below to a root
1899 	 */
1900 	if (!parent) {
1901 		struct extent_buffer *child;
1902 
1903 		if (btrfs_header_nritems(mid) != 1)
1904 			return 0;
1905 
1906 		/* promote the child to a root */
1907 		child = read_node_slot(root, mid, 0);
1908 		if (IS_ERR(child)) {
1909 			ret = PTR_ERR(child);
1910 			btrfs_handle_fs_error(root->fs_info, ret, NULL);
1911 			goto enospc;
1912 		}
1913 
1914 		btrfs_tree_lock(child);
1915 		btrfs_set_lock_blocking(child);
1916 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1917 		if (ret) {
1918 			btrfs_tree_unlock(child);
1919 			free_extent_buffer(child);
1920 			goto enospc;
1921 		}
1922 
1923 		tree_mod_log_set_root_pointer(root, child, 1);
1924 		rcu_assign_pointer(root->node, child);
1925 
1926 		add_root_to_dirty_list(root);
1927 		btrfs_tree_unlock(child);
1928 
1929 		path->locks[level] = 0;
1930 		path->nodes[level] = NULL;
1931 		clean_tree_block(trans, root->fs_info, mid);
1932 		btrfs_tree_unlock(mid);
1933 		/* once for the path */
1934 		free_extent_buffer(mid);
1935 
1936 		root_sub_used(root, mid->len);
1937 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1938 		/* once for the root ptr */
1939 		free_extent_buffer_stale(mid);
1940 		return 0;
1941 	}
1942 	if (btrfs_header_nritems(mid) >
1943 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1944 		return 0;
1945 
1946 	left = read_node_slot(root, parent, pslot - 1);
1947 	if (IS_ERR(left))
1948 		left = NULL;
1949 
1950 	if (left) {
1951 		btrfs_tree_lock(left);
1952 		btrfs_set_lock_blocking(left);
1953 		wret = btrfs_cow_block(trans, root, left,
1954 				       parent, pslot - 1, &left);
1955 		if (wret) {
1956 			ret = wret;
1957 			goto enospc;
1958 		}
1959 	}
1960 
1961 	right = read_node_slot(root, parent, pslot + 1);
1962 	if (IS_ERR(right))
1963 		right = NULL;
1964 
1965 	if (right) {
1966 		btrfs_tree_lock(right);
1967 		btrfs_set_lock_blocking(right);
1968 		wret = btrfs_cow_block(trans, root, right,
1969 				       parent, pslot + 1, &right);
1970 		if (wret) {
1971 			ret = wret;
1972 			goto enospc;
1973 		}
1974 	}
1975 
1976 	/* first, try to make some room in the middle buffer */
1977 	if (left) {
1978 		orig_slot += btrfs_header_nritems(left);
1979 		wret = push_node_left(trans, root, left, mid, 1);
1980 		if (wret < 0)
1981 			ret = wret;
1982 	}
1983 
1984 	/*
1985 	 * then try to empty the right most buffer into the middle
1986 	 */
1987 	if (right) {
1988 		wret = push_node_left(trans, root, mid, right, 1);
1989 		if (wret < 0 && wret != -ENOSPC)
1990 			ret = wret;
1991 		if (btrfs_header_nritems(right) == 0) {
1992 			clean_tree_block(trans, root->fs_info, right);
1993 			btrfs_tree_unlock(right);
1994 			del_ptr(root, path, level + 1, pslot + 1);
1995 			root_sub_used(root, right->len);
1996 			btrfs_free_tree_block(trans, root, right, 0, 1);
1997 			free_extent_buffer_stale(right);
1998 			right = NULL;
1999 		} else {
2000 			struct btrfs_disk_key right_key;
2001 			btrfs_node_key(right, &right_key, 0);
2002 			tree_mod_log_set_node_key(root->fs_info, parent,
2003 						  pslot + 1, 0);
2004 			btrfs_set_node_key(parent, &right_key, pslot + 1);
2005 			btrfs_mark_buffer_dirty(parent);
2006 		}
2007 	}
2008 	if (btrfs_header_nritems(mid) == 1) {
2009 		/*
2010 		 * we're not allowed to leave a node with one item in the
2011 		 * tree during a delete.  A deletion from lower in the tree
2012 		 * could try to delete the only pointer in this node.
2013 		 * So, pull some keys from the left.
2014 		 * There has to be a left pointer at this point because
2015 		 * otherwise we would have pulled some pointers from the
2016 		 * right
2017 		 */
2018 		if (!left) {
2019 			ret = -EROFS;
2020 			btrfs_handle_fs_error(root->fs_info, ret, NULL);
2021 			goto enospc;
2022 		}
2023 		wret = balance_node_right(trans, root, mid, left);
2024 		if (wret < 0) {
2025 			ret = wret;
2026 			goto enospc;
2027 		}
2028 		if (wret == 1) {
2029 			wret = push_node_left(trans, root, left, mid, 1);
2030 			if (wret < 0)
2031 				ret = wret;
2032 		}
2033 		BUG_ON(wret == 1);
2034 	}
2035 	if (btrfs_header_nritems(mid) == 0) {
2036 		clean_tree_block(trans, root->fs_info, mid);
2037 		btrfs_tree_unlock(mid);
2038 		del_ptr(root, path, level + 1, pslot);
2039 		root_sub_used(root, mid->len);
2040 		btrfs_free_tree_block(trans, root, mid, 0, 1);
2041 		free_extent_buffer_stale(mid);
2042 		mid = NULL;
2043 	} else {
2044 		/* update the parent key to reflect our changes */
2045 		struct btrfs_disk_key mid_key;
2046 		btrfs_node_key(mid, &mid_key, 0);
2047 		tree_mod_log_set_node_key(root->fs_info, parent,
2048 					  pslot, 0);
2049 		btrfs_set_node_key(parent, &mid_key, pslot);
2050 		btrfs_mark_buffer_dirty(parent);
2051 	}
2052 
2053 	/* update the path */
2054 	if (left) {
2055 		if (btrfs_header_nritems(left) > orig_slot) {
2056 			extent_buffer_get(left);
2057 			/* left was locked after cow */
2058 			path->nodes[level] = left;
2059 			path->slots[level + 1] -= 1;
2060 			path->slots[level] = orig_slot;
2061 			if (mid) {
2062 				btrfs_tree_unlock(mid);
2063 				free_extent_buffer(mid);
2064 			}
2065 		} else {
2066 			orig_slot -= btrfs_header_nritems(left);
2067 			path->slots[level] = orig_slot;
2068 		}
2069 	}
2070 	/* double check we haven't messed things up */
2071 	if (orig_ptr !=
2072 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2073 		BUG();
2074 enospc:
2075 	if (right) {
2076 		btrfs_tree_unlock(right);
2077 		free_extent_buffer(right);
2078 	}
2079 	if (left) {
2080 		if (path->nodes[level] != left)
2081 			btrfs_tree_unlock(left);
2082 		free_extent_buffer(left);
2083 	}
2084 	return ret;
2085 }
2086 
2087 /* Node balancing for insertion.  Here we only split or push nodes around
2088  * when they are completely full.  This is also done top down, so we
2089  * have to be pessimistic.
2090  */
2091 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2092 					  struct btrfs_root *root,
2093 					  struct btrfs_path *path, int level)
2094 {
2095 	struct extent_buffer *right = NULL;
2096 	struct extent_buffer *mid;
2097 	struct extent_buffer *left = NULL;
2098 	struct extent_buffer *parent = NULL;
2099 	int ret = 0;
2100 	int wret;
2101 	int pslot;
2102 	int orig_slot = path->slots[level];
2103 
2104 	if (level == 0)
2105 		return 1;
2106 
2107 	mid = path->nodes[level];
2108 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2109 
2110 	if (level < BTRFS_MAX_LEVEL - 1) {
2111 		parent = path->nodes[level + 1];
2112 		pslot = path->slots[level + 1];
2113 	}
2114 
2115 	if (!parent)
2116 		return 1;
2117 
2118 	left = read_node_slot(root, parent, pslot - 1);
2119 	if (IS_ERR(left))
2120 		left = NULL;
2121 
2122 	/* first, try to make some room in the middle buffer */
2123 	if (left) {
2124 		u32 left_nr;
2125 
2126 		btrfs_tree_lock(left);
2127 		btrfs_set_lock_blocking(left);
2128 
2129 		left_nr = btrfs_header_nritems(left);
2130 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2131 			wret = 1;
2132 		} else {
2133 			ret = btrfs_cow_block(trans, root, left, parent,
2134 					      pslot - 1, &left);
2135 			if (ret)
2136 				wret = 1;
2137 			else {
2138 				wret = push_node_left(trans, root,
2139 						      left, mid, 0);
2140 			}
2141 		}
2142 		if (wret < 0)
2143 			ret = wret;
2144 		if (wret == 0) {
2145 			struct btrfs_disk_key disk_key;
2146 			orig_slot += left_nr;
2147 			btrfs_node_key(mid, &disk_key, 0);
2148 			tree_mod_log_set_node_key(root->fs_info, parent,
2149 						  pslot, 0);
2150 			btrfs_set_node_key(parent, &disk_key, pslot);
2151 			btrfs_mark_buffer_dirty(parent);
2152 			if (btrfs_header_nritems(left) > orig_slot) {
2153 				path->nodes[level] = left;
2154 				path->slots[level + 1] -= 1;
2155 				path->slots[level] = orig_slot;
2156 				btrfs_tree_unlock(mid);
2157 				free_extent_buffer(mid);
2158 			} else {
2159 				orig_slot -=
2160 					btrfs_header_nritems(left);
2161 				path->slots[level] = orig_slot;
2162 				btrfs_tree_unlock(left);
2163 				free_extent_buffer(left);
2164 			}
2165 			return 0;
2166 		}
2167 		btrfs_tree_unlock(left);
2168 		free_extent_buffer(left);
2169 	}
2170 	right = read_node_slot(root, parent, pslot + 1);
2171 	if (IS_ERR(right))
2172 		right = NULL;
2173 
2174 	/*
2175 	 * then try to empty the right most buffer into the middle
2176 	 */
2177 	if (right) {
2178 		u32 right_nr;
2179 
2180 		btrfs_tree_lock(right);
2181 		btrfs_set_lock_blocking(right);
2182 
2183 		right_nr = btrfs_header_nritems(right);
2184 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2185 			wret = 1;
2186 		} else {
2187 			ret = btrfs_cow_block(trans, root, right,
2188 					      parent, pslot + 1,
2189 					      &right);
2190 			if (ret)
2191 				wret = 1;
2192 			else {
2193 				wret = balance_node_right(trans, root,
2194 							  right, mid);
2195 			}
2196 		}
2197 		if (wret < 0)
2198 			ret = wret;
2199 		if (wret == 0) {
2200 			struct btrfs_disk_key disk_key;
2201 
2202 			btrfs_node_key(right, &disk_key, 0);
2203 			tree_mod_log_set_node_key(root->fs_info, parent,
2204 						  pslot + 1, 0);
2205 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2206 			btrfs_mark_buffer_dirty(parent);
2207 
2208 			if (btrfs_header_nritems(mid) <= orig_slot) {
2209 				path->nodes[level] = right;
2210 				path->slots[level + 1] += 1;
2211 				path->slots[level] = orig_slot -
2212 					btrfs_header_nritems(mid);
2213 				btrfs_tree_unlock(mid);
2214 				free_extent_buffer(mid);
2215 			} else {
2216 				btrfs_tree_unlock(right);
2217 				free_extent_buffer(right);
2218 			}
2219 			return 0;
2220 		}
2221 		btrfs_tree_unlock(right);
2222 		free_extent_buffer(right);
2223 	}
2224 	return 1;
2225 }
2226 
2227 /*
2228  * readahead one full node of leaves, finding things that are close
2229  * to the block in 'slot', and triggering ra on them.
2230  */
2231 static void reada_for_search(struct btrfs_root *root,
2232 			     struct btrfs_path *path,
2233 			     int level, int slot, u64 objectid)
2234 {
2235 	struct extent_buffer *node;
2236 	struct btrfs_disk_key disk_key;
2237 	u32 nritems;
2238 	u64 search;
2239 	u64 target;
2240 	u64 nread = 0;
2241 	struct extent_buffer *eb;
2242 	u32 nr;
2243 	u32 blocksize;
2244 	u32 nscan = 0;
2245 
2246 	if (level != 1)
2247 		return;
2248 
2249 	if (!path->nodes[level])
2250 		return;
2251 
2252 	node = path->nodes[level];
2253 
2254 	search = btrfs_node_blockptr(node, slot);
2255 	blocksize = root->nodesize;
2256 	eb = find_extent_buffer(root->fs_info, search);
2257 	if (eb) {
2258 		free_extent_buffer(eb);
2259 		return;
2260 	}
2261 
2262 	target = search;
2263 
2264 	nritems = btrfs_header_nritems(node);
2265 	nr = slot;
2266 
2267 	while (1) {
2268 		if (path->reada == READA_BACK) {
2269 			if (nr == 0)
2270 				break;
2271 			nr--;
2272 		} else if (path->reada == READA_FORWARD) {
2273 			nr++;
2274 			if (nr >= nritems)
2275 				break;
2276 		}
2277 		if (path->reada == READA_BACK && objectid) {
2278 			btrfs_node_key(node, &disk_key, nr);
2279 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2280 				break;
2281 		}
2282 		search = btrfs_node_blockptr(node, nr);
2283 		if ((search <= target && target - search <= 65536) ||
2284 		    (search > target && search - target <= 65536)) {
2285 			readahead_tree_block(root, search);
2286 			nread += blocksize;
2287 		}
2288 		nscan++;
2289 		if ((nread > 65536 || nscan > 32))
2290 			break;
2291 	}
2292 }
2293 
2294 static noinline void reada_for_balance(struct btrfs_root *root,
2295 				       struct btrfs_path *path, int level)
2296 {
2297 	int slot;
2298 	int nritems;
2299 	struct extent_buffer *parent;
2300 	struct extent_buffer *eb;
2301 	u64 gen;
2302 	u64 block1 = 0;
2303 	u64 block2 = 0;
2304 
2305 	parent = path->nodes[level + 1];
2306 	if (!parent)
2307 		return;
2308 
2309 	nritems = btrfs_header_nritems(parent);
2310 	slot = path->slots[level + 1];
2311 
2312 	if (slot > 0) {
2313 		block1 = btrfs_node_blockptr(parent, slot - 1);
2314 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2315 		eb = find_extent_buffer(root->fs_info, block1);
2316 		/*
2317 		 * if we get -eagain from btrfs_buffer_uptodate, we
2318 		 * don't want to return eagain here.  That will loop
2319 		 * forever
2320 		 */
2321 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2322 			block1 = 0;
2323 		free_extent_buffer(eb);
2324 	}
2325 	if (slot + 1 < nritems) {
2326 		block2 = btrfs_node_blockptr(parent, slot + 1);
2327 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2328 		eb = find_extent_buffer(root->fs_info, block2);
2329 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2330 			block2 = 0;
2331 		free_extent_buffer(eb);
2332 	}
2333 
2334 	if (block1)
2335 		readahead_tree_block(root, block1);
2336 	if (block2)
2337 		readahead_tree_block(root, block2);
2338 }
2339 
2340 
2341 /*
2342  * when we walk down the tree, it is usually safe to unlock the higher layers
2343  * in the tree.  The exceptions are when our path goes through slot 0, because
2344  * operations on the tree might require changing key pointers higher up in the
2345  * tree.
2346  *
2347  * callers might also have set path->keep_locks, which tells this code to keep
2348  * the lock if the path points to the last slot in the block.  This is part of
2349  * walking through the tree, and selecting the next slot in the higher block.
2350  *
2351  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2352  * if lowest_unlock is 1, level 0 won't be unlocked
2353  */
2354 static noinline void unlock_up(struct btrfs_path *path, int level,
2355 			       int lowest_unlock, int min_write_lock_level,
2356 			       int *write_lock_level)
2357 {
2358 	int i;
2359 	int skip_level = level;
2360 	int no_skips = 0;
2361 	struct extent_buffer *t;
2362 
2363 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2364 		if (!path->nodes[i])
2365 			break;
2366 		if (!path->locks[i])
2367 			break;
2368 		if (!no_skips && path->slots[i] == 0) {
2369 			skip_level = i + 1;
2370 			continue;
2371 		}
2372 		if (!no_skips && path->keep_locks) {
2373 			u32 nritems;
2374 			t = path->nodes[i];
2375 			nritems = btrfs_header_nritems(t);
2376 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2377 				skip_level = i + 1;
2378 				continue;
2379 			}
2380 		}
2381 		if (skip_level < i && i >= lowest_unlock)
2382 			no_skips = 1;
2383 
2384 		t = path->nodes[i];
2385 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2386 			btrfs_tree_unlock_rw(t, path->locks[i]);
2387 			path->locks[i] = 0;
2388 			if (write_lock_level &&
2389 			    i > min_write_lock_level &&
2390 			    i <= *write_lock_level) {
2391 				*write_lock_level = i - 1;
2392 			}
2393 		}
2394 	}
2395 }
2396 
2397 /*
2398  * This releases any locks held in the path starting at level and
2399  * going all the way up to the root.
2400  *
2401  * btrfs_search_slot will keep the lock held on higher nodes in a few
2402  * corner cases, such as COW of the block at slot zero in the node.  This
2403  * ignores those rules, and it should only be called when there are no
2404  * more updates to be done higher up in the tree.
2405  */
2406 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2407 {
2408 	int i;
2409 
2410 	if (path->keep_locks)
2411 		return;
2412 
2413 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2414 		if (!path->nodes[i])
2415 			continue;
2416 		if (!path->locks[i])
2417 			continue;
2418 		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2419 		path->locks[i] = 0;
2420 	}
2421 }
2422 
2423 /*
2424  * helper function for btrfs_search_slot.  The goal is to find a block
2425  * in cache without setting the path to blocking.  If we find the block
2426  * we return zero and the path is unchanged.
2427  *
2428  * If we can't find the block, we set the path blocking and do some
2429  * reada.  -EAGAIN is returned and the search must be repeated.
2430  */
2431 static int
2432 read_block_for_search(struct btrfs_trans_handle *trans,
2433 		       struct btrfs_root *root, struct btrfs_path *p,
2434 		       struct extent_buffer **eb_ret, int level, int slot,
2435 		       struct btrfs_key *key, u64 time_seq)
2436 {
2437 	u64 blocknr;
2438 	u64 gen;
2439 	struct extent_buffer *b = *eb_ret;
2440 	struct extent_buffer *tmp;
2441 	int ret;
2442 
2443 	blocknr = btrfs_node_blockptr(b, slot);
2444 	gen = btrfs_node_ptr_generation(b, slot);
2445 
2446 	tmp = find_extent_buffer(root->fs_info, blocknr);
2447 	if (tmp) {
2448 		/* first we do an atomic uptodate check */
2449 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2450 			*eb_ret = tmp;
2451 			return 0;
2452 		}
2453 
2454 		/* the pages were up to date, but we failed
2455 		 * the generation number check.  Do a full
2456 		 * read for the generation number that is correct.
2457 		 * We must do this without dropping locks so
2458 		 * we can trust our generation number
2459 		 */
2460 		btrfs_set_path_blocking(p);
2461 
2462 		/* now we're allowed to do a blocking uptodate check */
2463 		ret = btrfs_read_buffer(tmp, gen);
2464 		if (!ret) {
2465 			*eb_ret = tmp;
2466 			return 0;
2467 		}
2468 		free_extent_buffer(tmp);
2469 		btrfs_release_path(p);
2470 		return -EIO;
2471 	}
2472 
2473 	/*
2474 	 * reduce lock contention at high levels
2475 	 * of the btree by dropping locks before
2476 	 * we read.  Don't release the lock on the current
2477 	 * level because we need to walk this node to figure
2478 	 * out which blocks to read.
2479 	 */
2480 	btrfs_unlock_up_safe(p, level + 1);
2481 	btrfs_set_path_blocking(p);
2482 
2483 	free_extent_buffer(tmp);
2484 	if (p->reada != READA_NONE)
2485 		reada_for_search(root, p, level, slot, key->objectid);
2486 
2487 	btrfs_release_path(p);
2488 
2489 	ret = -EAGAIN;
2490 	tmp = read_tree_block(root, blocknr, 0);
2491 	if (!IS_ERR(tmp)) {
2492 		/*
2493 		 * If the read above didn't mark this buffer up to date,
2494 		 * it will never end up being up to date.  Set ret to EIO now
2495 		 * and give up so that our caller doesn't loop forever
2496 		 * on our EAGAINs.
2497 		 */
2498 		if (!btrfs_buffer_uptodate(tmp, 0, 0))
2499 			ret = -EIO;
2500 		free_extent_buffer(tmp);
2501 	} else {
2502 		ret = PTR_ERR(tmp);
2503 	}
2504 	return ret;
2505 }
2506 
2507 /*
2508  * helper function for btrfs_search_slot.  This does all of the checks
2509  * for node-level blocks and does any balancing required based on
2510  * the ins_len.
2511  *
2512  * If no extra work was required, zero is returned.  If we had to
2513  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2514  * start over
2515  */
2516 static int
2517 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2518 		       struct btrfs_root *root, struct btrfs_path *p,
2519 		       struct extent_buffer *b, int level, int ins_len,
2520 		       int *write_lock_level)
2521 {
2522 	int ret;
2523 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2524 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2525 		int sret;
2526 
2527 		if (*write_lock_level < level + 1) {
2528 			*write_lock_level = level + 1;
2529 			btrfs_release_path(p);
2530 			goto again;
2531 		}
2532 
2533 		btrfs_set_path_blocking(p);
2534 		reada_for_balance(root, p, level);
2535 		sret = split_node(trans, root, p, level);
2536 		btrfs_clear_path_blocking(p, NULL, 0);
2537 
2538 		BUG_ON(sret > 0);
2539 		if (sret) {
2540 			ret = sret;
2541 			goto done;
2542 		}
2543 		b = p->nodes[level];
2544 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2545 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2546 		int sret;
2547 
2548 		if (*write_lock_level < level + 1) {
2549 			*write_lock_level = level + 1;
2550 			btrfs_release_path(p);
2551 			goto again;
2552 		}
2553 
2554 		btrfs_set_path_blocking(p);
2555 		reada_for_balance(root, p, level);
2556 		sret = balance_level(trans, root, p, level);
2557 		btrfs_clear_path_blocking(p, NULL, 0);
2558 
2559 		if (sret) {
2560 			ret = sret;
2561 			goto done;
2562 		}
2563 		b = p->nodes[level];
2564 		if (!b) {
2565 			btrfs_release_path(p);
2566 			goto again;
2567 		}
2568 		BUG_ON(btrfs_header_nritems(b) == 1);
2569 	}
2570 	return 0;
2571 
2572 again:
2573 	ret = -EAGAIN;
2574 done:
2575 	return ret;
2576 }
2577 
2578 static void key_search_validate(struct extent_buffer *b,
2579 				struct btrfs_key *key,
2580 				int level)
2581 {
2582 #ifdef CONFIG_BTRFS_ASSERT
2583 	struct btrfs_disk_key disk_key;
2584 
2585 	btrfs_cpu_key_to_disk(&disk_key, key);
2586 
2587 	if (level == 0)
2588 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2589 		    offsetof(struct btrfs_leaf, items[0].key),
2590 		    sizeof(disk_key)));
2591 	else
2592 		ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593 		    offsetof(struct btrfs_node, ptrs[0].key),
2594 		    sizeof(disk_key)));
2595 #endif
2596 }
2597 
2598 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2599 		      int level, int *prev_cmp, int *slot)
2600 {
2601 	if (*prev_cmp != 0) {
2602 		*prev_cmp = bin_search(b, key, level, slot);
2603 		return *prev_cmp;
2604 	}
2605 
2606 	key_search_validate(b, key, level);
2607 	*slot = 0;
2608 
2609 	return 0;
2610 }
2611 
2612 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2613 		u64 iobjectid, u64 ioff, u8 key_type,
2614 		struct btrfs_key *found_key)
2615 {
2616 	int ret;
2617 	struct btrfs_key key;
2618 	struct extent_buffer *eb;
2619 
2620 	ASSERT(path);
2621 	ASSERT(found_key);
2622 
2623 	key.type = key_type;
2624 	key.objectid = iobjectid;
2625 	key.offset = ioff;
2626 
2627 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2628 	if (ret < 0)
2629 		return ret;
2630 
2631 	eb = path->nodes[0];
2632 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2633 		ret = btrfs_next_leaf(fs_root, path);
2634 		if (ret)
2635 			return ret;
2636 		eb = path->nodes[0];
2637 	}
2638 
2639 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2640 	if (found_key->type != key.type ||
2641 			found_key->objectid != key.objectid)
2642 		return 1;
2643 
2644 	return 0;
2645 }
2646 
2647 /*
2648  * look for key in the tree.  path is filled in with nodes along the way
2649  * if key is found, we return zero and you can find the item in the leaf
2650  * level of the path (level 0)
2651  *
2652  * If the key isn't found, the path points to the slot where it should
2653  * be inserted, and 1 is returned.  If there are other errors during the
2654  * search a negative error number is returned.
2655  *
2656  * if ins_len > 0, nodes and leaves will be split as we walk down the
2657  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2658  * possible)
2659  */
2660 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2661 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
2662 		      ins_len, int cow)
2663 {
2664 	struct extent_buffer *b;
2665 	int slot;
2666 	int ret;
2667 	int err;
2668 	int level;
2669 	int lowest_unlock = 1;
2670 	int root_lock;
2671 	/* everything at write_lock_level or lower must be write locked */
2672 	int write_lock_level = 0;
2673 	u8 lowest_level = 0;
2674 	int min_write_lock_level;
2675 	int prev_cmp;
2676 
2677 	lowest_level = p->lowest_level;
2678 	WARN_ON(lowest_level && ins_len > 0);
2679 	WARN_ON(p->nodes[0] != NULL);
2680 	BUG_ON(!cow && ins_len);
2681 
2682 	if (ins_len < 0) {
2683 		lowest_unlock = 2;
2684 
2685 		/* when we are removing items, we might have to go up to level
2686 		 * two as we update tree pointers  Make sure we keep write
2687 		 * for those levels as well
2688 		 */
2689 		write_lock_level = 2;
2690 	} else if (ins_len > 0) {
2691 		/*
2692 		 * for inserting items, make sure we have a write lock on
2693 		 * level 1 so we can update keys
2694 		 */
2695 		write_lock_level = 1;
2696 	}
2697 
2698 	if (!cow)
2699 		write_lock_level = -1;
2700 
2701 	if (cow && (p->keep_locks || p->lowest_level))
2702 		write_lock_level = BTRFS_MAX_LEVEL;
2703 
2704 	min_write_lock_level = write_lock_level;
2705 
2706 again:
2707 	prev_cmp = -1;
2708 	/*
2709 	 * we try very hard to do read locks on the root
2710 	 */
2711 	root_lock = BTRFS_READ_LOCK;
2712 	level = 0;
2713 	if (p->search_commit_root) {
2714 		/*
2715 		 * the commit roots are read only
2716 		 * so we always do read locks
2717 		 */
2718 		if (p->need_commit_sem)
2719 			down_read(&root->fs_info->commit_root_sem);
2720 		b = root->commit_root;
2721 		extent_buffer_get(b);
2722 		level = btrfs_header_level(b);
2723 		if (p->need_commit_sem)
2724 			up_read(&root->fs_info->commit_root_sem);
2725 		if (!p->skip_locking)
2726 			btrfs_tree_read_lock(b);
2727 	} else {
2728 		if (p->skip_locking) {
2729 			b = btrfs_root_node(root);
2730 			level = btrfs_header_level(b);
2731 		} else {
2732 			/* we don't know the level of the root node
2733 			 * until we actually have it read locked
2734 			 */
2735 			b = btrfs_read_lock_root_node(root);
2736 			level = btrfs_header_level(b);
2737 			if (level <= write_lock_level) {
2738 				/* whoops, must trade for write lock */
2739 				btrfs_tree_read_unlock(b);
2740 				free_extent_buffer(b);
2741 				b = btrfs_lock_root_node(root);
2742 				root_lock = BTRFS_WRITE_LOCK;
2743 
2744 				/* the level might have changed, check again */
2745 				level = btrfs_header_level(b);
2746 			}
2747 		}
2748 	}
2749 	p->nodes[level] = b;
2750 	if (!p->skip_locking)
2751 		p->locks[level] = root_lock;
2752 
2753 	while (b) {
2754 		level = btrfs_header_level(b);
2755 
2756 		/*
2757 		 * setup the path here so we can release it under lock
2758 		 * contention with the cow code
2759 		 */
2760 		if (cow) {
2761 			/*
2762 			 * if we don't really need to cow this block
2763 			 * then we don't want to set the path blocking,
2764 			 * so we test it here
2765 			 */
2766 			if (!should_cow_block(trans, root, b)) {
2767 				trans->dirty = true;
2768 				goto cow_done;
2769 			}
2770 
2771 			/*
2772 			 * must have write locks on this node and the
2773 			 * parent
2774 			 */
2775 			if (level > write_lock_level ||
2776 			    (level + 1 > write_lock_level &&
2777 			    level + 1 < BTRFS_MAX_LEVEL &&
2778 			    p->nodes[level + 1])) {
2779 				write_lock_level = level + 1;
2780 				btrfs_release_path(p);
2781 				goto again;
2782 			}
2783 
2784 			btrfs_set_path_blocking(p);
2785 			err = btrfs_cow_block(trans, root, b,
2786 					      p->nodes[level + 1],
2787 					      p->slots[level + 1], &b);
2788 			if (err) {
2789 				ret = err;
2790 				goto done;
2791 			}
2792 		}
2793 cow_done:
2794 		p->nodes[level] = b;
2795 		btrfs_clear_path_blocking(p, NULL, 0);
2796 
2797 		/*
2798 		 * we have a lock on b and as long as we aren't changing
2799 		 * the tree, there is no way to for the items in b to change.
2800 		 * It is safe to drop the lock on our parent before we
2801 		 * go through the expensive btree search on b.
2802 		 *
2803 		 * If we're inserting or deleting (ins_len != 0), then we might
2804 		 * be changing slot zero, which may require changing the parent.
2805 		 * So, we can't drop the lock until after we know which slot
2806 		 * we're operating on.
2807 		 */
2808 		if (!ins_len && !p->keep_locks) {
2809 			int u = level + 1;
2810 
2811 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2812 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2813 				p->locks[u] = 0;
2814 			}
2815 		}
2816 
2817 		ret = key_search(b, key, level, &prev_cmp, &slot);
2818 		if (ret < 0)
2819 			goto done;
2820 
2821 		if (level != 0) {
2822 			int dec = 0;
2823 			if (ret && slot > 0) {
2824 				dec = 1;
2825 				slot -= 1;
2826 			}
2827 			p->slots[level] = slot;
2828 			err = setup_nodes_for_search(trans, root, p, b, level,
2829 					     ins_len, &write_lock_level);
2830 			if (err == -EAGAIN)
2831 				goto again;
2832 			if (err) {
2833 				ret = err;
2834 				goto done;
2835 			}
2836 			b = p->nodes[level];
2837 			slot = p->slots[level];
2838 
2839 			/*
2840 			 * slot 0 is special, if we change the key
2841 			 * we have to update the parent pointer
2842 			 * which means we must have a write lock
2843 			 * on the parent
2844 			 */
2845 			if (slot == 0 && ins_len &&
2846 			    write_lock_level < level + 1) {
2847 				write_lock_level = level + 1;
2848 				btrfs_release_path(p);
2849 				goto again;
2850 			}
2851 
2852 			unlock_up(p, level, lowest_unlock,
2853 				  min_write_lock_level, &write_lock_level);
2854 
2855 			if (level == lowest_level) {
2856 				if (dec)
2857 					p->slots[level]++;
2858 				goto done;
2859 			}
2860 
2861 			err = read_block_for_search(trans, root, p,
2862 						    &b, level, slot, key, 0);
2863 			if (err == -EAGAIN)
2864 				goto again;
2865 			if (err) {
2866 				ret = err;
2867 				goto done;
2868 			}
2869 
2870 			if (!p->skip_locking) {
2871 				level = btrfs_header_level(b);
2872 				if (level <= write_lock_level) {
2873 					err = btrfs_try_tree_write_lock(b);
2874 					if (!err) {
2875 						btrfs_set_path_blocking(p);
2876 						btrfs_tree_lock(b);
2877 						btrfs_clear_path_blocking(p, b,
2878 								  BTRFS_WRITE_LOCK);
2879 					}
2880 					p->locks[level] = BTRFS_WRITE_LOCK;
2881 				} else {
2882 					err = btrfs_tree_read_lock_atomic(b);
2883 					if (!err) {
2884 						btrfs_set_path_blocking(p);
2885 						btrfs_tree_read_lock(b);
2886 						btrfs_clear_path_blocking(p, b,
2887 								  BTRFS_READ_LOCK);
2888 					}
2889 					p->locks[level] = BTRFS_READ_LOCK;
2890 				}
2891 				p->nodes[level] = b;
2892 			}
2893 		} else {
2894 			p->slots[level] = slot;
2895 			if (ins_len > 0 &&
2896 			    btrfs_leaf_free_space(root, b) < ins_len) {
2897 				if (write_lock_level < 1) {
2898 					write_lock_level = 1;
2899 					btrfs_release_path(p);
2900 					goto again;
2901 				}
2902 
2903 				btrfs_set_path_blocking(p);
2904 				err = split_leaf(trans, root, key,
2905 						 p, ins_len, ret == 0);
2906 				btrfs_clear_path_blocking(p, NULL, 0);
2907 
2908 				BUG_ON(err > 0);
2909 				if (err) {
2910 					ret = err;
2911 					goto done;
2912 				}
2913 			}
2914 			if (!p->search_for_split)
2915 				unlock_up(p, level, lowest_unlock,
2916 					  min_write_lock_level, &write_lock_level);
2917 			goto done;
2918 		}
2919 	}
2920 	ret = 1;
2921 done:
2922 	/*
2923 	 * we don't really know what they plan on doing with the path
2924 	 * from here on, so for now just mark it as blocking
2925 	 */
2926 	if (!p->leave_spinning)
2927 		btrfs_set_path_blocking(p);
2928 	if (ret < 0 && !p->skip_release_on_error)
2929 		btrfs_release_path(p);
2930 	return ret;
2931 }
2932 
2933 /*
2934  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2935  * current state of the tree together with the operations recorded in the tree
2936  * modification log to search for the key in a previous version of this tree, as
2937  * denoted by the time_seq parameter.
2938  *
2939  * Naturally, there is no support for insert, delete or cow operations.
2940  *
2941  * The resulting path and return value will be set up as if we called
2942  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2943  */
2944 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2945 			  struct btrfs_path *p, u64 time_seq)
2946 {
2947 	struct extent_buffer *b;
2948 	int slot;
2949 	int ret;
2950 	int err;
2951 	int level;
2952 	int lowest_unlock = 1;
2953 	u8 lowest_level = 0;
2954 	int prev_cmp = -1;
2955 
2956 	lowest_level = p->lowest_level;
2957 	WARN_ON(p->nodes[0] != NULL);
2958 
2959 	if (p->search_commit_root) {
2960 		BUG_ON(time_seq);
2961 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2962 	}
2963 
2964 again:
2965 	b = get_old_root(root, time_seq);
2966 	level = btrfs_header_level(b);
2967 	p->locks[level] = BTRFS_READ_LOCK;
2968 
2969 	while (b) {
2970 		level = btrfs_header_level(b);
2971 		p->nodes[level] = b;
2972 		btrfs_clear_path_blocking(p, NULL, 0);
2973 
2974 		/*
2975 		 * we have a lock on b and as long as we aren't changing
2976 		 * the tree, there is no way to for the items in b to change.
2977 		 * It is safe to drop the lock on our parent before we
2978 		 * go through the expensive btree search on b.
2979 		 */
2980 		btrfs_unlock_up_safe(p, level + 1);
2981 
2982 		/*
2983 		 * Since we can unwind ebs we want to do a real search every
2984 		 * time.
2985 		 */
2986 		prev_cmp = -1;
2987 		ret = key_search(b, key, level, &prev_cmp, &slot);
2988 
2989 		if (level != 0) {
2990 			int dec = 0;
2991 			if (ret && slot > 0) {
2992 				dec = 1;
2993 				slot -= 1;
2994 			}
2995 			p->slots[level] = slot;
2996 			unlock_up(p, level, lowest_unlock, 0, NULL);
2997 
2998 			if (level == lowest_level) {
2999 				if (dec)
3000 					p->slots[level]++;
3001 				goto done;
3002 			}
3003 
3004 			err = read_block_for_search(NULL, root, p, &b, level,
3005 						    slot, key, time_seq);
3006 			if (err == -EAGAIN)
3007 				goto again;
3008 			if (err) {
3009 				ret = err;
3010 				goto done;
3011 			}
3012 
3013 			level = btrfs_header_level(b);
3014 			err = btrfs_tree_read_lock_atomic(b);
3015 			if (!err) {
3016 				btrfs_set_path_blocking(p);
3017 				btrfs_tree_read_lock(b);
3018 				btrfs_clear_path_blocking(p, b,
3019 							  BTRFS_READ_LOCK);
3020 			}
3021 			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3022 			if (!b) {
3023 				ret = -ENOMEM;
3024 				goto done;
3025 			}
3026 			p->locks[level] = BTRFS_READ_LOCK;
3027 			p->nodes[level] = b;
3028 		} else {
3029 			p->slots[level] = slot;
3030 			unlock_up(p, level, lowest_unlock, 0, NULL);
3031 			goto done;
3032 		}
3033 	}
3034 	ret = 1;
3035 done:
3036 	if (!p->leave_spinning)
3037 		btrfs_set_path_blocking(p);
3038 	if (ret < 0)
3039 		btrfs_release_path(p);
3040 
3041 	return ret;
3042 }
3043 
3044 /*
3045  * helper to use instead of search slot if no exact match is needed but
3046  * instead the next or previous item should be returned.
3047  * When find_higher is true, the next higher item is returned, the next lower
3048  * otherwise.
3049  * When return_any and find_higher are both true, and no higher item is found,
3050  * return the next lower instead.
3051  * When return_any is true and find_higher is false, and no lower item is found,
3052  * return the next higher instead.
3053  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3054  * < 0 on error
3055  */
3056 int btrfs_search_slot_for_read(struct btrfs_root *root,
3057 			       struct btrfs_key *key, struct btrfs_path *p,
3058 			       int find_higher, int return_any)
3059 {
3060 	int ret;
3061 	struct extent_buffer *leaf;
3062 
3063 again:
3064 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3065 	if (ret <= 0)
3066 		return ret;
3067 	/*
3068 	 * a return value of 1 means the path is at the position where the
3069 	 * item should be inserted. Normally this is the next bigger item,
3070 	 * but in case the previous item is the last in a leaf, path points
3071 	 * to the first free slot in the previous leaf, i.e. at an invalid
3072 	 * item.
3073 	 */
3074 	leaf = p->nodes[0];
3075 
3076 	if (find_higher) {
3077 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3078 			ret = btrfs_next_leaf(root, p);
3079 			if (ret <= 0)
3080 				return ret;
3081 			if (!return_any)
3082 				return 1;
3083 			/*
3084 			 * no higher item found, return the next
3085 			 * lower instead
3086 			 */
3087 			return_any = 0;
3088 			find_higher = 0;
3089 			btrfs_release_path(p);
3090 			goto again;
3091 		}
3092 	} else {
3093 		if (p->slots[0] == 0) {
3094 			ret = btrfs_prev_leaf(root, p);
3095 			if (ret < 0)
3096 				return ret;
3097 			if (!ret) {
3098 				leaf = p->nodes[0];
3099 				if (p->slots[0] == btrfs_header_nritems(leaf))
3100 					p->slots[0]--;
3101 				return 0;
3102 			}
3103 			if (!return_any)
3104 				return 1;
3105 			/*
3106 			 * no lower item found, return the next
3107 			 * higher instead
3108 			 */
3109 			return_any = 0;
3110 			find_higher = 1;
3111 			btrfs_release_path(p);
3112 			goto again;
3113 		} else {
3114 			--p->slots[0];
3115 		}
3116 	}
3117 	return 0;
3118 }
3119 
3120 /*
3121  * adjust the pointers going up the tree, starting at level
3122  * making sure the right key of each node is points to 'key'.
3123  * This is used after shifting pointers to the left, so it stops
3124  * fixing up pointers when a given leaf/node is not in slot 0 of the
3125  * higher levels
3126  *
3127  */
3128 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3129 			   struct btrfs_path *path,
3130 			   struct btrfs_disk_key *key, int level)
3131 {
3132 	int i;
3133 	struct extent_buffer *t;
3134 
3135 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3136 		int tslot = path->slots[i];
3137 		if (!path->nodes[i])
3138 			break;
3139 		t = path->nodes[i];
3140 		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3141 		btrfs_set_node_key(t, key, tslot);
3142 		btrfs_mark_buffer_dirty(path->nodes[i]);
3143 		if (tslot != 0)
3144 			break;
3145 	}
3146 }
3147 
3148 /*
3149  * update item key.
3150  *
3151  * This function isn't completely safe. It's the caller's responsibility
3152  * that the new key won't break the order
3153  */
3154 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3155 			     struct btrfs_path *path,
3156 			     struct btrfs_key *new_key)
3157 {
3158 	struct btrfs_disk_key disk_key;
3159 	struct extent_buffer *eb;
3160 	int slot;
3161 
3162 	eb = path->nodes[0];
3163 	slot = path->slots[0];
3164 	if (slot > 0) {
3165 		btrfs_item_key(eb, &disk_key, slot - 1);
3166 		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3167 	}
3168 	if (slot < btrfs_header_nritems(eb) - 1) {
3169 		btrfs_item_key(eb, &disk_key, slot + 1);
3170 		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3171 	}
3172 
3173 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3174 	btrfs_set_item_key(eb, &disk_key, slot);
3175 	btrfs_mark_buffer_dirty(eb);
3176 	if (slot == 0)
3177 		fixup_low_keys(fs_info, path, &disk_key, 1);
3178 }
3179 
3180 /*
3181  * try to push data from one node into the next node left in the
3182  * tree.
3183  *
3184  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3185  * error, and > 0 if there was no room in the left hand block.
3186  */
3187 static int push_node_left(struct btrfs_trans_handle *trans,
3188 			  struct btrfs_root *root, struct extent_buffer *dst,
3189 			  struct extent_buffer *src, int empty)
3190 {
3191 	int push_items = 0;
3192 	int src_nritems;
3193 	int dst_nritems;
3194 	int ret = 0;
3195 
3196 	src_nritems = btrfs_header_nritems(src);
3197 	dst_nritems = btrfs_header_nritems(dst);
3198 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3199 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3200 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3201 
3202 	if (!empty && src_nritems <= 8)
3203 		return 1;
3204 
3205 	if (push_items <= 0)
3206 		return 1;
3207 
3208 	if (empty) {
3209 		push_items = min(src_nritems, push_items);
3210 		if (push_items < src_nritems) {
3211 			/* leave at least 8 pointers in the node if
3212 			 * we aren't going to empty it
3213 			 */
3214 			if (src_nritems - push_items < 8) {
3215 				if (push_items <= 8)
3216 					return 1;
3217 				push_items -= 8;
3218 			}
3219 		}
3220 	} else
3221 		push_items = min(src_nritems - 8, push_items);
3222 
3223 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3224 				   push_items);
3225 	if (ret) {
3226 		btrfs_abort_transaction(trans, ret);
3227 		return ret;
3228 	}
3229 	copy_extent_buffer(dst, src,
3230 			   btrfs_node_key_ptr_offset(dst_nritems),
3231 			   btrfs_node_key_ptr_offset(0),
3232 			   push_items * sizeof(struct btrfs_key_ptr));
3233 
3234 	if (push_items < src_nritems) {
3235 		/*
3236 		 * don't call tree_mod_log_eb_move here, key removal was already
3237 		 * fully logged by tree_mod_log_eb_copy above.
3238 		 */
3239 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3240 				      btrfs_node_key_ptr_offset(push_items),
3241 				      (src_nritems - push_items) *
3242 				      sizeof(struct btrfs_key_ptr));
3243 	}
3244 	btrfs_set_header_nritems(src, src_nritems - push_items);
3245 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3246 	btrfs_mark_buffer_dirty(src);
3247 	btrfs_mark_buffer_dirty(dst);
3248 
3249 	return ret;
3250 }
3251 
3252 /*
3253  * try to push data from one node into the next node right in the
3254  * tree.
3255  *
3256  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3257  * error, and > 0 if there was no room in the right hand block.
3258  *
3259  * this will  only push up to 1/2 the contents of the left node over
3260  */
3261 static int balance_node_right(struct btrfs_trans_handle *trans,
3262 			      struct btrfs_root *root,
3263 			      struct extent_buffer *dst,
3264 			      struct extent_buffer *src)
3265 {
3266 	int push_items = 0;
3267 	int max_push;
3268 	int src_nritems;
3269 	int dst_nritems;
3270 	int ret = 0;
3271 
3272 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3273 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3274 
3275 	src_nritems = btrfs_header_nritems(src);
3276 	dst_nritems = btrfs_header_nritems(dst);
3277 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3278 	if (push_items <= 0)
3279 		return 1;
3280 
3281 	if (src_nritems < 4)
3282 		return 1;
3283 
3284 	max_push = src_nritems / 2 + 1;
3285 	/* don't try to empty the node */
3286 	if (max_push >= src_nritems)
3287 		return 1;
3288 
3289 	if (max_push < push_items)
3290 		push_items = max_push;
3291 
3292 	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3293 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3294 				      btrfs_node_key_ptr_offset(0),
3295 				      (dst_nritems) *
3296 				      sizeof(struct btrfs_key_ptr));
3297 
3298 	ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3299 				   src_nritems - push_items, push_items);
3300 	if (ret) {
3301 		btrfs_abort_transaction(trans, ret);
3302 		return ret;
3303 	}
3304 	copy_extent_buffer(dst, src,
3305 			   btrfs_node_key_ptr_offset(0),
3306 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3307 			   push_items * sizeof(struct btrfs_key_ptr));
3308 
3309 	btrfs_set_header_nritems(src, src_nritems - push_items);
3310 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3311 
3312 	btrfs_mark_buffer_dirty(src);
3313 	btrfs_mark_buffer_dirty(dst);
3314 
3315 	return ret;
3316 }
3317 
3318 /*
3319  * helper function to insert a new root level in the tree.
3320  * A new node is allocated, and a single item is inserted to
3321  * point to the existing root
3322  *
3323  * returns zero on success or < 0 on failure.
3324  */
3325 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3326 			   struct btrfs_root *root,
3327 			   struct btrfs_path *path, int level)
3328 {
3329 	u64 lower_gen;
3330 	struct extent_buffer *lower;
3331 	struct extent_buffer *c;
3332 	struct extent_buffer *old;
3333 	struct btrfs_disk_key lower_key;
3334 
3335 	BUG_ON(path->nodes[level]);
3336 	BUG_ON(path->nodes[level-1] != root->node);
3337 
3338 	lower = path->nodes[level-1];
3339 	if (level == 1)
3340 		btrfs_item_key(lower, &lower_key, 0);
3341 	else
3342 		btrfs_node_key(lower, &lower_key, 0);
3343 
3344 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3345 				   &lower_key, level, root->node->start, 0);
3346 	if (IS_ERR(c))
3347 		return PTR_ERR(c);
3348 
3349 	root_add_used(root, root->nodesize);
3350 
3351 	memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
3352 	btrfs_set_header_nritems(c, 1);
3353 	btrfs_set_header_level(c, level);
3354 	btrfs_set_header_bytenr(c, c->start);
3355 	btrfs_set_header_generation(c, trans->transid);
3356 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3357 	btrfs_set_header_owner(c, root->root_key.objectid);
3358 
3359 	write_extent_buffer_fsid(c, root->fs_info->fsid);
3360 	write_extent_buffer_chunk_tree_uuid(c, root->fs_info->chunk_tree_uuid);
3361 
3362 	btrfs_set_node_key(c, &lower_key, 0);
3363 	btrfs_set_node_blockptr(c, 0, lower->start);
3364 	lower_gen = btrfs_header_generation(lower);
3365 	WARN_ON(lower_gen != trans->transid);
3366 
3367 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3368 
3369 	btrfs_mark_buffer_dirty(c);
3370 
3371 	old = root->node;
3372 	tree_mod_log_set_root_pointer(root, c, 0);
3373 	rcu_assign_pointer(root->node, c);
3374 
3375 	/* the super has an extra ref to root->node */
3376 	free_extent_buffer(old);
3377 
3378 	add_root_to_dirty_list(root);
3379 	extent_buffer_get(c);
3380 	path->nodes[level] = c;
3381 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3382 	path->slots[level] = 0;
3383 	return 0;
3384 }
3385 
3386 /*
3387  * worker function to insert a single pointer in a node.
3388  * the node should have enough room for the pointer already
3389  *
3390  * slot and level indicate where you want the key to go, and
3391  * blocknr is the block the key points to.
3392  */
3393 static void insert_ptr(struct btrfs_trans_handle *trans,
3394 		       struct btrfs_root *root, struct btrfs_path *path,
3395 		       struct btrfs_disk_key *key, u64 bytenr,
3396 		       int slot, int level)
3397 {
3398 	struct extent_buffer *lower;
3399 	int nritems;
3400 	int ret;
3401 
3402 	BUG_ON(!path->nodes[level]);
3403 	btrfs_assert_tree_locked(path->nodes[level]);
3404 	lower = path->nodes[level];
3405 	nritems = btrfs_header_nritems(lower);
3406 	BUG_ON(slot > nritems);
3407 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3408 	if (slot != nritems) {
3409 		if (level)
3410 			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3411 					     slot, nritems - slot);
3412 		memmove_extent_buffer(lower,
3413 			      btrfs_node_key_ptr_offset(slot + 1),
3414 			      btrfs_node_key_ptr_offset(slot),
3415 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3416 	}
3417 	if (level) {
3418 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3419 					      MOD_LOG_KEY_ADD, GFP_NOFS);
3420 		BUG_ON(ret < 0);
3421 	}
3422 	btrfs_set_node_key(lower, key, slot);
3423 	btrfs_set_node_blockptr(lower, slot, bytenr);
3424 	WARN_ON(trans->transid == 0);
3425 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3426 	btrfs_set_header_nritems(lower, nritems + 1);
3427 	btrfs_mark_buffer_dirty(lower);
3428 }
3429 
3430 /*
3431  * split the node at the specified level in path in two.
3432  * The path is corrected to point to the appropriate node after the split
3433  *
3434  * Before splitting this tries to make some room in the node by pushing
3435  * left and right, if either one works, it returns right away.
3436  *
3437  * returns 0 on success and < 0 on failure
3438  */
3439 static noinline int split_node(struct btrfs_trans_handle *trans,
3440 			       struct btrfs_root *root,
3441 			       struct btrfs_path *path, int level)
3442 {
3443 	struct extent_buffer *c;
3444 	struct extent_buffer *split;
3445 	struct btrfs_disk_key disk_key;
3446 	int mid;
3447 	int ret;
3448 	u32 c_nritems;
3449 
3450 	c = path->nodes[level];
3451 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3452 	if (c == root->node) {
3453 		/*
3454 		 * trying to split the root, lets make a new one
3455 		 *
3456 		 * tree mod log: We don't log_removal old root in
3457 		 * insert_new_root, because that root buffer will be kept as a
3458 		 * normal node. We are going to log removal of half of the
3459 		 * elements below with tree_mod_log_eb_copy. We're holding a
3460 		 * tree lock on the buffer, which is why we cannot race with
3461 		 * other tree_mod_log users.
3462 		 */
3463 		ret = insert_new_root(trans, root, path, level + 1);
3464 		if (ret)
3465 			return ret;
3466 	} else {
3467 		ret = push_nodes_for_insert(trans, root, path, level);
3468 		c = path->nodes[level];
3469 		if (!ret && btrfs_header_nritems(c) <
3470 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3471 			return 0;
3472 		if (ret < 0)
3473 			return ret;
3474 	}
3475 
3476 	c_nritems = btrfs_header_nritems(c);
3477 	mid = (c_nritems + 1) / 2;
3478 	btrfs_node_key(c, &disk_key, mid);
3479 
3480 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3481 			&disk_key, level, c->start, 0);
3482 	if (IS_ERR(split))
3483 		return PTR_ERR(split);
3484 
3485 	root_add_used(root, root->nodesize);
3486 
3487 	memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
3488 	btrfs_set_header_level(split, btrfs_header_level(c));
3489 	btrfs_set_header_bytenr(split, split->start);
3490 	btrfs_set_header_generation(split, trans->transid);
3491 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3492 	btrfs_set_header_owner(split, root->root_key.objectid);
3493 	write_extent_buffer_fsid(split, root->fs_info->fsid);
3494 	write_extent_buffer_chunk_tree_uuid(split,
3495 			root->fs_info->chunk_tree_uuid);
3496 
3497 	ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3498 				   mid, c_nritems - mid);
3499 	if (ret) {
3500 		btrfs_abort_transaction(trans, ret);
3501 		return ret;
3502 	}
3503 	copy_extent_buffer(split, c,
3504 			   btrfs_node_key_ptr_offset(0),
3505 			   btrfs_node_key_ptr_offset(mid),
3506 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3507 	btrfs_set_header_nritems(split, c_nritems - mid);
3508 	btrfs_set_header_nritems(c, mid);
3509 	ret = 0;
3510 
3511 	btrfs_mark_buffer_dirty(c);
3512 	btrfs_mark_buffer_dirty(split);
3513 
3514 	insert_ptr(trans, root, path, &disk_key, split->start,
3515 		   path->slots[level + 1] + 1, level + 1);
3516 
3517 	if (path->slots[level] >= mid) {
3518 		path->slots[level] -= mid;
3519 		btrfs_tree_unlock(c);
3520 		free_extent_buffer(c);
3521 		path->nodes[level] = split;
3522 		path->slots[level + 1] += 1;
3523 	} else {
3524 		btrfs_tree_unlock(split);
3525 		free_extent_buffer(split);
3526 	}
3527 	return ret;
3528 }
3529 
3530 /*
3531  * how many bytes are required to store the items in a leaf.  start
3532  * and nr indicate which items in the leaf to check.  This totals up the
3533  * space used both by the item structs and the item data
3534  */
3535 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3536 {
3537 	struct btrfs_item *start_item;
3538 	struct btrfs_item *end_item;
3539 	struct btrfs_map_token token;
3540 	int data_len;
3541 	int nritems = btrfs_header_nritems(l);
3542 	int end = min(nritems, start + nr) - 1;
3543 
3544 	if (!nr)
3545 		return 0;
3546 	btrfs_init_map_token(&token);
3547 	start_item = btrfs_item_nr(start);
3548 	end_item = btrfs_item_nr(end);
3549 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3550 		btrfs_token_item_size(l, start_item, &token);
3551 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3552 	data_len += sizeof(struct btrfs_item) * nr;
3553 	WARN_ON(data_len < 0);
3554 	return data_len;
3555 }
3556 
3557 /*
3558  * The space between the end of the leaf items and
3559  * the start of the leaf data.  IOW, how much room
3560  * the leaf has left for both items and data
3561  */
3562 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3563 				   struct extent_buffer *leaf)
3564 {
3565 	int nritems = btrfs_header_nritems(leaf);
3566 	int ret;
3567 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3568 	if (ret < 0) {
3569 		btrfs_crit(root->fs_info,
3570 			"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3571 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3572 		       leaf_space_used(leaf, 0, nritems), nritems);
3573 	}
3574 	return ret;
3575 }
3576 
3577 /*
3578  * min slot controls the lowest index we're willing to push to the
3579  * right.  We'll push up to and including min_slot, but no lower
3580  */
3581 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3582 				      struct btrfs_root *root,
3583 				      struct btrfs_path *path,
3584 				      int data_size, int empty,
3585 				      struct extent_buffer *right,
3586 				      int free_space, u32 left_nritems,
3587 				      u32 min_slot)
3588 {
3589 	struct extent_buffer *left = path->nodes[0];
3590 	struct extent_buffer *upper = path->nodes[1];
3591 	struct btrfs_map_token token;
3592 	struct btrfs_disk_key disk_key;
3593 	int slot;
3594 	u32 i;
3595 	int push_space = 0;
3596 	int push_items = 0;
3597 	struct btrfs_item *item;
3598 	u32 nr;
3599 	u32 right_nritems;
3600 	u32 data_end;
3601 	u32 this_item_size;
3602 
3603 	btrfs_init_map_token(&token);
3604 
3605 	if (empty)
3606 		nr = 0;
3607 	else
3608 		nr = max_t(u32, 1, min_slot);
3609 
3610 	if (path->slots[0] >= left_nritems)
3611 		push_space += data_size;
3612 
3613 	slot = path->slots[1];
3614 	i = left_nritems - 1;
3615 	while (i >= nr) {
3616 		item = btrfs_item_nr(i);
3617 
3618 		if (!empty && push_items > 0) {
3619 			if (path->slots[0] > i)
3620 				break;
3621 			if (path->slots[0] == i) {
3622 				int space = btrfs_leaf_free_space(root, left);
3623 				if (space + push_space * 2 > free_space)
3624 					break;
3625 			}
3626 		}
3627 
3628 		if (path->slots[0] == i)
3629 			push_space += data_size;
3630 
3631 		this_item_size = btrfs_item_size(left, item);
3632 		if (this_item_size + sizeof(*item) + push_space > free_space)
3633 			break;
3634 
3635 		push_items++;
3636 		push_space += this_item_size + sizeof(*item);
3637 		if (i == 0)
3638 			break;
3639 		i--;
3640 	}
3641 
3642 	if (push_items == 0)
3643 		goto out_unlock;
3644 
3645 	WARN_ON(!empty && push_items == left_nritems);
3646 
3647 	/* push left to right */
3648 	right_nritems = btrfs_header_nritems(right);
3649 
3650 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3651 	push_space -= leaf_data_end(root, left);
3652 
3653 	/* make room in the right data area */
3654 	data_end = leaf_data_end(root, right);
3655 	memmove_extent_buffer(right,
3656 			      btrfs_leaf_data(right) + data_end - push_space,
3657 			      btrfs_leaf_data(right) + data_end,
3658 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
3659 
3660 	/* copy from the left data area */
3661 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3662 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3663 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
3664 		     push_space);
3665 
3666 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3667 			      btrfs_item_nr_offset(0),
3668 			      right_nritems * sizeof(struct btrfs_item));
3669 
3670 	/* copy the items from left to right */
3671 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3672 		   btrfs_item_nr_offset(left_nritems - push_items),
3673 		   push_items * sizeof(struct btrfs_item));
3674 
3675 	/* update the item pointers */
3676 	right_nritems += push_items;
3677 	btrfs_set_header_nritems(right, right_nritems);
3678 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3679 	for (i = 0; i < right_nritems; i++) {
3680 		item = btrfs_item_nr(i);
3681 		push_space -= btrfs_token_item_size(right, item, &token);
3682 		btrfs_set_token_item_offset(right, item, push_space, &token);
3683 	}
3684 
3685 	left_nritems -= push_items;
3686 	btrfs_set_header_nritems(left, left_nritems);
3687 
3688 	if (left_nritems)
3689 		btrfs_mark_buffer_dirty(left);
3690 	else
3691 		clean_tree_block(trans, root->fs_info, left);
3692 
3693 	btrfs_mark_buffer_dirty(right);
3694 
3695 	btrfs_item_key(right, &disk_key, 0);
3696 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3697 	btrfs_mark_buffer_dirty(upper);
3698 
3699 	/* then fixup the leaf pointer in the path */
3700 	if (path->slots[0] >= left_nritems) {
3701 		path->slots[0] -= left_nritems;
3702 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3703 			clean_tree_block(trans, root->fs_info, path->nodes[0]);
3704 		btrfs_tree_unlock(path->nodes[0]);
3705 		free_extent_buffer(path->nodes[0]);
3706 		path->nodes[0] = right;
3707 		path->slots[1] += 1;
3708 	} else {
3709 		btrfs_tree_unlock(right);
3710 		free_extent_buffer(right);
3711 	}
3712 	return 0;
3713 
3714 out_unlock:
3715 	btrfs_tree_unlock(right);
3716 	free_extent_buffer(right);
3717 	return 1;
3718 }
3719 
3720 /*
3721  * push some data in the path leaf to the right, trying to free up at
3722  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3723  *
3724  * returns 1 if the push failed because the other node didn't have enough
3725  * room, 0 if everything worked out and < 0 if there were major errors.
3726  *
3727  * this will push starting from min_slot to the end of the leaf.  It won't
3728  * push any slot lower than min_slot
3729  */
3730 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3731 			   *root, struct btrfs_path *path,
3732 			   int min_data_size, int data_size,
3733 			   int empty, u32 min_slot)
3734 {
3735 	struct extent_buffer *left = path->nodes[0];
3736 	struct extent_buffer *right;
3737 	struct extent_buffer *upper;
3738 	int slot;
3739 	int free_space;
3740 	u32 left_nritems;
3741 	int ret;
3742 
3743 	if (!path->nodes[1])
3744 		return 1;
3745 
3746 	slot = path->slots[1];
3747 	upper = path->nodes[1];
3748 	if (slot >= btrfs_header_nritems(upper) - 1)
3749 		return 1;
3750 
3751 	btrfs_assert_tree_locked(path->nodes[1]);
3752 
3753 	right = read_node_slot(root, upper, slot + 1);
3754 	/*
3755 	 * slot + 1 is not valid or we fail to read the right node,
3756 	 * no big deal, just return.
3757 	 */
3758 	if (IS_ERR(right))
3759 		return 1;
3760 
3761 	btrfs_tree_lock(right);
3762 	btrfs_set_lock_blocking(right);
3763 
3764 	free_space = btrfs_leaf_free_space(root, right);
3765 	if (free_space < data_size)
3766 		goto out_unlock;
3767 
3768 	/* cow and double check */
3769 	ret = btrfs_cow_block(trans, root, right, upper,
3770 			      slot + 1, &right);
3771 	if (ret)
3772 		goto out_unlock;
3773 
3774 	free_space = btrfs_leaf_free_space(root, right);
3775 	if (free_space < data_size)
3776 		goto out_unlock;
3777 
3778 	left_nritems = btrfs_header_nritems(left);
3779 	if (left_nritems == 0)
3780 		goto out_unlock;
3781 
3782 	if (path->slots[0] == left_nritems && !empty) {
3783 		/* Key greater than all keys in the leaf, right neighbor has
3784 		 * enough room for it and we're not emptying our leaf to delete
3785 		 * it, therefore use right neighbor to insert the new item and
3786 		 * no need to touch/dirty our left leaft. */
3787 		btrfs_tree_unlock(left);
3788 		free_extent_buffer(left);
3789 		path->nodes[0] = right;
3790 		path->slots[0] = 0;
3791 		path->slots[1]++;
3792 		return 0;
3793 	}
3794 
3795 	return __push_leaf_right(trans, root, path, min_data_size, empty,
3796 				right, free_space, left_nritems, min_slot);
3797 out_unlock:
3798 	btrfs_tree_unlock(right);
3799 	free_extent_buffer(right);
3800 	return 1;
3801 }
3802 
3803 /*
3804  * push some data in the path leaf to the left, trying to free up at
3805  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3806  *
3807  * max_slot can put a limit on how far into the leaf we'll push items.  The
3808  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3809  * items
3810  */
3811 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3812 				     struct btrfs_root *root,
3813 				     struct btrfs_path *path, int data_size,
3814 				     int empty, struct extent_buffer *left,
3815 				     int free_space, u32 right_nritems,
3816 				     u32 max_slot)
3817 {
3818 	struct btrfs_disk_key disk_key;
3819 	struct extent_buffer *right = path->nodes[0];
3820 	int i;
3821 	int push_space = 0;
3822 	int push_items = 0;
3823 	struct btrfs_item *item;
3824 	u32 old_left_nritems;
3825 	u32 nr;
3826 	int ret = 0;
3827 	u32 this_item_size;
3828 	u32 old_left_item_size;
3829 	struct btrfs_map_token token;
3830 
3831 	btrfs_init_map_token(&token);
3832 
3833 	if (empty)
3834 		nr = min(right_nritems, max_slot);
3835 	else
3836 		nr = min(right_nritems - 1, max_slot);
3837 
3838 	for (i = 0; i < nr; i++) {
3839 		item = btrfs_item_nr(i);
3840 
3841 		if (!empty && push_items > 0) {
3842 			if (path->slots[0] < i)
3843 				break;
3844 			if (path->slots[0] == i) {
3845 				int space = btrfs_leaf_free_space(root, right);
3846 				if (space + push_space * 2 > free_space)
3847 					break;
3848 			}
3849 		}
3850 
3851 		if (path->slots[0] == i)
3852 			push_space += data_size;
3853 
3854 		this_item_size = btrfs_item_size(right, item);
3855 		if (this_item_size + sizeof(*item) + push_space > free_space)
3856 			break;
3857 
3858 		push_items++;
3859 		push_space += this_item_size + sizeof(*item);
3860 	}
3861 
3862 	if (push_items == 0) {
3863 		ret = 1;
3864 		goto out;
3865 	}
3866 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3867 
3868 	/* push data from right to left */
3869 	copy_extent_buffer(left, right,
3870 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3871 			   btrfs_item_nr_offset(0),
3872 			   push_items * sizeof(struct btrfs_item));
3873 
3874 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
3875 		     btrfs_item_offset_nr(right, push_items - 1);
3876 
3877 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3878 		     leaf_data_end(root, left) - push_space,
3879 		     btrfs_leaf_data(right) +
3880 		     btrfs_item_offset_nr(right, push_items - 1),
3881 		     push_space);
3882 	old_left_nritems = btrfs_header_nritems(left);
3883 	BUG_ON(old_left_nritems <= 0);
3884 
3885 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3886 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3887 		u32 ioff;
3888 
3889 		item = btrfs_item_nr(i);
3890 
3891 		ioff = btrfs_token_item_offset(left, item, &token);
3892 		btrfs_set_token_item_offset(left, item,
3893 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3894 		      &token);
3895 	}
3896 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3897 
3898 	/* fixup right node */
3899 	if (push_items > right_nritems)
3900 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3901 		       right_nritems);
3902 
3903 	if (push_items < right_nritems) {
3904 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3905 						  leaf_data_end(root, right);
3906 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
3907 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3908 				      btrfs_leaf_data(right) +
3909 				      leaf_data_end(root, right), push_space);
3910 
3911 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3912 			      btrfs_item_nr_offset(push_items),
3913 			     (btrfs_header_nritems(right) - push_items) *
3914 			     sizeof(struct btrfs_item));
3915 	}
3916 	right_nritems -= push_items;
3917 	btrfs_set_header_nritems(right, right_nritems);
3918 	push_space = BTRFS_LEAF_DATA_SIZE(root);
3919 	for (i = 0; i < right_nritems; i++) {
3920 		item = btrfs_item_nr(i);
3921 
3922 		push_space = push_space - btrfs_token_item_size(right,
3923 								item, &token);
3924 		btrfs_set_token_item_offset(right, item, push_space, &token);
3925 	}
3926 
3927 	btrfs_mark_buffer_dirty(left);
3928 	if (right_nritems)
3929 		btrfs_mark_buffer_dirty(right);
3930 	else
3931 		clean_tree_block(trans, root->fs_info, right);
3932 
3933 	btrfs_item_key(right, &disk_key, 0);
3934 	fixup_low_keys(root->fs_info, path, &disk_key, 1);
3935 
3936 	/* then fixup the leaf pointer in the path */
3937 	if (path->slots[0] < push_items) {
3938 		path->slots[0] += old_left_nritems;
3939 		btrfs_tree_unlock(path->nodes[0]);
3940 		free_extent_buffer(path->nodes[0]);
3941 		path->nodes[0] = left;
3942 		path->slots[1] -= 1;
3943 	} else {
3944 		btrfs_tree_unlock(left);
3945 		free_extent_buffer(left);
3946 		path->slots[0] -= push_items;
3947 	}
3948 	BUG_ON(path->slots[0] < 0);
3949 	return ret;
3950 out:
3951 	btrfs_tree_unlock(left);
3952 	free_extent_buffer(left);
3953 	return ret;
3954 }
3955 
3956 /*
3957  * push some data in the path leaf to the left, trying to free up at
3958  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3959  *
3960  * max_slot can put a limit on how far into the leaf we'll push items.  The
3961  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3962  * items
3963  */
3964 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3965 			  *root, struct btrfs_path *path, int min_data_size,
3966 			  int data_size, int empty, u32 max_slot)
3967 {
3968 	struct extent_buffer *right = path->nodes[0];
3969 	struct extent_buffer *left;
3970 	int slot;
3971 	int free_space;
3972 	u32 right_nritems;
3973 	int ret = 0;
3974 
3975 	slot = path->slots[1];
3976 	if (slot == 0)
3977 		return 1;
3978 	if (!path->nodes[1])
3979 		return 1;
3980 
3981 	right_nritems = btrfs_header_nritems(right);
3982 	if (right_nritems == 0)
3983 		return 1;
3984 
3985 	btrfs_assert_tree_locked(path->nodes[1]);
3986 
3987 	left = read_node_slot(root, path->nodes[1], slot - 1);
3988 	/*
3989 	 * slot - 1 is not valid or we fail to read the left node,
3990 	 * no big deal, just return.
3991 	 */
3992 	if (IS_ERR(left))
3993 		return 1;
3994 
3995 	btrfs_tree_lock(left);
3996 	btrfs_set_lock_blocking(left);
3997 
3998 	free_space = btrfs_leaf_free_space(root, left);
3999 	if (free_space < data_size) {
4000 		ret = 1;
4001 		goto out;
4002 	}
4003 
4004 	/* cow and double check */
4005 	ret = btrfs_cow_block(trans, root, left,
4006 			      path->nodes[1], slot - 1, &left);
4007 	if (ret) {
4008 		/* we hit -ENOSPC, but it isn't fatal here */
4009 		if (ret == -ENOSPC)
4010 			ret = 1;
4011 		goto out;
4012 	}
4013 
4014 	free_space = btrfs_leaf_free_space(root, left);
4015 	if (free_space < data_size) {
4016 		ret = 1;
4017 		goto out;
4018 	}
4019 
4020 	return __push_leaf_left(trans, root, path, min_data_size,
4021 			       empty, left, free_space, right_nritems,
4022 			       max_slot);
4023 out:
4024 	btrfs_tree_unlock(left);
4025 	free_extent_buffer(left);
4026 	return ret;
4027 }
4028 
4029 /*
4030  * split the path's leaf in two, making sure there is at least data_size
4031  * available for the resulting leaf level of the path.
4032  */
4033 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4034 				    struct btrfs_root *root,
4035 				    struct btrfs_path *path,
4036 				    struct extent_buffer *l,
4037 				    struct extent_buffer *right,
4038 				    int slot, int mid, int nritems)
4039 {
4040 	int data_copy_size;
4041 	int rt_data_off;
4042 	int i;
4043 	struct btrfs_disk_key disk_key;
4044 	struct btrfs_map_token token;
4045 
4046 	btrfs_init_map_token(&token);
4047 
4048 	nritems = nritems - mid;
4049 	btrfs_set_header_nritems(right, nritems);
4050 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4051 
4052 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4053 			   btrfs_item_nr_offset(mid),
4054 			   nritems * sizeof(struct btrfs_item));
4055 
4056 	copy_extent_buffer(right, l,
4057 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4058 		     data_copy_size, btrfs_leaf_data(l) +
4059 		     leaf_data_end(root, l), data_copy_size);
4060 
4061 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4062 		      btrfs_item_end_nr(l, mid);
4063 
4064 	for (i = 0; i < nritems; i++) {
4065 		struct btrfs_item *item = btrfs_item_nr(i);
4066 		u32 ioff;
4067 
4068 		ioff = btrfs_token_item_offset(right, item, &token);
4069 		btrfs_set_token_item_offset(right, item,
4070 					    ioff + rt_data_off, &token);
4071 	}
4072 
4073 	btrfs_set_header_nritems(l, mid);
4074 	btrfs_item_key(right, &disk_key, 0);
4075 	insert_ptr(trans, root, path, &disk_key, right->start,
4076 		   path->slots[1] + 1, 1);
4077 
4078 	btrfs_mark_buffer_dirty(right);
4079 	btrfs_mark_buffer_dirty(l);
4080 	BUG_ON(path->slots[0] != slot);
4081 
4082 	if (mid <= slot) {
4083 		btrfs_tree_unlock(path->nodes[0]);
4084 		free_extent_buffer(path->nodes[0]);
4085 		path->nodes[0] = right;
4086 		path->slots[0] -= mid;
4087 		path->slots[1] += 1;
4088 	} else {
4089 		btrfs_tree_unlock(right);
4090 		free_extent_buffer(right);
4091 	}
4092 
4093 	BUG_ON(path->slots[0] < 0);
4094 }
4095 
4096 /*
4097  * double splits happen when we need to insert a big item in the middle
4098  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4099  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4100  *          A                 B                 C
4101  *
4102  * We avoid this by trying to push the items on either side of our target
4103  * into the adjacent leaves.  If all goes well we can avoid the double split
4104  * completely.
4105  */
4106 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4107 					  struct btrfs_root *root,
4108 					  struct btrfs_path *path,
4109 					  int data_size)
4110 {
4111 	int ret;
4112 	int progress = 0;
4113 	int slot;
4114 	u32 nritems;
4115 	int space_needed = data_size;
4116 
4117 	slot = path->slots[0];
4118 	if (slot < btrfs_header_nritems(path->nodes[0]))
4119 		space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4120 
4121 	/*
4122 	 * try to push all the items after our slot into the
4123 	 * right leaf
4124 	 */
4125 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4126 	if (ret < 0)
4127 		return ret;
4128 
4129 	if (ret == 0)
4130 		progress++;
4131 
4132 	nritems = btrfs_header_nritems(path->nodes[0]);
4133 	/*
4134 	 * our goal is to get our slot at the start or end of a leaf.  If
4135 	 * we've done so we're done
4136 	 */
4137 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4138 		return 0;
4139 
4140 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4141 		return 0;
4142 
4143 	/* try to push all the items before our slot into the next leaf */
4144 	slot = path->slots[0];
4145 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4146 	if (ret < 0)
4147 		return ret;
4148 
4149 	if (ret == 0)
4150 		progress++;
4151 
4152 	if (progress)
4153 		return 0;
4154 	return 1;
4155 }
4156 
4157 /*
4158  * split the path's leaf in two, making sure there is at least data_size
4159  * available for the resulting leaf level of the path.
4160  *
4161  * returns 0 if all went well and < 0 on failure.
4162  */
4163 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4164 			       struct btrfs_root *root,
4165 			       struct btrfs_key *ins_key,
4166 			       struct btrfs_path *path, int data_size,
4167 			       int extend)
4168 {
4169 	struct btrfs_disk_key disk_key;
4170 	struct extent_buffer *l;
4171 	u32 nritems;
4172 	int mid;
4173 	int slot;
4174 	struct extent_buffer *right;
4175 	struct btrfs_fs_info *fs_info = root->fs_info;
4176 	int ret = 0;
4177 	int wret;
4178 	int split;
4179 	int num_doubles = 0;
4180 	int tried_avoid_double = 0;
4181 
4182 	l = path->nodes[0];
4183 	slot = path->slots[0];
4184 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4185 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4186 		return -EOVERFLOW;
4187 
4188 	/* first try to make some room by pushing left and right */
4189 	if (data_size && path->nodes[1]) {
4190 		int space_needed = data_size;
4191 
4192 		if (slot < btrfs_header_nritems(l))
4193 			space_needed -= btrfs_leaf_free_space(root, l);
4194 
4195 		wret = push_leaf_right(trans, root, path, space_needed,
4196 				       space_needed, 0, 0);
4197 		if (wret < 0)
4198 			return wret;
4199 		if (wret) {
4200 			wret = push_leaf_left(trans, root, path, space_needed,
4201 					      space_needed, 0, (u32)-1);
4202 			if (wret < 0)
4203 				return wret;
4204 		}
4205 		l = path->nodes[0];
4206 
4207 		/* did the pushes work? */
4208 		if (btrfs_leaf_free_space(root, l) >= data_size)
4209 			return 0;
4210 	}
4211 
4212 	if (!path->nodes[1]) {
4213 		ret = insert_new_root(trans, root, path, 1);
4214 		if (ret)
4215 			return ret;
4216 	}
4217 again:
4218 	split = 1;
4219 	l = path->nodes[0];
4220 	slot = path->slots[0];
4221 	nritems = btrfs_header_nritems(l);
4222 	mid = (nritems + 1) / 2;
4223 
4224 	if (mid <= slot) {
4225 		if (nritems == 1 ||
4226 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4227 			BTRFS_LEAF_DATA_SIZE(root)) {
4228 			if (slot >= nritems) {
4229 				split = 0;
4230 			} else {
4231 				mid = slot;
4232 				if (mid != nritems &&
4233 				    leaf_space_used(l, mid, nritems - mid) +
4234 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4235 					if (data_size && !tried_avoid_double)
4236 						goto push_for_double;
4237 					split = 2;
4238 				}
4239 			}
4240 		}
4241 	} else {
4242 		if (leaf_space_used(l, 0, mid) + data_size >
4243 			BTRFS_LEAF_DATA_SIZE(root)) {
4244 			if (!extend && data_size && slot == 0) {
4245 				split = 0;
4246 			} else if ((extend || !data_size) && slot == 0) {
4247 				mid = 1;
4248 			} else {
4249 				mid = slot;
4250 				if (mid != nritems &&
4251 				    leaf_space_used(l, mid, nritems - mid) +
4252 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4253 					if (data_size && !tried_avoid_double)
4254 						goto push_for_double;
4255 					split = 2;
4256 				}
4257 			}
4258 		}
4259 	}
4260 
4261 	if (split == 0)
4262 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4263 	else
4264 		btrfs_item_key(l, &disk_key, mid);
4265 
4266 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4267 			&disk_key, 0, l->start, 0);
4268 	if (IS_ERR(right))
4269 		return PTR_ERR(right);
4270 
4271 	root_add_used(root, root->nodesize);
4272 
4273 	memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
4274 	btrfs_set_header_bytenr(right, right->start);
4275 	btrfs_set_header_generation(right, trans->transid);
4276 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4277 	btrfs_set_header_owner(right, root->root_key.objectid);
4278 	btrfs_set_header_level(right, 0);
4279 	write_extent_buffer_fsid(right, fs_info->fsid);
4280 	write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
4281 
4282 	if (split == 0) {
4283 		if (mid <= slot) {
4284 			btrfs_set_header_nritems(right, 0);
4285 			insert_ptr(trans, root, path, &disk_key, right->start,
4286 				   path->slots[1] + 1, 1);
4287 			btrfs_tree_unlock(path->nodes[0]);
4288 			free_extent_buffer(path->nodes[0]);
4289 			path->nodes[0] = right;
4290 			path->slots[0] = 0;
4291 			path->slots[1] += 1;
4292 		} else {
4293 			btrfs_set_header_nritems(right, 0);
4294 			insert_ptr(trans, root, path, &disk_key, right->start,
4295 					  path->slots[1], 1);
4296 			btrfs_tree_unlock(path->nodes[0]);
4297 			free_extent_buffer(path->nodes[0]);
4298 			path->nodes[0] = right;
4299 			path->slots[0] = 0;
4300 			if (path->slots[1] == 0)
4301 				fixup_low_keys(fs_info, path, &disk_key, 1);
4302 		}
4303 		/*
4304 		 * We create a new leaf 'right' for the required ins_len and
4305 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4306 		 * the content of ins_len to 'right'.
4307 		 */
4308 		return ret;
4309 	}
4310 
4311 	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4312 
4313 	if (split == 2) {
4314 		BUG_ON(num_doubles != 0);
4315 		num_doubles++;
4316 		goto again;
4317 	}
4318 
4319 	return 0;
4320 
4321 push_for_double:
4322 	push_for_double_split(trans, root, path, data_size);
4323 	tried_avoid_double = 1;
4324 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4325 		return 0;
4326 	goto again;
4327 }
4328 
4329 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4330 					 struct btrfs_root *root,
4331 					 struct btrfs_path *path, int ins_len)
4332 {
4333 	struct btrfs_key key;
4334 	struct extent_buffer *leaf;
4335 	struct btrfs_file_extent_item *fi;
4336 	u64 extent_len = 0;
4337 	u32 item_size;
4338 	int ret;
4339 
4340 	leaf = path->nodes[0];
4341 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4342 
4343 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4344 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4345 
4346 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4347 		return 0;
4348 
4349 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4350 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4351 		fi = btrfs_item_ptr(leaf, path->slots[0],
4352 				    struct btrfs_file_extent_item);
4353 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4354 	}
4355 	btrfs_release_path(path);
4356 
4357 	path->keep_locks = 1;
4358 	path->search_for_split = 1;
4359 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4360 	path->search_for_split = 0;
4361 	if (ret > 0)
4362 		ret = -EAGAIN;
4363 	if (ret < 0)
4364 		goto err;
4365 
4366 	ret = -EAGAIN;
4367 	leaf = path->nodes[0];
4368 	/* if our item isn't there, return now */
4369 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4370 		goto err;
4371 
4372 	/* the leaf has  changed, it now has room.  return now */
4373 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4374 		goto err;
4375 
4376 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4377 		fi = btrfs_item_ptr(leaf, path->slots[0],
4378 				    struct btrfs_file_extent_item);
4379 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4380 			goto err;
4381 	}
4382 
4383 	btrfs_set_path_blocking(path);
4384 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4385 	if (ret)
4386 		goto err;
4387 
4388 	path->keep_locks = 0;
4389 	btrfs_unlock_up_safe(path, 1);
4390 	return 0;
4391 err:
4392 	path->keep_locks = 0;
4393 	return ret;
4394 }
4395 
4396 static noinline int split_item(struct btrfs_trans_handle *trans,
4397 			       struct btrfs_root *root,
4398 			       struct btrfs_path *path,
4399 			       struct btrfs_key *new_key,
4400 			       unsigned long split_offset)
4401 {
4402 	struct extent_buffer *leaf;
4403 	struct btrfs_item *item;
4404 	struct btrfs_item *new_item;
4405 	int slot;
4406 	char *buf;
4407 	u32 nritems;
4408 	u32 item_size;
4409 	u32 orig_offset;
4410 	struct btrfs_disk_key disk_key;
4411 
4412 	leaf = path->nodes[0];
4413 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4414 
4415 	btrfs_set_path_blocking(path);
4416 
4417 	item = btrfs_item_nr(path->slots[0]);
4418 	orig_offset = btrfs_item_offset(leaf, item);
4419 	item_size = btrfs_item_size(leaf, item);
4420 
4421 	buf = kmalloc(item_size, GFP_NOFS);
4422 	if (!buf)
4423 		return -ENOMEM;
4424 
4425 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4426 			    path->slots[0]), item_size);
4427 
4428 	slot = path->slots[0] + 1;
4429 	nritems = btrfs_header_nritems(leaf);
4430 	if (slot != nritems) {
4431 		/* shift the items */
4432 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4433 				btrfs_item_nr_offset(slot),
4434 				(nritems - slot) * sizeof(struct btrfs_item));
4435 	}
4436 
4437 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4438 	btrfs_set_item_key(leaf, &disk_key, slot);
4439 
4440 	new_item = btrfs_item_nr(slot);
4441 
4442 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4443 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4444 
4445 	btrfs_set_item_offset(leaf, item,
4446 			      orig_offset + item_size - split_offset);
4447 	btrfs_set_item_size(leaf, item, split_offset);
4448 
4449 	btrfs_set_header_nritems(leaf, nritems + 1);
4450 
4451 	/* write the data for the start of the original item */
4452 	write_extent_buffer(leaf, buf,
4453 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4454 			    split_offset);
4455 
4456 	/* write the data for the new item */
4457 	write_extent_buffer(leaf, buf + split_offset,
4458 			    btrfs_item_ptr_offset(leaf, slot),
4459 			    item_size - split_offset);
4460 	btrfs_mark_buffer_dirty(leaf);
4461 
4462 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4463 	kfree(buf);
4464 	return 0;
4465 }
4466 
4467 /*
4468  * This function splits a single item into two items,
4469  * giving 'new_key' to the new item and splitting the
4470  * old one at split_offset (from the start of the item).
4471  *
4472  * The path may be released by this operation.  After
4473  * the split, the path is pointing to the old item.  The
4474  * new item is going to be in the same node as the old one.
4475  *
4476  * Note, the item being split must be smaller enough to live alone on
4477  * a tree block with room for one extra struct btrfs_item
4478  *
4479  * This allows us to split the item in place, keeping a lock on the
4480  * leaf the entire time.
4481  */
4482 int btrfs_split_item(struct btrfs_trans_handle *trans,
4483 		     struct btrfs_root *root,
4484 		     struct btrfs_path *path,
4485 		     struct btrfs_key *new_key,
4486 		     unsigned long split_offset)
4487 {
4488 	int ret;
4489 	ret = setup_leaf_for_split(trans, root, path,
4490 				   sizeof(struct btrfs_item));
4491 	if (ret)
4492 		return ret;
4493 
4494 	ret = split_item(trans, root, path, new_key, split_offset);
4495 	return ret;
4496 }
4497 
4498 /*
4499  * This function duplicate a item, giving 'new_key' to the new item.
4500  * It guarantees both items live in the same tree leaf and the new item
4501  * is contiguous with the original item.
4502  *
4503  * This allows us to split file extent in place, keeping a lock on the
4504  * leaf the entire time.
4505  */
4506 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4507 			 struct btrfs_root *root,
4508 			 struct btrfs_path *path,
4509 			 struct btrfs_key *new_key)
4510 {
4511 	struct extent_buffer *leaf;
4512 	int ret;
4513 	u32 item_size;
4514 
4515 	leaf = path->nodes[0];
4516 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4517 	ret = setup_leaf_for_split(trans, root, path,
4518 				   item_size + sizeof(struct btrfs_item));
4519 	if (ret)
4520 		return ret;
4521 
4522 	path->slots[0]++;
4523 	setup_items_for_insert(root, path, new_key, &item_size,
4524 			       item_size, item_size +
4525 			       sizeof(struct btrfs_item), 1);
4526 	leaf = path->nodes[0];
4527 	memcpy_extent_buffer(leaf,
4528 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4529 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4530 			     item_size);
4531 	return 0;
4532 }
4533 
4534 /*
4535  * make the item pointed to by the path smaller.  new_size indicates
4536  * how small to make it, and from_end tells us if we just chop bytes
4537  * off the end of the item or if we shift the item to chop bytes off
4538  * the front.
4539  */
4540 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4541 			 u32 new_size, int from_end)
4542 {
4543 	int slot;
4544 	struct extent_buffer *leaf;
4545 	struct btrfs_item *item;
4546 	u32 nritems;
4547 	unsigned int data_end;
4548 	unsigned int old_data_start;
4549 	unsigned int old_size;
4550 	unsigned int size_diff;
4551 	int i;
4552 	struct btrfs_map_token token;
4553 
4554 	btrfs_init_map_token(&token);
4555 
4556 	leaf = path->nodes[0];
4557 	slot = path->slots[0];
4558 
4559 	old_size = btrfs_item_size_nr(leaf, slot);
4560 	if (old_size == new_size)
4561 		return;
4562 
4563 	nritems = btrfs_header_nritems(leaf);
4564 	data_end = leaf_data_end(root, leaf);
4565 
4566 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4567 
4568 	size_diff = old_size - new_size;
4569 
4570 	BUG_ON(slot < 0);
4571 	BUG_ON(slot >= nritems);
4572 
4573 	/*
4574 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4575 	 */
4576 	/* first correct the data pointers */
4577 	for (i = slot; i < nritems; i++) {
4578 		u32 ioff;
4579 		item = btrfs_item_nr(i);
4580 
4581 		ioff = btrfs_token_item_offset(leaf, item, &token);
4582 		btrfs_set_token_item_offset(leaf, item,
4583 					    ioff + size_diff, &token);
4584 	}
4585 
4586 	/* shift the data */
4587 	if (from_end) {
4588 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4589 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4590 			      data_end, old_data_start + new_size - data_end);
4591 	} else {
4592 		struct btrfs_disk_key disk_key;
4593 		u64 offset;
4594 
4595 		btrfs_item_key(leaf, &disk_key, slot);
4596 
4597 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4598 			unsigned long ptr;
4599 			struct btrfs_file_extent_item *fi;
4600 
4601 			fi = btrfs_item_ptr(leaf, slot,
4602 					    struct btrfs_file_extent_item);
4603 			fi = (struct btrfs_file_extent_item *)(
4604 			     (unsigned long)fi - size_diff);
4605 
4606 			if (btrfs_file_extent_type(leaf, fi) ==
4607 			    BTRFS_FILE_EXTENT_INLINE) {
4608 				ptr = btrfs_item_ptr_offset(leaf, slot);
4609 				memmove_extent_buffer(leaf, ptr,
4610 				      (unsigned long)fi,
4611 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4612 			}
4613 		}
4614 
4615 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4616 			      data_end + size_diff, btrfs_leaf_data(leaf) +
4617 			      data_end, old_data_start - data_end);
4618 
4619 		offset = btrfs_disk_key_offset(&disk_key);
4620 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4621 		btrfs_set_item_key(leaf, &disk_key, slot);
4622 		if (slot == 0)
4623 			fixup_low_keys(root->fs_info, path, &disk_key, 1);
4624 	}
4625 
4626 	item = btrfs_item_nr(slot);
4627 	btrfs_set_item_size(leaf, item, new_size);
4628 	btrfs_mark_buffer_dirty(leaf);
4629 
4630 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4631 		btrfs_print_leaf(root, leaf);
4632 		BUG();
4633 	}
4634 }
4635 
4636 /*
4637  * make the item pointed to by the path bigger, data_size is the added size.
4638  */
4639 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4640 		       u32 data_size)
4641 {
4642 	int slot;
4643 	struct extent_buffer *leaf;
4644 	struct btrfs_item *item;
4645 	u32 nritems;
4646 	unsigned int data_end;
4647 	unsigned int old_data;
4648 	unsigned int old_size;
4649 	int i;
4650 	struct btrfs_map_token token;
4651 
4652 	btrfs_init_map_token(&token);
4653 
4654 	leaf = path->nodes[0];
4655 
4656 	nritems = btrfs_header_nritems(leaf);
4657 	data_end = leaf_data_end(root, leaf);
4658 
4659 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
4660 		btrfs_print_leaf(root, leaf);
4661 		BUG();
4662 	}
4663 	slot = path->slots[0];
4664 	old_data = btrfs_item_end_nr(leaf, slot);
4665 
4666 	BUG_ON(slot < 0);
4667 	if (slot >= nritems) {
4668 		btrfs_print_leaf(root, leaf);
4669 		btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4670 		       slot, nritems);
4671 		BUG_ON(1);
4672 	}
4673 
4674 	/*
4675 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4676 	 */
4677 	/* first correct the data pointers */
4678 	for (i = slot; i < nritems; i++) {
4679 		u32 ioff;
4680 		item = btrfs_item_nr(i);
4681 
4682 		ioff = btrfs_token_item_offset(leaf, item, &token);
4683 		btrfs_set_token_item_offset(leaf, item,
4684 					    ioff - data_size, &token);
4685 	}
4686 
4687 	/* shift the data */
4688 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4689 		      data_end - data_size, btrfs_leaf_data(leaf) +
4690 		      data_end, old_data - data_end);
4691 
4692 	data_end = old_data;
4693 	old_size = btrfs_item_size_nr(leaf, slot);
4694 	item = btrfs_item_nr(slot);
4695 	btrfs_set_item_size(leaf, item, old_size + data_size);
4696 	btrfs_mark_buffer_dirty(leaf);
4697 
4698 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4699 		btrfs_print_leaf(root, leaf);
4700 		BUG();
4701 	}
4702 }
4703 
4704 /*
4705  * this is a helper for btrfs_insert_empty_items, the main goal here is
4706  * to save stack depth by doing the bulk of the work in a function
4707  * that doesn't call btrfs_search_slot
4708  */
4709 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4710 			    struct btrfs_key *cpu_key, u32 *data_size,
4711 			    u32 total_data, u32 total_size, int nr)
4712 {
4713 	struct btrfs_item *item;
4714 	int i;
4715 	u32 nritems;
4716 	unsigned int data_end;
4717 	struct btrfs_disk_key disk_key;
4718 	struct extent_buffer *leaf;
4719 	int slot;
4720 	struct btrfs_map_token token;
4721 
4722 	if (path->slots[0] == 0) {
4723 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4724 		fixup_low_keys(root->fs_info, path, &disk_key, 1);
4725 	}
4726 	btrfs_unlock_up_safe(path, 1);
4727 
4728 	btrfs_init_map_token(&token);
4729 
4730 	leaf = path->nodes[0];
4731 	slot = path->slots[0];
4732 
4733 	nritems = btrfs_header_nritems(leaf);
4734 	data_end = leaf_data_end(root, leaf);
4735 
4736 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
4737 		btrfs_print_leaf(root, leaf);
4738 		btrfs_crit(root->fs_info,
4739 			   "not enough freespace need %u have %d",
4740 			   total_size, btrfs_leaf_free_space(root, leaf));
4741 		BUG();
4742 	}
4743 
4744 	if (slot != nritems) {
4745 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4746 
4747 		if (old_data < data_end) {
4748 			btrfs_print_leaf(root, leaf);
4749 			btrfs_crit(root->fs_info,
4750 				   "slot %d old_data %d data_end %d",
4751 				   slot, old_data, data_end);
4752 			BUG_ON(1);
4753 		}
4754 		/*
4755 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4756 		 */
4757 		/* first correct the data pointers */
4758 		for (i = slot; i < nritems; i++) {
4759 			u32 ioff;
4760 
4761 			item = btrfs_item_nr(i);
4762 			ioff = btrfs_token_item_offset(leaf, item, &token);
4763 			btrfs_set_token_item_offset(leaf, item,
4764 						    ioff - total_data, &token);
4765 		}
4766 		/* shift the items */
4767 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4768 			      btrfs_item_nr_offset(slot),
4769 			      (nritems - slot) * sizeof(struct btrfs_item));
4770 
4771 		/* shift the data */
4772 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4773 			      data_end - total_data, btrfs_leaf_data(leaf) +
4774 			      data_end, old_data - data_end);
4775 		data_end = old_data;
4776 	}
4777 
4778 	/* setup the item for the new data */
4779 	for (i = 0; i < nr; i++) {
4780 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4781 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4782 		item = btrfs_item_nr(slot + i);
4783 		btrfs_set_token_item_offset(leaf, item,
4784 					    data_end - data_size[i], &token);
4785 		data_end -= data_size[i];
4786 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4787 	}
4788 
4789 	btrfs_set_header_nritems(leaf, nritems + nr);
4790 	btrfs_mark_buffer_dirty(leaf);
4791 
4792 	if (btrfs_leaf_free_space(root, leaf) < 0) {
4793 		btrfs_print_leaf(root, leaf);
4794 		BUG();
4795 	}
4796 }
4797 
4798 /*
4799  * Given a key and some data, insert items into the tree.
4800  * This does all the path init required, making room in the tree if needed.
4801  */
4802 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4803 			    struct btrfs_root *root,
4804 			    struct btrfs_path *path,
4805 			    struct btrfs_key *cpu_key, u32 *data_size,
4806 			    int nr)
4807 {
4808 	int ret = 0;
4809 	int slot;
4810 	int i;
4811 	u32 total_size = 0;
4812 	u32 total_data = 0;
4813 
4814 	for (i = 0; i < nr; i++)
4815 		total_data += data_size[i];
4816 
4817 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4818 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4819 	if (ret == 0)
4820 		return -EEXIST;
4821 	if (ret < 0)
4822 		return ret;
4823 
4824 	slot = path->slots[0];
4825 	BUG_ON(slot < 0);
4826 
4827 	setup_items_for_insert(root, path, cpu_key, data_size,
4828 			       total_data, total_size, nr);
4829 	return 0;
4830 }
4831 
4832 /*
4833  * Given a key and some data, insert an item into the tree.
4834  * This does all the path init required, making room in the tree if needed.
4835  */
4836 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4837 		      *root, struct btrfs_key *cpu_key, void *data, u32
4838 		      data_size)
4839 {
4840 	int ret = 0;
4841 	struct btrfs_path *path;
4842 	struct extent_buffer *leaf;
4843 	unsigned long ptr;
4844 
4845 	path = btrfs_alloc_path();
4846 	if (!path)
4847 		return -ENOMEM;
4848 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4849 	if (!ret) {
4850 		leaf = path->nodes[0];
4851 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4852 		write_extent_buffer(leaf, data, ptr, data_size);
4853 		btrfs_mark_buffer_dirty(leaf);
4854 	}
4855 	btrfs_free_path(path);
4856 	return ret;
4857 }
4858 
4859 /*
4860  * delete the pointer from a given node.
4861  *
4862  * the tree should have been previously balanced so the deletion does not
4863  * empty a node.
4864  */
4865 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4866 		    int level, int slot)
4867 {
4868 	struct extent_buffer *parent = path->nodes[level];
4869 	u32 nritems;
4870 	int ret;
4871 
4872 	nritems = btrfs_header_nritems(parent);
4873 	if (slot != nritems - 1) {
4874 		if (level)
4875 			tree_mod_log_eb_move(root->fs_info, parent, slot,
4876 					     slot + 1, nritems - slot - 1);
4877 		memmove_extent_buffer(parent,
4878 			      btrfs_node_key_ptr_offset(slot),
4879 			      btrfs_node_key_ptr_offset(slot + 1),
4880 			      sizeof(struct btrfs_key_ptr) *
4881 			      (nritems - slot - 1));
4882 	} else if (level) {
4883 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4884 					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
4885 		BUG_ON(ret < 0);
4886 	}
4887 
4888 	nritems--;
4889 	btrfs_set_header_nritems(parent, nritems);
4890 	if (nritems == 0 && parent == root->node) {
4891 		BUG_ON(btrfs_header_level(root->node) != 1);
4892 		/* just turn the root into a leaf and break */
4893 		btrfs_set_header_level(root->node, 0);
4894 	} else if (slot == 0) {
4895 		struct btrfs_disk_key disk_key;
4896 
4897 		btrfs_node_key(parent, &disk_key, 0);
4898 		fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4899 	}
4900 	btrfs_mark_buffer_dirty(parent);
4901 }
4902 
4903 /*
4904  * a helper function to delete the leaf pointed to by path->slots[1] and
4905  * path->nodes[1].
4906  *
4907  * This deletes the pointer in path->nodes[1] and frees the leaf
4908  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4909  *
4910  * The path must have already been setup for deleting the leaf, including
4911  * all the proper balancing.  path->nodes[1] must be locked.
4912  */
4913 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4914 				    struct btrfs_root *root,
4915 				    struct btrfs_path *path,
4916 				    struct extent_buffer *leaf)
4917 {
4918 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4919 	del_ptr(root, path, 1, path->slots[1]);
4920 
4921 	/*
4922 	 * btrfs_free_extent is expensive, we want to make sure we
4923 	 * aren't holding any locks when we call it
4924 	 */
4925 	btrfs_unlock_up_safe(path, 0);
4926 
4927 	root_sub_used(root, leaf->len);
4928 
4929 	extent_buffer_get(leaf);
4930 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4931 	free_extent_buffer_stale(leaf);
4932 }
4933 /*
4934  * delete the item at the leaf level in path.  If that empties
4935  * the leaf, remove it from the tree
4936  */
4937 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4938 		    struct btrfs_path *path, int slot, int nr)
4939 {
4940 	struct extent_buffer *leaf;
4941 	struct btrfs_item *item;
4942 	u32 last_off;
4943 	u32 dsize = 0;
4944 	int ret = 0;
4945 	int wret;
4946 	int i;
4947 	u32 nritems;
4948 	struct btrfs_map_token token;
4949 
4950 	btrfs_init_map_token(&token);
4951 
4952 	leaf = path->nodes[0];
4953 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4954 
4955 	for (i = 0; i < nr; i++)
4956 		dsize += btrfs_item_size_nr(leaf, slot + i);
4957 
4958 	nritems = btrfs_header_nritems(leaf);
4959 
4960 	if (slot + nr != nritems) {
4961 		int data_end = leaf_data_end(root, leaf);
4962 
4963 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4964 			      data_end + dsize,
4965 			      btrfs_leaf_data(leaf) + data_end,
4966 			      last_off - data_end);
4967 
4968 		for (i = slot + nr; i < nritems; i++) {
4969 			u32 ioff;
4970 
4971 			item = btrfs_item_nr(i);
4972 			ioff = btrfs_token_item_offset(leaf, item, &token);
4973 			btrfs_set_token_item_offset(leaf, item,
4974 						    ioff + dsize, &token);
4975 		}
4976 
4977 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4978 			      btrfs_item_nr_offset(slot + nr),
4979 			      sizeof(struct btrfs_item) *
4980 			      (nritems - slot - nr));
4981 	}
4982 	btrfs_set_header_nritems(leaf, nritems - nr);
4983 	nritems -= nr;
4984 
4985 	/* delete the leaf if we've emptied it */
4986 	if (nritems == 0) {
4987 		if (leaf == root->node) {
4988 			btrfs_set_header_level(leaf, 0);
4989 		} else {
4990 			btrfs_set_path_blocking(path);
4991 			clean_tree_block(trans, root->fs_info, leaf);
4992 			btrfs_del_leaf(trans, root, path, leaf);
4993 		}
4994 	} else {
4995 		int used = leaf_space_used(leaf, 0, nritems);
4996 		if (slot == 0) {
4997 			struct btrfs_disk_key disk_key;
4998 
4999 			btrfs_item_key(leaf, &disk_key, 0);
5000 			fixup_low_keys(root->fs_info, path, &disk_key, 1);
5001 		}
5002 
5003 		/* delete the leaf if it is mostly empty */
5004 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5005 			/* push_leaf_left fixes the path.
5006 			 * make sure the path still points to our leaf
5007 			 * for possible call to del_ptr below
5008 			 */
5009 			slot = path->slots[1];
5010 			extent_buffer_get(leaf);
5011 
5012 			btrfs_set_path_blocking(path);
5013 			wret = push_leaf_left(trans, root, path, 1, 1,
5014 					      1, (u32)-1);
5015 			if (wret < 0 && wret != -ENOSPC)
5016 				ret = wret;
5017 
5018 			if (path->nodes[0] == leaf &&
5019 			    btrfs_header_nritems(leaf)) {
5020 				wret = push_leaf_right(trans, root, path, 1,
5021 						       1, 1, 0);
5022 				if (wret < 0 && wret != -ENOSPC)
5023 					ret = wret;
5024 			}
5025 
5026 			if (btrfs_header_nritems(leaf) == 0) {
5027 				path->slots[1] = slot;
5028 				btrfs_del_leaf(trans, root, path, leaf);
5029 				free_extent_buffer(leaf);
5030 				ret = 0;
5031 			} else {
5032 				/* if we're still in the path, make sure
5033 				 * we're dirty.  Otherwise, one of the
5034 				 * push_leaf functions must have already
5035 				 * dirtied this buffer
5036 				 */
5037 				if (path->nodes[0] == leaf)
5038 					btrfs_mark_buffer_dirty(leaf);
5039 				free_extent_buffer(leaf);
5040 			}
5041 		} else {
5042 			btrfs_mark_buffer_dirty(leaf);
5043 		}
5044 	}
5045 	return ret;
5046 }
5047 
5048 /*
5049  * search the tree again to find a leaf with lesser keys
5050  * returns 0 if it found something or 1 if there are no lesser leaves.
5051  * returns < 0 on io errors.
5052  *
5053  * This may release the path, and so you may lose any locks held at the
5054  * time you call it.
5055  */
5056 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5057 {
5058 	struct btrfs_key key;
5059 	struct btrfs_disk_key found_key;
5060 	int ret;
5061 
5062 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5063 
5064 	if (key.offset > 0) {
5065 		key.offset--;
5066 	} else if (key.type > 0) {
5067 		key.type--;
5068 		key.offset = (u64)-1;
5069 	} else if (key.objectid > 0) {
5070 		key.objectid--;
5071 		key.type = (u8)-1;
5072 		key.offset = (u64)-1;
5073 	} else {
5074 		return 1;
5075 	}
5076 
5077 	btrfs_release_path(path);
5078 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5079 	if (ret < 0)
5080 		return ret;
5081 	btrfs_item_key(path->nodes[0], &found_key, 0);
5082 	ret = comp_keys(&found_key, &key);
5083 	/*
5084 	 * We might have had an item with the previous key in the tree right
5085 	 * before we released our path. And after we released our path, that
5086 	 * item might have been pushed to the first slot (0) of the leaf we
5087 	 * were holding due to a tree balance. Alternatively, an item with the
5088 	 * previous key can exist as the only element of a leaf (big fat item).
5089 	 * Therefore account for these 2 cases, so that our callers (like
5090 	 * btrfs_previous_item) don't miss an existing item with a key matching
5091 	 * the previous key we computed above.
5092 	 */
5093 	if (ret <= 0)
5094 		return 0;
5095 	return 1;
5096 }
5097 
5098 /*
5099  * A helper function to walk down the tree starting at min_key, and looking
5100  * for nodes or leaves that are have a minimum transaction id.
5101  * This is used by the btree defrag code, and tree logging
5102  *
5103  * This does not cow, but it does stuff the starting key it finds back
5104  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5105  * key and get a writable path.
5106  *
5107  * This does lock as it descends, and path->keep_locks should be set
5108  * to 1 by the caller.
5109  *
5110  * This honors path->lowest_level to prevent descent past a given level
5111  * of the tree.
5112  *
5113  * min_trans indicates the oldest transaction that you are interested
5114  * in walking through.  Any nodes or leaves older than min_trans are
5115  * skipped over (without reading them).
5116  *
5117  * returns zero if something useful was found, < 0 on error and 1 if there
5118  * was nothing in the tree that matched the search criteria.
5119  */
5120 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5121 			 struct btrfs_path *path,
5122 			 u64 min_trans)
5123 {
5124 	struct extent_buffer *cur;
5125 	struct btrfs_key found_key;
5126 	int slot;
5127 	int sret;
5128 	u32 nritems;
5129 	int level;
5130 	int ret = 1;
5131 	int keep_locks = path->keep_locks;
5132 
5133 	path->keep_locks = 1;
5134 again:
5135 	cur = btrfs_read_lock_root_node(root);
5136 	level = btrfs_header_level(cur);
5137 	WARN_ON(path->nodes[level]);
5138 	path->nodes[level] = cur;
5139 	path->locks[level] = BTRFS_READ_LOCK;
5140 
5141 	if (btrfs_header_generation(cur) < min_trans) {
5142 		ret = 1;
5143 		goto out;
5144 	}
5145 	while (1) {
5146 		nritems = btrfs_header_nritems(cur);
5147 		level = btrfs_header_level(cur);
5148 		sret = bin_search(cur, min_key, level, &slot);
5149 
5150 		/* at the lowest level, we're done, setup the path and exit */
5151 		if (level == path->lowest_level) {
5152 			if (slot >= nritems)
5153 				goto find_next_key;
5154 			ret = 0;
5155 			path->slots[level] = slot;
5156 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5157 			goto out;
5158 		}
5159 		if (sret && slot > 0)
5160 			slot--;
5161 		/*
5162 		 * check this node pointer against the min_trans parameters.
5163 		 * If it is too old, old, skip to the next one.
5164 		 */
5165 		while (slot < nritems) {
5166 			u64 gen;
5167 
5168 			gen = btrfs_node_ptr_generation(cur, slot);
5169 			if (gen < min_trans) {
5170 				slot++;
5171 				continue;
5172 			}
5173 			break;
5174 		}
5175 find_next_key:
5176 		/*
5177 		 * we didn't find a candidate key in this node, walk forward
5178 		 * and find another one
5179 		 */
5180 		if (slot >= nritems) {
5181 			path->slots[level] = slot;
5182 			btrfs_set_path_blocking(path);
5183 			sret = btrfs_find_next_key(root, path, min_key, level,
5184 						  min_trans);
5185 			if (sret == 0) {
5186 				btrfs_release_path(path);
5187 				goto again;
5188 			} else {
5189 				goto out;
5190 			}
5191 		}
5192 		/* save our key for returning back */
5193 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5194 		path->slots[level] = slot;
5195 		if (level == path->lowest_level) {
5196 			ret = 0;
5197 			goto out;
5198 		}
5199 		btrfs_set_path_blocking(path);
5200 		cur = read_node_slot(root, cur, slot);
5201 		if (IS_ERR(cur)) {
5202 			ret = PTR_ERR(cur);
5203 			goto out;
5204 		}
5205 
5206 		btrfs_tree_read_lock(cur);
5207 
5208 		path->locks[level - 1] = BTRFS_READ_LOCK;
5209 		path->nodes[level - 1] = cur;
5210 		unlock_up(path, level, 1, 0, NULL);
5211 		btrfs_clear_path_blocking(path, NULL, 0);
5212 	}
5213 out:
5214 	path->keep_locks = keep_locks;
5215 	if (ret == 0) {
5216 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5217 		btrfs_set_path_blocking(path);
5218 		memcpy(min_key, &found_key, sizeof(found_key));
5219 	}
5220 	return ret;
5221 }
5222 
5223 static int tree_move_down(struct btrfs_root *root,
5224 			   struct btrfs_path *path,
5225 			   int *level, int root_level)
5226 {
5227 	struct extent_buffer *eb;
5228 
5229 	BUG_ON(*level == 0);
5230 	eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
5231 	if (IS_ERR(eb))
5232 		return PTR_ERR(eb);
5233 
5234 	path->nodes[*level - 1] = eb;
5235 	path->slots[*level - 1] = 0;
5236 	(*level)--;
5237 	return 0;
5238 }
5239 
5240 static int tree_move_next_or_upnext(struct btrfs_root *root,
5241 				    struct btrfs_path *path,
5242 				    int *level, int root_level)
5243 {
5244 	int ret = 0;
5245 	int nritems;
5246 	nritems = btrfs_header_nritems(path->nodes[*level]);
5247 
5248 	path->slots[*level]++;
5249 
5250 	while (path->slots[*level] >= nritems) {
5251 		if (*level == root_level)
5252 			return -1;
5253 
5254 		/* move upnext */
5255 		path->slots[*level] = 0;
5256 		free_extent_buffer(path->nodes[*level]);
5257 		path->nodes[*level] = NULL;
5258 		(*level)++;
5259 		path->slots[*level]++;
5260 
5261 		nritems = btrfs_header_nritems(path->nodes[*level]);
5262 		ret = 1;
5263 	}
5264 	return ret;
5265 }
5266 
5267 /*
5268  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5269  * or down.
5270  */
5271 static int tree_advance(struct btrfs_root *root,
5272 			struct btrfs_path *path,
5273 			int *level, int root_level,
5274 			int allow_down,
5275 			struct btrfs_key *key)
5276 {
5277 	int ret;
5278 
5279 	if (*level == 0 || !allow_down) {
5280 		ret = tree_move_next_or_upnext(root, path, level, root_level);
5281 	} else {
5282 		ret = tree_move_down(root, path, level, root_level);
5283 	}
5284 	if (ret >= 0) {
5285 		if (*level == 0)
5286 			btrfs_item_key_to_cpu(path->nodes[*level], key,
5287 					path->slots[*level]);
5288 		else
5289 			btrfs_node_key_to_cpu(path->nodes[*level], key,
5290 					path->slots[*level]);
5291 	}
5292 	return ret;
5293 }
5294 
5295 static int tree_compare_item(struct btrfs_root *left_root,
5296 			     struct btrfs_path *left_path,
5297 			     struct btrfs_path *right_path,
5298 			     char *tmp_buf)
5299 {
5300 	int cmp;
5301 	int len1, len2;
5302 	unsigned long off1, off2;
5303 
5304 	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5305 	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5306 	if (len1 != len2)
5307 		return 1;
5308 
5309 	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5310 	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5311 				right_path->slots[0]);
5312 
5313 	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5314 
5315 	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5316 	if (cmp)
5317 		return 1;
5318 	return 0;
5319 }
5320 
5321 #define ADVANCE 1
5322 #define ADVANCE_ONLY_NEXT -1
5323 
5324 /*
5325  * This function compares two trees and calls the provided callback for
5326  * every changed/new/deleted item it finds.
5327  * If shared tree blocks are encountered, whole subtrees are skipped, making
5328  * the compare pretty fast on snapshotted subvolumes.
5329  *
5330  * This currently works on commit roots only. As commit roots are read only,
5331  * we don't do any locking. The commit roots are protected with transactions.
5332  * Transactions are ended and rejoined when a commit is tried in between.
5333  *
5334  * This function checks for modifications done to the trees while comparing.
5335  * If it detects a change, it aborts immediately.
5336  */
5337 int btrfs_compare_trees(struct btrfs_root *left_root,
5338 			struct btrfs_root *right_root,
5339 			btrfs_changed_cb_t changed_cb, void *ctx)
5340 {
5341 	int ret;
5342 	int cmp;
5343 	struct btrfs_path *left_path = NULL;
5344 	struct btrfs_path *right_path = NULL;
5345 	struct btrfs_key left_key;
5346 	struct btrfs_key right_key;
5347 	char *tmp_buf = NULL;
5348 	int left_root_level;
5349 	int right_root_level;
5350 	int left_level;
5351 	int right_level;
5352 	int left_end_reached;
5353 	int right_end_reached;
5354 	int advance_left;
5355 	int advance_right;
5356 	u64 left_blockptr;
5357 	u64 right_blockptr;
5358 	u64 left_gen;
5359 	u64 right_gen;
5360 
5361 	left_path = btrfs_alloc_path();
5362 	if (!left_path) {
5363 		ret = -ENOMEM;
5364 		goto out;
5365 	}
5366 	right_path = btrfs_alloc_path();
5367 	if (!right_path) {
5368 		ret = -ENOMEM;
5369 		goto out;
5370 	}
5371 
5372 	tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
5373 	if (!tmp_buf) {
5374 		tmp_buf = vmalloc(left_root->nodesize);
5375 		if (!tmp_buf) {
5376 			ret = -ENOMEM;
5377 			goto out;
5378 		}
5379 	}
5380 
5381 	left_path->search_commit_root = 1;
5382 	left_path->skip_locking = 1;
5383 	right_path->search_commit_root = 1;
5384 	right_path->skip_locking = 1;
5385 
5386 	/*
5387 	 * Strategy: Go to the first items of both trees. Then do
5388 	 *
5389 	 * If both trees are at level 0
5390 	 *   Compare keys of current items
5391 	 *     If left < right treat left item as new, advance left tree
5392 	 *       and repeat
5393 	 *     If left > right treat right item as deleted, advance right tree
5394 	 *       and repeat
5395 	 *     If left == right do deep compare of items, treat as changed if
5396 	 *       needed, advance both trees and repeat
5397 	 * If both trees are at the same level but not at level 0
5398 	 *   Compare keys of current nodes/leafs
5399 	 *     If left < right advance left tree and repeat
5400 	 *     If left > right advance right tree and repeat
5401 	 *     If left == right compare blockptrs of the next nodes/leafs
5402 	 *       If they match advance both trees but stay at the same level
5403 	 *         and repeat
5404 	 *       If they don't match advance both trees while allowing to go
5405 	 *         deeper and repeat
5406 	 * If tree levels are different
5407 	 *   Advance the tree that needs it and repeat
5408 	 *
5409 	 * Advancing a tree means:
5410 	 *   If we are at level 0, try to go to the next slot. If that's not
5411 	 *   possible, go one level up and repeat. Stop when we found a level
5412 	 *   where we could go to the next slot. We may at this point be on a
5413 	 *   node or a leaf.
5414 	 *
5415 	 *   If we are not at level 0 and not on shared tree blocks, go one
5416 	 *   level deeper.
5417 	 *
5418 	 *   If we are not at level 0 and on shared tree blocks, go one slot to
5419 	 *   the right if possible or go up and right.
5420 	 */
5421 
5422 	down_read(&left_root->fs_info->commit_root_sem);
5423 	left_level = btrfs_header_level(left_root->commit_root);
5424 	left_root_level = left_level;
5425 	left_path->nodes[left_level] = left_root->commit_root;
5426 	extent_buffer_get(left_path->nodes[left_level]);
5427 
5428 	right_level = btrfs_header_level(right_root->commit_root);
5429 	right_root_level = right_level;
5430 	right_path->nodes[right_level] = right_root->commit_root;
5431 	extent_buffer_get(right_path->nodes[right_level]);
5432 	up_read(&left_root->fs_info->commit_root_sem);
5433 
5434 	if (left_level == 0)
5435 		btrfs_item_key_to_cpu(left_path->nodes[left_level],
5436 				&left_key, left_path->slots[left_level]);
5437 	else
5438 		btrfs_node_key_to_cpu(left_path->nodes[left_level],
5439 				&left_key, left_path->slots[left_level]);
5440 	if (right_level == 0)
5441 		btrfs_item_key_to_cpu(right_path->nodes[right_level],
5442 				&right_key, right_path->slots[right_level]);
5443 	else
5444 		btrfs_node_key_to_cpu(right_path->nodes[right_level],
5445 				&right_key, right_path->slots[right_level]);
5446 
5447 	left_end_reached = right_end_reached = 0;
5448 	advance_left = advance_right = 0;
5449 
5450 	while (1) {
5451 		if (advance_left && !left_end_reached) {
5452 			ret = tree_advance(left_root, left_path, &left_level,
5453 					left_root_level,
5454 					advance_left != ADVANCE_ONLY_NEXT,
5455 					&left_key);
5456 			if (ret == -1)
5457 				left_end_reached = ADVANCE;
5458 			else if (ret < 0)
5459 				goto out;
5460 			advance_left = 0;
5461 		}
5462 		if (advance_right && !right_end_reached) {
5463 			ret = tree_advance(right_root, right_path, &right_level,
5464 					right_root_level,
5465 					advance_right != ADVANCE_ONLY_NEXT,
5466 					&right_key);
5467 			if (ret == -1)
5468 				right_end_reached = ADVANCE;
5469 			else if (ret < 0)
5470 				goto out;
5471 			advance_right = 0;
5472 		}
5473 
5474 		if (left_end_reached && right_end_reached) {
5475 			ret = 0;
5476 			goto out;
5477 		} else if (left_end_reached) {
5478 			if (right_level == 0) {
5479 				ret = changed_cb(left_root, right_root,
5480 						left_path, right_path,
5481 						&right_key,
5482 						BTRFS_COMPARE_TREE_DELETED,
5483 						ctx);
5484 				if (ret < 0)
5485 					goto out;
5486 			}
5487 			advance_right = ADVANCE;
5488 			continue;
5489 		} else if (right_end_reached) {
5490 			if (left_level == 0) {
5491 				ret = changed_cb(left_root, right_root,
5492 						left_path, right_path,
5493 						&left_key,
5494 						BTRFS_COMPARE_TREE_NEW,
5495 						ctx);
5496 				if (ret < 0)
5497 					goto out;
5498 			}
5499 			advance_left = ADVANCE;
5500 			continue;
5501 		}
5502 
5503 		if (left_level == 0 && right_level == 0) {
5504 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5505 			if (cmp < 0) {
5506 				ret = changed_cb(left_root, right_root,
5507 						left_path, right_path,
5508 						&left_key,
5509 						BTRFS_COMPARE_TREE_NEW,
5510 						ctx);
5511 				if (ret < 0)
5512 					goto out;
5513 				advance_left = ADVANCE;
5514 			} else if (cmp > 0) {
5515 				ret = changed_cb(left_root, right_root,
5516 						left_path, right_path,
5517 						&right_key,
5518 						BTRFS_COMPARE_TREE_DELETED,
5519 						ctx);
5520 				if (ret < 0)
5521 					goto out;
5522 				advance_right = ADVANCE;
5523 			} else {
5524 				enum btrfs_compare_tree_result result;
5525 
5526 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5527 				ret = tree_compare_item(left_root, left_path,
5528 						right_path, tmp_buf);
5529 				if (ret)
5530 					result = BTRFS_COMPARE_TREE_CHANGED;
5531 				else
5532 					result = BTRFS_COMPARE_TREE_SAME;
5533 				ret = changed_cb(left_root, right_root,
5534 						 left_path, right_path,
5535 						 &left_key, result, ctx);
5536 				if (ret < 0)
5537 					goto out;
5538 				advance_left = ADVANCE;
5539 				advance_right = ADVANCE;
5540 			}
5541 		} else if (left_level == right_level) {
5542 			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5543 			if (cmp < 0) {
5544 				advance_left = ADVANCE;
5545 			} else if (cmp > 0) {
5546 				advance_right = ADVANCE;
5547 			} else {
5548 				left_blockptr = btrfs_node_blockptr(
5549 						left_path->nodes[left_level],
5550 						left_path->slots[left_level]);
5551 				right_blockptr = btrfs_node_blockptr(
5552 						right_path->nodes[right_level],
5553 						right_path->slots[right_level]);
5554 				left_gen = btrfs_node_ptr_generation(
5555 						left_path->nodes[left_level],
5556 						left_path->slots[left_level]);
5557 				right_gen = btrfs_node_ptr_generation(
5558 						right_path->nodes[right_level],
5559 						right_path->slots[right_level]);
5560 				if (left_blockptr == right_blockptr &&
5561 				    left_gen == right_gen) {
5562 					/*
5563 					 * As we're on a shared block, don't
5564 					 * allow to go deeper.
5565 					 */
5566 					advance_left = ADVANCE_ONLY_NEXT;
5567 					advance_right = ADVANCE_ONLY_NEXT;
5568 				} else {
5569 					advance_left = ADVANCE;
5570 					advance_right = ADVANCE;
5571 				}
5572 			}
5573 		} else if (left_level < right_level) {
5574 			advance_right = ADVANCE;
5575 		} else {
5576 			advance_left = ADVANCE;
5577 		}
5578 	}
5579 
5580 out:
5581 	btrfs_free_path(left_path);
5582 	btrfs_free_path(right_path);
5583 	kvfree(tmp_buf);
5584 	return ret;
5585 }
5586 
5587 /*
5588  * this is similar to btrfs_next_leaf, but does not try to preserve
5589  * and fixup the path.  It looks for and returns the next key in the
5590  * tree based on the current path and the min_trans parameters.
5591  *
5592  * 0 is returned if another key is found, < 0 if there are any errors
5593  * and 1 is returned if there are no higher keys in the tree
5594  *
5595  * path->keep_locks should be set to 1 on the search made before
5596  * calling this function.
5597  */
5598 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5599 			struct btrfs_key *key, int level, u64 min_trans)
5600 {
5601 	int slot;
5602 	struct extent_buffer *c;
5603 
5604 	WARN_ON(!path->keep_locks);
5605 	while (level < BTRFS_MAX_LEVEL) {
5606 		if (!path->nodes[level])
5607 			return 1;
5608 
5609 		slot = path->slots[level] + 1;
5610 		c = path->nodes[level];
5611 next:
5612 		if (slot >= btrfs_header_nritems(c)) {
5613 			int ret;
5614 			int orig_lowest;
5615 			struct btrfs_key cur_key;
5616 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5617 			    !path->nodes[level + 1])
5618 				return 1;
5619 
5620 			if (path->locks[level + 1]) {
5621 				level++;
5622 				continue;
5623 			}
5624 
5625 			slot = btrfs_header_nritems(c) - 1;
5626 			if (level == 0)
5627 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5628 			else
5629 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5630 
5631 			orig_lowest = path->lowest_level;
5632 			btrfs_release_path(path);
5633 			path->lowest_level = level;
5634 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5635 						0, 0);
5636 			path->lowest_level = orig_lowest;
5637 			if (ret < 0)
5638 				return ret;
5639 
5640 			c = path->nodes[level];
5641 			slot = path->slots[level];
5642 			if (ret == 0)
5643 				slot++;
5644 			goto next;
5645 		}
5646 
5647 		if (level == 0)
5648 			btrfs_item_key_to_cpu(c, key, slot);
5649 		else {
5650 			u64 gen = btrfs_node_ptr_generation(c, slot);
5651 
5652 			if (gen < min_trans) {
5653 				slot++;
5654 				goto next;
5655 			}
5656 			btrfs_node_key_to_cpu(c, key, slot);
5657 		}
5658 		return 0;
5659 	}
5660 	return 1;
5661 }
5662 
5663 /*
5664  * search the tree again to find a leaf with greater keys
5665  * returns 0 if it found something or 1 if there are no greater leaves.
5666  * returns < 0 on io errors.
5667  */
5668 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5669 {
5670 	return btrfs_next_old_leaf(root, path, 0);
5671 }
5672 
5673 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5674 			u64 time_seq)
5675 {
5676 	int slot;
5677 	int level;
5678 	struct extent_buffer *c;
5679 	struct extent_buffer *next;
5680 	struct btrfs_key key;
5681 	u32 nritems;
5682 	int ret;
5683 	int old_spinning = path->leave_spinning;
5684 	int next_rw_lock = 0;
5685 
5686 	nritems = btrfs_header_nritems(path->nodes[0]);
5687 	if (nritems == 0)
5688 		return 1;
5689 
5690 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5691 again:
5692 	level = 1;
5693 	next = NULL;
5694 	next_rw_lock = 0;
5695 	btrfs_release_path(path);
5696 
5697 	path->keep_locks = 1;
5698 	path->leave_spinning = 1;
5699 
5700 	if (time_seq)
5701 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5702 	else
5703 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5704 	path->keep_locks = 0;
5705 
5706 	if (ret < 0)
5707 		return ret;
5708 
5709 	nritems = btrfs_header_nritems(path->nodes[0]);
5710 	/*
5711 	 * by releasing the path above we dropped all our locks.  A balance
5712 	 * could have added more items next to the key that used to be
5713 	 * at the very end of the block.  So, check again here and
5714 	 * advance the path if there are now more items available.
5715 	 */
5716 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5717 		if (ret == 0)
5718 			path->slots[0]++;
5719 		ret = 0;
5720 		goto done;
5721 	}
5722 	/*
5723 	 * So the above check misses one case:
5724 	 * - after releasing the path above, someone has removed the item that
5725 	 *   used to be at the very end of the block, and balance between leafs
5726 	 *   gets another one with bigger key.offset to replace it.
5727 	 *
5728 	 * This one should be returned as well, or we can get leaf corruption
5729 	 * later(esp. in __btrfs_drop_extents()).
5730 	 *
5731 	 * And a bit more explanation about this check,
5732 	 * with ret > 0, the key isn't found, the path points to the slot
5733 	 * where it should be inserted, so the path->slots[0] item must be the
5734 	 * bigger one.
5735 	 */
5736 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5737 		ret = 0;
5738 		goto done;
5739 	}
5740 
5741 	while (level < BTRFS_MAX_LEVEL) {
5742 		if (!path->nodes[level]) {
5743 			ret = 1;
5744 			goto done;
5745 		}
5746 
5747 		slot = path->slots[level] + 1;
5748 		c = path->nodes[level];
5749 		if (slot >= btrfs_header_nritems(c)) {
5750 			level++;
5751 			if (level == BTRFS_MAX_LEVEL) {
5752 				ret = 1;
5753 				goto done;
5754 			}
5755 			continue;
5756 		}
5757 
5758 		if (next) {
5759 			btrfs_tree_unlock_rw(next, next_rw_lock);
5760 			free_extent_buffer(next);
5761 		}
5762 
5763 		next = c;
5764 		next_rw_lock = path->locks[level];
5765 		ret = read_block_for_search(NULL, root, path, &next, level,
5766 					    slot, &key, 0);
5767 		if (ret == -EAGAIN)
5768 			goto again;
5769 
5770 		if (ret < 0) {
5771 			btrfs_release_path(path);
5772 			goto done;
5773 		}
5774 
5775 		if (!path->skip_locking) {
5776 			ret = btrfs_try_tree_read_lock(next);
5777 			if (!ret && time_seq) {
5778 				/*
5779 				 * If we don't get the lock, we may be racing
5780 				 * with push_leaf_left, holding that lock while
5781 				 * itself waiting for the leaf we've currently
5782 				 * locked. To solve this situation, we give up
5783 				 * on our lock and cycle.
5784 				 */
5785 				free_extent_buffer(next);
5786 				btrfs_release_path(path);
5787 				cond_resched();
5788 				goto again;
5789 			}
5790 			if (!ret) {
5791 				btrfs_set_path_blocking(path);
5792 				btrfs_tree_read_lock(next);
5793 				btrfs_clear_path_blocking(path, next,
5794 							  BTRFS_READ_LOCK);
5795 			}
5796 			next_rw_lock = BTRFS_READ_LOCK;
5797 		}
5798 		break;
5799 	}
5800 	path->slots[level] = slot;
5801 	while (1) {
5802 		level--;
5803 		c = path->nodes[level];
5804 		if (path->locks[level])
5805 			btrfs_tree_unlock_rw(c, path->locks[level]);
5806 
5807 		free_extent_buffer(c);
5808 		path->nodes[level] = next;
5809 		path->slots[level] = 0;
5810 		if (!path->skip_locking)
5811 			path->locks[level] = next_rw_lock;
5812 		if (!level)
5813 			break;
5814 
5815 		ret = read_block_for_search(NULL, root, path, &next, level,
5816 					    0, &key, 0);
5817 		if (ret == -EAGAIN)
5818 			goto again;
5819 
5820 		if (ret < 0) {
5821 			btrfs_release_path(path);
5822 			goto done;
5823 		}
5824 
5825 		if (!path->skip_locking) {
5826 			ret = btrfs_try_tree_read_lock(next);
5827 			if (!ret) {
5828 				btrfs_set_path_blocking(path);
5829 				btrfs_tree_read_lock(next);
5830 				btrfs_clear_path_blocking(path, next,
5831 							  BTRFS_READ_LOCK);
5832 			}
5833 			next_rw_lock = BTRFS_READ_LOCK;
5834 		}
5835 	}
5836 	ret = 0;
5837 done:
5838 	unlock_up(path, 0, 1, 0, NULL);
5839 	path->leave_spinning = old_spinning;
5840 	if (!old_spinning)
5841 		btrfs_set_path_blocking(path);
5842 
5843 	return ret;
5844 }
5845 
5846 /*
5847  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5848  * searching until it gets past min_objectid or finds an item of 'type'
5849  *
5850  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5851  */
5852 int btrfs_previous_item(struct btrfs_root *root,
5853 			struct btrfs_path *path, u64 min_objectid,
5854 			int type)
5855 {
5856 	struct btrfs_key found_key;
5857 	struct extent_buffer *leaf;
5858 	u32 nritems;
5859 	int ret;
5860 
5861 	while (1) {
5862 		if (path->slots[0] == 0) {
5863 			btrfs_set_path_blocking(path);
5864 			ret = btrfs_prev_leaf(root, path);
5865 			if (ret != 0)
5866 				return ret;
5867 		} else {
5868 			path->slots[0]--;
5869 		}
5870 		leaf = path->nodes[0];
5871 		nritems = btrfs_header_nritems(leaf);
5872 		if (nritems == 0)
5873 			return 1;
5874 		if (path->slots[0] == nritems)
5875 			path->slots[0]--;
5876 
5877 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5878 		if (found_key.objectid < min_objectid)
5879 			break;
5880 		if (found_key.type == type)
5881 			return 0;
5882 		if (found_key.objectid == min_objectid &&
5883 		    found_key.type < type)
5884 			break;
5885 	}
5886 	return 1;
5887 }
5888 
5889 /*
5890  * search in extent tree to find a previous Metadata/Data extent item with
5891  * min objecitd.
5892  *
5893  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5894  */
5895 int btrfs_previous_extent_item(struct btrfs_root *root,
5896 			struct btrfs_path *path, u64 min_objectid)
5897 {
5898 	struct btrfs_key found_key;
5899 	struct extent_buffer *leaf;
5900 	u32 nritems;
5901 	int ret;
5902 
5903 	while (1) {
5904 		if (path->slots[0] == 0) {
5905 			btrfs_set_path_blocking(path);
5906 			ret = btrfs_prev_leaf(root, path);
5907 			if (ret != 0)
5908 				return ret;
5909 		} else {
5910 			path->slots[0]--;
5911 		}
5912 		leaf = path->nodes[0];
5913 		nritems = btrfs_header_nritems(leaf);
5914 		if (nritems == 0)
5915 			return 1;
5916 		if (path->slots[0] == nritems)
5917 			path->slots[0]--;
5918 
5919 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5920 		if (found_key.objectid < min_objectid)
5921 			break;
5922 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5923 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5924 			return 0;
5925 		if (found_key.objectid == min_objectid &&
5926 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5927 			break;
5928 	}
5929 	return 1;
5930 }
5931