xref: /openbmc/linux/fs/btrfs/ctree.c (revision 11a163f2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19 		      *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
22 		      int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24 			  struct extent_buffer *dst,
25 			  struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27 			      struct extent_buffer *dst_buf,
28 			      struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 		    int level, int slot);
31 
32 static const struct btrfs_csums {
33 	u16		size;
34 	const char	name[10];
35 	const char	driver[12];
36 } btrfs_csums[] = {
37 	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38 	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39 	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40 	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41 				     .driver = "blake2b-256" },
42 };
43 
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46 	u16 t = btrfs_super_csum_type(s);
47 	/*
48 	 * csum type is validated at mount time
49 	 */
50 	return btrfs_csums[t].size;
51 }
52 
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55 	/* csum type is validated at mount time */
56 	return btrfs_csums[csum_type].name;
57 }
58 
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65 	/* csum type is validated at mount time */
66 	return btrfs_csums[csum_type].driver[0] ?
67 		btrfs_csums[csum_type].driver :
68 		btrfs_csums[csum_type].name;
69 }
70 
71 size_t __attribute_const__ btrfs_get_num_csums(void)
72 {
73 	return ARRAY_SIZE(btrfs_csums);
74 }
75 
76 struct btrfs_path *btrfs_alloc_path(void)
77 {
78 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79 }
80 
81 /* this also releases the path */
82 void btrfs_free_path(struct btrfs_path *p)
83 {
84 	if (!p)
85 		return;
86 	btrfs_release_path(p);
87 	kmem_cache_free(btrfs_path_cachep, p);
88 }
89 
90 /*
91  * path release drops references on the extent buffers in the path
92  * and it drops any locks held by this path
93  *
94  * It is safe to call this on paths that no locks or extent buffers held.
95  */
96 noinline void btrfs_release_path(struct btrfs_path *p)
97 {
98 	int i;
99 
100 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101 		p->slots[i] = 0;
102 		if (!p->nodes[i])
103 			continue;
104 		if (p->locks[i]) {
105 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106 			p->locks[i] = 0;
107 		}
108 		free_extent_buffer(p->nodes[i]);
109 		p->nodes[i] = NULL;
110 	}
111 }
112 
113 /*
114  * safely gets a reference on the root node of a tree.  A lock
115  * is not taken, so a concurrent writer may put a different node
116  * at the root of the tree.  See btrfs_lock_root_node for the
117  * looping required.
118  *
119  * The extent buffer returned by this has a reference taken, so
120  * it won't disappear.  It may stop being the root of the tree
121  * at any time because there are no locks held.
122  */
123 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124 {
125 	struct extent_buffer *eb;
126 
127 	while (1) {
128 		rcu_read_lock();
129 		eb = rcu_dereference(root->node);
130 
131 		/*
132 		 * RCU really hurts here, we could free up the root node because
133 		 * it was COWed but we may not get the new root node yet so do
134 		 * the inc_not_zero dance and if it doesn't work then
135 		 * synchronize_rcu and try again.
136 		 */
137 		if (atomic_inc_not_zero(&eb->refs)) {
138 			rcu_read_unlock();
139 			break;
140 		}
141 		rcu_read_unlock();
142 		synchronize_rcu();
143 	}
144 	return eb;
145 }
146 
147 /*
148  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149  * just get put onto a simple dirty list.  Transaction walks this list to make
150  * sure they get properly updated on disk.
151  */
152 static void add_root_to_dirty_list(struct btrfs_root *root)
153 {
154 	struct btrfs_fs_info *fs_info = root->fs_info;
155 
156 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
157 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
158 		return;
159 
160 	spin_lock(&fs_info->trans_lock);
161 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
162 		/* Want the extent tree to be the last on the list */
163 		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164 			list_move_tail(&root->dirty_list,
165 				       &fs_info->dirty_cowonly_roots);
166 		else
167 			list_move(&root->dirty_list,
168 				  &fs_info->dirty_cowonly_roots);
169 	}
170 	spin_unlock(&fs_info->trans_lock);
171 }
172 
173 /*
174  * used by snapshot creation to make a copy of a root for a tree with
175  * a given objectid.  The buffer with the new root node is returned in
176  * cow_ret, and this func returns zero on success or a negative error code.
177  */
178 int btrfs_copy_root(struct btrfs_trans_handle *trans,
179 		      struct btrfs_root *root,
180 		      struct extent_buffer *buf,
181 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
182 {
183 	struct btrfs_fs_info *fs_info = root->fs_info;
184 	struct extent_buffer *cow;
185 	int ret = 0;
186 	int level;
187 	struct btrfs_disk_key disk_key;
188 
189 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190 		trans->transid != fs_info->running_transaction->transid);
191 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192 		trans->transid != root->last_trans);
193 
194 	level = btrfs_header_level(buf);
195 	if (level == 0)
196 		btrfs_item_key(buf, &disk_key, 0);
197 	else
198 		btrfs_node_key(buf, &disk_key, 0);
199 
200 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201 				     &disk_key, level, buf->start, 0,
202 				     BTRFS_NESTING_NEW_ROOT);
203 	if (IS_ERR(cow))
204 		return PTR_ERR(cow);
205 
206 	copy_extent_buffer_full(cow, buf);
207 	btrfs_set_header_bytenr(cow, cow->start);
208 	btrfs_set_header_generation(cow, trans->transid);
209 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
210 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
211 				     BTRFS_HEADER_FLAG_RELOC);
212 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
213 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
214 	else
215 		btrfs_set_header_owner(cow, new_root_objectid);
216 
217 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
218 
219 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
220 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
221 		ret = btrfs_inc_ref(trans, root, cow, 1);
222 	else
223 		ret = btrfs_inc_ref(trans, root, cow, 0);
224 
225 	if (ret)
226 		return ret;
227 
228 	btrfs_mark_buffer_dirty(cow);
229 	*cow_ret = cow;
230 	return 0;
231 }
232 
233 enum mod_log_op {
234 	MOD_LOG_KEY_REPLACE,
235 	MOD_LOG_KEY_ADD,
236 	MOD_LOG_KEY_REMOVE,
237 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
238 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
239 	MOD_LOG_MOVE_KEYS,
240 	MOD_LOG_ROOT_REPLACE,
241 };
242 
243 struct tree_mod_root {
244 	u64 logical;
245 	u8 level;
246 };
247 
248 struct tree_mod_elem {
249 	struct rb_node node;
250 	u64 logical;
251 	u64 seq;
252 	enum mod_log_op op;
253 
254 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
255 	int slot;
256 
257 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
258 	u64 generation;
259 
260 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
261 	struct btrfs_disk_key key;
262 	u64 blockptr;
263 
264 	/* this is used for op == MOD_LOG_MOVE_KEYS */
265 	struct {
266 		int dst_slot;
267 		int nr_items;
268 	} move;
269 
270 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
271 	struct tree_mod_root old_root;
272 };
273 
274 /*
275  * Pull a new tree mod seq number for our operation.
276  */
277 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
278 {
279 	return atomic64_inc_return(&fs_info->tree_mod_seq);
280 }
281 
282 /*
283  * This adds a new blocker to the tree mod log's blocker list if the @elem
284  * passed does not already have a sequence number set. So when a caller expects
285  * to record tree modifications, it should ensure to set elem->seq to zero
286  * before calling btrfs_get_tree_mod_seq.
287  * Returns a fresh, unused tree log modification sequence number, even if no new
288  * blocker was added.
289  */
290 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
291 			   struct seq_list *elem)
292 {
293 	write_lock(&fs_info->tree_mod_log_lock);
294 	if (!elem->seq) {
295 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
296 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
297 	}
298 	write_unlock(&fs_info->tree_mod_log_lock);
299 
300 	return elem->seq;
301 }
302 
303 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
304 			    struct seq_list *elem)
305 {
306 	struct rb_root *tm_root;
307 	struct rb_node *node;
308 	struct rb_node *next;
309 	struct tree_mod_elem *tm;
310 	u64 min_seq = (u64)-1;
311 	u64 seq_putting = elem->seq;
312 
313 	if (!seq_putting)
314 		return;
315 
316 	write_lock(&fs_info->tree_mod_log_lock);
317 	list_del(&elem->list);
318 	elem->seq = 0;
319 
320 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
321 		struct seq_list *first;
322 
323 		first = list_first_entry(&fs_info->tree_mod_seq_list,
324 					 struct seq_list, list);
325 		if (seq_putting > first->seq) {
326 			/*
327 			 * Blocker with lower sequence number exists, we
328 			 * cannot remove anything from the log.
329 			 */
330 			write_unlock(&fs_info->tree_mod_log_lock);
331 			return;
332 		}
333 		min_seq = first->seq;
334 	}
335 
336 	/*
337 	 * anything that's lower than the lowest existing (read: blocked)
338 	 * sequence number can be removed from the tree.
339 	 */
340 	tm_root = &fs_info->tree_mod_log;
341 	for (node = rb_first(tm_root); node; node = next) {
342 		next = rb_next(node);
343 		tm = rb_entry(node, struct tree_mod_elem, node);
344 		if (tm->seq >= min_seq)
345 			continue;
346 		rb_erase(node, tm_root);
347 		kfree(tm);
348 	}
349 	write_unlock(&fs_info->tree_mod_log_lock);
350 }
351 
352 /*
353  * key order of the log:
354  *       node/leaf start address -> sequence
355  *
356  * The 'start address' is the logical address of the *new* root node
357  * for root replace operations, or the logical address of the affected
358  * block for all other operations.
359  */
360 static noinline int
361 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
362 {
363 	struct rb_root *tm_root;
364 	struct rb_node **new;
365 	struct rb_node *parent = NULL;
366 	struct tree_mod_elem *cur;
367 
368 	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
369 
370 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
371 
372 	tm_root = &fs_info->tree_mod_log;
373 	new = &tm_root->rb_node;
374 	while (*new) {
375 		cur = rb_entry(*new, struct tree_mod_elem, node);
376 		parent = *new;
377 		if (cur->logical < tm->logical)
378 			new = &((*new)->rb_left);
379 		else if (cur->logical > tm->logical)
380 			new = &((*new)->rb_right);
381 		else if (cur->seq < tm->seq)
382 			new = &((*new)->rb_left);
383 		else if (cur->seq > tm->seq)
384 			new = &((*new)->rb_right);
385 		else
386 			return -EEXIST;
387 	}
388 
389 	rb_link_node(&tm->node, parent, new);
390 	rb_insert_color(&tm->node, tm_root);
391 	return 0;
392 }
393 
394 /*
395  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
396  * returns zero with the tree_mod_log_lock acquired. The caller must hold
397  * this until all tree mod log insertions are recorded in the rb tree and then
398  * write unlock fs_info::tree_mod_log_lock.
399  */
400 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
401 				    struct extent_buffer *eb) {
402 	smp_mb();
403 	if (list_empty(&(fs_info)->tree_mod_seq_list))
404 		return 1;
405 	if (eb && btrfs_header_level(eb) == 0)
406 		return 1;
407 
408 	write_lock(&fs_info->tree_mod_log_lock);
409 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
410 		write_unlock(&fs_info->tree_mod_log_lock);
411 		return 1;
412 	}
413 
414 	return 0;
415 }
416 
417 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
418 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
419 				    struct extent_buffer *eb)
420 {
421 	smp_mb();
422 	if (list_empty(&(fs_info)->tree_mod_seq_list))
423 		return 0;
424 	if (eb && btrfs_header_level(eb) == 0)
425 		return 0;
426 
427 	return 1;
428 }
429 
430 static struct tree_mod_elem *
431 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
432 		    enum mod_log_op op, gfp_t flags)
433 {
434 	struct tree_mod_elem *tm;
435 
436 	tm = kzalloc(sizeof(*tm), flags);
437 	if (!tm)
438 		return NULL;
439 
440 	tm->logical = eb->start;
441 	if (op != MOD_LOG_KEY_ADD) {
442 		btrfs_node_key(eb, &tm->key, slot);
443 		tm->blockptr = btrfs_node_blockptr(eb, slot);
444 	}
445 	tm->op = op;
446 	tm->slot = slot;
447 	tm->generation = btrfs_node_ptr_generation(eb, slot);
448 	RB_CLEAR_NODE(&tm->node);
449 
450 	return tm;
451 }
452 
453 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
454 		enum mod_log_op op, gfp_t flags)
455 {
456 	struct tree_mod_elem *tm;
457 	int ret;
458 
459 	if (!tree_mod_need_log(eb->fs_info, eb))
460 		return 0;
461 
462 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
463 	if (!tm)
464 		return -ENOMEM;
465 
466 	if (tree_mod_dont_log(eb->fs_info, eb)) {
467 		kfree(tm);
468 		return 0;
469 	}
470 
471 	ret = __tree_mod_log_insert(eb->fs_info, tm);
472 	write_unlock(&eb->fs_info->tree_mod_log_lock);
473 	if (ret)
474 		kfree(tm);
475 
476 	return ret;
477 }
478 
479 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
480 		int dst_slot, int src_slot, int nr_items)
481 {
482 	struct tree_mod_elem *tm = NULL;
483 	struct tree_mod_elem **tm_list = NULL;
484 	int ret = 0;
485 	int i;
486 	int locked = 0;
487 
488 	if (!tree_mod_need_log(eb->fs_info, eb))
489 		return 0;
490 
491 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
492 	if (!tm_list)
493 		return -ENOMEM;
494 
495 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
496 	if (!tm) {
497 		ret = -ENOMEM;
498 		goto free_tms;
499 	}
500 
501 	tm->logical = eb->start;
502 	tm->slot = src_slot;
503 	tm->move.dst_slot = dst_slot;
504 	tm->move.nr_items = nr_items;
505 	tm->op = MOD_LOG_MOVE_KEYS;
506 
507 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
508 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
509 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
510 		if (!tm_list[i]) {
511 			ret = -ENOMEM;
512 			goto free_tms;
513 		}
514 	}
515 
516 	if (tree_mod_dont_log(eb->fs_info, eb))
517 		goto free_tms;
518 	locked = 1;
519 
520 	/*
521 	 * When we override something during the move, we log these removals.
522 	 * This can only happen when we move towards the beginning of the
523 	 * buffer, i.e. dst_slot < src_slot.
524 	 */
525 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
526 		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
527 		if (ret)
528 			goto free_tms;
529 	}
530 
531 	ret = __tree_mod_log_insert(eb->fs_info, tm);
532 	if (ret)
533 		goto free_tms;
534 	write_unlock(&eb->fs_info->tree_mod_log_lock);
535 	kfree(tm_list);
536 
537 	return 0;
538 free_tms:
539 	for (i = 0; i < nr_items; i++) {
540 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
541 			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
542 		kfree(tm_list[i]);
543 	}
544 	if (locked)
545 		write_unlock(&eb->fs_info->tree_mod_log_lock);
546 	kfree(tm_list);
547 	kfree(tm);
548 
549 	return ret;
550 }
551 
552 static inline int
553 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
554 		       struct tree_mod_elem **tm_list,
555 		       int nritems)
556 {
557 	int i, j;
558 	int ret;
559 
560 	for (i = nritems - 1; i >= 0; i--) {
561 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
562 		if (ret) {
563 			for (j = nritems - 1; j > i; j--)
564 				rb_erase(&tm_list[j]->node,
565 					 &fs_info->tree_mod_log);
566 			return ret;
567 		}
568 	}
569 
570 	return 0;
571 }
572 
573 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
574 			 struct extent_buffer *new_root, int log_removal)
575 {
576 	struct btrfs_fs_info *fs_info = old_root->fs_info;
577 	struct tree_mod_elem *tm = NULL;
578 	struct tree_mod_elem **tm_list = NULL;
579 	int nritems = 0;
580 	int ret = 0;
581 	int i;
582 
583 	if (!tree_mod_need_log(fs_info, NULL))
584 		return 0;
585 
586 	if (log_removal && btrfs_header_level(old_root) > 0) {
587 		nritems = btrfs_header_nritems(old_root);
588 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
589 				  GFP_NOFS);
590 		if (!tm_list) {
591 			ret = -ENOMEM;
592 			goto free_tms;
593 		}
594 		for (i = 0; i < nritems; i++) {
595 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
596 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
597 			if (!tm_list[i]) {
598 				ret = -ENOMEM;
599 				goto free_tms;
600 			}
601 		}
602 	}
603 
604 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
605 	if (!tm) {
606 		ret = -ENOMEM;
607 		goto free_tms;
608 	}
609 
610 	tm->logical = new_root->start;
611 	tm->old_root.logical = old_root->start;
612 	tm->old_root.level = btrfs_header_level(old_root);
613 	tm->generation = btrfs_header_generation(old_root);
614 	tm->op = MOD_LOG_ROOT_REPLACE;
615 
616 	if (tree_mod_dont_log(fs_info, NULL))
617 		goto free_tms;
618 
619 	if (tm_list)
620 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
621 	if (!ret)
622 		ret = __tree_mod_log_insert(fs_info, tm);
623 
624 	write_unlock(&fs_info->tree_mod_log_lock);
625 	if (ret)
626 		goto free_tms;
627 	kfree(tm_list);
628 
629 	return ret;
630 
631 free_tms:
632 	if (tm_list) {
633 		for (i = 0; i < nritems; i++)
634 			kfree(tm_list[i]);
635 		kfree(tm_list);
636 	}
637 	kfree(tm);
638 
639 	return ret;
640 }
641 
642 static struct tree_mod_elem *
643 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
644 		      int smallest)
645 {
646 	struct rb_root *tm_root;
647 	struct rb_node *node;
648 	struct tree_mod_elem *cur = NULL;
649 	struct tree_mod_elem *found = NULL;
650 
651 	read_lock(&fs_info->tree_mod_log_lock);
652 	tm_root = &fs_info->tree_mod_log;
653 	node = tm_root->rb_node;
654 	while (node) {
655 		cur = rb_entry(node, struct tree_mod_elem, node);
656 		if (cur->logical < start) {
657 			node = node->rb_left;
658 		} else if (cur->logical > start) {
659 			node = node->rb_right;
660 		} else if (cur->seq < min_seq) {
661 			node = node->rb_left;
662 		} else if (!smallest) {
663 			/* we want the node with the highest seq */
664 			if (found)
665 				BUG_ON(found->seq > cur->seq);
666 			found = cur;
667 			node = node->rb_left;
668 		} else if (cur->seq > min_seq) {
669 			/* we want the node with the smallest seq */
670 			if (found)
671 				BUG_ON(found->seq < cur->seq);
672 			found = cur;
673 			node = node->rb_right;
674 		} else {
675 			found = cur;
676 			break;
677 		}
678 	}
679 	read_unlock(&fs_info->tree_mod_log_lock);
680 
681 	return found;
682 }
683 
684 /*
685  * this returns the element from the log with the smallest time sequence
686  * value that's in the log (the oldest log item). any element with a time
687  * sequence lower than min_seq will be ignored.
688  */
689 static struct tree_mod_elem *
690 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
691 			   u64 min_seq)
692 {
693 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
694 }
695 
696 /*
697  * this returns the element from the log with the largest time sequence
698  * value that's in the log (the most recent log item). any element with
699  * a time sequence lower than min_seq will be ignored.
700  */
701 static struct tree_mod_elem *
702 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
703 {
704 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
705 }
706 
707 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
708 		     struct extent_buffer *src, unsigned long dst_offset,
709 		     unsigned long src_offset, int nr_items)
710 {
711 	struct btrfs_fs_info *fs_info = dst->fs_info;
712 	int ret = 0;
713 	struct tree_mod_elem **tm_list = NULL;
714 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
715 	int i;
716 	int locked = 0;
717 
718 	if (!tree_mod_need_log(fs_info, NULL))
719 		return 0;
720 
721 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
722 		return 0;
723 
724 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
725 			  GFP_NOFS);
726 	if (!tm_list)
727 		return -ENOMEM;
728 
729 	tm_list_add = tm_list;
730 	tm_list_rem = tm_list + nr_items;
731 	for (i = 0; i < nr_items; i++) {
732 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
733 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
734 		if (!tm_list_rem[i]) {
735 			ret = -ENOMEM;
736 			goto free_tms;
737 		}
738 
739 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
740 		    MOD_LOG_KEY_ADD, GFP_NOFS);
741 		if (!tm_list_add[i]) {
742 			ret = -ENOMEM;
743 			goto free_tms;
744 		}
745 	}
746 
747 	if (tree_mod_dont_log(fs_info, NULL))
748 		goto free_tms;
749 	locked = 1;
750 
751 	for (i = 0; i < nr_items; i++) {
752 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
753 		if (ret)
754 			goto free_tms;
755 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
756 		if (ret)
757 			goto free_tms;
758 	}
759 
760 	write_unlock(&fs_info->tree_mod_log_lock);
761 	kfree(tm_list);
762 
763 	return 0;
764 
765 free_tms:
766 	for (i = 0; i < nr_items * 2; i++) {
767 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
768 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
769 		kfree(tm_list[i]);
770 	}
771 	if (locked)
772 		write_unlock(&fs_info->tree_mod_log_lock);
773 	kfree(tm_list);
774 
775 	return ret;
776 }
777 
778 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
779 {
780 	struct tree_mod_elem **tm_list = NULL;
781 	int nritems = 0;
782 	int i;
783 	int ret = 0;
784 
785 	if (btrfs_header_level(eb) == 0)
786 		return 0;
787 
788 	if (!tree_mod_need_log(eb->fs_info, NULL))
789 		return 0;
790 
791 	nritems = btrfs_header_nritems(eb);
792 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
793 	if (!tm_list)
794 		return -ENOMEM;
795 
796 	for (i = 0; i < nritems; i++) {
797 		tm_list[i] = alloc_tree_mod_elem(eb, i,
798 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
799 		if (!tm_list[i]) {
800 			ret = -ENOMEM;
801 			goto free_tms;
802 		}
803 	}
804 
805 	if (tree_mod_dont_log(eb->fs_info, eb))
806 		goto free_tms;
807 
808 	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
809 	write_unlock(&eb->fs_info->tree_mod_log_lock);
810 	if (ret)
811 		goto free_tms;
812 	kfree(tm_list);
813 
814 	return 0;
815 
816 free_tms:
817 	for (i = 0; i < nritems; i++)
818 		kfree(tm_list[i]);
819 	kfree(tm_list);
820 
821 	return ret;
822 }
823 
824 /*
825  * check if the tree block can be shared by multiple trees
826  */
827 int btrfs_block_can_be_shared(struct btrfs_root *root,
828 			      struct extent_buffer *buf)
829 {
830 	/*
831 	 * Tree blocks not in shareable trees and tree roots are never shared.
832 	 * If a block was allocated after the last snapshot and the block was
833 	 * not allocated by tree relocation, we know the block is not shared.
834 	 */
835 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
836 	    buf != root->node && buf != root->commit_root &&
837 	    (btrfs_header_generation(buf) <=
838 	     btrfs_root_last_snapshot(&root->root_item) ||
839 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
840 		return 1;
841 
842 	return 0;
843 }
844 
845 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
846 				       struct btrfs_root *root,
847 				       struct extent_buffer *buf,
848 				       struct extent_buffer *cow,
849 				       int *last_ref)
850 {
851 	struct btrfs_fs_info *fs_info = root->fs_info;
852 	u64 refs;
853 	u64 owner;
854 	u64 flags;
855 	u64 new_flags = 0;
856 	int ret;
857 
858 	/*
859 	 * Backrefs update rules:
860 	 *
861 	 * Always use full backrefs for extent pointers in tree block
862 	 * allocated by tree relocation.
863 	 *
864 	 * If a shared tree block is no longer referenced by its owner
865 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
866 	 * use full backrefs for extent pointers in tree block.
867 	 *
868 	 * If a tree block is been relocating
869 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
870 	 * use full backrefs for extent pointers in tree block.
871 	 * The reason for this is some operations (such as drop tree)
872 	 * are only allowed for blocks use full backrefs.
873 	 */
874 
875 	if (btrfs_block_can_be_shared(root, buf)) {
876 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
877 					       btrfs_header_level(buf), 1,
878 					       &refs, &flags);
879 		if (ret)
880 			return ret;
881 		if (refs == 0) {
882 			ret = -EROFS;
883 			btrfs_handle_fs_error(fs_info, ret, NULL);
884 			return ret;
885 		}
886 	} else {
887 		refs = 1;
888 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
889 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
890 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
891 		else
892 			flags = 0;
893 	}
894 
895 	owner = btrfs_header_owner(buf);
896 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
897 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
898 
899 	if (refs > 1) {
900 		if ((owner == root->root_key.objectid ||
901 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
902 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
903 			ret = btrfs_inc_ref(trans, root, buf, 1);
904 			if (ret)
905 				return ret;
906 
907 			if (root->root_key.objectid ==
908 			    BTRFS_TREE_RELOC_OBJECTID) {
909 				ret = btrfs_dec_ref(trans, root, buf, 0);
910 				if (ret)
911 					return ret;
912 				ret = btrfs_inc_ref(trans, root, cow, 1);
913 				if (ret)
914 					return ret;
915 			}
916 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
917 		} else {
918 
919 			if (root->root_key.objectid ==
920 			    BTRFS_TREE_RELOC_OBJECTID)
921 				ret = btrfs_inc_ref(trans, root, cow, 1);
922 			else
923 				ret = btrfs_inc_ref(trans, root, cow, 0);
924 			if (ret)
925 				return ret;
926 		}
927 		if (new_flags != 0) {
928 			int level = btrfs_header_level(buf);
929 
930 			ret = btrfs_set_disk_extent_flags(trans, buf,
931 							  new_flags, level, 0);
932 			if (ret)
933 				return ret;
934 		}
935 	} else {
936 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
937 			if (root->root_key.objectid ==
938 			    BTRFS_TREE_RELOC_OBJECTID)
939 				ret = btrfs_inc_ref(trans, root, cow, 1);
940 			else
941 				ret = btrfs_inc_ref(trans, root, cow, 0);
942 			if (ret)
943 				return ret;
944 			ret = btrfs_dec_ref(trans, root, buf, 1);
945 			if (ret)
946 				return ret;
947 		}
948 		btrfs_clean_tree_block(buf);
949 		*last_ref = 1;
950 	}
951 	return 0;
952 }
953 
954 static struct extent_buffer *alloc_tree_block_no_bg_flush(
955 					  struct btrfs_trans_handle *trans,
956 					  struct btrfs_root *root,
957 					  u64 parent_start,
958 					  const struct btrfs_disk_key *disk_key,
959 					  int level,
960 					  u64 hint,
961 					  u64 empty_size,
962 					  enum btrfs_lock_nesting nest)
963 {
964 	struct btrfs_fs_info *fs_info = root->fs_info;
965 	struct extent_buffer *ret;
966 
967 	/*
968 	 * If we are COWing a node/leaf from the extent, chunk, device or free
969 	 * space trees, make sure that we do not finish block group creation of
970 	 * pending block groups. We do this to avoid a deadlock.
971 	 * COWing can result in allocation of a new chunk, and flushing pending
972 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
973 	 * when finishing allocation of a new chunk. Creation of a pending block
974 	 * group modifies the extent, chunk, device and free space trees,
975 	 * therefore we could deadlock with ourselves since we are holding a
976 	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
977 	 * try to COW later.
978 	 * For similar reasons, we also need to delay flushing pending block
979 	 * groups when splitting a leaf or node, from one of those trees, since
980 	 * we are holding a write lock on it and its parent or when inserting a
981 	 * new root node for one of those trees.
982 	 */
983 	if (root == fs_info->extent_root ||
984 	    root == fs_info->chunk_root ||
985 	    root == fs_info->dev_root ||
986 	    root == fs_info->free_space_root)
987 		trans->can_flush_pending_bgs = false;
988 
989 	ret = btrfs_alloc_tree_block(trans, root, parent_start,
990 				     root->root_key.objectid, disk_key, level,
991 				     hint, empty_size, nest);
992 	trans->can_flush_pending_bgs = true;
993 
994 	return ret;
995 }
996 
997 /*
998  * does the dirty work in cow of a single block.  The parent block (if
999  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1000  * dirty and returned locked.  If you modify the block it needs to be marked
1001  * dirty again.
1002  *
1003  * search_start -- an allocation hint for the new block
1004  *
1005  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1006  * bytes the allocator should try to find free next to the block it returns.
1007  * This is just a hint and may be ignored by the allocator.
1008  */
1009 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1010 			     struct btrfs_root *root,
1011 			     struct extent_buffer *buf,
1012 			     struct extent_buffer *parent, int parent_slot,
1013 			     struct extent_buffer **cow_ret,
1014 			     u64 search_start, u64 empty_size,
1015 			     enum btrfs_lock_nesting nest)
1016 {
1017 	struct btrfs_fs_info *fs_info = root->fs_info;
1018 	struct btrfs_disk_key disk_key;
1019 	struct extent_buffer *cow;
1020 	int level, ret;
1021 	int last_ref = 0;
1022 	int unlock_orig = 0;
1023 	u64 parent_start = 0;
1024 
1025 	if (*cow_ret == buf)
1026 		unlock_orig = 1;
1027 
1028 	btrfs_assert_tree_locked(buf);
1029 
1030 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1031 		trans->transid != fs_info->running_transaction->transid);
1032 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1033 		trans->transid != root->last_trans);
1034 
1035 	level = btrfs_header_level(buf);
1036 
1037 	if (level == 0)
1038 		btrfs_item_key(buf, &disk_key, 0);
1039 	else
1040 		btrfs_node_key(buf, &disk_key, 0);
1041 
1042 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1043 		parent_start = parent->start;
1044 
1045 	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1046 					   level, search_start, empty_size, nest);
1047 	if (IS_ERR(cow))
1048 		return PTR_ERR(cow);
1049 
1050 	/* cow is set to blocking by btrfs_init_new_buffer */
1051 
1052 	copy_extent_buffer_full(cow, buf);
1053 	btrfs_set_header_bytenr(cow, cow->start);
1054 	btrfs_set_header_generation(cow, trans->transid);
1055 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1056 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1057 				     BTRFS_HEADER_FLAG_RELOC);
1058 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1059 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1060 	else
1061 		btrfs_set_header_owner(cow, root->root_key.objectid);
1062 
1063 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1064 
1065 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1066 	if (ret) {
1067 		btrfs_tree_unlock(cow);
1068 		free_extent_buffer(cow);
1069 		btrfs_abort_transaction(trans, ret);
1070 		return ret;
1071 	}
1072 
1073 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1074 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1075 		if (ret) {
1076 			btrfs_tree_unlock(cow);
1077 			free_extent_buffer(cow);
1078 			btrfs_abort_transaction(trans, ret);
1079 			return ret;
1080 		}
1081 	}
1082 
1083 	if (buf == root->node) {
1084 		WARN_ON(parent && parent != buf);
1085 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1086 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1087 			parent_start = buf->start;
1088 
1089 		atomic_inc(&cow->refs);
1090 		ret = tree_mod_log_insert_root(root->node, cow, 1);
1091 		BUG_ON(ret < 0);
1092 		rcu_assign_pointer(root->node, cow);
1093 
1094 		btrfs_free_tree_block(trans, root, buf, parent_start,
1095 				      last_ref);
1096 		free_extent_buffer(buf);
1097 		add_root_to_dirty_list(root);
1098 	} else {
1099 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1100 		tree_mod_log_insert_key(parent, parent_slot,
1101 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1102 		btrfs_set_node_blockptr(parent, parent_slot,
1103 					cow->start);
1104 		btrfs_set_node_ptr_generation(parent, parent_slot,
1105 					      trans->transid);
1106 		btrfs_mark_buffer_dirty(parent);
1107 		if (last_ref) {
1108 			ret = tree_mod_log_free_eb(buf);
1109 			if (ret) {
1110 				btrfs_tree_unlock(cow);
1111 				free_extent_buffer(cow);
1112 				btrfs_abort_transaction(trans, ret);
1113 				return ret;
1114 			}
1115 		}
1116 		btrfs_free_tree_block(trans, root, buf, parent_start,
1117 				      last_ref);
1118 	}
1119 	if (unlock_orig)
1120 		btrfs_tree_unlock(buf);
1121 	free_extent_buffer_stale(buf);
1122 	btrfs_mark_buffer_dirty(cow);
1123 	*cow_ret = cow;
1124 	return 0;
1125 }
1126 
1127 /*
1128  * returns the logical address of the oldest predecessor of the given root.
1129  * entries older than time_seq are ignored.
1130  */
1131 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1132 		struct extent_buffer *eb_root, u64 time_seq)
1133 {
1134 	struct tree_mod_elem *tm;
1135 	struct tree_mod_elem *found = NULL;
1136 	u64 root_logical = eb_root->start;
1137 	int looped = 0;
1138 
1139 	if (!time_seq)
1140 		return NULL;
1141 
1142 	/*
1143 	 * the very last operation that's logged for a root is the
1144 	 * replacement operation (if it is replaced at all). this has
1145 	 * the logical address of the *new* root, making it the very
1146 	 * first operation that's logged for this root.
1147 	 */
1148 	while (1) {
1149 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1150 						time_seq);
1151 		if (!looped && !tm)
1152 			return NULL;
1153 		/*
1154 		 * if there are no tree operation for the oldest root, we simply
1155 		 * return it. this should only happen if that (old) root is at
1156 		 * level 0.
1157 		 */
1158 		if (!tm)
1159 			break;
1160 
1161 		/*
1162 		 * if there's an operation that's not a root replacement, we
1163 		 * found the oldest version of our root. normally, we'll find a
1164 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1165 		 */
1166 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1167 			break;
1168 
1169 		found = tm;
1170 		root_logical = tm->old_root.logical;
1171 		looped = 1;
1172 	}
1173 
1174 	/* if there's no old root to return, return what we found instead */
1175 	if (!found)
1176 		found = tm;
1177 
1178 	return found;
1179 }
1180 
1181 /*
1182  * tm is a pointer to the first operation to rewind within eb. then, all
1183  * previous operations will be rewound (until we reach something older than
1184  * time_seq).
1185  */
1186 static void
1187 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1188 		      u64 time_seq, struct tree_mod_elem *first_tm)
1189 {
1190 	u32 n;
1191 	struct rb_node *next;
1192 	struct tree_mod_elem *tm = first_tm;
1193 	unsigned long o_dst;
1194 	unsigned long o_src;
1195 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1196 
1197 	n = btrfs_header_nritems(eb);
1198 	read_lock(&fs_info->tree_mod_log_lock);
1199 	while (tm && tm->seq >= time_seq) {
1200 		/*
1201 		 * all the operations are recorded with the operator used for
1202 		 * the modification. as we're going backwards, we do the
1203 		 * opposite of each operation here.
1204 		 */
1205 		switch (tm->op) {
1206 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1207 			BUG_ON(tm->slot < n);
1208 			fallthrough;
1209 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1210 		case MOD_LOG_KEY_REMOVE:
1211 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1212 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1213 			btrfs_set_node_ptr_generation(eb, tm->slot,
1214 						      tm->generation);
1215 			n++;
1216 			break;
1217 		case MOD_LOG_KEY_REPLACE:
1218 			BUG_ON(tm->slot >= n);
1219 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1220 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1221 			btrfs_set_node_ptr_generation(eb, tm->slot,
1222 						      tm->generation);
1223 			break;
1224 		case MOD_LOG_KEY_ADD:
1225 			/* if a move operation is needed it's in the log */
1226 			n--;
1227 			break;
1228 		case MOD_LOG_MOVE_KEYS:
1229 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1230 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1231 			memmove_extent_buffer(eb, o_dst, o_src,
1232 					      tm->move.nr_items * p_size);
1233 			break;
1234 		case MOD_LOG_ROOT_REPLACE:
1235 			/*
1236 			 * this operation is special. for roots, this must be
1237 			 * handled explicitly before rewinding.
1238 			 * for non-roots, this operation may exist if the node
1239 			 * was a root: root A -> child B; then A gets empty and
1240 			 * B is promoted to the new root. in the mod log, we'll
1241 			 * have a root-replace operation for B, a tree block
1242 			 * that is no root. we simply ignore that operation.
1243 			 */
1244 			break;
1245 		}
1246 		next = rb_next(&tm->node);
1247 		if (!next)
1248 			break;
1249 		tm = rb_entry(next, struct tree_mod_elem, node);
1250 		if (tm->logical != first_tm->logical)
1251 			break;
1252 	}
1253 	read_unlock(&fs_info->tree_mod_log_lock);
1254 	btrfs_set_header_nritems(eb, n);
1255 }
1256 
1257 /*
1258  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1259  * is returned. If rewind operations happen, a fresh buffer is returned. The
1260  * returned buffer is always read-locked. If the returned buffer is not the
1261  * input buffer, the lock on the input buffer is released and the input buffer
1262  * is freed (its refcount is decremented).
1263  */
1264 static struct extent_buffer *
1265 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1266 		    struct extent_buffer *eb, u64 time_seq)
1267 {
1268 	struct extent_buffer *eb_rewin;
1269 	struct tree_mod_elem *tm;
1270 
1271 	if (!time_seq)
1272 		return eb;
1273 
1274 	if (btrfs_header_level(eb) == 0)
1275 		return eb;
1276 
1277 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1278 	if (!tm)
1279 		return eb;
1280 
1281 	btrfs_set_path_blocking(path);
1282 	btrfs_set_lock_blocking_read(eb);
1283 
1284 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1285 		BUG_ON(tm->slot != 0);
1286 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1287 		if (!eb_rewin) {
1288 			btrfs_tree_read_unlock_blocking(eb);
1289 			free_extent_buffer(eb);
1290 			return NULL;
1291 		}
1292 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1293 		btrfs_set_header_backref_rev(eb_rewin,
1294 					     btrfs_header_backref_rev(eb));
1295 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1296 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1297 	} else {
1298 		eb_rewin = btrfs_clone_extent_buffer(eb);
1299 		if (!eb_rewin) {
1300 			btrfs_tree_read_unlock_blocking(eb);
1301 			free_extent_buffer(eb);
1302 			return NULL;
1303 		}
1304 	}
1305 
1306 	btrfs_tree_read_unlock_blocking(eb);
1307 	free_extent_buffer(eb);
1308 
1309 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1310 				       eb_rewin, btrfs_header_level(eb_rewin));
1311 	btrfs_tree_read_lock(eb_rewin);
1312 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1313 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1314 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1315 
1316 	return eb_rewin;
1317 }
1318 
1319 /*
1320  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1321  * value. If there are no changes, the current root->root_node is returned. If
1322  * anything changed in between, there's a fresh buffer allocated on which the
1323  * rewind operations are done. In any case, the returned buffer is read locked.
1324  * Returns NULL on error (with no locks held).
1325  */
1326 static inline struct extent_buffer *
1327 get_old_root(struct btrfs_root *root, u64 time_seq)
1328 {
1329 	struct btrfs_fs_info *fs_info = root->fs_info;
1330 	struct tree_mod_elem *tm;
1331 	struct extent_buffer *eb = NULL;
1332 	struct extent_buffer *eb_root;
1333 	u64 eb_root_owner = 0;
1334 	struct extent_buffer *old;
1335 	struct tree_mod_root *old_root = NULL;
1336 	u64 old_generation = 0;
1337 	u64 logical;
1338 	int level;
1339 
1340 	eb_root = btrfs_read_lock_root_node(root);
1341 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1342 	if (!tm)
1343 		return eb_root;
1344 
1345 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1346 		old_root = &tm->old_root;
1347 		old_generation = tm->generation;
1348 		logical = old_root->logical;
1349 		level = old_root->level;
1350 	} else {
1351 		logical = eb_root->start;
1352 		level = btrfs_header_level(eb_root);
1353 	}
1354 
1355 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1356 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1357 		btrfs_tree_read_unlock(eb_root);
1358 		free_extent_buffer(eb_root);
1359 		old = read_tree_block(fs_info, logical, 0, level, NULL);
1360 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1361 			if (!IS_ERR(old))
1362 				free_extent_buffer(old);
1363 			btrfs_warn(fs_info,
1364 				   "failed to read tree block %llu from get_old_root",
1365 				   logical);
1366 		} else {
1367 			eb = btrfs_clone_extent_buffer(old);
1368 			free_extent_buffer(old);
1369 		}
1370 	} else if (old_root) {
1371 		eb_root_owner = btrfs_header_owner(eb_root);
1372 		btrfs_tree_read_unlock(eb_root);
1373 		free_extent_buffer(eb_root);
1374 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1375 	} else {
1376 		btrfs_set_lock_blocking_read(eb_root);
1377 		eb = btrfs_clone_extent_buffer(eb_root);
1378 		btrfs_tree_read_unlock_blocking(eb_root);
1379 		free_extent_buffer(eb_root);
1380 	}
1381 
1382 	if (!eb)
1383 		return NULL;
1384 	if (old_root) {
1385 		btrfs_set_header_bytenr(eb, eb->start);
1386 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1387 		btrfs_set_header_owner(eb, eb_root_owner);
1388 		btrfs_set_header_level(eb, old_root->level);
1389 		btrfs_set_header_generation(eb, old_generation);
1390 	}
1391 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1392 				       btrfs_header_level(eb));
1393 	btrfs_tree_read_lock(eb);
1394 	if (tm)
1395 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1396 	else
1397 		WARN_ON(btrfs_header_level(eb) != 0);
1398 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1399 
1400 	return eb;
1401 }
1402 
1403 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1404 {
1405 	struct tree_mod_elem *tm;
1406 	int level;
1407 	struct extent_buffer *eb_root = btrfs_root_node(root);
1408 
1409 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1410 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1411 		level = tm->old_root.level;
1412 	} else {
1413 		level = btrfs_header_level(eb_root);
1414 	}
1415 	free_extent_buffer(eb_root);
1416 
1417 	return level;
1418 }
1419 
1420 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1421 				   struct btrfs_root *root,
1422 				   struct extent_buffer *buf)
1423 {
1424 	if (btrfs_is_testing(root->fs_info))
1425 		return 0;
1426 
1427 	/* Ensure we can see the FORCE_COW bit */
1428 	smp_mb__before_atomic();
1429 
1430 	/*
1431 	 * We do not need to cow a block if
1432 	 * 1) this block is not created or changed in this transaction;
1433 	 * 2) this block does not belong to TREE_RELOC tree;
1434 	 * 3) the root is not forced COW.
1435 	 *
1436 	 * What is forced COW:
1437 	 *    when we create snapshot during committing the transaction,
1438 	 *    after we've finished copying src root, we must COW the shared
1439 	 *    block to ensure the metadata consistency.
1440 	 */
1441 	if (btrfs_header_generation(buf) == trans->transid &&
1442 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1443 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1444 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1445 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1446 		return 0;
1447 	return 1;
1448 }
1449 
1450 /*
1451  * cows a single block, see __btrfs_cow_block for the real work.
1452  * This version of it has extra checks so that a block isn't COWed more than
1453  * once per transaction, as long as it hasn't been written yet
1454  */
1455 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1456 		    struct btrfs_root *root, struct extent_buffer *buf,
1457 		    struct extent_buffer *parent, int parent_slot,
1458 		    struct extent_buffer **cow_ret,
1459 		    enum btrfs_lock_nesting nest)
1460 {
1461 	struct btrfs_fs_info *fs_info = root->fs_info;
1462 	u64 search_start;
1463 	int ret;
1464 
1465 	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1466 		btrfs_err(fs_info,
1467 			"COW'ing blocks on a fs root that's being dropped");
1468 
1469 	if (trans->transaction != fs_info->running_transaction)
1470 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1471 		       trans->transid,
1472 		       fs_info->running_transaction->transid);
1473 
1474 	if (trans->transid != fs_info->generation)
1475 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1476 		       trans->transid, fs_info->generation);
1477 
1478 	if (!should_cow_block(trans, root, buf)) {
1479 		trans->dirty = true;
1480 		*cow_ret = buf;
1481 		return 0;
1482 	}
1483 
1484 	search_start = buf->start & ~((u64)SZ_1G - 1);
1485 
1486 	if (parent)
1487 		btrfs_set_lock_blocking_write(parent);
1488 	btrfs_set_lock_blocking_write(buf);
1489 
1490 	/*
1491 	 * Before CoWing this block for later modification, check if it's
1492 	 * the subtree root and do the delayed subtree trace if needed.
1493 	 *
1494 	 * Also We don't care about the error, as it's handled internally.
1495 	 */
1496 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1497 	ret = __btrfs_cow_block(trans, root, buf, parent,
1498 				 parent_slot, cow_ret, search_start, 0, nest);
1499 
1500 	trace_btrfs_cow_block(root, buf, *cow_ret);
1501 
1502 	return ret;
1503 }
1504 
1505 /*
1506  * helper function for defrag to decide if two blocks pointed to by a
1507  * node are actually close by
1508  */
1509 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1510 {
1511 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1512 		return 1;
1513 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1514 		return 1;
1515 	return 0;
1516 }
1517 
1518 #ifdef __LITTLE_ENDIAN
1519 
1520 /*
1521  * Compare two keys, on little-endian the disk order is same as CPU order and
1522  * we can avoid the conversion.
1523  */
1524 static int comp_keys(const struct btrfs_disk_key *disk_key,
1525 		     const struct btrfs_key *k2)
1526 {
1527 	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1528 
1529 	return btrfs_comp_cpu_keys(k1, k2);
1530 }
1531 
1532 #else
1533 
1534 /*
1535  * compare two keys in a memcmp fashion
1536  */
1537 static int comp_keys(const struct btrfs_disk_key *disk,
1538 		     const struct btrfs_key *k2)
1539 {
1540 	struct btrfs_key k1;
1541 
1542 	btrfs_disk_key_to_cpu(&k1, disk);
1543 
1544 	return btrfs_comp_cpu_keys(&k1, k2);
1545 }
1546 #endif
1547 
1548 /*
1549  * same as comp_keys only with two btrfs_key's
1550  */
1551 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1552 {
1553 	if (k1->objectid > k2->objectid)
1554 		return 1;
1555 	if (k1->objectid < k2->objectid)
1556 		return -1;
1557 	if (k1->type > k2->type)
1558 		return 1;
1559 	if (k1->type < k2->type)
1560 		return -1;
1561 	if (k1->offset > k2->offset)
1562 		return 1;
1563 	if (k1->offset < k2->offset)
1564 		return -1;
1565 	return 0;
1566 }
1567 
1568 /*
1569  * this is used by the defrag code to go through all the
1570  * leaves pointed to by a node and reallocate them so that
1571  * disk order is close to key order
1572  */
1573 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1574 		       struct btrfs_root *root, struct extent_buffer *parent,
1575 		       int start_slot, u64 *last_ret,
1576 		       struct btrfs_key *progress)
1577 {
1578 	struct btrfs_fs_info *fs_info = root->fs_info;
1579 	struct extent_buffer *cur;
1580 	u64 blocknr;
1581 	u64 gen;
1582 	u64 search_start = *last_ret;
1583 	u64 last_block = 0;
1584 	u64 other;
1585 	u32 parent_nritems;
1586 	int end_slot;
1587 	int i;
1588 	int err = 0;
1589 	int parent_level;
1590 	int uptodate;
1591 	u32 blocksize;
1592 	int progress_passed = 0;
1593 	struct btrfs_disk_key disk_key;
1594 
1595 	parent_level = btrfs_header_level(parent);
1596 
1597 	WARN_ON(trans->transaction != fs_info->running_transaction);
1598 	WARN_ON(trans->transid != fs_info->generation);
1599 
1600 	parent_nritems = btrfs_header_nritems(parent);
1601 	blocksize = fs_info->nodesize;
1602 	end_slot = parent_nritems - 1;
1603 
1604 	if (parent_nritems <= 1)
1605 		return 0;
1606 
1607 	btrfs_set_lock_blocking_write(parent);
1608 
1609 	for (i = start_slot; i <= end_slot; i++) {
1610 		struct btrfs_key first_key;
1611 		int close = 1;
1612 
1613 		btrfs_node_key(parent, &disk_key, i);
1614 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1615 			continue;
1616 
1617 		progress_passed = 1;
1618 		blocknr = btrfs_node_blockptr(parent, i);
1619 		gen = btrfs_node_ptr_generation(parent, i);
1620 		btrfs_node_key_to_cpu(parent, &first_key, i);
1621 		if (last_block == 0)
1622 			last_block = blocknr;
1623 
1624 		if (i > 0) {
1625 			other = btrfs_node_blockptr(parent, i - 1);
1626 			close = close_blocks(blocknr, other, blocksize);
1627 		}
1628 		if (!close && i < end_slot) {
1629 			other = btrfs_node_blockptr(parent, i + 1);
1630 			close = close_blocks(blocknr, other, blocksize);
1631 		}
1632 		if (close) {
1633 			last_block = blocknr;
1634 			continue;
1635 		}
1636 
1637 		cur = find_extent_buffer(fs_info, blocknr);
1638 		if (cur)
1639 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1640 		else
1641 			uptodate = 0;
1642 		if (!cur || !uptodate) {
1643 			if (!cur) {
1644 				cur = read_tree_block(fs_info, blocknr, gen,
1645 						      parent_level - 1,
1646 						      &first_key);
1647 				if (IS_ERR(cur)) {
1648 					return PTR_ERR(cur);
1649 				} else if (!extent_buffer_uptodate(cur)) {
1650 					free_extent_buffer(cur);
1651 					return -EIO;
1652 				}
1653 			} else if (!uptodate) {
1654 				err = btrfs_read_buffer(cur, gen,
1655 						parent_level - 1,&first_key);
1656 				if (err) {
1657 					free_extent_buffer(cur);
1658 					return err;
1659 				}
1660 			}
1661 		}
1662 		if (search_start == 0)
1663 			search_start = last_block;
1664 
1665 		btrfs_tree_lock(cur);
1666 		btrfs_set_lock_blocking_write(cur);
1667 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1668 					&cur, search_start,
1669 					min(16 * blocksize,
1670 					    (end_slot - i) * blocksize),
1671 					BTRFS_NESTING_COW);
1672 		if (err) {
1673 			btrfs_tree_unlock(cur);
1674 			free_extent_buffer(cur);
1675 			break;
1676 		}
1677 		search_start = cur->start;
1678 		last_block = cur->start;
1679 		*last_ret = search_start;
1680 		btrfs_tree_unlock(cur);
1681 		free_extent_buffer(cur);
1682 	}
1683 	return err;
1684 }
1685 
1686 /*
1687  * search for key in the extent_buffer.  The items start at offset p,
1688  * and they are item_size apart.  There are 'max' items in p.
1689  *
1690  * the slot in the array is returned via slot, and it points to
1691  * the place where you would insert key if it is not found in
1692  * the array.
1693  *
1694  * slot may point to max if the key is bigger than all of the keys
1695  */
1696 static noinline int generic_bin_search(struct extent_buffer *eb,
1697 				       unsigned long p, int item_size,
1698 				       const struct btrfs_key *key,
1699 				       int max, int *slot)
1700 {
1701 	int low = 0;
1702 	int high = max;
1703 	int ret;
1704 	const int key_size = sizeof(struct btrfs_disk_key);
1705 
1706 	if (low > high) {
1707 		btrfs_err(eb->fs_info,
1708 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1709 			  __func__, low, high, eb->start,
1710 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1711 		return -EINVAL;
1712 	}
1713 
1714 	while (low < high) {
1715 		unsigned long oip;
1716 		unsigned long offset;
1717 		struct btrfs_disk_key *tmp;
1718 		struct btrfs_disk_key unaligned;
1719 		int mid;
1720 
1721 		mid = (low + high) / 2;
1722 		offset = p + mid * item_size;
1723 		oip = offset_in_page(offset);
1724 
1725 		if (oip + key_size <= PAGE_SIZE) {
1726 			const unsigned long idx = offset >> PAGE_SHIFT;
1727 			char *kaddr = page_address(eb->pages[idx]);
1728 
1729 			tmp = (struct btrfs_disk_key *)(kaddr + oip);
1730 		} else {
1731 			read_extent_buffer(eb, &unaligned, offset, key_size);
1732 			tmp = &unaligned;
1733 		}
1734 
1735 		ret = comp_keys(tmp, key);
1736 
1737 		if (ret < 0)
1738 			low = mid + 1;
1739 		else if (ret > 0)
1740 			high = mid;
1741 		else {
1742 			*slot = mid;
1743 			return 0;
1744 		}
1745 	}
1746 	*slot = low;
1747 	return 1;
1748 }
1749 
1750 /*
1751  * simple bin_search frontend that does the right thing for
1752  * leaves vs nodes
1753  */
1754 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1755 		     int *slot)
1756 {
1757 	if (btrfs_header_level(eb) == 0)
1758 		return generic_bin_search(eb,
1759 					  offsetof(struct btrfs_leaf, items),
1760 					  sizeof(struct btrfs_item),
1761 					  key, btrfs_header_nritems(eb),
1762 					  slot);
1763 	else
1764 		return generic_bin_search(eb,
1765 					  offsetof(struct btrfs_node, ptrs),
1766 					  sizeof(struct btrfs_key_ptr),
1767 					  key, btrfs_header_nritems(eb),
1768 					  slot);
1769 }
1770 
1771 static void root_add_used(struct btrfs_root *root, u32 size)
1772 {
1773 	spin_lock(&root->accounting_lock);
1774 	btrfs_set_root_used(&root->root_item,
1775 			    btrfs_root_used(&root->root_item) + size);
1776 	spin_unlock(&root->accounting_lock);
1777 }
1778 
1779 static void root_sub_used(struct btrfs_root *root, u32 size)
1780 {
1781 	spin_lock(&root->accounting_lock);
1782 	btrfs_set_root_used(&root->root_item,
1783 			    btrfs_root_used(&root->root_item) - size);
1784 	spin_unlock(&root->accounting_lock);
1785 }
1786 
1787 /* given a node and slot number, this reads the blocks it points to.  The
1788  * extent buffer is returned with a reference taken (but unlocked).
1789  */
1790 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1791 					   int slot)
1792 {
1793 	int level = btrfs_header_level(parent);
1794 	struct extent_buffer *eb;
1795 	struct btrfs_key first_key;
1796 
1797 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1798 		return ERR_PTR(-ENOENT);
1799 
1800 	BUG_ON(level == 0);
1801 
1802 	btrfs_node_key_to_cpu(parent, &first_key, slot);
1803 	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1804 			     btrfs_node_ptr_generation(parent, slot),
1805 			     level - 1, &first_key);
1806 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1807 		free_extent_buffer(eb);
1808 		eb = ERR_PTR(-EIO);
1809 	}
1810 
1811 	return eb;
1812 }
1813 
1814 /*
1815  * node level balancing, used to make sure nodes are in proper order for
1816  * item deletion.  We balance from the top down, so we have to make sure
1817  * that a deletion won't leave an node completely empty later on.
1818  */
1819 static noinline int balance_level(struct btrfs_trans_handle *trans,
1820 			 struct btrfs_root *root,
1821 			 struct btrfs_path *path, int level)
1822 {
1823 	struct btrfs_fs_info *fs_info = root->fs_info;
1824 	struct extent_buffer *right = NULL;
1825 	struct extent_buffer *mid;
1826 	struct extent_buffer *left = NULL;
1827 	struct extent_buffer *parent = NULL;
1828 	int ret = 0;
1829 	int wret;
1830 	int pslot;
1831 	int orig_slot = path->slots[level];
1832 	u64 orig_ptr;
1833 
1834 	ASSERT(level > 0);
1835 
1836 	mid = path->nodes[level];
1837 
1838 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1839 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1840 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1841 
1842 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1843 
1844 	if (level < BTRFS_MAX_LEVEL - 1) {
1845 		parent = path->nodes[level + 1];
1846 		pslot = path->slots[level + 1];
1847 	}
1848 
1849 	/*
1850 	 * deal with the case where there is only one pointer in the root
1851 	 * by promoting the node below to a root
1852 	 */
1853 	if (!parent) {
1854 		struct extent_buffer *child;
1855 
1856 		if (btrfs_header_nritems(mid) != 1)
1857 			return 0;
1858 
1859 		/* promote the child to a root */
1860 		child = btrfs_read_node_slot(mid, 0);
1861 		if (IS_ERR(child)) {
1862 			ret = PTR_ERR(child);
1863 			btrfs_handle_fs_error(fs_info, ret, NULL);
1864 			goto enospc;
1865 		}
1866 
1867 		btrfs_tree_lock(child);
1868 		btrfs_set_lock_blocking_write(child);
1869 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1870 				      BTRFS_NESTING_COW);
1871 		if (ret) {
1872 			btrfs_tree_unlock(child);
1873 			free_extent_buffer(child);
1874 			goto enospc;
1875 		}
1876 
1877 		ret = tree_mod_log_insert_root(root->node, child, 1);
1878 		BUG_ON(ret < 0);
1879 		rcu_assign_pointer(root->node, child);
1880 
1881 		add_root_to_dirty_list(root);
1882 		btrfs_tree_unlock(child);
1883 
1884 		path->locks[level] = 0;
1885 		path->nodes[level] = NULL;
1886 		btrfs_clean_tree_block(mid);
1887 		btrfs_tree_unlock(mid);
1888 		/* once for the path */
1889 		free_extent_buffer(mid);
1890 
1891 		root_sub_used(root, mid->len);
1892 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1893 		/* once for the root ptr */
1894 		free_extent_buffer_stale(mid);
1895 		return 0;
1896 	}
1897 	if (btrfs_header_nritems(mid) >
1898 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1899 		return 0;
1900 
1901 	left = btrfs_read_node_slot(parent, pslot - 1);
1902 	if (IS_ERR(left))
1903 		left = NULL;
1904 
1905 	if (left) {
1906 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1907 		btrfs_set_lock_blocking_write(left);
1908 		wret = btrfs_cow_block(trans, root, left,
1909 				       parent, pslot - 1, &left,
1910 				       BTRFS_NESTING_LEFT_COW);
1911 		if (wret) {
1912 			ret = wret;
1913 			goto enospc;
1914 		}
1915 	}
1916 
1917 	right = btrfs_read_node_slot(parent, pslot + 1);
1918 	if (IS_ERR(right))
1919 		right = NULL;
1920 
1921 	if (right) {
1922 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1923 		btrfs_set_lock_blocking_write(right);
1924 		wret = btrfs_cow_block(trans, root, right,
1925 				       parent, pslot + 1, &right,
1926 				       BTRFS_NESTING_RIGHT_COW);
1927 		if (wret) {
1928 			ret = wret;
1929 			goto enospc;
1930 		}
1931 	}
1932 
1933 	/* first, try to make some room in the middle buffer */
1934 	if (left) {
1935 		orig_slot += btrfs_header_nritems(left);
1936 		wret = push_node_left(trans, left, mid, 1);
1937 		if (wret < 0)
1938 			ret = wret;
1939 	}
1940 
1941 	/*
1942 	 * then try to empty the right most buffer into the middle
1943 	 */
1944 	if (right) {
1945 		wret = push_node_left(trans, mid, right, 1);
1946 		if (wret < 0 && wret != -ENOSPC)
1947 			ret = wret;
1948 		if (btrfs_header_nritems(right) == 0) {
1949 			btrfs_clean_tree_block(right);
1950 			btrfs_tree_unlock(right);
1951 			del_ptr(root, path, level + 1, pslot + 1);
1952 			root_sub_used(root, right->len);
1953 			btrfs_free_tree_block(trans, root, right, 0, 1);
1954 			free_extent_buffer_stale(right);
1955 			right = NULL;
1956 		} else {
1957 			struct btrfs_disk_key right_key;
1958 			btrfs_node_key(right, &right_key, 0);
1959 			ret = tree_mod_log_insert_key(parent, pslot + 1,
1960 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1961 			BUG_ON(ret < 0);
1962 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1963 			btrfs_mark_buffer_dirty(parent);
1964 		}
1965 	}
1966 	if (btrfs_header_nritems(mid) == 1) {
1967 		/*
1968 		 * we're not allowed to leave a node with one item in the
1969 		 * tree during a delete.  A deletion from lower in the tree
1970 		 * could try to delete the only pointer in this node.
1971 		 * So, pull some keys from the left.
1972 		 * There has to be a left pointer at this point because
1973 		 * otherwise we would have pulled some pointers from the
1974 		 * right
1975 		 */
1976 		if (!left) {
1977 			ret = -EROFS;
1978 			btrfs_handle_fs_error(fs_info, ret, NULL);
1979 			goto enospc;
1980 		}
1981 		wret = balance_node_right(trans, mid, left);
1982 		if (wret < 0) {
1983 			ret = wret;
1984 			goto enospc;
1985 		}
1986 		if (wret == 1) {
1987 			wret = push_node_left(trans, left, mid, 1);
1988 			if (wret < 0)
1989 				ret = wret;
1990 		}
1991 		BUG_ON(wret == 1);
1992 	}
1993 	if (btrfs_header_nritems(mid) == 0) {
1994 		btrfs_clean_tree_block(mid);
1995 		btrfs_tree_unlock(mid);
1996 		del_ptr(root, path, level + 1, pslot);
1997 		root_sub_used(root, mid->len);
1998 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1999 		free_extent_buffer_stale(mid);
2000 		mid = NULL;
2001 	} else {
2002 		/* update the parent key to reflect our changes */
2003 		struct btrfs_disk_key mid_key;
2004 		btrfs_node_key(mid, &mid_key, 0);
2005 		ret = tree_mod_log_insert_key(parent, pslot,
2006 				MOD_LOG_KEY_REPLACE, GFP_NOFS);
2007 		BUG_ON(ret < 0);
2008 		btrfs_set_node_key(parent, &mid_key, pslot);
2009 		btrfs_mark_buffer_dirty(parent);
2010 	}
2011 
2012 	/* update the path */
2013 	if (left) {
2014 		if (btrfs_header_nritems(left) > orig_slot) {
2015 			atomic_inc(&left->refs);
2016 			/* left was locked after cow */
2017 			path->nodes[level] = left;
2018 			path->slots[level + 1] -= 1;
2019 			path->slots[level] = orig_slot;
2020 			if (mid) {
2021 				btrfs_tree_unlock(mid);
2022 				free_extent_buffer(mid);
2023 			}
2024 		} else {
2025 			orig_slot -= btrfs_header_nritems(left);
2026 			path->slots[level] = orig_slot;
2027 		}
2028 	}
2029 	/* double check we haven't messed things up */
2030 	if (orig_ptr !=
2031 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2032 		BUG();
2033 enospc:
2034 	if (right) {
2035 		btrfs_tree_unlock(right);
2036 		free_extent_buffer(right);
2037 	}
2038 	if (left) {
2039 		if (path->nodes[level] != left)
2040 			btrfs_tree_unlock(left);
2041 		free_extent_buffer(left);
2042 	}
2043 	return ret;
2044 }
2045 
2046 /* Node balancing for insertion.  Here we only split or push nodes around
2047  * when they are completely full.  This is also done top down, so we
2048  * have to be pessimistic.
2049  */
2050 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2051 					  struct btrfs_root *root,
2052 					  struct btrfs_path *path, int level)
2053 {
2054 	struct btrfs_fs_info *fs_info = root->fs_info;
2055 	struct extent_buffer *right = NULL;
2056 	struct extent_buffer *mid;
2057 	struct extent_buffer *left = NULL;
2058 	struct extent_buffer *parent = NULL;
2059 	int ret = 0;
2060 	int wret;
2061 	int pslot;
2062 	int orig_slot = path->slots[level];
2063 
2064 	if (level == 0)
2065 		return 1;
2066 
2067 	mid = path->nodes[level];
2068 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2069 
2070 	if (level < BTRFS_MAX_LEVEL - 1) {
2071 		parent = path->nodes[level + 1];
2072 		pslot = path->slots[level + 1];
2073 	}
2074 
2075 	if (!parent)
2076 		return 1;
2077 
2078 	left = btrfs_read_node_slot(parent, pslot - 1);
2079 	if (IS_ERR(left))
2080 		left = NULL;
2081 
2082 	/* first, try to make some room in the middle buffer */
2083 	if (left) {
2084 		u32 left_nr;
2085 
2086 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2087 		btrfs_set_lock_blocking_write(left);
2088 
2089 		left_nr = btrfs_header_nritems(left);
2090 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2091 			wret = 1;
2092 		} else {
2093 			ret = btrfs_cow_block(trans, root, left, parent,
2094 					      pslot - 1, &left,
2095 					      BTRFS_NESTING_LEFT_COW);
2096 			if (ret)
2097 				wret = 1;
2098 			else {
2099 				wret = push_node_left(trans, left, mid, 0);
2100 			}
2101 		}
2102 		if (wret < 0)
2103 			ret = wret;
2104 		if (wret == 0) {
2105 			struct btrfs_disk_key disk_key;
2106 			orig_slot += left_nr;
2107 			btrfs_node_key(mid, &disk_key, 0);
2108 			ret = tree_mod_log_insert_key(parent, pslot,
2109 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2110 			BUG_ON(ret < 0);
2111 			btrfs_set_node_key(parent, &disk_key, pslot);
2112 			btrfs_mark_buffer_dirty(parent);
2113 			if (btrfs_header_nritems(left) > orig_slot) {
2114 				path->nodes[level] = left;
2115 				path->slots[level + 1] -= 1;
2116 				path->slots[level] = orig_slot;
2117 				btrfs_tree_unlock(mid);
2118 				free_extent_buffer(mid);
2119 			} else {
2120 				orig_slot -=
2121 					btrfs_header_nritems(left);
2122 				path->slots[level] = orig_slot;
2123 				btrfs_tree_unlock(left);
2124 				free_extent_buffer(left);
2125 			}
2126 			return 0;
2127 		}
2128 		btrfs_tree_unlock(left);
2129 		free_extent_buffer(left);
2130 	}
2131 	right = btrfs_read_node_slot(parent, pslot + 1);
2132 	if (IS_ERR(right))
2133 		right = NULL;
2134 
2135 	/*
2136 	 * then try to empty the right most buffer into the middle
2137 	 */
2138 	if (right) {
2139 		u32 right_nr;
2140 
2141 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2142 		btrfs_set_lock_blocking_write(right);
2143 
2144 		right_nr = btrfs_header_nritems(right);
2145 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2146 			wret = 1;
2147 		} else {
2148 			ret = btrfs_cow_block(trans, root, right,
2149 					      parent, pslot + 1,
2150 					      &right, BTRFS_NESTING_RIGHT_COW);
2151 			if (ret)
2152 				wret = 1;
2153 			else {
2154 				wret = balance_node_right(trans, right, mid);
2155 			}
2156 		}
2157 		if (wret < 0)
2158 			ret = wret;
2159 		if (wret == 0) {
2160 			struct btrfs_disk_key disk_key;
2161 
2162 			btrfs_node_key(right, &disk_key, 0);
2163 			ret = tree_mod_log_insert_key(parent, pslot + 1,
2164 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2165 			BUG_ON(ret < 0);
2166 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2167 			btrfs_mark_buffer_dirty(parent);
2168 
2169 			if (btrfs_header_nritems(mid) <= orig_slot) {
2170 				path->nodes[level] = right;
2171 				path->slots[level + 1] += 1;
2172 				path->slots[level] = orig_slot -
2173 					btrfs_header_nritems(mid);
2174 				btrfs_tree_unlock(mid);
2175 				free_extent_buffer(mid);
2176 			} else {
2177 				btrfs_tree_unlock(right);
2178 				free_extent_buffer(right);
2179 			}
2180 			return 0;
2181 		}
2182 		btrfs_tree_unlock(right);
2183 		free_extent_buffer(right);
2184 	}
2185 	return 1;
2186 }
2187 
2188 /*
2189  * readahead one full node of leaves, finding things that are close
2190  * to the block in 'slot', and triggering ra on them.
2191  */
2192 static void reada_for_search(struct btrfs_fs_info *fs_info,
2193 			     struct btrfs_path *path,
2194 			     int level, int slot, u64 objectid)
2195 {
2196 	struct extent_buffer *node;
2197 	struct btrfs_disk_key disk_key;
2198 	u32 nritems;
2199 	u64 search;
2200 	u64 target;
2201 	u64 nread = 0;
2202 	struct extent_buffer *eb;
2203 	u32 nr;
2204 	u32 blocksize;
2205 	u32 nscan = 0;
2206 
2207 	if (level != 1)
2208 		return;
2209 
2210 	if (!path->nodes[level])
2211 		return;
2212 
2213 	node = path->nodes[level];
2214 
2215 	search = btrfs_node_blockptr(node, slot);
2216 	blocksize = fs_info->nodesize;
2217 	eb = find_extent_buffer(fs_info, search);
2218 	if (eb) {
2219 		free_extent_buffer(eb);
2220 		return;
2221 	}
2222 
2223 	target = search;
2224 
2225 	nritems = btrfs_header_nritems(node);
2226 	nr = slot;
2227 
2228 	while (1) {
2229 		if (path->reada == READA_BACK) {
2230 			if (nr == 0)
2231 				break;
2232 			nr--;
2233 		} else if (path->reada == READA_FORWARD) {
2234 			nr++;
2235 			if (nr >= nritems)
2236 				break;
2237 		}
2238 		if (path->reada == READA_BACK && objectid) {
2239 			btrfs_node_key(node, &disk_key, nr);
2240 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2241 				break;
2242 		}
2243 		search = btrfs_node_blockptr(node, nr);
2244 		if ((search <= target && target - search <= 65536) ||
2245 		    (search > target && search - target <= 65536)) {
2246 			readahead_tree_block(fs_info, search);
2247 			nread += blocksize;
2248 		}
2249 		nscan++;
2250 		if ((nread > 65536 || nscan > 32))
2251 			break;
2252 	}
2253 }
2254 
2255 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2256 				       struct btrfs_path *path, int level)
2257 {
2258 	int slot;
2259 	int nritems;
2260 	struct extent_buffer *parent;
2261 	struct extent_buffer *eb;
2262 	u64 gen;
2263 	u64 block1 = 0;
2264 	u64 block2 = 0;
2265 
2266 	parent = path->nodes[level + 1];
2267 	if (!parent)
2268 		return;
2269 
2270 	nritems = btrfs_header_nritems(parent);
2271 	slot = path->slots[level + 1];
2272 
2273 	if (slot > 0) {
2274 		block1 = btrfs_node_blockptr(parent, slot - 1);
2275 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2276 		eb = find_extent_buffer(fs_info, block1);
2277 		/*
2278 		 * if we get -eagain from btrfs_buffer_uptodate, we
2279 		 * don't want to return eagain here.  That will loop
2280 		 * forever
2281 		 */
2282 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2283 			block1 = 0;
2284 		free_extent_buffer(eb);
2285 	}
2286 	if (slot + 1 < nritems) {
2287 		block2 = btrfs_node_blockptr(parent, slot + 1);
2288 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2289 		eb = find_extent_buffer(fs_info, block2);
2290 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2291 			block2 = 0;
2292 		free_extent_buffer(eb);
2293 	}
2294 
2295 	if (block1)
2296 		readahead_tree_block(fs_info, block1);
2297 	if (block2)
2298 		readahead_tree_block(fs_info, block2);
2299 }
2300 
2301 
2302 /*
2303  * when we walk down the tree, it is usually safe to unlock the higher layers
2304  * in the tree.  The exceptions are when our path goes through slot 0, because
2305  * operations on the tree might require changing key pointers higher up in the
2306  * tree.
2307  *
2308  * callers might also have set path->keep_locks, which tells this code to keep
2309  * the lock if the path points to the last slot in the block.  This is part of
2310  * walking through the tree, and selecting the next slot in the higher block.
2311  *
2312  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2313  * if lowest_unlock is 1, level 0 won't be unlocked
2314  */
2315 static noinline void unlock_up(struct btrfs_path *path, int level,
2316 			       int lowest_unlock, int min_write_lock_level,
2317 			       int *write_lock_level)
2318 {
2319 	int i;
2320 	int skip_level = level;
2321 	int no_skips = 0;
2322 	struct extent_buffer *t;
2323 
2324 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2325 		if (!path->nodes[i])
2326 			break;
2327 		if (!path->locks[i])
2328 			break;
2329 		if (!no_skips && path->slots[i] == 0) {
2330 			skip_level = i + 1;
2331 			continue;
2332 		}
2333 		if (!no_skips && path->keep_locks) {
2334 			u32 nritems;
2335 			t = path->nodes[i];
2336 			nritems = btrfs_header_nritems(t);
2337 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2338 				skip_level = i + 1;
2339 				continue;
2340 			}
2341 		}
2342 		if (skip_level < i && i >= lowest_unlock)
2343 			no_skips = 1;
2344 
2345 		t = path->nodes[i];
2346 		if (i >= lowest_unlock && i > skip_level) {
2347 			btrfs_tree_unlock_rw(t, path->locks[i]);
2348 			path->locks[i] = 0;
2349 			if (write_lock_level &&
2350 			    i > min_write_lock_level &&
2351 			    i <= *write_lock_level) {
2352 				*write_lock_level = i - 1;
2353 			}
2354 		}
2355 	}
2356 }
2357 
2358 /*
2359  * helper function for btrfs_search_slot.  The goal is to find a block
2360  * in cache without setting the path to blocking.  If we find the block
2361  * we return zero and the path is unchanged.
2362  *
2363  * If we can't find the block, we set the path blocking and do some
2364  * reada.  -EAGAIN is returned and the search must be repeated.
2365  */
2366 static int
2367 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2368 		      struct extent_buffer **eb_ret, int level, int slot,
2369 		      const struct btrfs_key *key)
2370 {
2371 	struct btrfs_fs_info *fs_info = root->fs_info;
2372 	u64 blocknr;
2373 	u64 gen;
2374 	struct extent_buffer *tmp;
2375 	struct btrfs_key first_key;
2376 	int ret;
2377 	int parent_level;
2378 
2379 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
2380 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
2381 	parent_level = btrfs_header_level(*eb_ret);
2382 	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2383 
2384 	tmp = find_extent_buffer(fs_info, blocknr);
2385 	if (tmp) {
2386 		/* first we do an atomic uptodate check */
2387 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2388 			/*
2389 			 * Do extra check for first_key, eb can be stale due to
2390 			 * being cached, read from scrub, or have multiple
2391 			 * parents (shared tree blocks).
2392 			 */
2393 			if (btrfs_verify_level_key(tmp,
2394 					parent_level - 1, &first_key, gen)) {
2395 				free_extent_buffer(tmp);
2396 				return -EUCLEAN;
2397 			}
2398 			*eb_ret = tmp;
2399 			return 0;
2400 		}
2401 
2402 		/* the pages were up to date, but we failed
2403 		 * the generation number check.  Do a full
2404 		 * read for the generation number that is correct.
2405 		 * We must do this without dropping locks so
2406 		 * we can trust our generation number
2407 		 */
2408 		btrfs_set_path_blocking(p);
2409 
2410 		/* now we're allowed to do a blocking uptodate check */
2411 		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2412 		if (!ret) {
2413 			*eb_ret = tmp;
2414 			return 0;
2415 		}
2416 		free_extent_buffer(tmp);
2417 		btrfs_release_path(p);
2418 		return -EIO;
2419 	}
2420 
2421 	/*
2422 	 * reduce lock contention at high levels
2423 	 * of the btree by dropping locks before
2424 	 * we read.  Don't release the lock on the current
2425 	 * level because we need to walk this node to figure
2426 	 * out which blocks to read.
2427 	 */
2428 	btrfs_unlock_up_safe(p, level + 1);
2429 	btrfs_set_path_blocking(p);
2430 
2431 	if (p->reada != READA_NONE)
2432 		reada_for_search(fs_info, p, level, slot, key->objectid);
2433 
2434 	ret = -EAGAIN;
2435 	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2436 			      &first_key);
2437 	if (!IS_ERR(tmp)) {
2438 		/*
2439 		 * If the read above didn't mark this buffer up to date,
2440 		 * it will never end up being up to date.  Set ret to EIO now
2441 		 * and give up so that our caller doesn't loop forever
2442 		 * on our EAGAINs.
2443 		 */
2444 		if (!extent_buffer_uptodate(tmp))
2445 			ret = -EIO;
2446 		free_extent_buffer(tmp);
2447 	} else {
2448 		ret = PTR_ERR(tmp);
2449 	}
2450 
2451 	btrfs_release_path(p);
2452 	return ret;
2453 }
2454 
2455 /*
2456  * helper function for btrfs_search_slot.  This does all of the checks
2457  * for node-level blocks and does any balancing required based on
2458  * the ins_len.
2459  *
2460  * If no extra work was required, zero is returned.  If we had to
2461  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2462  * start over
2463  */
2464 static int
2465 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2466 		       struct btrfs_root *root, struct btrfs_path *p,
2467 		       struct extent_buffer *b, int level, int ins_len,
2468 		       int *write_lock_level)
2469 {
2470 	struct btrfs_fs_info *fs_info = root->fs_info;
2471 	int ret;
2472 
2473 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2474 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2475 		int sret;
2476 
2477 		if (*write_lock_level < level + 1) {
2478 			*write_lock_level = level + 1;
2479 			btrfs_release_path(p);
2480 			goto again;
2481 		}
2482 
2483 		btrfs_set_path_blocking(p);
2484 		reada_for_balance(fs_info, p, level);
2485 		sret = split_node(trans, root, p, level);
2486 
2487 		BUG_ON(sret > 0);
2488 		if (sret) {
2489 			ret = sret;
2490 			goto done;
2491 		}
2492 		b = p->nodes[level];
2493 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2494 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2495 		int sret;
2496 
2497 		if (*write_lock_level < level + 1) {
2498 			*write_lock_level = level + 1;
2499 			btrfs_release_path(p);
2500 			goto again;
2501 		}
2502 
2503 		btrfs_set_path_blocking(p);
2504 		reada_for_balance(fs_info, p, level);
2505 		sret = balance_level(trans, root, p, level);
2506 
2507 		if (sret) {
2508 			ret = sret;
2509 			goto done;
2510 		}
2511 		b = p->nodes[level];
2512 		if (!b) {
2513 			btrfs_release_path(p);
2514 			goto again;
2515 		}
2516 		BUG_ON(btrfs_header_nritems(b) == 1);
2517 	}
2518 	return 0;
2519 
2520 again:
2521 	ret = -EAGAIN;
2522 done:
2523 	return ret;
2524 }
2525 
2526 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2527 		u64 iobjectid, u64 ioff, u8 key_type,
2528 		struct btrfs_key *found_key)
2529 {
2530 	int ret;
2531 	struct btrfs_key key;
2532 	struct extent_buffer *eb;
2533 
2534 	ASSERT(path);
2535 	ASSERT(found_key);
2536 
2537 	key.type = key_type;
2538 	key.objectid = iobjectid;
2539 	key.offset = ioff;
2540 
2541 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2542 	if (ret < 0)
2543 		return ret;
2544 
2545 	eb = path->nodes[0];
2546 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2547 		ret = btrfs_next_leaf(fs_root, path);
2548 		if (ret)
2549 			return ret;
2550 		eb = path->nodes[0];
2551 	}
2552 
2553 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2554 	if (found_key->type != key.type ||
2555 			found_key->objectid != key.objectid)
2556 		return 1;
2557 
2558 	return 0;
2559 }
2560 
2561 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2562 							struct btrfs_path *p,
2563 							int write_lock_level)
2564 {
2565 	struct btrfs_fs_info *fs_info = root->fs_info;
2566 	struct extent_buffer *b;
2567 	int root_lock;
2568 	int level = 0;
2569 
2570 	/* We try very hard to do read locks on the root */
2571 	root_lock = BTRFS_READ_LOCK;
2572 
2573 	if (p->search_commit_root) {
2574 		/*
2575 		 * The commit roots are read only so we always do read locks,
2576 		 * and we always must hold the commit_root_sem when doing
2577 		 * searches on them, the only exception is send where we don't
2578 		 * want to block transaction commits for a long time, so
2579 		 * we need to clone the commit root in order to avoid races
2580 		 * with transaction commits that create a snapshot of one of
2581 		 * the roots used by a send operation.
2582 		 */
2583 		if (p->need_commit_sem) {
2584 			down_read(&fs_info->commit_root_sem);
2585 			b = btrfs_clone_extent_buffer(root->commit_root);
2586 			up_read(&fs_info->commit_root_sem);
2587 			if (!b)
2588 				return ERR_PTR(-ENOMEM);
2589 
2590 		} else {
2591 			b = root->commit_root;
2592 			atomic_inc(&b->refs);
2593 		}
2594 		level = btrfs_header_level(b);
2595 		/*
2596 		 * Ensure that all callers have set skip_locking when
2597 		 * p->search_commit_root = 1.
2598 		 */
2599 		ASSERT(p->skip_locking == 1);
2600 
2601 		goto out;
2602 	}
2603 
2604 	if (p->skip_locking) {
2605 		b = btrfs_root_node(root);
2606 		level = btrfs_header_level(b);
2607 		goto out;
2608 	}
2609 
2610 	/*
2611 	 * If the level is set to maximum, we can skip trying to get the read
2612 	 * lock.
2613 	 */
2614 	if (write_lock_level < BTRFS_MAX_LEVEL) {
2615 		/*
2616 		 * We don't know the level of the root node until we actually
2617 		 * have it read locked
2618 		 */
2619 		b = __btrfs_read_lock_root_node(root, p->recurse);
2620 		level = btrfs_header_level(b);
2621 		if (level > write_lock_level)
2622 			goto out;
2623 
2624 		/* Whoops, must trade for write lock */
2625 		btrfs_tree_read_unlock(b);
2626 		free_extent_buffer(b);
2627 	}
2628 
2629 	b = btrfs_lock_root_node(root);
2630 	root_lock = BTRFS_WRITE_LOCK;
2631 
2632 	/* The level might have changed, check again */
2633 	level = btrfs_header_level(b);
2634 
2635 out:
2636 	p->nodes[level] = b;
2637 	if (!p->skip_locking)
2638 		p->locks[level] = root_lock;
2639 	/*
2640 	 * Callers are responsible for dropping b's references.
2641 	 */
2642 	return b;
2643 }
2644 
2645 
2646 /*
2647  * btrfs_search_slot - look for a key in a tree and perform necessary
2648  * modifications to preserve tree invariants.
2649  *
2650  * @trans:	Handle of transaction, used when modifying the tree
2651  * @p:		Holds all btree nodes along the search path
2652  * @root:	The root node of the tree
2653  * @key:	The key we are looking for
2654  * @ins_len:	Indicates purpose of search, for inserts it is 1, for
2655  *		deletions it's -1. 0 for plain searches
2656  * @cow:	boolean should CoW operations be performed. Must always be 1
2657  *		when modifying the tree.
2658  *
2659  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2660  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2661  *
2662  * If @key is found, 0 is returned and you can find the item in the leaf level
2663  * of the path (level 0)
2664  *
2665  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2666  * points to the slot where it should be inserted
2667  *
2668  * If an error is encountered while searching the tree a negative error number
2669  * is returned
2670  */
2671 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2672 		      const struct btrfs_key *key, struct btrfs_path *p,
2673 		      int ins_len, int cow)
2674 {
2675 	struct extent_buffer *b;
2676 	int slot;
2677 	int ret;
2678 	int err;
2679 	int level;
2680 	int lowest_unlock = 1;
2681 	/* everything at write_lock_level or lower must be write locked */
2682 	int write_lock_level = 0;
2683 	u8 lowest_level = 0;
2684 	int min_write_lock_level;
2685 	int prev_cmp;
2686 
2687 	lowest_level = p->lowest_level;
2688 	WARN_ON(lowest_level && ins_len > 0);
2689 	WARN_ON(p->nodes[0] != NULL);
2690 	BUG_ON(!cow && ins_len);
2691 
2692 	if (ins_len < 0) {
2693 		lowest_unlock = 2;
2694 
2695 		/* when we are removing items, we might have to go up to level
2696 		 * two as we update tree pointers  Make sure we keep write
2697 		 * for those levels as well
2698 		 */
2699 		write_lock_level = 2;
2700 	} else if (ins_len > 0) {
2701 		/*
2702 		 * for inserting items, make sure we have a write lock on
2703 		 * level 1 so we can update keys
2704 		 */
2705 		write_lock_level = 1;
2706 	}
2707 
2708 	if (!cow)
2709 		write_lock_level = -1;
2710 
2711 	if (cow && (p->keep_locks || p->lowest_level))
2712 		write_lock_level = BTRFS_MAX_LEVEL;
2713 
2714 	min_write_lock_level = write_lock_level;
2715 
2716 again:
2717 	prev_cmp = -1;
2718 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2719 	if (IS_ERR(b)) {
2720 		ret = PTR_ERR(b);
2721 		goto done;
2722 	}
2723 
2724 	while (b) {
2725 		int dec = 0;
2726 
2727 		level = btrfs_header_level(b);
2728 
2729 		if (cow) {
2730 			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2731 
2732 			/*
2733 			 * if we don't really need to cow this block
2734 			 * then we don't want to set the path blocking,
2735 			 * so we test it here
2736 			 */
2737 			if (!should_cow_block(trans, root, b)) {
2738 				trans->dirty = true;
2739 				goto cow_done;
2740 			}
2741 
2742 			/*
2743 			 * must have write locks on this node and the
2744 			 * parent
2745 			 */
2746 			if (level > write_lock_level ||
2747 			    (level + 1 > write_lock_level &&
2748 			    level + 1 < BTRFS_MAX_LEVEL &&
2749 			    p->nodes[level + 1])) {
2750 				write_lock_level = level + 1;
2751 				btrfs_release_path(p);
2752 				goto again;
2753 			}
2754 
2755 			btrfs_set_path_blocking(p);
2756 			if (last_level)
2757 				err = btrfs_cow_block(trans, root, b, NULL, 0,
2758 						      &b,
2759 						      BTRFS_NESTING_COW);
2760 			else
2761 				err = btrfs_cow_block(trans, root, b,
2762 						      p->nodes[level + 1],
2763 						      p->slots[level + 1], &b,
2764 						      BTRFS_NESTING_COW);
2765 			if (err) {
2766 				ret = err;
2767 				goto done;
2768 			}
2769 		}
2770 cow_done:
2771 		p->nodes[level] = b;
2772 		/*
2773 		 * Leave path with blocking locks to avoid massive
2774 		 * lock context switch, this is made on purpose.
2775 		 */
2776 
2777 		/*
2778 		 * we have a lock on b and as long as we aren't changing
2779 		 * the tree, there is no way to for the items in b to change.
2780 		 * It is safe to drop the lock on our parent before we
2781 		 * go through the expensive btree search on b.
2782 		 *
2783 		 * If we're inserting or deleting (ins_len != 0), then we might
2784 		 * be changing slot zero, which may require changing the parent.
2785 		 * So, we can't drop the lock until after we know which slot
2786 		 * we're operating on.
2787 		 */
2788 		if (!ins_len && !p->keep_locks) {
2789 			int u = level + 1;
2790 
2791 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2792 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2793 				p->locks[u] = 0;
2794 			}
2795 		}
2796 
2797 		/*
2798 		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2799 		 * we can safely assume the target key will always be in slot 0
2800 		 * on lower levels due to the invariants BTRFS' btree provides,
2801 		 * namely that a btrfs_key_ptr entry always points to the
2802 		 * lowest key in the child node, thus we can skip searching
2803 		 * lower levels
2804 		 */
2805 		if (prev_cmp == 0) {
2806 			slot = 0;
2807 			ret = 0;
2808 		} else {
2809 			ret = btrfs_bin_search(b, key, &slot);
2810 			prev_cmp = ret;
2811 			if (ret < 0)
2812 				goto done;
2813 		}
2814 
2815 		if (level == 0) {
2816 			p->slots[level] = slot;
2817 			if (ins_len > 0 &&
2818 			    btrfs_leaf_free_space(b) < ins_len) {
2819 				if (write_lock_level < 1) {
2820 					write_lock_level = 1;
2821 					btrfs_release_path(p);
2822 					goto again;
2823 				}
2824 
2825 				btrfs_set_path_blocking(p);
2826 				err = split_leaf(trans, root, key,
2827 						 p, ins_len, ret == 0);
2828 
2829 				BUG_ON(err > 0);
2830 				if (err) {
2831 					ret = err;
2832 					goto done;
2833 				}
2834 			}
2835 			if (!p->search_for_split)
2836 				unlock_up(p, level, lowest_unlock,
2837 					  min_write_lock_level, NULL);
2838 			goto done;
2839 		}
2840 		if (ret && slot > 0) {
2841 			dec = 1;
2842 			slot--;
2843 		}
2844 		p->slots[level] = slot;
2845 		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2846 					     &write_lock_level);
2847 		if (err == -EAGAIN)
2848 			goto again;
2849 		if (err) {
2850 			ret = err;
2851 			goto done;
2852 		}
2853 		b = p->nodes[level];
2854 		slot = p->slots[level];
2855 
2856 		/*
2857 		 * Slot 0 is special, if we change the key we have to update
2858 		 * the parent pointer which means we must have a write lock on
2859 		 * the parent
2860 		 */
2861 		if (slot == 0 && ins_len && write_lock_level < level + 1) {
2862 			write_lock_level = level + 1;
2863 			btrfs_release_path(p);
2864 			goto again;
2865 		}
2866 
2867 		unlock_up(p, level, lowest_unlock, min_write_lock_level,
2868 			  &write_lock_level);
2869 
2870 		if (level == lowest_level) {
2871 			if (dec)
2872 				p->slots[level]++;
2873 			goto done;
2874 		}
2875 
2876 		err = read_block_for_search(root, p, &b, level, slot, key);
2877 		if (err == -EAGAIN)
2878 			goto again;
2879 		if (err) {
2880 			ret = err;
2881 			goto done;
2882 		}
2883 
2884 		if (!p->skip_locking) {
2885 			level = btrfs_header_level(b);
2886 			if (level <= write_lock_level) {
2887 				if (!btrfs_try_tree_write_lock(b)) {
2888 					btrfs_set_path_blocking(p);
2889 					btrfs_tree_lock(b);
2890 				}
2891 				p->locks[level] = BTRFS_WRITE_LOCK;
2892 			} else {
2893 				if (!btrfs_tree_read_lock_atomic(b)) {
2894 					btrfs_set_path_blocking(p);
2895 					__btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
2896 							       p->recurse);
2897 				}
2898 				p->locks[level] = BTRFS_READ_LOCK;
2899 			}
2900 			p->nodes[level] = b;
2901 		}
2902 	}
2903 	ret = 1;
2904 done:
2905 	/*
2906 	 * we don't really know what they plan on doing with the path
2907 	 * from here on, so for now just mark it as blocking
2908 	 */
2909 	if (!p->leave_spinning)
2910 		btrfs_set_path_blocking(p);
2911 	if (ret < 0 && !p->skip_release_on_error)
2912 		btrfs_release_path(p);
2913 	return ret;
2914 }
2915 
2916 /*
2917  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2918  * current state of the tree together with the operations recorded in the tree
2919  * modification log to search for the key in a previous version of this tree, as
2920  * denoted by the time_seq parameter.
2921  *
2922  * Naturally, there is no support for insert, delete or cow operations.
2923  *
2924  * The resulting path and return value will be set up as if we called
2925  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2926  */
2927 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2928 			  struct btrfs_path *p, u64 time_seq)
2929 {
2930 	struct btrfs_fs_info *fs_info = root->fs_info;
2931 	struct extent_buffer *b;
2932 	int slot;
2933 	int ret;
2934 	int err;
2935 	int level;
2936 	int lowest_unlock = 1;
2937 	u8 lowest_level = 0;
2938 
2939 	lowest_level = p->lowest_level;
2940 	WARN_ON(p->nodes[0] != NULL);
2941 
2942 	if (p->search_commit_root) {
2943 		BUG_ON(time_seq);
2944 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2945 	}
2946 
2947 again:
2948 	b = get_old_root(root, time_seq);
2949 	if (!b) {
2950 		ret = -EIO;
2951 		goto done;
2952 	}
2953 	level = btrfs_header_level(b);
2954 	p->locks[level] = BTRFS_READ_LOCK;
2955 
2956 	while (b) {
2957 		int dec = 0;
2958 
2959 		level = btrfs_header_level(b);
2960 		p->nodes[level] = b;
2961 
2962 		/*
2963 		 * we have a lock on b and as long as we aren't changing
2964 		 * the tree, there is no way to for the items in b to change.
2965 		 * It is safe to drop the lock on our parent before we
2966 		 * go through the expensive btree search on b.
2967 		 */
2968 		btrfs_unlock_up_safe(p, level + 1);
2969 
2970 		ret = btrfs_bin_search(b, key, &slot);
2971 		if (ret < 0)
2972 			goto done;
2973 
2974 		if (level == 0) {
2975 			p->slots[level] = slot;
2976 			unlock_up(p, level, lowest_unlock, 0, NULL);
2977 			goto done;
2978 		}
2979 
2980 		if (ret && slot > 0) {
2981 			dec = 1;
2982 			slot--;
2983 		}
2984 		p->slots[level] = slot;
2985 		unlock_up(p, level, lowest_unlock, 0, NULL);
2986 
2987 		if (level == lowest_level) {
2988 			if (dec)
2989 				p->slots[level]++;
2990 			goto done;
2991 		}
2992 
2993 		err = read_block_for_search(root, p, &b, level, slot, key);
2994 		if (err == -EAGAIN)
2995 			goto again;
2996 		if (err) {
2997 			ret = err;
2998 			goto done;
2999 		}
3000 
3001 		level = btrfs_header_level(b);
3002 		if (!btrfs_tree_read_lock_atomic(b)) {
3003 			btrfs_set_path_blocking(p);
3004 			btrfs_tree_read_lock(b);
3005 		}
3006 		b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3007 		if (!b) {
3008 			ret = -ENOMEM;
3009 			goto done;
3010 		}
3011 		p->locks[level] = BTRFS_READ_LOCK;
3012 		p->nodes[level] = b;
3013 	}
3014 	ret = 1;
3015 done:
3016 	if (!p->leave_spinning)
3017 		btrfs_set_path_blocking(p);
3018 	if (ret < 0)
3019 		btrfs_release_path(p);
3020 
3021 	return ret;
3022 }
3023 
3024 /*
3025  * helper to use instead of search slot if no exact match is needed but
3026  * instead the next or previous item should be returned.
3027  * When find_higher is true, the next higher item is returned, the next lower
3028  * otherwise.
3029  * When return_any and find_higher are both true, and no higher item is found,
3030  * return the next lower instead.
3031  * When return_any is true and find_higher is false, and no lower item is found,
3032  * return the next higher instead.
3033  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3034  * < 0 on error
3035  */
3036 int btrfs_search_slot_for_read(struct btrfs_root *root,
3037 			       const struct btrfs_key *key,
3038 			       struct btrfs_path *p, int find_higher,
3039 			       int return_any)
3040 {
3041 	int ret;
3042 	struct extent_buffer *leaf;
3043 
3044 again:
3045 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3046 	if (ret <= 0)
3047 		return ret;
3048 	/*
3049 	 * a return value of 1 means the path is at the position where the
3050 	 * item should be inserted. Normally this is the next bigger item,
3051 	 * but in case the previous item is the last in a leaf, path points
3052 	 * to the first free slot in the previous leaf, i.e. at an invalid
3053 	 * item.
3054 	 */
3055 	leaf = p->nodes[0];
3056 
3057 	if (find_higher) {
3058 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3059 			ret = btrfs_next_leaf(root, p);
3060 			if (ret <= 0)
3061 				return ret;
3062 			if (!return_any)
3063 				return 1;
3064 			/*
3065 			 * no higher item found, return the next
3066 			 * lower instead
3067 			 */
3068 			return_any = 0;
3069 			find_higher = 0;
3070 			btrfs_release_path(p);
3071 			goto again;
3072 		}
3073 	} else {
3074 		if (p->slots[0] == 0) {
3075 			ret = btrfs_prev_leaf(root, p);
3076 			if (ret < 0)
3077 				return ret;
3078 			if (!ret) {
3079 				leaf = p->nodes[0];
3080 				if (p->slots[0] == btrfs_header_nritems(leaf))
3081 					p->slots[0]--;
3082 				return 0;
3083 			}
3084 			if (!return_any)
3085 				return 1;
3086 			/*
3087 			 * no lower item found, return the next
3088 			 * higher instead
3089 			 */
3090 			return_any = 0;
3091 			find_higher = 1;
3092 			btrfs_release_path(p);
3093 			goto again;
3094 		} else {
3095 			--p->slots[0];
3096 		}
3097 	}
3098 	return 0;
3099 }
3100 
3101 /*
3102  * adjust the pointers going up the tree, starting at level
3103  * making sure the right key of each node is points to 'key'.
3104  * This is used after shifting pointers to the left, so it stops
3105  * fixing up pointers when a given leaf/node is not in slot 0 of the
3106  * higher levels
3107  *
3108  */
3109 static void fixup_low_keys(struct btrfs_path *path,
3110 			   struct btrfs_disk_key *key, int level)
3111 {
3112 	int i;
3113 	struct extent_buffer *t;
3114 	int ret;
3115 
3116 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3117 		int tslot = path->slots[i];
3118 
3119 		if (!path->nodes[i])
3120 			break;
3121 		t = path->nodes[i];
3122 		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3123 				GFP_ATOMIC);
3124 		BUG_ON(ret < 0);
3125 		btrfs_set_node_key(t, key, tslot);
3126 		btrfs_mark_buffer_dirty(path->nodes[i]);
3127 		if (tslot != 0)
3128 			break;
3129 	}
3130 }
3131 
3132 /*
3133  * update item key.
3134  *
3135  * This function isn't completely safe. It's the caller's responsibility
3136  * that the new key won't break the order
3137  */
3138 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3139 			     struct btrfs_path *path,
3140 			     const struct btrfs_key *new_key)
3141 {
3142 	struct btrfs_disk_key disk_key;
3143 	struct extent_buffer *eb;
3144 	int slot;
3145 
3146 	eb = path->nodes[0];
3147 	slot = path->slots[0];
3148 	if (slot > 0) {
3149 		btrfs_item_key(eb, &disk_key, slot - 1);
3150 		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3151 			btrfs_crit(fs_info,
3152 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3153 				   slot, btrfs_disk_key_objectid(&disk_key),
3154 				   btrfs_disk_key_type(&disk_key),
3155 				   btrfs_disk_key_offset(&disk_key),
3156 				   new_key->objectid, new_key->type,
3157 				   new_key->offset);
3158 			btrfs_print_leaf(eb);
3159 			BUG();
3160 		}
3161 	}
3162 	if (slot < btrfs_header_nritems(eb) - 1) {
3163 		btrfs_item_key(eb, &disk_key, slot + 1);
3164 		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3165 			btrfs_crit(fs_info,
3166 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3167 				   slot, btrfs_disk_key_objectid(&disk_key),
3168 				   btrfs_disk_key_type(&disk_key),
3169 				   btrfs_disk_key_offset(&disk_key),
3170 				   new_key->objectid, new_key->type,
3171 				   new_key->offset);
3172 			btrfs_print_leaf(eb);
3173 			BUG();
3174 		}
3175 	}
3176 
3177 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3178 	btrfs_set_item_key(eb, &disk_key, slot);
3179 	btrfs_mark_buffer_dirty(eb);
3180 	if (slot == 0)
3181 		fixup_low_keys(path, &disk_key, 1);
3182 }
3183 
3184 /*
3185  * Check key order of two sibling extent buffers.
3186  *
3187  * Return true if something is wrong.
3188  * Return false if everything is fine.
3189  *
3190  * Tree-checker only works inside one tree block, thus the following
3191  * corruption can not be detected by tree-checker:
3192  *
3193  * Leaf @left			| Leaf @right
3194  * --------------------------------------------------------------
3195  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
3196  *
3197  * Key f6 in leaf @left itself is valid, but not valid when the next
3198  * key in leaf @right is 7.
3199  * This can only be checked at tree block merge time.
3200  * And since tree checker has ensured all key order in each tree block
3201  * is correct, we only need to bother the last key of @left and the first
3202  * key of @right.
3203  */
3204 static bool check_sibling_keys(struct extent_buffer *left,
3205 			       struct extent_buffer *right)
3206 {
3207 	struct btrfs_key left_last;
3208 	struct btrfs_key right_first;
3209 	int level = btrfs_header_level(left);
3210 	int nr_left = btrfs_header_nritems(left);
3211 	int nr_right = btrfs_header_nritems(right);
3212 
3213 	/* No key to check in one of the tree blocks */
3214 	if (!nr_left || !nr_right)
3215 		return false;
3216 
3217 	if (level) {
3218 		btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3219 		btrfs_node_key_to_cpu(right, &right_first, 0);
3220 	} else {
3221 		btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3222 		btrfs_item_key_to_cpu(right, &right_first, 0);
3223 	}
3224 
3225 	if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3226 		btrfs_crit(left->fs_info,
3227 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3228 			   left_last.objectid, left_last.type,
3229 			   left_last.offset, right_first.objectid,
3230 			   right_first.type, right_first.offset);
3231 		return true;
3232 	}
3233 	return false;
3234 }
3235 
3236 /*
3237  * try to push data from one node into the next node left in the
3238  * tree.
3239  *
3240  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3241  * error, and > 0 if there was no room in the left hand block.
3242  */
3243 static int push_node_left(struct btrfs_trans_handle *trans,
3244 			  struct extent_buffer *dst,
3245 			  struct extent_buffer *src, int empty)
3246 {
3247 	struct btrfs_fs_info *fs_info = trans->fs_info;
3248 	int push_items = 0;
3249 	int src_nritems;
3250 	int dst_nritems;
3251 	int ret = 0;
3252 
3253 	src_nritems = btrfs_header_nritems(src);
3254 	dst_nritems = btrfs_header_nritems(dst);
3255 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3256 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3257 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3258 
3259 	if (!empty && src_nritems <= 8)
3260 		return 1;
3261 
3262 	if (push_items <= 0)
3263 		return 1;
3264 
3265 	if (empty) {
3266 		push_items = min(src_nritems, push_items);
3267 		if (push_items < src_nritems) {
3268 			/* leave at least 8 pointers in the node if
3269 			 * we aren't going to empty it
3270 			 */
3271 			if (src_nritems - push_items < 8) {
3272 				if (push_items <= 8)
3273 					return 1;
3274 				push_items -= 8;
3275 			}
3276 		}
3277 	} else
3278 		push_items = min(src_nritems - 8, push_items);
3279 
3280 	/* dst is the left eb, src is the middle eb */
3281 	if (check_sibling_keys(dst, src)) {
3282 		ret = -EUCLEAN;
3283 		btrfs_abort_transaction(trans, ret);
3284 		return ret;
3285 	}
3286 	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3287 	if (ret) {
3288 		btrfs_abort_transaction(trans, ret);
3289 		return ret;
3290 	}
3291 	copy_extent_buffer(dst, src,
3292 			   btrfs_node_key_ptr_offset(dst_nritems),
3293 			   btrfs_node_key_ptr_offset(0),
3294 			   push_items * sizeof(struct btrfs_key_ptr));
3295 
3296 	if (push_items < src_nritems) {
3297 		/*
3298 		 * Don't call tree_mod_log_insert_move here, key removal was
3299 		 * already fully logged by tree_mod_log_eb_copy above.
3300 		 */
3301 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3302 				      btrfs_node_key_ptr_offset(push_items),
3303 				      (src_nritems - push_items) *
3304 				      sizeof(struct btrfs_key_ptr));
3305 	}
3306 	btrfs_set_header_nritems(src, src_nritems - push_items);
3307 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3308 	btrfs_mark_buffer_dirty(src);
3309 	btrfs_mark_buffer_dirty(dst);
3310 
3311 	return ret;
3312 }
3313 
3314 /*
3315  * try to push data from one node into the next node right in the
3316  * tree.
3317  *
3318  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3319  * error, and > 0 if there was no room in the right hand block.
3320  *
3321  * this will  only push up to 1/2 the contents of the left node over
3322  */
3323 static int balance_node_right(struct btrfs_trans_handle *trans,
3324 			      struct extent_buffer *dst,
3325 			      struct extent_buffer *src)
3326 {
3327 	struct btrfs_fs_info *fs_info = trans->fs_info;
3328 	int push_items = 0;
3329 	int max_push;
3330 	int src_nritems;
3331 	int dst_nritems;
3332 	int ret = 0;
3333 
3334 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3335 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3336 
3337 	src_nritems = btrfs_header_nritems(src);
3338 	dst_nritems = btrfs_header_nritems(dst);
3339 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3340 	if (push_items <= 0)
3341 		return 1;
3342 
3343 	if (src_nritems < 4)
3344 		return 1;
3345 
3346 	max_push = src_nritems / 2 + 1;
3347 	/* don't try to empty the node */
3348 	if (max_push >= src_nritems)
3349 		return 1;
3350 
3351 	if (max_push < push_items)
3352 		push_items = max_push;
3353 
3354 	/* dst is the right eb, src is the middle eb */
3355 	if (check_sibling_keys(src, dst)) {
3356 		ret = -EUCLEAN;
3357 		btrfs_abort_transaction(trans, ret);
3358 		return ret;
3359 	}
3360 	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3361 	BUG_ON(ret < 0);
3362 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3363 				      btrfs_node_key_ptr_offset(0),
3364 				      (dst_nritems) *
3365 				      sizeof(struct btrfs_key_ptr));
3366 
3367 	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3368 				   push_items);
3369 	if (ret) {
3370 		btrfs_abort_transaction(trans, ret);
3371 		return ret;
3372 	}
3373 	copy_extent_buffer(dst, src,
3374 			   btrfs_node_key_ptr_offset(0),
3375 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3376 			   push_items * sizeof(struct btrfs_key_ptr));
3377 
3378 	btrfs_set_header_nritems(src, src_nritems - push_items);
3379 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3380 
3381 	btrfs_mark_buffer_dirty(src);
3382 	btrfs_mark_buffer_dirty(dst);
3383 
3384 	return ret;
3385 }
3386 
3387 /*
3388  * helper function to insert a new root level in the tree.
3389  * A new node is allocated, and a single item is inserted to
3390  * point to the existing root
3391  *
3392  * returns zero on success or < 0 on failure.
3393  */
3394 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3395 			   struct btrfs_root *root,
3396 			   struct btrfs_path *path, int level)
3397 {
3398 	struct btrfs_fs_info *fs_info = root->fs_info;
3399 	u64 lower_gen;
3400 	struct extent_buffer *lower;
3401 	struct extent_buffer *c;
3402 	struct extent_buffer *old;
3403 	struct btrfs_disk_key lower_key;
3404 	int ret;
3405 
3406 	BUG_ON(path->nodes[level]);
3407 	BUG_ON(path->nodes[level-1] != root->node);
3408 
3409 	lower = path->nodes[level-1];
3410 	if (level == 1)
3411 		btrfs_item_key(lower, &lower_key, 0);
3412 	else
3413 		btrfs_node_key(lower, &lower_key, 0);
3414 
3415 	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3416 					 root->node->start, 0,
3417 					 BTRFS_NESTING_NEW_ROOT);
3418 	if (IS_ERR(c))
3419 		return PTR_ERR(c);
3420 
3421 	root_add_used(root, fs_info->nodesize);
3422 
3423 	btrfs_set_header_nritems(c, 1);
3424 	btrfs_set_node_key(c, &lower_key, 0);
3425 	btrfs_set_node_blockptr(c, 0, lower->start);
3426 	lower_gen = btrfs_header_generation(lower);
3427 	WARN_ON(lower_gen != trans->transid);
3428 
3429 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3430 
3431 	btrfs_mark_buffer_dirty(c);
3432 
3433 	old = root->node;
3434 	ret = tree_mod_log_insert_root(root->node, c, 0);
3435 	BUG_ON(ret < 0);
3436 	rcu_assign_pointer(root->node, c);
3437 
3438 	/* the super has an extra ref to root->node */
3439 	free_extent_buffer(old);
3440 
3441 	add_root_to_dirty_list(root);
3442 	atomic_inc(&c->refs);
3443 	path->nodes[level] = c;
3444 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3445 	path->slots[level] = 0;
3446 	return 0;
3447 }
3448 
3449 /*
3450  * worker function to insert a single pointer in a node.
3451  * the node should have enough room for the pointer already
3452  *
3453  * slot and level indicate where you want the key to go, and
3454  * blocknr is the block the key points to.
3455  */
3456 static void insert_ptr(struct btrfs_trans_handle *trans,
3457 		       struct btrfs_path *path,
3458 		       struct btrfs_disk_key *key, u64 bytenr,
3459 		       int slot, int level)
3460 {
3461 	struct extent_buffer *lower;
3462 	int nritems;
3463 	int ret;
3464 
3465 	BUG_ON(!path->nodes[level]);
3466 	btrfs_assert_tree_locked(path->nodes[level]);
3467 	lower = path->nodes[level];
3468 	nritems = btrfs_header_nritems(lower);
3469 	BUG_ON(slot > nritems);
3470 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3471 	if (slot != nritems) {
3472 		if (level) {
3473 			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3474 					nritems - slot);
3475 			BUG_ON(ret < 0);
3476 		}
3477 		memmove_extent_buffer(lower,
3478 			      btrfs_node_key_ptr_offset(slot + 1),
3479 			      btrfs_node_key_ptr_offset(slot),
3480 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3481 	}
3482 	if (level) {
3483 		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3484 				GFP_NOFS);
3485 		BUG_ON(ret < 0);
3486 	}
3487 	btrfs_set_node_key(lower, key, slot);
3488 	btrfs_set_node_blockptr(lower, slot, bytenr);
3489 	WARN_ON(trans->transid == 0);
3490 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3491 	btrfs_set_header_nritems(lower, nritems + 1);
3492 	btrfs_mark_buffer_dirty(lower);
3493 }
3494 
3495 /*
3496  * split the node at the specified level in path in two.
3497  * The path is corrected to point to the appropriate node after the split
3498  *
3499  * Before splitting this tries to make some room in the node by pushing
3500  * left and right, if either one works, it returns right away.
3501  *
3502  * returns 0 on success and < 0 on failure
3503  */
3504 static noinline int split_node(struct btrfs_trans_handle *trans,
3505 			       struct btrfs_root *root,
3506 			       struct btrfs_path *path, int level)
3507 {
3508 	struct btrfs_fs_info *fs_info = root->fs_info;
3509 	struct extent_buffer *c;
3510 	struct extent_buffer *split;
3511 	struct btrfs_disk_key disk_key;
3512 	int mid;
3513 	int ret;
3514 	u32 c_nritems;
3515 
3516 	c = path->nodes[level];
3517 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3518 	if (c == root->node) {
3519 		/*
3520 		 * trying to split the root, lets make a new one
3521 		 *
3522 		 * tree mod log: We don't log_removal old root in
3523 		 * insert_new_root, because that root buffer will be kept as a
3524 		 * normal node. We are going to log removal of half of the
3525 		 * elements below with tree_mod_log_eb_copy. We're holding a
3526 		 * tree lock on the buffer, which is why we cannot race with
3527 		 * other tree_mod_log users.
3528 		 */
3529 		ret = insert_new_root(trans, root, path, level + 1);
3530 		if (ret)
3531 			return ret;
3532 	} else {
3533 		ret = push_nodes_for_insert(trans, root, path, level);
3534 		c = path->nodes[level];
3535 		if (!ret && btrfs_header_nritems(c) <
3536 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3537 			return 0;
3538 		if (ret < 0)
3539 			return ret;
3540 	}
3541 
3542 	c_nritems = btrfs_header_nritems(c);
3543 	mid = (c_nritems + 1) / 2;
3544 	btrfs_node_key(c, &disk_key, mid);
3545 
3546 	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3547 					     c->start, 0, BTRFS_NESTING_SPLIT);
3548 	if (IS_ERR(split))
3549 		return PTR_ERR(split);
3550 
3551 	root_add_used(root, fs_info->nodesize);
3552 	ASSERT(btrfs_header_level(c) == level);
3553 
3554 	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3555 	if (ret) {
3556 		btrfs_abort_transaction(trans, ret);
3557 		return ret;
3558 	}
3559 	copy_extent_buffer(split, c,
3560 			   btrfs_node_key_ptr_offset(0),
3561 			   btrfs_node_key_ptr_offset(mid),
3562 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3563 	btrfs_set_header_nritems(split, c_nritems - mid);
3564 	btrfs_set_header_nritems(c, mid);
3565 	ret = 0;
3566 
3567 	btrfs_mark_buffer_dirty(c);
3568 	btrfs_mark_buffer_dirty(split);
3569 
3570 	insert_ptr(trans, path, &disk_key, split->start,
3571 		   path->slots[level + 1] + 1, level + 1);
3572 
3573 	if (path->slots[level] >= mid) {
3574 		path->slots[level] -= mid;
3575 		btrfs_tree_unlock(c);
3576 		free_extent_buffer(c);
3577 		path->nodes[level] = split;
3578 		path->slots[level + 1] += 1;
3579 	} else {
3580 		btrfs_tree_unlock(split);
3581 		free_extent_buffer(split);
3582 	}
3583 	return ret;
3584 }
3585 
3586 /*
3587  * how many bytes are required to store the items in a leaf.  start
3588  * and nr indicate which items in the leaf to check.  This totals up the
3589  * space used both by the item structs and the item data
3590  */
3591 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3592 {
3593 	struct btrfs_item *start_item;
3594 	struct btrfs_item *end_item;
3595 	int data_len;
3596 	int nritems = btrfs_header_nritems(l);
3597 	int end = min(nritems, start + nr) - 1;
3598 
3599 	if (!nr)
3600 		return 0;
3601 	start_item = btrfs_item_nr(start);
3602 	end_item = btrfs_item_nr(end);
3603 	data_len = btrfs_item_offset(l, start_item) +
3604 		   btrfs_item_size(l, start_item);
3605 	data_len = data_len - btrfs_item_offset(l, end_item);
3606 	data_len += sizeof(struct btrfs_item) * nr;
3607 	WARN_ON(data_len < 0);
3608 	return data_len;
3609 }
3610 
3611 /*
3612  * The space between the end of the leaf items and
3613  * the start of the leaf data.  IOW, how much room
3614  * the leaf has left for both items and data
3615  */
3616 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3617 {
3618 	struct btrfs_fs_info *fs_info = leaf->fs_info;
3619 	int nritems = btrfs_header_nritems(leaf);
3620 	int ret;
3621 
3622 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3623 	if (ret < 0) {
3624 		btrfs_crit(fs_info,
3625 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3626 			   ret,
3627 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3628 			   leaf_space_used(leaf, 0, nritems), nritems);
3629 	}
3630 	return ret;
3631 }
3632 
3633 /*
3634  * min slot controls the lowest index we're willing to push to the
3635  * right.  We'll push up to and including min_slot, but no lower
3636  */
3637 static noinline int __push_leaf_right(struct btrfs_path *path,
3638 				      int data_size, int empty,
3639 				      struct extent_buffer *right,
3640 				      int free_space, u32 left_nritems,
3641 				      u32 min_slot)
3642 {
3643 	struct btrfs_fs_info *fs_info = right->fs_info;
3644 	struct extent_buffer *left = path->nodes[0];
3645 	struct extent_buffer *upper = path->nodes[1];
3646 	struct btrfs_map_token token;
3647 	struct btrfs_disk_key disk_key;
3648 	int slot;
3649 	u32 i;
3650 	int push_space = 0;
3651 	int push_items = 0;
3652 	struct btrfs_item *item;
3653 	u32 nr;
3654 	u32 right_nritems;
3655 	u32 data_end;
3656 	u32 this_item_size;
3657 
3658 	if (empty)
3659 		nr = 0;
3660 	else
3661 		nr = max_t(u32, 1, min_slot);
3662 
3663 	if (path->slots[0] >= left_nritems)
3664 		push_space += data_size;
3665 
3666 	slot = path->slots[1];
3667 	i = left_nritems - 1;
3668 	while (i >= nr) {
3669 		item = btrfs_item_nr(i);
3670 
3671 		if (!empty && push_items > 0) {
3672 			if (path->slots[0] > i)
3673 				break;
3674 			if (path->slots[0] == i) {
3675 				int space = btrfs_leaf_free_space(left);
3676 
3677 				if (space + push_space * 2 > free_space)
3678 					break;
3679 			}
3680 		}
3681 
3682 		if (path->slots[0] == i)
3683 			push_space += data_size;
3684 
3685 		this_item_size = btrfs_item_size(left, item);
3686 		if (this_item_size + sizeof(*item) + push_space > free_space)
3687 			break;
3688 
3689 		push_items++;
3690 		push_space += this_item_size + sizeof(*item);
3691 		if (i == 0)
3692 			break;
3693 		i--;
3694 	}
3695 
3696 	if (push_items == 0)
3697 		goto out_unlock;
3698 
3699 	WARN_ON(!empty && push_items == left_nritems);
3700 
3701 	/* push left to right */
3702 	right_nritems = btrfs_header_nritems(right);
3703 
3704 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3705 	push_space -= leaf_data_end(left);
3706 
3707 	/* make room in the right data area */
3708 	data_end = leaf_data_end(right);
3709 	memmove_extent_buffer(right,
3710 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3711 			      BTRFS_LEAF_DATA_OFFSET + data_end,
3712 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3713 
3714 	/* copy from the left data area */
3715 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3716 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3717 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3718 		     push_space);
3719 
3720 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3721 			      btrfs_item_nr_offset(0),
3722 			      right_nritems * sizeof(struct btrfs_item));
3723 
3724 	/* copy the items from left to right */
3725 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3726 		   btrfs_item_nr_offset(left_nritems - push_items),
3727 		   push_items * sizeof(struct btrfs_item));
3728 
3729 	/* update the item pointers */
3730 	btrfs_init_map_token(&token, right);
3731 	right_nritems += push_items;
3732 	btrfs_set_header_nritems(right, right_nritems);
3733 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3734 	for (i = 0; i < right_nritems; i++) {
3735 		item = btrfs_item_nr(i);
3736 		push_space -= btrfs_token_item_size(&token, item);
3737 		btrfs_set_token_item_offset(&token, item, push_space);
3738 	}
3739 
3740 	left_nritems -= push_items;
3741 	btrfs_set_header_nritems(left, left_nritems);
3742 
3743 	if (left_nritems)
3744 		btrfs_mark_buffer_dirty(left);
3745 	else
3746 		btrfs_clean_tree_block(left);
3747 
3748 	btrfs_mark_buffer_dirty(right);
3749 
3750 	btrfs_item_key(right, &disk_key, 0);
3751 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3752 	btrfs_mark_buffer_dirty(upper);
3753 
3754 	/* then fixup the leaf pointer in the path */
3755 	if (path->slots[0] >= left_nritems) {
3756 		path->slots[0] -= left_nritems;
3757 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3758 			btrfs_clean_tree_block(path->nodes[0]);
3759 		btrfs_tree_unlock(path->nodes[0]);
3760 		free_extent_buffer(path->nodes[0]);
3761 		path->nodes[0] = right;
3762 		path->slots[1] += 1;
3763 	} else {
3764 		btrfs_tree_unlock(right);
3765 		free_extent_buffer(right);
3766 	}
3767 	return 0;
3768 
3769 out_unlock:
3770 	btrfs_tree_unlock(right);
3771 	free_extent_buffer(right);
3772 	return 1;
3773 }
3774 
3775 /*
3776  * push some data in the path leaf to the right, trying to free up at
3777  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3778  *
3779  * returns 1 if the push failed because the other node didn't have enough
3780  * room, 0 if everything worked out and < 0 if there were major errors.
3781  *
3782  * this will push starting from min_slot to the end of the leaf.  It won't
3783  * push any slot lower than min_slot
3784  */
3785 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3786 			   *root, struct btrfs_path *path,
3787 			   int min_data_size, int data_size,
3788 			   int empty, u32 min_slot)
3789 {
3790 	struct extent_buffer *left = path->nodes[0];
3791 	struct extent_buffer *right;
3792 	struct extent_buffer *upper;
3793 	int slot;
3794 	int free_space;
3795 	u32 left_nritems;
3796 	int ret;
3797 
3798 	if (!path->nodes[1])
3799 		return 1;
3800 
3801 	slot = path->slots[1];
3802 	upper = path->nodes[1];
3803 	if (slot >= btrfs_header_nritems(upper) - 1)
3804 		return 1;
3805 
3806 	btrfs_assert_tree_locked(path->nodes[1]);
3807 
3808 	right = btrfs_read_node_slot(upper, slot + 1);
3809 	/*
3810 	 * slot + 1 is not valid or we fail to read the right node,
3811 	 * no big deal, just return.
3812 	 */
3813 	if (IS_ERR(right))
3814 		return 1;
3815 
3816 	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3817 	btrfs_set_lock_blocking_write(right);
3818 
3819 	free_space = btrfs_leaf_free_space(right);
3820 	if (free_space < data_size)
3821 		goto out_unlock;
3822 
3823 	/* cow and double check */
3824 	ret = btrfs_cow_block(trans, root, right, upper,
3825 			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3826 	if (ret)
3827 		goto out_unlock;
3828 
3829 	free_space = btrfs_leaf_free_space(right);
3830 	if (free_space < data_size)
3831 		goto out_unlock;
3832 
3833 	left_nritems = btrfs_header_nritems(left);
3834 	if (left_nritems == 0)
3835 		goto out_unlock;
3836 
3837 	if (check_sibling_keys(left, right)) {
3838 		ret = -EUCLEAN;
3839 		btrfs_tree_unlock(right);
3840 		free_extent_buffer(right);
3841 		return ret;
3842 	}
3843 	if (path->slots[0] == left_nritems && !empty) {
3844 		/* Key greater than all keys in the leaf, right neighbor has
3845 		 * enough room for it and we're not emptying our leaf to delete
3846 		 * it, therefore use right neighbor to insert the new item and
3847 		 * no need to touch/dirty our left leaf. */
3848 		btrfs_tree_unlock(left);
3849 		free_extent_buffer(left);
3850 		path->nodes[0] = right;
3851 		path->slots[0] = 0;
3852 		path->slots[1]++;
3853 		return 0;
3854 	}
3855 
3856 	return __push_leaf_right(path, min_data_size, empty,
3857 				right, free_space, left_nritems, min_slot);
3858 out_unlock:
3859 	btrfs_tree_unlock(right);
3860 	free_extent_buffer(right);
3861 	return 1;
3862 }
3863 
3864 /*
3865  * push some data in the path leaf to the left, trying to free up at
3866  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3867  *
3868  * max_slot can put a limit on how far into the leaf we'll push items.  The
3869  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3870  * items
3871  */
3872 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3873 				     int empty, struct extent_buffer *left,
3874 				     int free_space, u32 right_nritems,
3875 				     u32 max_slot)
3876 {
3877 	struct btrfs_fs_info *fs_info = left->fs_info;
3878 	struct btrfs_disk_key disk_key;
3879 	struct extent_buffer *right = path->nodes[0];
3880 	int i;
3881 	int push_space = 0;
3882 	int push_items = 0;
3883 	struct btrfs_item *item;
3884 	u32 old_left_nritems;
3885 	u32 nr;
3886 	int ret = 0;
3887 	u32 this_item_size;
3888 	u32 old_left_item_size;
3889 	struct btrfs_map_token token;
3890 
3891 	if (empty)
3892 		nr = min(right_nritems, max_slot);
3893 	else
3894 		nr = min(right_nritems - 1, max_slot);
3895 
3896 	for (i = 0; i < nr; i++) {
3897 		item = btrfs_item_nr(i);
3898 
3899 		if (!empty && push_items > 0) {
3900 			if (path->slots[0] < i)
3901 				break;
3902 			if (path->slots[0] == i) {
3903 				int space = btrfs_leaf_free_space(right);
3904 
3905 				if (space + push_space * 2 > free_space)
3906 					break;
3907 			}
3908 		}
3909 
3910 		if (path->slots[0] == i)
3911 			push_space += data_size;
3912 
3913 		this_item_size = btrfs_item_size(right, item);
3914 		if (this_item_size + sizeof(*item) + push_space > free_space)
3915 			break;
3916 
3917 		push_items++;
3918 		push_space += this_item_size + sizeof(*item);
3919 	}
3920 
3921 	if (push_items == 0) {
3922 		ret = 1;
3923 		goto out;
3924 	}
3925 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3926 
3927 	/* push data from right to left */
3928 	copy_extent_buffer(left, right,
3929 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3930 			   btrfs_item_nr_offset(0),
3931 			   push_items * sizeof(struct btrfs_item));
3932 
3933 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3934 		     btrfs_item_offset_nr(right, push_items - 1);
3935 
3936 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3937 		     leaf_data_end(left) - push_space,
3938 		     BTRFS_LEAF_DATA_OFFSET +
3939 		     btrfs_item_offset_nr(right, push_items - 1),
3940 		     push_space);
3941 	old_left_nritems = btrfs_header_nritems(left);
3942 	BUG_ON(old_left_nritems <= 0);
3943 
3944 	btrfs_init_map_token(&token, left);
3945 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3946 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3947 		u32 ioff;
3948 
3949 		item = btrfs_item_nr(i);
3950 
3951 		ioff = btrfs_token_item_offset(&token, item);
3952 		btrfs_set_token_item_offset(&token, item,
3953 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3954 	}
3955 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3956 
3957 	/* fixup right node */
3958 	if (push_items > right_nritems)
3959 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3960 		       right_nritems);
3961 
3962 	if (push_items < right_nritems) {
3963 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3964 						  leaf_data_end(right);
3965 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3966 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3967 				      BTRFS_LEAF_DATA_OFFSET +
3968 				      leaf_data_end(right), push_space);
3969 
3970 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3971 			      btrfs_item_nr_offset(push_items),
3972 			     (btrfs_header_nritems(right) - push_items) *
3973 			     sizeof(struct btrfs_item));
3974 	}
3975 
3976 	btrfs_init_map_token(&token, right);
3977 	right_nritems -= push_items;
3978 	btrfs_set_header_nritems(right, right_nritems);
3979 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3980 	for (i = 0; i < right_nritems; i++) {
3981 		item = btrfs_item_nr(i);
3982 
3983 		push_space = push_space - btrfs_token_item_size(&token, item);
3984 		btrfs_set_token_item_offset(&token, item, push_space);
3985 	}
3986 
3987 	btrfs_mark_buffer_dirty(left);
3988 	if (right_nritems)
3989 		btrfs_mark_buffer_dirty(right);
3990 	else
3991 		btrfs_clean_tree_block(right);
3992 
3993 	btrfs_item_key(right, &disk_key, 0);
3994 	fixup_low_keys(path, &disk_key, 1);
3995 
3996 	/* then fixup the leaf pointer in the path */
3997 	if (path->slots[0] < push_items) {
3998 		path->slots[0] += old_left_nritems;
3999 		btrfs_tree_unlock(path->nodes[0]);
4000 		free_extent_buffer(path->nodes[0]);
4001 		path->nodes[0] = left;
4002 		path->slots[1] -= 1;
4003 	} else {
4004 		btrfs_tree_unlock(left);
4005 		free_extent_buffer(left);
4006 		path->slots[0] -= push_items;
4007 	}
4008 	BUG_ON(path->slots[0] < 0);
4009 	return ret;
4010 out:
4011 	btrfs_tree_unlock(left);
4012 	free_extent_buffer(left);
4013 	return ret;
4014 }
4015 
4016 /*
4017  * push some data in the path leaf to the left, trying to free up at
4018  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4019  *
4020  * max_slot can put a limit on how far into the leaf we'll push items.  The
4021  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4022  * items
4023  */
4024 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4025 			  *root, struct btrfs_path *path, int min_data_size,
4026 			  int data_size, int empty, u32 max_slot)
4027 {
4028 	struct extent_buffer *right = path->nodes[0];
4029 	struct extent_buffer *left;
4030 	int slot;
4031 	int free_space;
4032 	u32 right_nritems;
4033 	int ret = 0;
4034 
4035 	slot = path->slots[1];
4036 	if (slot == 0)
4037 		return 1;
4038 	if (!path->nodes[1])
4039 		return 1;
4040 
4041 	right_nritems = btrfs_header_nritems(right);
4042 	if (right_nritems == 0)
4043 		return 1;
4044 
4045 	btrfs_assert_tree_locked(path->nodes[1]);
4046 
4047 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4048 	/*
4049 	 * slot - 1 is not valid or we fail to read the left node,
4050 	 * no big deal, just return.
4051 	 */
4052 	if (IS_ERR(left))
4053 		return 1;
4054 
4055 	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4056 	btrfs_set_lock_blocking_write(left);
4057 
4058 	free_space = btrfs_leaf_free_space(left);
4059 	if (free_space < data_size) {
4060 		ret = 1;
4061 		goto out;
4062 	}
4063 
4064 	/* cow and double check */
4065 	ret = btrfs_cow_block(trans, root, left,
4066 			      path->nodes[1], slot - 1, &left,
4067 			      BTRFS_NESTING_LEFT_COW);
4068 	if (ret) {
4069 		/* we hit -ENOSPC, but it isn't fatal here */
4070 		if (ret == -ENOSPC)
4071 			ret = 1;
4072 		goto out;
4073 	}
4074 
4075 	free_space = btrfs_leaf_free_space(left);
4076 	if (free_space < data_size) {
4077 		ret = 1;
4078 		goto out;
4079 	}
4080 
4081 	if (check_sibling_keys(left, right)) {
4082 		ret = -EUCLEAN;
4083 		goto out;
4084 	}
4085 	return __push_leaf_left(path, min_data_size,
4086 			       empty, left, free_space, right_nritems,
4087 			       max_slot);
4088 out:
4089 	btrfs_tree_unlock(left);
4090 	free_extent_buffer(left);
4091 	return ret;
4092 }
4093 
4094 /*
4095  * split the path's leaf in two, making sure there is at least data_size
4096  * available for the resulting leaf level of the path.
4097  */
4098 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4099 				    struct btrfs_path *path,
4100 				    struct extent_buffer *l,
4101 				    struct extent_buffer *right,
4102 				    int slot, int mid, int nritems)
4103 {
4104 	struct btrfs_fs_info *fs_info = trans->fs_info;
4105 	int data_copy_size;
4106 	int rt_data_off;
4107 	int i;
4108 	struct btrfs_disk_key disk_key;
4109 	struct btrfs_map_token token;
4110 
4111 	nritems = nritems - mid;
4112 	btrfs_set_header_nritems(right, nritems);
4113 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4114 
4115 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4116 			   btrfs_item_nr_offset(mid),
4117 			   nritems * sizeof(struct btrfs_item));
4118 
4119 	copy_extent_buffer(right, l,
4120 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4121 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4122 		     leaf_data_end(l), data_copy_size);
4123 
4124 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4125 
4126 	btrfs_init_map_token(&token, right);
4127 	for (i = 0; i < nritems; i++) {
4128 		struct btrfs_item *item = btrfs_item_nr(i);
4129 		u32 ioff;
4130 
4131 		ioff = btrfs_token_item_offset(&token, item);
4132 		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4133 	}
4134 
4135 	btrfs_set_header_nritems(l, mid);
4136 	btrfs_item_key(right, &disk_key, 0);
4137 	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4138 
4139 	btrfs_mark_buffer_dirty(right);
4140 	btrfs_mark_buffer_dirty(l);
4141 	BUG_ON(path->slots[0] != slot);
4142 
4143 	if (mid <= slot) {
4144 		btrfs_tree_unlock(path->nodes[0]);
4145 		free_extent_buffer(path->nodes[0]);
4146 		path->nodes[0] = right;
4147 		path->slots[0] -= mid;
4148 		path->slots[1] += 1;
4149 	} else {
4150 		btrfs_tree_unlock(right);
4151 		free_extent_buffer(right);
4152 	}
4153 
4154 	BUG_ON(path->slots[0] < 0);
4155 }
4156 
4157 /*
4158  * double splits happen when we need to insert a big item in the middle
4159  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4160  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4161  *          A                 B                 C
4162  *
4163  * We avoid this by trying to push the items on either side of our target
4164  * into the adjacent leaves.  If all goes well we can avoid the double split
4165  * completely.
4166  */
4167 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4168 					  struct btrfs_root *root,
4169 					  struct btrfs_path *path,
4170 					  int data_size)
4171 {
4172 	int ret;
4173 	int progress = 0;
4174 	int slot;
4175 	u32 nritems;
4176 	int space_needed = data_size;
4177 
4178 	slot = path->slots[0];
4179 	if (slot < btrfs_header_nritems(path->nodes[0]))
4180 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4181 
4182 	/*
4183 	 * try to push all the items after our slot into the
4184 	 * right leaf
4185 	 */
4186 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4187 	if (ret < 0)
4188 		return ret;
4189 
4190 	if (ret == 0)
4191 		progress++;
4192 
4193 	nritems = btrfs_header_nritems(path->nodes[0]);
4194 	/*
4195 	 * our goal is to get our slot at the start or end of a leaf.  If
4196 	 * we've done so we're done
4197 	 */
4198 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4199 		return 0;
4200 
4201 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4202 		return 0;
4203 
4204 	/* try to push all the items before our slot into the next leaf */
4205 	slot = path->slots[0];
4206 	space_needed = data_size;
4207 	if (slot > 0)
4208 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4209 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4210 	if (ret < 0)
4211 		return ret;
4212 
4213 	if (ret == 0)
4214 		progress++;
4215 
4216 	if (progress)
4217 		return 0;
4218 	return 1;
4219 }
4220 
4221 /*
4222  * split the path's leaf in two, making sure there is at least data_size
4223  * available for the resulting leaf level of the path.
4224  *
4225  * returns 0 if all went well and < 0 on failure.
4226  */
4227 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4228 			       struct btrfs_root *root,
4229 			       const struct btrfs_key *ins_key,
4230 			       struct btrfs_path *path, int data_size,
4231 			       int extend)
4232 {
4233 	struct btrfs_disk_key disk_key;
4234 	struct extent_buffer *l;
4235 	u32 nritems;
4236 	int mid;
4237 	int slot;
4238 	struct extent_buffer *right;
4239 	struct btrfs_fs_info *fs_info = root->fs_info;
4240 	int ret = 0;
4241 	int wret;
4242 	int split;
4243 	int num_doubles = 0;
4244 	int tried_avoid_double = 0;
4245 
4246 	l = path->nodes[0];
4247 	slot = path->slots[0];
4248 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4249 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4250 		return -EOVERFLOW;
4251 
4252 	/* first try to make some room by pushing left and right */
4253 	if (data_size && path->nodes[1]) {
4254 		int space_needed = data_size;
4255 
4256 		if (slot < btrfs_header_nritems(l))
4257 			space_needed -= btrfs_leaf_free_space(l);
4258 
4259 		wret = push_leaf_right(trans, root, path, space_needed,
4260 				       space_needed, 0, 0);
4261 		if (wret < 0)
4262 			return wret;
4263 		if (wret) {
4264 			space_needed = data_size;
4265 			if (slot > 0)
4266 				space_needed -= btrfs_leaf_free_space(l);
4267 			wret = push_leaf_left(trans, root, path, space_needed,
4268 					      space_needed, 0, (u32)-1);
4269 			if (wret < 0)
4270 				return wret;
4271 		}
4272 		l = path->nodes[0];
4273 
4274 		/* did the pushes work? */
4275 		if (btrfs_leaf_free_space(l) >= data_size)
4276 			return 0;
4277 	}
4278 
4279 	if (!path->nodes[1]) {
4280 		ret = insert_new_root(trans, root, path, 1);
4281 		if (ret)
4282 			return ret;
4283 	}
4284 again:
4285 	split = 1;
4286 	l = path->nodes[0];
4287 	slot = path->slots[0];
4288 	nritems = btrfs_header_nritems(l);
4289 	mid = (nritems + 1) / 2;
4290 
4291 	if (mid <= slot) {
4292 		if (nritems == 1 ||
4293 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4294 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4295 			if (slot >= nritems) {
4296 				split = 0;
4297 			} else {
4298 				mid = slot;
4299 				if (mid != nritems &&
4300 				    leaf_space_used(l, mid, nritems - mid) +
4301 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4302 					if (data_size && !tried_avoid_double)
4303 						goto push_for_double;
4304 					split = 2;
4305 				}
4306 			}
4307 		}
4308 	} else {
4309 		if (leaf_space_used(l, 0, mid) + data_size >
4310 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4311 			if (!extend && data_size && slot == 0) {
4312 				split = 0;
4313 			} else if ((extend || !data_size) && slot == 0) {
4314 				mid = 1;
4315 			} else {
4316 				mid = slot;
4317 				if (mid != nritems &&
4318 				    leaf_space_used(l, mid, nritems - mid) +
4319 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4320 					if (data_size && !tried_avoid_double)
4321 						goto push_for_double;
4322 					split = 2;
4323 				}
4324 			}
4325 		}
4326 	}
4327 
4328 	if (split == 0)
4329 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4330 	else
4331 		btrfs_item_key(l, &disk_key, mid);
4332 
4333 	/*
4334 	 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4335 	 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4336 	 * subclasses, which is 8 at the time of this patch, and we've maxed it
4337 	 * out.  In the future we could add a
4338 	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4339 	 * use BTRFS_NESTING_NEW_ROOT.
4340 	 */
4341 	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4342 					     l->start, 0, num_doubles ?
4343 					     BTRFS_NESTING_NEW_ROOT :
4344 					     BTRFS_NESTING_SPLIT);
4345 	if (IS_ERR(right))
4346 		return PTR_ERR(right);
4347 
4348 	root_add_used(root, fs_info->nodesize);
4349 
4350 	if (split == 0) {
4351 		if (mid <= slot) {
4352 			btrfs_set_header_nritems(right, 0);
4353 			insert_ptr(trans, path, &disk_key,
4354 				   right->start, path->slots[1] + 1, 1);
4355 			btrfs_tree_unlock(path->nodes[0]);
4356 			free_extent_buffer(path->nodes[0]);
4357 			path->nodes[0] = right;
4358 			path->slots[0] = 0;
4359 			path->slots[1] += 1;
4360 		} else {
4361 			btrfs_set_header_nritems(right, 0);
4362 			insert_ptr(trans, path, &disk_key,
4363 				   right->start, path->slots[1], 1);
4364 			btrfs_tree_unlock(path->nodes[0]);
4365 			free_extent_buffer(path->nodes[0]);
4366 			path->nodes[0] = right;
4367 			path->slots[0] = 0;
4368 			if (path->slots[1] == 0)
4369 				fixup_low_keys(path, &disk_key, 1);
4370 		}
4371 		/*
4372 		 * We create a new leaf 'right' for the required ins_len and
4373 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4374 		 * the content of ins_len to 'right'.
4375 		 */
4376 		return ret;
4377 	}
4378 
4379 	copy_for_split(trans, path, l, right, slot, mid, nritems);
4380 
4381 	if (split == 2) {
4382 		BUG_ON(num_doubles != 0);
4383 		num_doubles++;
4384 		goto again;
4385 	}
4386 
4387 	return 0;
4388 
4389 push_for_double:
4390 	push_for_double_split(trans, root, path, data_size);
4391 	tried_avoid_double = 1;
4392 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4393 		return 0;
4394 	goto again;
4395 }
4396 
4397 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4398 					 struct btrfs_root *root,
4399 					 struct btrfs_path *path, int ins_len)
4400 {
4401 	struct btrfs_key key;
4402 	struct extent_buffer *leaf;
4403 	struct btrfs_file_extent_item *fi;
4404 	u64 extent_len = 0;
4405 	u32 item_size;
4406 	int ret;
4407 
4408 	leaf = path->nodes[0];
4409 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4410 
4411 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4412 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4413 
4414 	if (btrfs_leaf_free_space(leaf) >= ins_len)
4415 		return 0;
4416 
4417 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4418 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4419 		fi = btrfs_item_ptr(leaf, path->slots[0],
4420 				    struct btrfs_file_extent_item);
4421 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4422 	}
4423 	btrfs_release_path(path);
4424 
4425 	path->keep_locks = 1;
4426 	path->search_for_split = 1;
4427 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4428 	path->search_for_split = 0;
4429 	if (ret > 0)
4430 		ret = -EAGAIN;
4431 	if (ret < 0)
4432 		goto err;
4433 
4434 	ret = -EAGAIN;
4435 	leaf = path->nodes[0];
4436 	/* if our item isn't there, return now */
4437 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4438 		goto err;
4439 
4440 	/* the leaf has  changed, it now has room.  return now */
4441 	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4442 		goto err;
4443 
4444 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4445 		fi = btrfs_item_ptr(leaf, path->slots[0],
4446 				    struct btrfs_file_extent_item);
4447 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4448 			goto err;
4449 	}
4450 
4451 	btrfs_set_path_blocking(path);
4452 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4453 	if (ret)
4454 		goto err;
4455 
4456 	path->keep_locks = 0;
4457 	btrfs_unlock_up_safe(path, 1);
4458 	return 0;
4459 err:
4460 	path->keep_locks = 0;
4461 	return ret;
4462 }
4463 
4464 static noinline int split_item(struct btrfs_path *path,
4465 			       const struct btrfs_key *new_key,
4466 			       unsigned long split_offset)
4467 {
4468 	struct extent_buffer *leaf;
4469 	struct btrfs_item *item;
4470 	struct btrfs_item *new_item;
4471 	int slot;
4472 	char *buf;
4473 	u32 nritems;
4474 	u32 item_size;
4475 	u32 orig_offset;
4476 	struct btrfs_disk_key disk_key;
4477 
4478 	leaf = path->nodes[0];
4479 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4480 
4481 	btrfs_set_path_blocking(path);
4482 
4483 	item = btrfs_item_nr(path->slots[0]);
4484 	orig_offset = btrfs_item_offset(leaf, item);
4485 	item_size = btrfs_item_size(leaf, item);
4486 
4487 	buf = kmalloc(item_size, GFP_NOFS);
4488 	if (!buf)
4489 		return -ENOMEM;
4490 
4491 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4492 			    path->slots[0]), item_size);
4493 
4494 	slot = path->slots[0] + 1;
4495 	nritems = btrfs_header_nritems(leaf);
4496 	if (slot != nritems) {
4497 		/* shift the items */
4498 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4499 				btrfs_item_nr_offset(slot),
4500 				(nritems - slot) * sizeof(struct btrfs_item));
4501 	}
4502 
4503 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4504 	btrfs_set_item_key(leaf, &disk_key, slot);
4505 
4506 	new_item = btrfs_item_nr(slot);
4507 
4508 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4509 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4510 
4511 	btrfs_set_item_offset(leaf, item,
4512 			      orig_offset + item_size - split_offset);
4513 	btrfs_set_item_size(leaf, item, split_offset);
4514 
4515 	btrfs_set_header_nritems(leaf, nritems + 1);
4516 
4517 	/* write the data for the start of the original item */
4518 	write_extent_buffer(leaf, buf,
4519 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4520 			    split_offset);
4521 
4522 	/* write the data for the new item */
4523 	write_extent_buffer(leaf, buf + split_offset,
4524 			    btrfs_item_ptr_offset(leaf, slot),
4525 			    item_size - split_offset);
4526 	btrfs_mark_buffer_dirty(leaf);
4527 
4528 	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4529 	kfree(buf);
4530 	return 0;
4531 }
4532 
4533 /*
4534  * This function splits a single item into two items,
4535  * giving 'new_key' to the new item and splitting the
4536  * old one at split_offset (from the start of the item).
4537  *
4538  * The path may be released by this operation.  After
4539  * the split, the path is pointing to the old item.  The
4540  * new item is going to be in the same node as the old one.
4541  *
4542  * Note, the item being split must be smaller enough to live alone on
4543  * a tree block with room for one extra struct btrfs_item
4544  *
4545  * This allows us to split the item in place, keeping a lock on the
4546  * leaf the entire time.
4547  */
4548 int btrfs_split_item(struct btrfs_trans_handle *trans,
4549 		     struct btrfs_root *root,
4550 		     struct btrfs_path *path,
4551 		     const struct btrfs_key *new_key,
4552 		     unsigned long split_offset)
4553 {
4554 	int ret;
4555 	ret = setup_leaf_for_split(trans, root, path,
4556 				   sizeof(struct btrfs_item));
4557 	if (ret)
4558 		return ret;
4559 
4560 	ret = split_item(path, new_key, split_offset);
4561 	return ret;
4562 }
4563 
4564 /*
4565  * This function duplicate a item, giving 'new_key' to the new item.
4566  * It guarantees both items live in the same tree leaf and the new item
4567  * is contiguous with the original item.
4568  *
4569  * This allows us to split file extent in place, keeping a lock on the
4570  * leaf the entire time.
4571  */
4572 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4573 			 struct btrfs_root *root,
4574 			 struct btrfs_path *path,
4575 			 const struct btrfs_key *new_key)
4576 {
4577 	struct extent_buffer *leaf;
4578 	int ret;
4579 	u32 item_size;
4580 
4581 	leaf = path->nodes[0];
4582 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4583 	ret = setup_leaf_for_split(trans, root, path,
4584 				   item_size + sizeof(struct btrfs_item));
4585 	if (ret)
4586 		return ret;
4587 
4588 	path->slots[0]++;
4589 	setup_items_for_insert(root, path, new_key, &item_size, 1);
4590 	leaf = path->nodes[0];
4591 	memcpy_extent_buffer(leaf,
4592 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4593 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4594 			     item_size);
4595 	return 0;
4596 }
4597 
4598 /*
4599  * make the item pointed to by the path smaller.  new_size indicates
4600  * how small to make it, and from_end tells us if we just chop bytes
4601  * off the end of the item or if we shift the item to chop bytes off
4602  * the front.
4603  */
4604 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4605 {
4606 	int slot;
4607 	struct extent_buffer *leaf;
4608 	struct btrfs_item *item;
4609 	u32 nritems;
4610 	unsigned int data_end;
4611 	unsigned int old_data_start;
4612 	unsigned int old_size;
4613 	unsigned int size_diff;
4614 	int i;
4615 	struct btrfs_map_token token;
4616 
4617 	leaf = path->nodes[0];
4618 	slot = path->slots[0];
4619 
4620 	old_size = btrfs_item_size_nr(leaf, slot);
4621 	if (old_size == new_size)
4622 		return;
4623 
4624 	nritems = btrfs_header_nritems(leaf);
4625 	data_end = leaf_data_end(leaf);
4626 
4627 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4628 
4629 	size_diff = old_size - new_size;
4630 
4631 	BUG_ON(slot < 0);
4632 	BUG_ON(slot >= nritems);
4633 
4634 	/*
4635 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4636 	 */
4637 	/* first correct the data pointers */
4638 	btrfs_init_map_token(&token, leaf);
4639 	for (i = slot; i < nritems; i++) {
4640 		u32 ioff;
4641 		item = btrfs_item_nr(i);
4642 
4643 		ioff = btrfs_token_item_offset(&token, item);
4644 		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4645 	}
4646 
4647 	/* shift the data */
4648 	if (from_end) {
4649 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4650 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4651 			      data_end, old_data_start + new_size - data_end);
4652 	} else {
4653 		struct btrfs_disk_key disk_key;
4654 		u64 offset;
4655 
4656 		btrfs_item_key(leaf, &disk_key, slot);
4657 
4658 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4659 			unsigned long ptr;
4660 			struct btrfs_file_extent_item *fi;
4661 
4662 			fi = btrfs_item_ptr(leaf, slot,
4663 					    struct btrfs_file_extent_item);
4664 			fi = (struct btrfs_file_extent_item *)(
4665 			     (unsigned long)fi - size_diff);
4666 
4667 			if (btrfs_file_extent_type(leaf, fi) ==
4668 			    BTRFS_FILE_EXTENT_INLINE) {
4669 				ptr = btrfs_item_ptr_offset(leaf, slot);
4670 				memmove_extent_buffer(leaf, ptr,
4671 				      (unsigned long)fi,
4672 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4673 			}
4674 		}
4675 
4676 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4677 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4678 			      data_end, old_data_start - data_end);
4679 
4680 		offset = btrfs_disk_key_offset(&disk_key);
4681 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4682 		btrfs_set_item_key(leaf, &disk_key, slot);
4683 		if (slot == 0)
4684 			fixup_low_keys(path, &disk_key, 1);
4685 	}
4686 
4687 	item = btrfs_item_nr(slot);
4688 	btrfs_set_item_size(leaf, item, new_size);
4689 	btrfs_mark_buffer_dirty(leaf);
4690 
4691 	if (btrfs_leaf_free_space(leaf) < 0) {
4692 		btrfs_print_leaf(leaf);
4693 		BUG();
4694 	}
4695 }
4696 
4697 /*
4698  * make the item pointed to by the path bigger, data_size is the added size.
4699  */
4700 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4701 {
4702 	int slot;
4703 	struct extent_buffer *leaf;
4704 	struct btrfs_item *item;
4705 	u32 nritems;
4706 	unsigned int data_end;
4707 	unsigned int old_data;
4708 	unsigned int old_size;
4709 	int i;
4710 	struct btrfs_map_token token;
4711 
4712 	leaf = path->nodes[0];
4713 
4714 	nritems = btrfs_header_nritems(leaf);
4715 	data_end = leaf_data_end(leaf);
4716 
4717 	if (btrfs_leaf_free_space(leaf) < data_size) {
4718 		btrfs_print_leaf(leaf);
4719 		BUG();
4720 	}
4721 	slot = path->slots[0];
4722 	old_data = btrfs_item_end_nr(leaf, slot);
4723 
4724 	BUG_ON(slot < 0);
4725 	if (slot >= nritems) {
4726 		btrfs_print_leaf(leaf);
4727 		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4728 			   slot, nritems);
4729 		BUG();
4730 	}
4731 
4732 	/*
4733 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4734 	 */
4735 	/* first correct the data pointers */
4736 	btrfs_init_map_token(&token, leaf);
4737 	for (i = slot; i < nritems; i++) {
4738 		u32 ioff;
4739 		item = btrfs_item_nr(i);
4740 
4741 		ioff = btrfs_token_item_offset(&token, item);
4742 		btrfs_set_token_item_offset(&token, item, ioff - data_size);
4743 	}
4744 
4745 	/* shift the data */
4746 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4747 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4748 		      data_end, old_data - data_end);
4749 
4750 	data_end = old_data;
4751 	old_size = btrfs_item_size_nr(leaf, slot);
4752 	item = btrfs_item_nr(slot);
4753 	btrfs_set_item_size(leaf, item, old_size + data_size);
4754 	btrfs_mark_buffer_dirty(leaf);
4755 
4756 	if (btrfs_leaf_free_space(leaf) < 0) {
4757 		btrfs_print_leaf(leaf);
4758 		BUG();
4759 	}
4760 }
4761 
4762 /**
4763  * setup_items_for_insert - Helper called before inserting one or more items
4764  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4765  * in a function that doesn't call btrfs_search_slot
4766  *
4767  * @root:	root we are inserting items to
4768  * @path:	points to the leaf/slot where we are going to insert new items
4769  * @cpu_key:	array of keys for items to be inserted
4770  * @data_size:	size of the body of each item we are going to insert
4771  * @nr:		size of @cpu_key/@data_size arrays
4772  */
4773 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4774 			    const struct btrfs_key *cpu_key, u32 *data_size,
4775 			    int nr)
4776 {
4777 	struct btrfs_fs_info *fs_info = root->fs_info;
4778 	struct btrfs_item *item;
4779 	int i;
4780 	u32 nritems;
4781 	unsigned int data_end;
4782 	struct btrfs_disk_key disk_key;
4783 	struct extent_buffer *leaf;
4784 	int slot;
4785 	struct btrfs_map_token token;
4786 	u32 total_size;
4787 	u32 total_data = 0;
4788 
4789 	for (i = 0; i < nr; i++)
4790 		total_data += data_size[i];
4791 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4792 
4793 	if (path->slots[0] == 0) {
4794 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4795 		fixup_low_keys(path, &disk_key, 1);
4796 	}
4797 	btrfs_unlock_up_safe(path, 1);
4798 
4799 	leaf = path->nodes[0];
4800 	slot = path->slots[0];
4801 
4802 	nritems = btrfs_header_nritems(leaf);
4803 	data_end = leaf_data_end(leaf);
4804 
4805 	if (btrfs_leaf_free_space(leaf) < total_size) {
4806 		btrfs_print_leaf(leaf);
4807 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4808 			   total_size, btrfs_leaf_free_space(leaf));
4809 		BUG();
4810 	}
4811 
4812 	btrfs_init_map_token(&token, leaf);
4813 	if (slot != nritems) {
4814 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4815 
4816 		if (old_data < data_end) {
4817 			btrfs_print_leaf(leaf);
4818 			btrfs_crit(fs_info,
4819 		"item at slot %d with data offset %u beyond data end of leaf %u",
4820 				   slot, old_data, data_end);
4821 			BUG();
4822 		}
4823 		/*
4824 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4825 		 */
4826 		/* first correct the data pointers */
4827 		for (i = slot; i < nritems; i++) {
4828 			u32 ioff;
4829 
4830 			item = btrfs_item_nr(i);
4831 			ioff = btrfs_token_item_offset(&token, item);
4832 			btrfs_set_token_item_offset(&token, item,
4833 						    ioff - total_data);
4834 		}
4835 		/* shift the items */
4836 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4837 			      btrfs_item_nr_offset(slot),
4838 			      (nritems - slot) * sizeof(struct btrfs_item));
4839 
4840 		/* shift the data */
4841 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4842 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4843 			      data_end, old_data - data_end);
4844 		data_end = old_data;
4845 	}
4846 
4847 	/* setup the item for the new data */
4848 	for (i = 0; i < nr; i++) {
4849 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4850 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4851 		item = btrfs_item_nr(slot + i);
4852 		data_end -= data_size[i];
4853 		btrfs_set_token_item_offset(&token, item, data_end);
4854 		btrfs_set_token_item_size(&token, item, data_size[i]);
4855 	}
4856 
4857 	btrfs_set_header_nritems(leaf, nritems + nr);
4858 	btrfs_mark_buffer_dirty(leaf);
4859 
4860 	if (btrfs_leaf_free_space(leaf) < 0) {
4861 		btrfs_print_leaf(leaf);
4862 		BUG();
4863 	}
4864 }
4865 
4866 /*
4867  * Given a key and some data, insert items into the tree.
4868  * This does all the path init required, making room in the tree if needed.
4869  */
4870 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4871 			    struct btrfs_root *root,
4872 			    struct btrfs_path *path,
4873 			    const struct btrfs_key *cpu_key, u32 *data_size,
4874 			    int nr)
4875 {
4876 	int ret = 0;
4877 	int slot;
4878 	int i;
4879 	u32 total_size = 0;
4880 	u32 total_data = 0;
4881 
4882 	for (i = 0; i < nr; i++)
4883 		total_data += data_size[i];
4884 
4885 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4886 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4887 	if (ret == 0)
4888 		return -EEXIST;
4889 	if (ret < 0)
4890 		return ret;
4891 
4892 	slot = path->slots[0];
4893 	BUG_ON(slot < 0);
4894 
4895 	setup_items_for_insert(root, path, cpu_key, data_size, nr);
4896 	return 0;
4897 }
4898 
4899 /*
4900  * Given a key and some data, insert an item into the tree.
4901  * This does all the path init required, making room in the tree if needed.
4902  */
4903 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4904 		      const struct btrfs_key *cpu_key, void *data,
4905 		      u32 data_size)
4906 {
4907 	int ret = 0;
4908 	struct btrfs_path *path;
4909 	struct extent_buffer *leaf;
4910 	unsigned long ptr;
4911 
4912 	path = btrfs_alloc_path();
4913 	if (!path)
4914 		return -ENOMEM;
4915 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4916 	if (!ret) {
4917 		leaf = path->nodes[0];
4918 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4919 		write_extent_buffer(leaf, data, ptr, data_size);
4920 		btrfs_mark_buffer_dirty(leaf);
4921 	}
4922 	btrfs_free_path(path);
4923 	return ret;
4924 }
4925 
4926 /*
4927  * delete the pointer from a given node.
4928  *
4929  * the tree should have been previously balanced so the deletion does not
4930  * empty a node.
4931  */
4932 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4933 		    int level, int slot)
4934 {
4935 	struct extent_buffer *parent = path->nodes[level];
4936 	u32 nritems;
4937 	int ret;
4938 
4939 	nritems = btrfs_header_nritems(parent);
4940 	if (slot != nritems - 1) {
4941 		if (level) {
4942 			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4943 					nritems - slot - 1);
4944 			BUG_ON(ret < 0);
4945 		}
4946 		memmove_extent_buffer(parent,
4947 			      btrfs_node_key_ptr_offset(slot),
4948 			      btrfs_node_key_ptr_offset(slot + 1),
4949 			      sizeof(struct btrfs_key_ptr) *
4950 			      (nritems - slot - 1));
4951 	} else if (level) {
4952 		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4953 				GFP_NOFS);
4954 		BUG_ON(ret < 0);
4955 	}
4956 
4957 	nritems--;
4958 	btrfs_set_header_nritems(parent, nritems);
4959 	if (nritems == 0 && parent == root->node) {
4960 		BUG_ON(btrfs_header_level(root->node) != 1);
4961 		/* just turn the root into a leaf and break */
4962 		btrfs_set_header_level(root->node, 0);
4963 	} else if (slot == 0) {
4964 		struct btrfs_disk_key disk_key;
4965 
4966 		btrfs_node_key(parent, &disk_key, 0);
4967 		fixup_low_keys(path, &disk_key, level + 1);
4968 	}
4969 	btrfs_mark_buffer_dirty(parent);
4970 }
4971 
4972 /*
4973  * a helper function to delete the leaf pointed to by path->slots[1] and
4974  * path->nodes[1].
4975  *
4976  * This deletes the pointer in path->nodes[1] and frees the leaf
4977  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4978  *
4979  * The path must have already been setup for deleting the leaf, including
4980  * all the proper balancing.  path->nodes[1] must be locked.
4981  */
4982 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4983 				    struct btrfs_root *root,
4984 				    struct btrfs_path *path,
4985 				    struct extent_buffer *leaf)
4986 {
4987 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4988 	del_ptr(root, path, 1, path->slots[1]);
4989 
4990 	/*
4991 	 * btrfs_free_extent is expensive, we want to make sure we
4992 	 * aren't holding any locks when we call it
4993 	 */
4994 	btrfs_unlock_up_safe(path, 0);
4995 
4996 	root_sub_used(root, leaf->len);
4997 
4998 	atomic_inc(&leaf->refs);
4999 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
5000 	free_extent_buffer_stale(leaf);
5001 }
5002 /*
5003  * delete the item at the leaf level in path.  If that empties
5004  * the leaf, remove it from the tree
5005  */
5006 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5007 		    struct btrfs_path *path, int slot, int nr)
5008 {
5009 	struct btrfs_fs_info *fs_info = root->fs_info;
5010 	struct extent_buffer *leaf;
5011 	struct btrfs_item *item;
5012 	u32 last_off;
5013 	u32 dsize = 0;
5014 	int ret = 0;
5015 	int wret;
5016 	int i;
5017 	u32 nritems;
5018 
5019 	leaf = path->nodes[0];
5020 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5021 
5022 	for (i = 0; i < nr; i++)
5023 		dsize += btrfs_item_size_nr(leaf, slot + i);
5024 
5025 	nritems = btrfs_header_nritems(leaf);
5026 
5027 	if (slot + nr != nritems) {
5028 		int data_end = leaf_data_end(leaf);
5029 		struct btrfs_map_token token;
5030 
5031 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5032 			      data_end + dsize,
5033 			      BTRFS_LEAF_DATA_OFFSET + data_end,
5034 			      last_off - data_end);
5035 
5036 		btrfs_init_map_token(&token, leaf);
5037 		for (i = slot + nr; i < nritems; i++) {
5038 			u32 ioff;
5039 
5040 			item = btrfs_item_nr(i);
5041 			ioff = btrfs_token_item_offset(&token, item);
5042 			btrfs_set_token_item_offset(&token, item, ioff + dsize);
5043 		}
5044 
5045 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5046 			      btrfs_item_nr_offset(slot + nr),
5047 			      sizeof(struct btrfs_item) *
5048 			      (nritems - slot - nr));
5049 	}
5050 	btrfs_set_header_nritems(leaf, nritems - nr);
5051 	nritems -= nr;
5052 
5053 	/* delete the leaf if we've emptied it */
5054 	if (nritems == 0) {
5055 		if (leaf == root->node) {
5056 			btrfs_set_header_level(leaf, 0);
5057 		} else {
5058 			btrfs_set_path_blocking(path);
5059 			btrfs_clean_tree_block(leaf);
5060 			btrfs_del_leaf(trans, root, path, leaf);
5061 		}
5062 	} else {
5063 		int used = leaf_space_used(leaf, 0, nritems);
5064 		if (slot == 0) {
5065 			struct btrfs_disk_key disk_key;
5066 
5067 			btrfs_item_key(leaf, &disk_key, 0);
5068 			fixup_low_keys(path, &disk_key, 1);
5069 		}
5070 
5071 		/* delete the leaf if it is mostly empty */
5072 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5073 			/* push_leaf_left fixes the path.
5074 			 * make sure the path still points to our leaf
5075 			 * for possible call to del_ptr below
5076 			 */
5077 			slot = path->slots[1];
5078 			atomic_inc(&leaf->refs);
5079 
5080 			btrfs_set_path_blocking(path);
5081 			wret = push_leaf_left(trans, root, path, 1, 1,
5082 					      1, (u32)-1);
5083 			if (wret < 0 && wret != -ENOSPC)
5084 				ret = wret;
5085 
5086 			if (path->nodes[0] == leaf &&
5087 			    btrfs_header_nritems(leaf)) {
5088 				wret = push_leaf_right(trans, root, path, 1,
5089 						       1, 1, 0);
5090 				if (wret < 0 && wret != -ENOSPC)
5091 					ret = wret;
5092 			}
5093 
5094 			if (btrfs_header_nritems(leaf) == 0) {
5095 				path->slots[1] = slot;
5096 				btrfs_del_leaf(trans, root, path, leaf);
5097 				free_extent_buffer(leaf);
5098 				ret = 0;
5099 			} else {
5100 				/* if we're still in the path, make sure
5101 				 * we're dirty.  Otherwise, one of the
5102 				 * push_leaf functions must have already
5103 				 * dirtied this buffer
5104 				 */
5105 				if (path->nodes[0] == leaf)
5106 					btrfs_mark_buffer_dirty(leaf);
5107 				free_extent_buffer(leaf);
5108 			}
5109 		} else {
5110 			btrfs_mark_buffer_dirty(leaf);
5111 		}
5112 	}
5113 	return ret;
5114 }
5115 
5116 /*
5117  * search the tree again to find a leaf with lesser keys
5118  * returns 0 if it found something or 1 if there are no lesser leaves.
5119  * returns < 0 on io errors.
5120  *
5121  * This may release the path, and so you may lose any locks held at the
5122  * time you call it.
5123  */
5124 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5125 {
5126 	struct btrfs_key key;
5127 	struct btrfs_disk_key found_key;
5128 	int ret;
5129 
5130 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5131 
5132 	if (key.offset > 0) {
5133 		key.offset--;
5134 	} else if (key.type > 0) {
5135 		key.type--;
5136 		key.offset = (u64)-1;
5137 	} else if (key.objectid > 0) {
5138 		key.objectid--;
5139 		key.type = (u8)-1;
5140 		key.offset = (u64)-1;
5141 	} else {
5142 		return 1;
5143 	}
5144 
5145 	btrfs_release_path(path);
5146 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5147 	if (ret < 0)
5148 		return ret;
5149 	btrfs_item_key(path->nodes[0], &found_key, 0);
5150 	ret = comp_keys(&found_key, &key);
5151 	/*
5152 	 * We might have had an item with the previous key in the tree right
5153 	 * before we released our path. And after we released our path, that
5154 	 * item might have been pushed to the first slot (0) of the leaf we
5155 	 * were holding due to a tree balance. Alternatively, an item with the
5156 	 * previous key can exist as the only element of a leaf (big fat item).
5157 	 * Therefore account for these 2 cases, so that our callers (like
5158 	 * btrfs_previous_item) don't miss an existing item with a key matching
5159 	 * the previous key we computed above.
5160 	 */
5161 	if (ret <= 0)
5162 		return 0;
5163 	return 1;
5164 }
5165 
5166 /*
5167  * A helper function to walk down the tree starting at min_key, and looking
5168  * for nodes or leaves that are have a minimum transaction id.
5169  * This is used by the btree defrag code, and tree logging
5170  *
5171  * This does not cow, but it does stuff the starting key it finds back
5172  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5173  * key and get a writable path.
5174  *
5175  * This honors path->lowest_level to prevent descent past a given level
5176  * of the tree.
5177  *
5178  * min_trans indicates the oldest transaction that you are interested
5179  * in walking through.  Any nodes or leaves older than min_trans are
5180  * skipped over (without reading them).
5181  *
5182  * returns zero if something useful was found, < 0 on error and 1 if there
5183  * was nothing in the tree that matched the search criteria.
5184  */
5185 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5186 			 struct btrfs_path *path,
5187 			 u64 min_trans)
5188 {
5189 	struct extent_buffer *cur;
5190 	struct btrfs_key found_key;
5191 	int slot;
5192 	int sret;
5193 	u32 nritems;
5194 	int level;
5195 	int ret = 1;
5196 	int keep_locks = path->keep_locks;
5197 
5198 	path->keep_locks = 1;
5199 again:
5200 	cur = btrfs_read_lock_root_node(root);
5201 	level = btrfs_header_level(cur);
5202 	WARN_ON(path->nodes[level]);
5203 	path->nodes[level] = cur;
5204 	path->locks[level] = BTRFS_READ_LOCK;
5205 
5206 	if (btrfs_header_generation(cur) < min_trans) {
5207 		ret = 1;
5208 		goto out;
5209 	}
5210 	while (1) {
5211 		nritems = btrfs_header_nritems(cur);
5212 		level = btrfs_header_level(cur);
5213 		sret = btrfs_bin_search(cur, min_key, &slot);
5214 		if (sret < 0) {
5215 			ret = sret;
5216 			goto out;
5217 		}
5218 
5219 		/* at the lowest level, we're done, setup the path and exit */
5220 		if (level == path->lowest_level) {
5221 			if (slot >= nritems)
5222 				goto find_next_key;
5223 			ret = 0;
5224 			path->slots[level] = slot;
5225 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5226 			goto out;
5227 		}
5228 		if (sret && slot > 0)
5229 			slot--;
5230 		/*
5231 		 * check this node pointer against the min_trans parameters.
5232 		 * If it is too old, skip to the next one.
5233 		 */
5234 		while (slot < nritems) {
5235 			u64 gen;
5236 
5237 			gen = btrfs_node_ptr_generation(cur, slot);
5238 			if (gen < min_trans) {
5239 				slot++;
5240 				continue;
5241 			}
5242 			break;
5243 		}
5244 find_next_key:
5245 		/*
5246 		 * we didn't find a candidate key in this node, walk forward
5247 		 * and find another one
5248 		 */
5249 		if (slot >= nritems) {
5250 			path->slots[level] = slot;
5251 			btrfs_set_path_blocking(path);
5252 			sret = btrfs_find_next_key(root, path, min_key, level,
5253 						  min_trans);
5254 			if (sret == 0) {
5255 				btrfs_release_path(path);
5256 				goto again;
5257 			} else {
5258 				goto out;
5259 			}
5260 		}
5261 		/* save our key for returning back */
5262 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5263 		path->slots[level] = slot;
5264 		if (level == path->lowest_level) {
5265 			ret = 0;
5266 			goto out;
5267 		}
5268 		btrfs_set_path_blocking(path);
5269 		cur = btrfs_read_node_slot(cur, slot);
5270 		if (IS_ERR(cur)) {
5271 			ret = PTR_ERR(cur);
5272 			goto out;
5273 		}
5274 
5275 		btrfs_tree_read_lock(cur);
5276 
5277 		path->locks[level - 1] = BTRFS_READ_LOCK;
5278 		path->nodes[level - 1] = cur;
5279 		unlock_up(path, level, 1, 0, NULL);
5280 	}
5281 out:
5282 	path->keep_locks = keep_locks;
5283 	if (ret == 0) {
5284 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5285 		btrfs_set_path_blocking(path);
5286 		memcpy(min_key, &found_key, sizeof(found_key));
5287 	}
5288 	return ret;
5289 }
5290 
5291 /*
5292  * this is similar to btrfs_next_leaf, but does not try to preserve
5293  * and fixup the path.  It looks for and returns the next key in the
5294  * tree based on the current path and the min_trans parameters.
5295  *
5296  * 0 is returned if another key is found, < 0 if there are any errors
5297  * and 1 is returned if there are no higher keys in the tree
5298  *
5299  * path->keep_locks should be set to 1 on the search made before
5300  * calling this function.
5301  */
5302 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5303 			struct btrfs_key *key, int level, u64 min_trans)
5304 {
5305 	int slot;
5306 	struct extent_buffer *c;
5307 
5308 	WARN_ON(!path->keep_locks && !path->skip_locking);
5309 	while (level < BTRFS_MAX_LEVEL) {
5310 		if (!path->nodes[level])
5311 			return 1;
5312 
5313 		slot = path->slots[level] + 1;
5314 		c = path->nodes[level];
5315 next:
5316 		if (slot >= btrfs_header_nritems(c)) {
5317 			int ret;
5318 			int orig_lowest;
5319 			struct btrfs_key cur_key;
5320 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5321 			    !path->nodes[level + 1])
5322 				return 1;
5323 
5324 			if (path->locks[level + 1] || path->skip_locking) {
5325 				level++;
5326 				continue;
5327 			}
5328 
5329 			slot = btrfs_header_nritems(c) - 1;
5330 			if (level == 0)
5331 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5332 			else
5333 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5334 
5335 			orig_lowest = path->lowest_level;
5336 			btrfs_release_path(path);
5337 			path->lowest_level = level;
5338 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5339 						0, 0);
5340 			path->lowest_level = orig_lowest;
5341 			if (ret < 0)
5342 				return ret;
5343 
5344 			c = path->nodes[level];
5345 			slot = path->slots[level];
5346 			if (ret == 0)
5347 				slot++;
5348 			goto next;
5349 		}
5350 
5351 		if (level == 0)
5352 			btrfs_item_key_to_cpu(c, key, slot);
5353 		else {
5354 			u64 gen = btrfs_node_ptr_generation(c, slot);
5355 
5356 			if (gen < min_trans) {
5357 				slot++;
5358 				goto next;
5359 			}
5360 			btrfs_node_key_to_cpu(c, key, slot);
5361 		}
5362 		return 0;
5363 	}
5364 	return 1;
5365 }
5366 
5367 /*
5368  * search the tree again to find a leaf with greater keys
5369  * returns 0 if it found something or 1 if there are no greater leaves.
5370  * returns < 0 on io errors.
5371  */
5372 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5373 {
5374 	return btrfs_next_old_leaf(root, path, 0);
5375 }
5376 
5377 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5378 			u64 time_seq)
5379 {
5380 	int slot;
5381 	int level;
5382 	struct extent_buffer *c;
5383 	struct extent_buffer *next;
5384 	struct btrfs_key key;
5385 	u32 nritems;
5386 	int ret;
5387 	int old_spinning = path->leave_spinning;
5388 	int next_rw_lock = 0;
5389 
5390 	nritems = btrfs_header_nritems(path->nodes[0]);
5391 	if (nritems == 0)
5392 		return 1;
5393 
5394 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5395 again:
5396 	level = 1;
5397 	next = NULL;
5398 	next_rw_lock = 0;
5399 	btrfs_release_path(path);
5400 
5401 	path->keep_locks = 1;
5402 	path->leave_spinning = 1;
5403 
5404 	if (time_seq)
5405 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5406 	else
5407 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5408 	path->keep_locks = 0;
5409 
5410 	if (ret < 0)
5411 		return ret;
5412 
5413 	nritems = btrfs_header_nritems(path->nodes[0]);
5414 	/*
5415 	 * by releasing the path above we dropped all our locks.  A balance
5416 	 * could have added more items next to the key that used to be
5417 	 * at the very end of the block.  So, check again here and
5418 	 * advance the path if there are now more items available.
5419 	 */
5420 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5421 		if (ret == 0)
5422 			path->slots[0]++;
5423 		ret = 0;
5424 		goto done;
5425 	}
5426 	/*
5427 	 * So the above check misses one case:
5428 	 * - after releasing the path above, someone has removed the item that
5429 	 *   used to be at the very end of the block, and balance between leafs
5430 	 *   gets another one with bigger key.offset to replace it.
5431 	 *
5432 	 * This one should be returned as well, or we can get leaf corruption
5433 	 * later(esp. in __btrfs_drop_extents()).
5434 	 *
5435 	 * And a bit more explanation about this check,
5436 	 * with ret > 0, the key isn't found, the path points to the slot
5437 	 * where it should be inserted, so the path->slots[0] item must be the
5438 	 * bigger one.
5439 	 */
5440 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5441 		ret = 0;
5442 		goto done;
5443 	}
5444 
5445 	while (level < BTRFS_MAX_LEVEL) {
5446 		if (!path->nodes[level]) {
5447 			ret = 1;
5448 			goto done;
5449 		}
5450 
5451 		slot = path->slots[level] + 1;
5452 		c = path->nodes[level];
5453 		if (slot >= btrfs_header_nritems(c)) {
5454 			level++;
5455 			if (level == BTRFS_MAX_LEVEL) {
5456 				ret = 1;
5457 				goto done;
5458 			}
5459 			continue;
5460 		}
5461 
5462 		if (next) {
5463 			btrfs_tree_unlock_rw(next, next_rw_lock);
5464 			free_extent_buffer(next);
5465 		}
5466 
5467 		next = c;
5468 		next_rw_lock = path->locks[level];
5469 		ret = read_block_for_search(root, path, &next, level,
5470 					    slot, &key);
5471 		if (ret == -EAGAIN)
5472 			goto again;
5473 
5474 		if (ret < 0) {
5475 			btrfs_release_path(path);
5476 			goto done;
5477 		}
5478 
5479 		if (!path->skip_locking) {
5480 			ret = btrfs_try_tree_read_lock(next);
5481 			if (!ret && time_seq) {
5482 				/*
5483 				 * If we don't get the lock, we may be racing
5484 				 * with push_leaf_left, holding that lock while
5485 				 * itself waiting for the leaf we've currently
5486 				 * locked. To solve this situation, we give up
5487 				 * on our lock and cycle.
5488 				 */
5489 				free_extent_buffer(next);
5490 				btrfs_release_path(path);
5491 				cond_resched();
5492 				goto again;
5493 			}
5494 			if (!ret) {
5495 				btrfs_set_path_blocking(path);
5496 				__btrfs_tree_read_lock(next,
5497 						       BTRFS_NESTING_RIGHT,
5498 						       path->recurse);
5499 			}
5500 			next_rw_lock = BTRFS_READ_LOCK;
5501 		}
5502 		break;
5503 	}
5504 	path->slots[level] = slot;
5505 	while (1) {
5506 		level--;
5507 		c = path->nodes[level];
5508 		if (path->locks[level])
5509 			btrfs_tree_unlock_rw(c, path->locks[level]);
5510 
5511 		free_extent_buffer(c);
5512 		path->nodes[level] = next;
5513 		path->slots[level] = 0;
5514 		if (!path->skip_locking)
5515 			path->locks[level] = next_rw_lock;
5516 		if (!level)
5517 			break;
5518 
5519 		ret = read_block_for_search(root, path, &next, level,
5520 					    0, &key);
5521 		if (ret == -EAGAIN)
5522 			goto again;
5523 
5524 		if (ret < 0) {
5525 			btrfs_release_path(path);
5526 			goto done;
5527 		}
5528 
5529 		if (!path->skip_locking) {
5530 			ret = btrfs_try_tree_read_lock(next);
5531 			if (!ret) {
5532 				btrfs_set_path_blocking(path);
5533 				__btrfs_tree_read_lock(next,
5534 						       BTRFS_NESTING_RIGHT,
5535 						       path->recurse);
5536 			}
5537 			next_rw_lock = BTRFS_READ_LOCK;
5538 		}
5539 	}
5540 	ret = 0;
5541 done:
5542 	unlock_up(path, 0, 1, 0, NULL);
5543 	path->leave_spinning = old_spinning;
5544 	if (!old_spinning)
5545 		btrfs_set_path_blocking(path);
5546 
5547 	return ret;
5548 }
5549 
5550 /*
5551  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5552  * searching until it gets past min_objectid or finds an item of 'type'
5553  *
5554  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5555  */
5556 int btrfs_previous_item(struct btrfs_root *root,
5557 			struct btrfs_path *path, u64 min_objectid,
5558 			int type)
5559 {
5560 	struct btrfs_key found_key;
5561 	struct extent_buffer *leaf;
5562 	u32 nritems;
5563 	int ret;
5564 
5565 	while (1) {
5566 		if (path->slots[0] == 0) {
5567 			btrfs_set_path_blocking(path);
5568 			ret = btrfs_prev_leaf(root, path);
5569 			if (ret != 0)
5570 				return ret;
5571 		} else {
5572 			path->slots[0]--;
5573 		}
5574 		leaf = path->nodes[0];
5575 		nritems = btrfs_header_nritems(leaf);
5576 		if (nritems == 0)
5577 			return 1;
5578 		if (path->slots[0] == nritems)
5579 			path->slots[0]--;
5580 
5581 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5582 		if (found_key.objectid < min_objectid)
5583 			break;
5584 		if (found_key.type == type)
5585 			return 0;
5586 		if (found_key.objectid == min_objectid &&
5587 		    found_key.type < type)
5588 			break;
5589 	}
5590 	return 1;
5591 }
5592 
5593 /*
5594  * search in extent tree to find a previous Metadata/Data extent item with
5595  * min objecitd.
5596  *
5597  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5598  */
5599 int btrfs_previous_extent_item(struct btrfs_root *root,
5600 			struct btrfs_path *path, u64 min_objectid)
5601 {
5602 	struct btrfs_key found_key;
5603 	struct extent_buffer *leaf;
5604 	u32 nritems;
5605 	int ret;
5606 
5607 	while (1) {
5608 		if (path->slots[0] == 0) {
5609 			btrfs_set_path_blocking(path);
5610 			ret = btrfs_prev_leaf(root, path);
5611 			if (ret != 0)
5612 				return ret;
5613 		} else {
5614 			path->slots[0]--;
5615 		}
5616 		leaf = path->nodes[0];
5617 		nritems = btrfs_header_nritems(leaf);
5618 		if (nritems == 0)
5619 			return 1;
5620 		if (path->slots[0] == nritems)
5621 			path->slots[0]--;
5622 
5623 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5624 		if (found_key.objectid < min_objectid)
5625 			break;
5626 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5627 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5628 			return 0;
5629 		if (found_key.objectid == min_objectid &&
5630 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5631 			break;
5632 	}
5633 	return 1;
5634 }
5635