xref: /openbmc/linux/fs/btrfs/ctree.c (revision 76426e23)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19 		      *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
22 		      int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24 			  struct extent_buffer *dst,
25 			  struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27 			      struct extent_buffer *dst_buf,
28 			      struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 		    int level, int slot);
31 
32 static const struct btrfs_csums {
33 	u16		size;
34 	const char	name[10];
35 	const char	driver[12];
36 } btrfs_csums[] = {
37 	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38 	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39 	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40 	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41 				     .driver = "blake2b-256" },
42 };
43 
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46 	u16 t = btrfs_super_csum_type(s);
47 	/*
48 	 * csum type is validated at mount time
49 	 */
50 	return btrfs_csums[t].size;
51 }
52 
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55 	/* csum type is validated at mount time */
56 	return btrfs_csums[csum_type].name;
57 }
58 
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65 	/* csum type is validated at mount time */
66 	return btrfs_csums[csum_type].driver[0] ?
67 		btrfs_csums[csum_type].driver :
68 		btrfs_csums[csum_type].name;
69 }
70 
71 size_t __const btrfs_get_num_csums(void)
72 {
73 	return ARRAY_SIZE(btrfs_csums);
74 }
75 
76 struct btrfs_path *btrfs_alloc_path(void)
77 {
78 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79 }
80 
81 /* this also releases the path */
82 void btrfs_free_path(struct btrfs_path *p)
83 {
84 	if (!p)
85 		return;
86 	btrfs_release_path(p);
87 	kmem_cache_free(btrfs_path_cachep, p);
88 }
89 
90 /*
91  * path release drops references on the extent buffers in the path
92  * and it drops any locks held by this path
93  *
94  * It is safe to call this on paths that no locks or extent buffers held.
95  */
96 noinline void btrfs_release_path(struct btrfs_path *p)
97 {
98 	int i;
99 
100 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101 		p->slots[i] = 0;
102 		if (!p->nodes[i])
103 			continue;
104 		if (p->locks[i]) {
105 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106 			p->locks[i] = 0;
107 		}
108 		free_extent_buffer(p->nodes[i]);
109 		p->nodes[i] = NULL;
110 	}
111 }
112 
113 /*
114  * safely gets a reference on the root node of a tree.  A lock
115  * is not taken, so a concurrent writer may put a different node
116  * at the root of the tree.  See btrfs_lock_root_node for the
117  * looping required.
118  *
119  * The extent buffer returned by this has a reference taken, so
120  * it won't disappear.  It may stop being the root of the tree
121  * at any time because there are no locks held.
122  */
123 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124 {
125 	struct extent_buffer *eb;
126 
127 	while (1) {
128 		rcu_read_lock();
129 		eb = rcu_dereference(root->node);
130 
131 		/*
132 		 * RCU really hurts here, we could free up the root node because
133 		 * it was COWed but we may not get the new root node yet so do
134 		 * the inc_not_zero dance and if it doesn't work then
135 		 * synchronize_rcu and try again.
136 		 */
137 		if (atomic_inc_not_zero(&eb->refs)) {
138 			rcu_read_unlock();
139 			break;
140 		}
141 		rcu_read_unlock();
142 		synchronize_rcu();
143 	}
144 	return eb;
145 }
146 
147 /* cowonly root (everything not a reference counted cow subvolume), just get
148  * put onto a simple dirty list.  transaction.c walks this to make sure they
149  * get properly updated on disk.
150  */
151 static void add_root_to_dirty_list(struct btrfs_root *root)
152 {
153 	struct btrfs_fs_info *fs_info = root->fs_info;
154 
155 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
156 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
157 		return;
158 
159 	spin_lock(&fs_info->trans_lock);
160 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
161 		/* Want the extent tree to be the last on the list */
162 		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
163 			list_move_tail(&root->dirty_list,
164 				       &fs_info->dirty_cowonly_roots);
165 		else
166 			list_move(&root->dirty_list,
167 				  &fs_info->dirty_cowonly_roots);
168 	}
169 	spin_unlock(&fs_info->trans_lock);
170 }
171 
172 /*
173  * used by snapshot creation to make a copy of a root for a tree with
174  * a given objectid.  The buffer with the new root node is returned in
175  * cow_ret, and this func returns zero on success or a negative error code.
176  */
177 int btrfs_copy_root(struct btrfs_trans_handle *trans,
178 		      struct btrfs_root *root,
179 		      struct extent_buffer *buf,
180 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
181 {
182 	struct btrfs_fs_info *fs_info = root->fs_info;
183 	struct extent_buffer *cow;
184 	int ret = 0;
185 	int level;
186 	struct btrfs_disk_key disk_key;
187 
188 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
189 		trans->transid != fs_info->running_transaction->transid);
190 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
191 		trans->transid != root->last_trans);
192 
193 	level = btrfs_header_level(buf);
194 	if (level == 0)
195 		btrfs_item_key(buf, &disk_key, 0);
196 	else
197 		btrfs_node_key(buf, &disk_key, 0);
198 
199 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
200 			&disk_key, level, buf->start, 0);
201 	if (IS_ERR(cow))
202 		return PTR_ERR(cow);
203 
204 	copy_extent_buffer_full(cow, buf);
205 	btrfs_set_header_bytenr(cow, cow->start);
206 	btrfs_set_header_generation(cow, trans->transid);
207 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
208 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
209 				     BTRFS_HEADER_FLAG_RELOC);
210 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
211 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
212 	else
213 		btrfs_set_header_owner(cow, new_root_objectid);
214 
215 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
216 
217 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
218 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
219 		ret = btrfs_inc_ref(trans, root, cow, 1);
220 	else
221 		ret = btrfs_inc_ref(trans, root, cow, 0);
222 
223 	if (ret)
224 		return ret;
225 
226 	btrfs_mark_buffer_dirty(cow);
227 	*cow_ret = cow;
228 	return 0;
229 }
230 
231 enum mod_log_op {
232 	MOD_LOG_KEY_REPLACE,
233 	MOD_LOG_KEY_ADD,
234 	MOD_LOG_KEY_REMOVE,
235 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
236 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
237 	MOD_LOG_MOVE_KEYS,
238 	MOD_LOG_ROOT_REPLACE,
239 };
240 
241 struct tree_mod_root {
242 	u64 logical;
243 	u8 level;
244 };
245 
246 struct tree_mod_elem {
247 	struct rb_node node;
248 	u64 logical;
249 	u64 seq;
250 	enum mod_log_op op;
251 
252 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
253 	int slot;
254 
255 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
256 	u64 generation;
257 
258 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
259 	struct btrfs_disk_key key;
260 	u64 blockptr;
261 
262 	/* this is used for op == MOD_LOG_MOVE_KEYS */
263 	struct {
264 		int dst_slot;
265 		int nr_items;
266 	} move;
267 
268 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
269 	struct tree_mod_root old_root;
270 };
271 
272 /*
273  * Pull a new tree mod seq number for our operation.
274  */
275 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
276 {
277 	return atomic64_inc_return(&fs_info->tree_mod_seq);
278 }
279 
280 /*
281  * This adds a new blocker to the tree mod log's blocker list if the @elem
282  * passed does not already have a sequence number set. So when a caller expects
283  * to record tree modifications, it should ensure to set elem->seq to zero
284  * before calling btrfs_get_tree_mod_seq.
285  * Returns a fresh, unused tree log modification sequence number, even if no new
286  * blocker was added.
287  */
288 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
289 			   struct seq_list *elem)
290 {
291 	write_lock(&fs_info->tree_mod_log_lock);
292 	if (!elem->seq) {
293 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
294 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
295 	}
296 	write_unlock(&fs_info->tree_mod_log_lock);
297 
298 	return elem->seq;
299 }
300 
301 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
302 			    struct seq_list *elem)
303 {
304 	struct rb_root *tm_root;
305 	struct rb_node *node;
306 	struct rb_node *next;
307 	struct tree_mod_elem *tm;
308 	u64 min_seq = (u64)-1;
309 	u64 seq_putting = elem->seq;
310 
311 	if (!seq_putting)
312 		return;
313 
314 	write_lock(&fs_info->tree_mod_log_lock);
315 	list_del(&elem->list);
316 	elem->seq = 0;
317 
318 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
319 		struct seq_list *first;
320 
321 		first = list_first_entry(&fs_info->tree_mod_seq_list,
322 					 struct seq_list, list);
323 		if (seq_putting > first->seq) {
324 			/*
325 			 * Blocker with lower sequence number exists, we
326 			 * cannot remove anything from the log.
327 			 */
328 			write_unlock(&fs_info->tree_mod_log_lock);
329 			return;
330 		}
331 		min_seq = first->seq;
332 	}
333 
334 	/*
335 	 * anything that's lower than the lowest existing (read: blocked)
336 	 * sequence number can be removed from the tree.
337 	 */
338 	tm_root = &fs_info->tree_mod_log;
339 	for (node = rb_first(tm_root); node; node = next) {
340 		next = rb_next(node);
341 		tm = rb_entry(node, struct tree_mod_elem, node);
342 		if (tm->seq >= min_seq)
343 			continue;
344 		rb_erase(node, tm_root);
345 		kfree(tm);
346 	}
347 	write_unlock(&fs_info->tree_mod_log_lock);
348 }
349 
350 /*
351  * key order of the log:
352  *       node/leaf start address -> sequence
353  *
354  * The 'start address' is the logical address of the *new* root node
355  * for root replace operations, or the logical address of the affected
356  * block for all other operations.
357  */
358 static noinline int
359 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
360 {
361 	struct rb_root *tm_root;
362 	struct rb_node **new;
363 	struct rb_node *parent = NULL;
364 	struct tree_mod_elem *cur;
365 
366 	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
367 
368 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
369 
370 	tm_root = &fs_info->tree_mod_log;
371 	new = &tm_root->rb_node;
372 	while (*new) {
373 		cur = rb_entry(*new, struct tree_mod_elem, node);
374 		parent = *new;
375 		if (cur->logical < tm->logical)
376 			new = &((*new)->rb_left);
377 		else if (cur->logical > tm->logical)
378 			new = &((*new)->rb_right);
379 		else if (cur->seq < tm->seq)
380 			new = &((*new)->rb_left);
381 		else if (cur->seq > tm->seq)
382 			new = &((*new)->rb_right);
383 		else
384 			return -EEXIST;
385 	}
386 
387 	rb_link_node(&tm->node, parent, new);
388 	rb_insert_color(&tm->node, tm_root);
389 	return 0;
390 }
391 
392 /*
393  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
394  * returns zero with the tree_mod_log_lock acquired. The caller must hold
395  * this until all tree mod log insertions are recorded in the rb tree and then
396  * write unlock fs_info::tree_mod_log_lock.
397  */
398 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
399 				    struct extent_buffer *eb) {
400 	smp_mb();
401 	if (list_empty(&(fs_info)->tree_mod_seq_list))
402 		return 1;
403 	if (eb && btrfs_header_level(eb) == 0)
404 		return 1;
405 
406 	write_lock(&fs_info->tree_mod_log_lock);
407 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
408 		write_unlock(&fs_info->tree_mod_log_lock);
409 		return 1;
410 	}
411 
412 	return 0;
413 }
414 
415 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
416 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
417 				    struct extent_buffer *eb)
418 {
419 	smp_mb();
420 	if (list_empty(&(fs_info)->tree_mod_seq_list))
421 		return 0;
422 	if (eb && btrfs_header_level(eb) == 0)
423 		return 0;
424 
425 	return 1;
426 }
427 
428 static struct tree_mod_elem *
429 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
430 		    enum mod_log_op op, gfp_t flags)
431 {
432 	struct tree_mod_elem *tm;
433 
434 	tm = kzalloc(sizeof(*tm), flags);
435 	if (!tm)
436 		return NULL;
437 
438 	tm->logical = eb->start;
439 	if (op != MOD_LOG_KEY_ADD) {
440 		btrfs_node_key(eb, &tm->key, slot);
441 		tm->blockptr = btrfs_node_blockptr(eb, slot);
442 	}
443 	tm->op = op;
444 	tm->slot = slot;
445 	tm->generation = btrfs_node_ptr_generation(eb, slot);
446 	RB_CLEAR_NODE(&tm->node);
447 
448 	return tm;
449 }
450 
451 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
452 		enum mod_log_op op, gfp_t flags)
453 {
454 	struct tree_mod_elem *tm;
455 	int ret;
456 
457 	if (!tree_mod_need_log(eb->fs_info, eb))
458 		return 0;
459 
460 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
461 	if (!tm)
462 		return -ENOMEM;
463 
464 	if (tree_mod_dont_log(eb->fs_info, eb)) {
465 		kfree(tm);
466 		return 0;
467 	}
468 
469 	ret = __tree_mod_log_insert(eb->fs_info, tm);
470 	write_unlock(&eb->fs_info->tree_mod_log_lock);
471 	if (ret)
472 		kfree(tm);
473 
474 	return ret;
475 }
476 
477 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
478 		int dst_slot, int src_slot, int nr_items)
479 {
480 	struct tree_mod_elem *tm = NULL;
481 	struct tree_mod_elem **tm_list = NULL;
482 	int ret = 0;
483 	int i;
484 	int locked = 0;
485 
486 	if (!tree_mod_need_log(eb->fs_info, eb))
487 		return 0;
488 
489 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
490 	if (!tm_list)
491 		return -ENOMEM;
492 
493 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
494 	if (!tm) {
495 		ret = -ENOMEM;
496 		goto free_tms;
497 	}
498 
499 	tm->logical = eb->start;
500 	tm->slot = src_slot;
501 	tm->move.dst_slot = dst_slot;
502 	tm->move.nr_items = nr_items;
503 	tm->op = MOD_LOG_MOVE_KEYS;
504 
505 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
506 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
507 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
508 		if (!tm_list[i]) {
509 			ret = -ENOMEM;
510 			goto free_tms;
511 		}
512 	}
513 
514 	if (tree_mod_dont_log(eb->fs_info, eb))
515 		goto free_tms;
516 	locked = 1;
517 
518 	/*
519 	 * When we override something during the move, we log these removals.
520 	 * This can only happen when we move towards the beginning of the
521 	 * buffer, i.e. dst_slot < src_slot.
522 	 */
523 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
524 		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
525 		if (ret)
526 			goto free_tms;
527 	}
528 
529 	ret = __tree_mod_log_insert(eb->fs_info, tm);
530 	if (ret)
531 		goto free_tms;
532 	write_unlock(&eb->fs_info->tree_mod_log_lock);
533 	kfree(tm_list);
534 
535 	return 0;
536 free_tms:
537 	for (i = 0; i < nr_items; i++) {
538 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
539 			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
540 		kfree(tm_list[i]);
541 	}
542 	if (locked)
543 		write_unlock(&eb->fs_info->tree_mod_log_lock);
544 	kfree(tm_list);
545 	kfree(tm);
546 
547 	return ret;
548 }
549 
550 static inline int
551 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
552 		       struct tree_mod_elem **tm_list,
553 		       int nritems)
554 {
555 	int i, j;
556 	int ret;
557 
558 	for (i = nritems - 1; i >= 0; i--) {
559 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
560 		if (ret) {
561 			for (j = nritems - 1; j > i; j--)
562 				rb_erase(&tm_list[j]->node,
563 					 &fs_info->tree_mod_log);
564 			return ret;
565 		}
566 	}
567 
568 	return 0;
569 }
570 
571 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
572 			 struct extent_buffer *new_root, int log_removal)
573 {
574 	struct btrfs_fs_info *fs_info = old_root->fs_info;
575 	struct tree_mod_elem *tm = NULL;
576 	struct tree_mod_elem **tm_list = NULL;
577 	int nritems = 0;
578 	int ret = 0;
579 	int i;
580 
581 	if (!tree_mod_need_log(fs_info, NULL))
582 		return 0;
583 
584 	if (log_removal && btrfs_header_level(old_root) > 0) {
585 		nritems = btrfs_header_nritems(old_root);
586 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
587 				  GFP_NOFS);
588 		if (!tm_list) {
589 			ret = -ENOMEM;
590 			goto free_tms;
591 		}
592 		for (i = 0; i < nritems; i++) {
593 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
594 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
595 			if (!tm_list[i]) {
596 				ret = -ENOMEM;
597 				goto free_tms;
598 			}
599 		}
600 	}
601 
602 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
603 	if (!tm) {
604 		ret = -ENOMEM;
605 		goto free_tms;
606 	}
607 
608 	tm->logical = new_root->start;
609 	tm->old_root.logical = old_root->start;
610 	tm->old_root.level = btrfs_header_level(old_root);
611 	tm->generation = btrfs_header_generation(old_root);
612 	tm->op = MOD_LOG_ROOT_REPLACE;
613 
614 	if (tree_mod_dont_log(fs_info, NULL))
615 		goto free_tms;
616 
617 	if (tm_list)
618 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
619 	if (!ret)
620 		ret = __tree_mod_log_insert(fs_info, tm);
621 
622 	write_unlock(&fs_info->tree_mod_log_lock);
623 	if (ret)
624 		goto free_tms;
625 	kfree(tm_list);
626 
627 	return ret;
628 
629 free_tms:
630 	if (tm_list) {
631 		for (i = 0; i < nritems; i++)
632 			kfree(tm_list[i]);
633 		kfree(tm_list);
634 	}
635 	kfree(tm);
636 
637 	return ret;
638 }
639 
640 static struct tree_mod_elem *
641 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
642 		      int smallest)
643 {
644 	struct rb_root *tm_root;
645 	struct rb_node *node;
646 	struct tree_mod_elem *cur = NULL;
647 	struct tree_mod_elem *found = NULL;
648 
649 	read_lock(&fs_info->tree_mod_log_lock);
650 	tm_root = &fs_info->tree_mod_log;
651 	node = tm_root->rb_node;
652 	while (node) {
653 		cur = rb_entry(node, struct tree_mod_elem, node);
654 		if (cur->logical < start) {
655 			node = node->rb_left;
656 		} else if (cur->logical > start) {
657 			node = node->rb_right;
658 		} else if (cur->seq < min_seq) {
659 			node = node->rb_left;
660 		} else if (!smallest) {
661 			/* we want the node with the highest seq */
662 			if (found)
663 				BUG_ON(found->seq > cur->seq);
664 			found = cur;
665 			node = node->rb_left;
666 		} else if (cur->seq > min_seq) {
667 			/* we want the node with the smallest seq */
668 			if (found)
669 				BUG_ON(found->seq < cur->seq);
670 			found = cur;
671 			node = node->rb_right;
672 		} else {
673 			found = cur;
674 			break;
675 		}
676 	}
677 	read_unlock(&fs_info->tree_mod_log_lock);
678 
679 	return found;
680 }
681 
682 /*
683  * this returns the element from the log with the smallest time sequence
684  * value that's in the log (the oldest log item). any element with a time
685  * sequence lower than min_seq will be ignored.
686  */
687 static struct tree_mod_elem *
688 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
689 			   u64 min_seq)
690 {
691 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
692 }
693 
694 /*
695  * this returns the element from the log with the largest time sequence
696  * value that's in the log (the most recent log item). any element with
697  * a time sequence lower than min_seq will be ignored.
698  */
699 static struct tree_mod_elem *
700 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
701 {
702 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
703 }
704 
705 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
706 		     struct extent_buffer *src, unsigned long dst_offset,
707 		     unsigned long src_offset, int nr_items)
708 {
709 	struct btrfs_fs_info *fs_info = dst->fs_info;
710 	int ret = 0;
711 	struct tree_mod_elem **tm_list = NULL;
712 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
713 	int i;
714 	int locked = 0;
715 
716 	if (!tree_mod_need_log(fs_info, NULL))
717 		return 0;
718 
719 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
720 		return 0;
721 
722 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
723 			  GFP_NOFS);
724 	if (!tm_list)
725 		return -ENOMEM;
726 
727 	tm_list_add = tm_list;
728 	tm_list_rem = tm_list + nr_items;
729 	for (i = 0; i < nr_items; i++) {
730 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
731 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
732 		if (!tm_list_rem[i]) {
733 			ret = -ENOMEM;
734 			goto free_tms;
735 		}
736 
737 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
738 		    MOD_LOG_KEY_ADD, GFP_NOFS);
739 		if (!tm_list_add[i]) {
740 			ret = -ENOMEM;
741 			goto free_tms;
742 		}
743 	}
744 
745 	if (tree_mod_dont_log(fs_info, NULL))
746 		goto free_tms;
747 	locked = 1;
748 
749 	for (i = 0; i < nr_items; i++) {
750 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
751 		if (ret)
752 			goto free_tms;
753 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
754 		if (ret)
755 			goto free_tms;
756 	}
757 
758 	write_unlock(&fs_info->tree_mod_log_lock);
759 	kfree(tm_list);
760 
761 	return 0;
762 
763 free_tms:
764 	for (i = 0; i < nr_items * 2; i++) {
765 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
766 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
767 		kfree(tm_list[i]);
768 	}
769 	if (locked)
770 		write_unlock(&fs_info->tree_mod_log_lock);
771 	kfree(tm_list);
772 
773 	return ret;
774 }
775 
776 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
777 {
778 	struct tree_mod_elem **tm_list = NULL;
779 	int nritems = 0;
780 	int i;
781 	int ret = 0;
782 
783 	if (btrfs_header_level(eb) == 0)
784 		return 0;
785 
786 	if (!tree_mod_need_log(eb->fs_info, NULL))
787 		return 0;
788 
789 	nritems = btrfs_header_nritems(eb);
790 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
791 	if (!tm_list)
792 		return -ENOMEM;
793 
794 	for (i = 0; i < nritems; i++) {
795 		tm_list[i] = alloc_tree_mod_elem(eb, i,
796 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
797 		if (!tm_list[i]) {
798 			ret = -ENOMEM;
799 			goto free_tms;
800 		}
801 	}
802 
803 	if (tree_mod_dont_log(eb->fs_info, eb))
804 		goto free_tms;
805 
806 	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
807 	write_unlock(&eb->fs_info->tree_mod_log_lock);
808 	if (ret)
809 		goto free_tms;
810 	kfree(tm_list);
811 
812 	return 0;
813 
814 free_tms:
815 	for (i = 0; i < nritems; i++)
816 		kfree(tm_list[i]);
817 	kfree(tm_list);
818 
819 	return ret;
820 }
821 
822 /*
823  * check if the tree block can be shared by multiple trees
824  */
825 int btrfs_block_can_be_shared(struct btrfs_root *root,
826 			      struct extent_buffer *buf)
827 {
828 	/*
829 	 * Tree blocks not in reference counted trees and tree roots
830 	 * are never shared. If a block was allocated after the last
831 	 * snapshot and the block was not allocated by tree relocation,
832 	 * we know the block is not shared.
833 	 */
834 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
835 	    buf != root->node && buf != root->commit_root &&
836 	    (btrfs_header_generation(buf) <=
837 	     btrfs_root_last_snapshot(&root->root_item) ||
838 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
839 		return 1;
840 
841 	return 0;
842 }
843 
844 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
845 				       struct btrfs_root *root,
846 				       struct extent_buffer *buf,
847 				       struct extent_buffer *cow,
848 				       int *last_ref)
849 {
850 	struct btrfs_fs_info *fs_info = root->fs_info;
851 	u64 refs;
852 	u64 owner;
853 	u64 flags;
854 	u64 new_flags = 0;
855 	int ret;
856 
857 	/*
858 	 * Backrefs update rules:
859 	 *
860 	 * Always use full backrefs for extent pointers in tree block
861 	 * allocated by tree relocation.
862 	 *
863 	 * If a shared tree block is no longer referenced by its owner
864 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
865 	 * use full backrefs for extent pointers in tree block.
866 	 *
867 	 * If a tree block is been relocating
868 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
869 	 * use full backrefs for extent pointers in tree block.
870 	 * The reason for this is some operations (such as drop tree)
871 	 * are only allowed for blocks use full backrefs.
872 	 */
873 
874 	if (btrfs_block_can_be_shared(root, buf)) {
875 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
876 					       btrfs_header_level(buf), 1,
877 					       &refs, &flags);
878 		if (ret)
879 			return ret;
880 		if (refs == 0) {
881 			ret = -EROFS;
882 			btrfs_handle_fs_error(fs_info, ret, NULL);
883 			return ret;
884 		}
885 	} else {
886 		refs = 1;
887 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
888 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
889 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
890 		else
891 			flags = 0;
892 	}
893 
894 	owner = btrfs_header_owner(buf);
895 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
896 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
897 
898 	if (refs > 1) {
899 		if ((owner == root->root_key.objectid ||
900 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
901 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
902 			ret = btrfs_inc_ref(trans, root, buf, 1);
903 			if (ret)
904 				return ret;
905 
906 			if (root->root_key.objectid ==
907 			    BTRFS_TREE_RELOC_OBJECTID) {
908 				ret = btrfs_dec_ref(trans, root, buf, 0);
909 				if (ret)
910 					return ret;
911 				ret = btrfs_inc_ref(trans, root, cow, 1);
912 				if (ret)
913 					return ret;
914 			}
915 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
916 		} else {
917 
918 			if (root->root_key.objectid ==
919 			    BTRFS_TREE_RELOC_OBJECTID)
920 				ret = btrfs_inc_ref(trans, root, cow, 1);
921 			else
922 				ret = btrfs_inc_ref(trans, root, cow, 0);
923 			if (ret)
924 				return ret;
925 		}
926 		if (new_flags != 0) {
927 			int level = btrfs_header_level(buf);
928 
929 			ret = btrfs_set_disk_extent_flags(trans, buf,
930 							  new_flags, level, 0);
931 			if (ret)
932 				return ret;
933 		}
934 	} else {
935 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
936 			if (root->root_key.objectid ==
937 			    BTRFS_TREE_RELOC_OBJECTID)
938 				ret = btrfs_inc_ref(trans, root, cow, 1);
939 			else
940 				ret = btrfs_inc_ref(trans, root, cow, 0);
941 			if (ret)
942 				return ret;
943 			ret = btrfs_dec_ref(trans, root, buf, 1);
944 			if (ret)
945 				return ret;
946 		}
947 		btrfs_clean_tree_block(buf);
948 		*last_ref = 1;
949 	}
950 	return 0;
951 }
952 
953 static struct extent_buffer *alloc_tree_block_no_bg_flush(
954 					  struct btrfs_trans_handle *trans,
955 					  struct btrfs_root *root,
956 					  u64 parent_start,
957 					  const struct btrfs_disk_key *disk_key,
958 					  int level,
959 					  u64 hint,
960 					  u64 empty_size)
961 {
962 	struct btrfs_fs_info *fs_info = root->fs_info;
963 	struct extent_buffer *ret;
964 
965 	/*
966 	 * If we are COWing a node/leaf from the extent, chunk, device or free
967 	 * space trees, make sure that we do not finish block group creation of
968 	 * pending block groups. We do this to avoid a deadlock.
969 	 * COWing can result in allocation of a new chunk, and flushing pending
970 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
971 	 * when finishing allocation of a new chunk. Creation of a pending block
972 	 * group modifies the extent, chunk, device and free space trees,
973 	 * therefore we could deadlock with ourselves since we are holding a
974 	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
975 	 * try to COW later.
976 	 * For similar reasons, we also need to delay flushing pending block
977 	 * groups when splitting a leaf or node, from one of those trees, since
978 	 * we are holding a write lock on it and its parent or when inserting a
979 	 * new root node for one of those trees.
980 	 */
981 	if (root == fs_info->extent_root ||
982 	    root == fs_info->chunk_root ||
983 	    root == fs_info->dev_root ||
984 	    root == fs_info->free_space_root)
985 		trans->can_flush_pending_bgs = false;
986 
987 	ret = btrfs_alloc_tree_block(trans, root, parent_start,
988 				     root->root_key.objectid, disk_key, level,
989 				     hint, empty_size);
990 	trans->can_flush_pending_bgs = true;
991 
992 	return ret;
993 }
994 
995 /*
996  * does the dirty work in cow of a single block.  The parent block (if
997  * supplied) is updated to point to the new cow copy.  The new buffer is marked
998  * dirty and returned locked.  If you modify the block it needs to be marked
999  * dirty again.
1000  *
1001  * search_start -- an allocation hint for the new block
1002  *
1003  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1004  * bytes the allocator should try to find free next to the block it returns.
1005  * This is just a hint and may be ignored by the allocator.
1006  */
1007 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1008 			     struct btrfs_root *root,
1009 			     struct extent_buffer *buf,
1010 			     struct extent_buffer *parent, int parent_slot,
1011 			     struct extent_buffer **cow_ret,
1012 			     u64 search_start, u64 empty_size)
1013 {
1014 	struct btrfs_fs_info *fs_info = root->fs_info;
1015 	struct btrfs_disk_key disk_key;
1016 	struct extent_buffer *cow;
1017 	int level, ret;
1018 	int last_ref = 0;
1019 	int unlock_orig = 0;
1020 	u64 parent_start = 0;
1021 
1022 	if (*cow_ret == buf)
1023 		unlock_orig = 1;
1024 
1025 	btrfs_assert_tree_locked(buf);
1026 
1027 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1028 		trans->transid != fs_info->running_transaction->transid);
1029 	WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1030 		trans->transid != root->last_trans);
1031 
1032 	level = btrfs_header_level(buf);
1033 
1034 	if (level == 0)
1035 		btrfs_item_key(buf, &disk_key, 0);
1036 	else
1037 		btrfs_node_key(buf, &disk_key, 0);
1038 
1039 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1040 		parent_start = parent->start;
1041 
1042 	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1043 					   level, search_start, empty_size);
1044 	if (IS_ERR(cow))
1045 		return PTR_ERR(cow);
1046 
1047 	/* cow is set to blocking by btrfs_init_new_buffer */
1048 
1049 	copy_extent_buffer_full(cow, buf);
1050 	btrfs_set_header_bytenr(cow, cow->start);
1051 	btrfs_set_header_generation(cow, trans->transid);
1052 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1053 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1054 				     BTRFS_HEADER_FLAG_RELOC);
1055 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1056 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1057 	else
1058 		btrfs_set_header_owner(cow, root->root_key.objectid);
1059 
1060 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1061 
1062 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1063 	if (ret) {
1064 		btrfs_abort_transaction(trans, ret);
1065 		return ret;
1066 	}
1067 
1068 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1069 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1070 		if (ret) {
1071 			btrfs_abort_transaction(trans, ret);
1072 			return ret;
1073 		}
1074 	}
1075 
1076 	if (buf == root->node) {
1077 		WARN_ON(parent && parent != buf);
1078 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1079 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1080 			parent_start = buf->start;
1081 
1082 		atomic_inc(&cow->refs);
1083 		ret = tree_mod_log_insert_root(root->node, cow, 1);
1084 		BUG_ON(ret < 0);
1085 		rcu_assign_pointer(root->node, cow);
1086 
1087 		btrfs_free_tree_block(trans, root, buf, parent_start,
1088 				      last_ref);
1089 		free_extent_buffer(buf);
1090 		add_root_to_dirty_list(root);
1091 	} else {
1092 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1093 		tree_mod_log_insert_key(parent, parent_slot,
1094 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1095 		btrfs_set_node_blockptr(parent, parent_slot,
1096 					cow->start);
1097 		btrfs_set_node_ptr_generation(parent, parent_slot,
1098 					      trans->transid);
1099 		btrfs_mark_buffer_dirty(parent);
1100 		if (last_ref) {
1101 			ret = tree_mod_log_free_eb(buf);
1102 			if (ret) {
1103 				btrfs_abort_transaction(trans, ret);
1104 				return ret;
1105 			}
1106 		}
1107 		btrfs_free_tree_block(trans, root, buf, parent_start,
1108 				      last_ref);
1109 	}
1110 	if (unlock_orig)
1111 		btrfs_tree_unlock(buf);
1112 	free_extent_buffer_stale(buf);
1113 	btrfs_mark_buffer_dirty(cow);
1114 	*cow_ret = cow;
1115 	return 0;
1116 }
1117 
1118 /*
1119  * returns the logical address of the oldest predecessor of the given root.
1120  * entries older than time_seq are ignored.
1121  */
1122 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1123 		struct extent_buffer *eb_root, u64 time_seq)
1124 {
1125 	struct tree_mod_elem *tm;
1126 	struct tree_mod_elem *found = NULL;
1127 	u64 root_logical = eb_root->start;
1128 	int looped = 0;
1129 
1130 	if (!time_seq)
1131 		return NULL;
1132 
1133 	/*
1134 	 * the very last operation that's logged for a root is the
1135 	 * replacement operation (if it is replaced at all). this has
1136 	 * the logical address of the *new* root, making it the very
1137 	 * first operation that's logged for this root.
1138 	 */
1139 	while (1) {
1140 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1141 						time_seq);
1142 		if (!looped && !tm)
1143 			return NULL;
1144 		/*
1145 		 * if there are no tree operation for the oldest root, we simply
1146 		 * return it. this should only happen if that (old) root is at
1147 		 * level 0.
1148 		 */
1149 		if (!tm)
1150 			break;
1151 
1152 		/*
1153 		 * if there's an operation that's not a root replacement, we
1154 		 * found the oldest version of our root. normally, we'll find a
1155 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1156 		 */
1157 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1158 			break;
1159 
1160 		found = tm;
1161 		root_logical = tm->old_root.logical;
1162 		looped = 1;
1163 	}
1164 
1165 	/* if there's no old root to return, return what we found instead */
1166 	if (!found)
1167 		found = tm;
1168 
1169 	return found;
1170 }
1171 
1172 /*
1173  * tm is a pointer to the first operation to rewind within eb. then, all
1174  * previous operations will be rewound (until we reach something older than
1175  * time_seq).
1176  */
1177 static void
1178 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1179 		      u64 time_seq, struct tree_mod_elem *first_tm)
1180 {
1181 	u32 n;
1182 	struct rb_node *next;
1183 	struct tree_mod_elem *tm = first_tm;
1184 	unsigned long o_dst;
1185 	unsigned long o_src;
1186 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1187 
1188 	n = btrfs_header_nritems(eb);
1189 	read_lock(&fs_info->tree_mod_log_lock);
1190 	while (tm && tm->seq >= time_seq) {
1191 		/*
1192 		 * all the operations are recorded with the operator used for
1193 		 * the modification. as we're going backwards, we do the
1194 		 * opposite of each operation here.
1195 		 */
1196 		switch (tm->op) {
1197 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1198 			BUG_ON(tm->slot < n);
1199 			/* Fallthrough */
1200 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1201 		case MOD_LOG_KEY_REMOVE:
1202 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1203 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1204 			btrfs_set_node_ptr_generation(eb, tm->slot,
1205 						      tm->generation);
1206 			n++;
1207 			break;
1208 		case MOD_LOG_KEY_REPLACE:
1209 			BUG_ON(tm->slot >= n);
1210 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1211 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1212 			btrfs_set_node_ptr_generation(eb, tm->slot,
1213 						      tm->generation);
1214 			break;
1215 		case MOD_LOG_KEY_ADD:
1216 			/* if a move operation is needed it's in the log */
1217 			n--;
1218 			break;
1219 		case MOD_LOG_MOVE_KEYS:
1220 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1221 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1222 			memmove_extent_buffer(eb, o_dst, o_src,
1223 					      tm->move.nr_items * p_size);
1224 			break;
1225 		case MOD_LOG_ROOT_REPLACE:
1226 			/*
1227 			 * this operation is special. for roots, this must be
1228 			 * handled explicitly before rewinding.
1229 			 * for non-roots, this operation may exist if the node
1230 			 * was a root: root A -> child B; then A gets empty and
1231 			 * B is promoted to the new root. in the mod log, we'll
1232 			 * have a root-replace operation for B, a tree block
1233 			 * that is no root. we simply ignore that operation.
1234 			 */
1235 			break;
1236 		}
1237 		next = rb_next(&tm->node);
1238 		if (!next)
1239 			break;
1240 		tm = rb_entry(next, struct tree_mod_elem, node);
1241 		if (tm->logical != first_tm->logical)
1242 			break;
1243 	}
1244 	read_unlock(&fs_info->tree_mod_log_lock);
1245 	btrfs_set_header_nritems(eb, n);
1246 }
1247 
1248 /*
1249  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1250  * is returned. If rewind operations happen, a fresh buffer is returned. The
1251  * returned buffer is always read-locked. If the returned buffer is not the
1252  * input buffer, the lock on the input buffer is released and the input buffer
1253  * is freed (its refcount is decremented).
1254  */
1255 static struct extent_buffer *
1256 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1257 		    struct extent_buffer *eb, u64 time_seq)
1258 {
1259 	struct extent_buffer *eb_rewin;
1260 	struct tree_mod_elem *tm;
1261 
1262 	if (!time_seq)
1263 		return eb;
1264 
1265 	if (btrfs_header_level(eb) == 0)
1266 		return eb;
1267 
1268 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1269 	if (!tm)
1270 		return eb;
1271 
1272 	btrfs_set_path_blocking(path);
1273 	btrfs_set_lock_blocking_read(eb);
1274 
1275 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1276 		BUG_ON(tm->slot != 0);
1277 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1278 		if (!eb_rewin) {
1279 			btrfs_tree_read_unlock_blocking(eb);
1280 			free_extent_buffer(eb);
1281 			return NULL;
1282 		}
1283 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1284 		btrfs_set_header_backref_rev(eb_rewin,
1285 					     btrfs_header_backref_rev(eb));
1286 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1287 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1288 	} else {
1289 		eb_rewin = btrfs_clone_extent_buffer(eb);
1290 		if (!eb_rewin) {
1291 			btrfs_tree_read_unlock_blocking(eb);
1292 			free_extent_buffer(eb);
1293 			return NULL;
1294 		}
1295 	}
1296 
1297 	btrfs_tree_read_unlock_blocking(eb);
1298 	free_extent_buffer(eb);
1299 
1300 	btrfs_tree_read_lock(eb_rewin);
1301 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1302 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1303 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1304 
1305 	return eb_rewin;
1306 }
1307 
1308 /*
1309  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1310  * value. If there are no changes, the current root->root_node is returned. If
1311  * anything changed in between, there's a fresh buffer allocated on which the
1312  * rewind operations are done. In any case, the returned buffer is read locked.
1313  * Returns NULL on error (with no locks held).
1314  */
1315 static inline struct extent_buffer *
1316 get_old_root(struct btrfs_root *root, u64 time_seq)
1317 {
1318 	struct btrfs_fs_info *fs_info = root->fs_info;
1319 	struct tree_mod_elem *tm;
1320 	struct extent_buffer *eb = NULL;
1321 	struct extent_buffer *eb_root;
1322 	u64 eb_root_owner = 0;
1323 	struct extent_buffer *old;
1324 	struct tree_mod_root *old_root = NULL;
1325 	u64 old_generation = 0;
1326 	u64 logical;
1327 	int level;
1328 
1329 	eb_root = btrfs_read_lock_root_node(root);
1330 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1331 	if (!tm)
1332 		return eb_root;
1333 
1334 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1335 		old_root = &tm->old_root;
1336 		old_generation = tm->generation;
1337 		logical = old_root->logical;
1338 		level = old_root->level;
1339 	} else {
1340 		logical = eb_root->start;
1341 		level = btrfs_header_level(eb_root);
1342 	}
1343 
1344 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1345 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1346 		btrfs_tree_read_unlock(eb_root);
1347 		free_extent_buffer(eb_root);
1348 		old = read_tree_block(fs_info, logical, 0, level, NULL);
1349 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1350 			if (!IS_ERR(old))
1351 				free_extent_buffer(old);
1352 			btrfs_warn(fs_info,
1353 				   "failed to read tree block %llu from get_old_root",
1354 				   logical);
1355 		} else {
1356 			eb = btrfs_clone_extent_buffer(old);
1357 			free_extent_buffer(old);
1358 		}
1359 	} else if (old_root) {
1360 		eb_root_owner = btrfs_header_owner(eb_root);
1361 		btrfs_tree_read_unlock(eb_root);
1362 		free_extent_buffer(eb_root);
1363 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1364 	} else {
1365 		btrfs_set_lock_blocking_read(eb_root);
1366 		eb = btrfs_clone_extent_buffer(eb_root);
1367 		btrfs_tree_read_unlock_blocking(eb_root);
1368 		free_extent_buffer(eb_root);
1369 	}
1370 
1371 	if (!eb)
1372 		return NULL;
1373 	btrfs_tree_read_lock(eb);
1374 	if (old_root) {
1375 		btrfs_set_header_bytenr(eb, eb->start);
1376 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1377 		btrfs_set_header_owner(eb, eb_root_owner);
1378 		btrfs_set_header_level(eb, old_root->level);
1379 		btrfs_set_header_generation(eb, old_generation);
1380 	}
1381 	if (tm)
1382 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1383 	else
1384 		WARN_ON(btrfs_header_level(eb) != 0);
1385 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1386 
1387 	return eb;
1388 }
1389 
1390 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1391 {
1392 	struct tree_mod_elem *tm;
1393 	int level;
1394 	struct extent_buffer *eb_root = btrfs_root_node(root);
1395 
1396 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1397 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1398 		level = tm->old_root.level;
1399 	} else {
1400 		level = btrfs_header_level(eb_root);
1401 	}
1402 	free_extent_buffer(eb_root);
1403 
1404 	return level;
1405 }
1406 
1407 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1408 				   struct btrfs_root *root,
1409 				   struct extent_buffer *buf)
1410 {
1411 	if (btrfs_is_testing(root->fs_info))
1412 		return 0;
1413 
1414 	/* Ensure we can see the FORCE_COW bit */
1415 	smp_mb__before_atomic();
1416 
1417 	/*
1418 	 * We do not need to cow a block if
1419 	 * 1) this block is not created or changed in this transaction;
1420 	 * 2) this block does not belong to TREE_RELOC tree;
1421 	 * 3) the root is not forced COW.
1422 	 *
1423 	 * What is forced COW:
1424 	 *    when we create snapshot during committing the transaction,
1425 	 *    after we've finished copying src root, we must COW the shared
1426 	 *    block to ensure the metadata consistency.
1427 	 */
1428 	if (btrfs_header_generation(buf) == trans->transid &&
1429 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1430 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1431 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1432 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1433 		return 0;
1434 	return 1;
1435 }
1436 
1437 /*
1438  * cows a single block, see __btrfs_cow_block for the real work.
1439  * This version of it has extra checks so that a block isn't COWed more than
1440  * once per transaction, as long as it hasn't been written yet
1441  */
1442 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1443 		    struct btrfs_root *root, struct extent_buffer *buf,
1444 		    struct extent_buffer *parent, int parent_slot,
1445 		    struct extent_buffer **cow_ret)
1446 {
1447 	struct btrfs_fs_info *fs_info = root->fs_info;
1448 	u64 search_start;
1449 	int ret;
1450 
1451 	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1452 		btrfs_err(fs_info,
1453 			"COW'ing blocks on a fs root that's being dropped");
1454 
1455 	if (trans->transaction != fs_info->running_transaction)
1456 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1457 		       trans->transid,
1458 		       fs_info->running_transaction->transid);
1459 
1460 	if (trans->transid != fs_info->generation)
1461 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1462 		       trans->transid, fs_info->generation);
1463 
1464 	if (!should_cow_block(trans, root, buf)) {
1465 		trans->dirty = true;
1466 		*cow_ret = buf;
1467 		return 0;
1468 	}
1469 
1470 	search_start = buf->start & ~((u64)SZ_1G - 1);
1471 
1472 	if (parent)
1473 		btrfs_set_lock_blocking_write(parent);
1474 	btrfs_set_lock_blocking_write(buf);
1475 
1476 	/*
1477 	 * Before CoWing this block for later modification, check if it's
1478 	 * the subtree root and do the delayed subtree trace if needed.
1479 	 *
1480 	 * Also We don't care about the error, as it's handled internally.
1481 	 */
1482 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1483 	ret = __btrfs_cow_block(trans, root, buf, parent,
1484 				 parent_slot, cow_ret, search_start, 0);
1485 
1486 	trace_btrfs_cow_block(root, buf, *cow_ret);
1487 
1488 	return ret;
1489 }
1490 
1491 /*
1492  * helper function for defrag to decide if two blocks pointed to by a
1493  * node are actually close by
1494  */
1495 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1496 {
1497 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1498 		return 1;
1499 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1500 		return 1;
1501 	return 0;
1502 }
1503 
1504 /*
1505  * compare two keys in a memcmp fashion
1506  */
1507 static int comp_keys(const struct btrfs_disk_key *disk,
1508 		     const struct btrfs_key *k2)
1509 {
1510 	struct btrfs_key k1;
1511 
1512 	btrfs_disk_key_to_cpu(&k1, disk);
1513 
1514 	return btrfs_comp_cpu_keys(&k1, k2);
1515 }
1516 
1517 /*
1518  * same as comp_keys only with two btrfs_key's
1519  */
1520 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1521 {
1522 	if (k1->objectid > k2->objectid)
1523 		return 1;
1524 	if (k1->objectid < k2->objectid)
1525 		return -1;
1526 	if (k1->type > k2->type)
1527 		return 1;
1528 	if (k1->type < k2->type)
1529 		return -1;
1530 	if (k1->offset > k2->offset)
1531 		return 1;
1532 	if (k1->offset < k2->offset)
1533 		return -1;
1534 	return 0;
1535 }
1536 
1537 /*
1538  * this is used by the defrag code to go through all the
1539  * leaves pointed to by a node and reallocate them so that
1540  * disk order is close to key order
1541  */
1542 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1543 		       struct btrfs_root *root, struct extent_buffer *parent,
1544 		       int start_slot, u64 *last_ret,
1545 		       struct btrfs_key *progress)
1546 {
1547 	struct btrfs_fs_info *fs_info = root->fs_info;
1548 	struct extent_buffer *cur;
1549 	u64 blocknr;
1550 	u64 gen;
1551 	u64 search_start = *last_ret;
1552 	u64 last_block = 0;
1553 	u64 other;
1554 	u32 parent_nritems;
1555 	int end_slot;
1556 	int i;
1557 	int err = 0;
1558 	int parent_level;
1559 	int uptodate;
1560 	u32 blocksize;
1561 	int progress_passed = 0;
1562 	struct btrfs_disk_key disk_key;
1563 
1564 	parent_level = btrfs_header_level(parent);
1565 
1566 	WARN_ON(trans->transaction != fs_info->running_transaction);
1567 	WARN_ON(trans->transid != fs_info->generation);
1568 
1569 	parent_nritems = btrfs_header_nritems(parent);
1570 	blocksize = fs_info->nodesize;
1571 	end_slot = parent_nritems - 1;
1572 
1573 	if (parent_nritems <= 1)
1574 		return 0;
1575 
1576 	btrfs_set_lock_blocking_write(parent);
1577 
1578 	for (i = start_slot; i <= end_slot; i++) {
1579 		struct btrfs_key first_key;
1580 		int close = 1;
1581 
1582 		btrfs_node_key(parent, &disk_key, i);
1583 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1584 			continue;
1585 
1586 		progress_passed = 1;
1587 		blocknr = btrfs_node_blockptr(parent, i);
1588 		gen = btrfs_node_ptr_generation(parent, i);
1589 		btrfs_node_key_to_cpu(parent, &first_key, i);
1590 		if (last_block == 0)
1591 			last_block = blocknr;
1592 
1593 		if (i > 0) {
1594 			other = btrfs_node_blockptr(parent, i - 1);
1595 			close = close_blocks(blocknr, other, blocksize);
1596 		}
1597 		if (!close && i < end_slot) {
1598 			other = btrfs_node_blockptr(parent, i + 1);
1599 			close = close_blocks(blocknr, other, blocksize);
1600 		}
1601 		if (close) {
1602 			last_block = blocknr;
1603 			continue;
1604 		}
1605 
1606 		cur = find_extent_buffer(fs_info, blocknr);
1607 		if (cur)
1608 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1609 		else
1610 			uptodate = 0;
1611 		if (!cur || !uptodate) {
1612 			if (!cur) {
1613 				cur = read_tree_block(fs_info, blocknr, gen,
1614 						      parent_level - 1,
1615 						      &first_key);
1616 				if (IS_ERR(cur)) {
1617 					return PTR_ERR(cur);
1618 				} else if (!extent_buffer_uptodate(cur)) {
1619 					free_extent_buffer(cur);
1620 					return -EIO;
1621 				}
1622 			} else if (!uptodate) {
1623 				err = btrfs_read_buffer(cur, gen,
1624 						parent_level - 1,&first_key);
1625 				if (err) {
1626 					free_extent_buffer(cur);
1627 					return err;
1628 				}
1629 			}
1630 		}
1631 		if (search_start == 0)
1632 			search_start = last_block;
1633 
1634 		btrfs_tree_lock(cur);
1635 		btrfs_set_lock_blocking_write(cur);
1636 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1637 					&cur, search_start,
1638 					min(16 * blocksize,
1639 					    (end_slot - i) * blocksize));
1640 		if (err) {
1641 			btrfs_tree_unlock(cur);
1642 			free_extent_buffer(cur);
1643 			break;
1644 		}
1645 		search_start = cur->start;
1646 		last_block = cur->start;
1647 		*last_ret = search_start;
1648 		btrfs_tree_unlock(cur);
1649 		free_extent_buffer(cur);
1650 	}
1651 	return err;
1652 }
1653 
1654 /*
1655  * search for key in the extent_buffer.  The items start at offset p,
1656  * and they are item_size apart.  There are 'max' items in p.
1657  *
1658  * the slot in the array is returned via slot, and it points to
1659  * the place where you would insert key if it is not found in
1660  * the array.
1661  *
1662  * slot may point to max if the key is bigger than all of the keys
1663  */
1664 static noinline int generic_bin_search(struct extent_buffer *eb,
1665 				       unsigned long p, int item_size,
1666 				       const struct btrfs_key *key,
1667 				       int max, int *slot)
1668 {
1669 	int low = 0;
1670 	int high = max;
1671 	int mid;
1672 	int ret;
1673 	struct btrfs_disk_key *tmp = NULL;
1674 	struct btrfs_disk_key unaligned;
1675 	unsigned long offset;
1676 	char *kaddr = NULL;
1677 	unsigned long map_start = 0;
1678 	unsigned long map_len = 0;
1679 	int err;
1680 
1681 	if (low > high) {
1682 		btrfs_err(eb->fs_info,
1683 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1684 			  __func__, low, high, eb->start,
1685 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1686 		return -EINVAL;
1687 	}
1688 
1689 	while (low < high) {
1690 		mid = (low + high) / 2;
1691 		offset = p + mid * item_size;
1692 
1693 		if (!kaddr || offset < map_start ||
1694 		    (offset + sizeof(struct btrfs_disk_key)) >
1695 		    map_start + map_len) {
1696 
1697 			err = map_private_extent_buffer(eb, offset,
1698 						sizeof(struct btrfs_disk_key),
1699 						&kaddr, &map_start, &map_len);
1700 
1701 			if (!err) {
1702 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
1703 							map_start);
1704 			} else if (err == 1) {
1705 				read_extent_buffer(eb, &unaligned,
1706 						   offset, sizeof(unaligned));
1707 				tmp = &unaligned;
1708 			} else {
1709 				return err;
1710 			}
1711 
1712 		} else {
1713 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
1714 							map_start);
1715 		}
1716 		ret = comp_keys(tmp, key);
1717 
1718 		if (ret < 0)
1719 			low = mid + 1;
1720 		else if (ret > 0)
1721 			high = mid;
1722 		else {
1723 			*slot = mid;
1724 			return 0;
1725 		}
1726 	}
1727 	*slot = low;
1728 	return 1;
1729 }
1730 
1731 /*
1732  * simple bin_search frontend that does the right thing for
1733  * leaves vs nodes
1734  */
1735 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1736 		     int level, int *slot)
1737 {
1738 	if (level == 0)
1739 		return generic_bin_search(eb,
1740 					  offsetof(struct btrfs_leaf, items),
1741 					  sizeof(struct btrfs_item),
1742 					  key, btrfs_header_nritems(eb),
1743 					  slot);
1744 	else
1745 		return generic_bin_search(eb,
1746 					  offsetof(struct btrfs_node, ptrs),
1747 					  sizeof(struct btrfs_key_ptr),
1748 					  key, btrfs_header_nritems(eb),
1749 					  slot);
1750 }
1751 
1752 static void root_add_used(struct btrfs_root *root, u32 size)
1753 {
1754 	spin_lock(&root->accounting_lock);
1755 	btrfs_set_root_used(&root->root_item,
1756 			    btrfs_root_used(&root->root_item) + size);
1757 	spin_unlock(&root->accounting_lock);
1758 }
1759 
1760 static void root_sub_used(struct btrfs_root *root, u32 size)
1761 {
1762 	spin_lock(&root->accounting_lock);
1763 	btrfs_set_root_used(&root->root_item,
1764 			    btrfs_root_used(&root->root_item) - size);
1765 	spin_unlock(&root->accounting_lock);
1766 }
1767 
1768 /* given a node and slot number, this reads the blocks it points to.  The
1769  * extent buffer is returned with a reference taken (but unlocked).
1770  */
1771 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1772 					   int slot)
1773 {
1774 	int level = btrfs_header_level(parent);
1775 	struct extent_buffer *eb;
1776 	struct btrfs_key first_key;
1777 
1778 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1779 		return ERR_PTR(-ENOENT);
1780 
1781 	BUG_ON(level == 0);
1782 
1783 	btrfs_node_key_to_cpu(parent, &first_key, slot);
1784 	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1785 			     btrfs_node_ptr_generation(parent, slot),
1786 			     level - 1, &first_key);
1787 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1788 		free_extent_buffer(eb);
1789 		eb = ERR_PTR(-EIO);
1790 	}
1791 
1792 	return eb;
1793 }
1794 
1795 /*
1796  * node level balancing, used to make sure nodes are in proper order for
1797  * item deletion.  We balance from the top down, so we have to make sure
1798  * that a deletion won't leave an node completely empty later on.
1799  */
1800 static noinline int balance_level(struct btrfs_trans_handle *trans,
1801 			 struct btrfs_root *root,
1802 			 struct btrfs_path *path, int level)
1803 {
1804 	struct btrfs_fs_info *fs_info = root->fs_info;
1805 	struct extent_buffer *right = NULL;
1806 	struct extent_buffer *mid;
1807 	struct extent_buffer *left = NULL;
1808 	struct extent_buffer *parent = NULL;
1809 	int ret = 0;
1810 	int wret;
1811 	int pslot;
1812 	int orig_slot = path->slots[level];
1813 	u64 orig_ptr;
1814 
1815 	ASSERT(level > 0);
1816 
1817 	mid = path->nodes[level];
1818 
1819 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1820 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1821 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1822 
1823 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1824 
1825 	if (level < BTRFS_MAX_LEVEL - 1) {
1826 		parent = path->nodes[level + 1];
1827 		pslot = path->slots[level + 1];
1828 	}
1829 
1830 	/*
1831 	 * deal with the case where there is only one pointer in the root
1832 	 * by promoting the node below to a root
1833 	 */
1834 	if (!parent) {
1835 		struct extent_buffer *child;
1836 
1837 		if (btrfs_header_nritems(mid) != 1)
1838 			return 0;
1839 
1840 		/* promote the child to a root */
1841 		child = btrfs_read_node_slot(mid, 0);
1842 		if (IS_ERR(child)) {
1843 			ret = PTR_ERR(child);
1844 			btrfs_handle_fs_error(fs_info, ret, NULL);
1845 			goto enospc;
1846 		}
1847 
1848 		btrfs_tree_lock(child);
1849 		btrfs_set_lock_blocking_write(child);
1850 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1851 		if (ret) {
1852 			btrfs_tree_unlock(child);
1853 			free_extent_buffer(child);
1854 			goto enospc;
1855 		}
1856 
1857 		ret = tree_mod_log_insert_root(root->node, child, 1);
1858 		BUG_ON(ret < 0);
1859 		rcu_assign_pointer(root->node, child);
1860 
1861 		add_root_to_dirty_list(root);
1862 		btrfs_tree_unlock(child);
1863 
1864 		path->locks[level] = 0;
1865 		path->nodes[level] = NULL;
1866 		btrfs_clean_tree_block(mid);
1867 		btrfs_tree_unlock(mid);
1868 		/* once for the path */
1869 		free_extent_buffer(mid);
1870 
1871 		root_sub_used(root, mid->len);
1872 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1873 		/* once for the root ptr */
1874 		free_extent_buffer_stale(mid);
1875 		return 0;
1876 	}
1877 	if (btrfs_header_nritems(mid) >
1878 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1879 		return 0;
1880 
1881 	left = btrfs_read_node_slot(parent, pslot - 1);
1882 	if (IS_ERR(left))
1883 		left = NULL;
1884 
1885 	if (left) {
1886 		btrfs_tree_lock(left);
1887 		btrfs_set_lock_blocking_write(left);
1888 		wret = btrfs_cow_block(trans, root, left,
1889 				       parent, pslot - 1, &left);
1890 		if (wret) {
1891 			ret = wret;
1892 			goto enospc;
1893 		}
1894 	}
1895 
1896 	right = btrfs_read_node_slot(parent, pslot + 1);
1897 	if (IS_ERR(right))
1898 		right = NULL;
1899 
1900 	if (right) {
1901 		btrfs_tree_lock(right);
1902 		btrfs_set_lock_blocking_write(right);
1903 		wret = btrfs_cow_block(trans, root, right,
1904 				       parent, pslot + 1, &right);
1905 		if (wret) {
1906 			ret = wret;
1907 			goto enospc;
1908 		}
1909 	}
1910 
1911 	/* first, try to make some room in the middle buffer */
1912 	if (left) {
1913 		orig_slot += btrfs_header_nritems(left);
1914 		wret = push_node_left(trans, left, mid, 1);
1915 		if (wret < 0)
1916 			ret = wret;
1917 	}
1918 
1919 	/*
1920 	 * then try to empty the right most buffer into the middle
1921 	 */
1922 	if (right) {
1923 		wret = push_node_left(trans, mid, right, 1);
1924 		if (wret < 0 && wret != -ENOSPC)
1925 			ret = wret;
1926 		if (btrfs_header_nritems(right) == 0) {
1927 			btrfs_clean_tree_block(right);
1928 			btrfs_tree_unlock(right);
1929 			del_ptr(root, path, level + 1, pslot + 1);
1930 			root_sub_used(root, right->len);
1931 			btrfs_free_tree_block(trans, root, right, 0, 1);
1932 			free_extent_buffer_stale(right);
1933 			right = NULL;
1934 		} else {
1935 			struct btrfs_disk_key right_key;
1936 			btrfs_node_key(right, &right_key, 0);
1937 			ret = tree_mod_log_insert_key(parent, pslot + 1,
1938 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1939 			BUG_ON(ret < 0);
1940 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1941 			btrfs_mark_buffer_dirty(parent);
1942 		}
1943 	}
1944 	if (btrfs_header_nritems(mid) == 1) {
1945 		/*
1946 		 * we're not allowed to leave a node with one item in the
1947 		 * tree during a delete.  A deletion from lower in the tree
1948 		 * could try to delete the only pointer in this node.
1949 		 * So, pull some keys from the left.
1950 		 * There has to be a left pointer at this point because
1951 		 * otherwise we would have pulled some pointers from the
1952 		 * right
1953 		 */
1954 		if (!left) {
1955 			ret = -EROFS;
1956 			btrfs_handle_fs_error(fs_info, ret, NULL);
1957 			goto enospc;
1958 		}
1959 		wret = balance_node_right(trans, mid, left);
1960 		if (wret < 0) {
1961 			ret = wret;
1962 			goto enospc;
1963 		}
1964 		if (wret == 1) {
1965 			wret = push_node_left(trans, left, mid, 1);
1966 			if (wret < 0)
1967 				ret = wret;
1968 		}
1969 		BUG_ON(wret == 1);
1970 	}
1971 	if (btrfs_header_nritems(mid) == 0) {
1972 		btrfs_clean_tree_block(mid);
1973 		btrfs_tree_unlock(mid);
1974 		del_ptr(root, path, level + 1, pslot);
1975 		root_sub_used(root, mid->len);
1976 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1977 		free_extent_buffer_stale(mid);
1978 		mid = NULL;
1979 	} else {
1980 		/* update the parent key to reflect our changes */
1981 		struct btrfs_disk_key mid_key;
1982 		btrfs_node_key(mid, &mid_key, 0);
1983 		ret = tree_mod_log_insert_key(parent, pslot,
1984 				MOD_LOG_KEY_REPLACE, GFP_NOFS);
1985 		BUG_ON(ret < 0);
1986 		btrfs_set_node_key(parent, &mid_key, pslot);
1987 		btrfs_mark_buffer_dirty(parent);
1988 	}
1989 
1990 	/* update the path */
1991 	if (left) {
1992 		if (btrfs_header_nritems(left) > orig_slot) {
1993 			atomic_inc(&left->refs);
1994 			/* left was locked after cow */
1995 			path->nodes[level] = left;
1996 			path->slots[level + 1] -= 1;
1997 			path->slots[level] = orig_slot;
1998 			if (mid) {
1999 				btrfs_tree_unlock(mid);
2000 				free_extent_buffer(mid);
2001 			}
2002 		} else {
2003 			orig_slot -= btrfs_header_nritems(left);
2004 			path->slots[level] = orig_slot;
2005 		}
2006 	}
2007 	/* double check we haven't messed things up */
2008 	if (orig_ptr !=
2009 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2010 		BUG();
2011 enospc:
2012 	if (right) {
2013 		btrfs_tree_unlock(right);
2014 		free_extent_buffer(right);
2015 	}
2016 	if (left) {
2017 		if (path->nodes[level] != left)
2018 			btrfs_tree_unlock(left);
2019 		free_extent_buffer(left);
2020 	}
2021 	return ret;
2022 }
2023 
2024 /* Node balancing for insertion.  Here we only split or push nodes around
2025  * when they are completely full.  This is also done top down, so we
2026  * have to be pessimistic.
2027  */
2028 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2029 					  struct btrfs_root *root,
2030 					  struct btrfs_path *path, int level)
2031 {
2032 	struct btrfs_fs_info *fs_info = root->fs_info;
2033 	struct extent_buffer *right = NULL;
2034 	struct extent_buffer *mid;
2035 	struct extent_buffer *left = NULL;
2036 	struct extent_buffer *parent = NULL;
2037 	int ret = 0;
2038 	int wret;
2039 	int pslot;
2040 	int orig_slot = path->slots[level];
2041 
2042 	if (level == 0)
2043 		return 1;
2044 
2045 	mid = path->nodes[level];
2046 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2047 
2048 	if (level < BTRFS_MAX_LEVEL - 1) {
2049 		parent = path->nodes[level + 1];
2050 		pslot = path->slots[level + 1];
2051 	}
2052 
2053 	if (!parent)
2054 		return 1;
2055 
2056 	left = btrfs_read_node_slot(parent, pslot - 1);
2057 	if (IS_ERR(left))
2058 		left = NULL;
2059 
2060 	/* first, try to make some room in the middle buffer */
2061 	if (left) {
2062 		u32 left_nr;
2063 
2064 		btrfs_tree_lock(left);
2065 		btrfs_set_lock_blocking_write(left);
2066 
2067 		left_nr = btrfs_header_nritems(left);
2068 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2069 			wret = 1;
2070 		} else {
2071 			ret = btrfs_cow_block(trans, root, left, parent,
2072 					      pslot - 1, &left);
2073 			if (ret)
2074 				wret = 1;
2075 			else {
2076 				wret = push_node_left(trans, left, mid, 0);
2077 			}
2078 		}
2079 		if (wret < 0)
2080 			ret = wret;
2081 		if (wret == 0) {
2082 			struct btrfs_disk_key disk_key;
2083 			orig_slot += left_nr;
2084 			btrfs_node_key(mid, &disk_key, 0);
2085 			ret = tree_mod_log_insert_key(parent, pslot,
2086 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2087 			BUG_ON(ret < 0);
2088 			btrfs_set_node_key(parent, &disk_key, pslot);
2089 			btrfs_mark_buffer_dirty(parent);
2090 			if (btrfs_header_nritems(left) > orig_slot) {
2091 				path->nodes[level] = left;
2092 				path->slots[level + 1] -= 1;
2093 				path->slots[level] = orig_slot;
2094 				btrfs_tree_unlock(mid);
2095 				free_extent_buffer(mid);
2096 			} else {
2097 				orig_slot -=
2098 					btrfs_header_nritems(left);
2099 				path->slots[level] = orig_slot;
2100 				btrfs_tree_unlock(left);
2101 				free_extent_buffer(left);
2102 			}
2103 			return 0;
2104 		}
2105 		btrfs_tree_unlock(left);
2106 		free_extent_buffer(left);
2107 	}
2108 	right = btrfs_read_node_slot(parent, pslot + 1);
2109 	if (IS_ERR(right))
2110 		right = NULL;
2111 
2112 	/*
2113 	 * then try to empty the right most buffer into the middle
2114 	 */
2115 	if (right) {
2116 		u32 right_nr;
2117 
2118 		btrfs_tree_lock(right);
2119 		btrfs_set_lock_blocking_write(right);
2120 
2121 		right_nr = btrfs_header_nritems(right);
2122 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2123 			wret = 1;
2124 		} else {
2125 			ret = btrfs_cow_block(trans, root, right,
2126 					      parent, pslot + 1,
2127 					      &right);
2128 			if (ret)
2129 				wret = 1;
2130 			else {
2131 				wret = balance_node_right(trans, right, mid);
2132 			}
2133 		}
2134 		if (wret < 0)
2135 			ret = wret;
2136 		if (wret == 0) {
2137 			struct btrfs_disk_key disk_key;
2138 
2139 			btrfs_node_key(right, &disk_key, 0);
2140 			ret = tree_mod_log_insert_key(parent, pslot + 1,
2141 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2142 			BUG_ON(ret < 0);
2143 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2144 			btrfs_mark_buffer_dirty(parent);
2145 
2146 			if (btrfs_header_nritems(mid) <= orig_slot) {
2147 				path->nodes[level] = right;
2148 				path->slots[level + 1] += 1;
2149 				path->slots[level] = orig_slot -
2150 					btrfs_header_nritems(mid);
2151 				btrfs_tree_unlock(mid);
2152 				free_extent_buffer(mid);
2153 			} else {
2154 				btrfs_tree_unlock(right);
2155 				free_extent_buffer(right);
2156 			}
2157 			return 0;
2158 		}
2159 		btrfs_tree_unlock(right);
2160 		free_extent_buffer(right);
2161 	}
2162 	return 1;
2163 }
2164 
2165 /*
2166  * readahead one full node of leaves, finding things that are close
2167  * to the block in 'slot', and triggering ra on them.
2168  */
2169 static void reada_for_search(struct btrfs_fs_info *fs_info,
2170 			     struct btrfs_path *path,
2171 			     int level, int slot, u64 objectid)
2172 {
2173 	struct extent_buffer *node;
2174 	struct btrfs_disk_key disk_key;
2175 	u32 nritems;
2176 	u64 search;
2177 	u64 target;
2178 	u64 nread = 0;
2179 	struct extent_buffer *eb;
2180 	u32 nr;
2181 	u32 blocksize;
2182 	u32 nscan = 0;
2183 
2184 	if (level != 1)
2185 		return;
2186 
2187 	if (!path->nodes[level])
2188 		return;
2189 
2190 	node = path->nodes[level];
2191 
2192 	search = btrfs_node_blockptr(node, slot);
2193 	blocksize = fs_info->nodesize;
2194 	eb = find_extent_buffer(fs_info, search);
2195 	if (eb) {
2196 		free_extent_buffer(eb);
2197 		return;
2198 	}
2199 
2200 	target = search;
2201 
2202 	nritems = btrfs_header_nritems(node);
2203 	nr = slot;
2204 
2205 	while (1) {
2206 		if (path->reada == READA_BACK) {
2207 			if (nr == 0)
2208 				break;
2209 			nr--;
2210 		} else if (path->reada == READA_FORWARD) {
2211 			nr++;
2212 			if (nr >= nritems)
2213 				break;
2214 		}
2215 		if (path->reada == READA_BACK && objectid) {
2216 			btrfs_node_key(node, &disk_key, nr);
2217 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2218 				break;
2219 		}
2220 		search = btrfs_node_blockptr(node, nr);
2221 		if ((search <= target && target - search <= 65536) ||
2222 		    (search > target && search - target <= 65536)) {
2223 			readahead_tree_block(fs_info, search);
2224 			nread += blocksize;
2225 		}
2226 		nscan++;
2227 		if ((nread > 65536 || nscan > 32))
2228 			break;
2229 	}
2230 }
2231 
2232 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2233 				       struct btrfs_path *path, int level)
2234 {
2235 	int slot;
2236 	int nritems;
2237 	struct extent_buffer *parent;
2238 	struct extent_buffer *eb;
2239 	u64 gen;
2240 	u64 block1 = 0;
2241 	u64 block2 = 0;
2242 
2243 	parent = path->nodes[level + 1];
2244 	if (!parent)
2245 		return;
2246 
2247 	nritems = btrfs_header_nritems(parent);
2248 	slot = path->slots[level + 1];
2249 
2250 	if (slot > 0) {
2251 		block1 = btrfs_node_blockptr(parent, slot - 1);
2252 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2253 		eb = find_extent_buffer(fs_info, block1);
2254 		/*
2255 		 * if we get -eagain from btrfs_buffer_uptodate, we
2256 		 * don't want to return eagain here.  That will loop
2257 		 * forever
2258 		 */
2259 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2260 			block1 = 0;
2261 		free_extent_buffer(eb);
2262 	}
2263 	if (slot + 1 < nritems) {
2264 		block2 = btrfs_node_blockptr(parent, slot + 1);
2265 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2266 		eb = find_extent_buffer(fs_info, block2);
2267 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2268 			block2 = 0;
2269 		free_extent_buffer(eb);
2270 	}
2271 
2272 	if (block1)
2273 		readahead_tree_block(fs_info, block1);
2274 	if (block2)
2275 		readahead_tree_block(fs_info, block2);
2276 }
2277 
2278 
2279 /*
2280  * when we walk down the tree, it is usually safe to unlock the higher layers
2281  * in the tree.  The exceptions are when our path goes through slot 0, because
2282  * operations on the tree might require changing key pointers higher up in the
2283  * tree.
2284  *
2285  * callers might also have set path->keep_locks, which tells this code to keep
2286  * the lock if the path points to the last slot in the block.  This is part of
2287  * walking through the tree, and selecting the next slot in the higher block.
2288  *
2289  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2290  * if lowest_unlock is 1, level 0 won't be unlocked
2291  */
2292 static noinline void unlock_up(struct btrfs_path *path, int level,
2293 			       int lowest_unlock, int min_write_lock_level,
2294 			       int *write_lock_level)
2295 {
2296 	int i;
2297 	int skip_level = level;
2298 	int no_skips = 0;
2299 	struct extent_buffer *t;
2300 
2301 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2302 		if (!path->nodes[i])
2303 			break;
2304 		if (!path->locks[i])
2305 			break;
2306 		if (!no_skips && path->slots[i] == 0) {
2307 			skip_level = i + 1;
2308 			continue;
2309 		}
2310 		if (!no_skips && path->keep_locks) {
2311 			u32 nritems;
2312 			t = path->nodes[i];
2313 			nritems = btrfs_header_nritems(t);
2314 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2315 				skip_level = i + 1;
2316 				continue;
2317 			}
2318 		}
2319 		if (skip_level < i && i >= lowest_unlock)
2320 			no_skips = 1;
2321 
2322 		t = path->nodes[i];
2323 		if (i >= lowest_unlock && i > skip_level) {
2324 			btrfs_tree_unlock_rw(t, path->locks[i]);
2325 			path->locks[i] = 0;
2326 			if (write_lock_level &&
2327 			    i > min_write_lock_level &&
2328 			    i <= *write_lock_level) {
2329 				*write_lock_level = i - 1;
2330 			}
2331 		}
2332 	}
2333 }
2334 
2335 /*
2336  * helper function for btrfs_search_slot.  The goal is to find a block
2337  * in cache without setting the path to blocking.  If we find the block
2338  * we return zero and the path is unchanged.
2339  *
2340  * If we can't find the block, we set the path blocking and do some
2341  * reada.  -EAGAIN is returned and the search must be repeated.
2342  */
2343 static int
2344 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2345 		      struct extent_buffer **eb_ret, int level, int slot,
2346 		      const struct btrfs_key *key)
2347 {
2348 	struct btrfs_fs_info *fs_info = root->fs_info;
2349 	u64 blocknr;
2350 	u64 gen;
2351 	struct extent_buffer *b = *eb_ret;
2352 	struct extent_buffer *tmp;
2353 	struct btrfs_key first_key;
2354 	int ret;
2355 	int parent_level;
2356 
2357 	blocknr = btrfs_node_blockptr(b, slot);
2358 	gen = btrfs_node_ptr_generation(b, slot);
2359 	parent_level = btrfs_header_level(b);
2360 	btrfs_node_key_to_cpu(b, &first_key, slot);
2361 
2362 	tmp = find_extent_buffer(fs_info, blocknr);
2363 	if (tmp) {
2364 		/* first we do an atomic uptodate check */
2365 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2366 			/*
2367 			 * Do extra check for first_key, eb can be stale due to
2368 			 * being cached, read from scrub, or have multiple
2369 			 * parents (shared tree blocks).
2370 			 */
2371 			if (btrfs_verify_level_key(tmp,
2372 					parent_level - 1, &first_key, gen)) {
2373 				free_extent_buffer(tmp);
2374 				return -EUCLEAN;
2375 			}
2376 			*eb_ret = tmp;
2377 			return 0;
2378 		}
2379 
2380 		/* the pages were up to date, but we failed
2381 		 * the generation number check.  Do a full
2382 		 * read for the generation number that is correct.
2383 		 * We must do this without dropping locks so
2384 		 * we can trust our generation number
2385 		 */
2386 		btrfs_set_path_blocking(p);
2387 
2388 		/* now we're allowed to do a blocking uptodate check */
2389 		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2390 		if (!ret) {
2391 			*eb_ret = tmp;
2392 			return 0;
2393 		}
2394 		free_extent_buffer(tmp);
2395 		btrfs_release_path(p);
2396 		return -EIO;
2397 	}
2398 
2399 	/*
2400 	 * reduce lock contention at high levels
2401 	 * of the btree by dropping locks before
2402 	 * we read.  Don't release the lock on the current
2403 	 * level because we need to walk this node to figure
2404 	 * out which blocks to read.
2405 	 */
2406 	btrfs_unlock_up_safe(p, level + 1);
2407 	btrfs_set_path_blocking(p);
2408 
2409 	if (p->reada != READA_NONE)
2410 		reada_for_search(fs_info, p, level, slot, key->objectid);
2411 
2412 	ret = -EAGAIN;
2413 	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2414 			      &first_key);
2415 	if (!IS_ERR(tmp)) {
2416 		/*
2417 		 * If the read above didn't mark this buffer up to date,
2418 		 * it will never end up being up to date.  Set ret to EIO now
2419 		 * and give up so that our caller doesn't loop forever
2420 		 * on our EAGAINs.
2421 		 */
2422 		if (!extent_buffer_uptodate(tmp))
2423 			ret = -EIO;
2424 		free_extent_buffer(tmp);
2425 	} else {
2426 		ret = PTR_ERR(tmp);
2427 	}
2428 
2429 	btrfs_release_path(p);
2430 	return ret;
2431 }
2432 
2433 /*
2434  * helper function for btrfs_search_slot.  This does all of the checks
2435  * for node-level blocks and does any balancing required based on
2436  * the ins_len.
2437  *
2438  * If no extra work was required, zero is returned.  If we had to
2439  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2440  * start over
2441  */
2442 static int
2443 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2444 		       struct btrfs_root *root, struct btrfs_path *p,
2445 		       struct extent_buffer *b, int level, int ins_len,
2446 		       int *write_lock_level)
2447 {
2448 	struct btrfs_fs_info *fs_info = root->fs_info;
2449 	int ret;
2450 
2451 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2452 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2453 		int sret;
2454 
2455 		if (*write_lock_level < level + 1) {
2456 			*write_lock_level = level + 1;
2457 			btrfs_release_path(p);
2458 			goto again;
2459 		}
2460 
2461 		btrfs_set_path_blocking(p);
2462 		reada_for_balance(fs_info, p, level);
2463 		sret = split_node(trans, root, p, level);
2464 
2465 		BUG_ON(sret > 0);
2466 		if (sret) {
2467 			ret = sret;
2468 			goto done;
2469 		}
2470 		b = p->nodes[level];
2471 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2472 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2473 		int sret;
2474 
2475 		if (*write_lock_level < level + 1) {
2476 			*write_lock_level = level + 1;
2477 			btrfs_release_path(p);
2478 			goto again;
2479 		}
2480 
2481 		btrfs_set_path_blocking(p);
2482 		reada_for_balance(fs_info, p, level);
2483 		sret = balance_level(trans, root, p, level);
2484 
2485 		if (sret) {
2486 			ret = sret;
2487 			goto done;
2488 		}
2489 		b = p->nodes[level];
2490 		if (!b) {
2491 			btrfs_release_path(p);
2492 			goto again;
2493 		}
2494 		BUG_ON(btrfs_header_nritems(b) == 1);
2495 	}
2496 	return 0;
2497 
2498 again:
2499 	ret = -EAGAIN;
2500 done:
2501 	return ret;
2502 }
2503 
2504 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2505 		      int level, int *prev_cmp, int *slot)
2506 {
2507 	if (*prev_cmp != 0) {
2508 		*prev_cmp = btrfs_bin_search(b, key, level, slot);
2509 		return *prev_cmp;
2510 	}
2511 
2512 	*slot = 0;
2513 
2514 	return 0;
2515 }
2516 
2517 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2518 		u64 iobjectid, u64 ioff, u8 key_type,
2519 		struct btrfs_key *found_key)
2520 {
2521 	int ret;
2522 	struct btrfs_key key;
2523 	struct extent_buffer *eb;
2524 
2525 	ASSERT(path);
2526 	ASSERT(found_key);
2527 
2528 	key.type = key_type;
2529 	key.objectid = iobjectid;
2530 	key.offset = ioff;
2531 
2532 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2533 	if (ret < 0)
2534 		return ret;
2535 
2536 	eb = path->nodes[0];
2537 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2538 		ret = btrfs_next_leaf(fs_root, path);
2539 		if (ret)
2540 			return ret;
2541 		eb = path->nodes[0];
2542 	}
2543 
2544 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2545 	if (found_key->type != key.type ||
2546 			found_key->objectid != key.objectid)
2547 		return 1;
2548 
2549 	return 0;
2550 }
2551 
2552 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2553 							struct btrfs_path *p,
2554 							int write_lock_level)
2555 {
2556 	struct btrfs_fs_info *fs_info = root->fs_info;
2557 	struct extent_buffer *b;
2558 	int root_lock;
2559 	int level = 0;
2560 
2561 	/* We try very hard to do read locks on the root */
2562 	root_lock = BTRFS_READ_LOCK;
2563 
2564 	if (p->search_commit_root) {
2565 		/*
2566 		 * The commit roots are read only so we always do read locks,
2567 		 * and we always must hold the commit_root_sem when doing
2568 		 * searches on them, the only exception is send where we don't
2569 		 * want to block transaction commits for a long time, so
2570 		 * we need to clone the commit root in order to avoid races
2571 		 * with transaction commits that create a snapshot of one of
2572 		 * the roots used by a send operation.
2573 		 */
2574 		if (p->need_commit_sem) {
2575 			down_read(&fs_info->commit_root_sem);
2576 			b = btrfs_clone_extent_buffer(root->commit_root);
2577 			up_read(&fs_info->commit_root_sem);
2578 			if (!b)
2579 				return ERR_PTR(-ENOMEM);
2580 
2581 		} else {
2582 			b = root->commit_root;
2583 			atomic_inc(&b->refs);
2584 		}
2585 		level = btrfs_header_level(b);
2586 		/*
2587 		 * Ensure that all callers have set skip_locking when
2588 		 * p->search_commit_root = 1.
2589 		 */
2590 		ASSERT(p->skip_locking == 1);
2591 
2592 		goto out;
2593 	}
2594 
2595 	if (p->skip_locking) {
2596 		b = btrfs_root_node(root);
2597 		level = btrfs_header_level(b);
2598 		goto out;
2599 	}
2600 
2601 	/*
2602 	 * If the level is set to maximum, we can skip trying to get the read
2603 	 * lock.
2604 	 */
2605 	if (write_lock_level < BTRFS_MAX_LEVEL) {
2606 		/*
2607 		 * We don't know the level of the root node until we actually
2608 		 * have it read locked
2609 		 */
2610 		b = btrfs_read_lock_root_node(root);
2611 		level = btrfs_header_level(b);
2612 		if (level > write_lock_level)
2613 			goto out;
2614 
2615 		/* Whoops, must trade for write lock */
2616 		btrfs_tree_read_unlock(b);
2617 		free_extent_buffer(b);
2618 	}
2619 
2620 	b = btrfs_lock_root_node(root);
2621 	root_lock = BTRFS_WRITE_LOCK;
2622 
2623 	/* The level might have changed, check again */
2624 	level = btrfs_header_level(b);
2625 
2626 out:
2627 	p->nodes[level] = b;
2628 	if (!p->skip_locking)
2629 		p->locks[level] = root_lock;
2630 	/*
2631 	 * Callers are responsible for dropping b's references.
2632 	 */
2633 	return b;
2634 }
2635 
2636 
2637 /*
2638  * btrfs_search_slot - look for a key in a tree and perform necessary
2639  * modifications to preserve tree invariants.
2640  *
2641  * @trans:	Handle of transaction, used when modifying the tree
2642  * @p:		Holds all btree nodes along the search path
2643  * @root:	The root node of the tree
2644  * @key:	The key we are looking for
2645  * @ins_len:	Indicates purpose of search, for inserts it is 1, for
2646  *		deletions it's -1. 0 for plain searches
2647  * @cow:	boolean should CoW operations be performed. Must always be 1
2648  *		when modifying the tree.
2649  *
2650  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2651  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2652  *
2653  * If @key is found, 0 is returned and you can find the item in the leaf level
2654  * of the path (level 0)
2655  *
2656  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2657  * points to the slot where it should be inserted
2658  *
2659  * If an error is encountered while searching the tree a negative error number
2660  * is returned
2661  */
2662 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2663 		      const struct btrfs_key *key, struct btrfs_path *p,
2664 		      int ins_len, int cow)
2665 {
2666 	struct extent_buffer *b;
2667 	int slot;
2668 	int ret;
2669 	int err;
2670 	int level;
2671 	int lowest_unlock = 1;
2672 	/* everything at write_lock_level or lower must be write locked */
2673 	int write_lock_level = 0;
2674 	u8 lowest_level = 0;
2675 	int min_write_lock_level;
2676 	int prev_cmp;
2677 
2678 	lowest_level = p->lowest_level;
2679 	WARN_ON(lowest_level && ins_len > 0);
2680 	WARN_ON(p->nodes[0] != NULL);
2681 	BUG_ON(!cow && ins_len);
2682 
2683 	if (ins_len < 0) {
2684 		lowest_unlock = 2;
2685 
2686 		/* when we are removing items, we might have to go up to level
2687 		 * two as we update tree pointers  Make sure we keep write
2688 		 * for those levels as well
2689 		 */
2690 		write_lock_level = 2;
2691 	} else if (ins_len > 0) {
2692 		/*
2693 		 * for inserting items, make sure we have a write lock on
2694 		 * level 1 so we can update keys
2695 		 */
2696 		write_lock_level = 1;
2697 	}
2698 
2699 	if (!cow)
2700 		write_lock_level = -1;
2701 
2702 	if (cow && (p->keep_locks || p->lowest_level))
2703 		write_lock_level = BTRFS_MAX_LEVEL;
2704 
2705 	min_write_lock_level = write_lock_level;
2706 
2707 again:
2708 	prev_cmp = -1;
2709 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2710 	if (IS_ERR(b)) {
2711 		ret = PTR_ERR(b);
2712 		goto done;
2713 	}
2714 
2715 	while (b) {
2716 		int dec = 0;
2717 
2718 		level = btrfs_header_level(b);
2719 
2720 		if (cow) {
2721 			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2722 
2723 			/*
2724 			 * if we don't really need to cow this block
2725 			 * then we don't want to set the path blocking,
2726 			 * so we test it here
2727 			 */
2728 			if (!should_cow_block(trans, root, b)) {
2729 				trans->dirty = true;
2730 				goto cow_done;
2731 			}
2732 
2733 			/*
2734 			 * must have write locks on this node and the
2735 			 * parent
2736 			 */
2737 			if (level > write_lock_level ||
2738 			    (level + 1 > write_lock_level &&
2739 			    level + 1 < BTRFS_MAX_LEVEL &&
2740 			    p->nodes[level + 1])) {
2741 				write_lock_level = level + 1;
2742 				btrfs_release_path(p);
2743 				goto again;
2744 			}
2745 
2746 			btrfs_set_path_blocking(p);
2747 			if (last_level)
2748 				err = btrfs_cow_block(trans, root, b, NULL, 0,
2749 						      &b);
2750 			else
2751 				err = btrfs_cow_block(trans, root, b,
2752 						      p->nodes[level + 1],
2753 						      p->slots[level + 1], &b);
2754 			if (err) {
2755 				ret = err;
2756 				goto done;
2757 			}
2758 		}
2759 cow_done:
2760 		p->nodes[level] = b;
2761 		/*
2762 		 * Leave path with blocking locks to avoid massive
2763 		 * lock context switch, this is made on purpose.
2764 		 */
2765 
2766 		/*
2767 		 * we have a lock on b and as long as we aren't changing
2768 		 * the tree, there is no way to for the items in b to change.
2769 		 * It is safe to drop the lock on our parent before we
2770 		 * go through the expensive btree search on b.
2771 		 *
2772 		 * If we're inserting or deleting (ins_len != 0), then we might
2773 		 * be changing slot zero, which may require changing the parent.
2774 		 * So, we can't drop the lock until after we know which slot
2775 		 * we're operating on.
2776 		 */
2777 		if (!ins_len && !p->keep_locks) {
2778 			int u = level + 1;
2779 
2780 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2781 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2782 				p->locks[u] = 0;
2783 			}
2784 		}
2785 
2786 		ret = key_search(b, key, level, &prev_cmp, &slot);
2787 		if (ret < 0)
2788 			goto done;
2789 
2790 		if (level == 0) {
2791 			p->slots[level] = slot;
2792 			if (ins_len > 0 &&
2793 			    btrfs_leaf_free_space(b) < ins_len) {
2794 				if (write_lock_level < 1) {
2795 					write_lock_level = 1;
2796 					btrfs_release_path(p);
2797 					goto again;
2798 				}
2799 
2800 				btrfs_set_path_blocking(p);
2801 				err = split_leaf(trans, root, key,
2802 						 p, ins_len, ret == 0);
2803 
2804 				BUG_ON(err > 0);
2805 				if (err) {
2806 					ret = err;
2807 					goto done;
2808 				}
2809 			}
2810 			if (!p->search_for_split)
2811 				unlock_up(p, level, lowest_unlock,
2812 					  min_write_lock_level, NULL);
2813 			goto done;
2814 		}
2815 		if (ret && slot > 0) {
2816 			dec = 1;
2817 			slot--;
2818 		}
2819 		p->slots[level] = slot;
2820 		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2821 					     &write_lock_level);
2822 		if (err == -EAGAIN)
2823 			goto again;
2824 		if (err) {
2825 			ret = err;
2826 			goto done;
2827 		}
2828 		b = p->nodes[level];
2829 		slot = p->slots[level];
2830 
2831 		/*
2832 		 * Slot 0 is special, if we change the key we have to update
2833 		 * the parent pointer which means we must have a write lock on
2834 		 * the parent
2835 		 */
2836 		if (slot == 0 && ins_len && write_lock_level < level + 1) {
2837 			write_lock_level = level + 1;
2838 			btrfs_release_path(p);
2839 			goto again;
2840 		}
2841 
2842 		unlock_up(p, level, lowest_unlock, min_write_lock_level,
2843 			  &write_lock_level);
2844 
2845 		if (level == lowest_level) {
2846 			if (dec)
2847 				p->slots[level]++;
2848 			goto done;
2849 		}
2850 
2851 		err = read_block_for_search(root, p, &b, level, slot, key);
2852 		if (err == -EAGAIN)
2853 			goto again;
2854 		if (err) {
2855 			ret = err;
2856 			goto done;
2857 		}
2858 
2859 		if (!p->skip_locking) {
2860 			level = btrfs_header_level(b);
2861 			if (level <= write_lock_level) {
2862 				if (!btrfs_try_tree_write_lock(b)) {
2863 					btrfs_set_path_blocking(p);
2864 					btrfs_tree_lock(b);
2865 				}
2866 				p->locks[level] = BTRFS_WRITE_LOCK;
2867 			} else {
2868 				if (!btrfs_tree_read_lock_atomic(b)) {
2869 					btrfs_set_path_blocking(p);
2870 					btrfs_tree_read_lock(b);
2871 				}
2872 				p->locks[level] = BTRFS_READ_LOCK;
2873 			}
2874 			p->nodes[level] = b;
2875 		}
2876 	}
2877 	ret = 1;
2878 done:
2879 	/*
2880 	 * we don't really know what they plan on doing with the path
2881 	 * from here on, so for now just mark it as blocking
2882 	 */
2883 	if (!p->leave_spinning)
2884 		btrfs_set_path_blocking(p);
2885 	if (ret < 0 && !p->skip_release_on_error)
2886 		btrfs_release_path(p);
2887 	return ret;
2888 }
2889 
2890 /*
2891  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2892  * current state of the tree together with the operations recorded in the tree
2893  * modification log to search for the key in a previous version of this tree, as
2894  * denoted by the time_seq parameter.
2895  *
2896  * Naturally, there is no support for insert, delete or cow operations.
2897  *
2898  * The resulting path and return value will be set up as if we called
2899  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2900  */
2901 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2902 			  struct btrfs_path *p, u64 time_seq)
2903 {
2904 	struct btrfs_fs_info *fs_info = root->fs_info;
2905 	struct extent_buffer *b;
2906 	int slot;
2907 	int ret;
2908 	int err;
2909 	int level;
2910 	int lowest_unlock = 1;
2911 	u8 lowest_level = 0;
2912 	int prev_cmp = -1;
2913 
2914 	lowest_level = p->lowest_level;
2915 	WARN_ON(p->nodes[0] != NULL);
2916 
2917 	if (p->search_commit_root) {
2918 		BUG_ON(time_seq);
2919 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2920 	}
2921 
2922 again:
2923 	b = get_old_root(root, time_seq);
2924 	if (!b) {
2925 		ret = -EIO;
2926 		goto done;
2927 	}
2928 	level = btrfs_header_level(b);
2929 	p->locks[level] = BTRFS_READ_LOCK;
2930 
2931 	while (b) {
2932 		int dec = 0;
2933 
2934 		level = btrfs_header_level(b);
2935 		p->nodes[level] = b;
2936 
2937 		/*
2938 		 * we have a lock on b and as long as we aren't changing
2939 		 * the tree, there is no way to for the items in b to change.
2940 		 * It is safe to drop the lock on our parent before we
2941 		 * go through the expensive btree search on b.
2942 		 */
2943 		btrfs_unlock_up_safe(p, level + 1);
2944 
2945 		/*
2946 		 * Since we can unwind ebs we want to do a real search every
2947 		 * time.
2948 		 */
2949 		prev_cmp = -1;
2950 		ret = key_search(b, key, level, &prev_cmp, &slot);
2951 		if (ret < 0)
2952 			goto done;
2953 
2954 		if (level == 0) {
2955 			p->slots[level] = slot;
2956 			unlock_up(p, level, lowest_unlock, 0, NULL);
2957 			goto done;
2958 		}
2959 
2960 		if (ret && slot > 0) {
2961 			dec = 1;
2962 			slot--;
2963 		}
2964 		p->slots[level] = slot;
2965 		unlock_up(p, level, lowest_unlock, 0, NULL);
2966 
2967 		if (level == lowest_level) {
2968 			if (dec)
2969 				p->slots[level]++;
2970 			goto done;
2971 		}
2972 
2973 		err = read_block_for_search(root, p, &b, level, slot, key);
2974 		if (err == -EAGAIN)
2975 			goto again;
2976 		if (err) {
2977 			ret = err;
2978 			goto done;
2979 		}
2980 
2981 		level = btrfs_header_level(b);
2982 		if (!btrfs_tree_read_lock_atomic(b)) {
2983 			btrfs_set_path_blocking(p);
2984 			btrfs_tree_read_lock(b);
2985 		}
2986 		b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2987 		if (!b) {
2988 			ret = -ENOMEM;
2989 			goto done;
2990 		}
2991 		p->locks[level] = BTRFS_READ_LOCK;
2992 		p->nodes[level] = b;
2993 	}
2994 	ret = 1;
2995 done:
2996 	if (!p->leave_spinning)
2997 		btrfs_set_path_blocking(p);
2998 	if (ret < 0)
2999 		btrfs_release_path(p);
3000 
3001 	return ret;
3002 }
3003 
3004 /*
3005  * helper to use instead of search slot if no exact match is needed but
3006  * instead the next or previous item should be returned.
3007  * When find_higher is true, the next higher item is returned, the next lower
3008  * otherwise.
3009  * When return_any and find_higher are both true, and no higher item is found,
3010  * return the next lower instead.
3011  * When return_any is true and find_higher is false, and no lower item is found,
3012  * return the next higher instead.
3013  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3014  * < 0 on error
3015  */
3016 int btrfs_search_slot_for_read(struct btrfs_root *root,
3017 			       const struct btrfs_key *key,
3018 			       struct btrfs_path *p, int find_higher,
3019 			       int return_any)
3020 {
3021 	int ret;
3022 	struct extent_buffer *leaf;
3023 
3024 again:
3025 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3026 	if (ret <= 0)
3027 		return ret;
3028 	/*
3029 	 * a return value of 1 means the path is at the position where the
3030 	 * item should be inserted. Normally this is the next bigger item,
3031 	 * but in case the previous item is the last in a leaf, path points
3032 	 * to the first free slot in the previous leaf, i.e. at an invalid
3033 	 * item.
3034 	 */
3035 	leaf = p->nodes[0];
3036 
3037 	if (find_higher) {
3038 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3039 			ret = btrfs_next_leaf(root, p);
3040 			if (ret <= 0)
3041 				return ret;
3042 			if (!return_any)
3043 				return 1;
3044 			/*
3045 			 * no higher item found, return the next
3046 			 * lower instead
3047 			 */
3048 			return_any = 0;
3049 			find_higher = 0;
3050 			btrfs_release_path(p);
3051 			goto again;
3052 		}
3053 	} else {
3054 		if (p->slots[0] == 0) {
3055 			ret = btrfs_prev_leaf(root, p);
3056 			if (ret < 0)
3057 				return ret;
3058 			if (!ret) {
3059 				leaf = p->nodes[0];
3060 				if (p->slots[0] == btrfs_header_nritems(leaf))
3061 					p->slots[0]--;
3062 				return 0;
3063 			}
3064 			if (!return_any)
3065 				return 1;
3066 			/*
3067 			 * no lower item found, return the next
3068 			 * higher instead
3069 			 */
3070 			return_any = 0;
3071 			find_higher = 1;
3072 			btrfs_release_path(p);
3073 			goto again;
3074 		} else {
3075 			--p->slots[0];
3076 		}
3077 	}
3078 	return 0;
3079 }
3080 
3081 /*
3082  * adjust the pointers going up the tree, starting at level
3083  * making sure the right key of each node is points to 'key'.
3084  * This is used after shifting pointers to the left, so it stops
3085  * fixing up pointers when a given leaf/node is not in slot 0 of the
3086  * higher levels
3087  *
3088  */
3089 static void fixup_low_keys(struct btrfs_path *path,
3090 			   struct btrfs_disk_key *key, int level)
3091 {
3092 	int i;
3093 	struct extent_buffer *t;
3094 	int ret;
3095 
3096 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3097 		int tslot = path->slots[i];
3098 
3099 		if (!path->nodes[i])
3100 			break;
3101 		t = path->nodes[i];
3102 		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3103 				GFP_ATOMIC);
3104 		BUG_ON(ret < 0);
3105 		btrfs_set_node_key(t, key, tslot);
3106 		btrfs_mark_buffer_dirty(path->nodes[i]);
3107 		if (tslot != 0)
3108 			break;
3109 	}
3110 }
3111 
3112 /*
3113  * update item key.
3114  *
3115  * This function isn't completely safe. It's the caller's responsibility
3116  * that the new key won't break the order
3117  */
3118 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3119 			     struct btrfs_path *path,
3120 			     const struct btrfs_key *new_key)
3121 {
3122 	struct btrfs_disk_key disk_key;
3123 	struct extent_buffer *eb;
3124 	int slot;
3125 
3126 	eb = path->nodes[0];
3127 	slot = path->slots[0];
3128 	if (slot > 0) {
3129 		btrfs_item_key(eb, &disk_key, slot - 1);
3130 		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3131 			btrfs_crit(fs_info,
3132 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3133 				   slot, btrfs_disk_key_objectid(&disk_key),
3134 				   btrfs_disk_key_type(&disk_key),
3135 				   btrfs_disk_key_offset(&disk_key),
3136 				   new_key->objectid, new_key->type,
3137 				   new_key->offset);
3138 			btrfs_print_leaf(eb);
3139 			BUG();
3140 		}
3141 	}
3142 	if (slot < btrfs_header_nritems(eb) - 1) {
3143 		btrfs_item_key(eb, &disk_key, slot + 1);
3144 		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3145 			btrfs_crit(fs_info,
3146 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3147 				   slot, btrfs_disk_key_objectid(&disk_key),
3148 				   btrfs_disk_key_type(&disk_key),
3149 				   btrfs_disk_key_offset(&disk_key),
3150 				   new_key->objectid, new_key->type,
3151 				   new_key->offset);
3152 			btrfs_print_leaf(eb);
3153 			BUG();
3154 		}
3155 	}
3156 
3157 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3158 	btrfs_set_item_key(eb, &disk_key, slot);
3159 	btrfs_mark_buffer_dirty(eb);
3160 	if (slot == 0)
3161 		fixup_low_keys(path, &disk_key, 1);
3162 }
3163 
3164 /*
3165  * try to push data from one node into the next node left in the
3166  * tree.
3167  *
3168  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3169  * error, and > 0 if there was no room in the left hand block.
3170  */
3171 static int push_node_left(struct btrfs_trans_handle *trans,
3172 			  struct extent_buffer *dst,
3173 			  struct extent_buffer *src, int empty)
3174 {
3175 	struct btrfs_fs_info *fs_info = trans->fs_info;
3176 	int push_items = 0;
3177 	int src_nritems;
3178 	int dst_nritems;
3179 	int ret = 0;
3180 
3181 	src_nritems = btrfs_header_nritems(src);
3182 	dst_nritems = btrfs_header_nritems(dst);
3183 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3184 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3185 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3186 
3187 	if (!empty && src_nritems <= 8)
3188 		return 1;
3189 
3190 	if (push_items <= 0)
3191 		return 1;
3192 
3193 	if (empty) {
3194 		push_items = min(src_nritems, push_items);
3195 		if (push_items < src_nritems) {
3196 			/* leave at least 8 pointers in the node if
3197 			 * we aren't going to empty it
3198 			 */
3199 			if (src_nritems - push_items < 8) {
3200 				if (push_items <= 8)
3201 					return 1;
3202 				push_items -= 8;
3203 			}
3204 		}
3205 	} else
3206 		push_items = min(src_nritems - 8, push_items);
3207 
3208 	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3209 	if (ret) {
3210 		btrfs_abort_transaction(trans, ret);
3211 		return ret;
3212 	}
3213 	copy_extent_buffer(dst, src,
3214 			   btrfs_node_key_ptr_offset(dst_nritems),
3215 			   btrfs_node_key_ptr_offset(0),
3216 			   push_items * sizeof(struct btrfs_key_ptr));
3217 
3218 	if (push_items < src_nritems) {
3219 		/*
3220 		 * Don't call tree_mod_log_insert_move here, key removal was
3221 		 * already fully logged by tree_mod_log_eb_copy above.
3222 		 */
3223 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3224 				      btrfs_node_key_ptr_offset(push_items),
3225 				      (src_nritems - push_items) *
3226 				      sizeof(struct btrfs_key_ptr));
3227 	}
3228 	btrfs_set_header_nritems(src, src_nritems - push_items);
3229 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3230 	btrfs_mark_buffer_dirty(src);
3231 	btrfs_mark_buffer_dirty(dst);
3232 
3233 	return ret;
3234 }
3235 
3236 /*
3237  * try to push data from one node into the next node right in the
3238  * tree.
3239  *
3240  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3241  * error, and > 0 if there was no room in the right hand block.
3242  *
3243  * this will  only push up to 1/2 the contents of the left node over
3244  */
3245 static int balance_node_right(struct btrfs_trans_handle *trans,
3246 			      struct extent_buffer *dst,
3247 			      struct extent_buffer *src)
3248 {
3249 	struct btrfs_fs_info *fs_info = trans->fs_info;
3250 	int push_items = 0;
3251 	int max_push;
3252 	int src_nritems;
3253 	int dst_nritems;
3254 	int ret = 0;
3255 
3256 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3257 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3258 
3259 	src_nritems = btrfs_header_nritems(src);
3260 	dst_nritems = btrfs_header_nritems(dst);
3261 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3262 	if (push_items <= 0)
3263 		return 1;
3264 
3265 	if (src_nritems < 4)
3266 		return 1;
3267 
3268 	max_push = src_nritems / 2 + 1;
3269 	/* don't try to empty the node */
3270 	if (max_push >= src_nritems)
3271 		return 1;
3272 
3273 	if (max_push < push_items)
3274 		push_items = max_push;
3275 
3276 	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3277 	BUG_ON(ret < 0);
3278 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3279 				      btrfs_node_key_ptr_offset(0),
3280 				      (dst_nritems) *
3281 				      sizeof(struct btrfs_key_ptr));
3282 
3283 	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3284 				   push_items);
3285 	if (ret) {
3286 		btrfs_abort_transaction(trans, ret);
3287 		return ret;
3288 	}
3289 	copy_extent_buffer(dst, src,
3290 			   btrfs_node_key_ptr_offset(0),
3291 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3292 			   push_items * sizeof(struct btrfs_key_ptr));
3293 
3294 	btrfs_set_header_nritems(src, src_nritems - push_items);
3295 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3296 
3297 	btrfs_mark_buffer_dirty(src);
3298 	btrfs_mark_buffer_dirty(dst);
3299 
3300 	return ret;
3301 }
3302 
3303 /*
3304  * helper function to insert a new root level in the tree.
3305  * A new node is allocated, and a single item is inserted to
3306  * point to the existing root
3307  *
3308  * returns zero on success or < 0 on failure.
3309  */
3310 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3311 			   struct btrfs_root *root,
3312 			   struct btrfs_path *path, int level)
3313 {
3314 	struct btrfs_fs_info *fs_info = root->fs_info;
3315 	u64 lower_gen;
3316 	struct extent_buffer *lower;
3317 	struct extent_buffer *c;
3318 	struct extent_buffer *old;
3319 	struct btrfs_disk_key lower_key;
3320 	int ret;
3321 
3322 	BUG_ON(path->nodes[level]);
3323 	BUG_ON(path->nodes[level-1] != root->node);
3324 
3325 	lower = path->nodes[level-1];
3326 	if (level == 1)
3327 		btrfs_item_key(lower, &lower_key, 0);
3328 	else
3329 		btrfs_node_key(lower, &lower_key, 0);
3330 
3331 	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3332 					 root->node->start, 0);
3333 	if (IS_ERR(c))
3334 		return PTR_ERR(c);
3335 
3336 	root_add_used(root, fs_info->nodesize);
3337 
3338 	btrfs_set_header_nritems(c, 1);
3339 	btrfs_set_node_key(c, &lower_key, 0);
3340 	btrfs_set_node_blockptr(c, 0, lower->start);
3341 	lower_gen = btrfs_header_generation(lower);
3342 	WARN_ON(lower_gen != trans->transid);
3343 
3344 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3345 
3346 	btrfs_mark_buffer_dirty(c);
3347 
3348 	old = root->node;
3349 	ret = tree_mod_log_insert_root(root->node, c, 0);
3350 	BUG_ON(ret < 0);
3351 	rcu_assign_pointer(root->node, c);
3352 
3353 	/* the super has an extra ref to root->node */
3354 	free_extent_buffer(old);
3355 
3356 	add_root_to_dirty_list(root);
3357 	atomic_inc(&c->refs);
3358 	path->nodes[level] = c;
3359 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3360 	path->slots[level] = 0;
3361 	return 0;
3362 }
3363 
3364 /*
3365  * worker function to insert a single pointer in a node.
3366  * the node should have enough room for the pointer already
3367  *
3368  * slot and level indicate where you want the key to go, and
3369  * blocknr is the block the key points to.
3370  */
3371 static void insert_ptr(struct btrfs_trans_handle *trans,
3372 		       struct btrfs_path *path,
3373 		       struct btrfs_disk_key *key, u64 bytenr,
3374 		       int slot, int level)
3375 {
3376 	struct extent_buffer *lower;
3377 	int nritems;
3378 	int ret;
3379 
3380 	BUG_ON(!path->nodes[level]);
3381 	btrfs_assert_tree_locked(path->nodes[level]);
3382 	lower = path->nodes[level];
3383 	nritems = btrfs_header_nritems(lower);
3384 	BUG_ON(slot > nritems);
3385 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3386 	if (slot != nritems) {
3387 		if (level) {
3388 			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3389 					nritems - slot);
3390 			BUG_ON(ret < 0);
3391 		}
3392 		memmove_extent_buffer(lower,
3393 			      btrfs_node_key_ptr_offset(slot + 1),
3394 			      btrfs_node_key_ptr_offset(slot),
3395 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3396 	}
3397 	if (level) {
3398 		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3399 				GFP_NOFS);
3400 		BUG_ON(ret < 0);
3401 	}
3402 	btrfs_set_node_key(lower, key, slot);
3403 	btrfs_set_node_blockptr(lower, slot, bytenr);
3404 	WARN_ON(trans->transid == 0);
3405 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3406 	btrfs_set_header_nritems(lower, nritems + 1);
3407 	btrfs_mark_buffer_dirty(lower);
3408 }
3409 
3410 /*
3411  * split the node at the specified level in path in two.
3412  * The path is corrected to point to the appropriate node after the split
3413  *
3414  * Before splitting this tries to make some room in the node by pushing
3415  * left and right, if either one works, it returns right away.
3416  *
3417  * returns 0 on success and < 0 on failure
3418  */
3419 static noinline int split_node(struct btrfs_trans_handle *trans,
3420 			       struct btrfs_root *root,
3421 			       struct btrfs_path *path, int level)
3422 {
3423 	struct btrfs_fs_info *fs_info = root->fs_info;
3424 	struct extent_buffer *c;
3425 	struct extent_buffer *split;
3426 	struct btrfs_disk_key disk_key;
3427 	int mid;
3428 	int ret;
3429 	u32 c_nritems;
3430 
3431 	c = path->nodes[level];
3432 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3433 	if (c == root->node) {
3434 		/*
3435 		 * trying to split the root, lets make a new one
3436 		 *
3437 		 * tree mod log: We don't log_removal old root in
3438 		 * insert_new_root, because that root buffer will be kept as a
3439 		 * normal node. We are going to log removal of half of the
3440 		 * elements below with tree_mod_log_eb_copy. We're holding a
3441 		 * tree lock on the buffer, which is why we cannot race with
3442 		 * other tree_mod_log users.
3443 		 */
3444 		ret = insert_new_root(trans, root, path, level + 1);
3445 		if (ret)
3446 			return ret;
3447 	} else {
3448 		ret = push_nodes_for_insert(trans, root, path, level);
3449 		c = path->nodes[level];
3450 		if (!ret && btrfs_header_nritems(c) <
3451 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3452 			return 0;
3453 		if (ret < 0)
3454 			return ret;
3455 	}
3456 
3457 	c_nritems = btrfs_header_nritems(c);
3458 	mid = (c_nritems + 1) / 2;
3459 	btrfs_node_key(c, &disk_key, mid);
3460 
3461 	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3462 					     c->start, 0);
3463 	if (IS_ERR(split))
3464 		return PTR_ERR(split);
3465 
3466 	root_add_used(root, fs_info->nodesize);
3467 	ASSERT(btrfs_header_level(c) == level);
3468 
3469 	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3470 	if (ret) {
3471 		btrfs_abort_transaction(trans, ret);
3472 		return ret;
3473 	}
3474 	copy_extent_buffer(split, c,
3475 			   btrfs_node_key_ptr_offset(0),
3476 			   btrfs_node_key_ptr_offset(mid),
3477 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3478 	btrfs_set_header_nritems(split, c_nritems - mid);
3479 	btrfs_set_header_nritems(c, mid);
3480 	ret = 0;
3481 
3482 	btrfs_mark_buffer_dirty(c);
3483 	btrfs_mark_buffer_dirty(split);
3484 
3485 	insert_ptr(trans, path, &disk_key, split->start,
3486 		   path->slots[level + 1] + 1, level + 1);
3487 
3488 	if (path->slots[level] >= mid) {
3489 		path->slots[level] -= mid;
3490 		btrfs_tree_unlock(c);
3491 		free_extent_buffer(c);
3492 		path->nodes[level] = split;
3493 		path->slots[level + 1] += 1;
3494 	} else {
3495 		btrfs_tree_unlock(split);
3496 		free_extent_buffer(split);
3497 	}
3498 	return ret;
3499 }
3500 
3501 /*
3502  * how many bytes are required to store the items in a leaf.  start
3503  * and nr indicate which items in the leaf to check.  This totals up the
3504  * space used both by the item structs and the item data
3505  */
3506 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3507 {
3508 	struct btrfs_item *start_item;
3509 	struct btrfs_item *end_item;
3510 	struct btrfs_map_token token;
3511 	int data_len;
3512 	int nritems = btrfs_header_nritems(l);
3513 	int end = min(nritems, start + nr) - 1;
3514 
3515 	if (!nr)
3516 		return 0;
3517 	btrfs_init_map_token(&token, l);
3518 	start_item = btrfs_item_nr(start);
3519 	end_item = btrfs_item_nr(end);
3520 	data_len = btrfs_token_item_offset(l, start_item, &token) +
3521 		btrfs_token_item_size(l, start_item, &token);
3522 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3523 	data_len += sizeof(struct btrfs_item) * nr;
3524 	WARN_ON(data_len < 0);
3525 	return data_len;
3526 }
3527 
3528 /*
3529  * The space between the end of the leaf items and
3530  * the start of the leaf data.  IOW, how much room
3531  * the leaf has left for both items and data
3532  */
3533 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3534 {
3535 	struct btrfs_fs_info *fs_info = leaf->fs_info;
3536 	int nritems = btrfs_header_nritems(leaf);
3537 	int ret;
3538 
3539 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3540 	if (ret < 0) {
3541 		btrfs_crit(fs_info,
3542 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3543 			   ret,
3544 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3545 			   leaf_space_used(leaf, 0, nritems), nritems);
3546 	}
3547 	return ret;
3548 }
3549 
3550 /*
3551  * min slot controls the lowest index we're willing to push to the
3552  * right.  We'll push up to and including min_slot, but no lower
3553  */
3554 static noinline int __push_leaf_right(struct btrfs_path *path,
3555 				      int data_size, int empty,
3556 				      struct extent_buffer *right,
3557 				      int free_space, u32 left_nritems,
3558 				      u32 min_slot)
3559 {
3560 	struct btrfs_fs_info *fs_info = right->fs_info;
3561 	struct extent_buffer *left = path->nodes[0];
3562 	struct extent_buffer *upper = path->nodes[1];
3563 	struct btrfs_map_token token;
3564 	struct btrfs_disk_key disk_key;
3565 	int slot;
3566 	u32 i;
3567 	int push_space = 0;
3568 	int push_items = 0;
3569 	struct btrfs_item *item;
3570 	u32 nr;
3571 	u32 right_nritems;
3572 	u32 data_end;
3573 	u32 this_item_size;
3574 
3575 	if (empty)
3576 		nr = 0;
3577 	else
3578 		nr = max_t(u32, 1, min_slot);
3579 
3580 	if (path->slots[0] >= left_nritems)
3581 		push_space += data_size;
3582 
3583 	slot = path->slots[1];
3584 	i = left_nritems - 1;
3585 	while (i >= nr) {
3586 		item = btrfs_item_nr(i);
3587 
3588 		if (!empty && push_items > 0) {
3589 			if (path->slots[0] > i)
3590 				break;
3591 			if (path->slots[0] == i) {
3592 				int space = btrfs_leaf_free_space(left);
3593 
3594 				if (space + push_space * 2 > free_space)
3595 					break;
3596 			}
3597 		}
3598 
3599 		if (path->slots[0] == i)
3600 			push_space += data_size;
3601 
3602 		this_item_size = btrfs_item_size(left, item);
3603 		if (this_item_size + sizeof(*item) + push_space > free_space)
3604 			break;
3605 
3606 		push_items++;
3607 		push_space += this_item_size + sizeof(*item);
3608 		if (i == 0)
3609 			break;
3610 		i--;
3611 	}
3612 
3613 	if (push_items == 0)
3614 		goto out_unlock;
3615 
3616 	WARN_ON(!empty && push_items == left_nritems);
3617 
3618 	/* push left to right */
3619 	right_nritems = btrfs_header_nritems(right);
3620 
3621 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3622 	push_space -= leaf_data_end(left);
3623 
3624 	/* make room in the right data area */
3625 	data_end = leaf_data_end(right);
3626 	memmove_extent_buffer(right,
3627 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3628 			      BTRFS_LEAF_DATA_OFFSET + data_end,
3629 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3630 
3631 	/* copy from the left data area */
3632 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3633 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3634 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3635 		     push_space);
3636 
3637 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3638 			      btrfs_item_nr_offset(0),
3639 			      right_nritems * sizeof(struct btrfs_item));
3640 
3641 	/* copy the items from left to right */
3642 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3643 		   btrfs_item_nr_offset(left_nritems - push_items),
3644 		   push_items * sizeof(struct btrfs_item));
3645 
3646 	/* update the item pointers */
3647 	btrfs_init_map_token(&token, right);
3648 	right_nritems += push_items;
3649 	btrfs_set_header_nritems(right, right_nritems);
3650 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3651 	for (i = 0; i < right_nritems; i++) {
3652 		item = btrfs_item_nr(i);
3653 		push_space -= btrfs_token_item_size(right, item, &token);
3654 		btrfs_set_token_item_offset(right, item, push_space, &token);
3655 	}
3656 
3657 	left_nritems -= push_items;
3658 	btrfs_set_header_nritems(left, left_nritems);
3659 
3660 	if (left_nritems)
3661 		btrfs_mark_buffer_dirty(left);
3662 	else
3663 		btrfs_clean_tree_block(left);
3664 
3665 	btrfs_mark_buffer_dirty(right);
3666 
3667 	btrfs_item_key(right, &disk_key, 0);
3668 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3669 	btrfs_mark_buffer_dirty(upper);
3670 
3671 	/* then fixup the leaf pointer in the path */
3672 	if (path->slots[0] >= left_nritems) {
3673 		path->slots[0] -= left_nritems;
3674 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3675 			btrfs_clean_tree_block(path->nodes[0]);
3676 		btrfs_tree_unlock(path->nodes[0]);
3677 		free_extent_buffer(path->nodes[0]);
3678 		path->nodes[0] = right;
3679 		path->slots[1] += 1;
3680 	} else {
3681 		btrfs_tree_unlock(right);
3682 		free_extent_buffer(right);
3683 	}
3684 	return 0;
3685 
3686 out_unlock:
3687 	btrfs_tree_unlock(right);
3688 	free_extent_buffer(right);
3689 	return 1;
3690 }
3691 
3692 /*
3693  * push some data in the path leaf to the right, trying to free up at
3694  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3695  *
3696  * returns 1 if the push failed because the other node didn't have enough
3697  * room, 0 if everything worked out and < 0 if there were major errors.
3698  *
3699  * this will push starting from min_slot to the end of the leaf.  It won't
3700  * push any slot lower than min_slot
3701  */
3702 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3703 			   *root, struct btrfs_path *path,
3704 			   int min_data_size, int data_size,
3705 			   int empty, u32 min_slot)
3706 {
3707 	struct extent_buffer *left = path->nodes[0];
3708 	struct extent_buffer *right;
3709 	struct extent_buffer *upper;
3710 	int slot;
3711 	int free_space;
3712 	u32 left_nritems;
3713 	int ret;
3714 
3715 	if (!path->nodes[1])
3716 		return 1;
3717 
3718 	slot = path->slots[1];
3719 	upper = path->nodes[1];
3720 	if (slot >= btrfs_header_nritems(upper) - 1)
3721 		return 1;
3722 
3723 	btrfs_assert_tree_locked(path->nodes[1]);
3724 
3725 	right = btrfs_read_node_slot(upper, slot + 1);
3726 	/*
3727 	 * slot + 1 is not valid or we fail to read the right node,
3728 	 * no big deal, just return.
3729 	 */
3730 	if (IS_ERR(right))
3731 		return 1;
3732 
3733 	btrfs_tree_lock(right);
3734 	btrfs_set_lock_blocking_write(right);
3735 
3736 	free_space = btrfs_leaf_free_space(right);
3737 	if (free_space < data_size)
3738 		goto out_unlock;
3739 
3740 	/* cow and double check */
3741 	ret = btrfs_cow_block(trans, root, right, upper,
3742 			      slot + 1, &right);
3743 	if (ret)
3744 		goto out_unlock;
3745 
3746 	free_space = btrfs_leaf_free_space(right);
3747 	if (free_space < data_size)
3748 		goto out_unlock;
3749 
3750 	left_nritems = btrfs_header_nritems(left);
3751 	if (left_nritems == 0)
3752 		goto out_unlock;
3753 
3754 	if (path->slots[0] == left_nritems && !empty) {
3755 		/* Key greater than all keys in the leaf, right neighbor has
3756 		 * enough room for it and we're not emptying our leaf to delete
3757 		 * it, therefore use right neighbor to insert the new item and
3758 		 * no need to touch/dirty our left leaf. */
3759 		btrfs_tree_unlock(left);
3760 		free_extent_buffer(left);
3761 		path->nodes[0] = right;
3762 		path->slots[0] = 0;
3763 		path->slots[1]++;
3764 		return 0;
3765 	}
3766 
3767 	return __push_leaf_right(path, min_data_size, empty,
3768 				right, free_space, left_nritems, min_slot);
3769 out_unlock:
3770 	btrfs_tree_unlock(right);
3771 	free_extent_buffer(right);
3772 	return 1;
3773 }
3774 
3775 /*
3776  * push some data in the path leaf to the left, trying to free up at
3777  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3778  *
3779  * max_slot can put a limit on how far into the leaf we'll push items.  The
3780  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3781  * items
3782  */
3783 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3784 				     int empty, struct extent_buffer *left,
3785 				     int free_space, u32 right_nritems,
3786 				     u32 max_slot)
3787 {
3788 	struct btrfs_fs_info *fs_info = left->fs_info;
3789 	struct btrfs_disk_key disk_key;
3790 	struct extent_buffer *right = path->nodes[0];
3791 	int i;
3792 	int push_space = 0;
3793 	int push_items = 0;
3794 	struct btrfs_item *item;
3795 	u32 old_left_nritems;
3796 	u32 nr;
3797 	int ret = 0;
3798 	u32 this_item_size;
3799 	u32 old_left_item_size;
3800 	struct btrfs_map_token token;
3801 
3802 	if (empty)
3803 		nr = min(right_nritems, max_slot);
3804 	else
3805 		nr = min(right_nritems - 1, max_slot);
3806 
3807 	for (i = 0; i < nr; i++) {
3808 		item = btrfs_item_nr(i);
3809 
3810 		if (!empty && push_items > 0) {
3811 			if (path->slots[0] < i)
3812 				break;
3813 			if (path->slots[0] == i) {
3814 				int space = btrfs_leaf_free_space(right);
3815 
3816 				if (space + push_space * 2 > free_space)
3817 					break;
3818 			}
3819 		}
3820 
3821 		if (path->slots[0] == i)
3822 			push_space += data_size;
3823 
3824 		this_item_size = btrfs_item_size(right, item);
3825 		if (this_item_size + sizeof(*item) + push_space > free_space)
3826 			break;
3827 
3828 		push_items++;
3829 		push_space += this_item_size + sizeof(*item);
3830 	}
3831 
3832 	if (push_items == 0) {
3833 		ret = 1;
3834 		goto out;
3835 	}
3836 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3837 
3838 	/* push data from right to left */
3839 	copy_extent_buffer(left, right,
3840 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3841 			   btrfs_item_nr_offset(0),
3842 			   push_items * sizeof(struct btrfs_item));
3843 
3844 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3845 		     btrfs_item_offset_nr(right, push_items - 1);
3846 
3847 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3848 		     leaf_data_end(left) - push_space,
3849 		     BTRFS_LEAF_DATA_OFFSET +
3850 		     btrfs_item_offset_nr(right, push_items - 1),
3851 		     push_space);
3852 	old_left_nritems = btrfs_header_nritems(left);
3853 	BUG_ON(old_left_nritems <= 0);
3854 
3855 	btrfs_init_map_token(&token, left);
3856 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3857 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3858 		u32 ioff;
3859 
3860 		item = btrfs_item_nr(i);
3861 
3862 		ioff = btrfs_token_item_offset(left, item, &token);
3863 		btrfs_set_token_item_offset(left, item,
3864 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3865 		      &token);
3866 	}
3867 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3868 
3869 	/* fixup right node */
3870 	if (push_items > right_nritems)
3871 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3872 		       right_nritems);
3873 
3874 	if (push_items < right_nritems) {
3875 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3876 						  leaf_data_end(right);
3877 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3878 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3879 				      BTRFS_LEAF_DATA_OFFSET +
3880 				      leaf_data_end(right), push_space);
3881 
3882 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3883 			      btrfs_item_nr_offset(push_items),
3884 			     (btrfs_header_nritems(right) - push_items) *
3885 			     sizeof(struct btrfs_item));
3886 	}
3887 
3888 	btrfs_init_map_token(&token, right);
3889 	right_nritems -= push_items;
3890 	btrfs_set_header_nritems(right, right_nritems);
3891 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3892 	for (i = 0; i < right_nritems; i++) {
3893 		item = btrfs_item_nr(i);
3894 
3895 		push_space = push_space - btrfs_token_item_size(right,
3896 								item, &token);
3897 		btrfs_set_token_item_offset(right, item, push_space, &token);
3898 	}
3899 
3900 	btrfs_mark_buffer_dirty(left);
3901 	if (right_nritems)
3902 		btrfs_mark_buffer_dirty(right);
3903 	else
3904 		btrfs_clean_tree_block(right);
3905 
3906 	btrfs_item_key(right, &disk_key, 0);
3907 	fixup_low_keys(path, &disk_key, 1);
3908 
3909 	/* then fixup the leaf pointer in the path */
3910 	if (path->slots[0] < push_items) {
3911 		path->slots[0] += old_left_nritems;
3912 		btrfs_tree_unlock(path->nodes[0]);
3913 		free_extent_buffer(path->nodes[0]);
3914 		path->nodes[0] = left;
3915 		path->slots[1] -= 1;
3916 	} else {
3917 		btrfs_tree_unlock(left);
3918 		free_extent_buffer(left);
3919 		path->slots[0] -= push_items;
3920 	}
3921 	BUG_ON(path->slots[0] < 0);
3922 	return ret;
3923 out:
3924 	btrfs_tree_unlock(left);
3925 	free_extent_buffer(left);
3926 	return ret;
3927 }
3928 
3929 /*
3930  * push some data in the path leaf to the left, trying to free up at
3931  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3932  *
3933  * max_slot can put a limit on how far into the leaf we'll push items.  The
3934  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3935  * items
3936  */
3937 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3938 			  *root, struct btrfs_path *path, int min_data_size,
3939 			  int data_size, int empty, u32 max_slot)
3940 {
3941 	struct extent_buffer *right = path->nodes[0];
3942 	struct extent_buffer *left;
3943 	int slot;
3944 	int free_space;
3945 	u32 right_nritems;
3946 	int ret = 0;
3947 
3948 	slot = path->slots[1];
3949 	if (slot == 0)
3950 		return 1;
3951 	if (!path->nodes[1])
3952 		return 1;
3953 
3954 	right_nritems = btrfs_header_nritems(right);
3955 	if (right_nritems == 0)
3956 		return 1;
3957 
3958 	btrfs_assert_tree_locked(path->nodes[1]);
3959 
3960 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3961 	/*
3962 	 * slot - 1 is not valid or we fail to read the left node,
3963 	 * no big deal, just return.
3964 	 */
3965 	if (IS_ERR(left))
3966 		return 1;
3967 
3968 	btrfs_tree_lock(left);
3969 	btrfs_set_lock_blocking_write(left);
3970 
3971 	free_space = btrfs_leaf_free_space(left);
3972 	if (free_space < data_size) {
3973 		ret = 1;
3974 		goto out;
3975 	}
3976 
3977 	/* cow and double check */
3978 	ret = btrfs_cow_block(trans, root, left,
3979 			      path->nodes[1], slot - 1, &left);
3980 	if (ret) {
3981 		/* we hit -ENOSPC, but it isn't fatal here */
3982 		if (ret == -ENOSPC)
3983 			ret = 1;
3984 		goto out;
3985 	}
3986 
3987 	free_space = btrfs_leaf_free_space(left);
3988 	if (free_space < data_size) {
3989 		ret = 1;
3990 		goto out;
3991 	}
3992 
3993 	return __push_leaf_left(path, min_data_size,
3994 			       empty, left, free_space, right_nritems,
3995 			       max_slot);
3996 out:
3997 	btrfs_tree_unlock(left);
3998 	free_extent_buffer(left);
3999 	return ret;
4000 }
4001 
4002 /*
4003  * split the path's leaf in two, making sure there is at least data_size
4004  * available for the resulting leaf level of the path.
4005  */
4006 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4007 				    struct btrfs_path *path,
4008 				    struct extent_buffer *l,
4009 				    struct extent_buffer *right,
4010 				    int slot, int mid, int nritems)
4011 {
4012 	struct btrfs_fs_info *fs_info = trans->fs_info;
4013 	int data_copy_size;
4014 	int rt_data_off;
4015 	int i;
4016 	struct btrfs_disk_key disk_key;
4017 	struct btrfs_map_token token;
4018 
4019 	nritems = nritems - mid;
4020 	btrfs_set_header_nritems(right, nritems);
4021 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4022 
4023 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4024 			   btrfs_item_nr_offset(mid),
4025 			   nritems * sizeof(struct btrfs_item));
4026 
4027 	copy_extent_buffer(right, l,
4028 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4029 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4030 		     leaf_data_end(l), data_copy_size);
4031 
4032 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4033 
4034 	btrfs_init_map_token(&token, right);
4035 	for (i = 0; i < nritems; i++) {
4036 		struct btrfs_item *item = btrfs_item_nr(i);
4037 		u32 ioff;
4038 
4039 		ioff = btrfs_token_item_offset(right, item, &token);
4040 		btrfs_set_token_item_offset(right, item,
4041 					    ioff + rt_data_off, &token);
4042 	}
4043 
4044 	btrfs_set_header_nritems(l, mid);
4045 	btrfs_item_key(right, &disk_key, 0);
4046 	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4047 
4048 	btrfs_mark_buffer_dirty(right);
4049 	btrfs_mark_buffer_dirty(l);
4050 	BUG_ON(path->slots[0] != slot);
4051 
4052 	if (mid <= slot) {
4053 		btrfs_tree_unlock(path->nodes[0]);
4054 		free_extent_buffer(path->nodes[0]);
4055 		path->nodes[0] = right;
4056 		path->slots[0] -= mid;
4057 		path->slots[1] += 1;
4058 	} else {
4059 		btrfs_tree_unlock(right);
4060 		free_extent_buffer(right);
4061 	}
4062 
4063 	BUG_ON(path->slots[0] < 0);
4064 }
4065 
4066 /*
4067  * double splits happen when we need to insert a big item in the middle
4068  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4069  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4070  *          A                 B                 C
4071  *
4072  * We avoid this by trying to push the items on either side of our target
4073  * into the adjacent leaves.  If all goes well we can avoid the double split
4074  * completely.
4075  */
4076 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4077 					  struct btrfs_root *root,
4078 					  struct btrfs_path *path,
4079 					  int data_size)
4080 {
4081 	int ret;
4082 	int progress = 0;
4083 	int slot;
4084 	u32 nritems;
4085 	int space_needed = data_size;
4086 
4087 	slot = path->slots[0];
4088 	if (slot < btrfs_header_nritems(path->nodes[0]))
4089 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4090 
4091 	/*
4092 	 * try to push all the items after our slot into the
4093 	 * right leaf
4094 	 */
4095 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4096 	if (ret < 0)
4097 		return ret;
4098 
4099 	if (ret == 0)
4100 		progress++;
4101 
4102 	nritems = btrfs_header_nritems(path->nodes[0]);
4103 	/*
4104 	 * our goal is to get our slot at the start or end of a leaf.  If
4105 	 * we've done so we're done
4106 	 */
4107 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4108 		return 0;
4109 
4110 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4111 		return 0;
4112 
4113 	/* try to push all the items before our slot into the next leaf */
4114 	slot = path->slots[0];
4115 	space_needed = data_size;
4116 	if (slot > 0)
4117 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4118 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4119 	if (ret < 0)
4120 		return ret;
4121 
4122 	if (ret == 0)
4123 		progress++;
4124 
4125 	if (progress)
4126 		return 0;
4127 	return 1;
4128 }
4129 
4130 /*
4131  * split the path's leaf in two, making sure there is at least data_size
4132  * available for the resulting leaf level of the path.
4133  *
4134  * returns 0 if all went well and < 0 on failure.
4135  */
4136 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4137 			       struct btrfs_root *root,
4138 			       const struct btrfs_key *ins_key,
4139 			       struct btrfs_path *path, int data_size,
4140 			       int extend)
4141 {
4142 	struct btrfs_disk_key disk_key;
4143 	struct extent_buffer *l;
4144 	u32 nritems;
4145 	int mid;
4146 	int slot;
4147 	struct extent_buffer *right;
4148 	struct btrfs_fs_info *fs_info = root->fs_info;
4149 	int ret = 0;
4150 	int wret;
4151 	int split;
4152 	int num_doubles = 0;
4153 	int tried_avoid_double = 0;
4154 
4155 	l = path->nodes[0];
4156 	slot = path->slots[0];
4157 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4158 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4159 		return -EOVERFLOW;
4160 
4161 	/* first try to make some room by pushing left and right */
4162 	if (data_size && path->nodes[1]) {
4163 		int space_needed = data_size;
4164 
4165 		if (slot < btrfs_header_nritems(l))
4166 			space_needed -= btrfs_leaf_free_space(l);
4167 
4168 		wret = push_leaf_right(trans, root, path, space_needed,
4169 				       space_needed, 0, 0);
4170 		if (wret < 0)
4171 			return wret;
4172 		if (wret) {
4173 			space_needed = data_size;
4174 			if (slot > 0)
4175 				space_needed -= btrfs_leaf_free_space(l);
4176 			wret = push_leaf_left(trans, root, path, space_needed,
4177 					      space_needed, 0, (u32)-1);
4178 			if (wret < 0)
4179 				return wret;
4180 		}
4181 		l = path->nodes[0];
4182 
4183 		/* did the pushes work? */
4184 		if (btrfs_leaf_free_space(l) >= data_size)
4185 			return 0;
4186 	}
4187 
4188 	if (!path->nodes[1]) {
4189 		ret = insert_new_root(trans, root, path, 1);
4190 		if (ret)
4191 			return ret;
4192 	}
4193 again:
4194 	split = 1;
4195 	l = path->nodes[0];
4196 	slot = path->slots[0];
4197 	nritems = btrfs_header_nritems(l);
4198 	mid = (nritems + 1) / 2;
4199 
4200 	if (mid <= slot) {
4201 		if (nritems == 1 ||
4202 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4203 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4204 			if (slot >= nritems) {
4205 				split = 0;
4206 			} else {
4207 				mid = slot;
4208 				if (mid != nritems &&
4209 				    leaf_space_used(l, mid, nritems - mid) +
4210 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4211 					if (data_size && !tried_avoid_double)
4212 						goto push_for_double;
4213 					split = 2;
4214 				}
4215 			}
4216 		}
4217 	} else {
4218 		if (leaf_space_used(l, 0, mid) + data_size >
4219 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4220 			if (!extend && data_size && slot == 0) {
4221 				split = 0;
4222 			} else if ((extend || !data_size) && slot == 0) {
4223 				mid = 1;
4224 			} else {
4225 				mid = slot;
4226 				if (mid != nritems &&
4227 				    leaf_space_used(l, mid, nritems - mid) +
4228 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4229 					if (data_size && !tried_avoid_double)
4230 						goto push_for_double;
4231 					split = 2;
4232 				}
4233 			}
4234 		}
4235 	}
4236 
4237 	if (split == 0)
4238 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4239 	else
4240 		btrfs_item_key(l, &disk_key, mid);
4241 
4242 	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4243 					     l->start, 0);
4244 	if (IS_ERR(right))
4245 		return PTR_ERR(right);
4246 
4247 	root_add_used(root, fs_info->nodesize);
4248 
4249 	if (split == 0) {
4250 		if (mid <= slot) {
4251 			btrfs_set_header_nritems(right, 0);
4252 			insert_ptr(trans, path, &disk_key,
4253 				   right->start, path->slots[1] + 1, 1);
4254 			btrfs_tree_unlock(path->nodes[0]);
4255 			free_extent_buffer(path->nodes[0]);
4256 			path->nodes[0] = right;
4257 			path->slots[0] = 0;
4258 			path->slots[1] += 1;
4259 		} else {
4260 			btrfs_set_header_nritems(right, 0);
4261 			insert_ptr(trans, path, &disk_key,
4262 				   right->start, path->slots[1], 1);
4263 			btrfs_tree_unlock(path->nodes[0]);
4264 			free_extent_buffer(path->nodes[0]);
4265 			path->nodes[0] = right;
4266 			path->slots[0] = 0;
4267 			if (path->slots[1] == 0)
4268 				fixup_low_keys(path, &disk_key, 1);
4269 		}
4270 		/*
4271 		 * We create a new leaf 'right' for the required ins_len and
4272 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4273 		 * the content of ins_len to 'right'.
4274 		 */
4275 		return ret;
4276 	}
4277 
4278 	copy_for_split(trans, path, l, right, slot, mid, nritems);
4279 
4280 	if (split == 2) {
4281 		BUG_ON(num_doubles != 0);
4282 		num_doubles++;
4283 		goto again;
4284 	}
4285 
4286 	return 0;
4287 
4288 push_for_double:
4289 	push_for_double_split(trans, root, path, data_size);
4290 	tried_avoid_double = 1;
4291 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4292 		return 0;
4293 	goto again;
4294 }
4295 
4296 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4297 					 struct btrfs_root *root,
4298 					 struct btrfs_path *path, int ins_len)
4299 {
4300 	struct btrfs_key key;
4301 	struct extent_buffer *leaf;
4302 	struct btrfs_file_extent_item *fi;
4303 	u64 extent_len = 0;
4304 	u32 item_size;
4305 	int ret;
4306 
4307 	leaf = path->nodes[0];
4308 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4309 
4310 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4311 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4312 
4313 	if (btrfs_leaf_free_space(leaf) >= ins_len)
4314 		return 0;
4315 
4316 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4317 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4318 		fi = btrfs_item_ptr(leaf, path->slots[0],
4319 				    struct btrfs_file_extent_item);
4320 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4321 	}
4322 	btrfs_release_path(path);
4323 
4324 	path->keep_locks = 1;
4325 	path->search_for_split = 1;
4326 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4327 	path->search_for_split = 0;
4328 	if (ret > 0)
4329 		ret = -EAGAIN;
4330 	if (ret < 0)
4331 		goto err;
4332 
4333 	ret = -EAGAIN;
4334 	leaf = path->nodes[0];
4335 	/* if our item isn't there, return now */
4336 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4337 		goto err;
4338 
4339 	/* the leaf has  changed, it now has room.  return now */
4340 	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4341 		goto err;
4342 
4343 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4344 		fi = btrfs_item_ptr(leaf, path->slots[0],
4345 				    struct btrfs_file_extent_item);
4346 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4347 			goto err;
4348 	}
4349 
4350 	btrfs_set_path_blocking(path);
4351 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4352 	if (ret)
4353 		goto err;
4354 
4355 	path->keep_locks = 0;
4356 	btrfs_unlock_up_safe(path, 1);
4357 	return 0;
4358 err:
4359 	path->keep_locks = 0;
4360 	return ret;
4361 }
4362 
4363 static noinline int split_item(struct btrfs_path *path,
4364 			       const struct btrfs_key *new_key,
4365 			       unsigned long split_offset)
4366 {
4367 	struct extent_buffer *leaf;
4368 	struct btrfs_item *item;
4369 	struct btrfs_item *new_item;
4370 	int slot;
4371 	char *buf;
4372 	u32 nritems;
4373 	u32 item_size;
4374 	u32 orig_offset;
4375 	struct btrfs_disk_key disk_key;
4376 
4377 	leaf = path->nodes[0];
4378 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4379 
4380 	btrfs_set_path_blocking(path);
4381 
4382 	item = btrfs_item_nr(path->slots[0]);
4383 	orig_offset = btrfs_item_offset(leaf, item);
4384 	item_size = btrfs_item_size(leaf, item);
4385 
4386 	buf = kmalloc(item_size, GFP_NOFS);
4387 	if (!buf)
4388 		return -ENOMEM;
4389 
4390 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4391 			    path->slots[0]), item_size);
4392 
4393 	slot = path->slots[0] + 1;
4394 	nritems = btrfs_header_nritems(leaf);
4395 	if (slot != nritems) {
4396 		/* shift the items */
4397 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4398 				btrfs_item_nr_offset(slot),
4399 				(nritems - slot) * sizeof(struct btrfs_item));
4400 	}
4401 
4402 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4403 	btrfs_set_item_key(leaf, &disk_key, slot);
4404 
4405 	new_item = btrfs_item_nr(slot);
4406 
4407 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4408 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4409 
4410 	btrfs_set_item_offset(leaf, item,
4411 			      orig_offset + item_size - split_offset);
4412 	btrfs_set_item_size(leaf, item, split_offset);
4413 
4414 	btrfs_set_header_nritems(leaf, nritems + 1);
4415 
4416 	/* write the data for the start of the original item */
4417 	write_extent_buffer(leaf, buf,
4418 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4419 			    split_offset);
4420 
4421 	/* write the data for the new item */
4422 	write_extent_buffer(leaf, buf + split_offset,
4423 			    btrfs_item_ptr_offset(leaf, slot),
4424 			    item_size - split_offset);
4425 	btrfs_mark_buffer_dirty(leaf);
4426 
4427 	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4428 	kfree(buf);
4429 	return 0;
4430 }
4431 
4432 /*
4433  * This function splits a single item into two items,
4434  * giving 'new_key' to the new item and splitting the
4435  * old one at split_offset (from the start of the item).
4436  *
4437  * The path may be released by this operation.  After
4438  * the split, the path is pointing to the old item.  The
4439  * new item is going to be in the same node as the old one.
4440  *
4441  * Note, the item being split must be smaller enough to live alone on
4442  * a tree block with room for one extra struct btrfs_item
4443  *
4444  * This allows us to split the item in place, keeping a lock on the
4445  * leaf the entire time.
4446  */
4447 int btrfs_split_item(struct btrfs_trans_handle *trans,
4448 		     struct btrfs_root *root,
4449 		     struct btrfs_path *path,
4450 		     const struct btrfs_key *new_key,
4451 		     unsigned long split_offset)
4452 {
4453 	int ret;
4454 	ret = setup_leaf_for_split(trans, root, path,
4455 				   sizeof(struct btrfs_item));
4456 	if (ret)
4457 		return ret;
4458 
4459 	ret = split_item(path, new_key, split_offset);
4460 	return ret;
4461 }
4462 
4463 /*
4464  * This function duplicate a item, giving 'new_key' to the new item.
4465  * It guarantees both items live in the same tree leaf and the new item
4466  * is contiguous with the original item.
4467  *
4468  * This allows us to split file extent in place, keeping a lock on the
4469  * leaf the entire time.
4470  */
4471 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4472 			 struct btrfs_root *root,
4473 			 struct btrfs_path *path,
4474 			 const struct btrfs_key *new_key)
4475 {
4476 	struct extent_buffer *leaf;
4477 	int ret;
4478 	u32 item_size;
4479 
4480 	leaf = path->nodes[0];
4481 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4482 	ret = setup_leaf_for_split(trans, root, path,
4483 				   item_size + sizeof(struct btrfs_item));
4484 	if (ret)
4485 		return ret;
4486 
4487 	path->slots[0]++;
4488 	setup_items_for_insert(root, path, new_key, &item_size,
4489 			       item_size, item_size +
4490 			       sizeof(struct btrfs_item), 1);
4491 	leaf = path->nodes[0];
4492 	memcpy_extent_buffer(leaf,
4493 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4494 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4495 			     item_size);
4496 	return 0;
4497 }
4498 
4499 /*
4500  * make the item pointed to by the path smaller.  new_size indicates
4501  * how small to make it, and from_end tells us if we just chop bytes
4502  * off the end of the item or if we shift the item to chop bytes off
4503  * the front.
4504  */
4505 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4506 {
4507 	int slot;
4508 	struct extent_buffer *leaf;
4509 	struct btrfs_item *item;
4510 	u32 nritems;
4511 	unsigned int data_end;
4512 	unsigned int old_data_start;
4513 	unsigned int old_size;
4514 	unsigned int size_diff;
4515 	int i;
4516 	struct btrfs_map_token token;
4517 
4518 	leaf = path->nodes[0];
4519 	slot = path->slots[0];
4520 
4521 	old_size = btrfs_item_size_nr(leaf, slot);
4522 	if (old_size == new_size)
4523 		return;
4524 
4525 	nritems = btrfs_header_nritems(leaf);
4526 	data_end = leaf_data_end(leaf);
4527 
4528 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4529 
4530 	size_diff = old_size - new_size;
4531 
4532 	BUG_ON(slot < 0);
4533 	BUG_ON(slot >= nritems);
4534 
4535 	/*
4536 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4537 	 */
4538 	/* first correct the data pointers */
4539 	btrfs_init_map_token(&token, leaf);
4540 	for (i = slot; i < nritems; i++) {
4541 		u32 ioff;
4542 		item = btrfs_item_nr(i);
4543 
4544 		ioff = btrfs_token_item_offset(leaf, item, &token);
4545 		btrfs_set_token_item_offset(leaf, item,
4546 					    ioff + size_diff, &token);
4547 	}
4548 
4549 	/* shift the data */
4550 	if (from_end) {
4551 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4552 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4553 			      data_end, old_data_start + new_size - data_end);
4554 	} else {
4555 		struct btrfs_disk_key disk_key;
4556 		u64 offset;
4557 
4558 		btrfs_item_key(leaf, &disk_key, slot);
4559 
4560 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4561 			unsigned long ptr;
4562 			struct btrfs_file_extent_item *fi;
4563 
4564 			fi = btrfs_item_ptr(leaf, slot,
4565 					    struct btrfs_file_extent_item);
4566 			fi = (struct btrfs_file_extent_item *)(
4567 			     (unsigned long)fi - size_diff);
4568 
4569 			if (btrfs_file_extent_type(leaf, fi) ==
4570 			    BTRFS_FILE_EXTENT_INLINE) {
4571 				ptr = btrfs_item_ptr_offset(leaf, slot);
4572 				memmove_extent_buffer(leaf, ptr,
4573 				      (unsigned long)fi,
4574 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4575 			}
4576 		}
4577 
4578 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4579 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4580 			      data_end, old_data_start - data_end);
4581 
4582 		offset = btrfs_disk_key_offset(&disk_key);
4583 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4584 		btrfs_set_item_key(leaf, &disk_key, slot);
4585 		if (slot == 0)
4586 			fixup_low_keys(path, &disk_key, 1);
4587 	}
4588 
4589 	item = btrfs_item_nr(slot);
4590 	btrfs_set_item_size(leaf, item, new_size);
4591 	btrfs_mark_buffer_dirty(leaf);
4592 
4593 	if (btrfs_leaf_free_space(leaf) < 0) {
4594 		btrfs_print_leaf(leaf);
4595 		BUG();
4596 	}
4597 }
4598 
4599 /*
4600  * make the item pointed to by the path bigger, data_size is the added size.
4601  */
4602 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4603 {
4604 	int slot;
4605 	struct extent_buffer *leaf;
4606 	struct btrfs_item *item;
4607 	u32 nritems;
4608 	unsigned int data_end;
4609 	unsigned int old_data;
4610 	unsigned int old_size;
4611 	int i;
4612 	struct btrfs_map_token token;
4613 
4614 	leaf = path->nodes[0];
4615 
4616 	nritems = btrfs_header_nritems(leaf);
4617 	data_end = leaf_data_end(leaf);
4618 
4619 	if (btrfs_leaf_free_space(leaf) < data_size) {
4620 		btrfs_print_leaf(leaf);
4621 		BUG();
4622 	}
4623 	slot = path->slots[0];
4624 	old_data = btrfs_item_end_nr(leaf, slot);
4625 
4626 	BUG_ON(slot < 0);
4627 	if (slot >= nritems) {
4628 		btrfs_print_leaf(leaf);
4629 		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4630 			   slot, nritems);
4631 		BUG();
4632 	}
4633 
4634 	/*
4635 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4636 	 */
4637 	/* first correct the data pointers */
4638 	btrfs_init_map_token(&token, leaf);
4639 	for (i = slot; i < nritems; i++) {
4640 		u32 ioff;
4641 		item = btrfs_item_nr(i);
4642 
4643 		ioff = btrfs_token_item_offset(leaf, item, &token);
4644 		btrfs_set_token_item_offset(leaf, item,
4645 					    ioff - data_size, &token);
4646 	}
4647 
4648 	/* shift the data */
4649 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4650 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4651 		      data_end, old_data - data_end);
4652 
4653 	data_end = old_data;
4654 	old_size = btrfs_item_size_nr(leaf, slot);
4655 	item = btrfs_item_nr(slot);
4656 	btrfs_set_item_size(leaf, item, old_size + data_size);
4657 	btrfs_mark_buffer_dirty(leaf);
4658 
4659 	if (btrfs_leaf_free_space(leaf) < 0) {
4660 		btrfs_print_leaf(leaf);
4661 		BUG();
4662 	}
4663 }
4664 
4665 /*
4666  * this is a helper for btrfs_insert_empty_items, the main goal here is
4667  * to save stack depth by doing the bulk of the work in a function
4668  * that doesn't call btrfs_search_slot
4669  */
4670 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4671 			    const struct btrfs_key *cpu_key, u32 *data_size,
4672 			    u32 total_data, u32 total_size, int nr)
4673 {
4674 	struct btrfs_fs_info *fs_info = root->fs_info;
4675 	struct btrfs_item *item;
4676 	int i;
4677 	u32 nritems;
4678 	unsigned int data_end;
4679 	struct btrfs_disk_key disk_key;
4680 	struct extent_buffer *leaf;
4681 	int slot;
4682 	struct btrfs_map_token token;
4683 
4684 	if (path->slots[0] == 0) {
4685 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4686 		fixup_low_keys(path, &disk_key, 1);
4687 	}
4688 	btrfs_unlock_up_safe(path, 1);
4689 
4690 	leaf = path->nodes[0];
4691 	slot = path->slots[0];
4692 
4693 	nritems = btrfs_header_nritems(leaf);
4694 	data_end = leaf_data_end(leaf);
4695 
4696 	if (btrfs_leaf_free_space(leaf) < total_size) {
4697 		btrfs_print_leaf(leaf);
4698 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4699 			   total_size, btrfs_leaf_free_space(leaf));
4700 		BUG();
4701 	}
4702 
4703 	btrfs_init_map_token(&token, leaf);
4704 	if (slot != nritems) {
4705 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4706 
4707 		if (old_data < data_end) {
4708 			btrfs_print_leaf(leaf);
4709 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4710 				   slot, old_data, data_end);
4711 			BUG();
4712 		}
4713 		/*
4714 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4715 		 */
4716 		/* first correct the data pointers */
4717 		for (i = slot; i < nritems; i++) {
4718 			u32 ioff;
4719 
4720 			item = btrfs_item_nr(i);
4721 			ioff = btrfs_token_item_offset(leaf, item, &token);
4722 			btrfs_set_token_item_offset(leaf, item,
4723 						    ioff - total_data, &token);
4724 		}
4725 		/* shift the items */
4726 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4727 			      btrfs_item_nr_offset(slot),
4728 			      (nritems - slot) * sizeof(struct btrfs_item));
4729 
4730 		/* shift the data */
4731 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4732 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4733 			      data_end, old_data - data_end);
4734 		data_end = old_data;
4735 	}
4736 
4737 	/* setup the item for the new data */
4738 	for (i = 0; i < nr; i++) {
4739 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4740 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4741 		item = btrfs_item_nr(slot + i);
4742 		btrfs_set_token_item_offset(leaf, item,
4743 					    data_end - data_size[i], &token);
4744 		data_end -= data_size[i];
4745 		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4746 	}
4747 
4748 	btrfs_set_header_nritems(leaf, nritems + nr);
4749 	btrfs_mark_buffer_dirty(leaf);
4750 
4751 	if (btrfs_leaf_free_space(leaf) < 0) {
4752 		btrfs_print_leaf(leaf);
4753 		BUG();
4754 	}
4755 }
4756 
4757 /*
4758  * Given a key and some data, insert items into the tree.
4759  * This does all the path init required, making room in the tree if needed.
4760  */
4761 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4762 			    struct btrfs_root *root,
4763 			    struct btrfs_path *path,
4764 			    const struct btrfs_key *cpu_key, u32 *data_size,
4765 			    int nr)
4766 {
4767 	int ret = 0;
4768 	int slot;
4769 	int i;
4770 	u32 total_size = 0;
4771 	u32 total_data = 0;
4772 
4773 	for (i = 0; i < nr; i++)
4774 		total_data += data_size[i];
4775 
4776 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4777 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4778 	if (ret == 0)
4779 		return -EEXIST;
4780 	if (ret < 0)
4781 		return ret;
4782 
4783 	slot = path->slots[0];
4784 	BUG_ON(slot < 0);
4785 
4786 	setup_items_for_insert(root, path, cpu_key, data_size,
4787 			       total_data, total_size, nr);
4788 	return 0;
4789 }
4790 
4791 /*
4792  * Given a key and some data, insert an item into the tree.
4793  * This does all the path init required, making room in the tree if needed.
4794  */
4795 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4796 		      const struct btrfs_key *cpu_key, void *data,
4797 		      u32 data_size)
4798 {
4799 	int ret = 0;
4800 	struct btrfs_path *path;
4801 	struct extent_buffer *leaf;
4802 	unsigned long ptr;
4803 
4804 	path = btrfs_alloc_path();
4805 	if (!path)
4806 		return -ENOMEM;
4807 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4808 	if (!ret) {
4809 		leaf = path->nodes[0];
4810 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4811 		write_extent_buffer(leaf, data, ptr, data_size);
4812 		btrfs_mark_buffer_dirty(leaf);
4813 	}
4814 	btrfs_free_path(path);
4815 	return ret;
4816 }
4817 
4818 /*
4819  * delete the pointer from a given node.
4820  *
4821  * the tree should have been previously balanced so the deletion does not
4822  * empty a node.
4823  */
4824 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4825 		    int level, int slot)
4826 {
4827 	struct extent_buffer *parent = path->nodes[level];
4828 	u32 nritems;
4829 	int ret;
4830 
4831 	nritems = btrfs_header_nritems(parent);
4832 	if (slot != nritems - 1) {
4833 		if (level) {
4834 			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4835 					nritems - slot - 1);
4836 			BUG_ON(ret < 0);
4837 		}
4838 		memmove_extent_buffer(parent,
4839 			      btrfs_node_key_ptr_offset(slot),
4840 			      btrfs_node_key_ptr_offset(slot + 1),
4841 			      sizeof(struct btrfs_key_ptr) *
4842 			      (nritems - slot - 1));
4843 	} else if (level) {
4844 		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4845 				GFP_NOFS);
4846 		BUG_ON(ret < 0);
4847 	}
4848 
4849 	nritems--;
4850 	btrfs_set_header_nritems(parent, nritems);
4851 	if (nritems == 0 && parent == root->node) {
4852 		BUG_ON(btrfs_header_level(root->node) != 1);
4853 		/* just turn the root into a leaf and break */
4854 		btrfs_set_header_level(root->node, 0);
4855 	} else if (slot == 0) {
4856 		struct btrfs_disk_key disk_key;
4857 
4858 		btrfs_node_key(parent, &disk_key, 0);
4859 		fixup_low_keys(path, &disk_key, level + 1);
4860 	}
4861 	btrfs_mark_buffer_dirty(parent);
4862 }
4863 
4864 /*
4865  * a helper function to delete the leaf pointed to by path->slots[1] and
4866  * path->nodes[1].
4867  *
4868  * This deletes the pointer in path->nodes[1] and frees the leaf
4869  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4870  *
4871  * The path must have already been setup for deleting the leaf, including
4872  * all the proper balancing.  path->nodes[1] must be locked.
4873  */
4874 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4875 				    struct btrfs_root *root,
4876 				    struct btrfs_path *path,
4877 				    struct extent_buffer *leaf)
4878 {
4879 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4880 	del_ptr(root, path, 1, path->slots[1]);
4881 
4882 	/*
4883 	 * btrfs_free_extent is expensive, we want to make sure we
4884 	 * aren't holding any locks when we call it
4885 	 */
4886 	btrfs_unlock_up_safe(path, 0);
4887 
4888 	root_sub_used(root, leaf->len);
4889 
4890 	atomic_inc(&leaf->refs);
4891 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4892 	free_extent_buffer_stale(leaf);
4893 }
4894 /*
4895  * delete the item at the leaf level in path.  If that empties
4896  * the leaf, remove it from the tree
4897  */
4898 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4899 		    struct btrfs_path *path, int slot, int nr)
4900 {
4901 	struct btrfs_fs_info *fs_info = root->fs_info;
4902 	struct extent_buffer *leaf;
4903 	struct btrfs_item *item;
4904 	u32 last_off;
4905 	u32 dsize = 0;
4906 	int ret = 0;
4907 	int wret;
4908 	int i;
4909 	u32 nritems;
4910 
4911 	leaf = path->nodes[0];
4912 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4913 
4914 	for (i = 0; i < nr; i++)
4915 		dsize += btrfs_item_size_nr(leaf, slot + i);
4916 
4917 	nritems = btrfs_header_nritems(leaf);
4918 
4919 	if (slot + nr != nritems) {
4920 		int data_end = leaf_data_end(leaf);
4921 		struct btrfs_map_token token;
4922 
4923 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4924 			      data_end + dsize,
4925 			      BTRFS_LEAF_DATA_OFFSET + data_end,
4926 			      last_off - data_end);
4927 
4928 		btrfs_init_map_token(&token, leaf);
4929 		for (i = slot + nr; i < nritems; i++) {
4930 			u32 ioff;
4931 
4932 			item = btrfs_item_nr(i);
4933 			ioff = btrfs_token_item_offset(leaf, item, &token);
4934 			btrfs_set_token_item_offset(leaf, item,
4935 						    ioff + dsize, &token);
4936 		}
4937 
4938 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4939 			      btrfs_item_nr_offset(slot + nr),
4940 			      sizeof(struct btrfs_item) *
4941 			      (nritems - slot - nr));
4942 	}
4943 	btrfs_set_header_nritems(leaf, nritems - nr);
4944 	nritems -= nr;
4945 
4946 	/* delete the leaf if we've emptied it */
4947 	if (nritems == 0) {
4948 		if (leaf == root->node) {
4949 			btrfs_set_header_level(leaf, 0);
4950 		} else {
4951 			btrfs_set_path_blocking(path);
4952 			btrfs_clean_tree_block(leaf);
4953 			btrfs_del_leaf(trans, root, path, leaf);
4954 		}
4955 	} else {
4956 		int used = leaf_space_used(leaf, 0, nritems);
4957 		if (slot == 0) {
4958 			struct btrfs_disk_key disk_key;
4959 
4960 			btrfs_item_key(leaf, &disk_key, 0);
4961 			fixup_low_keys(path, &disk_key, 1);
4962 		}
4963 
4964 		/* delete the leaf if it is mostly empty */
4965 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4966 			/* push_leaf_left fixes the path.
4967 			 * make sure the path still points to our leaf
4968 			 * for possible call to del_ptr below
4969 			 */
4970 			slot = path->slots[1];
4971 			atomic_inc(&leaf->refs);
4972 
4973 			btrfs_set_path_blocking(path);
4974 			wret = push_leaf_left(trans, root, path, 1, 1,
4975 					      1, (u32)-1);
4976 			if (wret < 0 && wret != -ENOSPC)
4977 				ret = wret;
4978 
4979 			if (path->nodes[0] == leaf &&
4980 			    btrfs_header_nritems(leaf)) {
4981 				wret = push_leaf_right(trans, root, path, 1,
4982 						       1, 1, 0);
4983 				if (wret < 0 && wret != -ENOSPC)
4984 					ret = wret;
4985 			}
4986 
4987 			if (btrfs_header_nritems(leaf) == 0) {
4988 				path->slots[1] = slot;
4989 				btrfs_del_leaf(trans, root, path, leaf);
4990 				free_extent_buffer(leaf);
4991 				ret = 0;
4992 			} else {
4993 				/* if we're still in the path, make sure
4994 				 * we're dirty.  Otherwise, one of the
4995 				 * push_leaf functions must have already
4996 				 * dirtied this buffer
4997 				 */
4998 				if (path->nodes[0] == leaf)
4999 					btrfs_mark_buffer_dirty(leaf);
5000 				free_extent_buffer(leaf);
5001 			}
5002 		} else {
5003 			btrfs_mark_buffer_dirty(leaf);
5004 		}
5005 	}
5006 	return ret;
5007 }
5008 
5009 /*
5010  * search the tree again to find a leaf with lesser keys
5011  * returns 0 if it found something or 1 if there are no lesser leaves.
5012  * returns < 0 on io errors.
5013  *
5014  * This may release the path, and so you may lose any locks held at the
5015  * time you call it.
5016  */
5017 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5018 {
5019 	struct btrfs_key key;
5020 	struct btrfs_disk_key found_key;
5021 	int ret;
5022 
5023 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5024 
5025 	if (key.offset > 0) {
5026 		key.offset--;
5027 	} else if (key.type > 0) {
5028 		key.type--;
5029 		key.offset = (u64)-1;
5030 	} else if (key.objectid > 0) {
5031 		key.objectid--;
5032 		key.type = (u8)-1;
5033 		key.offset = (u64)-1;
5034 	} else {
5035 		return 1;
5036 	}
5037 
5038 	btrfs_release_path(path);
5039 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5040 	if (ret < 0)
5041 		return ret;
5042 	btrfs_item_key(path->nodes[0], &found_key, 0);
5043 	ret = comp_keys(&found_key, &key);
5044 	/*
5045 	 * We might have had an item with the previous key in the tree right
5046 	 * before we released our path. And after we released our path, that
5047 	 * item might have been pushed to the first slot (0) of the leaf we
5048 	 * were holding due to a tree balance. Alternatively, an item with the
5049 	 * previous key can exist as the only element of a leaf (big fat item).
5050 	 * Therefore account for these 2 cases, so that our callers (like
5051 	 * btrfs_previous_item) don't miss an existing item with a key matching
5052 	 * the previous key we computed above.
5053 	 */
5054 	if (ret <= 0)
5055 		return 0;
5056 	return 1;
5057 }
5058 
5059 /*
5060  * A helper function to walk down the tree starting at min_key, and looking
5061  * for nodes or leaves that are have a minimum transaction id.
5062  * This is used by the btree defrag code, and tree logging
5063  *
5064  * This does not cow, but it does stuff the starting key it finds back
5065  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5066  * key and get a writable path.
5067  *
5068  * This honors path->lowest_level to prevent descent past a given level
5069  * of the tree.
5070  *
5071  * min_trans indicates the oldest transaction that you are interested
5072  * in walking through.  Any nodes or leaves older than min_trans are
5073  * skipped over (without reading them).
5074  *
5075  * returns zero if something useful was found, < 0 on error and 1 if there
5076  * was nothing in the tree that matched the search criteria.
5077  */
5078 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5079 			 struct btrfs_path *path,
5080 			 u64 min_trans)
5081 {
5082 	struct extent_buffer *cur;
5083 	struct btrfs_key found_key;
5084 	int slot;
5085 	int sret;
5086 	u32 nritems;
5087 	int level;
5088 	int ret = 1;
5089 	int keep_locks = path->keep_locks;
5090 
5091 	path->keep_locks = 1;
5092 again:
5093 	cur = btrfs_read_lock_root_node(root);
5094 	level = btrfs_header_level(cur);
5095 	WARN_ON(path->nodes[level]);
5096 	path->nodes[level] = cur;
5097 	path->locks[level] = BTRFS_READ_LOCK;
5098 
5099 	if (btrfs_header_generation(cur) < min_trans) {
5100 		ret = 1;
5101 		goto out;
5102 	}
5103 	while (1) {
5104 		nritems = btrfs_header_nritems(cur);
5105 		level = btrfs_header_level(cur);
5106 		sret = btrfs_bin_search(cur, min_key, level, &slot);
5107 		if (sret < 0) {
5108 			ret = sret;
5109 			goto out;
5110 		}
5111 
5112 		/* at the lowest level, we're done, setup the path and exit */
5113 		if (level == path->lowest_level) {
5114 			if (slot >= nritems)
5115 				goto find_next_key;
5116 			ret = 0;
5117 			path->slots[level] = slot;
5118 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5119 			goto out;
5120 		}
5121 		if (sret && slot > 0)
5122 			slot--;
5123 		/*
5124 		 * check this node pointer against the min_trans parameters.
5125 		 * If it is too old, old, skip to the next one.
5126 		 */
5127 		while (slot < nritems) {
5128 			u64 gen;
5129 
5130 			gen = btrfs_node_ptr_generation(cur, slot);
5131 			if (gen < min_trans) {
5132 				slot++;
5133 				continue;
5134 			}
5135 			break;
5136 		}
5137 find_next_key:
5138 		/*
5139 		 * we didn't find a candidate key in this node, walk forward
5140 		 * and find another one
5141 		 */
5142 		if (slot >= nritems) {
5143 			path->slots[level] = slot;
5144 			btrfs_set_path_blocking(path);
5145 			sret = btrfs_find_next_key(root, path, min_key, level,
5146 						  min_trans);
5147 			if (sret == 0) {
5148 				btrfs_release_path(path);
5149 				goto again;
5150 			} else {
5151 				goto out;
5152 			}
5153 		}
5154 		/* save our key for returning back */
5155 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5156 		path->slots[level] = slot;
5157 		if (level == path->lowest_level) {
5158 			ret = 0;
5159 			goto out;
5160 		}
5161 		btrfs_set_path_blocking(path);
5162 		cur = btrfs_read_node_slot(cur, slot);
5163 		if (IS_ERR(cur)) {
5164 			ret = PTR_ERR(cur);
5165 			goto out;
5166 		}
5167 
5168 		btrfs_tree_read_lock(cur);
5169 
5170 		path->locks[level - 1] = BTRFS_READ_LOCK;
5171 		path->nodes[level - 1] = cur;
5172 		unlock_up(path, level, 1, 0, NULL);
5173 	}
5174 out:
5175 	path->keep_locks = keep_locks;
5176 	if (ret == 0) {
5177 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5178 		btrfs_set_path_blocking(path);
5179 		memcpy(min_key, &found_key, sizeof(found_key));
5180 	}
5181 	return ret;
5182 }
5183 
5184 /*
5185  * this is similar to btrfs_next_leaf, but does not try to preserve
5186  * and fixup the path.  It looks for and returns the next key in the
5187  * tree based on the current path and the min_trans parameters.
5188  *
5189  * 0 is returned if another key is found, < 0 if there are any errors
5190  * and 1 is returned if there are no higher keys in the tree
5191  *
5192  * path->keep_locks should be set to 1 on the search made before
5193  * calling this function.
5194  */
5195 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5196 			struct btrfs_key *key, int level, u64 min_trans)
5197 {
5198 	int slot;
5199 	struct extent_buffer *c;
5200 
5201 	WARN_ON(!path->keep_locks && !path->skip_locking);
5202 	while (level < BTRFS_MAX_LEVEL) {
5203 		if (!path->nodes[level])
5204 			return 1;
5205 
5206 		slot = path->slots[level] + 1;
5207 		c = path->nodes[level];
5208 next:
5209 		if (slot >= btrfs_header_nritems(c)) {
5210 			int ret;
5211 			int orig_lowest;
5212 			struct btrfs_key cur_key;
5213 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5214 			    !path->nodes[level + 1])
5215 				return 1;
5216 
5217 			if (path->locks[level + 1] || path->skip_locking) {
5218 				level++;
5219 				continue;
5220 			}
5221 
5222 			slot = btrfs_header_nritems(c) - 1;
5223 			if (level == 0)
5224 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5225 			else
5226 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5227 
5228 			orig_lowest = path->lowest_level;
5229 			btrfs_release_path(path);
5230 			path->lowest_level = level;
5231 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5232 						0, 0);
5233 			path->lowest_level = orig_lowest;
5234 			if (ret < 0)
5235 				return ret;
5236 
5237 			c = path->nodes[level];
5238 			slot = path->slots[level];
5239 			if (ret == 0)
5240 				slot++;
5241 			goto next;
5242 		}
5243 
5244 		if (level == 0)
5245 			btrfs_item_key_to_cpu(c, key, slot);
5246 		else {
5247 			u64 gen = btrfs_node_ptr_generation(c, slot);
5248 
5249 			if (gen < min_trans) {
5250 				slot++;
5251 				goto next;
5252 			}
5253 			btrfs_node_key_to_cpu(c, key, slot);
5254 		}
5255 		return 0;
5256 	}
5257 	return 1;
5258 }
5259 
5260 /*
5261  * search the tree again to find a leaf with greater keys
5262  * returns 0 if it found something or 1 if there are no greater leaves.
5263  * returns < 0 on io errors.
5264  */
5265 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5266 {
5267 	return btrfs_next_old_leaf(root, path, 0);
5268 }
5269 
5270 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5271 			u64 time_seq)
5272 {
5273 	int slot;
5274 	int level;
5275 	struct extent_buffer *c;
5276 	struct extent_buffer *next;
5277 	struct btrfs_key key;
5278 	u32 nritems;
5279 	int ret;
5280 	int old_spinning = path->leave_spinning;
5281 	int next_rw_lock = 0;
5282 
5283 	nritems = btrfs_header_nritems(path->nodes[0]);
5284 	if (nritems == 0)
5285 		return 1;
5286 
5287 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5288 again:
5289 	level = 1;
5290 	next = NULL;
5291 	next_rw_lock = 0;
5292 	btrfs_release_path(path);
5293 
5294 	path->keep_locks = 1;
5295 	path->leave_spinning = 1;
5296 
5297 	if (time_seq)
5298 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5299 	else
5300 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5301 	path->keep_locks = 0;
5302 
5303 	if (ret < 0)
5304 		return ret;
5305 
5306 	nritems = btrfs_header_nritems(path->nodes[0]);
5307 	/*
5308 	 * by releasing the path above we dropped all our locks.  A balance
5309 	 * could have added more items next to the key that used to be
5310 	 * at the very end of the block.  So, check again here and
5311 	 * advance the path if there are now more items available.
5312 	 */
5313 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5314 		if (ret == 0)
5315 			path->slots[0]++;
5316 		ret = 0;
5317 		goto done;
5318 	}
5319 	/*
5320 	 * So the above check misses one case:
5321 	 * - after releasing the path above, someone has removed the item that
5322 	 *   used to be at the very end of the block, and balance between leafs
5323 	 *   gets another one with bigger key.offset to replace it.
5324 	 *
5325 	 * This one should be returned as well, or we can get leaf corruption
5326 	 * later(esp. in __btrfs_drop_extents()).
5327 	 *
5328 	 * And a bit more explanation about this check,
5329 	 * with ret > 0, the key isn't found, the path points to the slot
5330 	 * where it should be inserted, so the path->slots[0] item must be the
5331 	 * bigger one.
5332 	 */
5333 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5334 		ret = 0;
5335 		goto done;
5336 	}
5337 
5338 	while (level < BTRFS_MAX_LEVEL) {
5339 		if (!path->nodes[level]) {
5340 			ret = 1;
5341 			goto done;
5342 		}
5343 
5344 		slot = path->slots[level] + 1;
5345 		c = path->nodes[level];
5346 		if (slot >= btrfs_header_nritems(c)) {
5347 			level++;
5348 			if (level == BTRFS_MAX_LEVEL) {
5349 				ret = 1;
5350 				goto done;
5351 			}
5352 			continue;
5353 		}
5354 
5355 		if (next) {
5356 			btrfs_tree_unlock_rw(next, next_rw_lock);
5357 			free_extent_buffer(next);
5358 		}
5359 
5360 		next = c;
5361 		next_rw_lock = path->locks[level];
5362 		ret = read_block_for_search(root, path, &next, level,
5363 					    slot, &key);
5364 		if (ret == -EAGAIN)
5365 			goto again;
5366 
5367 		if (ret < 0) {
5368 			btrfs_release_path(path);
5369 			goto done;
5370 		}
5371 
5372 		if (!path->skip_locking) {
5373 			ret = btrfs_try_tree_read_lock(next);
5374 			if (!ret && time_seq) {
5375 				/*
5376 				 * If we don't get the lock, we may be racing
5377 				 * with push_leaf_left, holding that lock while
5378 				 * itself waiting for the leaf we've currently
5379 				 * locked. To solve this situation, we give up
5380 				 * on our lock and cycle.
5381 				 */
5382 				free_extent_buffer(next);
5383 				btrfs_release_path(path);
5384 				cond_resched();
5385 				goto again;
5386 			}
5387 			if (!ret) {
5388 				btrfs_set_path_blocking(path);
5389 				btrfs_tree_read_lock(next);
5390 			}
5391 			next_rw_lock = BTRFS_READ_LOCK;
5392 		}
5393 		break;
5394 	}
5395 	path->slots[level] = slot;
5396 	while (1) {
5397 		level--;
5398 		c = path->nodes[level];
5399 		if (path->locks[level])
5400 			btrfs_tree_unlock_rw(c, path->locks[level]);
5401 
5402 		free_extent_buffer(c);
5403 		path->nodes[level] = next;
5404 		path->slots[level] = 0;
5405 		if (!path->skip_locking)
5406 			path->locks[level] = next_rw_lock;
5407 		if (!level)
5408 			break;
5409 
5410 		ret = read_block_for_search(root, path, &next, level,
5411 					    0, &key);
5412 		if (ret == -EAGAIN)
5413 			goto again;
5414 
5415 		if (ret < 0) {
5416 			btrfs_release_path(path);
5417 			goto done;
5418 		}
5419 
5420 		if (!path->skip_locking) {
5421 			ret = btrfs_try_tree_read_lock(next);
5422 			if (!ret) {
5423 				btrfs_set_path_blocking(path);
5424 				btrfs_tree_read_lock(next);
5425 			}
5426 			next_rw_lock = BTRFS_READ_LOCK;
5427 		}
5428 	}
5429 	ret = 0;
5430 done:
5431 	unlock_up(path, 0, 1, 0, NULL);
5432 	path->leave_spinning = old_spinning;
5433 	if (!old_spinning)
5434 		btrfs_set_path_blocking(path);
5435 
5436 	return ret;
5437 }
5438 
5439 /*
5440  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5441  * searching until it gets past min_objectid or finds an item of 'type'
5442  *
5443  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5444  */
5445 int btrfs_previous_item(struct btrfs_root *root,
5446 			struct btrfs_path *path, u64 min_objectid,
5447 			int type)
5448 {
5449 	struct btrfs_key found_key;
5450 	struct extent_buffer *leaf;
5451 	u32 nritems;
5452 	int ret;
5453 
5454 	while (1) {
5455 		if (path->slots[0] == 0) {
5456 			btrfs_set_path_blocking(path);
5457 			ret = btrfs_prev_leaf(root, path);
5458 			if (ret != 0)
5459 				return ret;
5460 		} else {
5461 			path->slots[0]--;
5462 		}
5463 		leaf = path->nodes[0];
5464 		nritems = btrfs_header_nritems(leaf);
5465 		if (nritems == 0)
5466 			return 1;
5467 		if (path->slots[0] == nritems)
5468 			path->slots[0]--;
5469 
5470 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5471 		if (found_key.objectid < min_objectid)
5472 			break;
5473 		if (found_key.type == type)
5474 			return 0;
5475 		if (found_key.objectid == min_objectid &&
5476 		    found_key.type < type)
5477 			break;
5478 	}
5479 	return 1;
5480 }
5481 
5482 /*
5483  * search in extent tree to find a previous Metadata/Data extent item with
5484  * min objecitd.
5485  *
5486  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5487  */
5488 int btrfs_previous_extent_item(struct btrfs_root *root,
5489 			struct btrfs_path *path, u64 min_objectid)
5490 {
5491 	struct btrfs_key found_key;
5492 	struct extent_buffer *leaf;
5493 	u32 nritems;
5494 	int ret;
5495 
5496 	while (1) {
5497 		if (path->slots[0] == 0) {
5498 			btrfs_set_path_blocking(path);
5499 			ret = btrfs_prev_leaf(root, path);
5500 			if (ret != 0)
5501 				return ret;
5502 		} else {
5503 			path->slots[0]--;
5504 		}
5505 		leaf = path->nodes[0];
5506 		nritems = btrfs_header_nritems(leaf);
5507 		if (nritems == 0)
5508 			return 1;
5509 		if (path->slots[0] == nritems)
5510 			path->slots[0]--;
5511 
5512 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5513 		if (found_key.objectid < min_objectid)
5514 			break;
5515 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5516 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5517 			return 0;
5518 		if (found_key.objectid == min_objectid &&
5519 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5520 			break;
5521 	}
5522 	return 1;
5523 }
5524