xref: /openbmc/linux/fs/btrfs/ctree.c (revision f8523d0e83613ab8d082cd504dc53a09fbba4889)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19 		      *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21 		      const struct btrfs_key *ins_key, struct btrfs_path *path,
22 		      int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24 			  struct extent_buffer *dst,
25 			  struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27 			      struct extent_buffer *dst_buf,
28 			      struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 		    int level, int slot);
31 
32 static const struct btrfs_csums {
33 	u16		size;
34 	const char	name[10];
35 	const char	driver[12];
36 } btrfs_csums[] = {
37 	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38 	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39 	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40 	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41 				     .driver = "blake2b-256" },
42 };
43 
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46 	u16 t = btrfs_super_csum_type(s);
47 	/*
48 	 * csum type is validated at mount time
49 	 */
50 	return btrfs_csums[t].size;
51 }
52 
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55 	/* csum type is validated at mount time */
56 	return btrfs_csums[csum_type].name;
57 }
58 
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65 	/* csum type is validated at mount time */
66 	return btrfs_csums[csum_type].driver[0] ?
67 		btrfs_csums[csum_type].driver :
68 		btrfs_csums[csum_type].name;
69 }
70 
71 size_t __const btrfs_get_num_csums(void)
72 {
73 	return ARRAY_SIZE(btrfs_csums);
74 }
75 
76 struct btrfs_path *btrfs_alloc_path(void)
77 {
78 	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79 }
80 
81 /* this also releases the path */
82 void btrfs_free_path(struct btrfs_path *p)
83 {
84 	if (!p)
85 		return;
86 	btrfs_release_path(p);
87 	kmem_cache_free(btrfs_path_cachep, p);
88 }
89 
90 /*
91  * path release drops references on the extent buffers in the path
92  * and it drops any locks held by this path
93  *
94  * It is safe to call this on paths that no locks or extent buffers held.
95  */
96 noinline void btrfs_release_path(struct btrfs_path *p)
97 {
98 	int i;
99 
100 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101 		p->slots[i] = 0;
102 		if (!p->nodes[i])
103 			continue;
104 		if (p->locks[i]) {
105 			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106 			p->locks[i] = 0;
107 		}
108 		free_extent_buffer(p->nodes[i]);
109 		p->nodes[i] = NULL;
110 	}
111 }
112 
113 /*
114  * safely gets a reference on the root node of a tree.  A lock
115  * is not taken, so a concurrent writer may put a different node
116  * at the root of the tree.  See btrfs_lock_root_node for the
117  * looping required.
118  *
119  * The extent buffer returned by this has a reference taken, so
120  * it won't disappear.  It may stop being the root of the tree
121  * at any time because there are no locks held.
122  */
123 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124 {
125 	struct extent_buffer *eb;
126 
127 	while (1) {
128 		rcu_read_lock();
129 		eb = rcu_dereference(root->node);
130 
131 		/*
132 		 * RCU really hurts here, we could free up the root node because
133 		 * it was COWed but we may not get the new root node yet so do
134 		 * the inc_not_zero dance and if it doesn't work then
135 		 * synchronize_rcu and try again.
136 		 */
137 		if (atomic_inc_not_zero(&eb->refs)) {
138 			rcu_read_unlock();
139 			break;
140 		}
141 		rcu_read_unlock();
142 		synchronize_rcu();
143 	}
144 	return eb;
145 }
146 
147 /*
148  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149  * just get put onto a simple dirty list.  Transaction walks this list to make
150  * sure they get properly updated on disk.
151  */
152 static void add_root_to_dirty_list(struct btrfs_root *root)
153 {
154 	struct btrfs_fs_info *fs_info = root->fs_info;
155 
156 	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
157 	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
158 		return;
159 
160 	spin_lock(&fs_info->trans_lock);
161 	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
162 		/* Want the extent tree to be the last on the list */
163 		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164 			list_move_tail(&root->dirty_list,
165 				       &fs_info->dirty_cowonly_roots);
166 		else
167 			list_move(&root->dirty_list,
168 				  &fs_info->dirty_cowonly_roots);
169 	}
170 	spin_unlock(&fs_info->trans_lock);
171 }
172 
173 /*
174  * used by snapshot creation to make a copy of a root for a tree with
175  * a given objectid.  The buffer with the new root node is returned in
176  * cow_ret, and this func returns zero on success or a negative error code.
177  */
178 int btrfs_copy_root(struct btrfs_trans_handle *trans,
179 		      struct btrfs_root *root,
180 		      struct extent_buffer *buf,
181 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
182 {
183 	struct btrfs_fs_info *fs_info = root->fs_info;
184 	struct extent_buffer *cow;
185 	int ret = 0;
186 	int level;
187 	struct btrfs_disk_key disk_key;
188 
189 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190 		trans->transid != fs_info->running_transaction->transid);
191 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192 		trans->transid != root->last_trans);
193 
194 	level = btrfs_header_level(buf);
195 	if (level == 0)
196 		btrfs_item_key(buf, &disk_key, 0);
197 	else
198 		btrfs_node_key(buf, &disk_key, 0);
199 
200 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201 			&disk_key, level, buf->start, 0);
202 	if (IS_ERR(cow))
203 		return PTR_ERR(cow);
204 
205 	copy_extent_buffer_full(cow, buf);
206 	btrfs_set_header_bytenr(cow, cow->start);
207 	btrfs_set_header_generation(cow, trans->transid);
208 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
209 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
210 				     BTRFS_HEADER_FLAG_RELOC);
211 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
212 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
213 	else
214 		btrfs_set_header_owner(cow, new_root_objectid);
215 
216 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
217 
218 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
219 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
220 		ret = btrfs_inc_ref(trans, root, cow, 1);
221 	else
222 		ret = btrfs_inc_ref(trans, root, cow, 0);
223 
224 	if (ret)
225 		return ret;
226 
227 	btrfs_mark_buffer_dirty(cow);
228 	*cow_ret = cow;
229 	return 0;
230 }
231 
232 enum mod_log_op {
233 	MOD_LOG_KEY_REPLACE,
234 	MOD_LOG_KEY_ADD,
235 	MOD_LOG_KEY_REMOVE,
236 	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
237 	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
238 	MOD_LOG_MOVE_KEYS,
239 	MOD_LOG_ROOT_REPLACE,
240 };
241 
242 struct tree_mod_root {
243 	u64 logical;
244 	u8 level;
245 };
246 
247 struct tree_mod_elem {
248 	struct rb_node node;
249 	u64 logical;
250 	u64 seq;
251 	enum mod_log_op op;
252 
253 	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
254 	int slot;
255 
256 	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
257 	u64 generation;
258 
259 	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
260 	struct btrfs_disk_key key;
261 	u64 blockptr;
262 
263 	/* this is used for op == MOD_LOG_MOVE_KEYS */
264 	struct {
265 		int dst_slot;
266 		int nr_items;
267 	} move;
268 
269 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
270 	struct tree_mod_root old_root;
271 };
272 
273 /*
274  * Pull a new tree mod seq number for our operation.
275  */
276 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
277 {
278 	return atomic64_inc_return(&fs_info->tree_mod_seq);
279 }
280 
281 /*
282  * This adds a new blocker to the tree mod log's blocker list if the @elem
283  * passed does not already have a sequence number set. So when a caller expects
284  * to record tree modifications, it should ensure to set elem->seq to zero
285  * before calling btrfs_get_tree_mod_seq.
286  * Returns a fresh, unused tree log modification sequence number, even if no new
287  * blocker was added.
288  */
289 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
290 			   struct seq_list *elem)
291 {
292 	write_lock(&fs_info->tree_mod_log_lock);
293 	if (!elem->seq) {
294 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
295 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
296 	}
297 	write_unlock(&fs_info->tree_mod_log_lock);
298 
299 	return elem->seq;
300 }
301 
302 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
303 			    struct seq_list *elem)
304 {
305 	struct rb_root *tm_root;
306 	struct rb_node *node;
307 	struct rb_node *next;
308 	struct tree_mod_elem *tm;
309 	u64 min_seq = (u64)-1;
310 	u64 seq_putting = elem->seq;
311 
312 	if (!seq_putting)
313 		return;
314 
315 	write_lock(&fs_info->tree_mod_log_lock);
316 	list_del(&elem->list);
317 	elem->seq = 0;
318 
319 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
320 		struct seq_list *first;
321 
322 		first = list_first_entry(&fs_info->tree_mod_seq_list,
323 					 struct seq_list, list);
324 		if (seq_putting > first->seq) {
325 			/*
326 			 * Blocker with lower sequence number exists, we
327 			 * cannot remove anything from the log.
328 			 */
329 			write_unlock(&fs_info->tree_mod_log_lock);
330 			return;
331 		}
332 		min_seq = first->seq;
333 	}
334 
335 	/*
336 	 * anything that's lower than the lowest existing (read: blocked)
337 	 * sequence number can be removed from the tree.
338 	 */
339 	tm_root = &fs_info->tree_mod_log;
340 	for (node = rb_first(tm_root); node; node = next) {
341 		next = rb_next(node);
342 		tm = rb_entry(node, struct tree_mod_elem, node);
343 		if (tm->seq >= min_seq)
344 			continue;
345 		rb_erase(node, tm_root);
346 		kfree(tm);
347 	}
348 	write_unlock(&fs_info->tree_mod_log_lock);
349 }
350 
351 /*
352  * key order of the log:
353  *       node/leaf start address -> sequence
354  *
355  * The 'start address' is the logical address of the *new* root node
356  * for root replace operations, or the logical address of the affected
357  * block for all other operations.
358  */
359 static noinline int
360 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
361 {
362 	struct rb_root *tm_root;
363 	struct rb_node **new;
364 	struct rb_node *parent = NULL;
365 	struct tree_mod_elem *cur;
366 
367 	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
368 
369 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
370 
371 	tm_root = &fs_info->tree_mod_log;
372 	new = &tm_root->rb_node;
373 	while (*new) {
374 		cur = rb_entry(*new, struct tree_mod_elem, node);
375 		parent = *new;
376 		if (cur->logical < tm->logical)
377 			new = &((*new)->rb_left);
378 		else if (cur->logical > tm->logical)
379 			new = &((*new)->rb_right);
380 		else if (cur->seq < tm->seq)
381 			new = &((*new)->rb_left);
382 		else if (cur->seq > tm->seq)
383 			new = &((*new)->rb_right);
384 		else
385 			return -EEXIST;
386 	}
387 
388 	rb_link_node(&tm->node, parent, new);
389 	rb_insert_color(&tm->node, tm_root);
390 	return 0;
391 }
392 
393 /*
394  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
395  * returns zero with the tree_mod_log_lock acquired. The caller must hold
396  * this until all tree mod log insertions are recorded in the rb tree and then
397  * write unlock fs_info::tree_mod_log_lock.
398  */
399 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
400 				    struct extent_buffer *eb) {
401 	smp_mb();
402 	if (list_empty(&(fs_info)->tree_mod_seq_list))
403 		return 1;
404 	if (eb && btrfs_header_level(eb) == 0)
405 		return 1;
406 
407 	write_lock(&fs_info->tree_mod_log_lock);
408 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
409 		write_unlock(&fs_info->tree_mod_log_lock);
410 		return 1;
411 	}
412 
413 	return 0;
414 }
415 
416 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
417 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
418 				    struct extent_buffer *eb)
419 {
420 	smp_mb();
421 	if (list_empty(&(fs_info)->tree_mod_seq_list))
422 		return 0;
423 	if (eb && btrfs_header_level(eb) == 0)
424 		return 0;
425 
426 	return 1;
427 }
428 
429 static struct tree_mod_elem *
430 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
431 		    enum mod_log_op op, gfp_t flags)
432 {
433 	struct tree_mod_elem *tm;
434 
435 	tm = kzalloc(sizeof(*tm), flags);
436 	if (!tm)
437 		return NULL;
438 
439 	tm->logical = eb->start;
440 	if (op != MOD_LOG_KEY_ADD) {
441 		btrfs_node_key(eb, &tm->key, slot);
442 		tm->blockptr = btrfs_node_blockptr(eb, slot);
443 	}
444 	tm->op = op;
445 	tm->slot = slot;
446 	tm->generation = btrfs_node_ptr_generation(eb, slot);
447 	RB_CLEAR_NODE(&tm->node);
448 
449 	return tm;
450 }
451 
452 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
453 		enum mod_log_op op, gfp_t flags)
454 {
455 	struct tree_mod_elem *tm;
456 	int ret;
457 
458 	if (!tree_mod_need_log(eb->fs_info, eb))
459 		return 0;
460 
461 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
462 	if (!tm)
463 		return -ENOMEM;
464 
465 	if (tree_mod_dont_log(eb->fs_info, eb)) {
466 		kfree(tm);
467 		return 0;
468 	}
469 
470 	ret = __tree_mod_log_insert(eb->fs_info, tm);
471 	write_unlock(&eb->fs_info->tree_mod_log_lock);
472 	if (ret)
473 		kfree(tm);
474 
475 	return ret;
476 }
477 
478 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
479 		int dst_slot, int src_slot, int nr_items)
480 {
481 	struct tree_mod_elem *tm = NULL;
482 	struct tree_mod_elem **tm_list = NULL;
483 	int ret = 0;
484 	int i;
485 	int locked = 0;
486 
487 	if (!tree_mod_need_log(eb->fs_info, eb))
488 		return 0;
489 
490 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
491 	if (!tm_list)
492 		return -ENOMEM;
493 
494 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
495 	if (!tm) {
496 		ret = -ENOMEM;
497 		goto free_tms;
498 	}
499 
500 	tm->logical = eb->start;
501 	tm->slot = src_slot;
502 	tm->move.dst_slot = dst_slot;
503 	tm->move.nr_items = nr_items;
504 	tm->op = MOD_LOG_MOVE_KEYS;
505 
506 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
507 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
508 		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
509 		if (!tm_list[i]) {
510 			ret = -ENOMEM;
511 			goto free_tms;
512 		}
513 	}
514 
515 	if (tree_mod_dont_log(eb->fs_info, eb))
516 		goto free_tms;
517 	locked = 1;
518 
519 	/*
520 	 * When we override something during the move, we log these removals.
521 	 * This can only happen when we move towards the beginning of the
522 	 * buffer, i.e. dst_slot < src_slot.
523 	 */
524 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
525 		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
526 		if (ret)
527 			goto free_tms;
528 	}
529 
530 	ret = __tree_mod_log_insert(eb->fs_info, tm);
531 	if (ret)
532 		goto free_tms;
533 	write_unlock(&eb->fs_info->tree_mod_log_lock);
534 	kfree(tm_list);
535 
536 	return 0;
537 free_tms:
538 	for (i = 0; i < nr_items; i++) {
539 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
540 			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
541 		kfree(tm_list[i]);
542 	}
543 	if (locked)
544 		write_unlock(&eb->fs_info->tree_mod_log_lock);
545 	kfree(tm_list);
546 	kfree(tm);
547 
548 	return ret;
549 }
550 
551 static inline int
552 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
553 		       struct tree_mod_elem **tm_list,
554 		       int nritems)
555 {
556 	int i, j;
557 	int ret;
558 
559 	for (i = nritems - 1; i >= 0; i--) {
560 		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
561 		if (ret) {
562 			for (j = nritems - 1; j > i; j--)
563 				rb_erase(&tm_list[j]->node,
564 					 &fs_info->tree_mod_log);
565 			return ret;
566 		}
567 	}
568 
569 	return 0;
570 }
571 
572 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
573 			 struct extent_buffer *new_root, int log_removal)
574 {
575 	struct btrfs_fs_info *fs_info = old_root->fs_info;
576 	struct tree_mod_elem *tm = NULL;
577 	struct tree_mod_elem **tm_list = NULL;
578 	int nritems = 0;
579 	int ret = 0;
580 	int i;
581 
582 	if (!tree_mod_need_log(fs_info, NULL))
583 		return 0;
584 
585 	if (log_removal && btrfs_header_level(old_root) > 0) {
586 		nritems = btrfs_header_nritems(old_root);
587 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
588 				  GFP_NOFS);
589 		if (!tm_list) {
590 			ret = -ENOMEM;
591 			goto free_tms;
592 		}
593 		for (i = 0; i < nritems; i++) {
594 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
595 			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
596 			if (!tm_list[i]) {
597 				ret = -ENOMEM;
598 				goto free_tms;
599 			}
600 		}
601 	}
602 
603 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
604 	if (!tm) {
605 		ret = -ENOMEM;
606 		goto free_tms;
607 	}
608 
609 	tm->logical = new_root->start;
610 	tm->old_root.logical = old_root->start;
611 	tm->old_root.level = btrfs_header_level(old_root);
612 	tm->generation = btrfs_header_generation(old_root);
613 	tm->op = MOD_LOG_ROOT_REPLACE;
614 
615 	if (tree_mod_dont_log(fs_info, NULL))
616 		goto free_tms;
617 
618 	if (tm_list)
619 		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
620 	if (!ret)
621 		ret = __tree_mod_log_insert(fs_info, tm);
622 
623 	write_unlock(&fs_info->tree_mod_log_lock);
624 	if (ret)
625 		goto free_tms;
626 	kfree(tm_list);
627 
628 	return ret;
629 
630 free_tms:
631 	if (tm_list) {
632 		for (i = 0; i < nritems; i++)
633 			kfree(tm_list[i]);
634 		kfree(tm_list);
635 	}
636 	kfree(tm);
637 
638 	return ret;
639 }
640 
641 static struct tree_mod_elem *
642 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
643 		      int smallest)
644 {
645 	struct rb_root *tm_root;
646 	struct rb_node *node;
647 	struct tree_mod_elem *cur = NULL;
648 	struct tree_mod_elem *found = NULL;
649 
650 	read_lock(&fs_info->tree_mod_log_lock);
651 	tm_root = &fs_info->tree_mod_log;
652 	node = tm_root->rb_node;
653 	while (node) {
654 		cur = rb_entry(node, struct tree_mod_elem, node);
655 		if (cur->logical < start) {
656 			node = node->rb_left;
657 		} else if (cur->logical > start) {
658 			node = node->rb_right;
659 		} else if (cur->seq < min_seq) {
660 			node = node->rb_left;
661 		} else if (!smallest) {
662 			/* we want the node with the highest seq */
663 			if (found)
664 				BUG_ON(found->seq > cur->seq);
665 			found = cur;
666 			node = node->rb_left;
667 		} else if (cur->seq > min_seq) {
668 			/* we want the node with the smallest seq */
669 			if (found)
670 				BUG_ON(found->seq < cur->seq);
671 			found = cur;
672 			node = node->rb_right;
673 		} else {
674 			found = cur;
675 			break;
676 		}
677 	}
678 	read_unlock(&fs_info->tree_mod_log_lock);
679 
680 	return found;
681 }
682 
683 /*
684  * this returns the element from the log with the smallest time sequence
685  * value that's in the log (the oldest log item). any element with a time
686  * sequence lower than min_seq will be ignored.
687  */
688 static struct tree_mod_elem *
689 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
690 			   u64 min_seq)
691 {
692 	return __tree_mod_log_search(fs_info, start, min_seq, 1);
693 }
694 
695 /*
696  * this returns the element from the log with the largest time sequence
697  * value that's in the log (the most recent log item). any element with
698  * a time sequence lower than min_seq will be ignored.
699  */
700 static struct tree_mod_elem *
701 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
702 {
703 	return __tree_mod_log_search(fs_info, start, min_seq, 0);
704 }
705 
706 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
707 		     struct extent_buffer *src, unsigned long dst_offset,
708 		     unsigned long src_offset, int nr_items)
709 {
710 	struct btrfs_fs_info *fs_info = dst->fs_info;
711 	int ret = 0;
712 	struct tree_mod_elem **tm_list = NULL;
713 	struct tree_mod_elem **tm_list_add, **tm_list_rem;
714 	int i;
715 	int locked = 0;
716 
717 	if (!tree_mod_need_log(fs_info, NULL))
718 		return 0;
719 
720 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
721 		return 0;
722 
723 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
724 			  GFP_NOFS);
725 	if (!tm_list)
726 		return -ENOMEM;
727 
728 	tm_list_add = tm_list;
729 	tm_list_rem = tm_list + nr_items;
730 	for (i = 0; i < nr_items; i++) {
731 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
732 		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
733 		if (!tm_list_rem[i]) {
734 			ret = -ENOMEM;
735 			goto free_tms;
736 		}
737 
738 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
739 		    MOD_LOG_KEY_ADD, GFP_NOFS);
740 		if (!tm_list_add[i]) {
741 			ret = -ENOMEM;
742 			goto free_tms;
743 		}
744 	}
745 
746 	if (tree_mod_dont_log(fs_info, NULL))
747 		goto free_tms;
748 	locked = 1;
749 
750 	for (i = 0; i < nr_items; i++) {
751 		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
752 		if (ret)
753 			goto free_tms;
754 		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
755 		if (ret)
756 			goto free_tms;
757 	}
758 
759 	write_unlock(&fs_info->tree_mod_log_lock);
760 	kfree(tm_list);
761 
762 	return 0;
763 
764 free_tms:
765 	for (i = 0; i < nr_items * 2; i++) {
766 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
767 			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
768 		kfree(tm_list[i]);
769 	}
770 	if (locked)
771 		write_unlock(&fs_info->tree_mod_log_lock);
772 	kfree(tm_list);
773 
774 	return ret;
775 }
776 
777 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
778 {
779 	struct tree_mod_elem **tm_list = NULL;
780 	int nritems = 0;
781 	int i;
782 	int ret = 0;
783 
784 	if (btrfs_header_level(eb) == 0)
785 		return 0;
786 
787 	if (!tree_mod_need_log(eb->fs_info, NULL))
788 		return 0;
789 
790 	nritems = btrfs_header_nritems(eb);
791 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
792 	if (!tm_list)
793 		return -ENOMEM;
794 
795 	for (i = 0; i < nritems; i++) {
796 		tm_list[i] = alloc_tree_mod_elem(eb, i,
797 		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
798 		if (!tm_list[i]) {
799 			ret = -ENOMEM;
800 			goto free_tms;
801 		}
802 	}
803 
804 	if (tree_mod_dont_log(eb->fs_info, eb))
805 		goto free_tms;
806 
807 	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
808 	write_unlock(&eb->fs_info->tree_mod_log_lock);
809 	if (ret)
810 		goto free_tms;
811 	kfree(tm_list);
812 
813 	return 0;
814 
815 free_tms:
816 	for (i = 0; i < nritems; i++)
817 		kfree(tm_list[i]);
818 	kfree(tm_list);
819 
820 	return ret;
821 }
822 
823 /*
824  * check if the tree block can be shared by multiple trees
825  */
826 int btrfs_block_can_be_shared(struct btrfs_root *root,
827 			      struct extent_buffer *buf)
828 {
829 	/*
830 	 * Tree blocks not in shareable trees and tree roots are never shared.
831 	 * If a block was allocated after the last snapshot and the block was
832 	 * not allocated by tree relocation, we know the block is not shared.
833 	 */
834 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
835 	    buf != root->node && buf != root->commit_root &&
836 	    (btrfs_header_generation(buf) <=
837 	     btrfs_root_last_snapshot(&root->root_item) ||
838 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
839 		return 1;
840 
841 	return 0;
842 }
843 
844 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
845 				       struct btrfs_root *root,
846 				       struct extent_buffer *buf,
847 				       struct extent_buffer *cow,
848 				       int *last_ref)
849 {
850 	struct btrfs_fs_info *fs_info = root->fs_info;
851 	u64 refs;
852 	u64 owner;
853 	u64 flags;
854 	u64 new_flags = 0;
855 	int ret;
856 
857 	/*
858 	 * Backrefs update rules:
859 	 *
860 	 * Always use full backrefs for extent pointers in tree block
861 	 * allocated by tree relocation.
862 	 *
863 	 * If a shared tree block is no longer referenced by its owner
864 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
865 	 * use full backrefs for extent pointers in tree block.
866 	 *
867 	 * If a tree block is been relocating
868 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
869 	 * use full backrefs for extent pointers in tree block.
870 	 * The reason for this is some operations (such as drop tree)
871 	 * are only allowed for blocks use full backrefs.
872 	 */
873 
874 	if (btrfs_block_can_be_shared(root, buf)) {
875 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
876 					       btrfs_header_level(buf), 1,
877 					       &refs, &flags);
878 		if (ret)
879 			return ret;
880 		if (refs == 0) {
881 			ret = -EROFS;
882 			btrfs_handle_fs_error(fs_info, ret, NULL);
883 			return ret;
884 		}
885 	} else {
886 		refs = 1;
887 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
888 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
889 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
890 		else
891 			flags = 0;
892 	}
893 
894 	owner = btrfs_header_owner(buf);
895 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
896 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
897 
898 	if (refs > 1) {
899 		if ((owner == root->root_key.objectid ||
900 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
901 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
902 			ret = btrfs_inc_ref(trans, root, buf, 1);
903 			if (ret)
904 				return ret;
905 
906 			if (root->root_key.objectid ==
907 			    BTRFS_TREE_RELOC_OBJECTID) {
908 				ret = btrfs_dec_ref(trans, root, buf, 0);
909 				if (ret)
910 					return ret;
911 				ret = btrfs_inc_ref(trans, root, cow, 1);
912 				if (ret)
913 					return ret;
914 			}
915 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
916 		} else {
917 
918 			if (root->root_key.objectid ==
919 			    BTRFS_TREE_RELOC_OBJECTID)
920 				ret = btrfs_inc_ref(trans, root, cow, 1);
921 			else
922 				ret = btrfs_inc_ref(trans, root, cow, 0);
923 			if (ret)
924 				return ret;
925 		}
926 		if (new_flags != 0) {
927 			int level = btrfs_header_level(buf);
928 
929 			ret = btrfs_set_disk_extent_flags(trans, buf,
930 							  new_flags, level, 0);
931 			if (ret)
932 				return ret;
933 		}
934 	} else {
935 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
936 			if (root->root_key.objectid ==
937 			    BTRFS_TREE_RELOC_OBJECTID)
938 				ret = btrfs_inc_ref(trans, root, cow, 1);
939 			else
940 				ret = btrfs_inc_ref(trans, root, cow, 0);
941 			if (ret)
942 				return ret;
943 			ret = btrfs_dec_ref(trans, root, buf, 1);
944 			if (ret)
945 				return ret;
946 		}
947 		btrfs_clean_tree_block(buf);
948 		*last_ref = 1;
949 	}
950 	return 0;
951 }
952 
953 static struct extent_buffer *alloc_tree_block_no_bg_flush(
954 					  struct btrfs_trans_handle *trans,
955 					  struct btrfs_root *root,
956 					  u64 parent_start,
957 					  const struct btrfs_disk_key *disk_key,
958 					  int level,
959 					  u64 hint,
960 					  u64 empty_size)
961 {
962 	struct btrfs_fs_info *fs_info = root->fs_info;
963 	struct extent_buffer *ret;
964 
965 	/*
966 	 * If we are COWing a node/leaf from the extent, chunk, device or free
967 	 * space trees, make sure that we do not finish block group creation of
968 	 * pending block groups. We do this to avoid a deadlock.
969 	 * COWing can result in allocation of a new chunk, and flushing pending
970 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
971 	 * when finishing allocation of a new chunk. Creation of a pending block
972 	 * group modifies the extent, chunk, device and free space trees,
973 	 * therefore we could deadlock with ourselves since we are holding a
974 	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
975 	 * try to COW later.
976 	 * For similar reasons, we also need to delay flushing pending block
977 	 * groups when splitting a leaf or node, from one of those trees, since
978 	 * we are holding a write lock on it and its parent or when inserting a
979 	 * new root node for one of those trees.
980 	 */
981 	if (root == fs_info->extent_root ||
982 	    root == fs_info->chunk_root ||
983 	    root == fs_info->dev_root ||
984 	    root == fs_info->free_space_root)
985 		trans->can_flush_pending_bgs = false;
986 
987 	ret = btrfs_alloc_tree_block(trans, root, parent_start,
988 				     root->root_key.objectid, disk_key, level,
989 				     hint, empty_size);
990 	trans->can_flush_pending_bgs = true;
991 
992 	return ret;
993 }
994 
995 /*
996  * does the dirty work in cow of a single block.  The parent block (if
997  * supplied) is updated to point to the new cow copy.  The new buffer is marked
998  * dirty and returned locked.  If you modify the block it needs to be marked
999  * dirty again.
1000  *
1001  * search_start -- an allocation hint for the new block
1002  *
1003  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1004  * bytes the allocator should try to find free next to the block it returns.
1005  * This is just a hint and may be ignored by the allocator.
1006  */
1007 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1008 			     struct btrfs_root *root,
1009 			     struct extent_buffer *buf,
1010 			     struct extent_buffer *parent, int parent_slot,
1011 			     struct extent_buffer **cow_ret,
1012 			     u64 search_start, u64 empty_size)
1013 {
1014 	struct btrfs_fs_info *fs_info = root->fs_info;
1015 	struct btrfs_disk_key disk_key;
1016 	struct extent_buffer *cow;
1017 	int level, ret;
1018 	int last_ref = 0;
1019 	int unlock_orig = 0;
1020 	u64 parent_start = 0;
1021 
1022 	if (*cow_ret == buf)
1023 		unlock_orig = 1;
1024 
1025 	btrfs_assert_tree_locked(buf);
1026 
1027 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1028 		trans->transid != fs_info->running_transaction->transid);
1029 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1030 		trans->transid != root->last_trans);
1031 
1032 	level = btrfs_header_level(buf);
1033 
1034 	if (level == 0)
1035 		btrfs_item_key(buf, &disk_key, 0);
1036 	else
1037 		btrfs_node_key(buf, &disk_key, 0);
1038 
1039 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1040 		parent_start = parent->start;
1041 
1042 	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1043 					   level, search_start, empty_size);
1044 	if (IS_ERR(cow))
1045 		return PTR_ERR(cow);
1046 
1047 	/* cow is set to blocking by btrfs_init_new_buffer */
1048 
1049 	copy_extent_buffer_full(cow, buf);
1050 	btrfs_set_header_bytenr(cow, cow->start);
1051 	btrfs_set_header_generation(cow, trans->transid);
1052 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1053 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1054 				     BTRFS_HEADER_FLAG_RELOC);
1055 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1056 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1057 	else
1058 		btrfs_set_header_owner(cow, root->root_key.objectid);
1059 
1060 	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1061 
1062 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1063 	if (ret) {
1064 		btrfs_abort_transaction(trans, ret);
1065 		return ret;
1066 	}
1067 
1068 	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1069 		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1070 		if (ret) {
1071 			btrfs_abort_transaction(trans, ret);
1072 			return ret;
1073 		}
1074 	}
1075 
1076 	if (buf == root->node) {
1077 		WARN_ON(parent && parent != buf);
1078 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1079 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1080 			parent_start = buf->start;
1081 
1082 		atomic_inc(&cow->refs);
1083 		ret = tree_mod_log_insert_root(root->node, cow, 1);
1084 		BUG_ON(ret < 0);
1085 		rcu_assign_pointer(root->node, cow);
1086 
1087 		btrfs_free_tree_block(trans, root, buf, parent_start,
1088 				      last_ref);
1089 		free_extent_buffer(buf);
1090 		add_root_to_dirty_list(root);
1091 	} else {
1092 		WARN_ON(trans->transid != btrfs_header_generation(parent));
1093 		tree_mod_log_insert_key(parent, parent_slot,
1094 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1095 		btrfs_set_node_blockptr(parent, parent_slot,
1096 					cow->start);
1097 		btrfs_set_node_ptr_generation(parent, parent_slot,
1098 					      trans->transid);
1099 		btrfs_mark_buffer_dirty(parent);
1100 		if (last_ref) {
1101 			ret = tree_mod_log_free_eb(buf);
1102 			if (ret) {
1103 				btrfs_abort_transaction(trans, ret);
1104 				return ret;
1105 			}
1106 		}
1107 		btrfs_free_tree_block(trans, root, buf, parent_start,
1108 				      last_ref);
1109 	}
1110 	if (unlock_orig)
1111 		btrfs_tree_unlock(buf);
1112 	free_extent_buffer_stale(buf);
1113 	btrfs_mark_buffer_dirty(cow);
1114 	*cow_ret = cow;
1115 	return 0;
1116 }
1117 
1118 /*
1119  * returns the logical address of the oldest predecessor of the given root.
1120  * entries older than time_seq are ignored.
1121  */
1122 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1123 		struct extent_buffer *eb_root, u64 time_seq)
1124 {
1125 	struct tree_mod_elem *tm;
1126 	struct tree_mod_elem *found = NULL;
1127 	u64 root_logical = eb_root->start;
1128 	int looped = 0;
1129 
1130 	if (!time_seq)
1131 		return NULL;
1132 
1133 	/*
1134 	 * the very last operation that's logged for a root is the
1135 	 * replacement operation (if it is replaced at all). this has
1136 	 * the logical address of the *new* root, making it the very
1137 	 * first operation that's logged for this root.
1138 	 */
1139 	while (1) {
1140 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1141 						time_seq);
1142 		if (!looped && !tm)
1143 			return NULL;
1144 		/*
1145 		 * if there are no tree operation for the oldest root, we simply
1146 		 * return it. this should only happen if that (old) root is at
1147 		 * level 0.
1148 		 */
1149 		if (!tm)
1150 			break;
1151 
1152 		/*
1153 		 * if there's an operation that's not a root replacement, we
1154 		 * found the oldest version of our root. normally, we'll find a
1155 		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1156 		 */
1157 		if (tm->op != MOD_LOG_ROOT_REPLACE)
1158 			break;
1159 
1160 		found = tm;
1161 		root_logical = tm->old_root.logical;
1162 		looped = 1;
1163 	}
1164 
1165 	/* if there's no old root to return, return what we found instead */
1166 	if (!found)
1167 		found = tm;
1168 
1169 	return found;
1170 }
1171 
1172 /*
1173  * tm is a pointer to the first operation to rewind within eb. then, all
1174  * previous operations will be rewound (until we reach something older than
1175  * time_seq).
1176  */
1177 static void
1178 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1179 		      u64 time_seq, struct tree_mod_elem *first_tm)
1180 {
1181 	u32 n;
1182 	struct rb_node *next;
1183 	struct tree_mod_elem *tm = first_tm;
1184 	unsigned long o_dst;
1185 	unsigned long o_src;
1186 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
1187 
1188 	n = btrfs_header_nritems(eb);
1189 	read_lock(&fs_info->tree_mod_log_lock);
1190 	while (tm && tm->seq >= time_seq) {
1191 		/*
1192 		 * all the operations are recorded with the operator used for
1193 		 * the modification. as we're going backwards, we do the
1194 		 * opposite of each operation here.
1195 		 */
1196 		switch (tm->op) {
1197 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1198 			BUG_ON(tm->slot < n);
1199 			/* Fallthrough */
1200 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1201 		case MOD_LOG_KEY_REMOVE:
1202 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1203 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1204 			btrfs_set_node_ptr_generation(eb, tm->slot,
1205 						      tm->generation);
1206 			n++;
1207 			break;
1208 		case MOD_LOG_KEY_REPLACE:
1209 			BUG_ON(tm->slot >= n);
1210 			btrfs_set_node_key(eb, &tm->key, tm->slot);
1211 			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1212 			btrfs_set_node_ptr_generation(eb, tm->slot,
1213 						      tm->generation);
1214 			break;
1215 		case MOD_LOG_KEY_ADD:
1216 			/* if a move operation is needed it's in the log */
1217 			n--;
1218 			break;
1219 		case MOD_LOG_MOVE_KEYS:
1220 			o_dst = btrfs_node_key_ptr_offset(tm->slot);
1221 			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1222 			memmove_extent_buffer(eb, o_dst, o_src,
1223 					      tm->move.nr_items * p_size);
1224 			break;
1225 		case MOD_LOG_ROOT_REPLACE:
1226 			/*
1227 			 * this operation is special. for roots, this must be
1228 			 * handled explicitly before rewinding.
1229 			 * for non-roots, this operation may exist if the node
1230 			 * was a root: root A -> child B; then A gets empty and
1231 			 * B is promoted to the new root. in the mod log, we'll
1232 			 * have a root-replace operation for B, a tree block
1233 			 * that is no root. we simply ignore that operation.
1234 			 */
1235 			break;
1236 		}
1237 		next = rb_next(&tm->node);
1238 		if (!next)
1239 			break;
1240 		tm = rb_entry(next, struct tree_mod_elem, node);
1241 		if (tm->logical != first_tm->logical)
1242 			break;
1243 	}
1244 	read_unlock(&fs_info->tree_mod_log_lock);
1245 	btrfs_set_header_nritems(eb, n);
1246 }
1247 
1248 /*
1249  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1250  * is returned. If rewind operations happen, a fresh buffer is returned. The
1251  * returned buffer is always read-locked. If the returned buffer is not the
1252  * input buffer, the lock on the input buffer is released and the input buffer
1253  * is freed (its refcount is decremented).
1254  */
1255 static struct extent_buffer *
1256 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1257 		    struct extent_buffer *eb, u64 time_seq)
1258 {
1259 	struct extent_buffer *eb_rewin;
1260 	struct tree_mod_elem *tm;
1261 
1262 	if (!time_seq)
1263 		return eb;
1264 
1265 	if (btrfs_header_level(eb) == 0)
1266 		return eb;
1267 
1268 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1269 	if (!tm)
1270 		return eb;
1271 
1272 	btrfs_set_path_blocking(path);
1273 	btrfs_set_lock_blocking_read(eb);
1274 
1275 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1276 		BUG_ON(tm->slot != 0);
1277 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1278 		if (!eb_rewin) {
1279 			btrfs_tree_read_unlock_blocking(eb);
1280 			free_extent_buffer(eb);
1281 			return NULL;
1282 		}
1283 		btrfs_set_header_bytenr(eb_rewin, eb->start);
1284 		btrfs_set_header_backref_rev(eb_rewin,
1285 					     btrfs_header_backref_rev(eb));
1286 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1287 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1288 	} else {
1289 		eb_rewin = btrfs_clone_extent_buffer(eb);
1290 		if (!eb_rewin) {
1291 			btrfs_tree_read_unlock_blocking(eb);
1292 			free_extent_buffer(eb);
1293 			return NULL;
1294 		}
1295 	}
1296 
1297 	btrfs_tree_read_unlock_blocking(eb);
1298 	free_extent_buffer(eb);
1299 
1300 	btrfs_tree_read_lock(eb_rewin);
1301 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1302 	WARN_ON(btrfs_header_nritems(eb_rewin) >
1303 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1304 
1305 	return eb_rewin;
1306 }
1307 
1308 /*
1309  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1310  * value. If there are no changes, the current root->root_node is returned. If
1311  * anything changed in between, there's a fresh buffer allocated on which the
1312  * rewind operations are done. In any case, the returned buffer is read locked.
1313  * Returns NULL on error (with no locks held).
1314  */
1315 static inline struct extent_buffer *
1316 get_old_root(struct btrfs_root *root, u64 time_seq)
1317 {
1318 	struct btrfs_fs_info *fs_info = root->fs_info;
1319 	struct tree_mod_elem *tm;
1320 	struct extent_buffer *eb = NULL;
1321 	struct extent_buffer *eb_root;
1322 	u64 eb_root_owner = 0;
1323 	struct extent_buffer *old;
1324 	struct tree_mod_root *old_root = NULL;
1325 	u64 old_generation = 0;
1326 	u64 logical;
1327 	int level;
1328 
1329 	eb_root = btrfs_read_lock_root_node(root);
1330 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1331 	if (!tm)
1332 		return eb_root;
1333 
1334 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
1335 		old_root = &tm->old_root;
1336 		old_generation = tm->generation;
1337 		logical = old_root->logical;
1338 		level = old_root->level;
1339 	} else {
1340 		logical = eb_root->start;
1341 		level = btrfs_header_level(eb_root);
1342 	}
1343 
1344 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1345 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1346 		btrfs_tree_read_unlock(eb_root);
1347 		free_extent_buffer(eb_root);
1348 		old = read_tree_block(fs_info, logical, 0, level, NULL);
1349 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1350 			if (!IS_ERR(old))
1351 				free_extent_buffer(old);
1352 			btrfs_warn(fs_info,
1353 				   "failed to read tree block %llu from get_old_root",
1354 				   logical);
1355 		} else {
1356 			eb = btrfs_clone_extent_buffer(old);
1357 			free_extent_buffer(old);
1358 		}
1359 	} else if (old_root) {
1360 		eb_root_owner = btrfs_header_owner(eb_root);
1361 		btrfs_tree_read_unlock(eb_root);
1362 		free_extent_buffer(eb_root);
1363 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1364 	} else {
1365 		btrfs_set_lock_blocking_read(eb_root);
1366 		eb = btrfs_clone_extent_buffer(eb_root);
1367 		btrfs_tree_read_unlock_blocking(eb_root);
1368 		free_extent_buffer(eb_root);
1369 	}
1370 
1371 	if (!eb)
1372 		return NULL;
1373 	btrfs_tree_read_lock(eb);
1374 	if (old_root) {
1375 		btrfs_set_header_bytenr(eb, eb->start);
1376 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1377 		btrfs_set_header_owner(eb, eb_root_owner);
1378 		btrfs_set_header_level(eb, old_root->level);
1379 		btrfs_set_header_generation(eb, old_generation);
1380 	}
1381 	if (tm)
1382 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1383 	else
1384 		WARN_ON(btrfs_header_level(eb) != 0);
1385 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1386 
1387 	return eb;
1388 }
1389 
1390 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1391 {
1392 	struct tree_mod_elem *tm;
1393 	int level;
1394 	struct extent_buffer *eb_root = btrfs_root_node(root);
1395 
1396 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1397 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1398 		level = tm->old_root.level;
1399 	} else {
1400 		level = btrfs_header_level(eb_root);
1401 	}
1402 	free_extent_buffer(eb_root);
1403 
1404 	return level;
1405 }
1406 
1407 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1408 				   struct btrfs_root *root,
1409 				   struct extent_buffer *buf)
1410 {
1411 	if (btrfs_is_testing(root->fs_info))
1412 		return 0;
1413 
1414 	/* Ensure we can see the FORCE_COW bit */
1415 	smp_mb__before_atomic();
1416 
1417 	/*
1418 	 * We do not need to cow a block if
1419 	 * 1) this block is not created or changed in this transaction;
1420 	 * 2) this block does not belong to TREE_RELOC tree;
1421 	 * 3) the root is not forced COW.
1422 	 *
1423 	 * What is forced COW:
1424 	 *    when we create snapshot during committing the transaction,
1425 	 *    after we've finished copying src root, we must COW the shared
1426 	 *    block to ensure the metadata consistency.
1427 	 */
1428 	if (btrfs_header_generation(buf) == trans->transid &&
1429 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1430 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1431 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1432 	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1433 		return 0;
1434 	return 1;
1435 }
1436 
1437 /*
1438  * cows a single block, see __btrfs_cow_block for the real work.
1439  * This version of it has extra checks so that a block isn't COWed more than
1440  * once per transaction, as long as it hasn't been written yet
1441  */
1442 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1443 		    struct btrfs_root *root, struct extent_buffer *buf,
1444 		    struct extent_buffer *parent, int parent_slot,
1445 		    struct extent_buffer **cow_ret)
1446 {
1447 	struct btrfs_fs_info *fs_info = root->fs_info;
1448 	u64 search_start;
1449 	int ret;
1450 
1451 	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1452 		btrfs_err(fs_info,
1453 			"COW'ing blocks on a fs root that's being dropped");
1454 
1455 	if (trans->transaction != fs_info->running_transaction)
1456 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1457 		       trans->transid,
1458 		       fs_info->running_transaction->transid);
1459 
1460 	if (trans->transid != fs_info->generation)
1461 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1462 		       trans->transid, fs_info->generation);
1463 
1464 	if (!should_cow_block(trans, root, buf)) {
1465 		trans->dirty = true;
1466 		*cow_ret = buf;
1467 		return 0;
1468 	}
1469 
1470 	search_start = buf->start & ~((u64)SZ_1G - 1);
1471 
1472 	if (parent)
1473 		btrfs_set_lock_blocking_write(parent);
1474 	btrfs_set_lock_blocking_write(buf);
1475 
1476 	/*
1477 	 * Before CoWing this block for later modification, check if it's
1478 	 * the subtree root and do the delayed subtree trace if needed.
1479 	 *
1480 	 * Also We don't care about the error, as it's handled internally.
1481 	 */
1482 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1483 	ret = __btrfs_cow_block(trans, root, buf, parent,
1484 				 parent_slot, cow_ret, search_start, 0);
1485 
1486 	trace_btrfs_cow_block(root, buf, *cow_ret);
1487 
1488 	return ret;
1489 }
1490 
1491 /*
1492  * helper function for defrag to decide if two blocks pointed to by a
1493  * node are actually close by
1494  */
1495 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1496 {
1497 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1498 		return 1;
1499 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1500 		return 1;
1501 	return 0;
1502 }
1503 
1504 /*
1505  * compare two keys in a memcmp fashion
1506  */
1507 static int comp_keys(const struct btrfs_disk_key *disk,
1508 		     const struct btrfs_key *k2)
1509 {
1510 	struct btrfs_key k1;
1511 
1512 	btrfs_disk_key_to_cpu(&k1, disk);
1513 
1514 	return btrfs_comp_cpu_keys(&k1, k2);
1515 }
1516 
1517 /*
1518  * same as comp_keys only with two btrfs_key's
1519  */
1520 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1521 {
1522 	if (k1->objectid > k2->objectid)
1523 		return 1;
1524 	if (k1->objectid < k2->objectid)
1525 		return -1;
1526 	if (k1->type > k2->type)
1527 		return 1;
1528 	if (k1->type < k2->type)
1529 		return -1;
1530 	if (k1->offset > k2->offset)
1531 		return 1;
1532 	if (k1->offset < k2->offset)
1533 		return -1;
1534 	return 0;
1535 }
1536 
1537 /*
1538  * this is used by the defrag code to go through all the
1539  * leaves pointed to by a node and reallocate them so that
1540  * disk order is close to key order
1541  */
1542 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1543 		       struct btrfs_root *root, struct extent_buffer *parent,
1544 		       int start_slot, u64 *last_ret,
1545 		       struct btrfs_key *progress)
1546 {
1547 	struct btrfs_fs_info *fs_info = root->fs_info;
1548 	struct extent_buffer *cur;
1549 	u64 blocknr;
1550 	u64 gen;
1551 	u64 search_start = *last_ret;
1552 	u64 last_block = 0;
1553 	u64 other;
1554 	u32 parent_nritems;
1555 	int end_slot;
1556 	int i;
1557 	int err = 0;
1558 	int parent_level;
1559 	int uptodate;
1560 	u32 blocksize;
1561 	int progress_passed = 0;
1562 	struct btrfs_disk_key disk_key;
1563 
1564 	parent_level = btrfs_header_level(parent);
1565 
1566 	WARN_ON(trans->transaction != fs_info->running_transaction);
1567 	WARN_ON(trans->transid != fs_info->generation);
1568 
1569 	parent_nritems = btrfs_header_nritems(parent);
1570 	blocksize = fs_info->nodesize;
1571 	end_slot = parent_nritems - 1;
1572 
1573 	if (parent_nritems <= 1)
1574 		return 0;
1575 
1576 	btrfs_set_lock_blocking_write(parent);
1577 
1578 	for (i = start_slot; i <= end_slot; i++) {
1579 		struct btrfs_key first_key;
1580 		int close = 1;
1581 
1582 		btrfs_node_key(parent, &disk_key, i);
1583 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1584 			continue;
1585 
1586 		progress_passed = 1;
1587 		blocknr = btrfs_node_blockptr(parent, i);
1588 		gen = btrfs_node_ptr_generation(parent, i);
1589 		btrfs_node_key_to_cpu(parent, &first_key, i);
1590 		if (last_block == 0)
1591 			last_block = blocknr;
1592 
1593 		if (i > 0) {
1594 			other = btrfs_node_blockptr(parent, i - 1);
1595 			close = close_blocks(blocknr, other, blocksize);
1596 		}
1597 		if (!close && i < end_slot) {
1598 			other = btrfs_node_blockptr(parent, i + 1);
1599 			close = close_blocks(blocknr, other, blocksize);
1600 		}
1601 		if (close) {
1602 			last_block = blocknr;
1603 			continue;
1604 		}
1605 
1606 		cur = find_extent_buffer(fs_info, blocknr);
1607 		if (cur)
1608 			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1609 		else
1610 			uptodate = 0;
1611 		if (!cur || !uptodate) {
1612 			if (!cur) {
1613 				cur = read_tree_block(fs_info, blocknr, gen,
1614 						      parent_level - 1,
1615 						      &first_key);
1616 				if (IS_ERR(cur)) {
1617 					return PTR_ERR(cur);
1618 				} else if (!extent_buffer_uptodate(cur)) {
1619 					free_extent_buffer(cur);
1620 					return -EIO;
1621 				}
1622 			} else if (!uptodate) {
1623 				err = btrfs_read_buffer(cur, gen,
1624 						parent_level - 1,&first_key);
1625 				if (err) {
1626 					free_extent_buffer(cur);
1627 					return err;
1628 				}
1629 			}
1630 		}
1631 		if (search_start == 0)
1632 			search_start = last_block;
1633 
1634 		btrfs_tree_lock(cur);
1635 		btrfs_set_lock_blocking_write(cur);
1636 		err = __btrfs_cow_block(trans, root, cur, parent, i,
1637 					&cur, search_start,
1638 					min(16 * blocksize,
1639 					    (end_slot - i) * blocksize));
1640 		if (err) {
1641 			btrfs_tree_unlock(cur);
1642 			free_extent_buffer(cur);
1643 			break;
1644 		}
1645 		search_start = cur->start;
1646 		last_block = cur->start;
1647 		*last_ret = search_start;
1648 		btrfs_tree_unlock(cur);
1649 		free_extent_buffer(cur);
1650 	}
1651 	return err;
1652 }
1653 
1654 /*
1655  * search for key in the extent_buffer.  The items start at offset p,
1656  * and they are item_size apart.  There are 'max' items in p.
1657  *
1658  * the slot in the array is returned via slot, and it points to
1659  * the place where you would insert key if it is not found in
1660  * the array.
1661  *
1662  * slot may point to max if the key is bigger than all of the keys
1663  */
1664 static noinline int generic_bin_search(struct extent_buffer *eb,
1665 				       unsigned long p, int item_size,
1666 				       const struct btrfs_key *key,
1667 				       int max, int *slot)
1668 {
1669 	int low = 0;
1670 	int high = max;
1671 	int ret;
1672 	const int key_size = sizeof(struct btrfs_disk_key);
1673 
1674 	if (low > high) {
1675 		btrfs_err(eb->fs_info,
1676 		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1677 			  __func__, low, high, eb->start,
1678 			  btrfs_header_owner(eb), btrfs_header_level(eb));
1679 		return -EINVAL;
1680 	}
1681 
1682 	while (low < high) {
1683 		unsigned long oip;
1684 		unsigned long offset;
1685 		struct btrfs_disk_key *tmp;
1686 		struct btrfs_disk_key unaligned;
1687 		int mid;
1688 
1689 		mid = (low + high) / 2;
1690 		offset = p + mid * item_size;
1691 		oip = offset_in_page(offset);
1692 
1693 		if (oip + key_size <= PAGE_SIZE) {
1694 			const unsigned long idx = offset >> PAGE_SHIFT;
1695 			char *kaddr = page_address(eb->pages[idx]);
1696 
1697 			tmp = (struct btrfs_disk_key *)(kaddr + oip);
1698 		} else {
1699 			read_extent_buffer(eb, &unaligned, offset, key_size);
1700 			tmp = &unaligned;
1701 		}
1702 
1703 		ret = comp_keys(tmp, key);
1704 
1705 		if (ret < 0)
1706 			low = mid + 1;
1707 		else if (ret > 0)
1708 			high = mid;
1709 		else {
1710 			*slot = mid;
1711 			return 0;
1712 		}
1713 	}
1714 	*slot = low;
1715 	return 1;
1716 }
1717 
1718 /*
1719  * simple bin_search frontend that does the right thing for
1720  * leaves vs nodes
1721  */
1722 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1723 		     int *slot)
1724 {
1725 	if (btrfs_header_level(eb) == 0)
1726 		return generic_bin_search(eb,
1727 					  offsetof(struct btrfs_leaf, items),
1728 					  sizeof(struct btrfs_item),
1729 					  key, btrfs_header_nritems(eb),
1730 					  slot);
1731 	else
1732 		return generic_bin_search(eb,
1733 					  offsetof(struct btrfs_node, ptrs),
1734 					  sizeof(struct btrfs_key_ptr),
1735 					  key, btrfs_header_nritems(eb),
1736 					  slot);
1737 }
1738 
1739 static void root_add_used(struct btrfs_root *root, u32 size)
1740 {
1741 	spin_lock(&root->accounting_lock);
1742 	btrfs_set_root_used(&root->root_item,
1743 			    btrfs_root_used(&root->root_item) + size);
1744 	spin_unlock(&root->accounting_lock);
1745 }
1746 
1747 static void root_sub_used(struct btrfs_root *root, u32 size)
1748 {
1749 	spin_lock(&root->accounting_lock);
1750 	btrfs_set_root_used(&root->root_item,
1751 			    btrfs_root_used(&root->root_item) - size);
1752 	spin_unlock(&root->accounting_lock);
1753 }
1754 
1755 /* given a node and slot number, this reads the blocks it points to.  The
1756  * extent buffer is returned with a reference taken (but unlocked).
1757  */
1758 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1759 					   int slot)
1760 {
1761 	int level = btrfs_header_level(parent);
1762 	struct extent_buffer *eb;
1763 	struct btrfs_key first_key;
1764 
1765 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
1766 		return ERR_PTR(-ENOENT);
1767 
1768 	BUG_ON(level == 0);
1769 
1770 	btrfs_node_key_to_cpu(parent, &first_key, slot);
1771 	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1772 			     btrfs_node_ptr_generation(parent, slot),
1773 			     level - 1, &first_key);
1774 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1775 		free_extent_buffer(eb);
1776 		eb = ERR_PTR(-EIO);
1777 	}
1778 
1779 	return eb;
1780 }
1781 
1782 /*
1783  * node level balancing, used to make sure nodes are in proper order for
1784  * item deletion.  We balance from the top down, so we have to make sure
1785  * that a deletion won't leave an node completely empty later on.
1786  */
1787 static noinline int balance_level(struct btrfs_trans_handle *trans,
1788 			 struct btrfs_root *root,
1789 			 struct btrfs_path *path, int level)
1790 {
1791 	struct btrfs_fs_info *fs_info = root->fs_info;
1792 	struct extent_buffer *right = NULL;
1793 	struct extent_buffer *mid;
1794 	struct extent_buffer *left = NULL;
1795 	struct extent_buffer *parent = NULL;
1796 	int ret = 0;
1797 	int wret;
1798 	int pslot;
1799 	int orig_slot = path->slots[level];
1800 	u64 orig_ptr;
1801 
1802 	ASSERT(level > 0);
1803 
1804 	mid = path->nodes[level];
1805 
1806 	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1807 		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1808 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1809 
1810 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1811 
1812 	if (level < BTRFS_MAX_LEVEL - 1) {
1813 		parent = path->nodes[level + 1];
1814 		pslot = path->slots[level + 1];
1815 	}
1816 
1817 	/*
1818 	 * deal with the case where there is only one pointer in the root
1819 	 * by promoting the node below to a root
1820 	 */
1821 	if (!parent) {
1822 		struct extent_buffer *child;
1823 
1824 		if (btrfs_header_nritems(mid) != 1)
1825 			return 0;
1826 
1827 		/* promote the child to a root */
1828 		child = btrfs_read_node_slot(mid, 0);
1829 		if (IS_ERR(child)) {
1830 			ret = PTR_ERR(child);
1831 			btrfs_handle_fs_error(fs_info, ret, NULL);
1832 			goto enospc;
1833 		}
1834 
1835 		btrfs_tree_lock(child);
1836 		btrfs_set_lock_blocking_write(child);
1837 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1838 		if (ret) {
1839 			btrfs_tree_unlock(child);
1840 			free_extent_buffer(child);
1841 			goto enospc;
1842 		}
1843 
1844 		ret = tree_mod_log_insert_root(root->node, child, 1);
1845 		BUG_ON(ret < 0);
1846 		rcu_assign_pointer(root->node, child);
1847 
1848 		add_root_to_dirty_list(root);
1849 		btrfs_tree_unlock(child);
1850 
1851 		path->locks[level] = 0;
1852 		path->nodes[level] = NULL;
1853 		btrfs_clean_tree_block(mid);
1854 		btrfs_tree_unlock(mid);
1855 		/* once for the path */
1856 		free_extent_buffer(mid);
1857 
1858 		root_sub_used(root, mid->len);
1859 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1860 		/* once for the root ptr */
1861 		free_extent_buffer_stale(mid);
1862 		return 0;
1863 	}
1864 	if (btrfs_header_nritems(mid) >
1865 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1866 		return 0;
1867 
1868 	left = btrfs_read_node_slot(parent, pslot - 1);
1869 	if (IS_ERR(left))
1870 		left = NULL;
1871 
1872 	if (left) {
1873 		btrfs_tree_lock(left);
1874 		btrfs_set_lock_blocking_write(left);
1875 		wret = btrfs_cow_block(trans, root, left,
1876 				       parent, pslot - 1, &left);
1877 		if (wret) {
1878 			ret = wret;
1879 			goto enospc;
1880 		}
1881 	}
1882 
1883 	right = btrfs_read_node_slot(parent, pslot + 1);
1884 	if (IS_ERR(right))
1885 		right = NULL;
1886 
1887 	if (right) {
1888 		btrfs_tree_lock(right);
1889 		btrfs_set_lock_blocking_write(right);
1890 		wret = btrfs_cow_block(trans, root, right,
1891 				       parent, pslot + 1, &right);
1892 		if (wret) {
1893 			ret = wret;
1894 			goto enospc;
1895 		}
1896 	}
1897 
1898 	/* first, try to make some room in the middle buffer */
1899 	if (left) {
1900 		orig_slot += btrfs_header_nritems(left);
1901 		wret = push_node_left(trans, left, mid, 1);
1902 		if (wret < 0)
1903 			ret = wret;
1904 	}
1905 
1906 	/*
1907 	 * then try to empty the right most buffer into the middle
1908 	 */
1909 	if (right) {
1910 		wret = push_node_left(trans, mid, right, 1);
1911 		if (wret < 0 && wret != -ENOSPC)
1912 			ret = wret;
1913 		if (btrfs_header_nritems(right) == 0) {
1914 			btrfs_clean_tree_block(right);
1915 			btrfs_tree_unlock(right);
1916 			del_ptr(root, path, level + 1, pslot + 1);
1917 			root_sub_used(root, right->len);
1918 			btrfs_free_tree_block(trans, root, right, 0, 1);
1919 			free_extent_buffer_stale(right);
1920 			right = NULL;
1921 		} else {
1922 			struct btrfs_disk_key right_key;
1923 			btrfs_node_key(right, &right_key, 0);
1924 			ret = tree_mod_log_insert_key(parent, pslot + 1,
1925 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1926 			BUG_ON(ret < 0);
1927 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1928 			btrfs_mark_buffer_dirty(parent);
1929 		}
1930 	}
1931 	if (btrfs_header_nritems(mid) == 1) {
1932 		/*
1933 		 * we're not allowed to leave a node with one item in the
1934 		 * tree during a delete.  A deletion from lower in the tree
1935 		 * could try to delete the only pointer in this node.
1936 		 * So, pull some keys from the left.
1937 		 * There has to be a left pointer at this point because
1938 		 * otherwise we would have pulled some pointers from the
1939 		 * right
1940 		 */
1941 		if (!left) {
1942 			ret = -EROFS;
1943 			btrfs_handle_fs_error(fs_info, ret, NULL);
1944 			goto enospc;
1945 		}
1946 		wret = balance_node_right(trans, mid, left);
1947 		if (wret < 0) {
1948 			ret = wret;
1949 			goto enospc;
1950 		}
1951 		if (wret == 1) {
1952 			wret = push_node_left(trans, left, mid, 1);
1953 			if (wret < 0)
1954 				ret = wret;
1955 		}
1956 		BUG_ON(wret == 1);
1957 	}
1958 	if (btrfs_header_nritems(mid) == 0) {
1959 		btrfs_clean_tree_block(mid);
1960 		btrfs_tree_unlock(mid);
1961 		del_ptr(root, path, level + 1, pslot);
1962 		root_sub_used(root, mid->len);
1963 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1964 		free_extent_buffer_stale(mid);
1965 		mid = NULL;
1966 	} else {
1967 		/* update the parent key to reflect our changes */
1968 		struct btrfs_disk_key mid_key;
1969 		btrfs_node_key(mid, &mid_key, 0);
1970 		ret = tree_mod_log_insert_key(parent, pslot,
1971 				MOD_LOG_KEY_REPLACE, GFP_NOFS);
1972 		BUG_ON(ret < 0);
1973 		btrfs_set_node_key(parent, &mid_key, pslot);
1974 		btrfs_mark_buffer_dirty(parent);
1975 	}
1976 
1977 	/* update the path */
1978 	if (left) {
1979 		if (btrfs_header_nritems(left) > orig_slot) {
1980 			atomic_inc(&left->refs);
1981 			/* left was locked after cow */
1982 			path->nodes[level] = left;
1983 			path->slots[level + 1] -= 1;
1984 			path->slots[level] = orig_slot;
1985 			if (mid) {
1986 				btrfs_tree_unlock(mid);
1987 				free_extent_buffer(mid);
1988 			}
1989 		} else {
1990 			orig_slot -= btrfs_header_nritems(left);
1991 			path->slots[level] = orig_slot;
1992 		}
1993 	}
1994 	/* double check we haven't messed things up */
1995 	if (orig_ptr !=
1996 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1997 		BUG();
1998 enospc:
1999 	if (right) {
2000 		btrfs_tree_unlock(right);
2001 		free_extent_buffer(right);
2002 	}
2003 	if (left) {
2004 		if (path->nodes[level] != left)
2005 			btrfs_tree_unlock(left);
2006 		free_extent_buffer(left);
2007 	}
2008 	return ret;
2009 }
2010 
2011 /* Node balancing for insertion.  Here we only split or push nodes around
2012  * when they are completely full.  This is also done top down, so we
2013  * have to be pessimistic.
2014  */
2015 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2016 					  struct btrfs_root *root,
2017 					  struct btrfs_path *path, int level)
2018 {
2019 	struct btrfs_fs_info *fs_info = root->fs_info;
2020 	struct extent_buffer *right = NULL;
2021 	struct extent_buffer *mid;
2022 	struct extent_buffer *left = NULL;
2023 	struct extent_buffer *parent = NULL;
2024 	int ret = 0;
2025 	int wret;
2026 	int pslot;
2027 	int orig_slot = path->slots[level];
2028 
2029 	if (level == 0)
2030 		return 1;
2031 
2032 	mid = path->nodes[level];
2033 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2034 
2035 	if (level < BTRFS_MAX_LEVEL - 1) {
2036 		parent = path->nodes[level + 1];
2037 		pslot = path->slots[level + 1];
2038 	}
2039 
2040 	if (!parent)
2041 		return 1;
2042 
2043 	left = btrfs_read_node_slot(parent, pslot - 1);
2044 	if (IS_ERR(left))
2045 		left = NULL;
2046 
2047 	/* first, try to make some room in the middle buffer */
2048 	if (left) {
2049 		u32 left_nr;
2050 
2051 		btrfs_tree_lock(left);
2052 		btrfs_set_lock_blocking_write(left);
2053 
2054 		left_nr = btrfs_header_nritems(left);
2055 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2056 			wret = 1;
2057 		} else {
2058 			ret = btrfs_cow_block(trans, root, left, parent,
2059 					      pslot - 1, &left);
2060 			if (ret)
2061 				wret = 1;
2062 			else {
2063 				wret = push_node_left(trans, left, mid, 0);
2064 			}
2065 		}
2066 		if (wret < 0)
2067 			ret = wret;
2068 		if (wret == 0) {
2069 			struct btrfs_disk_key disk_key;
2070 			orig_slot += left_nr;
2071 			btrfs_node_key(mid, &disk_key, 0);
2072 			ret = tree_mod_log_insert_key(parent, pslot,
2073 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2074 			BUG_ON(ret < 0);
2075 			btrfs_set_node_key(parent, &disk_key, pslot);
2076 			btrfs_mark_buffer_dirty(parent);
2077 			if (btrfs_header_nritems(left) > orig_slot) {
2078 				path->nodes[level] = left;
2079 				path->slots[level + 1] -= 1;
2080 				path->slots[level] = orig_slot;
2081 				btrfs_tree_unlock(mid);
2082 				free_extent_buffer(mid);
2083 			} else {
2084 				orig_slot -=
2085 					btrfs_header_nritems(left);
2086 				path->slots[level] = orig_slot;
2087 				btrfs_tree_unlock(left);
2088 				free_extent_buffer(left);
2089 			}
2090 			return 0;
2091 		}
2092 		btrfs_tree_unlock(left);
2093 		free_extent_buffer(left);
2094 	}
2095 	right = btrfs_read_node_slot(parent, pslot + 1);
2096 	if (IS_ERR(right))
2097 		right = NULL;
2098 
2099 	/*
2100 	 * then try to empty the right most buffer into the middle
2101 	 */
2102 	if (right) {
2103 		u32 right_nr;
2104 
2105 		btrfs_tree_lock(right);
2106 		btrfs_set_lock_blocking_write(right);
2107 
2108 		right_nr = btrfs_header_nritems(right);
2109 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2110 			wret = 1;
2111 		} else {
2112 			ret = btrfs_cow_block(trans, root, right,
2113 					      parent, pslot + 1,
2114 					      &right);
2115 			if (ret)
2116 				wret = 1;
2117 			else {
2118 				wret = balance_node_right(trans, right, mid);
2119 			}
2120 		}
2121 		if (wret < 0)
2122 			ret = wret;
2123 		if (wret == 0) {
2124 			struct btrfs_disk_key disk_key;
2125 
2126 			btrfs_node_key(right, &disk_key, 0);
2127 			ret = tree_mod_log_insert_key(parent, pslot + 1,
2128 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
2129 			BUG_ON(ret < 0);
2130 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
2131 			btrfs_mark_buffer_dirty(parent);
2132 
2133 			if (btrfs_header_nritems(mid) <= orig_slot) {
2134 				path->nodes[level] = right;
2135 				path->slots[level + 1] += 1;
2136 				path->slots[level] = orig_slot -
2137 					btrfs_header_nritems(mid);
2138 				btrfs_tree_unlock(mid);
2139 				free_extent_buffer(mid);
2140 			} else {
2141 				btrfs_tree_unlock(right);
2142 				free_extent_buffer(right);
2143 			}
2144 			return 0;
2145 		}
2146 		btrfs_tree_unlock(right);
2147 		free_extent_buffer(right);
2148 	}
2149 	return 1;
2150 }
2151 
2152 /*
2153  * readahead one full node of leaves, finding things that are close
2154  * to the block in 'slot', and triggering ra on them.
2155  */
2156 static void reada_for_search(struct btrfs_fs_info *fs_info,
2157 			     struct btrfs_path *path,
2158 			     int level, int slot, u64 objectid)
2159 {
2160 	struct extent_buffer *node;
2161 	struct btrfs_disk_key disk_key;
2162 	u32 nritems;
2163 	u64 search;
2164 	u64 target;
2165 	u64 nread = 0;
2166 	struct extent_buffer *eb;
2167 	u32 nr;
2168 	u32 blocksize;
2169 	u32 nscan = 0;
2170 
2171 	if (level != 1)
2172 		return;
2173 
2174 	if (!path->nodes[level])
2175 		return;
2176 
2177 	node = path->nodes[level];
2178 
2179 	search = btrfs_node_blockptr(node, slot);
2180 	blocksize = fs_info->nodesize;
2181 	eb = find_extent_buffer(fs_info, search);
2182 	if (eb) {
2183 		free_extent_buffer(eb);
2184 		return;
2185 	}
2186 
2187 	target = search;
2188 
2189 	nritems = btrfs_header_nritems(node);
2190 	nr = slot;
2191 
2192 	while (1) {
2193 		if (path->reada == READA_BACK) {
2194 			if (nr == 0)
2195 				break;
2196 			nr--;
2197 		} else if (path->reada == READA_FORWARD) {
2198 			nr++;
2199 			if (nr >= nritems)
2200 				break;
2201 		}
2202 		if (path->reada == READA_BACK && objectid) {
2203 			btrfs_node_key(node, &disk_key, nr);
2204 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
2205 				break;
2206 		}
2207 		search = btrfs_node_blockptr(node, nr);
2208 		if ((search <= target && target - search <= 65536) ||
2209 		    (search > target && search - target <= 65536)) {
2210 			readahead_tree_block(fs_info, search);
2211 			nread += blocksize;
2212 		}
2213 		nscan++;
2214 		if ((nread > 65536 || nscan > 32))
2215 			break;
2216 	}
2217 }
2218 
2219 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2220 				       struct btrfs_path *path, int level)
2221 {
2222 	int slot;
2223 	int nritems;
2224 	struct extent_buffer *parent;
2225 	struct extent_buffer *eb;
2226 	u64 gen;
2227 	u64 block1 = 0;
2228 	u64 block2 = 0;
2229 
2230 	parent = path->nodes[level + 1];
2231 	if (!parent)
2232 		return;
2233 
2234 	nritems = btrfs_header_nritems(parent);
2235 	slot = path->slots[level + 1];
2236 
2237 	if (slot > 0) {
2238 		block1 = btrfs_node_blockptr(parent, slot - 1);
2239 		gen = btrfs_node_ptr_generation(parent, slot - 1);
2240 		eb = find_extent_buffer(fs_info, block1);
2241 		/*
2242 		 * if we get -eagain from btrfs_buffer_uptodate, we
2243 		 * don't want to return eagain here.  That will loop
2244 		 * forever
2245 		 */
2246 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2247 			block1 = 0;
2248 		free_extent_buffer(eb);
2249 	}
2250 	if (slot + 1 < nritems) {
2251 		block2 = btrfs_node_blockptr(parent, slot + 1);
2252 		gen = btrfs_node_ptr_generation(parent, slot + 1);
2253 		eb = find_extent_buffer(fs_info, block2);
2254 		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2255 			block2 = 0;
2256 		free_extent_buffer(eb);
2257 	}
2258 
2259 	if (block1)
2260 		readahead_tree_block(fs_info, block1);
2261 	if (block2)
2262 		readahead_tree_block(fs_info, block2);
2263 }
2264 
2265 
2266 /*
2267  * when we walk down the tree, it is usually safe to unlock the higher layers
2268  * in the tree.  The exceptions are when our path goes through slot 0, because
2269  * operations on the tree might require changing key pointers higher up in the
2270  * tree.
2271  *
2272  * callers might also have set path->keep_locks, which tells this code to keep
2273  * the lock if the path points to the last slot in the block.  This is part of
2274  * walking through the tree, and selecting the next slot in the higher block.
2275  *
2276  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2277  * if lowest_unlock is 1, level 0 won't be unlocked
2278  */
2279 static noinline void unlock_up(struct btrfs_path *path, int level,
2280 			       int lowest_unlock, int min_write_lock_level,
2281 			       int *write_lock_level)
2282 {
2283 	int i;
2284 	int skip_level = level;
2285 	int no_skips = 0;
2286 	struct extent_buffer *t;
2287 
2288 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2289 		if (!path->nodes[i])
2290 			break;
2291 		if (!path->locks[i])
2292 			break;
2293 		if (!no_skips && path->slots[i] == 0) {
2294 			skip_level = i + 1;
2295 			continue;
2296 		}
2297 		if (!no_skips && path->keep_locks) {
2298 			u32 nritems;
2299 			t = path->nodes[i];
2300 			nritems = btrfs_header_nritems(t);
2301 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2302 				skip_level = i + 1;
2303 				continue;
2304 			}
2305 		}
2306 		if (skip_level < i && i >= lowest_unlock)
2307 			no_skips = 1;
2308 
2309 		t = path->nodes[i];
2310 		if (i >= lowest_unlock && i > skip_level) {
2311 			btrfs_tree_unlock_rw(t, path->locks[i]);
2312 			path->locks[i] = 0;
2313 			if (write_lock_level &&
2314 			    i > min_write_lock_level &&
2315 			    i <= *write_lock_level) {
2316 				*write_lock_level = i - 1;
2317 			}
2318 		}
2319 	}
2320 }
2321 
2322 /*
2323  * helper function for btrfs_search_slot.  The goal is to find a block
2324  * in cache without setting the path to blocking.  If we find the block
2325  * we return zero and the path is unchanged.
2326  *
2327  * If we can't find the block, we set the path blocking and do some
2328  * reada.  -EAGAIN is returned and the search must be repeated.
2329  */
2330 static int
2331 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2332 		      struct extent_buffer **eb_ret, int level, int slot,
2333 		      const struct btrfs_key *key)
2334 {
2335 	struct btrfs_fs_info *fs_info = root->fs_info;
2336 	u64 blocknr;
2337 	u64 gen;
2338 	struct extent_buffer *tmp;
2339 	struct btrfs_key first_key;
2340 	int ret;
2341 	int parent_level;
2342 
2343 	blocknr = btrfs_node_blockptr(*eb_ret, slot);
2344 	gen = btrfs_node_ptr_generation(*eb_ret, slot);
2345 	parent_level = btrfs_header_level(*eb_ret);
2346 	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2347 
2348 	tmp = find_extent_buffer(fs_info, blocknr);
2349 	if (tmp) {
2350 		/* first we do an atomic uptodate check */
2351 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2352 			/*
2353 			 * Do extra check for first_key, eb can be stale due to
2354 			 * being cached, read from scrub, or have multiple
2355 			 * parents (shared tree blocks).
2356 			 */
2357 			if (btrfs_verify_level_key(tmp,
2358 					parent_level - 1, &first_key, gen)) {
2359 				free_extent_buffer(tmp);
2360 				return -EUCLEAN;
2361 			}
2362 			*eb_ret = tmp;
2363 			return 0;
2364 		}
2365 
2366 		/* the pages were up to date, but we failed
2367 		 * the generation number check.  Do a full
2368 		 * read for the generation number that is correct.
2369 		 * We must do this without dropping locks so
2370 		 * we can trust our generation number
2371 		 */
2372 		btrfs_set_path_blocking(p);
2373 
2374 		/* now we're allowed to do a blocking uptodate check */
2375 		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2376 		if (!ret) {
2377 			*eb_ret = tmp;
2378 			return 0;
2379 		}
2380 		free_extent_buffer(tmp);
2381 		btrfs_release_path(p);
2382 		return -EIO;
2383 	}
2384 
2385 	/*
2386 	 * reduce lock contention at high levels
2387 	 * of the btree by dropping locks before
2388 	 * we read.  Don't release the lock on the current
2389 	 * level because we need to walk this node to figure
2390 	 * out which blocks to read.
2391 	 */
2392 	btrfs_unlock_up_safe(p, level + 1);
2393 	btrfs_set_path_blocking(p);
2394 
2395 	if (p->reada != READA_NONE)
2396 		reada_for_search(fs_info, p, level, slot, key->objectid);
2397 
2398 	ret = -EAGAIN;
2399 	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2400 			      &first_key);
2401 	if (!IS_ERR(tmp)) {
2402 		/*
2403 		 * If the read above didn't mark this buffer up to date,
2404 		 * it will never end up being up to date.  Set ret to EIO now
2405 		 * and give up so that our caller doesn't loop forever
2406 		 * on our EAGAINs.
2407 		 */
2408 		if (!extent_buffer_uptodate(tmp))
2409 			ret = -EIO;
2410 		free_extent_buffer(tmp);
2411 	} else {
2412 		ret = PTR_ERR(tmp);
2413 	}
2414 
2415 	btrfs_release_path(p);
2416 	return ret;
2417 }
2418 
2419 /*
2420  * helper function for btrfs_search_slot.  This does all of the checks
2421  * for node-level blocks and does any balancing required based on
2422  * the ins_len.
2423  *
2424  * If no extra work was required, zero is returned.  If we had to
2425  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2426  * start over
2427  */
2428 static int
2429 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2430 		       struct btrfs_root *root, struct btrfs_path *p,
2431 		       struct extent_buffer *b, int level, int ins_len,
2432 		       int *write_lock_level)
2433 {
2434 	struct btrfs_fs_info *fs_info = root->fs_info;
2435 	int ret;
2436 
2437 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2438 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2439 		int sret;
2440 
2441 		if (*write_lock_level < level + 1) {
2442 			*write_lock_level = level + 1;
2443 			btrfs_release_path(p);
2444 			goto again;
2445 		}
2446 
2447 		btrfs_set_path_blocking(p);
2448 		reada_for_balance(fs_info, p, level);
2449 		sret = split_node(trans, root, p, level);
2450 
2451 		BUG_ON(sret > 0);
2452 		if (sret) {
2453 			ret = sret;
2454 			goto done;
2455 		}
2456 		b = p->nodes[level];
2457 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2458 		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2459 		int sret;
2460 
2461 		if (*write_lock_level < level + 1) {
2462 			*write_lock_level = level + 1;
2463 			btrfs_release_path(p);
2464 			goto again;
2465 		}
2466 
2467 		btrfs_set_path_blocking(p);
2468 		reada_for_balance(fs_info, p, level);
2469 		sret = balance_level(trans, root, p, level);
2470 
2471 		if (sret) {
2472 			ret = sret;
2473 			goto done;
2474 		}
2475 		b = p->nodes[level];
2476 		if (!b) {
2477 			btrfs_release_path(p);
2478 			goto again;
2479 		}
2480 		BUG_ON(btrfs_header_nritems(b) == 1);
2481 	}
2482 	return 0;
2483 
2484 again:
2485 	ret = -EAGAIN;
2486 done:
2487 	return ret;
2488 }
2489 
2490 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2491 		u64 iobjectid, u64 ioff, u8 key_type,
2492 		struct btrfs_key *found_key)
2493 {
2494 	int ret;
2495 	struct btrfs_key key;
2496 	struct extent_buffer *eb;
2497 
2498 	ASSERT(path);
2499 	ASSERT(found_key);
2500 
2501 	key.type = key_type;
2502 	key.objectid = iobjectid;
2503 	key.offset = ioff;
2504 
2505 	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2506 	if (ret < 0)
2507 		return ret;
2508 
2509 	eb = path->nodes[0];
2510 	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2511 		ret = btrfs_next_leaf(fs_root, path);
2512 		if (ret)
2513 			return ret;
2514 		eb = path->nodes[0];
2515 	}
2516 
2517 	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2518 	if (found_key->type != key.type ||
2519 			found_key->objectid != key.objectid)
2520 		return 1;
2521 
2522 	return 0;
2523 }
2524 
2525 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2526 							struct btrfs_path *p,
2527 							int write_lock_level)
2528 {
2529 	struct btrfs_fs_info *fs_info = root->fs_info;
2530 	struct extent_buffer *b;
2531 	int root_lock;
2532 	int level = 0;
2533 
2534 	/* We try very hard to do read locks on the root */
2535 	root_lock = BTRFS_READ_LOCK;
2536 
2537 	if (p->search_commit_root) {
2538 		/*
2539 		 * The commit roots are read only so we always do read locks,
2540 		 * and we always must hold the commit_root_sem when doing
2541 		 * searches on them, the only exception is send where we don't
2542 		 * want to block transaction commits for a long time, so
2543 		 * we need to clone the commit root in order to avoid races
2544 		 * with transaction commits that create a snapshot of one of
2545 		 * the roots used by a send operation.
2546 		 */
2547 		if (p->need_commit_sem) {
2548 			down_read(&fs_info->commit_root_sem);
2549 			b = btrfs_clone_extent_buffer(root->commit_root);
2550 			up_read(&fs_info->commit_root_sem);
2551 			if (!b)
2552 				return ERR_PTR(-ENOMEM);
2553 
2554 		} else {
2555 			b = root->commit_root;
2556 			atomic_inc(&b->refs);
2557 		}
2558 		level = btrfs_header_level(b);
2559 		/*
2560 		 * Ensure that all callers have set skip_locking when
2561 		 * p->search_commit_root = 1.
2562 		 */
2563 		ASSERT(p->skip_locking == 1);
2564 
2565 		goto out;
2566 	}
2567 
2568 	if (p->skip_locking) {
2569 		b = btrfs_root_node(root);
2570 		level = btrfs_header_level(b);
2571 		goto out;
2572 	}
2573 
2574 	/*
2575 	 * If the level is set to maximum, we can skip trying to get the read
2576 	 * lock.
2577 	 */
2578 	if (write_lock_level < BTRFS_MAX_LEVEL) {
2579 		/*
2580 		 * We don't know the level of the root node until we actually
2581 		 * have it read locked
2582 		 */
2583 		b = btrfs_read_lock_root_node(root);
2584 		level = btrfs_header_level(b);
2585 		if (level > write_lock_level)
2586 			goto out;
2587 
2588 		/* Whoops, must trade for write lock */
2589 		btrfs_tree_read_unlock(b);
2590 		free_extent_buffer(b);
2591 	}
2592 
2593 	b = btrfs_lock_root_node(root);
2594 	root_lock = BTRFS_WRITE_LOCK;
2595 
2596 	/* The level might have changed, check again */
2597 	level = btrfs_header_level(b);
2598 
2599 out:
2600 	p->nodes[level] = b;
2601 	if (!p->skip_locking)
2602 		p->locks[level] = root_lock;
2603 	/*
2604 	 * Callers are responsible for dropping b's references.
2605 	 */
2606 	return b;
2607 }
2608 
2609 
2610 /*
2611  * btrfs_search_slot - look for a key in a tree and perform necessary
2612  * modifications to preserve tree invariants.
2613  *
2614  * @trans:	Handle of transaction, used when modifying the tree
2615  * @p:		Holds all btree nodes along the search path
2616  * @root:	The root node of the tree
2617  * @key:	The key we are looking for
2618  * @ins_len:	Indicates purpose of search, for inserts it is 1, for
2619  *		deletions it's -1. 0 for plain searches
2620  * @cow:	boolean should CoW operations be performed. Must always be 1
2621  *		when modifying the tree.
2622  *
2623  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2624  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2625  *
2626  * If @key is found, 0 is returned and you can find the item in the leaf level
2627  * of the path (level 0)
2628  *
2629  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2630  * points to the slot where it should be inserted
2631  *
2632  * If an error is encountered while searching the tree a negative error number
2633  * is returned
2634  */
2635 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2636 		      const struct btrfs_key *key, struct btrfs_path *p,
2637 		      int ins_len, int cow)
2638 {
2639 	struct extent_buffer *b;
2640 	int slot;
2641 	int ret;
2642 	int err;
2643 	int level;
2644 	int lowest_unlock = 1;
2645 	/* everything at write_lock_level or lower must be write locked */
2646 	int write_lock_level = 0;
2647 	u8 lowest_level = 0;
2648 	int min_write_lock_level;
2649 	int prev_cmp;
2650 
2651 	lowest_level = p->lowest_level;
2652 	WARN_ON(lowest_level && ins_len > 0);
2653 	WARN_ON(p->nodes[0] != NULL);
2654 	BUG_ON(!cow && ins_len);
2655 
2656 	if (ins_len < 0) {
2657 		lowest_unlock = 2;
2658 
2659 		/* when we are removing items, we might have to go up to level
2660 		 * two as we update tree pointers  Make sure we keep write
2661 		 * for those levels as well
2662 		 */
2663 		write_lock_level = 2;
2664 	} else if (ins_len > 0) {
2665 		/*
2666 		 * for inserting items, make sure we have a write lock on
2667 		 * level 1 so we can update keys
2668 		 */
2669 		write_lock_level = 1;
2670 	}
2671 
2672 	if (!cow)
2673 		write_lock_level = -1;
2674 
2675 	if (cow && (p->keep_locks || p->lowest_level))
2676 		write_lock_level = BTRFS_MAX_LEVEL;
2677 
2678 	min_write_lock_level = write_lock_level;
2679 
2680 again:
2681 	prev_cmp = -1;
2682 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2683 	if (IS_ERR(b)) {
2684 		ret = PTR_ERR(b);
2685 		goto done;
2686 	}
2687 
2688 	while (b) {
2689 		int dec = 0;
2690 
2691 		level = btrfs_header_level(b);
2692 
2693 		if (cow) {
2694 			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2695 
2696 			/*
2697 			 * if we don't really need to cow this block
2698 			 * then we don't want to set the path blocking,
2699 			 * so we test it here
2700 			 */
2701 			if (!should_cow_block(trans, root, b)) {
2702 				trans->dirty = true;
2703 				goto cow_done;
2704 			}
2705 
2706 			/*
2707 			 * must have write locks on this node and the
2708 			 * parent
2709 			 */
2710 			if (level > write_lock_level ||
2711 			    (level + 1 > write_lock_level &&
2712 			    level + 1 < BTRFS_MAX_LEVEL &&
2713 			    p->nodes[level + 1])) {
2714 				write_lock_level = level + 1;
2715 				btrfs_release_path(p);
2716 				goto again;
2717 			}
2718 
2719 			btrfs_set_path_blocking(p);
2720 			if (last_level)
2721 				err = btrfs_cow_block(trans, root, b, NULL, 0,
2722 						      &b);
2723 			else
2724 				err = btrfs_cow_block(trans, root, b,
2725 						      p->nodes[level + 1],
2726 						      p->slots[level + 1], &b);
2727 			if (err) {
2728 				ret = err;
2729 				goto done;
2730 			}
2731 		}
2732 cow_done:
2733 		p->nodes[level] = b;
2734 		/*
2735 		 * Leave path with blocking locks to avoid massive
2736 		 * lock context switch, this is made on purpose.
2737 		 */
2738 
2739 		/*
2740 		 * we have a lock on b and as long as we aren't changing
2741 		 * the tree, there is no way to for the items in b to change.
2742 		 * It is safe to drop the lock on our parent before we
2743 		 * go through the expensive btree search on b.
2744 		 *
2745 		 * If we're inserting or deleting (ins_len != 0), then we might
2746 		 * be changing slot zero, which may require changing the parent.
2747 		 * So, we can't drop the lock until after we know which slot
2748 		 * we're operating on.
2749 		 */
2750 		if (!ins_len && !p->keep_locks) {
2751 			int u = level + 1;
2752 
2753 			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2754 				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2755 				p->locks[u] = 0;
2756 			}
2757 		}
2758 
2759 		/*
2760 		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2761 		 * we can safely assume the target key will always be in slot 0
2762 		 * on lower levels due to the invariants BTRFS' btree provides,
2763 		 * namely that a btrfs_key_ptr entry always points to the
2764 		 * lowest key in the child node, thus we can skip searching
2765 		 * lower levels
2766 		 */
2767 		if (prev_cmp == 0) {
2768 			slot = 0;
2769 			ret = 0;
2770 		} else {
2771 			ret = btrfs_bin_search(b, key, &slot);
2772 			prev_cmp = ret;
2773 			if (ret < 0)
2774 				goto done;
2775 		}
2776 
2777 		if (level == 0) {
2778 			p->slots[level] = slot;
2779 			if (ins_len > 0 &&
2780 			    btrfs_leaf_free_space(b) < ins_len) {
2781 				if (write_lock_level < 1) {
2782 					write_lock_level = 1;
2783 					btrfs_release_path(p);
2784 					goto again;
2785 				}
2786 
2787 				btrfs_set_path_blocking(p);
2788 				err = split_leaf(trans, root, key,
2789 						 p, ins_len, ret == 0);
2790 
2791 				BUG_ON(err > 0);
2792 				if (err) {
2793 					ret = err;
2794 					goto done;
2795 				}
2796 			}
2797 			if (!p->search_for_split)
2798 				unlock_up(p, level, lowest_unlock,
2799 					  min_write_lock_level, NULL);
2800 			goto done;
2801 		}
2802 		if (ret && slot > 0) {
2803 			dec = 1;
2804 			slot--;
2805 		}
2806 		p->slots[level] = slot;
2807 		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2808 					     &write_lock_level);
2809 		if (err == -EAGAIN)
2810 			goto again;
2811 		if (err) {
2812 			ret = err;
2813 			goto done;
2814 		}
2815 		b = p->nodes[level];
2816 		slot = p->slots[level];
2817 
2818 		/*
2819 		 * Slot 0 is special, if we change the key we have to update
2820 		 * the parent pointer which means we must have a write lock on
2821 		 * the parent
2822 		 */
2823 		if (slot == 0 && ins_len && write_lock_level < level + 1) {
2824 			write_lock_level = level + 1;
2825 			btrfs_release_path(p);
2826 			goto again;
2827 		}
2828 
2829 		unlock_up(p, level, lowest_unlock, min_write_lock_level,
2830 			  &write_lock_level);
2831 
2832 		if (level == lowest_level) {
2833 			if (dec)
2834 				p->slots[level]++;
2835 			goto done;
2836 		}
2837 
2838 		err = read_block_for_search(root, p, &b, level, slot, key);
2839 		if (err == -EAGAIN)
2840 			goto again;
2841 		if (err) {
2842 			ret = err;
2843 			goto done;
2844 		}
2845 
2846 		if (!p->skip_locking) {
2847 			level = btrfs_header_level(b);
2848 			if (level <= write_lock_level) {
2849 				if (!btrfs_try_tree_write_lock(b)) {
2850 					btrfs_set_path_blocking(p);
2851 					btrfs_tree_lock(b);
2852 				}
2853 				p->locks[level] = BTRFS_WRITE_LOCK;
2854 			} else {
2855 				if (!btrfs_tree_read_lock_atomic(b)) {
2856 					btrfs_set_path_blocking(p);
2857 					btrfs_tree_read_lock(b);
2858 				}
2859 				p->locks[level] = BTRFS_READ_LOCK;
2860 			}
2861 			p->nodes[level] = b;
2862 		}
2863 	}
2864 	ret = 1;
2865 done:
2866 	/*
2867 	 * we don't really know what they plan on doing with the path
2868 	 * from here on, so for now just mark it as blocking
2869 	 */
2870 	if (!p->leave_spinning)
2871 		btrfs_set_path_blocking(p);
2872 	if (ret < 0 && !p->skip_release_on_error)
2873 		btrfs_release_path(p);
2874 	return ret;
2875 }
2876 
2877 /*
2878  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2879  * current state of the tree together with the operations recorded in the tree
2880  * modification log to search for the key in a previous version of this tree, as
2881  * denoted by the time_seq parameter.
2882  *
2883  * Naturally, there is no support for insert, delete or cow operations.
2884  *
2885  * The resulting path and return value will be set up as if we called
2886  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2887  */
2888 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2889 			  struct btrfs_path *p, u64 time_seq)
2890 {
2891 	struct btrfs_fs_info *fs_info = root->fs_info;
2892 	struct extent_buffer *b;
2893 	int slot;
2894 	int ret;
2895 	int err;
2896 	int level;
2897 	int lowest_unlock = 1;
2898 	u8 lowest_level = 0;
2899 
2900 	lowest_level = p->lowest_level;
2901 	WARN_ON(p->nodes[0] != NULL);
2902 
2903 	if (p->search_commit_root) {
2904 		BUG_ON(time_seq);
2905 		return btrfs_search_slot(NULL, root, key, p, 0, 0);
2906 	}
2907 
2908 again:
2909 	b = get_old_root(root, time_seq);
2910 	if (!b) {
2911 		ret = -EIO;
2912 		goto done;
2913 	}
2914 	level = btrfs_header_level(b);
2915 	p->locks[level] = BTRFS_READ_LOCK;
2916 
2917 	while (b) {
2918 		int dec = 0;
2919 
2920 		level = btrfs_header_level(b);
2921 		p->nodes[level] = b;
2922 
2923 		/*
2924 		 * we have a lock on b and as long as we aren't changing
2925 		 * the tree, there is no way to for the items in b to change.
2926 		 * It is safe to drop the lock on our parent before we
2927 		 * go through the expensive btree search on b.
2928 		 */
2929 		btrfs_unlock_up_safe(p, level + 1);
2930 
2931 		ret = btrfs_bin_search(b, key, &slot);
2932 		if (ret < 0)
2933 			goto done;
2934 
2935 		if (level == 0) {
2936 			p->slots[level] = slot;
2937 			unlock_up(p, level, lowest_unlock, 0, NULL);
2938 			goto done;
2939 		}
2940 
2941 		if (ret && slot > 0) {
2942 			dec = 1;
2943 			slot--;
2944 		}
2945 		p->slots[level] = slot;
2946 		unlock_up(p, level, lowest_unlock, 0, NULL);
2947 
2948 		if (level == lowest_level) {
2949 			if (dec)
2950 				p->slots[level]++;
2951 			goto done;
2952 		}
2953 
2954 		err = read_block_for_search(root, p, &b, level, slot, key);
2955 		if (err == -EAGAIN)
2956 			goto again;
2957 		if (err) {
2958 			ret = err;
2959 			goto done;
2960 		}
2961 
2962 		level = btrfs_header_level(b);
2963 		if (!btrfs_tree_read_lock_atomic(b)) {
2964 			btrfs_set_path_blocking(p);
2965 			btrfs_tree_read_lock(b);
2966 		}
2967 		b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2968 		if (!b) {
2969 			ret = -ENOMEM;
2970 			goto done;
2971 		}
2972 		p->locks[level] = BTRFS_READ_LOCK;
2973 		p->nodes[level] = b;
2974 	}
2975 	ret = 1;
2976 done:
2977 	if (!p->leave_spinning)
2978 		btrfs_set_path_blocking(p);
2979 	if (ret < 0)
2980 		btrfs_release_path(p);
2981 
2982 	return ret;
2983 }
2984 
2985 /*
2986  * helper to use instead of search slot if no exact match is needed but
2987  * instead the next or previous item should be returned.
2988  * When find_higher is true, the next higher item is returned, the next lower
2989  * otherwise.
2990  * When return_any and find_higher are both true, and no higher item is found,
2991  * return the next lower instead.
2992  * When return_any is true and find_higher is false, and no lower item is found,
2993  * return the next higher instead.
2994  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2995  * < 0 on error
2996  */
2997 int btrfs_search_slot_for_read(struct btrfs_root *root,
2998 			       const struct btrfs_key *key,
2999 			       struct btrfs_path *p, int find_higher,
3000 			       int return_any)
3001 {
3002 	int ret;
3003 	struct extent_buffer *leaf;
3004 
3005 again:
3006 	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3007 	if (ret <= 0)
3008 		return ret;
3009 	/*
3010 	 * a return value of 1 means the path is at the position where the
3011 	 * item should be inserted. Normally this is the next bigger item,
3012 	 * but in case the previous item is the last in a leaf, path points
3013 	 * to the first free slot in the previous leaf, i.e. at an invalid
3014 	 * item.
3015 	 */
3016 	leaf = p->nodes[0];
3017 
3018 	if (find_higher) {
3019 		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3020 			ret = btrfs_next_leaf(root, p);
3021 			if (ret <= 0)
3022 				return ret;
3023 			if (!return_any)
3024 				return 1;
3025 			/*
3026 			 * no higher item found, return the next
3027 			 * lower instead
3028 			 */
3029 			return_any = 0;
3030 			find_higher = 0;
3031 			btrfs_release_path(p);
3032 			goto again;
3033 		}
3034 	} else {
3035 		if (p->slots[0] == 0) {
3036 			ret = btrfs_prev_leaf(root, p);
3037 			if (ret < 0)
3038 				return ret;
3039 			if (!ret) {
3040 				leaf = p->nodes[0];
3041 				if (p->slots[0] == btrfs_header_nritems(leaf))
3042 					p->slots[0]--;
3043 				return 0;
3044 			}
3045 			if (!return_any)
3046 				return 1;
3047 			/*
3048 			 * no lower item found, return the next
3049 			 * higher instead
3050 			 */
3051 			return_any = 0;
3052 			find_higher = 1;
3053 			btrfs_release_path(p);
3054 			goto again;
3055 		} else {
3056 			--p->slots[0];
3057 		}
3058 	}
3059 	return 0;
3060 }
3061 
3062 /*
3063  * adjust the pointers going up the tree, starting at level
3064  * making sure the right key of each node is points to 'key'.
3065  * This is used after shifting pointers to the left, so it stops
3066  * fixing up pointers when a given leaf/node is not in slot 0 of the
3067  * higher levels
3068  *
3069  */
3070 static void fixup_low_keys(struct btrfs_path *path,
3071 			   struct btrfs_disk_key *key, int level)
3072 {
3073 	int i;
3074 	struct extent_buffer *t;
3075 	int ret;
3076 
3077 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3078 		int tslot = path->slots[i];
3079 
3080 		if (!path->nodes[i])
3081 			break;
3082 		t = path->nodes[i];
3083 		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3084 				GFP_ATOMIC);
3085 		BUG_ON(ret < 0);
3086 		btrfs_set_node_key(t, key, tslot);
3087 		btrfs_mark_buffer_dirty(path->nodes[i]);
3088 		if (tslot != 0)
3089 			break;
3090 	}
3091 }
3092 
3093 /*
3094  * update item key.
3095  *
3096  * This function isn't completely safe. It's the caller's responsibility
3097  * that the new key won't break the order
3098  */
3099 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3100 			     struct btrfs_path *path,
3101 			     const struct btrfs_key *new_key)
3102 {
3103 	struct btrfs_disk_key disk_key;
3104 	struct extent_buffer *eb;
3105 	int slot;
3106 
3107 	eb = path->nodes[0];
3108 	slot = path->slots[0];
3109 	if (slot > 0) {
3110 		btrfs_item_key(eb, &disk_key, slot - 1);
3111 		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3112 			btrfs_crit(fs_info,
3113 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3114 				   slot, btrfs_disk_key_objectid(&disk_key),
3115 				   btrfs_disk_key_type(&disk_key),
3116 				   btrfs_disk_key_offset(&disk_key),
3117 				   new_key->objectid, new_key->type,
3118 				   new_key->offset);
3119 			btrfs_print_leaf(eb);
3120 			BUG();
3121 		}
3122 	}
3123 	if (slot < btrfs_header_nritems(eb) - 1) {
3124 		btrfs_item_key(eb, &disk_key, slot + 1);
3125 		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3126 			btrfs_crit(fs_info,
3127 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3128 				   slot, btrfs_disk_key_objectid(&disk_key),
3129 				   btrfs_disk_key_type(&disk_key),
3130 				   btrfs_disk_key_offset(&disk_key),
3131 				   new_key->objectid, new_key->type,
3132 				   new_key->offset);
3133 			btrfs_print_leaf(eb);
3134 			BUG();
3135 		}
3136 	}
3137 
3138 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3139 	btrfs_set_item_key(eb, &disk_key, slot);
3140 	btrfs_mark_buffer_dirty(eb);
3141 	if (slot == 0)
3142 		fixup_low_keys(path, &disk_key, 1);
3143 }
3144 
3145 /*
3146  * try to push data from one node into the next node left in the
3147  * tree.
3148  *
3149  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3150  * error, and > 0 if there was no room in the left hand block.
3151  */
3152 static int push_node_left(struct btrfs_trans_handle *trans,
3153 			  struct extent_buffer *dst,
3154 			  struct extent_buffer *src, int empty)
3155 {
3156 	struct btrfs_fs_info *fs_info = trans->fs_info;
3157 	int push_items = 0;
3158 	int src_nritems;
3159 	int dst_nritems;
3160 	int ret = 0;
3161 
3162 	src_nritems = btrfs_header_nritems(src);
3163 	dst_nritems = btrfs_header_nritems(dst);
3164 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3165 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3166 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3167 
3168 	if (!empty && src_nritems <= 8)
3169 		return 1;
3170 
3171 	if (push_items <= 0)
3172 		return 1;
3173 
3174 	if (empty) {
3175 		push_items = min(src_nritems, push_items);
3176 		if (push_items < src_nritems) {
3177 			/* leave at least 8 pointers in the node if
3178 			 * we aren't going to empty it
3179 			 */
3180 			if (src_nritems - push_items < 8) {
3181 				if (push_items <= 8)
3182 					return 1;
3183 				push_items -= 8;
3184 			}
3185 		}
3186 	} else
3187 		push_items = min(src_nritems - 8, push_items);
3188 
3189 	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3190 	if (ret) {
3191 		btrfs_abort_transaction(trans, ret);
3192 		return ret;
3193 	}
3194 	copy_extent_buffer(dst, src,
3195 			   btrfs_node_key_ptr_offset(dst_nritems),
3196 			   btrfs_node_key_ptr_offset(0),
3197 			   push_items * sizeof(struct btrfs_key_ptr));
3198 
3199 	if (push_items < src_nritems) {
3200 		/*
3201 		 * Don't call tree_mod_log_insert_move here, key removal was
3202 		 * already fully logged by tree_mod_log_eb_copy above.
3203 		 */
3204 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3205 				      btrfs_node_key_ptr_offset(push_items),
3206 				      (src_nritems - push_items) *
3207 				      sizeof(struct btrfs_key_ptr));
3208 	}
3209 	btrfs_set_header_nritems(src, src_nritems - push_items);
3210 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3211 	btrfs_mark_buffer_dirty(src);
3212 	btrfs_mark_buffer_dirty(dst);
3213 
3214 	return ret;
3215 }
3216 
3217 /*
3218  * try to push data from one node into the next node right in the
3219  * tree.
3220  *
3221  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3222  * error, and > 0 if there was no room in the right hand block.
3223  *
3224  * this will  only push up to 1/2 the contents of the left node over
3225  */
3226 static int balance_node_right(struct btrfs_trans_handle *trans,
3227 			      struct extent_buffer *dst,
3228 			      struct extent_buffer *src)
3229 {
3230 	struct btrfs_fs_info *fs_info = trans->fs_info;
3231 	int push_items = 0;
3232 	int max_push;
3233 	int src_nritems;
3234 	int dst_nritems;
3235 	int ret = 0;
3236 
3237 	WARN_ON(btrfs_header_generation(src) != trans->transid);
3238 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3239 
3240 	src_nritems = btrfs_header_nritems(src);
3241 	dst_nritems = btrfs_header_nritems(dst);
3242 	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3243 	if (push_items <= 0)
3244 		return 1;
3245 
3246 	if (src_nritems < 4)
3247 		return 1;
3248 
3249 	max_push = src_nritems / 2 + 1;
3250 	/* don't try to empty the node */
3251 	if (max_push >= src_nritems)
3252 		return 1;
3253 
3254 	if (max_push < push_items)
3255 		push_items = max_push;
3256 
3257 	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3258 	BUG_ON(ret < 0);
3259 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3260 				      btrfs_node_key_ptr_offset(0),
3261 				      (dst_nritems) *
3262 				      sizeof(struct btrfs_key_ptr));
3263 
3264 	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3265 				   push_items);
3266 	if (ret) {
3267 		btrfs_abort_transaction(trans, ret);
3268 		return ret;
3269 	}
3270 	copy_extent_buffer(dst, src,
3271 			   btrfs_node_key_ptr_offset(0),
3272 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
3273 			   push_items * sizeof(struct btrfs_key_ptr));
3274 
3275 	btrfs_set_header_nritems(src, src_nritems - push_items);
3276 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3277 
3278 	btrfs_mark_buffer_dirty(src);
3279 	btrfs_mark_buffer_dirty(dst);
3280 
3281 	return ret;
3282 }
3283 
3284 /*
3285  * helper function to insert a new root level in the tree.
3286  * A new node is allocated, and a single item is inserted to
3287  * point to the existing root
3288  *
3289  * returns zero on success or < 0 on failure.
3290  */
3291 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3292 			   struct btrfs_root *root,
3293 			   struct btrfs_path *path, int level)
3294 {
3295 	struct btrfs_fs_info *fs_info = root->fs_info;
3296 	u64 lower_gen;
3297 	struct extent_buffer *lower;
3298 	struct extent_buffer *c;
3299 	struct extent_buffer *old;
3300 	struct btrfs_disk_key lower_key;
3301 	int ret;
3302 
3303 	BUG_ON(path->nodes[level]);
3304 	BUG_ON(path->nodes[level-1] != root->node);
3305 
3306 	lower = path->nodes[level-1];
3307 	if (level == 1)
3308 		btrfs_item_key(lower, &lower_key, 0);
3309 	else
3310 		btrfs_node_key(lower, &lower_key, 0);
3311 
3312 	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3313 					 root->node->start, 0);
3314 	if (IS_ERR(c))
3315 		return PTR_ERR(c);
3316 
3317 	root_add_used(root, fs_info->nodesize);
3318 
3319 	btrfs_set_header_nritems(c, 1);
3320 	btrfs_set_node_key(c, &lower_key, 0);
3321 	btrfs_set_node_blockptr(c, 0, lower->start);
3322 	lower_gen = btrfs_header_generation(lower);
3323 	WARN_ON(lower_gen != trans->transid);
3324 
3325 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3326 
3327 	btrfs_mark_buffer_dirty(c);
3328 
3329 	old = root->node;
3330 	ret = tree_mod_log_insert_root(root->node, c, 0);
3331 	BUG_ON(ret < 0);
3332 	rcu_assign_pointer(root->node, c);
3333 
3334 	/* the super has an extra ref to root->node */
3335 	free_extent_buffer(old);
3336 
3337 	add_root_to_dirty_list(root);
3338 	atomic_inc(&c->refs);
3339 	path->nodes[level] = c;
3340 	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3341 	path->slots[level] = 0;
3342 	return 0;
3343 }
3344 
3345 /*
3346  * worker function to insert a single pointer in a node.
3347  * the node should have enough room for the pointer already
3348  *
3349  * slot and level indicate where you want the key to go, and
3350  * blocknr is the block the key points to.
3351  */
3352 static void insert_ptr(struct btrfs_trans_handle *trans,
3353 		       struct btrfs_path *path,
3354 		       struct btrfs_disk_key *key, u64 bytenr,
3355 		       int slot, int level)
3356 {
3357 	struct extent_buffer *lower;
3358 	int nritems;
3359 	int ret;
3360 
3361 	BUG_ON(!path->nodes[level]);
3362 	btrfs_assert_tree_locked(path->nodes[level]);
3363 	lower = path->nodes[level];
3364 	nritems = btrfs_header_nritems(lower);
3365 	BUG_ON(slot > nritems);
3366 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3367 	if (slot != nritems) {
3368 		if (level) {
3369 			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3370 					nritems - slot);
3371 			BUG_ON(ret < 0);
3372 		}
3373 		memmove_extent_buffer(lower,
3374 			      btrfs_node_key_ptr_offset(slot + 1),
3375 			      btrfs_node_key_ptr_offset(slot),
3376 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
3377 	}
3378 	if (level) {
3379 		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3380 				GFP_NOFS);
3381 		BUG_ON(ret < 0);
3382 	}
3383 	btrfs_set_node_key(lower, key, slot);
3384 	btrfs_set_node_blockptr(lower, slot, bytenr);
3385 	WARN_ON(trans->transid == 0);
3386 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3387 	btrfs_set_header_nritems(lower, nritems + 1);
3388 	btrfs_mark_buffer_dirty(lower);
3389 }
3390 
3391 /*
3392  * split the node at the specified level in path in two.
3393  * The path is corrected to point to the appropriate node after the split
3394  *
3395  * Before splitting this tries to make some room in the node by pushing
3396  * left and right, if either one works, it returns right away.
3397  *
3398  * returns 0 on success and < 0 on failure
3399  */
3400 static noinline int split_node(struct btrfs_trans_handle *trans,
3401 			       struct btrfs_root *root,
3402 			       struct btrfs_path *path, int level)
3403 {
3404 	struct btrfs_fs_info *fs_info = root->fs_info;
3405 	struct extent_buffer *c;
3406 	struct extent_buffer *split;
3407 	struct btrfs_disk_key disk_key;
3408 	int mid;
3409 	int ret;
3410 	u32 c_nritems;
3411 
3412 	c = path->nodes[level];
3413 	WARN_ON(btrfs_header_generation(c) != trans->transid);
3414 	if (c == root->node) {
3415 		/*
3416 		 * trying to split the root, lets make a new one
3417 		 *
3418 		 * tree mod log: We don't log_removal old root in
3419 		 * insert_new_root, because that root buffer will be kept as a
3420 		 * normal node. We are going to log removal of half of the
3421 		 * elements below with tree_mod_log_eb_copy. We're holding a
3422 		 * tree lock on the buffer, which is why we cannot race with
3423 		 * other tree_mod_log users.
3424 		 */
3425 		ret = insert_new_root(trans, root, path, level + 1);
3426 		if (ret)
3427 			return ret;
3428 	} else {
3429 		ret = push_nodes_for_insert(trans, root, path, level);
3430 		c = path->nodes[level];
3431 		if (!ret && btrfs_header_nritems(c) <
3432 		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3433 			return 0;
3434 		if (ret < 0)
3435 			return ret;
3436 	}
3437 
3438 	c_nritems = btrfs_header_nritems(c);
3439 	mid = (c_nritems + 1) / 2;
3440 	btrfs_node_key(c, &disk_key, mid);
3441 
3442 	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3443 					     c->start, 0);
3444 	if (IS_ERR(split))
3445 		return PTR_ERR(split);
3446 
3447 	root_add_used(root, fs_info->nodesize);
3448 	ASSERT(btrfs_header_level(c) == level);
3449 
3450 	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3451 	if (ret) {
3452 		btrfs_abort_transaction(trans, ret);
3453 		return ret;
3454 	}
3455 	copy_extent_buffer(split, c,
3456 			   btrfs_node_key_ptr_offset(0),
3457 			   btrfs_node_key_ptr_offset(mid),
3458 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3459 	btrfs_set_header_nritems(split, c_nritems - mid);
3460 	btrfs_set_header_nritems(c, mid);
3461 	ret = 0;
3462 
3463 	btrfs_mark_buffer_dirty(c);
3464 	btrfs_mark_buffer_dirty(split);
3465 
3466 	insert_ptr(trans, path, &disk_key, split->start,
3467 		   path->slots[level + 1] + 1, level + 1);
3468 
3469 	if (path->slots[level] >= mid) {
3470 		path->slots[level] -= mid;
3471 		btrfs_tree_unlock(c);
3472 		free_extent_buffer(c);
3473 		path->nodes[level] = split;
3474 		path->slots[level + 1] += 1;
3475 	} else {
3476 		btrfs_tree_unlock(split);
3477 		free_extent_buffer(split);
3478 	}
3479 	return ret;
3480 }
3481 
3482 /*
3483  * how many bytes are required to store the items in a leaf.  start
3484  * and nr indicate which items in the leaf to check.  This totals up the
3485  * space used both by the item structs and the item data
3486  */
3487 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3488 {
3489 	struct btrfs_item *start_item;
3490 	struct btrfs_item *end_item;
3491 	int data_len;
3492 	int nritems = btrfs_header_nritems(l);
3493 	int end = min(nritems, start + nr) - 1;
3494 
3495 	if (!nr)
3496 		return 0;
3497 	start_item = btrfs_item_nr(start);
3498 	end_item = btrfs_item_nr(end);
3499 	data_len = btrfs_item_offset(l, start_item) +
3500 		   btrfs_item_size(l, start_item);
3501 	data_len = data_len - btrfs_item_offset(l, end_item);
3502 	data_len += sizeof(struct btrfs_item) * nr;
3503 	WARN_ON(data_len < 0);
3504 	return data_len;
3505 }
3506 
3507 /*
3508  * The space between the end of the leaf items and
3509  * the start of the leaf data.  IOW, how much room
3510  * the leaf has left for both items and data
3511  */
3512 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3513 {
3514 	struct btrfs_fs_info *fs_info = leaf->fs_info;
3515 	int nritems = btrfs_header_nritems(leaf);
3516 	int ret;
3517 
3518 	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3519 	if (ret < 0) {
3520 		btrfs_crit(fs_info,
3521 			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3522 			   ret,
3523 			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3524 			   leaf_space_used(leaf, 0, nritems), nritems);
3525 	}
3526 	return ret;
3527 }
3528 
3529 /*
3530  * min slot controls the lowest index we're willing to push to the
3531  * right.  We'll push up to and including min_slot, but no lower
3532  */
3533 static noinline int __push_leaf_right(struct btrfs_path *path,
3534 				      int data_size, int empty,
3535 				      struct extent_buffer *right,
3536 				      int free_space, u32 left_nritems,
3537 				      u32 min_slot)
3538 {
3539 	struct btrfs_fs_info *fs_info = right->fs_info;
3540 	struct extent_buffer *left = path->nodes[0];
3541 	struct extent_buffer *upper = path->nodes[1];
3542 	struct btrfs_map_token token;
3543 	struct btrfs_disk_key disk_key;
3544 	int slot;
3545 	u32 i;
3546 	int push_space = 0;
3547 	int push_items = 0;
3548 	struct btrfs_item *item;
3549 	u32 nr;
3550 	u32 right_nritems;
3551 	u32 data_end;
3552 	u32 this_item_size;
3553 
3554 	if (empty)
3555 		nr = 0;
3556 	else
3557 		nr = max_t(u32, 1, min_slot);
3558 
3559 	if (path->slots[0] >= left_nritems)
3560 		push_space += data_size;
3561 
3562 	slot = path->slots[1];
3563 	i = left_nritems - 1;
3564 	while (i >= nr) {
3565 		item = btrfs_item_nr(i);
3566 
3567 		if (!empty && push_items > 0) {
3568 			if (path->slots[0] > i)
3569 				break;
3570 			if (path->slots[0] == i) {
3571 				int space = btrfs_leaf_free_space(left);
3572 
3573 				if (space + push_space * 2 > free_space)
3574 					break;
3575 			}
3576 		}
3577 
3578 		if (path->slots[0] == i)
3579 			push_space += data_size;
3580 
3581 		this_item_size = btrfs_item_size(left, item);
3582 		if (this_item_size + sizeof(*item) + push_space > free_space)
3583 			break;
3584 
3585 		push_items++;
3586 		push_space += this_item_size + sizeof(*item);
3587 		if (i == 0)
3588 			break;
3589 		i--;
3590 	}
3591 
3592 	if (push_items == 0)
3593 		goto out_unlock;
3594 
3595 	WARN_ON(!empty && push_items == left_nritems);
3596 
3597 	/* push left to right */
3598 	right_nritems = btrfs_header_nritems(right);
3599 
3600 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3601 	push_space -= leaf_data_end(left);
3602 
3603 	/* make room in the right data area */
3604 	data_end = leaf_data_end(right);
3605 	memmove_extent_buffer(right,
3606 			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3607 			      BTRFS_LEAF_DATA_OFFSET + data_end,
3608 			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3609 
3610 	/* copy from the left data area */
3611 	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3612 		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3613 		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3614 		     push_space);
3615 
3616 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3617 			      btrfs_item_nr_offset(0),
3618 			      right_nritems * sizeof(struct btrfs_item));
3619 
3620 	/* copy the items from left to right */
3621 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3622 		   btrfs_item_nr_offset(left_nritems - push_items),
3623 		   push_items * sizeof(struct btrfs_item));
3624 
3625 	/* update the item pointers */
3626 	btrfs_init_map_token(&token, right);
3627 	right_nritems += push_items;
3628 	btrfs_set_header_nritems(right, right_nritems);
3629 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3630 	for (i = 0; i < right_nritems; i++) {
3631 		item = btrfs_item_nr(i);
3632 		push_space -= btrfs_token_item_size(&token, item);
3633 		btrfs_set_token_item_offset(&token, item, push_space);
3634 	}
3635 
3636 	left_nritems -= push_items;
3637 	btrfs_set_header_nritems(left, left_nritems);
3638 
3639 	if (left_nritems)
3640 		btrfs_mark_buffer_dirty(left);
3641 	else
3642 		btrfs_clean_tree_block(left);
3643 
3644 	btrfs_mark_buffer_dirty(right);
3645 
3646 	btrfs_item_key(right, &disk_key, 0);
3647 	btrfs_set_node_key(upper, &disk_key, slot + 1);
3648 	btrfs_mark_buffer_dirty(upper);
3649 
3650 	/* then fixup the leaf pointer in the path */
3651 	if (path->slots[0] >= left_nritems) {
3652 		path->slots[0] -= left_nritems;
3653 		if (btrfs_header_nritems(path->nodes[0]) == 0)
3654 			btrfs_clean_tree_block(path->nodes[0]);
3655 		btrfs_tree_unlock(path->nodes[0]);
3656 		free_extent_buffer(path->nodes[0]);
3657 		path->nodes[0] = right;
3658 		path->slots[1] += 1;
3659 	} else {
3660 		btrfs_tree_unlock(right);
3661 		free_extent_buffer(right);
3662 	}
3663 	return 0;
3664 
3665 out_unlock:
3666 	btrfs_tree_unlock(right);
3667 	free_extent_buffer(right);
3668 	return 1;
3669 }
3670 
3671 /*
3672  * push some data in the path leaf to the right, trying to free up at
3673  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3674  *
3675  * returns 1 if the push failed because the other node didn't have enough
3676  * room, 0 if everything worked out and < 0 if there were major errors.
3677  *
3678  * this will push starting from min_slot to the end of the leaf.  It won't
3679  * push any slot lower than min_slot
3680  */
3681 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3682 			   *root, struct btrfs_path *path,
3683 			   int min_data_size, int data_size,
3684 			   int empty, u32 min_slot)
3685 {
3686 	struct extent_buffer *left = path->nodes[0];
3687 	struct extent_buffer *right;
3688 	struct extent_buffer *upper;
3689 	int slot;
3690 	int free_space;
3691 	u32 left_nritems;
3692 	int ret;
3693 
3694 	if (!path->nodes[1])
3695 		return 1;
3696 
3697 	slot = path->slots[1];
3698 	upper = path->nodes[1];
3699 	if (slot >= btrfs_header_nritems(upper) - 1)
3700 		return 1;
3701 
3702 	btrfs_assert_tree_locked(path->nodes[1]);
3703 
3704 	right = btrfs_read_node_slot(upper, slot + 1);
3705 	/*
3706 	 * slot + 1 is not valid or we fail to read the right node,
3707 	 * no big deal, just return.
3708 	 */
3709 	if (IS_ERR(right))
3710 		return 1;
3711 
3712 	btrfs_tree_lock(right);
3713 	btrfs_set_lock_blocking_write(right);
3714 
3715 	free_space = btrfs_leaf_free_space(right);
3716 	if (free_space < data_size)
3717 		goto out_unlock;
3718 
3719 	/* cow and double check */
3720 	ret = btrfs_cow_block(trans, root, right, upper,
3721 			      slot + 1, &right);
3722 	if (ret)
3723 		goto out_unlock;
3724 
3725 	free_space = btrfs_leaf_free_space(right);
3726 	if (free_space < data_size)
3727 		goto out_unlock;
3728 
3729 	left_nritems = btrfs_header_nritems(left);
3730 	if (left_nritems == 0)
3731 		goto out_unlock;
3732 
3733 	if (path->slots[0] == left_nritems && !empty) {
3734 		/* Key greater than all keys in the leaf, right neighbor has
3735 		 * enough room for it and we're not emptying our leaf to delete
3736 		 * it, therefore use right neighbor to insert the new item and
3737 		 * no need to touch/dirty our left leaf. */
3738 		btrfs_tree_unlock(left);
3739 		free_extent_buffer(left);
3740 		path->nodes[0] = right;
3741 		path->slots[0] = 0;
3742 		path->slots[1]++;
3743 		return 0;
3744 	}
3745 
3746 	return __push_leaf_right(path, min_data_size, empty,
3747 				right, free_space, left_nritems, min_slot);
3748 out_unlock:
3749 	btrfs_tree_unlock(right);
3750 	free_extent_buffer(right);
3751 	return 1;
3752 }
3753 
3754 /*
3755  * push some data in the path leaf to the left, trying to free up at
3756  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3757  *
3758  * max_slot can put a limit on how far into the leaf we'll push items.  The
3759  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3760  * items
3761  */
3762 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3763 				     int empty, struct extent_buffer *left,
3764 				     int free_space, u32 right_nritems,
3765 				     u32 max_slot)
3766 {
3767 	struct btrfs_fs_info *fs_info = left->fs_info;
3768 	struct btrfs_disk_key disk_key;
3769 	struct extent_buffer *right = path->nodes[0];
3770 	int i;
3771 	int push_space = 0;
3772 	int push_items = 0;
3773 	struct btrfs_item *item;
3774 	u32 old_left_nritems;
3775 	u32 nr;
3776 	int ret = 0;
3777 	u32 this_item_size;
3778 	u32 old_left_item_size;
3779 	struct btrfs_map_token token;
3780 
3781 	if (empty)
3782 		nr = min(right_nritems, max_slot);
3783 	else
3784 		nr = min(right_nritems - 1, max_slot);
3785 
3786 	for (i = 0; i < nr; i++) {
3787 		item = btrfs_item_nr(i);
3788 
3789 		if (!empty && push_items > 0) {
3790 			if (path->slots[0] < i)
3791 				break;
3792 			if (path->slots[0] == i) {
3793 				int space = btrfs_leaf_free_space(right);
3794 
3795 				if (space + push_space * 2 > free_space)
3796 					break;
3797 			}
3798 		}
3799 
3800 		if (path->slots[0] == i)
3801 			push_space += data_size;
3802 
3803 		this_item_size = btrfs_item_size(right, item);
3804 		if (this_item_size + sizeof(*item) + push_space > free_space)
3805 			break;
3806 
3807 		push_items++;
3808 		push_space += this_item_size + sizeof(*item);
3809 	}
3810 
3811 	if (push_items == 0) {
3812 		ret = 1;
3813 		goto out;
3814 	}
3815 	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3816 
3817 	/* push data from right to left */
3818 	copy_extent_buffer(left, right,
3819 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
3820 			   btrfs_item_nr_offset(0),
3821 			   push_items * sizeof(struct btrfs_item));
3822 
3823 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3824 		     btrfs_item_offset_nr(right, push_items - 1);
3825 
3826 	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3827 		     leaf_data_end(left) - push_space,
3828 		     BTRFS_LEAF_DATA_OFFSET +
3829 		     btrfs_item_offset_nr(right, push_items - 1),
3830 		     push_space);
3831 	old_left_nritems = btrfs_header_nritems(left);
3832 	BUG_ON(old_left_nritems <= 0);
3833 
3834 	btrfs_init_map_token(&token, left);
3835 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3836 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3837 		u32 ioff;
3838 
3839 		item = btrfs_item_nr(i);
3840 
3841 		ioff = btrfs_token_item_offset(&token, item);
3842 		btrfs_set_token_item_offset(&token, item,
3843 		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3844 	}
3845 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3846 
3847 	/* fixup right node */
3848 	if (push_items > right_nritems)
3849 		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3850 		       right_nritems);
3851 
3852 	if (push_items < right_nritems) {
3853 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3854 						  leaf_data_end(right);
3855 		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3856 				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3857 				      BTRFS_LEAF_DATA_OFFSET +
3858 				      leaf_data_end(right), push_space);
3859 
3860 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3861 			      btrfs_item_nr_offset(push_items),
3862 			     (btrfs_header_nritems(right) - push_items) *
3863 			     sizeof(struct btrfs_item));
3864 	}
3865 
3866 	btrfs_init_map_token(&token, right);
3867 	right_nritems -= push_items;
3868 	btrfs_set_header_nritems(right, right_nritems);
3869 	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3870 	for (i = 0; i < right_nritems; i++) {
3871 		item = btrfs_item_nr(i);
3872 
3873 		push_space = push_space - btrfs_token_item_size(&token, item);
3874 		btrfs_set_token_item_offset(&token, item, push_space);
3875 	}
3876 
3877 	btrfs_mark_buffer_dirty(left);
3878 	if (right_nritems)
3879 		btrfs_mark_buffer_dirty(right);
3880 	else
3881 		btrfs_clean_tree_block(right);
3882 
3883 	btrfs_item_key(right, &disk_key, 0);
3884 	fixup_low_keys(path, &disk_key, 1);
3885 
3886 	/* then fixup the leaf pointer in the path */
3887 	if (path->slots[0] < push_items) {
3888 		path->slots[0] += old_left_nritems;
3889 		btrfs_tree_unlock(path->nodes[0]);
3890 		free_extent_buffer(path->nodes[0]);
3891 		path->nodes[0] = left;
3892 		path->slots[1] -= 1;
3893 	} else {
3894 		btrfs_tree_unlock(left);
3895 		free_extent_buffer(left);
3896 		path->slots[0] -= push_items;
3897 	}
3898 	BUG_ON(path->slots[0] < 0);
3899 	return ret;
3900 out:
3901 	btrfs_tree_unlock(left);
3902 	free_extent_buffer(left);
3903 	return ret;
3904 }
3905 
3906 /*
3907  * push some data in the path leaf to the left, trying to free up at
3908  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3909  *
3910  * max_slot can put a limit on how far into the leaf we'll push items.  The
3911  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3912  * items
3913  */
3914 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3915 			  *root, struct btrfs_path *path, int min_data_size,
3916 			  int data_size, int empty, u32 max_slot)
3917 {
3918 	struct extent_buffer *right = path->nodes[0];
3919 	struct extent_buffer *left;
3920 	int slot;
3921 	int free_space;
3922 	u32 right_nritems;
3923 	int ret = 0;
3924 
3925 	slot = path->slots[1];
3926 	if (slot == 0)
3927 		return 1;
3928 	if (!path->nodes[1])
3929 		return 1;
3930 
3931 	right_nritems = btrfs_header_nritems(right);
3932 	if (right_nritems == 0)
3933 		return 1;
3934 
3935 	btrfs_assert_tree_locked(path->nodes[1]);
3936 
3937 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3938 	/*
3939 	 * slot - 1 is not valid or we fail to read the left node,
3940 	 * no big deal, just return.
3941 	 */
3942 	if (IS_ERR(left))
3943 		return 1;
3944 
3945 	btrfs_tree_lock(left);
3946 	btrfs_set_lock_blocking_write(left);
3947 
3948 	free_space = btrfs_leaf_free_space(left);
3949 	if (free_space < data_size) {
3950 		ret = 1;
3951 		goto out;
3952 	}
3953 
3954 	/* cow and double check */
3955 	ret = btrfs_cow_block(trans, root, left,
3956 			      path->nodes[1], slot - 1, &left);
3957 	if (ret) {
3958 		/* we hit -ENOSPC, but it isn't fatal here */
3959 		if (ret == -ENOSPC)
3960 			ret = 1;
3961 		goto out;
3962 	}
3963 
3964 	free_space = btrfs_leaf_free_space(left);
3965 	if (free_space < data_size) {
3966 		ret = 1;
3967 		goto out;
3968 	}
3969 
3970 	return __push_leaf_left(path, min_data_size,
3971 			       empty, left, free_space, right_nritems,
3972 			       max_slot);
3973 out:
3974 	btrfs_tree_unlock(left);
3975 	free_extent_buffer(left);
3976 	return ret;
3977 }
3978 
3979 /*
3980  * split the path's leaf in two, making sure there is at least data_size
3981  * available for the resulting leaf level of the path.
3982  */
3983 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3984 				    struct btrfs_path *path,
3985 				    struct extent_buffer *l,
3986 				    struct extent_buffer *right,
3987 				    int slot, int mid, int nritems)
3988 {
3989 	struct btrfs_fs_info *fs_info = trans->fs_info;
3990 	int data_copy_size;
3991 	int rt_data_off;
3992 	int i;
3993 	struct btrfs_disk_key disk_key;
3994 	struct btrfs_map_token token;
3995 
3996 	nritems = nritems - mid;
3997 	btrfs_set_header_nritems(right, nritems);
3998 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3999 
4000 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4001 			   btrfs_item_nr_offset(mid),
4002 			   nritems * sizeof(struct btrfs_item));
4003 
4004 	copy_extent_buffer(right, l,
4005 		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4006 		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4007 		     leaf_data_end(l), data_copy_size);
4008 
4009 	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4010 
4011 	btrfs_init_map_token(&token, right);
4012 	for (i = 0; i < nritems; i++) {
4013 		struct btrfs_item *item = btrfs_item_nr(i);
4014 		u32 ioff;
4015 
4016 		ioff = btrfs_token_item_offset(&token, item);
4017 		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4018 	}
4019 
4020 	btrfs_set_header_nritems(l, mid);
4021 	btrfs_item_key(right, &disk_key, 0);
4022 	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4023 
4024 	btrfs_mark_buffer_dirty(right);
4025 	btrfs_mark_buffer_dirty(l);
4026 	BUG_ON(path->slots[0] != slot);
4027 
4028 	if (mid <= slot) {
4029 		btrfs_tree_unlock(path->nodes[0]);
4030 		free_extent_buffer(path->nodes[0]);
4031 		path->nodes[0] = right;
4032 		path->slots[0] -= mid;
4033 		path->slots[1] += 1;
4034 	} else {
4035 		btrfs_tree_unlock(right);
4036 		free_extent_buffer(right);
4037 	}
4038 
4039 	BUG_ON(path->slots[0] < 0);
4040 }
4041 
4042 /*
4043  * double splits happen when we need to insert a big item in the middle
4044  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4045  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4046  *          A                 B                 C
4047  *
4048  * We avoid this by trying to push the items on either side of our target
4049  * into the adjacent leaves.  If all goes well we can avoid the double split
4050  * completely.
4051  */
4052 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4053 					  struct btrfs_root *root,
4054 					  struct btrfs_path *path,
4055 					  int data_size)
4056 {
4057 	int ret;
4058 	int progress = 0;
4059 	int slot;
4060 	u32 nritems;
4061 	int space_needed = data_size;
4062 
4063 	slot = path->slots[0];
4064 	if (slot < btrfs_header_nritems(path->nodes[0]))
4065 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4066 
4067 	/*
4068 	 * try to push all the items after our slot into the
4069 	 * right leaf
4070 	 */
4071 	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4072 	if (ret < 0)
4073 		return ret;
4074 
4075 	if (ret == 0)
4076 		progress++;
4077 
4078 	nritems = btrfs_header_nritems(path->nodes[0]);
4079 	/*
4080 	 * our goal is to get our slot at the start or end of a leaf.  If
4081 	 * we've done so we're done
4082 	 */
4083 	if (path->slots[0] == 0 || path->slots[0] == nritems)
4084 		return 0;
4085 
4086 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4087 		return 0;
4088 
4089 	/* try to push all the items before our slot into the next leaf */
4090 	slot = path->slots[0];
4091 	space_needed = data_size;
4092 	if (slot > 0)
4093 		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4094 	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4095 	if (ret < 0)
4096 		return ret;
4097 
4098 	if (ret == 0)
4099 		progress++;
4100 
4101 	if (progress)
4102 		return 0;
4103 	return 1;
4104 }
4105 
4106 /*
4107  * split the path's leaf in two, making sure there is at least data_size
4108  * available for the resulting leaf level of the path.
4109  *
4110  * returns 0 if all went well and < 0 on failure.
4111  */
4112 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4113 			       struct btrfs_root *root,
4114 			       const struct btrfs_key *ins_key,
4115 			       struct btrfs_path *path, int data_size,
4116 			       int extend)
4117 {
4118 	struct btrfs_disk_key disk_key;
4119 	struct extent_buffer *l;
4120 	u32 nritems;
4121 	int mid;
4122 	int slot;
4123 	struct extent_buffer *right;
4124 	struct btrfs_fs_info *fs_info = root->fs_info;
4125 	int ret = 0;
4126 	int wret;
4127 	int split;
4128 	int num_doubles = 0;
4129 	int tried_avoid_double = 0;
4130 
4131 	l = path->nodes[0];
4132 	slot = path->slots[0];
4133 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4134 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4135 		return -EOVERFLOW;
4136 
4137 	/* first try to make some room by pushing left and right */
4138 	if (data_size && path->nodes[1]) {
4139 		int space_needed = data_size;
4140 
4141 		if (slot < btrfs_header_nritems(l))
4142 			space_needed -= btrfs_leaf_free_space(l);
4143 
4144 		wret = push_leaf_right(trans, root, path, space_needed,
4145 				       space_needed, 0, 0);
4146 		if (wret < 0)
4147 			return wret;
4148 		if (wret) {
4149 			space_needed = data_size;
4150 			if (slot > 0)
4151 				space_needed -= btrfs_leaf_free_space(l);
4152 			wret = push_leaf_left(trans, root, path, space_needed,
4153 					      space_needed, 0, (u32)-1);
4154 			if (wret < 0)
4155 				return wret;
4156 		}
4157 		l = path->nodes[0];
4158 
4159 		/* did the pushes work? */
4160 		if (btrfs_leaf_free_space(l) >= data_size)
4161 			return 0;
4162 	}
4163 
4164 	if (!path->nodes[1]) {
4165 		ret = insert_new_root(trans, root, path, 1);
4166 		if (ret)
4167 			return ret;
4168 	}
4169 again:
4170 	split = 1;
4171 	l = path->nodes[0];
4172 	slot = path->slots[0];
4173 	nritems = btrfs_header_nritems(l);
4174 	mid = (nritems + 1) / 2;
4175 
4176 	if (mid <= slot) {
4177 		if (nritems == 1 ||
4178 		    leaf_space_used(l, mid, nritems - mid) + data_size >
4179 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4180 			if (slot >= nritems) {
4181 				split = 0;
4182 			} else {
4183 				mid = slot;
4184 				if (mid != nritems &&
4185 				    leaf_space_used(l, mid, nritems - mid) +
4186 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4187 					if (data_size && !tried_avoid_double)
4188 						goto push_for_double;
4189 					split = 2;
4190 				}
4191 			}
4192 		}
4193 	} else {
4194 		if (leaf_space_used(l, 0, mid) + data_size >
4195 			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4196 			if (!extend && data_size && slot == 0) {
4197 				split = 0;
4198 			} else if ((extend || !data_size) && slot == 0) {
4199 				mid = 1;
4200 			} else {
4201 				mid = slot;
4202 				if (mid != nritems &&
4203 				    leaf_space_used(l, mid, nritems - mid) +
4204 				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4205 					if (data_size && !tried_avoid_double)
4206 						goto push_for_double;
4207 					split = 2;
4208 				}
4209 			}
4210 		}
4211 	}
4212 
4213 	if (split == 0)
4214 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
4215 	else
4216 		btrfs_item_key(l, &disk_key, mid);
4217 
4218 	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4219 					     l->start, 0);
4220 	if (IS_ERR(right))
4221 		return PTR_ERR(right);
4222 
4223 	root_add_used(root, fs_info->nodesize);
4224 
4225 	if (split == 0) {
4226 		if (mid <= slot) {
4227 			btrfs_set_header_nritems(right, 0);
4228 			insert_ptr(trans, path, &disk_key,
4229 				   right->start, path->slots[1] + 1, 1);
4230 			btrfs_tree_unlock(path->nodes[0]);
4231 			free_extent_buffer(path->nodes[0]);
4232 			path->nodes[0] = right;
4233 			path->slots[0] = 0;
4234 			path->slots[1] += 1;
4235 		} else {
4236 			btrfs_set_header_nritems(right, 0);
4237 			insert_ptr(trans, path, &disk_key,
4238 				   right->start, path->slots[1], 1);
4239 			btrfs_tree_unlock(path->nodes[0]);
4240 			free_extent_buffer(path->nodes[0]);
4241 			path->nodes[0] = right;
4242 			path->slots[0] = 0;
4243 			if (path->slots[1] == 0)
4244 				fixup_low_keys(path, &disk_key, 1);
4245 		}
4246 		/*
4247 		 * We create a new leaf 'right' for the required ins_len and
4248 		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4249 		 * the content of ins_len to 'right'.
4250 		 */
4251 		return ret;
4252 	}
4253 
4254 	copy_for_split(trans, path, l, right, slot, mid, nritems);
4255 
4256 	if (split == 2) {
4257 		BUG_ON(num_doubles != 0);
4258 		num_doubles++;
4259 		goto again;
4260 	}
4261 
4262 	return 0;
4263 
4264 push_for_double:
4265 	push_for_double_split(trans, root, path, data_size);
4266 	tried_avoid_double = 1;
4267 	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4268 		return 0;
4269 	goto again;
4270 }
4271 
4272 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4273 					 struct btrfs_root *root,
4274 					 struct btrfs_path *path, int ins_len)
4275 {
4276 	struct btrfs_key key;
4277 	struct extent_buffer *leaf;
4278 	struct btrfs_file_extent_item *fi;
4279 	u64 extent_len = 0;
4280 	u32 item_size;
4281 	int ret;
4282 
4283 	leaf = path->nodes[0];
4284 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4285 
4286 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4287 	       key.type != BTRFS_EXTENT_CSUM_KEY);
4288 
4289 	if (btrfs_leaf_free_space(leaf) >= ins_len)
4290 		return 0;
4291 
4292 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4293 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4294 		fi = btrfs_item_ptr(leaf, path->slots[0],
4295 				    struct btrfs_file_extent_item);
4296 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4297 	}
4298 	btrfs_release_path(path);
4299 
4300 	path->keep_locks = 1;
4301 	path->search_for_split = 1;
4302 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4303 	path->search_for_split = 0;
4304 	if (ret > 0)
4305 		ret = -EAGAIN;
4306 	if (ret < 0)
4307 		goto err;
4308 
4309 	ret = -EAGAIN;
4310 	leaf = path->nodes[0];
4311 	/* if our item isn't there, return now */
4312 	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4313 		goto err;
4314 
4315 	/* the leaf has  changed, it now has room.  return now */
4316 	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4317 		goto err;
4318 
4319 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
4320 		fi = btrfs_item_ptr(leaf, path->slots[0],
4321 				    struct btrfs_file_extent_item);
4322 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4323 			goto err;
4324 	}
4325 
4326 	btrfs_set_path_blocking(path);
4327 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4328 	if (ret)
4329 		goto err;
4330 
4331 	path->keep_locks = 0;
4332 	btrfs_unlock_up_safe(path, 1);
4333 	return 0;
4334 err:
4335 	path->keep_locks = 0;
4336 	return ret;
4337 }
4338 
4339 static noinline int split_item(struct btrfs_path *path,
4340 			       const struct btrfs_key *new_key,
4341 			       unsigned long split_offset)
4342 {
4343 	struct extent_buffer *leaf;
4344 	struct btrfs_item *item;
4345 	struct btrfs_item *new_item;
4346 	int slot;
4347 	char *buf;
4348 	u32 nritems;
4349 	u32 item_size;
4350 	u32 orig_offset;
4351 	struct btrfs_disk_key disk_key;
4352 
4353 	leaf = path->nodes[0];
4354 	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4355 
4356 	btrfs_set_path_blocking(path);
4357 
4358 	item = btrfs_item_nr(path->slots[0]);
4359 	orig_offset = btrfs_item_offset(leaf, item);
4360 	item_size = btrfs_item_size(leaf, item);
4361 
4362 	buf = kmalloc(item_size, GFP_NOFS);
4363 	if (!buf)
4364 		return -ENOMEM;
4365 
4366 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4367 			    path->slots[0]), item_size);
4368 
4369 	slot = path->slots[0] + 1;
4370 	nritems = btrfs_header_nritems(leaf);
4371 	if (slot != nritems) {
4372 		/* shift the items */
4373 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4374 				btrfs_item_nr_offset(slot),
4375 				(nritems - slot) * sizeof(struct btrfs_item));
4376 	}
4377 
4378 	btrfs_cpu_key_to_disk(&disk_key, new_key);
4379 	btrfs_set_item_key(leaf, &disk_key, slot);
4380 
4381 	new_item = btrfs_item_nr(slot);
4382 
4383 	btrfs_set_item_offset(leaf, new_item, orig_offset);
4384 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4385 
4386 	btrfs_set_item_offset(leaf, item,
4387 			      orig_offset + item_size - split_offset);
4388 	btrfs_set_item_size(leaf, item, split_offset);
4389 
4390 	btrfs_set_header_nritems(leaf, nritems + 1);
4391 
4392 	/* write the data for the start of the original item */
4393 	write_extent_buffer(leaf, buf,
4394 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4395 			    split_offset);
4396 
4397 	/* write the data for the new item */
4398 	write_extent_buffer(leaf, buf + split_offset,
4399 			    btrfs_item_ptr_offset(leaf, slot),
4400 			    item_size - split_offset);
4401 	btrfs_mark_buffer_dirty(leaf);
4402 
4403 	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4404 	kfree(buf);
4405 	return 0;
4406 }
4407 
4408 /*
4409  * This function splits a single item into two items,
4410  * giving 'new_key' to the new item and splitting the
4411  * old one at split_offset (from the start of the item).
4412  *
4413  * The path may be released by this operation.  After
4414  * the split, the path is pointing to the old item.  The
4415  * new item is going to be in the same node as the old one.
4416  *
4417  * Note, the item being split must be smaller enough to live alone on
4418  * a tree block with room for one extra struct btrfs_item
4419  *
4420  * This allows us to split the item in place, keeping a lock on the
4421  * leaf the entire time.
4422  */
4423 int btrfs_split_item(struct btrfs_trans_handle *trans,
4424 		     struct btrfs_root *root,
4425 		     struct btrfs_path *path,
4426 		     const struct btrfs_key *new_key,
4427 		     unsigned long split_offset)
4428 {
4429 	int ret;
4430 	ret = setup_leaf_for_split(trans, root, path,
4431 				   sizeof(struct btrfs_item));
4432 	if (ret)
4433 		return ret;
4434 
4435 	ret = split_item(path, new_key, split_offset);
4436 	return ret;
4437 }
4438 
4439 /*
4440  * This function duplicate a item, giving 'new_key' to the new item.
4441  * It guarantees both items live in the same tree leaf and the new item
4442  * is contiguous with the original item.
4443  *
4444  * This allows us to split file extent in place, keeping a lock on the
4445  * leaf the entire time.
4446  */
4447 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4448 			 struct btrfs_root *root,
4449 			 struct btrfs_path *path,
4450 			 const struct btrfs_key *new_key)
4451 {
4452 	struct extent_buffer *leaf;
4453 	int ret;
4454 	u32 item_size;
4455 
4456 	leaf = path->nodes[0];
4457 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4458 	ret = setup_leaf_for_split(trans, root, path,
4459 				   item_size + sizeof(struct btrfs_item));
4460 	if (ret)
4461 		return ret;
4462 
4463 	path->slots[0]++;
4464 	setup_items_for_insert(root, path, new_key, &item_size,
4465 			       item_size, item_size +
4466 			       sizeof(struct btrfs_item), 1);
4467 	leaf = path->nodes[0];
4468 	memcpy_extent_buffer(leaf,
4469 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
4470 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4471 			     item_size);
4472 	return 0;
4473 }
4474 
4475 /*
4476  * make the item pointed to by the path smaller.  new_size indicates
4477  * how small to make it, and from_end tells us if we just chop bytes
4478  * off the end of the item or if we shift the item to chop bytes off
4479  * the front.
4480  */
4481 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4482 {
4483 	int slot;
4484 	struct extent_buffer *leaf;
4485 	struct btrfs_item *item;
4486 	u32 nritems;
4487 	unsigned int data_end;
4488 	unsigned int old_data_start;
4489 	unsigned int old_size;
4490 	unsigned int size_diff;
4491 	int i;
4492 	struct btrfs_map_token token;
4493 
4494 	leaf = path->nodes[0];
4495 	slot = path->slots[0];
4496 
4497 	old_size = btrfs_item_size_nr(leaf, slot);
4498 	if (old_size == new_size)
4499 		return;
4500 
4501 	nritems = btrfs_header_nritems(leaf);
4502 	data_end = leaf_data_end(leaf);
4503 
4504 	old_data_start = btrfs_item_offset_nr(leaf, slot);
4505 
4506 	size_diff = old_size - new_size;
4507 
4508 	BUG_ON(slot < 0);
4509 	BUG_ON(slot >= nritems);
4510 
4511 	/*
4512 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4513 	 */
4514 	/* first correct the data pointers */
4515 	btrfs_init_map_token(&token, leaf);
4516 	for (i = slot; i < nritems; i++) {
4517 		u32 ioff;
4518 		item = btrfs_item_nr(i);
4519 
4520 		ioff = btrfs_token_item_offset(&token, item);
4521 		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4522 	}
4523 
4524 	/* shift the data */
4525 	if (from_end) {
4526 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4527 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4528 			      data_end, old_data_start + new_size - data_end);
4529 	} else {
4530 		struct btrfs_disk_key disk_key;
4531 		u64 offset;
4532 
4533 		btrfs_item_key(leaf, &disk_key, slot);
4534 
4535 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4536 			unsigned long ptr;
4537 			struct btrfs_file_extent_item *fi;
4538 
4539 			fi = btrfs_item_ptr(leaf, slot,
4540 					    struct btrfs_file_extent_item);
4541 			fi = (struct btrfs_file_extent_item *)(
4542 			     (unsigned long)fi - size_diff);
4543 
4544 			if (btrfs_file_extent_type(leaf, fi) ==
4545 			    BTRFS_FILE_EXTENT_INLINE) {
4546 				ptr = btrfs_item_ptr_offset(leaf, slot);
4547 				memmove_extent_buffer(leaf, ptr,
4548 				      (unsigned long)fi,
4549 				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4550 			}
4551 		}
4552 
4553 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4554 			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4555 			      data_end, old_data_start - data_end);
4556 
4557 		offset = btrfs_disk_key_offset(&disk_key);
4558 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4559 		btrfs_set_item_key(leaf, &disk_key, slot);
4560 		if (slot == 0)
4561 			fixup_low_keys(path, &disk_key, 1);
4562 	}
4563 
4564 	item = btrfs_item_nr(slot);
4565 	btrfs_set_item_size(leaf, item, new_size);
4566 	btrfs_mark_buffer_dirty(leaf);
4567 
4568 	if (btrfs_leaf_free_space(leaf) < 0) {
4569 		btrfs_print_leaf(leaf);
4570 		BUG();
4571 	}
4572 }
4573 
4574 /*
4575  * make the item pointed to by the path bigger, data_size is the added size.
4576  */
4577 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4578 {
4579 	int slot;
4580 	struct extent_buffer *leaf;
4581 	struct btrfs_item *item;
4582 	u32 nritems;
4583 	unsigned int data_end;
4584 	unsigned int old_data;
4585 	unsigned int old_size;
4586 	int i;
4587 	struct btrfs_map_token token;
4588 
4589 	leaf = path->nodes[0];
4590 
4591 	nritems = btrfs_header_nritems(leaf);
4592 	data_end = leaf_data_end(leaf);
4593 
4594 	if (btrfs_leaf_free_space(leaf) < data_size) {
4595 		btrfs_print_leaf(leaf);
4596 		BUG();
4597 	}
4598 	slot = path->slots[0];
4599 	old_data = btrfs_item_end_nr(leaf, slot);
4600 
4601 	BUG_ON(slot < 0);
4602 	if (slot >= nritems) {
4603 		btrfs_print_leaf(leaf);
4604 		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4605 			   slot, nritems);
4606 		BUG();
4607 	}
4608 
4609 	/*
4610 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4611 	 */
4612 	/* first correct the data pointers */
4613 	btrfs_init_map_token(&token, leaf);
4614 	for (i = slot; i < nritems; i++) {
4615 		u32 ioff;
4616 		item = btrfs_item_nr(i);
4617 
4618 		ioff = btrfs_token_item_offset(&token, item);
4619 		btrfs_set_token_item_offset(&token, item, ioff - data_size);
4620 	}
4621 
4622 	/* shift the data */
4623 	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4624 		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4625 		      data_end, old_data - data_end);
4626 
4627 	data_end = old_data;
4628 	old_size = btrfs_item_size_nr(leaf, slot);
4629 	item = btrfs_item_nr(slot);
4630 	btrfs_set_item_size(leaf, item, old_size + data_size);
4631 	btrfs_mark_buffer_dirty(leaf);
4632 
4633 	if (btrfs_leaf_free_space(leaf) < 0) {
4634 		btrfs_print_leaf(leaf);
4635 		BUG();
4636 	}
4637 }
4638 
4639 /*
4640  * this is a helper for btrfs_insert_empty_items, the main goal here is
4641  * to save stack depth by doing the bulk of the work in a function
4642  * that doesn't call btrfs_search_slot
4643  */
4644 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4645 			    const struct btrfs_key *cpu_key, u32 *data_size,
4646 			    u32 total_data, u32 total_size, int nr)
4647 {
4648 	struct btrfs_fs_info *fs_info = root->fs_info;
4649 	struct btrfs_item *item;
4650 	int i;
4651 	u32 nritems;
4652 	unsigned int data_end;
4653 	struct btrfs_disk_key disk_key;
4654 	struct extent_buffer *leaf;
4655 	int slot;
4656 	struct btrfs_map_token token;
4657 
4658 	if (path->slots[0] == 0) {
4659 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4660 		fixup_low_keys(path, &disk_key, 1);
4661 	}
4662 	btrfs_unlock_up_safe(path, 1);
4663 
4664 	leaf = path->nodes[0];
4665 	slot = path->slots[0];
4666 
4667 	nritems = btrfs_header_nritems(leaf);
4668 	data_end = leaf_data_end(leaf);
4669 
4670 	if (btrfs_leaf_free_space(leaf) < total_size) {
4671 		btrfs_print_leaf(leaf);
4672 		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4673 			   total_size, btrfs_leaf_free_space(leaf));
4674 		BUG();
4675 	}
4676 
4677 	btrfs_init_map_token(&token, leaf);
4678 	if (slot != nritems) {
4679 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4680 
4681 		if (old_data < data_end) {
4682 			btrfs_print_leaf(leaf);
4683 			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4684 				   slot, old_data, data_end);
4685 			BUG();
4686 		}
4687 		/*
4688 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4689 		 */
4690 		/* first correct the data pointers */
4691 		for (i = slot; i < nritems; i++) {
4692 			u32 ioff;
4693 
4694 			item = btrfs_item_nr(i);
4695 			ioff = btrfs_token_item_offset(&token, item);
4696 			btrfs_set_token_item_offset(&token, item,
4697 						    ioff - total_data);
4698 		}
4699 		/* shift the items */
4700 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4701 			      btrfs_item_nr_offset(slot),
4702 			      (nritems - slot) * sizeof(struct btrfs_item));
4703 
4704 		/* shift the data */
4705 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4706 			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4707 			      data_end, old_data - data_end);
4708 		data_end = old_data;
4709 	}
4710 
4711 	/* setup the item for the new data */
4712 	for (i = 0; i < nr; i++) {
4713 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4714 		btrfs_set_item_key(leaf, &disk_key, slot + i);
4715 		item = btrfs_item_nr(slot + i);
4716 		btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
4717 		data_end -= data_size[i];
4718 		btrfs_set_token_item_size(&token, item, data_size[i]);
4719 	}
4720 
4721 	btrfs_set_header_nritems(leaf, nritems + nr);
4722 	btrfs_mark_buffer_dirty(leaf);
4723 
4724 	if (btrfs_leaf_free_space(leaf) < 0) {
4725 		btrfs_print_leaf(leaf);
4726 		BUG();
4727 	}
4728 }
4729 
4730 /*
4731  * Given a key and some data, insert items into the tree.
4732  * This does all the path init required, making room in the tree if needed.
4733  */
4734 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4735 			    struct btrfs_root *root,
4736 			    struct btrfs_path *path,
4737 			    const struct btrfs_key *cpu_key, u32 *data_size,
4738 			    int nr)
4739 {
4740 	int ret = 0;
4741 	int slot;
4742 	int i;
4743 	u32 total_size = 0;
4744 	u32 total_data = 0;
4745 
4746 	for (i = 0; i < nr; i++)
4747 		total_data += data_size[i];
4748 
4749 	total_size = total_data + (nr * sizeof(struct btrfs_item));
4750 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4751 	if (ret == 0)
4752 		return -EEXIST;
4753 	if (ret < 0)
4754 		return ret;
4755 
4756 	slot = path->slots[0];
4757 	BUG_ON(slot < 0);
4758 
4759 	setup_items_for_insert(root, path, cpu_key, data_size,
4760 			       total_data, total_size, nr);
4761 	return 0;
4762 }
4763 
4764 /*
4765  * Given a key and some data, insert an item into the tree.
4766  * This does all the path init required, making room in the tree if needed.
4767  */
4768 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4769 		      const struct btrfs_key *cpu_key, void *data,
4770 		      u32 data_size)
4771 {
4772 	int ret = 0;
4773 	struct btrfs_path *path;
4774 	struct extent_buffer *leaf;
4775 	unsigned long ptr;
4776 
4777 	path = btrfs_alloc_path();
4778 	if (!path)
4779 		return -ENOMEM;
4780 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4781 	if (!ret) {
4782 		leaf = path->nodes[0];
4783 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4784 		write_extent_buffer(leaf, data, ptr, data_size);
4785 		btrfs_mark_buffer_dirty(leaf);
4786 	}
4787 	btrfs_free_path(path);
4788 	return ret;
4789 }
4790 
4791 /*
4792  * delete the pointer from a given node.
4793  *
4794  * the tree should have been previously balanced so the deletion does not
4795  * empty a node.
4796  */
4797 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4798 		    int level, int slot)
4799 {
4800 	struct extent_buffer *parent = path->nodes[level];
4801 	u32 nritems;
4802 	int ret;
4803 
4804 	nritems = btrfs_header_nritems(parent);
4805 	if (slot != nritems - 1) {
4806 		if (level) {
4807 			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4808 					nritems - slot - 1);
4809 			BUG_ON(ret < 0);
4810 		}
4811 		memmove_extent_buffer(parent,
4812 			      btrfs_node_key_ptr_offset(slot),
4813 			      btrfs_node_key_ptr_offset(slot + 1),
4814 			      sizeof(struct btrfs_key_ptr) *
4815 			      (nritems - slot - 1));
4816 	} else if (level) {
4817 		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4818 				GFP_NOFS);
4819 		BUG_ON(ret < 0);
4820 	}
4821 
4822 	nritems--;
4823 	btrfs_set_header_nritems(parent, nritems);
4824 	if (nritems == 0 && parent == root->node) {
4825 		BUG_ON(btrfs_header_level(root->node) != 1);
4826 		/* just turn the root into a leaf and break */
4827 		btrfs_set_header_level(root->node, 0);
4828 	} else if (slot == 0) {
4829 		struct btrfs_disk_key disk_key;
4830 
4831 		btrfs_node_key(parent, &disk_key, 0);
4832 		fixup_low_keys(path, &disk_key, level + 1);
4833 	}
4834 	btrfs_mark_buffer_dirty(parent);
4835 }
4836 
4837 /*
4838  * a helper function to delete the leaf pointed to by path->slots[1] and
4839  * path->nodes[1].
4840  *
4841  * This deletes the pointer in path->nodes[1] and frees the leaf
4842  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4843  *
4844  * The path must have already been setup for deleting the leaf, including
4845  * all the proper balancing.  path->nodes[1] must be locked.
4846  */
4847 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4848 				    struct btrfs_root *root,
4849 				    struct btrfs_path *path,
4850 				    struct extent_buffer *leaf)
4851 {
4852 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4853 	del_ptr(root, path, 1, path->slots[1]);
4854 
4855 	/*
4856 	 * btrfs_free_extent is expensive, we want to make sure we
4857 	 * aren't holding any locks when we call it
4858 	 */
4859 	btrfs_unlock_up_safe(path, 0);
4860 
4861 	root_sub_used(root, leaf->len);
4862 
4863 	atomic_inc(&leaf->refs);
4864 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4865 	free_extent_buffer_stale(leaf);
4866 }
4867 /*
4868  * delete the item at the leaf level in path.  If that empties
4869  * the leaf, remove it from the tree
4870  */
4871 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4872 		    struct btrfs_path *path, int slot, int nr)
4873 {
4874 	struct btrfs_fs_info *fs_info = root->fs_info;
4875 	struct extent_buffer *leaf;
4876 	struct btrfs_item *item;
4877 	u32 last_off;
4878 	u32 dsize = 0;
4879 	int ret = 0;
4880 	int wret;
4881 	int i;
4882 	u32 nritems;
4883 
4884 	leaf = path->nodes[0];
4885 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4886 
4887 	for (i = 0; i < nr; i++)
4888 		dsize += btrfs_item_size_nr(leaf, slot + i);
4889 
4890 	nritems = btrfs_header_nritems(leaf);
4891 
4892 	if (slot + nr != nritems) {
4893 		int data_end = leaf_data_end(leaf);
4894 		struct btrfs_map_token token;
4895 
4896 		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4897 			      data_end + dsize,
4898 			      BTRFS_LEAF_DATA_OFFSET + data_end,
4899 			      last_off - data_end);
4900 
4901 		btrfs_init_map_token(&token, leaf);
4902 		for (i = slot + nr; i < nritems; i++) {
4903 			u32 ioff;
4904 
4905 			item = btrfs_item_nr(i);
4906 			ioff = btrfs_token_item_offset(&token, item);
4907 			btrfs_set_token_item_offset(&token, item, ioff + dsize);
4908 		}
4909 
4910 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4911 			      btrfs_item_nr_offset(slot + nr),
4912 			      sizeof(struct btrfs_item) *
4913 			      (nritems - slot - nr));
4914 	}
4915 	btrfs_set_header_nritems(leaf, nritems - nr);
4916 	nritems -= nr;
4917 
4918 	/* delete the leaf if we've emptied it */
4919 	if (nritems == 0) {
4920 		if (leaf == root->node) {
4921 			btrfs_set_header_level(leaf, 0);
4922 		} else {
4923 			btrfs_set_path_blocking(path);
4924 			btrfs_clean_tree_block(leaf);
4925 			btrfs_del_leaf(trans, root, path, leaf);
4926 		}
4927 	} else {
4928 		int used = leaf_space_used(leaf, 0, nritems);
4929 		if (slot == 0) {
4930 			struct btrfs_disk_key disk_key;
4931 
4932 			btrfs_item_key(leaf, &disk_key, 0);
4933 			fixup_low_keys(path, &disk_key, 1);
4934 		}
4935 
4936 		/* delete the leaf if it is mostly empty */
4937 		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4938 			/* push_leaf_left fixes the path.
4939 			 * make sure the path still points to our leaf
4940 			 * for possible call to del_ptr below
4941 			 */
4942 			slot = path->slots[1];
4943 			atomic_inc(&leaf->refs);
4944 
4945 			btrfs_set_path_blocking(path);
4946 			wret = push_leaf_left(trans, root, path, 1, 1,
4947 					      1, (u32)-1);
4948 			if (wret < 0 && wret != -ENOSPC)
4949 				ret = wret;
4950 
4951 			if (path->nodes[0] == leaf &&
4952 			    btrfs_header_nritems(leaf)) {
4953 				wret = push_leaf_right(trans, root, path, 1,
4954 						       1, 1, 0);
4955 				if (wret < 0 && wret != -ENOSPC)
4956 					ret = wret;
4957 			}
4958 
4959 			if (btrfs_header_nritems(leaf) == 0) {
4960 				path->slots[1] = slot;
4961 				btrfs_del_leaf(trans, root, path, leaf);
4962 				free_extent_buffer(leaf);
4963 				ret = 0;
4964 			} else {
4965 				/* if we're still in the path, make sure
4966 				 * we're dirty.  Otherwise, one of the
4967 				 * push_leaf functions must have already
4968 				 * dirtied this buffer
4969 				 */
4970 				if (path->nodes[0] == leaf)
4971 					btrfs_mark_buffer_dirty(leaf);
4972 				free_extent_buffer(leaf);
4973 			}
4974 		} else {
4975 			btrfs_mark_buffer_dirty(leaf);
4976 		}
4977 	}
4978 	return ret;
4979 }
4980 
4981 /*
4982  * search the tree again to find a leaf with lesser keys
4983  * returns 0 if it found something or 1 if there are no lesser leaves.
4984  * returns < 0 on io errors.
4985  *
4986  * This may release the path, and so you may lose any locks held at the
4987  * time you call it.
4988  */
4989 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4990 {
4991 	struct btrfs_key key;
4992 	struct btrfs_disk_key found_key;
4993 	int ret;
4994 
4995 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4996 
4997 	if (key.offset > 0) {
4998 		key.offset--;
4999 	} else if (key.type > 0) {
5000 		key.type--;
5001 		key.offset = (u64)-1;
5002 	} else if (key.objectid > 0) {
5003 		key.objectid--;
5004 		key.type = (u8)-1;
5005 		key.offset = (u64)-1;
5006 	} else {
5007 		return 1;
5008 	}
5009 
5010 	btrfs_release_path(path);
5011 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5012 	if (ret < 0)
5013 		return ret;
5014 	btrfs_item_key(path->nodes[0], &found_key, 0);
5015 	ret = comp_keys(&found_key, &key);
5016 	/*
5017 	 * We might have had an item with the previous key in the tree right
5018 	 * before we released our path. And after we released our path, that
5019 	 * item might have been pushed to the first slot (0) of the leaf we
5020 	 * were holding due to a tree balance. Alternatively, an item with the
5021 	 * previous key can exist as the only element of a leaf (big fat item).
5022 	 * Therefore account for these 2 cases, so that our callers (like
5023 	 * btrfs_previous_item) don't miss an existing item with a key matching
5024 	 * the previous key we computed above.
5025 	 */
5026 	if (ret <= 0)
5027 		return 0;
5028 	return 1;
5029 }
5030 
5031 /*
5032  * A helper function to walk down the tree starting at min_key, and looking
5033  * for nodes or leaves that are have a minimum transaction id.
5034  * This is used by the btree defrag code, and tree logging
5035  *
5036  * This does not cow, but it does stuff the starting key it finds back
5037  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5038  * key and get a writable path.
5039  *
5040  * This honors path->lowest_level to prevent descent past a given level
5041  * of the tree.
5042  *
5043  * min_trans indicates the oldest transaction that you are interested
5044  * in walking through.  Any nodes or leaves older than min_trans are
5045  * skipped over (without reading them).
5046  *
5047  * returns zero if something useful was found, < 0 on error and 1 if there
5048  * was nothing in the tree that matched the search criteria.
5049  */
5050 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5051 			 struct btrfs_path *path,
5052 			 u64 min_trans)
5053 {
5054 	struct extent_buffer *cur;
5055 	struct btrfs_key found_key;
5056 	int slot;
5057 	int sret;
5058 	u32 nritems;
5059 	int level;
5060 	int ret = 1;
5061 	int keep_locks = path->keep_locks;
5062 
5063 	path->keep_locks = 1;
5064 again:
5065 	cur = btrfs_read_lock_root_node(root);
5066 	level = btrfs_header_level(cur);
5067 	WARN_ON(path->nodes[level]);
5068 	path->nodes[level] = cur;
5069 	path->locks[level] = BTRFS_READ_LOCK;
5070 
5071 	if (btrfs_header_generation(cur) < min_trans) {
5072 		ret = 1;
5073 		goto out;
5074 	}
5075 	while (1) {
5076 		nritems = btrfs_header_nritems(cur);
5077 		level = btrfs_header_level(cur);
5078 		sret = btrfs_bin_search(cur, min_key, &slot);
5079 		if (sret < 0) {
5080 			ret = sret;
5081 			goto out;
5082 		}
5083 
5084 		/* at the lowest level, we're done, setup the path and exit */
5085 		if (level == path->lowest_level) {
5086 			if (slot >= nritems)
5087 				goto find_next_key;
5088 			ret = 0;
5089 			path->slots[level] = slot;
5090 			btrfs_item_key_to_cpu(cur, &found_key, slot);
5091 			goto out;
5092 		}
5093 		if (sret && slot > 0)
5094 			slot--;
5095 		/*
5096 		 * check this node pointer against the min_trans parameters.
5097 		 * If it is too old, old, skip to the next one.
5098 		 */
5099 		while (slot < nritems) {
5100 			u64 gen;
5101 
5102 			gen = btrfs_node_ptr_generation(cur, slot);
5103 			if (gen < min_trans) {
5104 				slot++;
5105 				continue;
5106 			}
5107 			break;
5108 		}
5109 find_next_key:
5110 		/*
5111 		 * we didn't find a candidate key in this node, walk forward
5112 		 * and find another one
5113 		 */
5114 		if (slot >= nritems) {
5115 			path->slots[level] = slot;
5116 			btrfs_set_path_blocking(path);
5117 			sret = btrfs_find_next_key(root, path, min_key, level,
5118 						  min_trans);
5119 			if (sret == 0) {
5120 				btrfs_release_path(path);
5121 				goto again;
5122 			} else {
5123 				goto out;
5124 			}
5125 		}
5126 		/* save our key for returning back */
5127 		btrfs_node_key_to_cpu(cur, &found_key, slot);
5128 		path->slots[level] = slot;
5129 		if (level == path->lowest_level) {
5130 			ret = 0;
5131 			goto out;
5132 		}
5133 		btrfs_set_path_blocking(path);
5134 		cur = btrfs_read_node_slot(cur, slot);
5135 		if (IS_ERR(cur)) {
5136 			ret = PTR_ERR(cur);
5137 			goto out;
5138 		}
5139 
5140 		btrfs_tree_read_lock(cur);
5141 
5142 		path->locks[level - 1] = BTRFS_READ_LOCK;
5143 		path->nodes[level - 1] = cur;
5144 		unlock_up(path, level, 1, 0, NULL);
5145 	}
5146 out:
5147 	path->keep_locks = keep_locks;
5148 	if (ret == 0) {
5149 		btrfs_unlock_up_safe(path, path->lowest_level + 1);
5150 		btrfs_set_path_blocking(path);
5151 		memcpy(min_key, &found_key, sizeof(found_key));
5152 	}
5153 	return ret;
5154 }
5155 
5156 /*
5157  * this is similar to btrfs_next_leaf, but does not try to preserve
5158  * and fixup the path.  It looks for and returns the next key in the
5159  * tree based on the current path and the min_trans parameters.
5160  *
5161  * 0 is returned if another key is found, < 0 if there are any errors
5162  * and 1 is returned if there are no higher keys in the tree
5163  *
5164  * path->keep_locks should be set to 1 on the search made before
5165  * calling this function.
5166  */
5167 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5168 			struct btrfs_key *key, int level, u64 min_trans)
5169 {
5170 	int slot;
5171 	struct extent_buffer *c;
5172 
5173 	WARN_ON(!path->keep_locks && !path->skip_locking);
5174 	while (level < BTRFS_MAX_LEVEL) {
5175 		if (!path->nodes[level])
5176 			return 1;
5177 
5178 		slot = path->slots[level] + 1;
5179 		c = path->nodes[level];
5180 next:
5181 		if (slot >= btrfs_header_nritems(c)) {
5182 			int ret;
5183 			int orig_lowest;
5184 			struct btrfs_key cur_key;
5185 			if (level + 1 >= BTRFS_MAX_LEVEL ||
5186 			    !path->nodes[level + 1])
5187 				return 1;
5188 
5189 			if (path->locks[level + 1] || path->skip_locking) {
5190 				level++;
5191 				continue;
5192 			}
5193 
5194 			slot = btrfs_header_nritems(c) - 1;
5195 			if (level == 0)
5196 				btrfs_item_key_to_cpu(c, &cur_key, slot);
5197 			else
5198 				btrfs_node_key_to_cpu(c, &cur_key, slot);
5199 
5200 			orig_lowest = path->lowest_level;
5201 			btrfs_release_path(path);
5202 			path->lowest_level = level;
5203 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
5204 						0, 0);
5205 			path->lowest_level = orig_lowest;
5206 			if (ret < 0)
5207 				return ret;
5208 
5209 			c = path->nodes[level];
5210 			slot = path->slots[level];
5211 			if (ret == 0)
5212 				slot++;
5213 			goto next;
5214 		}
5215 
5216 		if (level == 0)
5217 			btrfs_item_key_to_cpu(c, key, slot);
5218 		else {
5219 			u64 gen = btrfs_node_ptr_generation(c, slot);
5220 
5221 			if (gen < min_trans) {
5222 				slot++;
5223 				goto next;
5224 			}
5225 			btrfs_node_key_to_cpu(c, key, slot);
5226 		}
5227 		return 0;
5228 	}
5229 	return 1;
5230 }
5231 
5232 /*
5233  * search the tree again to find a leaf with greater keys
5234  * returns 0 if it found something or 1 if there are no greater leaves.
5235  * returns < 0 on io errors.
5236  */
5237 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5238 {
5239 	return btrfs_next_old_leaf(root, path, 0);
5240 }
5241 
5242 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5243 			u64 time_seq)
5244 {
5245 	int slot;
5246 	int level;
5247 	struct extent_buffer *c;
5248 	struct extent_buffer *next;
5249 	struct btrfs_key key;
5250 	u32 nritems;
5251 	int ret;
5252 	int old_spinning = path->leave_spinning;
5253 	int next_rw_lock = 0;
5254 
5255 	nritems = btrfs_header_nritems(path->nodes[0]);
5256 	if (nritems == 0)
5257 		return 1;
5258 
5259 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5260 again:
5261 	level = 1;
5262 	next = NULL;
5263 	next_rw_lock = 0;
5264 	btrfs_release_path(path);
5265 
5266 	path->keep_locks = 1;
5267 	path->leave_spinning = 1;
5268 
5269 	if (time_seq)
5270 		ret = btrfs_search_old_slot(root, &key, path, time_seq);
5271 	else
5272 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5273 	path->keep_locks = 0;
5274 
5275 	if (ret < 0)
5276 		return ret;
5277 
5278 	nritems = btrfs_header_nritems(path->nodes[0]);
5279 	/*
5280 	 * by releasing the path above we dropped all our locks.  A balance
5281 	 * could have added more items next to the key that used to be
5282 	 * at the very end of the block.  So, check again here and
5283 	 * advance the path if there are now more items available.
5284 	 */
5285 	if (nritems > 0 && path->slots[0] < nritems - 1) {
5286 		if (ret == 0)
5287 			path->slots[0]++;
5288 		ret = 0;
5289 		goto done;
5290 	}
5291 	/*
5292 	 * So the above check misses one case:
5293 	 * - after releasing the path above, someone has removed the item that
5294 	 *   used to be at the very end of the block, and balance between leafs
5295 	 *   gets another one with bigger key.offset to replace it.
5296 	 *
5297 	 * This one should be returned as well, or we can get leaf corruption
5298 	 * later(esp. in __btrfs_drop_extents()).
5299 	 *
5300 	 * And a bit more explanation about this check,
5301 	 * with ret > 0, the key isn't found, the path points to the slot
5302 	 * where it should be inserted, so the path->slots[0] item must be the
5303 	 * bigger one.
5304 	 */
5305 	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5306 		ret = 0;
5307 		goto done;
5308 	}
5309 
5310 	while (level < BTRFS_MAX_LEVEL) {
5311 		if (!path->nodes[level]) {
5312 			ret = 1;
5313 			goto done;
5314 		}
5315 
5316 		slot = path->slots[level] + 1;
5317 		c = path->nodes[level];
5318 		if (slot >= btrfs_header_nritems(c)) {
5319 			level++;
5320 			if (level == BTRFS_MAX_LEVEL) {
5321 				ret = 1;
5322 				goto done;
5323 			}
5324 			continue;
5325 		}
5326 
5327 		if (next) {
5328 			btrfs_tree_unlock_rw(next, next_rw_lock);
5329 			free_extent_buffer(next);
5330 		}
5331 
5332 		next = c;
5333 		next_rw_lock = path->locks[level];
5334 		ret = read_block_for_search(root, path, &next, level,
5335 					    slot, &key);
5336 		if (ret == -EAGAIN)
5337 			goto again;
5338 
5339 		if (ret < 0) {
5340 			btrfs_release_path(path);
5341 			goto done;
5342 		}
5343 
5344 		if (!path->skip_locking) {
5345 			ret = btrfs_try_tree_read_lock(next);
5346 			if (!ret && time_seq) {
5347 				/*
5348 				 * If we don't get the lock, we may be racing
5349 				 * with push_leaf_left, holding that lock while
5350 				 * itself waiting for the leaf we've currently
5351 				 * locked. To solve this situation, we give up
5352 				 * on our lock and cycle.
5353 				 */
5354 				free_extent_buffer(next);
5355 				btrfs_release_path(path);
5356 				cond_resched();
5357 				goto again;
5358 			}
5359 			if (!ret) {
5360 				btrfs_set_path_blocking(path);
5361 				btrfs_tree_read_lock(next);
5362 			}
5363 			next_rw_lock = BTRFS_READ_LOCK;
5364 		}
5365 		break;
5366 	}
5367 	path->slots[level] = slot;
5368 	while (1) {
5369 		level--;
5370 		c = path->nodes[level];
5371 		if (path->locks[level])
5372 			btrfs_tree_unlock_rw(c, path->locks[level]);
5373 
5374 		free_extent_buffer(c);
5375 		path->nodes[level] = next;
5376 		path->slots[level] = 0;
5377 		if (!path->skip_locking)
5378 			path->locks[level] = next_rw_lock;
5379 		if (!level)
5380 			break;
5381 
5382 		ret = read_block_for_search(root, path, &next, level,
5383 					    0, &key);
5384 		if (ret == -EAGAIN)
5385 			goto again;
5386 
5387 		if (ret < 0) {
5388 			btrfs_release_path(path);
5389 			goto done;
5390 		}
5391 
5392 		if (!path->skip_locking) {
5393 			ret = btrfs_try_tree_read_lock(next);
5394 			if (!ret) {
5395 				btrfs_set_path_blocking(path);
5396 				btrfs_tree_read_lock(next);
5397 			}
5398 			next_rw_lock = BTRFS_READ_LOCK;
5399 		}
5400 	}
5401 	ret = 0;
5402 done:
5403 	unlock_up(path, 0, 1, 0, NULL);
5404 	path->leave_spinning = old_spinning;
5405 	if (!old_spinning)
5406 		btrfs_set_path_blocking(path);
5407 
5408 	return ret;
5409 }
5410 
5411 /*
5412  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5413  * searching until it gets past min_objectid or finds an item of 'type'
5414  *
5415  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5416  */
5417 int btrfs_previous_item(struct btrfs_root *root,
5418 			struct btrfs_path *path, u64 min_objectid,
5419 			int type)
5420 {
5421 	struct btrfs_key found_key;
5422 	struct extent_buffer *leaf;
5423 	u32 nritems;
5424 	int ret;
5425 
5426 	while (1) {
5427 		if (path->slots[0] == 0) {
5428 			btrfs_set_path_blocking(path);
5429 			ret = btrfs_prev_leaf(root, path);
5430 			if (ret != 0)
5431 				return ret;
5432 		} else {
5433 			path->slots[0]--;
5434 		}
5435 		leaf = path->nodes[0];
5436 		nritems = btrfs_header_nritems(leaf);
5437 		if (nritems == 0)
5438 			return 1;
5439 		if (path->slots[0] == nritems)
5440 			path->slots[0]--;
5441 
5442 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5443 		if (found_key.objectid < min_objectid)
5444 			break;
5445 		if (found_key.type == type)
5446 			return 0;
5447 		if (found_key.objectid == min_objectid &&
5448 		    found_key.type < type)
5449 			break;
5450 	}
5451 	return 1;
5452 }
5453 
5454 /*
5455  * search in extent tree to find a previous Metadata/Data extent item with
5456  * min objecitd.
5457  *
5458  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5459  */
5460 int btrfs_previous_extent_item(struct btrfs_root *root,
5461 			struct btrfs_path *path, u64 min_objectid)
5462 {
5463 	struct btrfs_key found_key;
5464 	struct extent_buffer *leaf;
5465 	u32 nritems;
5466 	int ret;
5467 
5468 	while (1) {
5469 		if (path->slots[0] == 0) {
5470 			btrfs_set_path_blocking(path);
5471 			ret = btrfs_prev_leaf(root, path);
5472 			if (ret != 0)
5473 				return ret;
5474 		} else {
5475 			path->slots[0]--;
5476 		}
5477 		leaf = path->nodes[0];
5478 		nritems = btrfs_header_nritems(leaf);
5479 		if (nritems == 0)
5480 			return 1;
5481 		if (path->slots[0] == nritems)
5482 			path->slots[0]--;
5483 
5484 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5485 		if (found_key.objectid < min_objectid)
5486 			break;
5487 		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5488 		    found_key.type == BTRFS_METADATA_ITEM_KEY)
5489 			return 0;
5490 		if (found_key.objectid == min_objectid &&
5491 		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5492 			break;
5493 	}
5494 	return 1;
5495 }
5496