xref: /openbmc/linux/fs/btrfs/ctree.c (revision b627b4ed)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "locking.h"
25 
26 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 		      *root, struct btrfs_path *path, int level);
28 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_key *ins_key,
30 		      struct btrfs_path *path, int data_size, int extend);
31 static int push_node_left(struct btrfs_trans_handle *trans,
32 			  struct btrfs_root *root, struct extent_buffer *dst,
33 			  struct extent_buffer *src, int empty);
34 static int balance_node_right(struct btrfs_trans_handle *trans,
35 			      struct btrfs_root *root,
36 			      struct extent_buffer *dst_buf,
37 			      struct extent_buffer *src_buf);
38 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 		   struct btrfs_path *path, int level, int slot);
40 
41 struct btrfs_path *btrfs_alloc_path(void)
42 {
43 	struct btrfs_path *path;
44 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
45 	if (path)
46 		path->reada = 1;
47 	return path;
48 }
49 
50 /*
51  * set all locked nodes in the path to blocking locks.  This should
52  * be done before scheduling
53  */
54 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55 {
56 	int i;
57 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58 		if (p->nodes[i] && p->locks[i])
59 			btrfs_set_lock_blocking(p->nodes[i]);
60 	}
61 }
62 
63 /*
64  * reset all the locked nodes in the patch to spinning locks.
65  *
66  * held is used to keep lockdep happy, when lockdep is enabled
67  * we set held to a blocking lock before we go around and
68  * retake all the spinlocks in the path.  You can safely use NULL
69  * for held
70  */
71 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 					struct extent_buffer *held)
73 {
74 	int i;
75 
76 #ifdef CONFIG_DEBUG_LOCK_ALLOC
77 	/* lockdep really cares that we take all of these spinlocks
78 	 * in the right order.  If any of the locks in the path are not
79 	 * currently blocking, it is going to complain.  So, make really
80 	 * really sure by forcing the path to blocking before we clear
81 	 * the path blocking.
82 	 */
83 	if (held)
84 		btrfs_set_lock_blocking(held);
85 	btrfs_set_path_blocking(p);
86 #endif
87 
88 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 		if (p->nodes[i] && p->locks[i])
90 			btrfs_clear_lock_blocking(p->nodes[i]);
91 	}
92 
93 #ifdef CONFIG_DEBUG_LOCK_ALLOC
94 	if (held)
95 		btrfs_clear_lock_blocking(held);
96 #endif
97 }
98 
99 /* this also releases the path */
100 void btrfs_free_path(struct btrfs_path *p)
101 {
102 	btrfs_release_path(NULL, p);
103 	kmem_cache_free(btrfs_path_cachep, p);
104 }
105 
106 /*
107  * path release drops references on the extent buffers in the path
108  * and it drops any locks held by this path
109  *
110  * It is safe to call this on paths that no locks or extent buffers held.
111  */
112 noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
113 {
114 	int i;
115 
116 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
117 		p->slots[i] = 0;
118 		if (!p->nodes[i])
119 			continue;
120 		if (p->locks[i]) {
121 			btrfs_tree_unlock(p->nodes[i]);
122 			p->locks[i] = 0;
123 		}
124 		free_extent_buffer(p->nodes[i]);
125 		p->nodes[i] = NULL;
126 	}
127 }
128 
129 /*
130  * safely gets a reference on the root node of a tree.  A lock
131  * is not taken, so a concurrent writer may put a different node
132  * at the root of the tree.  See btrfs_lock_root_node for the
133  * looping required.
134  *
135  * The extent buffer returned by this has a reference taken, so
136  * it won't disappear.  It may stop being the root of the tree
137  * at any time because there are no locks held.
138  */
139 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
140 {
141 	struct extent_buffer *eb;
142 	spin_lock(&root->node_lock);
143 	eb = root->node;
144 	extent_buffer_get(eb);
145 	spin_unlock(&root->node_lock);
146 	return eb;
147 }
148 
149 /* loop around taking references on and locking the root node of the
150  * tree until you end up with a lock on the root.  A locked buffer
151  * is returned, with a reference held.
152  */
153 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
154 {
155 	struct extent_buffer *eb;
156 
157 	while (1) {
158 		eb = btrfs_root_node(root);
159 		btrfs_tree_lock(eb);
160 
161 		spin_lock(&root->node_lock);
162 		if (eb == root->node) {
163 			spin_unlock(&root->node_lock);
164 			break;
165 		}
166 		spin_unlock(&root->node_lock);
167 
168 		btrfs_tree_unlock(eb);
169 		free_extent_buffer(eb);
170 	}
171 	return eb;
172 }
173 
174 /* cowonly root (everything not a reference counted cow subvolume), just get
175  * put onto a simple dirty list.  transaction.c walks this to make sure they
176  * get properly updated on disk.
177  */
178 static void add_root_to_dirty_list(struct btrfs_root *root)
179 {
180 	if (root->track_dirty && list_empty(&root->dirty_list)) {
181 		list_add(&root->dirty_list,
182 			 &root->fs_info->dirty_cowonly_roots);
183 	}
184 }
185 
186 /*
187  * used by snapshot creation to make a copy of a root for a tree with
188  * a given objectid.  The buffer with the new root node is returned in
189  * cow_ret, and this func returns zero on success or a negative error code.
190  */
191 int btrfs_copy_root(struct btrfs_trans_handle *trans,
192 		      struct btrfs_root *root,
193 		      struct extent_buffer *buf,
194 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
195 {
196 	struct extent_buffer *cow;
197 	u32 nritems;
198 	int ret = 0;
199 	int level;
200 	struct btrfs_root *new_root;
201 
202 	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
203 	if (!new_root)
204 		return -ENOMEM;
205 
206 	memcpy(new_root, root, sizeof(*new_root));
207 	new_root->root_key.objectid = new_root_objectid;
208 
209 	WARN_ON(root->ref_cows && trans->transid !=
210 		root->fs_info->running_transaction->transid);
211 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
212 
213 	level = btrfs_header_level(buf);
214 	nritems = btrfs_header_nritems(buf);
215 
216 	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
217 				     new_root_objectid, trans->transid,
218 				     level, buf->start, 0);
219 	if (IS_ERR(cow)) {
220 		kfree(new_root);
221 		return PTR_ERR(cow);
222 	}
223 
224 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
225 	btrfs_set_header_bytenr(cow, cow->start);
226 	btrfs_set_header_generation(cow, trans->transid);
227 	btrfs_set_header_owner(cow, new_root_objectid);
228 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
229 
230 	write_extent_buffer(cow, root->fs_info->fsid,
231 			    (unsigned long)btrfs_header_fsid(cow),
232 			    BTRFS_FSID_SIZE);
233 
234 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
235 	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
236 	kfree(new_root);
237 
238 	if (ret)
239 		return ret;
240 
241 	btrfs_mark_buffer_dirty(cow);
242 	*cow_ret = cow;
243 	return 0;
244 }
245 
246 /*
247  * does the dirty work in cow of a single block.  The parent block (if
248  * supplied) is updated to point to the new cow copy.  The new buffer is marked
249  * dirty and returned locked.  If you modify the block it needs to be marked
250  * dirty again.
251  *
252  * search_start -- an allocation hint for the new block
253  *
254  * empty_size -- a hint that you plan on doing more cow.  This is the size in
255  * bytes the allocator should try to find free next to the block it returns.
256  * This is just a hint and may be ignored by the allocator.
257  */
258 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
259 			     struct btrfs_root *root,
260 			     struct extent_buffer *buf,
261 			     struct extent_buffer *parent, int parent_slot,
262 			     struct extent_buffer **cow_ret,
263 			     u64 search_start, u64 empty_size)
264 {
265 	u64 parent_start;
266 	struct extent_buffer *cow;
267 	u32 nritems;
268 	int ret = 0;
269 	int level;
270 	int unlock_orig = 0;
271 
272 	if (*cow_ret == buf)
273 		unlock_orig = 1;
274 
275 	btrfs_assert_tree_locked(buf);
276 
277 	if (parent)
278 		parent_start = parent->start;
279 	else
280 		parent_start = 0;
281 
282 	WARN_ON(root->ref_cows && trans->transid !=
283 		root->fs_info->running_transaction->transid);
284 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
285 
286 	level = btrfs_header_level(buf);
287 	nritems = btrfs_header_nritems(buf);
288 
289 	cow = btrfs_alloc_free_block(trans, root, buf->len,
290 				     parent_start, root->root_key.objectid,
291 				     trans->transid, level,
292 				     search_start, empty_size);
293 	if (IS_ERR(cow))
294 		return PTR_ERR(cow);
295 
296 	/* cow is set to blocking by btrfs_init_new_buffer */
297 
298 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
299 	btrfs_set_header_bytenr(cow, cow->start);
300 	btrfs_set_header_generation(cow, trans->transid);
301 	btrfs_set_header_owner(cow, root->root_key.objectid);
302 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
303 
304 	write_extent_buffer(cow, root->fs_info->fsid,
305 			    (unsigned long)btrfs_header_fsid(cow),
306 			    BTRFS_FSID_SIZE);
307 
308 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
309 	if (btrfs_header_generation(buf) != trans->transid) {
310 		u32 nr_extents;
311 		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
312 		if (ret)
313 			return ret;
314 
315 		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
316 		WARN_ON(ret);
317 	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
318 		/*
319 		 * There are only two places that can drop reference to
320 		 * tree blocks owned by living reloc trees, one is here,
321 		 * the other place is btrfs_drop_subtree. In both places,
322 		 * we check reference count while tree block is locked.
323 		 * Furthermore, if reference count is one, it won't get
324 		 * increased by someone else.
325 		 */
326 		u32 refs;
327 		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
328 					      buf->len, &refs);
329 		BUG_ON(ret);
330 		if (refs == 1) {
331 			ret = btrfs_update_ref(trans, root, buf, cow,
332 					       0, nritems);
333 			clean_tree_block(trans, root, buf);
334 		} else {
335 			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
336 		}
337 		BUG_ON(ret);
338 	} else {
339 		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
340 		if (ret)
341 			return ret;
342 		clean_tree_block(trans, root, buf);
343 	}
344 
345 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
346 		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
347 		WARN_ON(ret);
348 	}
349 
350 	if (buf == root->node) {
351 		WARN_ON(parent && parent != buf);
352 
353 		spin_lock(&root->node_lock);
354 		root->node = cow;
355 		extent_buffer_get(cow);
356 		spin_unlock(&root->node_lock);
357 
358 		if (buf != root->commit_root) {
359 			btrfs_free_extent(trans, root, buf->start,
360 					  buf->len, buf->start,
361 					  root->root_key.objectid,
362 					  btrfs_header_generation(buf),
363 					  level, 1);
364 		}
365 		free_extent_buffer(buf);
366 		add_root_to_dirty_list(root);
367 	} else {
368 		btrfs_set_node_blockptr(parent, parent_slot,
369 					cow->start);
370 		WARN_ON(trans->transid == 0);
371 		btrfs_set_node_ptr_generation(parent, parent_slot,
372 					      trans->transid);
373 		btrfs_mark_buffer_dirty(parent);
374 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
375 		btrfs_free_extent(trans, root, buf->start, buf->len,
376 				  parent_start, btrfs_header_owner(parent),
377 				  btrfs_header_generation(parent), level, 1);
378 	}
379 	if (unlock_orig)
380 		btrfs_tree_unlock(buf);
381 	free_extent_buffer(buf);
382 	btrfs_mark_buffer_dirty(cow);
383 	*cow_ret = cow;
384 	return 0;
385 }
386 
387 /*
388  * cows a single block, see __btrfs_cow_block for the real work.
389  * This version of it has extra checks so that a block isn't cow'd more than
390  * once per transaction, as long as it hasn't been written yet
391  */
392 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
393 		    struct btrfs_root *root, struct extent_buffer *buf,
394 		    struct extent_buffer *parent, int parent_slot,
395 		    struct extent_buffer **cow_ret)
396 {
397 	u64 search_start;
398 	int ret;
399 
400 	if (trans->transaction != root->fs_info->running_transaction) {
401 		printk(KERN_CRIT "trans %llu running %llu\n",
402 		       (unsigned long long)trans->transid,
403 		       (unsigned long long)
404 		       root->fs_info->running_transaction->transid);
405 		WARN_ON(1);
406 	}
407 	if (trans->transid != root->fs_info->generation) {
408 		printk(KERN_CRIT "trans %llu running %llu\n",
409 		       (unsigned long long)trans->transid,
410 		       (unsigned long long)root->fs_info->generation);
411 		WARN_ON(1);
412 	}
413 
414 	if (btrfs_header_generation(buf) == trans->transid &&
415 	    btrfs_header_owner(buf) == root->root_key.objectid &&
416 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
417 		*cow_ret = buf;
418 		return 0;
419 	}
420 
421 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
422 
423 	if (parent)
424 		btrfs_set_lock_blocking(parent);
425 	btrfs_set_lock_blocking(buf);
426 
427 	ret = __btrfs_cow_block(trans, root, buf, parent,
428 				 parent_slot, cow_ret, search_start, 0);
429 	return ret;
430 }
431 
432 /*
433  * helper function for defrag to decide if two blocks pointed to by a
434  * node are actually close by
435  */
436 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
437 {
438 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
439 		return 1;
440 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
441 		return 1;
442 	return 0;
443 }
444 
445 /*
446  * compare two keys in a memcmp fashion
447  */
448 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
449 {
450 	struct btrfs_key k1;
451 
452 	btrfs_disk_key_to_cpu(&k1, disk);
453 
454 	if (k1.objectid > k2->objectid)
455 		return 1;
456 	if (k1.objectid < k2->objectid)
457 		return -1;
458 	if (k1.type > k2->type)
459 		return 1;
460 	if (k1.type < k2->type)
461 		return -1;
462 	if (k1.offset > k2->offset)
463 		return 1;
464 	if (k1.offset < k2->offset)
465 		return -1;
466 	return 0;
467 }
468 
469 /*
470  * same as comp_keys only with two btrfs_key's
471  */
472 static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
473 {
474 	if (k1->objectid > k2->objectid)
475 		return 1;
476 	if (k1->objectid < k2->objectid)
477 		return -1;
478 	if (k1->type > k2->type)
479 		return 1;
480 	if (k1->type < k2->type)
481 		return -1;
482 	if (k1->offset > k2->offset)
483 		return 1;
484 	if (k1->offset < k2->offset)
485 		return -1;
486 	return 0;
487 }
488 
489 /*
490  * this is used by the defrag code to go through all the
491  * leaves pointed to by a node and reallocate them so that
492  * disk order is close to key order
493  */
494 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
495 		       struct btrfs_root *root, struct extent_buffer *parent,
496 		       int start_slot, int cache_only, u64 *last_ret,
497 		       struct btrfs_key *progress)
498 {
499 	struct extent_buffer *cur;
500 	u64 blocknr;
501 	u64 gen;
502 	u64 search_start = *last_ret;
503 	u64 last_block = 0;
504 	u64 other;
505 	u32 parent_nritems;
506 	int end_slot;
507 	int i;
508 	int err = 0;
509 	int parent_level;
510 	int uptodate;
511 	u32 blocksize;
512 	int progress_passed = 0;
513 	struct btrfs_disk_key disk_key;
514 
515 	parent_level = btrfs_header_level(parent);
516 	if (cache_only && parent_level != 1)
517 		return 0;
518 
519 	if (trans->transaction != root->fs_info->running_transaction)
520 		WARN_ON(1);
521 	if (trans->transid != root->fs_info->generation)
522 		WARN_ON(1);
523 
524 	parent_nritems = btrfs_header_nritems(parent);
525 	blocksize = btrfs_level_size(root, parent_level - 1);
526 	end_slot = parent_nritems;
527 
528 	if (parent_nritems == 1)
529 		return 0;
530 
531 	btrfs_set_lock_blocking(parent);
532 
533 	for (i = start_slot; i < end_slot; i++) {
534 		int close = 1;
535 
536 		if (!parent->map_token) {
537 			map_extent_buffer(parent,
538 					btrfs_node_key_ptr_offset(i),
539 					sizeof(struct btrfs_key_ptr),
540 					&parent->map_token, &parent->kaddr,
541 					&parent->map_start, &parent->map_len,
542 					KM_USER1);
543 		}
544 		btrfs_node_key(parent, &disk_key, i);
545 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
546 			continue;
547 
548 		progress_passed = 1;
549 		blocknr = btrfs_node_blockptr(parent, i);
550 		gen = btrfs_node_ptr_generation(parent, i);
551 		if (last_block == 0)
552 			last_block = blocknr;
553 
554 		if (i > 0) {
555 			other = btrfs_node_blockptr(parent, i - 1);
556 			close = close_blocks(blocknr, other, blocksize);
557 		}
558 		if (!close && i < end_slot - 2) {
559 			other = btrfs_node_blockptr(parent, i + 1);
560 			close = close_blocks(blocknr, other, blocksize);
561 		}
562 		if (close) {
563 			last_block = blocknr;
564 			continue;
565 		}
566 		if (parent->map_token) {
567 			unmap_extent_buffer(parent, parent->map_token,
568 					    KM_USER1);
569 			parent->map_token = NULL;
570 		}
571 
572 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
573 		if (cur)
574 			uptodate = btrfs_buffer_uptodate(cur, gen);
575 		else
576 			uptodate = 0;
577 		if (!cur || !uptodate) {
578 			if (cache_only) {
579 				free_extent_buffer(cur);
580 				continue;
581 			}
582 			if (!cur) {
583 				cur = read_tree_block(root, blocknr,
584 							 blocksize, gen);
585 			} else if (!uptodate) {
586 				btrfs_read_buffer(cur, gen);
587 			}
588 		}
589 		if (search_start == 0)
590 			search_start = last_block;
591 
592 		btrfs_tree_lock(cur);
593 		btrfs_set_lock_blocking(cur);
594 		err = __btrfs_cow_block(trans, root, cur, parent, i,
595 					&cur, search_start,
596 					min(16 * blocksize,
597 					    (end_slot - i) * blocksize));
598 		if (err) {
599 			btrfs_tree_unlock(cur);
600 			free_extent_buffer(cur);
601 			break;
602 		}
603 		search_start = cur->start;
604 		last_block = cur->start;
605 		*last_ret = search_start;
606 		btrfs_tree_unlock(cur);
607 		free_extent_buffer(cur);
608 	}
609 	if (parent->map_token) {
610 		unmap_extent_buffer(parent, parent->map_token,
611 				    KM_USER1);
612 		parent->map_token = NULL;
613 	}
614 	return err;
615 }
616 
617 /*
618  * The leaf data grows from end-to-front in the node.
619  * this returns the address of the start of the last item,
620  * which is the stop of the leaf data stack
621  */
622 static inline unsigned int leaf_data_end(struct btrfs_root *root,
623 					 struct extent_buffer *leaf)
624 {
625 	u32 nr = btrfs_header_nritems(leaf);
626 	if (nr == 0)
627 		return BTRFS_LEAF_DATA_SIZE(root);
628 	return btrfs_item_offset_nr(leaf, nr - 1);
629 }
630 
631 /*
632  * extra debugging checks to make sure all the items in a key are
633  * well formed and in the proper order
634  */
635 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
636 		      int level)
637 {
638 	struct extent_buffer *parent = NULL;
639 	struct extent_buffer *node = path->nodes[level];
640 	struct btrfs_disk_key parent_key;
641 	struct btrfs_disk_key node_key;
642 	int parent_slot;
643 	int slot;
644 	struct btrfs_key cpukey;
645 	u32 nritems = btrfs_header_nritems(node);
646 
647 	if (path->nodes[level + 1])
648 		parent = path->nodes[level + 1];
649 
650 	slot = path->slots[level];
651 	BUG_ON(nritems == 0);
652 	if (parent) {
653 		parent_slot = path->slots[level + 1];
654 		btrfs_node_key(parent, &parent_key, parent_slot);
655 		btrfs_node_key(node, &node_key, 0);
656 		BUG_ON(memcmp(&parent_key, &node_key,
657 			      sizeof(struct btrfs_disk_key)));
658 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
659 		       btrfs_header_bytenr(node));
660 	}
661 	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
662 	if (slot != 0) {
663 		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
664 		btrfs_node_key(node, &node_key, slot);
665 		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
666 	}
667 	if (slot < nritems - 1) {
668 		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
669 		btrfs_node_key(node, &node_key, slot);
670 		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
671 	}
672 	return 0;
673 }
674 
675 /*
676  * extra checking to make sure all the items in a leaf are
677  * well formed and in the proper order
678  */
679 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
680 		      int level)
681 {
682 	struct extent_buffer *leaf = path->nodes[level];
683 	struct extent_buffer *parent = NULL;
684 	int parent_slot;
685 	struct btrfs_key cpukey;
686 	struct btrfs_disk_key parent_key;
687 	struct btrfs_disk_key leaf_key;
688 	int slot = path->slots[0];
689 
690 	u32 nritems = btrfs_header_nritems(leaf);
691 
692 	if (path->nodes[level + 1])
693 		parent = path->nodes[level + 1];
694 
695 	if (nritems == 0)
696 		return 0;
697 
698 	if (parent) {
699 		parent_slot = path->slots[level + 1];
700 		btrfs_node_key(parent, &parent_key, parent_slot);
701 		btrfs_item_key(leaf, &leaf_key, 0);
702 
703 		BUG_ON(memcmp(&parent_key, &leaf_key,
704 		       sizeof(struct btrfs_disk_key)));
705 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
706 		       btrfs_header_bytenr(leaf));
707 	}
708 	if (slot != 0 && slot < nritems - 1) {
709 		btrfs_item_key(leaf, &leaf_key, slot);
710 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
711 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
712 			btrfs_print_leaf(root, leaf);
713 			printk(KERN_CRIT "slot %d offset bad key\n", slot);
714 			BUG_ON(1);
715 		}
716 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
717 		       btrfs_item_end_nr(leaf, slot)) {
718 			btrfs_print_leaf(root, leaf);
719 			printk(KERN_CRIT "slot %d offset bad\n", slot);
720 			BUG_ON(1);
721 		}
722 	}
723 	if (slot < nritems - 1) {
724 		btrfs_item_key(leaf, &leaf_key, slot);
725 		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
726 		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
727 		if (btrfs_item_offset_nr(leaf, slot) !=
728 			btrfs_item_end_nr(leaf, slot + 1)) {
729 			btrfs_print_leaf(root, leaf);
730 			printk(KERN_CRIT "slot %d offset bad\n", slot);
731 			BUG_ON(1);
732 		}
733 	}
734 	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
735 	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
736 	return 0;
737 }
738 
739 static noinline int check_block(struct btrfs_root *root,
740 				struct btrfs_path *path, int level)
741 {
742 	return 0;
743 	if (level == 0)
744 		return check_leaf(root, path, level);
745 	return check_node(root, path, level);
746 }
747 
748 /*
749  * search for key in the extent_buffer.  The items start at offset p,
750  * and they are item_size apart.  There are 'max' items in p.
751  *
752  * the slot in the array is returned via slot, and it points to
753  * the place where you would insert key if it is not found in
754  * the array.
755  *
756  * slot may point to max if the key is bigger than all of the keys
757  */
758 static noinline int generic_bin_search(struct extent_buffer *eb,
759 				       unsigned long p,
760 				       int item_size, struct btrfs_key *key,
761 				       int max, int *slot)
762 {
763 	int low = 0;
764 	int high = max;
765 	int mid;
766 	int ret;
767 	struct btrfs_disk_key *tmp = NULL;
768 	struct btrfs_disk_key unaligned;
769 	unsigned long offset;
770 	char *map_token = NULL;
771 	char *kaddr = NULL;
772 	unsigned long map_start = 0;
773 	unsigned long map_len = 0;
774 	int err;
775 
776 	while (low < high) {
777 		mid = (low + high) / 2;
778 		offset = p + mid * item_size;
779 
780 		if (!map_token || offset < map_start ||
781 		    (offset + sizeof(struct btrfs_disk_key)) >
782 		    map_start + map_len) {
783 			if (map_token) {
784 				unmap_extent_buffer(eb, map_token, KM_USER0);
785 				map_token = NULL;
786 			}
787 
788 			err = map_private_extent_buffer(eb, offset,
789 						sizeof(struct btrfs_disk_key),
790 						&map_token, &kaddr,
791 						&map_start, &map_len, KM_USER0);
792 
793 			if (!err) {
794 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
795 							map_start);
796 			} else {
797 				read_extent_buffer(eb, &unaligned,
798 						   offset, sizeof(unaligned));
799 				tmp = &unaligned;
800 			}
801 
802 		} else {
803 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
804 							map_start);
805 		}
806 		ret = comp_keys(tmp, key);
807 
808 		if (ret < 0)
809 			low = mid + 1;
810 		else if (ret > 0)
811 			high = mid;
812 		else {
813 			*slot = mid;
814 			if (map_token)
815 				unmap_extent_buffer(eb, map_token, KM_USER0);
816 			return 0;
817 		}
818 	}
819 	*slot = low;
820 	if (map_token)
821 		unmap_extent_buffer(eb, map_token, KM_USER0);
822 	return 1;
823 }
824 
825 /*
826  * simple bin_search frontend that does the right thing for
827  * leaves vs nodes
828  */
829 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
830 		      int level, int *slot)
831 {
832 	if (level == 0) {
833 		return generic_bin_search(eb,
834 					  offsetof(struct btrfs_leaf, items),
835 					  sizeof(struct btrfs_item),
836 					  key, btrfs_header_nritems(eb),
837 					  slot);
838 	} else {
839 		return generic_bin_search(eb,
840 					  offsetof(struct btrfs_node, ptrs),
841 					  sizeof(struct btrfs_key_ptr),
842 					  key, btrfs_header_nritems(eb),
843 					  slot);
844 	}
845 	return -1;
846 }
847 
848 /* given a node and slot number, this reads the blocks it points to.  The
849  * extent buffer is returned with a reference taken (but unlocked).
850  * NULL is returned on error.
851  */
852 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
853 				   struct extent_buffer *parent, int slot)
854 {
855 	int level = btrfs_header_level(parent);
856 	if (slot < 0)
857 		return NULL;
858 	if (slot >= btrfs_header_nritems(parent))
859 		return NULL;
860 
861 	BUG_ON(level == 0);
862 
863 	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
864 		       btrfs_level_size(root, level - 1),
865 		       btrfs_node_ptr_generation(parent, slot));
866 }
867 
868 /*
869  * node level balancing, used to make sure nodes are in proper order for
870  * item deletion.  We balance from the top down, so we have to make sure
871  * that a deletion won't leave an node completely empty later on.
872  */
873 static noinline int balance_level(struct btrfs_trans_handle *trans,
874 			 struct btrfs_root *root,
875 			 struct btrfs_path *path, int level)
876 {
877 	struct extent_buffer *right = NULL;
878 	struct extent_buffer *mid;
879 	struct extent_buffer *left = NULL;
880 	struct extent_buffer *parent = NULL;
881 	int ret = 0;
882 	int wret;
883 	int pslot;
884 	int orig_slot = path->slots[level];
885 	int err_on_enospc = 0;
886 	u64 orig_ptr;
887 
888 	if (level == 0)
889 		return 0;
890 
891 	mid = path->nodes[level];
892 
893 	WARN_ON(!path->locks[level]);
894 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
895 
896 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
897 
898 	if (level < BTRFS_MAX_LEVEL - 1)
899 		parent = path->nodes[level + 1];
900 	pslot = path->slots[level + 1];
901 
902 	/*
903 	 * deal with the case where there is only one pointer in the root
904 	 * by promoting the node below to a root
905 	 */
906 	if (!parent) {
907 		struct extent_buffer *child;
908 
909 		if (btrfs_header_nritems(mid) != 1)
910 			return 0;
911 
912 		/* promote the child to a root */
913 		child = read_node_slot(root, mid, 0);
914 		BUG_ON(!child);
915 		btrfs_tree_lock(child);
916 		btrfs_set_lock_blocking(child);
917 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
918 		BUG_ON(ret);
919 
920 		spin_lock(&root->node_lock);
921 		root->node = child;
922 		spin_unlock(&root->node_lock);
923 
924 		ret = btrfs_update_extent_ref(trans, root, child->start,
925 					      child->len,
926 					      mid->start, child->start,
927 					      root->root_key.objectid,
928 					      trans->transid, level - 1);
929 		BUG_ON(ret);
930 
931 		add_root_to_dirty_list(root);
932 		btrfs_tree_unlock(child);
933 
934 		path->locks[level] = 0;
935 		path->nodes[level] = NULL;
936 		clean_tree_block(trans, root, mid);
937 		btrfs_tree_unlock(mid);
938 		/* once for the path */
939 		free_extent_buffer(mid);
940 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
941 					mid->start, root->root_key.objectid,
942 					btrfs_header_generation(mid),
943 					level, 1);
944 		/* once for the root ptr */
945 		free_extent_buffer(mid);
946 		return ret;
947 	}
948 	if (btrfs_header_nritems(mid) >
949 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
950 		return 0;
951 
952 	if (trans->transaction->delayed_refs.flushing &&
953 	    btrfs_header_nritems(mid) > 2)
954 		return 0;
955 
956 	if (btrfs_header_nritems(mid) < 2)
957 		err_on_enospc = 1;
958 
959 	left = read_node_slot(root, parent, pslot - 1);
960 	if (left) {
961 		btrfs_tree_lock(left);
962 		btrfs_set_lock_blocking(left);
963 		wret = btrfs_cow_block(trans, root, left,
964 				       parent, pslot - 1, &left);
965 		if (wret) {
966 			ret = wret;
967 			goto enospc;
968 		}
969 	}
970 	right = read_node_slot(root, parent, pslot + 1);
971 	if (right) {
972 		btrfs_tree_lock(right);
973 		btrfs_set_lock_blocking(right);
974 		wret = btrfs_cow_block(trans, root, right,
975 				       parent, pslot + 1, &right);
976 		if (wret) {
977 			ret = wret;
978 			goto enospc;
979 		}
980 	}
981 
982 	/* first, try to make some room in the middle buffer */
983 	if (left) {
984 		orig_slot += btrfs_header_nritems(left);
985 		wret = push_node_left(trans, root, left, mid, 1);
986 		if (wret < 0)
987 			ret = wret;
988 		if (btrfs_header_nritems(mid) < 2)
989 			err_on_enospc = 1;
990 	}
991 
992 	/*
993 	 * then try to empty the right most buffer into the middle
994 	 */
995 	if (right) {
996 		wret = push_node_left(trans, root, mid, right, 1);
997 		if (wret < 0 && wret != -ENOSPC)
998 			ret = wret;
999 		if (btrfs_header_nritems(right) == 0) {
1000 			u64 bytenr = right->start;
1001 			u64 generation = btrfs_header_generation(parent);
1002 			u32 blocksize = right->len;
1003 
1004 			clean_tree_block(trans, root, right);
1005 			btrfs_tree_unlock(right);
1006 			free_extent_buffer(right);
1007 			right = NULL;
1008 			wret = del_ptr(trans, root, path, level + 1, pslot +
1009 				       1);
1010 			if (wret)
1011 				ret = wret;
1012 			wret = btrfs_free_extent(trans, root, bytenr,
1013 						 blocksize, parent->start,
1014 						 btrfs_header_owner(parent),
1015 						 generation, level, 1);
1016 			if (wret)
1017 				ret = wret;
1018 		} else {
1019 			struct btrfs_disk_key right_key;
1020 			btrfs_node_key(right, &right_key, 0);
1021 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1022 			btrfs_mark_buffer_dirty(parent);
1023 		}
1024 	}
1025 	if (btrfs_header_nritems(mid) == 1) {
1026 		/*
1027 		 * we're not allowed to leave a node with one item in the
1028 		 * tree during a delete.  A deletion from lower in the tree
1029 		 * could try to delete the only pointer in this node.
1030 		 * So, pull some keys from the left.
1031 		 * There has to be a left pointer at this point because
1032 		 * otherwise we would have pulled some pointers from the
1033 		 * right
1034 		 */
1035 		BUG_ON(!left);
1036 		wret = balance_node_right(trans, root, mid, left);
1037 		if (wret < 0) {
1038 			ret = wret;
1039 			goto enospc;
1040 		}
1041 		if (wret == 1) {
1042 			wret = push_node_left(trans, root, left, mid, 1);
1043 			if (wret < 0)
1044 				ret = wret;
1045 		}
1046 		BUG_ON(wret == 1);
1047 	}
1048 	if (btrfs_header_nritems(mid) == 0) {
1049 		/* we've managed to empty the middle node, drop it */
1050 		u64 root_gen = btrfs_header_generation(parent);
1051 		u64 bytenr = mid->start;
1052 		u32 blocksize = mid->len;
1053 
1054 		clean_tree_block(trans, root, mid);
1055 		btrfs_tree_unlock(mid);
1056 		free_extent_buffer(mid);
1057 		mid = NULL;
1058 		wret = del_ptr(trans, root, path, level + 1, pslot);
1059 		if (wret)
1060 			ret = wret;
1061 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1062 					 parent->start,
1063 					 btrfs_header_owner(parent),
1064 					 root_gen, level, 1);
1065 		if (wret)
1066 			ret = wret;
1067 	} else {
1068 		/* update the parent key to reflect our changes */
1069 		struct btrfs_disk_key mid_key;
1070 		btrfs_node_key(mid, &mid_key, 0);
1071 		btrfs_set_node_key(parent, &mid_key, pslot);
1072 		btrfs_mark_buffer_dirty(parent);
1073 	}
1074 
1075 	/* update the path */
1076 	if (left) {
1077 		if (btrfs_header_nritems(left) > orig_slot) {
1078 			extent_buffer_get(left);
1079 			/* left was locked after cow */
1080 			path->nodes[level] = left;
1081 			path->slots[level + 1] -= 1;
1082 			path->slots[level] = orig_slot;
1083 			if (mid) {
1084 				btrfs_tree_unlock(mid);
1085 				free_extent_buffer(mid);
1086 			}
1087 		} else {
1088 			orig_slot -= btrfs_header_nritems(left);
1089 			path->slots[level] = orig_slot;
1090 		}
1091 	}
1092 	/* double check we haven't messed things up */
1093 	check_block(root, path, level);
1094 	if (orig_ptr !=
1095 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1096 		BUG();
1097 enospc:
1098 	if (right) {
1099 		btrfs_tree_unlock(right);
1100 		free_extent_buffer(right);
1101 	}
1102 	if (left) {
1103 		if (path->nodes[level] != left)
1104 			btrfs_tree_unlock(left);
1105 		free_extent_buffer(left);
1106 	}
1107 	return ret;
1108 }
1109 
1110 /* Node balancing for insertion.  Here we only split or push nodes around
1111  * when they are completely full.  This is also done top down, so we
1112  * have to be pessimistic.
1113  */
1114 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1115 					  struct btrfs_root *root,
1116 					  struct btrfs_path *path, int level)
1117 {
1118 	struct extent_buffer *right = NULL;
1119 	struct extent_buffer *mid;
1120 	struct extent_buffer *left = NULL;
1121 	struct extent_buffer *parent = NULL;
1122 	int ret = 0;
1123 	int wret;
1124 	int pslot;
1125 	int orig_slot = path->slots[level];
1126 	u64 orig_ptr;
1127 
1128 	if (level == 0)
1129 		return 1;
1130 
1131 	mid = path->nodes[level];
1132 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1133 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1134 
1135 	if (level < BTRFS_MAX_LEVEL - 1)
1136 		parent = path->nodes[level + 1];
1137 	pslot = path->slots[level + 1];
1138 
1139 	if (!parent)
1140 		return 1;
1141 
1142 	left = read_node_slot(root, parent, pslot - 1);
1143 
1144 	/* first, try to make some room in the middle buffer */
1145 	if (left) {
1146 		u32 left_nr;
1147 
1148 		btrfs_tree_lock(left);
1149 		btrfs_set_lock_blocking(left);
1150 
1151 		left_nr = btrfs_header_nritems(left);
1152 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1153 			wret = 1;
1154 		} else {
1155 			ret = btrfs_cow_block(trans, root, left, parent,
1156 					      pslot - 1, &left);
1157 			if (ret)
1158 				wret = 1;
1159 			else {
1160 				wret = push_node_left(trans, root,
1161 						      left, mid, 0);
1162 			}
1163 		}
1164 		if (wret < 0)
1165 			ret = wret;
1166 		if (wret == 0) {
1167 			struct btrfs_disk_key disk_key;
1168 			orig_slot += left_nr;
1169 			btrfs_node_key(mid, &disk_key, 0);
1170 			btrfs_set_node_key(parent, &disk_key, pslot);
1171 			btrfs_mark_buffer_dirty(parent);
1172 			if (btrfs_header_nritems(left) > orig_slot) {
1173 				path->nodes[level] = left;
1174 				path->slots[level + 1] -= 1;
1175 				path->slots[level] = orig_slot;
1176 				btrfs_tree_unlock(mid);
1177 				free_extent_buffer(mid);
1178 			} else {
1179 				orig_slot -=
1180 					btrfs_header_nritems(left);
1181 				path->slots[level] = orig_slot;
1182 				btrfs_tree_unlock(left);
1183 				free_extent_buffer(left);
1184 			}
1185 			return 0;
1186 		}
1187 		btrfs_tree_unlock(left);
1188 		free_extent_buffer(left);
1189 	}
1190 	right = read_node_slot(root, parent, pslot + 1);
1191 
1192 	/*
1193 	 * then try to empty the right most buffer into the middle
1194 	 */
1195 	if (right) {
1196 		u32 right_nr;
1197 
1198 		btrfs_tree_lock(right);
1199 		btrfs_set_lock_blocking(right);
1200 
1201 		right_nr = btrfs_header_nritems(right);
1202 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1203 			wret = 1;
1204 		} else {
1205 			ret = btrfs_cow_block(trans, root, right,
1206 					      parent, pslot + 1,
1207 					      &right);
1208 			if (ret)
1209 				wret = 1;
1210 			else {
1211 				wret = balance_node_right(trans, root,
1212 							  right, mid);
1213 			}
1214 		}
1215 		if (wret < 0)
1216 			ret = wret;
1217 		if (wret == 0) {
1218 			struct btrfs_disk_key disk_key;
1219 
1220 			btrfs_node_key(right, &disk_key, 0);
1221 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
1222 			btrfs_mark_buffer_dirty(parent);
1223 
1224 			if (btrfs_header_nritems(mid) <= orig_slot) {
1225 				path->nodes[level] = right;
1226 				path->slots[level + 1] += 1;
1227 				path->slots[level] = orig_slot -
1228 					btrfs_header_nritems(mid);
1229 				btrfs_tree_unlock(mid);
1230 				free_extent_buffer(mid);
1231 			} else {
1232 				btrfs_tree_unlock(right);
1233 				free_extent_buffer(right);
1234 			}
1235 			return 0;
1236 		}
1237 		btrfs_tree_unlock(right);
1238 		free_extent_buffer(right);
1239 	}
1240 	return 1;
1241 }
1242 
1243 /*
1244  * readahead one full node of leaves, finding things that are close
1245  * to the block in 'slot', and triggering ra on them.
1246  */
1247 static void reada_for_search(struct btrfs_root *root,
1248 			     struct btrfs_path *path,
1249 			     int level, int slot, u64 objectid)
1250 {
1251 	struct extent_buffer *node;
1252 	struct btrfs_disk_key disk_key;
1253 	u32 nritems;
1254 	u64 search;
1255 	u64 target;
1256 	u64 nread = 0;
1257 	int direction = path->reada;
1258 	struct extent_buffer *eb;
1259 	u32 nr;
1260 	u32 blocksize;
1261 	u32 nscan = 0;
1262 
1263 	if (level != 1)
1264 		return;
1265 
1266 	if (!path->nodes[level])
1267 		return;
1268 
1269 	node = path->nodes[level];
1270 
1271 	search = btrfs_node_blockptr(node, slot);
1272 	blocksize = btrfs_level_size(root, level - 1);
1273 	eb = btrfs_find_tree_block(root, search, blocksize);
1274 	if (eb) {
1275 		free_extent_buffer(eb);
1276 		return;
1277 	}
1278 
1279 	target = search;
1280 
1281 	nritems = btrfs_header_nritems(node);
1282 	nr = slot;
1283 	while (1) {
1284 		if (direction < 0) {
1285 			if (nr == 0)
1286 				break;
1287 			nr--;
1288 		} else if (direction > 0) {
1289 			nr++;
1290 			if (nr >= nritems)
1291 				break;
1292 		}
1293 		if (path->reada < 0 && objectid) {
1294 			btrfs_node_key(node, &disk_key, nr);
1295 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
1296 				break;
1297 		}
1298 		search = btrfs_node_blockptr(node, nr);
1299 		if ((search <= target && target - search <= 65536) ||
1300 		    (search > target && search - target <= 65536)) {
1301 			readahead_tree_block(root, search, blocksize,
1302 				     btrfs_node_ptr_generation(node, nr));
1303 			nread += blocksize;
1304 		}
1305 		nscan++;
1306 		if ((nread > 65536 || nscan > 32))
1307 			break;
1308 	}
1309 }
1310 
1311 /*
1312  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1313  * cache
1314  */
1315 static noinline int reada_for_balance(struct btrfs_root *root,
1316 				      struct btrfs_path *path, int level)
1317 {
1318 	int slot;
1319 	int nritems;
1320 	struct extent_buffer *parent;
1321 	struct extent_buffer *eb;
1322 	u64 gen;
1323 	u64 block1 = 0;
1324 	u64 block2 = 0;
1325 	int ret = 0;
1326 	int blocksize;
1327 
1328 	parent = path->nodes[level - 1];
1329 	if (!parent)
1330 		return 0;
1331 
1332 	nritems = btrfs_header_nritems(parent);
1333 	slot = path->slots[level];
1334 	blocksize = btrfs_level_size(root, level);
1335 
1336 	if (slot > 0) {
1337 		block1 = btrfs_node_blockptr(parent, slot - 1);
1338 		gen = btrfs_node_ptr_generation(parent, slot - 1);
1339 		eb = btrfs_find_tree_block(root, block1, blocksize);
1340 		if (eb && btrfs_buffer_uptodate(eb, gen))
1341 			block1 = 0;
1342 		free_extent_buffer(eb);
1343 	}
1344 	if (slot < nritems) {
1345 		block2 = btrfs_node_blockptr(parent, slot + 1);
1346 		gen = btrfs_node_ptr_generation(parent, slot + 1);
1347 		eb = btrfs_find_tree_block(root, block2, blocksize);
1348 		if (eb && btrfs_buffer_uptodate(eb, gen))
1349 			block2 = 0;
1350 		free_extent_buffer(eb);
1351 	}
1352 	if (block1 || block2) {
1353 		ret = -EAGAIN;
1354 		btrfs_release_path(root, path);
1355 		if (block1)
1356 			readahead_tree_block(root, block1, blocksize, 0);
1357 		if (block2)
1358 			readahead_tree_block(root, block2, blocksize, 0);
1359 
1360 		if (block1) {
1361 			eb = read_tree_block(root, block1, blocksize, 0);
1362 			free_extent_buffer(eb);
1363 		}
1364 		if (block1) {
1365 			eb = read_tree_block(root, block2, blocksize, 0);
1366 			free_extent_buffer(eb);
1367 		}
1368 	}
1369 	return ret;
1370 }
1371 
1372 
1373 /*
1374  * when we walk down the tree, it is usually safe to unlock the higher layers
1375  * in the tree.  The exceptions are when our path goes through slot 0, because
1376  * operations on the tree might require changing key pointers higher up in the
1377  * tree.
1378  *
1379  * callers might also have set path->keep_locks, which tells this code to keep
1380  * the lock if the path points to the last slot in the block.  This is part of
1381  * walking through the tree, and selecting the next slot in the higher block.
1382  *
1383  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1384  * if lowest_unlock is 1, level 0 won't be unlocked
1385  */
1386 static noinline void unlock_up(struct btrfs_path *path, int level,
1387 			       int lowest_unlock)
1388 {
1389 	int i;
1390 	int skip_level = level;
1391 	int no_skips = 0;
1392 	struct extent_buffer *t;
1393 
1394 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1395 		if (!path->nodes[i])
1396 			break;
1397 		if (!path->locks[i])
1398 			break;
1399 		if (!no_skips && path->slots[i] == 0) {
1400 			skip_level = i + 1;
1401 			continue;
1402 		}
1403 		if (!no_skips && path->keep_locks) {
1404 			u32 nritems;
1405 			t = path->nodes[i];
1406 			nritems = btrfs_header_nritems(t);
1407 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1408 				skip_level = i + 1;
1409 				continue;
1410 			}
1411 		}
1412 		if (skip_level < i && i >= lowest_unlock)
1413 			no_skips = 1;
1414 
1415 		t = path->nodes[i];
1416 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1417 			btrfs_tree_unlock(t);
1418 			path->locks[i] = 0;
1419 		}
1420 	}
1421 }
1422 
1423 /*
1424  * This releases any locks held in the path starting at level and
1425  * going all the way up to the root.
1426  *
1427  * btrfs_search_slot will keep the lock held on higher nodes in a few
1428  * corner cases, such as COW of the block at slot zero in the node.  This
1429  * ignores those rules, and it should only be called when there are no
1430  * more updates to be done higher up in the tree.
1431  */
1432 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1433 {
1434 	int i;
1435 
1436 	if (path->keep_locks || path->lowest_level)
1437 		return;
1438 
1439 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1440 		if (!path->nodes[i])
1441 			continue;
1442 		if (!path->locks[i])
1443 			continue;
1444 		btrfs_tree_unlock(path->nodes[i]);
1445 		path->locks[i] = 0;
1446 	}
1447 }
1448 
1449 /*
1450  * helper function for btrfs_search_slot.  The goal is to find a block
1451  * in cache without setting the path to blocking.  If we find the block
1452  * we return zero and the path is unchanged.
1453  *
1454  * If we can't find the block, we set the path blocking and do some
1455  * reada.  -EAGAIN is returned and the search must be repeated.
1456  */
1457 static int
1458 read_block_for_search(struct btrfs_trans_handle *trans,
1459 		       struct btrfs_root *root, struct btrfs_path *p,
1460 		       struct extent_buffer **eb_ret, int level, int slot,
1461 		       struct btrfs_key *key)
1462 {
1463 	u64 blocknr;
1464 	u64 gen;
1465 	u32 blocksize;
1466 	struct extent_buffer *b = *eb_ret;
1467 	struct extent_buffer *tmp;
1468 
1469 	blocknr = btrfs_node_blockptr(b, slot);
1470 	gen = btrfs_node_ptr_generation(b, slot);
1471 	blocksize = btrfs_level_size(root, level - 1);
1472 
1473 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 	if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 		*eb_ret = tmp;
1476 		return 0;
1477 	}
1478 
1479 	/*
1480 	 * reduce lock contention at high levels
1481 	 * of the btree by dropping locks before
1482 	 * we read.
1483 	 */
1484 	btrfs_release_path(NULL, p);
1485 	if (tmp)
1486 		free_extent_buffer(tmp);
1487 	if (p->reada)
1488 		reada_for_search(root, p, level, slot, key->objectid);
1489 
1490 	tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 	if (tmp)
1492 		free_extent_buffer(tmp);
1493 	return -EAGAIN;
1494 }
1495 
1496 /*
1497  * helper function for btrfs_search_slot.  This does all of the checks
1498  * for node-level blocks and does any balancing required based on
1499  * the ins_len.
1500  *
1501  * If no extra work was required, zero is returned.  If we had to
1502  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503  * start over
1504  */
1505 static int
1506 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 		       struct btrfs_root *root, struct btrfs_path *p,
1508 		       struct extent_buffer *b, int level, int ins_len)
1509 {
1510 	int ret;
1511 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 		int sret;
1514 
1515 		sret = reada_for_balance(root, p, level);
1516 		if (sret)
1517 			goto again;
1518 
1519 		btrfs_set_path_blocking(p);
1520 		sret = split_node(trans, root, p, level);
1521 		btrfs_clear_path_blocking(p, NULL);
1522 
1523 		BUG_ON(sret > 0);
1524 		if (sret) {
1525 			ret = sret;
1526 			goto done;
1527 		}
1528 		b = p->nodes[level];
1529 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 		int sret;
1532 
1533 		sret = reada_for_balance(root, p, level);
1534 		if (sret)
1535 			goto again;
1536 
1537 		btrfs_set_path_blocking(p);
1538 		sret = balance_level(trans, root, p, level);
1539 		btrfs_clear_path_blocking(p, NULL);
1540 
1541 		if (sret) {
1542 			ret = sret;
1543 			goto done;
1544 		}
1545 		b = p->nodes[level];
1546 		if (!b) {
1547 			btrfs_release_path(NULL, p);
1548 			goto again;
1549 		}
1550 		BUG_ON(btrfs_header_nritems(b) == 1);
1551 	}
1552 	return 0;
1553 
1554 again:
1555 	ret = -EAGAIN;
1556 done:
1557 	return ret;
1558 }
1559 
1560 /*
1561  * look for key in the tree.  path is filled in with nodes along the way
1562  * if key is found, we return zero and you can find the item in the leaf
1563  * level of the path (level 0)
1564  *
1565  * If the key isn't found, the path points to the slot where it should
1566  * be inserted, and 1 is returned.  If there are other errors during the
1567  * search a negative error number is returned.
1568  *
1569  * if ins_len > 0, nodes and leaves will be split as we walk down the
1570  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1571  * possible)
1572  */
1573 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1574 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
1575 		      ins_len, int cow)
1576 {
1577 	struct extent_buffer *b;
1578 	int slot;
1579 	int ret;
1580 	int level;
1581 	int lowest_unlock = 1;
1582 	u8 lowest_level = 0;
1583 
1584 	lowest_level = p->lowest_level;
1585 	WARN_ON(lowest_level && ins_len > 0);
1586 	WARN_ON(p->nodes[0] != NULL);
1587 
1588 	if (ins_len < 0)
1589 		lowest_unlock = 2;
1590 
1591 again:
1592 	if (p->skip_locking)
1593 		b = btrfs_root_node(root);
1594 	else
1595 		b = btrfs_lock_root_node(root);
1596 
1597 	while (b) {
1598 		level = btrfs_header_level(b);
1599 
1600 		/*
1601 		 * setup the path here so we can release it under lock
1602 		 * contention with the cow code
1603 		 */
1604 		p->nodes[level] = b;
1605 		if (!p->skip_locking)
1606 			p->locks[level] = 1;
1607 
1608 		if (cow) {
1609 			int wret;
1610 
1611 			/*
1612 			 * if we don't really need to cow this block
1613 			 * then we don't want to set the path blocking,
1614 			 * so we test it here
1615 			 */
1616 			if (btrfs_header_generation(b) == trans->transid &&
1617 			    btrfs_header_owner(b) == root->root_key.objectid &&
1618 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1619 				goto cow_done;
1620 			}
1621 			btrfs_set_path_blocking(p);
1622 
1623 			wret = btrfs_cow_block(trans, root, b,
1624 					       p->nodes[level + 1],
1625 					       p->slots[level + 1], &b);
1626 			if (wret) {
1627 				free_extent_buffer(b);
1628 				ret = wret;
1629 				goto done;
1630 			}
1631 		}
1632 cow_done:
1633 		BUG_ON(!cow && ins_len);
1634 		if (level != btrfs_header_level(b))
1635 			WARN_ON(1);
1636 		level = btrfs_header_level(b);
1637 
1638 		p->nodes[level] = b;
1639 		if (!p->skip_locking)
1640 			p->locks[level] = 1;
1641 
1642 		btrfs_clear_path_blocking(p, NULL);
1643 
1644 		/*
1645 		 * we have a lock on b and as long as we aren't changing
1646 		 * the tree, there is no way to for the items in b to change.
1647 		 * It is safe to drop the lock on our parent before we
1648 		 * go through the expensive btree search on b.
1649 		 *
1650 		 * If cow is true, then we might be changing slot zero,
1651 		 * which may require changing the parent.  So, we can't
1652 		 * drop the lock until after we know which slot we're
1653 		 * operating on.
1654 		 */
1655 		if (!cow)
1656 			btrfs_unlock_up_safe(p, level + 1);
1657 
1658 		ret = check_block(root, p, level);
1659 		if (ret) {
1660 			ret = -1;
1661 			goto done;
1662 		}
1663 
1664 		ret = bin_search(b, key, level, &slot);
1665 
1666 		if (level != 0) {
1667 			if (ret && slot > 0)
1668 				slot -= 1;
1669 			p->slots[level] = slot;
1670 			ret = setup_nodes_for_search(trans, root, p, b, level,
1671 						     ins_len);
1672 			if (ret == -EAGAIN)
1673 				goto again;
1674 			else if (ret)
1675 				goto done;
1676 			b = p->nodes[level];
1677 			slot = p->slots[level];
1678 
1679 			unlock_up(p, level, lowest_unlock);
1680 
1681 			/* this is only true while dropping a snapshot */
1682 			if (level == lowest_level) {
1683 				ret = 0;
1684 				goto done;
1685 			}
1686 
1687 			ret = read_block_for_search(trans, root, p,
1688 						    &b, level, slot, key);
1689 			if (ret == -EAGAIN)
1690 				goto again;
1691 
1692 			if (!p->skip_locking) {
1693 				int lret;
1694 
1695 				btrfs_clear_path_blocking(p, NULL);
1696 				lret = btrfs_try_spin_lock(b);
1697 
1698 				if (!lret) {
1699 					btrfs_set_path_blocking(p);
1700 					btrfs_tree_lock(b);
1701 					btrfs_clear_path_blocking(p, b);
1702 				}
1703 			}
1704 		} else {
1705 			p->slots[level] = slot;
1706 			if (ins_len > 0 &&
1707 			    btrfs_leaf_free_space(root, b) < ins_len) {
1708 				int sret;
1709 
1710 				btrfs_set_path_blocking(p);
1711 				sret = split_leaf(trans, root, key,
1712 						      p, ins_len, ret == 0);
1713 				btrfs_clear_path_blocking(p, NULL);
1714 
1715 				BUG_ON(sret > 0);
1716 				if (sret) {
1717 					ret = sret;
1718 					goto done;
1719 				}
1720 			}
1721 			if (!p->search_for_split)
1722 				unlock_up(p, level, lowest_unlock);
1723 			goto done;
1724 		}
1725 	}
1726 	ret = 1;
1727 done:
1728 	/*
1729 	 * we don't really know what they plan on doing with the path
1730 	 * from here on, so for now just mark it as blocking
1731 	 */
1732 	if (!p->leave_spinning)
1733 		btrfs_set_path_blocking(p);
1734 	return ret;
1735 }
1736 
1737 int btrfs_merge_path(struct btrfs_trans_handle *trans,
1738 		     struct btrfs_root *root,
1739 		     struct btrfs_key *node_keys,
1740 		     u64 *nodes, int lowest_level)
1741 {
1742 	struct extent_buffer *eb;
1743 	struct extent_buffer *parent;
1744 	struct btrfs_key key;
1745 	u64 bytenr;
1746 	u64 generation;
1747 	u32 blocksize;
1748 	int level;
1749 	int slot;
1750 	int key_match;
1751 	int ret;
1752 
1753 	eb = btrfs_lock_root_node(root);
1754 	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1755 	BUG_ON(ret);
1756 
1757 	btrfs_set_lock_blocking(eb);
1758 
1759 	parent = eb;
1760 	while (1) {
1761 		level = btrfs_header_level(parent);
1762 		if (level == 0 || level <= lowest_level)
1763 			break;
1764 
1765 		ret = bin_search(parent, &node_keys[lowest_level], level,
1766 				 &slot);
1767 		if (ret && slot > 0)
1768 			slot--;
1769 
1770 		bytenr = btrfs_node_blockptr(parent, slot);
1771 		if (nodes[level - 1] == bytenr)
1772 			break;
1773 
1774 		blocksize = btrfs_level_size(root, level - 1);
1775 		generation = btrfs_node_ptr_generation(parent, slot);
1776 		btrfs_node_key_to_cpu(eb, &key, slot);
1777 		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1778 
1779 		if (generation == trans->transid) {
1780 			eb = read_tree_block(root, bytenr, blocksize,
1781 					     generation);
1782 			btrfs_tree_lock(eb);
1783 			btrfs_set_lock_blocking(eb);
1784 		}
1785 
1786 		/*
1787 		 * if node keys match and node pointer hasn't been modified
1788 		 * in the running transaction, we can merge the path. for
1789 		 * blocks owened by reloc trees, the node pointer check is
1790 		 * skipped, this is because these blocks are fully controlled
1791 		 * by the space balance code, no one else can modify them.
1792 		 */
1793 		if (!nodes[level - 1] || !key_match ||
1794 		    (generation == trans->transid &&
1795 		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1796 			if (level == 1 || level == lowest_level + 1) {
1797 				if (generation == trans->transid) {
1798 					btrfs_tree_unlock(eb);
1799 					free_extent_buffer(eb);
1800 				}
1801 				break;
1802 			}
1803 
1804 			if (generation != trans->transid) {
1805 				eb = read_tree_block(root, bytenr, blocksize,
1806 						generation);
1807 				btrfs_tree_lock(eb);
1808 				btrfs_set_lock_blocking(eb);
1809 			}
1810 
1811 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
1812 					      &eb);
1813 			BUG_ON(ret);
1814 
1815 			if (root->root_key.objectid ==
1816 			    BTRFS_TREE_RELOC_OBJECTID) {
1817 				if (!nodes[level - 1]) {
1818 					nodes[level - 1] = eb->start;
1819 					memcpy(&node_keys[level - 1], &key,
1820 					       sizeof(node_keys[0]));
1821 				} else {
1822 					WARN_ON(1);
1823 				}
1824 			}
1825 
1826 			btrfs_tree_unlock(parent);
1827 			free_extent_buffer(parent);
1828 			parent = eb;
1829 			continue;
1830 		}
1831 
1832 		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1833 		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1834 		btrfs_mark_buffer_dirty(parent);
1835 
1836 		ret = btrfs_inc_extent_ref(trans, root,
1837 					nodes[level - 1],
1838 					blocksize, parent->start,
1839 					btrfs_header_owner(parent),
1840 					btrfs_header_generation(parent),
1841 					level - 1);
1842 		BUG_ON(ret);
1843 
1844 		/*
1845 		 * If the block was created in the running transaction,
1846 		 * it's possible this is the last reference to it, so we
1847 		 * should drop the subtree.
1848 		 */
1849 		if (generation == trans->transid) {
1850 			ret = btrfs_drop_subtree(trans, root, eb, parent);
1851 			BUG_ON(ret);
1852 			btrfs_tree_unlock(eb);
1853 			free_extent_buffer(eb);
1854 		} else {
1855 			ret = btrfs_free_extent(trans, root, bytenr,
1856 					blocksize, parent->start,
1857 					btrfs_header_owner(parent),
1858 					btrfs_header_generation(parent),
1859 					level - 1, 1);
1860 			BUG_ON(ret);
1861 		}
1862 		break;
1863 	}
1864 	btrfs_tree_unlock(parent);
1865 	free_extent_buffer(parent);
1866 	return 0;
1867 }
1868 
1869 /*
1870  * adjust the pointers going up the tree, starting at level
1871  * making sure the right key of each node is points to 'key'.
1872  * This is used after shifting pointers to the left, so it stops
1873  * fixing up pointers when a given leaf/node is not in slot 0 of the
1874  * higher levels
1875  *
1876  * If this fails to write a tree block, it returns -1, but continues
1877  * fixing up the blocks in ram so the tree is consistent.
1878  */
1879 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1880 			  struct btrfs_root *root, struct btrfs_path *path,
1881 			  struct btrfs_disk_key *key, int level)
1882 {
1883 	int i;
1884 	int ret = 0;
1885 	struct extent_buffer *t;
1886 
1887 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1888 		int tslot = path->slots[i];
1889 		if (!path->nodes[i])
1890 			break;
1891 		t = path->nodes[i];
1892 		btrfs_set_node_key(t, key, tslot);
1893 		btrfs_mark_buffer_dirty(path->nodes[i]);
1894 		if (tslot != 0)
1895 			break;
1896 	}
1897 	return ret;
1898 }
1899 
1900 /*
1901  * update item key.
1902  *
1903  * This function isn't completely safe. It's the caller's responsibility
1904  * that the new key won't break the order
1905  */
1906 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1907 			    struct btrfs_root *root, struct btrfs_path *path,
1908 			    struct btrfs_key *new_key)
1909 {
1910 	struct btrfs_disk_key disk_key;
1911 	struct extent_buffer *eb;
1912 	int slot;
1913 
1914 	eb = path->nodes[0];
1915 	slot = path->slots[0];
1916 	if (slot > 0) {
1917 		btrfs_item_key(eb, &disk_key, slot - 1);
1918 		if (comp_keys(&disk_key, new_key) >= 0)
1919 			return -1;
1920 	}
1921 	if (slot < btrfs_header_nritems(eb) - 1) {
1922 		btrfs_item_key(eb, &disk_key, slot + 1);
1923 		if (comp_keys(&disk_key, new_key) <= 0)
1924 			return -1;
1925 	}
1926 
1927 	btrfs_cpu_key_to_disk(&disk_key, new_key);
1928 	btrfs_set_item_key(eb, &disk_key, slot);
1929 	btrfs_mark_buffer_dirty(eb);
1930 	if (slot == 0)
1931 		fixup_low_keys(trans, root, path, &disk_key, 1);
1932 	return 0;
1933 }
1934 
1935 /*
1936  * try to push data from one node into the next node left in the
1937  * tree.
1938  *
1939  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1940  * error, and > 0 if there was no room in the left hand block.
1941  */
1942 static int push_node_left(struct btrfs_trans_handle *trans,
1943 			  struct btrfs_root *root, struct extent_buffer *dst,
1944 			  struct extent_buffer *src, int empty)
1945 {
1946 	int push_items = 0;
1947 	int src_nritems;
1948 	int dst_nritems;
1949 	int ret = 0;
1950 
1951 	src_nritems = btrfs_header_nritems(src);
1952 	dst_nritems = btrfs_header_nritems(dst);
1953 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1954 	WARN_ON(btrfs_header_generation(src) != trans->transid);
1955 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1956 
1957 	if (!empty && src_nritems <= 8)
1958 		return 1;
1959 
1960 	if (push_items <= 0)
1961 		return 1;
1962 
1963 	if (empty) {
1964 		push_items = min(src_nritems, push_items);
1965 		if (push_items < src_nritems) {
1966 			/* leave at least 8 pointers in the node if
1967 			 * we aren't going to empty it
1968 			 */
1969 			if (src_nritems - push_items < 8) {
1970 				if (push_items <= 8)
1971 					return 1;
1972 				push_items -= 8;
1973 			}
1974 		}
1975 	} else
1976 		push_items = min(src_nritems - 8, push_items);
1977 
1978 	copy_extent_buffer(dst, src,
1979 			   btrfs_node_key_ptr_offset(dst_nritems),
1980 			   btrfs_node_key_ptr_offset(0),
1981 			   push_items * sizeof(struct btrfs_key_ptr));
1982 
1983 	if (push_items < src_nritems) {
1984 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1985 				      btrfs_node_key_ptr_offset(push_items),
1986 				      (src_nritems - push_items) *
1987 				      sizeof(struct btrfs_key_ptr));
1988 	}
1989 	btrfs_set_header_nritems(src, src_nritems - push_items);
1990 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
1991 	btrfs_mark_buffer_dirty(src);
1992 	btrfs_mark_buffer_dirty(dst);
1993 
1994 	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1995 	BUG_ON(ret);
1996 
1997 	return ret;
1998 }
1999 
2000 /*
2001  * try to push data from one node into the next node right in the
2002  * tree.
2003  *
2004  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2005  * error, and > 0 if there was no room in the right hand block.
2006  *
2007  * this will  only push up to 1/2 the contents of the left node over
2008  */
2009 static int balance_node_right(struct btrfs_trans_handle *trans,
2010 			      struct btrfs_root *root,
2011 			      struct extent_buffer *dst,
2012 			      struct extent_buffer *src)
2013 {
2014 	int push_items = 0;
2015 	int max_push;
2016 	int src_nritems;
2017 	int dst_nritems;
2018 	int ret = 0;
2019 
2020 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2021 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2022 
2023 	src_nritems = btrfs_header_nritems(src);
2024 	dst_nritems = btrfs_header_nritems(dst);
2025 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2026 	if (push_items <= 0)
2027 		return 1;
2028 
2029 	if (src_nritems < 4)
2030 		return 1;
2031 
2032 	max_push = src_nritems / 2 + 1;
2033 	/* don't try to empty the node */
2034 	if (max_push >= src_nritems)
2035 		return 1;
2036 
2037 	if (max_push < push_items)
2038 		push_items = max_push;
2039 
2040 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2041 				      btrfs_node_key_ptr_offset(0),
2042 				      (dst_nritems) *
2043 				      sizeof(struct btrfs_key_ptr));
2044 
2045 	copy_extent_buffer(dst, src,
2046 			   btrfs_node_key_ptr_offset(0),
2047 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
2048 			   push_items * sizeof(struct btrfs_key_ptr));
2049 
2050 	btrfs_set_header_nritems(src, src_nritems - push_items);
2051 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2052 
2053 	btrfs_mark_buffer_dirty(src);
2054 	btrfs_mark_buffer_dirty(dst);
2055 
2056 	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
2057 	BUG_ON(ret);
2058 
2059 	return ret;
2060 }
2061 
2062 /*
2063  * helper function to insert a new root level in the tree.
2064  * A new node is allocated, and a single item is inserted to
2065  * point to the existing root
2066  *
2067  * returns zero on success or < 0 on failure.
2068  */
2069 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2070 			   struct btrfs_root *root,
2071 			   struct btrfs_path *path, int level)
2072 {
2073 	u64 lower_gen;
2074 	struct extent_buffer *lower;
2075 	struct extent_buffer *c;
2076 	struct extent_buffer *old;
2077 	struct btrfs_disk_key lower_key;
2078 	int ret;
2079 
2080 	BUG_ON(path->nodes[level]);
2081 	BUG_ON(path->nodes[level-1] != root->node);
2082 
2083 	lower = path->nodes[level-1];
2084 	if (level == 1)
2085 		btrfs_item_key(lower, &lower_key, 0);
2086 	else
2087 		btrfs_node_key(lower, &lower_key, 0);
2088 
2089 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2090 				   root->root_key.objectid, trans->transid,
2091 				   level, root->node->start, 0);
2092 	if (IS_ERR(c))
2093 		return PTR_ERR(c);
2094 
2095 	memset_extent_buffer(c, 0, 0, root->nodesize);
2096 	btrfs_set_header_nritems(c, 1);
2097 	btrfs_set_header_level(c, level);
2098 	btrfs_set_header_bytenr(c, c->start);
2099 	btrfs_set_header_generation(c, trans->transid);
2100 	btrfs_set_header_owner(c, root->root_key.objectid);
2101 
2102 	write_extent_buffer(c, root->fs_info->fsid,
2103 			    (unsigned long)btrfs_header_fsid(c),
2104 			    BTRFS_FSID_SIZE);
2105 
2106 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2107 			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
2108 			    BTRFS_UUID_SIZE);
2109 
2110 	btrfs_set_node_key(c, &lower_key, 0);
2111 	btrfs_set_node_blockptr(c, 0, lower->start);
2112 	lower_gen = btrfs_header_generation(lower);
2113 	WARN_ON(lower_gen != trans->transid);
2114 
2115 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2116 
2117 	btrfs_mark_buffer_dirty(c);
2118 
2119 	spin_lock(&root->node_lock);
2120 	old = root->node;
2121 	root->node = c;
2122 	spin_unlock(&root->node_lock);
2123 
2124 	ret = btrfs_update_extent_ref(trans, root, lower->start,
2125 				      lower->len, lower->start, c->start,
2126 				      root->root_key.objectid,
2127 				      trans->transid, level - 1);
2128 	BUG_ON(ret);
2129 
2130 	/* the super has an extra ref to root->node */
2131 	free_extent_buffer(old);
2132 
2133 	add_root_to_dirty_list(root);
2134 	extent_buffer_get(c);
2135 	path->nodes[level] = c;
2136 	path->locks[level] = 1;
2137 	path->slots[level] = 0;
2138 	return 0;
2139 }
2140 
2141 /*
2142  * worker function to insert a single pointer in a node.
2143  * the node should have enough room for the pointer already
2144  *
2145  * slot and level indicate where you want the key to go, and
2146  * blocknr is the block the key points to.
2147  *
2148  * returns zero on success and < 0 on any error
2149  */
2150 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2151 		      *root, struct btrfs_path *path, struct btrfs_disk_key
2152 		      *key, u64 bytenr, int slot, int level)
2153 {
2154 	struct extent_buffer *lower;
2155 	int nritems;
2156 
2157 	BUG_ON(!path->nodes[level]);
2158 	lower = path->nodes[level];
2159 	nritems = btrfs_header_nritems(lower);
2160 	BUG_ON(slot > nritems);
2161 	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2162 		BUG();
2163 	if (slot != nritems) {
2164 		memmove_extent_buffer(lower,
2165 			      btrfs_node_key_ptr_offset(slot + 1),
2166 			      btrfs_node_key_ptr_offset(slot),
2167 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
2168 	}
2169 	btrfs_set_node_key(lower, key, slot);
2170 	btrfs_set_node_blockptr(lower, slot, bytenr);
2171 	WARN_ON(trans->transid == 0);
2172 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2173 	btrfs_set_header_nritems(lower, nritems + 1);
2174 	btrfs_mark_buffer_dirty(lower);
2175 	return 0;
2176 }
2177 
2178 /*
2179  * split the node at the specified level in path in two.
2180  * The path is corrected to point to the appropriate node after the split
2181  *
2182  * Before splitting this tries to make some room in the node by pushing
2183  * left and right, if either one works, it returns right away.
2184  *
2185  * returns 0 on success and < 0 on failure
2186  */
2187 static noinline int split_node(struct btrfs_trans_handle *trans,
2188 			       struct btrfs_root *root,
2189 			       struct btrfs_path *path, int level)
2190 {
2191 	struct extent_buffer *c;
2192 	struct extent_buffer *split;
2193 	struct btrfs_disk_key disk_key;
2194 	int mid;
2195 	int ret;
2196 	int wret;
2197 	u32 c_nritems;
2198 
2199 	c = path->nodes[level];
2200 	WARN_ON(btrfs_header_generation(c) != trans->transid);
2201 	if (c == root->node) {
2202 		/* trying to split the root, lets make a new one */
2203 		ret = insert_new_root(trans, root, path, level + 1);
2204 		if (ret)
2205 			return ret;
2206 	} else if (!trans->transaction->delayed_refs.flushing) {
2207 		ret = push_nodes_for_insert(trans, root, path, level);
2208 		c = path->nodes[level];
2209 		if (!ret && btrfs_header_nritems(c) <
2210 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2211 			return 0;
2212 		if (ret < 0)
2213 			return ret;
2214 	}
2215 
2216 	c_nritems = btrfs_header_nritems(c);
2217 
2218 	split = btrfs_alloc_free_block(trans, root, root->nodesize,
2219 					path->nodes[level + 1]->start,
2220 					root->root_key.objectid,
2221 					trans->transid, level, c->start, 0);
2222 	if (IS_ERR(split))
2223 		return PTR_ERR(split);
2224 
2225 	btrfs_set_header_flags(split, btrfs_header_flags(c));
2226 	btrfs_set_header_level(split, btrfs_header_level(c));
2227 	btrfs_set_header_bytenr(split, split->start);
2228 	btrfs_set_header_generation(split, trans->transid);
2229 	btrfs_set_header_owner(split, root->root_key.objectid);
2230 	btrfs_set_header_flags(split, 0);
2231 	write_extent_buffer(split, root->fs_info->fsid,
2232 			    (unsigned long)btrfs_header_fsid(split),
2233 			    BTRFS_FSID_SIZE);
2234 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2235 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
2236 			    BTRFS_UUID_SIZE);
2237 
2238 	mid = (c_nritems + 1) / 2;
2239 
2240 	copy_extent_buffer(split, c,
2241 			   btrfs_node_key_ptr_offset(0),
2242 			   btrfs_node_key_ptr_offset(mid),
2243 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2244 	btrfs_set_header_nritems(split, c_nritems - mid);
2245 	btrfs_set_header_nritems(c, mid);
2246 	ret = 0;
2247 
2248 	btrfs_mark_buffer_dirty(c);
2249 	btrfs_mark_buffer_dirty(split);
2250 
2251 	btrfs_node_key(split, &disk_key, 0);
2252 	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2253 			  path->slots[level + 1] + 1,
2254 			  level + 1);
2255 	if (wret)
2256 		ret = wret;
2257 
2258 	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2259 	BUG_ON(ret);
2260 
2261 	if (path->slots[level] >= mid) {
2262 		path->slots[level] -= mid;
2263 		btrfs_tree_unlock(c);
2264 		free_extent_buffer(c);
2265 		path->nodes[level] = split;
2266 		path->slots[level + 1] += 1;
2267 	} else {
2268 		btrfs_tree_unlock(split);
2269 		free_extent_buffer(split);
2270 	}
2271 	return ret;
2272 }
2273 
2274 /*
2275  * how many bytes are required to store the items in a leaf.  start
2276  * and nr indicate which items in the leaf to check.  This totals up the
2277  * space used both by the item structs and the item data
2278  */
2279 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2280 {
2281 	int data_len;
2282 	int nritems = btrfs_header_nritems(l);
2283 	int end = min(nritems, start + nr) - 1;
2284 
2285 	if (!nr)
2286 		return 0;
2287 	data_len = btrfs_item_end_nr(l, start);
2288 	data_len = data_len - btrfs_item_offset_nr(l, end);
2289 	data_len += sizeof(struct btrfs_item) * nr;
2290 	WARN_ON(data_len < 0);
2291 	return data_len;
2292 }
2293 
2294 /*
2295  * The space between the end of the leaf items and
2296  * the start of the leaf data.  IOW, how much room
2297  * the leaf has left for both items and data
2298  */
2299 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2300 				   struct extent_buffer *leaf)
2301 {
2302 	int nritems = btrfs_header_nritems(leaf);
2303 	int ret;
2304 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2305 	if (ret < 0) {
2306 		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2307 		       "used %d nritems %d\n",
2308 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2309 		       leaf_space_used(leaf, 0, nritems), nritems);
2310 	}
2311 	return ret;
2312 }
2313 
2314 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2315 				      struct btrfs_root *root,
2316 				      struct btrfs_path *path,
2317 				      int data_size, int empty,
2318 				      struct extent_buffer *right,
2319 				      int free_space, u32 left_nritems)
2320 {
2321 	struct extent_buffer *left = path->nodes[0];
2322 	struct extent_buffer *upper = path->nodes[1];
2323 	struct btrfs_disk_key disk_key;
2324 	int slot;
2325 	u32 i;
2326 	int push_space = 0;
2327 	int push_items = 0;
2328 	struct btrfs_item *item;
2329 	u32 nr;
2330 	u32 right_nritems;
2331 	u32 data_end;
2332 	u32 this_item_size;
2333 	int ret;
2334 
2335 	if (empty)
2336 		nr = 0;
2337 	else
2338 		nr = 1;
2339 
2340 	if (path->slots[0] >= left_nritems)
2341 		push_space += data_size;
2342 
2343 	slot = path->slots[1];
2344 	i = left_nritems - 1;
2345 	while (i >= nr) {
2346 		item = btrfs_item_nr(left, i);
2347 
2348 		if (!empty && push_items > 0) {
2349 			if (path->slots[0] > i)
2350 				break;
2351 			if (path->slots[0] == i) {
2352 				int space = btrfs_leaf_free_space(root, left);
2353 				if (space + push_space * 2 > free_space)
2354 					break;
2355 			}
2356 		}
2357 
2358 		if (path->slots[0] == i)
2359 			push_space += data_size;
2360 
2361 		if (!left->map_token) {
2362 			map_extent_buffer(left, (unsigned long)item,
2363 					sizeof(struct btrfs_item),
2364 					&left->map_token, &left->kaddr,
2365 					&left->map_start, &left->map_len,
2366 					KM_USER1);
2367 		}
2368 
2369 		this_item_size = btrfs_item_size(left, item);
2370 		if (this_item_size + sizeof(*item) + push_space > free_space)
2371 			break;
2372 
2373 		push_items++;
2374 		push_space += this_item_size + sizeof(*item);
2375 		if (i == 0)
2376 			break;
2377 		i--;
2378 	}
2379 	if (left->map_token) {
2380 		unmap_extent_buffer(left, left->map_token, KM_USER1);
2381 		left->map_token = NULL;
2382 	}
2383 
2384 	if (push_items == 0)
2385 		goto out_unlock;
2386 
2387 	if (!empty && push_items == left_nritems)
2388 		WARN_ON(1);
2389 
2390 	/* push left to right */
2391 	right_nritems = btrfs_header_nritems(right);
2392 
2393 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2394 	push_space -= leaf_data_end(root, left);
2395 
2396 	/* make room in the right data area */
2397 	data_end = leaf_data_end(root, right);
2398 	memmove_extent_buffer(right,
2399 			      btrfs_leaf_data(right) + data_end - push_space,
2400 			      btrfs_leaf_data(right) + data_end,
2401 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
2402 
2403 	/* copy from the left data area */
2404 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2405 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
2406 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
2407 		     push_space);
2408 
2409 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2410 			      btrfs_item_nr_offset(0),
2411 			      right_nritems * sizeof(struct btrfs_item));
2412 
2413 	/* copy the items from left to right */
2414 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2415 		   btrfs_item_nr_offset(left_nritems - push_items),
2416 		   push_items * sizeof(struct btrfs_item));
2417 
2418 	/* update the item pointers */
2419 	right_nritems += push_items;
2420 	btrfs_set_header_nritems(right, right_nritems);
2421 	push_space = BTRFS_LEAF_DATA_SIZE(root);
2422 	for (i = 0; i < right_nritems; i++) {
2423 		item = btrfs_item_nr(right, i);
2424 		if (!right->map_token) {
2425 			map_extent_buffer(right, (unsigned long)item,
2426 					sizeof(struct btrfs_item),
2427 					&right->map_token, &right->kaddr,
2428 					&right->map_start, &right->map_len,
2429 					KM_USER1);
2430 		}
2431 		push_space -= btrfs_item_size(right, item);
2432 		btrfs_set_item_offset(right, item, push_space);
2433 	}
2434 
2435 	if (right->map_token) {
2436 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2437 		right->map_token = NULL;
2438 	}
2439 	left_nritems -= push_items;
2440 	btrfs_set_header_nritems(left, left_nritems);
2441 
2442 	if (left_nritems)
2443 		btrfs_mark_buffer_dirty(left);
2444 	btrfs_mark_buffer_dirty(right);
2445 
2446 	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2447 	BUG_ON(ret);
2448 
2449 	btrfs_item_key(right, &disk_key, 0);
2450 	btrfs_set_node_key(upper, &disk_key, slot + 1);
2451 	btrfs_mark_buffer_dirty(upper);
2452 
2453 	/* then fixup the leaf pointer in the path */
2454 	if (path->slots[0] >= left_nritems) {
2455 		path->slots[0] -= left_nritems;
2456 		if (btrfs_header_nritems(path->nodes[0]) == 0)
2457 			clean_tree_block(trans, root, path->nodes[0]);
2458 		btrfs_tree_unlock(path->nodes[0]);
2459 		free_extent_buffer(path->nodes[0]);
2460 		path->nodes[0] = right;
2461 		path->slots[1] += 1;
2462 	} else {
2463 		btrfs_tree_unlock(right);
2464 		free_extent_buffer(right);
2465 	}
2466 	return 0;
2467 
2468 out_unlock:
2469 	btrfs_tree_unlock(right);
2470 	free_extent_buffer(right);
2471 	return 1;
2472 }
2473 
2474 /*
2475  * push some data in the path leaf to the right, trying to free up at
2476  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2477  *
2478  * returns 1 if the push failed because the other node didn't have enough
2479  * room, 0 if everything worked out and < 0 if there were major errors.
2480  */
2481 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2482 			   *root, struct btrfs_path *path, int data_size,
2483 			   int empty)
2484 {
2485 	struct extent_buffer *left = path->nodes[0];
2486 	struct extent_buffer *right;
2487 	struct extent_buffer *upper;
2488 	int slot;
2489 	int free_space;
2490 	u32 left_nritems;
2491 	int ret;
2492 
2493 	if (!path->nodes[1])
2494 		return 1;
2495 
2496 	slot = path->slots[1];
2497 	upper = path->nodes[1];
2498 	if (slot >= btrfs_header_nritems(upper) - 1)
2499 		return 1;
2500 
2501 	btrfs_assert_tree_locked(path->nodes[1]);
2502 
2503 	right = read_node_slot(root, upper, slot + 1);
2504 	btrfs_tree_lock(right);
2505 	btrfs_set_lock_blocking(right);
2506 
2507 	free_space = btrfs_leaf_free_space(root, right);
2508 	if (free_space < data_size)
2509 		goto out_unlock;
2510 
2511 	/* cow and double check */
2512 	ret = btrfs_cow_block(trans, root, right, upper,
2513 			      slot + 1, &right);
2514 	if (ret)
2515 		goto out_unlock;
2516 
2517 	free_space = btrfs_leaf_free_space(root, right);
2518 	if (free_space < data_size)
2519 		goto out_unlock;
2520 
2521 	left_nritems = btrfs_header_nritems(left);
2522 	if (left_nritems == 0)
2523 		goto out_unlock;
2524 
2525 	return __push_leaf_right(trans, root, path, data_size, empty,
2526 				right, free_space, left_nritems);
2527 out_unlock:
2528 	btrfs_tree_unlock(right);
2529 	free_extent_buffer(right);
2530 	return 1;
2531 }
2532 
2533 /*
2534  * push some data in the path leaf to the left, trying to free up at
2535  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2536  */
2537 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2538 				     struct btrfs_root *root,
2539 				     struct btrfs_path *path, int data_size,
2540 				     int empty, struct extent_buffer *left,
2541 				     int free_space, int right_nritems)
2542 {
2543 	struct btrfs_disk_key disk_key;
2544 	struct extent_buffer *right = path->nodes[0];
2545 	int slot;
2546 	int i;
2547 	int push_space = 0;
2548 	int push_items = 0;
2549 	struct btrfs_item *item;
2550 	u32 old_left_nritems;
2551 	u32 nr;
2552 	int ret = 0;
2553 	int wret;
2554 	u32 this_item_size;
2555 	u32 old_left_item_size;
2556 
2557 	slot = path->slots[1];
2558 
2559 	if (empty)
2560 		nr = right_nritems;
2561 	else
2562 		nr = right_nritems - 1;
2563 
2564 	for (i = 0; i < nr; i++) {
2565 		item = btrfs_item_nr(right, i);
2566 		if (!right->map_token) {
2567 			map_extent_buffer(right, (unsigned long)item,
2568 					sizeof(struct btrfs_item),
2569 					&right->map_token, &right->kaddr,
2570 					&right->map_start, &right->map_len,
2571 					KM_USER1);
2572 		}
2573 
2574 		if (!empty && push_items > 0) {
2575 			if (path->slots[0] < i)
2576 				break;
2577 			if (path->slots[0] == i) {
2578 				int space = btrfs_leaf_free_space(root, right);
2579 				if (space + push_space * 2 > free_space)
2580 					break;
2581 			}
2582 		}
2583 
2584 		if (path->slots[0] == i)
2585 			push_space += data_size;
2586 
2587 		this_item_size = btrfs_item_size(right, item);
2588 		if (this_item_size + sizeof(*item) + push_space > free_space)
2589 			break;
2590 
2591 		push_items++;
2592 		push_space += this_item_size + sizeof(*item);
2593 	}
2594 
2595 	if (right->map_token) {
2596 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2597 		right->map_token = NULL;
2598 	}
2599 
2600 	if (push_items == 0) {
2601 		ret = 1;
2602 		goto out;
2603 	}
2604 	if (!empty && push_items == btrfs_header_nritems(right))
2605 		WARN_ON(1);
2606 
2607 	/* push data from right to left */
2608 	copy_extent_buffer(left, right,
2609 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
2610 			   btrfs_item_nr_offset(0),
2611 			   push_items * sizeof(struct btrfs_item));
2612 
2613 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
2614 		     btrfs_item_offset_nr(right, push_items - 1);
2615 
2616 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2617 		     leaf_data_end(root, left) - push_space,
2618 		     btrfs_leaf_data(right) +
2619 		     btrfs_item_offset_nr(right, push_items - 1),
2620 		     push_space);
2621 	old_left_nritems = btrfs_header_nritems(left);
2622 	BUG_ON(old_left_nritems <= 0);
2623 
2624 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2625 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2626 		u32 ioff;
2627 
2628 		item = btrfs_item_nr(left, i);
2629 		if (!left->map_token) {
2630 			map_extent_buffer(left, (unsigned long)item,
2631 					sizeof(struct btrfs_item),
2632 					&left->map_token, &left->kaddr,
2633 					&left->map_start, &left->map_len,
2634 					KM_USER1);
2635 		}
2636 
2637 		ioff = btrfs_item_offset(left, item);
2638 		btrfs_set_item_offset(left, item,
2639 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2640 	}
2641 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2642 	if (left->map_token) {
2643 		unmap_extent_buffer(left, left->map_token, KM_USER1);
2644 		left->map_token = NULL;
2645 	}
2646 
2647 	/* fixup right node */
2648 	if (push_items > right_nritems) {
2649 		printk(KERN_CRIT "push items %d nr %u\n", push_items,
2650 		       right_nritems);
2651 		WARN_ON(1);
2652 	}
2653 
2654 	if (push_items < right_nritems) {
2655 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
2656 						  leaf_data_end(root, right);
2657 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
2658 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2659 				      btrfs_leaf_data(right) +
2660 				      leaf_data_end(root, right), push_space);
2661 
2662 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2663 			      btrfs_item_nr_offset(push_items),
2664 			     (btrfs_header_nritems(right) - push_items) *
2665 			     sizeof(struct btrfs_item));
2666 	}
2667 	right_nritems -= push_items;
2668 	btrfs_set_header_nritems(right, right_nritems);
2669 	push_space = BTRFS_LEAF_DATA_SIZE(root);
2670 	for (i = 0; i < right_nritems; i++) {
2671 		item = btrfs_item_nr(right, i);
2672 
2673 		if (!right->map_token) {
2674 			map_extent_buffer(right, (unsigned long)item,
2675 					sizeof(struct btrfs_item),
2676 					&right->map_token, &right->kaddr,
2677 					&right->map_start, &right->map_len,
2678 					KM_USER1);
2679 		}
2680 
2681 		push_space = push_space - btrfs_item_size(right, item);
2682 		btrfs_set_item_offset(right, item, push_space);
2683 	}
2684 	if (right->map_token) {
2685 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2686 		right->map_token = NULL;
2687 	}
2688 
2689 	btrfs_mark_buffer_dirty(left);
2690 	if (right_nritems)
2691 		btrfs_mark_buffer_dirty(right);
2692 
2693 	ret = btrfs_update_ref(trans, root, right, left,
2694 			       old_left_nritems, push_items);
2695 	BUG_ON(ret);
2696 
2697 	btrfs_item_key(right, &disk_key, 0);
2698 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2699 	if (wret)
2700 		ret = wret;
2701 
2702 	/* then fixup the leaf pointer in the path */
2703 	if (path->slots[0] < push_items) {
2704 		path->slots[0] += old_left_nritems;
2705 		if (btrfs_header_nritems(path->nodes[0]) == 0)
2706 			clean_tree_block(trans, root, path->nodes[0]);
2707 		btrfs_tree_unlock(path->nodes[0]);
2708 		free_extent_buffer(path->nodes[0]);
2709 		path->nodes[0] = left;
2710 		path->slots[1] -= 1;
2711 	} else {
2712 		btrfs_tree_unlock(left);
2713 		free_extent_buffer(left);
2714 		path->slots[0] -= push_items;
2715 	}
2716 	BUG_ON(path->slots[0] < 0);
2717 	return ret;
2718 out:
2719 	btrfs_tree_unlock(left);
2720 	free_extent_buffer(left);
2721 	return ret;
2722 }
2723 
2724 /*
2725  * push some data in the path leaf to the left, trying to free up at
2726  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2727  */
2728 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2729 			  *root, struct btrfs_path *path, int data_size,
2730 			  int empty)
2731 {
2732 	struct extent_buffer *right = path->nodes[0];
2733 	struct extent_buffer *left;
2734 	int slot;
2735 	int free_space;
2736 	u32 right_nritems;
2737 	int ret = 0;
2738 
2739 	slot = path->slots[1];
2740 	if (slot == 0)
2741 		return 1;
2742 	if (!path->nodes[1])
2743 		return 1;
2744 
2745 	right_nritems = btrfs_header_nritems(right);
2746 	if (right_nritems == 0)
2747 		return 1;
2748 
2749 	btrfs_assert_tree_locked(path->nodes[1]);
2750 
2751 	left = read_node_slot(root, path->nodes[1], slot - 1);
2752 	btrfs_tree_lock(left);
2753 	btrfs_set_lock_blocking(left);
2754 
2755 	free_space = btrfs_leaf_free_space(root, left);
2756 	if (free_space < data_size) {
2757 		ret = 1;
2758 		goto out;
2759 	}
2760 
2761 	/* cow and double check */
2762 	ret = btrfs_cow_block(trans, root, left,
2763 			      path->nodes[1], slot - 1, &left);
2764 	if (ret) {
2765 		/* we hit -ENOSPC, but it isn't fatal here */
2766 		ret = 1;
2767 		goto out;
2768 	}
2769 
2770 	free_space = btrfs_leaf_free_space(root, left);
2771 	if (free_space < data_size) {
2772 		ret = 1;
2773 		goto out;
2774 	}
2775 
2776 	return __push_leaf_left(trans, root, path, data_size,
2777 			       empty, left, free_space, right_nritems);
2778 out:
2779 	btrfs_tree_unlock(left);
2780 	free_extent_buffer(left);
2781 	return ret;
2782 }
2783 
2784 /*
2785  * split the path's leaf in two, making sure there is at least data_size
2786  * available for the resulting leaf level of the path.
2787  *
2788  * returns 0 if all went well and < 0 on failure.
2789  */
2790 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2791 			       struct btrfs_root *root,
2792 			       struct btrfs_path *path,
2793 			       struct extent_buffer *l,
2794 			       struct extent_buffer *right,
2795 			       int slot, int mid, int nritems)
2796 {
2797 	int data_copy_size;
2798 	int rt_data_off;
2799 	int i;
2800 	int ret = 0;
2801 	int wret;
2802 	struct btrfs_disk_key disk_key;
2803 
2804 	nritems = nritems - mid;
2805 	btrfs_set_header_nritems(right, nritems);
2806 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2807 
2808 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2809 			   btrfs_item_nr_offset(mid),
2810 			   nritems * sizeof(struct btrfs_item));
2811 
2812 	copy_extent_buffer(right, l,
2813 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2814 		     data_copy_size, btrfs_leaf_data(l) +
2815 		     leaf_data_end(root, l), data_copy_size);
2816 
2817 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2818 		      btrfs_item_end_nr(l, mid);
2819 
2820 	for (i = 0; i < nritems; i++) {
2821 		struct btrfs_item *item = btrfs_item_nr(right, i);
2822 		u32 ioff;
2823 
2824 		if (!right->map_token) {
2825 			map_extent_buffer(right, (unsigned long)item,
2826 					sizeof(struct btrfs_item),
2827 					&right->map_token, &right->kaddr,
2828 					&right->map_start, &right->map_len,
2829 					KM_USER1);
2830 		}
2831 
2832 		ioff = btrfs_item_offset(right, item);
2833 		btrfs_set_item_offset(right, item, ioff + rt_data_off);
2834 	}
2835 
2836 	if (right->map_token) {
2837 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2838 		right->map_token = NULL;
2839 	}
2840 
2841 	btrfs_set_header_nritems(l, mid);
2842 	ret = 0;
2843 	btrfs_item_key(right, &disk_key, 0);
2844 	wret = insert_ptr(trans, root, path, &disk_key, right->start,
2845 			  path->slots[1] + 1, 1);
2846 	if (wret)
2847 		ret = wret;
2848 
2849 	btrfs_mark_buffer_dirty(right);
2850 	btrfs_mark_buffer_dirty(l);
2851 	BUG_ON(path->slots[0] != slot);
2852 
2853 	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2854 	BUG_ON(ret);
2855 
2856 	if (mid <= slot) {
2857 		btrfs_tree_unlock(path->nodes[0]);
2858 		free_extent_buffer(path->nodes[0]);
2859 		path->nodes[0] = right;
2860 		path->slots[0] -= mid;
2861 		path->slots[1] += 1;
2862 	} else {
2863 		btrfs_tree_unlock(right);
2864 		free_extent_buffer(right);
2865 	}
2866 
2867 	BUG_ON(path->slots[0] < 0);
2868 
2869 	return ret;
2870 }
2871 
2872 /*
2873  * split the path's leaf in two, making sure there is at least data_size
2874  * available for the resulting leaf level of the path.
2875  *
2876  * returns 0 if all went well and < 0 on failure.
2877  */
2878 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2879 			       struct btrfs_root *root,
2880 			       struct btrfs_key *ins_key,
2881 			       struct btrfs_path *path, int data_size,
2882 			       int extend)
2883 {
2884 	struct extent_buffer *l;
2885 	u32 nritems;
2886 	int mid;
2887 	int slot;
2888 	struct extent_buffer *right;
2889 	int ret = 0;
2890 	int wret;
2891 	int double_split;
2892 	int num_doubles = 0;
2893 
2894 	/* first try to make some room by pushing left and right */
2895 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2896 	    !trans->transaction->delayed_refs.flushing) {
2897 		wret = push_leaf_right(trans, root, path, data_size, 0);
2898 		if (wret < 0)
2899 			return wret;
2900 		if (wret) {
2901 			wret = push_leaf_left(trans, root, path, data_size, 0);
2902 			if (wret < 0)
2903 				return wret;
2904 		}
2905 		l = path->nodes[0];
2906 
2907 		/* did the pushes work? */
2908 		if (btrfs_leaf_free_space(root, l) >= data_size)
2909 			return 0;
2910 	}
2911 
2912 	if (!path->nodes[1]) {
2913 		ret = insert_new_root(trans, root, path, 1);
2914 		if (ret)
2915 			return ret;
2916 	}
2917 again:
2918 	double_split = 0;
2919 	l = path->nodes[0];
2920 	slot = path->slots[0];
2921 	nritems = btrfs_header_nritems(l);
2922 	mid = (nritems + 1) / 2;
2923 
2924 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
2925 					path->nodes[1]->start,
2926 					root->root_key.objectid,
2927 					trans->transid, 0, l->start, 0);
2928 	if (IS_ERR(right)) {
2929 		BUG_ON(1);
2930 		return PTR_ERR(right);
2931 	}
2932 
2933 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2934 	btrfs_set_header_bytenr(right, right->start);
2935 	btrfs_set_header_generation(right, trans->transid);
2936 	btrfs_set_header_owner(right, root->root_key.objectid);
2937 	btrfs_set_header_level(right, 0);
2938 	write_extent_buffer(right, root->fs_info->fsid,
2939 			    (unsigned long)btrfs_header_fsid(right),
2940 			    BTRFS_FSID_SIZE);
2941 
2942 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2943 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
2944 			    BTRFS_UUID_SIZE);
2945 
2946 	if (mid <= slot) {
2947 		if (nritems == 1 ||
2948 		    leaf_space_used(l, mid, nritems - mid) + data_size >
2949 			BTRFS_LEAF_DATA_SIZE(root)) {
2950 			if (slot >= nritems) {
2951 				struct btrfs_disk_key disk_key;
2952 
2953 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2954 				btrfs_set_header_nritems(right, 0);
2955 				wret = insert_ptr(trans, root, path,
2956 						  &disk_key, right->start,
2957 						  path->slots[1] + 1, 1);
2958 				if (wret)
2959 					ret = wret;
2960 
2961 				btrfs_tree_unlock(path->nodes[0]);
2962 				free_extent_buffer(path->nodes[0]);
2963 				path->nodes[0] = right;
2964 				path->slots[0] = 0;
2965 				path->slots[1] += 1;
2966 				btrfs_mark_buffer_dirty(right);
2967 				return ret;
2968 			}
2969 			mid = slot;
2970 			if (mid != nritems &&
2971 			    leaf_space_used(l, mid, nritems - mid) +
2972 			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2973 				double_split = 1;
2974 			}
2975 		}
2976 	} else {
2977 		if (leaf_space_used(l, 0, mid) + data_size >
2978 			BTRFS_LEAF_DATA_SIZE(root)) {
2979 			if (!extend && data_size && slot == 0) {
2980 				struct btrfs_disk_key disk_key;
2981 
2982 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2983 				btrfs_set_header_nritems(right, 0);
2984 				wret = insert_ptr(trans, root, path,
2985 						  &disk_key,
2986 						  right->start,
2987 						  path->slots[1], 1);
2988 				if (wret)
2989 					ret = wret;
2990 				btrfs_tree_unlock(path->nodes[0]);
2991 				free_extent_buffer(path->nodes[0]);
2992 				path->nodes[0] = right;
2993 				path->slots[0] = 0;
2994 				if (path->slots[1] == 0) {
2995 					wret = fixup_low_keys(trans, root,
2996 						      path, &disk_key, 1);
2997 					if (wret)
2998 						ret = wret;
2999 				}
3000 				btrfs_mark_buffer_dirty(right);
3001 				return ret;
3002 			} else if ((extend || !data_size) && slot == 0) {
3003 				mid = 1;
3004 			} else {
3005 				mid = slot;
3006 				if (mid != nritems &&
3007 				    leaf_space_used(l, mid, nritems - mid) +
3008 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3009 					double_split = 1;
3010 				}
3011 			}
3012 		}
3013 	}
3014 
3015 	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3016 	BUG_ON(ret);
3017 
3018 	if (double_split) {
3019 		BUG_ON(num_doubles != 0);
3020 		num_doubles++;
3021 		goto again;
3022 	}
3023 
3024 	return ret;
3025 }
3026 
3027 /*
3028  * This function splits a single item into two items,
3029  * giving 'new_key' to the new item and splitting the
3030  * old one at split_offset (from the start of the item).
3031  *
3032  * The path may be released by this operation.  After
3033  * the split, the path is pointing to the old item.  The
3034  * new item is going to be in the same node as the old one.
3035  *
3036  * Note, the item being split must be smaller enough to live alone on
3037  * a tree block with room for one extra struct btrfs_item
3038  *
3039  * This allows us to split the item in place, keeping a lock on the
3040  * leaf the entire time.
3041  */
3042 int btrfs_split_item(struct btrfs_trans_handle *trans,
3043 		     struct btrfs_root *root,
3044 		     struct btrfs_path *path,
3045 		     struct btrfs_key *new_key,
3046 		     unsigned long split_offset)
3047 {
3048 	u32 item_size;
3049 	struct extent_buffer *leaf;
3050 	struct btrfs_key orig_key;
3051 	struct btrfs_item *item;
3052 	struct btrfs_item *new_item;
3053 	int ret = 0;
3054 	int slot;
3055 	u32 nritems;
3056 	u32 orig_offset;
3057 	struct btrfs_disk_key disk_key;
3058 	char *buf;
3059 
3060 	leaf = path->nodes[0];
3061 	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
3062 	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
3063 		goto split;
3064 
3065 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3066 	btrfs_release_path(root, path);
3067 
3068 	path->search_for_split = 1;
3069 	path->keep_locks = 1;
3070 
3071 	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
3072 	path->search_for_split = 0;
3073 
3074 	/* if our item isn't there or got smaller, return now */
3075 	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
3076 							path->slots[0])) {
3077 		path->keep_locks = 0;
3078 		return -EAGAIN;
3079 	}
3080 
3081 	btrfs_set_path_blocking(path);
3082 	ret = split_leaf(trans, root, &orig_key, path,
3083 			 sizeof(struct btrfs_item), 1);
3084 	path->keep_locks = 0;
3085 	BUG_ON(ret);
3086 
3087 	btrfs_unlock_up_safe(path, 1);
3088 	leaf = path->nodes[0];
3089 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3090 
3091 split:
3092 	/*
3093 	 * make sure any changes to the path from split_leaf leave it
3094 	 * in a blocking state
3095 	 */
3096 	btrfs_set_path_blocking(path);
3097 
3098 	item = btrfs_item_nr(leaf, path->slots[0]);
3099 	orig_offset = btrfs_item_offset(leaf, item);
3100 	item_size = btrfs_item_size(leaf, item);
3101 
3102 	buf = kmalloc(item_size, GFP_NOFS);
3103 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3104 			    path->slots[0]), item_size);
3105 	slot = path->slots[0] + 1;
3106 	leaf = path->nodes[0];
3107 
3108 	nritems = btrfs_header_nritems(leaf);
3109 
3110 	if (slot != nritems) {
3111 		/* shift the items */
3112 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3113 			      btrfs_item_nr_offset(slot),
3114 			      (nritems - slot) * sizeof(struct btrfs_item));
3115 
3116 	}
3117 
3118 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3119 	btrfs_set_item_key(leaf, &disk_key, slot);
3120 
3121 	new_item = btrfs_item_nr(leaf, slot);
3122 
3123 	btrfs_set_item_offset(leaf, new_item, orig_offset);
3124 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3125 
3126 	btrfs_set_item_offset(leaf, item,
3127 			      orig_offset + item_size - split_offset);
3128 	btrfs_set_item_size(leaf, item, split_offset);
3129 
3130 	btrfs_set_header_nritems(leaf, nritems + 1);
3131 
3132 	/* write the data for the start of the original item */
3133 	write_extent_buffer(leaf, buf,
3134 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
3135 			    split_offset);
3136 
3137 	/* write the data for the new item */
3138 	write_extent_buffer(leaf, buf + split_offset,
3139 			    btrfs_item_ptr_offset(leaf, slot),
3140 			    item_size - split_offset);
3141 	btrfs_mark_buffer_dirty(leaf);
3142 
3143 	ret = 0;
3144 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3145 		btrfs_print_leaf(root, leaf);
3146 		BUG();
3147 	}
3148 	kfree(buf);
3149 	return ret;
3150 }
3151 
3152 /*
3153  * make the item pointed to by the path smaller.  new_size indicates
3154  * how small to make it, and from_end tells us if we just chop bytes
3155  * off the end of the item or if we shift the item to chop bytes off
3156  * the front.
3157  */
3158 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3159 			struct btrfs_root *root,
3160 			struct btrfs_path *path,
3161 			u32 new_size, int from_end)
3162 {
3163 	int ret = 0;
3164 	int slot;
3165 	int slot_orig;
3166 	struct extent_buffer *leaf;
3167 	struct btrfs_item *item;
3168 	u32 nritems;
3169 	unsigned int data_end;
3170 	unsigned int old_data_start;
3171 	unsigned int old_size;
3172 	unsigned int size_diff;
3173 	int i;
3174 
3175 	slot_orig = path->slots[0];
3176 	leaf = path->nodes[0];
3177 	slot = path->slots[0];
3178 
3179 	old_size = btrfs_item_size_nr(leaf, slot);
3180 	if (old_size == new_size)
3181 		return 0;
3182 
3183 	nritems = btrfs_header_nritems(leaf);
3184 	data_end = leaf_data_end(root, leaf);
3185 
3186 	old_data_start = btrfs_item_offset_nr(leaf, slot);
3187 
3188 	size_diff = old_size - new_size;
3189 
3190 	BUG_ON(slot < 0);
3191 	BUG_ON(slot >= nritems);
3192 
3193 	/*
3194 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3195 	 */
3196 	/* first correct the data pointers */
3197 	for (i = slot; i < nritems; i++) {
3198 		u32 ioff;
3199 		item = btrfs_item_nr(leaf, i);
3200 
3201 		if (!leaf->map_token) {
3202 			map_extent_buffer(leaf, (unsigned long)item,
3203 					sizeof(struct btrfs_item),
3204 					&leaf->map_token, &leaf->kaddr,
3205 					&leaf->map_start, &leaf->map_len,
3206 					KM_USER1);
3207 		}
3208 
3209 		ioff = btrfs_item_offset(leaf, item);
3210 		btrfs_set_item_offset(leaf, item, ioff + size_diff);
3211 	}
3212 
3213 	if (leaf->map_token) {
3214 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3215 		leaf->map_token = NULL;
3216 	}
3217 
3218 	/* shift the data */
3219 	if (from_end) {
3220 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3221 			      data_end + size_diff, btrfs_leaf_data(leaf) +
3222 			      data_end, old_data_start + new_size - data_end);
3223 	} else {
3224 		struct btrfs_disk_key disk_key;
3225 		u64 offset;
3226 
3227 		btrfs_item_key(leaf, &disk_key, slot);
3228 
3229 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3230 			unsigned long ptr;
3231 			struct btrfs_file_extent_item *fi;
3232 
3233 			fi = btrfs_item_ptr(leaf, slot,
3234 					    struct btrfs_file_extent_item);
3235 			fi = (struct btrfs_file_extent_item *)(
3236 			     (unsigned long)fi - size_diff);
3237 
3238 			if (btrfs_file_extent_type(leaf, fi) ==
3239 			    BTRFS_FILE_EXTENT_INLINE) {
3240 				ptr = btrfs_item_ptr_offset(leaf, slot);
3241 				memmove_extent_buffer(leaf, ptr,
3242 				      (unsigned long)fi,
3243 				      offsetof(struct btrfs_file_extent_item,
3244 						 disk_bytenr));
3245 			}
3246 		}
3247 
3248 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3249 			      data_end + size_diff, btrfs_leaf_data(leaf) +
3250 			      data_end, old_data_start - data_end);
3251 
3252 		offset = btrfs_disk_key_offset(&disk_key);
3253 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3254 		btrfs_set_item_key(leaf, &disk_key, slot);
3255 		if (slot == 0)
3256 			fixup_low_keys(trans, root, path, &disk_key, 1);
3257 	}
3258 
3259 	item = btrfs_item_nr(leaf, slot);
3260 	btrfs_set_item_size(leaf, item, new_size);
3261 	btrfs_mark_buffer_dirty(leaf);
3262 
3263 	ret = 0;
3264 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3265 		btrfs_print_leaf(root, leaf);
3266 		BUG();
3267 	}
3268 	return ret;
3269 }
3270 
3271 /*
3272  * make the item pointed to by the path bigger, data_size is the new size.
3273  */
3274 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3275 		      struct btrfs_root *root, struct btrfs_path *path,
3276 		      u32 data_size)
3277 {
3278 	int ret = 0;
3279 	int slot;
3280 	int slot_orig;
3281 	struct extent_buffer *leaf;
3282 	struct btrfs_item *item;
3283 	u32 nritems;
3284 	unsigned int data_end;
3285 	unsigned int old_data;
3286 	unsigned int old_size;
3287 	int i;
3288 
3289 	slot_orig = path->slots[0];
3290 	leaf = path->nodes[0];
3291 
3292 	nritems = btrfs_header_nritems(leaf);
3293 	data_end = leaf_data_end(root, leaf);
3294 
3295 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
3296 		btrfs_print_leaf(root, leaf);
3297 		BUG();
3298 	}
3299 	slot = path->slots[0];
3300 	old_data = btrfs_item_end_nr(leaf, slot);
3301 
3302 	BUG_ON(slot < 0);
3303 	if (slot >= nritems) {
3304 		btrfs_print_leaf(root, leaf);
3305 		printk(KERN_CRIT "slot %d too large, nritems %d\n",
3306 		       slot, nritems);
3307 		BUG_ON(1);
3308 	}
3309 
3310 	/*
3311 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3312 	 */
3313 	/* first correct the data pointers */
3314 	for (i = slot; i < nritems; i++) {
3315 		u32 ioff;
3316 		item = btrfs_item_nr(leaf, i);
3317 
3318 		if (!leaf->map_token) {
3319 			map_extent_buffer(leaf, (unsigned long)item,
3320 					sizeof(struct btrfs_item),
3321 					&leaf->map_token, &leaf->kaddr,
3322 					&leaf->map_start, &leaf->map_len,
3323 					KM_USER1);
3324 		}
3325 		ioff = btrfs_item_offset(leaf, item);
3326 		btrfs_set_item_offset(leaf, item, ioff - data_size);
3327 	}
3328 
3329 	if (leaf->map_token) {
3330 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3331 		leaf->map_token = NULL;
3332 	}
3333 
3334 	/* shift the data */
3335 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3336 		      data_end - data_size, btrfs_leaf_data(leaf) +
3337 		      data_end, old_data - data_end);
3338 
3339 	data_end = old_data;
3340 	old_size = btrfs_item_size_nr(leaf, slot);
3341 	item = btrfs_item_nr(leaf, slot);
3342 	btrfs_set_item_size(leaf, item, old_size + data_size);
3343 	btrfs_mark_buffer_dirty(leaf);
3344 
3345 	ret = 0;
3346 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3347 		btrfs_print_leaf(root, leaf);
3348 		BUG();
3349 	}
3350 	return ret;
3351 }
3352 
3353 /*
3354  * Given a key and some data, insert items into the tree.
3355  * This does all the path init required, making room in the tree if needed.
3356  * Returns the number of keys that were inserted.
3357  */
3358 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3359 			    struct btrfs_root *root,
3360 			    struct btrfs_path *path,
3361 			    struct btrfs_key *cpu_key, u32 *data_size,
3362 			    int nr)
3363 {
3364 	struct extent_buffer *leaf;
3365 	struct btrfs_item *item;
3366 	int ret = 0;
3367 	int slot;
3368 	int i;
3369 	u32 nritems;
3370 	u32 total_data = 0;
3371 	u32 total_size = 0;
3372 	unsigned int data_end;
3373 	struct btrfs_disk_key disk_key;
3374 	struct btrfs_key found_key;
3375 
3376 	for (i = 0; i < nr; i++) {
3377 		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3378 		    BTRFS_LEAF_DATA_SIZE(root)) {
3379 			break;
3380 			nr = i;
3381 		}
3382 		total_data += data_size[i];
3383 		total_size += data_size[i] + sizeof(struct btrfs_item);
3384 	}
3385 	BUG_ON(nr == 0);
3386 
3387 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3388 	if (ret == 0)
3389 		return -EEXIST;
3390 	if (ret < 0)
3391 		goto out;
3392 
3393 	leaf = path->nodes[0];
3394 
3395 	nritems = btrfs_header_nritems(leaf);
3396 	data_end = leaf_data_end(root, leaf);
3397 
3398 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3399 		for (i = nr; i >= 0; i--) {
3400 			total_data -= data_size[i];
3401 			total_size -= data_size[i] + sizeof(struct btrfs_item);
3402 			if (total_size < btrfs_leaf_free_space(root, leaf))
3403 				break;
3404 		}
3405 		nr = i;
3406 	}
3407 
3408 	slot = path->slots[0];
3409 	BUG_ON(slot < 0);
3410 
3411 	if (slot != nritems) {
3412 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3413 
3414 		item = btrfs_item_nr(leaf, slot);
3415 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3416 
3417 		/* figure out how many keys we can insert in here */
3418 		total_data = data_size[0];
3419 		for (i = 1; i < nr; i++) {
3420 			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3421 				break;
3422 			total_data += data_size[i];
3423 		}
3424 		nr = i;
3425 
3426 		if (old_data < data_end) {
3427 			btrfs_print_leaf(root, leaf);
3428 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3429 			       slot, old_data, data_end);
3430 			BUG_ON(1);
3431 		}
3432 		/*
3433 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3434 		 */
3435 		/* first correct the data pointers */
3436 		WARN_ON(leaf->map_token);
3437 		for (i = slot; i < nritems; i++) {
3438 			u32 ioff;
3439 
3440 			item = btrfs_item_nr(leaf, i);
3441 			if (!leaf->map_token) {
3442 				map_extent_buffer(leaf, (unsigned long)item,
3443 					sizeof(struct btrfs_item),
3444 					&leaf->map_token, &leaf->kaddr,
3445 					&leaf->map_start, &leaf->map_len,
3446 					KM_USER1);
3447 			}
3448 
3449 			ioff = btrfs_item_offset(leaf, item);
3450 			btrfs_set_item_offset(leaf, item, ioff - total_data);
3451 		}
3452 		if (leaf->map_token) {
3453 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3454 			leaf->map_token = NULL;
3455 		}
3456 
3457 		/* shift the items */
3458 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3459 			      btrfs_item_nr_offset(slot),
3460 			      (nritems - slot) * sizeof(struct btrfs_item));
3461 
3462 		/* shift the data */
3463 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3464 			      data_end - total_data, btrfs_leaf_data(leaf) +
3465 			      data_end, old_data - data_end);
3466 		data_end = old_data;
3467 	} else {
3468 		/*
3469 		 * this sucks but it has to be done, if we are inserting at
3470 		 * the end of the leaf only insert 1 of the items, since we
3471 		 * have no way of knowing whats on the next leaf and we'd have
3472 		 * to drop our current locks to figure it out
3473 		 */
3474 		nr = 1;
3475 	}
3476 
3477 	/* setup the item for the new data */
3478 	for (i = 0; i < nr; i++) {
3479 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3480 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3481 		item = btrfs_item_nr(leaf, slot + i);
3482 		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3483 		data_end -= data_size[i];
3484 		btrfs_set_item_size(leaf, item, data_size[i]);
3485 	}
3486 	btrfs_set_header_nritems(leaf, nritems + nr);
3487 	btrfs_mark_buffer_dirty(leaf);
3488 
3489 	ret = 0;
3490 	if (slot == 0) {
3491 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3492 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3493 	}
3494 
3495 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3496 		btrfs_print_leaf(root, leaf);
3497 		BUG();
3498 	}
3499 out:
3500 	if (!ret)
3501 		ret = nr;
3502 	return ret;
3503 }
3504 
3505 /*
3506  * this is a helper for btrfs_insert_empty_items, the main goal here is
3507  * to save stack depth by doing the bulk of the work in a function
3508  * that doesn't call btrfs_search_slot
3509  */
3510 static noinline_for_stack int
3511 setup_items_for_insert(struct btrfs_trans_handle *trans,
3512 		      struct btrfs_root *root, struct btrfs_path *path,
3513 		      struct btrfs_key *cpu_key, u32 *data_size,
3514 		      u32 total_data, u32 total_size, int nr)
3515 {
3516 	struct btrfs_item *item;
3517 	int i;
3518 	u32 nritems;
3519 	unsigned int data_end;
3520 	struct btrfs_disk_key disk_key;
3521 	int ret;
3522 	struct extent_buffer *leaf;
3523 	int slot;
3524 
3525 	leaf = path->nodes[0];
3526 	slot = path->slots[0];
3527 
3528 	nritems = btrfs_header_nritems(leaf);
3529 	data_end = leaf_data_end(root, leaf);
3530 
3531 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3532 		btrfs_print_leaf(root, leaf);
3533 		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3534 		       total_size, btrfs_leaf_free_space(root, leaf));
3535 		BUG();
3536 	}
3537 
3538 	if (slot != nritems) {
3539 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3540 
3541 		if (old_data < data_end) {
3542 			btrfs_print_leaf(root, leaf);
3543 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3544 			       slot, old_data, data_end);
3545 			BUG_ON(1);
3546 		}
3547 		/*
3548 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3549 		 */
3550 		/* first correct the data pointers */
3551 		WARN_ON(leaf->map_token);
3552 		for (i = slot; i < nritems; i++) {
3553 			u32 ioff;
3554 
3555 			item = btrfs_item_nr(leaf, i);
3556 			if (!leaf->map_token) {
3557 				map_extent_buffer(leaf, (unsigned long)item,
3558 					sizeof(struct btrfs_item),
3559 					&leaf->map_token, &leaf->kaddr,
3560 					&leaf->map_start, &leaf->map_len,
3561 					KM_USER1);
3562 			}
3563 
3564 			ioff = btrfs_item_offset(leaf, item);
3565 			btrfs_set_item_offset(leaf, item, ioff - total_data);
3566 		}
3567 		if (leaf->map_token) {
3568 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3569 			leaf->map_token = NULL;
3570 		}
3571 
3572 		/* shift the items */
3573 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3574 			      btrfs_item_nr_offset(slot),
3575 			      (nritems - slot) * sizeof(struct btrfs_item));
3576 
3577 		/* shift the data */
3578 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3579 			      data_end - total_data, btrfs_leaf_data(leaf) +
3580 			      data_end, old_data - data_end);
3581 		data_end = old_data;
3582 	}
3583 
3584 	/* setup the item for the new data */
3585 	for (i = 0; i < nr; i++) {
3586 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3587 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3588 		item = btrfs_item_nr(leaf, slot + i);
3589 		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3590 		data_end -= data_size[i];
3591 		btrfs_set_item_size(leaf, item, data_size[i]);
3592 	}
3593 
3594 	btrfs_set_header_nritems(leaf, nritems + nr);
3595 
3596 	ret = 0;
3597 	if (slot == 0) {
3598 		struct btrfs_disk_key disk_key;
3599 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3600 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3601 	}
3602 	btrfs_unlock_up_safe(path, 1);
3603 	btrfs_mark_buffer_dirty(leaf);
3604 
3605 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3606 		btrfs_print_leaf(root, leaf);
3607 		BUG();
3608 	}
3609 	return ret;
3610 }
3611 
3612 /*
3613  * Given a key and some data, insert items into the tree.
3614  * This does all the path init required, making room in the tree if needed.
3615  */
3616 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3617 			    struct btrfs_root *root,
3618 			    struct btrfs_path *path,
3619 			    struct btrfs_key *cpu_key, u32 *data_size,
3620 			    int nr)
3621 {
3622 	struct extent_buffer *leaf;
3623 	int ret = 0;
3624 	int slot;
3625 	int i;
3626 	u32 total_size = 0;
3627 	u32 total_data = 0;
3628 
3629 	for (i = 0; i < nr; i++)
3630 		total_data += data_size[i];
3631 
3632 	total_size = total_data + (nr * sizeof(struct btrfs_item));
3633 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3634 	if (ret == 0)
3635 		return -EEXIST;
3636 	if (ret < 0)
3637 		goto out;
3638 
3639 	leaf = path->nodes[0];
3640 	slot = path->slots[0];
3641 	BUG_ON(slot < 0);
3642 
3643 	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3644 			       total_data, total_size, nr);
3645 
3646 out:
3647 	return ret;
3648 }
3649 
3650 /*
3651  * Given a key and some data, insert an item into the tree.
3652  * This does all the path init required, making room in the tree if needed.
3653  */
3654 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3655 		      *root, struct btrfs_key *cpu_key, void *data, u32
3656 		      data_size)
3657 {
3658 	int ret = 0;
3659 	struct btrfs_path *path;
3660 	struct extent_buffer *leaf;
3661 	unsigned long ptr;
3662 
3663 	path = btrfs_alloc_path();
3664 	BUG_ON(!path);
3665 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3666 	if (!ret) {
3667 		leaf = path->nodes[0];
3668 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3669 		write_extent_buffer(leaf, data, ptr, data_size);
3670 		btrfs_mark_buffer_dirty(leaf);
3671 	}
3672 	btrfs_free_path(path);
3673 	return ret;
3674 }
3675 
3676 /*
3677  * delete the pointer from a given node.
3678  *
3679  * the tree should have been previously balanced so the deletion does not
3680  * empty a node.
3681  */
3682 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3683 		   struct btrfs_path *path, int level, int slot)
3684 {
3685 	struct extent_buffer *parent = path->nodes[level];
3686 	u32 nritems;
3687 	int ret = 0;
3688 	int wret;
3689 
3690 	nritems = btrfs_header_nritems(parent);
3691 	if (slot != nritems - 1) {
3692 		memmove_extent_buffer(parent,
3693 			      btrfs_node_key_ptr_offset(slot),
3694 			      btrfs_node_key_ptr_offset(slot + 1),
3695 			      sizeof(struct btrfs_key_ptr) *
3696 			      (nritems - slot - 1));
3697 	}
3698 	nritems--;
3699 	btrfs_set_header_nritems(parent, nritems);
3700 	if (nritems == 0 && parent == root->node) {
3701 		BUG_ON(btrfs_header_level(root->node) != 1);
3702 		/* just turn the root into a leaf and break */
3703 		btrfs_set_header_level(root->node, 0);
3704 	} else if (slot == 0) {
3705 		struct btrfs_disk_key disk_key;
3706 
3707 		btrfs_node_key(parent, &disk_key, 0);
3708 		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3709 		if (wret)
3710 			ret = wret;
3711 	}
3712 	btrfs_mark_buffer_dirty(parent);
3713 	return ret;
3714 }
3715 
3716 /*
3717  * a helper function to delete the leaf pointed to by path->slots[1] and
3718  * path->nodes[1].  bytenr is the node block pointer, but since the callers
3719  * already know it, it is faster to have them pass it down than to
3720  * read it out of the node again.
3721  *
3722  * This deletes the pointer in path->nodes[1] and frees the leaf
3723  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3724  *
3725  * The path must have already been setup for deleting the leaf, including
3726  * all the proper balancing.  path->nodes[1] must be locked.
3727  */
3728 noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3729 			    struct btrfs_root *root,
3730 			    struct btrfs_path *path, u64 bytenr)
3731 {
3732 	int ret;
3733 	u64 root_gen = btrfs_header_generation(path->nodes[1]);
3734 	u64 parent_start = path->nodes[1]->start;
3735 	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3736 
3737 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
3738 	if (ret)
3739 		return ret;
3740 
3741 	/*
3742 	 * btrfs_free_extent is expensive, we want to make sure we
3743 	 * aren't holding any locks when we call it
3744 	 */
3745 	btrfs_unlock_up_safe(path, 0);
3746 
3747 	ret = btrfs_free_extent(trans, root, bytenr,
3748 				btrfs_level_size(root, 0),
3749 				parent_start, parent_owner,
3750 				root_gen, 0, 1);
3751 	return ret;
3752 }
3753 /*
3754  * delete the item at the leaf level in path.  If that empties
3755  * the leaf, remove it from the tree
3756  */
3757 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3758 		    struct btrfs_path *path, int slot, int nr)
3759 {
3760 	struct extent_buffer *leaf;
3761 	struct btrfs_item *item;
3762 	int last_off;
3763 	int dsize = 0;
3764 	int ret = 0;
3765 	int wret;
3766 	int i;
3767 	u32 nritems;
3768 
3769 	leaf = path->nodes[0];
3770 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3771 
3772 	for (i = 0; i < nr; i++)
3773 		dsize += btrfs_item_size_nr(leaf, slot + i);
3774 
3775 	nritems = btrfs_header_nritems(leaf);
3776 
3777 	if (slot + nr != nritems) {
3778 		int data_end = leaf_data_end(root, leaf);
3779 
3780 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3781 			      data_end + dsize,
3782 			      btrfs_leaf_data(leaf) + data_end,
3783 			      last_off - data_end);
3784 
3785 		for (i = slot + nr; i < nritems; i++) {
3786 			u32 ioff;
3787 
3788 			item = btrfs_item_nr(leaf, i);
3789 			if (!leaf->map_token) {
3790 				map_extent_buffer(leaf, (unsigned long)item,
3791 					sizeof(struct btrfs_item),
3792 					&leaf->map_token, &leaf->kaddr,
3793 					&leaf->map_start, &leaf->map_len,
3794 					KM_USER1);
3795 			}
3796 			ioff = btrfs_item_offset(leaf, item);
3797 			btrfs_set_item_offset(leaf, item, ioff + dsize);
3798 		}
3799 
3800 		if (leaf->map_token) {
3801 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3802 			leaf->map_token = NULL;
3803 		}
3804 
3805 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3806 			      btrfs_item_nr_offset(slot + nr),
3807 			      sizeof(struct btrfs_item) *
3808 			      (nritems - slot - nr));
3809 	}
3810 	btrfs_set_header_nritems(leaf, nritems - nr);
3811 	nritems -= nr;
3812 
3813 	/* delete the leaf if we've emptied it */
3814 	if (nritems == 0) {
3815 		if (leaf == root->node) {
3816 			btrfs_set_header_level(leaf, 0);
3817 		} else {
3818 			ret = btrfs_del_leaf(trans, root, path, leaf->start);
3819 			BUG_ON(ret);
3820 		}
3821 	} else {
3822 		int used = leaf_space_used(leaf, 0, nritems);
3823 		if (slot == 0) {
3824 			struct btrfs_disk_key disk_key;
3825 
3826 			btrfs_item_key(leaf, &disk_key, 0);
3827 			wret = fixup_low_keys(trans, root, path,
3828 					      &disk_key, 1);
3829 			if (wret)
3830 				ret = wret;
3831 		}
3832 
3833 		/* delete the leaf if it is mostly empty */
3834 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3835 		    !trans->transaction->delayed_refs.flushing) {
3836 			/* push_leaf_left fixes the path.
3837 			 * make sure the path still points to our leaf
3838 			 * for possible call to del_ptr below
3839 			 */
3840 			slot = path->slots[1];
3841 			extent_buffer_get(leaf);
3842 
3843 			btrfs_set_path_blocking(path);
3844 			wret = push_leaf_left(trans, root, path, 1, 1);
3845 			if (wret < 0 && wret != -ENOSPC)
3846 				ret = wret;
3847 
3848 			if (path->nodes[0] == leaf &&
3849 			    btrfs_header_nritems(leaf)) {
3850 				wret = push_leaf_right(trans, root, path, 1, 1);
3851 				if (wret < 0 && wret != -ENOSPC)
3852 					ret = wret;
3853 			}
3854 
3855 			if (btrfs_header_nritems(leaf) == 0) {
3856 				path->slots[1] = slot;
3857 				ret = btrfs_del_leaf(trans, root, path,
3858 						     leaf->start);
3859 				BUG_ON(ret);
3860 				free_extent_buffer(leaf);
3861 			} else {
3862 				/* if we're still in the path, make sure
3863 				 * we're dirty.  Otherwise, one of the
3864 				 * push_leaf functions must have already
3865 				 * dirtied this buffer
3866 				 */
3867 				if (path->nodes[0] == leaf)
3868 					btrfs_mark_buffer_dirty(leaf);
3869 				free_extent_buffer(leaf);
3870 			}
3871 		} else {
3872 			btrfs_mark_buffer_dirty(leaf);
3873 		}
3874 	}
3875 	return ret;
3876 }
3877 
3878 /*
3879  * search the tree again to find a leaf with lesser keys
3880  * returns 0 if it found something or 1 if there are no lesser leaves.
3881  * returns < 0 on io errors.
3882  *
3883  * This may release the path, and so you may lose any locks held at the
3884  * time you call it.
3885  */
3886 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3887 {
3888 	struct btrfs_key key;
3889 	struct btrfs_disk_key found_key;
3890 	int ret;
3891 
3892 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3893 
3894 	if (key.offset > 0)
3895 		key.offset--;
3896 	else if (key.type > 0)
3897 		key.type--;
3898 	else if (key.objectid > 0)
3899 		key.objectid--;
3900 	else
3901 		return 1;
3902 
3903 	btrfs_release_path(root, path);
3904 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3905 	if (ret < 0)
3906 		return ret;
3907 	btrfs_item_key(path->nodes[0], &found_key, 0);
3908 	ret = comp_keys(&found_key, &key);
3909 	if (ret < 0)
3910 		return 0;
3911 	return 1;
3912 }
3913 
3914 /*
3915  * A helper function to walk down the tree starting at min_key, and looking
3916  * for nodes or leaves that are either in cache or have a minimum
3917  * transaction id.  This is used by the btree defrag code, and tree logging
3918  *
3919  * This does not cow, but it does stuff the starting key it finds back
3920  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3921  * key and get a writable path.
3922  *
3923  * This does lock as it descends, and path->keep_locks should be set
3924  * to 1 by the caller.
3925  *
3926  * This honors path->lowest_level to prevent descent past a given level
3927  * of the tree.
3928  *
3929  * min_trans indicates the oldest transaction that you are interested
3930  * in walking through.  Any nodes or leaves older than min_trans are
3931  * skipped over (without reading them).
3932  *
3933  * returns zero if something useful was found, < 0 on error and 1 if there
3934  * was nothing in the tree that matched the search criteria.
3935  */
3936 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3937 			 struct btrfs_key *max_key,
3938 			 struct btrfs_path *path, int cache_only,
3939 			 u64 min_trans)
3940 {
3941 	struct extent_buffer *cur;
3942 	struct btrfs_key found_key;
3943 	int slot;
3944 	int sret;
3945 	u32 nritems;
3946 	int level;
3947 	int ret = 1;
3948 
3949 	WARN_ON(!path->keep_locks);
3950 again:
3951 	cur = btrfs_lock_root_node(root);
3952 	level = btrfs_header_level(cur);
3953 	WARN_ON(path->nodes[level]);
3954 	path->nodes[level] = cur;
3955 	path->locks[level] = 1;
3956 
3957 	if (btrfs_header_generation(cur) < min_trans) {
3958 		ret = 1;
3959 		goto out;
3960 	}
3961 	while (1) {
3962 		nritems = btrfs_header_nritems(cur);
3963 		level = btrfs_header_level(cur);
3964 		sret = bin_search(cur, min_key, level, &slot);
3965 
3966 		/* at the lowest level, we're done, setup the path and exit */
3967 		if (level == path->lowest_level) {
3968 			if (slot >= nritems)
3969 				goto find_next_key;
3970 			ret = 0;
3971 			path->slots[level] = slot;
3972 			btrfs_item_key_to_cpu(cur, &found_key, slot);
3973 			goto out;
3974 		}
3975 		if (sret && slot > 0)
3976 			slot--;
3977 		/*
3978 		 * check this node pointer against the cache_only and
3979 		 * min_trans parameters.  If it isn't in cache or is too
3980 		 * old, skip to the next one.
3981 		 */
3982 		while (slot < nritems) {
3983 			u64 blockptr;
3984 			u64 gen;
3985 			struct extent_buffer *tmp;
3986 			struct btrfs_disk_key disk_key;
3987 
3988 			blockptr = btrfs_node_blockptr(cur, slot);
3989 			gen = btrfs_node_ptr_generation(cur, slot);
3990 			if (gen < min_trans) {
3991 				slot++;
3992 				continue;
3993 			}
3994 			if (!cache_only)
3995 				break;
3996 
3997 			if (max_key) {
3998 				btrfs_node_key(cur, &disk_key, slot);
3999 				if (comp_keys(&disk_key, max_key) >= 0) {
4000 					ret = 1;
4001 					goto out;
4002 				}
4003 			}
4004 
4005 			tmp = btrfs_find_tree_block(root, blockptr,
4006 					    btrfs_level_size(root, level - 1));
4007 
4008 			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4009 				free_extent_buffer(tmp);
4010 				break;
4011 			}
4012 			if (tmp)
4013 				free_extent_buffer(tmp);
4014 			slot++;
4015 		}
4016 find_next_key:
4017 		/*
4018 		 * we didn't find a candidate key in this node, walk forward
4019 		 * and find another one
4020 		 */
4021 		if (slot >= nritems) {
4022 			path->slots[level] = slot;
4023 			btrfs_set_path_blocking(path);
4024 			sret = btrfs_find_next_key(root, path, min_key, level,
4025 						  cache_only, min_trans);
4026 			if (sret == 0) {
4027 				btrfs_release_path(root, path);
4028 				goto again;
4029 			} else {
4030 				goto out;
4031 			}
4032 		}
4033 		/* save our key for returning back */
4034 		btrfs_node_key_to_cpu(cur, &found_key, slot);
4035 		path->slots[level] = slot;
4036 		if (level == path->lowest_level) {
4037 			ret = 0;
4038 			unlock_up(path, level, 1);
4039 			goto out;
4040 		}
4041 		btrfs_set_path_blocking(path);
4042 		cur = read_node_slot(root, cur, slot);
4043 
4044 		btrfs_tree_lock(cur);
4045 
4046 		path->locks[level - 1] = 1;
4047 		path->nodes[level - 1] = cur;
4048 		unlock_up(path, level, 1);
4049 		btrfs_clear_path_blocking(path, NULL);
4050 	}
4051 out:
4052 	if (ret == 0)
4053 		memcpy(min_key, &found_key, sizeof(found_key));
4054 	btrfs_set_path_blocking(path);
4055 	return ret;
4056 }
4057 
4058 /*
4059  * this is similar to btrfs_next_leaf, but does not try to preserve
4060  * and fixup the path.  It looks for and returns the next key in the
4061  * tree based on the current path and the cache_only and min_trans
4062  * parameters.
4063  *
4064  * 0 is returned if another key is found, < 0 if there are any errors
4065  * and 1 is returned if there are no higher keys in the tree
4066  *
4067  * path->keep_locks should be set to 1 on the search made before
4068  * calling this function.
4069  */
4070 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4071 			struct btrfs_key *key, int lowest_level,
4072 			int cache_only, u64 min_trans)
4073 {
4074 	int level = lowest_level;
4075 	int slot;
4076 	struct extent_buffer *c;
4077 
4078 	WARN_ON(!path->keep_locks);
4079 	while (level < BTRFS_MAX_LEVEL) {
4080 		if (!path->nodes[level])
4081 			return 1;
4082 
4083 		slot = path->slots[level] + 1;
4084 		c = path->nodes[level];
4085 next:
4086 		if (slot >= btrfs_header_nritems(c)) {
4087 			level++;
4088 			if (level == BTRFS_MAX_LEVEL)
4089 				return 1;
4090 			continue;
4091 		}
4092 		if (level == 0)
4093 			btrfs_item_key_to_cpu(c, key, slot);
4094 		else {
4095 			u64 blockptr = btrfs_node_blockptr(c, slot);
4096 			u64 gen = btrfs_node_ptr_generation(c, slot);
4097 
4098 			if (cache_only) {
4099 				struct extent_buffer *cur;
4100 				cur = btrfs_find_tree_block(root, blockptr,
4101 					    btrfs_level_size(root, level - 1));
4102 				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4103 					slot++;
4104 					if (cur)
4105 						free_extent_buffer(cur);
4106 					goto next;
4107 				}
4108 				free_extent_buffer(cur);
4109 			}
4110 			if (gen < min_trans) {
4111 				slot++;
4112 				goto next;
4113 			}
4114 			btrfs_node_key_to_cpu(c, key, slot);
4115 		}
4116 		return 0;
4117 	}
4118 	return 1;
4119 }
4120 
4121 /*
4122  * search the tree again to find a leaf with greater keys
4123  * returns 0 if it found something or 1 if there are no greater leaves.
4124  * returns < 0 on io errors.
4125  */
4126 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4127 {
4128 	int slot;
4129 	int level;
4130 	struct extent_buffer *c;
4131 	struct extent_buffer *next;
4132 	struct btrfs_key key;
4133 	u32 nritems;
4134 	int ret;
4135 	int old_spinning = path->leave_spinning;
4136 	int force_blocking = 0;
4137 
4138 	nritems = btrfs_header_nritems(path->nodes[0]);
4139 	if (nritems == 0)
4140 		return 1;
4141 
4142 	/*
4143 	 * we take the blocks in an order that upsets lockdep.  Using
4144 	 * blocking mode is the only way around it.
4145 	 */
4146 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 	force_blocking = 1;
4148 #endif
4149 
4150 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151 again:
4152 	level = 1;
4153 	next = NULL;
4154 	btrfs_release_path(root, path);
4155 
4156 	path->keep_locks = 1;
4157 
4158 	if (!force_blocking)
4159 		path->leave_spinning = 1;
4160 
4161 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4162 	path->keep_locks = 0;
4163 
4164 	if (ret < 0)
4165 		return ret;
4166 
4167 	nritems = btrfs_header_nritems(path->nodes[0]);
4168 	/*
4169 	 * by releasing the path above we dropped all our locks.  A balance
4170 	 * could have added more items next to the key that used to be
4171 	 * at the very end of the block.  So, check again here and
4172 	 * advance the path if there are now more items available.
4173 	 */
4174 	if (nritems > 0 && path->slots[0] < nritems - 1) {
4175 		path->slots[0]++;
4176 		ret = 0;
4177 		goto done;
4178 	}
4179 
4180 	while (level < BTRFS_MAX_LEVEL) {
4181 		if (!path->nodes[level]) {
4182 			ret = 1;
4183 			goto done;
4184 		}
4185 
4186 		slot = path->slots[level] + 1;
4187 		c = path->nodes[level];
4188 		if (slot >= btrfs_header_nritems(c)) {
4189 			level++;
4190 			if (level == BTRFS_MAX_LEVEL) {
4191 				ret = 1;
4192 				goto done;
4193 			}
4194 			continue;
4195 		}
4196 
4197 		if (next) {
4198 			btrfs_tree_unlock(next);
4199 			free_extent_buffer(next);
4200 		}
4201 
4202 		next = c;
4203 		ret = read_block_for_search(NULL, root, path, &next, level,
4204 					    slot, &key);
4205 		if (ret == -EAGAIN)
4206 			goto again;
4207 
4208 		if (!path->skip_locking) {
4209 			ret = btrfs_try_spin_lock(next);
4210 			if (!ret) {
4211 				btrfs_set_path_blocking(path);
4212 				btrfs_tree_lock(next);
4213 				if (!force_blocking)
4214 					btrfs_clear_path_blocking(path, next);
4215 			}
4216 			if (force_blocking)
4217 				btrfs_set_lock_blocking(next);
4218 		}
4219 		break;
4220 	}
4221 	path->slots[level] = slot;
4222 	while (1) {
4223 		level--;
4224 		c = path->nodes[level];
4225 		if (path->locks[level])
4226 			btrfs_tree_unlock(c);
4227 
4228 		free_extent_buffer(c);
4229 		path->nodes[level] = next;
4230 		path->slots[level] = 0;
4231 		if (!path->skip_locking)
4232 			path->locks[level] = 1;
4233 
4234 		if (!level)
4235 			break;
4236 
4237 		ret = read_block_for_search(NULL, root, path, &next, level,
4238 					    0, &key);
4239 		if (ret == -EAGAIN)
4240 			goto again;
4241 
4242 		if (!path->skip_locking) {
4243 			btrfs_assert_tree_locked(path->nodes[level]);
4244 			ret = btrfs_try_spin_lock(next);
4245 			if (!ret) {
4246 				btrfs_set_path_blocking(path);
4247 				btrfs_tree_lock(next);
4248 				if (!force_blocking)
4249 					btrfs_clear_path_blocking(path, next);
4250 			}
4251 			if (force_blocking)
4252 				btrfs_set_lock_blocking(next);
4253 		}
4254 	}
4255 	ret = 0;
4256 done:
4257 	unlock_up(path, 0, 1);
4258 	path->leave_spinning = old_spinning;
4259 	if (!old_spinning)
4260 		btrfs_set_path_blocking(path);
4261 
4262 	return ret;
4263 }
4264 
4265 /*
4266  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4267  * searching until it gets past min_objectid or finds an item of 'type'
4268  *
4269  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4270  */
4271 int btrfs_previous_item(struct btrfs_root *root,
4272 			struct btrfs_path *path, u64 min_objectid,
4273 			int type)
4274 {
4275 	struct btrfs_key found_key;
4276 	struct extent_buffer *leaf;
4277 	u32 nritems;
4278 	int ret;
4279 
4280 	while (1) {
4281 		if (path->slots[0] == 0) {
4282 			btrfs_set_path_blocking(path);
4283 			ret = btrfs_prev_leaf(root, path);
4284 			if (ret != 0)
4285 				return ret;
4286 		} else {
4287 			path->slots[0]--;
4288 		}
4289 		leaf = path->nodes[0];
4290 		nritems = btrfs_header_nritems(leaf);
4291 		if (nritems == 0)
4292 			return 1;
4293 		if (path->slots[0] == nritems)
4294 			path->slots[0]--;
4295 
4296 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4297 		if (found_key.type == type)
4298 			return 0;
4299 		if (found_key.objectid < min_objectid)
4300 			break;
4301 		if (found_key.objectid == min_objectid &&
4302 		    found_key.type < type)
4303 			break;
4304 	}
4305 	return 1;
4306 }
4307