xref: /openbmc/linux/fs/btrfs/ctree.c (revision 78c99ba1)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "print-tree.h"
24 #include "locking.h"
25 
26 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 		      *root, struct btrfs_path *path, int level);
28 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 		      *root, struct btrfs_key *ins_key,
30 		      struct btrfs_path *path, int data_size, int extend);
31 static int push_node_left(struct btrfs_trans_handle *trans,
32 			  struct btrfs_root *root, struct extent_buffer *dst,
33 			  struct extent_buffer *src, int empty);
34 static int balance_node_right(struct btrfs_trans_handle *trans,
35 			      struct btrfs_root *root,
36 			      struct extent_buffer *dst_buf,
37 			      struct extent_buffer *src_buf);
38 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 		   struct btrfs_path *path, int level, int slot);
40 
41 struct btrfs_path *btrfs_alloc_path(void)
42 {
43 	struct btrfs_path *path;
44 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
45 	if (path)
46 		path->reada = 1;
47 	return path;
48 }
49 
50 /*
51  * set all locked nodes in the path to blocking locks.  This should
52  * be done before scheduling
53  */
54 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55 {
56 	int i;
57 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58 		if (p->nodes[i] && p->locks[i])
59 			btrfs_set_lock_blocking(p->nodes[i]);
60 	}
61 }
62 
63 /*
64  * reset all the locked nodes in the patch to spinning locks.
65  *
66  * held is used to keep lockdep happy, when lockdep is enabled
67  * we set held to a blocking lock before we go around and
68  * retake all the spinlocks in the path.  You can safely use NULL
69  * for held
70  */
71 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 					struct extent_buffer *held)
73 {
74 	int i;
75 
76 #ifdef CONFIG_DEBUG_LOCK_ALLOC
77 	/* lockdep really cares that we take all of these spinlocks
78 	 * in the right order.  If any of the locks in the path are not
79 	 * currently blocking, it is going to complain.  So, make really
80 	 * really sure by forcing the path to blocking before we clear
81 	 * the path blocking.
82 	 */
83 	if (held)
84 		btrfs_set_lock_blocking(held);
85 	btrfs_set_path_blocking(p);
86 #endif
87 
88 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 		if (p->nodes[i] && p->locks[i])
90 			btrfs_clear_lock_blocking(p->nodes[i]);
91 	}
92 
93 #ifdef CONFIG_DEBUG_LOCK_ALLOC
94 	if (held)
95 		btrfs_clear_lock_blocking(held);
96 #endif
97 }
98 
99 /* this also releases the path */
100 void btrfs_free_path(struct btrfs_path *p)
101 {
102 	btrfs_release_path(NULL, p);
103 	kmem_cache_free(btrfs_path_cachep, p);
104 }
105 
106 /*
107  * path release drops references on the extent buffers in the path
108  * and it drops any locks held by this path
109  *
110  * It is safe to call this on paths that no locks or extent buffers held.
111  */
112 noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
113 {
114 	int i;
115 
116 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
117 		p->slots[i] = 0;
118 		if (!p->nodes[i])
119 			continue;
120 		if (p->locks[i]) {
121 			btrfs_tree_unlock(p->nodes[i]);
122 			p->locks[i] = 0;
123 		}
124 		free_extent_buffer(p->nodes[i]);
125 		p->nodes[i] = NULL;
126 	}
127 }
128 
129 /*
130  * safely gets a reference on the root node of a tree.  A lock
131  * is not taken, so a concurrent writer may put a different node
132  * at the root of the tree.  See btrfs_lock_root_node for the
133  * looping required.
134  *
135  * The extent buffer returned by this has a reference taken, so
136  * it won't disappear.  It may stop being the root of the tree
137  * at any time because there are no locks held.
138  */
139 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
140 {
141 	struct extent_buffer *eb;
142 	spin_lock(&root->node_lock);
143 	eb = root->node;
144 	extent_buffer_get(eb);
145 	spin_unlock(&root->node_lock);
146 	return eb;
147 }
148 
149 /* loop around taking references on and locking the root node of the
150  * tree until you end up with a lock on the root.  A locked buffer
151  * is returned, with a reference held.
152  */
153 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
154 {
155 	struct extent_buffer *eb;
156 
157 	while (1) {
158 		eb = btrfs_root_node(root);
159 		btrfs_tree_lock(eb);
160 
161 		spin_lock(&root->node_lock);
162 		if (eb == root->node) {
163 			spin_unlock(&root->node_lock);
164 			break;
165 		}
166 		spin_unlock(&root->node_lock);
167 
168 		btrfs_tree_unlock(eb);
169 		free_extent_buffer(eb);
170 	}
171 	return eb;
172 }
173 
174 /* cowonly root (everything not a reference counted cow subvolume), just get
175  * put onto a simple dirty list.  transaction.c walks this to make sure they
176  * get properly updated on disk.
177  */
178 static void add_root_to_dirty_list(struct btrfs_root *root)
179 {
180 	if (root->track_dirty && list_empty(&root->dirty_list)) {
181 		list_add(&root->dirty_list,
182 			 &root->fs_info->dirty_cowonly_roots);
183 	}
184 }
185 
186 /*
187  * used by snapshot creation to make a copy of a root for a tree with
188  * a given objectid.  The buffer with the new root node is returned in
189  * cow_ret, and this func returns zero on success or a negative error code.
190  */
191 int btrfs_copy_root(struct btrfs_trans_handle *trans,
192 		      struct btrfs_root *root,
193 		      struct extent_buffer *buf,
194 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
195 {
196 	struct extent_buffer *cow;
197 	u32 nritems;
198 	int ret = 0;
199 	int level;
200 	struct btrfs_root *new_root;
201 
202 	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
203 	if (!new_root)
204 		return -ENOMEM;
205 
206 	memcpy(new_root, root, sizeof(*new_root));
207 	new_root->root_key.objectid = new_root_objectid;
208 
209 	WARN_ON(root->ref_cows && trans->transid !=
210 		root->fs_info->running_transaction->transid);
211 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
212 
213 	level = btrfs_header_level(buf);
214 	nritems = btrfs_header_nritems(buf);
215 
216 	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
217 				     new_root_objectid, trans->transid,
218 				     level, buf->start, 0);
219 	if (IS_ERR(cow)) {
220 		kfree(new_root);
221 		return PTR_ERR(cow);
222 	}
223 
224 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
225 	btrfs_set_header_bytenr(cow, cow->start);
226 	btrfs_set_header_generation(cow, trans->transid);
227 	btrfs_set_header_owner(cow, new_root_objectid);
228 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
229 
230 	write_extent_buffer(cow, root->fs_info->fsid,
231 			    (unsigned long)btrfs_header_fsid(cow),
232 			    BTRFS_FSID_SIZE);
233 
234 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
235 	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
236 	kfree(new_root);
237 
238 	if (ret)
239 		return ret;
240 
241 	btrfs_mark_buffer_dirty(cow);
242 	*cow_ret = cow;
243 	return 0;
244 }
245 
246 /*
247  * does the dirty work in cow of a single block.  The parent block (if
248  * supplied) is updated to point to the new cow copy.  The new buffer is marked
249  * dirty and returned locked.  If you modify the block it needs to be marked
250  * dirty again.
251  *
252  * search_start -- an allocation hint for the new block
253  *
254  * empty_size -- a hint that you plan on doing more cow.  This is the size in
255  * bytes the allocator should try to find free next to the block it returns.
256  * This is just a hint and may be ignored by the allocator.
257  */
258 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
259 			     struct btrfs_root *root,
260 			     struct extent_buffer *buf,
261 			     struct extent_buffer *parent, int parent_slot,
262 			     struct extent_buffer **cow_ret,
263 			     u64 search_start, u64 empty_size)
264 {
265 	u64 parent_start;
266 	struct extent_buffer *cow;
267 	u32 nritems;
268 	int ret = 0;
269 	int level;
270 	int unlock_orig = 0;
271 
272 	if (*cow_ret == buf)
273 		unlock_orig = 1;
274 
275 	btrfs_assert_tree_locked(buf);
276 
277 	if (parent)
278 		parent_start = parent->start;
279 	else
280 		parent_start = 0;
281 
282 	WARN_ON(root->ref_cows && trans->transid !=
283 		root->fs_info->running_transaction->transid);
284 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
285 
286 	level = btrfs_header_level(buf);
287 	nritems = btrfs_header_nritems(buf);
288 
289 	cow = btrfs_alloc_free_block(trans, root, buf->len,
290 				     parent_start, root->root_key.objectid,
291 				     trans->transid, level,
292 				     search_start, empty_size);
293 	if (IS_ERR(cow))
294 		return PTR_ERR(cow);
295 
296 	/* cow is set to blocking by btrfs_init_new_buffer */
297 
298 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
299 	btrfs_set_header_bytenr(cow, cow->start);
300 	btrfs_set_header_generation(cow, trans->transid);
301 	btrfs_set_header_owner(cow, root->root_key.objectid);
302 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
303 
304 	write_extent_buffer(cow, root->fs_info->fsid,
305 			    (unsigned long)btrfs_header_fsid(cow),
306 			    BTRFS_FSID_SIZE);
307 
308 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
309 	if (btrfs_header_generation(buf) != trans->transid) {
310 		u32 nr_extents;
311 		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
312 		if (ret)
313 			return ret;
314 
315 		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
316 		WARN_ON(ret);
317 	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
318 		/*
319 		 * There are only two places that can drop reference to
320 		 * tree blocks owned by living reloc trees, one is here,
321 		 * the other place is btrfs_drop_subtree. In both places,
322 		 * we check reference count while tree block is locked.
323 		 * Furthermore, if reference count is one, it won't get
324 		 * increased by someone else.
325 		 */
326 		u32 refs;
327 		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
328 					      buf->len, &refs);
329 		BUG_ON(ret);
330 		if (refs == 1) {
331 			ret = btrfs_update_ref(trans, root, buf, cow,
332 					       0, nritems);
333 			clean_tree_block(trans, root, buf);
334 		} else {
335 			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
336 		}
337 		BUG_ON(ret);
338 	} else {
339 		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
340 		if (ret)
341 			return ret;
342 		clean_tree_block(trans, root, buf);
343 	}
344 
345 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
346 		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
347 		WARN_ON(ret);
348 	}
349 
350 	if (buf == root->node) {
351 		WARN_ON(parent && parent != buf);
352 
353 		spin_lock(&root->node_lock);
354 		root->node = cow;
355 		extent_buffer_get(cow);
356 		spin_unlock(&root->node_lock);
357 
358 		if (buf != root->commit_root) {
359 			btrfs_free_extent(trans, root, buf->start,
360 					  buf->len, buf->start,
361 					  root->root_key.objectid,
362 					  btrfs_header_generation(buf),
363 					  level, 1);
364 		}
365 		free_extent_buffer(buf);
366 		add_root_to_dirty_list(root);
367 	} else {
368 		btrfs_set_node_blockptr(parent, parent_slot,
369 					cow->start);
370 		WARN_ON(trans->transid == 0);
371 		btrfs_set_node_ptr_generation(parent, parent_slot,
372 					      trans->transid);
373 		btrfs_mark_buffer_dirty(parent);
374 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
375 		btrfs_free_extent(trans, root, buf->start, buf->len,
376 				  parent_start, btrfs_header_owner(parent),
377 				  btrfs_header_generation(parent), level, 1);
378 	}
379 	if (unlock_orig)
380 		btrfs_tree_unlock(buf);
381 	free_extent_buffer(buf);
382 	btrfs_mark_buffer_dirty(cow);
383 	*cow_ret = cow;
384 	return 0;
385 }
386 
387 /*
388  * cows a single block, see __btrfs_cow_block for the real work.
389  * This version of it has extra checks so that a block isn't cow'd more than
390  * once per transaction, as long as it hasn't been written yet
391  */
392 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
393 		    struct btrfs_root *root, struct extent_buffer *buf,
394 		    struct extent_buffer *parent, int parent_slot,
395 		    struct extent_buffer **cow_ret)
396 {
397 	u64 search_start;
398 	int ret;
399 
400 	if (trans->transaction != root->fs_info->running_transaction) {
401 		printk(KERN_CRIT "trans %llu running %llu\n",
402 		       (unsigned long long)trans->transid,
403 		       (unsigned long long)
404 		       root->fs_info->running_transaction->transid);
405 		WARN_ON(1);
406 	}
407 	if (trans->transid != root->fs_info->generation) {
408 		printk(KERN_CRIT "trans %llu running %llu\n",
409 		       (unsigned long long)trans->transid,
410 		       (unsigned long long)root->fs_info->generation);
411 		WARN_ON(1);
412 	}
413 
414 	if (btrfs_header_generation(buf) == trans->transid &&
415 	    btrfs_header_owner(buf) == root->root_key.objectid &&
416 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
417 		*cow_ret = buf;
418 		return 0;
419 	}
420 
421 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
422 
423 	if (parent)
424 		btrfs_set_lock_blocking(parent);
425 	btrfs_set_lock_blocking(buf);
426 
427 	ret = __btrfs_cow_block(trans, root, buf, parent,
428 				 parent_slot, cow_ret, search_start, 0);
429 	return ret;
430 }
431 
432 /*
433  * helper function for defrag to decide if two blocks pointed to by a
434  * node are actually close by
435  */
436 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
437 {
438 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
439 		return 1;
440 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
441 		return 1;
442 	return 0;
443 }
444 
445 /*
446  * compare two keys in a memcmp fashion
447  */
448 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
449 {
450 	struct btrfs_key k1;
451 
452 	btrfs_disk_key_to_cpu(&k1, disk);
453 
454 	if (k1.objectid > k2->objectid)
455 		return 1;
456 	if (k1.objectid < k2->objectid)
457 		return -1;
458 	if (k1.type > k2->type)
459 		return 1;
460 	if (k1.type < k2->type)
461 		return -1;
462 	if (k1.offset > k2->offset)
463 		return 1;
464 	if (k1.offset < k2->offset)
465 		return -1;
466 	return 0;
467 }
468 
469 /*
470  * same as comp_keys only with two btrfs_key's
471  */
472 static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
473 {
474 	if (k1->objectid > k2->objectid)
475 		return 1;
476 	if (k1->objectid < k2->objectid)
477 		return -1;
478 	if (k1->type > k2->type)
479 		return 1;
480 	if (k1->type < k2->type)
481 		return -1;
482 	if (k1->offset > k2->offset)
483 		return 1;
484 	if (k1->offset < k2->offset)
485 		return -1;
486 	return 0;
487 }
488 
489 /*
490  * this is used by the defrag code to go through all the
491  * leaves pointed to by a node and reallocate them so that
492  * disk order is close to key order
493  */
494 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
495 		       struct btrfs_root *root, struct extent_buffer *parent,
496 		       int start_slot, int cache_only, u64 *last_ret,
497 		       struct btrfs_key *progress)
498 {
499 	struct extent_buffer *cur;
500 	u64 blocknr;
501 	u64 gen;
502 	u64 search_start = *last_ret;
503 	u64 last_block = 0;
504 	u64 other;
505 	u32 parent_nritems;
506 	int end_slot;
507 	int i;
508 	int err = 0;
509 	int parent_level;
510 	int uptodate;
511 	u32 blocksize;
512 	int progress_passed = 0;
513 	struct btrfs_disk_key disk_key;
514 
515 	parent_level = btrfs_header_level(parent);
516 	if (cache_only && parent_level != 1)
517 		return 0;
518 
519 	if (trans->transaction != root->fs_info->running_transaction)
520 		WARN_ON(1);
521 	if (trans->transid != root->fs_info->generation)
522 		WARN_ON(1);
523 
524 	parent_nritems = btrfs_header_nritems(parent);
525 	blocksize = btrfs_level_size(root, parent_level - 1);
526 	end_slot = parent_nritems;
527 
528 	if (parent_nritems == 1)
529 		return 0;
530 
531 	btrfs_set_lock_blocking(parent);
532 
533 	for (i = start_slot; i < end_slot; i++) {
534 		int close = 1;
535 
536 		if (!parent->map_token) {
537 			map_extent_buffer(parent,
538 					btrfs_node_key_ptr_offset(i),
539 					sizeof(struct btrfs_key_ptr),
540 					&parent->map_token, &parent->kaddr,
541 					&parent->map_start, &parent->map_len,
542 					KM_USER1);
543 		}
544 		btrfs_node_key(parent, &disk_key, i);
545 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
546 			continue;
547 
548 		progress_passed = 1;
549 		blocknr = btrfs_node_blockptr(parent, i);
550 		gen = btrfs_node_ptr_generation(parent, i);
551 		if (last_block == 0)
552 			last_block = blocknr;
553 
554 		if (i > 0) {
555 			other = btrfs_node_blockptr(parent, i - 1);
556 			close = close_blocks(blocknr, other, blocksize);
557 		}
558 		if (!close && i < end_slot - 2) {
559 			other = btrfs_node_blockptr(parent, i + 1);
560 			close = close_blocks(blocknr, other, blocksize);
561 		}
562 		if (close) {
563 			last_block = blocknr;
564 			continue;
565 		}
566 		if (parent->map_token) {
567 			unmap_extent_buffer(parent, parent->map_token,
568 					    KM_USER1);
569 			parent->map_token = NULL;
570 		}
571 
572 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
573 		if (cur)
574 			uptodate = btrfs_buffer_uptodate(cur, gen);
575 		else
576 			uptodate = 0;
577 		if (!cur || !uptodate) {
578 			if (cache_only) {
579 				free_extent_buffer(cur);
580 				continue;
581 			}
582 			if (!cur) {
583 				cur = read_tree_block(root, blocknr,
584 							 blocksize, gen);
585 			} else if (!uptodate) {
586 				btrfs_read_buffer(cur, gen);
587 			}
588 		}
589 		if (search_start == 0)
590 			search_start = last_block;
591 
592 		btrfs_tree_lock(cur);
593 		btrfs_set_lock_blocking(cur);
594 		err = __btrfs_cow_block(trans, root, cur, parent, i,
595 					&cur, search_start,
596 					min(16 * blocksize,
597 					    (end_slot - i) * blocksize));
598 		if (err) {
599 			btrfs_tree_unlock(cur);
600 			free_extent_buffer(cur);
601 			break;
602 		}
603 		search_start = cur->start;
604 		last_block = cur->start;
605 		*last_ret = search_start;
606 		btrfs_tree_unlock(cur);
607 		free_extent_buffer(cur);
608 	}
609 	if (parent->map_token) {
610 		unmap_extent_buffer(parent, parent->map_token,
611 				    KM_USER1);
612 		parent->map_token = NULL;
613 	}
614 	return err;
615 }
616 
617 /*
618  * The leaf data grows from end-to-front in the node.
619  * this returns the address of the start of the last item,
620  * which is the stop of the leaf data stack
621  */
622 static inline unsigned int leaf_data_end(struct btrfs_root *root,
623 					 struct extent_buffer *leaf)
624 {
625 	u32 nr = btrfs_header_nritems(leaf);
626 	if (nr == 0)
627 		return BTRFS_LEAF_DATA_SIZE(root);
628 	return btrfs_item_offset_nr(leaf, nr - 1);
629 }
630 
631 /*
632  * extra debugging checks to make sure all the items in a key are
633  * well formed and in the proper order
634  */
635 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
636 		      int level)
637 {
638 	struct extent_buffer *parent = NULL;
639 	struct extent_buffer *node = path->nodes[level];
640 	struct btrfs_disk_key parent_key;
641 	struct btrfs_disk_key node_key;
642 	int parent_slot;
643 	int slot;
644 	struct btrfs_key cpukey;
645 	u32 nritems = btrfs_header_nritems(node);
646 
647 	if (path->nodes[level + 1])
648 		parent = path->nodes[level + 1];
649 
650 	slot = path->slots[level];
651 	BUG_ON(nritems == 0);
652 	if (parent) {
653 		parent_slot = path->slots[level + 1];
654 		btrfs_node_key(parent, &parent_key, parent_slot);
655 		btrfs_node_key(node, &node_key, 0);
656 		BUG_ON(memcmp(&parent_key, &node_key,
657 			      sizeof(struct btrfs_disk_key)));
658 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
659 		       btrfs_header_bytenr(node));
660 	}
661 	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
662 	if (slot != 0) {
663 		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
664 		btrfs_node_key(node, &node_key, slot);
665 		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
666 	}
667 	if (slot < nritems - 1) {
668 		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
669 		btrfs_node_key(node, &node_key, slot);
670 		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
671 	}
672 	return 0;
673 }
674 
675 /*
676  * extra checking to make sure all the items in a leaf are
677  * well formed and in the proper order
678  */
679 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
680 		      int level)
681 {
682 	struct extent_buffer *leaf = path->nodes[level];
683 	struct extent_buffer *parent = NULL;
684 	int parent_slot;
685 	struct btrfs_key cpukey;
686 	struct btrfs_disk_key parent_key;
687 	struct btrfs_disk_key leaf_key;
688 	int slot = path->slots[0];
689 
690 	u32 nritems = btrfs_header_nritems(leaf);
691 
692 	if (path->nodes[level + 1])
693 		parent = path->nodes[level + 1];
694 
695 	if (nritems == 0)
696 		return 0;
697 
698 	if (parent) {
699 		parent_slot = path->slots[level + 1];
700 		btrfs_node_key(parent, &parent_key, parent_slot);
701 		btrfs_item_key(leaf, &leaf_key, 0);
702 
703 		BUG_ON(memcmp(&parent_key, &leaf_key,
704 		       sizeof(struct btrfs_disk_key)));
705 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
706 		       btrfs_header_bytenr(leaf));
707 	}
708 	if (slot != 0 && slot < nritems - 1) {
709 		btrfs_item_key(leaf, &leaf_key, slot);
710 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
711 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
712 			btrfs_print_leaf(root, leaf);
713 			printk(KERN_CRIT "slot %d offset bad key\n", slot);
714 			BUG_ON(1);
715 		}
716 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
717 		       btrfs_item_end_nr(leaf, slot)) {
718 			btrfs_print_leaf(root, leaf);
719 			printk(KERN_CRIT "slot %d offset bad\n", slot);
720 			BUG_ON(1);
721 		}
722 	}
723 	if (slot < nritems - 1) {
724 		btrfs_item_key(leaf, &leaf_key, slot);
725 		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
726 		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
727 		if (btrfs_item_offset_nr(leaf, slot) !=
728 			btrfs_item_end_nr(leaf, slot + 1)) {
729 			btrfs_print_leaf(root, leaf);
730 			printk(KERN_CRIT "slot %d offset bad\n", slot);
731 			BUG_ON(1);
732 		}
733 	}
734 	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
735 	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
736 	return 0;
737 }
738 
739 static noinline int check_block(struct btrfs_root *root,
740 				struct btrfs_path *path, int level)
741 {
742 	return 0;
743 	if (level == 0)
744 		return check_leaf(root, path, level);
745 	return check_node(root, path, level);
746 }
747 
748 /*
749  * search for key in the extent_buffer.  The items start at offset p,
750  * and they are item_size apart.  There are 'max' items in p.
751  *
752  * the slot in the array is returned via slot, and it points to
753  * the place where you would insert key if it is not found in
754  * the array.
755  *
756  * slot may point to max if the key is bigger than all of the keys
757  */
758 static noinline int generic_bin_search(struct extent_buffer *eb,
759 				       unsigned long p,
760 				       int item_size, struct btrfs_key *key,
761 				       int max, int *slot)
762 {
763 	int low = 0;
764 	int high = max;
765 	int mid;
766 	int ret;
767 	struct btrfs_disk_key *tmp = NULL;
768 	struct btrfs_disk_key unaligned;
769 	unsigned long offset;
770 	char *map_token = NULL;
771 	char *kaddr = NULL;
772 	unsigned long map_start = 0;
773 	unsigned long map_len = 0;
774 	int err;
775 
776 	while (low < high) {
777 		mid = (low + high) / 2;
778 		offset = p + mid * item_size;
779 
780 		if (!map_token || offset < map_start ||
781 		    (offset + sizeof(struct btrfs_disk_key)) >
782 		    map_start + map_len) {
783 			if (map_token) {
784 				unmap_extent_buffer(eb, map_token, KM_USER0);
785 				map_token = NULL;
786 			}
787 
788 			err = map_private_extent_buffer(eb, offset,
789 						sizeof(struct btrfs_disk_key),
790 						&map_token, &kaddr,
791 						&map_start, &map_len, KM_USER0);
792 
793 			if (!err) {
794 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
795 							map_start);
796 			} else {
797 				read_extent_buffer(eb, &unaligned,
798 						   offset, sizeof(unaligned));
799 				tmp = &unaligned;
800 			}
801 
802 		} else {
803 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
804 							map_start);
805 		}
806 		ret = comp_keys(tmp, key);
807 
808 		if (ret < 0)
809 			low = mid + 1;
810 		else if (ret > 0)
811 			high = mid;
812 		else {
813 			*slot = mid;
814 			if (map_token)
815 				unmap_extent_buffer(eb, map_token, KM_USER0);
816 			return 0;
817 		}
818 	}
819 	*slot = low;
820 	if (map_token)
821 		unmap_extent_buffer(eb, map_token, KM_USER0);
822 	return 1;
823 }
824 
825 /*
826  * simple bin_search frontend that does the right thing for
827  * leaves vs nodes
828  */
829 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
830 		      int level, int *slot)
831 {
832 	if (level == 0) {
833 		return generic_bin_search(eb,
834 					  offsetof(struct btrfs_leaf, items),
835 					  sizeof(struct btrfs_item),
836 					  key, btrfs_header_nritems(eb),
837 					  slot);
838 	} else {
839 		return generic_bin_search(eb,
840 					  offsetof(struct btrfs_node, ptrs),
841 					  sizeof(struct btrfs_key_ptr),
842 					  key, btrfs_header_nritems(eb),
843 					  slot);
844 	}
845 	return -1;
846 }
847 
848 /* given a node and slot number, this reads the blocks it points to.  The
849  * extent buffer is returned with a reference taken (but unlocked).
850  * NULL is returned on error.
851  */
852 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
853 				   struct extent_buffer *parent, int slot)
854 {
855 	int level = btrfs_header_level(parent);
856 	if (slot < 0)
857 		return NULL;
858 	if (slot >= btrfs_header_nritems(parent))
859 		return NULL;
860 
861 	BUG_ON(level == 0);
862 
863 	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
864 		       btrfs_level_size(root, level - 1),
865 		       btrfs_node_ptr_generation(parent, slot));
866 }
867 
868 /*
869  * node level balancing, used to make sure nodes are in proper order for
870  * item deletion.  We balance from the top down, so we have to make sure
871  * that a deletion won't leave an node completely empty later on.
872  */
873 static noinline int balance_level(struct btrfs_trans_handle *trans,
874 			 struct btrfs_root *root,
875 			 struct btrfs_path *path, int level)
876 {
877 	struct extent_buffer *right = NULL;
878 	struct extent_buffer *mid;
879 	struct extent_buffer *left = NULL;
880 	struct extent_buffer *parent = NULL;
881 	int ret = 0;
882 	int wret;
883 	int pslot;
884 	int orig_slot = path->slots[level];
885 	int err_on_enospc = 0;
886 	u64 orig_ptr;
887 
888 	if (level == 0)
889 		return 0;
890 
891 	mid = path->nodes[level];
892 
893 	WARN_ON(!path->locks[level]);
894 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
895 
896 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
897 
898 	if (level < BTRFS_MAX_LEVEL - 1)
899 		parent = path->nodes[level + 1];
900 	pslot = path->slots[level + 1];
901 
902 	/*
903 	 * deal with the case where there is only one pointer in the root
904 	 * by promoting the node below to a root
905 	 */
906 	if (!parent) {
907 		struct extent_buffer *child;
908 
909 		if (btrfs_header_nritems(mid) != 1)
910 			return 0;
911 
912 		/* promote the child to a root */
913 		child = read_node_slot(root, mid, 0);
914 		BUG_ON(!child);
915 		btrfs_tree_lock(child);
916 		btrfs_set_lock_blocking(child);
917 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
918 		BUG_ON(ret);
919 
920 		spin_lock(&root->node_lock);
921 		root->node = child;
922 		spin_unlock(&root->node_lock);
923 
924 		ret = btrfs_update_extent_ref(trans, root, child->start,
925 					      child->len,
926 					      mid->start, child->start,
927 					      root->root_key.objectid,
928 					      trans->transid, level - 1);
929 		BUG_ON(ret);
930 
931 		add_root_to_dirty_list(root);
932 		btrfs_tree_unlock(child);
933 
934 		path->locks[level] = 0;
935 		path->nodes[level] = NULL;
936 		clean_tree_block(trans, root, mid);
937 		btrfs_tree_unlock(mid);
938 		/* once for the path */
939 		free_extent_buffer(mid);
940 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
941 					mid->start, root->root_key.objectid,
942 					btrfs_header_generation(mid),
943 					level, 1);
944 		/* once for the root ptr */
945 		free_extent_buffer(mid);
946 		return ret;
947 	}
948 	if (btrfs_header_nritems(mid) >
949 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
950 		return 0;
951 
952 	if (trans->transaction->delayed_refs.flushing &&
953 	    btrfs_header_nritems(mid) > 2)
954 		return 0;
955 
956 	if (btrfs_header_nritems(mid) < 2)
957 		err_on_enospc = 1;
958 
959 	left = read_node_slot(root, parent, pslot - 1);
960 	if (left) {
961 		btrfs_tree_lock(left);
962 		btrfs_set_lock_blocking(left);
963 		wret = btrfs_cow_block(trans, root, left,
964 				       parent, pslot - 1, &left);
965 		if (wret) {
966 			ret = wret;
967 			goto enospc;
968 		}
969 	}
970 	right = read_node_slot(root, parent, pslot + 1);
971 	if (right) {
972 		btrfs_tree_lock(right);
973 		btrfs_set_lock_blocking(right);
974 		wret = btrfs_cow_block(trans, root, right,
975 				       parent, pslot + 1, &right);
976 		if (wret) {
977 			ret = wret;
978 			goto enospc;
979 		}
980 	}
981 
982 	/* first, try to make some room in the middle buffer */
983 	if (left) {
984 		orig_slot += btrfs_header_nritems(left);
985 		wret = push_node_left(trans, root, left, mid, 1);
986 		if (wret < 0)
987 			ret = wret;
988 		if (btrfs_header_nritems(mid) < 2)
989 			err_on_enospc = 1;
990 	}
991 
992 	/*
993 	 * then try to empty the right most buffer into the middle
994 	 */
995 	if (right) {
996 		wret = push_node_left(trans, root, mid, right, 1);
997 		if (wret < 0 && wret != -ENOSPC)
998 			ret = wret;
999 		if (btrfs_header_nritems(right) == 0) {
1000 			u64 bytenr = right->start;
1001 			u64 generation = btrfs_header_generation(parent);
1002 			u32 blocksize = right->len;
1003 
1004 			clean_tree_block(trans, root, right);
1005 			btrfs_tree_unlock(right);
1006 			free_extent_buffer(right);
1007 			right = NULL;
1008 			wret = del_ptr(trans, root, path, level + 1, pslot +
1009 				       1);
1010 			if (wret)
1011 				ret = wret;
1012 			wret = btrfs_free_extent(trans, root, bytenr,
1013 						 blocksize, parent->start,
1014 						 btrfs_header_owner(parent),
1015 						 generation, level, 1);
1016 			if (wret)
1017 				ret = wret;
1018 		} else {
1019 			struct btrfs_disk_key right_key;
1020 			btrfs_node_key(right, &right_key, 0);
1021 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1022 			btrfs_mark_buffer_dirty(parent);
1023 		}
1024 	}
1025 	if (btrfs_header_nritems(mid) == 1) {
1026 		/*
1027 		 * we're not allowed to leave a node with one item in the
1028 		 * tree during a delete.  A deletion from lower in the tree
1029 		 * could try to delete the only pointer in this node.
1030 		 * So, pull some keys from the left.
1031 		 * There has to be a left pointer at this point because
1032 		 * otherwise we would have pulled some pointers from the
1033 		 * right
1034 		 */
1035 		BUG_ON(!left);
1036 		wret = balance_node_right(trans, root, mid, left);
1037 		if (wret < 0) {
1038 			ret = wret;
1039 			goto enospc;
1040 		}
1041 		if (wret == 1) {
1042 			wret = push_node_left(trans, root, left, mid, 1);
1043 			if (wret < 0)
1044 				ret = wret;
1045 		}
1046 		BUG_ON(wret == 1);
1047 	}
1048 	if (btrfs_header_nritems(mid) == 0) {
1049 		/* we've managed to empty the middle node, drop it */
1050 		u64 root_gen = btrfs_header_generation(parent);
1051 		u64 bytenr = mid->start;
1052 		u32 blocksize = mid->len;
1053 
1054 		clean_tree_block(trans, root, mid);
1055 		btrfs_tree_unlock(mid);
1056 		free_extent_buffer(mid);
1057 		mid = NULL;
1058 		wret = del_ptr(trans, root, path, level + 1, pslot);
1059 		if (wret)
1060 			ret = wret;
1061 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1062 					 parent->start,
1063 					 btrfs_header_owner(parent),
1064 					 root_gen, level, 1);
1065 		if (wret)
1066 			ret = wret;
1067 	} else {
1068 		/* update the parent key to reflect our changes */
1069 		struct btrfs_disk_key mid_key;
1070 		btrfs_node_key(mid, &mid_key, 0);
1071 		btrfs_set_node_key(parent, &mid_key, pslot);
1072 		btrfs_mark_buffer_dirty(parent);
1073 	}
1074 
1075 	/* update the path */
1076 	if (left) {
1077 		if (btrfs_header_nritems(left) > orig_slot) {
1078 			extent_buffer_get(left);
1079 			/* left was locked after cow */
1080 			path->nodes[level] = left;
1081 			path->slots[level + 1] -= 1;
1082 			path->slots[level] = orig_slot;
1083 			if (mid) {
1084 				btrfs_tree_unlock(mid);
1085 				free_extent_buffer(mid);
1086 			}
1087 		} else {
1088 			orig_slot -= btrfs_header_nritems(left);
1089 			path->slots[level] = orig_slot;
1090 		}
1091 	}
1092 	/* double check we haven't messed things up */
1093 	check_block(root, path, level);
1094 	if (orig_ptr !=
1095 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1096 		BUG();
1097 enospc:
1098 	if (right) {
1099 		btrfs_tree_unlock(right);
1100 		free_extent_buffer(right);
1101 	}
1102 	if (left) {
1103 		if (path->nodes[level] != left)
1104 			btrfs_tree_unlock(left);
1105 		free_extent_buffer(left);
1106 	}
1107 	return ret;
1108 }
1109 
1110 /* Node balancing for insertion.  Here we only split or push nodes around
1111  * when they are completely full.  This is also done top down, so we
1112  * have to be pessimistic.
1113  */
1114 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1115 					  struct btrfs_root *root,
1116 					  struct btrfs_path *path, int level)
1117 {
1118 	struct extent_buffer *right = NULL;
1119 	struct extent_buffer *mid;
1120 	struct extent_buffer *left = NULL;
1121 	struct extent_buffer *parent = NULL;
1122 	int ret = 0;
1123 	int wret;
1124 	int pslot;
1125 	int orig_slot = path->slots[level];
1126 	u64 orig_ptr;
1127 
1128 	if (level == 0)
1129 		return 1;
1130 
1131 	mid = path->nodes[level];
1132 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1133 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1134 
1135 	if (level < BTRFS_MAX_LEVEL - 1)
1136 		parent = path->nodes[level + 1];
1137 	pslot = path->slots[level + 1];
1138 
1139 	if (!parent)
1140 		return 1;
1141 
1142 	left = read_node_slot(root, parent, pslot - 1);
1143 
1144 	/* first, try to make some room in the middle buffer */
1145 	if (left) {
1146 		u32 left_nr;
1147 
1148 		btrfs_tree_lock(left);
1149 		btrfs_set_lock_blocking(left);
1150 
1151 		left_nr = btrfs_header_nritems(left);
1152 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1153 			wret = 1;
1154 		} else {
1155 			ret = btrfs_cow_block(trans, root, left, parent,
1156 					      pslot - 1, &left);
1157 			if (ret)
1158 				wret = 1;
1159 			else {
1160 				wret = push_node_left(trans, root,
1161 						      left, mid, 0);
1162 			}
1163 		}
1164 		if (wret < 0)
1165 			ret = wret;
1166 		if (wret == 0) {
1167 			struct btrfs_disk_key disk_key;
1168 			orig_slot += left_nr;
1169 			btrfs_node_key(mid, &disk_key, 0);
1170 			btrfs_set_node_key(parent, &disk_key, pslot);
1171 			btrfs_mark_buffer_dirty(parent);
1172 			if (btrfs_header_nritems(left) > orig_slot) {
1173 				path->nodes[level] = left;
1174 				path->slots[level + 1] -= 1;
1175 				path->slots[level] = orig_slot;
1176 				btrfs_tree_unlock(mid);
1177 				free_extent_buffer(mid);
1178 			} else {
1179 				orig_slot -=
1180 					btrfs_header_nritems(left);
1181 				path->slots[level] = orig_slot;
1182 				btrfs_tree_unlock(left);
1183 				free_extent_buffer(left);
1184 			}
1185 			return 0;
1186 		}
1187 		btrfs_tree_unlock(left);
1188 		free_extent_buffer(left);
1189 	}
1190 	right = read_node_slot(root, parent, pslot + 1);
1191 
1192 	/*
1193 	 * then try to empty the right most buffer into the middle
1194 	 */
1195 	if (right) {
1196 		u32 right_nr;
1197 
1198 		btrfs_tree_lock(right);
1199 		btrfs_set_lock_blocking(right);
1200 
1201 		right_nr = btrfs_header_nritems(right);
1202 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1203 			wret = 1;
1204 		} else {
1205 			ret = btrfs_cow_block(trans, root, right,
1206 					      parent, pslot + 1,
1207 					      &right);
1208 			if (ret)
1209 				wret = 1;
1210 			else {
1211 				wret = balance_node_right(trans, root,
1212 							  right, mid);
1213 			}
1214 		}
1215 		if (wret < 0)
1216 			ret = wret;
1217 		if (wret == 0) {
1218 			struct btrfs_disk_key disk_key;
1219 
1220 			btrfs_node_key(right, &disk_key, 0);
1221 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
1222 			btrfs_mark_buffer_dirty(parent);
1223 
1224 			if (btrfs_header_nritems(mid) <= orig_slot) {
1225 				path->nodes[level] = right;
1226 				path->slots[level + 1] += 1;
1227 				path->slots[level] = orig_slot -
1228 					btrfs_header_nritems(mid);
1229 				btrfs_tree_unlock(mid);
1230 				free_extent_buffer(mid);
1231 			} else {
1232 				btrfs_tree_unlock(right);
1233 				free_extent_buffer(right);
1234 			}
1235 			return 0;
1236 		}
1237 		btrfs_tree_unlock(right);
1238 		free_extent_buffer(right);
1239 	}
1240 	return 1;
1241 }
1242 
1243 /*
1244  * readahead one full node of leaves, finding things that are close
1245  * to the block in 'slot', and triggering ra on them.
1246  */
1247 static void reada_for_search(struct btrfs_root *root,
1248 			     struct btrfs_path *path,
1249 			     int level, int slot, u64 objectid)
1250 {
1251 	struct extent_buffer *node;
1252 	struct btrfs_disk_key disk_key;
1253 	u32 nritems;
1254 	u64 search;
1255 	u64 target;
1256 	u64 nread = 0;
1257 	int direction = path->reada;
1258 	struct extent_buffer *eb;
1259 	u32 nr;
1260 	u32 blocksize;
1261 	u32 nscan = 0;
1262 
1263 	if (level != 1)
1264 		return;
1265 
1266 	if (!path->nodes[level])
1267 		return;
1268 
1269 	node = path->nodes[level];
1270 
1271 	search = btrfs_node_blockptr(node, slot);
1272 	blocksize = btrfs_level_size(root, level - 1);
1273 	eb = btrfs_find_tree_block(root, search, blocksize);
1274 	if (eb) {
1275 		free_extent_buffer(eb);
1276 		return;
1277 	}
1278 
1279 	target = search;
1280 
1281 	nritems = btrfs_header_nritems(node);
1282 	nr = slot;
1283 	while (1) {
1284 		if (direction < 0) {
1285 			if (nr == 0)
1286 				break;
1287 			nr--;
1288 		} else if (direction > 0) {
1289 			nr++;
1290 			if (nr >= nritems)
1291 				break;
1292 		}
1293 		if (path->reada < 0 && objectid) {
1294 			btrfs_node_key(node, &disk_key, nr);
1295 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
1296 				break;
1297 		}
1298 		search = btrfs_node_blockptr(node, nr);
1299 		if ((search <= target && target - search <= 65536) ||
1300 		    (search > target && search - target <= 65536)) {
1301 			readahead_tree_block(root, search, blocksize,
1302 				     btrfs_node_ptr_generation(node, nr));
1303 			nread += blocksize;
1304 		}
1305 		nscan++;
1306 		if ((nread > 65536 || nscan > 32))
1307 			break;
1308 	}
1309 }
1310 
1311 /*
1312  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1313  * cache
1314  */
1315 static noinline int reada_for_balance(struct btrfs_root *root,
1316 				      struct btrfs_path *path, int level)
1317 {
1318 	int slot;
1319 	int nritems;
1320 	struct extent_buffer *parent;
1321 	struct extent_buffer *eb;
1322 	u64 gen;
1323 	u64 block1 = 0;
1324 	u64 block2 = 0;
1325 	int ret = 0;
1326 	int blocksize;
1327 
1328 	parent = path->nodes[level + 1];
1329 	if (!parent)
1330 		return 0;
1331 
1332 	nritems = btrfs_header_nritems(parent);
1333 	slot = path->slots[level + 1];
1334 	blocksize = btrfs_level_size(root, level);
1335 
1336 	if (slot > 0) {
1337 		block1 = btrfs_node_blockptr(parent, slot - 1);
1338 		gen = btrfs_node_ptr_generation(parent, slot - 1);
1339 		eb = btrfs_find_tree_block(root, block1, blocksize);
1340 		if (eb && btrfs_buffer_uptodate(eb, gen))
1341 			block1 = 0;
1342 		free_extent_buffer(eb);
1343 	}
1344 	if (slot + 1 < nritems) {
1345 		block2 = btrfs_node_blockptr(parent, slot + 1);
1346 		gen = btrfs_node_ptr_generation(parent, slot + 1);
1347 		eb = btrfs_find_tree_block(root, block2, blocksize);
1348 		if (eb && btrfs_buffer_uptodate(eb, gen))
1349 			block2 = 0;
1350 		free_extent_buffer(eb);
1351 	}
1352 	if (block1 || block2) {
1353 		ret = -EAGAIN;
1354 
1355 		/* release the whole path */
1356 		btrfs_release_path(root, path);
1357 
1358 		/* read the blocks */
1359 		if (block1)
1360 			readahead_tree_block(root, block1, blocksize, 0);
1361 		if (block2)
1362 			readahead_tree_block(root, block2, blocksize, 0);
1363 
1364 		if (block1) {
1365 			eb = read_tree_block(root, block1, blocksize, 0);
1366 			free_extent_buffer(eb);
1367 		}
1368 		if (block2) {
1369 			eb = read_tree_block(root, block2, blocksize, 0);
1370 			free_extent_buffer(eb);
1371 		}
1372 	}
1373 	return ret;
1374 }
1375 
1376 
1377 /*
1378  * when we walk down the tree, it is usually safe to unlock the higher layers
1379  * in the tree.  The exceptions are when our path goes through slot 0, because
1380  * operations on the tree might require changing key pointers higher up in the
1381  * tree.
1382  *
1383  * callers might also have set path->keep_locks, which tells this code to keep
1384  * the lock if the path points to the last slot in the block.  This is part of
1385  * walking through the tree, and selecting the next slot in the higher block.
1386  *
1387  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1388  * if lowest_unlock is 1, level 0 won't be unlocked
1389  */
1390 static noinline void unlock_up(struct btrfs_path *path, int level,
1391 			       int lowest_unlock)
1392 {
1393 	int i;
1394 	int skip_level = level;
1395 	int no_skips = 0;
1396 	struct extent_buffer *t;
1397 
1398 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1399 		if (!path->nodes[i])
1400 			break;
1401 		if (!path->locks[i])
1402 			break;
1403 		if (!no_skips && path->slots[i] == 0) {
1404 			skip_level = i + 1;
1405 			continue;
1406 		}
1407 		if (!no_skips && path->keep_locks) {
1408 			u32 nritems;
1409 			t = path->nodes[i];
1410 			nritems = btrfs_header_nritems(t);
1411 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1412 				skip_level = i + 1;
1413 				continue;
1414 			}
1415 		}
1416 		if (skip_level < i && i >= lowest_unlock)
1417 			no_skips = 1;
1418 
1419 		t = path->nodes[i];
1420 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1421 			btrfs_tree_unlock(t);
1422 			path->locks[i] = 0;
1423 		}
1424 	}
1425 }
1426 
1427 /*
1428  * This releases any locks held in the path starting at level and
1429  * going all the way up to the root.
1430  *
1431  * btrfs_search_slot will keep the lock held on higher nodes in a few
1432  * corner cases, such as COW of the block at slot zero in the node.  This
1433  * ignores those rules, and it should only be called when there are no
1434  * more updates to be done higher up in the tree.
1435  */
1436 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1437 {
1438 	int i;
1439 
1440 	if (path->keep_locks || path->lowest_level)
1441 		return;
1442 
1443 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1444 		if (!path->nodes[i])
1445 			continue;
1446 		if (!path->locks[i])
1447 			continue;
1448 		btrfs_tree_unlock(path->nodes[i]);
1449 		path->locks[i] = 0;
1450 	}
1451 }
1452 
1453 /*
1454  * helper function for btrfs_search_slot.  The goal is to find a block
1455  * in cache without setting the path to blocking.  If we find the block
1456  * we return zero and the path is unchanged.
1457  *
1458  * If we can't find the block, we set the path blocking and do some
1459  * reada.  -EAGAIN is returned and the search must be repeated.
1460  */
1461 static int
1462 read_block_for_search(struct btrfs_trans_handle *trans,
1463 		       struct btrfs_root *root, struct btrfs_path *p,
1464 		       struct extent_buffer **eb_ret, int level, int slot,
1465 		       struct btrfs_key *key)
1466 {
1467 	u64 blocknr;
1468 	u64 gen;
1469 	u32 blocksize;
1470 	struct extent_buffer *b = *eb_ret;
1471 	struct extent_buffer *tmp;
1472 	int ret;
1473 
1474 	blocknr = btrfs_node_blockptr(b, slot);
1475 	gen = btrfs_node_ptr_generation(b, slot);
1476 	blocksize = btrfs_level_size(root, level - 1);
1477 
1478 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1479 	if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1480 		/*
1481 		 * we found an up to date block without sleeping, return
1482 		 * right away
1483 		 */
1484 		*eb_ret = tmp;
1485 		return 0;
1486 	}
1487 
1488 	/*
1489 	 * reduce lock contention at high levels
1490 	 * of the btree by dropping locks before
1491 	 * we read.  Don't release the lock on the current
1492 	 * level because we need to walk this node to figure
1493 	 * out which blocks to read.
1494 	 */
1495 	btrfs_unlock_up_safe(p, level + 1);
1496 	btrfs_set_path_blocking(p);
1497 
1498 	if (tmp)
1499 		free_extent_buffer(tmp);
1500 	if (p->reada)
1501 		reada_for_search(root, p, level, slot, key->objectid);
1502 
1503 	btrfs_release_path(NULL, p);
1504 
1505 	ret = -EAGAIN;
1506 	tmp = read_tree_block(root, blocknr, blocksize, gen);
1507 	if (tmp) {
1508 		/*
1509 		 * If the read above didn't mark this buffer up to date,
1510 		 * it will never end up being up to date.  Set ret to EIO now
1511 		 * and give up so that our caller doesn't loop forever
1512 		 * on our EAGAINs.
1513 		 */
1514 		if (!btrfs_buffer_uptodate(tmp, 0))
1515 			ret = -EIO;
1516 		free_extent_buffer(tmp);
1517 	}
1518 	return ret;
1519 }
1520 
1521 /*
1522  * helper function for btrfs_search_slot.  This does all of the checks
1523  * for node-level blocks and does any balancing required based on
1524  * the ins_len.
1525  *
1526  * If no extra work was required, zero is returned.  If we had to
1527  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1528  * start over
1529  */
1530 static int
1531 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1532 		       struct btrfs_root *root, struct btrfs_path *p,
1533 		       struct extent_buffer *b, int level, int ins_len)
1534 {
1535 	int ret;
1536 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1537 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1538 		int sret;
1539 
1540 		sret = reada_for_balance(root, p, level);
1541 		if (sret)
1542 			goto again;
1543 
1544 		btrfs_set_path_blocking(p);
1545 		sret = split_node(trans, root, p, level);
1546 		btrfs_clear_path_blocking(p, NULL);
1547 
1548 		BUG_ON(sret > 0);
1549 		if (sret) {
1550 			ret = sret;
1551 			goto done;
1552 		}
1553 		b = p->nodes[level];
1554 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
1555 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1556 		int sret;
1557 
1558 		sret = reada_for_balance(root, p, level);
1559 		if (sret)
1560 			goto again;
1561 
1562 		btrfs_set_path_blocking(p);
1563 		sret = balance_level(trans, root, p, level);
1564 		btrfs_clear_path_blocking(p, NULL);
1565 
1566 		if (sret) {
1567 			ret = sret;
1568 			goto done;
1569 		}
1570 		b = p->nodes[level];
1571 		if (!b) {
1572 			btrfs_release_path(NULL, p);
1573 			goto again;
1574 		}
1575 		BUG_ON(btrfs_header_nritems(b) == 1);
1576 	}
1577 	return 0;
1578 
1579 again:
1580 	ret = -EAGAIN;
1581 done:
1582 	return ret;
1583 }
1584 
1585 /*
1586  * look for key in the tree.  path is filled in with nodes along the way
1587  * if key is found, we return zero and you can find the item in the leaf
1588  * level of the path (level 0)
1589  *
1590  * If the key isn't found, the path points to the slot where it should
1591  * be inserted, and 1 is returned.  If there are other errors during the
1592  * search a negative error number is returned.
1593  *
1594  * if ins_len > 0, nodes and leaves will be split as we walk down the
1595  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1596  * possible)
1597  */
1598 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1599 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
1600 		      ins_len, int cow)
1601 {
1602 	struct extent_buffer *b;
1603 	int slot;
1604 	int ret;
1605 	int level;
1606 	int lowest_unlock = 1;
1607 	u8 lowest_level = 0;
1608 
1609 	lowest_level = p->lowest_level;
1610 	WARN_ON(lowest_level && ins_len > 0);
1611 	WARN_ON(p->nodes[0] != NULL);
1612 
1613 	if (ins_len < 0)
1614 		lowest_unlock = 2;
1615 
1616 again:
1617 	if (p->skip_locking)
1618 		b = btrfs_root_node(root);
1619 	else
1620 		b = btrfs_lock_root_node(root);
1621 
1622 	while (b) {
1623 		level = btrfs_header_level(b);
1624 
1625 		/*
1626 		 * setup the path here so we can release it under lock
1627 		 * contention with the cow code
1628 		 */
1629 		p->nodes[level] = b;
1630 		if (!p->skip_locking)
1631 			p->locks[level] = 1;
1632 
1633 		if (cow) {
1634 			int wret;
1635 
1636 			/*
1637 			 * if we don't really need to cow this block
1638 			 * then we don't want to set the path blocking,
1639 			 * so we test it here
1640 			 */
1641 			if (btrfs_header_generation(b) == trans->transid &&
1642 			    btrfs_header_owner(b) == root->root_key.objectid &&
1643 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1644 				goto cow_done;
1645 			}
1646 			btrfs_set_path_blocking(p);
1647 
1648 			wret = btrfs_cow_block(trans, root, b,
1649 					       p->nodes[level + 1],
1650 					       p->slots[level + 1], &b);
1651 			if (wret) {
1652 				free_extent_buffer(b);
1653 				ret = wret;
1654 				goto done;
1655 			}
1656 		}
1657 cow_done:
1658 		BUG_ON(!cow && ins_len);
1659 		if (level != btrfs_header_level(b))
1660 			WARN_ON(1);
1661 		level = btrfs_header_level(b);
1662 
1663 		p->nodes[level] = b;
1664 		if (!p->skip_locking)
1665 			p->locks[level] = 1;
1666 
1667 		btrfs_clear_path_blocking(p, NULL);
1668 
1669 		/*
1670 		 * we have a lock on b and as long as we aren't changing
1671 		 * the tree, there is no way to for the items in b to change.
1672 		 * It is safe to drop the lock on our parent before we
1673 		 * go through the expensive btree search on b.
1674 		 *
1675 		 * If cow is true, then we might be changing slot zero,
1676 		 * which may require changing the parent.  So, we can't
1677 		 * drop the lock until after we know which slot we're
1678 		 * operating on.
1679 		 */
1680 		if (!cow)
1681 			btrfs_unlock_up_safe(p, level + 1);
1682 
1683 		ret = check_block(root, p, level);
1684 		if (ret) {
1685 			ret = -1;
1686 			goto done;
1687 		}
1688 
1689 		ret = bin_search(b, key, level, &slot);
1690 
1691 		if (level != 0) {
1692 			if (ret && slot > 0)
1693 				slot -= 1;
1694 			p->slots[level] = slot;
1695 			ret = setup_nodes_for_search(trans, root, p, b, level,
1696 						     ins_len);
1697 			if (ret == -EAGAIN)
1698 				goto again;
1699 			else if (ret)
1700 				goto done;
1701 			b = p->nodes[level];
1702 			slot = p->slots[level];
1703 
1704 			unlock_up(p, level, lowest_unlock);
1705 
1706 			/* this is only true while dropping a snapshot */
1707 			if (level == lowest_level) {
1708 				ret = 0;
1709 				goto done;
1710 			}
1711 
1712 			ret = read_block_for_search(trans, root, p,
1713 						    &b, level, slot, key);
1714 			if (ret == -EAGAIN)
1715 				goto again;
1716 
1717 			if (ret == -EIO)
1718 				goto done;
1719 
1720 			if (!p->skip_locking) {
1721 				int lret;
1722 
1723 				btrfs_clear_path_blocking(p, NULL);
1724 				lret = btrfs_try_spin_lock(b);
1725 
1726 				if (!lret) {
1727 					btrfs_set_path_blocking(p);
1728 					btrfs_tree_lock(b);
1729 					btrfs_clear_path_blocking(p, b);
1730 				}
1731 			}
1732 		} else {
1733 			p->slots[level] = slot;
1734 			if (ins_len > 0 &&
1735 			    btrfs_leaf_free_space(root, b) < ins_len) {
1736 				int sret;
1737 
1738 				btrfs_set_path_blocking(p);
1739 				sret = split_leaf(trans, root, key,
1740 						      p, ins_len, ret == 0);
1741 				btrfs_clear_path_blocking(p, NULL);
1742 
1743 				BUG_ON(sret > 0);
1744 				if (sret) {
1745 					ret = sret;
1746 					goto done;
1747 				}
1748 			}
1749 			if (!p->search_for_split)
1750 				unlock_up(p, level, lowest_unlock);
1751 			goto done;
1752 		}
1753 	}
1754 	ret = 1;
1755 done:
1756 	/*
1757 	 * we don't really know what they plan on doing with the path
1758 	 * from here on, so for now just mark it as blocking
1759 	 */
1760 	if (!p->leave_spinning)
1761 		btrfs_set_path_blocking(p);
1762 	if (ret < 0)
1763 		btrfs_release_path(root, p);
1764 	return ret;
1765 }
1766 
1767 int btrfs_merge_path(struct btrfs_trans_handle *trans,
1768 		     struct btrfs_root *root,
1769 		     struct btrfs_key *node_keys,
1770 		     u64 *nodes, int lowest_level)
1771 {
1772 	struct extent_buffer *eb;
1773 	struct extent_buffer *parent;
1774 	struct btrfs_key key;
1775 	u64 bytenr;
1776 	u64 generation;
1777 	u32 blocksize;
1778 	int level;
1779 	int slot;
1780 	int key_match;
1781 	int ret;
1782 
1783 	eb = btrfs_lock_root_node(root);
1784 	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
1785 	BUG_ON(ret);
1786 
1787 	btrfs_set_lock_blocking(eb);
1788 
1789 	parent = eb;
1790 	while (1) {
1791 		level = btrfs_header_level(parent);
1792 		if (level == 0 || level <= lowest_level)
1793 			break;
1794 
1795 		ret = bin_search(parent, &node_keys[lowest_level], level,
1796 				 &slot);
1797 		if (ret && slot > 0)
1798 			slot--;
1799 
1800 		bytenr = btrfs_node_blockptr(parent, slot);
1801 		if (nodes[level - 1] == bytenr)
1802 			break;
1803 
1804 		blocksize = btrfs_level_size(root, level - 1);
1805 		generation = btrfs_node_ptr_generation(parent, slot);
1806 		btrfs_node_key_to_cpu(eb, &key, slot);
1807 		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1808 
1809 		if (generation == trans->transid) {
1810 			eb = read_tree_block(root, bytenr, blocksize,
1811 					     generation);
1812 			btrfs_tree_lock(eb);
1813 			btrfs_set_lock_blocking(eb);
1814 		}
1815 
1816 		/*
1817 		 * if node keys match and node pointer hasn't been modified
1818 		 * in the running transaction, we can merge the path. for
1819 		 * blocks owened by reloc trees, the node pointer check is
1820 		 * skipped, this is because these blocks are fully controlled
1821 		 * by the space balance code, no one else can modify them.
1822 		 */
1823 		if (!nodes[level - 1] || !key_match ||
1824 		    (generation == trans->transid &&
1825 		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1826 			if (level == 1 || level == lowest_level + 1) {
1827 				if (generation == trans->transid) {
1828 					btrfs_tree_unlock(eb);
1829 					free_extent_buffer(eb);
1830 				}
1831 				break;
1832 			}
1833 
1834 			if (generation != trans->transid) {
1835 				eb = read_tree_block(root, bytenr, blocksize,
1836 						generation);
1837 				btrfs_tree_lock(eb);
1838 				btrfs_set_lock_blocking(eb);
1839 			}
1840 
1841 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
1842 					      &eb);
1843 			BUG_ON(ret);
1844 
1845 			if (root->root_key.objectid ==
1846 			    BTRFS_TREE_RELOC_OBJECTID) {
1847 				if (!nodes[level - 1]) {
1848 					nodes[level - 1] = eb->start;
1849 					memcpy(&node_keys[level - 1], &key,
1850 					       sizeof(node_keys[0]));
1851 				} else {
1852 					WARN_ON(1);
1853 				}
1854 			}
1855 
1856 			btrfs_tree_unlock(parent);
1857 			free_extent_buffer(parent);
1858 			parent = eb;
1859 			continue;
1860 		}
1861 
1862 		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1863 		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1864 		btrfs_mark_buffer_dirty(parent);
1865 
1866 		ret = btrfs_inc_extent_ref(trans, root,
1867 					nodes[level - 1],
1868 					blocksize, parent->start,
1869 					btrfs_header_owner(parent),
1870 					btrfs_header_generation(parent),
1871 					level - 1);
1872 		BUG_ON(ret);
1873 
1874 		/*
1875 		 * If the block was created in the running transaction,
1876 		 * it's possible this is the last reference to it, so we
1877 		 * should drop the subtree.
1878 		 */
1879 		if (generation == trans->transid) {
1880 			ret = btrfs_drop_subtree(trans, root, eb, parent);
1881 			BUG_ON(ret);
1882 			btrfs_tree_unlock(eb);
1883 			free_extent_buffer(eb);
1884 		} else {
1885 			ret = btrfs_free_extent(trans, root, bytenr,
1886 					blocksize, parent->start,
1887 					btrfs_header_owner(parent),
1888 					btrfs_header_generation(parent),
1889 					level - 1, 1);
1890 			BUG_ON(ret);
1891 		}
1892 		break;
1893 	}
1894 	btrfs_tree_unlock(parent);
1895 	free_extent_buffer(parent);
1896 	return 0;
1897 }
1898 
1899 /*
1900  * adjust the pointers going up the tree, starting at level
1901  * making sure the right key of each node is points to 'key'.
1902  * This is used after shifting pointers to the left, so it stops
1903  * fixing up pointers when a given leaf/node is not in slot 0 of the
1904  * higher levels
1905  *
1906  * If this fails to write a tree block, it returns -1, but continues
1907  * fixing up the blocks in ram so the tree is consistent.
1908  */
1909 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1910 			  struct btrfs_root *root, struct btrfs_path *path,
1911 			  struct btrfs_disk_key *key, int level)
1912 {
1913 	int i;
1914 	int ret = 0;
1915 	struct extent_buffer *t;
1916 
1917 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1918 		int tslot = path->slots[i];
1919 		if (!path->nodes[i])
1920 			break;
1921 		t = path->nodes[i];
1922 		btrfs_set_node_key(t, key, tslot);
1923 		btrfs_mark_buffer_dirty(path->nodes[i]);
1924 		if (tslot != 0)
1925 			break;
1926 	}
1927 	return ret;
1928 }
1929 
1930 /*
1931  * update item key.
1932  *
1933  * This function isn't completely safe. It's the caller's responsibility
1934  * that the new key won't break the order
1935  */
1936 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1937 			    struct btrfs_root *root, struct btrfs_path *path,
1938 			    struct btrfs_key *new_key)
1939 {
1940 	struct btrfs_disk_key disk_key;
1941 	struct extent_buffer *eb;
1942 	int slot;
1943 
1944 	eb = path->nodes[0];
1945 	slot = path->slots[0];
1946 	if (slot > 0) {
1947 		btrfs_item_key(eb, &disk_key, slot - 1);
1948 		if (comp_keys(&disk_key, new_key) >= 0)
1949 			return -1;
1950 	}
1951 	if (slot < btrfs_header_nritems(eb) - 1) {
1952 		btrfs_item_key(eb, &disk_key, slot + 1);
1953 		if (comp_keys(&disk_key, new_key) <= 0)
1954 			return -1;
1955 	}
1956 
1957 	btrfs_cpu_key_to_disk(&disk_key, new_key);
1958 	btrfs_set_item_key(eb, &disk_key, slot);
1959 	btrfs_mark_buffer_dirty(eb);
1960 	if (slot == 0)
1961 		fixup_low_keys(trans, root, path, &disk_key, 1);
1962 	return 0;
1963 }
1964 
1965 /*
1966  * try to push data from one node into the next node left in the
1967  * tree.
1968  *
1969  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1970  * error, and > 0 if there was no room in the left hand block.
1971  */
1972 static int push_node_left(struct btrfs_trans_handle *trans,
1973 			  struct btrfs_root *root, struct extent_buffer *dst,
1974 			  struct extent_buffer *src, int empty)
1975 {
1976 	int push_items = 0;
1977 	int src_nritems;
1978 	int dst_nritems;
1979 	int ret = 0;
1980 
1981 	src_nritems = btrfs_header_nritems(src);
1982 	dst_nritems = btrfs_header_nritems(dst);
1983 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1984 	WARN_ON(btrfs_header_generation(src) != trans->transid);
1985 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1986 
1987 	if (!empty && src_nritems <= 8)
1988 		return 1;
1989 
1990 	if (push_items <= 0)
1991 		return 1;
1992 
1993 	if (empty) {
1994 		push_items = min(src_nritems, push_items);
1995 		if (push_items < src_nritems) {
1996 			/* leave at least 8 pointers in the node if
1997 			 * we aren't going to empty it
1998 			 */
1999 			if (src_nritems - push_items < 8) {
2000 				if (push_items <= 8)
2001 					return 1;
2002 				push_items -= 8;
2003 			}
2004 		}
2005 	} else
2006 		push_items = min(src_nritems - 8, push_items);
2007 
2008 	copy_extent_buffer(dst, src,
2009 			   btrfs_node_key_ptr_offset(dst_nritems),
2010 			   btrfs_node_key_ptr_offset(0),
2011 			   push_items * sizeof(struct btrfs_key_ptr));
2012 
2013 	if (push_items < src_nritems) {
2014 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2015 				      btrfs_node_key_ptr_offset(push_items),
2016 				      (src_nritems - push_items) *
2017 				      sizeof(struct btrfs_key_ptr));
2018 	}
2019 	btrfs_set_header_nritems(src, src_nritems - push_items);
2020 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2021 	btrfs_mark_buffer_dirty(src);
2022 	btrfs_mark_buffer_dirty(dst);
2023 
2024 	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
2025 	BUG_ON(ret);
2026 
2027 	return ret;
2028 }
2029 
2030 /*
2031  * try to push data from one node into the next node right in the
2032  * tree.
2033  *
2034  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2035  * error, and > 0 if there was no room in the right hand block.
2036  *
2037  * this will  only push up to 1/2 the contents of the left node over
2038  */
2039 static int balance_node_right(struct btrfs_trans_handle *trans,
2040 			      struct btrfs_root *root,
2041 			      struct extent_buffer *dst,
2042 			      struct extent_buffer *src)
2043 {
2044 	int push_items = 0;
2045 	int max_push;
2046 	int src_nritems;
2047 	int dst_nritems;
2048 	int ret = 0;
2049 
2050 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2051 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2052 
2053 	src_nritems = btrfs_header_nritems(src);
2054 	dst_nritems = btrfs_header_nritems(dst);
2055 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2056 	if (push_items <= 0)
2057 		return 1;
2058 
2059 	if (src_nritems < 4)
2060 		return 1;
2061 
2062 	max_push = src_nritems / 2 + 1;
2063 	/* don't try to empty the node */
2064 	if (max_push >= src_nritems)
2065 		return 1;
2066 
2067 	if (max_push < push_items)
2068 		push_items = max_push;
2069 
2070 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2071 				      btrfs_node_key_ptr_offset(0),
2072 				      (dst_nritems) *
2073 				      sizeof(struct btrfs_key_ptr));
2074 
2075 	copy_extent_buffer(dst, src,
2076 			   btrfs_node_key_ptr_offset(0),
2077 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
2078 			   push_items * sizeof(struct btrfs_key_ptr));
2079 
2080 	btrfs_set_header_nritems(src, src_nritems - push_items);
2081 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2082 
2083 	btrfs_mark_buffer_dirty(src);
2084 	btrfs_mark_buffer_dirty(dst);
2085 
2086 	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
2087 	BUG_ON(ret);
2088 
2089 	return ret;
2090 }
2091 
2092 /*
2093  * helper function to insert a new root level in the tree.
2094  * A new node is allocated, and a single item is inserted to
2095  * point to the existing root
2096  *
2097  * returns zero on success or < 0 on failure.
2098  */
2099 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2100 			   struct btrfs_root *root,
2101 			   struct btrfs_path *path, int level)
2102 {
2103 	u64 lower_gen;
2104 	struct extent_buffer *lower;
2105 	struct extent_buffer *c;
2106 	struct extent_buffer *old;
2107 	struct btrfs_disk_key lower_key;
2108 	int ret;
2109 
2110 	BUG_ON(path->nodes[level]);
2111 	BUG_ON(path->nodes[level-1] != root->node);
2112 
2113 	lower = path->nodes[level-1];
2114 	if (level == 1)
2115 		btrfs_item_key(lower, &lower_key, 0);
2116 	else
2117 		btrfs_node_key(lower, &lower_key, 0);
2118 
2119 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2120 				   root->root_key.objectid, trans->transid,
2121 				   level, root->node->start, 0);
2122 	if (IS_ERR(c))
2123 		return PTR_ERR(c);
2124 
2125 	memset_extent_buffer(c, 0, 0, root->nodesize);
2126 	btrfs_set_header_nritems(c, 1);
2127 	btrfs_set_header_level(c, level);
2128 	btrfs_set_header_bytenr(c, c->start);
2129 	btrfs_set_header_generation(c, trans->transid);
2130 	btrfs_set_header_owner(c, root->root_key.objectid);
2131 
2132 	write_extent_buffer(c, root->fs_info->fsid,
2133 			    (unsigned long)btrfs_header_fsid(c),
2134 			    BTRFS_FSID_SIZE);
2135 
2136 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2137 			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
2138 			    BTRFS_UUID_SIZE);
2139 
2140 	btrfs_set_node_key(c, &lower_key, 0);
2141 	btrfs_set_node_blockptr(c, 0, lower->start);
2142 	lower_gen = btrfs_header_generation(lower);
2143 	WARN_ON(lower_gen != trans->transid);
2144 
2145 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2146 
2147 	btrfs_mark_buffer_dirty(c);
2148 
2149 	spin_lock(&root->node_lock);
2150 	old = root->node;
2151 	root->node = c;
2152 	spin_unlock(&root->node_lock);
2153 
2154 	ret = btrfs_update_extent_ref(trans, root, lower->start,
2155 				      lower->len, lower->start, c->start,
2156 				      root->root_key.objectid,
2157 				      trans->transid, level - 1);
2158 	BUG_ON(ret);
2159 
2160 	/* the super has an extra ref to root->node */
2161 	free_extent_buffer(old);
2162 
2163 	add_root_to_dirty_list(root);
2164 	extent_buffer_get(c);
2165 	path->nodes[level] = c;
2166 	path->locks[level] = 1;
2167 	path->slots[level] = 0;
2168 	return 0;
2169 }
2170 
2171 /*
2172  * worker function to insert a single pointer in a node.
2173  * the node should have enough room for the pointer already
2174  *
2175  * slot and level indicate where you want the key to go, and
2176  * blocknr is the block the key points to.
2177  *
2178  * returns zero on success and < 0 on any error
2179  */
2180 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2181 		      *root, struct btrfs_path *path, struct btrfs_disk_key
2182 		      *key, u64 bytenr, int slot, int level)
2183 {
2184 	struct extent_buffer *lower;
2185 	int nritems;
2186 
2187 	BUG_ON(!path->nodes[level]);
2188 	lower = path->nodes[level];
2189 	nritems = btrfs_header_nritems(lower);
2190 	BUG_ON(slot > nritems);
2191 	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2192 		BUG();
2193 	if (slot != nritems) {
2194 		memmove_extent_buffer(lower,
2195 			      btrfs_node_key_ptr_offset(slot + 1),
2196 			      btrfs_node_key_ptr_offset(slot),
2197 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
2198 	}
2199 	btrfs_set_node_key(lower, key, slot);
2200 	btrfs_set_node_blockptr(lower, slot, bytenr);
2201 	WARN_ON(trans->transid == 0);
2202 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2203 	btrfs_set_header_nritems(lower, nritems + 1);
2204 	btrfs_mark_buffer_dirty(lower);
2205 	return 0;
2206 }
2207 
2208 /*
2209  * split the node at the specified level in path in two.
2210  * The path is corrected to point to the appropriate node after the split
2211  *
2212  * Before splitting this tries to make some room in the node by pushing
2213  * left and right, if either one works, it returns right away.
2214  *
2215  * returns 0 on success and < 0 on failure
2216  */
2217 static noinline int split_node(struct btrfs_trans_handle *trans,
2218 			       struct btrfs_root *root,
2219 			       struct btrfs_path *path, int level)
2220 {
2221 	struct extent_buffer *c;
2222 	struct extent_buffer *split;
2223 	struct btrfs_disk_key disk_key;
2224 	int mid;
2225 	int ret;
2226 	int wret;
2227 	u32 c_nritems;
2228 
2229 	c = path->nodes[level];
2230 	WARN_ON(btrfs_header_generation(c) != trans->transid);
2231 	if (c == root->node) {
2232 		/* trying to split the root, lets make a new one */
2233 		ret = insert_new_root(trans, root, path, level + 1);
2234 		if (ret)
2235 			return ret;
2236 	} else if (!trans->transaction->delayed_refs.flushing) {
2237 		ret = push_nodes_for_insert(trans, root, path, level);
2238 		c = path->nodes[level];
2239 		if (!ret && btrfs_header_nritems(c) <
2240 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2241 			return 0;
2242 		if (ret < 0)
2243 			return ret;
2244 	}
2245 
2246 	c_nritems = btrfs_header_nritems(c);
2247 
2248 	split = btrfs_alloc_free_block(trans, root, root->nodesize,
2249 					path->nodes[level + 1]->start,
2250 					root->root_key.objectid,
2251 					trans->transid, level, c->start, 0);
2252 	if (IS_ERR(split))
2253 		return PTR_ERR(split);
2254 
2255 	btrfs_set_header_flags(split, btrfs_header_flags(c));
2256 	btrfs_set_header_level(split, btrfs_header_level(c));
2257 	btrfs_set_header_bytenr(split, split->start);
2258 	btrfs_set_header_generation(split, trans->transid);
2259 	btrfs_set_header_owner(split, root->root_key.objectid);
2260 	btrfs_set_header_flags(split, 0);
2261 	write_extent_buffer(split, root->fs_info->fsid,
2262 			    (unsigned long)btrfs_header_fsid(split),
2263 			    BTRFS_FSID_SIZE);
2264 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2265 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
2266 			    BTRFS_UUID_SIZE);
2267 
2268 	mid = (c_nritems + 1) / 2;
2269 
2270 	copy_extent_buffer(split, c,
2271 			   btrfs_node_key_ptr_offset(0),
2272 			   btrfs_node_key_ptr_offset(mid),
2273 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2274 	btrfs_set_header_nritems(split, c_nritems - mid);
2275 	btrfs_set_header_nritems(c, mid);
2276 	ret = 0;
2277 
2278 	btrfs_mark_buffer_dirty(c);
2279 	btrfs_mark_buffer_dirty(split);
2280 
2281 	btrfs_node_key(split, &disk_key, 0);
2282 	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2283 			  path->slots[level + 1] + 1,
2284 			  level + 1);
2285 	if (wret)
2286 		ret = wret;
2287 
2288 	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2289 	BUG_ON(ret);
2290 
2291 	if (path->slots[level] >= mid) {
2292 		path->slots[level] -= mid;
2293 		btrfs_tree_unlock(c);
2294 		free_extent_buffer(c);
2295 		path->nodes[level] = split;
2296 		path->slots[level + 1] += 1;
2297 	} else {
2298 		btrfs_tree_unlock(split);
2299 		free_extent_buffer(split);
2300 	}
2301 	return ret;
2302 }
2303 
2304 /*
2305  * how many bytes are required to store the items in a leaf.  start
2306  * and nr indicate which items in the leaf to check.  This totals up the
2307  * space used both by the item structs and the item data
2308  */
2309 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2310 {
2311 	int data_len;
2312 	int nritems = btrfs_header_nritems(l);
2313 	int end = min(nritems, start + nr) - 1;
2314 
2315 	if (!nr)
2316 		return 0;
2317 	data_len = btrfs_item_end_nr(l, start);
2318 	data_len = data_len - btrfs_item_offset_nr(l, end);
2319 	data_len += sizeof(struct btrfs_item) * nr;
2320 	WARN_ON(data_len < 0);
2321 	return data_len;
2322 }
2323 
2324 /*
2325  * The space between the end of the leaf items and
2326  * the start of the leaf data.  IOW, how much room
2327  * the leaf has left for both items and data
2328  */
2329 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2330 				   struct extent_buffer *leaf)
2331 {
2332 	int nritems = btrfs_header_nritems(leaf);
2333 	int ret;
2334 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2335 	if (ret < 0) {
2336 		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2337 		       "used %d nritems %d\n",
2338 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2339 		       leaf_space_used(leaf, 0, nritems), nritems);
2340 	}
2341 	return ret;
2342 }
2343 
2344 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2345 				      struct btrfs_root *root,
2346 				      struct btrfs_path *path,
2347 				      int data_size, int empty,
2348 				      struct extent_buffer *right,
2349 				      int free_space, u32 left_nritems)
2350 {
2351 	struct extent_buffer *left = path->nodes[0];
2352 	struct extent_buffer *upper = path->nodes[1];
2353 	struct btrfs_disk_key disk_key;
2354 	int slot;
2355 	u32 i;
2356 	int push_space = 0;
2357 	int push_items = 0;
2358 	struct btrfs_item *item;
2359 	u32 nr;
2360 	u32 right_nritems;
2361 	u32 data_end;
2362 	u32 this_item_size;
2363 	int ret;
2364 
2365 	if (empty)
2366 		nr = 0;
2367 	else
2368 		nr = 1;
2369 
2370 	if (path->slots[0] >= left_nritems)
2371 		push_space += data_size;
2372 
2373 	slot = path->slots[1];
2374 	i = left_nritems - 1;
2375 	while (i >= nr) {
2376 		item = btrfs_item_nr(left, i);
2377 
2378 		if (!empty && push_items > 0) {
2379 			if (path->slots[0] > i)
2380 				break;
2381 			if (path->slots[0] == i) {
2382 				int space = btrfs_leaf_free_space(root, left);
2383 				if (space + push_space * 2 > free_space)
2384 					break;
2385 			}
2386 		}
2387 
2388 		if (path->slots[0] == i)
2389 			push_space += data_size;
2390 
2391 		if (!left->map_token) {
2392 			map_extent_buffer(left, (unsigned long)item,
2393 					sizeof(struct btrfs_item),
2394 					&left->map_token, &left->kaddr,
2395 					&left->map_start, &left->map_len,
2396 					KM_USER1);
2397 		}
2398 
2399 		this_item_size = btrfs_item_size(left, item);
2400 		if (this_item_size + sizeof(*item) + push_space > free_space)
2401 			break;
2402 
2403 		push_items++;
2404 		push_space += this_item_size + sizeof(*item);
2405 		if (i == 0)
2406 			break;
2407 		i--;
2408 	}
2409 	if (left->map_token) {
2410 		unmap_extent_buffer(left, left->map_token, KM_USER1);
2411 		left->map_token = NULL;
2412 	}
2413 
2414 	if (push_items == 0)
2415 		goto out_unlock;
2416 
2417 	if (!empty && push_items == left_nritems)
2418 		WARN_ON(1);
2419 
2420 	/* push left to right */
2421 	right_nritems = btrfs_header_nritems(right);
2422 
2423 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2424 	push_space -= leaf_data_end(root, left);
2425 
2426 	/* make room in the right data area */
2427 	data_end = leaf_data_end(root, right);
2428 	memmove_extent_buffer(right,
2429 			      btrfs_leaf_data(right) + data_end - push_space,
2430 			      btrfs_leaf_data(right) + data_end,
2431 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
2432 
2433 	/* copy from the left data area */
2434 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2435 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
2436 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
2437 		     push_space);
2438 
2439 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2440 			      btrfs_item_nr_offset(0),
2441 			      right_nritems * sizeof(struct btrfs_item));
2442 
2443 	/* copy the items from left to right */
2444 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2445 		   btrfs_item_nr_offset(left_nritems - push_items),
2446 		   push_items * sizeof(struct btrfs_item));
2447 
2448 	/* update the item pointers */
2449 	right_nritems += push_items;
2450 	btrfs_set_header_nritems(right, right_nritems);
2451 	push_space = BTRFS_LEAF_DATA_SIZE(root);
2452 	for (i = 0; i < right_nritems; i++) {
2453 		item = btrfs_item_nr(right, i);
2454 		if (!right->map_token) {
2455 			map_extent_buffer(right, (unsigned long)item,
2456 					sizeof(struct btrfs_item),
2457 					&right->map_token, &right->kaddr,
2458 					&right->map_start, &right->map_len,
2459 					KM_USER1);
2460 		}
2461 		push_space -= btrfs_item_size(right, item);
2462 		btrfs_set_item_offset(right, item, push_space);
2463 	}
2464 
2465 	if (right->map_token) {
2466 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2467 		right->map_token = NULL;
2468 	}
2469 	left_nritems -= push_items;
2470 	btrfs_set_header_nritems(left, left_nritems);
2471 
2472 	if (left_nritems)
2473 		btrfs_mark_buffer_dirty(left);
2474 	btrfs_mark_buffer_dirty(right);
2475 
2476 	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2477 	BUG_ON(ret);
2478 
2479 	btrfs_item_key(right, &disk_key, 0);
2480 	btrfs_set_node_key(upper, &disk_key, slot + 1);
2481 	btrfs_mark_buffer_dirty(upper);
2482 
2483 	/* then fixup the leaf pointer in the path */
2484 	if (path->slots[0] >= left_nritems) {
2485 		path->slots[0] -= left_nritems;
2486 		if (btrfs_header_nritems(path->nodes[0]) == 0)
2487 			clean_tree_block(trans, root, path->nodes[0]);
2488 		btrfs_tree_unlock(path->nodes[0]);
2489 		free_extent_buffer(path->nodes[0]);
2490 		path->nodes[0] = right;
2491 		path->slots[1] += 1;
2492 	} else {
2493 		btrfs_tree_unlock(right);
2494 		free_extent_buffer(right);
2495 	}
2496 	return 0;
2497 
2498 out_unlock:
2499 	btrfs_tree_unlock(right);
2500 	free_extent_buffer(right);
2501 	return 1;
2502 }
2503 
2504 /*
2505  * push some data in the path leaf to the right, trying to free up at
2506  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2507  *
2508  * returns 1 if the push failed because the other node didn't have enough
2509  * room, 0 if everything worked out and < 0 if there were major errors.
2510  */
2511 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2512 			   *root, struct btrfs_path *path, int data_size,
2513 			   int empty)
2514 {
2515 	struct extent_buffer *left = path->nodes[0];
2516 	struct extent_buffer *right;
2517 	struct extent_buffer *upper;
2518 	int slot;
2519 	int free_space;
2520 	u32 left_nritems;
2521 	int ret;
2522 
2523 	if (!path->nodes[1])
2524 		return 1;
2525 
2526 	slot = path->slots[1];
2527 	upper = path->nodes[1];
2528 	if (slot >= btrfs_header_nritems(upper) - 1)
2529 		return 1;
2530 
2531 	btrfs_assert_tree_locked(path->nodes[1]);
2532 
2533 	right = read_node_slot(root, upper, slot + 1);
2534 	btrfs_tree_lock(right);
2535 	btrfs_set_lock_blocking(right);
2536 
2537 	free_space = btrfs_leaf_free_space(root, right);
2538 	if (free_space < data_size)
2539 		goto out_unlock;
2540 
2541 	/* cow and double check */
2542 	ret = btrfs_cow_block(trans, root, right, upper,
2543 			      slot + 1, &right);
2544 	if (ret)
2545 		goto out_unlock;
2546 
2547 	free_space = btrfs_leaf_free_space(root, right);
2548 	if (free_space < data_size)
2549 		goto out_unlock;
2550 
2551 	left_nritems = btrfs_header_nritems(left);
2552 	if (left_nritems == 0)
2553 		goto out_unlock;
2554 
2555 	return __push_leaf_right(trans, root, path, data_size, empty,
2556 				right, free_space, left_nritems);
2557 out_unlock:
2558 	btrfs_tree_unlock(right);
2559 	free_extent_buffer(right);
2560 	return 1;
2561 }
2562 
2563 /*
2564  * push some data in the path leaf to the left, trying to free up at
2565  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2566  */
2567 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2568 				     struct btrfs_root *root,
2569 				     struct btrfs_path *path, int data_size,
2570 				     int empty, struct extent_buffer *left,
2571 				     int free_space, int right_nritems)
2572 {
2573 	struct btrfs_disk_key disk_key;
2574 	struct extent_buffer *right = path->nodes[0];
2575 	int slot;
2576 	int i;
2577 	int push_space = 0;
2578 	int push_items = 0;
2579 	struct btrfs_item *item;
2580 	u32 old_left_nritems;
2581 	u32 nr;
2582 	int ret = 0;
2583 	int wret;
2584 	u32 this_item_size;
2585 	u32 old_left_item_size;
2586 
2587 	slot = path->slots[1];
2588 
2589 	if (empty)
2590 		nr = right_nritems;
2591 	else
2592 		nr = right_nritems - 1;
2593 
2594 	for (i = 0; i < nr; i++) {
2595 		item = btrfs_item_nr(right, i);
2596 		if (!right->map_token) {
2597 			map_extent_buffer(right, (unsigned long)item,
2598 					sizeof(struct btrfs_item),
2599 					&right->map_token, &right->kaddr,
2600 					&right->map_start, &right->map_len,
2601 					KM_USER1);
2602 		}
2603 
2604 		if (!empty && push_items > 0) {
2605 			if (path->slots[0] < i)
2606 				break;
2607 			if (path->slots[0] == i) {
2608 				int space = btrfs_leaf_free_space(root, right);
2609 				if (space + push_space * 2 > free_space)
2610 					break;
2611 			}
2612 		}
2613 
2614 		if (path->slots[0] == i)
2615 			push_space += data_size;
2616 
2617 		this_item_size = btrfs_item_size(right, item);
2618 		if (this_item_size + sizeof(*item) + push_space > free_space)
2619 			break;
2620 
2621 		push_items++;
2622 		push_space += this_item_size + sizeof(*item);
2623 	}
2624 
2625 	if (right->map_token) {
2626 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2627 		right->map_token = NULL;
2628 	}
2629 
2630 	if (push_items == 0) {
2631 		ret = 1;
2632 		goto out;
2633 	}
2634 	if (!empty && push_items == btrfs_header_nritems(right))
2635 		WARN_ON(1);
2636 
2637 	/* push data from right to left */
2638 	copy_extent_buffer(left, right,
2639 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
2640 			   btrfs_item_nr_offset(0),
2641 			   push_items * sizeof(struct btrfs_item));
2642 
2643 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
2644 		     btrfs_item_offset_nr(right, push_items - 1);
2645 
2646 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2647 		     leaf_data_end(root, left) - push_space,
2648 		     btrfs_leaf_data(right) +
2649 		     btrfs_item_offset_nr(right, push_items - 1),
2650 		     push_space);
2651 	old_left_nritems = btrfs_header_nritems(left);
2652 	BUG_ON(old_left_nritems <= 0);
2653 
2654 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2655 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2656 		u32 ioff;
2657 
2658 		item = btrfs_item_nr(left, i);
2659 		if (!left->map_token) {
2660 			map_extent_buffer(left, (unsigned long)item,
2661 					sizeof(struct btrfs_item),
2662 					&left->map_token, &left->kaddr,
2663 					&left->map_start, &left->map_len,
2664 					KM_USER1);
2665 		}
2666 
2667 		ioff = btrfs_item_offset(left, item);
2668 		btrfs_set_item_offset(left, item,
2669 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2670 	}
2671 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2672 	if (left->map_token) {
2673 		unmap_extent_buffer(left, left->map_token, KM_USER1);
2674 		left->map_token = NULL;
2675 	}
2676 
2677 	/* fixup right node */
2678 	if (push_items > right_nritems) {
2679 		printk(KERN_CRIT "push items %d nr %u\n", push_items,
2680 		       right_nritems);
2681 		WARN_ON(1);
2682 	}
2683 
2684 	if (push_items < right_nritems) {
2685 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
2686 						  leaf_data_end(root, right);
2687 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
2688 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2689 				      btrfs_leaf_data(right) +
2690 				      leaf_data_end(root, right), push_space);
2691 
2692 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2693 			      btrfs_item_nr_offset(push_items),
2694 			     (btrfs_header_nritems(right) - push_items) *
2695 			     sizeof(struct btrfs_item));
2696 	}
2697 	right_nritems -= push_items;
2698 	btrfs_set_header_nritems(right, right_nritems);
2699 	push_space = BTRFS_LEAF_DATA_SIZE(root);
2700 	for (i = 0; i < right_nritems; i++) {
2701 		item = btrfs_item_nr(right, i);
2702 
2703 		if (!right->map_token) {
2704 			map_extent_buffer(right, (unsigned long)item,
2705 					sizeof(struct btrfs_item),
2706 					&right->map_token, &right->kaddr,
2707 					&right->map_start, &right->map_len,
2708 					KM_USER1);
2709 		}
2710 
2711 		push_space = push_space - btrfs_item_size(right, item);
2712 		btrfs_set_item_offset(right, item, push_space);
2713 	}
2714 	if (right->map_token) {
2715 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2716 		right->map_token = NULL;
2717 	}
2718 
2719 	btrfs_mark_buffer_dirty(left);
2720 	if (right_nritems)
2721 		btrfs_mark_buffer_dirty(right);
2722 
2723 	ret = btrfs_update_ref(trans, root, right, left,
2724 			       old_left_nritems, push_items);
2725 	BUG_ON(ret);
2726 
2727 	btrfs_item_key(right, &disk_key, 0);
2728 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2729 	if (wret)
2730 		ret = wret;
2731 
2732 	/* then fixup the leaf pointer in the path */
2733 	if (path->slots[0] < push_items) {
2734 		path->slots[0] += old_left_nritems;
2735 		if (btrfs_header_nritems(path->nodes[0]) == 0)
2736 			clean_tree_block(trans, root, path->nodes[0]);
2737 		btrfs_tree_unlock(path->nodes[0]);
2738 		free_extent_buffer(path->nodes[0]);
2739 		path->nodes[0] = left;
2740 		path->slots[1] -= 1;
2741 	} else {
2742 		btrfs_tree_unlock(left);
2743 		free_extent_buffer(left);
2744 		path->slots[0] -= push_items;
2745 	}
2746 	BUG_ON(path->slots[0] < 0);
2747 	return ret;
2748 out:
2749 	btrfs_tree_unlock(left);
2750 	free_extent_buffer(left);
2751 	return ret;
2752 }
2753 
2754 /*
2755  * push some data in the path leaf to the left, trying to free up at
2756  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2757  */
2758 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2759 			  *root, struct btrfs_path *path, int data_size,
2760 			  int empty)
2761 {
2762 	struct extent_buffer *right = path->nodes[0];
2763 	struct extent_buffer *left;
2764 	int slot;
2765 	int free_space;
2766 	u32 right_nritems;
2767 	int ret = 0;
2768 
2769 	slot = path->slots[1];
2770 	if (slot == 0)
2771 		return 1;
2772 	if (!path->nodes[1])
2773 		return 1;
2774 
2775 	right_nritems = btrfs_header_nritems(right);
2776 	if (right_nritems == 0)
2777 		return 1;
2778 
2779 	btrfs_assert_tree_locked(path->nodes[1]);
2780 
2781 	left = read_node_slot(root, path->nodes[1], slot - 1);
2782 	btrfs_tree_lock(left);
2783 	btrfs_set_lock_blocking(left);
2784 
2785 	free_space = btrfs_leaf_free_space(root, left);
2786 	if (free_space < data_size) {
2787 		ret = 1;
2788 		goto out;
2789 	}
2790 
2791 	/* cow and double check */
2792 	ret = btrfs_cow_block(trans, root, left,
2793 			      path->nodes[1], slot - 1, &left);
2794 	if (ret) {
2795 		/* we hit -ENOSPC, but it isn't fatal here */
2796 		ret = 1;
2797 		goto out;
2798 	}
2799 
2800 	free_space = btrfs_leaf_free_space(root, left);
2801 	if (free_space < data_size) {
2802 		ret = 1;
2803 		goto out;
2804 	}
2805 
2806 	return __push_leaf_left(trans, root, path, data_size,
2807 			       empty, left, free_space, right_nritems);
2808 out:
2809 	btrfs_tree_unlock(left);
2810 	free_extent_buffer(left);
2811 	return ret;
2812 }
2813 
2814 /*
2815  * split the path's leaf in two, making sure there is at least data_size
2816  * available for the resulting leaf level of the path.
2817  *
2818  * returns 0 if all went well and < 0 on failure.
2819  */
2820 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2821 			       struct btrfs_root *root,
2822 			       struct btrfs_path *path,
2823 			       struct extent_buffer *l,
2824 			       struct extent_buffer *right,
2825 			       int slot, int mid, int nritems)
2826 {
2827 	int data_copy_size;
2828 	int rt_data_off;
2829 	int i;
2830 	int ret = 0;
2831 	int wret;
2832 	struct btrfs_disk_key disk_key;
2833 
2834 	nritems = nritems - mid;
2835 	btrfs_set_header_nritems(right, nritems);
2836 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2837 
2838 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2839 			   btrfs_item_nr_offset(mid),
2840 			   nritems * sizeof(struct btrfs_item));
2841 
2842 	copy_extent_buffer(right, l,
2843 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2844 		     data_copy_size, btrfs_leaf_data(l) +
2845 		     leaf_data_end(root, l), data_copy_size);
2846 
2847 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2848 		      btrfs_item_end_nr(l, mid);
2849 
2850 	for (i = 0; i < nritems; i++) {
2851 		struct btrfs_item *item = btrfs_item_nr(right, i);
2852 		u32 ioff;
2853 
2854 		if (!right->map_token) {
2855 			map_extent_buffer(right, (unsigned long)item,
2856 					sizeof(struct btrfs_item),
2857 					&right->map_token, &right->kaddr,
2858 					&right->map_start, &right->map_len,
2859 					KM_USER1);
2860 		}
2861 
2862 		ioff = btrfs_item_offset(right, item);
2863 		btrfs_set_item_offset(right, item, ioff + rt_data_off);
2864 	}
2865 
2866 	if (right->map_token) {
2867 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2868 		right->map_token = NULL;
2869 	}
2870 
2871 	btrfs_set_header_nritems(l, mid);
2872 	ret = 0;
2873 	btrfs_item_key(right, &disk_key, 0);
2874 	wret = insert_ptr(trans, root, path, &disk_key, right->start,
2875 			  path->slots[1] + 1, 1);
2876 	if (wret)
2877 		ret = wret;
2878 
2879 	btrfs_mark_buffer_dirty(right);
2880 	btrfs_mark_buffer_dirty(l);
2881 	BUG_ON(path->slots[0] != slot);
2882 
2883 	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2884 	BUG_ON(ret);
2885 
2886 	if (mid <= slot) {
2887 		btrfs_tree_unlock(path->nodes[0]);
2888 		free_extent_buffer(path->nodes[0]);
2889 		path->nodes[0] = right;
2890 		path->slots[0] -= mid;
2891 		path->slots[1] += 1;
2892 	} else {
2893 		btrfs_tree_unlock(right);
2894 		free_extent_buffer(right);
2895 	}
2896 
2897 	BUG_ON(path->slots[0] < 0);
2898 
2899 	return ret;
2900 }
2901 
2902 /*
2903  * split the path's leaf in two, making sure there is at least data_size
2904  * available for the resulting leaf level of the path.
2905  *
2906  * returns 0 if all went well and < 0 on failure.
2907  */
2908 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2909 			       struct btrfs_root *root,
2910 			       struct btrfs_key *ins_key,
2911 			       struct btrfs_path *path, int data_size,
2912 			       int extend)
2913 {
2914 	struct extent_buffer *l;
2915 	u32 nritems;
2916 	int mid;
2917 	int slot;
2918 	struct extent_buffer *right;
2919 	int ret = 0;
2920 	int wret;
2921 	int double_split;
2922 	int num_doubles = 0;
2923 
2924 	/* first try to make some room by pushing left and right */
2925 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
2926 	    !trans->transaction->delayed_refs.flushing) {
2927 		wret = push_leaf_right(trans, root, path, data_size, 0);
2928 		if (wret < 0)
2929 			return wret;
2930 		if (wret) {
2931 			wret = push_leaf_left(trans, root, path, data_size, 0);
2932 			if (wret < 0)
2933 				return wret;
2934 		}
2935 		l = path->nodes[0];
2936 
2937 		/* did the pushes work? */
2938 		if (btrfs_leaf_free_space(root, l) >= data_size)
2939 			return 0;
2940 	}
2941 
2942 	if (!path->nodes[1]) {
2943 		ret = insert_new_root(trans, root, path, 1);
2944 		if (ret)
2945 			return ret;
2946 	}
2947 again:
2948 	double_split = 0;
2949 	l = path->nodes[0];
2950 	slot = path->slots[0];
2951 	nritems = btrfs_header_nritems(l);
2952 	mid = (nritems + 1) / 2;
2953 
2954 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
2955 					path->nodes[1]->start,
2956 					root->root_key.objectid,
2957 					trans->transid, 0, l->start, 0);
2958 	if (IS_ERR(right)) {
2959 		BUG_ON(1);
2960 		return PTR_ERR(right);
2961 	}
2962 
2963 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2964 	btrfs_set_header_bytenr(right, right->start);
2965 	btrfs_set_header_generation(right, trans->transid);
2966 	btrfs_set_header_owner(right, root->root_key.objectid);
2967 	btrfs_set_header_level(right, 0);
2968 	write_extent_buffer(right, root->fs_info->fsid,
2969 			    (unsigned long)btrfs_header_fsid(right),
2970 			    BTRFS_FSID_SIZE);
2971 
2972 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2973 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
2974 			    BTRFS_UUID_SIZE);
2975 
2976 	if (mid <= slot) {
2977 		if (nritems == 1 ||
2978 		    leaf_space_used(l, mid, nritems - mid) + data_size >
2979 			BTRFS_LEAF_DATA_SIZE(root)) {
2980 			if (slot >= nritems) {
2981 				struct btrfs_disk_key disk_key;
2982 
2983 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2984 				btrfs_set_header_nritems(right, 0);
2985 				wret = insert_ptr(trans, root, path,
2986 						  &disk_key, right->start,
2987 						  path->slots[1] + 1, 1);
2988 				if (wret)
2989 					ret = wret;
2990 
2991 				btrfs_tree_unlock(path->nodes[0]);
2992 				free_extent_buffer(path->nodes[0]);
2993 				path->nodes[0] = right;
2994 				path->slots[0] = 0;
2995 				path->slots[1] += 1;
2996 				btrfs_mark_buffer_dirty(right);
2997 				return ret;
2998 			}
2999 			mid = slot;
3000 			if (mid != nritems &&
3001 			    leaf_space_used(l, mid, nritems - mid) +
3002 			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3003 				double_split = 1;
3004 			}
3005 		}
3006 	} else {
3007 		if (leaf_space_used(l, 0, mid) + data_size >
3008 			BTRFS_LEAF_DATA_SIZE(root)) {
3009 			if (!extend && data_size && slot == 0) {
3010 				struct btrfs_disk_key disk_key;
3011 
3012 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
3013 				btrfs_set_header_nritems(right, 0);
3014 				wret = insert_ptr(trans, root, path,
3015 						  &disk_key,
3016 						  right->start,
3017 						  path->slots[1], 1);
3018 				if (wret)
3019 					ret = wret;
3020 				btrfs_tree_unlock(path->nodes[0]);
3021 				free_extent_buffer(path->nodes[0]);
3022 				path->nodes[0] = right;
3023 				path->slots[0] = 0;
3024 				if (path->slots[1] == 0) {
3025 					wret = fixup_low_keys(trans, root,
3026 						      path, &disk_key, 1);
3027 					if (wret)
3028 						ret = wret;
3029 				}
3030 				btrfs_mark_buffer_dirty(right);
3031 				return ret;
3032 			} else if ((extend || !data_size) && slot == 0) {
3033 				mid = 1;
3034 			} else {
3035 				mid = slot;
3036 				if (mid != nritems &&
3037 				    leaf_space_used(l, mid, nritems - mid) +
3038 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3039 					double_split = 1;
3040 				}
3041 			}
3042 		}
3043 	}
3044 
3045 	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3046 	BUG_ON(ret);
3047 
3048 	if (double_split) {
3049 		BUG_ON(num_doubles != 0);
3050 		num_doubles++;
3051 		goto again;
3052 	}
3053 
3054 	return ret;
3055 }
3056 
3057 /*
3058  * This function splits a single item into two items,
3059  * giving 'new_key' to the new item and splitting the
3060  * old one at split_offset (from the start of the item).
3061  *
3062  * The path may be released by this operation.  After
3063  * the split, the path is pointing to the old item.  The
3064  * new item is going to be in the same node as the old one.
3065  *
3066  * Note, the item being split must be smaller enough to live alone on
3067  * a tree block with room for one extra struct btrfs_item
3068  *
3069  * This allows us to split the item in place, keeping a lock on the
3070  * leaf the entire time.
3071  */
3072 int btrfs_split_item(struct btrfs_trans_handle *trans,
3073 		     struct btrfs_root *root,
3074 		     struct btrfs_path *path,
3075 		     struct btrfs_key *new_key,
3076 		     unsigned long split_offset)
3077 {
3078 	u32 item_size;
3079 	struct extent_buffer *leaf;
3080 	struct btrfs_key orig_key;
3081 	struct btrfs_item *item;
3082 	struct btrfs_item *new_item;
3083 	int ret = 0;
3084 	int slot;
3085 	u32 nritems;
3086 	u32 orig_offset;
3087 	struct btrfs_disk_key disk_key;
3088 	char *buf;
3089 
3090 	leaf = path->nodes[0];
3091 	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
3092 	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
3093 		goto split;
3094 
3095 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3096 	btrfs_release_path(root, path);
3097 
3098 	path->search_for_split = 1;
3099 	path->keep_locks = 1;
3100 
3101 	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
3102 	path->search_for_split = 0;
3103 
3104 	/* if our item isn't there or got smaller, return now */
3105 	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
3106 							path->slots[0])) {
3107 		path->keep_locks = 0;
3108 		return -EAGAIN;
3109 	}
3110 
3111 	btrfs_set_path_blocking(path);
3112 	ret = split_leaf(trans, root, &orig_key, path,
3113 			 sizeof(struct btrfs_item), 1);
3114 	path->keep_locks = 0;
3115 	BUG_ON(ret);
3116 
3117 	btrfs_unlock_up_safe(path, 1);
3118 	leaf = path->nodes[0];
3119 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3120 
3121 split:
3122 	/*
3123 	 * make sure any changes to the path from split_leaf leave it
3124 	 * in a blocking state
3125 	 */
3126 	btrfs_set_path_blocking(path);
3127 
3128 	item = btrfs_item_nr(leaf, path->slots[0]);
3129 	orig_offset = btrfs_item_offset(leaf, item);
3130 	item_size = btrfs_item_size(leaf, item);
3131 
3132 	buf = kmalloc(item_size, GFP_NOFS);
3133 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3134 			    path->slots[0]), item_size);
3135 	slot = path->slots[0] + 1;
3136 	leaf = path->nodes[0];
3137 
3138 	nritems = btrfs_header_nritems(leaf);
3139 
3140 	if (slot != nritems) {
3141 		/* shift the items */
3142 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3143 			      btrfs_item_nr_offset(slot),
3144 			      (nritems - slot) * sizeof(struct btrfs_item));
3145 
3146 	}
3147 
3148 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3149 	btrfs_set_item_key(leaf, &disk_key, slot);
3150 
3151 	new_item = btrfs_item_nr(leaf, slot);
3152 
3153 	btrfs_set_item_offset(leaf, new_item, orig_offset);
3154 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3155 
3156 	btrfs_set_item_offset(leaf, item,
3157 			      orig_offset + item_size - split_offset);
3158 	btrfs_set_item_size(leaf, item, split_offset);
3159 
3160 	btrfs_set_header_nritems(leaf, nritems + 1);
3161 
3162 	/* write the data for the start of the original item */
3163 	write_extent_buffer(leaf, buf,
3164 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
3165 			    split_offset);
3166 
3167 	/* write the data for the new item */
3168 	write_extent_buffer(leaf, buf + split_offset,
3169 			    btrfs_item_ptr_offset(leaf, slot),
3170 			    item_size - split_offset);
3171 	btrfs_mark_buffer_dirty(leaf);
3172 
3173 	ret = 0;
3174 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3175 		btrfs_print_leaf(root, leaf);
3176 		BUG();
3177 	}
3178 	kfree(buf);
3179 	return ret;
3180 }
3181 
3182 /*
3183  * make the item pointed to by the path smaller.  new_size indicates
3184  * how small to make it, and from_end tells us if we just chop bytes
3185  * off the end of the item or if we shift the item to chop bytes off
3186  * the front.
3187  */
3188 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3189 			struct btrfs_root *root,
3190 			struct btrfs_path *path,
3191 			u32 new_size, int from_end)
3192 {
3193 	int ret = 0;
3194 	int slot;
3195 	int slot_orig;
3196 	struct extent_buffer *leaf;
3197 	struct btrfs_item *item;
3198 	u32 nritems;
3199 	unsigned int data_end;
3200 	unsigned int old_data_start;
3201 	unsigned int old_size;
3202 	unsigned int size_diff;
3203 	int i;
3204 
3205 	slot_orig = path->slots[0];
3206 	leaf = path->nodes[0];
3207 	slot = path->slots[0];
3208 
3209 	old_size = btrfs_item_size_nr(leaf, slot);
3210 	if (old_size == new_size)
3211 		return 0;
3212 
3213 	nritems = btrfs_header_nritems(leaf);
3214 	data_end = leaf_data_end(root, leaf);
3215 
3216 	old_data_start = btrfs_item_offset_nr(leaf, slot);
3217 
3218 	size_diff = old_size - new_size;
3219 
3220 	BUG_ON(slot < 0);
3221 	BUG_ON(slot >= nritems);
3222 
3223 	/*
3224 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3225 	 */
3226 	/* first correct the data pointers */
3227 	for (i = slot; i < nritems; i++) {
3228 		u32 ioff;
3229 		item = btrfs_item_nr(leaf, i);
3230 
3231 		if (!leaf->map_token) {
3232 			map_extent_buffer(leaf, (unsigned long)item,
3233 					sizeof(struct btrfs_item),
3234 					&leaf->map_token, &leaf->kaddr,
3235 					&leaf->map_start, &leaf->map_len,
3236 					KM_USER1);
3237 		}
3238 
3239 		ioff = btrfs_item_offset(leaf, item);
3240 		btrfs_set_item_offset(leaf, item, ioff + size_diff);
3241 	}
3242 
3243 	if (leaf->map_token) {
3244 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3245 		leaf->map_token = NULL;
3246 	}
3247 
3248 	/* shift the data */
3249 	if (from_end) {
3250 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3251 			      data_end + size_diff, btrfs_leaf_data(leaf) +
3252 			      data_end, old_data_start + new_size - data_end);
3253 	} else {
3254 		struct btrfs_disk_key disk_key;
3255 		u64 offset;
3256 
3257 		btrfs_item_key(leaf, &disk_key, slot);
3258 
3259 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3260 			unsigned long ptr;
3261 			struct btrfs_file_extent_item *fi;
3262 
3263 			fi = btrfs_item_ptr(leaf, slot,
3264 					    struct btrfs_file_extent_item);
3265 			fi = (struct btrfs_file_extent_item *)(
3266 			     (unsigned long)fi - size_diff);
3267 
3268 			if (btrfs_file_extent_type(leaf, fi) ==
3269 			    BTRFS_FILE_EXTENT_INLINE) {
3270 				ptr = btrfs_item_ptr_offset(leaf, slot);
3271 				memmove_extent_buffer(leaf, ptr,
3272 				      (unsigned long)fi,
3273 				      offsetof(struct btrfs_file_extent_item,
3274 						 disk_bytenr));
3275 			}
3276 		}
3277 
3278 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3279 			      data_end + size_diff, btrfs_leaf_data(leaf) +
3280 			      data_end, old_data_start - data_end);
3281 
3282 		offset = btrfs_disk_key_offset(&disk_key);
3283 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3284 		btrfs_set_item_key(leaf, &disk_key, slot);
3285 		if (slot == 0)
3286 			fixup_low_keys(trans, root, path, &disk_key, 1);
3287 	}
3288 
3289 	item = btrfs_item_nr(leaf, slot);
3290 	btrfs_set_item_size(leaf, item, new_size);
3291 	btrfs_mark_buffer_dirty(leaf);
3292 
3293 	ret = 0;
3294 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3295 		btrfs_print_leaf(root, leaf);
3296 		BUG();
3297 	}
3298 	return ret;
3299 }
3300 
3301 /*
3302  * make the item pointed to by the path bigger, data_size is the new size.
3303  */
3304 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3305 		      struct btrfs_root *root, struct btrfs_path *path,
3306 		      u32 data_size)
3307 {
3308 	int ret = 0;
3309 	int slot;
3310 	int slot_orig;
3311 	struct extent_buffer *leaf;
3312 	struct btrfs_item *item;
3313 	u32 nritems;
3314 	unsigned int data_end;
3315 	unsigned int old_data;
3316 	unsigned int old_size;
3317 	int i;
3318 
3319 	slot_orig = path->slots[0];
3320 	leaf = path->nodes[0];
3321 
3322 	nritems = btrfs_header_nritems(leaf);
3323 	data_end = leaf_data_end(root, leaf);
3324 
3325 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
3326 		btrfs_print_leaf(root, leaf);
3327 		BUG();
3328 	}
3329 	slot = path->slots[0];
3330 	old_data = btrfs_item_end_nr(leaf, slot);
3331 
3332 	BUG_ON(slot < 0);
3333 	if (slot >= nritems) {
3334 		btrfs_print_leaf(root, leaf);
3335 		printk(KERN_CRIT "slot %d too large, nritems %d\n",
3336 		       slot, nritems);
3337 		BUG_ON(1);
3338 	}
3339 
3340 	/*
3341 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3342 	 */
3343 	/* first correct the data pointers */
3344 	for (i = slot; i < nritems; i++) {
3345 		u32 ioff;
3346 		item = btrfs_item_nr(leaf, i);
3347 
3348 		if (!leaf->map_token) {
3349 			map_extent_buffer(leaf, (unsigned long)item,
3350 					sizeof(struct btrfs_item),
3351 					&leaf->map_token, &leaf->kaddr,
3352 					&leaf->map_start, &leaf->map_len,
3353 					KM_USER1);
3354 		}
3355 		ioff = btrfs_item_offset(leaf, item);
3356 		btrfs_set_item_offset(leaf, item, ioff - data_size);
3357 	}
3358 
3359 	if (leaf->map_token) {
3360 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3361 		leaf->map_token = NULL;
3362 	}
3363 
3364 	/* shift the data */
3365 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3366 		      data_end - data_size, btrfs_leaf_data(leaf) +
3367 		      data_end, old_data - data_end);
3368 
3369 	data_end = old_data;
3370 	old_size = btrfs_item_size_nr(leaf, slot);
3371 	item = btrfs_item_nr(leaf, slot);
3372 	btrfs_set_item_size(leaf, item, old_size + data_size);
3373 	btrfs_mark_buffer_dirty(leaf);
3374 
3375 	ret = 0;
3376 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3377 		btrfs_print_leaf(root, leaf);
3378 		BUG();
3379 	}
3380 	return ret;
3381 }
3382 
3383 /*
3384  * Given a key and some data, insert items into the tree.
3385  * This does all the path init required, making room in the tree if needed.
3386  * Returns the number of keys that were inserted.
3387  */
3388 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3389 			    struct btrfs_root *root,
3390 			    struct btrfs_path *path,
3391 			    struct btrfs_key *cpu_key, u32 *data_size,
3392 			    int nr)
3393 {
3394 	struct extent_buffer *leaf;
3395 	struct btrfs_item *item;
3396 	int ret = 0;
3397 	int slot;
3398 	int i;
3399 	u32 nritems;
3400 	u32 total_data = 0;
3401 	u32 total_size = 0;
3402 	unsigned int data_end;
3403 	struct btrfs_disk_key disk_key;
3404 	struct btrfs_key found_key;
3405 
3406 	for (i = 0; i < nr; i++) {
3407 		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3408 		    BTRFS_LEAF_DATA_SIZE(root)) {
3409 			break;
3410 			nr = i;
3411 		}
3412 		total_data += data_size[i];
3413 		total_size += data_size[i] + sizeof(struct btrfs_item);
3414 	}
3415 	BUG_ON(nr == 0);
3416 
3417 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3418 	if (ret == 0)
3419 		return -EEXIST;
3420 	if (ret < 0)
3421 		goto out;
3422 
3423 	leaf = path->nodes[0];
3424 
3425 	nritems = btrfs_header_nritems(leaf);
3426 	data_end = leaf_data_end(root, leaf);
3427 
3428 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3429 		for (i = nr; i >= 0; i--) {
3430 			total_data -= data_size[i];
3431 			total_size -= data_size[i] + sizeof(struct btrfs_item);
3432 			if (total_size < btrfs_leaf_free_space(root, leaf))
3433 				break;
3434 		}
3435 		nr = i;
3436 	}
3437 
3438 	slot = path->slots[0];
3439 	BUG_ON(slot < 0);
3440 
3441 	if (slot != nritems) {
3442 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3443 
3444 		item = btrfs_item_nr(leaf, slot);
3445 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3446 
3447 		/* figure out how many keys we can insert in here */
3448 		total_data = data_size[0];
3449 		for (i = 1; i < nr; i++) {
3450 			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3451 				break;
3452 			total_data += data_size[i];
3453 		}
3454 		nr = i;
3455 
3456 		if (old_data < data_end) {
3457 			btrfs_print_leaf(root, leaf);
3458 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3459 			       slot, old_data, data_end);
3460 			BUG_ON(1);
3461 		}
3462 		/*
3463 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3464 		 */
3465 		/* first correct the data pointers */
3466 		WARN_ON(leaf->map_token);
3467 		for (i = slot; i < nritems; i++) {
3468 			u32 ioff;
3469 
3470 			item = btrfs_item_nr(leaf, i);
3471 			if (!leaf->map_token) {
3472 				map_extent_buffer(leaf, (unsigned long)item,
3473 					sizeof(struct btrfs_item),
3474 					&leaf->map_token, &leaf->kaddr,
3475 					&leaf->map_start, &leaf->map_len,
3476 					KM_USER1);
3477 			}
3478 
3479 			ioff = btrfs_item_offset(leaf, item);
3480 			btrfs_set_item_offset(leaf, item, ioff - total_data);
3481 		}
3482 		if (leaf->map_token) {
3483 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3484 			leaf->map_token = NULL;
3485 		}
3486 
3487 		/* shift the items */
3488 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3489 			      btrfs_item_nr_offset(slot),
3490 			      (nritems - slot) * sizeof(struct btrfs_item));
3491 
3492 		/* shift the data */
3493 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3494 			      data_end - total_data, btrfs_leaf_data(leaf) +
3495 			      data_end, old_data - data_end);
3496 		data_end = old_data;
3497 	} else {
3498 		/*
3499 		 * this sucks but it has to be done, if we are inserting at
3500 		 * the end of the leaf only insert 1 of the items, since we
3501 		 * have no way of knowing whats on the next leaf and we'd have
3502 		 * to drop our current locks to figure it out
3503 		 */
3504 		nr = 1;
3505 	}
3506 
3507 	/* setup the item for the new data */
3508 	for (i = 0; i < nr; i++) {
3509 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3510 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3511 		item = btrfs_item_nr(leaf, slot + i);
3512 		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3513 		data_end -= data_size[i];
3514 		btrfs_set_item_size(leaf, item, data_size[i]);
3515 	}
3516 	btrfs_set_header_nritems(leaf, nritems + nr);
3517 	btrfs_mark_buffer_dirty(leaf);
3518 
3519 	ret = 0;
3520 	if (slot == 0) {
3521 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3522 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3523 	}
3524 
3525 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3526 		btrfs_print_leaf(root, leaf);
3527 		BUG();
3528 	}
3529 out:
3530 	if (!ret)
3531 		ret = nr;
3532 	return ret;
3533 }
3534 
3535 /*
3536  * this is a helper for btrfs_insert_empty_items, the main goal here is
3537  * to save stack depth by doing the bulk of the work in a function
3538  * that doesn't call btrfs_search_slot
3539  */
3540 static noinline_for_stack int
3541 setup_items_for_insert(struct btrfs_trans_handle *trans,
3542 		      struct btrfs_root *root, struct btrfs_path *path,
3543 		      struct btrfs_key *cpu_key, u32 *data_size,
3544 		      u32 total_data, u32 total_size, int nr)
3545 {
3546 	struct btrfs_item *item;
3547 	int i;
3548 	u32 nritems;
3549 	unsigned int data_end;
3550 	struct btrfs_disk_key disk_key;
3551 	int ret;
3552 	struct extent_buffer *leaf;
3553 	int slot;
3554 
3555 	leaf = path->nodes[0];
3556 	slot = path->slots[0];
3557 
3558 	nritems = btrfs_header_nritems(leaf);
3559 	data_end = leaf_data_end(root, leaf);
3560 
3561 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3562 		btrfs_print_leaf(root, leaf);
3563 		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3564 		       total_size, btrfs_leaf_free_space(root, leaf));
3565 		BUG();
3566 	}
3567 
3568 	if (slot != nritems) {
3569 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3570 
3571 		if (old_data < data_end) {
3572 			btrfs_print_leaf(root, leaf);
3573 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3574 			       slot, old_data, data_end);
3575 			BUG_ON(1);
3576 		}
3577 		/*
3578 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3579 		 */
3580 		/* first correct the data pointers */
3581 		WARN_ON(leaf->map_token);
3582 		for (i = slot; i < nritems; i++) {
3583 			u32 ioff;
3584 
3585 			item = btrfs_item_nr(leaf, i);
3586 			if (!leaf->map_token) {
3587 				map_extent_buffer(leaf, (unsigned long)item,
3588 					sizeof(struct btrfs_item),
3589 					&leaf->map_token, &leaf->kaddr,
3590 					&leaf->map_start, &leaf->map_len,
3591 					KM_USER1);
3592 			}
3593 
3594 			ioff = btrfs_item_offset(leaf, item);
3595 			btrfs_set_item_offset(leaf, item, ioff - total_data);
3596 		}
3597 		if (leaf->map_token) {
3598 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3599 			leaf->map_token = NULL;
3600 		}
3601 
3602 		/* shift the items */
3603 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3604 			      btrfs_item_nr_offset(slot),
3605 			      (nritems - slot) * sizeof(struct btrfs_item));
3606 
3607 		/* shift the data */
3608 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3609 			      data_end - total_data, btrfs_leaf_data(leaf) +
3610 			      data_end, old_data - data_end);
3611 		data_end = old_data;
3612 	}
3613 
3614 	/* setup the item for the new data */
3615 	for (i = 0; i < nr; i++) {
3616 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3617 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3618 		item = btrfs_item_nr(leaf, slot + i);
3619 		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3620 		data_end -= data_size[i];
3621 		btrfs_set_item_size(leaf, item, data_size[i]);
3622 	}
3623 
3624 	btrfs_set_header_nritems(leaf, nritems + nr);
3625 
3626 	ret = 0;
3627 	if (slot == 0) {
3628 		struct btrfs_disk_key disk_key;
3629 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3630 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3631 	}
3632 	btrfs_unlock_up_safe(path, 1);
3633 	btrfs_mark_buffer_dirty(leaf);
3634 
3635 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3636 		btrfs_print_leaf(root, leaf);
3637 		BUG();
3638 	}
3639 	return ret;
3640 }
3641 
3642 /*
3643  * Given a key and some data, insert items into the tree.
3644  * This does all the path init required, making room in the tree if needed.
3645  */
3646 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3647 			    struct btrfs_root *root,
3648 			    struct btrfs_path *path,
3649 			    struct btrfs_key *cpu_key, u32 *data_size,
3650 			    int nr)
3651 {
3652 	struct extent_buffer *leaf;
3653 	int ret = 0;
3654 	int slot;
3655 	int i;
3656 	u32 total_size = 0;
3657 	u32 total_data = 0;
3658 
3659 	for (i = 0; i < nr; i++)
3660 		total_data += data_size[i];
3661 
3662 	total_size = total_data + (nr * sizeof(struct btrfs_item));
3663 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3664 	if (ret == 0)
3665 		return -EEXIST;
3666 	if (ret < 0)
3667 		goto out;
3668 
3669 	leaf = path->nodes[0];
3670 	slot = path->slots[0];
3671 	BUG_ON(slot < 0);
3672 
3673 	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3674 			       total_data, total_size, nr);
3675 
3676 out:
3677 	return ret;
3678 }
3679 
3680 /*
3681  * Given a key and some data, insert an item into the tree.
3682  * This does all the path init required, making room in the tree if needed.
3683  */
3684 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3685 		      *root, struct btrfs_key *cpu_key, void *data, u32
3686 		      data_size)
3687 {
3688 	int ret = 0;
3689 	struct btrfs_path *path;
3690 	struct extent_buffer *leaf;
3691 	unsigned long ptr;
3692 
3693 	path = btrfs_alloc_path();
3694 	BUG_ON(!path);
3695 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3696 	if (!ret) {
3697 		leaf = path->nodes[0];
3698 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3699 		write_extent_buffer(leaf, data, ptr, data_size);
3700 		btrfs_mark_buffer_dirty(leaf);
3701 	}
3702 	btrfs_free_path(path);
3703 	return ret;
3704 }
3705 
3706 /*
3707  * delete the pointer from a given node.
3708  *
3709  * the tree should have been previously balanced so the deletion does not
3710  * empty a node.
3711  */
3712 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3713 		   struct btrfs_path *path, int level, int slot)
3714 {
3715 	struct extent_buffer *parent = path->nodes[level];
3716 	u32 nritems;
3717 	int ret = 0;
3718 	int wret;
3719 
3720 	nritems = btrfs_header_nritems(parent);
3721 	if (slot != nritems - 1) {
3722 		memmove_extent_buffer(parent,
3723 			      btrfs_node_key_ptr_offset(slot),
3724 			      btrfs_node_key_ptr_offset(slot + 1),
3725 			      sizeof(struct btrfs_key_ptr) *
3726 			      (nritems - slot - 1));
3727 	}
3728 	nritems--;
3729 	btrfs_set_header_nritems(parent, nritems);
3730 	if (nritems == 0 && parent == root->node) {
3731 		BUG_ON(btrfs_header_level(root->node) != 1);
3732 		/* just turn the root into a leaf and break */
3733 		btrfs_set_header_level(root->node, 0);
3734 	} else if (slot == 0) {
3735 		struct btrfs_disk_key disk_key;
3736 
3737 		btrfs_node_key(parent, &disk_key, 0);
3738 		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3739 		if (wret)
3740 			ret = wret;
3741 	}
3742 	btrfs_mark_buffer_dirty(parent);
3743 	return ret;
3744 }
3745 
3746 /*
3747  * a helper function to delete the leaf pointed to by path->slots[1] and
3748  * path->nodes[1].  bytenr is the node block pointer, but since the callers
3749  * already know it, it is faster to have them pass it down than to
3750  * read it out of the node again.
3751  *
3752  * This deletes the pointer in path->nodes[1] and frees the leaf
3753  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3754  *
3755  * The path must have already been setup for deleting the leaf, including
3756  * all the proper balancing.  path->nodes[1] must be locked.
3757  */
3758 noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3759 			    struct btrfs_root *root,
3760 			    struct btrfs_path *path, u64 bytenr)
3761 {
3762 	int ret;
3763 	u64 root_gen = btrfs_header_generation(path->nodes[1]);
3764 	u64 parent_start = path->nodes[1]->start;
3765 	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3766 
3767 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
3768 	if (ret)
3769 		return ret;
3770 
3771 	/*
3772 	 * btrfs_free_extent is expensive, we want to make sure we
3773 	 * aren't holding any locks when we call it
3774 	 */
3775 	btrfs_unlock_up_safe(path, 0);
3776 
3777 	ret = btrfs_free_extent(trans, root, bytenr,
3778 				btrfs_level_size(root, 0),
3779 				parent_start, parent_owner,
3780 				root_gen, 0, 1);
3781 	return ret;
3782 }
3783 /*
3784  * delete the item at the leaf level in path.  If that empties
3785  * the leaf, remove it from the tree
3786  */
3787 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3788 		    struct btrfs_path *path, int slot, int nr)
3789 {
3790 	struct extent_buffer *leaf;
3791 	struct btrfs_item *item;
3792 	int last_off;
3793 	int dsize = 0;
3794 	int ret = 0;
3795 	int wret;
3796 	int i;
3797 	u32 nritems;
3798 
3799 	leaf = path->nodes[0];
3800 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3801 
3802 	for (i = 0; i < nr; i++)
3803 		dsize += btrfs_item_size_nr(leaf, slot + i);
3804 
3805 	nritems = btrfs_header_nritems(leaf);
3806 
3807 	if (slot + nr != nritems) {
3808 		int data_end = leaf_data_end(root, leaf);
3809 
3810 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3811 			      data_end + dsize,
3812 			      btrfs_leaf_data(leaf) + data_end,
3813 			      last_off - data_end);
3814 
3815 		for (i = slot + nr; i < nritems; i++) {
3816 			u32 ioff;
3817 
3818 			item = btrfs_item_nr(leaf, i);
3819 			if (!leaf->map_token) {
3820 				map_extent_buffer(leaf, (unsigned long)item,
3821 					sizeof(struct btrfs_item),
3822 					&leaf->map_token, &leaf->kaddr,
3823 					&leaf->map_start, &leaf->map_len,
3824 					KM_USER1);
3825 			}
3826 			ioff = btrfs_item_offset(leaf, item);
3827 			btrfs_set_item_offset(leaf, item, ioff + dsize);
3828 		}
3829 
3830 		if (leaf->map_token) {
3831 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3832 			leaf->map_token = NULL;
3833 		}
3834 
3835 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3836 			      btrfs_item_nr_offset(slot + nr),
3837 			      sizeof(struct btrfs_item) *
3838 			      (nritems - slot - nr));
3839 	}
3840 	btrfs_set_header_nritems(leaf, nritems - nr);
3841 	nritems -= nr;
3842 
3843 	/* delete the leaf if we've emptied it */
3844 	if (nritems == 0) {
3845 		if (leaf == root->node) {
3846 			btrfs_set_header_level(leaf, 0);
3847 		} else {
3848 			ret = btrfs_del_leaf(trans, root, path, leaf->start);
3849 			BUG_ON(ret);
3850 		}
3851 	} else {
3852 		int used = leaf_space_used(leaf, 0, nritems);
3853 		if (slot == 0) {
3854 			struct btrfs_disk_key disk_key;
3855 
3856 			btrfs_item_key(leaf, &disk_key, 0);
3857 			wret = fixup_low_keys(trans, root, path,
3858 					      &disk_key, 1);
3859 			if (wret)
3860 				ret = wret;
3861 		}
3862 
3863 		/* delete the leaf if it is mostly empty */
3864 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
3865 		    !trans->transaction->delayed_refs.flushing) {
3866 			/* push_leaf_left fixes the path.
3867 			 * make sure the path still points to our leaf
3868 			 * for possible call to del_ptr below
3869 			 */
3870 			slot = path->slots[1];
3871 			extent_buffer_get(leaf);
3872 
3873 			btrfs_set_path_blocking(path);
3874 			wret = push_leaf_left(trans, root, path, 1, 1);
3875 			if (wret < 0 && wret != -ENOSPC)
3876 				ret = wret;
3877 
3878 			if (path->nodes[0] == leaf &&
3879 			    btrfs_header_nritems(leaf)) {
3880 				wret = push_leaf_right(trans, root, path, 1, 1);
3881 				if (wret < 0 && wret != -ENOSPC)
3882 					ret = wret;
3883 			}
3884 
3885 			if (btrfs_header_nritems(leaf) == 0) {
3886 				path->slots[1] = slot;
3887 				ret = btrfs_del_leaf(trans, root, path,
3888 						     leaf->start);
3889 				BUG_ON(ret);
3890 				free_extent_buffer(leaf);
3891 			} else {
3892 				/* if we're still in the path, make sure
3893 				 * we're dirty.  Otherwise, one of the
3894 				 * push_leaf functions must have already
3895 				 * dirtied this buffer
3896 				 */
3897 				if (path->nodes[0] == leaf)
3898 					btrfs_mark_buffer_dirty(leaf);
3899 				free_extent_buffer(leaf);
3900 			}
3901 		} else {
3902 			btrfs_mark_buffer_dirty(leaf);
3903 		}
3904 	}
3905 	return ret;
3906 }
3907 
3908 /*
3909  * search the tree again to find a leaf with lesser keys
3910  * returns 0 if it found something or 1 if there are no lesser leaves.
3911  * returns < 0 on io errors.
3912  *
3913  * This may release the path, and so you may lose any locks held at the
3914  * time you call it.
3915  */
3916 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3917 {
3918 	struct btrfs_key key;
3919 	struct btrfs_disk_key found_key;
3920 	int ret;
3921 
3922 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3923 
3924 	if (key.offset > 0)
3925 		key.offset--;
3926 	else if (key.type > 0)
3927 		key.type--;
3928 	else if (key.objectid > 0)
3929 		key.objectid--;
3930 	else
3931 		return 1;
3932 
3933 	btrfs_release_path(root, path);
3934 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3935 	if (ret < 0)
3936 		return ret;
3937 	btrfs_item_key(path->nodes[0], &found_key, 0);
3938 	ret = comp_keys(&found_key, &key);
3939 	if (ret < 0)
3940 		return 0;
3941 	return 1;
3942 }
3943 
3944 /*
3945  * A helper function to walk down the tree starting at min_key, and looking
3946  * for nodes or leaves that are either in cache or have a minimum
3947  * transaction id.  This is used by the btree defrag code, and tree logging
3948  *
3949  * This does not cow, but it does stuff the starting key it finds back
3950  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3951  * key and get a writable path.
3952  *
3953  * This does lock as it descends, and path->keep_locks should be set
3954  * to 1 by the caller.
3955  *
3956  * This honors path->lowest_level to prevent descent past a given level
3957  * of the tree.
3958  *
3959  * min_trans indicates the oldest transaction that you are interested
3960  * in walking through.  Any nodes or leaves older than min_trans are
3961  * skipped over (without reading them).
3962  *
3963  * returns zero if something useful was found, < 0 on error and 1 if there
3964  * was nothing in the tree that matched the search criteria.
3965  */
3966 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3967 			 struct btrfs_key *max_key,
3968 			 struct btrfs_path *path, int cache_only,
3969 			 u64 min_trans)
3970 {
3971 	struct extent_buffer *cur;
3972 	struct btrfs_key found_key;
3973 	int slot;
3974 	int sret;
3975 	u32 nritems;
3976 	int level;
3977 	int ret = 1;
3978 
3979 	WARN_ON(!path->keep_locks);
3980 again:
3981 	cur = btrfs_lock_root_node(root);
3982 	level = btrfs_header_level(cur);
3983 	WARN_ON(path->nodes[level]);
3984 	path->nodes[level] = cur;
3985 	path->locks[level] = 1;
3986 
3987 	if (btrfs_header_generation(cur) < min_trans) {
3988 		ret = 1;
3989 		goto out;
3990 	}
3991 	while (1) {
3992 		nritems = btrfs_header_nritems(cur);
3993 		level = btrfs_header_level(cur);
3994 		sret = bin_search(cur, min_key, level, &slot);
3995 
3996 		/* at the lowest level, we're done, setup the path and exit */
3997 		if (level == path->lowest_level) {
3998 			if (slot >= nritems)
3999 				goto find_next_key;
4000 			ret = 0;
4001 			path->slots[level] = slot;
4002 			btrfs_item_key_to_cpu(cur, &found_key, slot);
4003 			goto out;
4004 		}
4005 		if (sret && slot > 0)
4006 			slot--;
4007 		/*
4008 		 * check this node pointer against the cache_only and
4009 		 * min_trans parameters.  If it isn't in cache or is too
4010 		 * old, skip to the next one.
4011 		 */
4012 		while (slot < nritems) {
4013 			u64 blockptr;
4014 			u64 gen;
4015 			struct extent_buffer *tmp;
4016 			struct btrfs_disk_key disk_key;
4017 
4018 			blockptr = btrfs_node_blockptr(cur, slot);
4019 			gen = btrfs_node_ptr_generation(cur, slot);
4020 			if (gen < min_trans) {
4021 				slot++;
4022 				continue;
4023 			}
4024 			if (!cache_only)
4025 				break;
4026 
4027 			if (max_key) {
4028 				btrfs_node_key(cur, &disk_key, slot);
4029 				if (comp_keys(&disk_key, max_key) >= 0) {
4030 					ret = 1;
4031 					goto out;
4032 				}
4033 			}
4034 
4035 			tmp = btrfs_find_tree_block(root, blockptr,
4036 					    btrfs_level_size(root, level - 1));
4037 
4038 			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4039 				free_extent_buffer(tmp);
4040 				break;
4041 			}
4042 			if (tmp)
4043 				free_extent_buffer(tmp);
4044 			slot++;
4045 		}
4046 find_next_key:
4047 		/*
4048 		 * we didn't find a candidate key in this node, walk forward
4049 		 * and find another one
4050 		 */
4051 		if (slot >= nritems) {
4052 			path->slots[level] = slot;
4053 			btrfs_set_path_blocking(path);
4054 			sret = btrfs_find_next_key(root, path, min_key, level,
4055 						  cache_only, min_trans);
4056 			if (sret == 0) {
4057 				btrfs_release_path(root, path);
4058 				goto again;
4059 			} else {
4060 				goto out;
4061 			}
4062 		}
4063 		/* save our key for returning back */
4064 		btrfs_node_key_to_cpu(cur, &found_key, slot);
4065 		path->slots[level] = slot;
4066 		if (level == path->lowest_level) {
4067 			ret = 0;
4068 			unlock_up(path, level, 1);
4069 			goto out;
4070 		}
4071 		btrfs_set_path_blocking(path);
4072 		cur = read_node_slot(root, cur, slot);
4073 
4074 		btrfs_tree_lock(cur);
4075 
4076 		path->locks[level - 1] = 1;
4077 		path->nodes[level - 1] = cur;
4078 		unlock_up(path, level, 1);
4079 		btrfs_clear_path_blocking(path, NULL);
4080 	}
4081 out:
4082 	if (ret == 0)
4083 		memcpy(min_key, &found_key, sizeof(found_key));
4084 	btrfs_set_path_blocking(path);
4085 	return ret;
4086 }
4087 
4088 /*
4089  * this is similar to btrfs_next_leaf, but does not try to preserve
4090  * and fixup the path.  It looks for and returns the next key in the
4091  * tree based on the current path and the cache_only and min_trans
4092  * parameters.
4093  *
4094  * 0 is returned if another key is found, < 0 if there are any errors
4095  * and 1 is returned if there are no higher keys in the tree
4096  *
4097  * path->keep_locks should be set to 1 on the search made before
4098  * calling this function.
4099  */
4100 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4101 			struct btrfs_key *key, int lowest_level,
4102 			int cache_only, u64 min_trans)
4103 {
4104 	int level = lowest_level;
4105 	int slot;
4106 	struct extent_buffer *c;
4107 
4108 	WARN_ON(!path->keep_locks);
4109 	while (level < BTRFS_MAX_LEVEL) {
4110 		if (!path->nodes[level])
4111 			return 1;
4112 
4113 		slot = path->slots[level] + 1;
4114 		c = path->nodes[level];
4115 next:
4116 		if (slot >= btrfs_header_nritems(c)) {
4117 			level++;
4118 			if (level == BTRFS_MAX_LEVEL)
4119 				return 1;
4120 			continue;
4121 		}
4122 		if (level == 0)
4123 			btrfs_item_key_to_cpu(c, key, slot);
4124 		else {
4125 			u64 blockptr = btrfs_node_blockptr(c, slot);
4126 			u64 gen = btrfs_node_ptr_generation(c, slot);
4127 
4128 			if (cache_only) {
4129 				struct extent_buffer *cur;
4130 				cur = btrfs_find_tree_block(root, blockptr,
4131 					    btrfs_level_size(root, level - 1));
4132 				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4133 					slot++;
4134 					if (cur)
4135 						free_extent_buffer(cur);
4136 					goto next;
4137 				}
4138 				free_extent_buffer(cur);
4139 			}
4140 			if (gen < min_trans) {
4141 				slot++;
4142 				goto next;
4143 			}
4144 			btrfs_node_key_to_cpu(c, key, slot);
4145 		}
4146 		return 0;
4147 	}
4148 	return 1;
4149 }
4150 
4151 /*
4152  * search the tree again to find a leaf with greater keys
4153  * returns 0 if it found something or 1 if there are no greater leaves.
4154  * returns < 0 on io errors.
4155  */
4156 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4157 {
4158 	int slot;
4159 	int level;
4160 	struct extent_buffer *c;
4161 	struct extent_buffer *next;
4162 	struct btrfs_key key;
4163 	u32 nritems;
4164 	int ret;
4165 	int old_spinning = path->leave_spinning;
4166 	int force_blocking = 0;
4167 
4168 	nritems = btrfs_header_nritems(path->nodes[0]);
4169 	if (nritems == 0)
4170 		return 1;
4171 
4172 	/*
4173 	 * we take the blocks in an order that upsets lockdep.  Using
4174 	 * blocking mode is the only way around it.
4175 	 */
4176 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4177 	force_blocking = 1;
4178 #endif
4179 
4180 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4181 again:
4182 	level = 1;
4183 	next = NULL;
4184 	btrfs_release_path(root, path);
4185 
4186 	path->keep_locks = 1;
4187 
4188 	if (!force_blocking)
4189 		path->leave_spinning = 1;
4190 
4191 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4192 	path->keep_locks = 0;
4193 
4194 	if (ret < 0)
4195 		return ret;
4196 
4197 	nritems = btrfs_header_nritems(path->nodes[0]);
4198 	/*
4199 	 * by releasing the path above we dropped all our locks.  A balance
4200 	 * could have added more items next to the key that used to be
4201 	 * at the very end of the block.  So, check again here and
4202 	 * advance the path if there are now more items available.
4203 	 */
4204 	if (nritems > 0 && path->slots[0] < nritems - 1) {
4205 		path->slots[0]++;
4206 		ret = 0;
4207 		goto done;
4208 	}
4209 
4210 	while (level < BTRFS_MAX_LEVEL) {
4211 		if (!path->nodes[level]) {
4212 			ret = 1;
4213 			goto done;
4214 		}
4215 
4216 		slot = path->slots[level] + 1;
4217 		c = path->nodes[level];
4218 		if (slot >= btrfs_header_nritems(c)) {
4219 			level++;
4220 			if (level == BTRFS_MAX_LEVEL) {
4221 				ret = 1;
4222 				goto done;
4223 			}
4224 			continue;
4225 		}
4226 
4227 		if (next) {
4228 			btrfs_tree_unlock(next);
4229 			free_extent_buffer(next);
4230 		}
4231 
4232 		next = c;
4233 		ret = read_block_for_search(NULL, root, path, &next, level,
4234 					    slot, &key);
4235 		if (ret == -EAGAIN)
4236 			goto again;
4237 
4238 		if (ret < 0) {
4239 			btrfs_release_path(root, path);
4240 			goto done;
4241 		}
4242 
4243 		if (!path->skip_locking) {
4244 			ret = btrfs_try_spin_lock(next);
4245 			if (!ret) {
4246 				btrfs_set_path_blocking(path);
4247 				btrfs_tree_lock(next);
4248 				if (!force_blocking)
4249 					btrfs_clear_path_blocking(path, next);
4250 			}
4251 			if (force_blocking)
4252 				btrfs_set_lock_blocking(next);
4253 		}
4254 		break;
4255 	}
4256 	path->slots[level] = slot;
4257 	while (1) {
4258 		level--;
4259 		c = path->nodes[level];
4260 		if (path->locks[level])
4261 			btrfs_tree_unlock(c);
4262 
4263 		free_extent_buffer(c);
4264 		path->nodes[level] = next;
4265 		path->slots[level] = 0;
4266 		if (!path->skip_locking)
4267 			path->locks[level] = 1;
4268 
4269 		if (!level)
4270 			break;
4271 
4272 		ret = read_block_for_search(NULL, root, path, &next, level,
4273 					    0, &key);
4274 		if (ret == -EAGAIN)
4275 			goto again;
4276 
4277 		if (ret < 0) {
4278 			btrfs_release_path(root, path);
4279 			goto done;
4280 		}
4281 
4282 		if (!path->skip_locking) {
4283 			btrfs_assert_tree_locked(path->nodes[level]);
4284 			ret = btrfs_try_spin_lock(next);
4285 			if (!ret) {
4286 				btrfs_set_path_blocking(path);
4287 				btrfs_tree_lock(next);
4288 				if (!force_blocking)
4289 					btrfs_clear_path_blocking(path, next);
4290 			}
4291 			if (force_blocking)
4292 				btrfs_set_lock_blocking(next);
4293 		}
4294 	}
4295 	ret = 0;
4296 done:
4297 	unlock_up(path, 0, 1);
4298 	path->leave_spinning = old_spinning;
4299 	if (!old_spinning)
4300 		btrfs_set_path_blocking(path);
4301 
4302 	return ret;
4303 }
4304 
4305 /*
4306  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4307  * searching until it gets past min_objectid or finds an item of 'type'
4308  *
4309  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4310  */
4311 int btrfs_previous_item(struct btrfs_root *root,
4312 			struct btrfs_path *path, u64 min_objectid,
4313 			int type)
4314 {
4315 	struct btrfs_key found_key;
4316 	struct extent_buffer *leaf;
4317 	u32 nritems;
4318 	int ret;
4319 
4320 	while (1) {
4321 		if (path->slots[0] == 0) {
4322 			btrfs_set_path_blocking(path);
4323 			ret = btrfs_prev_leaf(root, path);
4324 			if (ret != 0)
4325 				return ret;
4326 		} else {
4327 			path->slots[0]--;
4328 		}
4329 		leaf = path->nodes[0];
4330 		nritems = btrfs_header_nritems(leaf);
4331 		if (nritems == 0)
4332 			return 1;
4333 		if (path->slots[0] == nritems)
4334 			path->slots[0]--;
4335 
4336 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4337 		if (found_key.type == type)
4338 			return 0;
4339 		if (found_key.objectid < min_objectid)
4340 			break;
4341 		if (found_key.objectid == min_objectid &&
4342 		    found_key.type < type)
4343 			break;
4344 	}
4345 	return 1;
4346 }
4347