xref: /openbmc/linux/fs/btrfs/ctree.c (revision 92ed1a76)
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "locking.h"
26 
27 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28 		      *root, struct btrfs_path *path, int level);
29 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
30 		      *root, struct btrfs_key *ins_key,
31 		      struct btrfs_path *path, int data_size, int extend);
32 static int push_node_left(struct btrfs_trans_handle *trans,
33 			  struct btrfs_root *root, struct extent_buffer *dst,
34 			  struct extent_buffer *src, int empty);
35 static int balance_node_right(struct btrfs_trans_handle *trans,
36 			      struct btrfs_root *root,
37 			      struct extent_buffer *dst_buf,
38 			      struct extent_buffer *src_buf);
39 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40 		   struct btrfs_path *path, int level, int slot);
41 static int setup_items_for_insert(struct btrfs_trans_handle *trans,
42 			struct btrfs_root *root, struct btrfs_path *path,
43 			struct btrfs_key *cpu_key, u32 *data_size,
44 			u32 total_data, u32 total_size, int nr);
45 
46 
47 struct btrfs_path *btrfs_alloc_path(void)
48 {
49 	struct btrfs_path *path;
50 	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
51 	if (path)
52 		path->reada = 1;
53 	return path;
54 }
55 
56 /*
57  * set all locked nodes in the path to blocking locks.  This should
58  * be done before scheduling
59  */
60 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
61 {
62 	int i;
63 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
64 		if (p->nodes[i] && p->locks[i])
65 			btrfs_set_lock_blocking(p->nodes[i]);
66 	}
67 }
68 
69 /*
70  * reset all the locked nodes in the patch to spinning locks.
71  *
72  * held is used to keep lockdep happy, when lockdep is enabled
73  * we set held to a blocking lock before we go around and
74  * retake all the spinlocks in the path.  You can safely use NULL
75  * for held
76  */
77 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
78 					struct extent_buffer *held)
79 {
80 	int i;
81 
82 #ifdef CONFIG_DEBUG_LOCK_ALLOC
83 	/* lockdep really cares that we take all of these spinlocks
84 	 * in the right order.  If any of the locks in the path are not
85 	 * currently blocking, it is going to complain.  So, make really
86 	 * really sure by forcing the path to blocking before we clear
87 	 * the path blocking.
88 	 */
89 	if (held)
90 		btrfs_set_lock_blocking(held);
91 	btrfs_set_path_blocking(p);
92 #endif
93 
94 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
95 		if (p->nodes[i] && p->locks[i])
96 			btrfs_clear_lock_blocking(p->nodes[i]);
97 	}
98 
99 #ifdef CONFIG_DEBUG_LOCK_ALLOC
100 	if (held)
101 		btrfs_clear_lock_blocking(held);
102 #endif
103 }
104 
105 /* this also releases the path */
106 void btrfs_free_path(struct btrfs_path *p)
107 {
108 	btrfs_release_path(NULL, p);
109 	kmem_cache_free(btrfs_path_cachep, p);
110 }
111 
112 /*
113  * path release drops references on the extent buffers in the path
114  * and it drops any locks held by this path
115  *
116  * It is safe to call this on paths that no locks or extent buffers held.
117  */
118 noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
119 {
120 	int i;
121 
122 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
123 		p->slots[i] = 0;
124 		if (!p->nodes[i])
125 			continue;
126 		if (p->locks[i]) {
127 			btrfs_tree_unlock(p->nodes[i]);
128 			p->locks[i] = 0;
129 		}
130 		free_extent_buffer(p->nodes[i]);
131 		p->nodes[i] = NULL;
132 	}
133 }
134 
135 /*
136  * safely gets a reference on the root node of a tree.  A lock
137  * is not taken, so a concurrent writer may put a different node
138  * at the root of the tree.  See btrfs_lock_root_node for the
139  * looping required.
140  *
141  * The extent buffer returned by this has a reference taken, so
142  * it won't disappear.  It may stop being the root of the tree
143  * at any time because there are no locks held.
144  */
145 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
146 {
147 	struct extent_buffer *eb;
148 	spin_lock(&root->node_lock);
149 	eb = root->node;
150 	extent_buffer_get(eb);
151 	spin_unlock(&root->node_lock);
152 	return eb;
153 }
154 
155 /* loop around taking references on and locking the root node of the
156  * tree until you end up with a lock on the root.  A locked buffer
157  * is returned, with a reference held.
158  */
159 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
160 {
161 	struct extent_buffer *eb;
162 
163 	while (1) {
164 		eb = btrfs_root_node(root);
165 		btrfs_tree_lock(eb);
166 
167 		spin_lock(&root->node_lock);
168 		if (eb == root->node) {
169 			spin_unlock(&root->node_lock);
170 			break;
171 		}
172 		spin_unlock(&root->node_lock);
173 
174 		btrfs_tree_unlock(eb);
175 		free_extent_buffer(eb);
176 	}
177 	return eb;
178 }
179 
180 /* cowonly root (everything not a reference counted cow subvolume), just get
181  * put onto a simple dirty list.  transaction.c walks this to make sure they
182  * get properly updated on disk.
183  */
184 static void add_root_to_dirty_list(struct btrfs_root *root)
185 {
186 	if (root->track_dirty && list_empty(&root->dirty_list)) {
187 		list_add(&root->dirty_list,
188 			 &root->fs_info->dirty_cowonly_roots);
189 	}
190 }
191 
192 /*
193  * used by snapshot creation to make a copy of a root for a tree with
194  * a given objectid.  The buffer with the new root node is returned in
195  * cow_ret, and this func returns zero on success or a negative error code.
196  */
197 int btrfs_copy_root(struct btrfs_trans_handle *trans,
198 		      struct btrfs_root *root,
199 		      struct extent_buffer *buf,
200 		      struct extent_buffer **cow_ret, u64 new_root_objectid)
201 {
202 	struct extent_buffer *cow;
203 	int ret = 0;
204 	int level;
205 	struct btrfs_disk_key disk_key;
206 
207 	WARN_ON(root->ref_cows && trans->transid !=
208 		root->fs_info->running_transaction->transid);
209 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
210 
211 	level = btrfs_header_level(buf);
212 	if (level == 0)
213 		btrfs_item_key(buf, &disk_key, 0);
214 	else
215 		btrfs_node_key(buf, &disk_key, 0);
216 
217 	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
218 				     new_root_objectid, &disk_key, level,
219 				     buf->start, 0);
220 	if (IS_ERR(cow))
221 		return PTR_ERR(cow);
222 
223 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
224 	btrfs_set_header_bytenr(cow, cow->start);
225 	btrfs_set_header_generation(cow, trans->transid);
226 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
227 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
228 				     BTRFS_HEADER_FLAG_RELOC);
229 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
230 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
231 	else
232 		btrfs_set_header_owner(cow, new_root_objectid);
233 
234 	write_extent_buffer(cow, root->fs_info->fsid,
235 			    (unsigned long)btrfs_header_fsid(cow),
236 			    BTRFS_FSID_SIZE);
237 
238 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
239 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
240 		ret = btrfs_inc_ref(trans, root, cow, 1);
241 	else
242 		ret = btrfs_inc_ref(trans, root, cow, 0);
243 
244 	if (ret)
245 		return ret;
246 
247 	btrfs_mark_buffer_dirty(cow);
248 	*cow_ret = cow;
249 	return 0;
250 }
251 
252 /*
253  * check if the tree block can be shared by multiple trees
254  */
255 int btrfs_block_can_be_shared(struct btrfs_root *root,
256 			      struct extent_buffer *buf)
257 {
258 	/*
259 	 * Tree blocks not in refernece counted trees and tree roots
260 	 * are never shared. If a block was allocated after the last
261 	 * snapshot and the block was not allocated by tree relocation,
262 	 * we know the block is not shared.
263 	 */
264 	if (root->ref_cows &&
265 	    buf != root->node && buf != root->commit_root &&
266 	    (btrfs_header_generation(buf) <=
267 	     btrfs_root_last_snapshot(&root->root_item) ||
268 	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
269 		return 1;
270 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
271 	if (root->ref_cows &&
272 	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
273 		return 1;
274 #endif
275 	return 0;
276 }
277 
278 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
279 				       struct btrfs_root *root,
280 				       struct extent_buffer *buf,
281 				       struct extent_buffer *cow,
282 				       int *last_ref)
283 {
284 	u64 refs;
285 	u64 owner;
286 	u64 flags;
287 	u64 new_flags = 0;
288 	int ret;
289 
290 	/*
291 	 * Backrefs update rules:
292 	 *
293 	 * Always use full backrefs for extent pointers in tree block
294 	 * allocated by tree relocation.
295 	 *
296 	 * If a shared tree block is no longer referenced by its owner
297 	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
298 	 * use full backrefs for extent pointers in tree block.
299 	 *
300 	 * If a tree block is been relocating
301 	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
302 	 * use full backrefs for extent pointers in tree block.
303 	 * The reason for this is some operations (such as drop tree)
304 	 * are only allowed for blocks use full backrefs.
305 	 */
306 
307 	if (btrfs_block_can_be_shared(root, buf)) {
308 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
309 					       buf->len, &refs, &flags);
310 		BUG_ON(ret);
311 		BUG_ON(refs == 0);
312 	} else {
313 		refs = 1;
314 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
315 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
316 			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
317 		else
318 			flags = 0;
319 	}
320 
321 	owner = btrfs_header_owner(buf);
322 	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
323 	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
324 
325 	if (refs > 1) {
326 		if ((owner == root->root_key.objectid ||
327 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
328 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
329 			ret = btrfs_inc_ref(trans, root, buf, 1);
330 			BUG_ON(ret);
331 
332 			if (root->root_key.objectid ==
333 			    BTRFS_TREE_RELOC_OBJECTID) {
334 				ret = btrfs_dec_ref(trans, root, buf, 0);
335 				BUG_ON(ret);
336 				ret = btrfs_inc_ref(trans, root, cow, 1);
337 				BUG_ON(ret);
338 			}
339 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
340 		} else {
341 
342 			if (root->root_key.objectid ==
343 			    BTRFS_TREE_RELOC_OBJECTID)
344 				ret = btrfs_inc_ref(trans, root, cow, 1);
345 			else
346 				ret = btrfs_inc_ref(trans, root, cow, 0);
347 			BUG_ON(ret);
348 		}
349 		if (new_flags != 0) {
350 			ret = btrfs_set_disk_extent_flags(trans, root,
351 							  buf->start,
352 							  buf->len,
353 							  new_flags, 0);
354 			BUG_ON(ret);
355 		}
356 	} else {
357 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
358 			if (root->root_key.objectid ==
359 			    BTRFS_TREE_RELOC_OBJECTID)
360 				ret = btrfs_inc_ref(trans, root, cow, 1);
361 			else
362 				ret = btrfs_inc_ref(trans, root, cow, 0);
363 			BUG_ON(ret);
364 			ret = btrfs_dec_ref(trans, root, buf, 1);
365 			BUG_ON(ret);
366 		}
367 		clean_tree_block(trans, root, buf);
368 		*last_ref = 1;
369 	}
370 	return 0;
371 }
372 
373 /*
374  * does the dirty work in cow of a single block.  The parent block (if
375  * supplied) is updated to point to the new cow copy.  The new buffer is marked
376  * dirty and returned locked.  If you modify the block it needs to be marked
377  * dirty again.
378  *
379  * search_start -- an allocation hint for the new block
380  *
381  * empty_size -- a hint that you plan on doing more cow.  This is the size in
382  * bytes the allocator should try to find free next to the block it returns.
383  * This is just a hint and may be ignored by the allocator.
384  */
385 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
386 			     struct btrfs_root *root,
387 			     struct extent_buffer *buf,
388 			     struct extent_buffer *parent, int parent_slot,
389 			     struct extent_buffer **cow_ret,
390 			     u64 search_start, u64 empty_size)
391 {
392 	struct btrfs_disk_key disk_key;
393 	struct extent_buffer *cow;
394 	int level;
395 	int last_ref = 0;
396 	int unlock_orig = 0;
397 	u64 parent_start;
398 
399 	if (*cow_ret == buf)
400 		unlock_orig = 1;
401 
402 	btrfs_assert_tree_locked(buf);
403 
404 	WARN_ON(root->ref_cows && trans->transid !=
405 		root->fs_info->running_transaction->transid);
406 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
407 
408 	level = btrfs_header_level(buf);
409 
410 	if (level == 0)
411 		btrfs_item_key(buf, &disk_key, 0);
412 	else
413 		btrfs_node_key(buf, &disk_key, 0);
414 
415 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
416 		if (parent)
417 			parent_start = parent->start;
418 		else
419 			parent_start = 0;
420 	} else
421 		parent_start = 0;
422 
423 	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
424 				     root->root_key.objectid, &disk_key,
425 				     level, search_start, empty_size);
426 	if (IS_ERR(cow))
427 		return PTR_ERR(cow);
428 
429 	/* cow is set to blocking by btrfs_init_new_buffer */
430 
431 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
432 	btrfs_set_header_bytenr(cow, cow->start);
433 	btrfs_set_header_generation(cow, trans->transid);
434 	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
435 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
436 				     BTRFS_HEADER_FLAG_RELOC);
437 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
438 		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
439 	else
440 		btrfs_set_header_owner(cow, root->root_key.objectid);
441 
442 	write_extent_buffer(cow, root->fs_info->fsid,
443 			    (unsigned long)btrfs_header_fsid(cow),
444 			    BTRFS_FSID_SIZE);
445 
446 	update_ref_for_cow(trans, root, buf, cow, &last_ref);
447 
448 	if (root->ref_cows)
449 		btrfs_reloc_cow_block(trans, root, buf, cow);
450 
451 	if (buf == root->node) {
452 		WARN_ON(parent && parent != buf);
453 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
454 		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
455 			parent_start = buf->start;
456 		else
457 			parent_start = 0;
458 
459 		spin_lock(&root->node_lock);
460 		root->node = cow;
461 		extent_buffer_get(cow);
462 		spin_unlock(&root->node_lock);
463 
464 		btrfs_free_tree_block(trans, root, buf, parent_start,
465 				      last_ref);
466 		free_extent_buffer(buf);
467 		add_root_to_dirty_list(root);
468 	} else {
469 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
470 			parent_start = parent->start;
471 		else
472 			parent_start = 0;
473 
474 		WARN_ON(trans->transid != btrfs_header_generation(parent));
475 		btrfs_set_node_blockptr(parent, parent_slot,
476 					cow->start);
477 		btrfs_set_node_ptr_generation(parent, parent_slot,
478 					      trans->transid);
479 		btrfs_mark_buffer_dirty(parent);
480 		btrfs_free_tree_block(trans, root, buf, parent_start,
481 				      last_ref);
482 	}
483 	if (unlock_orig)
484 		btrfs_tree_unlock(buf);
485 	free_extent_buffer(buf);
486 	btrfs_mark_buffer_dirty(cow);
487 	*cow_ret = cow;
488 	return 0;
489 }
490 
491 static inline int should_cow_block(struct btrfs_trans_handle *trans,
492 				   struct btrfs_root *root,
493 				   struct extent_buffer *buf)
494 {
495 	if (btrfs_header_generation(buf) == trans->transid &&
496 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
497 	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
498 	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
499 		return 0;
500 	return 1;
501 }
502 
503 /*
504  * cows a single block, see __btrfs_cow_block for the real work.
505  * This version of it has extra checks so that a block isn't cow'd more than
506  * once per transaction, as long as it hasn't been written yet
507  */
508 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
509 		    struct btrfs_root *root, struct extent_buffer *buf,
510 		    struct extent_buffer *parent, int parent_slot,
511 		    struct extent_buffer **cow_ret)
512 {
513 	u64 search_start;
514 	int ret;
515 
516 	if (trans->transaction != root->fs_info->running_transaction) {
517 		printk(KERN_CRIT "trans %llu running %llu\n",
518 		       (unsigned long long)trans->transid,
519 		       (unsigned long long)
520 		       root->fs_info->running_transaction->transid);
521 		WARN_ON(1);
522 	}
523 	if (trans->transid != root->fs_info->generation) {
524 		printk(KERN_CRIT "trans %llu running %llu\n",
525 		       (unsigned long long)trans->transid,
526 		       (unsigned long long)root->fs_info->generation);
527 		WARN_ON(1);
528 	}
529 
530 	if (!should_cow_block(trans, root, buf)) {
531 		*cow_ret = buf;
532 		return 0;
533 	}
534 
535 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
536 
537 	if (parent)
538 		btrfs_set_lock_blocking(parent);
539 	btrfs_set_lock_blocking(buf);
540 
541 	ret = __btrfs_cow_block(trans, root, buf, parent,
542 				 parent_slot, cow_ret, search_start, 0);
543 	return ret;
544 }
545 
546 /*
547  * helper function for defrag to decide if two blocks pointed to by a
548  * node are actually close by
549  */
550 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
551 {
552 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
553 		return 1;
554 	if (blocknr > other && blocknr - (other + blocksize) < 32768)
555 		return 1;
556 	return 0;
557 }
558 
559 /*
560  * compare two keys in a memcmp fashion
561  */
562 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
563 {
564 	struct btrfs_key k1;
565 
566 	btrfs_disk_key_to_cpu(&k1, disk);
567 
568 	return btrfs_comp_cpu_keys(&k1, k2);
569 }
570 
571 /*
572  * same as comp_keys only with two btrfs_key's
573  */
574 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
575 {
576 	if (k1->objectid > k2->objectid)
577 		return 1;
578 	if (k1->objectid < k2->objectid)
579 		return -1;
580 	if (k1->type > k2->type)
581 		return 1;
582 	if (k1->type < k2->type)
583 		return -1;
584 	if (k1->offset > k2->offset)
585 		return 1;
586 	if (k1->offset < k2->offset)
587 		return -1;
588 	return 0;
589 }
590 
591 /*
592  * this is used by the defrag code to go through all the
593  * leaves pointed to by a node and reallocate them so that
594  * disk order is close to key order
595  */
596 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
597 		       struct btrfs_root *root, struct extent_buffer *parent,
598 		       int start_slot, int cache_only, u64 *last_ret,
599 		       struct btrfs_key *progress)
600 {
601 	struct extent_buffer *cur;
602 	u64 blocknr;
603 	u64 gen;
604 	u64 search_start = *last_ret;
605 	u64 last_block = 0;
606 	u64 other;
607 	u32 parent_nritems;
608 	int end_slot;
609 	int i;
610 	int err = 0;
611 	int parent_level;
612 	int uptodate;
613 	u32 blocksize;
614 	int progress_passed = 0;
615 	struct btrfs_disk_key disk_key;
616 
617 	parent_level = btrfs_header_level(parent);
618 	if (cache_only && parent_level != 1)
619 		return 0;
620 
621 	if (trans->transaction != root->fs_info->running_transaction)
622 		WARN_ON(1);
623 	if (trans->transid != root->fs_info->generation)
624 		WARN_ON(1);
625 
626 	parent_nritems = btrfs_header_nritems(parent);
627 	blocksize = btrfs_level_size(root, parent_level - 1);
628 	end_slot = parent_nritems;
629 
630 	if (parent_nritems == 1)
631 		return 0;
632 
633 	btrfs_set_lock_blocking(parent);
634 
635 	for (i = start_slot; i < end_slot; i++) {
636 		int close = 1;
637 
638 		if (!parent->map_token) {
639 			map_extent_buffer(parent,
640 					btrfs_node_key_ptr_offset(i),
641 					sizeof(struct btrfs_key_ptr),
642 					&parent->map_token, &parent->kaddr,
643 					&parent->map_start, &parent->map_len,
644 					KM_USER1);
645 		}
646 		btrfs_node_key(parent, &disk_key, i);
647 		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
648 			continue;
649 
650 		progress_passed = 1;
651 		blocknr = btrfs_node_blockptr(parent, i);
652 		gen = btrfs_node_ptr_generation(parent, i);
653 		if (last_block == 0)
654 			last_block = blocknr;
655 
656 		if (i > 0) {
657 			other = btrfs_node_blockptr(parent, i - 1);
658 			close = close_blocks(blocknr, other, blocksize);
659 		}
660 		if (!close && i < end_slot - 2) {
661 			other = btrfs_node_blockptr(parent, i + 1);
662 			close = close_blocks(blocknr, other, blocksize);
663 		}
664 		if (close) {
665 			last_block = blocknr;
666 			continue;
667 		}
668 		if (parent->map_token) {
669 			unmap_extent_buffer(parent, parent->map_token,
670 					    KM_USER1);
671 			parent->map_token = NULL;
672 		}
673 
674 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
675 		if (cur)
676 			uptodate = btrfs_buffer_uptodate(cur, gen);
677 		else
678 			uptodate = 0;
679 		if (!cur || !uptodate) {
680 			if (cache_only) {
681 				free_extent_buffer(cur);
682 				continue;
683 			}
684 			if (!cur) {
685 				cur = read_tree_block(root, blocknr,
686 							 blocksize, gen);
687 			} else if (!uptodate) {
688 				btrfs_read_buffer(cur, gen);
689 			}
690 		}
691 		if (search_start == 0)
692 			search_start = last_block;
693 
694 		btrfs_tree_lock(cur);
695 		btrfs_set_lock_blocking(cur);
696 		err = __btrfs_cow_block(trans, root, cur, parent, i,
697 					&cur, search_start,
698 					min(16 * blocksize,
699 					    (end_slot - i) * blocksize));
700 		if (err) {
701 			btrfs_tree_unlock(cur);
702 			free_extent_buffer(cur);
703 			break;
704 		}
705 		search_start = cur->start;
706 		last_block = cur->start;
707 		*last_ret = search_start;
708 		btrfs_tree_unlock(cur);
709 		free_extent_buffer(cur);
710 	}
711 	if (parent->map_token) {
712 		unmap_extent_buffer(parent, parent->map_token,
713 				    KM_USER1);
714 		parent->map_token = NULL;
715 	}
716 	return err;
717 }
718 
719 /*
720  * The leaf data grows from end-to-front in the node.
721  * this returns the address of the start of the last item,
722  * which is the stop of the leaf data stack
723  */
724 static inline unsigned int leaf_data_end(struct btrfs_root *root,
725 					 struct extent_buffer *leaf)
726 {
727 	u32 nr = btrfs_header_nritems(leaf);
728 	if (nr == 0)
729 		return BTRFS_LEAF_DATA_SIZE(root);
730 	return btrfs_item_offset_nr(leaf, nr - 1);
731 }
732 
733 /*
734  * extra debugging checks to make sure all the items in a key are
735  * well formed and in the proper order
736  */
737 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
738 		      int level)
739 {
740 	struct extent_buffer *parent = NULL;
741 	struct extent_buffer *node = path->nodes[level];
742 	struct btrfs_disk_key parent_key;
743 	struct btrfs_disk_key node_key;
744 	int parent_slot;
745 	int slot;
746 	struct btrfs_key cpukey;
747 	u32 nritems = btrfs_header_nritems(node);
748 
749 	if (path->nodes[level + 1])
750 		parent = path->nodes[level + 1];
751 
752 	slot = path->slots[level];
753 	BUG_ON(nritems == 0);
754 	if (parent) {
755 		parent_slot = path->slots[level + 1];
756 		btrfs_node_key(parent, &parent_key, parent_slot);
757 		btrfs_node_key(node, &node_key, 0);
758 		BUG_ON(memcmp(&parent_key, &node_key,
759 			      sizeof(struct btrfs_disk_key)));
760 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
761 		       btrfs_header_bytenr(node));
762 	}
763 	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
764 	if (slot != 0) {
765 		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
766 		btrfs_node_key(node, &node_key, slot);
767 		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
768 	}
769 	if (slot < nritems - 1) {
770 		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
771 		btrfs_node_key(node, &node_key, slot);
772 		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
773 	}
774 	return 0;
775 }
776 
777 /*
778  * extra checking to make sure all the items in a leaf are
779  * well formed and in the proper order
780  */
781 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
782 		      int level)
783 {
784 	struct extent_buffer *leaf = path->nodes[level];
785 	struct extent_buffer *parent = NULL;
786 	int parent_slot;
787 	struct btrfs_key cpukey;
788 	struct btrfs_disk_key parent_key;
789 	struct btrfs_disk_key leaf_key;
790 	int slot = path->slots[0];
791 
792 	u32 nritems = btrfs_header_nritems(leaf);
793 
794 	if (path->nodes[level + 1])
795 		parent = path->nodes[level + 1];
796 
797 	if (nritems == 0)
798 		return 0;
799 
800 	if (parent) {
801 		parent_slot = path->slots[level + 1];
802 		btrfs_node_key(parent, &parent_key, parent_slot);
803 		btrfs_item_key(leaf, &leaf_key, 0);
804 
805 		BUG_ON(memcmp(&parent_key, &leaf_key,
806 		       sizeof(struct btrfs_disk_key)));
807 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
808 		       btrfs_header_bytenr(leaf));
809 	}
810 	if (slot != 0 && slot < nritems - 1) {
811 		btrfs_item_key(leaf, &leaf_key, slot);
812 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
813 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
814 			btrfs_print_leaf(root, leaf);
815 			printk(KERN_CRIT "slot %d offset bad key\n", slot);
816 			BUG_ON(1);
817 		}
818 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
819 		       btrfs_item_end_nr(leaf, slot)) {
820 			btrfs_print_leaf(root, leaf);
821 			printk(KERN_CRIT "slot %d offset bad\n", slot);
822 			BUG_ON(1);
823 		}
824 	}
825 	if (slot < nritems - 1) {
826 		btrfs_item_key(leaf, &leaf_key, slot);
827 		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
828 		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
829 		if (btrfs_item_offset_nr(leaf, slot) !=
830 			btrfs_item_end_nr(leaf, slot + 1)) {
831 			btrfs_print_leaf(root, leaf);
832 			printk(KERN_CRIT "slot %d offset bad\n", slot);
833 			BUG_ON(1);
834 		}
835 	}
836 	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
837 	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
838 	return 0;
839 }
840 
841 static noinline int check_block(struct btrfs_root *root,
842 				struct btrfs_path *path, int level)
843 {
844 	return 0;
845 	if (level == 0)
846 		return check_leaf(root, path, level);
847 	return check_node(root, path, level);
848 }
849 
850 /*
851  * search for key in the extent_buffer.  The items start at offset p,
852  * and they are item_size apart.  There are 'max' items in p.
853  *
854  * the slot in the array is returned via slot, and it points to
855  * the place where you would insert key if it is not found in
856  * the array.
857  *
858  * slot may point to max if the key is bigger than all of the keys
859  */
860 static noinline int generic_bin_search(struct extent_buffer *eb,
861 				       unsigned long p,
862 				       int item_size, struct btrfs_key *key,
863 				       int max, int *slot)
864 {
865 	int low = 0;
866 	int high = max;
867 	int mid;
868 	int ret;
869 	struct btrfs_disk_key *tmp = NULL;
870 	struct btrfs_disk_key unaligned;
871 	unsigned long offset;
872 	char *map_token = NULL;
873 	char *kaddr = NULL;
874 	unsigned long map_start = 0;
875 	unsigned long map_len = 0;
876 	int err;
877 
878 	while (low < high) {
879 		mid = (low + high) / 2;
880 		offset = p + mid * item_size;
881 
882 		if (!map_token || offset < map_start ||
883 		    (offset + sizeof(struct btrfs_disk_key)) >
884 		    map_start + map_len) {
885 			if (map_token) {
886 				unmap_extent_buffer(eb, map_token, KM_USER0);
887 				map_token = NULL;
888 			}
889 
890 			err = map_private_extent_buffer(eb, offset,
891 						sizeof(struct btrfs_disk_key),
892 						&map_token, &kaddr,
893 						&map_start, &map_len, KM_USER0);
894 
895 			if (!err) {
896 				tmp = (struct btrfs_disk_key *)(kaddr + offset -
897 							map_start);
898 			} else {
899 				read_extent_buffer(eb, &unaligned,
900 						   offset, sizeof(unaligned));
901 				tmp = &unaligned;
902 			}
903 
904 		} else {
905 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
906 							map_start);
907 		}
908 		ret = comp_keys(tmp, key);
909 
910 		if (ret < 0)
911 			low = mid + 1;
912 		else if (ret > 0)
913 			high = mid;
914 		else {
915 			*slot = mid;
916 			if (map_token)
917 				unmap_extent_buffer(eb, map_token, KM_USER0);
918 			return 0;
919 		}
920 	}
921 	*slot = low;
922 	if (map_token)
923 		unmap_extent_buffer(eb, map_token, KM_USER0);
924 	return 1;
925 }
926 
927 /*
928  * simple bin_search frontend that does the right thing for
929  * leaves vs nodes
930  */
931 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
932 		      int level, int *slot)
933 {
934 	if (level == 0) {
935 		return generic_bin_search(eb,
936 					  offsetof(struct btrfs_leaf, items),
937 					  sizeof(struct btrfs_item),
938 					  key, btrfs_header_nritems(eb),
939 					  slot);
940 	} else {
941 		return generic_bin_search(eb,
942 					  offsetof(struct btrfs_node, ptrs),
943 					  sizeof(struct btrfs_key_ptr),
944 					  key, btrfs_header_nritems(eb),
945 					  slot);
946 	}
947 	return -1;
948 }
949 
950 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
951 		     int level, int *slot)
952 {
953 	return bin_search(eb, key, level, slot);
954 }
955 
956 static void root_add_used(struct btrfs_root *root, u32 size)
957 {
958 	spin_lock(&root->accounting_lock);
959 	btrfs_set_root_used(&root->root_item,
960 			    btrfs_root_used(&root->root_item) + size);
961 	spin_unlock(&root->accounting_lock);
962 }
963 
964 static void root_sub_used(struct btrfs_root *root, u32 size)
965 {
966 	spin_lock(&root->accounting_lock);
967 	btrfs_set_root_used(&root->root_item,
968 			    btrfs_root_used(&root->root_item) - size);
969 	spin_unlock(&root->accounting_lock);
970 }
971 
972 /* given a node and slot number, this reads the blocks it points to.  The
973  * extent buffer is returned with a reference taken (but unlocked).
974  * NULL is returned on error.
975  */
976 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
977 				   struct extent_buffer *parent, int slot)
978 {
979 	int level = btrfs_header_level(parent);
980 	if (slot < 0)
981 		return NULL;
982 	if (slot >= btrfs_header_nritems(parent))
983 		return NULL;
984 
985 	BUG_ON(level == 0);
986 
987 	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
988 		       btrfs_level_size(root, level - 1),
989 		       btrfs_node_ptr_generation(parent, slot));
990 }
991 
992 /*
993  * node level balancing, used to make sure nodes are in proper order for
994  * item deletion.  We balance from the top down, so we have to make sure
995  * that a deletion won't leave an node completely empty later on.
996  */
997 static noinline int balance_level(struct btrfs_trans_handle *trans,
998 			 struct btrfs_root *root,
999 			 struct btrfs_path *path, int level)
1000 {
1001 	struct extent_buffer *right = NULL;
1002 	struct extent_buffer *mid;
1003 	struct extent_buffer *left = NULL;
1004 	struct extent_buffer *parent = NULL;
1005 	int ret = 0;
1006 	int wret;
1007 	int pslot;
1008 	int orig_slot = path->slots[level];
1009 	u64 orig_ptr;
1010 
1011 	if (level == 0)
1012 		return 0;
1013 
1014 	mid = path->nodes[level];
1015 
1016 	WARN_ON(!path->locks[level]);
1017 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1018 
1019 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1020 
1021 	if (level < BTRFS_MAX_LEVEL - 1)
1022 		parent = path->nodes[level + 1];
1023 	pslot = path->slots[level + 1];
1024 
1025 	/*
1026 	 * deal with the case where there is only one pointer in the root
1027 	 * by promoting the node below to a root
1028 	 */
1029 	if (!parent) {
1030 		struct extent_buffer *child;
1031 
1032 		if (btrfs_header_nritems(mid) != 1)
1033 			return 0;
1034 
1035 		/* promote the child to a root */
1036 		child = read_node_slot(root, mid, 0);
1037 		BUG_ON(!child);
1038 		btrfs_tree_lock(child);
1039 		btrfs_set_lock_blocking(child);
1040 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1041 		if (ret) {
1042 			btrfs_tree_unlock(child);
1043 			free_extent_buffer(child);
1044 			goto enospc;
1045 		}
1046 
1047 		spin_lock(&root->node_lock);
1048 		root->node = child;
1049 		spin_unlock(&root->node_lock);
1050 
1051 		add_root_to_dirty_list(root);
1052 		btrfs_tree_unlock(child);
1053 
1054 		path->locks[level] = 0;
1055 		path->nodes[level] = NULL;
1056 		clean_tree_block(trans, root, mid);
1057 		btrfs_tree_unlock(mid);
1058 		/* once for the path */
1059 		free_extent_buffer(mid);
1060 
1061 		root_sub_used(root, mid->len);
1062 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1063 		/* once for the root ptr */
1064 		free_extent_buffer(mid);
1065 		return 0;
1066 	}
1067 	if (btrfs_header_nritems(mid) >
1068 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1069 		return 0;
1070 
1071 	btrfs_header_nritems(mid);
1072 
1073 	left = read_node_slot(root, parent, pslot - 1);
1074 	if (left) {
1075 		btrfs_tree_lock(left);
1076 		btrfs_set_lock_blocking(left);
1077 		wret = btrfs_cow_block(trans, root, left,
1078 				       parent, pslot - 1, &left);
1079 		if (wret) {
1080 			ret = wret;
1081 			goto enospc;
1082 		}
1083 	}
1084 	right = read_node_slot(root, parent, pslot + 1);
1085 	if (right) {
1086 		btrfs_tree_lock(right);
1087 		btrfs_set_lock_blocking(right);
1088 		wret = btrfs_cow_block(trans, root, right,
1089 				       parent, pslot + 1, &right);
1090 		if (wret) {
1091 			ret = wret;
1092 			goto enospc;
1093 		}
1094 	}
1095 
1096 	/* first, try to make some room in the middle buffer */
1097 	if (left) {
1098 		orig_slot += btrfs_header_nritems(left);
1099 		wret = push_node_left(trans, root, left, mid, 1);
1100 		if (wret < 0)
1101 			ret = wret;
1102 		btrfs_header_nritems(mid);
1103 	}
1104 
1105 	/*
1106 	 * then try to empty the right most buffer into the middle
1107 	 */
1108 	if (right) {
1109 		wret = push_node_left(trans, root, mid, right, 1);
1110 		if (wret < 0 && wret != -ENOSPC)
1111 			ret = wret;
1112 		if (btrfs_header_nritems(right) == 0) {
1113 			clean_tree_block(trans, root, right);
1114 			btrfs_tree_unlock(right);
1115 			wret = del_ptr(trans, root, path, level + 1, pslot +
1116 				       1);
1117 			if (wret)
1118 				ret = wret;
1119 			root_sub_used(root, right->len);
1120 			btrfs_free_tree_block(trans, root, right, 0, 1);
1121 			free_extent_buffer(right);
1122 			right = NULL;
1123 		} else {
1124 			struct btrfs_disk_key right_key;
1125 			btrfs_node_key(right, &right_key, 0);
1126 			btrfs_set_node_key(parent, &right_key, pslot + 1);
1127 			btrfs_mark_buffer_dirty(parent);
1128 		}
1129 	}
1130 	if (btrfs_header_nritems(mid) == 1) {
1131 		/*
1132 		 * we're not allowed to leave a node with one item in the
1133 		 * tree during a delete.  A deletion from lower in the tree
1134 		 * could try to delete the only pointer in this node.
1135 		 * So, pull some keys from the left.
1136 		 * There has to be a left pointer at this point because
1137 		 * otherwise we would have pulled some pointers from the
1138 		 * right
1139 		 */
1140 		BUG_ON(!left);
1141 		wret = balance_node_right(trans, root, mid, left);
1142 		if (wret < 0) {
1143 			ret = wret;
1144 			goto enospc;
1145 		}
1146 		if (wret == 1) {
1147 			wret = push_node_left(trans, root, left, mid, 1);
1148 			if (wret < 0)
1149 				ret = wret;
1150 		}
1151 		BUG_ON(wret == 1);
1152 	}
1153 	if (btrfs_header_nritems(mid) == 0) {
1154 		clean_tree_block(trans, root, mid);
1155 		btrfs_tree_unlock(mid);
1156 		wret = del_ptr(trans, root, path, level + 1, pslot);
1157 		if (wret)
1158 			ret = wret;
1159 		root_sub_used(root, mid->len);
1160 		btrfs_free_tree_block(trans, root, mid, 0, 1);
1161 		free_extent_buffer(mid);
1162 		mid = NULL;
1163 	} else {
1164 		/* update the parent key to reflect our changes */
1165 		struct btrfs_disk_key mid_key;
1166 		btrfs_node_key(mid, &mid_key, 0);
1167 		btrfs_set_node_key(parent, &mid_key, pslot);
1168 		btrfs_mark_buffer_dirty(parent);
1169 	}
1170 
1171 	/* update the path */
1172 	if (left) {
1173 		if (btrfs_header_nritems(left) > orig_slot) {
1174 			extent_buffer_get(left);
1175 			/* left was locked after cow */
1176 			path->nodes[level] = left;
1177 			path->slots[level + 1] -= 1;
1178 			path->slots[level] = orig_slot;
1179 			if (mid) {
1180 				btrfs_tree_unlock(mid);
1181 				free_extent_buffer(mid);
1182 			}
1183 		} else {
1184 			orig_slot -= btrfs_header_nritems(left);
1185 			path->slots[level] = orig_slot;
1186 		}
1187 	}
1188 	/* double check we haven't messed things up */
1189 	check_block(root, path, level);
1190 	if (orig_ptr !=
1191 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1192 		BUG();
1193 enospc:
1194 	if (right) {
1195 		btrfs_tree_unlock(right);
1196 		free_extent_buffer(right);
1197 	}
1198 	if (left) {
1199 		if (path->nodes[level] != left)
1200 			btrfs_tree_unlock(left);
1201 		free_extent_buffer(left);
1202 	}
1203 	return ret;
1204 }
1205 
1206 /* Node balancing for insertion.  Here we only split or push nodes around
1207  * when they are completely full.  This is also done top down, so we
1208  * have to be pessimistic.
1209  */
1210 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1211 					  struct btrfs_root *root,
1212 					  struct btrfs_path *path, int level)
1213 {
1214 	struct extent_buffer *right = NULL;
1215 	struct extent_buffer *mid;
1216 	struct extent_buffer *left = NULL;
1217 	struct extent_buffer *parent = NULL;
1218 	int ret = 0;
1219 	int wret;
1220 	int pslot;
1221 	int orig_slot = path->slots[level];
1222 
1223 	if (level == 0)
1224 		return 1;
1225 
1226 	mid = path->nodes[level];
1227 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1228 
1229 	if (level < BTRFS_MAX_LEVEL - 1)
1230 		parent = path->nodes[level + 1];
1231 	pslot = path->slots[level + 1];
1232 
1233 	if (!parent)
1234 		return 1;
1235 
1236 	left = read_node_slot(root, parent, pslot - 1);
1237 
1238 	/* first, try to make some room in the middle buffer */
1239 	if (left) {
1240 		u32 left_nr;
1241 
1242 		btrfs_tree_lock(left);
1243 		btrfs_set_lock_blocking(left);
1244 
1245 		left_nr = btrfs_header_nritems(left);
1246 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1247 			wret = 1;
1248 		} else {
1249 			ret = btrfs_cow_block(trans, root, left, parent,
1250 					      pslot - 1, &left);
1251 			if (ret)
1252 				wret = 1;
1253 			else {
1254 				wret = push_node_left(trans, root,
1255 						      left, mid, 0);
1256 			}
1257 		}
1258 		if (wret < 0)
1259 			ret = wret;
1260 		if (wret == 0) {
1261 			struct btrfs_disk_key disk_key;
1262 			orig_slot += left_nr;
1263 			btrfs_node_key(mid, &disk_key, 0);
1264 			btrfs_set_node_key(parent, &disk_key, pslot);
1265 			btrfs_mark_buffer_dirty(parent);
1266 			if (btrfs_header_nritems(left) > orig_slot) {
1267 				path->nodes[level] = left;
1268 				path->slots[level + 1] -= 1;
1269 				path->slots[level] = orig_slot;
1270 				btrfs_tree_unlock(mid);
1271 				free_extent_buffer(mid);
1272 			} else {
1273 				orig_slot -=
1274 					btrfs_header_nritems(left);
1275 				path->slots[level] = orig_slot;
1276 				btrfs_tree_unlock(left);
1277 				free_extent_buffer(left);
1278 			}
1279 			return 0;
1280 		}
1281 		btrfs_tree_unlock(left);
1282 		free_extent_buffer(left);
1283 	}
1284 	right = read_node_slot(root, parent, pslot + 1);
1285 
1286 	/*
1287 	 * then try to empty the right most buffer into the middle
1288 	 */
1289 	if (right) {
1290 		u32 right_nr;
1291 
1292 		btrfs_tree_lock(right);
1293 		btrfs_set_lock_blocking(right);
1294 
1295 		right_nr = btrfs_header_nritems(right);
1296 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1297 			wret = 1;
1298 		} else {
1299 			ret = btrfs_cow_block(trans, root, right,
1300 					      parent, pslot + 1,
1301 					      &right);
1302 			if (ret)
1303 				wret = 1;
1304 			else {
1305 				wret = balance_node_right(trans, root,
1306 							  right, mid);
1307 			}
1308 		}
1309 		if (wret < 0)
1310 			ret = wret;
1311 		if (wret == 0) {
1312 			struct btrfs_disk_key disk_key;
1313 
1314 			btrfs_node_key(right, &disk_key, 0);
1315 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
1316 			btrfs_mark_buffer_dirty(parent);
1317 
1318 			if (btrfs_header_nritems(mid) <= orig_slot) {
1319 				path->nodes[level] = right;
1320 				path->slots[level + 1] += 1;
1321 				path->slots[level] = orig_slot -
1322 					btrfs_header_nritems(mid);
1323 				btrfs_tree_unlock(mid);
1324 				free_extent_buffer(mid);
1325 			} else {
1326 				btrfs_tree_unlock(right);
1327 				free_extent_buffer(right);
1328 			}
1329 			return 0;
1330 		}
1331 		btrfs_tree_unlock(right);
1332 		free_extent_buffer(right);
1333 	}
1334 	return 1;
1335 }
1336 
1337 /*
1338  * readahead one full node of leaves, finding things that are close
1339  * to the block in 'slot', and triggering ra on them.
1340  */
1341 static void reada_for_search(struct btrfs_root *root,
1342 			     struct btrfs_path *path,
1343 			     int level, int slot, u64 objectid)
1344 {
1345 	struct extent_buffer *node;
1346 	struct btrfs_disk_key disk_key;
1347 	u32 nritems;
1348 	u64 search;
1349 	u64 target;
1350 	u64 nread = 0;
1351 	int direction = path->reada;
1352 	struct extent_buffer *eb;
1353 	u32 nr;
1354 	u32 blocksize;
1355 	u32 nscan = 0;
1356 
1357 	if (level != 1)
1358 		return;
1359 
1360 	if (!path->nodes[level])
1361 		return;
1362 
1363 	node = path->nodes[level];
1364 
1365 	search = btrfs_node_blockptr(node, slot);
1366 	blocksize = btrfs_level_size(root, level - 1);
1367 	eb = btrfs_find_tree_block(root, search, blocksize);
1368 	if (eb) {
1369 		free_extent_buffer(eb);
1370 		return;
1371 	}
1372 
1373 	target = search;
1374 
1375 	nritems = btrfs_header_nritems(node);
1376 	nr = slot;
1377 	while (1) {
1378 		if (direction < 0) {
1379 			if (nr == 0)
1380 				break;
1381 			nr--;
1382 		} else if (direction > 0) {
1383 			nr++;
1384 			if (nr >= nritems)
1385 				break;
1386 		}
1387 		if (path->reada < 0 && objectid) {
1388 			btrfs_node_key(node, &disk_key, nr);
1389 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
1390 				break;
1391 		}
1392 		search = btrfs_node_blockptr(node, nr);
1393 		if ((search <= target && target - search <= 65536) ||
1394 		    (search > target && search - target <= 65536)) {
1395 			readahead_tree_block(root, search, blocksize,
1396 				     btrfs_node_ptr_generation(node, nr));
1397 			nread += blocksize;
1398 		}
1399 		nscan++;
1400 		if ((nread > 65536 || nscan > 32))
1401 			break;
1402 	}
1403 }
1404 
1405 /*
1406  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1407  * cache
1408  */
1409 static noinline int reada_for_balance(struct btrfs_root *root,
1410 				      struct btrfs_path *path, int level)
1411 {
1412 	int slot;
1413 	int nritems;
1414 	struct extent_buffer *parent;
1415 	struct extent_buffer *eb;
1416 	u64 gen;
1417 	u64 block1 = 0;
1418 	u64 block2 = 0;
1419 	int ret = 0;
1420 	int blocksize;
1421 
1422 	parent = path->nodes[level + 1];
1423 	if (!parent)
1424 		return 0;
1425 
1426 	nritems = btrfs_header_nritems(parent);
1427 	slot = path->slots[level + 1];
1428 	blocksize = btrfs_level_size(root, level);
1429 
1430 	if (slot > 0) {
1431 		block1 = btrfs_node_blockptr(parent, slot - 1);
1432 		gen = btrfs_node_ptr_generation(parent, slot - 1);
1433 		eb = btrfs_find_tree_block(root, block1, blocksize);
1434 		if (eb && btrfs_buffer_uptodate(eb, gen))
1435 			block1 = 0;
1436 		free_extent_buffer(eb);
1437 	}
1438 	if (slot + 1 < nritems) {
1439 		block2 = btrfs_node_blockptr(parent, slot + 1);
1440 		gen = btrfs_node_ptr_generation(parent, slot + 1);
1441 		eb = btrfs_find_tree_block(root, block2, blocksize);
1442 		if (eb && btrfs_buffer_uptodate(eb, gen))
1443 			block2 = 0;
1444 		free_extent_buffer(eb);
1445 	}
1446 	if (block1 || block2) {
1447 		ret = -EAGAIN;
1448 
1449 		/* release the whole path */
1450 		btrfs_release_path(root, path);
1451 
1452 		/* read the blocks */
1453 		if (block1)
1454 			readahead_tree_block(root, block1, blocksize, 0);
1455 		if (block2)
1456 			readahead_tree_block(root, block2, blocksize, 0);
1457 
1458 		if (block1) {
1459 			eb = read_tree_block(root, block1, blocksize, 0);
1460 			free_extent_buffer(eb);
1461 		}
1462 		if (block2) {
1463 			eb = read_tree_block(root, block2, blocksize, 0);
1464 			free_extent_buffer(eb);
1465 		}
1466 	}
1467 	return ret;
1468 }
1469 
1470 
1471 /*
1472  * when we walk down the tree, it is usually safe to unlock the higher layers
1473  * in the tree.  The exceptions are when our path goes through slot 0, because
1474  * operations on the tree might require changing key pointers higher up in the
1475  * tree.
1476  *
1477  * callers might also have set path->keep_locks, which tells this code to keep
1478  * the lock if the path points to the last slot in the block.  This is part of
1479  * walking through the tree, and selecting the next slot in the higher block.
1480  *
1481  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1482  * if lowest_unlock is 1, level 0 won't be unlocked
1483  */
1484 static noinline void unlock_up(struct btrfs_path *path, int level,
1485 			       int lowest_unlock)
1486 {
1487 	int i;
1488 	int skip_level = level;
1489 	int no_skips = 0;
1490 	struct extent_buffer *t;
1491 
1492 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1493 		if (!path->nodes[i])
1494 			break;
1495 		if (!path->locks[i])
1496 			break;
1497 		if (!no_skips && path->slots[i] == 0) {
1498 			skip_level = i + 1;
1499 			continue;
1500 		}
1501 		if (!no_skips && path->keep_locks) {
1502 			u32 nritems;
1503 			t = path->nodes[i];
1504 			nritems = btrfs_header_nritems(t);
1505 			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1506 				skip_level = i + 1;
1507 				continue;
1508 			}
1509 		}
1510 		if (skip_level < i && i >= lowest_unlock)
1511 			no_skips = 1;
1512 
1513 		t = path->nodes[i];
1514 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1515 			btrfs_tree_unlock(t);
1516 			path->locks[i] = 0;
1517 		}
1518 	}
1519 }
1520 
1521 /*
1522  * This releases any locks held in the path starting at level and
1523  * going all the way up to the root.
1524  *
1525  * btrfs_search_slot will keep the lock held on higher nodes in a few
1526  * corner cases, such as COW of the block at slot zero in the node.  This
1527  * ignores those rules, and it should only be called when there are no
1528  * more updates to be done higher up in the tree.
1529  */
1530 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1531 {
1532 	int i;
1533 
1534 	if (path->keep_locks)
1535 		return;
1536 
1537 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1538 		if (!path->nodes[i])
1539 			continue;
1540 		if (!path->locks[i])
1541 			continue;
1542 		btrfs_tree_unlock(path->nodes[i]);
1543 		path->locks[i] = 0;
1544 	}
1545 }
1546 
1547 /*
1548  * helper function for btrfs_search_slot.  The goal is to find a block
1549  * in cache without setting the path to blocking.  If we find the block
1550  * we return zero and the path is unchanged.
1551  *
1552  * If we can't find the block, we set the path blocking and do some
1553  * reada.  -EAGAIN is returned and the search must be repeated.
1554  */
1555 static int
1556 read_block_for_search(struct btrfs_trans_handle *trans,
1557 		       struct btrfs_root *root, struct btrfs_path *p,
1558 		       struct extent_buffer **eb_ret, int level, int slot,
1559 		       struct btrfs_key *key)
1560 {
1561 	u64 blocknr;
1562 	u64 gen;
1563 	u32 blocksize;
1564 	struct extent_buffer *b = *eb_ret;
1565 	struct extent_buffer *tmp;
1566 	int ret;
1567 
1568 	blocknr = btrfs_node_blockptr(b, slot);
1569 	gen = btrfs_node_ptr_generation(b, slot);
1570 	blocksize = btrfs_level_size(root, level - 1);
1571 
1572 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1573 	if (tmp) {
1574 		if (btrfs_buffer_uptodate(tmp, 0)) {
1575 			if (btrfs_buffer_uptodate(tmp, gen)) {
1576 				/*
1577 				 * we found an up to date block without
1578 				 * sleeping, return
1579 				 * right away
1580 				 */
1581 				*eb_ret = tmp;
1582 				return 0;
1583 			}
1584 			/* the pages were up to date, but we failed
1585 			 * the generation number check.  Do a full
1586 			 * read for the generation number that is correct.
1587 			 * We must do this without dropping locks so
1588 			 * we can trust our generation number
1589 			 */
1590 			free_extent_buffer(tmp);
1591 			tmp = read_tree_block(root, blocknr, blocksize, gen);
1592 			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1593 				*eb_ret = tmp;
1594 				return 0;
1595 			}
1596 			free_extent_buffer(tmp);
1597 			btrfs_release_path(NULL, p);
1598 			return -EIO;
1599 		}
1600 	}
1601 
1602 	/*
1603 	 * reduce lock contention at high levels
1604 	 * of the btree by dropping locks before
1605 	 * we read.  Don't release the lock on the current
1606 	 * level because we need to walk this node to figure
1607 	 * out which blocks to read.
1608 	 */
1609 	btrfs_unlock_up_safe(p, level + 1);
1610 	btrfs_set_path_blocking(p);
1611 
1612 	free_extent_buffer(tmp);
1613 	if (p->reada)
1614 		reada_for_search(root, p, level, slot, key->objectid);
1615 
1616 	btrfs_release_path(NULL, p);
1617 
1618 	ret = -EAGAIN;
1619 	tmp = read_tree_block(root, blocknr, blocksize, 0);
1620 	if (tmp) {
1621 		/*
1622 		 * If the read above didn't mark this buffer up to date,
1623 		 * it will never end up being up to date.  Set ret to EIO now
1624 		 * and give up so that our caller doesn't loop forever
1625 		 * on our EAGAINs.
1626 		 */
1627 		if (!btrfs_buffer_uptodate(tmp, 0))
1628 			ret = -EIO;
1629 		free_extent_buffer(tmp);
1630 	}
1631 	return ret;
1632 }
1633 
1634 /*
1635  * helper function for btrfs_search_slot.  This does all of the checks
1636  * for node-level blocks and does any balancing required based on
1637  * the ins_len.
1638  *
1639  * If no extra work was required, zero is returned.  If we had to
1640  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1641  * start over
1642  */
1643 static int
1644 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1645 		       struct btrfs_root *root, struct btrfs_path *p,
1646 		       struct extent_buffer *b, int level, int ins_len)
1647 {
1648 	int ret;
1649 	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1650 	    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1651 		int sret;
1652 
1653 		sret = reada_for_balance(root, p, level);
1654 		if (sret)
1655 			goto again;
1656 
1657 		btrfs_set_path_blocking(p);
1658 		sret = split_node(trans, root, p, level);
1659 		btrfs_clear_path_blocking(p, NULL);
1660 
1661 		BUG_ON(sret > 0);
1662 		if (sret) {
1663 			ret = sret;
1664 			goto done;
1665 		}
1666 		b = p->nodes[level];
1667 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
1668 		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1669 		int sret;
1670 
1671 		sret = reada_for_balance(root, p, level);
1672 		if (sret)
1673 			goto again;
1674 
1675 		btrfs_set_path_blocking(p);
1676 		sret = balance_level(trans, root, p, level);
1677 		btrfs_clear_path_blocking(p, NULL);
1678 
1679 		if (sret) {
1680 			ret = sret;
1681 			goto done;
1682 		}
1683 		b = p->nodes[level];
1684 		if (!b) {
1685 			btrfs_release_path(NULL, p);
1686 			goto again;
1687 		}
1688 		BUG_ON(btrfs_header_nritems(b) == 1);
1689 	}
1690 	return 0;
1691 
1692 again:
1693 	ret = -EAGAIN;
1694 done:
1695 	return ret;
1696 }
1697 
1698 /*
1699  * look for key in the tree.  path is filled in with nodes along the way
1700  * if key is found, we return zero and you can find the item in the leaf
1701  * level of the path (level 0)
1702  *
1703  * If the key isn't found, the path points to the slot where it should
1704  * be inserted, and 1 is returned.  If there are other errors during the
1705  * search a negative error number is returned.
1706  *
1707  * if ins_len > 0, nodes and leaves will be split as we walk down the
1708  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1709  * possible)
1710  */
1711 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1712 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
1713 		      ins_len, int cow)
1714 {
1715 	struct extent_buffer *b;
1716 	int slot;
1717 	int ret;
1718 	int err;
1719 	int level;
1720 	int lowest_unlock = 1;
1721 	u8 lowest_level = 0;
1722 
1723 	lowest_level = p->lowest_level;
1724 	WARN_ON(lowest_level && ins_len > 0);
1725 	WARN_ON(p->nodes[0] != NULL);
1726 
1727 	if (ins_len < 0)
1728 		lowest_unlock = 2;
1729 
1730 again:
1731 	if (p->search_commit_root) {
1732 		b = root->commit_root;
1733 		extent_buffer_get(b);
1734 		if (!p->skip_locking)
1735 			btrfs_tree_lock(b);
1736 	} else {
1737 		if (p->skip_locking)
1738 			b = btrfs_root_node(root);
1739 		else
1740 			b = btrfs_lock_root_node(root);
1741 	}
1742 
1743 	while (b) {
1744 		level = btrfs_header_level(b);
1745 
1746 		/*
1747 		 * setup the path here so we can release it under lock
1748 		 * contention with the cow code
1749 		 */
1750 		p->nodes[level] = b;
1751 		if (!p->skip_locking)
1752 			p->locks[level] = 1;
1753 
1754 		if (cow) {
1755 			/*
1756 			 * if we don't really need to cow this block
1757 			 * then we don't want to set the path blocking,
1758 			 * so we test it here
1759 			 */
1760 			if (!should_cow_block(trans, root, b))
1761 				goto cow_done;
1762 
1763 			btrfs_set_path_blocking(p);
1764 
1765 			err = btrfs_cow_block(trans, root, b,
1766 					      p->nodes[level + 1],
1767 					      p->slots[level + 1], &b);
1768 			if (err) {
1769 				ret = err;
1770 				goto done;
1771 			}
1772 		}
1773 cow_done:
1774 		BUG_ON(!cow && ins_len);
1775 		if (level != btrfs_header_level(b))
1776 			WARN_ON(1);
1777 		level = btrfs_header_level(b);
1778 
1779 		p->nodes[level] = b;
1780 		if (!p->skip_locking)
1781 			p->locks[level] = 1;
1782 
1783 		btrfs_clear_path_blocking(p, NULL);
1784 
1785 		/*
1786 		 * we have a lock on b and as long as we aren't changing
1787 		 * the tree, there is no way to for the items in b to change.
1788 		 * It is safe to drop the lock on our parent before we
1789 		 * go through the expensive btree search on b.
1790 		 *
1791 		 * If cow is true, then we might be changing slot zero,
1792 		 * which may require changing the parent.  So, we can't
1793 		 * drop the lock until after we know which slot we're
1794 		 * operating on.
1795 		 */
1796 		if (!cow)
1797 			btrfs_unlock_up_safe(p, level + 1);
1798 
1799 		ret = check_block(root, p, level);
1800 		if (ret) {
1801 			ret = -1;
1802 			goto done;
1803 		}
1804 
1805 		ret = bin_search(b, key, level, &slot);
1806 
1807 		if (level != 0) {
1808 			int dec = 0;
1809 			if (ret && slot > 0) {
1810 				dec = 1;
1811 				slot -= 1;
1812 			}
1813 			p->slots[level] = slot;
1814 			err = setup_nodes_for_search(trans, root, p, b, level,
1815 						     ins_len);
1816 			if (err == -EAGAIN)
1817 				goto again;
1818 			if (err) {
1819 				ret = err;
1820 				goto done;
1821 			}
1822 			b = p->nodes[level];
1823 			slot = p->slots[level];
1824 
1825 			unlock_up(p, level, lowest_unlock);
1826 
1827 			if (level == lowest_level) {
1828 				if (dec)
1829 					p->slots[level]++;
1830 				goto done;
1831 			}
1832 
1833 			err = read_block_for_search(trans, root, p,
1834 						    &b, level, slot, key);
1835 			if (err == -EAGAIN)
1836 				goto again;
1837 			if (err) {
1838 				ret = err;
1839 				goto done;
1840 			}
1841 
1842 			if (!p->skip_locking) {
1843 				btrfs_clear_path_blocking(p, NULL);
1844 				err = btrfs_try_spin_lock(b);
1845 
1846 				if (!err) {
1847 					btrfs_set_path_blocking(p);
1848 					btrfs_tree_lock(b);
1849 					btrfs_clear_path_blocking(p, b);
1850 				}
1851 			}
1852 		} else {
1853 			p->slots[level] = slot;
1854 			if (ins_len > 0 &&
1855 			    btrfs_leaf_free_space(root, b) < ins_len) {
1856 				btrfs_set_path_blocking(p);
1857 				err = split_leaf(trans, root, key,
1858 						 p, ins_len, ret == 0);
1859 				btrfs_clear_path_blocking(p, NULL);
1860 
1861 				BUG_ON(err > 0);
1862 				if (err) {
1863 					ret = err;
1864 					goto done;
1865 				}
1866 			}
1867 			if (!p->search_for_split)
1868 				unlock_up(p, level, lowest_unlock);
1869 			goto done;
1870 		}
1871 	}
1872 	ret = 1;
1873 done:
1874 	/*
1875 	 * we don't really know what they plan on doing with the path
1876 	 * from here on, so for now just mark it as blocking
1877 	 */
1878 	if (!p->leave_spinning)
1879 		btrfs_set_path_blocking(p);
1880 	if (ret < 0)
1881 		btrfs_release_path(root, p);
1882 	return ret;
1883 }
1884 
1885 /*
1886  * adjust the pointers going up the tree, starting at level
1887  * making sure the right key of each node is points to 'key'.
1888  * This is used after shifting pointers to the left, so it stops
1889  * fixing up pointers when a given leaf/node is not in slot 0 of the
1890  * higher levels
1891  *
1892  * If this fails to write a tree block, it returns -1, but continues
1893  * fixing up the blocks in ram so the tree is consistent.
1894  */
1895 static int fixup_low_keys(struct btrfs_trans_handle *trans,
1896 			  struct btrfs_root *root, struct btrfs_path *path,
1897 			  struct btrfs_disk_key *key, int level)
1898 {
1899 	int i;
1900 	int ret = 0;
1901 	struct extent_buffer *t;
1902 
1903 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1904 		int tslot = path->slots[i];
1905 		if (!path->nodes[i])
1906 			break;
1907 		t = path->nodes[i];
1908 		btrfs_set_node_key(t, key, tslot);
1909 		btrfs_mark_buffer_dirty(path->nodes[i]);
1910 		if (tslot != 0)
1911 			break;
1912 	}
1913 	return ret;
1914 }
1915 
1916 /*
1917  * update item key.
1918  *
1919  * This function isn't completely safe. It's the caller's responsibility
1920  * that the new key won't break the order
1921  */
1922 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1923 			    struct btrfs_root *root, struct btrfs_path *path,
1924 			    struct btrfs_key *new_key)
1925 {
1926 	struct btrfs_disk_key disk_key;
1927 	struct extent_buffer *eb;
1928 	int slot;
1929 
1930 	eb = path->nodes[0];
1931 	slot = path->slots[0];
1932 	if (slot > 0) {
1933 		btrfs_item_key(eb, &disk_key, slot - 1);
1934 		if (comp_keys(&disk_key, new_key) >= 0)
1935 			return -1;
1936 	}
1937 	if (slot < btrfs_header_nritems(eb) - 1) {
1938 		btrfs_item_key(eb, &disk_key, slot + 1);
1939 		if (comp_keys(&disk_key, new_key) <= 0)
1940 			return -1;
1941 	}
1942 
1943 	btrfs_cpu_key_to_disk(&disk_key, new_key);
1944 	btrfs_set_item_key(eb, &disk_key, slot);
1945 	btrfs_mark_buffer_dirty(eb);
1946 	if (slot == 0)
1947 		fixup_low_keys(trans, root, path, &disk_key, 1);
1948 	return 0;
1949 }
1950 
1951 /*
1952  * try to push data from one node into the next node left in the
1953  * tree.
1954  *
1955  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1956  * error, and > 0 if there was no room in the left hand block.
1957  */
1958 static int push_node_left(struct btrfs_trans_handle *trans,
1959 			  struct btrfs_root *root, struct extent_buffer *dst,
1960 			  struct extent_buffer *src, int empty)
1961 {
1962 	int push_items = 0;
1963 	int src_nritems;
1964 	int dst_nritems;
1965 	int ret = 0;
1966 
1967 	src_nritems = btrfs_header_nritems(src);
1968 	dst_nritems = btrfs_header_nritems(dst);
1969 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1970 	WARN_ON(btrfs_header_generation(src) != trans->transid);
1971 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1972 
1973 	if (!empty && src_nritems <= 8)
1974 		return 1;
1975 
1976 	if (push_items <= 0)
1977 		return 1;
1978 
1979 	if (empty) {
1980 		push_items = min(src_nritems, push_items);
1981 		if (push_items < src_nritems) {
1982 			/* leave at least 8 pointers in the node if
1983 			 * we aren't going to empty it
1984 			 */
1985 			if (src_nritems - push_items < 8) {
1986 				if (push_items <= 8)
1987 					return 1;
1988 				push_items -= 8;
1989 			}
1990 		}
1991 	} else
1992 		push_items = min(src_nritems - 8, push_items);
1993 
1994 	copy_extent_buffer(dst, src,
1995 			   btrfs_node_key_ptr_offset(dst_nritems),
1996 			   btrfs_node_key_ptr_offset(0),
1997 			   push_items * sizeof(struct btrfs_key_ptr));
1998 
1999 	if (push_items < src_nritems) {
2000 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2001 				      btrfs_node_key_ptr_offset(push_items),
2002 				      (src_nritems - push_items) *
2003 				      sizeof(struct btrfs_key_ptr));
2004 	}
2005 	btrfs_set_header_nritems(src, src_nritems - push_items);
2006 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2007 	btrfs_mark_buffer_dirty(src);
2008 	btrfs_mark_buffer_dirty(dst);
2009 
2010 	return ret;
2011 }
2012 
2013 /*
2014  * try to push data from one node into the next node right in the
2015  * tree.
2016  *
2017  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2018  * error, and > 0 if there was no room in the right hand block.
2019  *
2020  * this will  only push up to 1/2 the contents of the left node over
2021  */
2022 static int balance_node_right(struct btrfs_trans_handle *trans,
2023 			      struct btrfs_root *root,
2024 			      struct extent_buffer *dst,
2025 			      struct extent_buffer *src)
2026 {
2027 	int push_items = 0;
2028 	int max_push;
2029 	int src_nritems;
2030 	int dst_nritems;
2031 	int ret = 0;
2032 
2033 	WARN_ON(btrfs_header_generation(src) != trans->transid);
2034 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2035 
2036 	src_nritems = btrfs_header_nritems(src);
2037 	dst_nritems = btrfs_header_nritems(dst);
2038 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2039 	if (push_items <= 0)
2040 		return 1;
2041 
2042 	if (src_nritems < 4)
2043 		return 1;
2044 
2045 	max_push = src_nritems / 2 + 1;
2046 	/* don't try to empty the node */
2047 	if (max_push >= src_nritems)
2048 		return 1;
2049 
2050 	if (max_push < push_items)
2051 		push_items = max_push;
2052 
2053 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2054 				      btrfs_node_key_ptr_offset(0),
2055 				      (dst_nritems) *
2056 				      sizeof(struct btrfs_key_ptr));
2057 
2058 	copy_extent_buffer(dst, src,
2059 			   btrfs_node_key_ptr_offset(0),
2060 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
2061 			   push_items * sizeof(struct btrfs_key_ptr));
2062 
2063 	btrfs_set_header_nritems(src, src_nritems - push_items);
2064 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
2065 
2066 	btrfs_mark_buffer_dirty(src);
2067 	btrfs_mark_buffer_dirty(dst);
2068 
2069 	return ret;
2070 }
2071 
2072 /*
2073  * helper function to insert a new root level in the tree.
2074  * A new node is allocated, and a single item is inserted to
2075  * point to the existing root
2076  *
2077  * returns zero on success or < 0 on failure.
2078  */
2079 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2080 			   struct btrfs_root *root,
2081 			   struct btrfs_path *path, int level)
2082 {
2083 	u64 lower_gen;
2084 	struct extent_buffer *lower;
2085 	struct extent_buffer *c;
2086 	struct extent_buffer *old;
2087 	struct btrfs_disk_key lower_key;
2088 
2089 	BUG_ON(path->nodes[level]);
2090 	BUG_ON(path->nodes[level-1] != root->node);
2091 
2092 	lower = path->nodes[level-1];
2093 	if (level == 1)
2094 		btrfs_item_key(lower, &lower_key, 0);
2095 	else
2096 		btrfs_node_key(lower, &lower_key, 0);
2097 
2098 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2099 				   root->root_key.objectid, &lower_key,
2100 				   level, root->node->start, 0);
2101 	if (IS_ERR(c))
2102 		return PTR_ERR(c);
2103 
2104 	root_add_used(root, root->nodesize);
2105 
2106 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2107 	btrfs_set_header_nritems(c, 1);
2108 	btrfs_set_header_level(c, level);
2109 	btrfs_set_header_bytenr(c, c->start);
2110 	btrfs_set_header_generation(c, trans->transid);
2111 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
2112 	btrfs_set_header_owner(c, root->root_key.objectid);
2113 
2114 	write_extent_buffer(c, root->fs_info->fsid,
2115 			    (unsigned long)btrfs_header_fsid(c),
2116 			    BTRFS_FSID_SIZE);
2117 
2118 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2119 			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
2120 			    BTRFS_UUID_SIZE);
2121 
2122 	btrfs_set_node_key(c, &lower_key, 0);
2123 	btrfs_set_node_blockptr(c, 0, lower->start);
2124 	lower_gen = btrfs_header_generation(lower);
2125 	WARN_ON(lower_gen != trans->transid);
2126 
2127 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
2128 
2129 	btrfs_mark_buffer_dirty(c);
2130 
2131 	spin_lock(&root->node_lock);
2132 	old = root->node;
2133 	root->node = c;
2134 	spin_unlock(&root->node_lock);
2135 
2136 	/* the super has an extra ref to root->node */
2137 	free_extent_buffer(old);
2138 
2139 	add_root_to_dirty_list(root);
2140 	extent_buffer_get(c);
2141 	path->nodes[level] = c;
2142 	path->locks[level] = 1;
2143 	path->slots[level] = 0;
2144 	return 0;
2145 }
2146 
2147 /*
2148  * worker function to insert a single pointer in a node.
2149  * the node should have enough room for the pointer already
2150  *
2151  * slot and level indicate where you want the key to go, and
2152  * blocknr is the block the key points to.
2153  *
2154  * returns zero on success and < 0 on any error
2155  */
2156 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2157 		      *root, struct btrfs_path *path, struct btrfs_disk_key
2158 		      *key, u64 bytenr, int slot, int level)
2159 {
2160 	struct extent_buffer *lower;
2161 	int nritems;
2162 
2163 	BUG_ON(!path->nodes[level]);
2164 	btrfs_assert_tree_locked(path->nodes[level]);
2165 	lower = path->nodes[level];
2166 	nritems = btrfs_header_nritems(lower);
2167 	BUG_ON(slot > nritems);
2168 	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2169 		BUG();
2170 	if (slot != nritems) {
2171 		memmove_extent_buffer(lower,
2172 			      btrfs_node_key_ptr_offset(slot + 1),
2173 			      btrfs_node_key_ptr_offset(slot),
2174 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
2175 	}
2176 	btrfs_set_node_key(lower, key, slot);
2177 	btrfs_set_node_blockptr(lower, slot, bytenr);
2178 	WARN_ON(trans->transid == 0);
2179 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2180 	btrfs_set_header_nritems(lower, nritems + 1);
2181 	btrfs_mark_buffer_dirty(lower);
2182 	return 0;
2183 }
2184 
2185 /*
2186  * split the node at the specified level in path in two.
2187  * The path is corrected to point to the appropriate node after the split
2188  *
2189  * Before splitting this tries to make some room in the node by pushing
2190  * left and right, if either one works, it returns right away.
2191  *
2192  * returns 0 on success and < 0 on failure
2193  */
2194 static noinline int split_node(struct btrfs_trans_handle *trans,
2195 			       struct btrfs_root *root,
2196 			       struct btrfs_path *path, int level)
2197 {
2198 	struct extent_buffer *c;
2199 	struct extent_buffer *split;
2200 	struct btrfs_disk_key disk_key;
2201 	int mid;
2202 	int ret;
2203 	int wret;
2204 	u32 c_nritems;
2205 
2206 	c = path->nodes[level];
2207 	WARN_ON(btrfs_header_generation(c) != trans->transid);
2208 	if (c == root->node) {
2209 		/* trying to split the root, lets make a new one */
2210 		ret = insert_new_root(trans, root, path, level + 1);
2211 		if (ret)
2212 			return ret;
2213 	} else {
2214 		ret = push_nodes_for_insert(trans, root, path, level);
2215 		c = path->nodes[level];
2216 		if (!ret && btrfs_header_nritems(c) <
2217 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2218 			return 0;
2219 		if (ret < 0)
2220 			return ret;
2221 	}
2222 
2223 	c_nritems = btrfs_header_nritems(c);
2224 	mid = (c_nritems + 1) / 2;
2225 	btrfs_node_key(c, &disk_key, mid);
2226 
2227 	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2228 					root->root_key.objectid,
2229 					&disk_key, level, c->start, 0);
2230 	if (IS_ERR(split))
2231 		return PTR_ERR(split);
2232 
2233 	root_add_used(root, root->nodesize);
2234 
2235 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2236 	btrfs_set_header_level(split, btrfs_header_level(c));
2237 	btrfs_set_header_bytenr(split, split->start);
2238 	btrfs_set_header_generation(split, trans->transid);
2239 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2240 	btrfs_set_header_owner(split, root->root_key.objectid);
2241 	write_extent_buffer(split, root->fs_info->fsid,
2242 			    (unsigned long)btrfs_header_fsid(split),
2243 			    BTRFS_FSID_SIZE);
2244 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2245 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
2246 			    BTRFS_UUID_SIZE);
2247 
2248 
2249 	copy_extent_buffer(split, c,
2250 			   btrfs_node_key_ptr_offset(0),
2251 			   btrfs_node_key_ptr_offset(mid),
2252 			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2253 	btrfs_set_header_nritems(split, c_nritems - mid);
2254 	btrfs_set_header_nritems(c, mid);
2255 	ret = 0;
2256 
2257 	btrfs_mark_buffer_dirty(c);
2258 	btrfs_mark_buffer_dirty(split);
2259 
2260 	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2261 			  path->slots[level + 1] + 1,
2262 			  level + 1);
2263 	if (wret)
2264 		ret = wret;
2265 
2266 	if (path->slots[level] >= mid) {
2267 		path->slots[level] -= mid;
2268 		btrfs_tree_unlock(c);
2269 		free_extent_buffer(c);
2270 		path->nodes[level] = split;
2271 		path->slots[level + 1] += 1;
2272 	} else {
2273 		btrfs_tree_unlock(split);
2274 		free_extent_buffer(split);
2275 	}
2276 	return ret;
2277 }
2278 
2279 /*
2280  * how many bytes are required to store the items in a leaf.  start
2281  * and nr indicate which items in the leaf to check.  This totals up the
2282  * space used both by the item structs and the item data
2283  */
2284 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2285 {
2286 	int data_len;
2287 	int nritems = btrfs_header_nritems(l);
2288 	int end = min(nritems, start + nr) - 1;
2289 
2290 	if (!nr)
2291 		return 0;
2292 	data_len = btrfs_item_end_nr(l, start);
2293 	data_len = data_len - btrfs_item_offset_nr(l, end);
2294 	data_len += sizeof(struct btrfs_item) * nr;
2295 	WARN_ON(data_len < 0);
2296 	return data_len;
2297 }
2298 
2299 /*
2300  * The space between the end of the leaf items and
2301  * the start of the leaf data.  IOW, how much room
2302  * the leaf has left for both items and data
2303  */
2304 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2305 				   struct extent_buffer *leaf)
2306 {
2307 	int nritems = btrfs_header_nritems(leaf);
2308 	int ret;
2309 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2310 	if (ret < 0) {
2311 		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2312 		       "used %d nritems %d\n",
2313 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2314 		       leaf_space_used(leaf, 0, nritems), nritems);
2315 	}
2316 	return ret;
2317 }
2318 
2319 /*
2320  * min slot controls the lowest index we're willing to push to the
2321  * right.  We'll push up to and including min_slot, but no lower
2322  */
2323 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2324 				      struct btrfs_root *root,
2325 				      struct btrfs_path *path,
2326 				      int data_size, int empty,
2327 				      struct extent_buffer *right,
2328 				      int free_space, u32 left_nritems,
2329 				      u32 min_slot)
2330 {
2331 	struct extent_buffer *left = path->nodes[0];
2332 	struct extent_buffer *upper = path->nodes[1];
2333 	struct btrfs_disk_key disk_key;
2334 	int slot;
2335 	u32 i;
2336 	int push_space = 0;
2337 	int push_items = 0;
2338 	struct btrfs_item *item;
2339 	u32 nr;
2340 	u32 right_nritems;
2341 	u32 data_end;
2342 	u32 this_item_size;
2343 
2344 	if (empty)
2345 		nr = 0;
2346 	else
2347 		nr = max_t(u32, 1, min_slot);
2348 
2349 	if (path->slots[0] >= left_nritems)
2350 		push_space += data_size;
2351 
2352 	slot = path->slots[1];
2353 	i = left_nritems - 1;
2354 	while (i >= nr) {
2355 		item = btrfs_item_nr(left, i);
2356 
2357 		if (!empty && push_items > 0) {
2358 			if (path->slots[0] > i)
2359 				break;
2360 			if (path->slots[0] == i) {
2361 				int space = btrfs_leaf_free_space(root, left);
2362 				if (space + push_space * 2 > free_space)
2363 					break;
2364 			}
2365 		}
2366 
2367 		if (path->slots[0] == i)
2368 			push_space += data_size;
2369 
2370 		if (!left->map_token) {
2371 			map_extent_buffer(left, (unsigned long)item,
2372 					sizeof(struct btrfs_item),
2373 					&left->map_token, &left->kaddr,
2374 					&left->map_start, &left->map_len,
2375 					KM_USER1);
2376 		}
2377 
2378 		this_item_size = btrfs_item_size(left, item);
2379 		if (this_item_size + sizeof(*item) + push_space > free_space)
2380 			break;
2381 
2382 		push_items++;
2383 		push_space += this_item_size + sizeof(*item);
2384 		if (i == 0)
2385 			break;
2386 		i--;
2387 	}
2388 	if (left->map_token) {
2389 		unmap_extent_buffer(left, left->map_token, KM_USER1);
2390 		left->map_token = NULL;
2391 	}
2392 
2393 	if (push_items == 0)
2394 		goto out_unlock;
2395 
2396 	if (!empty && push_items == left_nritems)
2397 		WARN_ON(1);
2398 
2399 	/* push left to right */
2400 	right_nritems = btrfs_header_nritems(right);
2401 
2402 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2403 	push_space -= leaf_data_end(root, left);
2404 
2405 	/* make room in the right data area */
2406 	data_end = leaf_data_end(root, right);
2407 	memmove_extent_buffer(right,
2408 			      btrfs_leaf_data(right) + data_end - push_space,
2409 			      btrfs_leaf_data(right) + data_end,
2410 			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
2411 
2412 	/* copy from the left data area */
2413 	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2414 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
2415 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
2416 		     push_space);
2417 
2418 	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2419 			      btrfs_item_nr_offset(0),
2420 			      right_nritems * sizeof(struct btrfs_item));
2421 
2422 	/* copy the items from left to right */
2423 	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2424 		   btrfs_item_nr_offset(left_nritems - push_items),
2425 		   push_items * sizeof(struct btrfs_item));
2426 
2427 	/* update the item pointers */
2428 	right_nritems += push_items;
2429 	btrfs_set_header_nritems(right, right_nritems);
2430 	push_space = BTRFS_LEAF_DATA_SIZE(root);
2431 	for (i = 0; i < right_nritems; i++) {
2432 		item = btrfs_item_nr(right, i);
2433 		if (!right->map_token) {
2434 			map_extent_buffer(right, (unsigned long)item,
2435 					sizeof(struct btrfs_item),
2436 					&right->map_token, &right->kaddr,
2437 					&right->map_start, &right->map_len,
2438 					KM_USER1);
2439 		}
2440 		push_space -= btrfs_item_size(right, item);
2441 		btrfs_set_item_offset(right, item, push_space);
2442 	}
2443 
2444 	if (right->map_token) {
2445 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2446 		right->map_token = NULL;
2447 	}
2448 	left_nritems -= push_items;
2449 	btrfs_set_header_nritems(left, left_nritems);
2450 
2451 	if (left_nritems)
2452 		btrfs_mark_buffer_dirty(left);
2453 	else
2454 		clean_tree_block(trans, root, left);
2455 
2456 	btrfs_mark_buffer_dirty(right);
2457 
2458 	btrfs_item_key(right, &disk_key, 0);
2459 	btrfs_set_node_key(upper, &disk_key, slot + 1);
2460 	btrfs_mark_buffer_dirty(upper);
2461 
2462 	/* then fixup the leaf pointer in the path */
2463 	if (path->slots[0] >= left_nritems) {
2464 		path->slots[0] -= left_nritems;
2465 		if (btrfs_header_nritems(path->nodes[0]) == 0)
2466 			clean_tree_block(trans, root, path->nodes[0]);
2467 		btrfs_tree_unlock(path->nodes[0]);
2468 		free_extent_buffer(path->nodes[0]);
2469 		path->nodes[0] = right;
2470 		path->slots[1] += 1;
2471 	} else {
2472 		btrfs_tree_unlock(right);
2473 		free_extent_buffer(right);
2474 	}
2475 	return 0;
2476 
2477 out_unlock:
2478 	btrfs_tree_unlock(right);
2479 	free_extent_buffer(right);
2480 	return 1;
2481 }
2482 
2483 /*
2484  * push some data in the path leaf to the right, trying to free up at
2485  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2486  *
2487  * returns 1 if the push failed because the other node didn't have enough
2488  * room, 0 if everything worked out and < 0 if there were major errors.
2489  *
2490  * this will push starting from min_slot to the end of the leaf.  It won't
2491  * push any slot lower than min_slot
2492  */
2493 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2494 			   *root, struct btrfs_path *path,
2495 			   int min_data_size, int data_size,
2496 			   int empty, u32 min_slot)
2497 {
2498 	struct extent_buffer *left = path->nodes[0];
2499 	struct extent_buffer *right;
2500 	struct extent_buffer *upper;
2501 	int slot;
2502 	int free_space;
2503 	u32 left_nritems;
2504 	int ret;
2505 
2506 	if (!path->nodes[1])
2507 		return 1;
2508 
2509 	slot = path->slots[1];
2510 	upper = path->nodes[1];
2511 	if (slot >= btrfs_header_nritems(upper) - 1)
2512 		return 1;
2513 
2514 	btrfs_assert_tree_locked(path->nodes[1]);
2515 
2516 	right = read_node_slot(root, upper, slot + 1);
2517 	btrfs_tree_lock(right);
2518 	btrfs_set_lock_blocking(right);
2519 
2520 	free_space = btrfs_leaf_free_space(root, right);
2521 	if (free_space < data_size)
2522 		goto out_unlock;
2523 
2524 	/* cow and double check */
2525 	ret = btrfs_cow_block(trans, root, right, upper,
2526 			      slot + 1, &right);
2527 	if (ret)
2528 		goto out_unlock;
2529 
2530 	free_space = btrfs_leaf_free_space(root, right);
2531 	if (free_space < data_size)
2532 		goto out_unlock;
2533 
2534 	left_nritems = btrfs_header_nritems(left);
2535 	if (left_nritems == 0)
2536 		goto out_unlock;
2537 
2538 	return __push_leaf_right(trans, root, path, min_data_size, empty,
2539 				right, free_space, left_nritems, min_slot);
2540 out_unlock:
2541 	btrfs_tree_unlock(right);
2542 	free_extent_buffer(right);
2543 	return 1;
2544 }
2545 
2546 /*
2547  * push some data in the path leaf to the left, trying to free up at
2548  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2549  *
2550  * max_slot can put a limit on how far into the leaf we'll push items.  The
2551  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2552  * items
2553  */
2554 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2555 				     struct btrfs_root *root,
2556 				     struct btrfs_path *path, int data_size,
2557 				     int empty, struct extent_buffer *left,
2558 				     int free_space, u32 right_nritems,
2559 				     u32 max_slot)
2560 {
2561 	struct btrfs_disk_key disk_key;
2562 	struct extent_buffer *right = path->nodes[0];
2563 	int i;
2564 	int push_space = 0;
2565 	int push_items = 0;
2566 	struct btrfs_item *item;
2567 	u32 old_left_nritems;
2568 	u32 nr;
2569 	int ret = 0;
2570 	int wret;
2571 	u32 this_item_size;
2572 	u32 old_left_item_size;
2573 
2574 	if (empty)
2575 		nr = min(right_nritems, max_slot);
2576 	else
2577 		nr = min(right_nritems - 1, max_slot);
2578 
2579 	for (i = 0; i < nr; i++) {
2580 		item = btrfs_item_nr(right, i);
2581 		if (!right->map_token) {
2582 			map_extent_buffer(right, (unsigned long)item,
2583 					sizeof(struct btrfs_item),
2584 					&right->map_token, &right->kaddr,
2585 					&right->map_start, &right->map_len,
2586 					KM_USER1);
2587 		}
2588 
2589 		if (!empty && push_items > 0) {
2590 			if (path->slots[0] < i)
2591 				break;
2592 			if (path->slots[0] == i) {
2593 				int space = btrfs_leaf_free_space(root, right);
2594 				if (space + push_space * 2 > free_space)
2595 					break;
2596 			}
2597 		}
2598 
2599 		if (path->slots[0] == i)
2600 			push_space += data_size;
2601 
2602 		this_item_size = btrfs_item_size(right, item);
2603 		if (this_item_size + sizeof(*item) + push_space > free_space)
2604 			break;
2605 
2606 		push_items++;
2607 		push_space += this_item_size + sizeof(*item);
2608 	}
2609 
2610 	if (right->map_token) {
2611 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2612 		right->map_token = NULL;
2613 	}
2614 
2615 	if (push_items == 0) {
2616 		ret = 1;
2617 		goto out;
2618 	}
2619 	if (!empty && push_items == btrfs_header_nritems(right))
2620 		WARN_ON(1);
2621 
2622 	/* push data from right to left */
2623 	copy_extent_buffer(left, right,
2624 			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
2625 			   btrfs_item_nr_offset(0),
2626 			   push_items * sizeof(struct btrfs_item));
2627 
2628 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
2629 		     btrfs_item_offset_nr(right, push_items - 1);
2630 
2631 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2632 		     leaf_data_end(root, left) - push_space,
2633 		     btrfs_leaf_data(right) +
2634 		     btrfs_item_offset_nr(right, push_items - 1),
2635 		     push_space);
2636 	old_left_nritems = btrfs_header_nritems(left);
2637 	BUG_ON(old_left_nritems <= 0);
2638 
2639 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2640 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2641 		u32 ioff;
2642 
2643 		item = btrfs_item_nr(left, i);
2644 		if (!left->map_token) {
2645 			map_extent_buffer(left, (unsigned long)item,
2646 					sizeof(struct btrfs_item),
2647 					&left->map_token, &left->kaddr,
2648 					&left->map_start, &left->map_len,
2649 					KM_USER1);
2650 		}
2651 
2652 		ioff = btrfs_item_offset(left, item);
2653 		btrfs_set_item_offset(left, item,
2654 		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2655 	}
2656 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2657 	if (left->map_token) {
2658 		unmap_extent_buffer(left, left->map_token, KM_USER1);
2659 		left->map_token = NULL;
2660 	}
2661 
2662 	/* fixup right node */
2663 	if (push_items > right_nritems) {
2664 		printk(KERN_CRIT "push items %d nr %u\n", push_items,
2665 		       right_nritems);
2666 		WARN_ON(1);
2667 	}
2668 
2669 	if (push_items < right_nritems) {
2670 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
2671 						  leaf_data_end(root, right);
2672 		memmove_extent_buffer(right, btrfs_leaf_data(right) +
2673 				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2674 				      btrfs_leaf_data(right) +
2675 				      leaf_data_end(root, right), push_space);
2676 
2677 		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2678 			      btrfs_item_nr_offset(push_items),
2679 			     (btrfs_header_nritems(right) - push_items) *
2680 			     sizeof(struct btrfs_item));
2681 	}
2682 	right_nritems -= push_items;
2683 	btrfs_set_header_nritems(right, right_nritems);
2684 	push_space = BTRFS_LEAF_DATA_SIZE(root);
2685 	for (i = 0; i < right_nritems; i++) {
2686 		item = btrfs_item_nr(right, i);
2687 
2688 		if (!right->map_token) {
2689 			map_extent_buffer(right, (unsigned long)item,
2690 					sizeof(struct btrfs_item),
2691 					&right->map_token, &right->kaddr,
2692 					&right->map_start, &right->map_len,
2693 					KM_USER1);
2694 		}
2695 
2696 		push_space = push_space - btrfs_item_size(right, item);
2697 		btrfs_set_item_offset(right, item, push_space);
2698 	}
2699 	if (right->map_token) {
2700 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2701 		right->map_token = NULL;
2702 	}
2703 
2704 	btrfs_mark_buffer_dirty(left);
2705 	if (right_nritems)
2706 		btrfs_mark_buffer_dirty(right);
2707 	else
2708 		clean_tree_block(trans, root, right);
2709 
2710 	btrfs_item_key(right, &disk_key, 0);
2711 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2712 	if (wret)
2713 		ret = wret;
2714 
2715 	/* then fixup the leaf pointer in the path */
2716 	if (path->slots[0] < push_items) {
2717 		path->slots[0] += old_left_nritems;
2718 		btrfs_tree_unlock(path->nodes[0]);
2719 		free_extent_buffer(path->nodes[0]);
2720 		path->nodes[0] = left;
2721 		path->slots[1] -= 1;
2722 	} else {
2723 		btrfs_tree_unlock(left);
2724 		free_extent_buffer(left);
2725 		path->slots[0] -= push_items;
2726 	}
2727 	BUG_ON(path->slots[0] < 0);
2728 	return ret;
2729 out:
2730 	btrfs_tree_unlock(left);
2731 	free_extent_buffer(left);
2732 	return ret;
2733 }
2734 
2735 /*
2736  * push some data in the path leaf to the left, trying to free up at
2737  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2738  *
2739  * max_slot can put a limit on how far into the leaf we'll push items.  The
2740  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
2741  * items
2742  */
2743 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2744 			  *root, struct btrfs_path *path, int min_data_size,
2745 			  int data_size, int empty, u32 max_slot)
2746 {
2747 	struct extent_buffer *right = path->nodes[0];
2748 	struct extent_buffer *left;
2749 	int slot;
2750 	int free_space;
2751 	u32 right_nritems;
2752 	int ret = 0;
2753 
2754 	slot = path->slots[1];
2755 	if (slot == 0)
2756 		return 1;
2757 	if (!path->nodes[1])
2758 		return 1;
2759 
2760 	right_nritems = btrfs_header_nritems(right);
2761 	if (right_nritems == 0)
2762 		return 1;
2763 
2764 	btrfs_assert_tree_locked(path->nodes[1]);
2765 
2766 	left = read_node_slot(root, path->nodes[1], slot - 1);
2767 	btrfs_tree_lock(left);
2768 	btrfs_set_lock_blocking(left);
2769 
2770 	free_space = btrfs_leaf_free_space(root, left);
2771 	if (free_space < data_size) {
2772 		ret = 1;
2773 		goto out;
2774 	}
2775 
2776 	/* cow and double check */
2777 	ret = btrfs_cow_block(trans, root, left,
2778 			      path->nodes[1], slot - 1, &left);
2779 	if (ret) {
2780 		/* we hit -ENOSPC, but it isn't fatal here */
2781 		ret = 1;
2782 		goto out;
2783 	}
2784 
2785 	free_space = btrfs_leaf_free_space(root, left);
2786 	if (free_space < data_size) {
2787 		ret = 1;
2788 		goto out;
2789 	}
2790 
2791 	return __push_leaf_left(trans, root, path, min_data_size,
2792 			       empty, left, free_space, right_nritems,
2793 			       max_slot);
2794 out:
2795 	btrfs_tree_unlock(left);
2796 	free_extent_buffer(left);
2797 	return ret;
2798 }
2799 
2800 /*
2801  * split the path's leaf in two, making sure there is at least data_size
2802  * available for the resulting leaf level of the path.
2803  *
2804  * returns 0 if all went well and < 0 on failure.
2805  */
2806 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2807 			       struct btrfs_root *root,
2808 			       struct btrfs_path *path,
2809 			       struct extent_buffer *l,
2810 			       struct extent_buffer *right,
2811 			       int slot, int mid, int nritems)
2812 {
2813 	int data_copy_size;
2814 	int rt_data_off;
2815 	int i;
2816 	int ret = 0;
2817 	int wret;
2818 	struct btrfs_disk_key disk_key;
2819 
2820 	nritems = nritems - mid;
2821 	btrfs_set_header_nritems(right, nritems);
2822 	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2823 
2824 	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2825 			   btrfs_item_nr_offset(mid),
2826 			   nritems * sizeof(struct btrfs_item));
2827 
2828 	copy_extent_buffer(right, l,
2829 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2830 		     data_copy_size, btrfs_leaf_data(l) +
2831 		     leaf_data_end(root, l), data_copy_size);
2832 
2833 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2834 		      btrfs_item_end_nr(l, mid);
2835 
2836 	for (i = 0; i < nritems; i++) {
2837 		struct btrfs_item *item = btrfs_item_nr(right, i);
2838 		u32 ioff;
2839 
2840 		if (!right->map_token) {
2841 			map_extent_buffer(right, (unsigned long)item,
2842 					sizeof(struct btrfs_item),
2843 					&right->map_token, &right->kaddr,
2844 					&right->map_start, &right->map_len,
2845 					KM_USER1);
2846 		}
2847 
2848 		ioff = btrfs_item_offset(right, item);
2849 		btrfs_set_item_offset(right, item, ioff + rt_data_off);
2850 	}
2851 
2852 	if (right->map_token) {
2853 		unmap_extent_buffer(right, right->map_token, KM_USER1);
2854 		right->map_token = NULL;
2855 	}
2856 
2857 	btrfs_set_header_nritems(l, mid);
2858 	ret = 0;
2859 	btrfs_item_key(right, &disk_key, 0);
2860 	wret = insert_ptr(trans, root, path, &disk_key, right->start,
2861 			  path->slots[1] + 1, 1);
2862 	if (wret)
2863 		ret = wret;
2864 
2865 	btrfs_mark_buffer_dirty(right);
2866 	btrfs_mark_buffer_dirty(l);
2867 	BUG_ON(path->slots[0] != slot);
2868 
2869 	if (mid <= slot) {
2870 		btrfs_tree_unlock(path->nodes[0]);
2871 		free_extent_buffer(path->nodes[0]);
2872 		path->nodes[0] = right;
2873 		path->slots[0] -= mid;
2874 		path->slots[1] += 1;
2875 	} else {
2876 		btrfs_tree_unlock(right);
2877 		free_extent_buffer(right);
2878 	}
2879 
2880 	BUG_ON(path->slots[0] < 0);
2881 
2882 	return ret;
2883 }
2884 
2885 /*
2886  * double splits happen when we need to insert a big item in the middle
2887  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
2888  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2889  *          A                 B                 C
2890  *
2891  * We avoid this by trying to push the items on either side of our target
2892  * into the adjacent leaves.  If all goes well we can avoid the double split
2893  * completely.
2894  */
2895 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2896 					  struct btrfs_root *root,
2897 					  struct btrfs_path *path,
2898 					  int data_size)
2899 {
2900 	int ret;
2901 	int progress = 0;
2902 	int slot;
2903 	u32 nritems;
2904 
2905 	slot = path->slots[0];
2906 
2907 	/*
2908 	 * try to push all the items after our slot into the
2909 	 * right leaf
2910 	 */
2911 	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2912 	if (ret < 0)
2913 		return ret;
2914 
2915 	if (ret == 0)
2916 		progress++;
2917 
2918 	nritems = btrfs_header_nritems(path->nodes[0]);
2919 	/*
2920 	 * our goal is to get our slot at the start or end of a leaf.  If
2921 	 * we've done so we're done
2922 	 */
2923 	if (path->slots[0] == 0 || path->slots[0] == nritems)
2924 		return 0;
2925 
2926 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2927 		return 0;
2928 
2929 	/* try to push all the items before our slot into the next leaf */
2930 	slot = path->slots[0];
2931 	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2932 	if (ret < 0)
2933 		return ret;
2934 
2935 	if (ret == 0)
2936 		progress++;
2937 
2938 	if (progress)
2939 		return 0;
2940 	return 1;
2941 }
2942 
2943 /*
2944  * split the path's leaf in two, making sure there is at least data_size
2945  * available for the resulting leaf level of the path.
2946  *
2947  * returns 0 if all went well and < 0 on failure.
2948  */
2949 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2950 			       struct btrfs_root *root,
2951 			       struct btrfs_key *ins_key,
2952 			       struct btrfs_path *path, int data_size,
2953 			       int extend)
2954 {
2955 	struct btrfs_disk_key disk_key;
2956 	struct extent_buffer *l;
2957 	u32 nritems;
2958 	int mid;
2959 	int slot;
2960 	struct extent_buffer *right;
2961 	int ret = 0;
2962 	int wret;
2963 	int split;
2964 	int num_doubles = 0;
2965 	int tried_avoid_double = 0;
2966 
2967 	l = path->nodes[0];
2968 	slot = path->slots[0];
2969 	if (extend && data_size + btrfs_item_size_nr(l, slot) +
2970 	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2971 		return -EOVERFLOW;
2972 
2973 	/* first try to make some room by pushing left and right */
2974 	if (data_size) {
2975 		wret = push_leaf_right(trans, root, path, data_size,
2976 				       data_size, 0, 0);
2977 		if (wret < 0)
2978 			return wret;
2979 		if (wret) {
2980 			wret = push_leaf_left(trans, root, path, data_size,
2981 					      data_size, 0, (u32)-1);
2982 			if (wret < 0)
2983 				return wret;
2984 		}
2985 		l = path->nodes[0];
2986 
2987 		/* did the pushes work? */
2988 		if (btrfs_leaf_free_space(root, l) >= data_size)
2989 			return 0;
2990 	}
2991 
2992 	if (!path->nodes[1]) {
2993 		ret = insert_new_root(trans, root, path, 1);
2994 		if (ret)
2995 			return ret;
2996 	}
2997 again:
2998 	split = 1;
2999 	l = path->nodes[0];
3000 	slot = path->slots[0];
3001 	nritems = btrfs_header_nritems(l);
3002 	mid = (nritems + 1) / 2;
3003 
3004 	if (mid <= slot) {
3005 		if (nritems == 1 ||
3006 		    leaf_space_used(l, mid, nritems - mid) + data_size >
3007 			BTRFS_LEAF_DATA_SIZE(root)) {
3008 			if (slot >= nritems) {
3009 				split = 0;
3010 			} else {
3011 				mid = slot;
3012 				if (mid != nritems &&
3013 				    leaf_space_used(l, mid, nritems - mid) +
3014 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3015 					if (data_size && !tried_avoid_double)
3016 						goto push_for_double;
3017 					split = 2;
3018 				}
3019 			}
3020 		}
3021 	} else {
3022 		if (leaf_space_used(l, 0, mid) + data_size >
3023 			BTRFS_LEAF_DATA_SIZE(root)) {
3024 			if (!extend && data_size && slot == 0) {
3025 				split = 0;
3026 			} else if ((extend || !data_size) && slot == 0) {
3027 				mid = 1;
3028 			} else {
3029 				mid = slot;
3030 				if (mid != nritems &&
3031 				    leaf_space_used(l, mid, nritems - mid) +
3032 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3033 					if (data_size && !tried_avoid_double)
3034 						goto push_for_double;
3035 					split = 2 ;
3036 				}
3037 			}
3038 		}
3039 	}
3040 
3041 	if (split == 0)
3042 		btrfs_cpu_key_to_disk(&disk_key, ins_key);
3043 	else
3044 		btrfs_item_key(l, &disk_key, mid);
3045 
3046 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3047 					root->root_key.objectid,
3048 					&disk_key, 0, l->start, 0);
3049 	if (IS_ERR(right))
3050 		return PTR_ERR(right);
3051 
3052 	root_add_used(root, root->leafsize);
3053 
3054 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
3055 	btrfs_set_header_bytenr(right, right->start);
3056 	btrfs_set_header_generation(right, trans->transid);
3057 	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
3058 	btrfs_set_header_owner(right, root->root_key.objectid);
3059 	btrfs_set_header_level(right, 0);
3060 	write_extent_buffer(right, root->fs_info->fsid,
3061 			    (unsigned long)btrfs_header_fsid(right),
3062 			    BTRFS_FSID_SIZE);
3063 
3064 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
3065 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
3066 			    BTRFS_UUID_SIZE);
3067 
3068 	if (split == 0) {
3069 		if (mid <= slot) {
3070 			btrfs_set_header_nritems(right, 0);
3071 			wret = insert_ptr(trans, root, path,
3072 					  &disk_key, right->start,
3073 					  path->slots[1] + 1, 1);
3074 			if (wret)
3075 				ret = wret;
3076 
3077 			btrfs_tree_unlock(path->nodes[0]);
3078 			free_extent_buffer(path->nodes[0]);
3079 			path->nodes[0] = right;
3080 			path->slots[0] = 0;
3081 			path->slots[1] += 1;
3082 		} else {
3083 			btrfs_set_header_nritems(right, 0);
3084 			wret = insert_ptr(trans, root, path,
3085 					  &disk_key,
3086 					  right->start,
3087 					  path->slots[1], 1);
3088 			if (wret)
3089 				ret = wret;
3090 			btrfs_tree_unlock(path->nodes[0]);
3091 			free_extent_buffer(path->nodes[0]);
3092 			path->nodes[0] = right;
3093 			path->slots[0] = 0;
3094 			if (path->slots[1] == 0) {
3095 				wret = fixup_low_keys(trans, root,
3096 						path, &disk_key, 1);
3097 				if (wret)
3098 					ret = wret;
3099 			}
3100 		}
3101 		btrfs_mark_buffer_dirty(right);
3102 		return ret;
3103 	}
3104 
3105 	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3106 	BUG_ON(ret);
3107 
3108 	if (split == 2) {
3109 		BUG_ON(num_doubles != 0);
3110 		num_doubles++;
3111 		goto again;
3112 	}
3113 
3114 	return ret;
3115 
3116 push_for_double:
3117 	push_for_double_split(trans, root, path, data_size);
3118 	tried_avoid_double = 1;
3119 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3120 		return 0;
3121 	goto again;
3122 }
3123 
3124 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3125 					 struct btrfs_root *root,
3126 					 struct btrfs_path *path, int ins_len)
3127 {
3128 	struct btrfs_key key;
3129 	struct extent_buffer *leaf;
3130 	struct btrfs_file_extent_item *fi;
3131 	u64 extent_len = 0;
3132 	u32 item_size;
3133 	int ret;
3134 
3135 	leaf = path->nodes[0];
3136 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3137 
3138 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3139 	       key.type != BTRFS_EXTENT_CSUM_KEY);
3140 
3141 	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3142 		return 0;
3143 
3144 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3145 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3146 		fi = btrfs_item_ptr(leaf, path->slots[0],
3147 				    struct btrfs_file_extent_item);
3148 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3149 	}
3150 	btrfs_release_path(root, path);
3151 
3152 	path->keep_locks = 1;
3153 	path->search_for_split = 1;
3154 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3155 	path->search_for_split = 0;
3156 	if (ret < 0)
3157 		goto err;
3158 
3159 	ret = -EAGAIN;
3160 	leaf = path->nodes[0];
3161 	/* if our item isn't there or got smaller, return now */
3162 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3163 		goto err;
3164 
3165 	/* the leaf has  changed, it now has room.  return now */
3166 	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3167 		goto err;
3168 
3169 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3170 		fi = btrfs_item_ptr(leaf, path->slots[0],
3171 				    struct btrfs_file_extent_item);
3172 		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3173 			goto err;
3174 	}
3175 
3176 	btrfs_set_path_blocking(path);
3177 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
3178 	if (ret)
3179 		goto err;
3180 
3181 	path->keep_locks = 0;
3182 	btrfs_unlock_up_safe(path, 1);
3183 	return 0;
3184 err:
3185 	path->keep_locks = 0;
3186 	return ret;
3187 }
3188 
3189 static noinline int split_item(struct btrfs_trans_handle *trans,
3190 			       struct btrfs_root *root,
3191 			       struct btrfs_path *path,
3192 			       struct btrfs_key *new_key,
3193 			       unsigned long split_offset)
3194 {
3195 	struct extent_buffer *leaf;
3196 	struct btrfs_item *item;
3197 	struct btrfs_item *new_item;
3198 	int slot;
3199 	char *buf;
3200 	u32 nritems;
3201 	u32 item_size;
3202 	u32 orig_offset;
3203 	struct btrfs_disk_key disk_key;
3204 
3205 	leaf = path->nodes[0];
3206 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3207 
3208 	btrfs_set_path_blocking(path);
3209 
3210 	item = btrfs_item_nr(leaf, path->slots[0]);
3211 	orig_offset = btrfs_item_offset(leaf, item);
3212 	item_size = btrfs_item_size(leaf, item);
3213 
3214 	buf = kmalloc(item_size, GFP_NOFS);
3215 	if (!buf)
3216 		return -ENOMEM;
3217 
3218 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3219 			    path->slots[0]), item_size);
3220 
3221 	slot = path->slots[0] + 1;
3222 	nritems = btrfs_header_nritems(leaf);
3223 	if (slot != nritems) {
3224 		/* shift the items */
3225 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3226 				btrfs_item_nr_offset(slot),
3227 				(nritems - slot) * sizeof(struct btrfs_item));
3228 	}
3229 
3230 	btrfs_cpu_key_to_disk(&disk_key, new_key);
3231 	btrfs_set_item_key(leaf, &disk_key, slot);
3232 
3233 	new_item = btrfs_item_nr(leaf, slot);
3234 
3235 	btrfs_set_item_offset(leaf, new_item, orig_offset);
3236 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3237 
3238 	btrfs_set_item_offset(leaf, item,
3239 			      orig_offset + item_size - split_offset);
3240 	btrfs_set_item_size(leaf, item, split_offset);
3241 
3242 	btrfs_set_header_nritems(leaf, nritems + 1);
3243 
3244 	/* write the data for the start of the original item */
3245 	write_extent_buffer(leaf, buf,
3246 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
3247 			    split_offset);
3248 
3249 	/* write the data for the new item */
3250 	write_extent_buffer(leaf, buf + split_offset,
3251 			    btrfs_item_ptr_offset(leaf, slot),
3252 			    item_size - split_offset);
3253 	btrfs_mark_buffer_dirty(leaf);
3254 
3255 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3256 	kfree(buf);
3257 	return 0;
3258 }
3259 
3260 /*
3261  * This function splits a single item into two items,
3262  * giving 'new_key' to the new item and splitting the
3263  * old one at split_offset (from the start of the item).
3264  *
3265  * The path may be released by this operation.  After
3266  * the split, the path is pointing to the old item.  The
3267  * new item is going to be in the same node as the old one.
3268  *
3269  * Note, the item being split must be smaller enough to live alone on
3270  * a tree block with room for one extra struct btrfs_item
3271  *
3272  * This allows us to split the item in place, keeping a lock on the
3273  * leaf the entire time.
3274  */
3275 int btrfs_split_item(struct btrfs_trans_handle *trans,
3276 		     struct btrfs_root *root,
3277 		     struct btrfs_path *path,
3278 		     struct btrfs_key *new_key,
3279 		     unsigned long split_offset)
3280 {
3281 	int ret;
3282 	ret = setup_leaf_for_split(trans, root, path,
3283 				   sizeof(struct btrfs_item));
3284 	if (ret)
3285 		return ret;
3286 
3287 	ret = split_item(trans, root, path, new_key, split_offset);
3288 	return ret;
3289 }
3290 
3291 /*
3292  * This function duplicate a item, giving 'new_key' to the new item.
3293  * It guarantees both items live in the same tree leaf and the new item
3294  * is contiguous with the original item.
3295  *
3296  * This allows us to split file extent in place, keeping a lock on the
3297  * leaf the entire time.
3298  */
3299 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3300 			 struct btrfs_root *root,
3301 			 struct btrfs_path *path,
3302 			 struct btrfs_key *new_key)
3303 {
3304 	struct extent_buffer *leaf;
3305 	int ret;
3306 	u32 item_size;
3307 
3308 	leaf = path->nodes[0];
3309 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3310 	ret = setup_leaf_for_split(trans, root, path,
3311 				   item_size + sizeof(struct btrfs_item));
3312 	if (ret)
3313 		return ret;
3314 
3315 	path->slots[0]++;
3316 	ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
3317 				     item_size, item_size +
3318 				     sizeof(struct btrfs_item), 1);
3319 	BUG_ON(ret);
3320 
3321 	leaf = path->nodes[0];
3322 	memcpy_extent_buffer(leaf,
3323 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
3324 			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3325 			     item_size);
3326 	return 0;
3327 }
3328 
3329 /*
3330  * make the item pointed to by the path smaller.  new_size indicates
3331  * how small to make it, and from_end tells us if we just chop bytes
3332  * off the end of the item or if we shift the item to chop bytes off
3333  * the front.
3334  */
3335 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3336 			struct btrfs_root *root,
3337 			struct btrfs_path *path,
3338 			u32 new_size, int from_end)
3339 {
3340 	int ret = 0;
3341 	int slot;
3342 	struct extent_buffer *leaf;
3343 	struct btrfs_item *item;
3344 	u32 nritems;
3345 	unsigned int data_end;
3346 	unsigned int old_data_start;
3347 	unsigned int old_size;
3348 	unsigned int size_diff;
3349 	int i;
3350 
3351 	leaf = path->nodes[0];
3352 	slot = path->slots[0];
3353 
3354 	old_size = btrfs_item_size_nr(leaf, slot);
3355 	if (old_size == new_size)
3356 		return 0;
3357 
3358 	nritems = btrfs_header_nritems(leaf);
3359 	data_end = leaf_data_end(root, leaf);
3360 
3361 	old_data_start = btrfs_item_offset_nr(leaf, slot);
3362 
3363 	size_diff = old_size - new_size;
3364 
3365 	BUG_ON(slot < 0);
3366 	BUG_ON(slot >= nritems);
3367 
3368 	/*
3369 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3370 	 */
3371 	/* first correct the data pointers */
3372 	for (i = slot; i < nritems; i++) {
3373 		u32 ioff;
3374 		item = btrfs_item_nr(leaf, i);
3375 
3376 		if (!leaf->map_token) {
3377 			map_extent_buffer(leaf, (unsigned long)item,
3378 					sizeof(struct btrfs_item),
3379 					&leaf->map_token, &leaf->kaddr,
3380 					&leaf->map_start, &leaf->map_len,
3381 					KM_USER1);
3382 		}
3383 
3384 		ioff = btrfs_item_offset(leaf, item);
3385 		btrfs_set_item_offset(leaf, item, ioff + size_diff);
3386 	}
3387 
3388 	if (leaf->map_token) {
3389 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3390 		leaf->map_token = NULL;
3391 	}
3392 
3393 	/* shift the data */
3394 	if (from_end) {
3395 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3396 			      data_end + size_diff, btrfs_leaf_data(leaf) +
3397 			      data_end, old_data_start + new_size - data_end);
3398 	} else {
3399 		struct btrfs_disk_key disk_key;
3400 		u64 offset;
3401 
3402 		btrfs_item_key(leaf, &disk_key, slot);
3403 
3404 		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3405 			unsigned long ptr;
3406 			struct btrfs_file_extent_item *fi;
3407 
3408 			fi = btrfs_item_ptr(leaf, slot,
3409 					    struct btrfs_file_extent_item);
3410 			fi = (struct btrfs_file_extent_item *)(
3411 			     (unsigned long)fi - size_diff);
3412 
3413 			if (btrfs_file_extent_type(leaf, fi) ==
3414 			    BTRFS_FILE_EXTENT_INLINE) {
3415 				ptr = btrfs_item_ptr_offset(leaf, slot);
3416 				memmove_extent_buffer(leaf, ptr,
3417 				      (unsigned long)fi,
3418 				      offsetof(struct btrfs_file_extent_item,
3419 						 disk_bytenr));
3420 			}
3421 		}
3422 
3423 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3424 			      data_end + size_diff, btrfs_leaf_data(leaf) +
3425 			      data_end, old_data_start - data_end);
3426 
3427 		offset = btrfs_disk_key_offset(&disk_key);
3428 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3429 		btrfs_set_item_key(leaf, &disk_key, slot);
3430 		if (slot == 0)
3431 			fixup_low_keys(trans, root, path, &disk_key, 1);
3432 	}
3433 
3434 	item = btrfs_item_nr(leaf, slot);
3435 	btrfs_set_item_size(leaf, item, new_size);
3436 	btrfs_mark_buffer_dirty(leaf);
3437 
3438 	ret = 0;
3439 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3440 		btrfs_print_leaf(root, leaf);
3441 		BUG();
3442 	}
3443 	return ret;
3444 }
3445 
3446 /*
3447  * make the item pointed to by the path bigger, data_size is the new size.
3448  */
3449 int btrfs_extend_item(struct btrfs_trans_handle *trans,
3450 		      struct btrfs_root *root, struct btrfs_path *path,
3451 		      u32 data_size)
3452 {
3453 	int ret = 0;
3454 	int slot;
3455 	struct extent_buffer *leaf;
3456 	struct btrfs_item *item;
3457 	u32 nritems;
3458 	unsigned int data_end;
3459 	unsigned int old_data;
3460 	unsigned int old_size;
3461 	int i;
3462 
3463 	leaf = path->nodes[0];
3464 
3465 	nritems = btrfs_header_nritems(leaf);
3466 	data_end = leaf_data_end(root, leaf);
3467 
3468 	if (btrfs_leaf_free_space(root, leaf) < data_size) {
3469 		btrfs_print_leaf(root, leaf);
3470 		BUG();
3471 	}
3472 	slot = path->slots[0];
3473 	old_data = btrfs_item_end_nr(leaf, slot);
3474 
3475 	BUG_ON(slot < 0);
3476 	if (slot >= nritems) {
3477 		btrfs_print_leaf(root, leaf);
3478 		printk(KERN_CRIT "slot %d too large, nritems %d\n",
3479 		       slot, nritems);
3480 		BUG_ON(1);
3481 	}
3482 
3483 	/*
3484 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3485 	 */
3486 	/* first correct the data pointers */
3487 	for (i = slot; i < nritems; i++) {
3488 		u32 ioff;
3489 		item = btrfs_item_nr(leaf, i);
3490 
3491 		if (!leaf->map_token) {
3492 			map_extent_buffer(leaf, (unsigned long)item,
3493 					sizeof(struct btrfs_item),
3494 					&leaf->map_token, &leaf->kaddr,
3495 					&leaf->map_start, &leaf->map_len,
3496 					KM_USER1);
3497 		}
3498 		ioff = btrfs_item_offset(leaf, item);
3499 		btrfs_set_item_offset(leaf, item, ioff - data_size);
3500 	}
3501 
3502 	if (leaf->map_token) {
3503 		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3504 		leaf->map_token = NULL;
3505 	}
3506 
3507 	/* shift the data */
3508 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3509 		      data_end - data_size, btrfs_leaf_data(leaf) +
3510 		      data_end, old_data - data_end);
3511 
3512 	data_end = old_data;
3513 	old_size = btrfs_item_size_nr(leaf, slot);
3514 	item = btrfs_item_nr(leaf, slot);
3515 	btrfs_set_item_size(leaf, item, old_size + data_size);
3516 	btrfs_mark_buffer_dirty(leaf);
3517 
3518 	ret = 0;
3519 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3520 		btrfs_print_leaf(root, leaf);
3521 		BUG();
3522 	}
3523 	return ret;
3524 }
3525 
3526 /*
3527  * Given a key and some data, insert items into the tree.
3528  * This does all the path init required, making room in the tree if needed.
3529  * Returns the number of keys that were inserted.
3530  */
3531 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3532 			    struct btrfs_root *root,
3533 			    struct btrfs_path *path,
3534 			    struct btrfs_key *cpu_key, u32 *data_size,
3535 			    int nr)
3536 {
3537 	struct extent_buffer *leaf;
3538 	struct btrfs_item *item;
3539 	int ret = 0;
3540 	int slot;
3541 	int i;
3542 	u32 nritems;
3543 	u32 total_data = 0;
3544 	u32 total_size = 0;
3545 	unsigned int data_end;
3546 	struct btrfs_disk_key disk_key;
3547 	struct btrfs_key found_key;
3548 
3549 	for (i = 0; i < nr; i++) {
3550 		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3551 		    BTRFS_LEAF_DATA_SIZE(root)) {
3552 			break;
3553 			nr = i;
3554 		}
3555 		total_data += data_size[i];
3556 		total_size += data_size[i] + sizeof(struct btrfs_item);
3557 	}
3558 	BUG_ON(nr == 0);
3559 
3560 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3561 	if (ret == 0)
3562 		return -EEXIST;
3563 	if (ret < 0)
3564 		goto out;
3565 
3566 	leaf = path->nodes[0];
3567 
3568 	nritems = btrfs_header_nritems(leaf);
3569 	data_end = leaf_data_end(root, leaf);
3570 
3571 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3572 		for (i = nr; i >= 0; i--) {
3573 			total_data -= data_size[i];
3574 			total_size -= data_size[i] + sizeof(struct btrfs_item);
3575 			if (total_size < btrfs_leaf_free_space(root, leaf))
3576 				break;
3577 		}
3578 		nr = i;
3579 	}
3580 
3581 	slot = path->slots[0];
3582 	BUG_ON(slot < 0);
3583 
3584 	if (slot != nritems) {
3585 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3586 
3587 		item = btrfs_item_nr(leaf, slot);
3588 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3589 
3590 		/* figure out how many keys we can insert in here */
3591 		total_data = data_size[0];
3592 		for (i = 1; i < nr; i++) {
3593 			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3594 				break;
3595 			total_data += data_size[i];
3596 		}
3597 		nr = i;
3598 
3599 		if (old_data < data_end) {
3600 			btrfs_print_leaf(root, leaf);
3601 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3602 			       slot, old_data, data_end);
3603 			BUG_ON(1);
3604 		}
3605 		/*
3606 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3607 		 */
3608 		/* first correct the data pointers */
3609 		WARN_ON(leaf->map_token);
3610 		for (i = slot; i < nritems; i++) {
3611 			u32 ioff;
3612 
3613 			item = btrfs_item_nr(leaf, i);
3614 			if (!leaf->map_token) {
3615 				map_extent_buffer(leaf, (unsigned long)item,
3616 					sizeof(struct btrfs_item),
3617 					&leaf->map_token, &leaf->kaddr,
3618 					&leaf->map_start, &leaf->map_len,
3619 					KM_USER1);
3620 			}
3621 
3622 			ioff = btrfs_item_offset(leaf, item);
3623 			btrfs_set_item_offset(leaf, item, ioff - total_data);
3624 		}
3625 		if (leaf->map_token) {
3626 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3627 			leaf->map_token = NULL;
3628 		}
3629 
3630 		/* shift the items */
3631 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3632 			      btrfs_item_nr_offset(slot),
3633 			      (nritems - slot) * sizeof(struct btrfs_item));
3634 
3635 		/* shift the data */
3636 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3637 			      data_end - total_data, btrfs_leaf_data(leaf) +
3638 			      data_end, old_data - data_end);
3639 		data_end = old_data;
3640 	} else {
3641 		/*
3642 		 * this sucks but it has to be done, if we are inserting at
3643 		 * the end of the leaf only insert 1 of the items, since we
3644 		 * have no way of knowing whats on the next leaf and we'd have
3645 		 * to drop our current locks to figure it out
3646 		 */
3647 		nr = 1;
3648 	}
3649 
3650 	/* setup the item for the new data */
3651 	for (i = 0; i < nr; i++) {
3652 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3653 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3654 		item = btrfs_item_nr(leaf, slot + i);
3655 		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3656 		data_end -= data_size[i];
3657 		btrfs_set_item_size(leaf, item, data_size[i]);
3658 	}
3659 	btrfs_set_header_nritems(leaf, nritems + nr);
3660 	btrfs_mark_buffer_dirty(leaf);
3661 
3662 	ret = 0;
3663 	if (slot == 0) {
3664 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3665 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3666 	}
3667 
3668 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3669 		btrfs_print_leaf(root, leaf);
3670 		BUG();
3671 	}
3672 out:
3673 	if (!ret)
3674 		ret = nr;
3675 	return ret;
3676 }
3677 
3678 /*
3679  * this is a helper for btrfs_insert_empty_items, the main goal here is
3680  * to save stack depth by doing the bulk of the work in a function
3681  * that doesn't call btrfs_search_slot
3682  */
3683 static noinline_for_stack int
3684 setup_items_for_insert(struct btrfs_trans_handle *trans,
3685 		      struct btrfs_root *root, struct btrfs_path *path,
3686 		      struct btrfs_key *cpu_key, u32 *data_size,
3687 		      u32 total_data, u32 total_size, int nr)
3688 {
3689 	struct btrfs_item *item;
3690 	int i;
3691 	u32 nritems;
3692 	unsigned int data_end;
3693 	struct btrfs_disk_key disk_key;
3694 	int ret;
3695 	struct extent_buffer *leaf;
3696 	int slot;
3697 
3698 	leaf = path->nodes[0];
3699 	slot = path->slots[0];
3700 
3701 	nritems = btrfs_header_nritems(leaf);
3702 	data_end = leaf_data_end(root, leaf);
3703 
3704 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3705 		btrfs_print_leaf(root, leaf);
3706 		printk(KERN_CRIT "not enough freespace need %u have %d\n",
3707 		       total_size, btrfs_leaf_free_space(root, leaf));
3708 		BUG();
3709 	}
3710 
3711 	if (slot != nritems) {
3712 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3713 
3714 		if (old_data < data_end) {
3715 			btrfs_print_leaf(root, leaf);
3716 			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3717 			       slot, old_data, data_end);
3718 			BUG_ON(1);
3719 		}
3720 		/*
3721 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3722 		 */
3723 		/* first correct the data pointers */
3724 		WARN_ON(leaf->map_token);
3725 		for (i = slot; i < nritems; i++) {
3726 			u32 ioff;
3727 
3728 			item = btrfs_item_nr(leaf, i);
3729 			if (!leaf->map_token) {
3730 				map_extent_buffer(leaf, (unsigned long)item,
3731 					sizeof(struct btrfs_item),
3732 					&leaf->map_token, &leaf->kaddr,
3733 					&leaf->map_start, &leaf->map_len,
3734 					KM_USER1);
3735 			}
3736 
3737 			ioff = btrfs_item_offset(leaf, item);
3738 			btrfs_set_item_offset(leaf, item, ioff - total_data);
3739 		}
3740 		if (leaf->map_token) {
3741 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3742 			leaf->map_token = NULL;
3743 		}
3744 
3745 		/* shift the items */
3746 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3747 			      btrfs_item_nr_offset(slot),
3748 			      (nritems - slot) * sizeof(struct btrfs_item));
3749 
3750 		/* shift the data */
3751 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3752 			      data_end - total_data, btrfs_leaf_data(leaf) +
3753 			      data_end, old_data - data_end);
3754 		data_end = old_data;
3755 	}
3756 
3757 	/* setup the item for the new data */
3758 	for (i = 0; i < nr; i++) {
3759 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3760 		btrfs_set_item_key(leaf, &disk_key, slot + i);
3761 		item = btrfs_item_nr(leaf, slot + i);
3762 		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3763 		data_end -= data_size[i];
3764 		btrfs_set_item_size(leaf, item, data_size[i]);
3765 	}
3766 
3767 	btrfs_set_header_nritems(leaf, nritems + nr);
3768 
3769 	ret = 0;
3770 	if (slot == 0) {
3771 		struct btrfs_disk_key disk_key;
3772 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3773 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3774 	}
3775 	btrfs_unlock_up_safe(path, 1);
3776 	btrfs_mark_buffer_dirty(leaf);
3777 
3778 	if (btrfs_leaf_free_space(root, leaf) < 0) {
3779 		btrfs_print_leaf(root, leaf);
3780 		BUG();
3781 	}
3782 	return ret;
3783 }
3784 
3785 /*
3786  * Given a key and some data, insert items into the tree.
3787  * This does all the path init required, making room in the tree if needed.
3788  */
3789 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3790 			    struct btrfs_root *root,
3791 			    struct btrfs_path *path,
3792 			    struct btrfs_key *cpu_key, u32 *data_size,
3793 			    int nr)
3794 {
3795 	int ret = 0;
3796 	int slot;
3797 	int i;
3798 	u32 total_size = 0;
3799 	u32 total_data = 0;
3800 
3801 	for (i = 0; i < nr; i++)
3802 		total_data += data_size[i];
3803 
3804 	total_size = total_data + (nr * sizeof(struct btrfs_item));
3805 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3806 	if (ret == 0)
3807 		return -EEXIST;
3808 	if (ret < 0)
3809 		goto out;
3810 
3811 	slot = path->slots[0];
3812 	BUG_ON(slot < 0);
3813 
3814 	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
3815 			       total_data, total_size, nr);
3816 
3817 out:
3818 	return ret;
3819 }
3820 
3821 /*
3822  * Given a key and some data, insert an item into the tree.
3823  * This does all the path init required, making room in the tree if needed.
3824  */
3825 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3826 		      *root, struct btrfs_key *cpu_key, void *data, u32
3827 		      data_size)
3828 {
3829 	int ret = 0;
3830 	struct btrfs_path *path;
3831 	struct extent_buffer *leaf;
3832 	unsigned long ptr;
3833 
3834 	path = btrfs_alloc_path();
3835 	BUG_ON(!path);
3836 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3837 	if (!ret) {
3838 		leaf = path->nodes[0];
3839 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3840 		write_extent_buffer(leaf, data, ptr, data_size);
3841 		btrfs_mark_buffer_dirty(leaf);
3842 	}
3843 	btrfs_free_path(path);
3844 	return ret;
3845 }
3846 
3847 /*
3848  * delete the pointer from a given node.
3849  *
3850  * the tree should have been previously balanced so the deletion does not
3851  * empty a node.
3852  */
3853 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3854 		   struct btrfs_path *path, int level, int slot)
3855 {
3856 	struct extent_buffer *parent = path->nodes[level];
3857 	u32 nritems;
3858 	int ret = 0;
3859 	int wret;
3860 
3861 	nritems = btrfs_header_nritems(parent);
3862 	if (slot != nritems - 1) {
3863 		memmove_extent_buffer(parent,
3864 			      btrfs_node_key_ptr_offset(slot),
3865 			      btrfs_node_key_ptr_offset(slot + 1),
3866 			      sizeof(struct btrfs_key_ptr) *
3867 			      (nritems - slot - 1));
3868 	}
3869 	nritems--;
3870 	btrfs_set_header_nritems(parent, nritems);
3871 	if (nritems == 0 && parent == root->node) {
3872 		BUG_ON(btrfs_header_level(root->node) != 1);
3873 		/* just turn the root into a leaf and break */
3874 		btrfs_set_header_level(root->node, 0);
3875 	} else if (slot == 0) {
3876 		struct btrfs_disk_key disk_key;
3877 
3878 		btrfs_node_key(parent, &disk_key, 0);
3879 		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3880 		if (wret)
3881 			ret = wret;
3882 	}
3883 	btrfs_mark_buffer_dirty(parent);
3884 	return ret;
3885 }
3886 
3887 /*
3888  * a helper function to delete the leaf pointed to by path->slots[1] and
3889  * path->nodes[1].
3890  *
3891  * This deletes the pointer in path->nodes[1] and frees the leaf
3892  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3893  *
3894  * The path must have already been setup for deleting the leaf, including
3895  * all the proper balancing.  path->nodes[1] must be locked.
3896  */
3897 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3898 				   struct btrfs_root *root,
3899 				   struct btrfs_path *path,
3900 				   struct extent_buffer *leaf)
3901 {
3902 	int ret;
3903 
3904 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3905 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
3906 	if (ret)
3907 		return ret;
3908 
3909 	/*
3910 	 * btrfs_free_extent is expensive, we want to make sure we
3911 	 * aren't holding any locks when we call it
3912 	 */
3913 	btrfs_unlock_up_safe(path, 0);
3914 
3915 	root_sub_used(root, leaf->len);
3916 
3917 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
3918 	return 0;
3919 }
3920 /*
3921  * delete the item at the leaf level in path.  If that empties
3922  * the leaf, remove it from the tree
3923  */
3924 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3925 		    struct btrfs_path *path, int slot, int nr)
3926 {
3927 	struct extent_buffer *leaf;
3928 	struct btrfs_item *item;
3929 	int last_off;
3930 	int dsize = 0;
3931 	int ret = 0;
3932 	int wret;
3933 	int i;
3934 	u32 nritems;
3935 
3936 	leaf = path->nodes[0];
3937 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3938 
3939 	for (i = 0; i < nr; i++)
3940 		dsize += btrfs_item_size_nr(leaf, slot + i);
3941 
3942 	nritems = btrfs_header_nritems(leaf);
3943 
3944 	if (slot + nr != nritems) {
3945 		int data_end = leaf_data_end(root, leaf);
3946 
3947 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3948 			      data_end + dsize,
3949 			      btrfs_leaf_data(leaf) + data_end,
3950 			      last_off - data_end);
3951 
3952 		for (i = slot + nr; i < nritems; i++) {
3953 			u32 ioff;
3954 
3955 			item = btrfs_item_nr(leaf, i);
3956 			if (!leaf->map_token) {
3957 				map_extent_buffer(leaf, (unsigned long)item,
3958 					sizeof(struct btrfs_item),
3959 					&leaf->map_token, &leaf->kaddr,
3960 					&leaf->map_start, &leaf->map_len,
3961 					KM_USER1);
3962 			}
3963 			ioff = btrfs_item_offset(leaf, item);
3964 			btrfs_set_item_offset(leaf, item, ioff + dsize);
3965 		}
3966 
3967 		if (leaf->map_token) {
3968 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3969 			leaf->map_token = NULL;
3970 		}
3971 
3972 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3973 			      btrfs_item_nr_offset(slot + nr),
3974 			      sizeof(struct btrfs_item) *
3975 			      (nritems - slot - nr));
3976 	}
3977 	btrfs_set_header_nritems(leaf, nritems - nr);
3978 	nritems -= nr;
3979 
3980 	/* delete the leaf if we've emptied it */
3981 	if (nritems == 0) {
3982 		if (leaf == root->node) {
3983 			btrfs_set_header_level(leaf, 0);
3984 		} else {
3985 			btrfs_set_path_blocking(path);
3986 			clean_tree_block(trans, root, leaf);
3987 			ret = btrfs_del_leaf(trans, root, path, leaf);
3988 			BUG_ON(ret);
3989 		}
3990 	} else {
3991 		int used = leaf_space_used(leaf, 0, nritems);
3992 		if (slot == 0) {
3993 			struct btrfs_disk_key disk_key;
3994 
3995 			btrfs_item_key(leaf, &disk_key, 0);
3996 			wret = fixup_low_keys(trans, root, path,
3997 					      &disk_key, 1);
3998 			if (wret)
3999 				ret = wret;
4000 		}
4001 
4002 		/* delete the leaf if it is mostly empty */
4003 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4004 			/* push_leaf_left fixes the path.
4005 			 * make sure the path still points to our leaf
4006 			 * for possible call to del_ptr below
4007 			 */
4008 			slot = path->slots[1];
4009 			extent_buffer_get(leaf);
4010 
4011 			btrfs_set_path_blocking(path);
4012 			wret = push_leaf_left(trans, root, path, 1, 1,
4013 					      1, (u32)-1);
4014 			if (wret < 0 && wret != -ENOSPC)
4015 				ret = wret;
4016 
4017 			if (path->nodes[0] == leaf &&
4018 			    btrfs_header_nritems(leaf)) {
4019 				wret = push_leaf_right(trans, root, path, 1,
4020 						       1, 1, 0);
4021 				if (wret < 0 && wret != -ENOSPC)
4022 					ret = wret;
4023 			}
4024 
4025 			if (btrfs_header_nritems(leaf) == 0) {
4026 				path->slots[1] = slot;
4027 				ret = btrfs_del_leaf(trans, root, path, leaf);
4028 				BUG_ON(ret);
4029 				free_extent_buffer(leaf);
4030 			} else {
4031 				/* if we're still in the path, make sure
4032 				 * we're dirty.  Otherwise, one of the
4033 				 * push_leaf functions must have already
4034 				 * dirtied this buffer
4035 				 */
4036 				if (path->nodes[0] == leaf)
4037 					btrfs_mark_buffer_dirty(leaf);
4038 				free_extent_buffer(leaf);
4039 			}
4040 		} else {
4041 			btrfs_mark_buffer_dirty(leaf);
4042 		}
4043 	}
4044 	return ret;
4045 }
4046 
4047 /*
4048  * search the tree again to find a leaf with lesser keys
4049  * returns 0 if it found something or 1 if there are no lesser leaves.
4050  * returns < 0 on io errors.
4051  *
4052  * This may release the path, and so you may lose any locks held at the
4053  * time you call it.
4054  */
4055 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4056 {
4057 	struct btrfs_key key;
4058 	struct btrfs_disk_key found_key;
4059 	int ret;
4060 
4061 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4062 
4063 	if (key.offset > 0)
4064 		key.offset--;
4065 	else if (key.type > 0)
4066 		key.type--;
4067 	else if (key.objectid > 0)
4068 		key.objectid--;
4069 	else
4070 		return 1;
4071 
4072 	btrfs_release_path(root, path);
4073 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4074 	if (ret < 0)
4075 		return ret;
4076 	btrfs_item_key(path->nodes[0], &found_key, 0);
4077 	ret = comp_keys(&found_key, &key);
4078 	if (ret < 0)
4079 		return 0;
4080 	return 1;
4081 }
4082 
4083 /*
4084  * A helper function to walk down the tree starting at min_key, and looking
4085  * for nodes or leaves that are either in cache or have a minimum
4086  * transaction id.  This is used by the btree defrag code, and tree logging
4087  *
4088  * This does not cow, but it does stuff the starting key it finds back
4089  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4090  * key and get a writable path.
4091  *
4092  * This does lock as it descends, and path->keep_locks should be set
4093  * to 1 by the caller.
4094  *
4095  * This honors path->lowest_level to prevent descent past a given level
4096  * of the tree.
4097  *
4098  * min_trans indicates the oldest transaction that you are interested
4099  * in walking through.  Any nodes or leaves older than min_trans are
4100  * skipped over (without reading them).
4101  *
4102  * returns zero if something useful was found, < 0 on error and 1 if there
4103  * was nothing in the tree that matched the search criteria.
4104  */
4105 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4106 			 struct btrfs_key *max_key,
4107 			 struct btrfs_path *path, int cache_only,
4108 			 u64 min_trans)
4109 {
4110 	struct extent_buffer *cur;
4111 	struct btrfs_key found_key;
4112 	int slot;
4113 	int sret;
4114 	u32 nritems;
4115 	int level;
4116 	int ret = 1;
4117 
4118 	WARN_ON(!path->keep_locks);
4119 again:
4120 	cur = btrfs_lock_root_node(root);
4121 	level = btrfs_header_level(cur);
4122 	WARN_ON(path->nodes[level]);
4123 	path->nodes[level] = cur;
4124 	path->locks[level] = 1;
4125 
4126 	if (btrfs_header_generation(cur) < min_trans) {
4127 		ret = 1;
4128 		goto out;
4129 	}
4130 	while (1) {
4131 		nritems = btrfs_header_nritems(cur);
4132 		level = btrfs_header_level(cur);
4133 		sret = bin_search(cur, min_key, level, &slot);
4134 
4135 		/* at the lowest level, we're done, setup the path and exit */
4136 		if (level == path->lowest_level) {
4137 			if (slot >= nritems)
4138 				goto find_next_key;
4139 			ret = 0;
4140 			path->slots[level] = slot;
4141 			btrfs_item_key_to_cpu(cur, &found_key, slot);
4142 			goto out;
4143 		}
4144 		if (sret && slot > 0)
4145 			slot--;
4146 		/*
4147 		 * check this node pointer against the cache_only and
4148 		 * min_trans parameters.  If it isn't in cache or is too
4149 		 * old, skip to the next one.
4150 		 */
4151 		while (slot < nritems) {
4152 			u64 blockptr;
4153 			u64 gen;
4154 			struct extent_buffer *tmp;
4155 			struct btrfs_disk_key disk_key;
4156 
4157 			blockptr = btrfs_node_blockptr(cur, slot);
4158 			gen = btrfs_node_ptr_generation(cur, slot);
4159 			if (gen < min_trans) {
4160 				slot++;
4161 				continue;
4162 			}
4163 			if (!cache_only)
4164 				break;
4165 
4166 			if (max_key) {
4167 				btrfs_node_key(cur, &disk_key, slot);
4168 				if (comp_keys(&disk_key, max_key) >= 0) {
4169 					ret = 1;
4170 					goto out;
4171 				}
4172 			}
4173 
4174 			tmp = btrfs_find_tree_block(root, blockptr,
4175 					    btrfs_level_size(root, level - 1));
4176 
4177 			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
4178 				free_extent_buffer(tmp);
4179 				break;
4180 			}
4181 			if (tmp)
4182 				free_extent_buffer(tmp);
4183 			slot++;
4184 		}
4185 find_next_key:
4186 		/*
4187 		 * we didn't find a candidate key in this node, walk forward
4188 		 * and find another one
4189 		 */
4190 		if (slot >= nritems) {
4191 			path->slots[level] = slot;
4192 			btrfs_set_path_blocking(path);
4193 			sret = btrfs_find_next_key(root, path, min_key, level,
4194 						  cache_only, min_trans);
4195 			if (sret == 0) {
4196 				btrfs_release_path(root, path);
4197 				goto again;
4198 			} else {
4199 				goto out;
4200 			}
4201 		}
4202 		/* save our key for returning back */
4203 		btrfs_node_key_to_cpu(cur, &found_key, slot);
4204 		path->slots[level] = slot;
4205 		if (level == path->lowest_level) {
4206 			ret = 0;
4207 			unlock_up(path, level, 1);
4208 			goto out;
4209 		}
4210 		btrfs_set_path_blocking(path);
4211 		cur = read_node_slot(root, cur, slot);
4212 
4213 		btrfs_tree_lock(cur);
4214 
4215 		path->locks[level - 1] = 1;
4216 		path->nodes[level - 1] = cur;
4217 		unlock_up(path, level, 1);
4218 		btrfs_clear_path_blocking(path, NULL);
4219 	}
4220 out:
4221 	if (ret == 0)
4222 		memcpy(min_key, &found_key, sizeof(found_key));
4223 	btrfs_set_path_blocking(path);
4224 	return ret;
4225 }
4226 
4227 /*
4228  * this is similar to btrfs_next_leaf, but does not try to preserve
4229  * and fixup the path.  It looks for and returns the next key in the
4230  * tree based on the current path and the cache_only and min_trans
4231  * parameters.
4232  *
4233  * 0 is returned if another key is found, < 0 if there are any errors
4234  * and 1 is returned if there are no higher keys in the tree
4235  *
4236  * path->keep_locks should be set to 1 on the search made before
4237  * calling this function.
4238  */
4239 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4240 			struct btrfs_key *key, int level,
4241 			int cache_only, u64 min_trans)
4242 {
4243 	int slot;
4244 	struct extent_buffer *c;
4245 
4246 	WARN_ON(!path->keep_locks);
4247 	while (level < BTRFS_MAX_LEVEL) {
4248 		if (!path->nodes[level])
4249 			return 1;
4250 
4251 		slot = path->slots[level] + 1;
4252 		c = path->nodes[level];
4253 next:
4254 		if (slot >= btrfs_header_nritems(c)) {
4255 			int ret;
4256 			int orig_lowest;
4257 			struct btrfs_key cur_key;
4258 			if (level + 1 >= BTRFS_MAX_LEVEL ||
4259 			    !path->nodes[level + 1])
4260 				return 1;
4261 
4262 			if (path->locks[level + 1]) {
4263 				level++;
4264 				continue;
4265 			}
4266 
4267 			slot = btrfs_header_nritems(c) - 1;
4268 			if (level == 0)
4269 				btrfs_item_key_to_cpu(c, &cur_key, slot);
4270 			else
4271 				btrfs_node_key_to_cpu(c, &cur_key, slot);
4272 
4273 			orig_lowest = path->lowest_level;
4274 			btrfs_release_path(root, path);
4275 			path->lowest_level = level;
4276 			ret = btrfs_search_slot(NULL, root, &cur_key, path,
4277 						0, 0);
4278 			path->lowest_level = orig_lowest;
4279 			if (ret < 0)
4280 				return ret;
4281 
4282 			c = path->nodes[level];
4283 			slot = path->slots[level];
4284 			if (ret == 0)
4285 				slot++;
4286 			goto next;
4287 		}
4288 
4289 		if (level == 0)
4290 			btrfs_item_key_to_cpu(c, key, slot);
4291 		else {
4292 			u64 blockptr = btrfs_node_blockptr(c, slot);
4293 			u64 gen = btrfs_node_ptr_generation(c, slot);
4294 
4295 			if (cache_only) {
4296 				struct extent_buffer *cur;
4297 				cur = btrfs_find_tree_block(root, blockptr,
4298 					    btrfs_level_size(root, level - 1));
4299 				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4300 					slot++;
4301 					if (cur)
4302 						free_extent_buffer(cur);
4303 					goto next;
4304 				}
4305 				free_extent_buffer(cur);
4306 			}
4307 			if (gen < min_trans) {
4308 				slot++;
4309 				goto next;
4310 			}
4311 			btrfs_node_key_to_cpu(c, key, slot);
4312 		}
4313 		return 0;
4314 	}
4315 	return 1;
4316 }
4317 
4318 /*
4319  * search the tree again to find a leaf with greater keys
4320  * returns 0 if it found something or 1 if there are no greater leaves.
4321  * returns < 0 on io errors.
4322  */
4323 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4324 {
4325 	int slot;
4326 	int level;
4327 	struct extent_buffer *c;
4328 	struct extent_buffer *next;
4329 	struct btrfs_key key;
4330 	u32 nritems;
4331 	int ret;
4332 	int old_spinning = path->leave_spinning;
4333 	int force_blocking = 0;
4334 
4335 	nritems = btrfs_header_nritems(path->nodes[0]);
4336 	if (nritems == 0)
4337 		return 1;
4338 
4339 	/*
4340 	 * we take the blocks in an order that upsets lockdep.  Using
4341 	 * blocking mode is the only way around it.
4342 	 */
4343 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4344 	force_blocking = 1;
4345 #endif
4346 
4347 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4348 again:
4349 	level = 1;
4350 	next = NULL;
4351 	btrfs_release_path(root, path);
4352 
4353 	path->keep_locks = 1;
4354 
4355 	if (!force_blocking)
4356 		path->leave_spinning = 1;
4357 
4358 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4359 	path->keep_locks = 0;
4360 
4361 	if (ret < 0)
4362 		return ret;
4363 
4364 	nritems = btrfs_header_nritems(path->nodes[0]);
4365 	/*
4366 	 * by releasing the path above we dropped all our locks.  A balance
4367 	 * could have added more items next to the key that used to be
4368 	 * at the very end of the block.  So, check again here and
4369 	 * advance the path if there are now more items available.
4370 	 */
4371 	if (nritems > 0 && path->slots[0] < nritems - 1) {
4372 		if (ret == 0)
4373 			path->slots[0]++;
4374 		ret = 0;
4375 		goto done;
4376 	}
4377 
4378 	while (level < BTRFS_MAX_LEVEL) {
4379 		if (!path->nodes[level]) {
4380 			ret = 1;
4381 			goto done;
4382 		}
4383 
4384 		slot = path->slots[level] + 1;
4385 		c = path->nodes[level];
4386 		if (slot >= btrfs_header_nritems(c)) {
4387 			level++;
4388 			if (level == BTRFS_MAX_LEVEL) {
4389 				ret = 1;
4390 				goto done;
4391 			}
4392 			continue;
4393 		}
4394 
4395 		if (next) {
4396 			btrfs_tree_unlock(next);
4397 			free_extent_buffer(next);
4398 		}
4399 
4400 		next = c;
4401 		ret = read_block_for_search(NULL, root, path, &next, level,
4402 					    slot, &key);
4403 		if (ret == -EAGAIN)
4404 			goto again;
4405 
4406 		if (ret < 0) {
4407 			btrfs_release_path(root, path);
4408 			goto done;
4409 		}
4410 
4411 		if (!path->skip_locking) {
4412 			ret = btrfs_try_spin_lock(next);
4413 			if (!ret) {
4414 				btrfs_set_path_blocking(path);
4415 				btrfs_tree_lock(next);
4416 				if (!force_blocking)
4417 					btrfs_clear_path_blocking(path, next);
4418 			}
4419 			if (force_blocking)
4420 				btrfs_set_lock_blocking(next);
4421 		}
4422 		break;
4423 	}
4424 	path->slots[level] = slot;
4425 	while (1) {
4426 		level--;
4427 		c = path->nodes[level];
4428 		if (path->locks[level])
4429 			btrfs_tree_unlock(c);
4430 
4431 		free_extent_buffer(c);
4432 		path->nodes[level] = next;
4433 		path->slots[level] = 0;
4434 		if (!path->skip_locking)
4435 			path->locks[level] = 1;
4436 
4437 		if (!level)
4438 			break;
4439 
4440 		ret = read_block_for_search(NULL, root, path, &next, level,
4441 					    0, &key);
4442 		if (ret == -EAGAIN)
4443 			goto again;
4444 
4445 		if (ret < 0) {
4446 			btrfs_release_path(root, path);
4447 			goto done;
4448 		}
4449 
4450 		if (!path->skip_locking) {
4451 			btrfs_assert_tree_locked(path->nodes[level]);
4452 			ret = btrfs_try_spin_lock(next);
4453 			if (!ret) {
4454 				btrfs_set_path_blocking(path);
4455 				btrfs_tree_lock(next);
4456 				if (!force_blocking)
4457 					btrfs_clear_path_blocking(path, next);
4458 			}
4459 			if (force_blocking)
4460 				btrfs_set_lock_blocking(next);
4461 		}
4462 	}
4463 	ret = 0;
4464 done:
4465 	unlock_up(path, 0, 1);
4466 	path->leave_spinning = old_spinning;
4467 	if (!old_spinning)
4468 		btrfs_set_path_blocking(path);
4469 
4470 	return ret;
4471 }
4472 
4473 /*
4474  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4475  * searching until it gets past min_objectid or finds an item of 'type'
4476  *
4477  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4478  */
4479 int btrfs_previous_item(struct btrfs_root *root,
4480 			struct btrfs_path *path, u64 min_objectid,
4481 			int type)
4482 {
4483 	struct btrfs_key found_key;
4484 	struct extent_buffer *leaf;
4485 	u32 nritems;
4486 	int ret;
4487 
4488 	while (1) {
4489 		if (path->slots[0] == 0) {
4490 			btrfs_set_path_blocking(path);
4491 			ret = btrfs_prev_leaf(root, path);
4492 			if (ret != 0)
4493 				return ret;
4494 		} else {
4495 			path->slots[0]--;
4496 		}
4497 		leaf = path->nodes[0];
4498 		nritems = btrfs_header_nritems(leaf);
4499 		if (nritems == 0)
4500 			return 1;
4501 		if (path->slots[0] == nritems)
4502 			path->slots[0]--;
4503 
4504 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4505 		if (found_key.objectid < min_objectid)
4506 			break;
4507 		if (found_key.type == type)
4508 			return 0;
4509 		if (found_key.objectid == min_objectid &&
4510 		    found_key.type < type)
4511 			break;
4512 	}
4513 	return 1;
4514 }
4515