1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "ctree.h" 12 #include "disk-io.h" 13 #include "transaction.h" 14 #include "print-tree.h" 15 #include "locking.h" 16 #include "volumes.h" 17 #include "qgroup.h" 18 #include "tree-mod-log.h" 19 #include "tree-checker.h" 20 21 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 22 *root, struct btrfs_path *path, int level); 23 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 24 const struct btrfs_key *ins_key, struct btrfs_path *path, 25 int data_size, int extend); 26 static int push_node_left(struct btrfs_trans_handle *trans, 27 struct extent_buffer *dst, 28 struct extent_buffer *src, int empty); 29 static int balance_node_right(struct btrfs_trans_handle *trans, 30 struct extent_buffer *dst_buf, 31 struct extent_buffer *src_buf); 32 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 33 int level, int slot); 34 35 static const struct btrfs_csums { 36 u16 size; 37 const char name[10]; 38 const char driver[12]; 39 } btrfs_csums[] = { 40 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 41 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 42 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 43 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 44 .driver = "blake2b-256" }, 45 }; 46 47 int btrfs_super_csum_size(const struct btrfs_super_block *s) 48 { 49 u16 t = btrfs_super_csum_type(s); 50 /* 51 * csum type is validated at mount time 52 */ 53 return btrfs_csums[t].size; 54 } 55 56 const char *btrfs_super_csum_name(u16 csum_type) 57 { 58 /* csum type is validated at mount time */ 59 return btrfs_csums[csum_type].name; 60 } 61 62 /* 63 * Return driver name if defined, otherwise the name that's also a valid driver 64 * name 65 */ 66 const char *btrfs_super_csum_driver(u16 csum_type) 67 { 68 /* csum type is validated at mount time */ 69 return btrfs_csums[csum_type].driver[0] ? 70 btrfs_csums[csum_type].driver : 71 btrfs_csums[csum_type].name; 72 } 73 74 size_t __attribute_const__ btrfs_get_num_csums(void) 75 { 76 return ARRAY_SIZE(btrfs_csums); 77 } 78 79 struct btrfs_path *btrfs_alloc_path(void) 80 { 81 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 82 } 83 84 /* this also releases the path */ 85 void btrfs_free_path(struct btrfs_path *p) 86 { 87 if (!p) 88 return; 89 btrfs_release_path(p); 90 kmem_cache_free(btrfs_path_cachep, p); 91 } 92 93 /* 94 * path release drops references on the extent buffers in the path 95 * and it drops any locks held by this path 96 * 97 * It is safe to call this on paths that no locks or extent buffers held. 98 */ 99 noinline void btrfs_release_path(struct btrfs_path *p) 100 { 101 int i; 102 103 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 104 p->slots[i] = 0; 105 if (!p->nodes[i]) 106 continue; 107 if (p->locks[i]) { 108 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 109 p->locks[i] = 0; 110 } 111 free_extent_buffer(p->nodes[i]); 112 p->nodes[i] = NULL; 113 } 114 } 115 116 /* 117 * We want the transaction abort to print stack trace only for errors where the 118 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 119 * caused by external factors. 120 */ 121 bool __cold abort_should_print_stack(int errno) 122 { 123 switch (errno) { 124 case -EIO: 125 case -EROFS: 126 case -ENOMEM: 127 return false; 128 } 129 return true; 130 } 131 132 /* 133 * safely gets a reference on the root node of a tree. A lock 134 * is not taken, so a concurrent writer may put a different node 135 * at the root of the tree. See btrfs_lock_root_node for the 136 * looping required. 137 * 138 * The extent buffer returned by this has a reference taken, so 139 * it won't disappear. It may stop being the root of the tree 140 * at any time because there are no locks held. 141 */ 142 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 143 { 144 struct extent_buffer *eb; 145 146 while (1) { 147 rcu_read_lock(); 148 eb = rcu_dereference(root->node); 149 150 /* 151 * RCU really hurts here, we could free up the root node because 152 * it was COWed but we may not get the new root node yet so do 153 * the inc_not_zero dance and if it doesn't work then 154 * synchronize_rcu and try again. 155 */ 156 if (atomic_inc_not_zero(&eb->refs)) { 157 rcu_read_unlock(); 158 break; 159 } 160 rcu_read_unlock(); 161 synchronize_rcu(); 162 } 163 return eb; 164 } 165 166 /* 167 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 168 * just get put onto a simple dirty list. Transaction walks this list to make 169 * sure they get properly updated on disk. 170 */ 171 static void add_root_to_dirty_list(struct btrfs_root *root) 172 { 173 struct btrfs_fs_info *fs_info = root->fs_info; 174 175 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 176 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 177 return; 178 179 spin_lock(&fs_info->trans_lock); 180 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 181 /* Want the extent tree to be the last on the list */ 182 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 183 list_move_tail(&root->dirty_list, 184 &fs_info->dirty_cowonly_roots); 185 else 186 list_move(&root->dirty_list, 187 &fs_info->dirty_cowonly_roots); 188 } 189 spin_unlock(&fs_info->trans_lock); 190 } 191 192 /* 193 * used by snapshot creation to make a copy of a root for a tree with 194 * a given objectid. The buffer with the new root node is returned in 195 * cow_ret, and this func returns zero on success or a negative error code. 196 */ 197 int btrfs_copy_root(struct btrfs_trans_handle *trans, 198 struct btrfs_root *root, 199 struct extent_buffer *buf, 200 struct extent_buffer **cow_ret, u64 new_root_objectid) 201 { 202 struct btrfs_fs_info *fs_info = root->fs_info; 203 struct extent_buffer *cow; 204 int ret = 0; 205 int level; 206 struct btrfs_disk_key disk_key; 207 208 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 209 trans->transid != fs_info->running_transaction->transid); 210 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 211 trans->transid != root->last_trans); 212 213 level = btrfs_header_level(buf); 214 if (level == 0) 215 btrfs_item_key(buf, &disk_key, 0); 216 else 217 btrfs_node_key(buf, &disk_key, 0); 218 219 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 220 &disk_key, level, buf->start, 0, 221 BTRFS_NESTING_NEW_ROOT); 222 if (IS_ERR(cow)) 223 return PTR_ERR(cow); 224 225 copy_extent_buffer_full(cow, buf); 226 btrfs_set_header_bytenr(cow, cow->start); 227 btrfs_set_header_generation(cow, trans->transid); 228 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 229 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 230 BTRFS_HEADER_FLAG_RELOC); 231 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 232 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 233 else 234 btrfs_set_header_owner(cow, new_root_objectid); 235 236 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 237 238 WARN_ON(btrfs_header_generation(buf) > trans->transid); 239 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 240 ret = btrfs_inc_ref(trans, root, cow, 1); 241 else 242 ret = btrfs_inc_ref(trans, root, cow, 0); 243 if (ret) { 244 btrfs_tree_unlock(cow); 245 free_extent_buffer(cow); 246 btrfs_abort_transaction(trans, ret); 247 return ret; 248 } 249 250 btrfs_mark_buffer_dirty(cow); 251 *cow_ret = cow; 252 return 0; 253 } 254 255 /* 256 * check if the tree block can be shared by multiple trees 257 */ 258 int btrfs_block_can_be_shared(struct btrfs_root *root, 259 struct extent_buffer *buf) 260 { 261 /* 262 * Tree blocks not in shareable trees and tree roots are never shared. 263 * If a block was allocated after the last snapshot and the block was 264 * not allocated by tree relocation, we know the block is not shared. 265 */ 266 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 267 buf != root->node && buf != root->commit_root && 268 (btrfs_header_generation(buf) <= 269 btrfs_root_last_snapshot(&root->root_item) || 270 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 271 return 1; 272 273 return 0; 274 } 275 276 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 277 struct btrfs_root *root, 278 struct extent_buffer *buf, 279 struct extent_buffer *cow, 280 int *last_ref) 281 { 282 struct btrfs_fs_info *fs_info = root->fs_info; 283 u64 refs; 284 u64 owner; 285 u64 flags; 286 u64 new_flags = 0; 287 int ret; 288 289 /* 290 * Backrefs update rules: 291 * 292 * Always use full backrefs for extent pointers in tree block 293 * allocated by tree relocation. 294 * 295 * If a shared tree block is no longer referenced by its owner 296 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 297 * use full backrefs for extent pointers in tree block. 298 * 299 * If a tree block is been relocating 300 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 301 * use full backrefs for extent pointers in tree block. 302 * The reason for this is some operations (such as drop tree) 303 * are only allowed for blocks use full backrefs. 304 */ 305 306 if (btrfs_block_can_be_shared(root, buf)) { 307 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 308 btrfs_header_level(buf), 1, 309 &refs, &flags); 310 if (ret) 311 return ret; 312 if (refs == 0) { 313 ret = -EROFS; 314 btrfs_handle_fs_error(fs_info, ret, NULL); 315 return ret; 316 } 317 } else { 318 refs = 1; 319 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 320 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 321 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 322 else 323 flags = 0; 324 } 325 326 owner = btrfs_header_owner(buf); 327 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 328 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 329 330 if (refs > 1) { 331 if ((owner == root->root_key.objectid || 332 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 333 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 334 ret = btrfs_inc_ref(trans, root, buf, 1); 335 if (ret) 336 return ret; 337 338 if (root->root_key.objectid == 339 BTRFS_TREE_RELOC_OBJECTID) { 340 ret = btrfs_dec_ref(trans, root, buf, 0); 341 if (ret) 342 return ret; 343 ret = btrfs_inc_ref(trans, root, cow, 1); 344 if (ret) 345 return ret; 346 } 347 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 348 } else { 349 350 if (root->root_key.objectid == 351 BTRFS_TREE_RELOC_OBJECTID) 352 ret = btrfs_inc_ref(trans, root, cow, 1); 353 else 354 ret = btrfs_inc_ref(trans, root, cow, 0); 355 if (ret) 356 return ret; 357 } 358 if (new_flags != 0) { 359 int level = btrfs_header_level(buf); 360 361 ret = btrfs_set_disk_extent_flags(trans, buf, 362 new_flags, level); 363 if (ret) 364 return ret; 365 } 366 } else { 367 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 368 if (root->root_key.objectid == 369 BTRFS_TREE_RELOC_OBJECTID) 370 ret = btrfs_inc_ref(trans, root, cow, 1); 371 else 372 ret = btrfs_inc_ref(trans, root, cow, 0); 373 if (ret) 374 return ret; 375 ret = btrfs_dec_ref(trans, root, buf, 1); 376 if (ret) 377 return ret; 378 } 379 btrfs_clean_tree_block(buf); 380 *last_ref = 1; 381 } 382 return 0; 383 } 384 385 /* 386 * does the dirty work in cow of a single block. The parent block (if 387 * supplied) is updated to point to the new cow copy. The new buffer is marked 388 * dirty and returned locked. If you modify the block it needs to be marked 389 * dirty again. 390 * 391 * search_start -- an allocation hint for the new block 392 * 393 * empty_size -- a hint that you plan on doing more cow. This is the size in 394 * bytes the allocator should try to find free next to the block it returns. 395 * This is just a hint and may be ignored by the allocator. 396 */ 397 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 398 struct btrfs_root *root, 399 struct extent_buffer *buf, 400 struct extent_buffer *parent, int parent_slot, 401 struct extent_buffer **cow_ret, 402 u64 search_start, u64 empty_size, 403 enum btrfs_lock_nesting nest) 404 { 405 struct btrfs_fs_info *fs_info = root->fs_info; 406 struct btrfs_disk_key disk_key; 407 struct extent_buffer *cow; 408 int level, ret; 409 int last_ref = 0; 410 int unlock_orig = 0; 411 u64 parent_start = 0; 412 413 if (*cow_ret == buf) 414 unlock_orig = 1; 415 416 btrfs_assert_tree_write_locked(buf); 417 418 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 419 trans->transid != fs_info->running_transaction->transid); 420 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 421 trans->transid != root->last_trans); 422 423 level = btrfs_header_level(buf); 424 425 if (level == 0) 426 btrfs_item_key(buf, &disk_key, 0); 427 else 428 btrfs_node_key(buf, &disk_key, 0); 429 430 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 431 parent_start = parent->start; 432 433 cow = btrfs_alloc_tree_block(trans, root, parent_start, 434 root->root_key.objectid, &disk_key, level, 435 search_start, empty_size, nest); 436 if (IS_ERR(cow)) 437 return PTR_ERR(cow); 438 439 /* cow is set to blocking by btrfs_init_new_buffer */ 440 441 copy_extent_buffer_full(cow, buf); 442 btrfs_set_header_bytenr(cow, cow->start); 443 btrfs_set_header_generation(cow, trans->transid); 444 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 445 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 446 BTRFS_HEADER_FLAG_RELOC); 447 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 448 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 449 else 450 btrfs_set_header_owner(cow, root->root_key.objectid); 451 452 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 453 454 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 455 if (ret) { 456 btrfs_tree_unlock(cow); 457 free_extent_buffer(cow); 458 btrfs_abort_transaction(trans, ret); 459 return ret; 460 } 461 462 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 463 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 464 if (ret) { 465 btrfs_tree_unlock(cow); 466 free_extent_buffer(cow); 467 btrfs_abort_transaction(trans, ret); 468 return ret; 469 } 470 } 471 472 if (buf == root->node) { 473 WARN_ON(parent && parent != buf); 474 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 475 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 476 parent_start = buf->start; 477 478 atomic_inc(&cow->refs); 479 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 480 BUG_ON(ret < 0); 481 rcu_assign_pointer(root->node, cow); 482 483 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 484 parent_start, last_ref); 485 free_extent_buffer(buf); 486 add_root_to_dirty_list(root); 487 } else { 488 WARN_ON(trans->transid != btrfs_header_generation(parent)); 489 btrfs_tree_mod_log_insert_key(parent, parent_slot, 490 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 491 btrfs_set_node_blockptr(parent, parent_slot, 492 cow->start); 493 btrfs_set_node_ptr_generation(parent, parent_slot, 494 trans->transid); 495 btrfs_mark_buffer_dirty(parent); 496 if (last_ref) { 497 ret = btrfs_tree_mod_log_free_eb(buf); 498 if (ret) { 499 btrfs_tree_unlock(cow); 500 free_extent_buffer(cow); 501 btrfs_abort_transaction(trans, ret); 502 return ret; 503 } 504 } 505 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 506 parent_start, last_ref); 507 } 508 if (unlock_orig) 509 btrfs_tree_unlock(buf); 510 free_extent_buffer_stale(buf); 511 btrfs_mark_buffer_dirty(cow); 512 *cow_ret = cow; 513 return 0; 514 } 515 516 static inline int should_cow_block(struct btrfs_trans_handle *trans, 517 struct btrfs_root *root, 518 struct extent_buffer *buf) 519 { 520 if (btrfs_is_testing(root->fs_info)) 521 return 0; 522 523 /* Ensure we can see the FORCE_COW bit */ 524 smp_mb__before_atomic(); 525 526 /* 527 * We do not need to cow a block if 528 * 1) this block is not created or changed in this transaction; 529 * 2) this block does not belong to TREE_RELOC tree; 530 * 3) the root is not forced COW. 531 * 532 * What is forced COW: 533 * when we create snapshot during committing the transaction, 534 * after we've finished copying src root, we must COW the shared 535 * block to ensure the metadata consistency. 536 */ 537 if (btrfs_header_generation(buf) == trans->transid && 538 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 539 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 540 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 541 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 542 return 0; 543 return 1; 544 } 545 546 /* 547 * cows a single block, see __btrfs_cow_block for the real work. 548 * This version of it has extra checks so that a block isn't COWed more than 549 * once per transaction, as long as it hasn't been written yet 550 */ 551 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 552 struct btrfs_root *root, struct extent_buffer *buf, 553 struct extent_buffer *parent, int parent_slot, 554 struct extent_buffer **cow_ret, 555 enum btrfs_lock_nesting nest) 556 { 557 struct btrfs_fs_info *fs_info = root->fs_info; 558 u64 search_start; 559 int ret; 560 561 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 562 btrfs_err(fs_info, 563 "COW'ing blocks on a fs root that's being dropped"); 564 565 if (trans->transaction != fs_info->running_transaction) 566 WARN(1, KERN_CRIT "trans %llu running %llu\n", 567 trans->transid, 568 fs_info->running_transaction->transid); 569 570 if (trans->transid != fs_info->generation) 571 WARN(1, KERN_CRIT "trans %llu running %llu\n", 572 trans->transid, fs_info->generation); 573 574 if (!should_cow_block(trans, root, buf)) { 575 *cow_ret = buf; 576 return 0; 577 } 578 579 search_start = buf->start & ~((u64)SZ_1G - 1); 580 581 /* 582 * Before CoWing this block for later modification, check if it's 583 * the subtree root and do the delayed subtree trace if needed. 584 * 585 * Also We don't care about the error, as it's handled internally. 586 */ 587 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 588 ret = __btrfs_cow_block(trans, root, buf, parent, 589 parent_slot, cow_ret, search_start, 0, nest); 590 591 trace_btrfs_cow_block(root, buf, *cow_ret); 592 593 return ret; 594 } 595 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 596 597 /* 598 * helper function for defrag to decide if two blocks pointed to by a 599 * node are actually close by 600 */ 601 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 602 { 603 if (blocknr < other && other - (blocknr + blocksize) < 32768) 604 return 1; 605 if (blocknr > other && blocknr - (other + blocksize) < 32768) 606 return 1; 607 return 0; 608 } 609 610 #ifdef __LITTLE_ENDIAN 611 612 /* 613 * Compare two keys, on little-endian the disk order is same as CPU order and 614 * we can avoid the conversion. 615 */ 616 static int comp_keys(const struct btrfs_disk_key *disk_key, 617 const struct btrfs_key *k2) 618 { 619 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 620 621 return btrfs_comp_cpu_keys(k1, k2); 622 } 623 624 #else 625 626 /* 627 * compare two keys in a memcmp fashion 628 */ 629 static int comp_keys(const struct btrfs_disk_key *disk, 630 const struct btrfs_key *k2) 631 { 632 struct btrfs_key k1; 633 634 btrfs_disk_key_to_cpu(&k1, disk); 635 636 return btrfs_comp_cpu_keys(&k1, k2); 637 } 638 #endif 639 640 /* 641 * same as comp_keys only with two btrfs_key's 642 */ 643 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 644 { 645 if (k1->objectid > k2->objectid) 646 return 1; 647 if (k1->objectid < k2->objectid) 648 return -1; 649 if (k1->type > k2->type) 650 return 1; 651 if (k1->type < k2->type) 652 return -1; 653 if (k1->offset > k2->offset) 654 return 1; 655 if (k1->offset < k2->offset) 656 return -1; 657 return 0; 658 } 659 660 /* 661 * this is used by the defrag code to go through all the 662 * leaves pointed to by a node and reallocate them so that 663 * disk order is close to key order 664 */ 665 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 666 struct btrfs_root *root, struct extent_buffer *parent, 667 int start_slot, u64 *last_ret, 668 struct btrfs_key *progress) 669 { 670 struct btrfs_fs_info *fs_info = root->fs_info; 671 struct extent_buffer *cur; 672 u64 blocknr; 673 u64 search_start = *last_ret; 674 u64 last_block = 0; 675 u64 other; 676 u32 parent_nritems; 677 int end_slot; 678 int i; 679 int err = 0; 680 u32 blocksize; 681 int progress_passed = 0; 682 struct btrfs_disk_key disk_key; 683 684 WARN_ON(trans->transaction != fs_info->running_transaction); 685 WARN_ON(trans->transid != fs_info->generation); 686 687 parent_nritems = btrfs_header_nritems(parent); 688 blocksize = fs_info->nodesize; 689 end_slot = parent_nritems - 1; 690 691 if (parent_nritems <= 1) 692 return 0; 693 694 for (i = start_slot; i <= end_slot; i++) { 695 int close = 1; 696 697 btrfs_node_key(parent, &disk_key, i); 698 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 699 continue; 700 701 progress_passed = 1; 702 blocknr = btrfs_node_blockptr(parent, i); 703 if (last_block == 0) 704 last_block = blocknr; 705 706 if (i > 0) { 707 other = btrfs_node_blockptr(parent, i - 1); 708 close = close_blocks(blocknr, other, blocksize); 709 } 710 if (!close && i < end_slot) { 711 other = btrfs_node_blockptr(parent, i + 1); 712 close = close_blocks(blocknr, other, blocksize); 713 } 714 if (close) { 715 last_block = blocknr; 716 continue; 717 } 718 719 cur = btrfs_read_node_slot(parent, i); 720 if (IS_ERR(cur)) 721 return PTR_ERR(cur); 722 if (search_start == 0) 723 search_start = last_block; 724 725 btrfs_tree_lock(cur); 726 err = __btrfs_cow_block(trans, root, cur, parent, i, 727 &cur, search_start, 728 min(16 * blocksize, 729 (end_slot - i) * blocksize), 730 BTRFS_NESTING_COW); 731 if (err) { 732 btrfs_tree_unlock(cur); 733 free_extent_buffer(cur); 734 break; 735 } 736 search_start = cur->start; 737 last_block = cur->start; 738 *last_ret = search_start; 739 btrfs_tree_unlock(cur); 740 free_extent_buffer(cur); 741 } 742 return err; 743 } 744 745 /* 746 * Search for a key in the given extent_buffer. 747 * 748 * The lower boundary for the search is specified by the slot number @low. Use a 749 * value of 0 to search over the whole extent buffer. 750 * 751 * The slot in the extent buffer is returned via @slot. If the key exists in the 752 * extent buffer, then @slot will point to the slot where the key is, otherwise 753 * it points to the slot where you would insert the key. 754 * 755 * Slot may point to the total number of items (i.e. one position beyond the last 756 * key) if the key is bigger than the last key in the extent buffer. 757 */ 758 static noinline int generic_bin_search(struct extent_buffer *eb, int low, 759 const struct btrfs_key *key, int *slot) 760 { 761 unsigned long p; 762 int item_size; 763 int high = btrfs_header_nritems(eb); 764 int ret; 765 const int key_size = sizeof(struct btrfs_disk_key); 766 767 if (low > high) { 768 btrfs_err(eb->fs_info, 769 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 770 __func__, low, high, eb->start, 771 btrfs_header_owner(eb), btrfs_header_level(eb)); 772 return -EINVAL; 773 } 774 775 if (btrfs_header_level(eb) == 0) { 776 p = offsetof(struct btrfs_leaf, items); 777 item_size = sizeof(struct btrfs_item); 778 } else { 779 p = offsetof(struct btrfs_node, ptrs); 780 item_size = sizeof(struct btrfs_key_ptr); 781 } 782 783 while (low < high) { 784 unsigned long oip; 785 unsigned long offset; 786 struct btrfs_disk_key *tmp; 787 struct btrfs_disk_key unaligned; 788 int mid; 789 790 mid = (low + high) / 2; 791 offset = p + mid * item_size; 792 oip = offset_in_page(offset); 793 794 if (oip + key_size <= PAGE_SIZE) { 795 const unsigned long idx = get_eb_page_index(offset); 796 char *kaddr = page_address(eb->pages[idx]); 797 798 oip = get_eb_offset_in_page(eb, offset); 799 tmp = (struct btrfs_disk_key *)(kaddr + oip); 800 } else { 801 read_extent_buffer(eb, &unaligned, offset, key_size); 802 tmp = &unaligned; 803 } 804 805 ret = comp_keys(tmp, key); 806 807 if (ret < 0) 808 low = mid + 1; 809 else if (ret > 0) 810 high = mid; 811 else { 812 *slot = mid; 813 return 0; 814 } 815 } 816 *slot = low; 817 return 1; 818 } 819 820 /* 821 * Simple binary search on an extent buffer. Works for both leaves and nodes, and 822 * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). 823 */ 824 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 825 int *slot) 826 { 827 return generic_bin_search(eb, 0, key, slot); 828 } 829 830 static void root_add_used(struct btrfs_root *root, u32 size) 831 { 832 spin_lock(&root->accounting_lock); 833 btrfs_set_root_used(&root->root_item, 834 btrfs_root_used(&root->root_item) + size); 835 spin_unlock(&root->accounting_lock); 836 } 837 838 static void root_sub_used(struct btrfs_root *root, u32 size) 839 { 840 spin_lock(&root->accounting_lock); 841 btrfs_set_root_used(&root->root_item, 842 btrfs_root_used(&root->root_item) - size); 843 spin_unlock(&root->accounting_lock); 844 } 845 846 /* given a node and slot number, this reads the blocks it points to. The 847 * extent buffer is returned with a reference taken (but unlocked). 848 */ 849 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 850 int slot) 851 { 852 int level = btrfs_header_level(parent); 853 struct extent_buffer *eb; 854 struct btrfs_key first_key; 855 856 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 857 return ERR_PTR(-ENOENT); 858 859 BUG_ON(level == 0); 860 861 btrfs_node_key_to_cpu(parent, &first_key, slot); 862 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 863 btrfs_header_owner(parent), 864 btrfs_node_ptr_generation(parent, slot), 865 level - 1, &first_key); 866 if (IS_ERR(eb)) 867 return eb; 868 if (!extent_buffer_uptodate(eb)) { 869 free_extent_buffer(eb); 870 return ERR_PTR(-EIO); 871 } 872 873 return eb; 874 } 875 876 /* 877 * node level balancing, used to make sure nodes are in proper order for 878 * item deletion. We balance from the top down, so we have to make sure 879 * that a deletion won't leave an node completely empty later on. 880 */ 881 static noinline int balance_level(struct btrfs_trans_handle *trans, 882 struct btrfs_root *root, 883 struct btrfs_path *path, int level) 884 { 885 struct btrfs_fs_info *fs_info = root->fs_info; 886 struct extent_buffer *right = NULL; 887 struct extent_buffer *mid; 888 struct extent_buffer *left = NULL; 889 struct extent_buffer *parent = NULL; 890 int ret = 0; 891 int wret; 892 int pslot; 893 int orig_slot = path->slots[level]; 894 u64 orig_ptr; 895 896 ASSERT(level > 0); 897 898 mid = path->nodes[level]; 899 900 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 901 WARN_ON(btrfs_header_generation(mid) != trans->transid); 902 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 904 905 if (level < BTRFS_MAX_LEVEL - 1) { 906 parent = path->nodes[level + 1]; 907 pslot = path->slots[level + 1]; 908 } 909 910 /* 911 * deal with the case where there is only one pointer in the root 912 * by promoting the node below to a root 913 */ 914 if (!parent) { 915 struct extent_buffer *child; 916 917 if (btrfs_header_nritems(mid) != 1) 918 return 0; 919 920 /* promote the child to a root */ 921 child = btrfs_read_node_slot(mid, 0); 922 if (IS_ERR(child)) { 923 ret = PTR_ERR(child); 924 btrfs_handle_fs_error(fs_info, ret, NULL); 925 goto enospc; 926 } 927 928 btrfs_tree_lock(child); 929 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 930 BTRFS_NESTING_COW); 931 if (ret) { 932 btrfs_tree_unlock(child); 933 free_extent_buffer(child); 934 goto enospc; 935 } 936 937 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 938 BUG_ON(ret < 0); 939 rcu_assign_pointer(root->node, child); 940 941 add_root_to_dirty_list(root); 942 btrfs_tree_unlock(child); 943 944 path->locks[level] = 0; 945 path->nodes[level] = NULL; 946 btrfs_clean_tree_block(mid); 947 btrfs_tree_unlock(mid); 948 /* once for the path */ 949 free_extent_buffer(mid); 950 951 root_sub_used(root, mid->len); 952 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 953 /* once for the root ptr */ 954 free_extent_buffer_stale(mid); 955 return 0; 956 } 957 if (btrfs_header_nritems(mid) > 958 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 959 return 0; 960 961 left = btrfs_read_node_slot(parent, pslot - 1); 962 if (IS_ERR(left)) 963 left = NULL; 964 965 if (left) { 966 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 967 wret = btrfs_cow_block(trans, root, left, 968 parent, pslot - 1, &left, 969 BTRFS_NESTING_LEFT_COW); 970 if (wret) { 971 ret = wret; 972 goto enospc; 973 } 974 } 975 976 right = btrfs_read_node_slot(parent, pslot + 1); 977 if (IS_ERR(right)) 978 right = NULL; 979 980 if (right) { 981 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 982 wret = btrfs_cow_block(trans, root, right, 983 parent, pslot + 1, &right, 984 BTRFS_NESTING_RIGHT_COW); 985 if (wret) { 986 ret = wret; 987 goto enospc; 988 } 989 } 990 991 /* first, try to make some room in the middle buffer */ 992 if (left) { 993 orig_slot += btrfs_header_nritems(left); 994 wret = push_node_left(trans, left, mid, 1); 995 if (wret < 0) 996 ret = wret; 997 } 998 999 /* 1000 * then try to empty the right most buffer into the middle 1001 */ 1002 if (right) { 1003 wret = push_node_left(trans, mid, right, 1); 1004 if (wret < 0 && wret != -ENOSPC) 1005 ret = wret; 1006 if (btrfs_header_nritems(right) == 0) { 1007 btrfs_clean_tree_block(right); 1008 btrfs_tree_unlock(right); 1009 del_ptr(root, path, level + 1, pslot + 1); 1010 root_sub_used(root, right->len); 1011 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 1012 0, 1); 1013 free_extent_buffer_stale(right); 1014 right = NULL; 1015 } else { 1016 struct btrfs_disk_key right_key; 1017 btrfs_node_key(right, &right_key, 0); 1018 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1019 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1020 BUG_ON(ret < 0); 1021 btrfs_set_node_key(parent, &right_key, pslot + 1); 1022 btrfs_mark_buffer_dirty(parent); 1023 } 1024 } 1025 if (btrfs_header_nritems(mid) == 1) { 1026 /* 1027 * we're not allowed to leave a node with one item in the 1028 * tree during a delete. A deletion from lower in the tree 1029 * could try to delete the only pointer in this node. 1030 * So, pull some keys from the left. 1031 * There has to be a left pointer at this point because 1032 * otherwise we would have pulled some pointers from the 1033 * right 1034 */ 1035 if (!left) { 1036 ret = -EROFS; 1037 btrfs_handle_fs_error(fs_info, ret, NULL); 1038 goto enospc; 1039 } 1040 wret = balance_node_right(trans, mid, left); 1041 if (wret < 0) { 1042 ret = wret; 1043 goto enospc; 1044 } 1045 if (wret == 1) { 1046 wret = push_node_left(trans, left, mid, 1); 1047 if (wret < 0) 1048 ret = wret; 1049 } 1050 BUG_ON(wret == 1); 1051 } 1052 if (btrfs_header_nritems(mid) == 0) { 1053 btrfs_clean_tree_block(mid); 1054 btrfs_tree_unlock(mid); 1055 del_ptr(root, path, level + 1, pslot); 1056 root_sub_used(root, mid->len); 1057 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1058 free_extent_buffer_stale(mid); 1059 mid = NULL; 1060 } else { 1061 /* update the parent key to reflect our changes */ 1062 struct btrfs_disk_key mid_key; 1063 btrfs_node_key(mid, &mid_key, 0); 1064 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1065 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1066 BUG_ON(ret < 0); 1067 btrfs_set_node_key(parent, &mid_key, pslot); 1068 btrfs_mark_buffer_dirty(parent); 1069 } 1070 1071 /* update the path */ 1072 if (left) { 1073 if (btrfs_header_nritems(left) > orig_slot) { 1074 atomic_inc(&left->refs); 1075 /* left was locked after cow */ 1076 path->nodes[level] = left; 1077 path->slots[level + 1] -= 1; 1078 path->slots[level] = orig_slot; 1079 if (mid) { 1080 btrfs_tree_unlock(mid); 1081 free_extent_buffer(mid); 1082 } 1083 } else { 1084 orig_slot -= btrfs_header_nritems(left); 1085 path->slots[level] = orig_slot; 1086 } 1087 } 1088 /* double check we haven't messed things up */ 1089 if (orig_ptr != 1090 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1091 BUG(); 1092 enospc: 1093 if (right) { 1094 btrfs_tree_unlock(right); 1095 free_extent_buffer(right); 1096 } 1097 if (left) { 1098 if (path->nodes[level] != left) 1099 btrfs_tree_unlock(left); 1100 free_extent_buffer(left); 1101 } 1102 return ret; 1103 } 1104 1105 /* Node balancing for insertion. Here we only split or push nodes around 1106 * when they are completely full. This is also done top down, so we 1107 * have to be pessimistic. 1108 */ 1109 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1110 struct btrfs_root *root, 1111 struct btrfs_path *path, int level) 1112 { 1113 struct btrfs_fs_info *fs_info = root->fs_info; 1114 struct extent_buffer *right = NULL; 1115 struct extent_buffer *mid; 1116 struct extent_buffer *left = NULL; 1117 struct extent_buffer *parent = NULL; 1118 int ret = 0; 1119 int wret; 1120 int pslot; 1121 int orig_slot = path->slots[level]; 1122 1123 if (level == 0) 1124 return 1; 1125 1126 mid = path->nodes[level]; 1127 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1128 1129 if (level < BTRFS_MAX_LEVEL - 1) { 1130 parent = path->nodes[level + 1]; 1131 pslot = path->slots[level + 1]; 1132 } 1133 1134 if (!parent) 1135 return 1; 1136 1137 left = btrfs_read_node_slot(parent, pslot - 1); 1138 if (IS_ERR(left)) 1139 left = NULL; 1140 1141 /* first, try to make some room in the middle buffer */ 1142 if (left) { 1143 u32 left_nr; 1144 1145 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1146 1147 left_nr = btrfs_header_nritems(left); 1148 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1149 wret = 1; 1150 } else { 1151 ret = btrfs_cow_block(trans, root, left, parent, 1152 pslot - 1, &left, 1153 BTRFS_NESTING_LEFT_COW); 1154 if (ret) 1155 wret = 1; 1156 else { 1157 wret = push_node_left(trans, left, mid, 0); 1158 } 1159 } 1160 if (wret < 0) 1161 ret = wret; 1162 if (wret == 0) { 1163 struct btrfs_disk_key disk_key; 1164 orig_slot += left_nr; 1165 btrfs_node_key(mid, &disk_key, 0); 1166 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1167 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1168 BUG_ON(ret < 0); 1169 btrfs_set_node_key(parent, &disk_key, pslot); 1170 btrfs_mark_buffer_dirty(parent); 1171 if (btrfs_header_nritems(left) > orig_slot) { 1172 path->nodes[level] = left; 1173 path->slots[level + 1] -= 1; 1174 path->slots[level] = orig_slot; 1175 btrfs_tree_unlock(mid); 1176 free_extent_buffer(mid); 1177 } else { 1178 orig_slot -= 1179 btrfs_header_nritems(left); 1180 path->slots[level] = orig_slot; 1181 btrfs_tree_unlock(left); 1182 free_extent_buffer(left); 1183 } 1184 return 0; 1185 } 1186 btrfs_tree_unlock(left); 1187 free_extent_buffer(left); 1188 } 1189 right = btrfs_read_node_slot(parent, pslot + 1); 1190 if (IS_ERR(right)) 1191 right = NULL; 1192 1193 /* 1194 * then try to empty the right most buffer into the middle 1195 */ 1196 if (right) { 1197 u32 right_nr; 1198 1199 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1200 1201 right_nr = btrfs_header_nritems(right); 1202 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1203 wret = 1; 1204 } else { 1205 ret = btrfs_cow_block(trans, root, right, 1206 parent, pslot + 1, 1207 &right, BTRFS_NESTING_RIGHT_COW); 1208 if (ret) 1209 wret = 1; 1210 else { 1211 wret = balance_node_right(trans, right, mid); 1212 } 1213 } 1214 if (wret < 0) 1215 ret = wret; 1216 if (wret == 0) { 1217 struct btrfs_disk_key disk_key; 1218 1219 btrfs_node_key(right, &disk_key, 0); 1220 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1221 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1222 BUG_ON(ret < 0); 1223 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1224 btrfs_mark_buffer_dirty(parent); 1225 1226 if (btrfs_header_nritems(mid) <= orig_slot) { 1227 path->nodes[level] = right; 1228 path->slots[level + 1] += 1; 1229 path->slots[level] = orig_slot - 1230 btrfs_header_nritems(mid); 1231 btrfs_tree_unlock(mid); 1232 free_extent_buffer(mid); 1233 } else { 1234 btrfs_tree_unlock(right); 1235 free_extent_buffer(right); 1236 } 1237 return 0; 1238 } 1239 btrfs_tree_unlock(right); 1240 free_extent_buffer(right); 1241 } 1242 return 1; 1243 } 1244 1245 /* 1246 * readahead one full node of leaves, finding things that are close 1247 * to the block in 'slot', and triggering ra on them. 1248 */ 1249 static void reada_for_search(struct btrfs_fs_info *fs_info, 1250 struct btrfs_path *path, 1251 int level, int slot, u64 objectid) 1252 { 1253 struct extent_buffer *node; 1254 struct btrfs_disk_key disk_key; 1255 u32 nritems; 1256 u64 search; 1257 u64 target; 1258 u64 nread = 0; 1259 u64 nread_max; 1260 u32 nr; 1261 u32 blocksize; 1262 u32 nscan = 0; 1263 1264 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1265 return; 1266 1267 if (!path->nodes[level]) 1268 return; 1269 1270 node = path->nodes[level]; 1271 1272 /* 1273 * Since the time between visiting leaves is much shorter than the time 1274 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1275 * much IO at once (possibly random). 1276 */ 1277 if (path->reada == READA_FORWARD_ALWAYS) { 1278 if (level > 1) 1279 nread_max = node->fs_info->nodesize; 1280 else 1281 nread_max = SZ_128K; 1282 } else { 1283 nread_max = SZ_64K; 1284 } 1285 1286 search = btrfs_node_blockptr(node, slot); 1287 blocksize = fs_info->nodesize; 1288 if (path->reada != READA_FORWARD_ALWAYS) { 1289 struct extent_buffer *eb; 1290 1291 eb = find_extent_buffer(fs_info, search); 1292 if (eb) { 1293 free_extent_buffer(eb); 1294 return; 1295 } 1296 } 1297 1298 target = search; 1299 1300 nritems = btrfs_header_nritems(node); 1301 nr = slot; 1302 1303 while (1) { 1304 if (path->reada == READA_BACK) { 1305 if (nr == 0) 1306 break; 1307 nr--; 1308 } else if (path->reada == READA_FORWARD || 1309 path->reada == READA_FORWARD_ALWAYS) { 1310 nr++; 1311 if (nr >= nritems) 1312 break; 1313 } 1314 if (path->reada == READA_BACK && objectid) { 1315 btrfs_node_key(node, &disk_key, nr); 1316 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1317 break; 1318 } 1319 search = btrfs_node_blockptr(node, nr); 1320 if (path->reada == READA_FORWARD_ALWAYS || 1321 (search <= target && target - search <= 65536) || 1322 (search > target && search - target <= 65536)) { 1323 btrfs_readahead_node_child(node, nr); 1324 nread += blocksize; 1325 } 1326 nscan++; 1327 if (nread > nread_max || nscan > 32) 1328 break; 1329 } 1330 } 1331 1332 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1333 { 1334 struct extent_buffer *parent; 1335 int slot; 1336 int nritems; 1337 1338 parent = path->nodes[level + 1]; 1339 if (!parent) 1340 return; 1341 1342 nritems = btrfs_header_nritems(parent); 1343 slot = path->slots[level + 1]; 1344 1345 if (slot > 0) 1346 btrfs_readahead_node_child(parent, slot - 1); 1347 if (slot + 1 < nritems) 1348 btrfs_readahead_node_child(parent, slot + 1); 1349 } 1350 1351 1352 /* 1353 * when we walk down the tree, it is usually safe to unlock the higher layers 1354 * in the tree. The exceptions are when our path goes through slot 0, because 1355 * operations on the tree might require changing key pointers higher up in the 1356 * tree. 1357 * 1358 * callers might also have set path->keep_locks, which tells this code to keep 1359 * the lock if the path points to the last slot in the block. This is part of 1360 * walking through the tree, and selecting the next slot in the higher block. 1361 * 1362 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1363 * if lowest_unlock is 1, level 0 won't be unlocked 1364 */ 1365 static noinline void unlock_up(struct btrfs_path *path, int level, 1366 int lowest_unlock, int min_write_lock_level, 1367 int *write_lock_level) 1368 { 1369 int i; 1370 int skip_level = level; 1371 bool check_skip = true; 1372 1373 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1374 if (!path->nodes[i]) 1375 break; 1376 if (!path->locks[i]) 1377 break; 1378 1379 if (check_skip) { 1380 if (path->slots[i] == 0) { 1381 skip_level = i + 1; 1382 continue; 1383 } 1384 1385 if (path->keep_locks) { 1386 u32 nritems; 1387 1388 nritems = btrfs_header_nritems(path->nodes[i]); 1389 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1390 skip_level = i + 1; 1391 continue; 1392 } 1393 } 1394 } 1395 1396 if (i >= lowest_unlock && i > skip_level) { 1397 check_skip = false; 1398 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1399 path->locks[i] = 0; 1400 if (write_lock_level && 1401 i > min_write_lock_level && 1402 i <= *write_lock_level) { 1403 *write_lock_level = i - 1; 1404 } 1405 } 1406 } 1407 } 1408 1409 /* 1410 * Helper function for btrfs_search_slot() and other functions that do a search 1411 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1412 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1413 * its pages from disk. 1414 * 1415 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1416 * whole btree search, starting again from the current root node. 1417 */ 1418 static int 1419 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1420 struct extent_buffer **eb_ret, int level, int slot, 1421 const struct btrfs_key *key) 1422 { 1423 struct btrfs_fs_info *fs_info = root->fs_info; 1424 u64 blocknr; 1425 u64 gen; 1426 struct extent_buffer *tmp; 1427 struct btrfs_key first_key; 1428 int ret; 1429 int parent_level; 1430 bool unlock_up; 1431 1432 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1433 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1434 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1435 parent_level = btrfs_header_level(*eb_ret); 1436 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); 1437 1438 /* 1439 * If we need to read an extent buffer from disk and we are holding locks 1440 * on upper level nodes, we unlock all the upper nodes before reading the 1441 * extent buffer, and then return -EAGAIN to the caller as it needs to 1442 * restart the search. We don't release the lock on the current level 1443 * because we need to walk this node to figure out which blocks to read. 1444 */ 1445 tmp = find_extent_buffer(fs_info, blocknr); 1446 if (tmp) { 1447 if (p->reada == READA_FORWARD_ALWAYS) 1448 reada_for_search(fs_info, p, level, slot, key->objectid); 1449 1450 /* first we do an atomic uptodate check */ 1451 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1452 /* 1453 * Do extra check for first_key, eb can be stale due to 1454 * being cached, read from scrub, or have multiple 1455 * parents (shared tree blocks). 1456 */ 1457 if (btrfs_verify_level_key(tmp, 1458 parent_level - 1, &first_key, gen)) { 1459 free_extent_buffer(tmp); 1460 return -EUCLEAN; 1461 } 1462 *eb_ret = tmp; 1463 return 0; 1464 } 1465 1466 if (p->nowait) { 1467 free_extent_buffer(tmp); 1468 return -EAGAIN; 1469 } 1470 1471 if (unlock_up) 1472 btrfs_unlock_up_safe(p, level + 1); 1473 1474 /* now we're allowed to do a blocking uptodate check */ 1475 ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); 1476 if (ret) { 1477 free_extent_buffer(tmp); 1478 btrfs_release_path(p); 1479 return -EIO; 1480 } 1481 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1482 free_extent_buffer(tmp); 1483 btrfs_release_path(p); 1484 return -EUCLEAN; 1485 } 1486 1487 if (unlock_up) 1488 ret = -EAGAIN; 1489 1490 goto out; 1491 } else if (p->nowait) { 1492 return -EAGAIN; 1493 } 1494 1495 if (unlock_up) { 1496 btrfs_unlock_up_safe(p, level + 1); 1497 ret = -EAGAIN; 1498 } else { 1499 ret = 0; 1500 } 1501 1502 if (p->reada != READA_NONE) 1503 reada_for_search(fs_info, p, level, slot, key->objectid); 1504 1505 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1506 gen, parent_level - 1, &first_key); 1507 if (IS_ERR(tmp)) { 1508 btrfs_release_path(p); 1509 return PTR_ERR(tmp); 1510 } 1511 /* 1512 * If the read above didn't mark this buffer up to date, 1513 * it will never end up being up to date. Set ret to EIO now 1514 * and give up so that our caller doesn't loop forever 1515 * on our EAGAINs. 1516 */ 1517 if (!extent_buffer_uptodate(tmp)) 1518 ret = -EIO; 1519 1520 out: 1521 if (ret == 0) { 1522 *eb_ret = tmp; 1523 } else { 1524 free_extent_buffer(tmp); 1525 btrfs_release_path(p); 1526 } 1527 1528 return ret; 1529 } 1530 1531 /* 1532 * helper function for btrfs_search_slot. This does all of the checks 1533 * for node-level blocks and does any balancing required based on 1534 * the ins_len. 1535 * 1536 * If no extra work was required, zero is returned. If we had to 1537 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1538 * start over 1539 */ 1540 static int 1541 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1542 struct btrfs_root *root, struct btrfs_path *p, 1543 struct extent_buffer *b, int level, int ins_len, 1544 int *write_lock_level) 1545 { 1546 struct btrfs_fs_info *fs_info = root->fs_info; 1547 int ret = 0; 1548 1549 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1550 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1551 1552 if (*write_lock_level < level + 1) { 1553 *write_lock_level = level + 1; 1554 btrfs_release_path(p); 1555 return -EAGAIN; 1556 } 1557 1558 reada_for_balance(p, level); 1559 ret = split_node(trans, root, p, level); 1560 1561 b = p->nodes[level]; 1562 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1563 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1564 1565 if (*write_lock_level < level + 1) { 1566 *write_lock_level = level + 1; 1567 btrfs_release_path(p); 1568 return -EAGAIN; 1569 } 1570 1571 reada_for_balance(p, level); 1572 ret = balance_level(trans, root, p, level); 1573 if (ret) 1574 return ret; 1575 1576 b = p->nodes[level]; 1577 if (!b) { 1578 btrfs_release_path(p); 1579 return -EAGAIN; 1580 } 1581 BUG_ON(btrfs_header_nritems(b) == 1); 1582 } 1583 return ret; 1584 } 1585 1586 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1587 u64 iobjectid, u64 ioff, u8 key_type, 1588 struct btrfs_key *found_key) 1589 { 1590 int ret; 1591 struct btrfs_key key; 1592 struct extent_buffer *eb; 1593 1594 ASSERT(path); 1595 ASSERT(found_key); 1596 1597 key.type = key_type; 1598 key.objectid = iobjectid; 1599 key.offset = ioff; 1600 1601 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1602 if (ret < 0) 1603 return ret; 1604 1605 eb = path->nodes[0]; 1606 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1607 ret = btrfs_next_leaf(fs_root, path); 1608 if (ret) 1609 return ret; 1610 eb = path->nodes[0]; 1611 } 1612 1613 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1614 if (found_key->type != key.type || 1615 found_key->objectid != key.objectid) 1616 return 1; 1617 1618 return 0; 1619 } 1620 1621 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1622 struct btrfs_path *p, 1623 int write_lock_level) 1624 { 1625 struct extent_buffer *b; 1626 int root_lock = 0; 1627 int level = 0; 1628 1629 if (p->search_commit_root) { 1630 b = root->commit_root; 1631 atomic_inc(&b->refs); 1632 level = btrfs_header_level(b); 1633 /* 1634 * Ensure that all callers have set skip_locking when 1635 * p->search_commit_root = 1. 1636 */ 1637 ASSERT(p->skip_locking == 1); 1638 1639 goto out; 1640 } 1641 1642 if (p->skip_locking) { 1643 b = btrfs_root_node(root); 1644 level = btrfs_header_level(b); 1645 goto out; 1646 } 1647 1648 /* We try very hard to do read locks on the root */ 1649 root_lock = BTRFS_READ_LOCK; 1650 1651 /* 1652 * If the level is set to maximum, we can skip trying to get the read 1653 * lock. 1654 */ 1655 if (write_lock_level < BTRFS_MAX_LEVEL) { 1656 /* 1657 * We don't know the level of the root node until we actually 1658 * have it read locked 1659 */ 1660 if (p->nowait) { 1661 b = btrfs_try_read_lock_root_node(root); 1662 if (IS_ERR(b)) 1663 return b; 1664 } else { 1665 b = btrfs_read_lock_root_node(root); 1666 } 1667 level = btrfs_header_level(b); 1668 if (level > write_lock_level) 1669 goto out; 1670 1671 /* Whoops, must trade for write lock */ 1672 btrfs_tree_read_unlock(b); 1673 free_extent_buffer(b); 1674 } 1675 1676 b = btrfs_lock_root_node(root); 1677 root_lock = BTRFS_WRITE_LOCK; 1678 1679 /* The level might have changed, check again */ 1680 level = btrfs_header_level(b); 1681 1682 out: 1683 /* 1684 * The root may have failed to write out at some point, and thus is no 1685 * longer valid, return an error in this case. 1686 */ 1687 if (!extent_buffer_uptodate(b)) { 1688 if (root_lock) 1689 btrfs_tree_unlock_rw(b, root_lock); 1690 free_extent_buffer(b); 1691 return ERR_PTR(-EIO); 1692 } 1693 1694 p->nodes[level] = b; 1695 if (!p->skip_locking) 1696 p->locks[level] = root_lock; 1697 /* 1698 * Callers are responsible for dropping b's references. 1699 */ 1700 return b; 1701 } 1702 1703 /* 1704 * Replace the extent buffer at the lowest level of the path with a cloned 1705 * version. The purpose is to be able to use it safely, after releasing the 1706 * commit root semaphore, even if relocation is happening in parallel, the 1707 * transaction used for relocation is committed and the extent buffer is 1708 * reallocated in the next transaction. 1709 * 1710 * This is used in a context where the caller does not prevent transaction 1711 * commits from happening, either by holding a transaction handle or holding 1712 * some lock, while it's doing searches through a commit root. 1713 * At the moment it's only used for send operations. 1714 */ 1715 static int finish_need_commit_sem_search(struct btrfs_path *path) 1716 { 1717 const int i = path->lowest_level; 1718 const int slot = path->slots[i]; 1719 struct extent_buffer *lowest = path->nodes[i]; 1720 struct extent_buffer *clone; 1721 1722 ASSERT(path->need_commit_sem); 1723 1724 if (!lowest) 1725 return 0; 1726 1727 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1728 1729 clone = btrfs_clone_extent_buffer(lowest); 1730 if (!clone) 1731 return -ENOMEM; 1732 1733 btrfs_release_path(path); 1734 path->nodes[i] = clone; 1735 path->slots[i] = slot; 1736 1737 return 0; 1738 } 1739 1740 static inline int search_for_key_slot(struct extent_buffer *eb, 1741 int search_low_slot, 1742 const struct btrfs_key *key, 1743 int prev_cmp, 1744 int *slot) 1745 { 1746 /* 1747 * If a previous call to btrfs_bin_search() on a parent node returned an 1748 * exact match (prev_cmp == 0), we can safely assume the target key will 1749 * always be at slot 0 on lower levels, since each key pointer 1750 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1751 * subtree it points to. Thus we can skip searching lower levels. 1752 */ 1753 if (prev_cmp == 0) { 1754 *slot = 0; 1755 return 0; 1756 } 1757 1758 return generic_bin_search(eb, search_low_slot, key, slot); 1759 } 1760 1761 static int search_leaf(struct btrfs_trans_handle *trans, 1762 struct btrfs_root *root, 1763 const struct btrfs_key *key, 1764 struct btrfs_path *path, 1765 int ins_len, 1766 int prev_cmp) 1767 { 1768 struct extent_buffer *leaf = path->nodes[0]; 1769 int leaf_free_space = -1; 1770 int search_low_slot = 0; 1771 int ret; 1772 bool do_bin_search = true; 1773 1774 /* 1775 * If we are doing an insertion, the leaf has enough free space and the 1776 * destination slot for the key is not slot 0, then we can unlock our 1777 * write lock on the parent, and any other upper nodes, before doing the 1778 * binary search on the leaf (with search_for_key_slot()), allowing other 1779 * tasks to lock the parent and any other upper nodes. 1780 */ 1781 if (ins_len > 0) { 1782 /* 1783 * Cache the leaf free space, since we will need it later and it 1784 * will not change until then. 1785 */ 1786 leaf_free_space = btrfs_leaf_free_space(leaf); 1787 1788 /* 1789 * !path->locks[1] means we have a single node tree, the leaf is 1790 * the root of the tree. 1791 */ 1792 if (path->locks[1] && leaf_free_space >= ins_len) { 1793 struct btrfs_disk_key first_key; 1794 1795 ASSERT(btrfs_header_nritems(leaf) > 0); 1796 btrfs_item_key(leaf, &first_key, 0); 1797 1798 /* 1799 * Doing the extra comparison with the first key is cheap, 1800 * taking into account that the first key is very likely 1801 * already in a cache line because it immediately follows 1802 * the extent buffer's header and we have recently accessed 1803 * the header's level field. 1804 */ 1805 ret = comp_keys(&first_key, key); 1806 if (ret < 0) { 1807 /* 1808 * The first key is smaller than the key we want 1809 * to insert, so we are safe to unlock all upper 1810 * nodes and we have to do the binary search. 1811 * 1812 * We do use btrfs_unlock_up_safe() and not 1813 * unlock_up() because the later does not unlock 1814 * nodes with a slot of 0 - we can safely unlock 1815 * any node even if its slot is 0 since in this 1816 * case the key does not end up at slot 0 of the 1817 * leaf and there's no need to split the leaf. 1818 */ 1819 btrfs_unlock_up_safe(path, 1); 1820 search_low_slot = 1; 1821 } else { 1822 /* 1823 * The first key is >= then the key we want to 1824 * insert, so we can skip the binary search as 1825 * the target key will be at slot 0. 1826 * 1827 * We can not unlock upper nodes when the key is 1828 * less than the first key, because we will need 1829 * to update the key at slot 0 of the parent node 1830 * and possibly of other upper nodes too. 1831 * If the key matches the first key, then we can 1832 * unlock all the upper nodes, using 1833 * btrfs_unlock_up_safe() instead of unlock_up() 1834 * as stated above. 1835 */ 1836 if (ret == 0) 1837 btrfs_unlock_up_safe(path, 1); 1838 /* 1839 * ret is already 0 or 1, matching the result of 1840 * a btrfs_bin_search() call, so there is no need 1841 * to adjust it. 1842 */ 1843 do_bin_search = false; 1844 path->slots[0] = 0; 1845 } 1846 } 1847 } 1848 1849 if (do_bin_search) { 1850 ret = search_for_key_slot(leaf, search_low_slot, key, 1851 prev_cmp, &path->slots[0]); 1852 if (ret < 0) 1853 return ret; 1854 } 1855 1856 if (ins_len > 0) { 1857 /* 1858 * Item key already exists. In this case, if we are allowed to 1859 * insert the item (for example, in dir_item case, item key 1860 * collision is allowed), it will be merged with the original 1861 * item. Only the item size grows, no new btrfs item will be 1862 * added. If search_for_extension is not set, ins_len already 1863 * accounts the size btrfs_item, deduct it here so leaf space 1864 * check will be correct. 1865 */ 1866 if (ret == 0 && !path->search_for_extension) { 1867 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1868 ins_len -= sizeof(struct btrfs_item); 1869 } 1870 1871 ASSERT(leaf_free_space >= 0); 1872 1873 if (leaf_free_space < ins_len) { 1874 int err; 1875 1876 err = split_leaf(trans, root, key, path, ins_len, 1877 (ret == 0)); 1878 ASSERT(err <= 0); 1879 if (WARN_ON(err > 0)) 1880 err = -EUCLEAN; 1881 if (err) 1882 ret = err; 1883 } 1884 } 1885 1886 return ret; 1887 } 1888 1889 /* 1890 * btrfs_search_slot - look for a key in a tree and perform necessary 1891 * modifications to preserve tree invariants. 1892 * 1893 * @trans: Handle of transaction, used when modifying the tree 1894 * @p: Holds all btree nodes along the search path 1895 * @root: The root node of the tree 1896 * @key: The key we are looking for 1897 * @ins_len: Indicates purpose of search: 1898 * >0 for inserts it's size of item inserted (*) 1899 * <0 for deletions 1900 * 0 for plain searches, not modifying the tree 1901 * 1902 * (*) If size of item inserted doesn't include 1903 * sizeof(struct btrfs_item), then p->search_for_extension must 1904 * be set. 1905 * @cow: boolean should CoW operations be performed. Must always be 1 1906 * when modifying the tree. 1907 * 1908 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1909 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1910 * 1911 * If @key is found, 0 is returned and you can find the item in the leaf level 1912 * of the path (level 0) 1913 * 1914 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1915 * points to the slot where it should be inserted 1916 * 1917 * If an error is encountered while searching the tree a negative error number 1918 * is returned 1919 */ 1920 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1921 const struct btrfs_key *key, struct btrfs_path *p, 1922 int ins_len, int cow) 1923 { 1924 struct btrfs_fs_info *fs_info = root->fs_info; 1925 struct extent_buffer *b; 1926 int slot; 1927 int ret; 1928 int err; 1929 int level; 1930 int lowest_unlock = 1; 1931 /* everything at write_lock_level or lower must be write locked */ 1932 int write_lock_level = 0; 1933 u8 lowest_level = 0; 1934 int min_write_lock_level; 1935 int prev_cmp; 1936 1937 lowest_level = p->lowest_level; 1938 WARN_ON(lowest_level && ins_len > 0); 1939 WARN_ON(p->nodes[0] != NULL); 1940 BUG_ON(!cow && ins_len); 1941 1942 /* 1943 * For now only allow nowait for read only operations. There's no 1944 * strict reason why we can't, we just only need it for reads so it's 1945 * only implemented for reads. 1946 */ 1947 ASSERT(!p->nowait || !cow); 1948 1949 if (ins_len < 0) { 1950 lowest_unlock = 2; 1951 1952 /* when we are removing items, we might have to go up to level 1953 * two as we update tree pointers Make sure we keep write 1954 * for those levels as well 1955 */ 1956 write_lock_level = 2; 1957 } else if (ins_len > 0) { 1958 /* 1959 * for inserting items, make sure we have a write lock on 1960 * level 1 so we can update keys 1961 */ 1962 write_lock_level = 1; 1963 } 1964 1965 if (!cow) 1966 write_lock_level = -1; 1967 1968 if (cow && (p->keep_locks || p->lowest_level)) 1969 write_lock_level = BTRFS_MAX_LEVEL; 1970 1971 min_write_lock_level = write_lock_level; 1972 1973 if (p->need_commit_sem) { 1974 ASSERT(p->search_commit_root); 1975 if (p->nowait) { 1976 if (!down_read_trylock(&fs_info->commit_root_sem)) 1977 return -EAGAIN; 1978 } else { 1979 down_read(&fs_info->commit_root_sem); 1980 } 1981 } 1982 1983 again: 1984 prev_cmp = -1; 1985 b = btrfs_search_slot_get_root(root, p, write_lock_level); 1986 if (IS_ERR(b)) { 1987 ret = PTR_ERR(b); 1988 goto done; 1989 } 1990 1991 while (b) { 1992 int dec = 0; 1993 1994 level = btrfs_header_level(b); 1995 1996 if (cow) { 1997 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 1998 1999 /* 2000 * if we don't really need to cow this block 2001 * then we don't want to set the path blocking, 2002 * so we test it here 2003 */ 2004 if (!should_cow_block(trans, root, b)) 2005 goto cow_done; 2006 2007 /* 2008 * must have write locks on this node and the 2009 * parent 2010 */ 2011 if (level > write_lock_level || 2012 (level + 1 > write_lock_level && 2013 level + 1 < BTRFS_MAX_LEVEL && 2014 p->nodes[level + 1])) { 2015 write_lock_level = level + 1; 2016 btrfs_release_path(p); 2017 goto again; 2018 } 2019 2020 if (last_level) 2021 err = btrfs_cow_block(trans, root, b, NULL, 0, 2022 &b, 2023 BTRFS_NESTING_COW); 2024 else 2025 err = btrfs_cow_block(trans, root, b, 2026 p->nodes[level + 1], 2027 p->slots[level + 1], &b, 2028 BTRFS_NESTING_COW); 2029 if (err) { 2030 ret = err; 2031 goto done; 2032 } 2033 } 2034 cow_done: 2035 p->nodes[level] = b; 2036 2037 /* 2038 * we have a lock on b and as long as we aren't changing 2039 * the tree, there is no way to for the items in b to change. 2040 * It is safe to drop the lock on our parent before we 2041 * go through the expensive btree search on b. 2042 * 2043 * If we're inserting or deleting (ins_len != 0), then we might 2044 * be changing slot zero, which may require changing the parent. 2045 * So, we can't drop the lock until after we know which slot 2046 * we're operating on. 2047 */ 2048 if (!ins_len && !p->keep_locks) { 2049 int u = level + 1; 2050 2051 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2052 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2053 p->locks[u] = 0; 2054 } 2055 } 2056 2057 if (level == 0) { 2058 if (ins_len > 0) 2059 ASSERT(write_lock_level >= 1); 2060 2061 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2062 if (!p->search_for_split) 2063 unlock_up(p, level, lowest_unlock, 2064 min_write_lock_level, NULL); 2065 goto done; 2066 } 2067 2068 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2069 if (ret < 0) 2070 goto done; 2071 prev_cmp = ret; 2072 2073 if (ret && slot > 0) { 2074 dec = 1; 2075 slot--; 2076 } 2077 p->slots[level] = slot; 2078 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2079 &write_lock_level); 2080 if (err == -EAGAIN) 2081 goto again; 2082 if (err) { 2083 ret = err; 2084 goto done; 2085 } 2086 b = p->nodes[level]; 2087 slot = p->slots[level]; 2088 2089 /* 2090 * Slot 0 is special, if we change the key we have to update 2091 * the parent pointer which means we must have a write lock on 2092 * the parent 2093 */ 2094 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2095 write_lock_level = level + 1; 2096 btrfs_release_path(p); 2097 goto again; 2098 } 2099 2100 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2101 &write_lock_level); 2102 2103 if (level == lowest_level) { 2104 if (dec) 2105 p->slots[level]++; 2106 goto done; 2107 } 2108 2109 err = read_block_for_search(root, p, &b, level, slot, key); 2110 if (err == -EAGAIN) 2111 goto again; 2112 if (err) { 2113 ret = err; 2114 goto done; 2115 } 2116 2117 if (!p->skip_locking) { 2118 level = btrfs_header_level(b); 2119 2120 btrfs_maybe_reset_lockdep_class(root, b); 2121 2122 if (level <= write_lock_level) { 2123 btrfs_tree_lock(b); 2124 p->locks[level] = BTRFS_WRITE_LOCK; 2125 } else { 2126 if (p->nowait) { 2127 if (!btrfs_try_tree_read_lock(b)) { 2128 free_extent_buffer(b); 2129 ret = -EAGAIN; 2130 goto done; 2131 } 2132 } else { 2133 btrfs_tree_read_lock(b); 2134 } 2135 p->locks[level] = BTRFS_READ_LOCK; 2136 } 2137 p->nodes[level] = b; 2138 } 2139 } 2140 ret = 1; 2141 done: 2142 if (ret < 0 && !p->skip_release_on_error) 2143 btrfs_release_path(p); 2144 2145 if (p->need_commit_sem) { 2146 int ret2; 2147 2148 ret2 = finish_need_commit_sem_search(p); 2149 up_read(&fs_info->commit_root_sem); 2150 if (ret2) 2151 ret = ret2; 2152 } 2153 2154 return ret; 2155 } 2156 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2157 2158 /* 2159 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2160 * current state of the tree together with the operations recorded in the tree 2161 * modification log to search for the key in a previous version of this tree, as 2162 * denoted by the time_seq parameter. 2163 * 2164 * Naturally, there is no support for insert, delete or cow operations. 2165 * 2166 * The resulting path and return value will be set up as if we called 2167 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2168 */ 2169 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2170 struct btrfs_path *p, u64 time_seq) 2171 { 2172 struct btrfs_fs_info *fs_info = root->fs_info; 2173 struct extent_buffer *b; 2174 int slot; 2175 int ret; 2176 int err; 2177 int level; 2178 int lowest_unlock = 1; 2179 u8 lowest_level = 0; 2180 2181 lowest_level = p->lowest_level; 2182 WARN_ON(p->nodes[0] != NULL); 2183 ASSERT(!p->nowait); 2184 2185 if (p->search_commit_root) { 2186 BUG_ON(time_seq); 2187 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2188 } 2189 2190 again: 2191 b = btrfs_get_old_root(root, time_seq); 2192 if (!b) { 2193 ret = -EIO; 2194 goto done; 2195 } 2196 level = btrfs_header_level(b); 2197 p->locks[level] = BTRFS_READ_LOCK; 2198 2199 while (b) { 2200 int dec = 0; 2201 2202 level = btrfs_header_level(b); 2203 p->nodes[level] = b; 2204 2205 /* 2206 * we have a lock on b and as long as we aren't changing 2207 * the tree, there is no way to for the items in b to change. 2208 * It is safe to drop the lock on our parent before we 2209 * go through the expensive btree search on b. 2210 */ 2211 btrfs_unlock_up_safe(p, level + 1); 2212 2213 ret = btrfs_bin_search(b, key, &slot); 2214 if (ret < 0) 2215 goto done; 2216 2217 if (level == 0) { 2218 p->slots[level] = slot; 2219 unlock_up(p, level, lowest_unlock, 0, NULL); 2220 goto done; 2221 } 2222 2223 if (ret && slot > 0) { 2224 dec = 1; 2225 slot--; 2226 } 2227 p->slots[level] = slot; 2228 unlock_up(p, level, lowest_unlock, 0, NULL); 2229 2230 if (level == lowest_level) { 2231 if (dec) 2232 p->slots[level]++; 2233 goto done; 2234 } 2235 2236 err = read_block_for_search(root, p, &b, level, slot, key); 2237 if (err == -EAGAIN) 2238 goto again; 2239 if (err) { 2240 ret = err; 2241 goto done; 2242 } 2243 2244 level = btrfs_header_level(b); 2245 btrfs_tree_read_lock(b); 2246 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2247 if (!b) { 2248 ret = -ENOMEM; 2249 goto done; 2250 } 2251 p->locks[level] = BTRFS_READ_LOCK; 2252 p->nodes[level] = b; 2253 } 2254 ret = 1; 2255 done: 2256 if (ret < 0) 2257 btrfs_release_path(p); 2258 2259 return ret; 2260 } 2261 2262 /* 2263 * helper to use instead of search slot if no exact match is needed but 2264 * instead the next or previous item should be returned. 2265 * When find_higher is true, the next higher item is returned, the next lower 2266 * otherwise. 2267 * When return_any and find_higher are both true, and no higher item is found, 2268 * return the next lower instead. 2269 * When return_any is true and find_higher is false, and no lower item is found, 2270 * return the next higher instead. 2271 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2272 * < 0 on error 2273 */ 2274 int btrfs_search_slot_for_read(struct btrfs_root *root, 2275 const struct btrfs_key *key, 2276 struct btrfs_path *p, int find_higher, 2277 int return_any) 2278 { 2279 int ret; 2280 struct extent_buffer *leaf; 2281 2282 again: 2283 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2284 if (ret <= 0) 2285 return ret; 2286 /* 2287 * a return value of 1 means the path is at the position where the 2288 * item should be inserted. Normally this is the next bigger item, 2289 * but in case the previous item is the last in a leaf, path points 2290 * to the first free slot in the previous leaf, i.e. at an invalid 2291 * item. 2292 */ 2293 leaf = p->nodes[0]; 2294 2295 if (find_higher) { 2296 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2297 ret = btrfs_next_leaf(root, p); 2298 if (ret <= 0) 2299 return ret; 2300 if (!return_any) 2301 return 1; 2302 /* 2303 * no higher item found, return the next 2304 * lower instead 2305 */ 2306 return_any = 0; 2307 find_higher = 0; 2308 btrfs_release_path(p); 2309 goto again; 2310 } 2311 } else { 2312 if (p->slots[0] == 0) { 2313 ret = btrfs_prev_leaf(root, p); 2314 if (ret < 0) 2315 return ret; 2316 if (!ret) { 2317 leaf = p->nodes[0]; 2318 if (p->slots[0] == btrfs_header_nritems(leaf)) 2319 p->slots[0]--; 2320 return 0; 2321 } 2322 if (!return_any) 2323 return 1; 2324 /* 2325 * no lower item found, return the next 2326 * higher instead 2327 */ 2328 return_any = 0; 2329 find_higher = 1; 2330 btrfs_release_path(p); 2331 goto again; 2332 } else { 2333 --p->slots[0]; 2334 } 2335 } 2336 return 0; 2337 } 2338 2339 /* 2340 * Execute search and call btrfs_previous_item to traverse backwards if the item 2341 * was not found. 2342 * 2343 * Return 0 if found, 1 if not found and < 0 if error. 2344 */ 2345 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2346 struct btrfs_path *path) 2347 { 2348 int ret; 2349 2350 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2351 if (ret > 0) 2352 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2353 2354 if (ret == 0) 2355 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2356 2357 return ret; 2358 } 2359 2360 /** 2361 * Search for a valid slot for the given path. 2362 * 2363 * @root: The root node of the tree. 2364 * @key: Will contain a valid item if found. 2365 * @path: The starting point to validate the slot. 2366 * 2367 * Return: 0 if the item is valid 2368 * 1 if not found 2369 * <0 if error. 2370 */ 2371 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2372 struct btrfs_path *path) 2373 { 2374 while (1) { 2375 int ret; 2376 const int slot = path->slots[0]; 2377 const struct extent_buffer *leaf = path->nodes[0]; 2378 2379 /* This is where we start walking the path. */ 2380 if (slot >= btrfs_header_nritems(leaf)) { 2381 /* 2382 * If we've reached the last slot in this leaf we need 2383 * to go to the next leaf and reset the path. 2384 */ 2385 ret = btrfs_next_leaf(root, path); 2386 if (ret) 2387 return ret; 2388 continue; 2389 } 2390 /* Store the found, valid item in @key. */ 2391 btrfs_item_key_to_cpu(leaf, key, slot); 2392 break; 2393 } 2394 return 0; 2395 } 2396 2397 /* 2398 * adjust the pointers going up the tree, starting at level 2399 * making sure the right key of each node is points to 'key'. 2400 * This is used after shifting pointers to the left, so it stops 2401 * fixing up pointers when a given leaf/node is not in slot 0 of the 2402 * higher levels 2403 * 2404 */ 2405 static void fixup_low_keys(struct btrfs_path *path, 2406 struct btrfs_disk_key *key, int level) 2407 { 2408 int i; 2409 struct extent_buffer *t; 2410 int ret; 2411 2412 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2413 int tslot = path->slots[i]; 2414 2415 if (!path->nodes[i]) 2416 break; 2417 t = path->nodes[i]; 2418 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2419 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); 2420 BUG_ON(ret < 0); 2421 btrfs_set_node_key(t, key, tslot); 2422 btrfs_mark_buffer_dirty(path->nodes[i]); 2423 if (tslot != 0) 2424 break; 2425 } 2426 } 2427 2428 /* 2429 * update item key. 2430 * 2431 * This function isn't completely safe. It's the caller's responsibility 2432 * that the new key won't break the order 2433 */ 2434 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2435 struct btrfs_path *path, 2436 const struct btrfs_key *new_key) 2437 { 2438 struct btrfs_disk_key disk_key; 2439 struct extent_buffer *eb; 2440 int slot; 2441 2442 eb = path->nodes[0]; 2443 slot = path->slots[0]; 2444 if (slot > 0) { 2445 btrfs_item_key(eb, &disk_key, slot - 1); 2446 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2447 btrfs_crit(fs_info, 2448 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2449 slot, btrfs_disk_key_objectid(&disk_key), 2450 btrfs_disk_key_type(&disk_key), 2451 btrfs_disk_key_offset(&disk_key), 2452 new_key->objectid, new_key->type, 2453 new_key->offset); 2454 btrfs_print_leaf(eb); 2455 BUG(); 2456 } 2457 } 2458 if (slot < btrfs_header_nritems(eb) - 1) { 2459 btrfs_item_key(eb, &disk_key, slot + 1); 2460 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2461 btrfs_crit(fs_info, 2462 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2463 slot, btrfs_disk_key_objectid(&disk_key), 2464 btrfs_disk_key_type(&disk_key), 2465 btrfs_disk_key_offset(&disk_key), 2466 new_key->objectid, new_key->type, 2467 new_key->offset); 2468 btrfs_print_leaf(eb); 2469 BUG(); 2470 } 2471 } 2472 2473 btrfs_cpu_key_to_disk(&disk_key, new_key); 2474 btrfs_set_item_key(eb, &disk_key, slot); 2475 btrfs_mark_buffer_dirty(eb); 2476 if (slot == 0) 2477 fixup_low_keys(path, &disk_key, 1); 2478 } 2479 2480 /* 2481 * Check key order of two sibling extent buffers. 2482 * 2483 * Return true if something is wrong. 2484 * Return false if everything is fine. 2485 * 2486 * Tree-checker only works inside one tree block, thus the following 2487 * corruption can not be detected by tree-checker: 2488 * 2489 * Leaf @left | Leaf @right 2490 * -------------------------------------------------------------- 2491 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2492 * 2493 * Key f6 in leaf @left itself is valid, but not valid when the next 2494 * key in leaf @right is 7. 2495 * This can only be checked at tree block merge time. 2496 * And since tree checker has ensured all key order in each tree block 2497 * is correct, we only need to bother the last key of @left and the first 2498 * key of @right. 2499 */ 2500 static bool check_sibling_keys(struct extent_buffer *left, 2501 struct extent_buffer *right) 2502 { 2503 struct btrfs_key left_last; 2504 struct btrfs_key right_first; 2505 int level = btrfs_header_level(left); 2506 int nr_left = btrfs_header_nritems(left); 2507 int nr_right = btrfs_header_nritems(right); 2508 2509 /* No key to check in one of the tree blocks */ 2510 if (!nr_left || !nr_right) 2511 return false; 2512 2513 if (level) { 2514 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2515 btrfs_node_key_to_cpu(right, &right_first, 0); 2516 } else { 2517 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2518 btrfs_item_key_to_cpu(right, &right_first, 0); 2519 } 2520 2521 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { 2522 btrfs_crit(left->fs_info, 2523 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2524 left_last.objectid, left_last.type, 2525 left_last.offset, right_first.objectid, 2526 right_first.type, right_first.offset); 2527 return true; 2528 } 2529 return false; 2530 } 2531 2532 /* 2533 * try to push data from one node into the next node left in the 2534 * tree. 2535 * 2536 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2537 * error, and > 0 if there was no room in the left hand block. 2538 */ 2539 static int push_node_left(struct btrfs_trans_handle *trans, 2540 struct extent_buffer *dst, 2541 struct extent_buffer *src, int empty) 2542 { 2543 struct btrfs_fs_info *fs_info = trans->fs_info; 2544 int push_items = 0; 2545 int src_nritems; 2546 int dst_nritems; 2547 int ret = 0; 2548 2549 src_nritems = btrfs_header_nritems(src); 2550 dst_nritems = btrfs_header_nritems(dst); 2551 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2552 WARN_ON(btrfs_header_generation(src) != trans->transid); 2553 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2554 2555 if (!empty && src_nritems <= 8) 2556 return 1; 2557 2558 if (push_items <= 0) 2559 return 1; 2560 2561 if (empty) { 2562 push_items = min(src_nritems, push_items); 2563 if (push_items < src_nritems) { 2564 /* leave at least 8 pointers in the node if 2565 * we aren't going to empty it 2566 */ 2567 if (src_nritems - push_items < 8) { 2568 if (push_items <= 8) 2569 return 1; 2570 push_items -= 8; 2571 } 2572 } 2573 } else 2574 push_items = min(src_nritems - 8, push_items); 2575 2576 /* dst is the left eb, src is the middle eb */ 2577 if (check_sibling_keys(dst, src)) { 2578 ret = -EUCLEAN; 2579 btrfs_abort_transaction(trans, ret); 2580 return ret; 2581 } 2582 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2583 if (ret) { 2584 btrfs_abort_transaction(trans, ret); 2585 return ret; 2586 } 2587 copy_extent_buffer(dst, src, 2588 btrfs_node_key_ptr_offset(dst_nritems), 2589 btrfs_node_key_ptr_offset(0), 2590 push_items * sizeof(struct btrfs_key_ptr)); 2591 2592 if (push_items < src_nritems) { 2593 /* 2594 * Don't call btrfs_tree_mod_log_insert_move() here, key removal 2595 * was already fully logged by btrfs_tree_mod_log_eb_copy() above. 2596 */ 2597 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2598 btrfs_node_key_ptr_offset(push_items), 2599 (src_nritems - push_items) * 2600 sizeof(struct btrfs_key_ptr)); 2601 } 2602 btrfs_set_header_nritems(src, src_nritems - push_items); 2603 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2604 btrfs_mark_buffer_dirty(src); 2605 btrfs_mark_buffer_dirty(dst); 2606 2607 return ret; 2608 } 2609 2610 /* 2611 * try to push data from one node into the next node right in the 2612 * tree. 2613 * 2614 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2615 * error, and > 0 if there was no room in the right hand block. 2616 * 2617 * this will only push up to 1/2 the contents of the left node over 2618 */ 2619 static int balance_node_right(struct btrfs_trans_handle *trans, 2620 struct extent_buffer *dst, 2621 struct extent_buffer *src) 2622 { 2623 struct btrfs_fs_info *fs_info = trans->fs_info; 2624 int push_items = 0; 2625 int max_push; 2626 int src_nritems; 2627 int dst_nritems; 2628 int ret = 0; 2629 2630 WARN_ON(btrfs_header_generation(src) != trans->transid); 2631 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2632 2633 src_nritems = btrfs_header_nritems(src); 2634 dst_nritems = btrfs_header_nritems(dst); 2635 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2636 if (push_items <= 0) 2637 return 1; 2638 2639 if (src_nritems < 4) 2640 return 1; 2641 2642 max_push = src_nritems / 2 + 1; 2643 /* don't try to empty the node */ 2644 if (max_push >= src_nritems) 2645 return 1; 2646 2647 if (max_push < push_items) 2648 push_items = max_push; 2649 2650 /* dst is the right eb, src is the middle eb */ 2651 if (check_sibling_keys(src, dst)) { 2652 ret = -EUCLEAN; 2653 btrfs_abort_transaction(trans, ret); 2654 return ret; 2655 } 2656 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 2657 BUG_ON(ret < 0); 2658 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2659 btrfs_node_key_ptr_offset(0), 2660 (dst_nritems) * 2661 sizeof(struct btrfs_key_ptr)); 2662 2663 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2664 push_items); 2665 if (ret) { 2666 btrfs_abort_transaction(trans, ret); 2667 return ret; 2668 } 2669 copy_extent_buffer(dst, src, 2670 btrfs_node_key_ptr_offset(0), 2671 btrfs_node_key_ptr_offset(src_nritems - push_items), 2672 push_items * sizeof(struct btrfs_key_ptr)); 2673 2674 btrfs_set_header_nritems(src, src_nritems - push_items); 2675 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2676 2677 btrfs_mark_buffer_dirty(src); 2678 btrfs_mark_buffer_dirty(dst); 2679 2680 return ret; 2681 } 2682 2683 /* 2684 * helper function to insert a new root level in the tree. 2685 * A new node is allocated, and a single item is inserted to 2686 * point to the existing root 2687 * 2688 * returns zero on success or < 0 on failure. 2689 */ 2690 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2691 struct btrfs_root *root, 2692 struct btrfs_path *path, int level) 2693 { 2694 struct btrfs_fs_info *fs_info = root->fs_info; 2695 u64 lower_gen; 2696 struct extent_buffer *lower; 2697 struct extent_buffer *c; 2698 struct extent_buffer *old; 2699 struct btrfs_disk_key lower_key; 2700 int ret; 2701 2702 BUG_ON(path->nodes[level]); 2703 BUG_ON(path->nodes[level-1] != root->node); 2704 2705 lower = path->nodes[level-1]; 2706 if (level == 1) 2707 btrfs_item_key(lower, &lower_key, 0); 2708 else 2709 btrfs_node_key(lower, &lower_key, 0); 2710 2711 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2712 &lower_key, level, root->node->start, 0, 2713 BTRFS_NESTING_NEW_ROOT); 2714 if (IS_ERR(c)) 2715 return PTR_ERR(c); 2716 2717 root_add_used(root, fs_info->nodesize); 2718 2719 btrfs_set_header_nritems(c, 1); 2720 btrfs_set_node_key(c, &lower_key, 0); 2721 btrfs_set_node_blockptr(c, 0, lower->start); 2722 lower_gen = btrfs_header_generation(lower); 2723 WARN_ON(lower_gen != trans->transid); 2724 2725 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2726 2727 btrfs_mark_buffer_dirty(c); 2728 2729 old = root->node; 2730 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2731 BUG_ON(ret < 0); 2732 rcu_assign_pointer(root->node, c); 2733 2734 /* the super has an extra ref to root->node */ 2735 free_extent_buffer(old); 2736 2737 add_root_to_dirty_list(root); 2738 atomic_inc(&c->refs); 2739 path->nodes[level] = c; 2740 path->locks[level] = BTRFS_WRITE_LOCK; 2741 path->slots[level] = 0; 2742 return 0; 2743 } 2744 2745 /* 2746 * worker function to insert a single pointer in a node. 2747 * the node should have enough room for the pointer already 2748 * 2749 * slot and level indicate where you want the key to go, and 2750 * blocknr is the block the key points to. 2751 */ 2752 static void insert_ptr(struct btrfs_trans_handle *trans, 2753 struct btrfs_path *path, 2754 struct btrfs_disk_key *key, u64 bytenr, 2755 int slot, int level) 2756 { 2757 struct extent_buffer *lower; 2758 int nritems; 2759 int ret; 2760 2761 BUG_ON(!path->nodes[level]); 2762 btrfs_assert_tree_write_locked(path->nodes[level]); 2763 lower = path->nodes[level]; 2764 nritems = btrfs_header_nritems(lower); 2765 BUG_ON(slot > nritems); 2766 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2767 if (slot != nritems) { 2768 if (level) { 2769 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2770 slot, nritems - slot); 2771 BUG_ON(ret < 0); 2772 } 2773 memmove_extent_buffer(lower, 2774 btrfs_node_key_ptr_offset(slot + 1), 2775 btrfs_node_key_ptr_offset(slot), 2776 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2777 } 2778 if (level) { 2779 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2780 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); 2781 BUG_ON(ret < 0); 2782 } 2783 btrfs_set_node_key(lower, key, slot); 2784 btrfs_set_node_blockptr(lower, slot, bytenr); 2785 WARN_ON(trans->transid == 0); 2786 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2787 btrfs_set_header_nritems(lower, nritems + 1); 2788 btrfs_mark_buffer_dirty(lower); 2789 } 2790 2791 /* 2792 * split the node at the specified level in path in two. 2793 * The path is corrected to point to the appropriate node after the split 2794 * 2795 * Before splitting this tries to make some room in the node by pushing 2796 * left and right, if either one works, it returns right away. 2797 * 2798 * returns 0 on success and < 0 on failure 2799 */ 2800 static noinline int split_node(struct btrfs_trans_handle *trans, 2801 struct btrfs_root *root, 2802 struct btrfs_path *path, int level) 2803 { 2804 struct btrfs_fs_info *fs_info = root->fs_info; 2805 struct extent_buffer *c; 2806 struct extent_buffer *split; 2807 struct btrfs_disk_key disk_key; 2808 int mid; 2809 int ret; 2810 u32 c_nritems; 2811 2812 c = path->nodes[level]; 2813 WARN_ON(btrfs_header_generation(c) != trans->transid); 2814 if (c == root->node) { 2815 /* 2816 * trying to split the root, lets make a new one 2817 * 2818 * tree mod log: We don't log_removal old root in 2819 * insert_new_root, because that root buffer will be kept as a 2820 * normal node. We are going to log removal of half of the 2821 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2822 * holding a tree lock on the buffer, which is why we cannot 2823 * race with other tree_mod_log users. 2824 */ 2825 ret = insert_new_root(trans, root, path, level + 1); 2826 if (ret) 2827 return ret; 2828 } else { 2829 ret = push_nodes_for_insert(trans, root, path, level); 2830 c = path->nodes[level]; 2831 if (!ret && btrfs_header_nritems(c) < 2832 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 2833 return 0; 2834 if (ret < 0) 2835 return ret; 2836 } 2837 2838 c_nritems = btrfs_header_nritems(c); 2839 mid = (c_nritems + 1) / 2; 2840 btrfs_node_key(c, &disk_key, mid); 2841 2842 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2843 &disk_key, level, c->start, 0, 2844 BTRFS_NESTING_SPLIT); 2845 if (IS_ERR(split)) 2846 return PTR_ERR(split); 2847 2848 root_add_used(root, fs_info->nodesize); 2849 ASSERT(btrfs_header_level(c) == level); 2850 2851 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 2852 if (ret) { 2853 btrfs_abort_transaction(trans, ret); 2854 return ret; 2855 } 2856 copy_extent_buffer(split, c, 2857 btrfs_node_key_ptr_offset(0), 2858 btrfs_node_key_ptr_offset(mid), 2859 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 2860 btrfs_set_header_nritems(split, c_nritems - mid); 2861 btrfs_set_header_nritems(c, mid); 2862 2863 btrfs_mark_buffer_dirty(c); 2864 btrfs_mark_buffer_dirty(split); 2865 2866 insert_ptr(trans, path, &disk_key, split->start, 2867 path->slots[level + 1] + 1, level + 1); 2868 2869 if (path->slots[level] >= mid) { 2870 path->slots[level] -= mid; 2871 btrfs_tree_unlock(c); 2872 free_extent_buffer(c); 2873 path->nodes[level] = split; 2874 path->slots[level + 1] += 1; 2875 } else { 2876 btrfs_tree_unlock(split); 2877 free_extent_buffer(split); 2878 } 2879 return 0; 2880 } 2881 2882 /* 2883 * how many bytes are required to store the items in a leaf. start 2884 * and nr indicate which items in the leaf to check. This totals up the 2885 * space used both by the item structs and the item data 2886 */ 2887 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 2888 { 2889 int data_len; 2890 int nritems = btrfs_header_nritems(l); 2891 int end = min(nritems, start + nr) - 1; 2892 2893 if (!nr) 2894 return 0; 2895 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 2896 data_len = data_len - btrfs_item_offset(l, end); 2897 data_len += sizeof(struct btrfs_item) * nr; 2898 WARN_ON(data_len < 0); 2899 return data_len; 2900 } 2901 2902 /* 2903 * The space between the end of the leaf items and 2904 * the start of the leaf data. IOW, how much room 2905 * the leaf has left for both items and data 2906 */ 2907 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) 2908 { 2909 struct btrfs_fs_info *fs_info = leaf->fs_info; 2910 int nritems = btrfs_header_nritems(leaf); 2911 int ret; 2912 2913 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 2914 if (ret < 0) { 2915 btrfs_crit(fs_info, 2916 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 2917 ret, 2918 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 2919 leaf_space_used(leaf, 0, nritems), nritems); 2920 } 2921 return ret; 2922 } 2923 2924 /* 2925 * min slot controls the lowest index we're willing to push to the 2926 * right. We'll push up to and including min_slot, but no lower 2927 */ 2928 static noinline int __push_leaf_right(struct btrfs_path *path, 2929 int data_size, int empty, 2930 struct extent_buffer *right, 2931 int free_space, u32 left_nritems, 2932 u32 min_slot) 2933 { 2934 struct btrfs_fs_info *fs_info = right->fs_info; 2935 struct extent_buffer *left = path->nodes[0]; 2936 struct extent_buffer *upper = path->nodes[1]; 2937 struct btrfs_map_token token; 2938 struct btrfs_disk_key disk_key; 2939 int slot; 2940 u32 i; 2941 int push_space = 0; 2942 int push_items = 0; 2943 u32 nr; 2944 u32 right_nritems; 2945 u32 data_end; 2946 u32 this_item_size; 2947 2948 if (empty) 2949 nr = 0; 2950 else 2951 nr = max_t(u32, 1, min_slot); 2952 2953 if (path->slots[0] >= left_nritems) 2954 push_space += data_size; 2955 2956 slot = path->slots[1]; 2957 i = left_nritems - 1; 2958 while (i >= nr) { 2959 if (!empty && push_items > 0) { 2960 if (path->slots[0] > i) 2961 break; 2962 if (path->slots[0] == i) { 2963 int space = btrfs_leaf_free_space(left); 2964 2965 if (space + push_space * 2 > free_space) 2966 break; 2967 } 2968 } 2969 2970 if (path->slots[0] == i) 2971 push_space += data_size; 2972 2973 this_item_size = btrfs_item_size(left, i); 2974 if (this_item_size + sizeof(struct btrfs_item) + 2975 push_space > free_space) 2976 break; 2977 2978 push_items++; 2979 push_space += this_item_size + sizeof(struct btrfs_item); 2980 if (i == 0) 2981 break; 2982 i--; 2983 } 2984 2985 if (push_items == 0) 2986 goto out_unlock; 2987 2988 WARN_ON(!empty && push_items == left_nritems); 2989 2990 /* push left to right */ 2991 right_nritems = btrfs_header_nritems(right); 2992 2993 push_space = btrfs_item_data_end(left, left_nritems - push_items); 2994 push_space -= leaf_data_end(left); 2995 2996 /* make room in the right data area */ 2997 data_end = leaf_data_end(right); 2998 memmove_extent_buffer(right, 2999 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 3000 BTRFS_LEAF_DATA_OFFSET + data_end, 3001 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3002 3003 /* copy from the left data area */ 3004 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 3005 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3006 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), 3007 push_space); 3008 3009 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3010 btrfs_item_nr_offset(0), 3011 right_nritems * sizeof(struct btrfs_item)); 3012 3013 /* copy the items from left to right */ 3014 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 3015 btrfs_item_nr_offset(left_nritems - push_items), 3016 push_items * sizeof(struct btrfs_item)); 3017 3018 /* update the item pointers */ 3019 btrfs_init_map_token(&token, right); 3020 right_nritems += push_items; 3021 btrfs_set_header_nritems(right, right_nritems); 3022 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3023 for (i = 0; i < right_nritems; i++) { 3024 push_space -= btrfs_token_item_size(&token, i); 3025 btrfs_set_token_item_offset(&token, i, push_space); 3026 } 3027 3028 left_nritems -= push_items; 3029 btrfs_set_header_nritems(left, left_nritems); 3030 3031 if (left_nritems) 3032 btrfs_mark_buffer_dirty(left); 3033 else 3034 btrfs_clean_tree_block(left); 3035 3036 btrfs_mark_buffer_dirty(right); 3037 3038 btrfs_item_key(right, &disk_key, 0); 3039 btrfs_set_node_key(upper, &disk_key, slot + 1); 3040 btrfs_mark_buffer_dirty(upper); 3041 3042 /* then fixup the leaf pointer in the path */ 3043 if (path->slots[0] >= left_nritems) { 3044 path->slots[0] -= left_nritems; 3045 if (btrfs_header_nritems(path->nodes[0]) == 0) 3046 btrfs_clean_tree_block(path->nodes[0]); 3047 btrfs_tree_unlock(path->nodes[0]); 3048 free_extent_buffer(path->nodes[0]); 3049 path->nodes[0] = right; 3050 path->slots[1] += 1; 3051 } else { 3052 btrfs_tree_unlock(right); 3053 free_extent_buffer(right); 3054 } 3055 return 0; 3056 3057 out_unlock: 3058 btrfs_tree_unlock(right); 3059 free_extent_buffer(right); 3060 return 1; 3061 } 3062 3063 /* 3064 * push some data in the path leaf to the right, trying to free up at 3065 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3066 * 3067 * returns 1 if the push failed because the other node didn't have enough 3068 * room, 0 if everything worked out and < 0 if there were major errors. 3069 * 3070 * this will push starting from min_slot to the end of the leaf. It won't 3071 * push any slot lower than min_slot 3072 */ 3073 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3074 *root, struct btrfs_path *path, 3075 int min_data_size, int data_size, 3076 int empty, u32 min_slot) 3077 { 3078 struct extent_buffer *left = path->nodes[0]; 3079 struct extent_buffer *right; 3080 struct extent_buffer *upper; 3081 int slot; 3082 int free_space; 3083 u32 left_nritems; 3084 int ret; 3085 3086 if (!path->nodes[1]) 3087 return 1; 3088 3089 slot = path->slots[1]; 3090 upper = path->nodes[1]; 3091 if (slot >= btrfs_header_nritems(upper) - 1) 3092 return 1; 3093 3094 btrfs_assert_tree_write_locked(path->nodes[1]); 3095 3096 right = btrfs_read_node_slot(upper, slot + 1); 3097 /* 3098 * slot + 1 is not valid or we fail to read the right node, 3099 * no big deal, just return. 3100 */ 3101 if (IS_ERR(right)) 3102 return 1; 3103 3104 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 3105 3106 free_space = btrfs_leaf_free_space(right); 3107 if (free_space < data_size) 3108 goto out_unlock; 3109 3110 ret = btrfs_cow_block(trans, root, right, upper, 3111 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3112 if (ret) 3113 goto out_unlock; 3114 3115 left_nritems = btrfs_header_nritems(left); 3116 if (left_nritems == 0) 3117 goto out_unlock; 3118 3119 if (check_sibling_keys(left, right)) { 3120 ret = -EUCLEAN; 3121 btrfs_tree_unlock(right); 3122 free_extent_buffer(right); 3123 return ret; 3124 } 3125 if (path->slots[0] == left_nritems && !empty) { 3126 /* Key greater than all keys in the leaf, right neighbor has 3127 * enough room for it and we're not emptying our leaf to delete 3128 * it, therefore use right neighbor to insert the new item and 3129 * no need to touch/dirty our left leaf. */ 3130 btrfs_tree_unlock(left); 3131 free_extent_buffer(left); 3132 path->nodes[0] = right; 3133 path->slots[0] = 0; 3134 path->slots[1]++; 3135 return 0; 3136 } 3137 3138 return __push_leaf_right(path, min_data_size, empty, 3139 right, free_space, left_nritems, min_slot); 3140 out_unlock: 3141 btrfs_tree_unlock(right); 3142 free_extent_buffer(right); 3143 return 1; 3144 } 3145 3146 /* 3147 * push some data in the path leaf to the left, trying to free up at 3148 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3149 * 3150 * max_slot can put a limit on how far into the leaf we'll push items. The 3151 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3152 * items 3153 */ 3154 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, 3155 int empty, struct extent_buffer *left, 3156 int free_space, u32 right_nritems, 3157 u32 max_slot) 3158 { 3159 struct btrfs_fs_info *fs_info = left->fs_info; 3160 struct btrfs_disk_key disk_key; 3161 struct extent_buffer *right = path->nodes[0]; 3162 int i; 3163 int push_space = 0; 3164 int push_items = 0; 3165 u32 old_left_nritems; 3166 u32 nr; 3167 int ret = 0; 3168 u32 this_item_size; 3169 u32 old_left_item_size; 3170 struct btrfs_map_token token; 3171 3172 if (empty) 3173 nr = min(right_nritems, max_slot); 3174 else 3175 nr = min(right_nritems - 1, max_slot); 3176 3177 for (i = 0; i < nr; i++) { 3178 if (!empty && push_items > 0) { 3179 if (path->slots[0] < i) 3180 break; 3181 if (path->slots[0] == i) { 3182 int space = btrfs_leaf_free_space(right); 3183 3184 if (space + push_space * 2 > free_space) 3185 break; 3186 } 3187 } 3188 3189 if (path->slots[0] == i) 3190 push_space += data_size; 3191 3192 this_item_size = btrfs_item_size(right, i); 3193 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3194 free_space) 3195 break; 3196 3197 push_items++; 3198 push_space += this_item_size + sizeof(struct btrfs_item); 3199 } 3200 3201 if (push_items == 0) { 3202 ret = 1; 3203 goto out; 3204 } 3205 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3206 3207 /* push data from right to left */ 3208 copy_extent_buffer(left, right, 3209 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3210 btrfs_item_nr_offset(0), 3211 push_items * sizeof(struct btrfs_item)); 3212 3213 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3214 btrfs_item_offset(right, push_items - 1); 3215 3216 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 3217 leaf_data_end(left) - push_space, 3218 BTRFS_LEAF_DATA_OFFSET + 3219 btrfs_item_offset(right, push_items - 1), 3220 push_space); 3221 old_left_nritems = btrfs_header_nritems(left); 3222 BUG_ON(old_left_nritems <= 0); 3223 3224 btrfs_init_map_token(&token, left); 3225 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3226 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3227 u32 ioff; 3228 3229 ioff = btrfs_token_item_offset(&token, i); 3230 btrfs_set_token_item_offset(&token, i, 3231 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3232 } 3233 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3234 3235 /* fixup right node */ 3236 if (push_items > right_nritems) 3237 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3238 right_nritems); 3239 3240 if (push_items < right_nritems) { 3241 push_space = btrfs_item_offset(right, push_items - 1) - 3242 leaf_data_end(right); 3243 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 3244 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3245 BTRFS_LEAF_DATA_OFFSET + 3246 leaf_data_end(right), push_space); 3247 3248 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3249 btrfs_item_nr_offset(push_items), 3250 (btrfs_header_nritems(right) - push_items) * 3251 sizeof(struct btrfs_item)); 3252 } 3253 3254 btrfs_init_map_token(&token, right); 3255 right_nritems -= push_items; 3256 btrfs_set_header_nritems(right, right_nritems); 3257 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3258 for (i = 0; i < right_nritems; i++) { 3259 push_space = push_space - btrfs_token_item_size(&token, i); 3260 btrfs_set_token_item_offset(&token, i, push_space); 3261 } 3262 3263 btrfs_mark_buffer_dirty(left); 3264 if (right_nritems) 3265 btrfs_mark_buffer_dirty(right); 3266 else 3267 btrfs_clean_tree_block(right); 3268 3269 btrfs_item_key(right, &disk_key, 0); 3270 fixup_low_keys(path, &disk_key, 1); 3271 3272 /* then fixup the leaf pointer in the path */ 3273 if (path->slots[0] < push_items) { 3274 path->slots[0] += old_left_nritems; 3275 btrfs_tree_unlock(path->nodes[0]); 3276 free_extent_buffer(path->nodes[0]); 3277 path->nodes[0] = left; 3278 path->slots[1] -= 1; 3279 } else { 3280 btrfs_tree_unlock(left); 3281 free_extent_buffer(left); 3282 path->slots[0] -= push_items; 3283 } 3284 BUG_ON(path->slots[0] < 0); 3285 return ret; 3286 out: 3287 btrfs_tree_unlock(left); 3288 free_extent_buffer(left); 3289 return ret; 3290 } 3291 3292 /* 3293 * push some data in the path leaf to the left, trying to free up at 3294 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3295 * 3296 * max_slot can put a limit on how far into the leaf we'll push items. The 3297 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3298 * items 3299 */ 3300 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3301 *root, struct btrfs_path *path, int min_data_size, 3302 int data_size, int empty, u32 max_slot) 3303 { 3304 struct extent_buffer *right = path->nodes[0]; 3305 struct extent_buffer *left; 3306 int slot; 3307 int free_space; 3308 u32 right_nritems; 3309 int ret = 0; 3310 3311 slot = path->slots[1]; 3312 if (slot == 0) 3313 return 1; 3314 if (!path->nodes[1]) 3315 return 1; 3316 3317 right_nritems = btrfs_header_nritems(right); 3318 if (right_nritems == 0) 3319 return 1; 3320 3321 btrfs_assert_tree_write_locked(path->nodes[1]); 3322 3323 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3324 /* 3325 * slot - 1 is not valid or we fail to read the left node, 3326 * no big deal, just return. 3327 */ 3328 if (IS_ERR(left)) 3329 return 1; 3330 3331 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3332 3333 free_space = btrfs_leaf_free_space(left); 3334 if (free_space < data_size) { 3335 ret = 1; 3336 goto out; 3337 } 3338 3339 ret = btrfs_cow_block(trans, root, left, 3340 path->nodes[1], slot - 1, &left, 3341 BTRFS_NESTING_LEFT_COW); 3342 if (ret) { 3343 /* we hit -ENOSPC, but it isn't fatal here */ 3344 if (ret == -ENOSPC) 3345 ret = 1; 3346 goto out; 3347 } 3348 3349 if (check_sibling_keys(left, right)) { 3350 ret = -EUCLEAN; 3351 goto out; 3352 } 3353 return __push_leaf_left(path, min_data_size, 3354 empty, left, free_space, right_nritems, 3355 max_slot); 3356 out: 3357 btrfs_tree_unlock(left); 3358 free_extent_buffer(left); 3359 return ret; 3360 } 3361 3362 /* 3363 * split the path's leaf in two, making sure there is at least data_size 3364 * available for the resulting leaf level of the path. 3365 */ 3366 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3367 struct btrfs_path *path, 3368 struct extent_buffer *l, 3369 struct extent_buffer *right, 3370 int slot, int mid, int nritems) 3371 { 3372 struct btrfs_fs_info *fs_info = trans->fs_info; 3373 int data_copy_size; 3374 int rt_data_off; 3375 int i; 3376 struct btrfs_disk_key disk_key; 3377 struct btrfs_map_token token; 3378 3379 nritems = nritems - mid; 3380 btrfs_set_header_nritems(right, nritems); 3381 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3382 3383 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3384 btrfs_item_nr_offset(mid), 3385 nritems * sizeof(struct btrfs_item)); 3386 3387 copy_extent_buffer(right, l, 3388 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 3389 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 3390 leaf_data_end(l), data_copy_size); 3391 3392 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3393 3394 btrfs_init_map_token(&token, right); 3395 for (i = 0; i < nritems; i++) { 3396 u32 ioff; 3397 3398 ioff = btrfs_token_item_offset(&token, i); 3399 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3400 } 3401 3402 btrfs_set_header_nritems(l, mid); 3403 btrfs_item_key(right, &disk_key, 0); 3404 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3405 3406 btrfs_mark_buffer_dirty(right); 3407 btrfs_mark_buffer_dirty(l); 3408 BUG_ON(path->slots[0] != slot); 3409 3410 if (mid <= slot) { 3411 btrfs_tree_unlock(path->nodes[0]); 3412 free_extent_buffer(path->nodes[0]); 3413 path->nodes[0] = right; 3414 path->slots[0] -= mid; 3415 path->slots[1] += 1; 3416 } else { 3417 btrfs_tree_unlock(right); 3418 free_extent_buffer(right); 3419 } 3420 3421 BUG_ON(path->slots[0] < 0); 3422 } 3423 3424 /* 3425 * double splits happen when we need to insert a big item in the middle 3426 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3427 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3428 * A B C 3429 * 3430 * We avoid this by trying to push the items on either side of our target 3431 * into the adjacent leaves. If all goes well we can avoid the double split 3432 * completely. 3433 */ 3434 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3435 struct btrfs_root *root, 3436 struct btrfs_path *path, 3437 int data_size) 3438 { 3439 int ret; 3440 int progress = 0; 3441 int slot; 3442 u32 nritems; 3443 int space_needed = data_size; 3444 3445 slot = path->slots[0]; 3446 if (slot < btrfs_header_nritems(path->nodes[0])) 3447 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3448 3449 /* 3450 * try to push all the items after our slot into the 3451 * right leaf 3452 */ 3453 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3454 if (ret < 0) 3455 return ret; 3456 3457 if (ret == 0) 3458 progress++; 3459 3460 nritems = btrfs_header_nritems(path->nodes[0]); 3461 /* 3462 * our goal is to get our slot at the start or end of a leaf. If 3463 * we've done so we're done 3464 */ 3465 if (path->slots[0] == 0 || path->slots[0] == nritems) 3466 return 0; 3467 3468 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3469 return 0; 3470 3471 /* try to push all the items before our slot into the next leaf */ 3472 slot = path->slots[0]; 3473 space_needed = data_size; 3474 if (slot > 0) 3475 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3476 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3477 if (ret < 0) 3478 return ret; 3479 3480 if (ret == 0) 3481 progress++; 3482 3483 if (progress) 3484 return 0; 3485 return 1; 3486 } 3487 3488 /* 3489 * split the path's leaf in two, making sure there is at least data_size 3490 * available for the resulting leaf level of the path. 3491 * 3492 * returns 0 if all went well and < 0 on failure. 3493 */ 3494 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3495 struct btrfs_root *root, 3496 const struct btrfs_key *ins_key, 3497 struct btrfs_path *path, int data_size, 3498 int extend) 3499 { 3500 struct btrfs_disk_key disk_key; 3501 struct extent_buffer *l; 3502 u32 nritems; 3503 int mid; 3504 int slot; 3505 struct extent_buffer *right; 3506 struct btrfs_fs_info *fs_info = root->fs_info; 3507 int ret = 0; 3508 int wret; 3509 int split; 3510 int num_doubles = 0; 3511 int tried_avoid_double = 0; 3512 3513 l = path->nodes[0]; 3514 slot = path->slots[0]; 3515 if (extend && data_size + btrfs_item_size(l, slot) + 3516 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3517 return -EOVERFLOW; 3518 3519 /* first try to make some room by pushing left and right */ 3520 if (data_size && path->nodes[1]) { 3521 int space_needed = data_size; 3522 3523 if (slot < btrfs_header_nritems(l)) 3524 space_needed -= btrfs_leaf_free_space(l); 3525 3526 wret = push_leaf_right(trans, root, path, space_needed, 3527 space_needed, 0, 0); 3528 if (wret < 0) 3529 return wret; 3530 if (wret) { 3531 space_needed = data_size; 3532 if (slot > 0) 3533 space_needed -= btrfs_leaf_free_space(l); 3534 wret = push_leaf_left(trans, root, path, space_needed, 3535 space_needed, 0, (u32)-1); 3536 if (wret < 0) 3537 return wret; 3538 } 3539 l = path->nodes[0]; 3540 3541 /* did the pushes work? */ 3542 if (btrfs_leaf_free_space(l) >= data_size) 3543 return 0; 3544 } 3545 3546 if (!path->nodes[1]) { 3547 ret = insert_new_root(trans, root, path, 1); 3548 if (ret) 3549 return ret; 3550 } 3551 again: 3552 split = 1; 3553 l = path->nodes[0]; 3554 slot = path->slots[0]; 3555 nritems = btrfs_header_nritems(l); 3556 mid = (nritems + 1) / 2; 3557 3558 if (mid <= slot) { 3559 if (nritems == 1 || 3560 leaf_space_used(l, mid, nritems - mid) + data_size > 3561 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3562 if (slot >= nritems) { 3563 split = 0; 3564 } else { 3565 mid = slot; 3566 if (mid != nritems && 3567 leaf_space_used(l, mid, nritems - mid) + 3568 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3569 if (data_size && !tried_avoid_double) 3570 goto push_for_double; 3571 split = 2; 3572 } 3573 } 3574 } 3575 } else { 3576 if (leaf_space_used(l, 0, mid) + data_size > 3577 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3578 if (!extend && data_size && slot == 0) { 3579 split = 0; 3580 } else if ((extend || !data_size) && slot == 0) { 3581 mid = 1; 3582 } else { 3583 mid = slot; 3584 if (mid != nritems && 3585 leaf_space_used(l, mid, nritems - mid) + 3586 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3587 if (data_size && !tried_avoid_double) 3588 goto push_for_double; 3589 split = 2; 3590 } 3591 } 3592 } 3593 } 3594 3595 if (split == 0) 3596 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3597 else 3598 btrfs_item_key(l, &disk_key, mid); 3599 3600 /* 3601 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3602 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3603 * subclasses, which is 8 at the time of this patch, and we've maxed it 3604 * out. In the future we could add a 3605 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3606 * use BTRFS_NESTING_NEW_ROOT. 3607 */ 3608 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3609 &disk_key, 0, l->start, 0, 3610 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3611 BTRFS_NESTING_SPLIT); 3612 if (IS_ERR(right)) 3613 return PTR_ERR(right); 3614 3615 root_add_used(root, fs_info->nodesize); 3616 3617 if (split == 0) { 3618 if (mid <= slot) { 3619 btrfs_set_header_nritems(right, 0); 3620 insert_ptr(trans, path, &disk_key, 3621 right->start, path->slots[1] + 1, 1); 3622 btrfs_tree_unlock(path->nodes[0]); 3623 free_extent_buffer(path->nodes[0]); 3624 path->nodes[0] = right; 3625 path->slots[0] = 0; 3626 path->slots[1] += 1; 3627 } else { 3628 btrfs_set_header_nritems(right, 0); 3629 insert_ptr(trans, path, &disk_key, 3630 right->start, path->slots[1], 1); 3631 btrfs_tree_unlock(path->nodes[0]); 3632 free_extent_buffer(path->nodes[0]); 3633 path->nodes[0] = right; 3634 path->slots[0] = 0; 3635 if (path->slots[1] == 0) 3636 fixup_low_keys(path, &disk_key, 1); 3637 } 3638 /* 3639 * We create a new leaf 'right' for the required ins_len and 3640 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3641 * the content of ins_len to 'right'. 3642 */ 3643 return ret; 3644 } 3645 3646 copy_for_split(trans, path, l, right, slot, mid, nritems); 3647 3648 if (split == 2) { 3649 BUG_ON(num_doubles != 0); 3650 num_doubles++; 3651 goto again; 3652 } 3653 3654 return 0; 3655 3656 push_for_double: 3657 push_for_double_split(trans, root, path, data_size); 3658 tried_avoid_double = 1; 3659 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3660 return 0; 3661 goto again; 3662 } 3663 3664 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3665 struct btrfs_root *root, 3666 struct btrfs_path *path, int ins_len) 3667 { 3668 struct btrfs_key key; 3669 struct extent_buffer *leaf; 3670 struct btrfs_file_extent_item *fi; 3671 u64 extent_len = 0; 3672 u32 item_size; 3673 int ret; 3674 3675 leaf = path->nodes[0]; 3676 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3677 3678 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3679 key.type != BTRFS_EXTENT_CSUM_KEY); 3680 3681 if (btrfs_leaf_free_space(leaf) >= ins_len) 3682 return 0; 3683 3684 item_size = btrfs_item_size(leaf, path->slots[0]); 3685 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3686 fi = btrfs_item_ptr(leaf, path->slots[0], 3687 struct btrfs_file_extent_item); 3688 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3689 } 3690 btrfs_release_path(path); 3691 3692 path->keep_locks = 1; 3693 path->search_for_split = 1; 3694 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3695 path->search_for_split = 0; 3696 if (ret > 0) 3697 ret = -EAGAIN; 3698 if (ret < 0) 3699 goto err; 3700 3701 ret = -EAGAIN; 3702 leaf = path->nodes[0]; 3703 /* if our item isn't there, return now */ 3704 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3705 goto err; 3706 3707 /* the leaf has changed, it now has room. return now */ 3708 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3709 goto err; 3710 3711 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3712 fi = btrfs_item_ptr(leaf, path->slots[0], 3713 struct btrfs_file_extent_item); 3714 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3715 goto err; 3716 } 3717 3718 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3719 if (ret) 3720 goto err; 3721 3722 path->keep_locks = 0; 3723 btrfs_unlock_up_safe(path, 1); 3724 return 0; 3725 err: 3726 path->keep_locks = 0; 3727 return ret; 3728 } 3729 3730 static noinline int split_item(struct btrfs_path *path, 3731 const struct btrfs_key *new_key, 3732 unsigned long split_offset) 3733 { 3734 struct extent_buffer *leaf; 3735 int orig_slot, slot; 3736 char *buf; 3737 u32 nritems; 3738 u32 item_size; 3739 u32 orig_offset; 3740 struct btrfs_disk_key disk_key; 3741 3742 leaf = path->nodes[0]; 3743 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); 3744 3745 orig_slot = path->slots[0]; 3746 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3747 item_size = btrfs_item_size(leaf, path->slots[0]); 3748 3749 buf = kmalloc(item_size, GFP_NOFS); 3750 if (!buf) 3751 return -ENOMEM; 3752 3753 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3754 path->slots[0]), item_size); 3755 3756 slot = path->slots[0] + 1; 3757 nritems = btrfs_header_nritems(leaf); 3758 if (slot != nritems) { 3759 /* shift the items */ 3760 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 3761 btrfs_item_nr_offset(slot), 3762 (nritems - slot) * sizeof(struct btrfs_item)); 3763 } 3764 3765 btrfs_cpu_key_to_disk(&disk_key, new_key); 3766 btrfs_set_item_key(leaf, &disk_key, slot); 3767 3768 btrfs_set_item_offset(leaf, slot, orig_offset); 3769 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3770 3771 btrfs_set_item_offset(leaf, orig_slot, 3772 orig_offset + item_size - split_offset); 3773 btrfs_set_item_size(leaf, orig_slot, split_offset); 3774 3775 btrfs_set_header_nritems(leaf, nritems + 1); 3776 3777 /* write the data for the start of the original item */ 3778 write_extent_buffer(leaf, buf, 3779 btrfs_item_ptr_offset(leaf, path->slots[0]), 3780 split_offset); 3781 3782 /* write the data for the new item */ 3783 write_extent_buffer(leaf, buf + split_offset, 3784 btrfs_item_ptr_offset(leaf, slot), 3785 item_size - split_offset); 3786 btrfs_mark_buffer_dirty(leaf); 3787 3788 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3789 kfree(buf); 3790 return 0; 3791 } 3792 3793 /* 3794 * This function splits a single item into two items, 3795 * giving 'new_key' to the new item and splitting the 3796 * old one at split_offset (from the start of the item). 3797 * 3798 * The path may be released by this operation. After 3799 * the split, the path is pointing to the old item. The 3800 * new item is going to be in the same node as the old one. 3801 * 3802 * Note, the item being split must be smaller enough to live alone on 3803 * a tree block with room for one extra struct btrfs_item 3804 * 3805 * This allows us to split the item in place, keeping a lock on the 3806 * leaf the entire time. 3807 */ 3808 int btrfs_split_item(struct btrfs_trans_handle *trans, 3809 struct btrfs_root *root, 3810 struct btrfs_path *path, 3811 const struct btrfs_key *new_key, 3812 unsigned long split_offset) 3813 { 3814 int ret; 3815 ret = setup_leaf_for_split(trans, root, path, 3816 sizeof(struct btrfs_item)); 3817 if (ret) 3818 return ret; 3819 3820 ret = split_item(path, new_key, split_offset); 3821 return ret; 3822 } 3823 3824 /* 3825 * make the item pointed to by the path smaller. new_size indicates 3826 * how small to make it, and from_end tells us if we just chop bytes 3827 * off the end of the item or if we shift the item to chop bytes off 3828 * the front. 3829 */ 3830 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 3831 { 3832 int slot; 3833 struct extent_buffer *leaf; 3834 u32 nritems; 3835 unsigned int data_end; 3836 unsigned int old_data_start; 3837 unsigned int old_size; 3838 unsigned int size_diff; 3839 int i; 3840 struct btrfs_map_token token; 3841 3842 leaf = path->nodes[0]; 3843 slot = path->slots[0]; 3844 3845 old_size = btrfs_item_size(leaf, slot); 3846 if (old_size == new_size) 3847 return; 3848 3849 nritems = btrfs_header_nritems(leaf); 3850 data_end = leaf_data_end(leaf); 3851 3852 old_data_start = btrfs_item_offset(leaf, slot); 3853 3854 size_diff = old_size - new_size; 3855 3856 BUG_ON(slot < 0); 3857 BUG_ON(slot >= nritems); 3858 3859 /* 3860 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3861 */ 3862 /* first correct the data pointers */ 3863 btrfs_init_map_token(&token, leaf); 3864 for (i = slot; i < nritems; i++) { 3865 u32 ioff; 3866 3867 ioff = btrfs_token_item_offset(&token, i); 3868 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 3869 } 3870 3871 /* shift the data */ 3872 if (from_end) { 3873 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3874 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3875 data_end, old_data_start + new_size - data_end); 3876 } else { 3877 struct btrfs_disk_key disk_key; 3878 u64 offset; 3879 3880 btrfs_item_key(leaf, &disk_key, slot); 3881 3882 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 3883 unsigned long ptr; 3884 struct btrfs_file_extent_item *fi; 3885 3886 fi = btrfs_item_ptr(leaf, slot, 3887 struct btrfs_file_extent_item); 3888 fi = (struct btrfs_file_extent_item *)( 3889 (unsigned long)fi - size_diff); 3890 3891 if (btrfs_file_extent_type(leaf, fi) == 3892 BTRFS_FILE_EXTENT_INLINE) { 3893 ptr = btrfs_item_ptr_offset(leaf, slot); 3894 memmove_extent_buffer(leaf, ptr, 3895 (unsigned long)fi, 3896 BTRFS_FILE_EXTENT_INLINE_DATA_START); 3897 } 3898 } 3899 3900 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3901 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3902 data_end, old_data_start - data_end); 3903 3904 offset = btrfs_disk_key_offset(&disk_key); 3905 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 3906 btrfs_set_item_key(leaf, &disk_key, slot); 3907 if (slot == 0) 3908 fixup_low_keys(path, &disk_key, 1); 3909 } 3910 3911 btrfs_set_item_size(leaf, slot, new_size); 3912 btrfs_mark_buffer_dirty(leaf); 3913 3914 if (btrfs_leaf_free_space(leaf) < 0) { 3915 btrfs_print_leaf(leaf); 3916 BUG(); 3917 } 3918 } 3919 3920 /* 3921 * make the item pointed to by the path bigger, data_size is the added size. 3922 */ 3923 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 3924 { 3925 int slot; 3926 struct extent_buffer *leaf; 3927 u32 nritems; 3928 unsigned int data_end; 3929 unsigned int old_data; 3930 unsigned int old_size; 3931 int i; 3932 struct btrfs_map_token token; 3933 3934 leaf = path->nodes[0]; 3935 3936 nritems = btrfs_header_nritems(leaf); 3937 data_end = leaf_data_end(leaf); 3938 3939 if (btrfs_leaf_free_space(leaf) < data_size) { 3940 btrfs_print_leaf(leaf); 3941 BUG(); 3942 } 3943 slot = path->slots[0]; 3944 old_data = btrfs_item_data_end(leaf, slot); 3945 3946 BUG_ON(slot < 0); 3947 if (slot >= nritems) { 3948 btrfs_print_leaf(leaf); 3949 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 3950 slot, nritems); 3951 BUG(); 3952 } 3953 3954 /* 3955 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3956 */ 3957 /* first correct the data pointers */ 3958 btrfs_init_map_token(&token, leaf); 3959 for (i = slot; i < nritems; i++) { 3960 u32 ioff; 3961 3962 ioff = btrfs_token_item_offset(&token, i); 3963 btrfs_set_token_item_offset(&token, i, ioff - data_size); 3964 } 3965 3966 /* shift the data */ 3967 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3968 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 3969 data_end, old_data - data_end); 3970 3971 data_end = old_data; 3972 old_size = btrfs_item_size(leaf, slot); 3973 btrfs_set_item_size(leaf, slot, old_size + data_size); 3974 btrfs_mark_buffer_dirty(leaf); 3975 3976 if (btrfs_leaf_free_space(leaf) < 0) { 3977 btrfs_print_leaf(leaf); 3978 BUG(); 3979 } 3980 } 3981 3982 /** 3983 * setup_items_for_insert - Helper called before inserting one or more items 3984 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work 3985 * in a function that doesn't call btrfs_search_slot 3986 * 3987 * @root: root we are inserting items to 3988 * @path: points to the leaf/slot where we are going to insert new items 3989 * @batch: information about the batch of items to insert 3990 */ 3991 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 3992 const struct btrfs_item_batch *batch) 3993 { 3994 struct btrfs_fs_info *fs_info = root->fs_info; 3995 int i; 3996 u32 nritems; 3997 unsigned int data_end; 3998 struct btrfs_disk_key disk_key; 3999 struct extent_buffer *leaf; 4000 int slot; 4001 struct btrfs_map_token token; 4002 u32 total_size; 4003 4004 /* 4005 * Before anything else, update keys in the parent and other ancestors 4006 * if needed, then release the write locks on them, so that other tasks 4007 * can use them while we modify the leaf. 4008 */ 4009 if (path->slots[0] == 0) { 4010 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4011 fixup_low_keys(path, &disk_key, 1); 4012 } 4013 btrfs_unlock_up_safe(path, 1); 4014 4015 leaf = path->nodes[0]; 4016 slot = path->slots[0]; 4017 4018 nritems = btrfs_header_nritems(leaf); 4019 data_end = leaf_data_end(leaf); 4020 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4021 4022 if (btrfs_leaf_free_space(leaf) < total_size) { 4023 btrfs_print_leaf(leaf); 4024 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4025 total_size, btrfs_leaf_free_space(leaf)); 4026 BUG(); 4027 } 4028 4029 btrfs_init_map_token(&token, leaf); 4030 if (slot != nritems) { 4031 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4032 4033 if (old_data < data_end) { 4034 btrfs_print_leaf(leaf); 4035 btrfs_crit(fs_info, 4036 "item at slot %d with data offset %u beyond data end of leaf %u", 4037 slot, old_data, data_end); 4038 BUG(); 4039 } 4040 /* 4041 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4042 */ 4043 /* first correct the data pointers */ 4044 for (i = slot; i < nritems; i++) { 4045 u32 ioff; 4046 4047 ioff = btrfs_token_item_offset(&token, i); 4048 btrfs_set_token_item_offset(&token, i, 4049 ioff - batch->total_data_size); 4050 } 4051 /* shift the items */ 4052 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), 4053 btrfs_item_nr_offset(slot), 4054 (nritems - slot) * sizeof(struct btrfs_item)); 4055 4056 /* shift the data */ 4057 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4058 data_end - batch->total_data_size, 4059 BTRFS_LEAF_DATA_OFFSET + data_end, 4060 old_data - data_end); 4061 data_end = old_data; 4062 } 4063 4064 /* setup the item for the new data */ 4065 for (i = 0; i < batch->nr; i++) { 4066 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4067 btrfs_set_item_key(leaf, &disk_key, slot + i); 4068 data_end -= batch->data_sizes[i]; 4069 btrfs_set_token_item_offset(&token, slot + i, data_end); 4070 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4071 } 4072 4073 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4074 btrfs_mark_buffer_dirty(leaf); 4075 4076 if (btrfs_leaf_free_space(leaf) < 0) { 4077 btrfs_print_leaf(leaf); 4078 BUG(); 4079 } 4080 } 4081 4082 /* 4083 * Insert a new item into a leaf. 4084 * 4085 * @root: The root of the btree. 4086 * @path: A path pointing to the target leaf and slot. 4087 * @key: The key of the new item. 4088 * @data_size: The size of the data associated with the new key. 4089 */ 4090 void btrfs_setup_item_for_insert(struct btrfs_root *root, 4091 struct btrfs_path *path, 4092 const struct btrfs_key *key, 4093 u32 data_size) 4094 { 4095 struct btrfs_item_batch batch; 4096 4097 batch.keys = key; 4098 batch.data_sizes = &data_size; 4099 batch.total_data_size = data_size; 4100 batch.nr = 1; 4101 4102 setup_items_for_insert(root, path, &batch); 4103 } 4104 4105 /* 4106 * Given a key and some data, insert items into the tree. 4107 * This does all the path init required, making room in the tree if needed. 4108 */ 4109 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4110 struct btrfs_root *root, 4111 struct btrfs_path *path, 4112 const struct btrfs_item_batch *batch) 4113 { 4114 int ret = 0; 4115 int slot; 4116 u32 total_size; 4117 4118 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4119 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4120 if (ret == 0) 4121 return -EEXIST; 4122 if (ret < 0) 4123 return ret; 4124 4125 slot = path->slots[0]; 4126 BUG_ON(slot < 0); 4127 4128 setup_items_for_insert(root, path, batch); 4129 return 0; 4130 } 4131 4132 /* 4133 * Given a key and some data, insert an item into the tree. 4134 * This does all the path init required, making room in the tree if needed. 4135 */ 4136 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4137 const struct btrfs_key *cpu_key, void *data, 4138 u32 data_size) 4139 { 4140 int ret = 0; 4141 struct btrfs_path *path; 4142 struct extent_buffer *leaf; 4143 unsigned long ptr; 4144 4145 path = btrfs_alloc_path(); 4146 if (!path) 4147 return -ENOMEM; 4148 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4149 if (!ret) { 4150 leaf = path->nodes[0]; 4151 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4152 write_extent_buffer(leaf, data, ptr, data_size); 4153 btrfs_mark_buffer_dirty(leaf); 4154 } 4155 btrfs_free_path(path); 4156 return ret; 4157 } 4158 4159 /* 4160 * This function duplicates an item, giving 'new_key' to the new item. 4161 * It guarantees both items live in the same tree leaf and the new item is 4162 * contiguous with the original item. 4163 * 4164 * This allows us to split a file extent in place, keeping a lock on the leaf 4165 * the entire time. 4166 */ 4167 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4168 struct btrfs_root *root, 4169 struct btrfs_path *path, 4170 const struct btrfs_key *new_key) 4171 { 4172 struct extent_buffer *leaf; 4173 int ret; 4174 u32 item_size; 4175 4176 leaf = path->nodes[0]; 4177 item_size = btrfs_item_size(leaf, path->slots[0]); 4178 ret = setup_leaf_for_split(trans, root, path, 4179 item_size + sizeof(struct btrfs_item)); 4180 if (ret) 4181 return ret; 4182 4183 path->slots[0]++; 4184 btrfs_setup_item_for_insert(root, path, new_key, item_size); 4185 leaf = path->nodes[0]; 4186 memcpy_extent_buffer(leaf, 4187 btrfs_item_ptr_offset(leaf, path->slots[0]), 4188 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4189 item_size); 4190 return 0; 4191 } 4192 4193 /* 4194 * delete the pointer from a given node. 4195 * 4196 * the tree should have been previously balanced so the deletion does not 4197 * empty a node. 4198 */ 4199 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4200 int level, int slot) 4201 { 4202 struct extent_buffer *parent = path->nodes[level]; 4203 u32 nritems; 4204 int ret; 4205 4206 nritems = btrfs_header_nritems(parent); 4207 if (slot != nritems - 1) { 4208 if (level) { 4209 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4210 slot + 1, nritems - slot - 1); 4211 BUG_ON(ret < 0); 4212 } 4213 memmove_extent_buffer(parent, 4214 btrfs_node_key_ptr_offset(slot), 4215 btrfs_node_key_ptr_offset(slot + 1), 4216 sizeof(struct btrfs_key_ptr) * 4217 (nritems - slot - 1)); 4218 } else if (level) { 4219 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4220 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); 4221 BUG_ON(ret < 0); 4222 } 4223 4224 nritems--; 4225 btrfs_set_header_nritems(parent, nritems); 4226 if (nritems == 0 && parent == root->node) { 4227 BUG_ON(btrfs_header_level(root->node) != 1); 4228 /* just turn the root into a leaf and break */ 4229 btrfs_set_header_level(root->node, 0); 4230 } else if (slot == 0) { 4231 struct btrfs_disk_key disk_key; 4232 4233 btrfs_node_key(parent, &disk_key, 0); 4234 fixup_low_keys(path, &disk_key, level + 1); 4235 } 4236 btrfs_mark_buffer_dirty(parent); 4237 } 4238 4239 /* 4240 * a helper function to delete the leaf pointed to by path->slots[1] and 4241 * path->nodes[1]. 4242 * 4243 * This deletes the pointer in path->nodes[1] and frees the leaf 4244 * block extent. zero is returned if it all worked out, < 0 otherwise. 4245 * 4246 * The path must have already been setup for deleting the leaf, including 4247 * all the proper balancing. path->nodes[1] must be locked. 4248 */ 4249 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4250 struct btrfs_root *root, 4251 struct btrfs_path *path, 4252 struct extent_buffer *leaf) 4253 { 4254 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4255 del_ptr(root, path, 1, path->slots[1]); 4256 4257 /* 4258 * btrfs_free_extent is expensive, we want to make sure we 4259 * aren't holding any locks when we call it 4260 */ 4261 btrfs_unlock_up_safe(path, 0); 4262 4263 root_sub_used(root, leaf->len); 4264 4265 atomic_inc(&leaf->refs); 4266 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4267 free_extent_buffer_stale(leaf); 4268 } 4269 /* 4270 * delete the item at the leaf level in path. If that empties 4271 * the leaf, remove it from the tree 4272 */ 4273 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4274 struct btrfs_path *path, int slot, int nr) 4275 { 4276 struct btrfs_fs_info *fs_info = root->fs_info; 4277 struct extent_buffer *leaf; 4278 int ret = 0; 4279 int wret; 4280 u32 nritems; 4281 4282 leaf = path->nodes[0]; 4283 nritems = btrfs_header_nritems(leaf); 4284 4285 if (slot + nr != nritems) { 4286 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4287 const int data_end = leaf_data_end(leaf); 4288 struct btrfs_map_token token; 4289 u32 dsize = 0; 4290 int i; 4291 4292 for (i = 0; i < nr; i++) 4293 dsize += btrfs_item_size(leaf, slot + i); 4294 4295 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4296 data_end + dsize, 4297 BTRFS_LEAF_DATA_OFFSET + data_end, 4298 last_off - data_end); 4299 4300 btrfs_init_map_token(&token, leaf); 4301 for (i = slot + nr; i < nritems; i++) { 4302 u32 ioff; 4303 4304 ioff = btrfs_token_item_offset(&token, i); 4305 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4306 } 4307 4308 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4309 btrfs_item_nr_offset(slot + nr), 4310 sizeof(struct btrfs_item) * 4311 (nritems - slot - nr)); 4312 } 4313 btrfs_set_header_nritems(leaf, nritems - nr); 4314 nritems -= nr; 4315 4316 /* delete the leaf if we've emptied it */ 4317 if (nritems == 0) { 4318 if (leaf == root->node) { 4319 btrfs_set_header_level(leaf, 0); 4320 } else { 4321 btrfs_clean_tree_block(leaf); 4322 btrfs_del_leaf(trans, root, path, leaf); 4323 } 4324 } else { 4325 int used = leaf_space_used(leaf, 0, nritems); 4326 if (slot == 0) { 4327 struct btrfs_disk_key disk_key; 4328 4329 btrfs_item_key(leaf, &disk_key, 0); 4330 fixup_low_keys(path, &disk_key, 1); 4331 } 4332 4333 /* 4334 * Try to delete the leaf if it is mostly empty. We do this by 4335 * trying to move all its items into its left and right neighbours. 4336 * If we can't move all the items, then we don't delete it - it's 4337 * not ideal, but future insertions might fill the leaf with more 4338 * items, or items from other leaves might be moved later into our 4339 * leaf due to deletions on those leaves. 4340 */ 4341 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4342 u32 min_push_space; 4343 4344 /* push_leaf_left fixes the path. 4345 * make sure the path still points to our leaf 4346 * for possible call to del_ptr below 4347 */ 4348 slot = path->slots[1]; 4349 atomic_inc(&leaf->refs); 4350 /* 4351 * We want to be able to at least push one item to the 4352 * left neighbour leaf, and that's the first item. 4353 */ 4354 min_push_space = sizeof(struct btrfs_item) + 4355 btrfs_item_size(leaf, 0); 4356 wret = push_leaf_left(trans, root, path, 0, 4357 min_push_space, 1, (u32)-1); 4358 if (wret < 0 && wret != -ENOSPC) 4359 ret = wret; 4360 4361 if (path->nodes[0] == leaf && 4362 btrfs_header_nritems(leaf)) { 4363 /* 4364 * If we were not able to push all items from our 4365 * leaf to its left neighbour, then attempt to 4366 * either push all the remaining items to the 4367 * right neighbour or none. There's no advantage 4368 * in pushing only some items, instead of all, as 4369 * it's pointless to end up with a leaf having 4370 * too few items while the neighbours can be full 4371 * or nearly full. 4372 */ 4373 nritems = btrfs_header_nritems(leaf); 4374 min_push_space = leaf_space_used(leaf, 0, nritems); 4375 wret = push_leaf_right(trans, root, path, 0, 4376 min_push_space, 1, 0); 4377 if (wret < 0 && wret != -ENOSPC) 4378 ret = wret; 4379 } 4380 4381 if (btrfs_header_nritems(leaf) == 0) { 4382 path->slots[1] = slot; 4383 btrfs_del_leaf(trans, root, path, leaf); 4384 free_extent_buffer(leaf); 4385 ret = 0; 4386 } else { 4387 /* if we're still in the path, make sure 4388 * we're dirty. Otherwise, one of the 4389 * push_leaf functions must have already 4390 * dirtied this buffer 4391 */ 4392 if (path->nodes[0] == leaf) 4393 btrfs_mark_buffer_dirty(leaf); 4394 free_extent_buffer(leaf); 4395 } 4396 } else { 4397 btrfs_mark_buffer_dirty(leaf); 4398 } 4399 } 4400 return ret; 4401 } 4402 4403 /* 4404 * search the tree again to find a leaf with lesser keys 4405 * returns 0 if it found something or 1 if there are no lesser leaves. 4406 * returns < 0 on io errors. 4407 * 4408 * This may release the path, and so you may lose any locks held at the 4409 * time you call it. 4410 */ 4411 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4412 { 4413 struct btrfs_key key; 4414 struct btrfs_disk_key found_key; 4415 int ret; 4416 4417 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4418 4419 if (key.offset > 0) { 4420 key.offset--; 4421 } else if (key.type > 0) { 4422 key.type--; 4423 key.offset = (u64)-1; 4424 } else if (key.objectid > 0) { 4425 key.objectid--; 4426 key.type = (u8)-1; 4427 key.offset = (u64)-1; 4428 } else { 4429 return 1; 4430 } 4431 4432 btrfs_release_path(path); 4433 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4434 if (ret < 0) 4435 return ret; 4436 btrfs_item_key(path->nodes[0], &found_key, 0); 4437 ret = comp_keys(&found_key, &key); 4438 /* 4439 * We might have had an item with the previous key in the tree right 4440 * before we released our path. And after we released our path, that 4441 * item might have been pushed to the first slot (0) of the leaf we 4442 * were holding due to a tree balance. Alternatively, an item with the 4443 * previous key can exist as the only element of a leaf (big fat item). 4444 * Therefore account for these 2 cases, so that our callers (like 4445 * btrfs_previous_item) don't miss an existing item with a key matching 4446 * the previous key we computed above. 4447 */ 4448 if (ret <= 0) 4449 return 0; 4450 return 1; 4451 } 4452 4453 /* 4454 * A helper function to walk down the tree starting at min_key, and looking 4455 * for nodes or leaves that are have a minimum transaction id. 4456 * This is used by the btree defrag code, and tree logging 4457 * 4458 * This does not cow, but it does stuff the starting key it finds back 4459 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4460 * key and get a writable path. 4461 * 4462 * This honors path->lowest_level to prevent descent past a given level 4463 * of the tree. 4464 * 4465 * min_trans indicates the oldest transaction that you are interested 4466 * in walking through. Any nodes or leaves older than min_trans are 4467 * skipped over (without reading them). 4468 * 4469 * returns zero if something useful was found, < 0 on error and 1 if there 4470 * was nothing in the tree that matched the search criteria. 4471 */ 4472 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4473 struct btrfs_path *path, 4474 u64 min_trans) 4475 { 4476 struct extent_buffer *cur; 4477 struct btrfs_key found_key; 4478 int slot; 4479 int sret; 4480 u32 nritems; 4481 int level; 4482 int ret = 1; 4483 int keep_locks = path->keep_locks; 4484 4485 ASSERT(!path->nowait); 4486 path->keep_locks = 1; 4487 again: 4488 cur = btrfs_read_lock_root_node(root); 4489 level = btrfs_header_level(cur); 4490 WARN_ON(path->nodes[level]); 4491 path->nodes[level] = cur; 4492 path->locks[level] = BTRFS_READ_LOCK; 4493 4494 if (btrfs_header_generation(cur) < min_trans) { 4495 ret = 1; 4496 goto out; 4497 } 4498 while (1) { 4499 nritems = btrfs_header_nritems(cur); 4500 level = btrfs_header_level(cur); 4501 sret = btrfs_bin_search(cur, min_key, &slot); 4502 if (sret < 0) { 4503 ret = sret; 4504 goto out; 4505 } 4506 4507 /* at the lowest level, we're done, setup the path and exit */ 4508 if (level == path->lowest_level) { 4509 if (slot >= nritems) 4510 goto find_next_key; 4511 ret = 0; 4512 path->slots[level] = slot; 4513 btrfs_item_key_to_cpu(cur, &found_key, slot); 4514 goto out; 4515 } 4516 if (sret && slot > 0) 4517 slot--; 4518 /* 4519 * check this node pointer against the min_trans parameters. 4520 * If it is too old, skip to the next one. 4521 */ 4522 while (slot < nritems) { 4523 u64 gen; 4524 4525 gen = btrfs_node_ptr_generation(cur, slot); 4526 if (gen < min_trans) { 4527 slot++; 4528 continue; 4529 } 4530 break; 4531 } 4532 find_next_key: 4533 /* 4534 * we didn't find a candidate key in this node, walk forward 4535 * and find another one 4536 */ 4537 if (slot >= nritems) { 4538 path->slots[level] = slot; 4539 sret = btrfs_find_next_key(root, path, min_key, level, 4540 min_trans); 4541 if (sret == 0) { 4542 btrfs_release_path(path); 4543 goto again; 4544 } else { 4545 goto out; 4546 } 4547 } 4548 /* save our key for returning back */ 4549 btrfs_node_key_to_cpu(cur, &found_key, slot); 4550 path->slots[level] = slot; 4551 if (level == path->lowest_level) { 4552 ret = 0; 4553 goto out; 4554 } 4555 cur = btrfs_read_node_slot(cur, slot); 4556 if (IS_ERR(cur)) { 4557 ret = PTR_ERR(cur); 4558 goto out; 4559 } 4560 4561 btrfs_tree_read_lock(cur); 4562 4563 path->locks[level - 1] = BTRFS_READ_LOCK; 4564 path->nodes[level - 1] = cur; 4565 unlock_up(path, level, 1, 0, NULL); 4566 } 4567 out: 4568 path->keep_locks = keep_locks; 4569 if (ret == 0) { 4570 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4571 memcpy(min_key, &found_key, sizeof(found_key)); 4572 } 4573 return ret; 4574 } 4575 4576 /* 4577 * this is similar to btrfs_next_leaf, but does not try to preserve 4578 * and fixup the path. It looks for and returns the next key in the 4579 * tree based on the current path and the min_trans parameters. 4580 * 4581 * 0 is returned if another key is found, < 0 if there are any errors 4582 * and 1 is returned if there are no higher keys in the tree 4583 * 4584 * path->keep_locks should be set to 1 on the search made before 4585 * calling this function. 4586 */ 4587 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4588 struct btrfs_key *key, int level, u64 min_trans) 4589 { 4590 int slot; 4591 struct extent_buffer *c; 4592 4593 WARN_ON(!path->keep_locks && !path->skip_locking); 4594 while (level < BTRFS_MAX_LEVEL) { 4595 if (!path->nodes[level]) 4596 return 1; 4597 4598 slot = path->slots[level] + 1; 4599 c = path->nodes[level]; 4600 next: 4601 if (slot >= btrfs_header_nritems(c)) { 4602 int ret; 4603 int orig_lowest; 4604 struct btrfs_key cur_key; 4605 if (level + 1 >= BTRFS_MAX_LEVEL || 4606 !path->nodes[level + 1]) 4607 return 1; 4608 4609 if (path->locks[level + 1] || path->skip_locking) { 4610 level++; 4611 continue; 4612 } 4613 4614 slot = btrfs_header_nritems(c) - 1; 4615 if (level == 0) 4616 btrfs_item_key_to_cpu(c, &cur_key, slot); 4617 else 4618 btrfs_node_key_to_cpu(c, &cur_key, slot); 4619 4620 orig_lowest = path->lowest_level; 4621 btrfs_release_path(path); 4622 path->lowest_level = level; 4623 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4624 0, 0); 4625 path->lowest_level = orig_lowest; 4626 if (ret < 0) 4627 return ret; 4628 4629 c = path->nodes[level]; 4630 slot = path->slots[level]; 4631 if (ret == 0) 4632 slot++; 4633 goto next; 4634 } 4635 4636 if (level == 0) 4637 btrfs_item_key_to_cpu(c, key, slot); 4638 else { 4639 u64 gen = btrfs_node_ptr_generation(c, slot); 4640 4641 if (gen < min_trans) { 4642 slot++; 4643 goto next; 4644 } 4645 btrfs_node_key_to_cpu(c, key, slot); 4646 } 4647 return 0; 4648 } 4649 return 1; 4650 } 4651 4652 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4653 u64 time_seq) 4654 { 4655 int slot; 4656 int level; 4657 struct extent_buffer *c; 4658 struct extent_buffer *next; 4659 struct btrfs_fs_info *fs_info = root->fs_info; 4660 struct btrfs_key key; 4661 bool need_commit_sem = false; 4662 u32 nritems; 4663 int ret; 4664 int i; 4665 4666 /* 4667 * The nowait semantics are used only for write paths, where we don't 4668 * use the tree mod log and sequence numbers. 4669 */ 4670 if (time_seq) 4671 ASSERT(!path->nowait); 4672 4673 nritems = btrfs_header_nritems(path->nodes[0]); 4674 if (nritems == 0) 4675 return 1; 4676 4677 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4678 again: 4679 level = 1; 4680 next = NULL; 4681 btrfs_release_path(path); 4682 4683 path->keep_locks = 1; 4684 4685 if (time_seq) { 4686 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4687 } else { 4688 if (path->need_commit_sem) { 4689 path->need_commit_sem = 0; 4690 need_commit_sem = true; 4691 if (path->nowait) { 4692 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4693 ret = -EAGAIN; 4694 goto done; 4695 } 4696 } else { 4697 down_read(&fs_info->commit_root_sem); 4698 } 4699 } 4700 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4701 } 4702 path->keep_locks = 0; 4703 4704 if (ret < 0) 4705 goto done; 4706 4707 nritems = btrfs_header_nritems(path->nodes[0]); 4708 /* 4709 * by releasing the path above we dropped all our locks. A balance 4710 * could have added more items next to the key that used to be 4711 * at the very end of the block. So, check again here and 4712 * advance the path if there are now more items available. 4713 */ 4714 if (nritems > 0 && path->slots[0] < nritems - 1) { 4715 if (ret == 0) 4716 path->slots[0]++; 4717 ret = 0; 4718 goto done; 4719 } 4720 /* 4721 * So the above check misses one case: 4722 * - after releasing the path above, someone has removed the item that 4723 * used to be at the very end of the block, and balance between leafs 4724 * gets another one with bigger key.offset to replace it. 4725 * 4726 * This one should be returned as well, or we can get leaf corruption 4727 * later(esp. in __btrfs_drop_extents()). 4728 * 4729 * And a bit more explanation about this check, 4730 * with ret > 0, the key isn't found, the path points to the slot 4731 * where it should be inserted, so the path->slots[0] item must be the 4732 * bigger one. 4733 */ 4734 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4735 ret = 0; 4736 goto done; 4737 } 4738 4739 while (level < BTRFS_MAX_LEVEL) { 4740 if (!path->nodes[level]) { 4741 ret = 1; 4742 goto done; 4743 } 4744 4745 slot = path->slots[level] + 1; 4746 c = path->nodes[level]; 4747 if (slot >= btrfs_header_nritems(c)) { 4748 level++; 4749 if (level == BTRFS_MAX_LEVEL) { 4750 ret = 1; 4751 goto done; 4752 } 4753 continue; 4754 } 4755 4756 4757 /* 4758 * Our current level is where we're going to start from, and to 4759 * make sure lockdep doesn't complain we need to drop our locks 4760 * and nodes from 0 to our current level. 4761 */ 4762 for (i = 0; i < level; i++) { 4763 if (path->locks[level]) { 4764 btrfs_tree_read_unlock(path->nodes[i]); 4765 path->locks[i] = 0; 4766 } 4767 free_extent_buffer(path->nodes[i]); 4768 path->nodes[i] = NULL; 4769 } 4770 4771 next = c; 4772 ret = read_block_for_search(root, path, &next, level, 4773 slot, &key); 4774 if (ret == -EAGAIN && !path->nowait) 4775 goto again; 4776 4777 if (ret < 0) { 4778 btrfs_release_path(path); 4779 goto done; 4780 } 4781 4782 if (!path->skip_locking) { 4783 ret = btrfs_try_tree_read_lock(next); 4784 if (!ret && path->nowait) { 4785 ret = -EAGAIN; 4786 goto done; 4787 } 4788 if (!ret && time_seq) { 4789 /* 4790 * If we don't get the lock, we may be racing 4791 * with push_leaf_left, holding that lock while 4792 * itself waiting for the leaf we've currently 4793 * locked. To solve this situation, we give up 4794 * on our lock and cycle. 4795 */ 4796 free_extent_buffer(next); 4797 btrfs_release_path(path); 4798 cond_resched(); 4799 goto again; 4800 } 4801 if (!ret) 4802 btrfs_tree_read_lock(next); 4803 } 4804 break; 4805 } 4806 path->slots[level] = slot; 4807 while (1) { 4808 level--; 4809 path->nodes[level] = next; 4810 path->slots[level] = 0; 4811 if (!path->skip_locking) 4812 path->locks[level] = BTRFS_READ_LOCK; 4813 if (!level) 4814 break; 4815 4816 ret = read_block_for_search(root, path, &next, level, 4817 0, &key); 4818 if (ret == -EAGAIN && !path->nowait) 4819 goto again; 4820 4821 if (ret < 0) { 4822 btrfs_release_path(path); 4823 goto done; 4824 } 4825 4826 if (!path->skip_locking) { 4827 if (path->nowait) { 4828 if (!btrfs_try_tree_read_lock(next)) { 4829 ret = -EAGAIN; 4830 goto done; 4831 } 4832 } else { 4833 btrfs_tree_read_lock(next); 4834 } 4835 } 4836 } 4837 ret = 0; 4838 done: 4839 unlock_up(path, 0, 1, 0, NULL); 4840 if (need_commit_sem) { 4841 int ret2; 4842 4843 path->need_commit_sem = 1; 4844 ret2 = finish_need_commit_sem_search(path); 4845 up_read(&fs_info->commit_root_sem); 4846 if (ret2) 4847 ret = ret2; 4848 } 4849 4850 return ret; 4851 } 4852 4853 /* 4854 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4855 * searching until it gets past min_objectid or finds an item of 'type' 4856 * 4857 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4858 */ 4859 int btrfs_previous_item(struct btrfs_root *root, 4860 struct btrfs_path *path, u64 min_objectid, 4861 int type) 4862 { 4863 struct btrfs_key found_key; 4864 struct extent_buffer *leaf; 4865 u32 nritems; 4866 int ret; 4867 4868 while (1) { 4869 if (path->slots[0] == 0) { 4870 ret = btrfs_prev_leaf(root, path); 4871 if (ret != 0) 4872 return ret; 4873 } else { 4874 path->slots[0]--; 4875 } 4876 leaf = path->nodes[0]; 4877 nritems = btrfs_header_nritems(leaf); 4878 if (nritems == 0) 4879 return 1; 4880 if (path->slots[0] == nritems) 4881 path->slots[0]--; 4882 4883 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4884 if (found_key.objectid < min_objectid) 4885 break; 4886 if (found_key.type == type) 4887 return 0; 4888 if (found_key.objectid == min_objectid && 4889 found_key.type < type) 4890 break; 4891 } 4892 return 1; 4893 } 4894 4895 /* 4896 * search in extent tree to find a previous Metadata/Data extent item with 4897 * min objecitd. 4898 * 4899 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4900 */ 4901 int btrfs_previous_extent_item(struct btrfs_root *root, 4902 struct btrfs_path *path, u64 min_objectid) 4903 { 4904 struct btrfs_key found_key; 4905 struct extent_buffer *leaf; 4906 u32 nritems; 4907 int ret; 4908 4909 while (1) { 4910 if (path->slots[0] == 0) { 4911 ret = btrfs_prev_leaf(root, path); 4912 if (ret != 0) 4913 return ret; 4914 } else { 4915 path->slots[0]--; 4916 } 4917 leaf = path->nodes[0]; 4918 nritems = btrfs_header_nritems(leaf); 4919 if (nritems == 0) 4920 return 1; 4921 if (path->slots[0] == nritems) 4922 path->slots[0]--; 4923 4924 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4925 if (found_key.objectid < min_objectid) 4926 break; 4927 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 4928 found_key.type == BTRFS_METADATA_ITEM_KEY) 4929 return 0; 4930 if (found_key.objectid == min_objectid && 4931 found_key.type < BTRFS_EXTENT_ITEM_KEY) 4932 break; 4933 } 4934 return 1; 4935 } 4936