1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "ctree.h" 12 #include "disk-io.h" 13 #include "transaction.h" 14 #include "print-tree.h" 15 #include "locking.h" 16 #include "volumes.h" 17 #include "qgroup.h" 18 #include "tree-mod-log.h" 19 20 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 21 *root, struct btrfs_path *path, int level); 22 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 23 const struct btrfs_key *ins_key, struct btrfs_path *path, 24 int data_size, int extend); 25 static int push_node_left(struct btrfs_trans_handle *trans, 26 struct extent_buffer *dst, 27 struct extent_buffer *src, int empty); 28 static int balance_node_right(struct btrfs_trans_handle *trans, 29 struct extent_buffer *dst_buf, 30 struct extent_buffer *src_buf); 31 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 32 int level, int slot); 33 34 static const struct btrfs_csums { 35 u16 size; 36 const char name[10]; 37 const char driver[12]; 38 } btrfs_csums[] = { 39 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 40 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 41 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 42 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 43 .driver = "blake2b-256" }, 44 }; 45 46 int btrfs_super_csum_size(const struct btrfs_super_block *s) 47 { 48 u16 t = btrfs_super_csum_type(s); 49 /* 50 * csum type is validated at mount time 51 */ 52 return btrfs_csums[t].size; 53 } 54 55 const char *btrfs_super_csum_name(u16 csum_type) 56 { 57 /* csum type is validated at mount time */ 58 return btrfs_csums[csum_type].name; 59 } 60 61 /* 62 * Return driver name if defined, otherwise the name that's also a valid driver 63 * name 64 */ 65 const char *btrfs_super_csum_driver(u16 csum_type) 66 { 67 /* csum type is validated at mount time */ 68 return btrfs_csums[csum_type].driver[0] ? 69 btrfs_csums[csum_type].driver : 70 btrfs_csums[csum_type].name; 71 } 72 73 size_t __attribute_const__ btrfs_get_num_csums(void) 74 { 75 return ARRAY_SIZE(btrfs_csums); 76 } 77 78 struct btrfs_path *btrfs_alloc_path(void) 79 { 80 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 81 } 82 83 /* this also releases the path */ 84 void btrfs_free_path(struct btrfs_path *p) 85 { 86 if (!p) 87 return; 88 btrfs_release_path(p); 89 kmem_cache_free(btrfs_path_cachep, p); 90 } 91 92 /* 93 * path release drops references on the extent buffers in the path 94 * and it drops any locks held by this path 95 * 96 * It is safe to call this on paths that no locks or extent buffers held. 97 */ 98 noinline void btrfs_release_path(struct btrfs_path *p) 99 { 100 int i; 101 102 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 103 p->slots[i] = 0; 104 if (!p->nodes[i]) 105 continue; 106 if (p->locks[i]) { 107 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 108 p->locks[i] = 0; 109 } 110 free_extent_buffer(p->nodes[i]); 111 p->nodes[i] = NULL; 112 } 113 } 114 115 /* 116 * safely gets a reference on the root node of a tree. A lock 117 * is not taken, so a concurrent writer may put a different node 118 * at the root of the tree. See btrfs_lock_root_node for the 119 * looping required. 120 * 121 * The extent buffer returned by this has a reference taken, so 122 * it won't disappear. It may stop being the root of the tree 123 * at any time because there are no locks held. 124 */ 125 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 126 { 127 struct extent_buffer *eb; 128 129 while (1) { 130 rcu_read_lock(); 131 eb = rcu_dereference(root->node); 132 133 /* 134 * RCU really hurts here, we could free up the root node because 135 * it was COWed but we may not get the new root node yet so do 136 * the inc_not_zero dance and if it doesn't work then 137 * synchronize_rcu and try again. 138 */ 139 if (atomic_inc_not_zero(&eb->refs)) { 140 rcu_read_unlock(); 141 break; 142 } 143 rcu_read_unlock(); 144 synchronize_rcu(); 145 } 146 return eb; 147 } 148 149 /* 150 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 151 * just get put onto a simple dirty list. Transaction walks this list to make 152 * sure they get properly updated on disk. 153 */ 154 static void add_root_to_dirty_list(struct btrfs_root *root) 155 { 156 struct btrfs_fs_info *fs_info = root->fs_info; 157 158 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 159 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 160 return; 161 162 spin_lock(&fs_info->trans_lock); 163 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 164 /* Want the extent tree to be the last on the list */ 165 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 166 list_move_tail(&root->dirty_list, 167 &fs_info->dirty_cowonly_roots); 168 else 169 list_move(&root->dirty_list, 170 &fs_info->dirty_cowonly_roots); 171 } 172 spin_unlock(&fs_info->trans_lock); 173 } 174 175 /* 176 * used by snapshot creation to make a copy of a root for a tree with 177 * a given objectid. The buffer with the new root node is returned in 178 * cow_ret, and this func returns zero on success or a negative error code. 179 */ 180 int btrfs_copy_root(struct btrfs_trans_handle *trans, 181 struct btrfs_root *root, 182 struct extent_buffer *buf, 183 struct extent_buffer **cow_ret, u64 new_root_objectid) 184 { 185 struct btrfs_fs_info *fs_info = root->fs_info; 186 struct extent_buffer *cow; 187 int ret = 0; 188 int level; 189 struct btrfs_disk_key disk_key; 190 191 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 192 trans->transid != fs_info->running_transaction->transid); 193 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 194 trans->transid != root->last_trans); 195 196 level = btrfs_header_level(buf); 197 if (level == 0) 198 btrfs_item_key(buf, &disk_key, 0); 199 else 200 btrfs_node_key(buf, &disk_key, 0); 201 202 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 203 &disk_key, level, buf->start, 0, 204 BTRFS_NESTING_NEW_ROOT); 205 if (IS_ERR(cow)) 206 return PTR_ERR(cow); 207 208 copy_extent_buffer_full(cow, buf); 209 btrfs_set_header_bytenr(cow, cow->start); 210 btrfs_set_header_generation(cow, trans->transid); 211 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 212 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 213 BTRFS_HEADER_FLAG_RELOC); 214 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 215 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 216 else 217 btrfs_set_header_owner(cow, new_root_objectid); 218 219 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 220 221 WARN_ON(btrfs_header_generation(buf) > trans->transid); 222 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 223 ret = btrfs_inc_ref(trans, root, cow, 1); 224 else 225 ret = btrfs_inc_ref(trans, root, cow, 0); 226 if (ret) { 227 btrfs_tree_unlock(cow); 228 free_extent_buffer(cow); 229 btrfs_abort_transaction(trans, ret); 230 return ret; 231 } 232 233 btrfs_mark_buffer_dirty(cow); 234 *cow_ret = cow; 235 return 0; 236 } 237 238 /* 239 * check if the tree block can be shared by multiple trees 240 */ 241 int btrfs_block_can_be_shared(struct btrfs_root *root, 242 struct extent_buffer *buf) 243 { 244 /* 245 * Tree blocks not in shareable trees and tree roots are never shared. 246 * If a block was allocated after the last snapshot and the block was 247 * not allocated by tree relocation, we know the block is not shared. 248 */ 249 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 250 buf != root->node && buf != root->commit_root && 251 (btrfs_header_generation(buf) <= 252 btrfs_root_last_snapshot(&root->root_item) || 253 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 254 return 1; 255 256 return 0; 257 } 258 259 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 260 struct btrfs_root *root, 261 struct extent_buffer *buf, 262 struct extent_buffer *cow, 263 int *last_ref) 264 { 265 struct btrfs_fs_info *fs_info = root->fs_info; 266 u64 refs; 267 u64 owner; 268 u64 flags; 269 u64 new_flags = 0; 270 int ret; 271 272 /* 273 * Backrefs update rules: 274 * 275 * Always use full backrefs for extent pointers in tree block 276 * allocated by tree relocation. 277 * 278 * If a shared tree block is no longer referenced by its owner 279 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 280 * use full backrefs for extent pointers in tree block. 281 * 282 * If a tree block is been relocating 283 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 284 * use full backrefs for extent pointers in tree block. 285 * The reason for this is some operations (such as drop tree) 286 * are only allowed for blocks use full backrefs. 287 */ 288 289 if (btrfs_block_can_be_shared(root, buf)) { 290 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 291 btrfs_header_level(buf), 1, 292 &refs, &flags); 293 if (ret) 294 return ret; 295 if (refs == 0) { 296 ret = -EROFS; 297 btrfs_handle_fs_error(fs_info, ret, NULL); 298 return ret; 299 } 300 } else { 301 refs = 1; 302 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 303 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 304 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 305 else 306 flags = 0; 307 } 308 309 owner = btrfs_header_owner(buf); 310 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 311 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 312 313 if (refs > 1) { 314 if ((owner == root->root_key.objectid || 315 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 316 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 317 ret = btrfs_inc_ref(trans, root, buf, 1); 318 if (ret) 319 return ret; 320 321 if (root->root_key.objectid == 322 BTRFS_TREE_RELOC_OBJECTID) { 323 ret = btrfs_dec_ref(trans, root, buf, 0); 324 if (ret) 325 return ret; 326 ret = btrfs_inc_ref(trans, root, cow, 1); 327 if (ret) 328 return ret; 329 } 330 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 331 } else { 332 333 if (root->root_key.objectid == 334 BTRFS_TREE_RELOC_OBJECTID) 335 ret = btrfs_inc_ref(trans, root, cow, 1); 336 else 337 ret = btrfs_inc_ref(trans, root, cow, 0); 338 if (ret) 339 return ret; 340 } 341 if (new_flags != 0) { 342 int level = btrfs_header_level(buf); 343 344 ret = btrfs_set_disk_extent_flags(trans, buf, 345 new_flags, level, 0); 346 if (ret) 347 return ret; 348 } 349 } else { 350 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 351 if (root->root_key.objectid == 352 BTRFS_TREE_RELOC_OBJECTID) 353 ret = btrfs_inc_ref(trans, root, cow, 1); 354 else 355 ret = btrfs_inc_ref(trans, root, cow, 0); 356 if (ret) 357 return ret; 358 ret = btrfs_dec_ref(trans, root, buf, 1); 359 if (ret) 360 return ret; 361 } 362 btrfs_clean_tree_block(buf); 363 *last_ref = 1; 364 } 365 return 0; 366 } 367 368 /* 369 * does the dirty work in cow of a single block. The parent block (if 370 * supplied) is updated to point to the new cow copy. The new buffer is marked 371 * dirty and returned locked. If you modify the block it needs to be marked 372 * dirty again. 373 * 374 * search_start -- an allocation hint for the new block 375 * 376 * empty_size -- a hint that you plan on doing more cow. This is the size in 377 * bytes the allocator should try to find free next to the block it returns. 378 * This is just a hint and may be ignored by the allocator. 379 */ 380 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 381 struct btrfs_root *root, 382 struct extent_buffer *buf, 383 struct extent_buffer *parent, int parent_slot, 384 struct extent_buffer **cow_ret, 385 u64 search_start, u64 empty_size, 386 enum btrfs_lock_nesting nest) 387 { 388 struct btrfs_fs_info *fs_info = root->fs_info; 389 struct btrfs_disk_key disk_key; 390 struct extent_buffer *cow; 391 int level, ret; 392 int last_ref = 0; 393 int unlock_orig = 0; 394 u64 parent_start = 0; 395 396 if (*cow_ret == buf) 397 unlock_orig = 1; 398 399 btrfs_assert_tree_write_locked(buf); 400 401 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 402 trans->transid != fs_info->running_transaction->transid); 403 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 404 trans->transid != root->last_trans); 405 406 level = btrfs_header_level(buf); 407 408 if (level == 0) 409 btrfs_item_key(buf, &disk_key, 0); 410 else 411 btrfs_node_key(buf, &disk_key, 0); 412 413 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 414 parent_start = parent->start; 415 416 cow = btrfs_alloc_tree_block(trans, root, parent_start, 417 root->root_key.objectid, &disk_key, level, 418 search_start, empty_size, nest); 419 if (IS_ERR(cow)) 420 return PTR_ERR(cow); 421 422 /* cow is set to blocking by btrfs_init_new_buffer */ 423 424 copy_extent_buffer_full(cow, buf); 425 btrfs_set_header_bytenr(cow, cow->start); 426 btrfs_set_header_generation(cow, trans->transid); 427 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 428 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 429 BTRFS_HEADER_FLAG_RELOC); 430 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 431 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 432 else 433 btrfs_set_header_owner(cow, root->root_key.objectid); 434 435 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 436 437 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 438 if (ret) { 439 btrfs_tree_unlock(cow); 440 free_extent_buffer(cow); 441 btrfs_abort_transaction(trans, ret); 442 return ret; 443 } 444 445 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 446 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 447 if (ret) { 448 btrfs_tree_unlock(cow); 449 free_extent_buffer(cow); 450 btrfs_abort_transaction(trans, ret); 451 return ret; 452 } 453 } 454 455 if (buf == root->node) { 456 WARN_ON(parent && parent != buf); 457 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 458 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 459 parent_start = buf->start; 460 461 atomic_inc(&cow->refs); 462 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 463 BUG_ON(ret < 0); 464 rcu_assign_pointer(root->node, cow); 465 466 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 467 parent_start, last_ref); 468 free_extent_buffer(buf); 469 add_root_to_dirty_list(root); 470 } else { 471 WARN_ON(trans->transid != btrfs_header_generation(parent)); 472 btrfs_tree_mod_log_insert_key(parent, parent_slot, 473 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 474 btrfs_set_node_blockptr(parent, parent_slot, 475 cow->start); 476 btrfs_set_node_ptr_generation(parent, parent_slot, 477 trans->transid); 478 btrfs_mark_buffer_dirty(parent); 479 if (last_ref) { 480 ret = btrfs_tree_mod_log_free_eb(buf); 481 if (ret) { 482 btrfs_tree_unlock(cow); 483 free_extent_buffer(cow); 484 btrfs_abort_transaction(trans, ret); 485 return ret; 486 } 487 } 488 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 489 parent_start, last_ref); 490 } 491 if (unlock_orig) 492 btrfs_tree_unlock(buf); 493 free_extent_buffer_stale(buf); 494 btrfs_mark_buffer_dirty(cow); 495 *cow_ret = cow; 496 return 0; 497 } 498 499 static inline int should_cow_block(struct btrfs_trans_handle *trans, 500 struct btrfs_root *root, 501 struct extent_buffer *buf) 502 { 503 if (btrfs_is_testing(root->fs_info)) 504 return 0; 505 506 /* Ensure we can see the FORCE_COW bit */ 507 smp_mb__before_atomic(); 508 509 /* 510 * We do not need to cow a block if 511 * 1) this block is not created or changed in this transaction; 512 * 2) this block does not belong to TREE_RELOC tree; 513 * 3) the root is not forced COW. 514 * 515 * What is forced COW: 516 * when we create snapshot during committing the transaction, 517 * after we've finished copying src root, we must COW the shared 518 * block to ensure the metadata consistency. 519 */ 520 if (btrfs_header_generation(buf) == trans->transid && 521 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 522 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 523 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 524 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 525 return 0; 526 return 1; 527 } 528 529 /* 530 * cows a single block, see __btrfs_cow_block for the real work. 531 * This version of it has extra checks so that a block isn't COWed more than 532 * once per transaction, as long as it hasn't been written yet 533 */ 534 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 535 struct btrfs_root *root, struct extent_buffer *buf, 536 struct extent_buffer *parent, int parent_slot, 537 struct extent_buffer **cow_ret, 538 enum btrfs_lock_nesting nest) 539 { 540 struct btrfs_fs_info *fs_info = root->fs_info; 541 u64 search_start; 542 int ret; 543 544 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 545 btrfs_err(fs_info, 546 "COW'ing blocks on a fs root that's being dropped"); 547 548 if (trans->transaction != fs_info->running_transaction) 549 WARN(1, KERN_CRIT "trans %llu running %llu\n", 550 trans->transid, 551 fs_info->running_transaction->transid); 552 553 if (trans->transid != fs_info->generation) 554 WARN(1, KERN_CRIT "trans %llu running %llu\n", 555 trans->transid, fs_info->generation); 556 557 if (!should_cow_block(trans, root, buf)) { 558 *cow_ret = buf; 559 return 0; 560 } 561 562 search_start = buf->start & ~((u64)SZ_1G - 1); 563 564 /* 565 * Before CoWing this block for later modification, check if it's 566 * the subtree root and do the delayed subtree trace if needed. 567 * 568 * Also We don't care about the error, as it's handled internally. 569 */ 570 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 571 ret = __btrfs_cow_block(trans, root, buf, parent, 572 parent_slot, cow_ret, search_start, 0, nest); 573 574 trace_btrfs_cow_block(root, buf, *cow_ret); 575 576 return ret; 577 } 578 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 579 580 /* 581 * helper function for defrag to decide if two blocks pointed to by a 582 * node are actually close by 583 */ 584 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 585 { 586 if (blocknr < other && other - (blocknr + blocksize) < 32768) 587 return 1; 588 if (blocknr > other && blocknr - (other + blocksize) < 32768) 589 return 1; 590 return 0; 591 } 592 593 #ifdef __LITTLE_ENDIAN 594 595 /* 596 * Compare two keys, on little-endian the disk order is same as CPU order and 597 * we can avoid the conversion. 598 */ 599 static int comp_keys(const struct btrfs_disk_key *disk_key, 600 const struct btrfs_key *k2) 601 { 602 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 603 604 return btrfs_comp_cpu_keys(k1, k2); 605 } 606 607 #else 608 609 /* 610 * compare two keys in a memcmp fashion 611 */ 612 static int comp_keys(const struct btrfs_disk_key *disk, 613 const struct btrfs_key *k2) 614 { 615 struct btrfs_key k1; 616 617 btrfs_disk_key_to_cpu(&k1, disk); 618 619 return btrfs_comp_cpu_keys(&k1, k2); 620 } 621 #endif 622 623 /* 624 * same as comp_keys only with two btrfs_key's 625 */ 626 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 627 { 628 if (k1->objectid > k2->objectid) 629 return 1; 630 if (k1->objectid < k2->objectid) 631 return -1; 632 if (k1->type > k2->type) 633 return 1; 634 if (k1->type < k2->type) 635 return -1; 636 if (k1->offset > k2->offset) 637 return 1; 638 if (k1->offset < k2->offset) 639 return -1; 640 return 0; 641 } 642 643 /* 644 * this is used by the defrag code to go through all the 645 * leaves pointed to by a node and reallocate them so that 646 * disk order is close to key order 647 */ 648 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 649 struct btrfs_root *root, struct extent_buffer *parent, 650 int start_slot, u64 *last_ret, 651 struct btrfs_key *progress) 652 { 653 struct btrfs_fs_info *fs_info = root->fs_info; 654 struct extent_buffer *cur; 655 u64 blocknr; 656 u64 search_start = *last_ret; 657 u64 last_block = 0; 658 u64 other; 659 u32 parent_nritems; 660 int end_slot; 661 int i; 662 int err = 0; 663 u32 blocksize; 664 int progress_passed = 0; 665 struct btrfs_disk_key disk_key; 666 667 WARN_ON(trans->transaction != fs_info->running_transaction); 668 WARN_ON(trans->transid != fs_info->generation); 669 670 parent_nritems = btrfs_header_nritems(parent); 671 blocksize = fs_info->nodesize; 672 end_slot = parent_nritems - 1; 673 674 if (parent_nritems <= 1) 675 return 0; 676 677 for (i = start_slot; i <= end_slot; i++) { 678 int close = 1; 679 680 btrfs_node_key(parent, &disk_key, i); 681 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 682 continue; 683 684 progress_passed = 1; 685 blocknr = btrfs_node_blockptr(parent, i); 686 if (last_block == 0) 687 last_block = blocknr; 688 689 if (i > 0) { 690 other = btrfs_node_blockptr(parent, i - 1); 691 close = close_blocks(blocknr, other, blocksize); 692 } 693 if (!close && i < end_slot) { 694 other = btrfs_node_blockptr(parent, i + 1); 695 close = close_blocks(blocknr, other, blocksize); 696 } 697 if (close) { 698 last_block = blocknr; 699 continue; 700 } 701 702 cur = btrfs_read_node_slot(parent, i); 703 if (IS_ERR(cur)) 704 return PTR_ERR(cur); 705 if (search_start == 0) 706 search_start = last_block; 707 708 btrfs_tree_lock(cur); 709 err = __btrfs_cow_block(trans, root, cur, parent, i, 710 &cur, search_start, 711 min(16 * blocksize, 712 (end_slot - i) * blocksize), 713 BTRFS_NESTING_COW); 714 if (err) { 715 btrfs_tree_unlock(cur); 716 free_extent_buffer(cur); 717 break; 718 } 719 search_start = cur->start; 720 last_block = cur->start; 721 *last_ret = search_start; 722 btrfs_tree_unlock(cur); 723 free_extent_buffer(cur); 724 } 725 return err; 726 } 727 728 /* 729 * Search for a key in the given extent_buffer. 730 * 731 * The lower boundary for the search is specified by the slot number @low. Use a 732 * value of 0 to search over the whole extent buffer. 733 * 734 * The slot in the extent buffer is returned via @slot. If the key exists in the 735 * extent buffer, then @slot will point to the slot where the key is, otherwise 736 * it points to the slot where you would insert the key. 737 * 738 * Slot may point to the total number of items (i.e. one position beyond the last 739 * key) if the key is bigger than the last key in the extent buffer. 740 */ 741 static noinline int generic_bin_search(struct extent_buffer *eb, int low, 742 const struct btrfs_key *key, int *slot) 743 { 744 unsigned long p; 745 int item_size; 746 int high = btrfs_header_nritems(eb); 747 int ret; 748 const int key_size = sizeof(struct btrfs_disk_key); 749 750 if (low > high) { 751 btrfs_err(eb->fs_info, 752 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 753 __func__, low, high, eb->start, 754 btrfs_header_owner(eb), btrfs_header_level(eb)); 755 return -EINVAL; 756 } 757 758 if (btrfs_header_level(eb) == 0) { 759 p = offsetof(struct btrfs_leaf, items); 760 item_size = sizeof(struct btrfs_item); 761 } else { 762 p = offsetof(struct btrfs_node, ptrs); 763 item_size = sizeof(struct btrfs_key_ptr); 764 } 765 766 while (low < high) { 767 unsigned long oip; 768 unsigned long offset; 769 struct btrfs_disk_key *tmp; 770 struct btrfs_disk_key unaligned; 771 int mid; 772 773 mid = (low + high) / 2; 774 offset = p + mid * item_size; 775 oip = offset_in_page(offset); 776 777 if (oip + key_size <= PAGE_SIZE) { 778 const unsigned long idx = get_eb_page_index(offset); 779 char *kaddr = page_address(eb->pages[idx]); 780 781 oip = get_eb_offset_in_page(eb, offset); 782 tmp = (struct btrfs_disk_key *)(kaddr + oip); 783 } else { 784 read_extent_buffer(eb, &unaligned, offset, key_size); 785 tmp = &unaligned; 786 } 787 788 ret = comp_keys(tmp, key); 789 790 if (ret < 0) 791 low = mid + 1; 792 else if (ret > 0) 793 high = mid; 794 else { 795 *slot = mid; 796 return 0; 797 } 798 } 799 *slot = low; 800 return 1; 801 } 802 803 /* 804 * Simple binary search on an extent buffer. Works for both leaves and nodes, and 805 * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). 806 */ 807 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 808 int *slot) 809 { 810 return generic_bin_search(eb, 0, key, slot); 811 } 812 813 static void root_add_used(struct btrfs_root *root, u32 size) 814 { 815 spin_lock(&root->accounting_lock); 816 btrfs_set_root_used(&root->root_item, 817 btrfs_root_used(&root->root_item) + size); 818 spin_unlock(&root->accounting_lock); 819 } 820 821 static void root_sub_used(struct btrfs_root *root, u32 size) 822 { 823 spin_lock(&root->accounting_lock); 824 btrfs_set_root_used(&root->root_item, 825 btrfs_root_used(&root->root_item) - size); 826 spin_unlock(&root->accounting_lock); 827 } 828 829 /* given a node and slot number, this reads the blocks it points to. The 830 * extent buffer is returned with a reference taken (but unlocked). 831 */ 832 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 833 int slot) 834 { 835 int level = btrfs_header_level(parent); 836 struct extent_buffer *eb; 837 struct btrfs_key first_key; 838 839 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 840 return ERR_PTR(-ENOENT); 841 842 BUG_ON(level == 0); 843 844 btrfs_node_key_to_cpu(parent, &first_key, slot); 845 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 846 btrfs_header_owner(parent), 847 btrfs_node_ptr_generation(parent, slot), 848 level - 1, &first_key); 849 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 850 free_extent_buffer(eb); 851 eb = ERR_PTR(-EIO); 852 } 853 854 return eb; 855 } 856 857 /* 858 * node level balancing, used to make sure nodes are in proper order for 859 * item deletion. We balance from the top down, so we have to make sure 860 * that a deletion won't leave an node completely empty later on. 861 */ 862 static noinline int balance_level(struct btrfs_trans_handle *trans, 863 struct btrfs_root *root, 864 struct btrfs_path *path, int level) 865 { 866 struct btrfs_fs_info *fs_info = root->fs_info; 867 struct extent_buffer *right = NULL; 868 struct extent_buffer *mid; 869 struct extent_buffer *left = NULL; 870 struct extent_buffer *parent = NULL; 871 int ret = 0; 872 int wret; 873 int pslot; 874 int orig_slot = path->slots[level]; 875 u64 orig_ptr; 876 877 ASSERT(level > 0); 878 879 mid = path->nodes[level]; 880 881 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 882 WARN_ON(btrfs_header_generation(mid) != trans->transid); 883 884 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 885 886 if (level < BTRFS_MAX_LEVEL - 1) { 887 parent = path->nodes[level + 1]; 888 pslot = path->slots[level + 1]; 889 } 890 891 /* 892 * deal with the case where there is only one pointer in the root 893 * by promoting the node below to a root 894 */ 895 if (!parent) { 896 struct extent_buffer *child; 897 898 if (btrfs_header_nritems(mid) != 1) 899 return 0; 900 901 /* promote the child to a root */ 902 child = btrfs_read_node_slot(mid, 0); 903 if (IS_ERR(child)) { 904 ret = PTR_ERR(child); 905 btrfs_handle_fs_error(fs_info, ret, NULL); 906 goto enospc; 907 } 908 909 btrfs_tree_lock(child); 910 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 911 BTRFS_NESTING_COW); 912 if (ret) { 913 btrfs_tree_unlock(child); 914 free_extent_buffer(child); 915 goto enospc; 916 } 917 918 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 919 BUG_ON(ret < 0); 920 rcu_assign_pointer(root->node, child); 921 922 add_root_to_dirty_list(root); 923 btrfs_tree_unlock(child); 924 925 path->locks[level] = 0; 926 path->nodes[level] = NULL; 927 btrfs_clean_tree_block(mid); 928 btrfs_tree_unlock(mid); 929 /* once for the path */ 930 free_extent_buffer(mid); 931 932 root_sub_used(root, mid->len); 933 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 934 /* once for the root ptr */ 935 free_extent_buffer_stale(mid); 936 return 0; 937 } 938 if (btrfs_header_nritems(mid) > 939 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 940 return 0; 941 942 left = btrfs_read_node_slot(parent, pslot - 1); 943 if (IS_ERR(left)) 944 left = NULL; 945 946 if (left) { 947 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 948 wret = btrfs_cow_block(trans, root, left, 949 parent, pslot - 1, &left, 950 BTRFS_NESTING_LEFT_COW); 951 if (wret) { 952 ret = wret; 953 goto enospc; 954 } 955 } 956 957 right = btrfs_read_node_slot(parent, pslot + 1); 958 if (IS_ERR(right)) 959 right = NULL; 960 961 if (right) { 962 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 963 wret = btrfs_cow_block(trans, root, right, 964 parent, pslot + 1, &right, 965 BTRFS_NESTING_RIGHT_COW); 966 if (wret) { 967 ret = wret; 968 goto enospc; 969 } 970 } 971 972 /* first, try to make some room in the middle buffer */ 973 if (left) { 974 orig_slot += btrfs_header_nritems(left); 975 wret = push_node_left(trans, left, mid, 1); 976 if (wret < 0) 977 ret = wret; 978 } 979 980 /* 981 * then try to empty the right most buffer into the middle 982 */ 983 if (right) { 984 wret = push_node_left(trans, mid, right, 1); 985 if (wret < 0 && wret != -ENOSPC) 986 ret = wret; 987 if (btrfs_header_nritems(right) == 0) { 988 btrfs_clean_tree_block(right); 989 btrfs_tree_unlock(right); 990 del_ptr(root, path, level + 1, pslot + 1); 991 root_sub_used(root, right->len); 992 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 993 0, 1); 994 free_extent_buffer_stale(right); 995 right = NULL; 996 } else { 997 struct btrfs_disk_key right_key; 998 btrfs_node_key(right, &right_key, 0); 999 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1000 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1001 BUG_ON(ret < 0); 1002 btrfs_set_node_key(parent, &right_key, pslot + 1); 1003 btrfs_mark_buffer_dirty(parent); 1004 } 1005 } 1006 if (btrfs_header_nritems(mid) == 1) { 1007 /* 1008 * we're not allowed to leave a node with one item in the 1009 * tree during a delete. A deletion from lower in the tree 1010 * could try to delete the only pointer in this node. 1011 * So, pull some keys from the left. 1012 * There has to be a left pointer at this point because 1013 * otherwise we would have pulled some pointers from the 1014 * right 1015 */ 1016 if (!left) { 1017 ret = -EROFS; 1018 btrfs_handle_fs_error(fs_info, ret, NULL); 1019 goto enospc; 1020 } 1021 wret = balance_node_right(trans, mid, left); 1022 if (wret < 0) { 1023 ret = wret; 1024 goto enospc; 1025 } 1026 if (wret == 1) { 1027 wret = push_node_left(trans, left, mid, 1); 1028 if (wret < 0) 1029 ret = wret; 1030 } 1031 BUG_ON(wret == 1); 1032 } 1033 if (btrfs_header_nritems(mid) == 0) { 1034 btrfs_clean_tree_block(mid); 1035 btrfs_tree_unlock(mid); 1036 del_ptr(root, path, level + 1, pslot); 1037 root_sub_used(root, mid->len); 1038 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1039 free_extent_buffer_stale(mid); 1040 mid = NULL; 1041 } else { 1042 /* update the parent key to reflect our changes */ 1043 struct btrfs_disk_key mid_key; 1044 btrfs_node_key(mid, &mid_key, 0); 1045 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1046 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1047 BUG_ON(ret < 0); 1048 btrfs_set_node_key(parent, &mid_key, pslot); 1049 btrfs_mark_buffer_dirty(parent); 1050 } 1051 1052 /* update the path */ 1053 if (left) { 1054 if (btrfs_header_nritems(left) > orig_slot) { 1055 atomic_inc(&left->refs); 1056 /* left was locked after cow */ 1057 path->nodes[level] = left; 1058 path->slots[level + 1] -= 1; 1059 path->slots[level] = orig_slot; 1060 if (mid) { 1061 btrfs_tree_unlock(mid); 1062 free_extent_buffer(mid); 1063 } 1064 } else { 1065 orig_slot -= btrfs_header_nritems(left); 1066 path->slots[level] = orig_slot; 1067 } 1068 } 1069 /* double check we haven't messed things up */ 1070 if (orig_ptr != 1071 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1072 BUG(); 1073 enospc: 1074 if (right) { 1075 btrfs_tree_unlock(right); 1076 free_extent_buffer(right); 1077 } 1078 if (left) { 1079 if (path->nodes[level] != left) 1080 btrfs_tree_unlock(left); 1081 free_extent_buffer(left); 1082 } 1083 return ret; 1084 } 1085 1086 /* Node balancing for insertion. Here we only split or push nodes around 1087 * when they are completely full. This is also done top down, so we 1088 * have to be pessimistic. 1089 */ 1090 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1091 struct btrfs_root *root, 1092 struct btrfs_path *path, int level) 1093 { 1094 struct btrfs_fs_info *fs_info = root->fs_info; 1095 struct extent_buffer *right = NULL; 1096 struct extent_buffer *mid; 1097 struct extent_buffer *left = NULL; 1098 struct extent_buffer *parent = NULL; 1099 int ret = 0; 1100 int wret; 1101 int pslot; 1102 int orig_slot = path->slots[level]; 1103 1104 if (level == 0) 1105 return 1; 1106 1107 mid = path->nodes[level]; 1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1109 1110 if (level < BTRFS_MAX_LEVEL - 1) { 1111 parent = path->nodes[level + 1]; 1112 pslot = path->slots[level + 1]; 1113 } 1114 1115 if (!parent) 1116 return 1; 1117 1118 left = btrfs_read_node_slot(parent, pslot - 1); 1119 if (IS_ERR(left)) 1120 left = NULL; 1121 1122 /* first, try to make some room in the middle buffer */ 1123 if (left) { 1124 u32 left_nr; 1125 1126 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1127 1128 left_nr = btrfs_header_nritems(left); 1129 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1130 wret = 1; 1131 } else { 1132 ret = btrfs_cow_block(trans, root, left, parent, 1133 pslot - 1, &left, 1134 BTRFS_NESTING_LEFT_COW); 1135 if (ret) 1136 wret = 1; 1137 else { 1138 wret = push_node_left(trans, left, mid, 0); 1139 } 1140 } 1141 if (wret < 0) 1142 ret = wret; 1143 if (wret == 0) { 1144 struct btrfs_disk_key disk_key; 1145 orig_slot += left_nr; 1146 btrfs_node_key(mid, &disk_key, 0); 1147 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1148 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1149 BUG_ON(ret < 0); 1150 btrfs_set_node_key(parent, &disk_key, pslot); 1151 btrfs_mark_buffer_dirty(parent); 1152 if (btrfs_header_nritems(left) > orig_slot) { 1153 path->nodes[level] = left; 1154 path->slots[level + 1] -= 1; 1155 path->slots[level] = orig_slot; 1156 btrfs_tree_unlock(mid); 1157 free_extent_buffer(mid); 1158 } else { 1159 orig_slot -= 1160 btrfs_header_nritems(left); 1161 path->slots[level] = orig_slot; 1162 btrfs_tree_unlock(left); 1163 free_extent_buffer(left); 1164 } 1165 return 0; 1166 } 1167 btrfs_tree_unlock(left); 1168 free_extent_buffer(left); 1169 } 1170 right = btrfs_read_node_slot(parent, pslot + 1); 1171 if (IS_ERR(right)) 1172 right = NULL; 1173 1174 /* 1175 * then try to empty the right most buffer into the middle 1176 */ 1177 if (right) { 1178 u32 right_nr; 1179 1180 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1181 1182 right_nr = btrfs_header_nritems(right); 1183 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1184 wret = 1; 1185 } else { 1186 ret = btrfs_cow_block(trans, root, right, 1187 parent, pslot + 1, 1188 &right, BTRFS_NESTING_RIGHT_COW); 1189 if (ret) 1190 wret = 1; 1191 else { 1192 wret = balance_node_right(trans, right, mid); 1193 } 1194 } 1195 if (wret < 0) 1196 ret = wret; 1197 if (wret == 0) { 1198 struct btrfs_disk_key disk_key; 1199 1200 btrfs_node_key(right, &disk_key, 0); 1201 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1202 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1203 BUG_ON(ret < 0); 1204 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1205 btrfs_mark_buffer_dirty(parent); 1206 1207 if (btrfs_header_nritems(mid) <= orig_slot) { 1208 path->nodes[level] = right; 1209 path->slots[level + 1] += 1; 1210 path->slots[level] = orig_slot - 1211 btrfs_header_nritems(mid); 1212 btrfs_tree_unlock(mid); 1213 free_extent_buffer(mid); 1214 } else { 1215 btrfs_tree_unlock(right); 1216 free_extent_buffer(right); 1217 } 1218 return 0; 1219 } 1220 btrfs_tree_unlock(right); 1221 free_extent_buffer(right); 1222 } 1223 return 1; 1224 } 1225 1226 /* 1227 * readahead one full node of leaves, finding things that are close 1228 * to the block in 'slot', and triggering ra on them. 1229 */ 1230 static void reada_for_search(struct btrfs_fs_info *fs_info, 1231 struct btrfs_path *path, 1232 int level, int slot, u64 objectid) 1233 { 1234 struct extent_buffer *node; 1235 struct btrfs_disk_key disk_key; 1236 u32 nritems; 1237 u64 search; 1238 u64 target; 1239 u64 nread = 0; 1240 u64 nread_max; 1241 u32 nr; 1242 u32 blocksize; 1243 u32 nscan = 0; 1244 1245 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1246 return; 1247 1248 if (!path->nodes[level]) 1249 return; 1250 1251 node = path->nodes[level]; 1252 1253 /* 1254 * Since the time between visiting leaves is much shorter than the time 1255 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1256 * much IO at once (possibly random). 1257 */ 1258 if (path->reada == READA_FORWARD_ALWAYS) { 1259 if (level > 1) 1260 nread_max = node->fs_info->nodesize; 1261 else 1262 nread_max = SZ_128K; 1263 } else { 1264 nread_max = SZ_64K; 1265 } 1266 1267 search = btrfs_node_blockptr(node, slot); 1268 blocksize = fs_info->nodesize; 1269 if (path->reada != READA_FORWARD_ALWAYS) { 1270 struct extent_buffer *eb; 1271 1272 eb = find_extent_buffer(fs_info, search); 1273 if (eb) { 1274 free_extent_buffer(eb); 1275 return; 1276 } 1277 } 1278 1279 target = search; 1280 1281 nritems = btrfs_header_nritems(node); 1282 nr = slot; 1283 1284 while (1) { 1285 if (path->reada == READA_BACK) { 1286 if (nr == 0) 1287 break; 1288 nr--; 1289 } else if (path->reada == READA_FORWARD || 1290 path->reada == READA_FORWARD_ALWAYS) { 1291 nr++; 1292 if (nr >= nritems) 1293 break; 1294 } 1295 if (path->reada == READA_BACK && objectid) { 1296 btrfs_node_key(node, &disk_key, nr); 1297 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1298 break; 1299 } 1300 search = btrfs_node_blockptr(node, nr); 1301 if (path->reada == READA_FORWARD_ALWAYS || 1302 (search <= target && target - search <= 65536) || 1303 (search > target && search - target <= 65536)) { 1304 btrfs_readahead_node_child(node, nr); 1305 nread += blocksize; 1306 } 1307 nscan++; 1308 if (nread > nread_max || nscan > 32) 1309 break; 1310 } 1311 } 1312 1313 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1314 { 1315 struct extent_buffer *parent; 1316 int slot; 1317 int nritems; 1318 1319 parent = path->nodes[level + 1]; 1320 if (!parent) 1321 return; 1322 1323 nritems = btrfs_header_nritems(parent); 1324 slot = path->slots[level + 1]; 1325 1326 if (slot > 0) 1327 btrfs_readahead_node_child(parent, slot - 1); 1328 if (slot + 1 < nritems) 1329 btrfs_readahead_node_child(parent, slot + 1); 1330 } 1331 1332 1333 /* 1334 * when we walk down the tree, it is usually safe to unlock the higher layers 1335 * in the tree. The exceptions are when our path goes through slot 0, because 1336 * operations on the tree might require changing key pointers higher up in the 1337 * tree. 1338 * 1339 * callers might also have set path->keep_locks, which tells this code to keep 1340 * the lock if the path points to the last slot in the block. This is part of 1341 * walking through the tree, and selecting the next slot in the higher block. 1342 * 1343 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1344 * if lowest_unlock is 1, level 0 won't be unlocked 1345 */ 1346 static noinline void unlock_up(struct btrfs_path *path, int level, 1347 int lowest_unlock, int min_write_lock_level, 1348 int *write_lock_level) 1349 { 1350 int i; 1351 int skip_level = level; 1352 bool check_skip = true; 1353 1354 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1355 if (!path->nodes[i]) 1356 break; 1357 if (!path->locks[i]) 1358 break; 1359 1360 if (check_skip) { 1361 if (path->slots[i] == 0) { 1362 skip_level = i + 1; 1363 continue; 1364 } 1365 1366 if (path->keep_locks) { 1367 u32 nritems; 1368 1369 nritems = btrfs_header_nritems(path->nodes[i]); 1370 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1371 skip_level = i + 1; 1372 continue; 1373 } 1374 } 1375 } 1376 1377 if (i >= lowest_unlock && i > skip_level) { 1378 check_skip = false; 1379 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1380 path->locks[i] = 0; 1381 if (write_lock_level && 1382 i > min_write_lock_level && 1383 i <= *write_lock_level) { 1384 *write_lock_level = i - 1; 1385 } 1386 } 1387 } 1388 } 1389 1390 /* 1391 * helper function for btrfs_search_slot. The goal is to find a block 1392 * in cache without setting the path to blocking. If we find the block 1393 * we return zero and the path is unchanged. 1394 * 1395 * If we can't find the block, we set the path blocking and do some 1396 * reada. -EAGAIN is returned and the search must be repeated. 1397 */ 1398 static int 1399 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1400 struct extent_buffer **eb_ret, int level, int slot, 1401 const struct btrfs_key *key) 1402 { 1403 struct btrfs_fs_info *fs_info = root->fs_info; 1404 u64 blocknr; 1405 u64 gen; 1406 struct extent_buffer *tmp; 1407 struct btrfs_key first_key; 1408 int ret; 1409 int parent_level; 1410 1411 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1412 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1413 parent_level = btrfs_header_level(*eb_ret); 1414 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); 1415 1416 tmp = find_extent_buffer(fs_info, blocknr); 1417 if (tmp) { 1418 if (p->reada == READA_FORWARD_ALWAYS) 1419 reada_for_search(fs_info, p, level, slot, key->objectid); 1420 1421 /* first we do an atomic uptodate check */ 1422 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1423 /* 1424 * Do extra check for first_key, eb can be stale due to 1425 * being cached, read from scrub, or have multiple 1426 * parents (shared tree blocks). 1427 */ 1428 if (btrfs_verify_level_key(tmp, 1429 parent_level - 1, &first_key, gen)) { 1430 free_extent_buffer(tmp); 1431 return -EUCLEAN; 1432 } 1433 *eb_ret = tmp; 1434 return 0; 1435 } 1436 1437 /* now we're allowed to do a blocking uptodate check */ 1438 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 1439 if (!ret) { 1440 *eb_ret = tmp; 1441 return 0; 1442 } 1443 free_extent_buffer(tmp); 1444 btrfs_release_path(p); 1445 return -EIO; 1446 } 1447 1448 /* 1449 * reduce lock contention at high levels 1450 * of the btree by dropping locks before 1451 * we read. Don't release the lock on the current 1452 * level because we need to walk this node to figure 1453 * out which blocks to read. 1454 */ 1455 btrfs_unlock_up_safe(p, level + 1); 1456 1457 if (p->reada != READA_NONE) 1458 reada_for_search(fs_info, p, level, slot, key->objectid); 1459 1460 ret = -EAGAIN; 1461 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1462 gen, parent_level - 1, &first_key); 1463 if (!IS_ERR(tmp)) { 1464 /* 1465 * If the read above didn't mark this buffer up to date, 1466 * it will never end up being up to date. Set ret to EIO now 1467 * and give up so that our caller doesn't loop forever 1468 * on our EAGAINs. 1469 */ 1470 if (!extent_buffer_uptodate(tmp)) 1471 ret = -EIO; 1472 free_extent_buffer(tmp); 1473 } else { 1474 ret = PTR_ERR(tmp); 1475 } 1476 1477 btrfs_release_path(p); 1478 return ret; 1479 } 1480 1481 /* 1482 * helper function for btrfs_search_slot. This does all of the checks 1483 * for node-level blocks and does any balancing required based on 1484 * the ins_len. 1485 * 1486 * If no extra work was required, zero is returned. If we had to 1487 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1488 * start over 1489 */ 1490 static int 1491 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1492 struct btrfs_root *root, struct btrfs_path *p, 1493 struct extent_buffer *b, int level, int ins_len, 1494 int *write_lock_level) 1495 { 1496 struct btrfs_fs_info *fs_info = root->fs_info; 1497 int ret = 0; 1498 1499 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1500 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1501 1502 if (*write_lock_level < level + 1) { 1503 *write_lock_level = level + 1; 1504 btrfs_release_path(p); 1505 return -EAGAIN; 1506 } 1507 1508 reada_for_balance(p, level); 1509 ret = split_node(trans, root, p, level); 1510 1511 b = p->nodes[level]; 1512 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1513 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1514 1515 if (*write_lock_level < level + 1) { 1516 *write_lock_level = level + 1; 1517 btrfs_release_path(p); 1518 return -EAGAIN; 1519 } 1520 1521 reada_for_balance(p, level); 1522 ret = balance_level(trans, root, p, level); 1523 if (ret) 1524 return ret; 1525 1526 b = p->nodes[level]; 1527 if (!b) { 1528 btrfs_release_path(p); 1529 return -EAGAIN; 1530 } 1531 BUG_ON(btrfs_header_nritems(b) == 1); 1532 } 1533 return ret; 1534 } 1535 1536 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1537 u64 iobjectid, u64 ioff, u8 key_type, 1538 struct btrfs_key *found_key) 1539 { 1540 int ret; 1541 struct btrfs_key key; 1542 struct extent_buffer *eb; 1543 1544 ASSERT(path); 1545 ASSERT(found_key); 1546 1547 key.type = key_type; 1548 key.objectid = iobjectid; 1549 key.offset = ioff; 1550 1551 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1552 if (ret < 0) 1553 return ret; 1554 1555 eb = path->nodes[0]; 1556 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1557 ret = btrfs_next_leaf(fs_root, path); 1558 if (ret) 1559 return ret; 1560 eb = path->nodes[0]; 1561 } 1562 1563 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1564 if (found_key->type != key.type || 1565 found_key->objectid != key.objectid) 1566 return 1; 1567 1568 return 0; 1569 } 1570 1571 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1572 struct btrfs_path *p, 1573 int write_lock_level) 1574 { 1575 struct extent_buffer *b; 1576 int root_lock = 0; 1577 int level = 0; 1578 1579 if (p->search_commit_root) { 1580 b = root->commit_root; 1581 atomic_inc(&b->refs); 1582 level = btrfs_header_level(b); 1583 /* 1584 * Ensure that all callers have set skip_locking when 1585 * p->search_commit_root = 1. 1586 */ 1587 ASSERT(p->skip_locking == 1); 1588 1589 goto out; 1590 } 1591 1592 if (p->skip_locking) { 1593 b = btrfs_root_node(root); 1594 level = btrfs_header_level(b); 1595 goto out; 1596 } 1597 1598 /* We try very hard to do read locks on the root */ 1599 root_lock = BTRFS_READ_LOCK; 1600 1601 /* 1602 * If the level is set to maximum, we can skip trying to get the read 1603 * lock. 1604 */ 1605 if (write_lock_level < BTRFS_MAX_LEVEL) { 1606 /* 1607 * We don't know the level of the root node until we actually 1608 * have it read locked 1609 */ 1610 b = btrfs_read_lock_root_node(root); 1611 level = btrfs_header_level(b); 1612 if (level > write_lock_level) 1613 goto out; 1614 1615 /* Whoops, must trade for write lock */ 1616 btrfs_tree_read_unlock(b); 1617 free_extent_buffer(b); 1618 } 1619 1620 b = btrfs_lock_root_node(root); 1621 root_lock = BTRFS_WRITE_LOCK; 1622 1623 /* The level might have changed, check again */ 1624 level = btrfs_header_level(b); 1625 1626 out: 1627 /* 1628 * The root may have failed to write out at some point, and thus is no 1629 * longer valid, return an error in this case. 1630 */ 1631 if (!extent_buffer_uptodate(b)) { 1632 if (root_lock) 1633 btrfs_tree_unlock_rw(b, root_lock); 1634 free_extent_buffer(b); 1635 return ERR_PTR(-EIO); 1636 } 1637 1638 p->nodes[level] = b; 1639 if (!p->skip_locking) 1640 p->locks[level] = root_lock; 1641 /* 1642 * Callers are responsible for dropping b's references. 1643 */ 1644 return b; 1645 } 1646 1647 /* 1648 * Replace the extent buffer at the lowest level of the path with a cloned 1649 * version. The purpose is to be able to use it safely, after releasing the 1650 * commit root semaphore, even if relocation is happening in parallel, the 1651 * transaction used for relocation is committed and the extent buffer is 1652 * reallocated in the next transaction. 1653 * 1654 * This is used in a context where the caller does not prevent transaction 1655 * commits from happening, either by holding a transaction handle or holding 1656 * some lock, while it's doing searches through a commit root. 1657 * At the moment it's only used for send operations. 1658 */ 1659 static int finish_need_commit_sem_search(struct btrfs_path *path) 1660 { 1661 const int i = path->lowest_level; 1662 const int slot = path->slots[i]; 1663 struct extent_buffer *lowest = path->nodes[i]; 1664 struct extent_buffer *clone; 1665 1666 ASSERT(path->need_commit_sem); 1667 1668 if (!lowest) 1669 return 0; 1670 1671 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1672 1673 clone = btrfs_clone_extent_buffer(lowest); 1674 if (!clone) 1675 return -ENOMEM; 1676 1677 btrfs_release_path(path); 1678 path->nodes[i] = clone; 1679 path->slots[i] = slot; 1680 1681 return 0; 1682 } 1683 1684 static inline int search_for_key_slot(struct extent_buffer *eb, 1685 int search_low_slot, 1686 const struct btrfs_key *key, 1687 int prev_cmp, 1688 int *slot) 1689 { 1690 /* 1691 * If a previous call to btrfs_bin_search() on a parent node returned an 1692 * exact match (prev_cmp == 0), we can safely assume the target key will 1693 * always be at slot 0 on lower levels, since each key pointer 1694 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1695 * subtree it points to. Thus we can skip searching lower levels. 1696 */ 1697 if (prev_cmp == 0) { 1698 *slot = 0; 1699 return 0; 1700 } 1701 1702 return generic_bin_search(eb, search_low_slot, key, slot); 1703 } 1704 1705 static int search_leaf(struct btrfs_trans_handle *trans, 1706 struct btrfs_root *root, 1707 const struct btrfs_key *key, 1708 struct btrfs_path *path, 1709 int ins_len, 1710 int prev_cmp) 1711 { 1712 struct extent_buffer *leaf = path->nodes[0]; 1713 int leaf_free_space = -1; 1714 int search_low_slot = 0; 1715 int ret; 1716 bool do_bin_search = true; 1717 1718 /* 1719 * If we are doing an insertion, the leaf has enough free space and the 1720 * destination slot for the key is not slot 0, then we can unlock our 1721 * write lock on the parent, and any other upper nodes, before doing the 1722 * binary search on the leaf (with search_for_key_slot()), allowing other 1723 * tasks to lock the parent and any other upper nodes. 1724 */ 1725 if (ins_len > 0) { 1726 /* 1727 * Cache the leaf free space, since we will need it later and it 1728 * will not change until then. 1729 */ 1730 leaf_free_space = btrfs_leaf_free_space(leaf); 1731 1732 /* 1733 * !path->locks[1] means we have a single node tree, the leaf is 1734 * the root of the tree. 1735 */ 1736 if (path->locks[1] && leaf_free_space >= ins_len) { 1737 struct btrfs_disk_key first_key; 1738 1739 ASSERT(btrfs_header_nritems(leaf) > 0); 1740 btrfs_item_key(leaf, &first_key, 0); 1741 1742 /* 1743 * Doing the extra comparison with the first key is cheap, 1744 * taking into account that the first key is very likely 1745 * already in a cache line because it immediately follows 1746 * the extent buffer's header and we have recently accessed 1747 * the header's level field. 1748 */ 1749 ret = comp_keys(&first_key, key); 1750 if (ret < 0) { 1751 /* 1752 * The first key is smaller than the key we want 1753 * to insert, so we are safe to unlock all upper 1754 * nodes and we have to do the binary search. 1755 * 1756 * We do use btrfs_unlock_up_safe() and not 1757 * unlock_up() because the later does not unlock 1758 * nodes with a slot of 0 - we can safely unlock 1759 * any node even if its slot is 0 since in this 1760 * case the key does not end up at slot 0 of the 1761 * leaf and there's no need to split the leaf. 1762 */ 1763 btrfs_unlock_up_safe(path, 1); 1764 search_low_slot = 1; 1765 } else { 1766 /* 1767 * The first key is >= then the key we want to 1768 * insert, so we can skip the binary search as 1769 * the target key will be at slot 0. 1770 * 1771 * We can not unlock upper nodes when the key is 1772 * less than the first key, because we will need 1773 * to update the key at slot 0 of the parent node 1774 * and possibly of other upper nodes too. 1775 * If the key matches the first key, then we can 1776 * unlock all the upper nodes, using 1777 * btrfs_unlock_up_safe() instead of unlock_up() 1778 * as stated above. 1779 */ 1780 if (ret == 0) 1781 btrfs_unlock_up_safe(path, 1); 1782 /* 1783 * ret is already 0 or 1, matching the result of 1784 * a btrfs_bin_search() call, so there is no need 1785 * to adjust it. 1786 */ 1787 do_bin_search = false; 1788 path->slots[0] = 0; 1789 } 1790 } 1791 } 1792 1793 if (do_bin_search) { 1794 ret = search_for_key_slot(leaf, search_low_slot, key, 1795 prev_cmp, &path->slots[0]); 1796 if (ret < 0) 1797 return ret; 1798 } 1799 1800 if (ins_len > 0) { 1801 /* 1802 * Item key already exists. In this case, if we are allowed to 1803 * insert the item (for example, in dir_item case, item key 1804 * collision is allowed), it will be merged with the original 1805 * item. Only the item size grows, no new btrfs item will be 1806 * added. If search_for_extension is not set, ins_len already 1807 * accounts the size btrfs_item, deduct it here so leaf space 1808 * check will be correct. 1809 */ 1810 if (ret == 0 && !path->search_for_extension) { 1811 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1812 ins_len -= sizeof(struct btrfs_item); 1813 } 1814 1815 ASSERT(leaf_free_space >= 0); 1816 1817 if (leaf_free_space < ins_len) { 1818 int err; 1819 1820 err = split_leaf(trans, root, key, path, ins_len, 1821 (ret == 0)); 1822 ASSERT(err <= 0); 1823 if (WARN_ON(err > 0)) 1824 err = -EUCLEAN; 1825 if (err) 1826 ret = err; 1827 } 1828 } 1829 1830 return ret; 1831 } 1832 1833 /* 1834 * btrfs_search_slot - look for a key in a tree and perform necessary 1835 * modifications to preserve tree invariants. 1836 * 1837 * @trans: Handle of transaction, used when modifying the tree 1838 * @p: Holds all btree nodes along the search path 1839 * @root: The root node of the tree 1840 * @key: The key we are looking for 1841 * @ins_len: Indicates purpose of search: 1842 * >0 for inserts it's size of item inserted (*) 1843 * <0 for deletions 1844 * 0 for plain searches, not modifying the tree 1845 * 1846 * (*) If size of item inserted doesn't include 1847 * sizeof(struct btrfs_item), then p->search_for_extension must 1848 * be set. 1849 * @cow: boolean should CoW operations be performed. Must always be 1 1850 * when modifying the tree. 1851 * 1852 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1853 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1854 * 1855 * If @key is found, 0 is returned and you can find the item in the leaf level 1856 * of the path (level 0) 1857 * 1858 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1859 * points to the slot where it should be inserted 1860 * 1861 * If an error is encountered while searching the tree a negative error number 1862 * is returned 1863 */ 1864 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1865 const struct btrfs_key *key, struct btrfs_path *p, 1866 int ins_len, int cow) 1867 { 1868 struct btrfs_fs_info *fs_info = root->fs_info; 1869 struct extent_buffer *b; 1870 int slot; 1871 int ret; 1872 int err; 1873 int level; 1874 int lowest_unlock = 1; 1875 /* everything at write_lock_level or lower must be write locked */ 1876 int write_lock_level = 0; 1877 u8 lowest_level = 0; 1878 int min_write_lock_level; 1879 int prev_cmp; 1880 1881 lowest_level = p->lowest_level; 1882 WARN_ON(lowest_level && ins_len > 0); 1883 WARN_ON(p->nodes[0] != NULL); 1884 BUG_ON(!cow && ins_len); 1885 1886 if (ins_len < 0) { 1887 lowest_unlock = 2; 1888 1889 /* when we are removing items, we might have to go up to level 1890 * two as we update tree pointers Make sure we keep write 1891 * for those levels as well 1892 */ 1893 write_lock_level = 2; 1894 } else if (ins_len > 0) { 1895 /* 1896 * for inserting items, make sure we have a write lock on 1897 * level 1 so we can update keys 1898 */ 1899 write_lock_level = 1; 1900 } 1901 1902 if (!cow) 1903 write_lock_level = -1; 1904 1905 if (cow && (p->keep_locks || p->lowest_level)) 1906 write_lock_level = BTRFS_MAX_LEVEL; 1907 1908 min_write_lock_level = write_lock_level; 1909 1910 if (p->need_commit_sem) { 1911 ASSERT(p->search_commit_root); 1912 down_read(&fs_info->commit_root_sem); 1913 } 1914 1915 again: 1916 prev_cmp = -1; 1917 b = btrfs_search_slot_get_root(root, p, write_lock_level); 1918 if (IS_ERR(b)) { 1919 ret = PTR_ERR(b); 1920 goto done; 1921 } 1922 1923 while (b) { 1924 int dec = 0; 1925 1926 level = btrfs_header_level(b); 1927 1928 if (cow) { 1929 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 1930 1931 /* 1932 * if we don't really need to cow this block 1933 * then we don't want to set the path blocking, 1934 * so we test it here 1935 */ 1936 if (!should_cow_block(trans, root, b)) 1937 goto cow_done; 1938 1939 /* 1940 * must have write locks on this node and the 1941 * parent 1942 */ 1943 if (level > write_lock_level || 1944 (level + 1 > write_lock_level && 1945 level + 1 < BTRFS_MAX_LEVEL && 1946 p->nodes[level + 1])) { 1947 write_lock_level = level + 1; 1948 btrfs_release_path(p); 1949 goto again; 1950 } 1951 1952 if (last_level) 1953 err = btrfs_cow_block(trans, root, b, NULL, 0, 1954 &b, 1955 BTRFS_NESTING_COW); 1956 else 1957 err = btrfs_cow_block(trans, root, b, 1958 p->nodes[level + 1], 1959 p->slots[level + 1], &b, 1960 BTRFS_NESTING_COW); 1961 if (err) { 1962 ret = err; 1963 goto done; 1964 } 1965 } 1966 cow_done: 1967 p->nodes[level] = b; 1968 1969 /* 1970 * we have a lock on b and as long as we aren't changing 1971 * the tree, there is no way to for the items in b to change. 1972 * It is safe to drop the lock on our parent before we 1973 * go through the expensive btree search on b. 1974 * 1975 * If we're inserting or deleting (ins_len != 0), then we might 1976 * be changing slot zero, which may require changing the parent. 1977 * So, we can't drop the lock until after we know which slot 1978 * we're operating on. 1979 */ 1980 if (!ins_len && !p->keep_locks) { 1981 int u = level + 1; 1982 1983 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 1984 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 1985 p->locks[u] = 0; 1986 } 1987 } 1988 1989 if (level == 0) { 1990 if (ins_len > 0) 1991 ASSERT(write_lock_level >= 1); 1992 1993 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 1994 if (!p->search_for_split) 1995 unlock_up(p, level, lowest_unlock, 1996 min_write_lock_level, NULL); 1997 goto done; 1998 } 1999 2000 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2001 if (ret < 0) 2002 goto done; 2003 prev_cmp = ret; 2004 2005 if (ret && slot > 0) { 2006 dec = 1; 2007 slot--; 2008 } 2009 p->slots[level] = slot; 2010 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2011 &write_lock_level); 2012 if (err == -EAGAIN) 2013 goto again; 2014 if (err) { 2015 ret = err; 2016 goto done; 2017 } 2018 b = p->nodes[level]; 2019 slot = p->slots[level]; 2020 2021 /* 2022 * Slot 0 is special, if we change the key we have to update 2023 * the parent pointer which means we must have a write lock on 2024 * the parent 2025 */ 2026 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2027 write_lock_level = level + 1; 2028 btrfs_release_path(p); 2029 goto again; 2030 } 2031 2032 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2033 &write_lock_level); 2034 2035 if (level == lowest_level) { 2036 if (dec) 2037 p->slots[level]++; 2038 goto done; 2039 } 2040 2041 err = read_block_for_search(root, p, &b, level, slot, key); 2042 if (err == -EAGAIN) 2043 goto again; 2044 if (err) { 2045 ret = err; 2046 goto done; 2047 } 2048 2049 if (!p->skip_locking) { 2050 level = btrfs_header_level(b); 2051 if (level <= write_lock_level) { 2052 btrfs_tree_lock(b); 2053 p->locks[level] = BTRFS_WRITE_LOCK; 2054 } else { 2055 btrfs_tree_read_lock(b); 2056 p->locks[level] = BTRFS_READ_LOCK; 2057 } 2058 p->nodes[level] = b; 2059 } 2060 } 2061 ret = 1; 2062 done: 2063 if (ret < 0 && !p->skip_release_on_error) 2064 btrfs_release_path(p); 2065 2066 if (p->need_commit_sem) { 2067 int ret2; 2068 2069 ret2 = finish_need_commit_sem_search(p); 2070 up_read(&fs_info->commit_root_sem); 2071 if (ret2) 2072 ret = ret2; 2073 } 2074 2075 return ret; 2076 } 2077 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2078 2079 /* 2080 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2081 * current state of the tree together with the operations recorded in the tree 2082 * modification log to search for the key in a previous version of this tree, as 2083 * denoted by the time_seq parameter. 2084 * 2085 * Naturally, there is no support for insert, delete or cow operations. 2086 * 2087 * The resulting path and return value will be set up as if we called 2088 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2089 */ 2090 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2091 struct btrfs_path *p, u64 time_seq) 2092 { 2093 struct btrfs_fs_info *fs_info = root->fs_info; 2094 struct extent_buffer *b; 2095 int slot; 2096 int ret; 2097 int err; 2098 int level; 2099 int lowest_unlock = 1; 2100 u8 lowest_level = 0; 2101 2102 lowest_level = p->lowest_level; 2103 WARN_ON(p->nodes[0] != NULL); 2104 2105 if (p->search_commit_root) { 2106 BUG_ON(time_seq); 2107 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2108 } 2109 2110 again: 2111 b = btrfs_get_old_root(root, time_seq); 2112 if (!b) { 2113 ret = -EIO; 2114 goto done; 2115 } 2116 level = btrfs_header_level(b); 2117 p->locks[level] = BTRFS_READ_LOCK; 2118 2119 while (b) { 2120 int dec = 0; 2121 2122 level = btrfs_header_level(b); 2123 p->nodes[level] = b; 2124 2125 /* 2126 * we have a lock on b and as long as we aren't changing 2127 * the tree, there is no way to for the items in b to change. 2128 * It is safe to drop the lock on our parent before we 2129 * go through the expensive btree search on b. 2130 */ 2131 btrfs_unlock_up_safe(p, level + 1); 2132 2133 ret = btrfs_bin_search(b, key, &slot); 2134 if (ret < 0) 2135 goto done; 2136 2137 if (level == 0) { 2138 p->slots[level] = slot; 2139 unlock_up(p, level, lowest_unlock, 0, NULL); 2140 goto done; 2141 } 2142 2143 if (ret && slot > 0) { 2144 dec = 1; 2145 slot--; 2146 } 2147 p->slots[level] = slot; 2148 unlock_up(p, level, lowest_unlock, 0, NULL); 2149 2150 if (level == lowest_level) { 2151 if (dec) 2152 p->slots[level]++; 2153 goto done; 2154 } 2155 2156 err = read_block_for_search(root, p, &b, level, slot, key); 2157 if (err == -EAGAIN) 2158 goto again; 2159 if (err) { 2160 ret = err; 2161 goto done; 2162 } 2163 2164 level = btrfs_header_level(b); 2165 btrfs_tree_read_lock(b); 2166 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2167 if (!b) { 2168 ret = -ENOMEM; 2169 goto done; 2170 } 2171 p->locks[level] = BTRFS_READ_LOCK; 2172 p->nodes[level] = b; 2173 } 2174 ret = 1; 2175 done: 2176 if (ret < 0) 2177 btrfs_release_path(p); 2178 2179 return ret; 2180 } 2181 2182 /* 2183 * helper to use instead of search slot if no exact match is needed but 2184 * instead the next or previous item should be returned. 2185 * When find_higher is true, the next higher item is returned, the next lower 2186 * otherwise. 2187 * When return_any and find_higher are both true, and no higher item is found, 2188 * return the next lower instead. 2189 * When return_any is true and find_higher is false, and no lower item is found, 2190 * return the next higher instead. 2191 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2192 * < 0 on error 2193 */ 2194 int btrfs_search_slot_for_read(struct btrfs_root *root, 2195 const struct btrfs_key *key, 2196 struct btrfs_path *p, int find_higher, 2197 int return_any) 2198 { 2199 int ret; 2200 struct extent_buffer *leaf; 2201 2202 again: 2203 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2204 if (ret <= 0) 2205 return ret; 2206 /* 2207 * a return value of 1 means the path is at the position where the 2208 * item should be inserted. Normally this is the next bigger item, 2209 * but in case the previous item is the last in a leaf, path points 2210 * to the first free slot in the previous leaf, i.e. at an invalid 2211 * item. 2212 */ 2213 leaf = p->nodes[0]; 2214 2215 if (find_higher) { 2216 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2217 ret = btrfs_next_leaf(root, p); 2218 if (ret <= 0) 2219 return ret; 2220 if (!return_any) 2221 return 1; 2222 /* 2223 * no higher item found, return the next 2224 * lower instead 2225 */ 2226 return_any = 0; 2227 find_higher = 0; 2228 btrfs_release_path(p); 2229 goto again; 2230 } 2231 } else { 2232 if (p->slots[0] == 0) { 2233 ret = btrfs_prev_leaf(root, p); 2234 if (ret < 0) 2235 return ret; 2236 if (!ret) { 2237 leaf = p->nodes[0]; 2238 if (p->slots[0] == btrfs_header_nritems(leaf)) 2239 p->slots[0]--; 2240 return 0; 2241 } 2242 if (!return_any) 2243 return 1; 2244 /* 2245 * no lower item found, return the next 2246 * higher instead 2247 */ 2248 return_any = 0; 2249 find_higher = 1; 2250 btrfs_release_path(p); 2251 goto again; 2252 } else { 2253 --p->slots[0]; 2254 } 2255 } 2256 return 0; 2257 } 2258 2259 /* 2260 * Execute search and call btrfs_previous_item to traverse backwards if the item 2261 * was not found. 2262 * 2263 * Return 0 if found, 1 if not found and < 0 if error. 2264 */ 2265 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2266 struct btrfs_path *path) 2267 { 2268 int ret; 2269 2270 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2271 if (ret > 0) 2272 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2273 2274 if (ret == 0) 2275 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2276 2277 return ret; 2278 } 2279 2280 /* 2281 * adjust the pointers going up the tree, starting at level 2282 * making sure the right key of each node is points to 'key'. 2283 * This is used after shifting pointers to the left, so it stops 2284 * fixing up pointers when a given leaf/node is not in slot 0 of the 2285 * higher levels 2286 * 2287 */ 2288 static void fixup_low_keys(struct btrfs_path *path, 2289 struct btrfs_disk_key *key, int level) 2290 { 2291 int i; 2292 struct extent_buffer *t; 2293 int ret; 2294 2295 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2296 int tslot = path->slots[i]; 2297 2298 if (!path->nodes[i]) 2299 break; 2300 t = path->nodes[i]; 2301 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2302 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); 2303 BUG_ON(ret < 0); 2304 btrfs_set_node_key(t, key, tslot); 2305 btrfs_mark_buffer_dirty(path->nodes[i]); 2306 if (tslot != 0) 2307 break; 2308 } 2309 } 2310 2311 /* 2312 * update item key. 2313 * 2314 * This function isn't completely safe. It's the caller's responsibility 2315 * that the new key won't break the order 2316 */ 2317 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2318 struct btrfs_path *path, 2319 const struct btrfs_key *new_key) 2320 { 2321 struct btrfs_disk_key disk_key; 2322 struct extent_buffer *eb; 2323 int slot; 2324 2325 eb = path->nodes[0]; 2326 slot = path->slots[0]; 2327 if (slot > 0) { 2328 btrfs_item_key(eb, &disk_key, slot - 1); 2329 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2330 btrfs_crit(fs_info, 2331 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2332 slot, btrfs_disk_key_objectid(&disk_key), 2333 btrfs_disk_key_type(&disk_key), 2334 btrfs_disk_key_offset(&disk_key), 2335 new_key->objectid, new_key->type, 2336 new_key->offset); 2337 btrfs_print_leaf(eb); 2338 BUG(); 2339 } 2340 } 2341 if (slot < btrfs_header_nritems(eb) - 1) { 2342 btrfs_item_key(eb, &disk_key, slot + 1); 2343 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2344 btrfs_crit(fs_info, 2345 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2346 slot, btrfs_disk_key_objectid(&disk_key), 2347 btrfs_disk_key_type(&disk_key), 2348 btrfs_disk_key_offset(&disk_key), 2349 new_key->objectid, new_key->type, 2350 new_key->offset); 2351 btrfs_print_leaf(eb); 2352 BUG(); 2353 } 2354 } 2355 2356 btrfs_cpu_key_to_disk(&disk_key, new_key); 2357 btrfs_set_item_key(eb, &disk_key, slot); 2358 btrfs_mark_buffer_dirty(eb); 2359 if (slot == 0) 2360 fixup_low_keys(path, &disk_key, 1); 2361 } 2362 2363 /* 2364 * Check key order of two sibling extent buffers. 2365 * 2366 * Return true if something is wrong. 2367 * Return false if everything is fine. 2368 * 2369 * Tree-checker only works inside one tree block, thus the following 2370 * corruption can not be detected by tree-checker: 2371 * 2372 * Leaf @left | Leaf @right 2373 * -------------------------------------------------------------- 2374 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2375 * 2376 * Key f6 in leaf @left itself is valid, but not valid when the next 2377 * key in leaf @right is 7. 2378 * This can only be checked at tree block merge time. 2379 * And since tree checker has ensured all key order in each tree block 2380 * is correct, we only need to bother the last key of @left and the first 2381 * key of @right. 2382 */ 2383 static bool check_sibling_keys(struct extent_buffer *left, 2384 struct extent_buffer *right) 2385 { 2386 struct btrfs_key left_last; 2387 struct btrfs_key right_first; 2388 int level = btrfs_header_level(left); 2389 int nr_left = btrfs_header_nritems(left); 2390 int nr_right = btrfs_header_nritems(right); 2391 2392 /* No key to check in one of the tree blocks */ 2393 if (!nr_left || !nr_right) 2394 return false; 2395 2396 if (level) { 2397 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2398 btrfs_node_key_to_cpu(right, &right_first, 0); 2399 } else { 2400 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2401 btrfs_item_key_to_cpu(right, &right_first, 0); 2402 } 2403 2404 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { 2405 btrfs_crit(left->fs_info, 2406 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2407 left_last.objectid, left_last.type, 2408 left_last.offset, right_first.objectid, 2409 right_first.type, right_first.offset); 2410 return true; 2411 } 2412 return false; 2413 } 2414 2415 /* 2416 * try to push data from one node into the next node left in the 2417 * tree. 2418 * 2419 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2420 * error, and > 0 if there was no room in the left hand block. 2421 */ 2422 static int push_node_left(struct btrfs_trans_handle *trans, 2423 struct extent_buffer *dst, 2424 struct extent_buffer *src, int empty) 2425 { 2426 struct btrfs_fs_info *fs_info = trans->fs_info; 2427 int push_items = 0; 2428 int src_nritems; 2429 int dst_nritems; 2430 int ret = 0; 2431 2432 src_nritems = btrfs_header_nritems(src); 2433 dst_nritems = btrfs_header_nritems(dst); 2434 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2435 WARN_ON(btrfs_header_generation(src) != trans->transid); 2436 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2437 2438 if (!empty && src_nritems <= 8) 2439 return 1; 2440 2441 if (push_items <= 0) 2442 return 1; 2443 2444 if (empty) { 2445 push_items = min(src_nritems, push_items); 2446 if (push_items < src_nritems) { 2447 /* leave at least 8 pointers in the node if 2448 * we aren't going to empty it 2449 */ 2450 if (src_nritems - push_items < 8) { 2451 if (push_items <= 8) 2452 return 1; 2453 push_items -= 8; 2454 } 2455 } 2456 } else 2457 push_items = min(src_nritems - 8, push_items); 2458 2459 /* dst is the left eb, src is the middle eb */ 2460 if (check_sibling_keys(dst, src)) { 2461 ret = -EUCLEAN; 2462 btrfs_abort_transaction(trans, ret); 2463 return ret; 2464 } 2465 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2466 if (ret) { 2467 btrfs_abort_transaction(trans, ret); 2468 return ret; 2469 } 2470 copy_extent_buffer(dst, src, 2471 btrfs_node_key_ptr_offset(dst_nritems), 2472 btrfs_node_key_ptr_offset(0), 2473 push_items * sizeof(struct btrfs_key_ptr)); 2474 2475 if (push_items < src_nritems) { 2476 /* 2477 * Don't call btrfs_tree_mod_log_insert_move() here, key removal 2478 * was already fully logged by btrfs_tree_mod_log_eb_copy() above. 2479 */ 2480 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2481 btrfs_node_key_ptr_offset(push_items), 2482 (src_nritems - push_items) * 2483 sizeof(struct btrfs_key_ptr)); 2484 } 2485 btrfs_set_header_nritems(src, src_nritems - push_items); 2486 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2487 btrfs_mark_buffer_dirty(src); 2488 btrfs_mark_buffer_dirty(dst); 2489 2490 return ret; 2491 } 2492 2493 /* 2494 * try to push data from one node into the next node right in the 2495 * tree. 2496 * 2497 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2498 * error, and > 0 if there was no room in the right hand block. 2499 * 2500 * this will only push up to 1/2 the contents of the left node over 2501 */ 2502 static int balance_node_right(struct btrfs_trans_handle *trans, 2503 struct extent_buffer *dst, 2504 struct extent_buffer *src) 2505 { 2506 struct btrfs_fs_info *fs_info = trans->fs_info; 2507 int push_items = 0; 2508 int max_push; 2509 int src_nritems; 2510 int dst_nritems; 2511 int ret = 0; 2512 2513 WARN_ON(btrfs_header_generation(src) != trans->transid); 2514 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2515 2516 src_nritems = btrfs_header_nritems(src); 2517 dst_nritems = btrfs_header_nritems(dst); 2518 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2519 if (push_items <= 0) 2520 return 1; 2521 2522 if (src_nritems < 4) 2523 return 1; 2524 2525 max_push = src_nritems / 2 + 1; 2526 /* don't try to empty the node */ 2527 if (max_push >= src_nritems) 2528 return 1; 2529 2530 if (max_push < push_items) 2531 push_items = max_push; 2532 2533 /* dst is the right eb, src is the middle eb */ 2534 if (check_sibling_keys(src, dst)) { 2535 ret = -EUCLEAN; 2536 btrfs_abort_transaction(trans, ret); 2537 return ret; 2538 } 2539 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 2540 BUG_ON(ret < 0); 2541 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2542 btrfs_node_key_ptr_offset(0), 2543 (dst_nritems) * 2544 sizeof(struct btrfs_key_ptr)); 2545 2546 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2547 push_items); 2548 if (ret) { 2549 btrfs_abort_transaction(trans, ret); 2550 return ret; 2551 } 2552 copy_extent_buffer(dst, src, 2553 btrfs_node_key_ptr_offset(0), 2554 btrfs_node_key_ptr_offset(src_nritems - push_items), 2555 push_items * sizeof(struct btrfs_key_ptr)); 2556 2557 btrfs_set_header_nritems(src, src_nritems - push_items); 2558 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2559 2560 btrfs_mark_buffer_dirty(src); 2561 btrfs_mark_buffer_dirty(dst); 2562 2563 return ret; 2564 } 2565 2566 /* 2567 * helper function to insert a new root level in the tree. 2568 * A new node is allocated, and a single item is inserted to 2569 * point to the existing root 2570 * 2571 * returns zero on success or < 0 on failure. 2572 */ 2573 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2574 struct btrfs_root *root, 2575 struct btrfs_path *path, int level) 2576 { 2577 struct btrfs_fs_info *fs_info = root->fs_info; 2578 u64 lower_gen; 2579 struct extent_buffer *lower; 2580 struct extent_buffer *c; 2581 struct extent_buffer *old; 2582 struct btrfs_disk_key lower_key; 2583 int ret; 2584 2585 BUG_ON(path->nodes[level]); 2586 BUG_ON(path->nodes[level-1] != root->node); 2587 2588 lower = path->nodes[level-1]; 2589 if (level == 1) 2590 btrfs_item_key(lower, &lower_key, 0); 2591 else 2592 btrfs_node_key(lower, &lower_key, 0); 2593 2594 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2595 &lower_key, level, root->node->start, 0, 2596 BTRFS_NESTING_NEW_ROOT); 2597 if (IS_ERR(c)) 2598 return PTR_ERR(c); 2599 2600 root_add_used(root, fs_info->nodesize); 2601 2602 btrfs_set_header_nritems(c, 1); 2603 btrfs_set_node_key(c, &lower_key, 0); 2604 btrfs_set_node_blockptr(c, 0, lower->start); 2605 lower_gen = btrfs_header_generation(lower); 2606 WARN_ON(lower_gen != trans->transid); 2607 2608 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2609 2610 btrfs_mark_buffer_dirty(c); 2611 2612 old = root->node; 2613 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2614 BUG_ON(ret < 0); 2615 rcu_assign_pointer(root->node, c); 2616 2617 /* the super has an extra ref to root->node */ 2618 free_extent_buffer(old); 2619 2620 add_root_to_dirty_list(root); 2621 atomic_inc(&c->refs); 2622 path->nodes[level] = c; 2623 path->locks[level] = BTRFS_WRITE_LOCK; 2624 path->slots[level] = 0; 2625 return 0; 2626 } 2627 2628 /* 2629 * worker function to insert a single pointer in a node. 2630 * the node should have enough room for the pointer already 2631 * 2632 * slot and level indicate where you want the key to go, and 2633 * blocknr is the block the key points to. 2634 */ 2635 static void insert_ptr(struct btrfs_trans_handle *trans, 2636 struct btrfs_path *path, 2637 struct btrfs_disk_key *key, u64 bytenr, 2638 int slot, int level) 2639 { 2640 struct extent_buffer *lower; 2641 int nritems; 2642 int ret; 2643 2644 BUG_ON(!path->nodes[level]); 2645 btrfs_assert_tree_write_locked(path->nodes[level]); 2646 lower = path->nodes[level]; 2647 nritems = btrfs_header_nritems(lower); 2648 BUG_ON(slot > nritems); 2649 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2650 if (slot != nritems) { 2651 if (level) { 2652 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2653 slot, nritems - slot); 2654 BUG_ON(ret < 0); 2655 } 2656 memmove_extent_buffer(lower, 2657 btrfs_node_key_ptr_offset(slot + 1), 2658 btrfs_node_key_ptr_offset(slot), 2659 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2660 } 2661 if (level) { 2662 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2663 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); 2664 BUG_ON(ret < 0); 2665 } 2666 btrfs_set_node_key(lower, key, slot); 2667 btrfs_set_node_blockptr(lower, slot, bytenr); 2668 WARN_ON(trans->transid == 0); 2669 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2670 btrfs_set_header_nritems(lower, nritems + 1); 2671 btrfs_mark_buffer_dirty(lower); 2672 } 2673 2674 /* 2675 * split the node at the specified level in path in two. 2676 * The path is corrected to point to the appropriate node after the split 2677 * 2678 * Before splitting this tries to make some room in the node by pushing 2679 * left and right, if either one works, it returns right away. 2680 * 2681 * returns 0 on success and < 0 on failure 2682 */ 2683 static noinline int split_node(struct btrfs_trans_handle *trans, 2684 struct btrfs_root *root, 2685 struct btrfs_path *path, int level) 2686 { 2687 struct btrfs_fs_info *fs_info = root->fs_info; 2688 struct extent_buffer *c; 2689 struct extent_buffer *split; 2690 struct btrfs_disk_key disk_key; 2691 int mid; 2692 int ret; 2693 u32 c_nritems; 2694 2695 c = path->nodes[level]; 2696 WARN_ON(btrfs_header_generation(c) != trans->transid); 2697 if (c == root->node) { 2698 /* 2699 * trying to split the root, lets make a new one 2700 * 2701 * tree mod log: We don't log_removal old root in 2702 * insert_new_root, because that root buffer will be kept as a 2703 * normal node. We are going to log removal of half of the 2704 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2705 * holding a tree lock on the buffer, which is why we cannot 2706 * race with other tree_mod_log users. 2707 */ 2708 ret = insert_new_root(trans, root, path, level + 1); 2709 if (ret) 2710 return ret; 2711 } else { 2712 ret = push_nodes_for_insert(trans, root, path, level); 2713 c = path->nodes[level]; 2714 if (!ret && btrfs_header_nritems(c) < 2715 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 2716 return 0; 2717 if (ret < 0) 2718 return ret; 2719 } 2720 2721 c_nritems = btrfs_header_nritems(c); 2722 mid = (c_nritems + 1) / 2; 2723 btrfs_node_key(c, &disk_key, mid); 2724 2725 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2726 &disk_key, level, c->start, 0, 2727 BTRFS_NESTING_SPLIT); 2728 if (IS_ERR(split)) 2729 return PTR_ERR(split); 2730 2731 root_add_used(root, fs_info->nodesize); 2732 ASSERT(btrfs_header_level(c) == level); 2733 2734 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 2735 if (ret) { 2736 btrfs_abort_transaction(trans, ret); 2737 return ret; 2738 } 2739 copy_extent_buffer(split, c, 2740 btrfs_node_key_ptr_offset(0), 2741 btrfs_node_key_ptr_offset(mid), 2742 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 2743 btrfs_set_header_nritems(split, c_nritems - mid); 2744 btrfs_set_header_nritems(c, mid); 2745 2746 btrfs_mark_buffer_dirty(c); 2747 btrfs_mark_buffer_dirty(split); 2748 2749 insert_ptr(trans, path, &disk_key, split->start, 2750 path->slots[level + 1] + 1, level + 1); 2751 2752 if (path->slots[level] >= mid) { 2753 path->slots[level] -= mid; 2754 btrfs_tree_unlock(c); 2755 free_extent_buffer(c); 2756 path->nodes[level] = split; 2757 path->slots[level + 1] += 1; 2758 } else { 2759 btrfs_tree_unlock(split); 2760 free_extent_buffer(split); 2761 } 2762 return 0; 2763 } 2764 2765 /* 2766 * how many bytes are required to store the items in a leaf. start 2767 * and nr indicate which items in the leaf to check. This totals up the 2768 * space used both by the item structs and the item data 2769 */ 2770 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 2771 { 2772 int data_len; 2773 int nritems = btrfs_header_nritems(l); 2774 int end = min(nritems, start + nr) - 1; 2775 2776 if (!nr) 2777 return 0; 2778 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 2779 data_len = data_len - btrfs_item_offset(l, end); 2780 data_len += sizeof(struct btrfs_item) * nr; 2781 WARN_ON(data_len < 0); 2782 return data_len; 2783 } 2784 2785 /* 2786 * The space between the end of the leaf items and 2787 * the start of the leaf data. IOW, how much room 2788 * the leaf has left for both items and data 2789 */ 2790 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) 2791 { 2792 struct btrfs_fs_info *fs_info = leaf->fs_info; 2793 int nritems = btrfs_header_nritems(leaf); 2794 int ret; 2795 2796 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 2797 if (ret < 0) { 2798 btrfs_crit(fs_info, 2799 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 2800 ret, 2801 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 2802 leaf_space_used(leaf, 0, nritems), nritems); 2803 } 2804 return ret; 2805 } 2806 2807 /* 2808 * min slot controls the lowest index we're willing to push to the 2809 * right. We'll push up to and including min_slot, but no lower 2810 */ 2811 static noinline int __push_leaf_right(struct btrfs_path *path, 2812 int data_size, int empty, 2813 struct extent_buffer *right, 2814 int free_space, u32 left_nritems, 2815 u32 min_slot) 2816 { 2817 struct btrfs_fs_info *fs_info = right->fs_info; 2818 struct extent_buffer *left = path->nodes[0]; 2819 struct extent_buffer *upper = path->nodes[1]; 2820 struct btrfs_map_token token; 2821 struct btrfs_disk_key disk_key; 2822 int slot; 2823 u32 i; 2824 int push_space = 0; 2825 int push_items = 0; 2826 u32 nr; 2827 u32 right_nritems; 2828 u32 data_end; 2829 u32 this_item_size; 2830 2831 if (empty) 2832 nr = 0; 2833 else 2834 nr = max_t(u32, 1, min_slot); 2835 2836 if (path->slots[0] >= left_nritems) 2837 push_space += data_size; 2838 2839 slot = path->slots[1]; 2840 i = left_nritems - 1; 2841 while (i >= nr) { 2842 if (!empty && push_items > 0) { 2843 if (path->slots[0] > i) 2844 break; 2845 if (path->slots[0] == i) { 2846 int space = btrfs_leaf_free_space(left); 2847 2848 if (space + push_space * 2 > free_space) 2849 break; 2850 } 2851 } 2852 2853 if (path->slots[0] == i) 2854 push_space += data_size; 2855 2856 this_item_size = btrfs_item_size(left, i); 2857 if (this_item_size + sizeof(struct btrfs_item) + 2858 push_space > free_space) 2859 break; 2860 2861 push_items++; 2862 push_space += this_item_size + sizeof(struct btrfs_item); 2863 if (i == 0) 2864 break; 2865 i--; 2866 } 2867 2868 if (push_items == 0) 2869 goto out_unlock; 2870 2871 WARN_ON(!empty && push_items == left_nritems); 2872 2873 /* push left to right */ 2874 right_nritems = btrfs_header_nritems(right); 2875 2876 push_space = btrfs_item_data_end(left, left_nritems - push_items); 2877 push_space -= leaf_data_end(left); 2878 2879 /* make room in the right data area */ 2880 data_end = leaf_data_end(right); 2881 memmove_extent_buffer(right, 2882 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 2883 BTRFS_LEAF_DATA_OFFSET + data_end, 2884 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 2885 2886 /* copy from the left data area */ 2887 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 2888 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 2889 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), 2890 push_space); 2891 2892 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 2893 btrfs_item_nr_offset(0), 2894 right_nritems * sizeof(struct btrfs_item)); 2895 2896 /* copy the items from left to right */ 2897 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 2898 btrfs_item_nr_offset(left_nritems - push_items), 2899 push_items * sizeof(struct btrfs_item)); 2900 2901 /* update the item pointers */ 2902 btrfs_init_map_token(&token, right); 2903 right_nritems += push_items; 2904 btrfs_set_header_nritems(right, right_nritems); 2905 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 2906 for (i = 0; i < right_nritems; i++) { 2907 push_space -= btrfs_token_item_size(&token, i); 2908 btrfs_set_token_item_offset(&token, i, push_space); 2909 } 2910 2911 left_nritems -= push_items; 2912 btrfs_set_header_nritems(left, left_nritems); 2913 2914 if (left_nritems) 2915 btrfs_mark_buffer_dirty(left); 2916 else 2917 btrfs_clean_tree_block(left); 2918 2919 btrfs_mark_buffer_dirty(right); 2920 2921 btrfs_item_key(right, &disk_key, 0); 2922 btrfs_set_node_key(upper, &disk_key, slot + 1); 2923 btrfs_mark_buffer_dirty(upper); 2924 2925 /* then fixup the leaf pointer in the path */ 2926 if (path->slots[0] >= left_nritems) { 2927 path->slots[0] -= left_nritems; 2928 if (btrfs_header_nritems(path->nodes[0]) == 0) 2929 btrfs_clean_tree_block(path->nodes[0]); 2930 btrfs_tree_unlock(path->nodes[0]); 2931 free_extent_buffer(path->nodes[0]); 2932 path->nodes[0] = right; 2933 path->slots[1] += 1; 2934 } else { 2935 btrfs_tree_unlock(right); 2936 free_extent_buffer(right); 2937 } 2938 return 0; 2939 2940 out_unlock: 2941 btrfs_tree_unlock(right); 2942 free_extent_buffer(right); 2943 return 1; 2944 } 2945 2946 /* 2947 * push some data in the path leaf to the right, trying to free up at 2948 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2949 * 2950 * returns 1 if the push failed because the other node didn't have enough 2951 * room, 0 if everything worked out and < 0 if there were major errors. 2952 * 2953 * this will push starting from min_slot to the end of the leaf. It won't 2954 * push any slot lower than min_slot 2955 */ 2956 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2957 *root, struct btrfs_path *path, 2958 int min_data_size, int data_size, 2959 int empty, u32 min_slot) 2960 { 2961 struct extent_buffer *left = path->nodes[0]; 2962 struct extent_buffer *right; 2963 struct extent_buffer *upper; 2964 int slot; 2965 int free_space; 2966 u32 left_nritems; 2967 int ret; 2968 2969 if (!path->nodes[1]) 2970 return 1; 2971 2972 slot = path->slots[1]; 2973 upper = path->nodes[1]; 2974 if (slot >= btrfs_header_nritems(upper) - 1) 2975 return 1; 2976 2977 btrfs_assert_tree_write_locked(path->nodes[1]); 2978 2979 right = btrfs_read_node_slot(upper, slot + 1); 2980 /* 2981 * slot + 1 is not valid or we fail to read the right node, 2982 * no big deal, just return. 2983 */ 2984 if (IS_ERR(right)) 2985 return 1; 2986 2987 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 2988 2989 free_space = btrfs_leaf_free_space(right); 2990 if (free_space < data_size) 2991 goto out_unlock; 2992 2993 /* cow and double check */ 2994 ret = btrfs_cow_block(trans, root, right, upper, 2995 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 2996 if (ret) 2997 goto out_unlock; 2998 2999 free_space = btrfs_leaf_free_space(right); 3000 if (free_space < data_size) 3001 goto out_unlock; 3002 3003 left_nritems = btrfs_header_nritems(left); 3004 if (left_nritems == 0) 3005 goto out_unlock; 3006 3007 if (check_sibling_keys(left, right)) { 3008 ret = -EUCLEAN; 3009 btrfs_tree_unlock(right); 3010 free_extent_buffer(right); 3011 return ret; 3012 } 3013 if (path->slots[0] == left_nritems && !empty) { 3014 /* Key greater than all keys in the leaf, right neighbor has 3015 * enough room for it and we're not emptying our leaf to delete 3016 * it, therefore use right neighbor to insert the new item and 3017 * no need to touch/dirty our left leaf. */ 3018 btrfs_tree_unlock(left); 3019 free_extent_buffer(left); 3020 path->nodes[0] = right; 3021 path->slots[0] = 0; 3022 path->slots[1]++; 3023 return 0; 3024 } 3025 3026 return __push_leaf_right(path, min_data_size, empty, 3027 right, free_space, left_nritems, min_slot); 3028 out_unlock: 3029 btrfs_tree_unlock(right); 3030 free_extent_buffer(right); 3031 return 1; 3032 } 3033 3034 /* 3035 * push some data in the path leaf to the left, trying to free up at 3036 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3037 * 3038 * max_slot can put a limit on how far into the leaf we'll push items. The 3039 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3040 * items 3041 */ 3042 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, 3043 int empty, struct extent_buffer *left, 3044 int free_space, u32 right_nritems, 3045 u32 max_slot) 3046 { 3047 struct btrfs_fs_info *fs_info = left->fs_info; 3048 struct btrfs_disk_key disk_key; 3049 struct extent_buffer *right = path->nodes[0]; 3050 int i; 3051 int push_space = 0; 3052 int push_items = 0; 3053 u32 old_left_nritems; 3054 u32 nr; 3055 int ret = 0; 3056 u32 this_item_size; 3057 u32 old_left_item_size; 3058 struct btrfs_map_token token; 3059 3060 if (empty) 3061 nr = min(right_nritems, max_slot); 3062 else 3063 nr = min(right_nritems - 1, max_slot); 3064 3065 for (i = 0; i < nr; i++) { 3066 if (!empty && push_items > 0) { 3067 if (path->slots[0] < i) 3068 break; 3069 if (path->slots[0] == i) { 3070 int space = btrfs_leaf_free_space(right); 3071 3072 if (space + push_space * 2 > free_space) 3073 break; 3074 } 3075 } 3076 3077 if (path->slots[0] == i) 3078 push_space += data_size; 3079 3080 this_item_size = btrfs_item_size(right, i); 3081 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3082 free_space) 3083 break; 3084 3085 push_items++; 3086 push_space += this_item_size + sizeof(struct btrfs_item); 3087 } 3088 3089 if (push_items == 0) { 3090 ret = 1; 3091 goto out; 3092 } 3093 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3094 3095 /* push data from right to left */ 3096 copy_extent_buffer(left, right, 3097 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3098 btrfs_item_nr_offset(0), 3099 push_items * sizeof(struct btrfs_item)); 3100 3101 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3102 btrfs_item_offset(right, push_items - 1); 3103 3104 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 3105 leaf_data_end(left) - push_space, 3106 BTRFS_LEAF_DATA_OFFSET + 3107 btrfs_item_offset(right, push_items - 1), 3108 push_space); 3109 old_left_nritems = btrfs_header_nritems(left); 3110 BUG_ON(old_left_nritems <= 0); 3111 3112 btrfs_init_map_token(&token, left); 3113 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3114 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3115 u32 ioff; 3116 3117 ioff = btrfs_token_item_offset(&token, i); 3118 btrfs_set_token_item_offset(&token, i, 3119 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3120 } 3121 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3122 3123 /* fixup right node */ 3124 if (push_items > right_nritems) 3125 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3126 right_nritems); 3127 3128 if (push_items < right_nritems) { 3129 push_space = btrfs_item_offset(right, push_items - 1) - 3130 leaf_data_end(right); 3131 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 3132 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3133 BTRFS_LEAF_DATA_OFFSET + 3134 leaf_data_end(right), push_space); 3135 3136 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3137 btrfs_item_nr_offset(push_items), 3138 (btrfs_header_nritems(right) - push_items) * 3139 sizeof(struct btrfs_item)); 3140 } 3141 3142 btrfs_init_map_token(&token, right); 3143 right_nritems -= push_items; 3144 btrfs_set_header_nritems(right, right_nritems); 3145 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3146 for (i = 0; i < right_nritems; i++) { 3147 push_space = push_space - btrfs_token_item_size(&token, i); 3148 btrfs_set_token_item_offset(&token, i, push_space); 3149 } 3150 3151 btrfs_mark_buffer_dirty(left); 3152 if (right_nritems) 3153 btrfs_mark_buffer_dirty(right); 3154 else 3155 btrfs_clean_tree_block(right); 3156 3157 btrfs_item_key(right, &disk_key, 0); 3158 fixup_low_keys(path, &disk_key, 1); 3159 3160 /* then fixup the leaf pointer in the path */ 3161 if (path->slots[0] < push_items) { 3162 path->slots[0] += old_left_nritems; 3163 btrfs_tree_unlock(path->nodes[0]); 3164 free_extent_buffer(path->nodes[0]); 3165 path->nodes[0] = left; 3166 path->slots[1] -= 1; 3167 } else { 3168 btrfs_tree_unlock(left); 3169 free_extent_buffer(left); 3170 path->slots[0] -= push_items; 3171 } 3172 BUG_ON(path->slots[0] < 0); 3173 return ret; 3174 out: 3175 btrfs_tree_unlock(left); 3176 free_extent_buffer(left); 3177 return ret; 3178 } 3179 3180 /* 3181 * push some data in the path leaf to the left, trying to free up at 3182 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3183 * 3184 * max_slot can put a limit on how far into the leaf we'll push items. The 3185 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3186 * items 3187 */ 3188 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3189 *root, struct btrfs_path *path, int min_data_size, 3190 int data_size, int empty, u32 max_slot) 3191 { 3192 struct extent_buffer *right = path->nodes[0]; 3193 struct extent_buffer *left; 3194 int slot; 3195 int free_space; 3196 u32 right_nritems; 3197 int ret = 0; 3198 3199 slot = path->slots[1]; 3200 if (slot == 0) 3201 return 1; 3202 if (!path->nodes[1]) 3203 return 1; 3204 3205 right_nritems = btrfs_header_nritems(right); 3206 if (right_nritems == 0) 3207 return 1; 3208 3209 btrfs_assert_tree_write_locked(path->nodes[1]); 3210 3211 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3212 /* 3213 * slot - 1 is not valid or we fail to read the left node, 3214 * no big deal, just return. 3215 */ 3216 if (IS_ERR(left)) 3217 return 1; 3218 3219 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3220 3221 free_space = btrfs_leaf_free_space(left); 3222 if (free_space < data_size) { 3223 ret = 1; 3224 goto out; 3225 } 3226 3227 /* cow and double check */ 3228 ret = btrfs_cow_block(trans, root, left, 3229 path->nodes[1], slot - 1, &left, 3230 BTRFS_NESTING_LEFT_COW); 3231 if (ret) { 3232 /* we hit -ENOSPC, but it isn't fatal here */ 3233 if (ret == -ENOSPC) 3234 ret = 1; 3235 goto out; 3236 } 3237 3238 free_space = btrfs_leaf_free_space(left); 3239 if (free_space < data_size) { 3240 ret = 1; 3241 goto out; 3242 } 3243 3244 if (check_sibling_keys(left, right)) { 3245 ret = -EUCLEAN; 3246 goto out; 3247 } 3248 return __push_leaf_left(path, min_data_size, 3249 empty, left, free_space, right_nritems, 3250 max_slot); 3251 out: 3252 btrfs_tree_unlock(left); 3253 free_extent_buffer(left); 3254 return ret; 3255 } 3256 3257 /* 3258 * split the path's leaf in two, making sure there is at least data_size 3259 * available for the resulting leaf level of the path. 3260 */ 3261 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3262 struct btrfs_path *path, 3263 struct extent_buffer *l, 3264 struct extent_buffer *right, 3265 int slot, int mid, int nritems) 3266 { 3267 struct btrfs_fs_info *fs_info = trans->fs_info; 3268 int data_copy_size; 3269 int rt_data_off; 3270 int i; 3271 struct btrfs_disk_key disk_key; 3272 struct btrfs_map_token token; 3273 3274 nritems = nritems - mid; 3275 btrfs_set_header_nritems(right, nritems); 3276 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3277 3278 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3279 btrfs_item_nr_offset(mid), 3280 nritems * sizeof(struct btrfs_item)); 3281 3282 copy_extent_buffer(right, l, 3283 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 3284 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 3285 leaf_data_end(l), data_copy_size); 3286 3287 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3288 3289 btrfs_init_map_token(&token, right); 3290 for (i = 0; i < nritems; i++) { 3291 u32 ioff; 3292 3293 ioff = btrfs_token_item_offset(&token, i); 3294 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3295 } 3296 3297 btrfs_set_header_nritems(l, mid); 3298 btrfs_item_key(right, &disk_key, 0); 3299 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3300 3301 btrfs_mark_buffer_dirty(right); 3302 btrfs_mark_buffer_dirty(l); 3303 BUG_ON(path->slots[0] != slot); 3304 3305 if (mid <= slot) { 3306 btrfs_tree_unlock(path->nodes[0]); 3307 free_extent_buffer(path->nodes[0]); 3308 path->nodes[0] = right; 3309 path->slots[0] -= mid; 3310 path->slots[1] += 1; 3311 } else { 3312 btrfs_tree_unlock(right); 3313 free_extent_buffer(right); 3314 } 3315 3316 BUG_ON(path->slots[0] < 0); 3317 } 3318 3319 /* 3320 * double splits happen when we need to insert a big item in the middle 3321 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3322 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3323 * A B C 3324 * 3325 * We avoid this by trying to push the items on either side of our target 3326 * into the adjacent leaves. If all goes well we can avoid the double split 3327 * completely. 3328 */ 3329 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3330 struct btrfs_root *root, 3331 struct btrfs_path *path, 3332 int data_size) 3333 { 3334 int ret; 3335 int progress = 0; 3336 int slot; 3337 u32 nritems; 3338 int space_needed = data_size; 3339 3340 slot = path->slots[0]; 3341 if (slot < btrfs_header_nritems(path->nodes[0])) 3342 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3343 3344 /* 3345 * try to push all the items after our slot into the 3346 * right leaf 3347 */ 3348 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3349 if (ret < 0) 3350 return ret; 3351 3352 if (ret == 0) 3353 progress++; 3354 3355 nritems = btrfs_header_nritems(path->nodes[0]); 3356 /* 3357 * our goal is to get our slot at the start or end of a leaf. If 3358 * we've done so we're done 3359 */ 3360 if (path->slots[0] == 0 || path->slots[0] == nritems) 3361 return 0; 3362 3363 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3364 return 0; 3365 3366 /* try to push all the items before our slot into the next leaf */ 3367 slot = path->slots[0]; 3368 space_needed = data_size; 3369 if (slot > 0) 3370 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3371 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3372 if (ret < 0) 3373 return ret; 3374 3375 if (ret == 0) 3376 progress++; 3377 3378 if (progress) 3379 return 0; 3380 return 1; 3381 } 3382 3383 /* 3384 * split the path's leaf in two, making sure there is at least data_size 3385 * available for the resulting leaf level of the path. 3386 * 3387 * returns 0 if all went well and < 0 on failure. 3388 */ 3389 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3390 struct btrfs_root *root, 3391 const struct btrfs_key *ins_key, 3392 struct btrfs_path *path, int data_size, 3393 int extend) 3394 { 3395 struct btrfs_disk_key disk_key; 3396 struct extent_buffer *l; 3397 u32 nritems; 3398 int mid; 3399 int slot; 3400 struct extent_buffer *right; 3401 struct btrfs_fs_info *fs_info = root->fs_info; 3402 int ret = 0; 3403 int wret; 3404 int split; 3405 int num_doubles = 0; 3406 int tried_avoid_double = 0; 3407 3408 l = path->nodes[0]; 3409 slot = path->slots[0]; 3410 if (extend && data_size + btrfs_item_size(l, slot) + 3411 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3412 return -EOVERFLOW; 3413 3414 /* first try to make some room by pushing left and right */ 3415 if (data_size && path->nodes[1]) { 3416 int space_needed = data_size; 3417 3418 if (slot < btrfs_header_nritems(l)) 3419 space_needed -= btrfs_leaf_free_space(l); 3420 3421 wret = push_leaf_right(trans, root, path, space_needed, 3422 space_needed, 0, 0); 3423 if (wret < 0) 3424 return wret; 3425 if (wret) { 3426 space_needed = data_size; 3427 if (slot > 0) 3428 space_needed -= btrfs_leaf_free_space(l); 3429 wret = push_leaf_left(trans, root, path, space_needed, 3430 space_needed, 0, (u32)-1); 3431 if (wret < 0) 3432 return wret; 3433 } 3434 l = path->nodes[0]; 3435 3436 /* did the pushes work? */ 3437 if (btrfs_leaf_free_space(l) >= data_size) 3438 return 0; 3439 } 3440 3441 if (!path->nodes[1]) { 3442 ret = insert_new_root(trans, root, path, 1); 3443 if (ret) 3444 return ret; 3445 } 3446 again: 3447 split = 1; 3448 l = path->nodes[0]; 3449 slot = path->slots[0]; 3450 nritems = btrfs_header_nritems(l); 3451 mid = (nritems + 1) / 2; 3452 3453 if (mid <= slot) { 3454 if (nritems == 1 || 3455 leaf_space_used(l, mid, nritems - mid) + data_size > 3456 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3457 if (slot >= nritems) { 3458 split = 0; 3459 } else { 3460 mid = slot; 3461 if (mid != nritems && 3462 leaf_space_used(l, mid, nritems - mid) + 3463 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3464 if (data_size && !tried_avoid_double) 3465 goto push_for_double; 3466 split = 2; 3467 } 3468 } 3469 } 3470 } else { 3471 if (leaf_space_used(l, 0, mid) + data_size > 3472 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3473 if (!extend && data_size && slot == 0) { 3474 split = 0; 3475 } else if ((extend || !data_size) && slot == 0) { 3476 mid = 1; 3477 } else { 3478 mid = slot; 3479 if (mid != nritems && 3480 leaf_space_used(l, mid, nritems - mid) + 3481 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3482 if (data_size && !tried_avoid_double) 3483 goto push_for_double; 3484 split = 2; 3485 } 3486 } 3487 } 3488 } 3489 3490 if (split == 0) 3491 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3492 else 3493 btrfs_item_key(l, &disk_key, mid); 3494 3495 /* 3496 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3497 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3498 * subclasses, which is 8 at the time of this patch, and we've maxed it 3499 * out. In the future we could add a 3500 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3501 * use BTRFS_NESTING_NEW_ROOT. 3502 */ 3503 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3504 &disk_key, 0, l->start, 0, 3505 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3506 BTRFS_NESTING_SPLIT); 3507 if (IS_ERR(right)) 3508 return PTR_ERR(right); 3509 3510 root_add_used(root, fs_info->nodesize); 3511 3512 if (split == 0) { 3513 if (mid <= slot) { 3514 btrfs_set_header_nritems(right, 0); 3515 insert_ptr(trans, path, &disk_key, 3516 right->start, path->slots[1] + 1, 1); 3517 btrfs_tree_unlock(path->nodes[0]); 3518 free_extent_buffer(path->nodes[0]); 3519 path->nodes[0] = right; 3520 path->slots[0] = 0; 3521 path->slots[1] += 1; 3522 } else { 3523 btrfs_set_header_nritems(right, 0); 3524 insert_ptr(trans, path, &disk_key, 3525 right->start, path->slots[1], 1); 3526 btrfs_tree_unlock(path->nodes[0]); 3527 free_extent_buffer(path->nodes[0]); 3528 path->nodes[0] = right; 3529 path->slots[0] = 0; 3530 if (path->slots[1] == 0) 3531 fixup_low_keys(path, &disk_key, 1); 3532 } 3533 /* 3534 * We create a new leaf 'right' for the required ins_len and 3535 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3536 * the content of ins_len to 'right'. 3537 */ 3538 return ret; 3539 } 3540 3541 copy_for_split(trans, path, l, right, slot, mid, nritems); 3542 3543 if (split == 2) { 3544 BUG_ON(num_doubles != 0); 3545 num_doubles++; 3546 goto again; 3547 } 3548 3549 return 0; 3550 3551 push_for_double: 3552 push_for_double_split(trans, root, path, data_size); 3553 tried_avoid_double = 1; 3554 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3555 return 0; 3556 goto again; 3557 } 3558 3559 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3560 struct btrfs_root *root, 3561 struct btrfs_path *path, int ins_len) 3562 { 3563 struct btrfs_key key; 3564 struct extent_buffer *leaf; 3565 struct btrfs_file_extent_item *fi; 3566 u64 extent_len = 0; 3567 u32 item_size; 3568 int ret; 3569 3570 leaf = path->nodes[0]; 3571 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3572 3573 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3574 key.type != BTRFS_EXTENT_CSUM_KEY); 3575 3576 if (btrfs_leaf_free_space(leaf) >= ins_len) 3577 return 0; 3578 3579 item_size = btrfs_item_size(leaf, path->slots[0]); 3580 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3581 fi = btrfs_item_ptr(leaf, path->slots[0], 3582 struct btrfs_file_extent_item); 3583 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3584 } 3585 btrfs_release_path(path); 3586 3587 path->keep_locks = 1; 3588 path->search_for_split = 1; 3589 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3590 path->search_for_split = 0; 3591 if (ret > 0) 3592 ret = -EAGAIN; 3593 if (ret < 0) 3594 goto err; 3595 3596 ret = -EAGAIN; 3597 leaf = path->nodes[0]; 3598 /* if our item isn't there, return now */ 3599 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3600 goto err; 3601 3602 /* the leaf has changed, it now has room. return now */ 3603 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3604 goto err; 3605 3606 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3607 fi = btrfs_item_ptr(leaf, path->slots[0], 3608 struct btrfs_file_extent_item); 3609 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3610 goto err; 3611 } 3612 3613 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3614 if (ret) 3615 goto err; 3616 3617 path->keep_locks = 0; 3618 btrfs_unlock_up_safe(path, 1); 3619 return 0; 3620 err: 3621 path->keep_locks = 0; 3622 return ret; 3623 } 3624 3625 static noinline int split_item(struct btrfs_path *path, 3626 const struct btrfs_key *new_key, 3627 unsigned long split_offset) 3628 { 3629 struct extent_buffer *leaf; 3630 int orig_slot, slot; 3631 char *buf; 3632 u32 nritems; 3633 u32 item_size; 3634 u32 orig_offset; 3635 struct btrfs_disk_key disk_key; 3636 3637 leaf = path->nodes[0]; 3638 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); 3639 3640 orig_slot = path->slots[0]; 3641 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3642 item_size = btrfs_item_size(leaf, path->slots[0]); 3643 3644 buf = kmalloc(item_size, GFP_NOFS); 3645 if (!buf) 3646 return -ENOMEM; 3647 3648 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3649 path->slots[0]), item_size); 3650 3651 slot = path->slots[0] + 1; 3652 nritems = btrfs_header_nritems(leaf); 3653 if (slot != nritems) { 3654 /* shift the items */ 3655 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 3656 btrfs_item_nr_offset(slot), 3657 (nritems - slot) * sizeof(struct btrfs_item)); 3658 } 3659 3660 btrfs_cpu_key_to_disk(&disk_key, new_key); 3661 btrfs_set_item_key(leaf, &disk_key, slot); 3662 3663 btrfs_set_item_offset(leaf, slot, orig_offset); 3664 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3665 3666 btrfs_set_item_offset(leaf, orig_slot, 3667 orig_offset + item_size - split_offset); 3668 btrfs_set_item_size(leaf, orig_slot, split_offset); 3669 3670 btrfs_set_header_nritems(leaf, nritems + 1); 3671 3672 /* write the data for the start of the original item */ 3673 write_extent_buffer(leaf, buf, 3674 btrfs_item_ptr_offset(leaf, path->slots[0]), 3675 split_offset); 3676 3677 /* write the data for the new item */ 3678 write_extent_buffer(leaf, buf + split_offset, 3679 btrfs_item_ptr_offset(leaf, slot), 3680 item_size - split_offset); 3681 btrfs_mark_buffer_dirty(leaf); 3682 3683 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3684 kfree(buf); 3685 return 0; 3686 } 3687 3688 /* 3689 * This function splits a single item into two items, 3690 * giving 'new_key' to the new item and splitting the 3691 * old one at split_offset (from the start of the item). 3692 * 3693 * The path may be released by this operation. After 3694 * the split, the path is pointing to the old item. The 3695 * new item is going to be in the same node as the old one. 3696 * 3697 * Note, the item being split must be smaller enough to live alone on 3698 * a tree block with room for one extra struct btrfs_item 3699 * 3700 * This allows us to split the item in place, keeping a lock on the 3701 * leaf the entire time. 3702 */ 3703 int btrfs_split_item(struct btrfs_trans_handle *trans, 3704 struct btrfs_root *root, 3705 struct btrfs_path *path, 3706 const struct btrfs_key *new_key, 3707 unsigned long split_offset) 3708 { 3709 int ret; 3710 ret = setup_leaf_for_split(trans, root, path, 3711 sizeof(struct btrfs_item)); 3712 if (ret) 3713 return ret; 3714 3715 ret = split_item(path, new_key, split_offset); 3716 return ret; 3717 } 3718 3719 /* 3720 * make the item pointed to by the path smaller. new_size indicates 3721 * how small to make it, and from_end tells us if we just chop bytes 3722 * off the end of the item or if we shift the item to chop bytes off 3723 * the front. 3724 */ 3725 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 3726 { 3727 int slot; 3728 struct extent_buffer *leaf; 3729 u32 nritems; 3730 unsigned int data_end; 3731 unsigned int old_data_start; 3732 unsigned int old_size; 3733 unsigned int size_diff; 3734 int i; 3735 struct btrfs_map_token token; 3736 3737 leaf = path->nodes[0]; 3738 slot = path->slots[0]; 3739 3740 old_size = btrfs_item_size(leaf, slot); 3741 if (old_size == new_size) 3742 return; 3743 3744 nritems = btrfs_header_nritems(leaf); 3745 data_end = leaf_data_end(leaf); 3746 3747 old_data_start = btrfs_item_offset(leaf, slot); 3748 3749 size_diff = old_size - new_size; 3750 3751 BUG_ON(slot < 0); 3752 BUG_ON(slot >= nritems); 3753 3754 /* 3755 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3756 */ 3757 /* first correct the data pointers */ 3758 btrfs_init_map_token(&token, leaf); 3759 for (i = slot; i < nritems; i++) { 3760 u32 ioff; 3761 3762 ioff = btrfs_token_item_offset(&token, i); 3763 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 3764 } 3765 3766 /* shift the data */ 3767 if (from_end) { 3768 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3769 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3770 data_end, old_data_start + new_size - data_end); 3771 } else { 3772 struct btrfs_disk_key disk_key; 3773 u64 offset; 3774 3775 btrfs_item_key(leaf, &disk_key, slot); 3776 3777 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 3778 unsigned long ptr; 3779 struct btrfs_file_extent_item *fi; 3780 3781 fi = btrfs_item_ptr(leaf, slot, 3782 struct btrfs_file_extent_item); 3783 fi = (struct btrfs_file_extent_item *)( 3784 (unsigned long)fi - size_diff); 3785 3786 if (btrfs_file_extent_type(leaf, fi) == 3787 BTRFS_FILE_EXTENT_INLINE) { 3788 ptr = btrfs_item_ptr_offset(leaf, slot); 3789 memmove_extent_buffer(leaf, ptr, 3790 (unsigned long)fi, 3791 BTRFS_FILE_EXTENT_INLINE_DATA_START); 3792 } 3793 } 3794 3795 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3796 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3797 data_end, old_data_start - data_end); 3798 3799 offset = btrfs_disk_key_offset(&disk_key); 3800 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 3801 btrfs_set_item_key(leaf, &disk_key, slot); 3802 if (slot == 0) 3803 fixup_low_keys(path, &disk_key, 1); 3804 } 3805 3806 btrfs_set_item_size(leaf, slot, new_size); 3807 btrfs_mark_buffer_dirty(leaf); 3808 3809 if (btrfs_leaf_free_space(leaf) < 0) { 3810 btrfs_print_leaf(leaf); 3811 BUG(); 3812 } 3813 } 3814 3815 /* 3816 * make the item pointed to by the path bigger, data_size is the added size. 3817 */ 3818 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 3819 { 3820 int slot; 3821 struct extent_buffer *leaf; 3822 u32 nritems; 3823 unsigned int data_end; 3824 unsigned int old_data; 3825 unsigned int old_size; 3826 int i; 3827 struct btrfs_map_token token; 3828 3829 leaf = path->nodes[0]; 3830 3831 nritems = btrfs_header_nritems(leaf); 3832 data_end = leaf_data_end(leaf); 3833 3834 if (btrfs_leaf_free_space(leaf) < data_size) { 3835 btrfs_print_leaf(leaf); 3836 BUG(); 3837 } 3838 slot = path->slots[0]; 3839 old_data = btrfs_item_data_end(leaf, slot); 3840 3841 BUG_ON(slot < 0); 3842 if (slot >= nritems) { 3843 btrfs_print_leaf(leaf); 3844 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 3845 slot, nritems); 3846 BUG(); 3847 } 3848 3849 /* 3850 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3851 */ 3852 /* first correct the data pointers */ 3853 btrfs_init_map_token(&token, leaf); 3854 for (i = slot; i < nritems; i++) { 3855 u32 ioff; 3856 3857 ioff = btrfs_token_item_offset(&token, i); 3858 btrfs_set_token_item_offset(&token, i, ioff - data_size); 3859 } 3860 3861 /* shift the data */ 3862 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3863 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 3864 data_end, old_data - data_end); 3865 3866 data_end = old_data; 3867 old_size = btrfs_item_size(leaf, slot); 3868 btrfs_set_item_size(leaf, slot, old_size + data_size); 3869 btrfs_mark_buffer_dirty(leaf); 3870 3871 if (btrfs_leaf_free_space(leaf) < 0) { 3872 btrfs_print_leaf(leaf); 3873 BUG(); 3874 } 3875 } 3876 3877 /** 3878 * setup_items_for_insert - Helper called before inserting one or more items 3879 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work 3880 * in a function that doesn't call btrfs_search_slot 3881 * 3882 * @root: root we are inserting items to 3883 * @path: points to the leaf/slot where we are going to insert new items 3884 * @batch: information about the batch of items to insert 3885 */ 3886 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 3887 const struct btrfs_item_batch *batch) 3888 { 3889 struct btrfs_fs_info *fs_info = root->fs_info; 3890 int i; 3891 u32 nritems; 3892 unsigned int data_end; 3893 struct btrfs_disk_key disk_key; 3894 struct extent_buffer *leaf; 3895 int slot; 3896 struct btrfs_map_token token; 3897 u32 total_size; 3898 3899 /* 3900 * Before anything else, update keys in the parent and other ancestors 3901 * if needed, then release the write locks on them, so that other tasks 3902 * can use them while we modify the leaf. 3903 */ 3904 if (path->slots[0] == 0) { 3905 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 3906 fixup_low_keys(path, &disk_key, 1); 3907 } 3908 btrfs_unlock_up_safe(path, 1); 3909 3910 leaf = path->nodes[0]; 3911 slot = path->slots[0]; 3912 3913 nritems = btrfs_header_nritems(leaf); 3914 data_end = leaf_data_end(leaf); 3915 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 3916 3917 if (btrfs_leaf_free_space(leaf) < total_size) { 3918 btrfs_print_leaf(leaf); 3919 btrfs_crit(fs_info, "not enough freespace need %u have %d", 3920 total_size, btrfs_leaf_free_space(leaf)); 3921 BUG(); 3922 } 3923 3924 btrfs_init_map_token(&token, leaf); 3925 if (slot != nritems) { 3926 unsigned int old_data = btrfs_item_data_end(leaf, slot); 3927 3928 if (old_data < data_end) { 3929 btrfs_print_leaf(leaf); 3930 btrfs_crit(fs_info, 3931 "item at slot %d with data offset %u beyond data end of leaf %u", 3932 slot, old_data, data_end); 3933 BUG(); 3934 } 3935 /* 3936 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3937 */ 3938 /* first correct the data pointers */ 3939 for (i = slot; i < nritems; i++) { 3940 u32 ioff; 3941 3942 ioff = btrfs_token_item_offset(&token, i); 3943 btrfs_set_token_item_offset(&token, i, 3944 ioff - batch->total_data_size); 3945 } 3946 /* shift the items */ 3947 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), 3948 btrfs_item_nr_offset(slot), 3949 (nritems - slot) * sizeof(struct btrfs_item)); 3950 3951 /* shift the data */ 3952 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3953 data_end - batch->total_data_size, 3954 BTRFS_LEAF_DATA_OFFSET + data_end, 3955 old_data - data_end); 3956 data_end = old_data; 3957 } 3958 3959 /* setup the item for the new data */ 3960 for (i = 0; i < batch->nr; i++) { 3961 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 3962 btrfs_set_item_key(leaf, &disk_key, slot + i); 3963 data_end -= batch->data_sizes[i]; 3964 btrfs_set_token_item_offset(&token, slot + i, data_end); 3965 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 3966 } 3967 3968 btrfs_set_header_nritems(leaf, nritems + batch->nr); 3969 btrfs_mark_buffer_dirty(leaf); 3970 3971 if (btrfs_leaf_free_space(leaf) < 0) { 3972 btrfs_print_leaf(leaf); 3973 BUG(); 3974 } 3975 } 3976 3977 /* 3978 * Insert a new item into a leaf. 3979 * 3980 * @root: The root of the btree. 3981 * @path: A path pointing to the target leaf and slot. 3982 * @key: The key of the new item. 3983 * @data_size: The size of the data associated with the new key. 3984 */ 3985 void btrfs_setup_item_for_insert(struct btrfs_root *root, 3986 struct btrfs_path *path, 3987 const struct btrfs_key *key, 3988 u32 data_size) 3989 { 3990 struct btrfs_item_batch batch; 3991 3992 batch.keys = key; 3993 batch.data_sizes = &data_size; 3994 batch.total_data_size = data_size; 3995 batch.nr = 1; 3996 3997 setup_items_for_insert(root, path, &batch); 3998 } 3999 4000 /* 4001 * Given a key and some data, insert items into the tree. 4002 * This does all the path init required, making room in the tree if needed. 4003 */ 4004 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4005 struct btrfs_root *root, 4006 struct btrfs_path *path, 4007 const struct btrfs_item_batch *batch) 4008 { 4009 int ret = 0; 4010 int slot; 4011 u32 total_size; 4012 4013 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4014 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4015 if (ret == 0) 4016 return -EEXIST; 4017 if (ret < 0) 4018 return ret; 4019 4020 slot = path->slots[0]; 4021 BUG_ON(slot < 0); 4022 4023 setup_items_for_insert(root, path, batch); 4024 return 0; 4025 } 4026 4027 /* 4028 * Given a key and some data, insert an item into the tree. 4029 * This does all the path init required, making room in the tree if needed. 4030 */ 4031 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4032 const struct btrfs_key *cpu_key, void *data, 4033 u32 data_size) 4034 { 4035 int ret = 0; 4036 struct btrfs_path *path; 4037 struct extent_buffer *leaf; 4038 unsigned long ptr; 4039 4040 path = btrfs_alloc_path(); 4041 if (!path) 4042 return -ENOMEM; 4043 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4044 if (!ret) { 4045 leaf = path->nodes[0]; 4046 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4047 write_extent_buffer(leaf, data, ptr, data_size); 4048 btrfs_mark_buffer_dirty(leaf); 4049 } 4050 btrfs_free_path(path); 4051 return ret; 4052 } 4053 4054 /* 4055 * This function duplicates an item, giving 'new_key' to the new item. 4056 * It guarantees both items live in the same tree leaf and the new item is 4057 * contiguous with the original item. 4058 * 4059 * This allows us to split a file extent in place, keeping a lock on the leaf 4060 * the entire time. 4061 */ 4062 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4063 struct btrfs_root *root, 4064 struct btrfs_path *path, 4065 const struct btrfs_key *new_key) 4066 { 4067 struct extent_buffer *leaf; 4068 int ret; 4069 u32 item_size; 4070 4071 leaf = path->nodes[0]; 4072 item_size = btrfs_item_size(leaf, path->slots[0]); 4073 ret = setup_leaf_for_split(trans, root, path, 4074 item_size + sizeof(struct btrfs_item)); 4075 if (ret) 4076 return ret; 4077 4078 path->slots[0]++; 4079 btrfs_setup_item_for_insert(root, path, new_key, item_size); 4080 leaf = path->nodes[0]; 4081 memcpy_extent_buffer(leaf, 4082 btrfs_item_ptr_offset(leaf, path->slots[0]), 4083 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4084 item_size); 4085 return 0; 4086 } 4087 4088 /* 4089 * delete the pointer from a given node. 4090 * 4091 * the tree should have been previously balanced so the deletion does not 4092 * empty a node. 4093 */ 4094 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4095 int level, int slot) 4096 { 4097 struct extent_buffer *parent = path->nodes[level]; 4098 u32 nritems; 4099 int ret; 4100 4101 nritems = btrfs_header_nritems(parent); 4102 if (slot != nritems - 1) { 4103 if (level) { 4104 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4105 slot + 1, nritems - slot - 1); 4106 BUG_ON(ret < 0); 4107 } 4108 memmove_extent_buffer(parent, 4109 btrfs_node_key_ptr_offset(slot), 4110 btrfs_node_key_ptr_offset(slot + 1), 4111 sizeof(struct btrfs_key_ptr) * 4112 (nritems - slot - 1)); 4113 } else if (level) { 4114 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4115 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); 4116 BUG_ON(ret < 0); 4117 } 4118 4119 nritems--; 4120 btrfs_set_header_nritems(parent, nritems); 4121 if (nritems == 0 && parent == root->node) { 4122 BUG_ON(btrfs_header_level(root->node) != 1); 4123 /* just turn the root into a leaf and break */ 4124 btrfs_set_header_level(root->node, 0); 4125 } else if (slot == 0) { 4126 struct btrfs_disk_key disk_key; 4127 4128 btrfs_node_key(parent, &disk_key, 0); 4129 fixup_low_keys(path, &disk_key, level + 1); 4130 } 4131 btrfs_mark_buffer_dirty(parent); 4132 } 4133 4134 /* 4135 * a helper function to delete the leaf pointed to by path->slots[1] and 4136 * path->nodes[1]. 4137 * 4138 * This deletes the pointer in path->nodes[1] and frees the leaf 4139 * block extent. zero is returned if it all worked out, < 0 otherwise. 4140 * 4141 * The path must have already been setup for deleting the leaf, including 4142 * all the proper balancing. path->nodes[1] must be locked. 4143 */ 4144 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4145 struct btrfs_root *root, 4146 struct btrfs_path *path, 4147 struct extent_buffer *leaf) 4148 { 4149 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4150 del_ptr(root, path, 1, path->slots[1]); 4151 4152 /* 4153 * btrfs_free_extent is expensive, we want to make sure we 4154 * aren't holding any locks when we call it 4155 */ 4156 btrfs_unlock_up_safe(path, 0); 4157 4158 root_sub_used(root, leaf->len); 4159 4160 atomic_inc(&leaf->refs); 4161 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4162 free_extent_buffer_stale(leaf); 4163 } 4164 /* 4165 * delete the item at the leaf level in path. If that empties 4166 * the leaf, remove it from the tree 4167 */ 4168 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4169 struct btrfs_path *path, int slot, int nr) 4170 { 4171 struct btrfs_fs_info *fs_info = root->fs_info; 4172 struct extent_buffer *leaf; 4173 u32 last_off; 4174 u32 dsize = 0; 4175 int ret = 0; 4176 int wret; 4177 int i; 4178 u32 nritems; 4179 4180 leaf = path->nodes[0]; 4181 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4182 4183 for (i = 0; i < nr; i++) 4184 dsize += btrfs_item_size(leaf, slot + i); 4185 4186 nritems = btrfs_header_nritems(leaf); 4187 4188 if (slot + nr != nritems) { 4189 int data_end = leaf_data_end(leaf); 4190 struct btrfs_map_token token; 4191 4192 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4193 data_end + dsize, 4194 BTRFS_LEAF_DATA_OFFSET + data_end, 4195 last_off - data_end); 4196 4197 btrfs_init_map_token(&token, leaf); 4198 for (i = slot + nr; i < nritems; i++) { 4199 u32 ioff; 4200 4201 ioff = btrfs_token_item_offset(&token, i); 4202 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4203 } 4204 4205 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4206 btrfs_item_nr_offset(slot + nr), 4207 sizeof(struct btrfs_item) * 4208 (nritems - slot - nr)); 4209 } 4210 btrfs_set_header_nritems(leaf, nritems - nr); 4211 nritems -= nr; 4212 4213 /* delete the leaf if we've emptied it */ 4214 if (nritems == 0) { 4215 if (leaf == root->node) { 4216 btrfs_set_header_level(leaf, 0); 4217 } else { 4218 btrfs_clean_tree_block(leaf); 4219 btrfs_del_leaf(trans, root, path, leaf); 4220 } 4221 } else { 4222 int used = leaf_space_used(leaf, 0, nritems); 4223 if (slot == 0) { 4224 struct btrfs_disk_key disk_key; 4225 4226 btrfs_item_key(leaf, &disk_key, 0); 4227 fixup_low_keys(path, &disk_key, 1); 4228 } 4229 4230 /* delete the leaf if it is mostly empty */ 4231 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4232 /* push_leaf_left fixes the path. 4233 * make sure the path still points to our leaf 4234 * for possible call to del_ptr below 4235 */ 4236 slot = path->slots[1]; 4237 atomic_inc(&leaf->refs); 4238 4239 wret = push_leaf_left(trans, root, path, 1, 1, 4240 1, (u32)-1); 4241 if (wret < 0 && wret != -ENOSPC) 4242 ret = wret; 4243 4244 if (path->nodes[0] == leaf && 4245 btrfs_header_nritems(leaf)) { 4246 wret = push_leaf_right(trans, root, path, 1, 4247 1, 1, 0); 4248 if (wret < 0 && wret != -ENOSPC) 4249 ret = wret; 4250 } 4251 4252 if (btrfs_header_nritems(leaf) == 0) { 4253 path->slots[1] = slot; 4254 btrfs_del_leaf(trans, root, path, leaf); 4255 free_extent_buffer(leaf); 4256 ret = 0; 4257 } else { 4258 /* if we're still in the path, make sure 4259 * we're dirty. Otherwise, one of the 4260 * push_leaf functions must have already 4261 * dirtied this buffer 4262 */ 4263 if (path->nodes[0] == leaf) 4264 btrfs_mark_buffer_dirty(leaf); 4265 free_extent_buffer(leaf); 4266 } 4267 } else { 4268 btrfs_mark_buffer_dirty(leaf); 4269 } 4270 } 4271 return ret; 4272 } 4273 4274 /* 4275 * search the tree again to find a leaf with lesser keys 4276 * returns 0 if it found something or 1 if there are no lesser leaves. 4277 * returns < 0 on io errors. 4278 * 4279 * This may release the path, and so you may lose any locks held at the 4280 * time you call it. 4281 */ 4282 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4283 { 4284 struct btrfs_key key; 4285 struct btrfs_disk_key found_key; 4286 int ret; 4287 4288 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4289 4290 if (key.offset > 0) { 4291 key.offset--; 4292 } else if (key.type > 0) { 4293 key.type--; 4294 key.offset = (u64)-1; 4295 } else if (key.objectid > 0) { 4296 key.objectid--; 4297 key.type = (u8)-1; 4298 key.offset = (u64)-1; 4299 } else { 4300 return 1; 4301 } 4302 4303 btrfs_release_path(path); 4304 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4305 if (ret < 0) 4306 return ret; 4307 btrfs_item_key(path->nodes[0], &found_key, 0); 4308 ret = comp_keys(&found_key, &key); 4309 /* 4310 * We might have had an item with the previous key in the tree right 4311 * before we released our path. And after we released our path, that 4312 * item might have been pushed to the first slot (0) of the leaf we 4313 * were holding due to a tree balance. Alternatively, an item with the 4314 * previous key can exist as the only element of a leaf (big fat item). 4315 * Therefore account for these 2 cases, so that our callers (like 4316 * btrfs_previous_item) don't miss an existing item with a key matching 4317 * the previous key we computed above. 4318 */ 4319 if (ret <= 0) 4320 return 0; 4321 return 1; 4322 } 4323 4324 /* 4325 * A helper function to walk down the tree starting at min_key, and looking 4326 * for nodes or leaves that are have a minimum transaction id. 4327 * This is used by the btree defrag code, and tree logging 4328 * 4329 * This does not cow, but it does stuff the starting key it finds back 4330 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4331 * key and get a writable path. 4332 * 4333 * This honors path->lowest_level to prevent descent past a given level 4334 * of the tree. 4335 * 4336 * min_trans indicates the oldest transaction that you are interested 4337 * in walking through. Any nodes or leaves older than min_trans are 4338 * skipped over (without reading them). 4339 * 4340 * returns zero if something useful was found, < 0 on error and 1 if there 4341 * was nothing in the tree that matched the search criteria. 4342 */ 4343 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4344 struct btrfs_path *path, 4345 u64 min_trans) 4346 { 4347 struct extent_buffer *cur; 4348 struct btrfs_key found_key; 4349 int slot; 4350 int sret; 4351 u32 nritems; 4352 int level; 4353 int ret = 1; 4354 int keep_locks = path->keep_locks; 4355 4356 path->keep_locks = 1; 4357 again: 4358 cur = btrfs_read_lock_root_node(root); 4359 level = btrfs_header_level(cur); 4360 WARN_ON(path->nodes[level]); 4361 path->nodes[level] = cur; 4362 path->locks[level] = BTRFS_READ_LOCK; 4363 4364 if (btrfs_header_generation(cur) < min_trans) { 4365 ret = 1; 4366 goto out; 4367 } 4368 while (1) { 4369 nritems = btrfs_header_nritems(cur); 4370 level = btrfs_header_level(cur); 4371 sret = btrfs_bin_search(cur, min_key, &slot); 4372 if (sret < 0) { 4373 ret = sret; 4374 goto out; 4375 } 4376 4377 /* at the lowest level, we're done, setup the path and exit */ 4378 if (level == path->lowest_level) { 4379 if (slot >= nritems) 4380 goto find_next_key; 4381 ret = 0; 4382 path->slots[level] = slot; 4383 btrfs_item_key_to_cpu(cur, &found_key, slot); 4384 goto out; 4385 } 4386 if (sret && slot > 0) 4387 slot--; 4388 /* 4389 * check this node pointer against the min_trans parameters. 4390 * If it is too old, skip to the next one. 4391 */ 4392 while (slot < nritems) { 4393 u64 gen; 4394 4395 gen = btrfs_node_ptr_generation(cur, slot); 4396 if (gen < min_trans) { 4397 slot++; 4398 continue; 4399 } 4400 break; 4401 } 4402 find_next_key: 4403 /* 4404 * we didn't find a candidate key in this node, walk forward 4405 * and find another one 4406 */ 4407 if (slot >= nritems) { 4408 path->slots[level] = slot; 4409 sret = btrfs_find_next_key(root, path, min_key, level, 4410 min_trans); 4411 if (sret == 0) { 4412 btrfs_release_path(path); 4413 goto again; 4414 } else { 4415 goto out; 4416 } 4417 } 4418 /* save our key for returning back */ 4419 btrfs_node_key_to_cpu(cur, &found_key, slot); 4420 path->slots[level] = slot; 4421 if (level == path->lowest_level) { 4422 ret = 0; 4423 goto out; 4424 } 4425 cur = btrfs_read_node_slot(cur, slot); 4426 if (IS_ERR(cur)) { 4427 ret = PTR_ERR(cur); 4428 goto out; 4429 } 4430 4431 btrfs_tree_read_lock(cur); 4432 4433 path->locks[level - 1] = BTRFS_READ_LOCK; 4434 path->nodes[level - 1] = cur; 4435 unlock_up(path, level, 1, 0, NULL); 4436 } 4437 out: 4438 path->keep_locks = keep_locks; 4439 if (ret == 0) { 4440 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4441 memcpy(min_key, &found_key, sizeof(found_key)); 4442 } 4443 return ret; 4444 } 4445 4446 /* 4447 * this is similar to btrfs_next_leaf, but does not try to preserve 4448 * and fixup the path. It looks for and returns the next key in the 4449 * tree based on the current path and the min_trans parameters. 4450 * 4451 * 0 is returned if another key is found, < 0 if there are any errors 4452 * and 1 is returned if there are no higher keys in the tree 4453 * 4454 * path->keep_locks should be set to 1 on the search made before 4455 * calling this function. 4456 */ 4457 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4458 struct btrfs_key *key, int level, u64 min_trans) 4459 { 4460 int slot; 4461 struct extent_buffer *c; 4462 4463 WARN_ON(!path->keep_locks && !path->skip_locking); 4464 while (level < BTRFS_MAX_LEVEL) { 4465 if (!path->nodes[level]) 4466 return 1; 4467 4468 slot = path->slots[level] + 1; 4469 c = path->nodes[level]; 4470 next: 4471 if (slot >= btrfs_header_nritems(c)) { 4472 int ret; 4473 int orig_lowest; 4474 struct btrfs_key cur_key; 4475 if (level + 1 >= BTRFS_MAX_LEVEL || 4476 !path->nodes[level + 1]) 4477 return 1; 4478 4479 if (path->locks[level + 1] || path->skip_locking) { 4480 level++; 4481 continue; 4482 } 4483 4484 slot = btrfs_header_nritems(c) - 1; 4485 if (level == 0) 4486 btrfs_item_key_to_cpu(c, &cur_key, slot); 4487 else 4488 btrfs_node_key_to_cpu(c, &cur_key, slot); 4489 4490 orig_lowest = path->lowest_level; 4491 btrfs_release_path(path); 4492 path->lowest_level = level; 4493 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4494 0, 0); 4495 path->lowest_level = orig_lowest; 4496 if (ret < 0) 4497 return ret; 4498 4499 c = path->nodes[level]; 4500 slot = path->slots[level]; 4501 if (ret == 0) 4502 slot++; 4503 goto next; 4504 } 4505 4506 if (level == 0) 4507 btrfs_item_key_to_cpu(c, key, slot); 4508 else { 4509 u64 gen = btrfs_node_ptr_generation(c, slot); 4510 4511 if (gen < min_trans) { 4512 slot++; 4513 goto next; 4514 } 4515 btrfs_node_key_to_cpu(c, key, slot); 4516 } 4517 return 0; 4518 } 4519 return 1; 4520 } 4521 4522 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4523 u64 time_seq) 4524 { 4525 int slot; 4526 int level; 4527 struct extent_buffer *c; 4528 struct extent_buffer *next; 4529 struct btrfs_fs_info *fs_info = root->fs_info; 4530 struct btrfs_key key; 4531 bool need_commit_sem = false; 4532 u32 nritems; 4533 int ret; 4534 int i; 4535 4536 nritems = btrfs_header_nritems(path->nodes[0]); 4537 if (nritems == 0) 4538 return 1; 4539 4540 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4541 again: 4542 level = 1; 4543 next = NULL; 4544 btrfs_release_path(path); 4545 4546 path->keep_locks = 1; 4547 4548 if (time_seq) { 4549 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4550 } else { 4551 if (path->need_commit_sem) { 4552 path->need_commit_sem = 0; 4553 need_commit_sem = true; 4554 down_read(&fs_info->commit_root_sem); 4555 } 4556 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4557 } 4558 path->keep_locks = 0; 4559 4560 if (ret < 0) 4561 goto done; 4562 4563 nritems = btrfs_header_nritems(path->nodes[0]); 4564 /* 4565 * by releasing the path above we dropped all our locks. A balance 4566 * could have added more items next to the key that used to be 4567 * at the very end of the block. So, check again here and 4568 * advance the path if there are now more items available. 4569 */ 4570 if (nritems > 0 && path->slots[0] < nritems - 1) { 4571 if (ret == 0) 4572 path->slots[0]++; 4573 ret = 0; 4574 goto done; 4575 } 4576 /* 4577 * So the above check misses one case: 4578 * - after releasing the path above, someone has removed the item that 4579 * used to be at the very end of the block, and balance between leafs 4580 * gets another one with bigger key.offset to replace it. 4581 * 4582 * This one should be returned as well, or we can get leaf corruption 4583 * later(esp. in __btrfs_drop_extents()). 4584 * 4585 * And a bit more explanation about this check, 4586 * with ret > 0, the key isn't found, the path points to the slot 4587 * where it should be inserted, so the path->slots[0] item must be the 4588 * bigger one. 4589 */ 4590 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4591 ret = 0; 4592 goto done; 4593 } 4594 4595 while (level < BTRFS_MAX_LEVEL) { 4596 if (!path->nodes[level]) { 4597 ret = 1; 4598 goto done; 4599 } 4600 4601 slot = path->slots[level] + 1; 4602 c = path->nodes[level]; 4603 if (slot >= btrfs_header_nritems(c)) { 4604 level++; 4605 if (level == BTRFS_MAX_LEVEL) { 4606 ret = 1; 4607 goto done; 4608 } 4609 continue; 4610 } 4611 4612 4613 /* 4614 * Our current level is where we're going to start from, and to 4615 * make sure lockdep doesn't complain we need to drop our locks 4616 * and nodes from 0 to our current level. 4617 */ 4618 for (i = 0; i < level; i++) { 4619 if (path->locks[level]) { 4620 btrfs_tree_read_unlock(path->nodes[i]); 4621 path->locks[i] = 0; 4622 } 4623 free_extent_buffer(path->nodes[i]); 4624 path->nodes[i] = NULL; 4625 } 4626 4627 next = c; 4628 ret = read_block_for_search(root, path, &next, level, 4629 slot, &key); 4630 if (ret == -EAGAIN) 4631 goto again; 4632 4633 if (ret < 0) { 4634 btrfs_release_path(path); 4635 goto done; 4636 } 4637 4638 if (!path->skip_locking) { 4639 ret = btrfs_try_tree_read_lock(next); 4640 if (!ret && time_seq) { 4641 /* 4642 * If we don't get the lock, we may be racing 4643 * with push_leaf_left, holding that lock while 4644 * itself waiting for the leaf we've currently 4645 * locked. To solve this situation, we give up 4646 * on our lock and cycle. 4647 */ 4648 free_extent_buffer(next); 4649 btrfs_release_path(path); 4650 cond_resched(); 4651 goto again; 4652 } 4653 if (!ret) 4654 btrfs_tree_read_lock(next); 4655 } 4656 break; 4657 } 4658 path->slots[level] = slot; 4659 while (1) { 4660 level--; 4661 path->nodes[level] = next; 4662 path->slots[level] = 0; 4663 if (!path->skip_locking) 4664 path->locks[level] = BTRFS_READ_LOCK; 4665 if (!level) 4666 break; 4667 4668 ret = read_block_for_search(root, path, &next, level, 4669 0, &key); 4670 if (ret == -EAGAIN) 4671 goto again; 4672 4673 if (ret < 0) { 4674 btrfs_release_path(path); 4675 goto done; 4676 } 4677 4678 if (!path->skip_locking) 4679 btrfs_tree_read_lock(next); 4680 } 4681 ret = 0; 4682 done: 4683 unlock_up(path, 0, 1, 0, NULL); 4684 if (need_commit_sem) { 4685 int ret2; 4686 4687 path->need_commit_sem = 1; 4688 ret2 = finish_need_commit_sem_search(path); 4689 up_read(&fs_info->commit_root_sem); 4690 if (ret2) 4691 ret = ret2; 4692 } 4693 4694 return ret; 4695 } 4696 4697 /* 4698 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4699 * searching until it gets past min_objectid or finds an item of 'type' 4700 * 4701 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4702 */ 4703 int btrfs_previous_item(struct btrfs_root *root, 4704 struct btrfs_path *path, u64 min_objectid, 4705 int type) 4706 { 4707 struct btrfs_key found_key; 4708 struct extent_buffer *leaf; 4709 u32 nritems; 4710 int ret; 4711 4712 while (1) { 4713 if (path->slots[0] == 0) { 4714 ret = btrfs_prev_leaf(root, path); 4715 if (ret != 0) 4716 return ret; 4717 } else { 4718 path->slots[0]--; 4719 } 4720 leaf = path->nodes[0]; 4721 nritems = btrfs_header_nritems(leaf); 4722 if (nritems == 0) 4723 return 1; 4724 if (path->slots[0] == nritems) 4725 path->slots[0]--; 4726 4727 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4728 if (found_key.objectid < min_objectid) 4729 break; 4730 if (found_key.type == type) 4731 return 0; 4732 if (found_key.objectid == min_objectid && 4733 found_key.type < type) 4734 break; 4735 } 4736 return 1; 4737 } 4738 4739 /* 4740 * search in extent tree to find a previous Metadata/Data extent item with 4741 * min objecitd. 4742 * 4743 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4744 */ 4745 int btrfs_previous_extent_item(struct btrfs_root *root, 4746 struct btrfs_path *path, u64 min_objectid) 4747 { 4748 struct btrfs_key found_key; 4749 struct extent_buffer *leaf; 4750 u32 nritems; 4751 int ret; 4752 4753 while (1) { 4754 if (path->slots[0] == 0) { 4755 ret = btrfs_prev_leaf(root, path); 4756 if (ret != 0) 4757 return ret; 4758 } else { 4759 path->slots[0]--; 4760 } 4761 leaf = path->nodes[0]; 4762 nritems = btrfs_header_nritems(leaf); 4763 if (nritems == 0) 4764 return 1; 4765 if (path->slots[0] == nritems) 4766 path->slots[0]--; 4767 4768 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4769 if (found_key.objectid < min_objectid) 4770 break; 4771 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 4772 found_key.type == BTRFS_METADATA_ITEM_KEY) 4773 return 0; 4774 if (found_key.objectid == min_objectid && 4775 found_key.type < BTRFS_EXTENT_ITEM_KEY) 4776 break; 4777 } 4778 return 1; 4779 } 4780