1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "ctree.h" 12 #include "disk-io.h" 13 #include "transaction.h" 14 #include "print-tree.h" 15 #include "locking.h" 16 #include "volumes.h" 17 #include "qgroup.h" 18 #include "tree-mod-log.h" 19 20 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 21 *root, struct btrfs_path *path, int level); 22 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 23 const struct btrfs_key *ins_key, struct btrfs_path *path, 24 int data_size, int extend); 25 static int push_node_left(struct btrfs_trans_handle *trans, 26 struct extent_buffer *dst, 27 struct extent_buffer *src, int empty); 28 static int balance_node_right(struct btrfs_trans_handle *trans, 29 struct extent_buffer *dst_buf, 30 struct extent_buffer *src_buf); 31 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 32 int level, int slot); 33 34 static const struct btrfs_csums { 35 u16 size; 36 const char name[10]; 37 const char driver[12]; 38 } btrfs_csums[] = { 39 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 40 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 41 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 42 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 43 .driver = "blake2b-256" }, 44 }; 45 46 int btrfs_super_csum_size(const struct btrfs_super_block *s) 47 { 48 u16 t = btrfs_super_csum_type(s); 49 /* 50 * csum type is validated at mount time 51 */ 52 return btrfs_csums[t].size; 53 } 54 55 const char *btrfs_super_csum_name(u16 csum_type) 56 { 57 /* csum type is validated at mount time */ 58 return btrfs_csums[csum_type].name; 59 } 60 61 /* 62 * Return driver name if defined, otherwise the name that's also a valid driver 63 * name 64 */ 65 const char *btrfs_super_csum_driver(u16 csum_type) 66 { 67 /* csum type is validated at mount time */ 68 return btrfs_csums[csum_type].driver[0] ? 69 btrfs_csums[csum_type].driver : 70 btrfs_csums[csum_type].name; 71 } 72 73 size_t __attribute_const__ btrfs_get_num_csums(void) 74 { 75 return ARRAY_SIZE(btrfs_csums); 76 } 77 78 struct btrfs_path *btrfs_alloc_path(void) 79 { 80 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 81 } 82 83 /* this also releases the path */ 84 void btrfs_free_path(struct btrfs_path *p) 85 { 86 if (!p) 87 return; 88 btrfs_release_path(p); 89 kmem_cache_free(btrfs_path_cachep, p); 90 } 91 92 /* 93 * path release drops references on the extent buffers in the path 94 * and it drops any locks held by this path 95 * 96 * It is safe to call this on paths that no locks or extent buffers held. 97 */ 98 noinline void btrfs_release_path(struct btrfs_path *p) 99 { 100 int i; 101 102 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 103 p->slots[i] = 0; 104 if (!p->nodes[i]) 105 continue; 106 if (p->locks[i]) { 107 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 108 p->locks[i] = 0; 109 } 110 free_extent_buffer(p->nodes[i]); 111 p->nodes[i] = NULL; 112 } 113 } 114 115 /* 116 * safely gets a reference on the root node of a tree. A lock 117 * is not taken, so a concurrent writer may put a different node 118 * at the root of the tree. See btrfs_lock_root_node for the 119 * looping required. 120 * 121 * The extent buffer returned by this has a reference taken, so 122 * it won't disappear. It may stop being the root of the tree 123 * at any time because there are no locks held. 124 */ 125 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 126 { 127 struct extent_buffer *eb; 128 129 while (1) { 130 rcu_read_lock(); 131 eb = rcu_dereference(root->node); 132 133 /* 134 * RCU really hurts here, we could free up the root node because 135 * it was COWed but we may not get the new root node yet so do 136 * the inc_not_zero dance and if it doesn't work then 137 * synchronize_rcu and try again. 138 */ 139 if (atomic_inc_not_zero(&eb->refs)) { 140 rcu_read_unlock(); 141 break; 142 } 143 rcu_read_unlock(); 144 synchronize_rcu(); 145 } 146 return eb; 147 } 148 149 /* 150 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 151 * just get put onto a simple dirty list. Transaction walks this list to make 152 * sure they get properly updated on disk. 153 */ 154 static void add_root_to_dirty_list(struct btrfs_root *root) 155 { 156 struct btrfs_fs_info *fs_info = root->fs_info; 157 158 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 159 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 160 return; 161 162 spin_lock(&fs_info->trans_lock); 163 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 164 /* Want the extent tree to be the last on the list */ 165 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 166 list_move_tail(&root->dirty_list, 167 &fs_info->dirty_cowonly_roots); 168 else 169 list_move(&root->dirty_list, 170 &fs_info->dirty_cowonly_roots); 171 } 172 spin_unlock(&fs_info->trans_lock); 173 } 174 175 /* 176 * used by snapshot creation to make a copy of a root for a tree with 177 * a given objectid. The buffer with the new root node is returned in 178 * cow_ret, and this func returns zero on success or a negative error code. 179 */ 180 int btrfs_copy_root(struct btrfs_trans_handle *trans, 181 struct btrfs_root *root, 182 struct extent_buffer *buf, 183 struct extent_buffer **cow_ret, u64 new_root_objectid) 184 { 185 struct btrfs_fs_info *fs_info = root->fs_info; 186 struct extent_buffer *cow; 187 int ret = 0; 188 int level; 189 struct btrfs_disk_key disk_key; 190 191 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 192 trans->transid != fs_info->running_transaction->transid); 193 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 194 trans->transid != root->last_trans); 195 196 level = btrfs_header_level(buf); 197 if (level == 0) 198 btrfs_item_key(buf, &disk_key, 0); 199 else 200 btrfs_node_key(buf, &disk_key, 0); 201 202 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 203 &disk_key, level, buf->start, 0, 204 BTRFS_NESTING_NEW_ROOT); 205 if (IS_ERR(cow)) 206 return PTR_ERR(cow); 207 208 copy_extent_buffer_full(cow, buf); 209 btrfs_set_header_bytenr(cow, cow->start); 210 btrfs_set_header_generation(cow, trans->transid); 211 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 212 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 213 BTRFS_HEADER_FLAG_RELOC); 214 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 215 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 216 else 217 btrfs_set_header_owner(cow, new_root_objectid); 218 219 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 220 221 WARN_ON(btrfs_header_generation(buf) > trans->transid); 222 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 223 ret = btrfs_inc_ref(trans, root, cow, 1); 224 else 225 ret = btrfs_inc_ref(trans, root, cow, 0); 226 if (ret) { 227 btrfs_tree_unlock(cow); 228 free_extent_buffer(cow); 229 btrfs_abort_transaction(trans, ret); 230 return ret; 231 } 232 233 btrfs_mark_buffer_dirty(cow); 234 *cow_ret = cow; 235 return 0; 236 } 237 238 /* 239 * check if the tree block can be shared by multiple trees 240 */ 241 int btrfs_block_can_be_shared(struct btrfs_root *root, 242 struct extent_buffer *buf) 243 { 244 /* 245 * Tree blocks not in shareable trees and tree roots are never shared. 246 * If a block was allocated after the last snapshot and the block was 247 * not allocated by tree relocation, we know the block is not shared. 248 */ 249 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 250 buf != root->node && buf != root->commit_root && 251 (btrfs_header_generation(buf) <= 252 btrfs_root_last_snapshot(&root->root_item) || 253 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 254 return 1; 255 256 return 0; 257 } 258 259 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 260 struct btrfs_root *root, 261 struct extent_buffer *buf, 262 struct extent_buffer *cow, 263 int *last_ref) 264 { 265 struct btrfs_fs_info *fs_info = root->fs_info; 266 u64 refs; 267 u64 owner; 268 u64 flags; 269 u64 new_flags = 0; 270 int ret; 271 272 /* 273 * Backrefs update rules: 274 * 275 * Always use full backrefs for extent pointers in tree block 276 * allocated by tree relocation. 277 * 278 * If a shared tree block is no longer referenced by its owner 279 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 280 * use full backrefs for extent pointers in tree block. 281 * 282 * If a tree block is been relocating 283 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 284 * use full backrefs for extent pointers in tree block. 285 * The reason for this is some operations (such as drop tree) 286 * are only allowed for blocks use full backrefs. 287 */ 288 289 if (btrfs_block_can_be_shared(root, buf)) { 290 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 291 btrfs_header_level(buf), 1, 292 &refs, &flags); 293 if (ret) 294 return ret; 295 if (refs == 0) { 296 ret = -EROFS; 297 btrfs_handle_fs_error(fs_info, ret, NULL); 298 return ret; 299 } 300 } else { 301 refs = 1; 302 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 303 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 304 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 305 else 306 flags = 0; 307 } 308 309 owner = btrfs_header_owner(buf); 310 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 311 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 312 313 if (refs > 1) { 314 if ((owner == root->root_key.objectid || 315 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 316 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 317 ret = btrfs_inc_ref(trans, root, buf, 1); 318 if (ret) 319 return ret; 320 321 if (root->root_key.objectid == 322 BTRFS_TREE_RELOC_OBJECTID) { 323 ret = btrfs_dec_ref(trans, root, buf, 0); 324 if (ret) 325 return ret; 326 ret = btrfs_inc_ref(trans, root, cow, 1); 327 if (ret) 328 return ret; 329 } 330 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 331 } else { 332 333 if (root->root_key.objectid == 334 BTRFS_TREE_RELOC_OBJECTID) 335 ret = btrfs_inc_ref(trans, root, cow, 1); 336 else 337 ret = btrfs_inc_ref(trans, root, cow, 0); 338 if (ret) 339 return ret; 340 } 341 if (new_flags != 0) { 342 int level = btrfs_header_level(buf); 343 344 ret = btrfs_set_disk_extent_flags(trans, buf, 345 new_flags, level, 0); 346 if (ret) 347 return ret; 348 } 349 } else { 350 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 351 if (root->root_key.objectid == 352 BTRFS_TREE_RELOC_OBJECTID) 353 ret = btrfs_inc_ref(trans, root, cow, 1); 354 else 355 ret = btrfs_inc_ref(trans, root, cow, 0); 356 if (ret) 357 return ret; 358 ret = btrfs_dec_ref(trans, root, buf, 1); 359 if (ret) 360 return ret; 361 } 362 btrfs_clean_tree_block(buf); 363 *last_ref = 1; 364 } 365 return 0; 366 } 367 368 /* 369 * does the dirty work in cow of a single block. The parent block (if 370 * supplied) is updated to point to the new cow copy. The new buffer is marked 371 * dirty and returned locked. If you modify the block it needs to be marked 372 * dirty again. 373 * 374 * search_start -- an allocation hint for the new block 375 * 376 * empty_size -- a hint that you plan on doing more cow. This is the size in 377 * bytes the allocator should try to find free next to the block it returns. 378 * This is just a hint and may be ignored by the allocator. 379 */ 380 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 381 struct btrfs_root *root, 382 struct extent_buffer *buf, 383 struct extent_buffer *parent, int parent_slot, 384 struct extent_buffer **cow_ret, 385 u64 search_start, u64 empty_size, 386 enum btrfs_lock_nesting nest) 387 { 388 struct btrfs_fs_info *fs_info = root->fs_info; 389 struct btrfs_disk_key disk_key; 390 struct extent_buffer *cow; 391 int level, ret; 392 int last_ref = 0; 393 int unlock_orig = 0; 394 u64 parent_start = 0; 395 396 if (*cow_ret == buf) 397 unlock_orig = 1; 398 399 btrfs_assert_tree_write_locked(buf); 400 401 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 402 trans->transid != fs_info->running_transaction->transid); 403 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 404 trans->transid != root->last_trans); 405 406 level = btrfs_header_level(buf); 407 408 if (level == 0) 409 btrfs_item_key(buf, &disk_key, 0); 410 else 411 btrfs_node_key(buf, &disk_key, 0); 412 413 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 414 parent_start = parent->start; 415 416 cow = btrfs_alloc_tree_block(trans, root, parent_start, 417 root->root_key.objectid, &disk_key, level, 418 search_start, empty_size, nest); 419 if (IS_ERR(cow)) 420 return PTR_ERR(cow); 421 422 /* cow is set to blocking by btrfs_init_new_buffer */ 423 424 copy_extent_buffer_full(cow, buf); 425 btrfs_set_header_bytenr(cow, cow->start); 426 btrfs_set_header_generation(cow, trans->transid); 427 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 428 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 429 BTRFS_HEADER_FLAG_RELOC); 430 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 431 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 432 else 433 btrfs_set_header_owner(cow, root->root_key.objectid); 434 435 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 436 437 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 438 if (ret) { 439 btrfs_tree_unlock(cow); 440 free_extent_buffer(cow); 441 btrfs_abort_transaction(trans, ret); 442 return ret; 443 } 444 445 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 446 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 447 if (ret) { 448 btrfs_tree_unlock(cow); 449 free_extent_buffer(cow); 450 btrfs_abort_transaction(trans, ret); 451 return ret; 452 } 453 } 454 455 if (buf == root->node) { 456 WARN_ON(parent && parent != buf); 457 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 458 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 459 parent_start = buf->start; 460 461 atomic_inc(&cow->refs); 462 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 463 BUG_ON(ret < 0); 464 rcu_assign_pointer(root->node, cow); 465 466 btrfs_free_tree_block(trans, root, buf, parent_start, 467 last_ref); 468 free_extent_buffer(buf); 469 add_root_to_dirty_list(root); 470 } else { 471 WARN_ON(trans->transid != btrfs_header_generation(parent)); 472 btrfs_tree_mod_log_insert_key(parent, parent_slot, 473 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 474 btrfs_set_node_blockptr(parent, parent_slot, 475 cow->start); 476 btrfs_set_node_ptr_generation(parent, parent_slot, 477 trans->transid); 478 btrfs_mark_buffer_dirty(parent); 479 if (last_ref) { 480 ret = btrfs_tree_mod_log_free_eb(buf); 481 if (ret) { 482 btrfs_tree_unlock(cow); 483 free_extent_buffer(cow); 484 btrfs_abort_transaction(trans, ret); 485 return ret; 486 } 487 } 488 btrfs_free_tree_block(trans, root, buf, parent_start, 489 last_ref); 490 } 491 if (unlock_orig) 492 btrfs_tree_unlock(buf); 493 free_extent_buffer_stale(buf); 494 btrfs_mark_buffer_dirty(cow); 495 *cow_ret = cow; 496 return 0; 497 } 498 499 static inline int should_cow_block(struct btrfs_trans_handle *trans, 500 struct btrfs_root *root, 501 struct extent_buffer *buf) 502 { 503 if (btrfs_is_testing(root->fs_info)) 504 return 0; 505 506 /* Ensure we can see the FORCE_COW bit */ 507 smp_mb__before_atomic(); 508 509 /* 510 * We do not need to cow a block if 511 * 1) this block is not created or changed in this transaction; 512 * 2) this block does not belong to TREE_RELOC tree; 513 * 3) the root is not forced COW. 514 * 515 * What is forced COW: 516 * when we create snapshot during committing the transaction, 517 * after we've finished copying src root, we must COW the shared 518 * block to ensure the metadata consistency. 519 */ 520 if (btrfs_header_generation(buf) == trans->transid && 521 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 522 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 523 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 524 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 525 return 0; 526 return 1; 527 } 528 529 /* 530 * cows a single block, see __btrfs_cow_block for the real work. 531 * This version of it has extra checks so that a block isn't COWed more than 532 * once per transaction, as long as it hasn't been written yet 533 */ 534 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 535 struct btrfs_root *root, struct extent_buffer *buf, 536 struct extent_buffer *parent, int parent_slot, 537 struct extent_buffer **cow_ret, 538 enum btrfs_lock_nesting nest) 539 { 540 struct btrfs_fs_info *fs_info = root->fs_info; 541 u64 search_start; 542 int ret; 543 544 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 545 btrfs_err(fs_info, 546 "COW'ing blocks on a fs root that's being dropped"); 547 548 if (trans->transaction != fs_info->running_transaction) 549 WARN(1, KERN_CRIT "trans %llu running %llu\n", 550 trans->transid, 551 fs_info->running_transaction->transid); 552 553 if (trans->transid != fs_info->generation) 554 WARN(1, KERN_CRIT "trans %llu running %llu\n", 555 trans->transid, fs_info->generation); 556 557 if (!should_cow_block(trans, root, buf)) { 558 *cow_ret = buf; 559 return 0; 560 } 561 562 search_start = buf->start & ~((u64)SZ_1G - 1); 563 564 /* 565 * Before CoWing this block for later modification, check if it's 566 * the subtree root and do the delayed subtree trace if needed. 567 * 568 * Also We don't care about the error, as it's handled internally. 569 */ 570 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 571 ret = __btrfs_cow_block(trans, root, buf, parent, 572 parent_slot, cow_ret, search_start, 0, nest); 573 574 trace_btrfs_cow_block(root, buf, *cow_ret); 575 576 return ret; 577 } 578 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 579 580 /* 581 * helper function for defrag to decide if two blocks pointed to by a 582 * node are actually close by 583 */ 584 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 585 { 586 if (blocknr < other && other - (blocknr + blocksize) < 32768) 587 return 1; 588 if (blocknr > other && blocknr - (other + blocksize) < 32768) 589 return 1; 590 return 0; 591 } 592 593 #ifdef __LITTLE_ENDIAN 594 595 /* 596 * Compare two keys, on little-endian the disk order is same as CPU order and 597 * we can avoid the conversion. 598 */ 599 static int comp_keys(const struct btrfs_disk_key *disk_key, 600 const struct btrfs_key *k2) 601 { 602 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 603 604 return btrfs_comp_cpu_keys(k1, k2); 605 } 606 607 #else 608 609 /* 610 * compare two keys in a memcmp fashion 611 */ 612 static int comp_keys(const struct btrfs_disk_key *disk, 613 const struct btrfs_key *k2) 614 { 615 struct btrfs_key k1; 616 617 btrfs_disk_key_to_cpu(&k1, disk); 618 619 return btrfs_comp_cpu_keys(&k1, k2); 620 } 621 #endif 622 623 /* 624 * same as comp_keys only with two btrfs_key's 625 */ 626 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 627 { 628 if (k1->objectid > k2->objectid) 629 return 1; 630 if (k1->objectid < k2->objectid) 631 return -1; 632 if (k1->type > k2->type) 633 return 1; 634 if (k1->type < k2->type) 635 return -1; 636 if (k1->offset > k2->offset) 637 return 1; 638 if (k1->offset < k2->offset) 639 return -1; 640 return 0; 641 } 642 643 /* 644 * this is used by the defrag code to go through all the 645 * leaves pointed to by a node and reallocate them so that 646 * disk order is close to key order 647 */ 648 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 649 struct btrfs_root *root, struct extent_buffer *parent, 650 int start_slot, u64 *last_ret, 651 struct btrfs_key *progress) 652 { 653 struct btrfs_fs_info *fs_info = root->fs_info; 654 struct extent_buffer *cur; 655 u64 blocknr; 656 u64 search_start = *last_ret; 657 u64 last_block = 0; 658 u64 other; 659 u32 parent_nritems; 660 int end_slot; 661 int i; 662 int err = 0; 663 u32 blocksize; 664 int progress_passed = 0; 665 struct btrfs_disk_key disk_key; 666 667 WARN_ON(trans->transaction != fs_info->running_transaction); 668 WARN_ON(trans->transid != fs_info->generation); 669 670 parent_nritems = btrfs_header_nritems(parent); 671 blocksize = fs_info->nodesize; 672 end_slot = parent_nritems - 1; 673 674 if (parent_nritems <= 1) 675 return 0; 676 677 for (i = start_slot; i <= end_slot; i++) { 678 int close = 1; 679 680 btrfs_node_key(parent, &disk_key, i); 681 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 682 continue; 683 684 progress_passed = 1; 685 blocknr = btrfs_node_blockptr(parent, i); 686 if (last_block == 0) 687 last_block = blocknr; 688 689 if (i > 0) { 690 other = btrfs_node_blockptr(parent, i - 1); 691 close = close_blocks(blocknr, other, blocksize); 692 } 693 if (!close && i < end_slot) { 694 other = btrfs_node_blockptr(parent, i + 1); 695 close = close_blocks(blocknr, other, blocksize); 696 } 697 if (close) { 698 last_block = blocknr; 699 continue; 700 } 701 702 cur = btrfs_read_node_slot(parent, i); 703 if (IS_ERR(cur)) 704 return PTR_ERR(cur); 705 if (search_start == 0) 706 search_start = last_block; 707 708 btrfs_tree_lock(cur); 709 err = __btrfs_cow_block(trans, root, cur, parent, i, 710 &cur, search_start, 711 min(16 * blocksize, 712 (end_slot - i) * blocksize), 713 BTRFS_NESTING_COW); 714 if (err) { 715 btrfs_tree_unlock(cur); 716 free_extent_buffer(cur); 717 break; 718 } 719 search_start = cur->start; 720 last_block = cur->start; 721 *last_ret = search_start; 722 btrfs_tree_unlock(cur); 723 free_extent_buffer(cur); 724 } 725 return err; 726 } 727 728 /* 729 * search for key in the extent_buffer. The items start at offset p, 730 * and they are item_size apart. 731 * 732 * the slot in the array is returned via slot, and it points to 733 * the place where you would insert key if it is not found in 734 * the array. 735 * 736 * Slot may point to total number of items if the key is bigger than 737 * all of the keys 738 */ 739 static noinline int generic_bin_search(struct extent_buffer *eb, 740 unsigned long p, int item_size, 741 const struct btrfs_key *key, int *slot) 742 { 743 int low = 0; 744 int high = btrfs_header_nritems(eb); 745 int ret; 746 const int key_size = sizeof(struct btrfs_disk_key); 747 748 if (low > high) { 749 btrfs_err(eb->fs_info, 750 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 751 __func__, low, high, eb->start, 752 btrfs_header_owner(eb), btrfs_header_level(eb)); 753 return -EINVAL; 754 } 755 756 while (low < high) { 757 unsigned long oip; 758 unsigned long offset; 759 struct btrfs_disk_key *tmp; 760 struct btrfs_disk_key unaligned; 761 int mid; 762 763 mid = (low + high) / 2; 764 offset = p + mid * item_size; 765 oip = offset_in_page(offset); 766 767 if (oip + key_size <= PAGE_SIZE) { 768 const unsigned long idx = get_eb_page_index(offset); 769 char *kaddr = page_address(eb->pages[idx]); 770 771 oip = get_eb_offset_in_page(eb, offset); 772 tmp = (struct btrfs_disk_key *)(kaddr + oip); 773 } else { 774 read_extent_buffer(eb, &unaligned, offset, key_size); 775 tmp = &unaligned; 776 } 777 778 ret = comp_keys(tmp, key); 779 780 if (ret < 0) 781 low = mid + 1; 782 else if (ret > 0) 783 high = mid; 784 else { 785 *slot = mid; 786 return 0; 787 } 788 } 789 *slot = low; 790 return 1; 791 } 792 793 /* 794 * simple bin_search frontend that does the right thing for 795 * leaves vs nodes 796 */ 797 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 798 int *slot) 799 { 800 if (btrfs_header_level(eb) == 0) 801 return generic_bin_search(eb, 802 offsetof(struct btrfs_leaf, items), 803 sizeof(struct btrfs_item), key, slot); 804 else 805 return generic_bin_search(eb, 806 offsetof(struct btrfs_node, ptrs), 807 sizeof(struct btrfs_key_ptr), key, slot); 808 } 809 810 static void root_add_used(struct btrfs_root *root, u32 size) 811 { 812 spin_lock(&root->accounting_lock); 813 btrfs_set_root_used(&root->root_item, 814 btrfs_root_used(&root->root_item) + size); 815 spin_unlock(&root->accounting_lock); 816 } 817 818 static void root_sub_used(struct btrfs_root *root, u32 size) 819 { 820 spin_lock(&root->accounting_lock); 821 btrfs_set_root_used(&root->root_item, 822 btrfs_root_used(&root->root_item) - size); 823 spin_unlock(&root->accounting_lock); 824 } 825 826 /* given a node and slot number, this reads the blocks it points to. The 827 * extent buffer is returned with a reference taken (but unlocked). 828 */ 829 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 830 int slot) 831 { 832 int level = btrfs_header_level(parent); 833 struct extent_buffer *eb; 834 struct btrfs_key first_key; 835 836 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 837 return ERR_PTR(-ENOENT); 838 839 BUG_ON(level == 0); 840 841 btrfs_node_key_to_cpu(parent, &first_key, slot); 842 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 843 btrfs_header_owner(parent), 844 btrfs_node_ptr_generation(parent, slot), 845 level - 1, &first_key); 846 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 847 free_extent_buffer(eb); 848 eb = ERR_PTR(-EIO); 849 } 850 851 return eb; 852 } 853 854 /* 855 * node level balancing, used to make sure nodes are in proper order for 856 * item deletion. We balance from the top down, so we have to make sure 857 * that a deletion won't leave an node completely empty later on. 858 */ 859 static noinline int balance_level(struct btrfs_trans_handle *trans, 860 struct btrfs_root *root, 861 struct btrfs_path *path, int level) 862 { 863 struct btrfs_fs_info *fs_info = root->fs_info; 864 struct extent_buffer *right = NULL; 865 struct extent_buffer *mid; 866 struct extent_buffer *left = NULL; 867 struct extent_buffer *parent = NULL; 868 int ret = 0; 869 int wret; 870 int pslot; 871 int orig_slot = path->slots[level]; 872 u64 orig_ptr; 873 874 ASSERT(level > 0); 875 876 mid = path->nodes[level]; 877 878 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 879 WARN_ON(btrfs_header_generation(mid) != trans->transid); 880 881 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 882 883 if (level < BTRFS_MAX_LEVEL - 1) { 884 parent = path->nodes[level + 1]; 885 pslot = path->slots[level + 1]; 886 } 887 888 /* 889 * deal with the case where there is only one pointer in the root 890 * by promoting the node below to a root 891 */ 892 if (!parent) { 893 struct extent_buffer *child; 894 895 if (btrfs_header_nritems(mid) != 1) 896 return 0; 897 898 /* promote the child to a root */ 899 child = btrfs_read_node_slot(mid, 0); 900 if (IS_ERR(child)) { 901 ret = PTR_ERR(child); 902 btrfs_handle_fs_error(fs_info, ret, NULL); 903 goto enospc; 904 } 905 906 btrfs_tree_lock(child); 907 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 908 BTRFS_NESTING_COW); 909 if (ret) { 910 btrfs_tree_unlock(child); 911 free_extent_buffer(child); 912 goto enospc; 913 } 914 915 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 916 BUG_ON(ret < 0); 917 rcu_assign_pointer(root->node, child); 918 919 add_root_to_dirty_list(root); 920 btrfs_tree_unlock(child); 921 922 path->locks[level] = 0; 923 path->nodes[level] = NULL; 924 btrfs_clean_tree_block(mid); 925 btrfs_tree_unlock(mid); 926 /* once for the path */ 927 free_extent_buffer(mid); 928 929 root_sub_used(root, mid->len); 930 btrfs_free_tree_block(trans, root, mid, 0, 1); 931 /* once for the root ptr */ 932 free_extent_buffer_stale(mid); 933 return 0; 934 } 935 if (btrfs_header_nritems(mid) > 936 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 937 return 0; 938 939 left = btrfs_read_node_slot(parent, pslot - 1); 940 if (IS_ERR(left)) 941 left = NULL; 942 943 if (left) { 944 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 945 wret = btrfs_cow_block(trans, root, left, 946 parent, pslot - 1, &left, 947 BTRFS_NESTING_LEFT_COW); 948 if (wret) { 949 ret = wret; 950 goto enospc; 951 } 952 } 953 954 right = btrfs_read_node_slot(parent, pslot + 1); 955 if (IS_ERR(right)) 956 right = NULL; 957 958 if (right) { 959 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 960 wret = btrfs_cow_block(trans, root, right, 961 parent, pslot + 1, &right, 962 BTRFS_NESTING_RIGHT_COW); 963 if (wret) { 964 ret = wret; 965 goto enospc; 966 } 967 } 968 969 /* first, try to make some room in the middle buffer */ 970 if (left) { 971 orig_slot += btrfs_header_nritems(left); 972 wret = push_node_left(trans, left, mid, 1); 973 if (wret < 0) 974 ret = wret; 975 } 976 977 /* 978 * then try to empty the right most buffer into the middle 979 */ 980 if (right) { 981 wret = push_node_left(trans, mid, right, 1); 982 if (wret < 0 && wret != -ENOSPC) 983 ret = wret; 984 if (btrfs_header_nritems(right) == 0) { 985 btrfs_clean_tree_block(right); 986 btrfs_tree_unlock(right); 987 del_ptr(root, path, level + 1, pslot + 1); 988 root_sub_used(root, right->len); 989 btrfs_free_tree_block(trans, root, right, 0, 1); 990 free_extent_buffer_stale(right); 991 right = NULL; 992 } else { 993 struct btrfs_disk_key right_key; 994 btrfs_node_key(right, &right_key, 0); 995 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 996 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 997 BUG_ON(ret < 0); 998 btrfs_set_node_key(parent, &right_key, pslot + 1); 999 btrfs_mark_buffer_dirty(parent); 1000 } 1001 } 1002 if (btrfs_header_nritems(mid) == 1) { 1003 /* 1004 * we're not allowed to leave a node with one item in the 1005 * tree during a delete. A deletion from lower in the tree 1006 * could try to delete the only pointer in this node. 1007 * So, pull some keys from the left. 1008 * There has to be a left pointer at this point because 1009 * otherwise we would have pulled some pointers from the 1010 * right 1011 */ 1012 if (!left) { 1013 ret = -EROFS; 1014 btrfs_handle_fs_error(fs_info, ret, NULL); 1015 goto enospc; 1016 } 1017 wret = balance_node_right(trans, mid, left); 1018 if (wret < 0) { 1019 ret = wret; 1020 goto enospc; 1021 } 1022 if (wret == 1) { 1023 wret = push_node_left(trans, left, mid, 1); 1024 if (wret < 0) 1025 ret = wret; 1026 } 1027 BUG_ON(wret == 1); 1028 } 1029 if (btrfs_header_nritems(mid) == 0) { 1030 btrfs_clean_tree_block(mid); 1031 btrfs_tree_unlock(mid); 1032 del_ptr(root, path, level + 1, pslot); 1033 root_sub_used(root, mid->len); 1034 btrfs_free_tree_block(trans, root, mid, 0, 1); 1035 free_extent_buffer_stale(mid); 1036 mid = NULL; 1037 } else { 1038 /* update the parent key to reflect our changes */ 1039 struct btrfs_disk_key mid_key; 1040 btrfs_node_key(mid, &mid_key, 0); 1041 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1042 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1043 BUG_ON(ret < 0); 1044 btrfs_set_node_key(parent, &mid_key, pslot); 1045 btrfs_mark_buffer_dirty(parent); 1046 } 1047 1048 /* update the path */ 1049 if (left) { 1050 if (btrfs_header_nritems(left) > orig_slot) { 1051 atomic_inc(&left->refs); 1052 /* left was locked after cow */ 1053 path->nodes[level] = left; 1054 path->slots[level + 1] -= 1; 1055 path->slots[level] = orig_slot; 1056 if (mid) { 1057 btrfs_tree_unlock(mid); 1058 free_extent_buffer(mid); 1059 } 1060 } else { 1061 orig_slot -= btrfs_header_nritems(left); 1062 path->slots[level] = orig_slot; 1063 } 1064 } 1065 /* double check we haven't messed things up */ 1066 if (orig_ptr != 1067 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1068 BUG(); 1069 enospc: 1070 if (right) { 1071 btrfs_tree_unlock(right); 1072 free_extent_buffer(right); 1073 } 1074 if (left) { 1075 if (path->nodes[level] != left) 1076 btrfs_tree_unlock(left); 1077 free_extent_buffer(left); 1078 } 1079 return ret; 1080 } 1081 1082 /* Node balancing for insertion. Here we only split or push nodes around 1083 * when they are completely full. This is also done top down, so we 1084 * have to be pessimistic. 1085 */ 1086 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1087 struct btrfs_root *root, 1088 struct btrfs_path *path, int level) 1089 { 1090 struct btrfs_fs_info *fs_info = root->fs_info; 1091 struct extent_buffer *right = NULL; 1092 struct extent_buffer *mid; 1093 struct extent_buffer *left = NULL; 1094 struct extent_buffer *parent = NULL; 1095 int ret = 0; 1096 int wret; 1097 int pslot; 1098 int orig_slot = path->slots[level]; 1099 1100 if (level == 0) 1101 return 1; 1102 1103 mid = path->nodes[level]; 1104 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1105 1106 if (level < BTRFS_MAX_LEVEL - 1) { 1107 parent = path->nodes[level + 1]; 1108 pslot = path->slots[level + 1]; 1109 } 1110 1111 if (!parent) 1112 return 1; 1113 1114 left = btrfs_read_node_slot(parent, pslot - 1); 1115 if (IS_ERR(left)) 1116 left = NULL; 1117 1118 /* first, try to make some room in the middle buffer */ 1119 if (left) { 1120 u32 left_nr; 1121 1122 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1123 1124 left_nr = btrfs_header_nritems(left); 1125 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1126 wret = 1; 1127 } else { 1128 ret = btrfs_cow_block(trans, root, left, parent, 1129 pslot - 1, &left, 1130 BTRFS_NESTING_LEFT_COW); 1131 if (ret) 1132 wret = 1; 1133 else { 1134 wret = push_node_left(trans, left, mid, 0); 1135 } 1136 } 1137 if (wret < 0) 1138 ret = wret; 1139 if (wret == 0) { 1140 struct btrfs_disk_key disk_key; 1141 orig_slot += left_nr; 1142 btrfs_node_key(mid, &disk_key, 0); 1143 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1144 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1145 BUG_ON(ret < 0); 1146 btrfs_set_node_key(parent, &disk_key, pslot); 1147 btrfs_mark_buffer_dirty(parent); 1148 if (btrfs_header_nritems(left) > orig_slot) { 1149 path->nodes[level] = left; 1150 path->slots[level + 1] -= 1; 1151 path->slots[level] = orig_slot; 1152 btrfs_tree_unlock(mid); 1153 free_extent_buffer(mid); 1154 } else { 1155 orig_slot -= 1156 btrfs_header_nritems(left); 1157 path->slots[level] = orig_slot; 1158 btrfs_tree_unlock(left); 1159 free_extent_buffer(left); 1160 } 1161 return 0; 1162 } 1163 btrfs_tree_unlock(left); 1164 free_extent_buffer(left); 1165 } 1166 right = btrfs_read_node_slot(parent, pslot + 1); 1167 if (IS_ERR(right)) 1168 right = NULL; 1169 1170 /* 1171 * then try to empty the right most buffer into the middle 1172 */ 1173 if (right) { 1174 u32 right_nr; 1175 1176 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1177 1178 right_nr = btrfs_header_nritems(right); 1179 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1180 wret = 1; 1181 } else { 1182 ret = btrfs_cow_block(trans, root, right, 1183 parent, pslot + 1, 1184 &right, BTRFS_NESTING_RIGHT_COW); 1185 if (ret) 1186 wret = 1; 1187 else { 1188 wret = balance_node_right(trans, right, mid); 1189 } 1190 } 1191 if (wret < 0) 1192 ret = wret; 1193 if (wret == 0) { 1194 struct btrfs_disk_key disk_key; 1195 1196 btrfs_node_key(right, &disk_key, 0); 1197 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1198 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1199 BUG_ON(ret < 0); 1200 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1201 btrfs_mark_buffer_dirty(parent); 1202 1203 if (btrfs_header_nritems(mid) <= orig_slot) { 1204 path->nodes[level] = right; 1205 path->slots[level + 1] += 1; 1206 path->slots[level] = orig_slot - 1207 btrfs_header_nritems(mid); 1208 btrfs_tree_unlock(mid); 1209 free_extent_buffer(mid); 1210 } else { 1211 btrfs_tree_unlock(right); 1212 free_extent_buffer(right); 1213 } 1214 return 0; 1215 } 1216 btrfs_tree_unlock(right); 1217 free_extent_buffer(right); 1218 } 1219 return 1; 1220 } 1221 1222 /* 1223 * readahead one full node of leaves, finding things that are close 1224 * to the block in 'slot', and triggering ra on them. 1225 */ 1226 static void reada_for_search(struct btrfs_fs_info *fs_info, 1227 struct btrfs_path *path, 1228 int level, int slot, u64 objectid) 1229 { 1230 struct extent_buffer *node; 1231 struct btrfs_disk_key disk_key; 1232 u32 nritems; 1233 u64 search; 1234 u64 target; 1235 u64 nread = 0; 1236 u64 nread_max; 1237 u32 nr; 1238 u32 blocksize; 1239 u32 nscan = 0; 1240 1241 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1242 return; 1243 1244 if (!path->nodes[level]) 1245 return; 1246 1247 node = path->nodes[level]; 1248 1249 /* 1250 * Since the time between visiting leaves is much shorter than the time 1251 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1252 * much IO at once (possibly random). 1253 */ 1254 if (path->reada == READA_FORWARD_ALWAYS) { 1255 if (level > 1) 1256 nread_max = node->fs_info->nodesize; 1257 else 1258 nread_max = SZ_128K; 1259 } else { 1260 nread_max = SZ_64K; 1261 } 1262 1263 search = btrfs_node_blockptr(node, slot); 1264 blocksize = fs_info->nodesize; 1265 if (path->reada != READA_FORWARD_ALWAYS) { 1266 struct extent_buffer *eb; 1267 1268 eb = find_extent_buffer(fs_info, search); 1269 if (eb) { 1270 free_extent_buffer(eb); 1271 return; 1272 } 1273 } 1274 1275 target = search; 1276 1277 nritems = btrfs_header_nritems(node); 1278 nr = slot; 1279 1280 while (1) { 1281 if (path->reada == READA_BACK) { 1282 if (nr == 0) 1283 break; 1284 nr--; 1285 } else if (path->reada == READA_FORWARD || 1286 path->reada == READA_FORWARD_ALWAYS) { 1287 nr++; 1288 if (nr >= nritems) 1289 break; 1290 } 1291 if (path->reada == READA_BACK && objectid) { 1292 btrfs_node_key(node, &disk_key, nr); 1293 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1294 break; 1295 } 1296 search = btrfs_node_blockptr(node, nr); 1297 if (path->reada == READA_FORWARD_ALWAYS || 1298 (search <= target && target - search <= 65536) || 1299 (search > target && search - target <= 65536)) { 1300 btrfs_readahead_node_child(node, nr); 1301 nread += blocksize; 1302 } 1303 nscan++; 1304 if (nread > nread_max || nscan > 32) 1305 break; 1306 } 1307 } 1308 1309 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1310 { 1311 struct extent_buffer *parent; 1312 int slot; 1313 int nritems; 1314 1315 parent = path->nodes[level + 1]; 1316 if (!parent) 1317 return; 1318 1319 nritems = btrfs_header_nritems(parent); 1320 slot = path->slots[level + 1]; 1321 1322 if (slot > 0) 1323 btrfs_readahead_node_child(parent, slot - 1); 1324 if (slot + 1 < nritems) 1325 btrfs_readahead_node_child(parent, slot + 1); 1326 } 1327 1328 1329 /* 1330 * when we walk down the tree, it is usually safe to unlock the higher layers 1331 * in the tree. The exceptions are when our path goes through slot 0, because 1332 * operations on the tree might require changing key pointers higher up in the 1333 * tree. 1334 * 1335 * callers might also have set path->keep_locks, which tells this code to keep 1336 * the lock if the path points to the last slot in the block. This is part of 1337 * walking through the tree, and selecting the next slot in the higher block. 1338 * 1339 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1340 * if lowest_unlock is 1, level 0 won't be unlocked 1341 */ 1342 static noinline void unlock_up(struct btrfs_path *path, int level, 1343 int lowest_unlock, int min_write_lock_level, 1344 int *write_lock_level) 1345 { 1346 int i; 1347 int skip_level = level; 1348 int no_skips = 0; 1349 struct extent_buffer *t; 1350 1351 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1352 if (!path->nodes[i]) 1353 break; 1354 if (!path->locks[i]) 1355 break; 1356 if (!no_skips && path->slots[i] == 0) { 1357 skip_level = i + 1; 1358 continue; 1359 } 1360 if (!no_skips && path->keep_locks) { 1361 u32 nritems; 1362 t = path->nodes[i]; 1363 nritems = btrfs_header_nritems(t); 1364 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1365 skip_level = i + 1; 1366 continue; 1367 } 1368 } 1369 if (skip_level < i && i >= lowest_unlock) 1370 no_skips = 1; 1371 1372 t = path->nodes[i]; 1373 if (i >= lowest_unlock && i > skip_level) { 1374 btrfs_tree_unlock_rw(t, path->locks[i]); 1375 path->locks[i] = 0; 1376 if (write_lock_level && 1377 i > min_write_lock_level && 1378 i <= *write_lock_level) { 1379 *write_lock_level = i - 1; 1380 } 1381 } 1382 } 1383 } 1384 1385 /* 1386 * helper function for btrfs_search_slot. The goal is to find a block 1387 * in cache without setting the path to blocking. If we find the block 1388 * we return zero and the path is unchanged. 1389 * 1390 * If we can't find the block, we set the path blocking and do some 1391 * reada. -EAGAIN is returned and the search must be repeated. 1392 */ 1393 static int 1394 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1395 struct extent_buffer **eb_ret, int level, int slot, 1396 const struct btrfs_key *key) 1397 { 1398 struct btrfs_fs_info *fs_info = root->fs_info; 1399 u64 blocknr; 1400 u64 gen; 1401 struct extent_buffer *tmp; 1402 struct btrfs_key first_key; 1403 int ret; 1404 int parent_level; 1405 1406 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1407 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1408 parent_level = btrfs_header_level(*eb_ret); 1409 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); 1410 1411 tmp = find_extent_buffer(fs_info, blocknr); 1412 if (tmp) { 1413 if (p->reada == READA_FORWARD_ALWAYS) 1414 reada_for_search(fs_info, p, level, slot, key->objectid); 1415 1416 /* first we do an atomic uptodate check */ 1417 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1418 /* 1419 * Do extra check for first_key, eb can be stale due to 1420 * being cached, read from scrub, or have multiple 1421 * parents (shared tree blocks). 1422 */ 1423 if (btrfs_verify_level_key(tmp, 1424 parent_level - 1, &first_key, gen)) { 1425 free_extent_buffer(tmp); 1426 return -EUCLEAN; 1427 } 1428 *eb_ret = tmp; 1429 return 0; 1430 } 1431 1432 /* now we're allowed to do a blocking uptodate check */ 1433 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 1434 if (!ret) { 1435 *eb_ret = tmp; 1436 return 0; 1437 } 1438 free_extent_buffer(tmp); 1439 btrfs_release_path(p); 1440 return -EIO; 1441 } 1442 1443 /* 1444 * reduce lock contention at high levels 1445 * of the btree by dropping locks before 1446 * we read. Don't release the lock on the current 1447 * level because we need to walk this node to figure 1448 * out which blocks to read. 1449 */ 1450 btrfs_unlock_up_safe(p, level + 1); 1451 1452 if (p->reada != READA_NONE) 1453 reada_for_search(fs_info, p, level, slot, key->objectid); 1454 1455 ret = -EAGAIN; 1456 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1457 gen, parent_level - 1, &first_key); 1458 if (!IS_ERR(tmp)) { 1459 /* 1460 * If the read above didn't mark this buffer up to date, 1461 * it will never end up being up to date. Set ret to EIO now 1462 * and give up so that our caller doesn't loop forever 1463 * on our EAGAINs. 1464 */ 1465 if (!extent_buffer_uptodate(tmp)) 1466 ret = -EIO; 1467 free_extent_buffer(tmp); 1468 } else { 1469 ret = PTR_ERR(tmp); 1470 } 1471 1472 btrfs_release_path(p); 1473 return ret; 1474 } 1475 1476 /* 1477 * helper function for btrfs_search_slot. This does all of the checks 1478 * for node-level blocks and does any balancing required based on 1479 * the ins_len. 1480 * 1481 * If no extra work was required, zero is returned. If we had to 1482 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1483 * start over 1484 */ 1485 static int 1486 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1487 struct btrfs_root *root, struct btrfs_path *p, 1488 struct extent_buffer *b, int level, int ins_len, 1489 int *write_lock_level) 1490 { 1491 struct btrfs_fs_info *fs_info = root->fs_info; 1492 int ret = 0; 1493 1494 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1495 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1496 1497 if (*write_lock_level < level + 1) { 1498 *write_lock_level = level + 1; 1499 btrfs_release_path(p); 1500 return -EAGAIN; 1501 } 1502 1503 reada_for_balance(p, level); 1504 ret = split_node(trans, root, p, level); 1505 1506 b = p->nodes[level]; 1507 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1508 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1509 1510 if (*write_lock_level < level + 1) { 1511 *write_lock_level = level + 1; 1512 btrfs_release_path(p); 1513 return -EAGAIN; 1514 } 1515 1516 reada_for_balance(p, level); 1517 ret = balance_level(trans, root, p, level); 1518 if (ret) 1519 return ret; 1520 1521 b = p->nodes[level]; 1522 if (!b) { 1523 btrfs_release_path(p); 1524 return -EAGAIN; 1525 } 1526 BUG_ON(btrfs_header_nritems(b) == 1); 1527 } 1528 return ret; 1529 } 1530 1531 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1532 u64 iobjectid, u64 ioff, u8 key_type, 1533 struct btrfs_key *found_key) 1534 { 1535 int ret; 1536 struct btrfs_key key; 1537 struct extent_buffer *eb; 1538 1539 ASSERT(path); 1540 ASSERT(found_key); 1541 1542 key.type = key_type; 1543 key.objectid = iobjectid; 1544 key.offset = ioff; 1545 1546 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1547 if (ret < 0) 1548 return ret; 1549 1550 eb = path->nodes[0]; 1551 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1552 ret = btrfs_next_leaf(fs_root, path); 1553 if (ret) 1554 return ret; 1555 eb = path->nodes[0]; 1556 } 1557 1558 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1559 if (found_key->type != key.type || 1560 found_key->objectid != key.objectid) 1561 return 1; 1562 1563 return 0; 1564 } 1565 1566 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1567 struct btrfs_path *p, 1568 int write_lock_level) 1569 { 1570 struct btrfs_fs_info *fs_info = root->fs_info; 1571 struct extent_buffer *b; 1572 int root_lock; 1573 int level = 0; 1574 1575 /* We try very hard to do read locks on the root */ 1576 root_lock = BTRFS_READ_LOCK; 1577 1578 if (p->search_commit_root) { 1579 /* 1580 * The commit roots are read only so we always do read locks, 1581 * and we always must hold the commit_root_sem when doing 1582 * searches on them, the only exception is send where we don't 1583 * want to block transaction commits for a long time, so 1584 * we need to clone the commit root in order to avoid races 1585 * with transaction commits that create a snapshot of one of 1586 * the roots used by a send operation. 1587 */ 1588 if (p->need_commit_sem) { 1589 down_read(&fs_info->commit_root_sem); 1590 b = btrfs_clone_extent_buffer(root->commit_root); 1591 up_read(&fs_info->commit_root_sem); 1592 if (!b) 1593 return ERR_PTR(-ENOMEM); 1594 1595 } else { 1596 b = root->commit_root; 1597 atomic_inc(&b->refs); 1598 } 1599 level = btrfs_header_level(b); 1600 /* 1601 * Ensure that all callers have set skip_locking when 1602 * p->search_commit_root = 1. 1603 */ 1604 ASSERT(p->skip_locking == 1); 1605 1606 goto out; 1607 } 1608 1609 if (p->skip_locking) { 1610 b = btrfs_root_node(root); 1611 level = btrfs_header_level(b); 1612 goto out; 1613 } 1614 1615 /* 1616 * If the level is set to maximum, we can skip trying to get the read 1617 * lock. 1618 */ 1619 if (write_lock_level < BTRFS_MAX_LEVEL) { 1620 /* 1621 * We don't know the level of the root node until we actually 1622 * have it read locked 1623 */ 1624 b = btrfs_read_lock_root_node(root); 1625 level = btrfs_header_level(b); 1626 if (level > write_lock_level) 1627 goto out; 1628 1629 /* Whoops, must trade for write lock */ 1630 btrfs_tree_read_unlock(b); 1631 free_extent_buffer(b); 1632 } 1633 1634 b = btrfs_lock_root_node(root); 1635 root_lock = BTRFS_WRITE_LOCK; 1636 1637 /* The level might have changed, check again */ 1638 level = btrfs_header_level(b); 1639 1640 out: 1641 p->nodes[level] = b; 1642 if (!p->skip_locking) 1643 p->locks[level] = root_lock; 1644 /* 1645 * Callers are responsible for dropping b's references. 1646 */ 1647 return b; 1648 } 1649 1650 1651 /* 1652 * btrfs_search_slot - look for a key in a tree and perform necessary 1653 * modifications to preserve tree invariants. 1654 * 1655 * @trans: Handle of transaction, used when modifying the tree 1656 * @p: Holds all btree nodes along the search path 1657 * @root: The root node of the tree 1658 * @key: The key we are looking for 1659 * @ins_len: Indicates purpose of search: 1660 * >0 for inserts it's size of item inserted (*) 1661 * <0 for deletions 1662 * 0 for plain searches, not modifying the tree 1663 * 1664 * (*) If size of item inserted doesn't include 1665 * sizeof(struct btrfs_item), then p->search_for_extension must 1666 * be set. 1667 * @cow: boolean should CoW operations be performed. Must always be 1 1668 * when modifying the tree. 1669 * 1670 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1671 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1672 * 1673 * If @key is found, 0 is returned and you can find the item in the leaf level 1674 * of the path (level 0) 1675 * 1676 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1677 * points to the slot where it should be inserted 1678 * 1679 * If an error is encountered while searching the tree a negative error number 1680 * is returned 1681 */ 1682 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1683 const struct btrfs_key *key, struct btrfs_path *p, 1684 int ins_len, int cow) 1685 { 1686 struct extent_buffer *b; 1687 int slot; 1688 int ret; 1689 int err; 1690 int level; 1691 int lowest_unlock = 1; 1692 /* everything at write_lock_level or lower must be write locked */ 1693 int write_lock_level = 0; 1694 u8 lowest_level = 0; 1695 int min_write_lock_level; 1696 int prev_cmp; 1697 1698 lowest_level = p->lowest_level; 1699 WARN_ON(lowest_level && ins_len > 0); 1700 WARN_ON(p->nodes[0] != NULL); 1701 BUG_ON(!cow && ins_len); 1702 1703 if (ins_len < 0) { 1704 lowest_unlock = 2; 1705 1706 /* when we are removing items, we might have to go up to level 1707 * two as we update tree pointers Make sure we keep write 1708 * for those levels as well 1709 */ 1710 write_lock_level = 2; 1711 } else if (ins_len > 0) { 1712 /* 1713 * for inserting items, make sure we have a write lock on 1714 * level 1 so we can update keys 1715 */ 1716 write_lock_level = 1; 1717 } 1718 1719 if (!cow) 1720 write_lock_level = -1; 1721 1722 if (cow && (p->keep_locks || p->lowest_level)) 1723 write_lock_level = BTRFS_MAX_LEVEL; 1724 1725 min_write_lock_level = write_lock_level; 1726 1727 again: 1728 prev_cmp = -1; 1729 b = btrfs_search_slot_get_root(root, p, write_lock_level); 1730 if (IS_ERR(b)) { 1731 ret = PTR_ERR(b); 1732 goto done; 1733 } 1734 1735 while (b) { 1736 int dec = 0; 1737 1738 level = btrfs_header_level(b); 1739 1740 if (cow) { 1741 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 1742 1743 /* 1744 * if we don't really need to cow this block 1745 * then we don't want to set the path blocking, 1746 * so we test it here 1747 */ 1748 if (!should_cow_block(trans, root, b)) 1749 goto cow_done; 1750 1751 /* 1752 * must have write locks on this node and the 1753 * parent 1754 */ 1755 if (level > write_lock_level || 1756 (level + 1 > write_lock_level && 1757 level + 1 < BTRFS_MAX_LEVEL && 1758 p->nodes[level + 1])) { 1759 write_lock_level = level + 1; 1760 btrfs_release_path(p); 1761 goto again; 1762 } 1763 1764 if (last_level) 1765 err = btrfs_cow_block(trans, root, b, NULL, 0, 1766 &b, 1767 BTRFS_NESTING_COW); 1768 else 1769 err = btrfs_cow_block(trans, root, b, 1770 p->nodes[level + 1], 1771 p->slots[level + 1], &b, 1772 BTRFS_NESTING_COW); 1773 if (err) { 1774 ret = err; 1775 goto done; 1776 } 1777 } 1778 cow_done: 1779 p->nodes[level] = b; 1780 /* 1781 * Leave path with blocking locks to avoid massive 1782 * lock context switch, this is made on purpose. 1783 */ 1784 1785 /* 1786 * we have a lock on b and as long as we aren't changing 1787 * the tree, there is no way to for the items in b to change. 1788 * It is safe to drop the lock on our parent before we 1789 * go through the expensive btree search on b. 1790 * 1791 * If we're inserting or deleting (ins_len != 0), then we might 1792 * be changing slot zero, which may require changing the parent. 1793 * So, we can't drop the lock until after we know which slot 1794 * we're operating on. 1795 */ 1796 if (!ins_len && !p->keep_locks) { 1797 int u = level + 1; 1798 1799 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 1800 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 1801 p->locks[u] = 0; 1802 } 1803 } 1804 1805 /* 1806 * If btrfs_bin_search returns an exact match (prev_cmp == 0) 1807 * we can safely assume the target key will always be in slot 0 1808 * on lower levels due to the invariants BTRFS' btree provides, 1809 * namely that a btrfs_key_ptr entry always points to the 1810 * lowest key in the child node, thus we can skip searching 1811 * lower levels 1812 */ 1813 if (prev_cmp == 0) { 1814 slot = 0; 1815 ret = 0; 1816 } else { 1817 ret = btrfs_bin_search(b, key, &slot); 1818 prev_cmp = ret; 1819 if (ret < 0) 1820 goto done; 1821 } 1822 1823 if (level == 0) { 1824 p->slots[level] = slot; 1825 /* 1826 * Item key already exists. In this case, if we are 1827 * allowed to insert the item (for example, in dir_item 1828 * case, item key collision is allowed), it will be 1829 * merged with the original item. Only the item size 1830 * grows, no new btrfs item will be added. If 1831 * search_for_extension is not set, ins_len already 1832 * accounts the size btrfs_item, deduct it here so leaf 1833 * space check will be correct. 1834 */ 1835 if (ret == 0 && ins_len > 0 && !p->search_for_extension) { 1836 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1837 ins_len -= sizeof(struct btrfs_item); 1838 } 1839 if (ins_len > 0 && 1840 btrfs_leaf_free_space(b) < ins_len) { 1841 if (write_lock_level < 1) { 1842 write_lock_level = 1; 1843 btrfs_release_path(p); 1844 goto again; 1845 } 1846 1847 err = split_leaf(trans, root, key, 1848 p, ins_len, ret == 0); 1849 1850 BUG_ON(err > 0); 1851 if (err) { 1852 ret = err; 1853 goto done; 1854 } 1855 } 1856 if (!p->search_for_split) 1857 unlock_up(p, level, lowest_unlock, 1858 min_write_lock_level, NULL); 1859 goto done; 1860 } 1861 if (ret && slot > 0) { 1862 dec = 1; 1863 slot--; 1864 } 1865 p->slots[level] = slot; 1866 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 1867 &write_lock_level); 1868 if (err == -EAGAIN) 1869 goto again; 1870 if (err) { 1871 ret = err; 1872 goto done; 1873 } 1874 b = p->nodes[level]; 1875 slot = p->slots[level]; 1876 1877 /* 1878 * Slot 0 is special, if we change the key we have to update 1879 * the parent pointer which means we must have a write lock on 1880 * the parent 1881 */ 1882 if (slot == 0 && ins_len && write_lock_level < level + 1) { 1883 write_lock_level = level + 1; 1884 btrfs_release_path(p); 1885 goto again; 1886 } 1887 1888 unlock_up(p, level, lowest_unlock, min_write_lock_level, 1889 &write_lock_level); 1890 1891 if (level == lowest_level) { 1892 if (dec) 1893 p->slots[level]++; 1894 goto done; 1895 } 1896 1897 err = read_block_for_search(root, p, &b, level, slot, key); 1898 if (err == -EAGAIN) 1899 goto again; 1900 if (err) { 1901 ret = err; 1902 goto done; 1903 } 1904 1905 if (!p->skip_locking) { 1906 level = btrfs_header_level(b); 1907 if (level <= write_lock_level) { 1908 btrfs_tree_lock(b); 1909 p->locks[level] = BTRFS_WRITE_LOCK; 1910 } else { 1911 btrfs_tree_read_lock(b); 1912 p->locks[level] = BTRFS_READ_LOCK; 1913 } 1914 p->nodes[level] = b; 1915 } 1916 } 1917 ret = 1; 1918 done: 1919 if (ret < 0 && !p->skip_release_on_error) 1920 btrfs_release_path(p); 1921 return ret; 1922 } 1923 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 1924 1925 /* 1926 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 1927 * current state of the tree together with the operations recorded in the tree 1928 * modification log to search for the key in a previous version of this tree, as 1929 * denoted by the time_seq parameter. 1930 * 1931 * Naturally, there is no support for insert, delete or cow operations. 1932 * 1933 * The resulting path and return value will be set up as if we called 1934 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 1935 */ 1936 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 1937 struct btrfs_path *p, u64 time_seq) 1938 { 1939 struct btrfs_fs_info *fs_info = root->fs_info; 1940 struct extent_buffer *b; 1941 int slot; 1942 int ret; 1943 int err; 1944 int level; 1945 int lowest_unlock = 1; 1946 u8 lowest_level = 0; 1947 1948 lowest_level = p->lowest_level; 1949 WARN_ON(p->nodes[0] != NULL); 1950 1951 if (p->search_commit_root) { 1952 BUG_ON(time_seq); 1953 return btrfs_search_slot(NULL, root, key, p, 0, 0); 1954 } 1955 1956 again: 1957 b = btrfs_get_old_root(root, time_seq); 1958 if (!b) { 1959 ret = -EIO; 1960 goto done; 1961 } 1962 level = btrfs_header_level(b); 1963 p->locks[level] = BTRFS_READ_LOCK; 1964 1965 while (b) { 1966 int dec = 0; 1967 1968 level = btrfs_header_level(b); 1969 p->nodes[level] = b; 1970 1971 /* 1972 * we have a lock on b and as long as we aren't changing 1973 * the tree, there is no way to for the items in b to change. 1974 * It is safe to drop the lock on our parent before we 1975 * go through the expensive btree search on b. 1976 */ 1977 btrfs_unlock_up_safe(p, level + 1); 1978 1979 ret = btrfs_bin_search(b, key, &slot); 1980 if (ret < 0) 1981 goto done; 1982 1983 if (level == 0) { 1984 p->slots[level] = slot; 1985 unlock_up(p, level, lowest_unlock, 0, NULL); 1986 goto done; 1987 } 1988 1989 if (ret && slot > 0) { 1990 dec = 1; 1991 slot--; 1992 } 1993 p->slots[level] = slot; 1994 unlock_up(p, level, lowest_unlock, 0, NULL); 1995 1996 if (level == lowest_level) { 1997 if (dec) 1998 p->slots[level]++; 1999 goto done; 2000 } 2001 2002 err = read_block_for_search(root, p, &b, level, slot, key); 2003 if (err == -EAGAIN) 2004 goto again; 2005 if (err) { 2006 ret = err; 2007 goto done; 2008 } 2009 2010 level = btrfs_header_level(b); 2011 btrfs_tree_read_lock(b); 2012 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2013 if (!b) { 2014 ret = -ENOMEM; 2015 goto done; 2016 } 2017 p->locks[level] = BTRFS_READ_LOCK; 2018 p->nodes[level] = b; 2019 } 2020 ret = 1; 2021 done: 2022 if (ret < 0) 2023 btrfs_release_path(p); 2024 2025 return ret; 2026 } 2027 2028 /* 2029 * helper to use instead of search slot if no exact match is needed but 2030 * instead the next or previous item should be returned. 2031 * When find_higher is true, the next higher item is returned, the next lower 2032 * otherwise. 2033 * When return_any and find_higher are both true, and no higher item is found, 2034 * return the next lower instead. 2035 * When return_any is true and find_higher is false, and no lower item is found, 2036 * return the next higher instead. 2037 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2038 * < 0 on error 2039 */ 2040 int btrfs_search_slot_for_read(struct btrfs_root *root, 2041 const struct btrfs_key *key, 2042 struct btrfs_path *p, int find_higher, 2043 int return_any) 2044 { 2045 int ret; 2046 struct extent_buffer *leaf; 2047 2048 again: 2049 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2050 if (ret <= 0) 2051 return ret; 2052 /* 2053 * a return value of 1 means the path is at the position where the 2054 * item should be inserted. Normally this is the next bigger item, 2055 * but in case the previous item is the last in a leaf, path points 2056 * to the first free slot in the previous leaf, i.e. at an invalid 2057 * item. 2058 */ 2059 leaf = p->nodes[0]; 2060 2061 if (find_higher) { 2062 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2063 ret = btrfs_next_leaf(root, p); 2064 if (ret <= 0) 2065 return ret; 2066 if (!return_any) 2067 return 1; 2068 /* 2069 * no higher item found, return the next 2070 * lower instead 2071 */ 2072 return_any = 0; 2073 find_higher = 0; 2074 btrfs_release_path(p); 2075 goto again; 2076 } 2077 } else { 2078 if (p->slots[0] == 0) { 2079 ret = btrfs_prev_leaf(root, p); 2080 if (ret < 0) 2081 return ret; 2082 if (!ret) { 2083 leaf = p->nodes[0]; 2084 if (p->slots[0] == btrfs_header_nritems(leaf)) 2085 p->slots[0]--; 2086 return 0; 2087 } 2088 if (!return_any) 2089 return 1; 2090 /* 2091 * no lower item found, return the next 2092 * higher instead 2093 */ 2094 return_any = 0; 2095 find_higher = 1; 2096 btrfs_release_path(p); 2097 goto again; 2098 } else { 2099 --p->slots[0]; 2100 } 2101 } 2102 return 0; 2103 } 2104 2105 /* 2106 * Execute search and call btrfs_previous_item to traverse backwards if the item 2107 * was not found. 2108 * 2109 * Return 0 if found, 1 if not found and < 0 if error. 2110 */ 2111 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2112 struct btrfs_path *path) 2113 { 2114 int ret; 2115 2116 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2117 if (ret > 0) 2118 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2119 2120 if (ret == 0) 2121 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2122 2123 return ret; 2124 } 2125 2126 /* 2127 * adjust the pointers going up the tree, starting at level 2128 * making sure the right key of each node is points to 'key'. 2129 * This is used after shifting pointers to the left, so it stops 2130 * fixing up pointers when a given leaf/node is not in slot 0 of the 2131 * higher levels 2132 * 2133 */ 2134 static void fixup_low_keys(struct btrfs_path *path, 2135 struct btrfs_disk_key *key, int level) 2136 { 2137 int i; 2138 struct extent_buffer *t; 2139 int ret; 2140 2141 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2142 int tslot = path->slots[i]; 2143 2144 if (!path->nodes[i]) 2145 break; 2146 t = path->nodes[i]; 2147 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2148 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); 2149 BUG_ON(ret < 0); 2150 btrfs_set_node_key(t, key, tslot); 2151 btrfs_mark_buffer_dirty(path->nodes[i]); 2152 if (tslot != 0) 2153 break; 2154 } 2155 } 2156 2157 /* 2158 * update item key. 2159 * 2160 * This function isn't completely safe. It's the caller's responsibility 2161 * that the new key won't break the order 2162 */ 2163 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2164 struct btrfs_path *path, 2165 const struct btrfs_key *new_key) 2166 { 2167 struct btrfs_disk_key disk_key; 2168 struct extent_buffer *eb; 2169 int slot; 2170 2171 eb = path->nodes[0]; 2172 slot = path->slots[0]; 2173 if (slot > 0) { 2174 btrfs_item_key(eb, &disk_key, slot - 1); 2175 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2176 btrfs_crit(fs_info, 2177 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2178 slot, btrfs_disk_key_objectid(&disk_key), 2179 btrfs_disk_key_type(&disk_key), 2180 btrfs_disk_key_offset(&disk_key), 2181 new_key->objectid, new_key->type, 2182 new_key->offset); 2183 btrfs_print_leaf(eb); 2184 BUG(); 2185 } 2186 } 2187 if (slot < btrfs_header_nritems(eb) - 1) { 2188 btrfs_item_key(eb, &disk_key, slot + 1); 2189 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2190 btrfs_crit(fs_info, 2191 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2192 slot, btrfs_disk_key_objectid(&disk_key), 2193 btrfs_disk_key_type(&disk_key), 2194 btrfs_disk_key_offset(&disk_key), 2195 new_key->objectid, new_key->type, 2196 new_key->offset); 2197 btrfs_print_leaf(eb); 2198 BUG(); 2199 } 2200 } 2201 2202 btrfs_cpu_key_to_disk(&disk_key, new_key); 2203 btrfs_set_item_key(eb, &disk_key, slot); 2204 btrfs_mark_buffer_dirty(eb); 2205 if (slot == 0) 2206 fixup_low_keys(path, &disk_key, 1); 2207 } 2208 2209 /* 2210 * Check key order of two sibling extent buffers. 2211 * 2212 * Return true if something is wrong. 2213 * Return false if everything is fine. 2214 * 2215 * Tree-checker only works inside one tree block, thus the following 2216 * corruption can not be detected by tree-checker: 2217 * 2218 * Leaf @left | Leaf @right 2219 * -------------------------------------------------------------- 2220 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2221 * 2222 * Key f6 in leaf @left itself is valid, but not valid when the next 2223 * key in leaf @right is 7. 2224 * This can only be checked at tree block merge time. 2225 * And since tree checker has ensured all key order in each tree block 2226 * is correct, we only need to bother the last key of @left and the first 2227 * key of @right. 2228 */ 2229 static bool check_sibling_keys(struct extent_buffer *left, 2230 struct extent_buffer *right) 2231 { 2232 struct btrfs_key left_last; 2233 struct btrfs_key right_first; 2234 int level = btrfs_header_level(left); 2235 int nr_left = btrfs_header_nritems(left); 2236 int nr_right = btrfs_header_nritems(right); 2237 2238 /* No key to check in one of the tree blocks */ 2239 if (!nr_left || !nr_right) 2240 return false; 2241 2242 if (level) { 2243 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2244 btrfs_node_key_to_cpu(right, &right_first, 0); 2245 } else { 2246 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2247 btrfs_item_key_to_cpu(right, &right_first, 0); 2248 } 2249 2250 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { 2251 btrfs_crit(left->fs_info, 2252 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2253 left_last.objectid, left_last.type, 2254 left_last.offset, right_first.objectid, 2255 right_first.type, right_first.offset); 2256 return true; 2257 } 2258 return false; 2259 } 2260 2261 /* 2262 * try to push data from one node into the next node left in the 2263 * tree. 2264 * 2265 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2266 * error, and > 0 if there was no room in the left hand block. 2267 */ 2268 static int push_node_left(struct btrfs_trans_handle *trans, 2269 struct extent_buffer *dst, 2270 struct extent_buffer *src, int empty) 2271 { 2272 struct btrfs_fs_info *fs_info = trans->fs_info; 2273 int push_items = 0; 2274 int src_nritems; 2275 int dst_nritems; 2276 int ret = 0; 2277 2278 src_nritems = btrfs_header_nritems(src); 2279 dst_nritems = btrfs_header_nritems(dst); 2280 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2281 WARN_ON(btrfs_header_generation(src) != trans->transid); 2282 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2283 2284 if (!empty && src_nritems <= 8) 2285 return 1; 2286 2287 if (push_items <= 0) 2288 return 1; 2289 2290 if (empty) { 2291 push_items = min(src_nritems, push_items); 2292 if (push_items < src_nritems) { 2293 /* leave at least 8 pointers in the node if 2294 * we aren't going to empty it 2295 */ 2296 if (src_nritems - push_items < 8) { 2297 if (push_items <= 8) 2298 return 1; 2299 push_items -= 8; 2300 } 2301 } 2302 } else 2303 push_items = min(src_nritems - 8, push_items); 2304 2305 /* dst is the left eb, src is the middle eb */ 2306 if (check_sibling_keys(dst, src)) { 2307 ret = -EUCLEAN; 2308 btrfs_abort_transaction(trans, ret); 2309 return ret; 2310 } 2311 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2312 if (ret) { 2313 btrfs_abort_transaction(trans, ret); 2314 return ret; 2315 } 2316 copy_extent_buffer(dst, src, 2317 btrfs_node_key_ptr_offset(dst_nritems), 2318 btrfs_node_key_ptr_offset(0), 2319 push_items * sizeof(struct btrfs_key_ptr)); 2320 2321 if (push_items < src_nritems) { 2322 /* 2323 * Don't call btrfs_tree_mod_log_insert_move() here, key removal 2324 * was already fully logged by btrfs_tree_mod_log_eb_copy() above. 2325 */ 2326 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2327 btrfs_node_key_ptr_offset(push_items), 2328 (src_nritems - push_items) * 2329 sizeof(struct btrfs_key_ptr)); 2330 } 2331 btrfs_set_header_nritems(src, src_nritems - push_items); 2332 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2333 btrfs_mark_buffer_dirty(src); 2334 btrfs_mark_buffer_dirty(dst); 2335 2336 return ret; 2337 } 2338 2339 /* 2340 * try to push data from one node into the next node right in the 2341 * tree. 2342 * 2343 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2344 * error, and > 0 if there was no room in the right hand block. 2345 * 2346 * this will only push up to 1/2 the contents of the left node over 2347 */ 2348 static int balance_node_right(struct btrfs_trans_handle *trans, 2349 struct extent_buffer *dst, 2350 struct extent_buffer *src) 2351 { 2352 struct btrfs_fs_info *fs_info = trans->fs_info; 2353 int push_items = 0; 2354 int max_push; 2355 int src_nritems; 2356 int dst_nritems; 2357 int ret = 0; 2358 2359 WARN_ON(btrfs_header_generation(src) != trans->transid); 2360 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2361 2362 src_nritems = btrfs_header_nritems(src); 2363 dst_nritems = btrfs_header_nritems(dst); 2364 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2365 if (push_items <= 0) 2366 return 1; 2367 2368 if (src_nritems < 4) 2369 return 1; 2370 2371 max_push = src_nritems / 2 + 1; 2372 /* don't try to empty the node */ 2373 if (max_push >= src_nritems) 2374 return 1; 2375 2376 if (max_push < push_items) 2377 push_items = max_push; 2378 2379 /* dst is the right eb, src is the middle eb */ 2380 if (check_sibling_keys(src, dst)) { 2381 ret = -EUCLEAN; 2382 btrfs_abort_transaction(trans, ret); 2383 return ret; 2384 } 2385 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 2386 BUG_ON(ret < 0); 2387 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2388 btrfs_node_key_ptr_offset(0), 2389 (dst_nritems) * 2390 sizeof(struct btrfs_key_ptr)); 2391 2392 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2393 push_items); 2394 if (ret) { 2395 btrfs_abort_transaction(trans, ret); 2396 return ret; 2397 } 2398 copy_extent_buffer(dst, src, 2399 btrfs_node_key_ptr_offset(0), 2400 btrfs_node_key_ptr_offset(src_nritems - push_items), 2401 push_items * sizeof(struct btrfs_key_ptr)); 2402 2403 btrfs_set_header_nritems(src, src_nritems - push_items); 2404 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2405 2406 btrfs_mark_buffer_dirty(src); 2407 btrfs_mark_buffer_dirty(dst); 2408 2409 return ret; 2410 } 2411 2412 /* 2413 * helper function to insert a new root level in the tree. 2414 * A new node is allocated, and a single item is inserted to 2415 * point to the existing root 2416 * 2417 * returns zero on success or < 0 on failure. 2418 */ 2419 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2420 struct btrfs_root *root, 2421 struct btrfs_path *path, int level) 2422 { 2423 struct btrfs_fs_info *fs_info = root->fs_info; 2424 u64 lower_gen; 2425 struct extent_buffer *lower; 2426 struct extent_buffer *c; 2427 struct extent_buffer *old; 2428 struct btrfs_disk_key lower_key; 2429 int ret; 2430 2431 BUG_ON(path->nodes[level]); 2432 BUG_ON(path->nodes[level-1] != root->node); 2433 2434 lower = path->nodes[level-1]; 2435 if (level == 1) 2436 btrfs_item_key(lower, &lower_key, 0); 2437 else 2438 btrfs_node_key(lower, &lower_key, 0); 2439 2440 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2441 &lower_key, level, root->node->start, 0, 2442 BTRFS_NESTING_NEW_ROOT); 2443 if (IS_ERR(c)) 2444 return PTR_ERR(c); 2445 2446 root_add_used(root, fs_info->nodesize); 2447 2448 btrfs_set_header_nritems(c, 1); 2449 btrfs_set_node_key(c, &lower_key, 0); 2450 btrfs_set_node_blockptr(c, 0, lower->start); 2451 lower_gen = btrfs_header_generation(lower); 2452 WARN_ON(lower_gen != trans->transid); 2453 2454 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2455 2456 btrfs_mark_buffer_dirty(c); 2457 2458 old = root->node; 2459 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2460 BUG_ON(ret < 0); 2461 rcu_assign_pointer(root->node, c); 2462 2463 /* the super has an extra ref to root->node */ 2464 free_extent_buffer(old); 2465 2466 add_root_to_dirty_list(root); 2467 atomic_inc(&c->refs); 2468 path->nodes[level] = c; 2469 path->locks[level] = BTRFS_WRITE_LOCK; 2470 path->slots[level] = 0; 2471 return 0; 2472 } 2473 2474 /* 2475 * worker function to insert a single pointer in a node. 2476 * the node should have enough room for the pointer already 2477 * 2478 * slot and level indicate where you want the key to go, and 2479 * blocknr is the block the key points to. 2480 */ 2481 static void insert_ptr(struct btrfs_trans_handle *trans, 2482 struct btrfs_path *path, 2483 struct btrfs_disk_key *key, u64 bytenr, 2484 int slot, int level) 2485 { 2486 struct extent_buffer *lower; 2487 int nritems; 2488 int ret; 2489 2490 BUG_ON(!path->nodes[level]); 2491 btrfs_assert_tree_write_locked(path->nodes[level]); 2492 lower = path->nodes[level]; 2493 nritems = btrfs_header_nritems(lower); 2494 BUG_ON(slot > nritems); 2495 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2496 if (slot != nritems) { 2497 if (level) { 2498 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2499 slot, nritems - slot); 2500 BUG_ON(ret < 0); 2501 } 2502 memmove_extent_buffer(lower, 2503 btrfs_node_key_ptr_offset(slot + 1), 2504 btrfs_node_key_ptr_offset(slot), 2505 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2506 } 2507 if (level) { 2508 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2509 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); 2510 BUG_ON(ret < 0); 2511 } 2512 btrfs_set_node_key(lower, key, slot); 2513 btrfs_set_node_blockptr(lower, slot, bytenr); 2514 WARN_ON(trans->transid == 0); 2515 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2516 btrfs_set_header_nritems(lower, nritems + 1); 2517 btrfs_mark_buffer_dirty(lower); 2518 } 2519 2520 /* 2521 * split the node at the specified level in path in two. 2522 * The path is corrected to point to the appropriate node after the split 2523 * 2524 * Before splitting this tries to make some room in the node by pushing 2525 * left and right, if either one works, it returns right away. 2526 * 2527 * returns 0 on success and < 0 on failure 2528 */ 2529 static noinline int split_node(struct btrfs_trans_handle *trans, 2530 struct btrfs_root *root, 2531 struct btrfs_path *path, int level) 2532 { 2533 struct btrfs_fs_info *fs_info = root->fs_info; 2534 struct extent_buffer *c; 2535 struct extent_buffer *split; 2536 struct btrfs_disk_key disk_key; 2537 int mid; 2538 int ret; 2539 u32 c_nritems; 2540 2541 c = path->nodes[level]; 2542 WARN_ON(btrfs_header_generation(c) != trans->transid); 2543 if (c == root->node) { 2544 /* 2545 * trying to split the root, lets make a new one 2546 * 2547 * tree mod log: We don't log_removal old root in 2548 * insert_new_root, because that root buffer will be kept as a 2549 * normal node. We are going to log removal of half of the 2550 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2551 * holding a tree lock on the buffer, which is why we cannot 2552 * race with other tree_mod_log users. 2553 */ 2554 ret = insert_new_root(trans, root, path, level + 1); 2555 if (ret) 2556 return ret; 2557 } else { 2558 ret = push_nodes_for_insert(trans, root, path, level); 2559 c = path->nodes[level]; 2560 if (!ret && btrfs_header_nritems(c) < 2561 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 2562 return 0; 2563 if (ret < 0) 2564 return ret; 2565 } 2566 2567 c_nritems = btrfs_header_nritems(c); 2568 mid = (c_nritems + 1) / 2; 2569 btrfs_node_key(c, &disk_key, mid); 2570 2571 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2572 &disk_key, level, c->start, 0, 2573 BTRFS_NESTING_SPLIT); 2574 if (IS_ERR(split)) 2575 return PTR_ERR(split); 2576 2577 root_add_used(root, fs_info->nodesize); 2578 ASSERT(btrfs_header_level(c) == level); 2579 2580 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 2581 if (ret) { 2582 btrfs_abort_transaction(trans, ret); 2583 return ret; 2584 } 2585 copy_extent_buffer(split, c, 2586 btrfs_node_key_ptr_offset(0), 2587 btrfs_node_key_ptr_offset(mid), 2588 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 2589 btrfs_set_header_nritems(split, c_nritems - mid); 2590 btrfs_set_header_nritems(c, mid); 2591 2592 btrfs_mark_buffer_dirty(c); 2593 btrfs_mark_buffer_dirty(split); 2594 2595 insert_ptr(trans, path, &disk_key, split->start, 2596 path->slots[level + 1] + 1, level + 1); 2597 2598 if (path->slots[level] >= mid) { 2599 path->slots[level] -= mid; 2600 btrfs_tree_unlock(c); 2601 free_extent_buffer(c); 2602 path->nodes[level] = split; 2603 path->slots[level + 1] += 1; 2604 } else { 2605 btrfs_tree_unlock(split); 2606 free_extent_buffer(split); 2607 } 2608 return 0; 2609 } 2610 2611 /* 2612 * how many bytes are required to store the items in a leaf. start 2613 * and nr indicate which items in the leaf to check. This totals up the 2614 * space used both by the item structs and the item data 2615 */ 2616 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 2617 { 2618 struct btrfs_item *start_item; 2619 struct btrfs_item *end_item; 2620 int data_len; 2621 int nritems = btrfs_header_nritems(l); 2622 int end = min(nritems, start + nr) - 1; 2623 2624 if (!nr) 2625 return 0; 2626 start_item = btrfs_item_nr(start); 2627 end_item = btrfs_item_nr(end); 2628 data_len = btrfs_item_offset(l, start_item) + 2629 btrfs_item_size(l, start_item); 2630 data_len = data_len - btrfs_item_offset(l, end_item); 2631 data_len += sizeof(struct btrfs_item) * nr; 2632 WARN_ON(data_len < 0); 2633 return data_len; 2634 } 2635 2636 /* 2637 * The space between the end of the leaf items and 2638 * the start of the leaf data. IOW, how much room 2639 * the leaf has left for both items and data 2640 */ 2641 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) 2642 { 2643 struct btrfs_fs_info *fs_info = leaf->fs_info; 2644 int nritems = btrfs_header_nritems(leaf); 2645 int ret; 2646 2647 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 2648 if (ret < 0) { 2649 btrfs_crit(fs_info, 2650 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 2651 ret, 2652 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 2653 leaf_space_used(leaf, 0, nritems), nritems); 2654 } 2655 return ret; 2656 } 2657 2658 /* 2659 * min slot controls the lowest index we're willing to push to the 2660 * right. We'll push up to and including min_slot, but no lower 2661 */ 2662 static noinline int __push_leaf_right(struct btrfs_path *path, 2663 int data_size, int empty, 2664 struct extent_buffer *right, 2665 int free_space, u32 left_nritems, 2666 u32 min_slot) 2667 { 2668 struct btrfs_fs_info *fs_info = right->fs_info; 2669 struct extent_buffer *left = path->nodes[0]; 2670 struct extent_buffer *upper = path->nodes[1]; 2671 struct btrfs_map_token token; 2672 struct btrfs_disk_key disk_key; 2673 int slot; 2674 u32 i; 2675 int push_space = 0; 2676 int push_items = 0; 2677 struct btrfs_item *item; 2678 u32 nr; 2679 u32 right_nritems; 2680 u32 data_end; 2681 u32 this_item_size; 2682 2683 if (empty) 2684 nr = 0; 2685 else 2686 nr = max_t(u32, 1, min_slot); 2687 2688 if (path->slots[0] >= left_nritems) 2689 push_space += data_size; 2690 2691 slot = path->slots[1]; 2692 i = left_nritems - 1; 2693 while (i >= nr) { 2694 item = btrfs_item_nr(i); 2695 2696 if (!empty && push_items > 0) { 2697 if (path->slots[0] > i) 2698 break; 2699 if (path->slots[0] == i) { 2700 int space = btrfs_leaf_free_space(left); 2701 2702 if (space + push_space * 2 > free_space) 2703 break; 2704 } 2705 } 2706 2707 if (path->slots[0] == i) 2708 push_space += data_size; 2709 2710 this_item_size = btrfs_item_size(left, item); 2711 if (this_item_size + sizeof(*item) + push_space > free_space) 2712 break; 2713 2714 push_items++; 2715 push_space += this_item_size + sizeof(*item); 2716 if (i == 0) 2717 break; 2718 i--; 2719 } 2720 2721 if (push_items == 0) 2722 goto out_unlock; 2723 2724 WARN_ON(!empty && push_items == left_nritems); 2725 2726 /* push left to right */ 2727 right_nritems = btrfs_header_nritems(right); 2728 2729 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 2730 push_space -= leaf_data_end(left); 2731 2732 /* make room in the right data area */ 2733 data_end = leaf_data_end(right); 2734 memmove_extent_buffer(right, 2735 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 2736 BTRFS_LEAF_DATA_OFFSET + data_end, 2737 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 2738 2739 /* copy from the left data area */ 2740 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 2741 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 2742 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), 2743 push_space); 2744 2745 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 2746 btrfs_item_nr_offset(0), 2747 right_nritems * sizeof(struct btrfs_item)); 2748 2749 /* copy the items from left to right */ 2750 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 2751 btrfs_item_nr_offset(left_nritems - push_items), 2752 push_items * sizeof(struct btrfs_item)); 2753 2754 /* update the item pointers */ 2755 btrfs_init_map_token(&token, right); 2756 right_nritems += push_items; 2757 btrfs_set_header_nritems(right, right_nritems); 2758 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 2759 for (i = 0; i < right_nritems; i++) { 2760 item = btrfs_item_nr(i); 2761 push_space -= btrfs_token_item_size(&token, item); 2762 btrfs_set_token_item_offset(&token, item, push_space); 2763 } 2764 2765 left_nritems -= push_items; 2766 btrfs_set_header_nritems(left, left_nritems); 2767 2768 if (left_nritems) 2769 btrfs_mark_buffer_dirty(left); 2770 else 2771 btrfs_clean_tree_block(left); 2772 2773 btrfs_mark_buffer_dirty(right); 2774 2775 btrfs_item_key(right, &disk_key, 0); 2776 btrfs_set_node_key(upper, &disk_key, slot + 1); 2777 btrfs_mark_buffer_dirty(upper); 2778 2779 /* then fixup the leaf pointer in the path */ 2780 if (path->slots[0] >= left_nritems) { 2781 path->slots[0] -= left_nritems; 2782 if (btrfs_header_nritems(path->nodes[0]) == 0) 2783 btrfs_clean_tree_block(path->nodes[0]); 2784 btrfs_tree_unlock(path->nodes[0]); 2785 free_extent_buffer(path->nodes[0]); 2786 path->nodes[0] = right; 2787 path->slots[1] += 1; 2788 } else { 2789 btrfs_tree_unlock(right); 2790 free_extent_buffer(right); 2791 } 2792 return 0; 2793 2794 out_unlock: 2795 btrfs_tree_unlock(right); 2796 free_extent_buffer(right); 2797 return 1; 2798 } 2799 2800 /* 2801 * push some data in the path leaf to the right, trying to free up at 2802 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2803 * 2804 * returns 1 if the push failed because the other node didn't have enough 2805 * room, 0 if everything worked out and < 0 if there were major errors. 2806 * 2807 * this will push starting from min_slot to the end of the leaf. It won't 2808 * push any slot lower than min_slot 2809 */ 2810 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2811 *root, struct btrfs_path *path, 2812 int min_data_size, int data_size, 2813 int empty, u32 min_slot) 2814 { 2815 struct extent_buffer *left = path->nodes[0]; 2816 struct extent_buffer *right; 2817 struct extent_buffer *upper; 2818 int slot; 2819 int free_space; 2820 u32 left_nritems; 2821 int ret; 2822 2823 if (!path->nodes[1]) 2824 return 1; 2825 2826 slot = path->slots[1]; 2827 upper = path->nodes[1]; 2828 if (slot >= btrfs_header_nritems(upper) - 1) 2829 return 1; 2830 2831 btrfs_assert_tree_write_locked(path->nodes[1]); 2832 2833 right = btrfs_read_node_slot(upper, slot + 1); 2834 /* 2835 * slot + 1 is not valid or we fail to read the right node, 2836 * no big deal, just return. 2837 */ 2838 if (IS_ERR(right)) 2839 return 1; 2840 2841 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 2842 2843 free_space = btrfs_leaf_free_space(right); 2844 if (free_space < data_size) 2845 goto out_unlock; 2846 2847 /* cow and double check */ 2848 ret = btrfs_cow_block(trans, root, right, upper, 2849 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 2850 if (ret) 2851 goto out_unlock; 2852 2853 free_space = btrfs_leaf_free_space(right); 2854 if (free_space < data_size) 2855 goto out_unlock; 2856 2857 left_nritems = btrfs_header_nritems(left); 2858 if (left_nritems == 0) 2859 goto out_unlock; 2860 2861 if (check_sibling_keys(left, right)) { 2862 ret = -EUCLEAN; 2863 btrfs_tree_unlock(right); 2864 free_extent_buffer(right); 2865 return ret; 2866 } 2867 if (path->slots[0] == left_nritems && !empty) { 2868 /* Key greater than all keys in the leaf, right neighbor has 2869 * enough room for it and we're not emptying our leaf to delete 2870 * it, therefore use right neighbor to insert the new item and 2871 * no need to touch/dirty our left leaf. */ 2872 btrfs_tree_unlock(left); 2873 free_extent_buffer(left); 2874 path->nodes[0] = right; 2875 path->slots[0] = 0; 2876 path->slots[1]++; 2877 return 0; 2878 } 2879 2880 return __push_leaf_right(path, min_data_size, empty, 2881 right, free_space, left_nritems, min_slot); 2882 out_unlock: 2883 btrfs_tree_unlock(right); 2884 free_extent_buffer(right); 2885 return 1; 2886 } 2887 2888 /* 2889 * push some data in the path leaf to the left, trying to free up at 2890 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2891 * 2892 * max_slot can put a limit on how far into the leaf we'll push items. The 2893 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 2894 * items 2895 */ 2896 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, 2897 int empty, struct extent_buffer *left, 2898 int free_space, u32 right_nritems, 2899 u32 max_slot) 2900 { 2901 struct btrfs_fs_info *fs_info = left->fs_info; 2902 struct btrfs_disk_key disk_key; 2903 struct extent_buffer *right = path->nodes[0]; 2904 int i; 2905 int push_space = 0; 2906 int push_items = 0; 2907 struct btrfs_item *item; 2908 u32 old_left_nritems; 2909 u32 nr; 2910 int ret = 0; 2911 u32 this_item_size; 2912 u32 old_left_item_size; 2913 struct btrfs_map_token token; 2914 2915 if (empty) 2916 nr = min(right_nritems, max_slot); 2917 else 2918 nr = min(right_nritems - 1, max_slot); 2919 2920 for (i = 0; i < nr; i++) { 2921 item = btrfs_item_nr(i); 2922 2923 if (!empty && push_items > 0) { 2924 if (path->slots[0] < i) 2925 break; 2926 if (path->slots[0] == i) { 2927 int space = btrfs_leaf_free_space(right); 2928 2929 if (space + push_space * 2 > free_space) 2930 break; 2931 } 2932 } 2933 2934 if (path->slots[0] == i) 2935 push_space += data_size; 2936 2937 this_item_size = btrfs_item_size(right, item); 2938 if (this_item_size + sizeof(*item) + push_space > free_space) 2939 break; 2940 2941 push_items++; 2942 push_space += this_item_size + sizeof(*item); 2943 } 2944 2945 if (push_items == 0) { 2946 ret = 1; 2947 goto out; 2948 } 2949 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 2950 2951 /* push data from right to left */ 2952 copy_extent_buffer(left, right, 2953 btrfs_item_nr_offset(btrfs_header_nritems(left)), 2954 btrfs_item_nr_offset(0), 2955 push_items * sizeof(struct btrfs_item)); 2956 2957 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 2958 btrfs_item_offset_nr(right, push_items - 1); 2959 2960 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 2961 leaf_data_end(left) - push_space, 2962 BTRFS_LEAF_DATA_OFFSET + 2963 btrfs_item_offset_nr(right, push_items - 1), 2964 push_space); 2965 old_left_nritems = btrfs_header_nritems(left); 2966 BUG_ON(old_left_nritems <= 0); 2967 2968 btrfs_init_map_token(&token, left); 2969 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 2970 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 2971 u32 ioff; 2972 2973 item = btrfs_item_nr(i); 2974 2975 ioff = btrfs_token_item_offset(&token, item); 2976 btrfs_set_token_item_offset(&token, item, 2977 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 2978 } 2979 btrfs_set_header_nritems(left, old_left_nritems + push_items); 2980 2981 /* fixup right node */ 2982 if (push_items > right_nritems) 2983 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 2984 right_nritems); 2985 2986 if (push_items < right_nritems) { 2987 push_space = btrfs_item_offset_nr(right, push_items - 1) - 2988 leaf_data_end(right); 2989 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 2990 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 2991 BTRFS_LEAF_DATA_OFFSET + 2992 leaf_data_end(right), push_space); 2993 2994 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 2995 btrfs_item_nr_offset(push_items), 2996 (btrfs_header_nritems(right) - push_items) * 2997 sizeof(struct btrfs_item)); 2998 } 2999 3000 btrfs_init_map_token(&token, right); 3001 right_nritems -= push_items; 3002 btrfs_set_header_nritems(right, right_nritems); 3003 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3004 for (i = 0; i < right_nritems; i++) { 3005 item = btrfs_item_nr(i); 3006 3007 push_space = push_space - btrfs_token_item_size(&token, item); 3008 btrfs_set_token_item_offset(&token, item, push_space); 3009 } 3010 3011 btrfs_mark_buffer_dirty(left); 3012 if (right_nritems) 3013 btrfs_mark_buffer_dirty(right); 3014 else 3015 btrfs_clean_tree_block(right); 3016 3017 btrfs_item_key(right, &disk_key, 0); 3018 fixup_low_keys(path, &disk_key, 1); 3019 3020 /* then fixup the leaf pointer in the path */ 3021 if (path->slots[0] < push_items) { 3022 path->slots[0] += old_left_nritems; 3023 btrfs_tree_unlock(path->nodes[0]); 3024 free_extent_buffer(path->nodes[0]); 3025 path->nodes[0] = left; 3026 path->slots[1] -= 1; 3027 } else { 3028 btrfs_tree_unlock(left); 3029 free_extent_buffer(left); 3030 path->slots[0] -= push_items; 3031 } 3032 BUG_ON(path->slots[0] < 0); 3033 return ret; 3034 out: 3035 btrfs_tree_unlock(left); 3036 free_extent_buffer(left); 3037 return ret; 3038 } 3039 3040 /* 3041 * push some data in the path leaf to the left, trying to free up at 3042 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3043 * 3044 * max_slot can put a limit on how far into the leaf we'll push items. The 3045 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3046 * items 3047 */ 3048 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3049 *root, struct btrfs_path *path, int min_data_size, 3050 int data_size, int empty, u32 max_slot) 3051 { 3052 struct extent_buffer *right = path->nodes[0]; 3053 struct extent_buffer *left; 3054 int slot; 3055 int free_space; 3056 u32 right_nritems; 3057 int ret = 0; 3058 3059 slot = path->slots[1]; 3060 if (slot == 0) 3061 return 1; 3062 if (!path->nodes[1]) 3063 return 1; 3064 3065 right_nritems = btrfs_header_nritems(right); 3066 if (right_nritems == 0) 3067 return 1; 3068 3069 btrfs_assert_tree_write_locked(path->nodes[1]); 3070 3071 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3072 /* 3073 * slot - 1 is not valid or we fail to read the left node, 3074 * no big deal, just return. 3075 */ 3076 if (IS_ERR(left)) 3077 return 1; 3078 3079 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3080 3081 free_space = btrfs_leaf_free_space(left); 3082 if (free_space < data_size) { 3083 ret = 1; 3084 goto out; 3085 } 3086 3087 /* cow and double check */ 3088 ret = btrfs_cow_block(trans, root, left, 3089 path->nodes[1], slot - 1, &left, 3090 BTRFS_NESTING_LEFT_COW); 3091 if (ret) { 3092 /* we hit -ENOSPC, but it isn't fatal here */ 3093 if (ret == -ENOSPC) 3094 ret = 1; 3095 goto out; 3096 } 3097 3098 free_space = btrfs_leaf_free_space(left); 3099 if (free_space < data_size) { 3100 ret = 1; 3101 goto out; 3102 } 3103 3104 if (check_sibling_keys(left, right)) { 3105 ret = -EUCLEAN; 3106 goto out; 3107 } 3108 return __push_leaf_left(path, min_data_size, 3109 empty, left, free_space, right_nritems, 3110 max_slot); 3111 out: 3112 btrfs_tree_unlock(left); 3113 free_extent_buffer(left); 3114 return ret; 3115 } 3116 3117 /* 3118 * split the path's leaf in two, making sure there is at least data_size 3119 * available for the resulting leaf level of the path. 3120 */ 3121 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3122 struct btrfs_path *path, 3123 struct extent_buffer *l, 3124 struct extent_buffer *right, 3125 int slot, int mid, int nritems) 3126 { 3127 struct btrfs_fs_info *fs_info = trans->fs_info; 3128 int data_copy_size; 3129 int rt_data_off; 3130 int i; 3131 struct btrfs_disk_key disk_key; 3132 struct btrfs_map_token token; 3133 3134 nritems = nritems - mid; 3135 btrfs_set_header_nritems(right, nritems); 3136 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); 3137 3138 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3139 btrfs_item_nr_offset(mid), 3140 nritems * sizeof(struct btrfs_item)); 3141 3142 copy_extent_buffer(right, l, 3143 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 3144 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 3145 leaf_data_end(l), data_copy_size); 3146 3147 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); 3148 3149 btrfs_init_map_token(&token, right); 3150 for (i = 0; i < nritems; i++) { 3151 struct btrfs_item *item = btrfs_item_nr(i); 3152 u32 ioff; 3153 3154 ioff = btrfs_token_item_offset(&token, item); 3155 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); 3156 } 3157 3158 btrfs_set_header_nritems(l, mid); 3159 btrfs_item_key(right, &disk_key, 0); 3160 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3161 3162 btrfs_mark_buffer_dirty(right); 3163 btrfs_mark_buffer_dirty(l); 3164 BUG_ON(path->slots[0] != slot); 3165 3166 if (mid <= slot) { 3167 btrfs_tree_unlock(path->nodes[0]); 3168 free_extent_buffer(path->nodes[0]); 3169 path->nodes[0] = right; 3170 path->slots[0] -= mid; 3171 path->slots[1] += 1; 3172 } else { 3173 btrfs_tree_unlock(right); 3174 free_extent_buffer(right); 3175 } 3176 3177 BUG_ON(path->slots[0] < 0); 3178 } 3179 3180 /* 3181 * double splits happen when we need to insert a big item in the middle 3182 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3183 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3184 * A B C 3185 * 3186 * We avoid this by trying to push the items on either side of our target 3187 * into the adjacent leaves. If all goes well we can avoid the double split 3188 * completely. 3189 */ 3190 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3191 struct btrfs_root *root, 3192 struct btrfs_path *path, 3193 int data_size) 3194 { 3195 int ret; 3196 int progress = 0; 3197 int slot; 3198 u32 nritems; 3199 int space_needed = data_size; 3200 3201 slot = path->slots[0]; 3202 if (slot < btrfs_header_nritems(path->nodes[0])) 3203 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3204 3205 /* 3206 * try to push all the items after our slot into the 3207 * right leaf 3208 */ 3209 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3210 if (ret < 0) 3211 return ret; 3212 3213 if (ret == 0) 3214 progress++; 3215 3216 nritems = btrfs_header_nritems(path->nodes[0]); 3217 /* 3218 * our goal is to get our slot at the start or end of a leaf. If 3219 * we've done so we're done 3220 */ 3221 if (path->slots[0] == 0 || path->slots[0] == nritems) 3222 return 0; 3223 3224 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3225 return 0; 3226 3227 /* try to push all the items before our slot into the next leaf */ 3228 slot = path->slots[0]; 3229 space_needed = data_size; 3230 if (slot > 0) 3231 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3232 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3233 if (ret < 0) 3234 return ret; 3235 3236 if (ret == 0) 3237 progress++; 3238 3239 if (progress) 3240 return 0; 3241 return 1; 3242 } 3243 3244 /* 3245 * split the path's leaf in two, making sure there is at least data_size 3246 * available for the resulting leaf level of the path. 3247 * 3248 * returns 0 if all went well and < 0 on failure. 3249 */ 3250 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3251 struct btrfs_root *root, 3252 const struct btrfs_key *ins_key, 3253 struct btrfs_path *path, int data_size, 3254 int extend) 3255 { 3256 struct btrfs_disk_key disk_key; 3257 struct extent_buffer *l; 3258 u32 nritems; 3259 int mid; 3260 int slot; 3261 struct extent_buffer *right; 3262 struct btrfs_fs_info *fs_info = root->fs_info; 3263 int ret = 0; 3264 int wret; 3265 int split; 3266 int num_doubles = 0; 3267 int tried_avoid_double = 0; 3268 3269 l = path->nodes[0]; 3270 slot = path->slots[0]; 3271 if (extend && data_size + btrfs_item_size_nr(l, slot) + 3272 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3273 return -EOVERFLOW; 3274 3275 /* first try to make some room by pushing left and right */ 3276 if (data_size && path->nodes[1]) { 3277 int space_needed = data_size; 3278 3279 if (slot < btrfs_header_nritems(l)) 3280 space_needed -= btrfs_leaf_free_space(l); 3281 3282 wret = push_leaf_right(trans, root, path, space_needed, 3283 space_needed, 0, 0); 3284 if (wret < 0) 3285 return wret; 3286 if (wret) { 3287 space_needed = data_size; 3288 if (slot > 0) 3289 space_needed -= btrfs_leaf_free_space(l); 3290 wret = push_leaf_left(trans, root, path, space_needed, 3291 space_needed, 0, (u32)-1); 3292 if (wret < 0) 3293 return wret; 3294 } 3295 l = path->nodes[0]; 3296 3297 /* did the pushes work? */ 3298 if (btrfs_leaf_free_space(l) >= data_size) 3299 return 0; 3300 } 3301 3302 if (!path->nodes[1]) { 3303 ret = insert_new_root(trans, root, path, 1); 3304 if (ret) 3305 return ret; 3306 } 3307 again: 3308 split = 1; 3309 l = path->nodes[0]; 3310 slot = path->slots[0]; 3311 nritems = btrfs_header_nritems(l); 3312 mid = (nritems + 1) / 2; 3313 3314 if (mid <= slot) { 3315 if (nritems == 1 || 3316 leaf_space_used(l, mid, nritems - mid) + data_size > 3317 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3318 if (slot >= nritems) { 3319 split = 0; 3320 } else { 3321 mid = slot; 3322 if (mid != nritems && 3323 leaf_space_used(l, mid, nritems - mid) + 3324 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3325 if (data_size && !tried_avoid_double) 3326 goto push_for_double; 3327 split = 2; 3328 } 3329 } 3330 } 3331 } else { 3332 if (leaf_space_used(l, 0, mid) + data_size > 3333 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3334 if (!extend && data_size && slot == 0) { 3335 split = 0; 3336 } else if ((extend || !data_size) && slot == 0) { 3337 mid = 1; 3338 } else { 3339 mid = slot; 3340 if (mid != nritems && 3341 leaf_space_used(l, mid, nritems - mid) + 3342 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3343 if (data_size && !tried_avoid_double) 3344 goto push_for_double; 3345 split = 2; 3346 } 3347 } 3348 } 3349 } 3350 3351 if (split == 0) 3352 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3353 else 3354 btrfs_item_key(l, &disk_key, mid); 3355 3356 /* 3357 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3358 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3359 * subclasses, which is 8 at the time of this patch, and we've maxed it 3360 * out. In the future we could add a 3361 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3362 * use BTRFS_NESTING_NEW_ROOT. 3363 */ 3364 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3365 &disk_key, 0, l->start, 0, 3366 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3367 BTRFS_NESTING_SPLIT); 3368 if (IS_ERR(right)) 3369 return PTR_ERR(right); 3370 3371 root_add_used(root, fs_info->nodesize); 3372 3373 if (split == 0) { 3374 if (mid <= slot) { 3375 btrfs_set_header_nritems(right, 0); 3376 insert_ptr(trans, path, &disk_key, 3377 right->start, path->slots[1] + 1, 1); 3378 btrfs_tree_unlock(path->nodes[0]); 3379 free_extent_buffer(path->nodes[0]); 3380 path->nodes[0] = right; 3381 path->slots[0] = 0; 3382 path->slots[1] += 1; 3383 } else { 3384 btrfs_set_header_nritems(right, 0); 3385 insert_ptr(trans, path, &disk_key, 3386 right->start, path->slots[1], 1); 3387 btrfs_tree_unlock(path->nodes[0]); 3388 free_extent_buffer(path->nodes[0]); 3389 path->nodes[0] = right; 3390 path->slots[0] = 0; 3391 if (path->slots[1] == 0) 3392 fixup_low_keys(path, &disk_key, 1); 3393 } 3394 /* 3395 * We create a new leaf 'right' for the required ins_len and 3396 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3397 * the content of ins_len to 'right'. 3398 */ 3399 return ret; 3400 } 3401 3402 copy_for_split(trans, path, l, right, slot, mid, nritems); 3403 3404 if (split == 2) { 3405 BUG_ON(num_doubles != 0); 3406 num_doubles++; 3407 goto again; 3408 } 3409 3410 return 0; 3411 3412 push_for_double: 3413 push_for_double_split(trans, root, path, data_size); 3414 tried_avoid_double = 1; 3415 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3416 return 0; 3417 goto again; 3418 } 3419 3420 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3421 struct btrfs_root *root, 3422 struct btrfs_path *path, int ins_len) 3423 { 3424 struct btrfs_key key; 3425 struct extent_buffer *leaf; 3426 struct btrfs_file_extent_item *fi; 3427 u64 extent_len = 0; 3428 u32 item_size; 3429 int ret; 3430 3431 leaf = path->nodes[0]; 3432 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3433 3434 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3435 key.type != BTRFS_EXTENT_CSUM_KEY); 3436 3437 if (btrfs_leaf_free_space(leaf) >= ins_len) 3438 return 0; 3439 3440 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3441 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3442 fi = btrfs_item_ptr(leaf, path->slots[0], 3443 struct btrfs_file_extent_item); 3444 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3445 } 3446 btrfs_release_path(path); 3447 3448 path->keep_locks = 1; 3449 path->search_for_split = 1; 3450 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3451 path->search_for_split = 0; 3452 if (ret > 0) 3453 ret = -EAGAIN; 3454 if (ret < 0) 3455 goto err; 3456 3457 ret = -EAGAIN; 3458 leaf = path->nodes[0]; 3459 /* if our item isn't there, return now */ 3460 if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3461 goto err; 3462 3463 /* the leaf has changed, it now has room. return now */ 3464 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3465 goto err; 3466 3467 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3468 fi = btrfs_item_ptr(leaf, path->slots[0], 3469 struct btrfs_file_extent_item); 3470 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3471 goto err; 3472 } 3473 3474 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3475 if (ret) 3476 goto err; 3477 3478 path->keep_locks = 0; 3479 btrfs_unlock_up_safe(path, 1); 3480 return 0; 3481 err: 3482 path->keep_locks = 0; 3483 return ret; 3484 } 3485 3486 static noinline int split_item(struct btrfs_path *path, 3487 const struct btrfs_key *new_key, 3488 unsigned long split_offset) 3489 { 3490 struct extent_buffer *leaf; 3491 struct btrfs_item *item; 3492 struct btrfs_item *new_item; 3493 int slot; 3494 char *buf; 3495 u32 nritems; 3496 u32 item_size; 3497 u32 orig_offset; 3498 struct btrfs_disk_key disk_key; 3499 3500 leaf = path->nodes[0]; 3501 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); 3502 3503 item = btrfs_item_nr(path->slots[0]); 3504 orig_offset = btrfs_item_offset(leaf, item); 3505 item_size = btrfs_item_size(leaf, item); 3506 3507 buf = kmalloc(item_size, GFP_NOFS); 3508 if (!buf) 3509 return -ENOMEM; 3510 3511 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3512 path->slots[0]), item_size); 3513 3514 slot = path->slots[0] + 1; 3515 nritems = btrfs_header_nritems(leaf); 3516 if (slot != nritems) { 3517 /* shift the items */ 3518 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 3519 btrfs_item_nr_offset(slot), 3520 (nritems - slot) * sizeof(struct btrfs_item)); 3521 } 3522 3523 btrfs_cpu_key_to_disk(&disk_key, new_key); 3524 btrfs_set_item_key(leaf, &disk_key, slot); 3525 3526 new_item = btrfs_item_nr(slot); 3527 3528 btrfs_set_item_offset(leaf, new_item, orig_offset); 3529 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 3530 3531 btrfs_set_item_offset(leaf, item, 3532 orig_offset + item_size - split_offset); 3533 btrfs_set_item_size(leaf, item, split_offset); 3534 3535 btrfs_set_header_nritems(leaf, nritems + 1); 3536 3537 /* write the data for the start of the original item */ 3538 write_extent_buffer(leaf, buf, 3539 btrfs_item_ptr_offset(leaf, path->slots[0]), 3540 split_offset); 3541 3542 /* write the data for the new item */ 3543 write_extent_buffer(leaf, buf + split_offset, 3544 btrfs_item_ptr_offset(leaf, slot), 3545 item_size - split_offset); 3546 btrfs_mark_buffer_dirty(leaf); 3547 3548 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3549 kfree(buf); 3550 return 0; 3551 } 3552 3553 /* 3554 * This function splits a single item into two items, 3555 * giving 'new_key' to the new item and splitting the 3556 * old one at split_offset (from the start of the item). 3557 * 3558 * The path may be released by this operation. After 3559 * the split, the path is pointing to the old item. The 3560 * new item is going to be in the same node as the old one. 3561 * 3562 * Note, the item being split must be smaller enough to live alone on 3563 * a tree block with room for one extra struct btrfs_item 3564 * 3565 * This allows us to split the item in place, keeping a lock on the 3566 * leaf the entire time. 3567 */ 3568 int btrfs_split_item(struct btrfs_trans_handle *trans, 3569 struct btrfs_root *root, 3570 struct btrfs_path *path, 3571 const struct btrfs_key *new_key, 3572 unsigned long split_offset) 3573 { 3574 int ret; 3575 ret = setup_leaf_for_split(trans, root, path, 3576 sizeof(struct btrfs_item)); 3577 if (ret) 3578 return ret; 3579 3580 ret = split_item(path, new_key, split_offset); 3581 return ret; 3582 } 3583 3584 /* 3585 * make the item pointed to by the path smaller. new_size indicates 3586 * how small to make it, and from_end tells us if we just chop bytes 3587 * off the end of the item or if we shift the item to chop bytes off 3588 * the front. 3589 */ 3590 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 3591 { 3592 int slot; 3593 struct extent_buffer *leaf; 3594 struct btrfs_item *item; 3595 u32 nritems; 3596 unsigned int data_end; 3597 unsigned int old_data_start; 3598 unsigned int old_size; 3599 unsigned int size_diff; 3600 int i; 3601 struct btrfs_map_token token; 3602 3603 leaf = path->nodes[0]; 3604 slot = path->slots[0]; 3605 3606 old_size = btrfs_item_size_nr(leaf, slot); 3607 if (old_size == new_size) 3608 return; 3609 3610 nritems = btrfs_header_nritems(leaf); 3611 data_end = leaf_data_end(leaf); 3612 3613 old_data_start = btrfs_item_offset_nr(leaf, slot); 3614 3615 size_diff = old_size - new_size; 3616 3617 BUG_ON(slot < 0); 3618 BUG_ON(slot >= nritems); 3619 3620 /* 3621 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3622 */ 3623 /* first correct the data pointers */ 3624 btrfs_init_map_token(&token, leaf); 3625 for (i = slot; i < nritems; i++) { 3626 u32 ioff; 3627 item = btrfs_item_nr(i); 3628 3629 ioff = btrfs_token_item_offset(&token, item); 3630 btrfs_set_token_item_offset(&token, item, ioff + size_diff); 3631 } 3632 3633 /* shift the data */ 3634 if (from_end) { 3635 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3636 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3637 data_end, old_data_start + new_size - data_end); 3638 } else { 3639 struct btrfs_disk_key disk_key; 3640 u64 offset; 3641 3642 btrfs_item_key(leaf, &disk_key, slot); 3643 3644 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 3645 unsigned long ptr; 3646 struct btrfs_file_extent_item *fi; 3647 3648 fi = btrfs_item_ptr(leaf, slot, 3649 struct btrfs_file_extent_item); 3650 fi = (struct btrfs_file_extent_item *)( 3651 (unsigned long)fi - size_diff); 3652 3653 if (btrfs_file_extent_type(leaf, fi) == 3654 BTRFS_FILE_EXTENT_INLINE) { 3655 ptr = btrfs_item_ptr_offset(leaf, slot); 3656 memmove_extent_buffer(leaf, ptr, 3657 (unsigned long)fi, 3658 BTRFS_FILE_EXTENT_INLINE_DATA_START); 3659 } 3660 } 3661 3662 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3663 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3664 data_end, old_data_start - data_end); 3665 3666 offset = btrfs_disk_key_offset(&disk_key); 3667 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 3668 btrfs_set_item_key(leaf, &disk_key, slot); 3669 if (slot == 0) 3670 fixup_low_keys(path, &disk_key, 1); 3671 } 3672 3673 item = btrfs_item_nr(slot); 3674 btrfs_set_item_size(leaf, item, new_size); 3675 btrfs_mark_buffer_dirty(leaf); 3676 3677 if (btrfs_leaf_free_space(leaf) < 0) { 3678 btrfs_print_leaf(leaf); 3679 BUG(); 3680 } 3681 } 3682 3683 /* 3684 * make the item pointed to by the path bigger, data_size is the added size. 3685 */ 3686 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 3687 { 3688 int slot; 3689 struct extent_buffer *leaf; 3690 struct btrfs_item *item; 3691 u32 nritems; 3692 unsigned int data_end; 3693 unsigned int old_data; 3694 unsigned int old_size; 3695 int i; 3696 struct btrfs_map_token token; 3697 3698 leaf = path->nodes[0]; 3699 3700 nritems = btrfs_header_nritems(leaf); 3701 data_end = leaf_data_end(leaf); 3702 3703 if (btrfs_leaf_free_space(leaf) < data_size) { 3704 btrfs_print_leaf(leaf); 3705 BUG(); 3706 } 3707 slot = path->slots[0]; 3708 old_data = btrfs_item_end_nr(leaf, slot); 3709 3710 BUG_ON(slot < 0); 3711 if (slot >= nritems) { 3712 btrfs_print_leaf(leaf); 3713 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 3714 slot, nritems); 3715 BUG(); 3716 } 3717 3718 /* 3719 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3720 */ 3721 /* first correct the data pointers */ 3722 btrfs_init_map_token(&token, leaf); 3723 for (i = slot; i < nritems; i++) { 3724 u32 ioff; 3725 item = btrfs_item_nr(i); 3726 3727 ioff = btrfs_token_item_offset(&token, item); 3728 btrfs_set_token_item_offset(&token, item, ioff - data_size); 3729 } 3730 3731 /* shift the data */ 3732 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3733 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 3734 data_end, old_data - data_end); 3735 3736 data_end = old_data; 3737 old_size = btrfs_item_size_nr(leaf, slot); 3738 item = btrfs_item_nr(slot); 3739 btrfs_set_item_size(leaf, item, old_size + data_size); 3740 btrfs_mark_buffer_dirty(leaf); 3741 3742 if (btrfs_leaf_free_space(leaf) < 0) { 3743 btrfs_print_leaf(leaf); 3744 BUG(); 3745 } 3746 } 3747 3748 /** 3749 * setup_items_for_insert - Helper called before inserting one or more items 3750 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work 3751 * in a function that doesn't call btrfs_search_slot 3752 * 3753 * @root: root we are inserting items to 3754 * @path: points to the leaf/slot where we are going to insert new items 3755 * @batch: information about the batch of items to insert 3756 */ 3757 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 3758 const struct btrfs_item_batch *batch) 3759 { 3760 struct btrfs_fs_info *fs_info = root->fs_info; 3761 struct btrfs_item *item; 3762 int i; 3763 u32 nritems; 3764 unsigned int data_end; 3765 struct btrfs_disk_key disk_key; 3766 struct extent_buffer *leaf; 3767 int slot; 3768 struct btrfs_map_token token; 3769 u32 total_size; 3770 3771 /* 3772 * Before anything else, update keys in the parent and other ancestors 3773 * if needed, then release the write locks on them, so that other tasks 3774 * can use them while we modify the leaf. 3775 */ 3776 if (path->slots[0] == 0) { 3777 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 3778 fixup_low_keys(path, &disk_key, 1); 3779 } 3780 btrfs_unlock_up_safe(path, 1); 3781 3782 leaf = path->nodes[0]; 3783 slot = path->slots[0]; 3784 3785 nritems = btrfs_header_nritems(leaf); 3786 data_end = leaf_data_end(leaf); 3787 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 3788 3789 if (btrfs_leaf_free_space(leaf) < total_size) { 3790 btrfs_print_leaf(leaf); 3791 btrfs_crit(fs_info, "not enough freespace need %u have %d", 3792 total_size, btrfs_leaf_free_space(leaf)); 3793 BUG(); 3794 } 3795 3796 btrfs_init_map_token(&token, leaf); 3797 if (slot != nritems) { 3798 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3799 3800 if (old_data < data_end) { 3801 btrfs_print_leaf(leaf); 3802 btrfs_crit(fs_info, 3803 "item at slot %d with data offset %u beyond data end of leaf %u", 3804 slot, old_data, data_end); 3805 BUG(); 3806 } 3807 /* 3808 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3809 */ 3810 /* first correct the data pointers */ 3811 for (i = slot; i < nritems; i++) { 3812 u32 ioff; 3813 3814 item = btrfs_item_nr(i); 3815 ioff = btrfs_token_item_offset(&token, item); 3816 btrfs_set_token_item_offset(&token, item, 3817 ioff - batch->total_data_size); 3818 } 3819 /* shift the items */ 3820 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), 3821 btrfs_item_nr_offset(slot), 3822 (nritems - slot) * sizeof(struct btrfs_item)); 3823 3824 /* shift the data */ 3825 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3826 data_end - batch->total_data_size, 3827 BTRFS_LEAF_DATA_OFFSET + data_end, 3828 old_data - data_end); 3829 data_end = old_data; 3830 } 3831 3832 /* setup the item for the new data */ 3833 for (i = 0; i < batch->nr; i++) { 3834 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 3835 btrfs_set_item_key(leaf, &disk_key, slot + i); 3836 item = btrfs_item_nr(slot + i); 3837 data_end -= batch->data_sizes[i]; 3838 btrfs_set_token_item_offset(&token, item, data_end); 3839 btrfs_set_token_item_size(&token, item, batch->data_sizes[i]); 3840 } 3841 3842 btrfs_set_header_nritems(leaf, nritems + batch->nr); 3843 btrfs_mark_buffer_dirty(leaf); 3844 3845 if (btrfs_leaf_free_space(leaf) < 0) { 3846 btrfs_print_leaf(leaf); 3847 BUG(); 3848 } 3849 } 3850 3851 /* 3852 * Insert a new item into a leaf. 3853 * 3854 * @root: The root of the btree. 3855 * @path: A path pointing to the target leaf and slot. 3856 * @key: The key of the new item. 3857 * @data_size: The size of the data associated with the new key. 3858 */ 3859 void btrfs_setup_item_for_insert(struct btrfs_root *root, 3860 struct btrfs_path *path, 3861 const struct btrfs_key *key, 3862 u32 data_size) 3863 { 3864 struct btrfs_item_batch batch; 3865 3866 batch.keys = key; 3867 batch.data_sizes = &data_size; 3868 batch.total_data_size = data_size; 3869 batch.nr = 1; 3870 3871 setup_items_for_insert(root, path, &batch); 3872 } 3873 3874 /* 3875 * Given a key and some data, insert items into the tree. 3876 * This does all the path init required, making room in the tree if needed. 3877 */ 3878 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3879 struct btrfs_root *root, 3880 struct btrfs_path *path, 3881 const struct btrfs_item_batch *batch) 3882 { 3883 int ret = 0; 3884 int slot; 3885 u32 total_size; 3886 3887 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 3888 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 3889 if (ret == 0) 3890 return -EEXIST; 3891 if (ret < 0) 3892 return ret; 3893 3894 slot = path->slots[0]; 3895 BUG_ON(slot < 0); 3896 3897 setup_items_for_insert(root, path, batch); 3898 return 0; 3899 } 3900 3901 /* 3902 * Given a key and some data, insert an item into the tree. 3903 * This does all the path init required, making room in the tree if needed. 3904 */ 3905 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3906 const struct btrfs_key *cpu_key, void *data, 3907 u32 data_size) 3908 { 3909 int ret = 0; 3910 struct btrfs_path *path; 3911 struct extent_buffer *leaf; 3912 unsigned long ptr; 3913 3914 path = btrfs_alloc_path(); 3915 if (!path) 3916 return -ENOMEM; 3917 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3918 if (!ret) { 3919 leaf = path->nodes[0]; 3920 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3921 write_extent_buffer(leaf, data, ptr, data_size); 3922 btrfs_mark_buffer_dirty(leaf); 3923 } 3924 btrfs_free_path(path); 3925 return ret; 3926 } 3927 3928 /* 3929 * This function duplicates an item, giving 'new_key' to the new item. 3930 * It guarantees both items live in the same tree leaf and the new item is 3931 * contiguous with the original item. 3932 * 3933 * This allows us to split a file extent in place, keeping a lock on the leaf 3934 * the entire time. 3935 */ 3936 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 3937 struct btrfs_root *root, 3938 struct btrfs_path *path, 3939 const struct btrfs_key *new_key) 3940 { 3941 struct extent_buffer *leaf; 3942 int ret; 3943 u32 item_size; 3944 3945 leaf = path->nodes[0]; 3946 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3947 ret = setup_leaf_for_split(trans, root, path, 3948 item_size + sizeof(struct btrfs_item)); 3949 if (ret) 3950 return ret; 3951 3952 path->slots[0]++; 3953 btrfs_setup_item_for_insert(root, path, new_key, item_size); 3954 leaf = path->nodes[0]; 3955 memcpy_extent_buffer(leaf, 3956 btrfs_item_ptr_offset(leaf, path->slots[0]), 3957 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 3958 item_size); 3959 return 0; 3960 } 3961 3962 /* 3963 * delete the pointer from a given node. 3964 * 3965 * the tree should have been previously balanced so the deletion does not 3966 * empty a node. 3967 */ 3968 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 3969 int level, int slot) 3970 { 3971 struct extent_buffer *parent = path->nodes[level]; 3972 u32 nritems; 3973 int ret; 3974 3975 nritems = btrfs_header_nritems(parent); 3976 if (slot != nritems - 1) { 3977 if (level) { 3978 ret = btrfs_tree_mod_log_insert_move(parent, slot, 3979 slot + 1, nritems - slot - 1); 3980 BUG_ON(ret < 0); 3981 } 3982 memmove_extent_buffer(parent, 3983 btrfs_node_key_ptr_offset(slot), 3984 btrfs_node_key_ptr_offset(slot + 1), 3985 sizeof(struct btrfs_key_ptr) * 3986 (nritems - slot - 1)); 3987 } else if (level) { 3988 ret = btrfs_tree_mod_log_insert_key(parent, slot, 3989 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); 3990 BUG_ON(ret < 0); 3991 } 3992 3993 nritems--; 3994 btrfs_set_header_nritems(parent, nritems); 3995 if (nritems == 0 && parent == root->node) { 3996 BUG_ON(btrfs_header_level(root->node) != 1); 3997 /* just turn the root into a leaf and break */ 3998 btrfs_set_header_level(root->node, 0); 3999 } else if (slot == 0) { 4000 struct btrfs_disk_key disk_key; 4001 4002 btrfs_node_key(parent, &disk_key, 0); 4003 fixup_low_keys(path, &disk_key, level + 1); 4004 } 4005 btrfs_mark_buffer_dirty(parent); 4006 } 4007 4008 /* 4009 * a helper function to delete the leaf pointed to by path->slots[1] and 4010 * path->nodes[1]. 4011 * 4012 * This deletes the pointer in path->nodes[1] and frees the leaf 4013 * block extent. zero is returned if it all worked out, < 0 otherwise. 4014 * 4015 * The path must have already been setup for deleting the leaf, including 4016 * all the proper balancing. path->nodes[1] must be locked. 4017 */ 4018 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4019 struct btrfs_root *root, 4020 struct btrfs_path *path, 4021 struct extent_buffer *leaf) 4022 { 4023 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4024 del_ptr(root, path, 1, path->slots[1]); 4025 4026 /* 4027 * btrfs_free_extent is expensive, we want to make sure we 4028 * aren't holding any locks when we call it 4029 */ 4030 btrfs_unlock_up_safe(path, 0); 4031 4032 root_sub_used(root, leaf->len); 4033 4034 atomic_inc(&leaf->refs); 4035 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4036 free_extent_buffer_stale(leaf); 4037 } 4038 /* 4039 * delete the item at the leaf level in path. If that empties 4040 * the leaf, remove it from the tree 4041 */ 4042 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4043 struct btrfs_path *path, int slot, int nr) 4044 { 4045 struct btrfs_fs_info *fs_info = root->fs_info; 4046 struct extent_buffer *leaf; 4047 struct btrfs_item *item; 4048 u32 last_off; 4049 u32 dsize = 0; 4050 int ret = 0; 4051 int wret; 4052 int i; 4053 u32 nritems; 4054 4055 leaf = path->nodes[0]; 4056 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4057 4058 for (i = 0; i < nr; i++) 4059 dsize += btrfs_item_size_nr(leaf, slot + i); 4060 4061 nritems = btrfs_header_nritems(leaf); 4062 4063 if (slot + nr != nritems) { 4064 int data_end = leaf_data_end(leaf); 4065 struct btrfs_map_token token; 4066 4067 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4068 data_end + dsize, 4069 BTRFS_LEAF_DATA_OFFSET + data_end, 4070 last_off - data_end); 4071 4072 btrfs_init_map_token(&token, leaf); 4073 for (i = slot + nr; i < nritems; i++) { 4074 u32 ioff; 4075 4076 item = btrfs_item_nr(i); 4077 ioff = btrfs_token_item_offset(&token, item); 4078 btrfs_set_token_item_offset(&token, item, ioff + dsize); 4079 } 4080 4081 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4082 btrfs_item_nr_offset(slot + nr), 4083 sizeof(struct btrfs_item) * 4084 (nritems - slot - nr)); 4085 } 4086 btrfs_set_header_nritems(leaf, nritems - nr); 4087 nritems -= nr; 4088 4089 /* delete the leaf if we've emptied it */ 4090 if (nritems == 0) { 4091 if (leaf == root->node) { 4092 btrfs_set_header_level(leaf, 0); 4093 } else { 4094 btrfs_clean_tree_block(leaf); 4095 btrfs_del_leaf(trans, root, path, leaf); 4096 } 4097 } else { 4098 int used = leaf_space_used(leaf, 0, nritems); 4099 if (slot == 0) { 4100 struct btrfs_disk_key disk_key; 4101 4102 btrfs_item_key(leaf, &disk_key, 0); 4103 fixup_low_keys(path, &disk_key, 1); 4104 } 4105 4106 /* delete the leaf if it is mostly empty */ 4107 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4108 /* push_leaf_left fixes the path. 4109 * make sure the path still points to our leaf 4110 * for possible call to del_ptr below 4111 */ 4112 slot = path->slots[1]; 4113 atomic_inc(&leaf->refs); 4114 4115 wret = push_leaf_left(trans, root, path, 1, 1, 4116 1, (u32)-1); 4117 if (wret < 0 && wret != -ENOSPC) 4118 ret = wret; 4119 4120 if (path->nodes[0] == leaf && 4121 btrfs_header_nritems(leaf)) { 4122 wret = push_leaf_right(trans, root, path, 1, 4123 1, 1, 0); 4124 if (wret < 0 && wret != -ENOSPC) 4125 ret = wret; 4126 } 4127 4128 if (btrfs_header_nritems(leaf) == 0) { 4129 path->slots[1] = slot; 4130 btrfs_del_leaf(trans, root, path, leaf); 4131 free_extent_buffer(leaf); 4132 ret = 0; 4133 } else { 4134 /* if we're still in the path, make sure 4135 * we're dirty. Otherwise, one of the 4136 * push_leaf functions must have already 4137 * dirtied this buffer 4138 */ 4139 if (path->nodes[0] == leaf) 4140 btrfs_mark_buffer_dirty(leaf); 4141 free_extent_buffer(leaf); 4142 } 4143 } else { 4144 btrfs_mark_buffer_dirty(leaf); 4145 } 4146 } 4147 return ret; 4148 } 4149 4150 /* 4151 * search the tree again to find a leaf with lesser keys 4152 * returns 0 if it found something or 1 if there are no lesser leaves. 4153 * returns < 0 on io errors. 4154 * 4155 * This may release the path, and so you may lose any locks held at the 4156 * time you call it. 4157 */ 4158 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4159 { 4160 struct btrfs_key key; 4161 struct btrfs_disk_key found_key; 4162 int ret; 4163 4164 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4165 4166 if (key.offset > 0) { 4167 key.offset--; 4168 } else if (key.type > 0) { 4169 key.type--; 4170 key.offset = (u64)-1; 4171 } else if (key.objectid > 0) { 4172 key.objectid--; 4173 key.type = (u8)-1; 4174 key.offset = (u64)-1; 4175 } else { 4176 return 1; 4177 } 4178 4179 btrfs_release_path(path); 4180 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4181 if (ret < 0) 4182 return ret; 4183 btrfs_item_key(path->nodes[0], &found_key, 0); 4184 ret = comp_keys(&found_key, &key); 4185 /* 4186 * We might have had an item with the previous key in the tree right 4187 * before we released our path. And after we released our path, that 4188 * item might have been pushed to the first slot (0) of the leaf we 4189 * were holding due to a tree balance. Alternatively, an item with the 4190 * previous key can exist as the only element of a leaf (big fat item). 4191 * Therefore account for these 2 cases, so that our callers (like 4192 * btrfs_previous_item) don't miss an existing item with a key matching 4193 * the previous key we computed above. 4194 */ 4195 if (ret <= 0) 4196 return 0; 4197 return 1; 4198 } 4199 4200 /* 4201 * A helper function to walk down the tree starting at min_key, and looking 4202 * for nodes or leaves that are have a minimum transaction id. 4203 * This is used by the btree defrag code, and tree logging 4204 * 4205 * This does not cow, but it does stuff the starting key it finds back 4206 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4207 * key and get a writable path. 4208 * 4209 * This honors path->lowest_level to prevent descent past a given level 4210 * of the tree. 4211 * 4212 * min_trans indicates the oldest transaction that you are interested 4213 * in walking through. Any nodes or leaves older than min_trans are 4214 * skipped over (without reading them). 4215 * 4216 * returns zero if something useful was found, < 0 on error and 1 if there 4217 * was nothing in the tree that matched the search criteria. 4218 */ 4219 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4220 struct btrfs_path *path, 4221 u64 min_trans) 4222 { 4223 struct extent_buffer *cur; 4224 struct btrfs_key found_key; 4225 int slot; 4226 int sret; 4227 u32 nritems; 4228 int level; 4229 int ret = 1; 4230 int keep_locks = path->keep_locks; 4231 4232 path->keep_locks = 1; 4233 again: 4234 cur = btrfs_read_lock_root_node(root); 4235 level = btrfs_header_level(cur); 4236 WARN_ON(path->nodes[level]); 4237 path->nodes[level] = cur; 4238 path->locks[level] = BTRFS_READ_LOCK; 4239 4240 if (btrfs_header_generation(cur) < min_trans) { 4241 ret = 1; 4242 goto out; 4243 } 4244 while (1) { 4245 nritems = btrfs_header_nritems(cur); 4246 level = btrfs_header_level(cur); 4247 sret = btrfs_bin_search(cur, min_key, &slot); 4248 if (sret < 0) { 4249 ret = sret; 4250 goto out; 4251 } 4252 4253 /* at the lowest level, we're done, setup the path and exit */ 4254 if (level == path->lowest_level) { 4255 if (slot >= nritems) 4256 goto find_next_key; 4257 ret = 0; 4258 path->slots[level] = slot; 4259 btrfs_item_key_to_cpu(cur, &found_key, slot); 4260 goto out; 4261 } 4262 if (sret && slot > 0) 4263 slot--; 4264 /* 4265 * check this node pointer against the min_trans parameters. 4266 * If it is too old, skip to the next one. 4267 */ 4268 while (slot < nritems) { 4269 u64 gen; 4270 4271 gen = btrfs_node_ptr_generation(cur, slot); 4272 if (gen < min_trans) { 4273 slot++; 4274 continue; 4275 } 4276 break; 4277 } 4278 find_next_key: 4279 /* 4280 * we didn't find a candidate key in this node, walk forward 4281 * and find another one 4282 */ 4283 if (slot >= nritems) { 4284 path->slots[level] = slot; 4285 sret = btrfs_find_next_key(root, path, min_key, level, 4286 min_trans); 4287 if (sret == 0) { 4288 btrfs_release_path(path); 4289 goto again; 4290 } else { 4291 goto out; 4292 } 4293 } 4294 /* save our key for returning back */ 4295 btrfs_node_key_to_cpu(cur, &found_key, slot); 4296 path->slots[level] = slot; 4297 if (level == path->lowest_level) { 4298 ret = 0; 4299 goto out; 4300 } 4301 cur = btrfs_read_node_slot(cur, slot); 4302 if (IS_ERR(cur)) { 4303 ret = PTR_ERR(cur); 4304 goto out; 4305 } 4306 4307 btrfs_tree_read_lock(cur); 4308 4309 path->locks[level - 1] = BTRFS_READ_LOCK; 4310 path->nodes[level - 1] = cur; 4311 unlock_up(path, level, 1, 0, NULL); 4312 } 4313 out: 4314 path->keep_locks = keep_locks; 4315 if (ret == 0) { 4316 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4317 memcpy(min_key, &found_key, sizeof(found_key)); 4318 } 4319 return ret; 4320 } 4321 4322 /* 4323 * this is similar to btrfs_next_leaf, but does not try to preserve 4324 * and fixup the path. It looks for and returns the next key in the 4325 * tree based on the current path and the min_trans parameters. 4326 * 4327 * 0 is returned if another key is found, < 0 if there are any errors 4328 * and 1 is returned if there are no higher keys in the tree 4329 * 4330 * path->keep_locks should be set to 1 on the search made before 4331 * calling this function. 4332 */ 4333 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4334 struct btrfs_key *key, int level, u64 min_trans) 4335 { 4336 int slot; 4337 struct extent_buffer *c; 4338 4339 WARN_ON(!path->keep_locks && !path->skip_locking); 4340 while (level < BTRFS_MAX_LEVEL) { 4341 if (!path->nodes[level]) 4342 return 1; 4343 4344 slot = path->slots[level] + 1; 4345 c = path->nodes[level]; 4346 next: 4347 if (slot >= btrfs_header_nritems(c)) { 4348 int ret; 4349 int orig_lowest; 4350 struct btrfs_key cur_key; 4351 if (level + 1 >= BTRFS_MAX_LEVEL || 4352 !path->nodes[level + 1]) 4353 return 1; 4354 4355 if (path->locks[level + 1] || path->skip_locking) { 4356 level++; 4357 continue; 4358 } 4359 4360 slot = btrfs_header_nritems(c) - 1; 4361 if (level == 0) 4362 btrfs_item_key_to_cpu(c, &cur_key, slot); 4363 else 4364 btrfs_node_key_to_cpu(c, &cur_key, slot); 4365 4366 orig_lowest = path->lowest_level; 4367 btrfs_release_path(path); 4368 path->lowest_level = level; 4369 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4370 0, 0); 4371 path->lowest_level = orig_lowest; 4372 if (ret < 0) 4373 return ret; 4374 4375 c = path->nodes[level]; 4376 slot = path->slots[level]; 4377 if (ret == 0) 4378 slot++; 4379 goto next; 4380 } 4381 4382 if (level == 0) 4383 btrfs_item_key_to_cpu(c, key, slot); 4384 else { 4385 u64 gen = btrfs_node_ptr_generation(c, slot); 4386 4387 if (gen < min_trans) { 4388 slot++; 4389 goto next; 4390 } 4391 btrfs_node_key_to_cpu(c, key, slot); 4392 } 4393 return 0; 4394 } 4395 return 1; 4396 } 4397 4398 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4399 u64 time_seq) 4400 { 4401 int slot; 4402 int level; 4403 struct extent_buffer *c; 4404 struct extent_buffer *next; 4405 struct btrfs_key key; 4406 u32 nritems; 4407 int ret; 4408 int i; 4409 4410 nritems = btrfs_header_nritems(path->nodes[0]); 4411 if (nritems == 0) 4412 return 1; 4413 4414 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4415 again: 4416 level = 1; 4417 next = NULL; 4418 btrfs_release_path(path); 4419 4420 path->keep_locks = 1; 4421 4422 if (time_seq) 4423 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4424 else 4425 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4426 path->keep_locks = 0; 4427 4428 if (ret < 0) 4429 return ret; 4430 4431 nritems = btrfs_header_nritems(path->nodes[0]); 4432 /* 4433 * by releasing the path above we dropped all our locks. A balance 4434 * could have added more items next to the key that used to be 4435 * at the very end of the block. So, check again here and 4436 * advance the path if there are now more items available. 4437 */ 4438 if (nritems > 0 && path->slots[0] < nritems - 1) { 4439 if (ret == 0) 4440 path->slots[0]++; 4441 ret = 0; 4442 goto done; 4443 } 4444 /* 4445 * So the above check misses one case: 4446 * - after releasing the path above, someone has removed the item that 4447 * used to be at the very end of the block, and balance between leafs 4448 * gets another one with bigger key.offset to replace it. 4449 * 4450 * This one should be returned as well, or we can get leaf corruption 4451 * later(esp. in __btrfs_drop_extents()). 4452 * 4453 * And a bit more explanation about this check, 4454 * with ret > 0, the key isn't found, the path points to the slot 4455 * where it should be inserted, so the path->slots[0] item must be the 4456 * bigger one. 4457 */ 4458 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4459 ret = 0; 4460 goto done; 4461 } 4462 4463 while (level < BTRFS_MAX_LEVEL) { 4464 if (!path->nodes[level]) { 4465 ret = 1; 4466 goto done; 4467 } 4468 4469 slot = path->slots[level] + 1; 4470 c = path->nodes[level]; 4471 if (slot >= btrfs_header_nritems(c)) { 4472 level++; 4473 if (level == BTRFS_MAX_LEVEL) { 4474 ret = 1; 4475 goto done; 4476 } 4477 continue; 4478 } 4479 4480 4481 /* 4482 * Our current level is where we're going to start from, and to 4483 * make sure lockdep doesn't complain we need to drop our locks 4484 * and nodes from 0 to our current level. 4485 */ 4486 for (i = 0; i < level; i++) { 4487 if (path->locks[level]) { 4488 btrfs_tree_read_unlock(path->nodes[i]); 4489 path->locks[i] = 0; 4490 } 4491 free_extent_buffer(path->nodes[i]); 4492 path->nodes[i] = NULL; 4493 } 4494 4495 next = c; 4496 ret = read_block_for_search(root, path, &next, level, 4497 slot, &key); 4498 if (ret == -EAGAIN) 4499 goto again; 4500 4501 if (ret < 0) { 4502 btrfs_release_path(path); 4503 goto done; 4504 } 4505 4506 if (!path->skip_locking) { 4507 ret = btrfs_try_tree_read_lock(next); 4508 if (!ret && time_seq) { 4509 /* 4510 * If we don't get the lock, we may be racing 4511 * with push_leaf_left, holding that lock while 4512 * itself waiting for the leaf we've currently 4513 * locked. To solve this situation, we give up 4514 * on our lock and cycle. 4515 */ 4516 free_extent_buffer(next); 4517 btrfs_release_path(path); 4518 cond_resched(); 4519 goto again; 4520 } 4521 if (!ret) 4522 btrfs_tree_read_lock(next); 4523 } 4524 break; 4525 } 4526 path->slots[level] = slot; 4527 while (1) { 4528 level--; 4529 path->nodes[level] = next; 4530 path->slots[level] = 0; 4531 if (!path->skip_locking) 4532 path->locks[level] = BTRFS_READ_LOCK; 4533 if (!level) 4534 break; 4535 4536 ret = read_block_for_search(root, path, &next, level, 4537 0, &key); 4538 if (ret == -EAGAIN) 4539 goto again; 4540 4541 if (ret < 0) { 4542 btrfs_release_path(path); 4543 goto done; 4544 } 4545 4546 if (!path->skip_locking) 4547 btrfs_tree_read_lock(next); 4548 } 4549 ret = 0; 4550 done: 4551 unlock_up(path, 0, 1, 0, NULL); 4552 4553 return ret; 4554 } 4555 4556 /* 4557 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4558 * searching until it gets past min_objectid or finds an item of 'type' 4559 * 4560 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4561 */ 4562 int btrfs_previous_item(struct btrfs_root *root, 4563 struct btrfs_path *path, u64 min_objectid, 4564 int type) 4565 { 4566 struct btrfs_key found_key; 4567 struct extent_buffer *leaf; 4568 u32 nritems; 4569 int ret; 4570 4571 while (1) { 4572 if (path->slots[0] == 0) { 4573 ret = btrfs_prev_leaf(root, path); 4574 if (ret != 0) 4575 return ret; 4576 } else { 4577 path->slots[0]--; 4578 } 4579 leaf = path->nodes[0]; 4580 nritems = btrfs_header_nritems(leaf); 4581 if (nritems == 0) 4582 return 1; 4583 if (path->slots[0] == nritems) 4584 path->slots[0]--; 4585 4586 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4587 if (found_key.objectid < min_objectid) 4588 break; 4589 if (found_key.type == type) 4590 return 0; 4591 if (found_key.objectid == min_objectid && 4592 found_key.type < type) 4593 break; 4594 } 4595 return 1; 4596 } 4597 4598 /* 4599 * search in extent tree to find a previous Metadata/Data extent item with 4600 * min objecitd. 4601 * 4602 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4603 */ 4604 int btrfs_previous_extent_item(struct btrfs_root *root, 4605 struct btrfs_path *path, u64 min_objectid) 4606 { 4607 struct btrfs_key found_key; 4608 struct extent_buffer *leaf; 4609 u32 nritems; 4610 int ret; 4611 4612 while (1) { 4613 if (path->slots[0] == 0) { 4614 ret = btrfs_prev_leaf(root, path); 4615 if (ret != 0) 4616 return ret; 4617 } else { 4618 path->slots[0]--; 4619 } 4620 leaf = path->nodes[0]; 4621 nritems = btrfs_header_nritems(leaf); 4622 if (nritems == 0) 4623 return 1; 4624 if (path->slots[0] == nritems) 4625 path->slots[0]--; 4626 4627 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4628 if (found_key.objectid < min_objectid) 4629 break; 4630 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 4631 found_key.type == BTRFS_METADATA_ITEM_KEY) 4632 return 0; 4633 if (found_key.objectid == min_objectid && 4634 found_key.type < BTRFS_EXTENT_ITEM_KEY) 4635 break; 4636 } 4637 return 1; 4638 } 4639