1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, int extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, int empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 41 static const struct btrfs_csums { 42 u16 size; 43 const char name[10]; 44 const char driver[12]; 45 } btrfs_csums[] = { 46 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 47 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 48 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 49 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 50 .driver = "blake2b-256" }, 51 }; 52 53 /* 54 * The leaf data grows from end-to-front in the node. this returns the address 55 * of the start of the last item, which is the stop of the leaf data stack. 56 */ 57 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 58 { 59 u32 nr = btrfs_header_nritems(leaf); 60 61 if (nr == 0) 62 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 63 return btrfs_item_offset(leaf, nr - 1); 64 } 65 66 /* 67 * Move data in a @leaf (using memmove, safe for overlapping ranges). 68 * 69 * @leaf: leaf that we're doing a memmove on 70 * @dst_offset: item data offset we're moving to 71 * @src_offset: item data offset were' moving from 72 * @len: length of the data we're moving 73 * 74 * Wrapper around memmove_extent_buffer() that takes into account the header on 75 * the leaf. The btrfs_item offset's start directly after the header, so we 76 * have to adjust any offsets to account for the header in the leaf. This 77 * handles that math to simplify the callers. 78 */ 79 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 80 unsigned long dst_offset, 81 unsigned long src_offset, 82 unsigned long len) 83 { 84 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 85 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 86 } 87 88 /* 89 * Copy item data from @src into @dst at the given @offset. 90 * 91 * @dst: destination leaf that we're copying into 92 * @src: source leaf that we're copying from 93 * @dst_offset: item data offset we're copying to 94 * @src_offset: item data offset were' copying from 95 * @len: length of the data we're copying 96 * 97 * Wrapper around copy_extent_buffer() that takes into account the header on 98 * the leaf. The btrfs_item offset's start directly after the header, so we 99 * have to adjust any offsets to account for the header in the leaf. This 100 * handles that math to simplify the callers. 101 */ 102 static inline void copy_leaf_data(const struct extent_buffer *dst, 103 const struct extent_buffer *src, 104 unsigned long dst_offset, 105 unsigned long src_offset, unsigned long len) 106 { 107 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 108 btrfs_item_nr_offset(src, 0) + src_offset, len); 109 } 110 111 /* 112 * Move items in a @leaf (using memmove). 113 * 114 * @dst: destination leaf for the items 115 * @dst_item: the item nr we're copying into 116 * @src_item: the item nr we're copying from 117 * @nr_items: the number of items to copy 118 * 119 * Wrapper around memmove_extent_buffer() that does the math to get the 120 * appropriate offsets into the leaf from the item numbers. 121 */ 122 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 123 int dst_item, int src_item, int nr_items) 124 { 125 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 126 btrfs_item_nr_offset(leaf, src_item), 127 nr_items * sizeof(struct btrfs_item)); 128 } 129 130 /* 131 * Copy items from @src into @dst at the given @offset. 132 * 133 * @dst: destination leaf for the items 134 * @src: source leaf for the items 135 * @dst_item: the item nr we're copying into 136 * @src_item: the item nr we're copying from 137 * @nr_items: the number of items to copy 138 * 139 * Wrapper around copy_extent_buffer() that does the math to get the 140 * appropriate offsets into the leaf from the item numbers. 141 */ 142 static inline void copy_leaf_items(const struct extent_buffer *dst, 143 const struct extent_buffer *src, 144 int dst_item, int src_item, int nr_items) 145 { 146 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 147 btrfs_item_nr_offset(src, src_item), 148 nr_items * sizeof(struct btrfs_item)); 149 } 150 151 /* This exists for btrfs-progs usages. */ 152 u16 btrfs_csum_type_size(u16 type) 153 { 154 return btrfs_csums[type].size; 155 } 156 157 int btrfs_super_csum_size(const struct btrfs_super_block *s) 158 { 159 u16 t = btrfs_super_csum_type(s); 160 /* 161 * csum type is validated at mount time 162 */ 163 return btrfs_csum_type_size(t); 164 } 165 166 const char *btrfs_super_csum_name(u16 csum_type) 167 { 168 /* csum type is validated at mount time */ 169 return btrfs_csums[csum_type].name; 170 } 171 172 /* 173 * Return driver name if defined, otherwise the name that's also a valid driver 174 * name 175 */ 176 const char *btrfs_super_csum_driver(u16 csum_type) 177 { 178 /* csum type is validated at mount time */ 179 return btrfs_csums[csum_type].driver[0] ? 180 btrfs_csums[csum_type].driver : 181 btrfs_csums[csum_type].name; 182 } 183 184 size_t __attribute_const__ btrfs_get_num_csums(void) 185 { 186 return ARRAY_SIZE(btrfs_csums); 187 } 188 189 struct btrfs_path *btrfs_alloc_path(void) 190 { 191 might_sleep(); 192 193 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 194 } 195 196 /* this also releases the path */ 197 void btrfs_free_path(struct btrfs_path *p) 198 { 199 if (!p) 200 return; 201 btrfs_release_path(p); 202 kmem_cache_free(btrfs_path_cachep, p); 203 } 204 205 /* 206 * path release drops references on the extent buffers in the path 207 * and it drops any locks held by this path 208 * 209 * It is safe to call this on paths that no locks or extent buffers held. 210 */ 211 noinline void btrfs_release_path(struct btrfs_path *p) 212 { 213 int i; 214 215 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 216 p->slots[i] = 0; 217 if (!p->nodes[i]) 218 continue; 219 if (p->locks[i]) { 220 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 221 p->locks[i] = 0; 222 } 223 free_extent_buffer(p->nodes[i]); 224 p->nodes[i] = NULL; 225 } 226 } 227 228 /* 229 * We want the transaction abort to print stack trace only for errors where the 230 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 231 * caused by external factors. 232 */ 233 bool __cold abort_should_print_stack(int errno) 234 { 235 switch (errno) { 236 case -EIO: 237 case -EROFS: 238 case -ENOMEM: 239 return false; 240 } 241 return true; 242 } 243 244 /* 245 * safely gets a reference on the root node of a tree. A lock 246 * is not taken, so a concurrent writer may put a different node 247 * at the root of the tree. See btrfs_lock_root_node for the 248 * looping required. 249 * 250 * The extent buffer returned by this has a reference taken, so 251 * it won't disappear. It may stop being the root of the tree 252 * at any time because there are no locks held. 253 */ 254 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 255 { 256 struct extent_buffer *eb; 257 258 while (1) { 259 rcu_read_lock(); 260 eb = rcu_dereference(root->node); 261 262 /* 263 * RCU really hurts here, we could free up the root node because 264 * it was COWed but we may not get the new root node yet so do 265 * the inc_not_zero dance and if it doesn't work then 266 * synchronize_rcu and try again. 267 */ 268 if (atomic_inc_not_zero(&eb->refs)) { 269 rcu_read_unlock(); 270 break; 271 } 272 rcu_read_unlock(); 273 synchronize_rcu(); 274 } 275 return eb; 276 } 277 278 /* 279 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 280 * just get put onto a simple dirty list. Transaction walks this list to make 281 * sure they get properly updated on disk. 282 */ 283 static void add_root_to_dirty_list(struct btrfs_root *root) 284 { 285 struct btrfs_fs_info *fs_info = root->fs_info; 286 287 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 288 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 289 return; 290 291 spin_lock(&fs_info->trans_lock); 292 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 293 /* Want the extent tree to be the last on the list */ 294 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 295 list_move_tail(&root->dirty_list, 296 &fs_info->dirty_cowonly_roots); 297 else 298 list_move(&root->dirty_list, 299 &fs_info->dirty_cowonly_roots); 300 } 301 spin_unlock(&fs_info->trans_lock); 302 } 303 304 /* 305 * used by snapshot creation to make a copy of a root for a tree with 306 * a given objectid. The buffer with the new root node is returned in 307 * cow_ret, and this func returns zero on success or a negative error code. 308 */ 309 int btrfs_copy_root(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root, 311 struct extent_buffer *buf, 312 struct extent_buffer **cow_ret, u64 new_root_objectid) 313 { 314 struct btrfs_fs_info *fs_info = root->fs_info; 315 struct extent_buffer *cow; 316 int ret = 0; 317 int level; 318 struct btrfs_disk_key disk_key; 319 320 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 321 trans->transid != fs_info->running_transaction->transid); 322 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 323 trans->transid != root->last_trans); 324 325 level = btrfs_header_level(buf); 326 if (level == 0) 327 btrfs_item_key(buf, &disk_key, 0); 328 else 329 btrfs_node_key(buf, &disk_key, 0); 330 331 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 332 &disk_key, level, buf->start, 0, 333 BTRFS_NESTING_NEW_ROOT); 334 if (IS_ERR(cow)) 335 return PTR_ERR(cow); 336 337 copy_extent_buffer_full(cow, buf); 338 btrfs_set_header_bytenr(cow, cow->start); 339 btrfs_set_header_generation(cow, trans->transid); 340 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 341 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 342 BTRFS_HEADER_FLAG_RELOC); 343 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 344 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 345 else 346 btrfs_set_header_owner(cow, new_root_objectid); 347 348 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 349 350 WARN_ON(btrfs_header_generation(buf) > trans->transid); 351 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 352 ret = btrfs_inc_ref(trans, root, cow, 1); 353 else 354 ret = btrfs_inc_ref(trans, root, cow, 0); 355 if (ret) { 356 btrfs_tree_unlock(cow); 357 free_extent_buffer(cow); 358 btrfs_abort_transaction(trans, ret); 359 return ret; 360 } 361 362 btrfs_mark_buffer_dirty(cow); 363 *cow_ret = cow; 364 return 0; 365 } 366 367 /* 368 * check if the tree block can be shared by multiple trees 369 */ 370 int btrfs_block_can_be_shared(struct btrfs_root *root, 371 struct extent_buffer *buf) 372 { 373 /* 374 * Tree blocks not in shareable trees and tree roots are never shared. 375 * If a block was allocated after the last snapshot and the block was 376 * not allocated by tree relocation, we know the block is not shared. 377 */ 378 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 379 buf != root->node && buf != root->commit_root && 380 (btrfs_header_generation(buf) <= 381 btrfs_root_last_snapshot(&root->root_item) || 382 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 383 return 1; 384 385 return 0; 386 } 387 388 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 389 struct btrfs_root *root, 390 struct extent_buffer *buf, 391 struct extent_buffer *cow, 392 int *last_ref) 393 { 394 struct btrfs_fs_info *fs_info = root->fs_info; 395 u64 refs; 396 u64 owner; 397 u64 flags; 398 u64 new_flags = 0; 399 int ret; 400 401 /* 402 * Backrefs update rules: 403 * 404 * Always use full backrefs for extent pointers in tree block 405 * allocated by tree relocation. 406 * 407 * If a shared tree block is no longer referenced by its owner 408 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 409 * use full backrefs for extent pointers in tree block. 410 * 411 * If a tree block is been relocating 412 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 413 * use full backrefs for extent pointers in tree block. 414 * The reason for this is some operations (such as drop tree) 415 * are only allowed for blocks use full backrefs. 416 */ 417 418 if (btrfs_block_can_be_shared(root, buf)) { 419 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 420 btrfs_header_level(buf), 1, 421 &refs, &flags); 422 if (ret) 423 return ret; 424 if (unlikely(refs == 0)) { 425 btrfs_crit(fs_info, 426 "found 0 references for tree block at bytenr %llu level %d root %llu", 427 buf->start, btrfs_header_level(buf), 428 btrfs_root_id(root)); 429 ret = -EUCLEAN; 430 btrfs_abort_transaction(trans, ret); 431 return ret; 432 } 433 } else { 434 refs = 1; 435 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 436 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 437 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 438 else 439 flags = 0; 440 } 441 442 owner = btrfs_header_owner(buf); 443 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 444 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 445 446 if (refs > 1) { 447 if ((owner == root->root_key.objectid || 448 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 449 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 450 ret = btrfs_inc_ref(trans, root, buf, 1); 451 if (ret) 452 return ret; 453 454 if (root->root_key.objectid == 455 BTRFS_TREE_RELOC_OBJECTID) { 456 ret = btrfs_dec_ref(trans, root, buf, 0); 457 if (ret) 458 return ret; 459 ret = btrfs_inc_ref(trans, root, cow, 1); 460 if (ret) 461 return ret; 462 } 463 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 464 } else { 465 466 if (root->root_key.objectid == 467 BTRFS_TREE_RELOC_OBJECTID) 468 ret = btrfs_inc_ref(trans, root, cow, 1); 469 else 470 ret = btrfs_inc_ref(trans, root, cow, 0); 471 if (ret) 472 return ret; 473 } 474 if (new_flags != 0) { 475 ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); 476 if (ret) 477 return ret; 478 } 479 } else { 480 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 481 if (root->root_key.objectid == 482 BTRFS_TREE_RELOC_OBJECTID) 483 ret = btrfs_inc_ref(trans, root, cow, 1); 484 else 485 ret = btrfs_inc_ref(trans, root, cow, 0); 486 if (ret) 487 return ret; 488 ret = btrfs_dec_ref(trans, root, buf, 1); 489 if (ret) 490 return ret; 491 } 492 btrfs_clear_buffer_dirty(trans, buf); 493 *last_ref = 1; 494 } 495 return 0; 496 } 497 498 /* 499 * does the dirty work in cow of a single block. The parent block (if 500 * supplied) is updated to point to the new cow copy. The new buffer is marked 501 * dirty and returned locked. If you modify the block it needs to be marked 502 * dirty again. 503 * 504 * search_start -- an allocation hint for the new block 505 * 506 * empty_size -- a hint that you plan on doing more cow. This is the size in 507 * bytes the allocator should try to find free next to the block it returns. 508 * This is just a hint and may be ignored by the allocator. 509 */ 510 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 511 struct btrfs_root *root, 512 struct extent_buffer *buf, 513 struct extent_buffer *parent, int parent_slot, 514 struct extent_buffer **cow_ret, 515 u64 search_start, u64 empty_size, 516 enum btrfs_lock_nesting nest) 517 { 518 struct btrfs_fs_info *fs_info = root->fs_info; 519 struct btrfs_disk_key disk_key; 520 struct extent_buffer *cow; 521 int level, ret; 522 int last_ref = 0; 523 int unlock_orig = 0; 524 u64 parent_start = 0; 525 526 if (*cow_ret == buf) 527 unlock_orig = 1; 528 529 btrfs_assert_tree_write_locked(buf); 530 531 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 532 trans->transid != fs_info->running_transaction->transid); 533 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 534 trans->transid != root->last_trans); 535 536 level = btrfs_header_level(buf); 537 538 if (level == 0) 539 btrfs_item_key(buf, &disk_key, 0); 540 else 541 btrfs_node_key(buf, &disk_key, 0); 542 543 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 544 parent_start = parent->start; 545 546 cow = btrfs_alloc_tree_block(trans, root, parent_start, 547 root->root_key.objectid, &disk_key, level, 548 search_start, empty_size, nest); 549 if (IS_ERR(cow)) 550 return PTR_ERR(cow); 551 552 /* cow is set to blocking by btrfs_init_new_buffer */ 553 554 copy_extent_buffer_full(cow, buf); 555 btrfs_set_header_bytenr(cow, cow->start); 556 btrfs_set_header_generation(cow, trans->transid); 557 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 558 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 559 BTRFS_HEADER_FLAG_RELOC); 560 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 561 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 562 else 563 btrfs_set_header_owner(cow, root->root_key.objectid); 564 565 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 566 567 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 568 if (ret) { 569 btrfs_tree_unlock(cow); 570 free_extent_buffer(cow); 571 btrfs_abort_transaction(trans, ret); 572 return ret; 573 } 574 575 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 576 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 577 if (ret) { 578 btrfs_tree_unlock(cow); 579 free_extent_buffer(cow); 580 btrfs_abort_transaction(trans, ret); 581 return ret; 582 } 583 } 584 585 if (buf == root->node) { 586 WARN_ON(parent && parent != buf); 587 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 588 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 589 parent_start = buf->start; 590 591 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 592 if (ret < 0) { 593 btrfs_tree_unlock(cow); 594 free_extent_buffer(cow); 595 btrfs_abort_transaction(trans, ret); 596 return ret; 597 } 598 atomic_inc(&cow->refs); 599 rcu_assign_pointer(root->node, cow); 600 601 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 602 parent_start, last_ref); 603 free_extent_buffer(buf); 604 add_root_to_dirty_list(root); 605 } else { 606 WARN_ON(trans->transid != btrfs_header_generation(parent)); 607 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 608 BTRFS_MOD_LOG_KEY_REPLACE); 609 if (ret) { 610 btrfs_tree_unlock(cow); 611 free_extent_buffer(cow); 612 btrfs_abort_transaction(trans, ret); 613 return ret; 614 } 615 btrfs_set_node_blockptr(parent, parent_slot, 616 cow->start); 617 btrfs_set_node_ptr_generation(parent, parent_slot, 618 trans->transid); 619 btrfs_mark_buffer_dirty(parent); 620 if (last_ref) { 621 ret = btrfs_tree_mod_log_free_eb(buf); 622 if (ret) { 623 btrfs_tree_unlock(cow); 624 free_extent_buffer(cow); 625 btrfs_abort_transaction(trans, ret); 626 return ret; 627 } 628 } 629 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 630 parent_start, last_ref); 631 } 632 if (unlock_orig) 633 btrfs_tree_unlock(buf); 634 free_extent_buffer_stale(buf); 635 btrfs_mark_buffer_dirty(cow); 636 *cow_ret = cow; 637 return 0; 638 } 639 640 static inline int should_cow_block(struct btrfs_trans_handle *trans, 641 struct btrfs_root *root, 642 struct extent_buffer *buf) 643 { 644 if (btrfs_is_testing(root->fs_info)) 645 return 0; 646 647 /* Ensure we can see the FORCE_COW bit */ 648 smp_mb__before_atomic(); 649 650 /* 651 * We do not need to cow a block if 652 * 1) this block is not created or changed in this transaction; 653 * 2) this block does not belong to TREE_RELOC tree; 654 * 3) the root is not forced COW. 655 * 656 * What is forced COW: 657 * when we create snapshot during committing the transaction, 658 * after we've finished copying src root, we must COW the shared 659 * block to ensure the metadata consistency. 660 */ 661 if (btrfs_header_generation(buf) == trans->transid && 662 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 663 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 664 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 665 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 666 return 0; 667 return 1; 668 } 669 670 /* 671 * cows a single block, see __btrfs_cow_block for the real work. 672 * This version of it has extra checks so that a block isn't COWed more than 673 * once per transaction, as long as it hasn't been written yet 674 */ 675 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 676 struct btrfs_root *root, struct extent_buffer *buf, 677 struct extent_buffer *parent, int parent_slot, 678 struct extent_buffer **cow_ret, 679 enum btrfs_lock_nesting nest) 680 { 681 struct btrfs_fs_info *fs_info = root->fs_info; 682 u64 search_start; 683 int ret; 684 685 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 686 btrfs_abort_transaction(trans, -EUCLEAN); 687 btrfs_crit(fs_info, 688 "attempt to COW block %llu on root %llu that is being deleted", 689 buf->start, btrfs_root_id(root)); 690 return -EUCLEAN; 691 } 692 693 /* 694 * COWing must happen through a running transaction, which always 695 * matches the current fs generation (it's a transaction with a state 696 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 697 * into error state to prevent the commit of any transaction. 698 */ 699 if (unlikely(trans->transaction != fs_info->running_transaction || 700 trans->transid != fs_info->generation)) { 701 btrfs_abort_transaction(trans, -EUCLEAN); 702 btrfs_crit(fs_info, 703 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 704 buf->start, btrfs_root_id(root), trans->transid, 705 fs_info->running_transaction->transid, 706 fs_info->generation); 707 return -EUCLEAN; 708 } 709 710 if (!should_cow_block(trans, root, buf)) { 711 *cow_ret = buf; 712 return 0; 713 } 714 715 search_start = buf->start & ~((u64)SZ_1G - 1); 716 717 /* 718 * Before CoWing this block for later modification, check if it's 719 * the subtree root and do the delayed subtree trace if needed. 720 * 721 * Also We don't care about the error, as it's handled internally. 722 */ 723 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 724 ret = __btrfs_cow_block(trans, root, buf, parent, 725 parent_slot, cow_ret, search_start, 0, nest); 726 727 trace_btrfs_cow_block(root, buf, *cow_ret); 728 729 return ret; 730 } 731 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 732 733 /* 734 * helper function for defrag to decide if two blocks pointed to by a 735 * node are actually close by 736 */ 737 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 738 { 739 if (blocknr < other && other - (blocknr + blocksize) < 32768) 740 return 1; 741 if (blocknr > other && blocknr - (other + blocksize) < 32768) 742 return 1; 743 return 0; 744 } 745 746 #ifdef __LITTLE_ENDIAN 747 748 /* 749 * Compare two keys, on little-endian the disk order is same as CPU order and 750 * we can avoid the conversion. 751 */ 752 static int comp_keys(const struct btrfs_disk_key *disk_key, 753 const struct btrfs_key *k2) 754 { 755 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 756 757 return btrfs_comp_cpu_keys(k1, k2); 758 } 759 760 #else 761 762 /* 763 * compare two keys in a memcmp fashion 764 */ 765 static int comp_keys(const struct btrfs_disk_key *disk, 766 const struct btrfs_key *k2) 767 { 768 struct btrfs_key k1; 769 770 btrfs_disk_key_to_cpu(&k1, disk); 771 772 return btrfs_comp_cpu_keys(&k1, k2); 773 } 774 #endif 775 776 /* 777 * same as comp_keys only with two btrfs_key's 778 */ 779 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 780 { 781 if (k1->objectid > k2->objectid) 782 return 1; 783 if (k1->objectid < k2->objectid) 784 return -1; 785 if (k1->type > k2->type) 786 return 1; 787 if (k1->type < k2->type) 788 return -1; 789 if (k1->offset > k2->offset) 790 return 1; 791 if (k1->offset < k2->offset) 792 return -1; 793 return 0; 794 } 795 796 /* 797 * this is used by the defrag code to go through all the 798 * leaves pointed to by a node and reallocate them so that 799 * disk order is close to key order 800 */ 801 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 802 struct btrfs_root *root, struct extent_buffer *parent, 803 int start_slot, u64 *last_ret, 804 struct btrfs_key *progress) 805 { 806 struct btrfs_fs_info *fs_info = root->fs_info; 807 struct extent_buffer *cur; 808 u64 blocknr; 809 u64 search_start = *last_ret; 810 u64 last_block = 0; 811 u64 other; 812 u32 parent_nritems; 813 int end_slot; 814 int i; 815 int err = 0; 816 u32 blocksize; 817 int progress_passed = 0; 818 struct btrfs_disk_key disk_key; 819 820 /* 821 * COWing must happen through a running transaction, which always 822 * matches the current fs generation (it's a transaction with a state 823 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 824 * into error state to prevent the commit of any transaction. 825 */ 826 if (unlikely(trans->transaction != fs_info->running_transaction || 827 trans->transid != fs_info->generation)) { 828 btrfs_abort_transaction(trans, -EUCLEAN); 829 btrfs_crit(fs_info, 830 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 831 parent->start, btrfs_root_id(root), trans->transid, 832 fs_info->running_transaction->transid, 833 fs_info->generation); 834 return -EUCLEAN; 835 } 836 837 parent_nritems = btrfs_header_nritems(parent); 838 blocksize = fs_info->nodesize; 839 end_slot = parent_nritems - 1; 840 841 if (parent_nritems <= 1) 842 return 0; 843 844 for (i = start_slot; i <= end_slot; i++) { 845 int close = 1; 846 847 btrfs_node_key(parent, &disk_key, i); 848 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 849 continue; 850 851 progress_passed = 1; 852 blocknr = btrfs_node_blockptr(parent, i); 853 if (last_block == 0) 854 last_block = blocknr; 855 856 if (i > 0) { 857 other = btrfs_node_blockptr(parent, i - 1); 858 close = close_blocks(blocknr, other, blocksize); 859 } 860 if (!close && i < end_slot) { 861 other = btrfs_node_blockptr(parent, i + 1); 862 close = close_blocks(blocknr, other, blocksize); 863 } 864 if (close) { 865 last_block = blocknr; 866 continue; 867 } 868 869 cur = btrfs_read_node_slot(parent, i); 870 if (IS_ERR(cur)) 871 return PTR_ERR(cur); 872 if (search_start == 0) 873 search_start = last_block; 874 875 btrfs_tree_lock(cur); 876 err = __btrfs_cow_block(trans, root, cur, parent, i, 877 &cur, search_start, 878 min(16 * blocksize, 879 (end_slot - i) * blocksize), 880 BTRFS_NESTING_COW); 881 if (err) { 882 btrfs_tree_unlock(cur); 883 free_extent_buffer(cur); 884 break; 885 } 886 search_start = cur->start; 887 last_block = cur->start; 888 *last_ret = search_start; 889 btrfs_tree_unlock(cur); 890 free_extent_buffer(cur); 891 } 892 return err; 893 } 894 895 /* 896 * Search for a key in the given extent_buffer. 897 * 898 * The lower boundary for the search is specified by the slot number @first_slot. 899 * Use a value of 0 to search over the whole extent buffer. Works for both 900 * leaves and nodes. 901 * 902 * The slot in the extent buffer is returned via @slot. If the key exists in the 903 * extent buffer, then @slot will point to the slot where the key is, otherwise 904 * it points to the slot where you would insert the key. 905 * 906 * Slot may point to the total number of items (i.e. one position beyond the last 907 * key) if the key is bigger than the last key in the extent buffer. 908 */ 909 int btrfs_bin_search(struct extent_buffer *eb, int first_slot, 910 const struct btrfs_key *key, int *slot) 911 { 912 unsigned long p; 913 int item_size; 914 /* 915 * Use unsigned types for the low and high slots, so that we get a more 916 * efficient division in the search loop below. 917 */ 918 u32 low = first_slot; 919 u32 high = btrfs_header_nritems(eb); 920 int ret; 921 const int key_size = sizeof(struct btrfs_disk_key); 922 923 if (unlikely(low > high)) { 924 btrfs_err(eb->fs_info, 925 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 926 __func__, low, high, eb->start, 927 btrfs_header_owner(eb), btrfs_header_level(eb)); 928 return -EINVAL; 929 } 930 931 if (btrfs_header_level(eb) == 0) { 932 p = offsetof(struct btrfs_leaf, items); 933 item_size = sizeof(struct btrfs_item); 934 } else { 935 p = offsetof(struct btrfs_node, ptrs); 936 item_size = sizeof(struct btrfs_key_ptr); 937 } 938 939 while (low < high) { 940 unsigned long oip; 941 unsigned long offset; 942 struct btrfs_disk_key *tmp; 943 struct btrfs_disk_key unaligned; 944 int mid; 945 946 mid = (low + high) / 2; 947 offset = p + mid * item_size; 948 oip = offset_in_page(offset); 949 950 if (oip + key_size <= PAGE_SIZE) { 951 const unsigned long idx = get_eb_page_index(offset); 952 char *kaddr = page_address(eb->pages[idx]); 953 954 oip = get_eb_offset_in_page(eb, offset); 955 tmp = (struct btrfs_disk_key *)(kaddr + oip); 956 } else { 957 read_extent_buffer(eb, &unaligned, offset, key_size); 958 tmp = &unaligned; 959 } 960 961 ret = comp_keys(tmp, key); 962 963 if (ret < 0) 964 low = mid + 1; 965 else if (ret > 0) 966 high = mid; 967 else { 968 *slot = mid; 969 return 0; 970 } 971 } 972 *slot = low; 973 return 1; 974 } 975 976 static void root_add_used(struct btrfs_root *root, u32 size) 977 { 978 spin_lock(&root->accounting_lock); 979 btrfs_set_root_used(&root->root_item, 980 btrfs_root_used(&root->root_item) + size); 981 spin_unlock(&root->accounting_lock); 982 } 983 984 static void root_sub_used(struct btrfs_root *root, u32 size) 985 { 986 spin_lock(&root->accounting_lock); 987 btrfs_set_root_used(&root->root_item, 988 btrfs_root_used(&root->root_item) - size); 989 spin_unlock(&root->accounting_lock); 990 } 991 992 /* given a node and slot number, this reads the blocks it points to. The 993 * extent buffer is returned with a reference taken (but unlocked). 994 */ 995 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 996 int slot) 997 { 998 int level = btrfs_header_level(parent); 999 struct btrfs_tree_parent_check check = { 0 }; 1000 struct extent_buffer *eb; 1001 1002 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 1003 return ERR_PTR(-ENOENT); 1004 1005 ASSERT(level); 1006 1007 check.level = level - 1; 1008 check.transid = btrfs_node_ptr_generation(parent, slot); 1009 check.owner_root = btrfs_header_owner(parent); 1010 check.has_first_key = true; 1011 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 1012 1013 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 1014 &check); 1015 if (IS_ERR(eb)) 1016 return eb; 1017 if (!extent_buffer_uptodate(eb)) { 1018 free_extent_buffer(eb); 1019 return ERR_PTR(-EIO); 1020 } 1021 1022 return eb; 1023 } 1024 1025 /* 1026 * node level balancing, used to make sure nodes are in proper order for 1027 * item deletion. We balance from the top down, so we have to make sure 1028 * that a deletion won't leave an node completely empty later on. 1029 */ 1030 static noinline int balance_level(struct btrfs_trans_handle *trans, 1031 struct btrfs_root *root, 1032 struct btrfs_path *path, int level) 1033 { 1034 struct btrfs_fs_info *fs_info = root->fs_info; 1035 struct extent_buffer *right = NULL; 1036 struct extent_buffer *mid; 1037 struct extent_buffer *left = NULL; 1038 struct extent_buffer *parent = NULL; 1039 int ret = 0; 1040 int wret; 1041 int pslot; 1042 int orig_slot = path->slots[level]; 1043 u64 orig_ptr; 1044 1045 ASSERT(level > 0); 1046 1047 mid = path->nodes[level]; 1048 1049 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 1050 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1051 1052 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1053 1054 if (level < BTRFS_MAX_LEVEL - 1) { 1055 parent = path->nodes[level + 1]; 1056 pslot = path->slots[level + 1]; 1057 } 1058 1059 /* 1060 * deal with the case where there is only one pointer in the root 1061 * by promoting the node below to a root 1062 */ 1063 if (!parent) { 1064 struct extent_buffer *child; 1065 1066 if (btrfs_header_nritems(mid) != 1) 1067 return 0; 1068 1069 /* promote the child to a root */ 1070 child = btrfs_read_node_slot(mid, 0); 1071 if (IS_ERR(child)) { 1072 ret = PTR_ERR(child); 1073 goto out; 1074 } 1075 1076 btrfs_tree_lock(child); 1077 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 1078 BTRFS_NESTING_COW); 1079 if (ret) { 1080 btrfs_tree_unlock(child); 1081 free_extent_buffer(child); 1082 goto out; 1083 } 1084 1085 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 1086 if (ret < 0) { 1087 btrfs_tree_unlock(child); 1088 free_extent_buffer(child); 1089 btrfs_abort_transaction(trans, ret); 1090 goto out; 1091 } 1092 rcu_assign_pointer(root->node, child); 1093 1094 add_root_to_dirty_list(root); 1095 btrfs_tree_unlock(child); 1096 1097 path->locks[level] = 0; 1098 path->nodes[level] = NULL; 1099 btrfs_clear_buffer_dirty(trans, mid); 1100 btrfs_tree_unlock(mid); 1101 /* once for the path */ 1102 free_extent_buffer(mid); 1103 1104 root_sub_used(root, mid->len); 1105 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1106 /* once for the root ptr */ 1107 free_extent_buffer_stale(mid); 1108 return 0; 1109 } 1110 if (btrfs_header_nritems(mid) > 1111 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1112 return 0; 1113 1114 if (pslot) { 1115 left = btrfs_read_node_slot(parent, pslot - 1); 1116 if (IS_ERR(left)) { 1117 ret = PTR_ERR(left); 1118 left = NULL; 1119 goto out; 1120 } 1121 1122 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1123 wret = btrfs_cow_block(trans, root, left, 1124 parent, pslot - 1, &left, 1125 BTRFS_NESTING_LEFT_COW); 1126 if (wret) { 1127 ret = wret; 1128 goto out; 1129 } 1130 } 1131 1132 if (pslot + 1 < btrfs_header_nritems(parent)) { 1133 right = btrfs_read_node_slot(parent, pslot + 1); 1134 if (IS_ERR(right)) { 1135 ret = PTR_ERR(right); 1136 right = NULL; 1137 goto out; 1138 } 1139 1140 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1141 wret = btrfs_cow_block(trans, root, right, 1142 parent, pslot + 1, &right, 1143 BTRFS_NESTING_RIGHT_COW); 1144 if (wret) { 1145 ret = wret; 1146 goto out; 1147 } 1148 } 1149 1150 /* first, try to make some room in the middle buffer */ 1151 if (left) { 1152 orig_slot += btrfs_header_nritems(left); 1153 wret = push_node_left(trans, left, mid, 1); 1154 if (wret < 0) 1155 ret = wret; 1156 } 1157 1158 /* 1159 * then try to empty the right most buffer into the middle 1160 */ 1161 if (right) { 1162 wret = push_node_left(trans, mid, right, 1); 1163 if (wret < 0 && wret != -ENOSPC) 1164 ret = wret; 1165 if (btrfs_header_nritems(right) == 0) { 1166 btrfs_clear_buffer_dirty(trans, right); 1167 btrfs_tree_unlock(right); 1168 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1169 if (ret < 0) { 1170 free_extent_buffer_stale(right); 1171 right = NULL; 1172 goto out; 1173 } 1174 root_sub_used(root, right->len); 1175 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 1176 0, 1); 1177 free_extent_buffer_stale(right); 1178 right = NULL; 1179 } else { 1180 struct btrfs_disk_key right_key; 1181 btrfs_node_key(right, &right_key, 0); 1182 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1183 BTRFS_MOD_LOG_KEY_REPLACE); 1184 if (ret < 0) { 1185 btrfs_abort_transaction(trans, ret); 1186 goto out; 1187 } 1188 btrfs_set_node_key(parent, &right_key, pslot + 1); 1189 btrfs_mark_buffer_dirty(parent); 1190 } 1191 } 1192 if (btrfs_header_nritems(mid) == 1) { 1193 /* 1194 * we're not allowed to leave a node with one item in the 1195 * tree during a delete. A deletion from lower in the tree 1196 * could try to delete the only pointer in this node. 1197 * So, pull some keys from the left. 1198 * There has to be a left pointer at this point because 1199 * otherwise we would have pulled some pointers from the 1200 * right 1201 */ 1202 if (unlikely(!left)) { 1203 btrfs_crit(fs_info, 1204 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1205 parent->start, btrfs_header_level(parent), 1206 mid->start, btrfs_root_id(root)); 1207 ret = -EUCLEAN; 1208 btrfs_abort_transaction(trans, ret); 1209 goto out; 1210 } 1211 wret = balance_node_right(trans, mid, left); 1212 if (wret < 0) { 1213 ret = wret; 1214 goto out; 1215 } 1216 if (wret == 1) { 1217 wret = push_node_left(trans, left, mid, 1); 1218 if (wret < 0) 1219 ret = wret; 1220 } 1221 BUG_ON(wret == 1); 1222 } 1223 if (btrfs_header_nritems(mid) == 0) { 1224 btrfs_clear_buffer_dirty(trans, mid); 1225 btrfs_tree_unlock(mid); 1226 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1227 if (ret < 0) { 1228 free_extent_buffer_stale(mid); 1229 mid = NULL; 1230 goto out; 1231 } 1232 root_sub_used(root, mid->len); 1233 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1234 free_extent_buffer_stale(mid); 1235 mid = NULL; 1236 } else { 1237 /* update the parent key to reflect our changes */ 1238 struct btrfs_disk_key mid_key; 1239 btrfs_node_key(mid, &mid_key, 0); 1240 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1241 BTRFS_MOD_LOG_KEY_REPLACE); 1242 if (ret < 0) { 1243 btrfs_abort_transaction(trans, ret); 1244 goto out; 1245 } 1246 btrfs_set_node_key(parent, &mid_key, pslot); 1247 btrfs_mark_buffer_dirty(parent); 1248 } 1249 1250 /* update the path */ 1251 if (left) { 1252 if (btrfs_header_nritems(left) > orig_slot) { 1253 atomic_inc(&left->refs); 1254 /* left was locked after cow */ 1255 path->nodes[level] = left; 1256 path->slots[level + 1] -= 1; 1257 path->slots[level] = orig_slot; 1258 if (mid) { 1259 btrfs_tree_unlock(mid); 1260 free_extent_buffer(mid); 1261 } 1262 } else { 1263 orig_slot -= btrfs_header_nritems(left); 1264 path->slots[level] = orig_slot; 1265 } 1266 } 1267 /* double check we haven't messed things up */ 1268 if (orig_ptr != 1269 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1270 BUG(); 1271 out: 1272 if (right) { 1273 btrfs_tree_unlock(right); 1274 free_extent_buffer(right); 1275 } 1276 if (left) { 1277 if (path->nodes[level] != left) 1278 btrfs_tree_unlock(left); 1279 free_extent_buffer(left); 1280 } 1281 return ret; 1282 } 1283 1284 /* Node balancing for insertion. Here we only split or push nodes around 1285 * when they are completely full. This is also done top down, so we 1286 * have to be pessimistic. 1287 */ 1288 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1289 struct btrfs_root *root, 1290 struct btrfs_path *path, int level) 1291 { 1292 struct btrfs_fs_info *fs_info = root->fs_info; 1293 struct extent_buffer *right = NULL; 1294 struct extent_buffer *mid; 1295 struct extent_buffer *left = NULL; 1296 struct extent_buffer *parent = NULL; 1297 int ret = 0; 1298 int wret; 1299 int pslot; 1300 int orig_slot = path->slots[level]; 1301 1302 if (level == 0) 1303 return 1; 1304 1305 mid = path->nodes[level]; 1306 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1307 1308 if (level < BTRFS_MAX_LEVEL - 1) { 1309 parent = path->nodes[level + 1]; 1310 pslot = path->slots[level + 1]; 1311 } 1312 1313 if (!parent) 1314 return 1; 1315 1316 /* first, try to make some room in the middle buffer */ 1317 if (pslot) { 1318 u32 left_nr; 1319 1320 left = btrfs_read_node_slot(parent, pslot - 1); 1321 if (IS_ERR(left)) 1322 return PTR_ERR(left); 1323 1324 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1325 1326 left_nr = btrfs_header_nritems(left); 1327 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1328 wret = 1; 1329 } else { 1330 ret = btrfs_cow_block(trans, root, left, parent, 1331 pslot - 1, &left, 1332 BTRFS_NESTING_LEFT_COW); 1333 if (ret) 1334 wret = 1; 1335 else { 1336 wret = push_node_left(trans, left, mid, 0); 1337 } 1338 } 1339 if (wret < 0) 1340 ret = wret; 1341 if (wret == 0) { 1342 struct btrfs_disk_key disk_key; 1343 orig_slot += left_nr; 1344 btrfs_node_key(mid, &disk_key, 0); 1345 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1346 BTRFS_MOD_LOG_KEY_REPLACE); 1347 if (ret < 0) { 1348 btrfs_tree_unlock(left); 1349 free_extent_buffer(left); 1350 btrfs_abort_transaction(trans, ret); 1351 return ret; 1352 } 1353 btrfs_set_node_key(parent, &disk_key, pslot); 1354 btrfs_mark_buffer_dirty(parent); 1355 if (btrfs_header_nritems(left) > orig_slot) { 1356 path->nodes[level] = left; 1357 path->slots[level + 1] -= 1; 1358 path->slots[level] = orig_slot; 1359 btrfs_tree_unlock(mid); 1360 free_extent_buffer(mid); 1361 } else { 1362 orig_slot -= 1363 btrfs_header_nritems(left); 1364 path->slots[level] = orig_slot; 1365 btrfs_tree_unlock(left); 1366 free_extent_buffer(left); 1367 } 1368 return 0; 1369 } 1370 btrfs_tree_unlock(left); 1371 free_extent_buffer(left); 1372 } 1373 1374 /* 1375 * then try to empty the right most buffer into the middle 1376 */ 1377 if (pslot + 1 < btrfs_header_nritems(parent)) { 1378 u32 right_nr; 1379 1380 right = btrfs_read_node_slot(parent, pslot + 1); 1381 if (IS_ERR(right)) 1382 return PTR_ERR(right); 1383 1384 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1385 1386 right_nr = btrfs_header_nritems(right); 1387 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1388 wret = 1; 1389 } else { 1390 ret = btrfs_cow_block(trans, root, right, 1391 parent, pslot + 1, 1392 &right, BTRFS_NESTING_RIGHT_COW); 1393 if (ret) 1394 wret = 1; 1395 else { 1396 wret = balance_node_right(trans, right, mid); 1397 } 1398 } 1399 if (wret < 0) 1400 ret = wret; 1401 if (wret == 0) { 1402 struct btrfs_disk_key disk_key; 1403 1404 btrfs_node_key(right, &disk_key, 0); 1405 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1406 BTRFS_MOD_LOG_KEY_REPLACE); 1407 if (ret < 0) { 1408 btrfs_tree_unlock(right); 1409 free_extent_buffer(right); 1410 btrfs_abort_transaction(trans, ret); 1411 return ret; 1412 } 1413 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1414 btrfs_mark_buffer_dirty(parent); 1415 1416 if (btrfs_header_nritems(mid) <= orig_slot) { 1417 path->nodes[level] = right; 1418 path->slots[level + 1] += 1; 1419 path->slots[level] = orig_slot - 1420 btrfs_header_nritems(mid); 1421 btrfs_tree_unlock(mid); 1422 free_extent_buffer(mid); 1423 } else { 1424 btrfs_tree_unlock(right); 1425 free_extent_buffer(right); 1426 } 1427 return 0; 1428 } 1429 btrfs_tree_unlock(right); 1430 free_extent_buffer(right); 1431 } 1432 return 1; 1433 } 1434 1435 /* 1436 * readahead one full node of leaves, finding things that are close 1437 * to the block in 'slot', and triggering ra on them. 1438 */ 1439 static void reada_for_search(struct btrfs_fs_info *fs_info, 1440 struct btrfs_path *path, 1441 int level, int slot, u64 objectid) 1442 { 1443 struct extent_buffer *node; 1444 struct btrfs_disk_key disk_key; 1445 u32 nritems; 1446 u64 search; 1447 u64 target; 1448 u64 nread = 0; 1449 u64 nread_max; 1450 u32 nr; 1451 u32 blocksize; 1452 u32 nscan = 0; 1453 1454 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1455 return; 1456 1457 if (!path->nodes[level]) 1458 return; 1459 1460 node = path->nodes[level]; 1461 1462 /* 1463 * Since the time between visiting leaves is much shorter than the time 1464 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1465 * much IO at once (possibly random). 1466 */ 1467 if (path->reada == READA_FORWARD_ALWAYS) { 1468 if (level > 1) 1469 nread_max = node->fs_info->nodesize; 1470 else 1471 nread_max = SZ_128K; 1472 } else { 1473 nread_max = SZ_64K; 1474 } 1475 1476 search = btrfs_node_blockptr(node, slot); 1477 blocksize = fs_info->nodesize; 1478 if (path->reada != READA_FORWARD_ALWAYS) { 1479 struct extent_buffer *eb; 1480 1481 eb = find_extent_buffer(fs_info, search); 1482 if (eb) { 1483 free_extent_buffer(eb); 1484 return; 1485 } 1486 } 1487 1488 target = search; 1489 1490 nritems = btrfs_header_nritems(node); 1491 nr = slot; 1492 1493 while (1) { 1494 if (path->reada == READA_BACK) { 1495 if (nr == 0) 1496 break; 1497 nr--; 1498 } else if (path->reada == READA_FORWARD || 1499 path->reada == READA_FORWARD_ALWAYS) { 1500 nr++; 1501 if (nr >= nritems) 1502 break; 1503 } 1504 if (path->reada == READA_BACK && objectid) { 1505 btrfs_node_key(node, &disk_key, nr); 1506 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1507 break; 1508 } 1509 search = btrfs_node_blockptr(node, nr); 1510 if (path->reada == READA_FORWARD_ALWAYS || 1511 (search <= target && target - search <= 65536) || 1512 (search > target && search - target <= 65536)) { 1513 btrfs_readahead_node_child(node, nr); 1514 nread += blocksize; 1515 } 1516 nscan++; 1517 if (nread > nread_max || nscan > 32) 1518 break; 1519 } 1520 } 1521 1522 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1523 { 1524 struct extent_buffer *parent; 1525 int slot; 1526 int nritems; 1527 1528 parent = path->nodes[level + 1]; 1529 if (!parent) 1530 return; 1531 1532 nritems = btrfs_header_nritems(parent); 1533 slot = path->slots[level + 1]; 1534 1535 if (slot > 0) 1536 btrfs_readahead_node_child(parent, slot - 1); 1537 if (slot + 1 < nritems) 1538 btrfs_readahead_node_child(parent, slot + 1); 1539 } 1540 1541 1542 /* 1543 * when we walk down the tree, it is usually safe to unlock the higher layers 1544 * in the tree. The exceptions are when our path goes through slot 0, because 1545 * operations on the tree might require changing key pointers higher up in the 1546 * tree. 1547 * 1548 * callers might also have set path->keep_locks, which tells this code to keep 1549 * the lock if the path points to the last slot in the block. This is part of 1550 * walking through the tree, and selecting the next slot in the higher block. 1551 * 1552 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1553 * if lowest_unlock is 1, level 0 won't be unlocked 1554 */ 1555 static noinline void unlock_up(struct btrfs_path *path, int level, 1556 int lowest_unlock, int min_write_lock_level, 1557 int *write_lock_level) 1558 { 1559 int i; 1560 int skip_level = level; 1561 bool check_skip = true; 1562 1563 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1564 if (!path->nodes[i]) 1565 break; 1566 if (!path->locks[i]) 1567 break; 1568 1569 if (check_skip) { 1570 if (path->slots[i] == 0) { 1571 skip_level = i + 1; 1572 continue; 1573 } 1574 1575 if (path->keep_locks) { 1576 u32 nritems; 1577 1578 nritems = btrfs_header_nritems(path->nodes[i]); 1579 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1580 skip_level = i + 1; 1581 continue; 1582 } 1583 } 1584 } 1585 1586 if (i >= lowest_unlock && i > skip_level) { 1587 check_skip = false; 1588 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1589 path->locks[i] = 0; 1590 if (write_lock_level && 1591 i > min_write_lock_level && 1592 i <= *write_lock_level) { 1593 *write_lock_level = i - 1; 1594 } 1595 } 1596 } 1597 } 1598 1599 /* 1600 * Helper function for btrfs_search_slot() and other functions that do a search 1601 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1602 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1603 * its pages from disk. 1604 * 1605 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1606 * whole btree search, starting again from the current root node. 1607 */ 1608 static int 1609 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1610 struct extent_buffer **eb_ret, int level, int slot, 1611 const struct btrfs_key *key) 1612 { 1613 struct btrfs_fs_info *fs_info = root->fs_info; 1614 struct btrfs_tree_parent_check check = { 0 }; 1615 u64 blocknr; 1616 u64 gen; 1617 struct extent_buffer *tmp; 1618 int ret; 1619 int parent_level; 1620 bool unlock_up; 1621 1622 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1623 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1624 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1625 parent_level = btrfs_header_level(*eb_ret); 1626 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1627 check.has_first_key = true; 1628 check.level = parent_level - 1; 1629 check.transid = gen; 1630 check.owner_root = root->root_key.objectid; 1631 1632 /* 1633 * If we need to read an extent buffer from disk and we are holding locks 1634 * on upper level nodes, we unlock all the upper nodes before reading the 1635 * extent buffer, and then return -EAGAIN to the caller as it needs to 1636 * restart the search. We don't release the lock on the current level 1637 * because we need to walk this node to figure out which blocks to read. 1638 */ 1639 tmp = find_extent_buffer(fs_info, blocknr); 1640 if (tmp) { 1641 if (p->reada == READA_FORWARD_ALWAYS) 1642 reada_for_search(fs_info, p, level, slot, key->objectid); 1643 1644 /* first we do an atomic uptodate check */ 1645 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1646 /* 1647 * Do extra check for first_key, eb can be stale due to 1648 * being cached, read from scrub, or have multiple 1649 * parents (shared tree blocks). 1650 */ 1651 if (btrfs_verify_level_key(tmp, 1652 parent_level - 1, &check.first_key, gen)) { 1653 free_extent_buffer(tmp); 1654 return -EUCLEAN; 1655 } 1656 *eb_ret = tmp; 1657 return 0; 1658 } 1659 1660 if (p->nowait) { 1661 free_extent_buffer(tmp); 1662 return -EAGAIN; 1663 } 1664 1665 if (unlock_up) 1666 btrfs_unlock_up_safe(p, level + 1); 1667 1668 /* now we're allowed to do a blocking uptodate check */ 1669 ret = btrfs_read_extent_buffer(tmp, &check); 1670 if (ret) { 1671 free_extent_buffer(tmp); 1672 btrfs_release_path(p); 1673 return -EIO; 1674 } 1675 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1676 free_extent_buffer(tmp); 1677 btrfs_release_path(p); 1678 return -EUCLEAN; 1679 } 1680 1681 if (unlock_up) 1682 ret = -EAGAIN; 1683 1684 goto out; 1685 } else if (p->nowait) { 1686 return -EAGAIN; 1687 } 1688 1689 if (unlock_up) { 1690 btrfs_unlock_up_safe(p, level + 1); 1691 ret = -EAGAIN; 1692 } else { 1693 ret = 0; 1694 } 1695 1696 if (p->reada != READA_NONE) 1697 reada_for_search(fs_info, p, level, slot, key->objectid); 1698 1699 tmp = read_tree_block(fs_info, blocknr, &check); 1700 if (IS_ERR(tmp)) { 1701 btrfs_release_path(p); 1702 return PTR_ERR(tmp); 1703 } 1704 /* 1705 * If the read above didn't mark this buffer up to date, 1706 * it will never end up being up to date. Set ret to EIO now 1707 * and give up so that our caller doesn't loop forever 1708 * on our EAGAINs. 1709 */ 1710 if (!extent_buffer_uptodate(tmp)) 1711 ret = -EIO; 1712 1713 out: 1714 if (ret == 0) { 1715 *eb_ret = tmp; 1716 } else { 1717 free_extent_buffer(tmp); 1718 btrfs_release_path(p); 1719 } 1720 1721 return ret; 1722 } 1723 1724 /* 1725 * helper function for btrfs_search_slot. This does all of the checks 1726 * for node-level blocks and does any balancing required based on 1727 * the ins_len. 1728 * 1729 * If no extra work was required, zero is returned. If we had to 1730 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1731 * start over 1732 */ 1733 static int 1734 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1735 struct btrfs_root *root, struct btrfs_path *p, 1736 struct extent_buffer *b, int level, int ins_len, 1737 int *write_lock_level) 1738 { 1739 struct btrfs_fs_info *fs_info = root->fs_info; 1740 int ret = 0; 1741 1742 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1743 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1744 1745 if (*write_lock_level < level + 1) { 1746 *write_lock_level = level + 1; 1747 btrfs_release_path(p); 1748 return -EAGAIN; 1749 } 1750 1751 reada_for_balance(p, level); 1752 ret = split_node(trans, root, p, level); 1753 1754 b = p->nodes[level]; 1755 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1756 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1757 1758 if (*write_lock_level < level + 1) { 1759 *write_lock_level = level + 1; 1760 btrfs_release_path(p); 1761 return -EAGAIN; 1762 } 1763 1764 reada_for_balance(p, level); 1765 ret = balance_level(trans, root, p, level); 1766 if (ret) 1767 return ret; 1768 1769 b = p->nodes[level]; 1770 if (!b) { 1771 btrfs_release_path(p); 1772 return -EAGAIN; 1773 } 1774 BUG_ON(btrfs_header_nritems(b) == 1); 1775 } 1776 return ret; 1777 } 1778 1779 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1780 u64 iobjectid, u64 ioff, u8 key_type, 1781 struct btrfs_key *found_key) 1782 { 1783 int ret; 1784 struct btrfs_key key; 1785 struct extent_buffer *eb; 1786 1787 ASSERT(path); 1788 ASSERT(found_key); 1789 1790 key.type = key_type; 1791 key.objectid = iobjectid; 1792 key.offset = ioff; 1793 1794 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1795 if (ret < 0) 1796 return ret; 1797 1798 eb = path->nodes[0]; 1799 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1800 ret = btrfs_next_leaf(fs_root, path); 1801 if (ret) 1802 return ret; 1803 eb = path->nodes[0]; 1804 } 1805 1806 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1807 if (found_key->type != key.type || 1808 found_key->objectid != key.objectid) 1809 return 1; 1810 1811 return 0; 1812 } 1813 1814 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1815 struct btrfs_path *p, 1816 int write_lock_level) 1817 { 1818 struct extent_buffer *b; 1819 int root_lock = 0; 1820 int level = 0; 1821 1822 if (p->search_commit_root) { 1823 b = root->commit_root; 1824 atomic_inc(&b->refs); 1825 level = btrfs_header_level(b); 1826 /* 1827 * Ensure that all callers have set skip_locking when 1828 * p->search_commit_root = 1. 1829 */ 1830 ASSERT(p->skip_locking == 1); 1831 1832 goto out; 1833 } 1834 1835 if (p->skip_locking) { 1836 b = btrfs_root_node(root); 1837 level = btrfs_header_level(b); 1838 goto out; 1839 } 1840 1841 /* We try very hard to do read locks on the root */ 1842 root_lock = BTRFS_READ_LOCK; 1843 1844 /* 1845 * If the level is set to maximum, we can skip trying to get the read 1846 * lock. 1847 */ 1848 if (write_lock_level < BTRFS_MAX_LEVEL) { 1849 /* 1850 * We don't know the level of the root node until we actually 1851 * have it read locked 1852 */ 1853 if (p->nowait) { 1854 b = btrfs_try_read_lock_root_node(root); 1855 if (IS_ERR(b)) 1856 return b; 1857 } else { 1858 b = btrfs_read_lock_root_node(root); 1859 } 1860 level = btrfs_header_level(b); 1861 if (level > write_lock_level) 1862 goto out; 1863 1864 /* Whoops, must trade for write lock */ 1865 btrfs_tree_read_unlock(b); 1866 free_extent_buffer(b); 1867 } 1868 1869 b = btrfs_lock_root_node(root); 1870 root_lock = BTRFS_WRITE_LOCK; 1871 1872 /* The level might have changed, check again */ 1873 level = btrfs_header_level(b); 1874 1875 out: 1876 /* 1877 * The root may have failed to write out at some point, and thus is no 1878 * longer valid, return an error in this case. 1879 */ 1880 if (!extent_buffer_uptodate(b)) { 1881 if (root_lock) 1882 btrfs_tree_unlock_rw(b, root_lock); 1883 free_extent_buffer(b); 1884 return ERR_PTR(-EIO); 1885 } 1886 1887 p->nodes[level] = b; 1888 if (!p->skip_locking) 1889 p->locks[level] = root_lock; 1890 /* 1891 * Callers are responsible for dropping b's references. 1892 */ 1893 return b; 1894 } 1895 1896 /* 1897 * Replace the extent buffer at the lowest level of the path with a cloned 1898 * version. The purpose is to be able to use it safely, after releasing the 1899 * commit root semaphore, even if relocation is happening in parallel, the 1900 * transaction used for relocation is committed and the extent buffer is 1901 * reallocated in the next transaction. 1902 * 1903 * This is used in a context where the caller does not prevent transaction 1904 * commits from happening, either by holding a transaction handle or holding 1905 * some lock, while it's doing searches through a commit root. 1906 * At the moment it's only used for send operations. 1907 */ 1908 static int finish_need_commit_sem_search(struct btrfs_path *path) 1909 { 1910 const int i = path->lowest_level; 1911 const int slot = path->slots[i]; 1912 struct extent_buffer *lowest = path->nodes[i]; 1913 struct extent_buffer *clone; 1914 1915 ASSERT(path->need_commit_sem); 1916 1917 if (!lowest) 1918 return 0; 1919 1920 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1921 1922 clone = btrfs_clone_extent_buffer(lowest); 1923 if (!clone) 1924 return -ENOMEM; 1925 1926 btrfs_release_path(path); 1927 path->nodes[i] = clone; 1928 path->slots[i] = slot; 1929 1930 return 0; 1931 } 1932 1933 static inline int search_for_key_slot(struct extent_buffer *eb, 1934 int search_low_slot, 1935 const struct btrfs_key *key, 1936 int prev_cmp, 1937 int *slot) 1938 { 1939 /* 1940 * If a previous call to btrfs_bin_search() on a parent node returned an 1941 * exact match (prev_cmp == 0), we can safely assume the target key will 1942 * always be at slot 0 on lower levels, since each key pointer 1943 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1944 * subtree it points to. Thus we can skip searching lower levels. 1945 */ 1946 if (prev_cmp == 0) { 1947 *slot = 0; 1948 return 0; 1949 } 1950 1951 return btrfs_bin_search(eb, search_low_slot, key, slot); 1952 } 1953 1954 static int search_leaf(struct btrfs_trans_handle *trans, 1955 struct btrfs_root *root, 1956 const struct btrfs_key *key, 1957 struct btrfs_path *path, 1958 int ins_len, 1959 int prev_cmp) 1960 { 1961 struct extent_buffer *leaf = path->nodes[0]; 1962 int leaf_free_space = -1; 1963 int search_low_slot = 0; 1964 int ret; 1965 bool do_bin_search = true; 1966 1967 /* 1968 * If we are doing an insertion, the leaf has enough free space and the 1969 * destination slot for the key is not slot 0, then we can unlock our 1970 * write lock on the parent, and any other upper nodes, before doing the 1971 * binary search on the leaf (with search_for_key_slot()), allowing other 1972 * tasks to lock the parent and any other upper nodes. 1973 */ 1974 if (ins_len > 0) { 1975 /* 1976 * Cache the leaf free space, since we will need it later and it 1977 * will not change until then. 1978 */ 1979 leaf_free_space = btrfs_leaf_free_space(leaf); 1980 1981 /* 1982 * !path->locks[1] means we have a single node tree, the leaf is 1983 * the root of the tree. 1984 */ 1985 if (path->locks[1] && leaf_free_space >= ins_len) { 1986 struct btrfs_disk_key first_key; 1987 1988 ASSERT(btrfs_header_nritems(leaf) > 0); 1989 btrfs_item_key(leaf, &first_key, 0); 1990 1991 /* 1992 * Doing the extra comparison with the first key is cheap, 1993 * taking into account that the first key is very likely 1994 * already in a cache line because it immediately follows 1995 * the extent buffer's header and we have recently accessed 1996 * the header's level field. 1997 */ 1998 ret = comp_keys(&first_key, key); 1999 if (ret < 0) { 2000 /* 2001 * The first key is smaller than the key we want 2002 * to insert, so we are safe to unlock all upper 2003 * nodes and we have to do the binary search. 2004 * 2005 * We do use btrfs_unlock_up_safe() and not 2006 * unlock_up() because the later does not unlock 2007 * nodes with a slot of 0 - we can safely unlock 2008 * any node even if its slot is 0 since in this 2009 * case the key does not end up at slot 0 of the 2010 * leaf and there's no need to split the leaf. 2011 */ 2012 btrfs_unlock_up_safe(path, 1); 2013 search_low_slot = 1; 2014 } else { 2015 /* 2016 * The first key is >= then the key we want to 2017 * insert, so we can skip the binary search as 2018 * the target key will be at slot 0. 2019 * 2020 * We can not unlock upper nodes when the key is 2021 * less than the first key, because we will need 2022 * to update the key at slot 0 of the parent node 2023 * and possibly of other upper nodes too. 2024 * If the key matches the first key, then we can 2025 * unlock all the upper nodes, using 2026 * btrfs_unlock_up_safe() instead of unlock_up() 2027 * as stated above. 2028 */ 2029 if (ret == 0) 2030 btrfs_unlock_up_safe(path, 1); 2031 /* 2032 * ret is already 0 or 1, matching the result of 2033 * a btrfs_bin_search() call, so there is no need 2034 * to adjust it. 2035 */ 2036 do_bin_search = false; 2037 path->slots[0] = 0; 2038 } 2039 } 2040 } 2041 2042 if (do_bin_search) { 2043 ret = search_for_key_slot(leaf, search_low_slot, key, 2044 prev_cmp, &path->slots[0]); 2045 if (ret < 0) 2046 return ret; 2047 } 2048 2049 if (ins_len > 0) { 2050 /* 2051 * Item key already exists. In this case, if we are allowed to 2052 * insert the item (for example, in dir_item case, item key 2053 * collision is allowed), it will be merged with the original 2054 * item. Only the item size grows, no new btrfs item will be 2055 * added. If search_for_extension is not set, ins_len already 2056 * accounts the size btrfs_item, deduct it here so leaf space 2057 * check will be correct. 2058 */ 2059 if (ret == 0 && !path->search_for_extension) { 2060 ASSERT(ins_len >= sizeof(struct btrfs_item)); 2061 ins_len -= sizeof(struct btrfs_item); 2062 } 2063 2064 ASSERT(leaf_free_space >= 0); 2065 2066 if (leaf_free_space < ins_len) { 2067 int err; 2068 2069 err = split_leaf(trans, root, key, path, ins_len, 2070 (ret == 0)); 2071 ASSERT(err <= 0); 2072 if (WARN_ON(err > 0)) 2073 err = -EUCLEAN; 2074 if (err) 2075 ret = err; 2076 } 2077 } 2078 2079 return ret; 2080 } 2081 2082 /* 2083 * btrfs_search_slot - look for a key in a tree and perform necessary 2084 * modifications to preserve tree invariants. 2085 * 2086 * @trans: Handle of transaction, used when modifying the tree 2087 * @p: Holds all btree nodes along the search path 2088 * @root: The root node of the tree 2089 * @key: The key we are looking for 2090 * @ins_len: Indicates purpose of search: 2091 * >0 for inserts it's size of item inserted (*) 2092 * <0 for deletions 2093 * 0 for plain searches, not modifying the tree 2094 * 2095 * (*) If size of item inserted doesn't include 2096 * sizeof(struct btrfs_item), then p->search_for_extension must 2097 * be set. 2098 * @cow: boolean should CoW operations be performed. Must always be 1 2099 * when modifying the tree. 2100 * 2101 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2102 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2103 * 2104 * If @key is found, 0 is returned and you can find the item in the leaf level 2105 * of the path (level 0) 2106 * 2107 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2108 * points to the slot where it should be inserted 2109 * 2110 * If an error is encountered while searching the tree a negative error number 2111 * is returned 2112 */ 2113 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2114 const struct btrfs_key *key, struct btrfs_path *p, 2115 int ins_len, int cow) 2116 { 2117 struct btrfs_fs_info *fs_info = root->fs_info; 2118 struct extent_buffer *b; 2119 int slot; 2120 int ret; 2121 int err; 2122 int level; 2123 int lowest_unlock = 1; 2124 /* everything at write_lock_level or lower must be write locked */ 2125 int write_lock_level = 0; 2126 u8 lowest_level = 0; 2127 int min_write_lock_level; 2128 int prev_cmp; 2129 2130 might_sleep(); 2131 2132 lowest_level = p->lowest_level; 2133 WARN_ON(lowest_level && ins_len > 0); 2134 WARN_ON(p->nodes[0] != NULL); 2135 BUG_ON(!cow && ins_len); 2136 2137 /* 2138 * For now only allow nowait for read only operations. There's no 2139 * strict reason why we can't, we just only need it for reads so it's 2140 * only implemented for reads. 2141 */ 2142 ASSERT(!p->nowait || !cow); 2143 2144 if (ins_len < 0) { 2145 lowest_unlock = 2; 2146 2147 /* when we are removing items, we might have to go up to level 2148 * two as we update tree pointers Make sure we keep write 2149 * for those levels as well 2150 */ 2151 write_lock_level = 2; 2152 } else if (ins_len > 0) { 2153 /* 2154 * for inserting items, make sure we have a write lock on 2155 * level 1 so we can update keys 2156 */ 2157 write_lock_level = 1; 2158 } 2159 2160 if (!cow) 2161 write_lock_level = -1; 2162 2163 if (cow && (p->keep_locks || p->lowest_level)) 2164 write_lock_level = BTRFS_MAX_LEVEL; 2165 2166 min_write_lock_level = write_lock_level; 2167 2168 if (p->need_commit_sem) { 2169 ASSERT(p->search_commit_root); 2170 if (p->nowait) { 2171 if (!down_read_trylock(&fs_info->commit_root_sem)) 2172 return -EAGAIN; 2173 } else { 2174 down_read(&fs_info->commit_root_sem); 2175 } 2176 } 2177 2178 again: 2179 prev_cmp = -1; 2180 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2181 if (IS_ERR(b)) { 2182 ret = PTR_ERR(b); 2183 goto done; 2184 } 2185 2186 while (b) { 2187 int dec = 0; 2188 2189 level = btrfs_header_level(b); 2190 2191 if (cow) { 2192 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2193 2194 /* 2195 * if we don't really need to cow this block 2196 * then we don't want to set the path blocking, 2197 * so we test it here 2198 */ 2199 if (!should_cow_block(trans, root, b)) 2200 goto cow_done; 2201 2202 /* 2203 * must have write locks on this node and the 2204 * parent 2205 */ 2206 if (level > write_lock_level || 2207 (level + 1 > write_lock_level && 2208 level + 1 < BTRFS_MAX_LEVEL && 2209 p->nodes[level + 1])) { 2210 write_lock_level = level + 1; 2211 btrfs_release_path(p); 2212 goto again; 2213 } 2214 2215 if (last_level) 2216 err = btrfs_cow_block(trans, root, b, NULL, 0, 2217 &b, 2218 BTRFS_NESTING_COW); 2219 else 2220 err = btrfs_cow_block(trans, root, b, 2221 p->nodes[level + 1], 2222 p->slots[level + 1], &b, 2223 BTRFS_NESTING_COW); 2224 if (err) { 2225 ret = err; 2226 goto done; 2227 } 2228 } 2229 cow_done: 2230 p->nodes[level] = b; 2231 2232 /* 2233 * we have a lock on b and as long as we aren't changing 2234 * the tree, there is no way to for the items in b to change. 2235 * It is safe to drop the lock on our parent before we 2236 * go through the expensive btree search on b. 2237 * 2238 * If we're inserting or deleting (ins_len != 0), then we might 2239 * be changing slot zero, which may require changing the parent. 2240 * So, we can't drop the lock until after we know which slot 2241 * we're operating on. 2242 */ 2243 if (!ins_len && !p->keep_locks) { 2244 int u = level + 1; 2245 2246 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2247 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2248 p->locks[u] = 0; 2249 } 2250 } 2251 2252 if (level == 0) { 2253 if (ins_len > 0) 2254 ASSERT(write_lock_level >= 1); 2255 2256 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2257 if (!p->search_for_split) 2258 unlock_up(p, level, lowest_unlock, 2259 min_write_lock_level, NULL); 2260 goto done; 2261 } 2262 2263 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2264 if (ret < 0) 2265 goto done; 2266 prev_cmp = ret; 2267 2268 if (ret && slot > 0) { 2269 dec = 1; 2270 slot--; 2271 } 2272 p->slots[level] = slot; 2273 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2274 &write_lock_level); 2275 if (err == -EAGAIN) 2276 goto again; 2277 if (err) { 2278 ret = err; 2279 goto done; 2280 } 2281 b = p->nodes[level]; 2282 slot = p->slots[level]; 2283 2284 /* 2285 * Slot 0 is special, if we change the key we have to update 2286 * the parent pointer which means we must have a write lock on 2287 * the parent 2288 */ 2289 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2290 write_lock_level = level + 1; 2291 btrfs_release_path(p); 2292 goto again; 2293 } 2294 2295 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2296 &write_lock_level); 2297 2298 if (level == lowest_level) { 2299 if (dec) 2300 p->slots[level]++; 2301 goto done; 2302 } 2303 2304 err = read_block_for_search(root, p, &b, level, slot, key); 2305 if (err == -EAGAIN) 2306 goto again; 2307 if (err) { 2308 ret = err; 2309 goto done; 2310 } 2311 2312 if (!p->skip_locking) { 2313 level = btrfs_header_level(b); 2314 2315 btrfs_maybe_reset_lockdep_class(root, b); 2316 2317 if (level <= write_lock_level) { 2318 btrfs_tree_lock(b); 2319 p->locks[level] = BTRFS_WRITE_LOCK; 2320 } else { 2321 if (p->nowait) { 2322 if (!btrfs_try_tree_read_lock(b)) { 2323 free_extent_buffer(b); 2324 ret = -EAGAIN; 2325 goto done; 2326 } 2327 } else { 2328 btrfs_tree_read_lock(b); 2329 } 2330 p->locks[level] = BTRFS_READ_LOCK; 2331 } 2332 p->nodes[level] = b; 2333 } 2334 } 2335 ret = 1; 2336 done: 2337 if (ret < 0 && !p->skip_release_on_error) 2338 btrfs_release_path(p); 2339 2340 if (p->need_commit_sem) { 2341 int ret2; 2342 2343 ret2 = finish_need_commit_sem_search(p); 2344 up_read(&fs_info->commit_root_sem); 2345 if (ret2) 2346 ret = ret2; 2347 } 2348 2349 return ret; 2350 } 2351 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2352 2353 /* 2354 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2355 * current state of the tree together with the operations recorded in the tree 2356 * modification log to search for the key in a previous version of this tree, as 2357 * denoted by the time_seq parameter. 2358 * 2359 * Naturally, there is no support for insert, delete or cow operations. 2360 * 2361 * The resulting path and return value will be set up as if we called 2362 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2363 */ 2364 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2365 struct btrfs_path *p, u64 time_seq) 2366 { 2367 struct btrfs_fs_info *fs_info = root->fs_info; 2368 struct extent_buffer *b; 2369 int slot; 2370 int ret; 2371 int err; 2372 int level; 2373 int lowest_unlock = 1; 2374 u8 lowest_level = 0; 2375 2376 lowest_level = p->lowest_level; 2377 WARN_ON(p->nodes[0] != NULL); 2378 ASSERT(!p->nowait); 2379 2380 if (p->search_commit_root) { 2381 BUG_ON(time_seq); 2382 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2383 } 2384 2385 again: 2386 b = btrfs_get_old_root(root, time_seq); 2387 if (!b) { 2388 ret = -EIO; 2389 goto done; 2390 } 2391 level = btrfs_header_level(b); 2392 p->locks[level] = BTRFS_READ_LOCK; 2393 2394 while (b) { 2395 int dec = 0; 2396 2397 level = btrfs_header_level(b); 2398 p->nodes[level] = b; 2399 2400 /* 2401 * we have a lock on b and as long as we aren't changing 2402 * the tree, there is no way to for the items in b to change. 2403 * It is safe to drop the lock on our parent before we 2404 * go through the expensive btree search on b. 2405 */ 2406 btrfs_unlock_up_safe(p, level + 1); 2407 2408 ret = btrfs_bin_search(b, 0, key, &slot); 2409 if (ret < 0) 2410 goto done; 2411 2412 if (level == 0) { 2413 p->slots[level] = slot; 2414 unlock_up(p, level, lowest_unlock, 0, NULL); 2415 goto done; 2416 } 2417 2418 if (ret && slot > 0) { 2419 dec = 1; 2420 slot--; 2421 } 2422 p->slots[level] = slot; 2423 unlock_up(p, level, lowest_unlock, 0, NULL); 2424 2425 if (level == lowest_level) { 2426 if (dec) 2427 p->slots[level]++; 2428 goto done; 2429 } 2430 2431 err = read_block_for_search(root, p, &b, level, slot, key); 2432 if (err == -EAGAIN) 2433 goto again; 2434 if (err) { 2435 ret = err; 2436 goto done; 2437 } 2438 2439 level = btrfs_header_level(b); 2440 btrfs_tree_read_lock(b); 2441 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2442 if (!b) { 2443 ret = -ENOMEM; 2444 goto done; 2445 } 2446 p->locks[level] = BTRFS_READ_LOCK; 2447 p->nodes[level] = b; 2448 } 2449 ret = 1; 2450 done: 2451 if (ret < 0) 2452 btrfs_release_path(p); 2453 2454 return ret; 2455 } 2456 2457 /* 2458 * Search the tree again to find a leaf with smaller keys. 2459 * Returns 0 if it found something. 2460 * Returns 1 if there are no smaller keys. 2461 * Returns < 0 on error. 2462 * 2463 * This may release the path, and so you may lose any locks held at the 2464 * time you call it. 2465 */ 2466 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2467 { 2468 struct btrfs_key key; 2469 struct btrfs_key orig_key; 2470 struct btrfs_disk_key found_key; 2471 int ret; 2472 2473 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2474 orig_key = key; 2475 2476 if (key.offset > 0) { 2477 key.offset--; 2478 } else if (key.type > 0) { 2479 key.type--; 2480 key.offset = (u64)-1; 2481 } else if (key.objectid > 0) { 2482 key.objectid--; 2483 key.type = (u8)-1; 2484 key.offset = (u64)-1; 2485 } else { 2486 return 1; 2487 } 2488 2489 btrfs_release_path(path); 2490 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2491 if (ret <= 0) 2492 return ret; 2493 2494 /* 2495 * Previous key not found. Even if we were at slot 0 of the leaf we had 2496 * before releasing the path and calling btrfs_search_slot(), we now may 2497 * be in a slot pointing to the same original key - this can happen if 2498 * after we released the path, one of more items were moved from a 2499 * sibling leaf into the front of the leaf we had due to an insertion 2500 * (see push_leaf_right()). 2501 * If we hit this case and our slot is > 0 and just decrement the slot 2502 * so that the caller does not process the same key again, which may or 2503 * may not break the caller, depending on its logic. 2504 */ 2505 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2506 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2507 ret = comp_keys(&found_key, &orig_key); 2508 if (ret == 0) { 2509 if (path->slots[0] > 0) { 2510 path->slots[0]--; 2511 return 0; 2512 } 2513 /* 2514 * At slot 0, same key as before, it means orig_key is 2515 * the lowest, leftmost, key in the tree. We're done. 2516 */ 2517 return 1; 2518 } 2519 } 2520 2521 btrfs_item_key(path->nodes[0], &found_key, 0); 2522 ret = comp_keys(&found_key, &key); 2523 /* 2524 * We might have had an item with the previous key in the tree right 2525 * before we released our path. And after we released our path, that 2526 * item might have been pushed to the first slot (0) of the leaf we 2527 * were holding due to a tree balance. Alternatively, an item with the 2528 * previous key can exist as the only element of a leaf (big fat item). 2529 * Therefore account for these 2 cases, so that our callers (like 2530 * btrfs_previous_item) don't miss an existing item with a key matching 2531 * the previous key we computed above. 2532 */ 2533 if (ret <= 0) 2534 return 0; 2535 return 1; 2536 } 2537 2538 /* 2539 * helper to use instead of search slot if no exact match is needed but 2540 * instead the next or previous item should be returned. 2541 * When find_higher is true, the next higher item is returned, the next lower 2542 * otherwise. 2543 * When return_any and find_higher are both true, and no higher item is found, 2544 * return the next lower instead. 2545 * When return_any is true and find_higher is false, and no lower item is found, 2546 * return the next higher instead. 2547 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2548 * < 0 on error 2549 */ 2550 int btrfs_search_slot_for_read(struct btrfs_root *root, 2551 const struct btrfs_key *key, 2552 struct btrfs_path *p, int find_higher, 2553 int return_any) 2554 { 2555 int ret; 2556 struct extent_buffer *leaf; 2557 2558 again: 2559 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2560 if (ret <= 0) 2561 return ret; 2562 /* 2563 * a return value of 1 means the path is at the position where the 2564 * item should be inserted. Normally this is the next bigger item, 2565 * but in case the previous item is the last in a leaf, path points 2566 * to the first free slot in the previous leaf, i.e. at an invalid 2567 * item. 2568 */ 2569 leaf = p->nodes[0]; 2570 2571 if (find_higher) { 2572 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2573 ret = btrfs_next_leaf(root, p); 2574 if (ret <= 0) 2575 return ret; 2576 if (!return_any) 2577 return 1; 2578 /* 2579 * no higher item found, return the next 2580 * lower instead 2581 */ 2582 return_any = 0; 2583 find_higher = 0; 2584 btrfs_release_path(p); 2585 goto again; 2586 } 2587 } else { 2588 if (p->slots[0] == 0) { 2589 ret = btrfs_prev_leaf(root, p); 2590 if (ret < 0) 2591 return ret; 2592 if (!ret) { 2593 leaf = p->nodes[0]; 2594 if (p->slots[0] == btrfs_header_nritems(leaf)) 2595 p->slots[0]--; 2596 return 0; 2597 } 2598 if (!return_any) 2599 return 1; 2600 /* 2601 * no lower item found, return the next 2602 * higher instead 2603 */ 2604 return_any = 0; 2605 find_higher = 1; 2606 btrfs_release_path(p); 2607 goto again; 2608 } else { 2609 --p->slots[0]; 2610 } 2611 } 2612 return 0; 2613 } 2614 2615 /* 2616 * Execute search and call btrfs_previous_item to traverse backwards if the item 2617 * was not found. 2618 * 2619 * Return 0 if found, 1 if not found and < 0 if error. 2620 */ 2621 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2622 struct btrfs_path *path) 2623 { 2624 int ret; 2625 2626 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2627 if (ret > 0) 2628 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2629 2630 if (ret == 0) 2631 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2632 2633 return ret; 2634 } 2635 2636 /* 2637 * Search for a valid slot for the given path. 2638 * 2639 * @root: The root node of the tree. 2640 * @key: Will contain a valid item if found. 2641 * @path: The starting point to validate the slot. 2642 * 2643 * Return: 0 if the item is valid 2644 * 1 if not found 2645 * <0 if error. 2646 */ 2647 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2648 struct btrfs_path *path) 2649 { 2650 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2651 int ret; 2652 2653 ret = btrfs_next_leaf(root, path); 2654 if (ret) 2655 return ret; 2656 } 2657 2658 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2659 return 0; 2660 } 2661 2662 /* 2663 * adjust the pointers going up the tree, starting at level 2664 * making sure the right key of each node is points to 'key'. 2665 * This is used after shifting pointers to the left, so it stops 2666 * fixing up pointers when a given leaf/node is not in slot 0 of the 2667 * higher levels 2668 * 2669 */ 2670 static void fixup_low_keys(struct btrfs_path *path, 2671 struct btrfs_disk_key *key, int level) 2672 { 2673 int i; 2674 struct extent_buffer *t; 2675 int ret; 2676 2677 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2678 int tslot = path->slots[i]; 2679 2680 if (!path->nodes[i]) 2681 break; 2682 t = path->nodes[i]; 2683 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2684 BTRFS_MOD_LOG_KEY_REPLACE); 2685 BUG_ON(ret < 0); 2686 btrfs_set_node_key(t, key, tslot); 2687 btrfs_mark_buffer_dirty(path->nodes[i]); 2688 if (tslot != 0) 2689 break; 2690 } 2691 } 2692 2693 /* 2694 * update item key. 2695 * 2696 * This function isn't completely safe. It's the caller's responsibility 2697 * that the new key won't break the order 2698 */ 2699 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2700 struct btrfs_path *path, 2701 const struct btrfs_key *new_key) 2702 { 2703 struct btrfs_disk_key disk_key; 2704 struct extent_buffer *eb; 2705 int slot; 2706 2707 eb = path->nodes[0]; 2708 slot = path->slots[0]; 2709 if (slot > 0) { 2710 btrfs_item_key(eb, &disk_key, slot - 1); 2711 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2712 btrfs_print_leaf(eb); 2713 btrfs_crit(fs_info, 2714 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2715 slot, btrfs_disk_key_objectid(&disk_key), 2716 btrfs_disk_key_type(&disk_key), 2717 btrfs_disk_key_offset(&disk_key), 2718 new_key->objectid, new_key->type, 2719 new_key->offset); 2720 BUG(); 2721 } 2722 } 2723 if (slot < btrfs_header_nritems(eb) - 1) { 2724 btrfs_item_key(eb, &disk_key, slot + 1); 2725 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2726 btrfs_print_leaf(eb); 2727 btrfs_crit(fs_info, 2728 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2729 slot, btrfs_disk_key_objectid(&disk_key), 2730 btrfs_disk_key_type(&disk_key), 2731 btrfs_disk_key_offset(&disk_key), 2732 new_key->objectid, new_key->type, 2733 new_key->offset); 2734 BUG(); 2735 } 2736 } 2737 2738 btrfs_cpu_key_to_disk(&disk_key, new_key); 2739 btrfs_set_item_key(eb, &disk_key, slot); 2740 btrfs_mark_buffer_dirty(eb); 2741 if (slot == 0) 2742 fixup_low_keys(path, &disk_key, 1); 2743 } 2744 2745 /* 2746 * Check key order of two sibling extent buffers. 2747 * 2748 * Return true if something is wrong. 2749 * Return false if everything is fine. 2750 * 2751 * Tree-checker only works inside one tree block, thus the following 2752 * corruption can not be detected by tree-checker: 2753 * 2754 * Leaf @left | Leaf @right 2755 * -------------------------------------------------------------- 2756 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2757 * 2758 * Key f6 in leaf @left itself is valid, but not valid when the next 2759 * key in leaf @right is 7. 2760 * This can only be checked at tree block merge time. 2761 * And since tree checker has ensured all key order in each tree block 2762 * is correct, we only need to bother the last key of @left and the first 2763 * key of @right. 2764 */ 2765 static bool check_sibling_keys(struct extent_buffer *left, 2766 struct extent_buffer *right) 2767 { 2768 struct btrfs_key left_last; 2769 struct btrfs_key right_first; 2770 int level = btrfs_header_level(left); 2771 int nr_left = btrfs_header_nritems(left); 2772 int nr_right = btrfs_header_nritems(right); 2773 2774 /* No key to check in one of the tree blocks */ 2775 if (!nr_left || !nr_right) 2776 return false; 2777 2778 if (level) { 2779 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2780 btrfs_node_key_to_cpu(right, &right_first, 0); 2781 } else { 2782 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2783 btrfs_item_key_to_cpu(right, &right_first, 0); 2784 } 2785 2786 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2787 btrfs_crit(left->fs_info, "left extent buffer:"); 2788 btrfs_print_tree(left, false); 2789 btrfs_crit(left->fs_info, "right extent buffer:"); 2790 btrfs_print_tree(right, false); 2791 btrfs_crit(left->fs_info, 2792 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2793 left_last.objectid, left_last.type, 2794 left_last.offset, right_first.objectid, 2795 right_first.type, right_first.offset); 2796 return true; 2797 } 2798 return false; 2799 } 2800 2801 /* 2802 * try to push data from one node into the next node left in the 2803 * tree. 2804 * 2805 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2806 * error, and > 0 if there was no room in the left hand block. 2807 */ 2808 static int push_node_left(struct btrfs_trans_handle *trans, 2809 struct extent_buffer *dst, 2810 struct extent_buffer *src, int empty) 2811 { 2812 struct btrfs_fs_info *fs_info = trans->fs_info; 2813 int push_items = 0; 2814 int src_nritems; 2815 int dst_nritems; 2816 int ret = 0; 2817 2818 src_nritems = btrfs_header_nritems(src); 2819 dst_nritems = btrfs_header_nritems(dst); 2820 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2821 WARN_ON(btrfs_header_generation(src) != trans->transid); 2822 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2823 2824 if (!empty && src_nritems <= 8) 2825 return 1; 2826 2827 if (push_items <= 0) 2828 return 1; 2829 2830 if (empty) { 2831 push_items = min(src_nritems, push_items); 2832 if (push_items < src_nritems) { 2833 /* leave at least 8 pointers in the node if 2834 * we aren't going to empty it 2835 */ 2836 if (src_nritems - push_items < 8) { 2837 if (push_items <= 8) 2838 return 1; 2839 push_items -= 8; 2840 } 2841 } 2842 } else 2843 push_items = min(src_nritems - 8, push_items); 2844 2845 /* dst is the left eb, src is the middle eb */ 2846 if (check_sibling_keys(dst, src)) { 2847 ret = -EUCLEAN; 2848 btrfs_abort_transaction(trans, ret); 2849 return ret; 2850 } 2851 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2852 if (ret) { 2853 btrfs_abort_transaction(trans, ret); 2854 return ret; 2855 } 2856 copy_extent_buffer(dst, src, 2857 btrfs_node_key_ptr_offset(dst, dst_nritems), 2858 btrfs_node_key_ptr_offset(src, 0), 2859 push_items * sizeof(struct btrfs_key_ptr)); 2860 2861 if (push_items < src_nritems) { 2862 /* 2863 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2864 * don't need to do an explicit tree mod log operation for it. 2865 */ 2866 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2867 btrfs_node_key_ptr_offset(src, push_items), 2868 (src_nritems - push_items) * 2869 sizeof(struct btrfs_key_ptr)); 2870 } 2871 btrfs_set_header_nritems(src, src_nritems - push_items); 2872 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2873 btrfs_mark_buffer_dirty(src); 2874 btrfs_mark_buffer_dirty(dst); 2875 2876 return ret; 2877 } 2878 2879 /* 2880 * try to push data from one node into the next node right in the 2881 * tree. 2882 * 2883 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2884 * error, and > 0 if there was no room in the right hand block. 2885 * 2886 * this will only push up to 1/2 the contents of the left node over 2887 */ 2888 static int balance_node_right(struct btrfs_trans_handle *trans, 2889 struct extent_buffer *dst, 2890 struct extent_buffer *src) 2891 { 2892 struct btrfs_fs_info *fs_info = trans->fs_info; 2893 int push_items = 0; 2894 int max_push; 2895 int src_nritems; 2896 int dst_nritems; 2897 int ret = 0; 2898 2899 WARN_ON(btrfs_header_generation(src) != trans->transid); 2900 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2901 2902 src_nritems = btrfs_header_nritems(src); 2903 dst_nritems = btrfs_header_nritems(dst); 2904 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2905 if (push_items <= 0) 2906 return 1; 2907 2908 if (src_nritems < 4) 2909 return 1; 2910 2911 max_push = src_nritems / 2 + 1; 2912 /* don't try to empty the node */ 2913 if (max_push >= src_nritems) 2914 return 1; 2915 2916 if (max_push < push_items) 2917 push_items = max_push; 2918 2919 /* dst is the right eb, src is the middle eb */ 2920 if (check_sibling_keys(src, dst)) { 2921 ret = -EUCLEAN; 2922 btrfs_abort_transaction(trans, ret); 2923 return ret; 2924 } 2925 2926 /* 2927 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2928 * need to do an explicit tree mod log operation for it. 2929 */ 2930 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2931 btrfs_node_key_ptr_offset(dst, 0), 2932 (dst_nritems) * 2933 sizeof(struct btrfs_key_ptr)); 2934 2935 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2936 push_items); 2937 if (ret) { 2938 btrfs_abort_transaction(trans, ret); 2939 return ret; 2940 } 2941 copy_extent_buffer(dst, src, 2942 btrfs_node_key_ptr_offset(dst, 0), 2943 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2944 push_items * sizeof(struct btrfs_key_ptr)); 2945 2946 btrfs_set_header_nritems(src, src_nritems - push_items); 2947 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2948 2949 btrfs_mark_buffer_dirty(src); 2950 btrfs_mark_buffer_dirty(dst); 2951 2952 return ret; 2953 } 2954 2955 /* 2956 * helper function to insert a new root level in the tree. 2957 * A new node is allocated, and a single item is inserted to 2958 * point to the existing root 2959 * 2960 * returns zero on success or < 0 on failure. 2961 */ 2962 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2963 struct btrfs_root *root, 2964 struct btrfs_path *path, int level) 2965 { 2966 struct btrfs_fs_info *fs_info = root->fs_info; 2967 u64 lower_gen; 2968 struct extent_buffer *lower; 2969 struct extent_buffer *c; 2970 struct extent_buffer *old; 2971 struct btrfs_disk_key lower_key; 2972 int ret; 2973 2974 BUG_ON(path->nodes[level]); 2975 BUG_ON(path->nodes[level-1] != root->node); 2976 2977 lower = path->nodes[level-1]; 2978 if (level == 1) 2979 btrfs_item_key(lower, &lower_key, 0); 2980 else 2981 btrfs_node_key(lower, &lower_key, 0); 2982 2983 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2984 &lower_key, level, root->node->start, 0, 2985 BTRFS_NESTING_NEW_ROOT); 2986 if (IS_ERR(c)) 2987 return PTR_ERR(c); 2988 2989 root_add_used(root, fs_info->nodesize); 2990 2991 btrfs_set_header_nritems(c, 1); 2992 btrfs_set_node_key(c, &lower_key, 0); 2993 btrfs_set_node_blockptr(c, 0, lower->start); 2994 lower_gen = btrfs_header_generation(lower); 2995 WARN_ON(lower_gen != trans->transid); 2996 2997 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2998 2999 btrfs_mark_buffer_dirty(c); 3000 3001 old = root->node; 3002 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 3003 if (ret < 0) { 3004 btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 3005 btrfs_tree_unlock(c); 3006 free_extent_buffer(c); 3007 return ret; 3008 } 3009 rcu_assign_pointer(root->node, c); 3010 3011 /* the super has an extra ref to root->node */ 3012 free_extent_buffer(old); 3013 3014 add_root_to_dirty_list(root); 3015 atomic_inc(&c->refs); 3016 path->nodes[level] = c; 3017 path->locks[level] = BTRFS_WRITE_LOCK; 3018 path->slots[level] = 0; 3019 return 0; 3020 } 3021 3022 /* 3023 * worker function to insert a single pointer in a node. 3024 * the node should have enough room for the pointer already 3025 * 3026 * slot and level indicate where you want the key to go, and 3027 * blocknr is the block the key points to. 3028 */ 3029 static int insert_ptr(struct btrfs_trans_handle *trans, 3030 struct btrfs_path *path, 3031 struct btrfs_disk_key *key, u64 bytenr, 3032 int slot, int level) 3033 { 3034 struct extent_buffer *lower; 3035 int nritems; 3036 int ret; 3037 3038 BUG_ON(!path->nodes[level]); 3039 btrfs_assert_tree_write_locked(path->nodes[level]); 3040 lower = path->nodes[level]; 3041 nritems = btrfs_header_nritems(lower); 3042 BUG_ON(slot > nritems); 3043 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 3044 if (slot != nritems) { 3045 if (level) { 3046 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 3047 slot, nritems - slot); 3048 if (ret < 0) { 3049 btrfs_abort_transaction(trans, ret); 3050 return ret; 3051 } 3052 } 3053 memmove_extent_buffer(lower, 3054 btrfs_node_key_ptr_offset(lower, slot + 1), 3055 btrfs_node_key_ptr_offset(lower, slot), 3056 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3057 } 3058 if (level) { 3059 ret = btrfs_tree_mod_log_insert_key(lower, slot, 3060 BTRFS_MOD_LOG_KEY_ADD); 3061 if (ret < 0) { 3062 btrfs_abort_transaction(trans, ret); 3063 return ret; 3064 } 3065 } 3066 btrfs_set_node_key(lower, key, slot); 3067 btrfs_set_node_blockptr(lower, slot, bytenr); 3068 WARN_ON(trans->transid == 0); 3069 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3070 btrfs_set_header_nritems(lower, nritems + 1); 3071 btrfs_mark_buffer_dirty(lower); 3072 3073 return 0; 3074 } 3075 3076 /* 3077 * split the node at the specified level in path in two. 3078 * The path is corrected to point to the appropriate node after the split 3079 * 3080 * Before splitting this tries to make some room in the node by pushing 3081 * left and right, if either one works, it returns right away. 3082 * 3083 * returns 0 on success and < 0 on failure 3084 */ 3085 static noinline int split_node(struct btrfs_trans_handle *trans, 3086 struct btrfs_root *root, 3087 struct btrfs_path *path, int level) 3088 { 3089 struct btrfs_fs_info *fs_info = root->fs_info; 3090 struct extent_buffer *c; 3091 struct extent_buffer *split; 3092 struct btrfs_disk_key disk_key; 3093 int mid; 3094 int ret; 3095 u32 c_nritems; 3096 3097 c = path->nodes[level]; 3098 WARN_ON(btrfs_header_generation(c) != trans->transid); 3099 if (c == root->node) { 3100 /* 3101 * trying to split the root, lets make a new one 3102 * 3103 * tree mod log: We don't log_removal old root in 3104 * insert_new_root, because that root buffer will be kept as a 3105 * normal node. We are going to log removal of half of the 3106 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3107 * holding a tree lock on the buffer, which is why we cannot 3108 * race with other tree_mod_log users. 3109 */ 3110 ret = insert_new_root(trans, root, path, level + 1); 3111 if (ret) 3112 return ret; 3113 } else { 3114 ret = push_nodes_for_insert(trans, root, path, level); 3115 c = path->nodes[level]; 3116 if (!ret && btrfs_header_nritems(c) < 3117 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3118 return 0; 3119 if (ret < 0) 3120 return ret; 3121 } 3122 3123 c_nritems = btrfs_header_nritems(c); 3124 mid = (c_nritems + 1) / 2; 3125 btrfs_node_key(c, &disk_key, mid); 3126 3127 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3128 &disk_key, level, c->start, 0, 3129 BTRFS_NESTING_SPLIT); 3130 if (IS_ERR(split)) 3131 return PTR_ERR(split); 3132 3133 root_add_used(root, fs_info->nodesize); 3134 ASSERT(btrfs_header_level(c) == level); 3135 3136 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3137 if (ret) { 3138 btrfs_tree_unlock(split); 3139 free_extent_buffer(split); 3140 btrfs_abort_transaction(trans, ret); 3141 return ret; 3142 } 3143 copy_extent_buffer(split, c, 3144 btrfs_node_key_ptr_offset(split, 0), 3145 btrfs_node_key_ptr_offset(c, mid), 3146 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3147 btrfs_set_header_nritems(split, c_nritems - mid); 3148 btrfs_set_header_nritems(c, mid); 3149 3150 btrfs_mark_buffer_dirty(c); 3151 btrfs_mark_buffer_dirty(split); 3152 3153 ret = insert_ptr(trans, path, &disk_key, split->start, 3154 path->slots[level + 1] + 1, level + 1); 3155 if (ret < 0) { 3156 btrfs_tree_unlock(split); 3157 free_extent_buffer(split); 3158 return ret; 3159 } 3160 3161 if (path->slots[level] >= mid) { 3162 path->slots[level] -= mid; 3163 btrfs_tree_unlock(c); 3164 free_extent_buffer(c); 3165 path->nodes[level] = split; 3166 path->slots[level + 1] += 1; 3167 } else { 3168 btrfs_tree_unlock(split); 3169 free_extent_buffer(split); 3170 } 3171 return 0; 3172 } 3173 3174 /* 3175 * how many bytes are required to store the items in a leaf. start 3176 * and nr indicate which items in the leaf to check. This totals up the 3177 * space used both by the item structs and the item data 3178 */ 3179 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3180 { 3181 int data_len; 3182 int nritems = btrfs_header_nritems(l); 3183 int end = min(nritems, start + nr) - 1; 3184 3185 if (!nr) 3186 return 0; 3187 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3188 data_len = data_len - btrfs_item_offset(l, end); 3189 data_len += sizeof(struct btrfs_item) * nr; 3190 WARN_ON(data_len < 0); 3191 return data_len; 3192 } 3193 3194 /* 3195 * The space between the end of the leaf items and 3196 * the start of the leaf data. IOW, how much room 3197 * the leaf has left for both items and data 3198 */ 3199 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3200 { 3201 struct btrfs_fs_info *fs_info = leaf->fs_info; 3202 int nritems = btrfs_header_nritems(leaf); 3203 int ret; 3204 3205 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3206 if (ret < 0) { 3207 btrfs_crit(fs_info, 3208 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3209 ret, 3210 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3211 leaf_space_used(leaf, 0, nritems), nritems); 3212 } 3213 return ret; 3214 } 3215 3216 /* 3217 * min slot controls the lowest index we're willing to push to the 3218 * right. We'll push up to and including min_slot, but no lower 3219 */ 3220 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3221 struct btrfs_path *path, 3222 int data_size, int empty, 3223 struct extent_buffer *right, 3224 int free_space, u32 left_nritems, 3225 u32 min_slot) 3226 { 3227 struct btrfs_fs_info *fs_info = right->fs_info; 3228 struct extent_buffer *left = path->nodes[0]; 3229 struct extent_buffer *upper = path->nodes[1]; 3230 struct btrfs_map_token token; 3231 struct btrfs_disk_key disk_key; 3232 int slot; 3233 u32 i; 3234 int push_space = 0; 3235 int push_items = 0; 3236 u32 nr; 3237 u32 right_nritems; 3238 u32 data_end; 3239 u32 this_item_size; 3240 3241 if (empty) 3242 nr = 0; 3243 else 3244 nr = max_t(u32, 1, min_slot); 3245 3246 if (path->slots[0] >= left_nritems) 3247 push_space += data_size; 3248 3249 slot = path->slots[1]; 3250 i = left_nritems - 1; 3251 while (i >= nr) { 3252 if (!empty && push_items > 0) { 3253 if (path->slots[0] > i) 3254 break; 3255 if (path->slots[0] == i) { 3256 int space = btrfs_leaf_free_space(left); 3257 3258 if (space + push_space * 2 > free_space) 3259 break; 3260 } 3261 } 3262 3263 if (path->slots[0] == i) 3264 push_space += data_size; 3265 3266 this_item_size = btrfs_item_size(left, i); 3267 if (this_item_size + sizeof(struct btrfs_item) + 3268 push_space > free_space) 3269 break; 3270 3271 push_items++; 3272 push_space += this_item_size + sizeof(struct btrfs_item); 3273 if (i == 0) 3274 break; 3275 i--; 3276 } 3277 3278 if (push_items == 0) 3279 goto out_unlock; 3280 3281 WARN_ON(!empty && push_items == left_nritems); 3282 3283 /* push left to right */ 3284 right_nritems = btrfs_header_nritems(right); 3285 3286 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3287 push_space -= leaf_data_end(left); 3288 3289 /* make room in the right data area */ 3290 data_end = leaf_data_end(right); 3291 memmove_leaf_data(right, data_end - push_space, data_end, 3292 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3293 3294 /* copy from the left data area */ 3295 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3296 leaf_data_end(left), push_space); 3297 3298 memmove_leaf_items(right, push_items, 0, right_nritems); 3299 3300 /* copy the items from left to right */ 3301 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3302 3303 /* update the item pointers */ 3304 btrfs_init_map_token(&token, right); 3305 right_nritems += push_items; 3306 btrfs_set_header_nritems(right, right_nritems); 3307 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3308 for (i = 0; i < right_nritems; i++) { 3309 push_space -= btrfs_token_item_size(&token, i); 3310 btrfs_set_token_item_offset(&token, i, push_space); 3311 } 3312 3313 left_nritems -= push_items; 3314 btrfs_set_header_nritems(left, left_nritems); 3315 3316 if (left_nritems) 3317 btrfs_mark_buffer_dirty(left); 3318 else 3319 btrfs_clear_buffer_dirty(trans, left); 3320 3321 btrfs_mark_buffer_dirty(right); 3322 3323 btrfs_item_key(right, &disk_key, 0); 3324 btrfs_set_node_key(upper, &disk_key, slot + 1); 3325 btrfs_mark_buffer_dirty(upper); 3326 3327 /* then fixup the leaf pointer in the path */ 3328 if (path->slots[0] >= left_nritems) { 3329 path->slots[0] -= left_nritems; 3330 if (btrfs_header_nritems(path->nodes[0]) == 0) 3331 btrfs_clear_buffer_dirty(trans, path->nodes[0]); 3332 btrfs_tree_unlock(path->nodes[0]); 3333 free_extent_buffer(path->nodes[0]); 3334 path->nodes[0] = right; 3335 path->slots[1] += 1; 3336 } else { 3337 btrfs_tree_unlock(right); 3338 free_extent_buffer(right); 3339 } 3340 return 0; 3341 3342 out_unlock: 3343 btrfs_tree_unlock(right); 3344 free_extent_buffer(right); 3345 return 1; 3346 } 3347 3348 /* 3349 * push some data in the path leaf to the right, trying to free up at 3350 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3351 * 3352 * returns 1 if the push failed because the other node didn't have enough 3353 * room, 0 if everything worked out and < 0 if there were major errors. 3354 * 3355 * this will push starting from min_slot to the end of the leaf. It won't 3356 * push any slot lower than min_slot 3357 */ 3358 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3359 *root, struct btrfs_path *path, 3360 int min_data_size, int data_size, 3361 int empty, u32 min_slot) 3362 { 3363 struct extent_buffer *left = path->nodes[0]; 3364 struct extent_buffer *right; 3365 struct extent_buffer *upper; 3366 int slot; 3367 int free_space; 3368 u32 left_nritems; 3369 int ret; 3370 3371 if (!path->nodes[1]) 3372 return 1; 3373 3374 slot = path->slots[1]; 3375 upper = path->nodes[1]; 3376 if (slot >= btrfs_header_nritems(upper) - 1) 3377 return 1; 3378 3379 btrfs_assert_tree_write_locked(path->nodes[1]); 3380 3381 right = btrfs_read_node_slot(upper, slot + 1); 3382 if (IS_ERR(right)) 3383 return PTR_ERR(right); 3384 3385 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 3386 3387 free_space = btrfs_leaf_free_space(right); 3388 if (free_space < data_size) 3389 goto out_unlock; 3390 3391 ret = btrfs_cow_block(trans, root, right, upper, 3392 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3393 if (ret) 3394 goto out_unlock; 3395 3396 left_nritems = btrfs_header_nritems(left); 3397 if (left_nritems == 0) 3398 goto out_unlock; 3399 3400 if (check_sibling_keys(left, right)) { 3401 ret = -EUCLEAN; 3402 btrfs_abort_transaction(trans, ret); 3403 btrfs_tree_unlock(right); 3404 free_extent_buffer(right); 3405 return ret; 3406 } 3407 if (path->slots[0] == left_nritems && !empty) { 3408 /* Key greater than all keys in the leaf, right neighbor has 3409 * enough room for it and we're not emptying our leaf to delete 3410 * it, therefore use right neighbor to insert the new item and 3411 * no need to touch/dirty our left leaf. */ 3412 btrfs_tree_unlock(left); 3413 free_extent_buffer(left); 3414 path->nodes[0] = right; 3415 path->slots[0] = 0; 3416 path->slots[1]++; 3417 return 0; 3418 } 3419 3420 return __push_leaf_right(trans, path, min_data_size, empty, right, 3421 free_space, left_nritems, min_slot); 3422 out_unlock: 3423 btrfs_tree_unlock(right); 3424 free_extent_buffer(right); 3425 return 1; 3426 } 3427 3428 /* 3429 * push some data in the path leaf to the left, trying to free up at 3430 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3431 * 3432 * max_slot can put a limit on how far into the leaf we'll push items. The 3433 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3434 * items 3435 */ 3436 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3437 struct btrfs_path *path, int data_size, 3438 int empty, struct extent_buffer *left, 3439 int free_space, u32 right_nritems, 3440 u32 max_slot) 3441 { 3442 struct btrfs_fs_info *fs_info = left->fs_info; 3443 struct btrfs_disk_key disk_key; 3444 struct extent_buffer *right = path->nodes[0]; 3445 int i; 3446 int push_space = 0; 3447 int push_items = 0; 3448 u32 old_left_nritems; 3449 u32 nr; 3450 int ret = 0; 3451 u32 this_item_size; 3452 u32 old_left_item_size; 3453 struct btrfs_map_token token; 3454 3455 if (empty) 3456 nr = min(right_nritems, max_slot); 3457 else 3458 nr = min(right_nritems - 1, max_slot); 3459 3460 for (i = 0; i < nr; i++) { 3461 if (!empty && push_items > 0) { 3462 if (path->slots[0] < i) 3463 break; 3464 if (path->slots[0] == i) { 3465 int space = btrfs_leaf_free_space(right); 3466 3467 if (space + push_space * 2 > free_space) 3468 break; 3469 } 3470 } 3471 3472 if (path->slots[0] == i) 3473 push_space += data_size; 3474 3475 this_item_size = btrfs_item_size(right, i); 3476 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3477 free_space) 3478 break; 3479 3480 push_items++; 3481 push_space += this_item_size + sizeof(struct btrfs_item); 3482 } 3483 3484 if (push_items == 0) { 3485 ret = 1; 3486 goto out; 3487 } 3488 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3489 3490 /* push data from right to left */ 3491 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3492 3493 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3494 btrfs_item_offset(right, push_items - 1); 3495 3496 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3497 btrfs_item_offset(right, push_items - 1), push_space); 3498 old_left_nritems = btrfs_header_nritems(left); 3499 BUG_ON(old_left_nritems <= 0); 3500 3501 btrfs_init_map_token(&token, left); 3502 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3503 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3504 u32 ioff; 3505 3506 ioff = btrfs_token_item_offset(&token, i); 3507 btrfs_set_token_item_offset(&token, i, 3508 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3509 } 3510 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3511 3512 /* fixup right node */ 3513 if (push_items > right_nritems) 3514 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3515 right_nritems); 3516 3517 if (push_items < right_nritems) { 3518 push_space = btrfs_item_offset(right, push_items - 1) - 3519 leaf_data_end(right); 3520 memmove_leaf_data(right, 3521 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3522 leaf_data_end(right), push_space); 3523 3524 memmove_leaf_items(right, 0, push_items, 3525 btrfs_header_nritems(right) - push_items); 3526 } 3527 3528 btrfs_init_map_token(&token, right); 3529 right_nritems -= push_items; 3530 btrfs_set_header_nritems(right, right_nritems); 3531 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3532 for (i = 0; i < right_nritems; i++) { 3533 push_space = push_space - btrfs_token_item_size(&token, i); 3534 btrfs_set_token_item_offset(&token, i, push_space); 3535 } 3536 3537 btrfs_mark_buffer_dirty(left); 3538 if (right_nritems) 3539 btrfs_mark_buffer_dirty(right); 3540 else 3541 btrfs_clear_buffer_dirty(trans, right); 3542 3543 btrfs_item_key(right, &disk_key, 0); 3544 fixup_low_keys(path, &disk_key, 1); 3545 3546 /* then fixup the leaf pointer in the path */ 3547 if (path->slots[0] < push_items) { 3548 path->slots[0] += old_left_nritems; 3549 btrfs_tree_unlock(path->nodes[0]); 3550 free_extent_buffer(path->nodes[0]); 3551 path->nodes[0] = left; 3552 path->slots[1] -= 1; 3553 } else { 3554 btrfs_tree_unlock(left); 3555 free_extent_buffer(left); 3556 path->slots[0] -= push_items; 3557 } 3558 BUG_ON(path->slots[0] < 0); 3559 return ret; 3560 out: 3561 btrfs_tree_unlock(left); 3562 free_extent_buffer(left); 3563 return ret; 3564 } 3565 3566 /* 3567 * push some data in the path leaf to the left, trying to free up at 3568 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3569 * 3570 * max_slot can put a limit on how far into the leaf we'll push items. The 3571 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3572 * items 3573 */ 3574 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3575 *root, struct btrfs_path *path, int min_data_size, 3576 int data_size, int empty, u32 max_slot) 3577 { 3578 struct extent_buffer *right = path->nodes[0]; 3579 struct extent_buffer *left; 3580 int slot; 3581 int free_space; 3582 u32 right_nritems; 3583 int ret = 0; 3584 3585 slot = path->slots[1]; 3586 if (slot == 0) 3587 return 1; 3588 if (!path->nodes[1]) 3589 return 1; 3590 3591 right_nritems = btrfs_header_nritems(right); 3592 if (right_nritems == 0) 3593 return 1; 3594 3595 btrfs_assert_tree_write_locked(path->nodes[1]); 3596 3597 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3598 if (IS_ERR(left)) 3599 return PTR_ERR(left); 3600 3601 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3602 3603 free_space = btrfs_leaf_free_space(left); 3604 if (free_space < data_size) { 3605 ret = 1; 3606 goto out; 3607 } 3608 3609 ret = btrfs_cow_block(trans, root, left, 3610 path->nodes[1], slot - 1, &left, 3611 BTRFS_NESTING_LEFT_COW); 3612 if (ret) { 3613 /* we hit -ENOSPC, but it isn't fatal here */ 3614 if (ret == -ENOSPC) 3615 ret = 1; 3616 goto out; 3617 } 3618 3619 if (check_sibling_keys(left, right)) { 3620 ret = -EUCLEAN; 3621 btrfs_abort_transaction(trans, ret); 3622 goto out; 3623 } 3624 return __push_leaf_left(trans, path, min_data_size, empty, left, 3625 free_space, right_nritems, max_slot); 3626 out: 3627 btrfs_tree_unlock(left); 3628 free_extent_buffer(left); 3629 return ret; 3630 } 3631 3632 /* 3633 * split the path's leaf in two, making sure there is at least data_size 3634 * available for the resulting leaf level of the path. 3635 */ 3636 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3637 struct btrfs_path *path, 3638 struct extent_buffer *l, 3639 struct extent_buffer *right, 3640 int slot, int mid, int nritems) 3641 { 3642 struct btrfs_fs_info *fs_info = trans->fs_info; 3643 int data_copy_size; 3644 int rt_data_off; 3645 int i; 3646 int ret; 3647 struct btrfs_disk_key disk_key; 3648 struct btrfs_map_token token; 3649 3650 nritems = nritems - mid; 3651 btrfs_set_header_nritems(right, nritems); 3652 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3653 3654 copy_leaf_items(right, l, 0, mid, nritems); 3655 3656 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3657 leaf_data_end(l), data_copy_size); 3658 3659 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3660 3661 btrfs_init_map_token(&token, right); 3662 for (i = 0; i < nritems; i++) { 3663 u32 ioff; 3664 3665 ioff = btrfs_token_item_offset(&token, i); 3666 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3667 } 3668 3669 btrfs_set_header_nritems(l, mid); 3670 btrfs_item_key(right, &disk_key, 0); 3671 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3672 if (ret < 0) 3673 return ret; 3674 3675 btrfs_mark_buffer_dirty(right); 3676 btrfs_mark_buffer_dirty(l); 3677 BUG_ON(path->slots[0] != slot); 3678 3679 if (mid <= slot) { 3680 btrfs_tree_unlock(path->nodes[0]); 3681 free_extent_buffer(path->nodes[0]); 3682 path->nodes[0] = right; 3683 path->slots[0] -= mid; 3684 path->slots[1] += 1; 3685 } else { 3686 btrfs_tree_unlock(right); 3687 free_extent_buffer(right); 3688 } 3689 3690 BUG_ON(path->slots[0] < 0); 3691 3692 return 0; 3693 } 3694 3695 /* 3696 * double splits happen when we need to insert a big item in the middle 3697 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3698 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3699 * A B C 3700 * 3701 * We avoid this by trying to push the items on either side of our target 3702 * into the adjacent leaves. If all goes well we can avoid the double split 3703 * completely. 3704 */ 3705 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3706 struct btrfs_root *root, 3707 struct btrfs_path *path, 3708 int data_size) 3709 { 3710 int ret; 3711 int progress = 0; 3712 int slot; 3713 u32 nritems; 3714 int space_needed = data_size; 3715 3716 slot = path->slots[0]; 3717 if (slot < btrfs_header_nritems(path->nodes[0])) 3718 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3719 3720 /* 3721 * try to push all the items after our slot into the 3722 * right leaf 3723 */ 3724 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3725 if (ret < 0) 3726 return ret; 3727 3728 if (ret == 0) 3729 progress++; 3730 3731 nritems = btrfs_header_nritems(path->nodes[0]); 3732 /* 3733 * our goal is to get our slot at the start or end of a leaf. If 3734 * we've done so we're done 3735 */ 3736 if (path->slots[0] == 0 || path->slots[0] == nritems) 3737 return 0; 3738 3739 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3740 return 0; 3741 3742 /* try to push all the items before our slot into the next leaf */ 3743 slot = path->slots[0]; 3744 space_needed = data_size; 3745 if (slot > 0) 3746 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3747 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3748 if (ret < 0) 3749 return ret; 3750 3751 if (ret == 0) 3752 progress++; 3753 3754 if (progress) 3755 return 0; 3756 return 1; 3757 } 3758 3759 /* 3760 * split the path's leaf in two, making sure there is at least data_size 3761 * available for the resulting leaf level of the path. 3762 * 3763 * returns 0 if all went well and < 0 on failure. 3764 */ 3765 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3766 struct btrfs_root *root, 3767 const struct btrfs_key *ins_key, 3768 struct btrfs_path *path, int data_size, 3769 int extend) 3770 { 3771 struct btrfs_disk_key disk_key; 3772 struct extent_buffer *l; 3773 u32 nritems; 3774 int mid; 3775 int slot; 3776 struct extent_buffer *right; 3777 struct btrfs_fs_info *fs_info = root->fs_info; 3778 int ret = 0; 3779 int wret; 3780 int split; 3781 int num_doubles = 0; 3782 int tried_avoid_double = 0; 3783 3784 l = path->nodes[0]; 3785 slot = path->slots[0]; 3786 if (extend && data_size + btrfs_item_size(l, slot) + 3787 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3788 return -EOVERFLOW; 3789 3790 /* first try to make some room by pushing left and right */ 3791 if (data_size && path->nodes[1]) { 3792 int space_needed = data_size; 3793 3794 if (slot < btrfs_header_nritems(l)) 3795 space_needed -= btrfs_leaf_free_space(l); 3796 3797 wret = push_leaf_right(trans, root, path, space_needed, 3798 space_needed, 0, 0); 3799 if (wret < 0) 3800 return wret; 3801 if (wret) { 3802 space_needed = data_size; 3803 if (slot > 0) 3804 space_needed -= btrfs_leaf_free_space(l); 3805 wret = push_leaf_left(trans, root, path, space_needed, 3806 space_needed, 0, (u32)-1); 3807 if (wret < 0) 3808 return wret; 3809 } 3810 l = path->nodes[0]; 3811 3812 /* did the pushes work? */ 3813 if (btrfs_leaf_free_space(l) >= data_size) 3814 return 0; 3815 } 3816 3817 if (!path->nodes[1]) { 3818 ret = insert_new_root(trans, root, path, 1); 3819 if (ret) 3820 return ret; 3821 } 3822 again: 3823 split = 1; 3824 l = path->nodes[0]; 3825 slot = path->slots[0]; 3826 nritems = btrfs_header_nritems(l); 3827 mid = (nritems + 1) / 2; 3828 3829 if (mid <= slot) { 3830 if (nritems == 1 || 3831 leaf_space_used(l, mid, nritems - mid) + data_size > 3832 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3833 if (slot >= nritems) { 3834 split = 0; 3835 } else { 3836 mid = slot; 3837 if (mid != nritems && 3838 leaf_space_used(l, mid, nritems - mid) + 3839 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3840 if (data_size && !tried_avoid_double) 3841 goto push_for_double; 3842 split = 2; 3843 } 3844 } 3845 } 3846 } else { 3847 if (leaf_space_used(l, 0, mid) + data_size > 3848 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3849 if (!extend && data_size && slot == 0) { 3850 split = 0; 3851 } else if ((extend || !data_size) && slot == 0) { 3852 mid = 1; 3853 } else { 3854 mid = slot; 3855 if (mid != nritems && 3856 leaf_space_used(l, mid, nritems - mid) + 3857 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3858 if (data_size && !tried_avoid_double) 3859 goto push_for_double; 3860 split = 2; 3861 } 3862 } 3863 } 3864 } 3865 3866 if (split == 0) 3867 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3868 else 3869 btrfs_item_key(l, &disk_key, mid); 3870 3871 /* 3872 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3873 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3874 * subclasses, which is 8 at the time of this patch, and we've maxed it 3875 * out. In the future we could add a 3876 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3877 * use BTRFS_NESTING_NEW_ROOT. 3878 */ 3879 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3880 &disk_key, 0, l->start, 0, 3881 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3882 BTRFS_NESTING_SPLIT); 3883 if (IS_ERR(right)) 3884 return PTR_ERR(right); 3885 3886 root_add_used(root, fs_info->nodesize); 3887 3888 if (split == 0) { 3889 if (mid <= slot) { 3890 btrfs_set_header_nritems(right, 0); 3891 ret = insert_ptr(trans, path, &disk_key, 3892 right->start, path->slots[1] + 1, 1); 3893 if (ret < 0) { 3894 btrfs_tree_unlock(right); 3895 free_extent_buffer(right); 3896 return ret; 3897 } 3898 btrfs_tree_unlock(path->nodes[0]); 3899 free_extent_buffer(path->nodes[0]); 3900 path->nodes[0] = right; 3901 path->slots[0] = 0; 3902 path->slots[1] += 1; 3903 } else { 3904 btrfs_set_header_nritems(right, 0); 3905 ret = insert_ptr(trans, path, &disk_key, 3906 right->start, path->slots[1], 1); 3907 if (ret < 0) { 3908 btrfs_tree_unlock(right); 3909 free_extent_buffer(right); 3910 return ret; 3911 } 3912 btrfs_tree_unlock(path->nodes[0]); 3913 free_extent_buffer(path->nodes[0]); 3914 path->nodes[0] = right; 3915 path->slots[0] = 0; 3916 if (path->slots[1] == 0) 3917 fixup_low_keys(path, &disk_key, 1); 3918 } 3919 /* 3920 * We create a new leaf 'right' for the required ins_len and 3921 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3922 * the content of ins_len to 'right'. 3923 */ 3924 return ret; 3925 } 3926 3927 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3928 if (ret < 0) { 3929 btrfs_tree_unlock(right); 3930 free_extent_buffer(right); 3931 return ret; 3932 } 3933 3934 if (split == 2) { 3935 BUG_ON(num_doubles != 0); 3936 num_doubles++; 3937 goto again; 3938 } 3939 3940 return 0; 3941 3942 push_for_double: 3943 push_for_double_split(trans, root, path, data_size); 3944 tried_avoid_double = 1; 3945 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3946 return 0; 3947 goto again; 3948 } 3949 3950 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3951 struct btrfs_root *root, 3952 struct btrfs_path *path, int ins_len) 3953 { 3954 struct btrfs_key key; 3955 struct extent_buffer *leaf; 3956 struct btrfs_file_extent_item *fi; 3957 u64 extent_len = 0; 3958 u32 item_size; 3959 int ret; 3960 3961 leaf = path->nodes[0]; 3962 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3963 3964 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3965 key.type != BTRFS_EXTENT_CSUM_KEY); 3966 3967 if (btrfs_leaf_free_space(leaf) >= ins_len) 3968 return 0; 3969 3970 item_size = btrfs_item_size(leaf, path->slots[0]); 3971 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3972 fi = btrfs_item_ptr(leaf, path->slots[0], 3973 struct btrfs_file_extent_item); 3974 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3975 } 3976 btrfs_release_path(path); 3977 3978 path->keep_locks = 1; 3979 path->search_for_split = 1; 3980 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3981 path->search_for_split = 0; 3982 if (ret > 0) 3983 ret = -EAGAIN; 3984 if (ret < 0) 3985 goto err; 3986 3987 ret = -EAGAIN; 3988 leaf = path->nodes[0]; 3989 /* if our item isn't there, return now */ 3990 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3991 goto err; 3992 3993 /* the leaf has changed, it now has room. return now */ 3994 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3995 goto err; 3996 3997 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3998 fi = btrfs_item_ptr(leaf, path->slots[0], 3999 struct btrfs_file_extent_item); 4000 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4001 goto err; 4002 } 4003 4004 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4005 if (ret) 4006 goto err; 4007 4008 path->keep_locks = 0; 4009 btrfs_unlock_up_safe(path, 1); 4010 return 0; 4011 err: 4012 path->keep_locks = 0; 4013 return ret; 4014 } 4015 4016 static noinline int split_item(struct btrfs_path *path, 4017 const struct btrfs_key *new_key, 4018 unsigned long split_offset) 4019 { 4020 struct extent_buffer *leaf; 4021 int orig_slot, slot; 4022 char *buf; 4023 u32 nritems; 4024 u32 item_size; 4025 u32 orig_offset; 4026 struct btrfs_disk_key disk_key; 4027 4028 leaf = path->nodes[0]; 4029 /* 4030 * Shouldn't happen because the caller must have previously called 4031 * setup_leaf_for_split() to make room for the new item in the leaf. 4032 */ 4033 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 4034 return -ENOSPC; 4035 4036 orig_slot = path->slots[0]; 4037 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 4038 item_size = btrfs_item_size(leaf, path->slots[0]); 4039 4040 buf = kmalloc(item_size, GFP_NOFS); 4041 if (!buf) 4042 return -ENOMEM; 4043 4044 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4045 path->slots[0]), item_size); 4046 4047 slot = path->slots[0] + 1; 4048 nritems = btrfs_header_nritems(leaf); 4049 if (slot != nritems) { 4050 /* shift the items */ 4051 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 4052 } 4053 4054 btrfs_cpu_key_to_disk(&disk_key, new_key); 4055 btrfs_set_item_key(leaf, &disk_key, slot); 4056 4057 btrfs_set_item_offset(leaf, slot, orig_offset); 4058 btrfs_set_item_size(leaf, slot, item_size - split_offset); 4059 4060 btrfs_set_item_offset(leaf, orig_slot, 4061 orig_offset + item_size - split_offset); 4062 btrfs_set_item_size(leaf, orig_slot, split_offset); 4063 4064 btrfs_set_header_nritems(leaf, nritems + 1); 4065 4066 /* write the data for the start of the original item */ 4067 write_extent_buffer(leaf, buf, 4068 btrfs_item_ptr_offset(leaf, path->slots[0]), 4069 split_offset); 4070 4071 /* write the data for the new item */ 4072 write_extent_buffer(leaf, buf + split_offset, 4073 btrfs_item_ptr_offset(leaf, slot), 4074 item_size - split_offset); 4075 btrfs_mark_buffer_dirty(leaf); 4076 4077 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 4078 kfree(buf); 4079 return 0; 4080 } 4081 4082 /* 4083 * This function splits a single item into two items, 4084 * giving 'new_key' to the new item and splitting the 4085 * old one at split_offset (from the start of the item). 4086 * 4087 * The path may be released by this operation. After 4088 * the split, the path is pointing to the old item. The 4089 * new item is going to be in the same node as the old one. 4090 * 4091 * Note, the item being split must be smaller enough to live alone on 4092 * a tree block with room for one extra struct btrfs_item 4093 * 4094 * This allows us to split the item in place, keeping a lock on the 4095 * leaf the entire time. 4096 */ 4097 int btrfs_split_item(struct btrfs_trans_handle *trans, 4098 struct btrfs_root *root, 4099 struct btrfs_path *path, 4100 const struct btrfs_key *new_key, 4101 unsigned long split_offset) 4102 { 4103 int ret; 4104 ret = setup_leaf_for_split(trans, root, path, 4105 sizeof(struct btrfs_item)); 4106 if (ret) 4107 return ret; 4108 4109 ret = split_item(path, new_key, split_offset); 4110 return ret; 4111 } 4112 4113 /* 4114 * make the item pointed to by the path smaller. new_size indicates 4115 * how small to make it, and from_end tells us if we just chop bytes 4116 * off the end of the item or if we shift the item to chop bytes off 4117 * the front. 4118 */ 4119 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 4120 { 4121 int slot; 4122 struct extent_buffer *leaf; 4123 u32 nritems; 4124 unsigned int data_end; 4125 unsigned int old_data_start; 4126 unsigned int old_size; 4127 unsigned int size_diff; 4128 int i; 4129 struct btrfs_map_token token; 4130 4131 leaf = path->nodes[0]; 4132 slot = path->slots[0]; 4133 4134 old_size = btrfs_item_size(leaf, slot); 4135 if (old_size == new_size) 4136 return; 4137 4138 nritems = btrfs_header_nritems(leaf); 4139 data_end = leaf_data_end(leaf); 4140 4141 old_data_start = btrfs_item_offset(leaf, slot); 4142 4143 size_diff = old_size - new_size; 4144 4145 BUG_ON(slot < 0); 4146 BUG_ON(slot >= nritems); 4147 4148 /* 4149 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4150 */ 4151 /* first correct the data pointers */ 4152 btrfs_init_map_token(&token, leaf); 4153 for (i = slot; i < nritems; i++) { 4154 u32 ioff; 4155 4156 ioff = btrfs_token_item_offset(&token, i); 4157 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 4158 } 4159 4160 /* shift the data */ 4161 if (from_end) { 4162 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4163 old_data_start + new_size - data_end); 4164 } else { 4165 struct btrfs_disk_key disk_key; 4166 u64 offset; 4167 4168 btrfs_item_key(leaf, &disk_key, slot); 4169 4170 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4171 unsigned long ptr; 4172 struct btrfs_file_extent_item *fi; 4173 4174 fi = btrfs_item_ptr(leaf, slot, 4175 struct btrfs_file_extent_item); 4176 fi = (struct btrfs_file_extent_item *)( 4177 (unsigned long)fi - size_diff); 4178 4179 if (btrfs_file_extent_type(leaf, fi) == 4180 BTRFS_FILE_EXTENT_INLINE) { 4181 ptr = btrfs_item_ptr_offset(leaf, slot); 4182 memmove_extent_buffer(leaf, ptr, 4183 (unsigned long)fi, 4184 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4185 } 4186 } 4187 4188 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4189 old_data_start - data_end); 4190 4191 offset = btrfs_disk_key_offset(&disk_key); 4192 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4193 btrfs_set_item_key(leaf, &disk_key, slot); 4194 if (slot == 0) 4195 fixup_low_keys(path, &disk_key, 1); 4196 } 4197 4198 btrfs_set_item_size(leaf, slot, new_size); 4199 btrfs_mark_buffer_dirty(leaf); 4200 4201 if (btrfs_leaf_free_space(leaf) < 0) { 4202 btrfs_print_leaf(leaf); 4203 BUG(); 4204 } 4205 } 4206 4207 /* 4208 * make the item pointed to by the path bigger, data_size is the added size. 4209 */ 4210 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 4211 { 4212 int slot; 4213 struct extent_buffer *leaf; 4214 u32 nritems; 4215 unsigned int data_end; 4216 unsigned int old_data; 4217 unsigned int old_size; 4218 int i; 4219 struct btrfs_map_token token; 4220 4221 leaf = path->nodes[0]; 4222 4223 nritems = btrfs_header_nritems(leaf); 4224 data_end = leaf_data_end(leaf); 4225 4226 if (btrfs_leaf_free_space(leaf) < data_size) { 4227 btrfs_print_leaf(leaf); 4228 BUG(); 4229 } 4230 slot = path->slots[0]; 4231 old_data = btrfs_item_data_end(leaf, slot); 4232 4233 BUG_ON(slot < 0); 4234 if (slot >= nritems) { 4235 btrfs_print_leaf(leaf); 4236 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4237 slot, nritems); 4238 BUG(); 4239 } 4240 4241 /* 4242 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4243 */ 4244 /* first correct the data pointers */ 4245 btrfs_init_map_token(&token, leaf); 4246 for (i = slot; i < nritems; i++) { 4247 u32 ioff; 4248 4249 ioff = btrfs_token_item_offset(&token, i); 4250 btrfs_set_token_item_offset(&token, i, ioff - data_size); 4251 } 4252 4253 /* shift the data */ 4254 memmove_leaf_data(leaf, data_end - data_size, data_end, 4255 old_data - data_end); 4256 4257 data_end = old_data; 4258 old_size = btrfs_item_size(leaf, slot); 4259 btrfs_set_item_size(leaf, slot, old_size + data_size); 4260 btrfs_mark_buffer_dirty(leaf); 4261 4262 if (btrfs_leaf_free_space(leaf) < 0) { 4263 btrfs_print_leaf(leaf); 4264 BUG(); 4265 } 4266 } 4267 4268 /* 4269 * Make space in the node before inserting one or more items. 4270 * 4271 * @root: root we are inserting items to 4272 * @path: points to the leaf/slot where we are going to insert new items 4273 * @batch: information about the batch of items to insert 4274 * 4275 * Main purpose is to save stack depth by doing the bulk of the work in a 4276 * function that doesn't call btrfs_search_slot 4277 */ 4278 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4279 const struct btrfs_item_batch *batch) 4280 { 4281 struct btrfs_fs_info *fs_info = root->fs_info; 4282 int i; 4283 u32 nritems; 4284 unsigned int data_end; 4285 struct btrfs_disk_key disk_key; 4286 struct extent_buffer *leaf; 4287 int slot; 4288 struct btrfs_map_token token; 4289 u32 total_size; 4290 4291 /* 4292 * Before anything else, update keys in the parent and other ancestors 4293 * if needed, then release the write locks on them, so that other tasks 4294 * can use them while we modify the leaf. 4295 */ 4296 if (path->slots[0] == 0) { 4297 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4298 fixup_low_keys(path, &disk_key, 1); 4299 } 4300 btrfs_unlock_up_safe(path, 1); 4301 4302 leaf = path->nodes[0]; 4303 slot = path->slots[0]; 4304 4305 nritems = btrfs_header_nritems(leaf); 4306 data_end = leaf_data_end(leaf); 4307 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4308 4309 if (btrfs_leaf_free_space(leaf) < total_size) { 4310 btrfs_print_leaf(leaf); 4311 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4312 total_size, btrfs_leaf_free_space(leaf)); 4313 BUG(); 4314 } 4315 4316 btrfs_init_map_token(&token, leaf); 4317 if (slot != nritems) { 4318 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4319 4320 if (old_data < data_end) { 4321 btrfs_print_leaf(leaf); 4322 btrfs_crit(fs_info, 4323 "item at slot %d with data offset %u beyond data end of leaf %u", 4324 slot, old_data, data_end); 4325 BUG(); 4326 } 4327 /* 4328 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4329 */ 4330 /* first correct the data pointers */ 4331 for (i = slot; i < nritems; i++) { 4332 u32 ioff; 4333 4334 ioff = btrfs_token_item_offset(&token, i); 4335 btrfs_set_token_item_offset(&token, i, 4336 ioff - batch->total_data_size); 4337 } 4338 /* shift the items */ 4339 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4340 4341 /* shift the data */ 4342 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4343 data_end, old_data - data_end); 4344 data_end = old_data; 4345 } 4346 4347 /* setup the item for the new data */ 4348 for (i = 0; i < batch->nr; i++) { 4349 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4350 btrfs_set_item_key(leaf, &disk_key, slot + i); 4351 data_end -= batch->data_sizes[i]; 4352 btrfs_set_token_item_offset(&token, slot + i, data_end); 4353 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4354 } 4355 4356 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4357 btrfs_mark_buffer_dirty(leaf); 4358 4359 if (btrfs_leaf_free_space(leaf) < 0) { 4360 btrfs_print_leaf(leaf); 4361 BUG(); 4362 } 4363 } 4364 4365 /* 4366 * Insert a new item into a leaf. 4367 * 4368 * @root: The root of the btree. 4369 * @path: A path pointing to the target leaf and slot. 4370 * @key: The key of the new item. 4371 * @data_size: The size of the data associated with the new key. 4372 */ 4373 void btrfs_setup_item_for_insert(struct btrfs_root *root, 4374 struct btrfs_path *path, 4375 const struct btrfs_key *key, 4376 u32 data_size) 4377 { 4378 struct btrfs_item_batch batch; 4379 4380 batch.keys = key; 4381 batch.data_sizes = &data_size; 4382 batch.total_data_size = data_size; 4383 batch.nr = 1; 4384 4385 setup_items_for_insert(root, path, &batch); 4386 } 4387 4388 /* 4389 * Given a key and some data, insert items into the tree. 4390 * This does all the path init required, making room in the tree if needed. 4391 */ 4392 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4393 struct btrfs_root *root, 4394 struct btrfs_path *path, 4395 const struct btrfs_item_batch *batch) 4396 { 4397 int ret = 0; 4398 int slot; 4399 u32 total_size; 4400 4401 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4402 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4403 if (ret == 0) 4404 return -EEXIST; 4405 if (ret < 0) 4406 return ret; 4407 4408 slot = path->slots[0]; 4409 BUG_ON(slot < 0); 4410 4411 setup_items_for_insert(root, path, batch); 4412 return 0; 4413 } 4414 4415 /* 4416 * Given a key and some data, insert an item into the tree. 4417 * This does all the path init required, making room in the tree if needed. 4418 */ 4419 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4420 const struct btrfs_key *cpu_key, void *data, 4421 u32 data_size) 4422 { 4423 int ret = 0; 4424 struct btrfs_path *path; 4425 struct extent_buffer *leaf; 4426 unsigned long ptr; 4427 4428 path = btrfs_alloc_path(); 4429 if (!path) 4430 return -ENOMEM; 4431 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4432 if (!ret) { 4433 leaf = path->nodes[0]; 4434 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4435 write_extent_buffer(leaf, data, ptr, data_size); 4436 btrfs_mark_buffer_dirty(leaf); 4437 } 4438 btrfs_free_path(path); 4439 return ret; 4440 } 4441 4442 /* 4443 * This function duplicates an item, giving 'new_key' to the new item. 4444 * It guarantees both items live in the same tree leaf and the new item is 4445 * contiguous with the original item. 4446 * 4447 * This allows us to split a file extent in place, keeping a lock on the leaf 4448 * the entire time. 4449 */ 4450 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4451 struct btrfs_root *root, 4452 struct btrfs_path *path, 4453 const struct btrfs_key *new_key) 4454 { 4455 struct extent_buffer *leaf; 4456 int ret; 4457 u32 item_size; 4458 4459 leaf = path->nodes[0]; 4460 item_size = btrfs_item_size(leaf, path->slots[0]); 4461 ret = setup_leaf_for_split(trans, root, path, 4462 item_size + sizeof(struct btrfs_item)); 4463 if (ret) 4464 return ret; 4465 4466 path->slots[0]++; 4467 btrfs_setup_item_for_insert(root, path, new_key, item_size); 4468 leaf = path->nodes[0]; 4469 memcpy_extent_buffer(leaf, 4470 btrfs_item_ptr_offset(leaf, path->slots[0]), 4471 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4472 item_size); 4473 return 0; 4474 } 4475 4476 /* 4477 * delete the pointer from a given node. 4478 * 4479 * the tree should have been previously balanced so the deletion does not 4480 * empty a node. 4481 * 4482 * This is exported for use inside btrfs-progs, don't un-export it. 4483 */ 4484 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4485 struct btrfs_path *path, int level, int slot) 4486 { 4487 struct extent_buffer *parent = path->nodes[level]; 4488 u32 nritems; 4489 int ret; 4490 4491 nritems = btrfs_header_nritems(parent); 4492 if (slot != nritems - 1) { 4493 if (level) { 4494 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4495 slot + 1, nritems - slot - 1); 4496 if (ret < 0) { 4497 btrfs_abort_transaction(trans, ret); 4498 return ret; 4499 } 4500 } 4501 memmove_extent_buffer(parent, 4502 btrfs_node_key_ptr_offset(parent, slot), 4503 btrfs_node_key_ptr_offset(parent, slot + 1), 4504 sizeof(struct btrfs_key_ptr) * 4505 (nritems - slot - 1)); 4506 } else if (level) { 4507 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4508 BTRFS_MOD_LOG_KEY_REMOVE); 4509 if (ret < 0) { 4510 btrfs_abort_transaction(trans, ret); 4511 return ret; 4512 } 4513 } 4514 4515 nritems--; 4516 btrfs_set_header_nritems(parent, nritems); 4517 if (nritems == 0 && parent == root->node) { 4518 BUG_ON(btrfs_header_level(root->node) != 1); 4519 /* just turn the root into a leaf and break */ 4520 btrfs_set_header_level(root->node, 0); 4521 } else if (slot == 0) { 4522 struct btrfs_disk_key disk_key; 4523 4524 btrfs_node_key(parent, &disk_key, 0); 4525 fixup_low_keys(path, &disk_key, level + 1); 4526 } 4527 btrfs_mark_buffer_dirty(parent); 4528 return 0; 4529 } 4530 4531 /* 4532 * a helper function to delete the leaf pointed to by path->slots[1] and 4533 * path->nodes[1]. 4534 * 4535 * This deletes the pointer in path->nodes[1] and frees the leaf 4536 * block extent. zero is returned if it all worked out, < 0 otherwise. 4537 * 4538 * The path must have already been setup for deleting the leaf, including 4539 * all the proper balancing. path->nodes[1] must be locked. 4540 */ 4541 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4542 struct btrfs_root *root, 4543 struct btrfs_path *path, 4544 struct extent_buffer *leaf) 4545 { 4546 int ret; 4547 4548 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4549 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4550 if (ret < 0) 4551 return ret; 4552 4553 /* 4554 * btrfs_free_extent is expensive, we want to make sure we 4555 * aren't holding any locks when we call it 4556 */ 4557 btrfs_unlock_up_safe(path, 0); 4558 4559 root_sub_used(root, leaf->len); 4560 4561 atomic_inc(&leaf->refs); 4562 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4563 free_extent_buffer_stale(leaf); 4564 return 0; 4565 } 4566 /* 4567 * delete the item at the leaf level in path. If that empties 4568 * the leaf, remove it from the tree 4569 */ 4570 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4571 struct btrfs_path *path, int slot, int nr) 4572 { 4573 struct btrfs_fs_info *fs_info = root->fs_info; 4574 struct extent_buffer *leaf; 4575 int ret = 0; 4576 int wret; 4577 u32 nritems; 4578 4579 leaf = path->nodes[0]; 4580 nritems = btrfs_header_nritems(leaf); 4581 4582 if (slot + nr != nritems) { 4583 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4584 const int data_end = leaf_data_end(leaf); 4585 struct btrfs_map_token token; 4586 u32 dsize = 0; 4587 int i; 4588 4589 for (i = 0; i < nr; i++) 4590 dsize += btrfs_item_size(leaf, slot + i); 4591 4592 memmove_leaf_data(leaf, data_end + dsize, data_end, 4593 last_off - data_end); 4594 4595 btrfs_init_map_token(&token, leaf); 4596 for (i = slot + nr; i < nritems; i++) { 4597 u32 ioff; 4598 4599 ioff = btrfs_token_item_offset(&token, i); 4600 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4601 } 4602 4603 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4604 } 4605 btrfs_set_header_nritems(leaf, nritems - nr); 4606 nritems -= nr; 4607 4608 /* delete the leaf if we've emptied it */ 4609 if (nritems == 0) { 4610 if (leaf == root->node) { 4611 btrfs_set_header_level(leaf, 0); 4612 } else { 4613 btrfs_clear_buffer_dirty(trans, leaf); 4614 ret = btrfs_del_leaf(trans, root, path, leaf); 4615 if (ret < 0) 4616 return ret; 4617 } 4618 } else { 4619 int used = leaf_space_used(leaf, 0, nritems); 4620 if (slot == 0) { 4621 struct btrfs_disk_key disk_key; 4622 4623 btrfs_item_key(leaf, &disk_key, 0); 4624 fixup_low_keys(path, &disk_key, 1); 4625 } 4626 4627 /* 4628 * Try to delete the leaf if it is mostly empty. We do this by 4629 * trying to move all its items into its left and right neighbours. 4630 * If we can't move all the items, then we don't delete it - it's 4631 * not ideal, but future insertions might fill the leaf with more 4632 * items, or items from other leaves might be moved later into our 4633 * leaf due to deletions on those leaves. 4634 */ 4635 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4636 u32 min_push_space; 4637 4638 /* push_leaf_left fixes the path. 4639 * make sure the path still points to our leaf 4640 * for possible call to btrfs_del_ptr below 4641 */ 4642 slot = path->slots[1]; 4643 atomic_inc(&leaf->refs); 4644 /* 4645 * We want to be able to at least push one item to the 4646 * left neighbour leaf, and that's the first item. 4647 */ 4648 min_push_space = sizeof(struct btrfs_item) + 4649 btrfs_item_size(leaf, 0); 4650 wret = push_leaf_left(trans, root, path, 0, 4651 min_push_space, 1, (u32)-1); 4652 if (wret < 0 && wret != -ENOSPC) 4653 ret = wret; 4654 4655 if (path->nodes[0] == leaf && 4656 btrfs_header_nritems(leaf)) { 4657 /* 4658 * If we were not able to push all items from our 4659 * leaf to its left neighbour, then attempt to 4660 * either push all the remaining items to the 4661 * right neighbour or none. There's no advantage 4662 * in pushing only some items, instead of all, as 4663 * it's pointless to end up with a leaf having 4664 * too few items while the neighbours can be full 4665 * or nearly full. 4666 */ 4667 nritems = btrfs_header_nritems(leaf); 4668 min_push_space = leaf_space_used(leaf, 0, nritems); 4669 wret = push_leaf_right(trans, root, path, 0, 4670 min_push_space, 1, 0); 4671 if (wret < 0 && wret != -ENOSPC) 4672 ret = wret; 4673 } 4674 4675 if (btrfs_header_nritems(leaf) == 0) { 4676 path->slots[1] = slot; 4677 ret = btrfs_del_leaf(trans, root, path, leaf); 4678 if (ret < 0) 4679 return ret; 4680 free_extent_buffer(leaf); 4681 ret = 0; 4682 } else { 4683 /* if we're still in the path, make sure 4684 * we're dirty. Otherwise, one of the 4685 * push_leaf functions must have already 4686 * dirtied this buffer 4687 */ 4688 if (path->nodes[0] == leaf) 4689 btrfs_mark_buffer_dirty(leaf); 4690 free_extent_buffer(leaf); 4691 } 4692 } else { 4693 btrfs_mark_buffer_dirty(leaf); 4694 } 4695 } 4696 return ret; 4697 } 4698 4699 /* 4700 * A helper function to walk down the tree starting at min_key, and looking 4701 * for nodes or leaves that are have a minimum transaction id. 4702 * This is used by the btree defrag code, and tree logging 4703 * 4704 * This does not cow, but it does stuff the starting key it finds back 4705 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4706 * key and get a writable path. 4707 * 4708 * This honors path->lowest_level to prevent descent past a given level 4709 * of the tree. 4710 * 4711 * min_trans indicates the oldest transaction that you are interested 4712 * in walking through. Any nodes or leaves older than min_trans are 4713 * skipped over (without reading them). 4714 * 4715 * returns zero if something useful was found, < 0 on error and 1 if there 4716 * was nothing in the tree that matched the search criteria. 4717 */ 4718 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4719 struct btrfs_path *path, 4720 u64 min_trans) 4721 { 4722 struct extent_buffer *cur; 4723 struct btrfs_key found_key; 4724 int slot; 4725 int sret; 4726 u32 nritems; 4727 int level; 4728 int ret = 1; 4729 int keep_locks = path->keep_locks; 4730 4731 ASSERT(!path->nowait); 4732 path->keep_locks = 1; 4733 again: 4734 cur = btrfs_read_lock_root_node(root); 4735 level = btrfs_header_level(cur); 4736 WARN_ON(path->nodes[level]); 4737 path->nodes[level] = cur; 4738 path->locks[level] = BTRFS_READ_LOCK; 4739 4740 if (btrfs_header_generation(cur) < min_trans) { 4741 ret = 1; 4742 goto out; 4743 } 4744 while (1) { 4745 nritems = btrfs_header_nritems(cur); 4746 level = btrfs_header_level(cur); 4747 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4748 if (sret < 0) { 4749 ret = sret; 4750 goto out; 4751 } 4752 4753 /* at the lowest level, we're done, setup the path and exit */ 4754 if (level == path->lowest_level) { 4755 if (slot >= nritems) 4756 goto find_next_key; 4757 ret = 0; 4758 path->slots[level] = slot; 4759 btrfs_item_key_to_cpu(cur, &found_key, slot); 4760 goto out; 4761 } 4762 if (sret && slot > 0) 4763 slot--; 4764 /* 4765 * check this node pointer against the min_trans parameters. 4766 * If it is too old, skip to the next one. 4767 */ 4768 while (slot < nritems) { 4769 u64 gen; 4770 4771 gen = btrfs_node_ptr_generation(cur, slot); 4772 if (gen < min_trans) { 4773 slot++; 4774 continue; 4775 } 4776 break; 4777 } 4778 find_next_key: 4779 /* 4780 * we didn't find a candidate key in this node, walk forward 4781 * and find another one 4782 */ 4783 if (slot >= nritems) { 4784 path->slots[level] = slot; 4785 sret = btrfs_find_next_key(root, path, min_key, level, 4786 min_trans); 4787 if (sret == 0) { 4788 btrfs_release_path(path); 4789 goto again; 4790 } else { 4791 goto out; 4792 } 4793 } 4794 /* save our key for returning back */ 4795 btrfs_node_key_to_cpu(cur, &found_key, slot); 4796 path->slots[level] = slot; 4797 if (level == path->lowest_level) { 4798 ret = 0; 4799 goto out; 4800 } 4801 cur = btrfs_read_node_slot(cur, slot); 4802 if (IS_ERR(cur)) { 4803 ret = PTR_ERR(cur); 4804 goto out; 4805 } 4806 4807 btrfs_tree_read_lock(cur); 4808 4809 path->locks[level - 1] = BTRFS_READ_LOCK; 4810 path->nodes[level - 1] = cur; 4811 unlock_up(path, level, 1, 0, NULL); 4812 } 4813 out: 4814 path->keep_locks = keep_locks; 4815 if (ret == 0) { 4816 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4817 memcpy(min_key, &found_key, sizeof(found_key)); 4818 } 4819 return ret; 4820 } 4821 4822 /* 4823 * this is similar to btrfs_next_leaf, but does not try to preserve 4824 * and fixup the path. It looks for and returns the next key in the 4825 * tree based on the current path and the min_trans parameters. 4826 * 4827 * 0 is returned if another key is found, < 0 if there are any errors 4828 * and 1 is returned if there are no higher keys in the tree 4829 * 4830 * path->keep_locks should be set to 1 on the search made before 4831 * calling this function. 4832 */ 4833 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4834 struct btrfs_key *key, int level, u64 min_trans) 4835 { 4836 int slot; 4837 struct extent_buffer *c; 4838 4839 WARN_ON(!path->keep_locks && !path->skip_locking); 4840 while (level < BTRFS_MAX_LEVEL) { 4841 if (!path->nodes[level]) 4842 return 1; 4843 4844 slot = path->slots[level] + 1; 4845 c = path->nodes[level]; 4846 next: 4847 if (slot >= btrfs_header_nritems(c)) { 4848 int ret; 4849 int orig_lowest; 4850 struct btrfs_key cur_key; 4851 if (level + 1 >= BTRFS_MAX_LEVEL || 4852 !path->nodes[level + 1]) 4853 return 1; 4854 4855 if (path->locks[level + 1] || path->skip_locking) { 4856 level++; 4857 continue; 4858 } 4859 4860 slot = btrfs_header_nritems(c) - 1; 4861 if (level == 0) 4862 btrfs_item_key_to_cpu(c, &cur_key, slot); 4863 else 4864 btrfs_node_key_to_cpu(c, &cur_key, slot); 4865 4866 orig_lowest = path->lowest_level; 4867 btrfs_release_path(path); 4868 path->lowest_level = level; 4869 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4870 0, 0); 4871 path->lowest_level = orig_lowest; 4872 if (ret < 0) 4873 return ret; 4874 4875 c = path->nodes[level]; 4876 slot = path->slots[level]; 4877 if (ret == 0) 4878 slot++; 4879 goto next; 4880 } 4881 4882 if (level == 0) 4883 btrfs_item_key_to_cpu(c, key, slot); 4884 else { 4885 u64 gen = btrfs_node_ptr_generation(c, slot); 4886 4887 if (gen < min_trans) { 4888 slot++; 4889 goto next; 4890 } 4891 btrfs_node_key_to_cpu(c, key, slot); 4892 } 4893 return 0; 4894 } 4895 return 1; 4896 } 4897 4898 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4899 u64 time_seq) 4900 { 4901 int slot; 4902 int level; 4903 struct extent_buffer *c; 4904 struct extent_buffer *next; 4905 struct btrfs_fs_info *fs_info = root->fs_info; 4906 struct btrfs_key key; 4907 bool need_commit_sem = false; 4908 u32 nritems; 4909 int ret; 4910 int i; 4911 4912 /* 4913 * The nowait semantics are used only for write paths, where we don't 4914 * use the tree mod log and sequence numbers. 4915 */ 4916 if (time_seq) 4917 ASSERT(!path->nowait); 4918 4919 nritems = btrfs_header_nritems(path->nodes[0]); 4920 if (nritems == 0) 4921 return 1; 4922 4923 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4924 again: 4925 level = 1; 4926 next = NULL; 4927 btrfs_release_path(path); 4928 4929 path->keep_locks = 1; 4930 4931 if (time_seq) { 4932 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4933 } else { 4934 if (path->need_commit_sem) { 4935 path->need_commit_sem = 0; 4936 need_commit_sem = true; 4937 if (path->nowait) { 4938 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4939 ret = -EAGAIN; 4940 goto done; 4941 } 4942 } else { 4943 down_read(&fs_info->commit_root_sem); 4944 } 4945 } 4946 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4947 } 4948 path->keep_locks = 0; 4949 4950 if (ret < 0) 4951 goto done; 4952 4953 nritems = btrfs_header_nritems(path->nodes[0]); 4954 /* 4955 * by releasing the path above we dropped all our locks. A balance 4956 * could have added more items next to the key that used to be 4957 * at the very end of the block. So, check again here and 4958 * advance the path if there are now more items available. 4959 */ 4960 if (nritems > 0 && path->slots[0] < nritems - 1) { 4961 if (ret == 0) 4962 path->slots[0]++; 4963 ret = 0; 4964 goto done; 4965 } 4966 /* 4967 * So the above check misses one case: 4968 * - after releasing the path above, someone has removed the item that 4969 * used to be at the very end of the block, and balance between leafs 4970 * gets another one with bigger key.offset to replace it. 4971 * 4972 * This one should be returned as well, or we can get leaf corruption 4973 * later(esp. in __btrfs_drop_extents()). 4974 * 4975 * And a bit more explanation about this check, 4976 * with ret > 0, the key isn't found, the path points to the slot 4977 * where it should be inserted, so the path->slots[0] item must be the 4978 * bigger one. 4979 */ 4980 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4981 ret = 0; 4982 goto done; 4983 } 4984 4985 while (level < BTRFS_MAX_LEVEL) { 4986 if (!path->nodes[level]) { 4987 ret = 1; 4988 goto done; 4989 } 4990 4991 slot = path->slots[level] + 1; 4992 c = path->nodes[level]; 4993 if (slot >= btrfs_header_nritems(c)) { 4994 level++; 4995 if (level == BTRFS_MAX_LEVEL) { 4996 ret = 1; 4997 goto done; 4998 } 4999 continue; 5000 } 5001 5002 5003 /* 5004 * Our current level is where we're going to start from, and to 5005 * make sure lockdep doesn't complain we need to drop our locks 5006 * and nodes from 0 to our current level. 5007 */ 5008 for (i = 0; i < level; i++) { 5009 if (path->locks[level]) { 5010 btrfs_tree_read_unlock(path->nodes[i]); 5011 path->locks[i] = 0; 5012 } 5013 free_extent_buffer(path->nodes[i]); 5014 path->nodes[i] = NULL; 5015 } 5016 5017 next = c; 5018 ret = read_block_for_search(root, path, &next, level, 5019 slot, &key); 5020 if (ret == -EAGAIN && !path->nowait) 5021 goto again; 5022 5023 if (ret < 0) { 5024 btrfs_release_path(path); 5025 goto done; 5026 } 5027 5028 if (!path->skip_locking) { 5029 ret = btrfs_try_tree_read_lock(next); 5030 if (!ret && path->nowait) { 5031 ret = -EAGAIN; 5032 goto done; 5033 } 5034 if (!ret && time_seq) { 5035 /* 5036 * If we don't get the lock, we may be racing 5037 * with push_leaf_left, holding that lock while 5038 * itself waiting for the leaf we've currently 5039 * locked. To solve this situation, we give up 5040 * on our lock and cycle. 5041 */ 5042 free_extent_buffer(next); 5043 btrfs_release_path(path); 5044 cond_resched(); 5045 goto again; 5046 } 5047 if (!ret) 5048 btrfs_tree_read_lock(next); 5049 } 5050 break; 5051 } 5052 path->slots[level] = slot; 5053 while (1) { 5054 level--; 5055 path->nodes[level] = next; 5056 path->slots[level] = 0; 5057 if (!path->skip_locking) 5058 path->locks[level] = BTRFS_READ_LOCK; 5059 if (!level) 5060 break; 5061 5062 ret = read_block_for_search(root, path, &next, level, 5063 0, &key); 5064 if (ret == -EAGAIN && !path->nowait) 5065 goto again; 5066 5067 if (ret < 0) { 5068 btrfs_release_path(path); 5069 goto done; 5070 } 5071 5072 if (!path->skip_locking) { 5073 if (path->nowait) { 5074 if (!btrfs_try_tree_read_lock(next)) { 5075 ret = -EAGAIN; 5076 goto done; 5077 } 5078 } else { 5079 btrfs_tree_read_lock(next); 5080 } 5081 } 5082 } 5083 ret = 0; 5084 done: 5085 unlock_up(path, 0, 1, 0, NULL); 5086 if (need_commit_sem) { 5087 int ret2; 5088 5089 path->need_commit_sem = 1; 5090 ret2 = finish_need_commit_sem_search(path); 5091 up_read(&fs_info->commit_root_sem); 5092 if (ret2) 5093 ret = ret2; 5094 } 5095 5096 return ret; 5097 } 5098 5099 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 5100 { 5101 path->slots[0]++; 5102 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 5103 return btrfs_next_old_leaf(root, path, time_seq); 5104 return 0; 5105 } 5106 5107 /* 5108 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5109 * searching until it gets past min_objectid or finds an item of 'type' 5110 * 5111 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5112 */ 5113 int btrfs_previous_item(struct btrfs_root *root, 5114 struct btrfs_path *path, u64 min_objectid, 5115 int type) 5116 { 5117 struct btrfs_key found_key; 5118 struct extent_buffer *leaf; 5119 u32 nritems; 5120 int ret; 5121 5122 while (1) { 5123 if (path->slots[0] == 0) { 5124 ret = btrfs_prev_leaf(root, path); 5125 if (ret != 0) 5126 return ret; 5127 } else { 5128 path->slots[0]--; 5129 } 5130 leaf = path->nodes[0]; 5131 nritems = btrfs_header_nritems(leaf); 5132 if (nritems == 0) 5133 return 1; 5134 if (path->slots[0] == nritems) 5135 path->slots[0]--; 5136 5137 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5138 if (found_key.objectid < min_objectid) 5139 break; 5140 if (found_key.type == type) 5141 return 0; 5142 if (found_key.objectid == min_objectid && 5143 found_key.type < type) 5144 break; 5145 } 5146 return 1; 5147 } 5148 5149 /* 5150 * search in extent tree to find a previous Metadata/Data extent item with 5151 * min objecitd. 5152 * 5153 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5154 */ 5155 int btrfs_previous_extent_item(struct btrfs_root *root, 5156 struct btrfs_path *path, u64 min_objectid) 5157 { 5158 struct btrfs_key found_key; 5159 struct extent_buffer *leaf; 5160 u32 nritems; 5161 int ret; 5162 5163 while (1) { 5164 if (path->slots[0] == 0) { 5165 ret = btrfs_prev_leaf(root, path); 5166 if (ret != 0) 5167 return ret; 5168 } else { 5169 path->slots[0]--; 5170 } 5171 leaf = path->nodes[0]; 5172 nritems = btrfs_header_nritems(leaf); 5173 if (nritems == 0) 5174 return 1; 5175 if (path->slots[0] == nritems) 5176 path->slots[0]--; 5177 5178 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5179 if (found_key.objectid < min_objectid) 5180 break; 5181 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5182 found_key.type == BTRFS_METADATA_ITEM_KEY) 5183 return 0; 5184 if (found_key.objectid == min_objectid && 5185 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5186 break; 5187 } 5188 return 1; 5189 } 5190 5191 int __init btrfs_ctree_init(void) 5192 { 5193 btrfs_path_cachep = kmem_cache_create("btrfs_path", 5194 sizeof(struct btrfs_path), 0, 5195 SLAB_MEM_SPREAD, NULL); 5196 if (!btrfs_path_cachep) 5197 return -ENOMEM; 5198 return 0; 5199 } 5200 5201 void __cold btrfs_ctree_exit(void) 5202 { 5203 kmem_cache_destroy(btrfs_path_cachep); 5204 } 5205