1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, int extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, int empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 41 static const struct btrfs_csums { 42 u16 size; 43 const char name[10]; 44 const char driver[12]; 45 } btrfs_csums[] = { 46 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 47 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 48 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 49 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 50 .driver = "blake2b-256" }, 51 }; 52 53 /* 54 * The leaf data grows from end-to-front in the node. this returns the address 55 * of the start of the last item, which is the stop of the leaf data stack. 56 */ 57 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 58 { 59 u32 nr = btrfs_header_nritems(leaf); 60 61 if (nr == 0) 62 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 63 return btrfs_item_offset(leaf, nr - 1); 64 } 65 66 /* 67 * Move data in a @leaf (using memmove, safe for overlapping ranges). 68 * 69 * @leaf: leaf that we're doing a memmove on 70 * @dst_offset: item data offset we're moving to 71 * @src_offset: item data offset were' moving from 72 * @len: length of the data we're moving 73 * 74 * Wrapper around memmove_extent_buffer() that takes into account the header on 75 * the leaf. The btrfs_item offset's start directly after the header, so we 76 * have to adjust any offsets to account for the header in the leaf. This 77 * handles that math to simplify the callers. 78 */ 79 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 80 unsigned long dst_offset, 81 unsigned long src_offset, 82 unsigned long len) 83 { 84 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 85 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 86 } 87 88 /* 89 * Copy item data from @src into @dst at the given @offset. 90 * 91 * @dst: destination leaf that we're copying into 92 * @src: source leaf that we're copying from 93 * @dst_offset: item data offset we're copying to 94 * @src_offset: item data offset were' copying from 95 * @len: length of the data we're copying 96 * 97 * Wrapper around copy_extent_buffer() that takes into account the header on 98 * the leaf. The btrfs_item offset's start directly after the header, so we 99 * have to adjust any offsets to account for the header in the leaf. This 100 * handles that math to simplify the callers. 101 */ 102 static inline void copy_leaf_data(const struct extent_buffer *dst, 103 const struct extent_buffer *src, 104 unsigned long dst_offset, 105 unsigned long src_offset, unsigned long len) 106 { 107 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 108 btrfs_item_nr_offset(src, 0) + src_offset, len); 109 } 110 111 /* 112 * Move items in a @leaf (using memmove). 113 * 114 * @dst: destination leaf for the items 115 * @dst_item: the item nr we're copying into 116 * @src_item: the item nr we're copying from 117 * @nr_items: the number of items to copy 118 * 119 * Wrapper around memmove_extent_buffer() that does the math to get the 120 * appropriate offsets into the leaf from the item numbers. 121 */ 122 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 123 int dst_item, int src_item, int nr_items) 124 { 125 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 126 btrfs_item_nr_offset(leaf, src_item), 127 nr_items * sizeof(struct btrfs_item)); 128 } 129 130 /* 131 * Copy items from @src into @dst at the given @offset. 132 * 133 * @dst: destination leaf for the items 134 * @src: source leaf for the items 135 * @dst_item: the item nr we're copying into 136 * @src_item: the item nr we're copying from 137 * @nr_items: the number of items to copy 138 * 139 * Wrapper around copy_extent_buffer() that does the math to get the 140 * appropriate offsets into the leaf from the item numbers. 141 */ 142 static inline void copy_leaf_items(const struct extent_buffer *dst, 143 const struct extent_buffer *src, 144 int dst_item, int src_item, int nr_items) 145 { 146 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 147 btrfs_item_nr_offset(src, src_item), 148 nr_items * sizeof(struct btrfs_item)); 149 } 150 151 /* This exists for btrfs-progs usages. */ 152 u16 btrfs_csum_type_size(u16 type) 153 { 154 return btrfs_csums[type].size; 155 } 156 157 int btrfs_super_csum_size(const struct btrfs_super_block *s) 158 { 159 u16 t = btrfs_super_csum_type(s); 160 /* 161 * csum type is validated at mount time 162 */ 163 return btrfs_csum_type_size(t); 164 } 165 166 const char *btrfs_super_csum_name(u16 csum_type) 167 { 168 /* csum type is validated at mount time */ 169 return btrfs_csums[csum_type].name; 170 } 171 172 /* 173 * Return driver name if defined, otherwise the name that's also a valid driver 174 * name 175 */ 176 const char *btrfs_super_csum_driver(u16 csum_type) 177 { 178 /* csum type is validated at mount time */ 179 return btrfs_csums[csum_type].driver[0] ? 180 btrfs_csums[csum_type].driver : 181 btrfs_csums[csum_type].name; 182 } 183 184 size_t __attribute_const__ btrfs_get_num_csums(void) 185 { 186 return ARRAY_SIZE(btrfs_csums); 187 } 188 189 struct btrfs_path *btrfs_alloc_path(void) 190 { 191 might_sleep(); 192 193 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 194 } 195 196 /* this also releases the path */ 197 void btrfs_free_path(struct btrfs_path *p) 198 { 199 if (!p) 200 return; 201 btrfs_release_path(p); 202 kmem_cache_free(btrfs_path_cachep, p); 203 } 204 205 /* 206 * path release drops references on the extent buffers in the path 207 * and it drops any locks held by this path 208 * 209 * It is safe to call this on paths that no locks or extent buffers held. 210 */ 211 noinline void btrfs_release_path(struct btrfs_path *p) 212 { 213 int i; 214 215 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 216 p->slots[i] = 0; 217 if (!p->nodes[i]) 218 continue; 219 if (p->locks[i]) { 220 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 221 p->locks[i] = 0; 222 } 223 free_extent_buffer(p->nodes[i]); 224 p->nodes[i] = NULL; 225 } 226 } 227 228 /* 229 * We want the transaction abort to print stack trace only for errors where the 230 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 231 * caused by external factors. 232 */ 233 bool __cold abort_should_print_stack(int errno) 234 { 235 switch (errno) { 236 case -EIO: 237 case -EROFS: 238 case -ENOMEM: 239 return false; 240 } 241 return true; 242 } 243 244 /* 245 * safely gets a reference on the root node of a tree. A lock 246 * is not taken, so a concurrent writer may put a different node 247 * at the root of the tree. See btrfs_lock_root_node for the 248 * looping required. 249 * 250 * The extent buffer returned by this has a reference taken, so 251 * it won't disappear. It may stop being the root of the tree 252 * at any time because there are no locks held. 253 */ 254 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 255 { 256 struct extent_buffer *eb; 257 258 while (1) { 259 rcu_read_lock(); 260 eb = rcu_dereference(root->node); 261 262 /* 263 * RCU really hurts here, we could free up the root node because 264 * it was COWed but we may not get the new root node yet so do 265 * the inc_not_zero dance and if it doesn't work then 266 * synchronize_rcu and try again. 267 */ 268 if (atomic_inc_not_zero(&eb->refs)) { 269 rcu_read_unlock(); 270 break; 271 } 272 rcu_read_unlock(); 273 synchronize_rcu(); 274 } 275 return eb; 276 } 277 278 /* 279 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 280 * just get put onto a simple dirty list. Transaction walks this list to make 281 * sure they get properly updated on disk. 282 */ 283 static void add_root_to_dirty_list(struct btrfs_root *root) 284 { 285 struct btrfs_fs_info *fs_info = root->fs_info; 286 287 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 288 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 289 return; 290 291 spin_lock(&fs_info->trans_lock); 292 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 293 /* Want the extent tree to be the last on the list */ 294 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 295 list_move_tail(&root->dirty_list, 296 &fs_info->dirty_cowonly_roots); 297 else 298 list_move(&root->dirty_list, 299 &fs_info->dirty_cowonly_roots); 300 } 301 spin_unlock(&fs_info->trans_lock); 302 } 303 304 /* 305 * used by snapshot creation to make a copy of a root for a tree with 306 * a given objectid. The buffer with the new root node is returned in 307 * cow_ret, and this func returns zero on success or a negative error code. 308 */ 309 int btrfs_copy_root(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root, 311 struct extent_buffer *buf, 312 struct extent_buffer **cow_ret, u64 new_root_objectid) 313 { 314 struct btrfs_fs_info *fs_info = root->fs_info; 315 struct extent_buffer *cow; 316 int ret = 0; 317 int level; 318 struct btrfs_disk_key disk_key; 319 320 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 321 trans->transid != fs_info->running_transaction->transid); 322 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 323 trans->transid != root->last_trans); 324 325 level = btrfs_header_level(buf); 326 if (level == 0) 327 btrfs_item_key(buf, &disk_key, 0); 328 else 329 btrfs_node_key(buf, &disk_key, 0); 330 331 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 332 &disk_key, level, buf->start, 0, 333 BTRFS_NESTING_NEW_ROOT); 334 if (IS_ERR(cow)) 335 return PTR_ERR(cow); 336 337 copy_extent_buffer_full(cow, buf); 338 btrfs_set_header_bytenr(cow, cow->start); 339 btrfs_set_header_generation(cow, trans->transid); 340 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 341 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 342 BTRFS_HEADER_FLAG_RELOC); 343 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 344 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 345 else 346 btrfs_set_header_owner(cow, new_root_objectid); 347 348 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 349 350 WARN_ON(btrfs_header_generation(buf) > trans->transid); 351 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 352 ret = btrfs_inc_ref(trans, root, cow, 1); 353 else 354 ret = btrfs_inc_ref(trans, root, cow, 0); 355 if (ret) { 356 btrfs_tree_unlock(cow); 357 free_extent_buffer(cow); 358 btrfs_abort_transaction(trans, ret); 359 return ret; 360 } 361 362 btrfs_mark_buffer_dirty(trans, cow); 363 *cow_ret = cow; 364 return 0; 365 } 366 367 /* 368 * check if the tree block can be shared by multiple trees 369 */ 370 int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, 371 struct btrfs_root *root, 372 struct extent_buffer *buf) 373 { 374 /* 375 * Tree blocks not in shareable trees and tree roots are never shared. 376 * If a block was allocated after the last snapshot and the block was 377 * not allocated by tree relocation, we know the block is not shared. 378 */ 379 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 380 buf != root->node && 381 (btrfs_header_generation(buf) <= 382 btrfs_root_last_snapshot(&root->root_item) || 383 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { 384 if (buf != root->commit_root) 385 return 1; 386 /* 387 * An extent buffer that used to be the commit root may still be 388 * shared because the tree height may have increased and it 389 * became a child of a higher level root. This can happen when 390 * snapshotting a subvolume created in the current transaction. 391 */ 392 if (btrfs_header_generation(buf) == trans->transid) 393 return 1; 394 } 395 396 return 0; 397 } 398 399 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 400 struct btrfs_root *root, 401 struct extent_buffer *buf, 402 struct extent_buffer *cow, 403 int *last_ref) 404 { 405 struct btrfs_fs_info *fs_info = root->fs_info; 406 u64 refs; 407 u64 owner; 408 u64 flags; 409 u64 new_flags = 0; 410 int ret; 411 412 /* 413 * Backrefs update rules: 414 * 415 * Always use full backrefs for extent pointers in tree block 416 * allocated by tree relocation. 417 * 418 * If a shared tree block is no longer referenced by its owner 419 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 420 * use full backrefs for extent pointers in tree block. 421 * 422 * If a tree block is been relocating 423 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 424 * use full backrefs for extent pointers in tree block. 425 * The reason for this is some operations (such as drop tree) 426 * are only allowed for blocks use full backrefs. 427 */ 428 429 if (btrfs_block_can_be_shared(trans, root, buf)) { 430 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 431 btrfs_header_level(buf), 1, 432 &refs, &flags); 433 if (ret) 434 return ret; 435 if (unlikely(refs == 0)) { 436 btrfs_crit(fs_info, 437 "found 0 references for tree block at bytenr %llu level %d root %llu", 438 buf->start, btrfs_header_level(buf), 439 btrfs_root_id(root)); 440 ret = -EUCLEAN; 441 btrfs_abort_transaction(trans, ret); 442 return ret; 443 } 444 } else { 445 refs = 1; 446 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 447 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 448 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 449 else 450 flags = 0; 451 } 452 453 owner = btrfs_header_owner(buf); 454 if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID && 455 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) { 456 btrfs_crit(fs_info, 457 "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set", 458 buf->start, btrfs_header_level(buf), 459 btrfs_root_id(root), refs, flags); 460 ret = -EUCLEAN; 461 btrfs_abort_transaction(trans, ret); 462 return ret; 463 } 464 465 if (refs > 1) { 466 if ((owner == root->root_key.objectid || 467 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 468 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 469 ret = btrfs_inc_ref(trans, root, buf, 1); 470 if (ret) 471 return ret; 472 473 if (root->root_key.objectid == 474 BTRFS_TREE_RELOC_OBJECTID) { 475 ret = btrfs_dec_ref(trans, root, buf, 0); 476 if (ret) 477 return ret; 478 ret = btrfs_inc_ref(trans, root, cow, 1); 479 if (ret) 480 return ret; 481 } 482 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 483 } else { 484 485 if (root->root_key.objectid == 486 BTRFS_TREE_RELOC_OBJECTID) 487 ret = btrfs_inc_ref(trans, root, cow, 1); 488 else 489 ret = btrfs_inc_ref(trans, root, cow, 0); 490 if (ret) 491 return ret; 492 } 493 if (new_flags != 0) { 494 ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); 495 if (ret) 496 return ret; 497 } 498 } else { 499 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 500 if (root->root_key.objectid == 501 BTRFS_TREE_RELOC_OBJECTID) 502 ret = btrfs_inc_ref(trans, root, cow, 1); 503 else 504 ret = btrfs_inc_ref(trans, root, cow, 0); 505 if (ret) 506 return ret; 507 ret = btrfs_dec_ref(trans, root, buf, 1); 508 if (ret) 509 return ret; 510 } 511 btrfs_clear_buffer_dirty(trans, buf); 512 *last_ref = 1; 513 } 514 return 0; 515 } 516 517 /* 518 * does the dirty work in cow of a single block. The parent block (if 519 * supplied) is updated to point to the new cow copy. The new buffer is marked 520 * dirty and returned locked. If you modify the block it needs to be marked 521 * dirty again. 522 * 523 * search_start -- an allocation hint for the new block 524 * 525 * empty_size -- a hint that you plan on doing more cow. This is the size in 526 * bytes the allocator should try to find free next to the block it returns. 527 * This is just a hint and may be ignored by the allocator. 528 */ 529 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 530 struct btrfs_root *root, 531 struct extent_buffer *buf, 532 struct extent_buffer *parent, int parent_slot, 533 struct extent_buffer **cow_ret, 534 u64 search_start, u64 empty_size, 535 enum btrfs_lock_nesting nest) 536 { 537 struct btrfs_fs_info *fs_info = root->fs_info; 538 struct btrfs_disk_key disk_key; 539 struct extent_buffer *cow; 540 int level, ret; 541 int last_ref = 0; 542 int unlock_orig = 0; 543 u64 parent_start = 0; 544 545 if (*cow_ret == buf) 546 unlock_orig = 1; 547 548 btrfs_assert_tree_write_locked(buf); 549 550 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 551 trans->transid != fs_info->running_transaction->transid); 552 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 553 trans->transid != root->last_trans); 554 555 level = btrfs_header_level(buf); 556 557 if (level == 0) 558 btrfs_item_key(buf, &disk_key, 0); 559 else 560 btrfs_node_key(buf, &disk_key, 0); 561 562 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 563 parent_start = parent->start; 564 565 cow = btrfs_alloc_tree_block(trans, root, parent_start, 566 root->root_key.objectid, &disk_key, level, 567 search_start, empty_size, nest); 568 if (IS_ERR(cow)) 569 return PTR_ERR(cow); 570 571 /* cow is set to blocking by btrfs_init_new_buffer */ 572 573 copy_extent_buffer_full(cow, buf); 574 btrfs_set_header_bytenr(cow, cow->start); 575 btrfs_set_header_generation(cow, trans->transid); 576 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 577 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 578 BTRFS_HEADER_FLAG_RELOC); 579 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 580 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 581 else 582 btrfs_set_header_owner(cow, root->root_key.objectid); 583 584 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 585 586 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 587 if (ret) { 588 btrfs_tree_unlock(cow); 589 free_extent_buffer(cow); 590 btrfs_abort_transaction(trans, ret); 591 return ret; 592 } 593 594 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 595 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 596 if (ret) { 597 btrfs_tree_unlock(cow); 598 free_extent_buffer(cow); 599 btrfs_abort_transaction(trans, ret); 600 return ret; 601 } 602 } 603 604 if (buf == root->node) { 605 WARN_ON(parent && parent != buf); 606 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 607 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 608 parent_start = buf->start; 609 610 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 611 if (ret < 0) { 612 btrfs_tree_unlock(cow); 613 free_extent_buffer(cow); 614 btrfs_abort_transaction(trans, ret); 615 return ret; 616 } 617 atomic_inc(&cow->refs); 618 rcu_assign_pointer(root->node, cow); 619 620 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 621 parent_start, last_ref); 622 free_extent_buffer(buf); 623 add_root_to_dirty_list(root); 624 } else { 625 WARN_ON(trans->transid != btrfs_header_generation(parent)); 626 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, 627 BTRFS_MOD_LOG_KEY_REPLACE); 628 if (ret) { 629 btrfs_tree_unlock(cow); 630 free_extent_buffer(cow); 631 btrfs_abort_transaction(trans, ret); 632 return ret; 633 } 634 btrfs_set_node_blockptr(parent, parent_slot, 635 cow->start); 636 btrfs_set_node_ptr_generation(parent, parent_slot, 637 trans->transid); 638 btrfs_mark_buffer_dirty(trans, parent); 639 if (last_ref) { 640 ret = btrfs_tree_mod_log_free_eb(buf); 641 if (ret) { 642 btrfs_tree_unlock(cow); 643 free_extent_buffer(cow); 644 btrfs_abort_transaction(trans, ret); 645 return ret; 646 } 647 } 648 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 649 parent_start, last_ref); 650 } 651 if (unlock_orig) 652 btrfs_tree_unlock(buf); 653 free_extent_buffer_stale(buf); 654 btrfs_mark_buffer_dirty(trans, cow); 655 *cow_ret = cow; 656 return 0; 657 } 658 659 static inline int should_cow_block(struct btrfs_trans_handle *trans, 660 struct btrfs_root *root, 661 struct extent_buffer *buf) 662 { 663 if (btrfs_is_testing(root->fs_info)) 664 return 0; 665 666 /* Ensure we can see the FORCE_COW bit */ 667 smp_mb__before_atomic(); 668 669 /* 670 * We do not need to cow a block if 671 * 1) this block is not created or changed in this transaction; 672 * 2) this block does not belong to TREE_RELOC tree; 673 * 3) the root is not forced COW. 674 * 675 * What is forced COW: 676 * when we create snapshot during committing the transaction, 677 * after we've finished copying src root, we must COW the shared 678 * block to ensure the metadata consistency. 679 */ 680 if (btrfs_header_generation(buf) == trans->transid && 681 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 682 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 683 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 684 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 685 return 0; 686 return 1; 687 } 688 689 /* 690 * cows a single block, see __btrfs_cow_block for the real work. 691 * This version of it has extra checks so that a block isn't COWed more than 692 * once per transaction, as long as it hasn't been written yet 693 */ 694 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 695 struct btrfs_root *root, struct extent_buffer *buf, 696 struct extent_buffer *parent, int parent_slot, 697 struct extent_buffer **cow_ret, 698 enum btrfs_lock_nesting nest) 699 { 700 struct btrfs_fs_info *fs_info = root->fs_info; 701 u64 search_start; 702 int ret; 703 704 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 705 btrfs_abort_transaction(trans, -EUCLEAN); 706 btrfs_crit(fs_info, 707 "attempt to COW block %llu on root %llu that is being deleted", 708 buf->start, btrfs_root_id(root)); 709 return -EUCLEAN; 710 } 711 712 /* 713 * COWing must happen through a running transaction, which always 714 * matches the current fs generation (it's a transaction with a state 715 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 716 * into error state to prevent the commit of any transaction. 717 */ 718 if (unlikely(trans->transaction != fs_info->running_transaction || 719 trans->transid != fs_info->generation)) { 720 btrfs_abort_transaction(trans, -EUCLEAN); 721 btrfs_crit(fs_info, 722 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", 723 buf->start, btrfs_root_id(root), trans->transid, 724 fs_info->running_transaction->transid, 725 fs_info->generation); 726 return -EUCLEAN; 727 } 728 729 if (!should_cow_block(trans, root, buf)) { 730 *cow_ret = buf; 731 return 0; 732 } 733 734 search_start = buf->start & ~((u64)SZ_1G - 1); 735 736 /* 737 * Before CoWing this block for later modification, check if it's 738 * the subtree root and do the delayed subtree trace if needed. 739 * 740 * Also We don't care about the error, as it's handled internally. 741 */ 742 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 743 ret = __btrfs_cow_block(trans, root, buf, parent, 744 parent_slot, cow_ret, search_start, 0, nest); 745 746 trace_btrfs_cow_block(root, buf, *cow_ret); 747 748 return ret; 749 } 750 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 751 752 /* 753 * helper function for defrag to decide if two blocks pointed to by a 754 * node are actually close by 755 */ 756 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 757 { 758 if (blocknr < other && other - (blocknr + blocksize) < 32768) 759 return 1; 760 if (blocknr > other && blocknr - (other + blocksize) < 32768) 761 return 1; 762 return 0; 763 } 764 765 #ifdef __LITTLE_ENDIAN 766 767 /* 768 * Compare two keys, on little-endian the disk order is same as CPU order and 769 * we can avoid the conversion. 770 */ 771 static int comp_keys(const struct btrfs_disk_key *disk_key, 772 const struct btrfs_key *k2) 773 { 774 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 775 776 return btrfs_comp_cpu_keys(k1, k2); 777 } 778 779 #else 780 781 /* 782 * compare two keys in a memcmp fashion 783 */ 784 static int comp_keys(const struct btrfs_disk_key *disk, 785 const struct btrfs_key *k2) 786 { 787 struct btrfs_key k1; 788 789 btrfs_disk_key_to_cpu(&k1, disk); 790 791 return btrfs_comp_cpu_keys(&k1, k2); 792 } 793 #endif 794 795 /* 796 * same as comp_keys only with two btrfs_key's 797 */ 798 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 799 { 800 if (k1->objectid > k2->objectid) 801 return 1; 802 if (k1->objectid < k2->objectid) 803 return -1; 804 if (k1->type > k2->type) 805 return 1; 806 if (k1->type < k2->type) 807 return -1; 808 if (k1->offset > k2->offset) 809 return 1; 810 if (k1->offset < k2->offset) 811 return -1; 812 return 0; 813 } 814 815 /* 816 * this is used by the defrag code to go through all the 817 * leaves pointed to by a node and reallocate them so that 818 * disk order is close to key order 819 */ 820 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 821 struct btrfs_root *root, struct extent_buffer *parent, 822 int start_slot, u64 *last_ret, 823 struct btrfs_key *progress) 824 { 825 struct btrfs_fs_info *fs_info = root->fs_info; 826 struct extent_buffer *cur; 827 u64 blocknr; 828 u64 search_start = *last_ret; 829 u64 last_block = 0; 830 u64 other; 831 u32 parent_nritems; 832 int end_slot; 833 int i; 834 int err = 0; 835 u32 blocksize; 836 int progress_passed = 0; 837 struct btrfs_disk_key disk_key; 838 839 /* 840 * COWing must happen through a running transaction, which always 841 * matches the current fs generation (it's a transaction with a state 842 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 843 * into error state to prevent the commit of any transaction. 844 */ 845 if (unlikely(trans->transaction != fs_info->running_transaction || 846 trans->transid != fs_info->generation)) { 847 btrfs_abort_transaction(trans, -EUCLEAN); 848 btrfs_crit(fs_info, 849 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 850 parent->start, btrfs_root_id(root), trans->transid, 851 fs_info->running_transaction->transid, 852 fs_info->generation); 853 return -EUCLEAN; 854 } 855 856 parent_nritems = btrfs_header_nritems(parent); 857 blocksize = fs_info->nodesize; 858 end_slot = parent_nritems - 1; 859 860 if (parent_nritems <= 1) 861 return 0; 862 863 for (i = start_slot; i <= end_slot; i++) { 864 int close = 1; 865 866 btrfs_node_key(parent, &disk_key, i); 867 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 868 continue; 869 870 progress_passed = 1; 871 blocknr = btrfs_node_blockptr(parent, i); 872 if (last_block == 0) 873 last_block = blocknr; 874 875 if (i > 0) { 876 other = btrfs_node_blockptr(parent, i - 1); 877 close = close_blocks(blocknr, other, blocksize); 878 } 879 if (!close && i < end_slot) { 880 other = btrfs_node_blockptr(parent, i + 1); 881 close = close_blocks(blocknr, other, blocksize); 882 } 883 if (close) { 884 last_block = blocknr; 885 continue; 886 } 887 888 cur = btrfs_read_node_slot(parent, i); 889 if (IS_ERR(cur)) 890 return PTR_ERR(cur); 891 if (search_start == 0) 892 search_start = last_block; 893 894 btrfs_tree_lock(cur); 895 err = __btrfs_cow_block(trans, root, cur, parent, i, 896 &cur, search_start, 897 min(16 * blocksize, 898 (end_slot - i) * blocksize), 899 BTRFS_NESTING_COW); 900 if (err) { 901 btrfs_tree_unlock(cur); 902 free_extent_buffer(cur); 903 break; 904 } 905 search_start = cur->start; 906 last_block = cur->start; 907 *last_ret = search_start; 908 btrfs_tree_unlock(cur); 909 free_extent_buffer(cur); 910 } 911 return err; 912 } 913 914 /* 915 * Search for a key in the given extent_buffer. 916 * 917 * The lower boundary for the search is specified by the slot number @first_slot. 918 * Use a value of 0 to search over the whole extent buffer. Works for both 919 * leaves and nodes. 920 * 921 * The slot in the extent buffer is returned via @slot. If the key exists in the 922 * extent buffer, then @slot will point to the slot where the key is, otherwise 923 * it points to the slot where you would insert the key. 924 * 925 * Slot may point to the total number of items (i.e. one position beyond the last 926 * key) if the key is bigger than the last key in the extent buffer. 927 */ 928 int btrfs_bin_search(struct extent_buffer *eb, int first_slot, 929 const struct btrfs_key *key, int *slot) 930 { 931 unsigned long p; 932 int item_size; 933 /* 934 * Use unsigned types for the low and high slots, so that we get a more 935 * efficient division in the search loop below. 936 */ 937 u32 low = first_slot; 938 u32 high = btrfs_header_nritems(eb); 939 int ret; 940 const int key_size = sizeof(struct btrfs_disk_key); 941 942 if (unlikely(low > high)) { 943 btrfs_err(eb->fs_info, 944 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 945 __func__, low, high, eb->start, 946 btrfs_header_owner(eb), btrfs_header_level(eb)); 947 return -EINVAL; 948 } 949 950 if (btrfs_header_level(eb) == 0) { 951 p = offsetof(struct btrfs_leaf, items); 952 item_size = sizeof(struct btrfs_item); 953 } else { 954 p = offsetof(struct btrfs_node, ptrs); 955 item_size = sizeof(struct btrfs_key_ptr); 956 } 957 958 while (low < high) { 959 unsigned long oip; 960 unsigned long offset; 961 struct btrfs_disk_key *tmp; 962 struct btrfs_disk_key unaligned; 963 int mid; 964 965 mid = (low + high) / 2; 966 offset = p + mid * item_size; 967 oip = offset_in_page(offset); 968 969 if (oip + key_size <= PAGE_SIZE) { 970 const unsigned long idx = get_eb_page_index(offset); 971 char *kaddr = page_address(eb->pages[idx]); 972 973 oip = get_eb_offset_in_page(eb, offset); 974 tmp = (struct btrfs_disk_key *)(kaddr + oip); 975 } else { 976 read_extent_buffer(eb, &unaligned, offset, key_size); 977 tmp = &unaligned; 978 } 979 980 ret = comp_keys(tmp, key); 981 982 if (ret < 0) 983 low = mid + 1; 984 else if (ret > 0) 985 high = mid; 986 else { 987 *slot = mid; 988 return 0; 989 } 990 } 991 *slot = low; 992 return 1; 993 } 994 995 static void root_add_used(struct btrfs_root *root, u32 size) 996 { 997 spin_lock(&root->accounting_lock); 998 btrfs_set_root_used(&root->root_item, 999 btrfs_root_used(&root->root_item) + size); 1000 spin_unlock(&root->accounting_lock); 1001 } 1002 1003 static void root_sub_used(struct btrfs_root *root, u32 size) 1004 { 1005 spin_lock(&root->accounting_lock); 1006 btrfs_set_root_used(&root->root_item, 1007 btrfs_root_used(&root->root_item) - size); 1008 spin_unlock(&root->accounting_lock); 1009 } 1010 1011 /* given a node and slot number, this reads the blocks it points to. The 1012 * extent buffer is returned with a reference taken (but unlocked). 1013 */ 1014 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 1015 int slot) 1016 { 1017 int level = btrfs_header_level(parent); 1018 struct btrfs_tree_parent_check check = { 0 }; 1019 struct extent_buffer *eb; 1020 1021 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 1022 return ERR_PTR(-ENOENT); 1023 1024 ASSERT(level); 1025 1026 check.level = level - 1; 1027 check.transid = btrfs_node_ptr_generation(parent, slot); 1028 check.owner_root = btrfs_header_owner(parent); 1029 check.has_first_key = true; 1030 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 1031 1032 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 1033 &check); 1034 if (IS_ERR(eb)) 1035 return eb; 1036 if (!extent_buffer_uptodate(eb)) { 1037 free_extent_buffer(eb); 1038 return ERR_PTR(-EIO); 1039 } 1040 1041 return eb; 1042 } 1043 1044 /* 1045 * node level balancing, used to make sure nodes are in proper order for 1046 * item deletion. We balance from the top down, so we have to make sure 1047 * that a deletion won't leave an node completely empty later on. 1048 */ 1049 static noinline int balance_level(struct btrfs_trans_handle *trans, 1050 struct btrfs_root *root, 1051 struct btrfs_path *path, int level) 1052 { 1053 struct btrfs_fs_info *fs_info = root->fs_info; 1054 struct extent_buffer *right = NULL; 1055 struct extent_buffer *mid; 1056 struct extent_buffer *left = NULL; 1057 struct extent_buffer *parent = NULL; 1058 int ret = 0; 1059 int wret; 1060 int pslot; 1061 int orig_slot = path->slots[level]; 1062 u64 orig_ptr; 1063 1064 ASSERT(level > 0); 1065 1066 mid = path->nodes[level]; 1067 1068 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 1069 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1070 1071 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1072 1073 if (level < BTRFS_MAX_LEVEL - 1) { 1074 parent = path->nodes[level + 1]; 1075 pslot = path->slots[level + 1]; 1076 } 1077 1078 /* 1079 * deal with the case where there is only one pointer in the root 1080 * by promoting the node below to a root 1081 */ 1082 if (!parent) { 1083 struct extent_buffer *child; 1084 1085 if (btrfs_header_nritems(mid) != 1) 1086 return 0; 1087 1088 /* promote the child to a root */ 1089 child = btrfs_read_node_slot(mid, 0); 1090 if (IS_ERR(child)) { 1091 ret = PTR_ERR(child); 1092 goto out; 1093 } 1094 1095 btrfs_tree_lock(child); 1096 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 1097 BTRFS_NESTING_COW); 1098 if (ret) { 1099 btrfs_tree_unlock(child); 1100 free_extent_buffer(child); 1101 goto out; 1102 } 1103 1104 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 1105 if (ret < 0) { 1106 btrfs_tree_unlock(child); 1107 free_extent_buffer(child); 1108 btrfs_abort_transaction(trans, ret); 1109 goto out; 1110 } 1111 rcu_assign_pointer(root->node, child); 1112 1113 add_root_to_dirty_list(root); 1114 btrfs_tree_unlock(child); 1115 1116 path->locks[level] = 0; 1117 path->nodes[level] = NULL; 1118 btrfs_clear_buffer_dirty(trans, mid); 1119 btrfs_tree_unlock(mid); 1120 /* once for the path */ 1121 free_extent_buffer(mid); 1122 1123 root_sub_used(root, mid->len); 1124 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1125 /* once for the root ptr */ 1126 free_extent_buffer_stale(mid); 1127 return 0; 1128 } 1129 if (btrfs_header_nritems(mid) > 1130 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1131 return 0; 1132 1133 if (pslot) { 1134 left = btrfs_read_node_slot(parent, pslot - 1); 1135 if (IS_ERR(left)) { 1136 ret = PTR_ERR(left); 1137 left = NULL; 1138 goto out; 1139 } 1140 1141 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1142 wret = btrfs_cow_block(trans, root, left, 1143 parent, pslot - 1, &left, 1144 BTRFS_NESTING_LEFT_COW); 1145 if (wret) { 1146 ret = wret; 1147 goto out; 1148 } 1149 } 1150 1151 if (pslot + 1 < btrfs_header_nritems(parent)) { 1152 right = btrfs_read_node_slot(parent, pslot + 1); 1153 if (IS_ERR(right)) { 1154 ret = PTR_ERR(right); 1155 right = NULL; 1156 goto out; 1157 } 1158 1159 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1160 wret = btrfs_cow_block(trans, root, right, 1161 parent, pslot + 1, &right, 1162 BTRFS_NESTING_RIGHT_COW); 1163 if (wret) { 1164 ret = wret; 1165 goto out; 1166 } 1167 } 1168 1169 /* first, try to make some room in the middle buffer */ 1170 if (left) { 1171 orig_slot += btrfs_header_nritems(left); 1172 wret = push_node_left(trans, left, mid, 1); 1173 if (wret < 0) 1174 ret = wret; 1175 } 1176 1177 /* 1178 * then try to empty the right most buffer into the middle 1179 */ 1180 if (right) { 1181 wret = push_node_left(trans, mid, right, 1); 1182 if (wret < 0 && wret != -ENOSPC) 1183 ret = wret; 1184 if (btrfs_header_nritems(right) == 0) { 1185 btrfs_clear_buffer_dirty(trans, right); 1186 btrfs_tree_unlock(right); 1187 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); 1188 if (ret < 0) { 1189 free_extent_buffer_stale(right); 1190 right = NULL; 1191 goto out; 1192 } 1193 root_sub_used(root, right->len); 1194 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 1195 0, 1); 1196 free_extent_buffer_stale(right); 1197 right = NULL; 1198 } else { 1199 struct btrfs_disk_key right_key; 1200 btrfs_node_key(right, &right_key, 0); 1201 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1202 BTRFS_MOD_LOG_KEY_REPLACE); 1203 if (ret < 0) { 1204 btrfs_abort_transaction(trans, ret); 1205 goto out; 1206 } 1207 btrfs_set_node_key(parent, &right_key, pslot + 1); 1208 btrfs_mark_buffer_dirty(trans, parent); 1209 } 1210 } 1211 if (btrfs_header_nritems(mid) == 1) { 1212 /* 1213 * we're not allowed to leave a node with one item in the 1214 * tree during a delete. A deletion from lower in the tree 1215 * could try to delete the only pointer in this node. 1216 * So, pull some keys from the left. 1217 * There has to be a left pointer at this point because 1218 * otherwise we would have pulled some pointers from the 1219 * right 1220 */ 1221 if (unlikely(!left)) { 1222 btrfs_crit(fs_info, 1223 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", 1224 parent->start, btrfs_header_level(parent), 1225 mid->start, btrfs_root_id(root)); 1226 ret = -EUCLEAN; 1227 btrfs_abort_transaction(trans, ret); 1228 goto out; 1229 } 1230 wret = balance_node_right(trans, mid, left); 1231 if (wret < 0) { 1232 ret = wret; 1233 goto out; 1234 } 1235 if (wret == 1) { 1236 wret = push_node_left(trans, left, mid, 1); 1237 if (wret < 0) 1238 ret = wret; 1239 } 1240 BUG_ON(wret == 1); 1241 } 1242 if (btrfs_header_nritems(mid) == 0) { 1243 btrfs_clear_buffer_dirty(trans, mid); 1244 btrfs_tree_unlock(mid); 1245 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); 1246 if (ret < 0) { 1247 free_extent_buffer_stale(mid); 1248 mid = NULL; 1249 goto out; 1250 } 1251 root_sub_used(root, mid->len); 1252 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1253 free_extent_buffer_stale(mid); 1254 mid = NULL; 1255 } else { 1256 /* update the parent key to reflect our changes */ 1257 struct btrfs_disk_key mid_key; 1258 btrfs_node_key(mid, &mid_key, 0); 1259 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1260 BTRFS_MOD_LOG_KEY_REPLACE); 1261 if (ret < 0) { 1262 btrfs_abort_transaction(trans, ret); 1263 goto out; 1264 } 1265 btrfs_set_node_key(parent, &mid_key, pslot); 1266 btrfs_mark_buffer_dirty(trans, parent); 1267 } 1268 1269 /* update the path */ 1270 if (left) { 1271 if (btrfs_header_nritems(left) > orig_slot) { 1272 atomic_inc(&left->refs); 1273 /* left was locked after cow */ 1274 path->nodes[level] = left; 1275 path->slots[level + 1] -= 1; 1276 path->slots[level] = orig_slot; 1277 if (mid) { 1278 btrfs_tree_unlock(mid); 1279 free_extent_buffer(mid); 1280 } 1281 } else { 1282 orig_slot -= btrfs_header_nritems(left); 1283 path->slots[level] = orig_slot; 1284 } 1285 } 1286 /* double check we haven't messed things up */ 1287 if (orig_ptr != 1288 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1289 BUG(); 1290 out: 1291 if (right) { 1292 btrfs_tree_unlock(right); 1293 free_extent_buffer(right); 1294 } 1295 if (left) { 1296 if (path->nodes[level] != left) 1297 btrfs_tree_unlock(left); 1298 free_extent_buffer(left); 1299 } 1300 return ret; 1301 } 1302 1303 /* Node balancing for insertion. Here we only split or push nodes around 1304 * when they are completely full. This is also done top down, so we 1305 * have to be pessimistic. 1306 */ 1307 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1308 struct btrfs_root *root, 1309 struct btrfs_path *path, int level) 1310 { 1311 struct btrfs_fs_info *fs_info = root->fs_info; 1312 struct extent_buffer *right = NULL; 1313 struct extent_buffer *mid; 1314 struct extent_buffer *left = NULL; 1315 struct extent_buffer *parent = NULL; 1316 int ret = 0; 1317 int wret; 1318 int pslot; 1319 int orig_slot = path->slots[level]; 1320 1321 if (level == 0) 1322 return 1; 1323 1324 mid = path->nodes[level]; 1325 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1326 1327 if (level < BTRFS_MAX_LEVEL - 1) { 1328 parent = path->nodes[level + 1]; 1329 pslot = path->slots[level + 1]; 1330 } 1331 1332 if (!parent) 1333 return 1; 1334 1335 /* first, try to make some room in the middle buffer */ 1336 if (pslot) { 1337 u32 left_nr; 1338 1339 left = btrfs_read_node_slot(parent, pslot - 1); 1340 if (IS_ERR(left)) 1341 return PTR_ERR(left); 1342 1343 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1344 1345 left_nr = btrfs_header_nritems(left); 1346 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1347 wret = 1; 1348 } else { 1349 ret = btrfs_cow_block(trans, root, left, parent, 1350 pslot - 1, &left, 1351 BTRFS_NESTING_LEFT_COW); 1352 if (ret) 1353 wret = 1; 1354 else { 1355 wret = push_node_left(trans, left, mid, 0); 1356 } 1357 } 1358 if (wret < 0) 1359 ret = wret; 1360 if (wret == 0) { 1361 struct btrfs_disk_key disk_key; 1362 orig_slot += left_nr; 1363 btrfs_node_key(mid, &disk_key, 0); 1364 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1365 BTRFS_MOD_LOG_KEY_REPLACE); 1366 if (ret < 0) { 1367 btrfs_tree_unlock(left); 1368 free_extent_buffer(left); 1369 btrfs_abort_transaction(trans, ret); 1370 return ret; 1371 } 1372 btrfs_set_node_key(parent, &disk_key, pslot); 1373 btrfs_mark_buffer_dirty(trans, parent); 1374 if (btrfs_header_nritems(left) > orig_slot) { 1375 path->nodes[level] = left; 1376 path->slots[level + 1] -= 1; 1377 path->slots[level] = orig_slot; 1378 btrfs_tree_unlock(mid); 1379 free_extent_buffer(mid); 1380 } else { 1381 orig_slot -= 1382 btrfs_header_nritems(left); 1383 path->slots[level] = orig_slot; 1384 btrfs_tree_unlock(left); 1385 free_extent_buffer(left); 1386 } 1387 return 0; 1388 } 1389 btrfs_tree_unlock(left); 1390 free_extent_buffer(left); 1391 } 1392 1393 /* 1394 * then try to empty the right most buffer into the middle 1395 */ 1396 if (pslot + 1 < btrfs_header_nritems(parent)) { 1397 u32 right_nr; 1398 1399 right = btrfs_read_node_slot(parent, pslot + 1); 1400 if (IS_ERR(right)) 1401 return PTR_ERR(right); 1402 1403 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1404 1405 right_nr = btrfs_header_nritems(right); 1406 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1407 wret = 1; 1408 } else { 1409 ret = btrfs_cow_block(trans, root, right, 1410 parent, pslot + 1, 1411 &right, BTRFS_NESTING_RIGHT_COW); 1412 if (ret) 1413 wret = 1; 1414 else { 1415 wret = balance_node_right(trans, right, mid); 1416 } 1417 } 1418 if (wret < 0) 1419 ret = wret; 1420 if (wret == 0) { 1421 struct btrfs_disk_key disk_key; 1422 1423 btrfs_node_key(right, &disk_key, 0); 1424 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1425 BTRFS_MOD_LOG_KEY_REPLACE); 1426 if (ret < 0) { 1427 btrfs_tree_unlock(right); 1428 free_extent_buffer(right); 1429 btrfs_abort_transaction(trans, ret); 1430 return ret; 1431 } 1432 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1433 btrfs_mark_buffer_dirty(trans, parent); 1434 1435 if (btrfs_header_nritems(mid) <= orig_slot) { 1436 path->nodes[level] = right; 1437 path->slots[level + 1] += 1; 1438 path->slots[level] = orig_slot - 1439 btrfs_header_nritems(mid); 1440 btrfs_tree_unlock(mid); 1441 free_extent_buffer(mid); 1442 } else { 1443 btrfs_tree_unlock(right); 1444 free_extent_buffer(right); 1445 } 1446 return 0; 1447 } 1448 btrfs_tree_unlock(right); 1449 free_extent_buffer(right); 1450 } 1451 return 1; 1452 } 1453 1454 /* 1455 * readahead one full node of leaves, finding things that are close 1456 * to the block in 'slot', and triggering ra on them. 1457 */ 1458 static void reada_for_search(struct btrfs_fs_info *fs_info, 1459 struct btrfs_path *path, 1460 int level, int slot, u64 objectid) 1461 { 1462 struct extent_buffer *node; 1463 struct btrfs_disk_key disk_key; 1464 u32 nritems; 1465 u64 search; 1466 u64 target; 1467 u64 nread = 0; 1468 u64 nread_max; 1469 u32 nr; 1470 u32 blocksize; 1471 u32 nscan = 0; 1472 1473 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1474 return; 1475 1476 if (!path->nodes[level]) 1477 return; 1478 1479 node = path->nodes[level]; 1480 1481 /* 1482 * Since the time between visiting leaves is much shorter than the time 1483 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1484 * much IO at once (possibly random). 1485 */ 1486 if (path->reada == READA_FORWARD_ALWAYS) { 1487 if (level > 1) 1488 nread_max = node->fs_info->nodesize; 1489 else 1490 nread_max = SZ_128K; 1491 } else { 1492 nread_max = SZ_64K; 1493 } 1494 1495 search = btrfs_node_blockptr(node, slot); 1496 blocksize = fs_info->nodesize; 1497 if (path->reada != READA_FORWARD_ALWAYS) { 1498 struct extent_buffer *eb; 1499 1500 eb = find_extent_buffer(fs_info, search); 1501 if (eb) { 1502 free_extent_buffer(eb); 1503 return; 1504 } 1505 } 1506 1507 target = search; 1508 1509 nritems = btrfs_header_nritems(node); 1510 nr = slot; 1511 1512 while (1) { 1513 if (path->reada == READA_BACK) { 1514 if (nr == 0) 1515 break; 1516 nr--; 1517 } else if (path->reada == READA_FORWARD || 1518 path->reada == READA_FORWARD_ALWAYS) { 1519 nr++; 1520 if (nr >= nritems) 1521 break; 1522 } 1523 if (path->reada == READA_BACK && objectid) { 1524 btrfs_node_key(node, &disk_key, nr); 1525 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1526 break; 1527 } 1528 search = btrfs_node_blockptr(node, nr); 1529 if (path->reada == READA_FORWARD_ALWAYS || 1530 (search <= target && target - search <= 65536) || 1531 (search > target && search - target <= 65536)) { 1532 btrfs_readahead_node_child(node, nr); 1533 nread += blocksize; 1534 } 1535 nscan++; 1536 if (nread > nread_max || nscan > 32) 1537 break; 1538 } 1539 } 1540 1541 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1542 { 1543 struct extent_buffer *parent; 1544 int slot; 1545 int nritems; 1546 1547 parent = path->nodes[level + 1]; 1548 if (!parent) 1549 return; 1550 1551 nritems = btrfs_header_nritems(parent); 1552 slot = path->slots[level + 1]; 1553 1554 if (slot > 0) 1555 btrfs_readahead_node_child(parent, slot - 1); 1556 if (slot + 1 < nritems) 1557 btrfs_readahead_node_child(parent, slot + 1); 1558 } 1559 1560 1561 /* 1562 * when we walk down the tree, it is usually safe to unlock the higher layers 1563 * in the tree. The exceptions are when our path goes through slot 0, because 1564 * operations on the tree might require changing key pointers higher up in the 1565 * tree. 1566 * 1567 * callers might also have set path->keep_locks, which tells this code to keep 1568 * the lock if the path points to the last slot in the block. This is part of 1569 * walking through the tree, and selecting the next slot in the higher block. 1570 * 1571 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1572 * if lowest_unlock is 1, level 0 won't be unlocked 1573 */ 1574 static noinline void unlock_up(struct btrfs_path *path, int level, 1575 int lowest_unlock, int min_write_lock_level, 1576 int *write_lock_level) 1577 { 1578 int i; 1579 int skip_level = level; 1580 bool check_skip = true; 1581 1582 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1583 if (!path->nodes[i]) 1584 break; 1585 if (!path->locks[i]) 1586 break; 1587 1588 if (check_skip) { 1589 if (path->slots[i] == 0) { 1590 skip_level = i + 1; 1591 continue; 1592 } 1593 1594 if (path->keep_locks) { 1595 u32 nritems; 1596 1597 nritems = btrfs_header_nritems(path->nodes[i]); 1598 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1599 skip_level = i + 1; 1600 continue; 1601 } 1602 } 1603 } 1604 1605 if (i >= lowest_unlock && i > skip_level) { 1606 check_skip = false; 1607 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1608 path->locks[i] = 0; 1609 if (write_lock_level && 1610 i > min_write_lock_level && 1611 i <= *write_lock_level) { 1612 *write_lock_level = i - 1; 1613 } 1614 } 1615 } 1616 } 1617 1618 /* 1619 * Helper function for btrfs_search_slot() and other functions that do a search 1620 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1621 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1622 * its pages from disk. 1623 * 1624 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1625 * whole btree search, starting again from the current root node. 1626 */ 1627 static int 1628 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1629 struct extent_buffer **eb_ret, int level, int slot, 1630 const struct btrfs_key *key) 1631 { 1632 struct btrfs_fs_info *fs_info = root->fs_info; 1633 struct btrfs_tree_parent_check check = { 0 }; 1634 u64 blocknr; 1635 u64 gen; 1636 struct extent_buffer *tmp; 1637 int ret; 1638 int parent_level; 1639 bool unlock_up; 1640 1641 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1642 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1643 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1644 parent_level = btrfs_header_level(*eb_ret); 1645 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1646 check.has_first_key = true; 1647 check.level = parent_level - 1; 1648 check.transid = gen; 1649 check.owner_root = root->root_key.objectid; 1650 1651 /* 1652 * If we need to read an extent buffer from disk and we are holding locks 1653 * on upper level nodes, we unlock all the upper nodes before reading the 1654 * extent buffer, and then return -EAGAIN to the caller as it needs to 1655 * restart the search. We don't release the lock on the current level 1656 * because we need to walk this node to figure out which blocks to read. 1657 */ 1658 tmp = find_extent_buffer(fs_info, blocknr); 1659 if (tmp) { 1660 if (p->reada == READA_FORWARD_ALWAYS) 1661 reada_for_search(fs_info, p, level, slot, key->objectid); 1662 1663 /* first we do an atomic uptodate check */ 1664 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1665 /* 1666 * Do extra check for first_key, eb can be stale due to 1667 * being cached, read from scrub, or have multiple 1668 * parents (shared tree blocks). 1669 */ 1670 if (btrfs_verify_level_key(tmp, 1671 parent_level - 1, &check.first_key, gen)) { 1672 free_extent_buffer(tmp); 1673 return -EUCLEAN; 1674 } 1675 *eb_ret = tmp; 1676 return 0; 1677 } 1678 1679 if (p->nowait) { 1680 free_extent_buffer(tmp); 1681 return -EAGAIN; 1682 } 1683 1684 if (unlock_up) 1685 btrfs_unlock_up_safe(p, level + 1); 1686 1687 /* now we're allowed to do a blocking uptodate check */ 1688 ret = btrfs_read_extent_buffer(tmp, &check); 1689 if (ret) { 1690 free_extent_buffer(tmp); 1691 btrfs_release_path(p); 1692 return -EIO; 1693 } 1694 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1695 free_extent_buffer(tmp); 1696 btrfs_release_path(p); 1697 return -EUCLEAN; 1698 } 1699 1700 if (unlock_up) 1701 ret = -EAGAIN; 1702 1703 goto out; 1704 } else if (p->nowait) { 1705 return -EAGAIN; 1706 } 1707 1708 if (unlock_up) { 1709 btrfs_unlock_up_safe(p, level + 1); 1710 ret = -EAGAIN; 1711 } else { 1712 ret = 0; 1713 } 1714 1715 if (p->reada != READA_NONE) 1716 reada_for_search(fs_info, p, level, slot, key->objectid); 1717 1718 tmp = read_tree_block(fs_info, blocknr, &check); 1719 if (IS_ERR(tmp)) { 1720 btrfs_release_path(p); 1721 return PTR_ERR(tmp); 1722 } 1723 /* 1724 * If the read above didn't mark this buffer up to date, 1725 * it will never end up being up to date. Set ret to EIO now 1726 * and give up so that our caller doesn't loop forever 1727 * on our EAGAINs. 1728 */ 1729 if (!extent_buffer_uptodate(tmp)) 1730 ret = -EIO; 1731 1732 out: 1733 if (ret == 0) { 1734 *eb_ret = tmp; 1735 } else { 1736 free_extent_buffer(tmp); 1737 btrfs_release_path(p); 1738 } 1739 1740 return ret; 1741 } 1742 1743 /* 1744 * helper function for btrfs_search_slot. This does all of the checks 1745 * for node-level blocks and does any balancing required based on 1746 * the ins_len. 1747 * 1748 * If no extra work was required, zero is returned. If we had to 1749 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1750 * start over 1751 */ 1752 static int 1753 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1754 struct btrfs_root *root, struct btrfs_path *p, 1755 struct extent_buffer *b, int level, int ins_len, 1756 int *write_lock_level) 1757 { 1758 struct btrfs_fs_info *fs_info = root->fs_info; 1759 int ret = 0; 1760 1761 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1762 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1763 1764 if (*write_lock_level < level + 1) { 1765 *write_lock_level = level + 1; 1766 btrfs_release_path(p); 1767 return -EAGAIN; 1768 } 1769 1770 reada_for_balance(p, level); 1771 ret = split_node(trans, root, p, level); 1772 1773 b = p->nodes[level]; 1774 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1775 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1776 1777 if (*write_lock_level < level + 1) { 1778 *write_lock_level = level + 1; 1779 btrfs_release_path(p); 1780 return -EAGAIN; 1781 } 1782 1783 reada_for_balance(p, level); 1784 ret = balance_level(trans, root, p, level); 1785 if (ret) 1786 return ret; 1787 1788 b = p->nodes[level]; 1789 if (!b) { 1790 btrfs_release_path(p); 1791 return -EAGAIN; 1792 } 1793 BUG_ON(btrfs_header_nritems(b) == 1); 1794 } 1795 return ret; 1796 } 1797 1798 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1799 u64 iobjectid, u64 ioff, u8 key_type, 1800 struct btrfs_key *found_key) 1801 { 1802 int ret; 1803 struct btrfs_key key; 1804 struct extent_buffer *eb; 1805 1806 ASSERT(path); 1807 ASSERT(found_key); 1808 1809 key.type = key_type; 1810 key.objectid = iobjectid; 1811 key.offset = ioff; 1812 1813 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1814 if (ret < 0) 1815 return ret; 1816 1817 eb = path->nodes[0]; 1818 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1819 ret = btrfs_next_leaf(fs_root, path); 1820 if (ret) 1821 return ret; 1822 eb = path->nodes[0]; 1823 } 1824 1825 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1826 if (found_key->type != key.type || 1827 found_key->objectid != key.objectid) 1828 return 1; 1829 1830 return 0; 1831 } 1832 1833 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1834 struct btrfs_path *p, 1835 int write_lock_level) 1836 { 1837 struct extent_buffer *b; 1838 int root_lock = 0; 1839 int level = 0; 1840 1841 if (p->search_commit_root) { 1842 b = root->commit_root; 1843 atomic_inc(&b->refs); 1844 level = btrfs_header_level(b); 1845 /* 1846 * Ensure that all callers have set skip_locking when 1847 * p->search_commit_root = 1. 1848 */ 1849 ASSERT(p->skip_locking == 1); 1850 1851 goto out; 1852 } 1853 1854 if (p->skip_locking) { 1855 b = btrfs_root_node(root); 1856 level = btrfs_header_level(b); 1857 goto out; 1858 } 1859 1860 /* We try very hard to do read locks on the root */ 1861 root_lock = BTRFS_READ_LOCK; 1862 1863 /* 1864 * If the level is set to maximum, we can skip trying to get the read 1865 * lock. 1866 */ 1867 if (write_lock_level < BTRFS_MAX_LEVEL) { 1868 /* 1869 * We don't know the level of the root node until we actually 1870 * have it read locked 1871 */ 1872 if (p->nowait) { 1873 b = btrfs_try_read_lock_root_node(root); 1874 if (IS_ERR(b)) 1875 return b; 1876 } else { 1877 b = btrfs_read_lock_root_node(root); 1878 } 1879 level = btrfs_header_level(b); 1880 if (level > write_lock_level) 1881 goto out; 1882 1883 /* Whoops, must trade for write lock */ 1884 btrfs_tree_read_unlock(b); 1885 free_extent_buffer(b); 1886 } 1887 1888 b = btrfs_lock_root_node(root); 1889 root_lock = BTRFS_WRITE_LOCK; 1890 1891 /* The level might have changed, check again */ 1892 level = btrfs_header_level(b); 1893 1894 out: 1895 /* 1896 * The root may have failed to write out at some point, and thus is no 1897 * longer valid, return an error in this case. 1898 */ 1899 if (!extent_buffer_uptodate(b)) { 1900 if (root_lock) 1901 btrfs_tree_unlock_rw(b, root_lock); 1902 free_extent_buffer(b); 1903 return ERR_PTR(-EIO); 1904 } 1905 1906 p->nodes[level] = b; 1907 if (!p->skip_locking) 1908 p->locks[level] = root_lock; 1909 /* 1910 * Callers are responsible for dropping b's references. 1911 */ 1912 return b; 1913 } 1914 1915 /* 1916 * Replace the extent buffer at the lowest level of the path with a cloned 1917 * version. The purpose is to be able to use it safely, after releasing the 1918 * commit root semaphore, even if relocation is happening in parallel, the 1919 * transaction used for relocation is committed and the extent buffer is 1920 * reallocated in the next transaction. 1921 * 1922 * This is used in a context where the caller does not prevent transaction 1923 * commits from happening, either by holding a transaction handle or holding 1924 * some lock, while it's doing searches through a commit root. 1925 * At the moment it's only used for send operations. 1926 */ 1927 static int finish_need_commit_sem_search(struct btrfs_path *path) 1928 { 1929 const int i = path->lowest_level; 1930 const int slot = path->slots[i]; 1931 struct extent_buffer *lowest = path->nodes[i]; 1932 struct extent_buffer *clone; 1933 1934 ASSERT(path->need_commit_sem); 1935 1936 if (!lowest) 1937 return 0; 1938 1939 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1940 1941 clone = btrfs_clone_extent_buffer(lowest); 1942 if (!clone) 1943 return -ENOMEM; 1944 1945 btrfs_release_path(path); 1946 path->nodes[i] = clone; 1947 path->slots[i] = slot; 1948 1949 return 0; 1950 } 1951 1952 static inline int search_for_key_slot(struct extent_buffer *eb, 1953 int search_low_slot, 1954 const struct btrfs_key *key, 1955 int prev_cmp, 1956 int *slot) 1957 { 1958 /* 1959 * If a previous call to btrfs_bin_search() on a parent node returned an 1960 * exact match (prev_cmp == 0), we can safely assume the target key will 1961 * always be at slot 0 on lower levels, since each key pointer 1962 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1963 * subtree it points to. Thus we can skip searching lower levels. 1964 */ 1965 if (prev_cmp == 0) { 1966 *slot = 0; 1967 return 0; 1968 } 1969 1970 return btrfs_bin_search(eb, search_low_slot, key, slot); 1971 } 1972 1973 static int search_leaf(struct btrfs_trans_handle *trans, 1974 struct btrfs_root *root, 1975 const struct btrfs_key *key, 1976 struct btrfs_path *path, 1977 int ins_len, 1978 int prev_cmp) 1979 { 1980 struct extent_buffer *leaf = path->nodes[0]; 1981 int leaf_free_space = -1; 1982 int search_low_slot = 0; 1983 int ret; 1984 bool do_bin_search = true; 1985 1986 /* 1987 * If we are doing an insertion, the leaf has enough free space and the 1988 * destination slot for the key is not slot 0, then we can unlock our 1989 * write lock on the parent, and any other upper nodes, before doing the 1990 * binary search on the leaf (with search_for_key_slot()), allowing other 1991 * tasks to lock the parent and any other upper nodes. 1992 */ 1993 if (ins_len > 0) { 1994 /* 1995 * Cache the leaf free space, since we will need it later and it 1996 * will not change until then. 1997 */ 1998 leaf_free_space = btrfs_leaf_free_space(leaf); 1999 2000 /* 2001 * !path->locks[1] means we have a single node tree, the leaf is 2002 * the root of the tree. 2003 */ 2004 if (path->locks[1] && leaf_free_space >= ins_len) { 2005 struct btrfs_disk_key first_key; 2006 2007 ASSERT(btrfs_header_nritems(leaf) > 0); 2008 btrfs_item_key(leaf, &first_key, 0); 2009 2010 /* 2011 * Doing the extra comparison with the first key is cheap, 2012 * taking into account that the first key is very likely 2013 * already in a cache line because it immediately follows 2014 * the extent buffer's header and we have recently accessed 2015 * the header's level field. 2016 */ 2017 ret = comp_keys(&first_key, key); 2018 if (ret < 0) { 2019 /* 2020 * The first key is smaller than the key we want 2021 * to insert, so we are safe to unlock all upper 2022 * nodes and we have to do the binary search. 2023 * 2024 * We do use btrfs_unlock_up_safe() and not 2025 * unlock_up() because the later does not unlock 2026 * nodes with a slot of 0 - we can safely unlock 2027 * any node even if its slot is 0 since in this 2028 * case the key does not end up at slot 0 of the 2029 * leaf and there's no need to split the leaf. 2030 */ 2031 btrfs_unlock_up_safe(path, 1); 2032 search_low_slot = 1; 2033 } else { 2034 /* 2035 * The first key is >= then the key we want to 2036 * insert, so we can skip the binary search as 2037 * the target key will be at slot 0. 2038 * 2039 * We can not unlock upper nodes when the key is 2040 * less than the first key, because we will need 2041 * to update the key at slot 0 of the parent node 2042 * and possibly of other upper nodes too. 2043 * If the key matches the first key, then we can 2044 * unlock all the upper nodes, using 2045 * btrfs_unlock_up_safe() instead of unlock_up() 2046 * as stated above. 2047 */ 2048 if (ret == 0) 2049 btrfs_unlock_up_safe(path, 1); 2050 /* 2051 * ret is already 0 or 1, matching the result of 2052 * a btrfs_bin_search() call, so there is no need 2053 * to adjust it. 2054 */ 2055 do_bin_search = false; 2056 path->slots[0] = 0; 2057 } 2058 } 2059 } 2060 2061 if (do_bin_search) { 2062 ret = search_for_key_slot(leaf, search_low_slot, key, 2063 prev_cmp, &path->slots[0]); 2064 if (ret < 0) 2065 return ret; 2066 } 2067 2068 if (ins_len > 0) { 2069 /* 2070 * Item key already exists. In this case, if we are allowed to 2071 * insert the item (for example, in dir_item case, item key 2072 * collision is allowed), it will be merged with the original 2073 * item. Only the item size grows, no new btrfs item will be 2074 * added. If search_for_extension is not set, ins_len already 2075 * accounts the size btrfs_item, deduct it here so leaf space 2076 * check will be correct. 2077 */ 2078 if (ret == 0 && !path->search_for_extension) { 2079 ASSERT(ins_len >= sizeof(struct btrfs_item)); 2080 ins_len -= sizeof(struct btrfs_item); 2081 } 2082 2083 ASSERT(leaf_free_space >= 0); 2084 2085 if (leaf_free_space < ins_len) { 2086 int err; 2087 2088 err = split_leaf(trans, root, key, path, ins_len, 2089 (ret == 0)); 2090 ASSERT(err <= 0); 2091 if (WARN_ON(err > 0)) 2092 err = -EUCLEAN; 2093 if (err) 2094 ret = err; 2095 } 2096 } 2097 2098 return ret; 2099 } 2100 2101 /* 2102 * btrfs_search_slot - look for a key in a tree and perform necessary 2103 * modifications to preserve tree invariants. 2104 * 2105 * @trans: Handle of transaction, used when modifying the tree 2106 * @p: Holds all btree nodes along the search path 2107 * @root: The root node of the tree 2108 * @key: The key we are looking for 2109 * @ins_len: Indicates purpose of search: 2110 * >0 for inserts it's size of item inserted (*) 2111 * <0 for deletions 2112 * 0 for plain searches, not modifying the tree 2113 * 2114 * (*) If size of item inserted doesn't include 2115 * sizeof(struct btrfs_item), then p->search_for_extension must 2116 * be set. 2117 * @cow: boolean should CoW operations be performed. Must always be 1 2118 * when modifying the tree. 2119 * 2120 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2121 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2122 * 2123 * If @key is found, 0 is returned and you can find the item in the leaf level 2124 * of the path (level 0) 2125 * 2126 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2127 * points to the slot where it should be inserted 2128 * 2129 * If an error is encountered while searching the tree a negative error number 2130 * is returned 2131 */ 2132 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2133 const struct btrfs_key *key, struct btrfs_path *p, 2134 int ins_len, int cow) 2135 { 2136 struct btrfs_fs_info *fs_info = root->fs_info; 2137 struct extent_buffer *b; 2138 int slot; 2139 int ret; 2140 int err; 2141 int level; 2142 int lowest_unlock = 1; 2143 /* everything at write_lock_level or lower must be write locked */ 2144 int write_lock_level = 0; 2145 u8 lowest_level = 0; 2146 int min_write_lock_level; 2147 int prev_cmp; 2148 2149 might_sleep(); 2150 2151 lowest_level = p->lowest_level; 2152 WARN_ON(lowest_level && ins_len > 0); 2153 WARN_ON(p->nodes[0] != NULL); 2154 BUG_ON(!cow && ins_len); 2155 2156 /* 2157 * For now only allow nowait for read only operations. There's no 2158 * strict reason why we can't, we just only need it for reads so it's 2159 * only implemented for reads. 2160 */ 2161 ASSERT(!p->nowait || !cow); 2162 2163 if (ins_len < 0) { 2164 lowest_unlock = 2; 2165 2166 /* when we are removing items, we might have to go up to level 2167 * two as we update tree pointers Make sure we keep write 2168 * for those levels as well 2169 */ 2170 write_lock_level = 2; 2171 } else if (ins_len > 0) { 2172 /* 2173 * for inserting items, make sure we have a write lock on 2174 * level 1 so we can update keys 2175 */ 2176 write_lock_level = 1; 2177 } 2178 2179 if (!cow) 2180 write_lock_level = -1; 2181 2182 if (cow && (p->keep_locks || p->lowest_level)) 2183 write_lock_level = BTRFS_MAX_LEVEL; 2184 2185 min_write_lock_level = write_lock_level; 2186 2187 if (p->need_commit_sem) { 2188 ASSERT(p->search_commit_root); 2189 if (p->nowait) { 2190 if (!down_read_trylock(&fs_info->commit_root_sem)) 2191 return -EAGAIN; 2192 } else { 2193 down_read(&fs_info->commit_root_sem); 2194 } 2195 } 2196 2197 again: 2198 prev_cmp = -1; 2199 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2200 if (IS_ERR(b)) { 2201 ret = PTR_ERR(b); 2202 goto done; 2203 } 2204 2205 while (b) { 2206 int dec = 0; 2207 2208 level = btrfs_header_level(b); 2209 2210 if (cow) { 2211 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2212 2213 /* 2214 * if we don't really need to cow this block 2215 * then we don't want to set the path blocking, 2216 * so we test it here 2217 */ 2218 if (!should_cow_block(trans, root, b)) 2219 goto cow_done; 2220 2221 /* 2222 * must have write locks on this node and the 2223 * parent 2224 */ 2225 if (level > write_lock_level || 2226 (level + 1 > write_lock_level && 2227 level + 1 < BTRFS_MAX_LEVEL && 2228 p->nodes[level + 1])) { 2229 write_lock_level = level + 1; 2230 btrfs_release_path(p); 2231 goto again; 2232 } 2233 2234 if (last_level) 2235 err = btrfs_cow_block(trans, root, b, NULL, 0, 2236 &b, 2237 BTRFS_NESTING_COW); 2238 else 2239 err = btrfs_cow_block(trans, root, b, 2240 p->nodes[level + 1], 2241 p->slots[level + 1], &b, 2242 BTRFS_NESTING_COW); 2243 if (err) { 2244 ret = err; 2245 goto done; 2246 } 2247 } 2248 cow_done: 2249 p->nodes[level] = b; 2250 2251 /* 2252 * we have a lock on b and as long as we aren't changing 2253 * the tree, there is no way to for the items in b to change. 2254 * It is safe to drop the lock on our parent before we 2255 * go through the expensive btree search on b. 2256 * 2257 * If we're inserting or deleting (ins_len != 0), then we might 2258 * be changing slot zero, which may require changing the parent. 2259 * So, we can't drop the lock until after we know which slot 2260 * we're operating on. 2261 */ 2262 if (!ins_len && !p->keep_locks) { 2263 int u = level + 1; 2264 2265 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2266 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2267 p->locks[u] = 0; 2268 } 2269 } 2270 2271 if (level == 0) { 2272 if (ins_len > 0) 2273 ASSERT(write_lock_level >= 1); 2274 2275 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2276 if (!p->search_for_split) 2277 unlock_up(p, level, lowest_unlock, 2278 min_write_lock_level, NULL); 2279 goto done; 2280 } 2281 2282 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2283 if (ret < 0) 2284 goto done; 2285 prev_cmp = ret; 2286 2287 if (ret && slot > 0) { 2288 dec = 1; 2289 slot--; 2290 } 2291 p->slots[level] = slot; 2292 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2293 &write_lock_level); 2294 if (err == -EAGAIN) 2295 goto again; 2296 if (err) { 2297 ret = err; 2298 goto done; 2299 } 2300 b = p->nodes[level]; 2301 slot = p->slots[level]; 2302 2303 /* 2304 * Slot 0 is special, if we change the key we have to update 2305 * the parent pointer which means we must have a write lock on 2306 * the parent 2307 */ 2308 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2309 write_lock_level = level + 1; 2310 btrfs_release_path(p); 2311 goto again; 2312 } 2313 2314 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2315 &write_lock_level); 2316 2317 if (level == lowest_level) { 2318 if (dec) 2319 p->slots[level]++; 2320 goto done; 2321 } 2322 2323 err = read_block_for_search(root, p, &b, level, slot, key); 2324 if (err == -EAGAIN) 2325 goto again; 2326 if (err) { 2327 ret = err; 2328 goto done; 2329 } 2330 2331 if (!p->skip_locking) { 2332 level = btrfs_header_level(b); 2333 2334 btrfs_maybe_reset_lockdep_class(root, b); 2335 2336 if (level <= write_lock_level) { 2337 btrfs_tree_lock(b); 2338 p->locks[level] = BTRFS_WRITE_LOCK; 2339 } else { 2340 if (p->nowait) { 2341 if (!btrfs_try_tree_read_lock(b)) { 2342 free_extent_buffer(b); 2343 ret = -EAGAIN; 2344 goto done; 2345 } 2346 } else { 2347 btrfs_tree_read_lock(b); 2348 } 2349 p->locks[level] = BTRFS_READ_LOCK; 2350 } 2351 p->nodes[level] = b; 2352 } 2353 } 2354 ret = 1; 2355 done: 2356 if (ret < 0 && !p->skip_release_on_error) 2357 btrfs_release_path(p); 2358 2359 if (p->need_commit_sem) { 2360 int ret2; 2361 2362 ret2 = finish_need_commit_sem_search(p); 2363 up_read(&fs_info->commit_root_sem); 2364 if (ret2) 2365 ret = ret2; 2366 } 2367 2368 return ret; 2369 } 2370 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2371 2372 /* 2373 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2374 * current state of the tree together with the operations recorded in the tree 2375 * modification log to search for the key in a previous version of this tree, as 2376 * denoted by the time_seq parameter. 2377 * 2378 * Naturally, there is no support for insert, delete or cow operations. 2379 * 2380 * The resulting path and return value will be set up as if we called 2381 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2382 */ 2383 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2384 struct btrfs_path *p, u64 time_seq) 2385 { 2386 struct btrfs_fs_info *fs_info = root->fs_info; 2387 struct extent_buffer *b; 2388 int slot; 2389 int ret; 2390 int err; 2391 int level; 2392 int lowest_unlock = 1; 2393 u8 lowest_level = 0; 2394 2395 lowest_level = p->lowest_level; 2396 WARN_ON(p->nodes[0] != NULL); 2397 ASSERT(!p->nowait); 2398 2399 if (p->search_commit_root) { 2400 BUG_ON(time_seq); 2401 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2402 } 2403 2404 again: 2405 b = btrfs_get_old_root(root, time_seq); 2406 if (!b) { 2407 ret = -EIO; 2408 goto done; 2409 } 2410 level = btrfs_header_level(b); 2411 p->locks[level] = BTRFS_READ_LOCK; 2412 2413 while (b) { 2414 int dec = 0; 2415 2416 level = btrfs_header_level(b); 2417 p->nodes[level] = b; 2418 2419 /* 2420 * we have a lock on b and as long as we aren't changing 2421 * the tree, there is no way to for the items in b to change. 2422 * It is safe to drop the lock on our parent before we 2423 * go through the expensive btree search on b. 2424 */ 2425 btrfs_unlock_up_safe(p, level + 1); 2426 2427 ret = btrfs_bin_search(b, 0, key, &slot); 2428 if (ret < 0) 2429 goto done; 2430 2431 if (level == 0) { 2432 p->slots[level] = slot; 2433 unlock_up(p, level, lowest_unlock, 0, NULL); 2434 goto done; 2435 } 2436 2437 if (ret && slot > 0) { 2438 dec = 1; 2439 slot--; 2440 } 2441 p->slots[level] = slot; 2442 unlock_up(p, level, lowest_unlock, 0, NULL); 2443 2444 if (level == lowest_level) { 2445 if (dec) 2446 p->slots[level]++; 2447 goto done; 2448 } 2449 2450 err = read_block_for_search(root, p, &b, level, slot, key); 2451 if (err == -EAGAIN) 2452 goto again; 2453 if (err) { 2454 ret = err; 2455 goto done; 2456 } 2457 2458 level = btrfs_header_level(b); 2459 btrfs_tree_read_lock(b); 2460 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2461 if (!b) { 2462 ret = -ENOMEM; 2463 goto done; 2464 } 2465 p->locks[level] = BTRFS_READ_LOCK; 2466 p->nodes[level] = b; 2467 } 2468 ret = 1; 2469 done: 2470 if (ret < 0) 2471 btrfs_release_path(p); 2472 2473 return ret; 2474 } 2475 2476 /* 2477 * Search the tree again to find a leaf with smaller keys. 2478 * Returns 0 if it found something. 2479 * Returns 1 if there are no smaller keys. 2480 * Returns < 0 on error. 2481 * 2482 * This may release the path, and so you may lose any locks held at the 2483 * time you call it. 2484 */ 2485 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2486 { 2487 struct btrfs_key key; 2488 struct btrfs_key orig_key; 2489 struct btrfs_disk_key found_key; 2490 int ret; 2491 2492 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2493 orig_key = key; 2494 2495 if (key.offset > 0) { 2496 key.offset--; 2497 } else if (key.type > 0) { 2498 key.type--; 2499 key.offset = (u64)-1; 2500 } else if (key.objectid > 0) { 2501 key.objectid--; 2502 key.type = (u8)-1; 2503 key.offset = (u64)-1; 2504 } else { 2505 return 1; 2506 } 2507 2508 btrfs_release_path(path); 2509 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2510 if (ret <= 0) 2511 return ret; 2512 2513 /* 2514 * Previous key not found. Even if we were at slot 0 of the leaf we had 2515 * before releasing the path and calling btrfs_search_slot(), we now may 2516 * be in a slot pointing to the same original key - this can happen if 2517 * after we released the path, one of more items were moved from a 2518 * sibling leaf into the front of the leaf we had due to an insertion 2519 * (see push_leaf_right()). 2520 * If we hit this case and our slot is > 0 and just decrement the slot 2521 * so that the caller does not process the same key again, which may or 2522 * may not break the caller, depending on its logic. 2523 */ 2524 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2525 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2526 ret = comp_keys(&found_key, &orig_key); 2527 if (ret == 0) { 2528 if (path->slots[0] > 0) { 2529 path->slots[0]--; 2530 return 0; 2531 } 2532 /* 2533 * At slot 0, same key as before, it means orig_key is 2534 * the lowest, leftmost, key in the tree. We're done. 2535 */ 2536 return 1; 2537 } 2538 } 2539 2540 btrfs_item_key(path->nodes[0], &found_key, 0); 2541 ret = comp_keys(&found_key, &key); 2542 /* 2543 * We might have had an item with the previous key in the tree right 2544 * before we released our path. And after we released our path, that 2545 * item might have been pushed to the first slot (0) of the leaf we 2546 * were holding due to a tree balance. Alternatively, an item with the 2547 * previous key can exist as the only element of a leaf (big fat item). 2548 * Therefore account for these 2 cases, so that our callers (like 2549 * btrfs_previous_item) don't miss an existing item with a key matching 2550 * the previous key we computed above. 2551 */ 2552 if (ret <= 0) 2553 return 0; 2554 return 1; 2555 } 2556 2557 /* 2558 * helper to use instead of search slot if no exact match is needed but 2559 * instead the next or previous item should be returned. 2560 * When find_higher is true, the next higher item is returned, the next lower 2561 * otherwise. 2562 * When return_any and find_higher are both true, and no higher item is found, 2563 * return the next lower instead. 2564 * When return_any is true and find_higher is false, and no lower item is found, 2565 * return the next higher instead. 2566 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2567 * < 0 on error 2568 */ 2569 int btrfs_search_slot_for_read(struct btrfs_root *root, 2570 const struct btrfs_key *key, 2571 struct btrfs_path *p, int find_higher, 2572 int return_any) 2573 { 2574 int ret; 2575 struct extent_buffer *leaf; 2576 2577 again: 2578 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2579 if (ret <= 0) 2580 return ret; 2581 /* 2582 * a return value of 1 means the path is at the position where the 2583 * item should be inserted. Normally this is the next bigger item, 2584 * but in case the previous item is the last in a leaf, path points 2585 * to the first free slot in the previous leaf, i.e. at an invalid 2586 * item. 2587 */ 2588 leaf = p->nodes[0]; 2589 2590 if (find_higher) { 2591 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2592 ret = btrfs_next_leaf(root, p); 2593 if (ret <= 0) 2594 return ret; 2595 if (!return_any) 2596 return 1; 2597 /* 2598 * no higher item found, return the next 2599 * lower instead 2600 */ 2601 return_any = 0; 2602 find_higher = 0; 2603 btrfs_release_path(p); 2604 goto again; 2605 } 2606 } else { 2607 if (p->slots[0] == 0) { 2608 ret = btrfs_prev_leaf(root, p); 2609 if (ret < 0) 2610 return ret; 2611 if (!ret) { 2612 leaf = p->nodes[0]; 2613 if (p->slots[0] == btrfs_header_nritems(leaf)) 2614 p->slots[0]--; 2615 return 0; 2616 } 2617 if (!return_any) 2618 return 1; 2619 /* 2620 * no lower item found, return the next 2621 * higher instead 2622 */ 2623 return_any = 0; 2624 find_higher = 1; 2625 btrfs_release_path(p); 2626 goto again; 2627 } else { 2628 --p->slots[0]; 2629 } 2630 } 2631 return 0; 2632 } 2633 2634 /* 2635 * Execute search and call btrfs_previous_item to traverse backwards if the item 2636 * was not found. 2637 * 2638 * Return 0 if found, 1 if not found and < 0 if error. 2639 */ 2640 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2641 struct btrfs_path *path) 2642 { 2643 int ret; 2644 2645 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2646 if (ret > 0) 2647 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2648 2649 if (ret == 0) 2650 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2651 2652 return ret; 2653 } 2654 2655 /* 2656 * Search for a valid slot for the given path. 2657 * 2658 * @root: The root node of the tree. 2659 * @key: Will contain a valid item if found. 2660 * @path: The starting point to validate the slot. 2661 * 2662 * Return: 0 if the item is valid 2663 * 1 if not found 2664 * <0 if error. 2665 */ 2666 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2667 struct btrfs_path *path) 2668 { 2669 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2670 int ret; 2671 2672 ret = btrfs_next_leaf(root, path); 2673 if (ret) 2674 return ret; 2675 } 2676 2677 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2678 return 0; 2679 } 2680 2681 /* 2682 * adjust the pointers going up the tree, starting at level 2683 * making sure the right key of each node is points to 'key'. 2684 * This is used after shifting pointers to the left, so it stops 2685 * fixing up pointers when a given leaf/node is not in slot 0 of the 2686 * higher levels 2687 * 2688 */ 2689 static void fixup_low_keys(struct btrfs_trans_handle *trans, 2690 struct btrfs_path *path, 2691 struct btrfs_disk_key *key, int level) 2692 { 2693 int i; 2694 struct extent_buffer *t; 2695 int ret; 2696 2697 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2698 int tslot = path->slots[i]; 2699 2700 if (!path->nodes[i]) 2701 break; 2702 t = path->nodes[i]; 2703 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2704 BTRFS_MOD_LOG_KEY_REPLACE); 2705 BUG_ON(ret < 0); 2706 btrfs_set_node_key(t, key, tslot); 2707 btrfs_mark_buffer_dirty(trans, path->nodes[i]); 2708 if (tslot != 0) 2709 break; 2710 } 2711 } 2712 2713 /* 2714 * update item key. 2715 * 2716 * This function isn't completely safe. It's the caller's responsibility 2717 * that the new key won't break the order 2718 */ 2719 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, 2720 struct btrfs_path *path, 2721 const struct btrfs_key *new_key) 2722 { 2723 struct btrfs_fs_info *fs_info = trans->fs_info; 2724 struct btrfs_disk_key disk_key; 2725 struct extent_buffer *eb; 2726 int slot; 2727 2728 eb = path->nodes[0]; 2729 slot = path->slots[0]; 2730 if (slot > 0) { 2731 btrfs_item_key(eb, &disk_key, slot - 1); 2732 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2733 btrfs_print_leaf(eb); 2734 btrfs_crit(fs_info, 2735 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2736 slot, btrfs_disk_key_objectid(&disk_key), 2737 btrfs_disk_key_type(&disk_key), 2738 btrfs_disk_key_offset(&disk_key), 2739 new_key->objectid, new_key->type, 2740 new_key->offset); 2741 BUG(); 2742 } 2743 } 2744 if (slot < btrfs_header_nritems(eb) - 1) { 2745 btrfs_item_key(eb, &disk_key, slot + 1); 2746 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2747 btrfs_print_leaf(eb); 2748 btrfs_crit(fs_info, 2749 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2750 slot, btrfs_disk_key_objectid(&disk_key), 2751 btrfs_disk_key_type(&disk_key), 2752 btrfs_disk_key_offset(&disk_key), 2753 new_key->objectid, new_key->type, 2754 new_key->offset); 2755 BUG(); 2756 } 2757 } 2758 2759 btrfs_cpu_key_to_disk(&disk_key, new_key); 2760 btrfs_set_item_key(eb, &disk_key, slot); 2761 btrfs_mark_buffer_dirty(trans, eb); 2762 if (slot == 0) 2763 fixup_low_keys(trans, path, &disk_key, 1); 2764 } 2765 2766 /* 2767 * Check key order of two sibling extent buffers. 2768 * 2769 * Return true if something is wrong. 2770 * Return false if everything is fine. 2771 * 2772 * Tree-checker only works inside one tree block, thus the following 2773 * corruption can not be detected by tree-checker: 2774 * 2775 * Leaf @left | Leaf @right 2776 * -------------------------------------------------------------- 2777 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2778 * 2779 * Key f6 in leaf @left itself is valid, but not valid when the next 2780 * key in leaf @right is 7. 2781 * This can only be checked at tree block merge time. 2782 * And since tree checker has ensured all key order in each tree block 2783 * is correct, we only need to bother the last key of @left and the first 2784 * key of @right. 2785 */ 2786 static bool check_sibling_keys(struct extent_buffer *left, 2787 struct extent_buffer *right) 2788 { 2789 struct btrfs_key left_last; 2790 struct btrfs_key right_first; 2791 int level = btrfs_header_level(left); 2792 int nr_left = btrfs_header_nritems(left); 2793 int nr_right = btrfs_header_nritems(right); 2794 2795 /* No key to check in one of the tree blocks */ 2796 if (!nr_left || !nr_right) 2797 return false; 2798 2799 if (level) { 2800 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2801 btrfs_node_key_to_cpu(right, &right_first, 0); 2802 } else { 2803 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2804 btrfs_item_key_to_cpu(right, &right_first, 0); 2805 } 2806 2807 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2808 btrfs_crit(left->fs_info, "left extent buffer:"); 2809 btrfs_print_tree(left, false); 2810 btrfs_crit(left->fs_info, "right extent buffer:"); 2811 btrfs_print_tree(right, false); 2812 btrfs_crit(left->fs_info, 2813 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2814 left_last.objectid, left_last.type, 2815 left_last.offset, right_first.objectid, 2816 right_first.type, right_first.offset); 2817 return true; 2818 } 2819 return false; 2820 } 2821 2822 /* 2823 * try to push data from one node into the next node left in the 2824 * tree. 2825 * 2826 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2827 * error, and > 0 if there was no room in the left hand block. 2828 */ 2829 static int push_node_left(struct btrfs_trans_handle *trans, 2830 struct extent_buffer *dst, 2831 struct extent_buffer *src, int empty) 2832 { 2833 struct btrfs_fs_info *fs_info = trans->fs_info; 2834 int push_items = 0; 2835 int src_nritems; 2836 int dst_nritems; 2837 int ret = 0; 2838 2839 src_nritems = btrfs_header_nritems(src); 2840 dst_nritems = btrfs_header_nritems(dst); 2841 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2842 WARN_ON(btrfs_header_generation(src) != trans->transid); 2843 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2844 2845 if (!empty && src_nritems <= 8) 2846 return 1; 2847 2848 if (push_items <= 0) 2849 return 1; 2850 2851 if (empty) { 2852 push_items = min(src_nritems, push_items); 2853 if (push_items < src_nritems) { 2854 /* leave at least 8 pointers in the node if 2855 * we aren't going to empty it 2856 */ 2857 if (src_nritems - push_items < 8) { 2858 if (push_items <= 8) 2859 return 1; 2860 push_items -= 8; 2861 } 2862 } 2863 } else 2864 push_items = min(src_nritems - 8, push_items); 2865 2866 /* dst is the left eb, src is the middle eb */ 2867 if (check_sibling_keys(dst, src)) { 2868 ret = -EUCLEAN; 2869 btrfs_abort_transaction(trans, ret); 2870 return ret; 2871 } 2872 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2873 if (ret) { 2874 btrfs_abort_transaction(trans, ret); 2875 return ret; 2876 } 2877 copy_extent_buffer(dst, src, 2878 btrfs_node_key_ptr_offset(dst, dst_nritems), 2879 btrfs_node_key_ptr_offset(src, 0), 2880 push_items * sizeof(struct btrfs_key_ptr)); 2881 2882 if (push_items < src_nritems) { 2883 /* 2884 * btrfs_tree_mod_log_eb_copy handles logging the move, so we 2885 * don't need to do an explicit tree mod log operation for it. 2886 */ 2887 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2888 btrfs_node_key_ptr_offset(src, push_items), 2889 (src_nritems - push_items) * 2890 sizeof(struct btrfs_key_ptr)); 2891 } 2892 btrfs_set_header_nritems(src, src_nritems - push_items); 2893 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2894 btrfs_mark_buffer_dirty(trans, src); 2895 btrfs_mark_buffer_dirty(trans, dst); 2896 2897 return ret; 2898 } 2899 2900 /* 2901 * try to push data from one node into the next node right in the 2902 * tree. 2903 * 2904 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2905 * error, and > 0 if there was no room in the right hand block. 2906 * 2907 * this will only push up to 1/2 the contents of the left node over 2908 */ 2909 static int balance_node_right(struct btrfs_trans_handle *trans, 2910 struct extent_buffer *dst, 2911 struct extent_buffer *src) 2912 { 2913 struct btrfs_fs_info *fs_info = trans->fs_info; 2914 int push_items = 0; 2915 int max_push; 2916 int src_nritems; 2917 int dst_nritems; 2918 int ret = 0; 2919 2920 WARN_ON(btrfs_header_generation(src) != trans->transid); 2921 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2922 2923 src_nritems = btrfs_header_nritems(src); 2924 dst_nritems = btrfs_header_nritems(dst); 2925 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2926 if (push_items <= 0) 2927 return 1; 2928 2929 if (src_nritems < 4) 2930 return 1; 2931 2932 max_push = src_nritems / 2 + 1; 2933 /* don't try to empty the node */ 2934 if (max_push >= src_nritems) 2935 return 1; 2936 2937 if (max_push < push_items) 2938 push_items = max_push; 2939 2940 /* dst is the right eb, src is the middle eb */ 2941 if (check_sibling_keys(src, dst)) { 2942 ret = -EUCLEAN; 2943 btrfs_abort_transaction(trans, ret); 2944 return ret; 2945 } 2946 2947 /* 2948 * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't 2949 * need to do an explicit tree mod log operation for it. 2950 */ 2951 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2952 btrfs_node_key_ptr_offset(dst, 0), 2953 (dst_nritems) * 2954 sizeof(struct btrfs_key_ptr)); 2955 2956 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2957 push_items); 2958 if (ret) { 2959 btrfs_abort_transaction(trans, ret); 2960 return ret; 2961 } 2962 copy_extent_buffer(dst, src, 2963 btrfs_node_key_ptr_offset(dst, 0), 2964 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2965 push_items * sizeof(struct btrfs_key_ptr)); 2966 2967 btrfs_set_header_nritems(src, src_nritems - push_items); 2968 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2969 2970 btrfs_mark_buffer_dirty(trans, src); 2971 btrfs_mark_buffer_dirty(trans, dst); 2972 2973 return ret; 2974 } 2975 2976 /* 2977 * helper function to insert a new root level in the tree. 2978 * A new node is allocated, and a single item is inserted to 2979 * point to the existing root 2980 * 2981 * returns zero on success or < 0 on failure. 2982 */ 2983 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2984 struct btrfs_root *root, 2985 struct btrfs_path *path, int level) 2986 { 2987 struct btrfs_fs_info *fs_info = root->fs_info; 2988 u64 lower_gen; 2989 struct extent_buffer *lower; 2990 struct extent_buffer *c; 2991 struct extent_buffer *old; 2992 struct btrfs_disk_key lower_key; 2993 int ret; 2994 2995 BUG_ON(path->nodes[level]); 2996 BUG_ON(path->nodes[level-1] != root->node); 2997 2998 lower = path->nodes[level-1]; 2999 if (level == 1) 3000 btrfs_item_key(lower, &lower_key, 0); 3001 else 3002 btrfs_node_key(lower, &lower_key, 0); 3003 3004 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3005 &lower_key, level, root->node->start, 0, 3006 BTRFS_NESTING_NEW_ROOT); 3007 if (IS_ERR(c)) 3008 return PTR_ERR(c); 3009 3010 root_add_used(root, fs_info->nodesize); 3011 3012 btrfs_set_header_nritems(c, 1); 3013 btrfs_set_node_key(c, &lower_key, 0); 3014 btrfs_set_node_blockptr(c, 0, lower->start); 3015 lower_gen = btrfs_header_generation(lower); 3016 WARN_ON(lower_gen != trans->transid); 3017 3018 btrfs_set_node_ptr_generation(c, 0, lower_gen); 3019 3020 btrfs_mark_buffer_dirty(trans, c); 3021 3022 old = root->node; 3023 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 3024 if (ret < 0) { 3025 btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); 3026 btrfs_tree_unlock(c); 3027 free_extent_buffer(c); 3028 return ret; 3029 } 3030 rcu_assign_pointer(root->node, c); 3031 3032 /* the super has an extra ref to root->node */ 3033 free_extent_buffer(old); 3034 3035 add_root_to_dirty_list(root); 3036 atomic_inc(&c->refs); 3037 path->nodes[level] = c; 3038 path->locks[level] = BTRFS_WRITE_LOCK; 3039 path->slots[level] = 0; 3040 return 0; 3041 } 3042 3043 /* 3044 * worker function to insert a single pointer in a node. 3045 * the node should have enough room for the pointer already 3046 * 3047 * slot and level indicate where you want the key to go, and 3048 * blocknr is the block the key points to. 3049 */ 3050 static int insert_ptr(struct btrfs_trans_handle *trans, 3051 struct btrfs_path *path, 3052 struct btrfs_disk_key *key, u64 bytenr, 3053 int slot, int level) 3054 { 3055 struct extent_buffer *lower; 3056 int nritems; 3057 int ret; 3058 3059 BUG_ON(!path->nodes[level]); 3060 btrfs_assert_tree_write_locked(path->nodes[level]); 3061 lower = path->nodes[level]; 3062 nritems = btrfs_header_nritems(lower); 3063 BUG_ON(slot > nritems); 3064 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 3065 if (slot != nritems) { 3066 if (level) { 3067 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 3068 slot, nritems - slot); 3069 if (ret < 0) { 3070 btrfs_abort_transaction(trans, ret); 3071 return ret; 3072 } 3073 } 3074 memmove_extent_buffer(lower, 3075 btrfs_node_key_ptr_offset(lower, slot + 1), 3076 btrfs_node_key_ptr_offset(lower, slot), 3077 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3078 } 3079 if (level) { 3080 ret = btrfs_tree_mod_log_insert_key(lower, slot, 3081 BTRFS_MOD_LOG_KEY_ADD); 3082 if (ret < 0) { 3083 btrfs_abort_transaction(trans, ret); 3084 return ret; 3085 } 3086 } 3087 btrfs_set_node_key(lower, key, slot); 3088 btrfs_set_node_blockptr(lower, slot, bytenr); 3089 WARN_ON(trans->transid == 0); 3090 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3091 btrfs_set_header_nritems(lower, nritems + 1); 3092 btrfs_mark_buffer_dirty(trans, lower); 3093 3094 return 0; 3095 } 3096 3097 /* 3098 * split the node at the specified level in path in two. 3099 * The path is corrected to point to the appropriate node after the split 3100 * 3101 * Before splitting this tries to make some room in the node by pushing 3102 * left and right, if either one works, it returns right away. 3103 * 3104 * returns 0 on success and < 0 on failure 3105 */ 3106 static noinline int split_node(struct btrfs_trans_handle *trans, 3107 struct btrfs_root *root, 3108 struct btrfs_path *path, int level) 3109 { 3110 struct btrfs_fs_info *fs_info = root->fs_info; 3111 struct extent_buffer *c; 3112 struct extent_buffer *split; 3113 struct btrfs_disk_key disk_key; 3114 int mid; 3115 int ret; 3116 u32 c_nritems; 3117 3118 c = path->nodes[level]; 3119 WARN_ON(btrfs_header_generation(c) != trans->transid); 3120 if (c == root->node) { 3121 /* 3122 * trying to split the root, lets make a new one 3123 * 3124 * tree mod log: We don't log_removal old root in 3125 * insert_new_root, because that root buffer will be kept as a 3126 * normal node. We are going to log removal of half of the 3127 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3128 * holding a tree lock on the buffer, which is why we cannot 3129 * race with other tree_mod_log users. 3130 */ 3131 ret = insert_new_root(trans, root, path, level + 1); 3132 if (ret) 3133 return ret; 3134 } else { 3135 ret = push_nodes_for_insert(trans, root, path, level); 3136 c = path->nodes[level]; 3137 if (!ret && btrfs_header_nritems(c) < 3138 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3139 return 0; 3140 if (ret < 0) 3141 return ret; 3142 } 3143 3144 c_nritems = btrfs_header_nritems(c); 3145 mid = (c_nritems + 1) / 2; 3146 btrfs_node_key(c, &disk_key, mid); 3147 3148 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3149 &disk_key, level, c->start, 0, 3150 BTRFS_NESTING_SPLIT); 3151 if (IS_ERR(split)) 3152 return PTR_ERR(split); 3153 3154 root_add_used(root, fs_info->nodesize); 3155 ASSERT(btrfs_header_level(c) == level); 3156 3157 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3158 if (ret) { 3159 btrfs_tree_unlock(split); 3160 free_extent_buffer(split); 3161 btrfs_abort_transaction(trans, ret); 3162 return ret; 3163 } 3164 copy_extent_buffer(split, c, 3165 btrfs_node_key_ptr_offset(split, 0), 3166 btrfs_node_key_ptr_offset(c, mid), 3167 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3168 btrfs_set_header_nritems(split, c_nritems - mid); 3169 btrfs_set_header_nritems(c, mid); 3170 3171 btrfs_mark_buffer_dirty(trans, c); 3172 btrfs_mark_buffer_dirty(trans, split); 3173 3174 ret = insert_ptr(trans, path, &disk_key, split->start, 3175 path->slots[level + 1] + 1, level + 1); 3176 if (ret < 0) { 3177 btrfs_tree_unlock(split); 3178 free_extent_buffer(split); 3179 return ret; 3180 } 3181 3182 if (path->slots[level] >= mid) { 3183 path->slots[level] -= mid; 3184 btrfs_tree_unlock(c); 3185 free_extent_buffer(c); 3186 path->nodes[level] = split; 3187 path->slots[level + 1] += 1; 3188 } else { 3189 btrfs_tree_unlock(split); 3190 free_extent_buffer(split); 3191 } 3192 return 0; 3193 } 3194 3195 /* 3196 * how many bytes are required to store the items in a leaf. start 3197 * and nr indicate which items in the leaf to check. This totals up the 3198 * space used both by the item structs and the item data 3199 */ 3200 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3201 { 3202 int data_len; 3203 int nritems = btrfs_header_nritems(l); 3204 int end = min(nritems, start + nr) - 1; 3205 3206 if (!nr) 3207 return 0; 3208 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3209 data_len = data_len - btrfs_item_offset(l, end); 3210 data_len += sizeof(struct btrfs_item) * nr; 3211 WARN_ON(data_len < 0); 3212 return data_len; 3213 } 3214 3215 /* 3216 * The space between the end of the leaf items and 3217 * the start of the leaf data. IOW, how much room 3218 * the leaf has left for both items and data 3219 */ 3220 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3221 { 3222 struct btrfs_fs_info *fs_info = leaf->fs_info; 3223 int nritems = btrfs_header_nritems(leaf); 3224 int ret; 3225 3226 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3227 if (ret < 0) { 3228 btrfs_crit(fs_info, 3229 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3230 ret, 3231 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3232 leaf_space_used(leaf, 0, nritems), nritems); 3233 } 3234 return ret; 3235 } 3236 3237 /* 3238 * min slot controls the lowest index we're willing to push to the 3239 * right. We'll push up to and including min_slot, but no lower 3240 */ 3241 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3242 struct btrfs_path *path, 3243 int data_size, int empty, 3244 struct extent_buffer *right, 3245 int free_space, u32 left_nritems, 3246 u32 min_slot) 3247 { 3248 struct btrfs_fs_info *fs_info = right->fs_info; 3249 struct extent_buffer *left = path->nodes[0]; 3250 struct extent_buffer *upper = path->nodes[1]; 3251 struct btrfs_map_token token; 3252 struct btrfs_disk_key disk_key; 3253 int slot; 3254 u32 i; 3255 int push_space = 0; 3256 int push_items = 0; 3257 u32 nr; 3258 u32 right_nritems; 3259 u32 data_end; 3260 u32 this_item_size; 3261 3262 if (empty) 3263 nr = 0; 3264 else 3265 nr = max_t(u32, 1, min_slot); 3266 3267 if (path->slots[0] >= left_nritems) 3268 push_space += data_size; 3269 3270 slot = path->slots[1]; 3271 i = left_nritems - 1; 3272 while (i >= nr) { 3273 if (!empty && push_items > 0) { 3274 if (path->slots[0] > i) 3275 break; 3276 if (path->slots[0] == i) { 3277 int space = btrfs_leaf_free_space(left); 3278 3279 if (space + push_space * 2 > free_space) 3280 break; 3281 } 3282 } 3283 3284 if (path->slots[0] == i) 3285 push_space += data_size; 3286 3287 this_item_size = btrfs_item_size(left, i); 3288 if (this_item_size + sizeof(struct btrfs_item) + 3289 push_space > free_space) 3290 break; 3291 3292 push_items++; 3293 push_space += this_item_size + sizeof(struct btrfs_item); 3294 if (i == 0) 3295 break; 3296 i--; 3297 } 3298 3299 if (push_items == 0) 3300 goto out_unlock; 3301 3302 WARN_ON(!empty && push_items == left_nritems); 3303 3304 /* push left to right */ 3305 right_nritems = btrfs_header_nritems(right); 3306 3307 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3308 push_space -= leaf_data_end(left); 3309 3310 /* make room in the right data area */ 3311 data_end = leaf_data_end(right); 3312 memmove_leaf_data(right, data_end - push_space, data_end, 3313 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3314 3315 /* copy from the left data area */ 3316 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3317 leaf_data_end(left), push_space); 3318 3319 memmove_leaf_items(right, push_items, 0, right_nritems); 3320 3321 /* copy the items from left to right */ 3322 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3323 3324 /* update the item pointers */ 3325 btrfs_init_map_token(&token, right); 3326 right_nritems += push_items; 3327 btrfs_set_header_nritems(right, right_nritems); 3328 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3329 for (i = 0; i < right_nritems; i++) { 3330 push_space -= btrfs_token_item_size(&token, i); 3331 btrfs_set_token_item_offset(&token, i, push_space); 3332 } 3333 3334 left_nritems -= push_items; 3335 btrfs_set_header_nritems(left, left_nritems); 3336 3337 if (left_nritems) 3338 btrfs_mark_buffer_dirty(trans, left); 3339 else 3340 btrfs_clear_buffer_dirty(trans, left); 3341 3342 btrfs_mark_buffer_dirty(trans, right); 3343 3344 btrfs_item_key(right, &disk_key, 0); 3345 btrfs_set_node_key(upper, &disk_key, slot + 1); 3346 btrfs_mark_buffer_dirty(trans, upper); 3347 3348 /* then fixup the leaf pointer in the path */ 3349 if (path->slots[0] >= left_nritems) { 3350 path->slots[0] -= left_nritems; 3351 if (btrfs_header_nritems(path->nodes[0]) == 0) 3352 btrfs_clear_buffer_dirty(trans, path->nodes[0]); 3353 btrfs_tree_unlock(path->nodes[0]); 3354 free_extent_buffer(path->nodes[0]); 3355 path->nodes[0] = right; 3356 path->slots[1] += 1; 3357 } else { 3358 btrfs_tree_unlock(right); 3359 free_extent_buffer(right); 3360 } 3361 return 0; 3362 3363 out_unlock: 3364 btrfs_tree_unlock(right); 3365 free_extent_buffer(right); 3366 return 1; 3367 } 3368 3369 /* 3370 * push some data in the path leaf to the right, trying to free up at 3371 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3372 * 3373 * returns 1 if the push failed because the other node didn't have enough 3374 * room, 0 if everything worked out and < 0 if there were major errors. 3375 * 3376 * this will push starting from min_slot to the end of the leaf. It won't 3377 * push any slot lower than min_slot 3378 */ 3379 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3380 *root, struct btrfs_path *path, 3381 int min_data_size, int data_size, 3382 int empty, u32 min_slot) 3383 { 3384 struct extent_buffer *left = path->nodes[0]; 3385 struct extent_buffer *right; 3386 struct extent_buffer *upper; 3387 int slot; 3388 int free_space; 3389 u32 left_nritems; 3390 int ret; 3391 3392 if (!path->nodes[1]) 3393 return 1; 3394 3395 slot = path->slots[1]; 3396 upper = path->nodes[1]; 3397 if (slot >= btrfs_header_nritems(upper) - 1) 3398 return 1; 3399 3400 btrfs_assert_tree_write_locked(path->nodes[1]); 3401 3402 right = btrfs_read_node_slot(upper, slot + 1); 3403 if (IS_ERR(right)) 3404 return PTR_ERR(right); 3405 3406 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 3407 3408 free_space = btrfs_leaf_free_space(right); 3409 if (free_space < data_size) 3410 goto out_unlock; 3411 3412 ret = btrfs_cow_block(trans, root, right, upper, 3413 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3414 if (ret) 3415 goto out_unlock; 3416 3417 left_nritems = btrfs_header_nritems(left); 3418 if (left_nritems == 0) 3419 goto out_unlock; 3420 3421 if (check_sibling_keys(left, right)) { 3422 ret = -EUCLEAN; 3423 btrfs_abort_transaction(trans, ret); 3424 btrfs_tree_unlock(right); 3425 free_extent_buffer(right); 3426 return ret; 3427 } 3428 if (path->slots[0] == left_nritems && !empty) { 3429 /* Key greater than all keys in the leaf, right neighbor has 3430 * enough room for it and we're not emptying our leaf to delete 3431 * it, therefore use right neighbor to insert the new item and 3432 * no need to touch/dirty our left leaf. */ 3433 btrfs_tree_unlock(left); 3434 free_extent_buffer(left); 3435 path->nodes[0] = right; 3436 path->slots[0] = 0; 3437 path->slots[1]++; 3438 return 0; 3439 } 3440 3441 return __push_leaf_right(trans, path, min_data_size, empty, right, 3442 free_space, left_nritems, min_slot); 3443 out_unlock: 3444 btrfs_tree_unlock(right); 3445 free_extent_buffer(right); 3446 return 1; 3447 } 3448 3449 /* 3450 * push some data in the path leaf to the left, trying to free up at 3451 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3452 * 3453 * max_slot can put a limit on how far into the leaf we'll push items. The 3454 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3455 * items 3456 */ 3457 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3458 struct btrfs_path *path, int data_size, 3459 int empty, struct extent_buffer *left, 3460 int free_space, u32 right_nritems, 3461 u32 max_slot) 3462 { 3463 struct btrfs_fs_info *fs_info = left->fs_info; 3464 struct btrfs_disk_key disk_key; 3465 struct extent_buffer *right = path->nodes[0]; 3466 int i; 3467 int push_space = 0; 3468 int push_items = 0; 3469 u32 old_left_nritems; 3470 u32 nr; 3471 int ret = 0; 3472 u32 this_item_size; 3473 u32 old_left_item_size; 3474 struct btrfs_map_token token; 3475 3476 if (empty) 3477 nr = min(right_nritems, max_slot); 3478 else 3479 nr = min(right_nritems - 1, max_slot); 3480 3481 for (i = 0; i < nr; i++) { 3482 if (!empty && push_items > 0) { 3483 if (path->slots[0] < i) 3484 break; 3485 if (path->slots[0] == i) { 3486 int space = btrfs_leaf_free_space(right); 3487 3488 if (space + push_space * 2 > free_space) 3489 break; 3490 } 3491 } 3492 3493 if (path->slots[0] == i) 3494 push_space += data_size; 3495 3496 this_item_size = btrfs_item_size(right, i); 3497 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3498 free_space) 3499 break; 3500 3501 push_items++; 3502 push_space += this_item_size + sizeof(struct btrfs_item); 3503 } 3504 3505 if (push_items == 0) { 3506 ret = 1; 3507 goto out; 3508 } 3509 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3510 3511 /* push data from right to left */ 3512 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3513 3514 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3515 btrfs_item_offset(right, push_items - 1); 3516 3517 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3518 btrfs_item_offset(right, push_items - 1), push_space); 3519 old_left_nritems = btrfs_header_nritems(left); 3520 BUG_ON(old_left_nritems <= 0); 3521 3522 btrfs_init_map_token(&token, left); 3523 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3524 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3525 u32 ioff; 3526 3527 ioff = btrfs_token_item_offset(&token, i); 3528 btrfs_set_token_item_offset(&token, i, 3529 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3530 } 3531 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3532 3533 /* fixup right node */ 3534 if (push_items > right_nritems) 3535 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3536 right_nritems); 3537 3538 if (push_items < right_nritems) { 3539 push_space = btrfs_item_offset(right, push_items - 1) - 3540 leaf_data_end(right); 3541 memmove_leaf_data(right, 3542 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3543 leaf_data_end(right), push_space); 3544 3545 memmove_leaf_items(right, 0, push_items, 3546 btrfs_header_nritems(right) - push_items); 3547 } 3548 3549 btrfs_init_map_token(&token, right); 3550 right_nritems -= push_items; 3551 btrfs_set_header_nritems(right, right_nritems); 3552 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3553 for (i = 0; i < right_nritems; i++) { 3554 push_space = push_space - btrfs_token_item_size(&token, i); 3555 btrfs_set_token_item_offset(&token, i, push_space); 3556 } 3557 3558 btrfs_mark_buffer_dirty(trans, left); 3559 if (right_nritems) 3560 btrfs_mark_buffer_dirty(trans, right); 3561 else 3562 btrfs_clear_buffer_dirty(trans, right); 3563 3564 btrfs_item_key(right, &disk_key, 0); 3565 fixup_low_keys(trans, path, &disk_key, 1); 3566 3567 /* then fixup the leaf pointer in the path */ 3568 if (path->slots[0] < push_items) { 3569 path->slots[0] += old_left_nritems; 3570 btrfs_tree_unlock(path->nodes[0]); 3571 free_extent_buffer(path->nodes[0]); 3572 path->nodes[0] = left; 3573 path->slots[1] -= 1; 3574 } else { 3575 btrfs_tree_unlock(left); 3576 free_extent_buffer(left); 3577 path->slots[0] -= push_items; 3578 } 3579 BUG_ON(path->slots[0] < 0); 3580 return ret; 3581 out: 3582 btrfs_tree_unlock(left); 3583 free_extent_buffer(left); 3584 return ret; 3585 } 3586 3587 /* 3588 * push some data in the path leaf to the left, trying to free up at 3589 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3590 * 3591 * max_slot can put a limit on how far into the leaf we'll push items. The 3592 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3593 * items 3594 */ 3595 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3596 *root, struct btrfs_path *path, int min_data_size, 3597 int data_size, int empty, u32 max_slot) 3598 { 3599 struct extent_buffer *right = path->nodes[0]; 3600 struct extent_buffer *left; 3601 int slot; 3602 int free_space; 3603 u32 right_nritems; 3604 int ret = 0; 3605 3606 slot = path->slots[1]; 3607 if (slot == 0) 3608 return 1; 3609 if (!path->nodes[1]) 3610 return 1; 3611 3612 right_nritems = btrfs_header_nritems(right); 3613 if (right_nritems == 0) 3614 return 1; 3615 3616 btrfs_assert_tree_write_locked(path->nodes[1]); 3617 3618 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3619 if (IS_ERR(left)) 3620 return PTR_ERR(left); 3621 3622 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3623 3624 free_space = btrfs_leaf_free_space(left); 3625 if (free_space < data_size) { 3626 ret = 1; 3627 goto out; 3628 } 3629 3630 ret = btrfs_cow_block(trans, root, left, 3631 path->nodes[1], slot - 1, &left, 3632 BTRFS_NESTING_LEFT_COW); 3633 if (ret) { 3634 /* we hit -ENOSPC, but it isn't fatal here */ 3635 if (ret == -ENOSPC) 3636 ret = 1; 3637 goto out; 3638 } 3639 3640 if (check_sibling_keys(left, right)) { 3641 ret = -EUCLEAN; 3642 btrfs_abort_transaction(trans, ret); 3643 goto out; 3644 } 3645 return __push_leaf_left(trans, path, min_data_size, empty, left, 3646 free_space, right_nritems, max_slot); 3647 out: 3648 btrfs_tree_unlock(left); 3649 free_extent_buffer(left); 3650 return ret; 3651 } 3652 3653 /* 3654 * split the path's leaf in two, making sure there is at least data_size 3655 * available for the resulting leaf level of the path. 3656 */ 3657 static noinline int copy_for_split(struct btrfs_trans_handle *trans, 3658 struct btrfs_path *path, 3659 struct extent_buffer *l, 3660 struct extent_buffer *right, 3661 int slot, int mid, int nritems) 3662 { 3663 struct btrfs_fs_info *fs_info = trans->fs_info; 3664 int data_copy_size; 3665 int rt_data_off; 3666 int i; 3667 int ret; 3668 struct btrfs_disk_key disk_key; 3669 struct btrfs_map_token token; 3670 3671 nritems = nritems - mid; 3672 btrfs_set_header_nritems(right, nritems); 3673 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3674 3675 copy_leaf_items(right, l, 0, mid, nritems); 3676 3677 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3678 leaf_data_end(l), data_copy_size); 3679 3680 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3681 3682 btrfs_init_map_token(&token, right); 3683 for (i = 0; i < nritems; i++) { 3684 u32 ioff; 3685 3686 ioff = btrfs_token_item_offset(&token, i); 3687 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3688 } 3689 3690 btrfs_set_header_nritems(l, mid); 3691 btrfs_item_key(right, &disk_key, 0); 3692 ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3693 if (ret < 0) 3694 return ret; 3695 3696 btrfs_mark_buffer_dirty(trans, right); 3697 btrfs_mark_buffer_dirty(trans, l); 3698 BUG_ON(path->slots[0] != slot); 3699 3700 if (mid <= slot) { 3701 btrfs_tree_unlock(path->nodes[0]); 3702 free_extent_buffer(path->nodes[0]); 3703 path->nodes[0] = right; 3704 path->slots[0] -= mid; 3705 path->slots[1] += 1; 3706 } else { 3707 btrfs_tree_unlock(right); 3708 free_extent_buffer(right); 3709 } 3710 3711 BUG_ON(path->slots[0] < 0); 3712 3713 return 0; 3714 } 3715 3716 /* 3717 * double splits happen when we need to insert a big item in the middle 3718 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3719 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3720 * A B C 3721 * 3722 * We avoid this by trying to push the items on either side of our target 3723 * into the adjacent leaves. If all goes well we can avoid the double split 3724 * completely. 3725 */ 3726 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3727 struct btrfs_root *root, 3728 struct btrfs_path *path, 3729 int data_size) 3730 { 3731 int ret; 3732 int progress = 0; 3733 int slot; 3734 u32 nritems; 3735 int space_needed = data_size; 3736 3737 slot = path->slots[0]; 3738 if (slot < btrfs_header_nritems(path->nodes[0])) 3739 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3740 3741 /* 3742 * try to push all the items after our slot into the 3743 * right leaf 3744 */ 3745 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3746 if (ret < 0) 3747 return ret; 3748 3749 if (ret == 0) 3750 progress++; 3751 3752 nritems = btrfs_header_nritems(path->nodes[0]); 3753 /* 3754 * our goal is to get our slot at the start or end of a leaf. If 3755 * we've done so we're done 3756 */ 3757 if (path->slots[0] == 0 || path->slots[0] == nritems) 3758 return 0; 3759 3760 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3761 return 0; 3762 3763 /* try to push all the items before our slot into the next leaf */ 3764 slot = path->slots[0]; 3765 space_needed = data_size; 3766 if (slot > 0) 3767 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3768 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3769 if (ret < 0) 3770 return ret; 3771 3772 if (ret == 0) 3773 progress++; 3774 3775 if (progress) 3776 return 0; 3777 return 1; 3778 } 3779 3780 /* 3781 * split the path's leaf in two, making sure there is at least data_size 3782 * available for the resulting leaf level of the path. 3783 * 3784 * returns 0 if all went well and < 0 on failure. 3785 */ 3786 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3787 struct btrfs_root *root, 3788 const struct btrfs_key *ins_key, 3789 struct btrfs_path *path, int data_size, 3790 int extend) 3791 { 3792 struct btrfs_disk_key disk_key; 3793 struct extent_buffer *l; 3794 u32 nritems; 3795 int mid; 3796 int slot; 3797 struct extent_buffer *right; 3798 struct btrfs_fs_info *fs_info = root->fs_info; 3799 int ret = 0; 3800 int wret; 3801 int split; 3802 int num_doubles = 0; 3803 int tried_avoid_double = 0; 3804 3805 l = path->nodes[0]; 3806 slot = path->slots[0]; 3807 if (extend && data_size + btrfs_item_size(l, slot) + 3808 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3809 return -EOVERFLOW; 3810 3811 /* first try to make some room by pushing left and right */ 3812 if (data_size && path->nodes[1]) { 3813 int space_needed = data_size; 3814 3815 if (slot < btrfs_header_nritems(l)) 3816 space_needed -= btrfs_leaf_free_space(l); 3817 3818 wret = push_leaf_right(trans, root, path, space_needed, 3819 space_needed, 0, 0); 3820 if (wret < 0) 3821 return wret; 3822 if (wret) { 3823 space_needed = data_size; 3824 if (slot > 0) 3825 space_needed -= btrfs_leaf_free_space(l); 3826 wret = push_leaf_left(trans, root, path, space_needed, 3827 space_needed, 0, (u32)-1); 3828 if (wret < 0) 3829 return wret; 3830 } 3831 l = path->nodes[0]; 3832 3833 /* did the pushes work? */ 3834 if (btrfs_leaf_free_space(l) >= data_size) 3835 return 0; 3836 } 3837 3838 if (!path->nodes[1]) { 3839 ret = insert_new_root(trans, root, path, 1); 3840 if (ret) 3841 return ret; 3842 } 3843 again: 3844 split = 1; 3845 l = path->nodes[0]; 3846 slot = path->slots[0]; 3847 nritems = btrfs_header_nritems(l); 3848 mid = (nritems + 1) / 2; 3849 3850 if (mid <= slot) { 3851 if (nritems == 1 || 3852 leaf_space_used(l, mid, nritems - mid) + data_size > 3853 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3854 if (slot >= nritems) { 3855 split = 0; 3856 } else { 3857 mid = slot; 3858 if (mid != nritems && 3859 leaf_space_used(l, mid, nritems - mid) + 3860 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3861 if (data_size && !tried_avoid_double) 3862 goto push_for_double; 3863 split = 2; 3864 } 3865 } 3866 } 3867 } else { 3868 if (leaf_space_used(l, 0, mid) + data_size > 3869 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3870 if (!extend && data_size && slot == 0) { 3871 split = 0; 3872 } else if ((extend || !data_size) && slot == 0) { 3873 mid = 1; 3874 } else { 3875 mid = slot; 3876 if (mid != nritems && 3877 leaf_space_used(l, mid, nritems - mid) + 3878 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3879 if (data_size && !tried_avoid_double) 3880 goto push_for_double; 3881 split = 2; 3882 } 3883 } 3884 } 3885 } 3886 3887 if (split == 0) 3888 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3889 else 3890 btrfs_item_key(l, &disk_key, mid); 3891 3892 /* 3893 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3894 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3895 * subclasses, which is 8 at the time of this patch, and we've maxed it 3896 * out. In the future we could add a 3897 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3898 * use BTRFS_NESTING_NEW_ROOT. 3899 */ 3900 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3901 &disk_key, 0, l->start, 0, 3902 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3903 BTRFS_NESTING_SPLIT); 3904 if (IS_ERR(right)) 3905 return PTR_ERR(right); 3906 3907 root_add_used(root, fs_info->nodesize); 3908 3909 if (split == 0) { 3910 if (mid <= slot) { 3911 btrfs_set_header_nritems(right, 0); 3912 ret = insert_ptr(trans, path, &disk_key, 3913 right->start, path->slots[1] + 1, 1); 3914 if (ret < 0) { 3915 btrfs_tree_unlock(right); 3916 free_extent_buffer(right); 3917 return ret; 3918 } 3919 btrfs_tree_unlock(path->nodes[0]); 3920 free_extent_buffer(path->nodes[0]); 3921 path->nodes[0] = right; 3922 path->slots[0] = 0; 3923 path->slots[1] += 1; 3924 } else { 3925 btrfs_set_header_nritems(right, 0); 3926 ret = insert_ptr(trans, path, &disk_key, 3927 right->start, path->slots[1], 1); 3928 if (ret < 0) { 3929 btrfs_tree_unlock(right); 3930 free_extent_buffer(right); 3931 return ret; 3932 } 3933 btrfs_tree_unlock(path->nodes[0]); 3934 free_extent_buffer(path->nodes[0]); 3935 path->nodes[0] = right; 3936 path->slots[0] = 0; 3937 if (path->slots[1] == 0) 3938 fixup_low_keys(trans, path, &disk_key, 1); 3939 } 3940 /* 3941 * We create a new leaf 'right' for the required ins_len and 3942 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3943 * the content of ins_len to 'right'. 3944 */ 3945 return ret; 3946 } 3947 3948 ret = copy_for_split(trans, path, l, right, slot, mid, nritems); 3949 if (ret < 0) { 3950 btrfs_tree_unlock(right); 3951 free_extent_buffer(right); 3952 return ret; 3953 } 3954 3955 if (split == 2) { 3956 BUG_ON(num_doubles != 0); 3957 num_doubles++; 3958 goto again; 3959 } 3960 3961 return 0; 3962 3963 push_for_double: 3964 push_for_double_split(trans, root, path, data_size); 3965 tried_avoid_double = 1; 3966 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3967 return 0; 3968 goto again; 3969 } 3970 3971 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3972 struct btrfs_root *root, 3973 struct btrfs_path *path, int ins_len) 3974 { 3975 struct btrfs_key key; 3976 struct extent_buffer *leaf; 3977 struct btrfs_file_extent_item *fi; 3978 u64 extent_len = 0; 3979 u32 item_size; 3980 int ret; 3981 3982 leaf = path->nodes[0]; 3983 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3984 3985 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3986 key.type != BTRFS_EXTENT_CSUM_KEY); 3987 3988 if (btrfs_leaf_free_space(leaf) >= ins_len) 3989 return 0; 3990 3991 item_size = btrfs_item_size(leaf, path->slots[0]); 3992 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3993 fi = btrfs_item_ptr(leaf, path->slots[0], 3994 struct btrfs_file_extent_item); 3995 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3996 } 3997 btrfs_release_path(path); 3998 3999 path->keep_locks = 1; 4000 path->search_for_split = 1; 4001 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4002 path->search_for_split = 0; 4003 if (ret > 0) 4004 ret = -EAGAIN; 4005 if (ret < 0) 4006 goto err; 4007 4008 ret = -EAGAIN; 4009 leaf = path->nodes[0]; 4010 /* if our item isn't there, return now */ 4011 if (item_size != btrfs_item_size(leaf, path->slots[0])) 4012 goto err; 4013 4014 /* the leaf has changed, it now has room. return now */ 4015 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 4016 goto err; 4017 4018 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4019 fi = btrfs_item_ptr(leaf, path->slots[0], 4020 struct btrfs_file_extent_item); 4021 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4022 goto err; 4023 } 4024 4025 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4026 if (ret) 4027 goto err; 4028 4029 path->keep_locks = 0; 4030 btrfs_unlock_up_safe(path, 1); 4031 return 0; 4032 err: 4033 path->keep_locks = 0; 4034 return ret; 4035 } 4036 4037 static noinline int split_item(struct btrfs_trans_handle *trans, 4038 struct btrfs_path *path, 4039 const struct btrfs_key *new_key, 4040 unsigned long split_offset) 4041 { 4042 struct extent_buffer *leaf; 4043 int orig_slot, slot; 4044 char *buf; 4045 u32 nritems; 4046 u32 item_size; 4047 u32 orig_offset; 4048 struct btrfs_disk_key disk_key; 4049 4050 leaf = path->nodes[0]; 4051 /* 4052 * Shouldn't happen because the caller must have previously called 4053 * setup_leaf_for_split() to make room for the new item in the leaf. 4054 */ 4055 if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) 4056 return -ENOSPC; 4057 4058 orig_slot = path->slots[0]; 4059 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 4060 item_size = btrfs_item_size(leaf, path->slots[0]); 4061 4062 buf = kmalloc(item_size, GFP_NOFS); 4063 if (!buf) 4064 return -ENOMEM; 4065 4066 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4067 path->slots[0]), item_size); 4068 4069 slot = path->slots[0] + 1; 4070 nritems = btrfs_header_nritems(leaf); 4071 if (slot != nritems) { 4072 /* shift the items */ 4073 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 4074 } 4075 4076 btrfs_cpu_key_to_disk(&disk_key, new_key); 4077 btrfs_set_item_key(leaf, &disk_key, slot); 4078 4079 btrfs_set_item_offset(leaf, slot, orig_offset); 4080 btrfs_set_item_size(leaf, slot, item_size - split_offset); 4081 4082 btrfs_set_item_offset(leaf, orig_slot, 4083 orig_offset + item_size - split_offset); 4084 btrfs_set_item_size(leaf, orig_slot, split_offset); 4085 4086 btrfs_set_header_nritems(leaf, nritems + 1); 4087 4088 /* write the data for the start of the original item */ 4089 write_extent_buffer(leaf, buf, 4090 btrfs_item_ptr_offset(leaf, path->slots[0]), 4091 split_offset); 4092 4093 /* write the data for the new item */ 4094 write_extent_buffer(leaf, buf + split_offset, 4095 btrfs_item_ptr_offset(leaf, slot), 4096 item_size - split_offset); 4097 btrfs_mark_buffer_dirty(trans, leaf); 4098 4099 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 4100 kfree(buf); 4101 return 0; 4102 } 4103 4104 /* 4105 * This function splits a single item into two items, 4106 * giving 'new_key' to the new item and splitting the 4107 * old one at split_offset (from the start of the item). 4108 * 4109 * The path may be released by this operation. After 4110 * the split, the path is pointing to the old item. The 4111 * new item is going to be in the same node as the old one. 4112 * 4113 * Note, the item being split must be smaller enough to live alone on 4114 * a tree block with room for one extra struct btrfs_item 4115 * 4116 * This allows us to split the item in place, keeping a lock on the 4117 * leaf the entire time. 4118 */ 4119 int btrfs_split_item(struct btrfs_trans_handle *trans, 4120 struct btrfs_root *root, 4121 struct btrfs_path *path, 4122 const struct btrfs_key *new_key, 4123 unsigned long split_offset) 4124 { 4125 int ret; 4126 ret = setup_leaf_for_split(trans, root, path, 4127 sizeof(struct btrfs_item)); 4128 if (ret) 4129 return ret; 4130 4131 ret = split_item(trans, path, new_key, split_offset); 4132 return ret; 4133 } 4134 4135 /* 4136 * make the item pointed to by the path smaller. new_size indicates 4137 * how small to make it, and from_end tells us if we just chop bytes 4138 * off the end of the item or if we shift the item to chop bytes off 4139 * the front. 4140 */ 4141 void btrfs_truncate_item(struct btrfs_trans_handle *trans, 4142 struct btrfs_path *path, u32 new_size, int from_end) 4143 { 4144 int slot; 4145 struct extent_buffer *leaf; 4146 u32 nritems; 4147 unsigned int data_end; 4148 unsigned int old_data_start; 4149 unsigned int old_size; 4150 unsigned int size_diff; 4151 int i; 4152 struct btrfs_map_token token; 4153 4154 leaf = path->nodes[0]; 4155 slot = path->slots[0]; 4156 4157 old_size = btrfs_item_size(leaf, slot); 4158 if (old_size == new_size) 4159 return; 4160 4161 nritems = btrfs_header_nritems(leaf); 4162 data_end = leaf_data_end(leaf); 4163 4164 old_data_start = btrfs_item_offset(leaf, slot); 4165 4166 size_diff = old_size - new_size; 4167 4168 BUG_ON(slot < 0); 4169 BUG_ON(slot >= nritems); 4170 4171 /* 4172 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4173 */ 4174 /* first correct the data pointers */ 4175 btrfs_init_map_token(&token, leaf); 4176 for (i = slot; i < nritems; i++) { 4177 u32 ioff; 4178 4179 ioff = btrfs_token_item_offset(&token, i); 4180 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 4181 } 4182 4183 /* shift the data */ 4184 if (from_end) { 4185 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4186 old_data_start + new_size - data_end); 4187 } else { 4188 struct btrfs_disk_key disk_key; 4189 u64 offset; 4190 4191 btrfs_item_key(leaf, &disk_key, slot); 4192 4193 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4194 unsigned long ptr; 4195 struct btrfs_file_extent_item *fi; 4196 4197 fi = btrfs_item_ptr(leaf, slot, 4198 struct btrfs_file_extent_item); 4199 fi = (struct btrfs_file_extent_item *)( 4200 (unsigned long)fi - size_diff); 4201 4202 if (btrfs_file_extent_type(leaf, fi) == 4203 BTRFS_FILE_EXTENT_INLINE) { 4204 ptr = btrfs_item_ptr_offset(leaf, slot); 4205 memmove_extent_buffer(leaf, ptr, 4206 (unsigned long)fi, 4207 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4208 } 4209 } 4210 4211 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4212 old_data_start - data_end); 4213 4214 offset = btrfs_disk_key_offset(&disk_key); 4215 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4216 btrfs_set_item_key(leaf, &disk_key, slot); 4217 if (slot == 0) 4218 fixup_low_keys(trans, path, &disk_key, 1); 4219 } 4220 4221 btrfs_set_item_size(leaf, slot, new_size); 4222 btrfs_mark_buffer_dirty(trans, leaf); 4223 4224 if (btrfs_leaf_free_space(leaf) < 0) { 4225 btrfs_print_leaf(leaf); 4226 BUG(); 4227 } 4228 } 4229 4230 /* 4231 * make the item pointed to by the path bigger, data_size is the added size. 4232 */ 4233 void btrfs_extend_item(struct btrfs_trans_handle *trans, 4234 struct btrfs_path *path, u32 data_size) 4235 { 4236 int slot; 4237 struct extent_buffer *leaf; 4238 u32 nritems; 4239 unsigned int data_end; 4240 unsigned int old_data; 4241 unsigned int old_size; 4242 int i; 4243 struct btrfs_map_token token; 4244 4245 leaf = path->nodes[0]; 4246 4247 nritems = btrfs_header_nritems(leaf); 4248 data_end = leaf_data_end(leaf); 4249 4250 if (btrfs_leaf_free_space(leaf) < data_size) { 4251 btrfs_print_leaf(leaf); 4252 BUG(); 4253 } 4254 slot = path->slots[0]; 4255 old_data = btrfs_item_data_end(leaf, slot); 4256 4257 BUG_ON(slot < 0); 4258 if (slot >= nritems) { 4259 btrfs_print_leaf(leaf); 4260 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4261 slot, nritems); 4262 BUG(); 4263 } 4264 4265 /* 4266 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4267 */ 4268 /* first correct the data pointers */ 4269 btrfs_init_map_token(&token, leaf); 4270 for (i = slot; i < nritems; i++) { 4271 u32 ioff; 4272 4273 ioff = btrfs_token_item_offset(&token, i); 4274 btrfs_set_token_item_offset(&token, i, ioff - data_size); 4275 } 4276 4277 /* shift the data */ 4278 memmove_leaf_data(leaf, data_end - data_size, data_end, 4279 old_data - data_end); 4280 4281 data_end = old_data; 4282 old_size = btrfs_item_size(leaf, slot); 4283 btrfs_set_item_size(leaf, slot, old_size + data_size); 4284 btrfs_mark_buffer_dirty(trans, leaf); 4285 4286 if (btrfs_leaf_free_space(leaf) < 0) { 4287 btrfs_print_leaf(leaf); 4288 BUG(); 4289 } 4290 } 4291 4292 /* 4293 * Make space in the node before inserting one or more items. 4294 * 4295 * @trans: transaction handle 4296 * @root: root we are inserting items to 4297 * @path: points to the leaf/slot where we are going to insert new items 4298 * @batch: information about the batch of items to insert 4299 * 4300 * Main purpose is to save stack depth by doing the bulk of the work in a 4301 * function that doesn't call btrfs_search_slot 4302 */ 4303 static void setup_items_for_insert(struct btrfs_trans_handle *trans, 4304 struct btrfs_root *root, struct btrfs_path *path, 4305 const struct btrfs_item_batch *batch) 4306 { 4307 struct btrfs_fs_info *fs_info = root->fs_info; 4308 int i; 4309 u32 nritems; 4310 unsigned int data_end; 4311 struct btrfs_disk_key disk_key; 4312 struct extent_buffer *leaf; 4313 int slot; 4314 struct btrfs_map_token token; 4315 u32 total_size; 4316 4317 /* 4318 * Before anything else, update keys in the parent and other ancestors 4319 * if needed, then release the write locks on them, so that other tasks 4320 * can use them while we modify the leaf. 4321 */ 4322 if (path->slots[0] == 0) { 4323 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4324 fixup_low_keys(trans, path, &disk_key, 1); 4325 } 4326 btrfs_unlock_up_safe(path, 1); 4327 4328 leaf = path->nodes[0]; 4329 slot = path->slots[0]; 4330 4331 nritems = btrfs_header_nritems(leaf); 4332 data_end = leaf_data_end(leaf); 4333 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4334 4335 if (btrfs_leaf_free_space(leaf) < total_size) { 4336 btrfs_print_leaf(leaf); 4337 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4338 total_size, btrfs_leaf_free_space(leaf)); 4339 BUG(); 4340 } 4341 4342 btrfs_init_map_token(&token, leaf); 4343 if (slot != nritems) { 4344 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4345 4346 if (old_data < data_end) { 4347 btrfs_print_leaf(leaf); 4348 btrfs_crit(fs_info, 4349 "item at slot %d with data offset %u beyond data end of leaf %u", 4350 slot, old_data, data_end); 4351 BUG(); 4352 } 4353 /* 4354 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4355 */ 4356 /* first correct the data pointers */ 4357 for (i = slot; i < nritems; i++) { 4358 u32 ioff; 4359 4360 ioff = btrfs_token_item_offset(&token, i); 4361 btrfs_set_token_item_offset(&token, i, 4362 ioff - batch->total_data_size); 4363 } 4364 /* shift the items */ 4365 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4366 4367 /* shift the data */ 4368 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4369 data_end, old_data - data_end); 4370 data_end = old_data; 4371 } 4372 4373 /* setup the item for the new data */ 4374 for (i = 0; i < batch->nr; i++) { 4375 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4376 btrfs_set_item_key(leaf, &disk_key, slot + i); 4377 data_end -= batch->data_sizes[i]; 4378 btrfs_set_token_item_offset(&token, slot + i, data_end); 4379 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4380 } 4381 4382 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4383 btrfs_mark_buffer_dirty(trans, leaf); 4384 4385 if (btrfs_leaf_free_space(leaf) < 0) { 4386 btrfs_print_leaf(leaf); 4387 BUG(); 4388 } 4389 } 4390 4391 /* 4392 * Insert a new item into a leaf. 4393 * 4394 * @trans: Transaction handle. 4395 * @root: The root of the btree. 4396 * @path: A path pointing to the target leaf and slot. 4397 * @key: The key of the new item. 4398 * @data_size: The size of the data associated with the new key. 4399 */ 4400 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, 4401 struct btrfs_root *root, 4402 struct btrfs_path *path, 4403 const struct btrfs_key *key, 4404 u32 data_size) 4405 { 4406 struct btrfs_item_batch batch; 4407 4408 batch.keys = key; 4409 batch.data_sizes = &data_size; 4410 batch.total_data_size = data_size; 4411 batch.nr = 1; 4412 4413 setup_items_for_insert(trans, root, path, &batch); 4414 } 4415 4416 /* 4417 * Given a key and some data, insert items into the tree. 4418 * This does all the path init required, making room in the tree if needed. 4419 */ 4420 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4421 struct btrfs_root *root, 4422 struct btrfs_path *path, 4423 const struct btrfs_item_batch *batch) 4424 { 4425 int ret = 0; 4426 int slot; 4427 u32 total_size; 4428 4429 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4430 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4431 if (ret == 0) 4432 return -EEXIST; 4433 if (ret < 0) 4434 return ret; 4435 4436 slot = path->slots[0]; 4437 BUG_ON(slot < 0); 4438 4439 setup_items_for_insert(trans, root, path, batch); 4440 return 0; 4441 } 4442 4443 /* 4444 * Given a key and some data, insert an item into the tree. 4445 * This does all the path init required, making room in the tree if needed. 4446 */ 4447 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4448 const struct btrfs_key *cpu_key, void *data, 4449 u32 data_size) 4450 { 4451 int ret = 0; 4452 struct btrfs_path *path; 4453 struct extent_buffer *leaf; 4454 unsigned long ptr; 4455 4456 path = btrfs_alloc_path(); 4457 if (!path) 4458 return -ENOMEM; 4459 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4460 if (!ret) { 4461 leaf = path->nodes[0]; 4462 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4463 write_extent_buffer(leaf, data, ptr, data_size); 4464 btrfs_mark_buffer_dirty(trans, leaf); 4465 } 4466 btrfs_free_path(path); 4467 return ret; 4468 } 4469 4470 /* 4471 * This function duplicates an item, giving 'new_key' to the new item. 4472 * It guarantees both items live in the same tree leaf and the new item is 4473 * contiguous with the original item. 4474 * 4475 * This allows us to split a file extent in place, keeping a lock on the leaf 4476 * the entire time. 4477 */ 4478 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4479 struct btrfs_root *root, 4480 struct btrfs_path *path, 4481 const struct btrfs_key *new_key) 4482 { 4483 struct extent_buffer *leaf; 4484 int ret; 4485 u32 item_size; 4486 4487 leaf = path->nodes[0]; 4488 item_size = btrfs_item_size(leaf, path->slots[0]); 4489 ret = setup_leaf_for_split(trans, root, path, 4490 item_size + sizeof(struct btrfs_item)); 4491 if (ret) 4492 return ret; 4493 4494 path->slots[0]++; 4495 btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); 4496 leaf = path->nodes[0]; 4497 memcpy_extent_buffer(leaf, 4498 btrfs_item_ptr_offset(leaf, path->slots[0]), 4499 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4500 item_size); 4501 return 0; 4502 } 4503 4504 /* 4505 * delete the pointer from a given node. 4506 * 4507 * the tree should have been previously balanced so the deletion does not 4508 * empty a node. 4509 * 4510 * This is exported for use inside btrfs-progs, don't un-export it. 4511 */ 4512 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4513 struct btrfs_path *path, int level, int slot) 4514 { 4515 struct extent_buffer *parent = path->nodes[level]; 4516 u32 nritems; 4517 int ret; 4518 4519 nritems = btrfs_header_nritems(parent); 4520 if (slot != nritems - 1) { 4521 if (level) { 4522 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4523 slot + 1, nritems - slot - 1); 4524 if (ret < 0) { 4525 btrfs_abort_transaction(trans, ret); 4526 return ret; 4527 } 4528 } 4529 memmove_extent_buffer(parent, 4530 btrfs_node_key_ptr_offset(parent, slot), 4531 btrfs_node_key_ptr_offset(parent, slot + 1), 4532 sizeof(struct btrfs_key_ptr) * 4533 (nritems - slot - 1)); 4534 } else if (level) { 4535 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4536 BTRFS_MOD_LOG_KEY_REMOVE); 4537 if (ret < 0) { 4538 btrfs_abort_transaction(trans, ret); 4539 return ret; 4540 } 4541 } 4542 4543 nritems--; 4544 btrfs_set_header_nritems(parent, nritems); 4545 if (nritems == 0 && parent == root->node) { 4546 BUG_ON(btrfs_header_level(root->node) != 1); 4547 /* just turn the root into a leaf and break */ 4548 btrfs_set_header_level(root->node, 0); 4549 } else if (slot == 0) { 4550 struct btrfs_disk_key disk_key; 4551 4552 btrfs_node_key(parent, &disk_key, 0); 4553 fixup_low_keys(trans, path, &disk_key, level + 1); 4554 } 4555 btrfs_mark_buffer_dirty(trans, parent); 4556 return 0; 4557 } 4558 4559 /* 4560 * a helper function to delete the leaf pointed to by path->slots[1] and 4561 * path->nodes[1]. 4562 * 4563 * This deletes the pointer in path->nodes[1] and frees the leaf 4564 * block extent. zero is returned if it all worked out, < 0 otherwise. 4565 * 4566 * The path must have already been setup for deleting the leaf, including 4567 * all the proper balancing. path->nodes[1] must be locked. 4568 */ 4569 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, 4570 struct btrfs_root *root, 4571 struct btrfs_path *path, 4572 struct extent_buffer *leaf) 4573 { 4574 int ret; 4575 4576 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4577 ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); 4578 if (ret < 0) 4579 return ret; 4580 4581 /* 4582 * btrfs_free_extent is expensive, we want to make sure we 4583 * aren't holding any locks when we call it 4584 */ 4585 btrfs_unlock_up_safe(path, 0); 4586 4587 root_sub_used(root, leaf->len); 4588 4589 atomic_inc(&leaf->refs); 4590 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4591 free_extent_buffer_stale(leaf); 4592 return 0; 4593 } 4594 /* 4595 * delete the item at the leaf level in path. If that empties 4596 * the leaf, remove it from the tree 4597 */ 4598 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4599 struct btrfs_path *path, int slot, int nr) 4600 { 4601 struct btrfs_fs_info *fs_info = root->fs_info; 4602 struct extent_buffer *leaf; 4603 int ret = 0; 4604 int wret; 4605 u32 nritems; 4606 4607 leaf = path->nodes[0]; 4608 nritems = btrfs_header_nritems(leaf); 4609 4610 if (slot + nr != nritems) { 4611 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4612 const int data_end = leaf_data_end(leaf); 4613 struct btrfs_map_token token; 4614 u32 dsize = 0; 4615 int i; 4616 4617 for (i = 0; i < nr; i++) 4618 dsize += btrfs_item_size(leaf, slot + i); 4619 4620 memmove_leaf_data(leaf, data_end + dsize, data_end, 4621 last_off - data_end); 4622 4623 btrfs_init_map_token(&token, leaf); 4624 for (i = slot + nr; i < nritems; i++) { 4625 u32 ioff; 4626 4627 ioff = btrfs_token_item_offset(&token, i); 4628 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4629 } 4630 4631 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4632 } 4633 btrfs_set_header_nritems(leaf, nritems - nr); 4634 nritems -= nr; 4635 4636 /* delete the leaf if we've emptied it */ 4637 if (nritems == 0) { 4638 if (leaf == root->node) { 4639 btrfs_set_header_level(leaf, 0); 4640 } else { 4641 btrfs_clear_buffer_dirty(trans, leaf); 4642 ret = btrfs_del_leaf(trans, root, path, leaf); 4643 if (ret < 0) 4644 return ret; 4645 } 4646 } else { 4647 int used = leaf_space_used(leaf, 0, nritems); 4648 if (slot == 0) { 4649 struct btrfs_disk_key disk_key; 4650 4651 btrfs_item_key(leaf, &disk_key, 0); 4652 fixup_low_keys(trans, path, &disk_key, 1); 4653 } 4654 4655 /* 4656 * Try to delete the leaf if it is mostly empty. We do this by 4657 * trying to move all its items into its left and right neighbours. 4658 * If we can't move all the items, then we don't delete it - it's 4659 * not ideal, but future insertions might fill the leaf with more 4660 * items, or items from other leaves might be moved later into our 4661 * leaf due to deletions on those leaves. 4662 */ 4663 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4664 u32 min_push_space; 4665 4666 /* push_leaf_left fixes the path. 4667 * make sure the path still points to our leaf 4668 * for possible call to btrfs_del_ptr below 4669 */ 4670 slot = path->slots[1]; 4671 atomic_inc(&leaf->refs); 4672 /* 4673 * We want to be able to at least push one item to the 4674 * left neighbour leaf, and that's the first item. 4675 */ 4676 min_push_space = sizeof(struct btrfs_item) + 4677 btrfs_item_size(leaf, 0); 4678 wret = push_leaf_left(trans, root, path, 0, 4679 min_push_space, 1, (u32)-1); 4680 if (wret < 0 && wret != -ENOSPC) 4681 ret = wret; 4682 4683 if (path->nodes[0] == leaf && 4684 btrfs_header_nritems(leaf)) { 4685 /* 4686 * If we were not able to push all items from our 4687 * leaf to its left neighbour, then attempt to 4688 * either push all the remaining items to the 4689 * right neighbour or none. There's no advantage 4690 * in pushing only some items, instead of all, as 4691 * it's pointless to end up with a leaf having 4692 * too few items while the neighbours can be full 4693 * or nearly full. 4694 */ 4695 nritems = btrfs_header_nritems(leaf); 4696 min_push_space = leaf_space_used(leaf, 0, nritems); 4697 wret = push_leaf_right(trans, root, path, 0, 4698 min_push_space, 1, 0); 4699 if (wret < 0 && wret != -ENOSPC) 4700 ret = wret; 4701 } 4702 4703 if (btrfs_header_nritems(leaf) == 0) { 4704 path->slots[1] = slot; 4705 ret = btrfs_del_leaf(trans, root, path, leaf); 4706 if (ret < 0) 4707 return ret; 4708 free_extent_buffer(leaf); 4709 ret = 0; 4710 } else { 4711 /* if we're still in the path, make sure 4712 * we're dirty. Otherwise, one of the 4713 * push_leaf functions must have already 4714 * dirtied this buffer 4715 */ 4716 if (path->nodes[0] == leaf) 4717 btrfs_mark_buffer_dirty(trans, leaf); 4718 free_extent_buffer(leaf); 4719 } 4720 } else { 4721 btrfs_mark_buffer_dirty(trans, leaf); 4722 } 4723 } 4724 return ret; 4725 } 4726 4727 /* 4728 * A helper function to walk down the tree starting at min_key, and looking 4729 * for nodes or leaves that are have a minimum transaction id. 4730 * This is used by the btree defrag code, and tree logging 4731 * 4732 * This does not cow, but it does stuff the starting key it finds back 4733 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4734 * key and get a writable path. 4735 * 4736 * This honors path->lowest_level to prevent descent past a given level 4737 * of the tree. 4738 * 4739 * min_trans indicates the oldest transaction that you are interested 4740 * in walking through. Any nodes or leaves older than min_trans are 4741 * skipped over (without reading them). 4742 * 4743 * returns zero if something useful was found, < 0 on error and 1 if there 4744 * was nothing in the tree that matched the search criteria. 4745 */ 4746 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4747 struct btrfs_path *path, 4748 u64 min_trans) 4749 { 4750 struct extent_buffer *cur; 4751 struct btrfs_key found_key; 4752 int slot; 4753 int sret; 4754 u32 nritems; 4755 int level; 4756 int ret = 1; 4757 int keep_locks = path->keep_locks; 4758 4759 ASSERT(!path->nowait); 4760 path->keep_locks = 1; 4761 again: 4762 cur = btrfs_read_lock_root_node(root); 4763 level = btrfs_header_level(cur); 4764 WARN_ON(path->nodes[level]); 4765 path->nodes[level] = cur; 4766 path->locks[level] = BTRFS_READ_LOCK; 4767 4768 if (btrfs_header_generation(cur) < min_trans) { 4769 ret = 1; 4770 goto out; 4771 } 4772 while (1) { 4773 nritems = btrfs_header_nritems(cur); 4774 level = btrfs_header_level(cur); 4775 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4776 if (sret < 0) { 4777 ret = sret; 4778 goto out; 4779 } 4780 4781 /* at the lowest level, we're done, setup the path and exit */ 4782 if (level == path->lowest_level) { 4783 if (slot >= nritems) 4784 goto find_next_key; 4785 ret = 0; 4786 path->slots[level] = slot; 4787 btrfs_item_key_to_cpu(cur, &found_key, slot); 4788 goto out; 4789 } 4790 if (sret && slot > 0) 4791 slot--; 4792 /* 4793 * check this node pointer against the min_trans parameters. 4794 * If it is too old, skip to the next one. 4795 */ 4796 while (slot < nritems) { 4797 u64 gen; 4798 4799 gen = btrfs_node_ptr_generation(cur, slot); 4800 if (gen < min_trans) { 4801 slot++; 4802 continue; 4803 } 4804 break; 4805 } 4806 find_next_key: 4807 /* 4808 * we didn't find a candidate key in this node, walk forward 4809 * and find another one 4810 */ 4811 if (slot >= nritems) { 4812 path->slots[level] = slot; 4813 sret = btrfs_find_next_key(root, path, min_key, level, 4814 min_trans); 4815 if (sret == 0) { 4816 btrfs_release_path(path); 4817 goto again; 4818 } else { 4819 goto out; 4820 } 4821 } 4822 /* save our key for returning back */ 4823 btrfs_node_key_to_cpu(cur, &found_key, slot); 4824 path->slots[level] = slot; 4825 if (level == path->lowest_level) { 4826 ret = 0; 4827 goto out; 4828 } 4829 cur = btrfs_read_node_slot(cur, slot); 4830 if (IS_ERR(cur)) { 4831 ret = PTR_ERR(cur); 4832 goto out; 4833 } 4834 4835 btrfs_tree_read_lock(cur); 4836 4837 path->locks[level - 1] = BTRFS_READ_LOCK; 4838 path->nodes[level - 1] = cur; 4839 unlock_up(path, level, 1, 0, NULL); 4840 } 4841 out: 4842 path->keep_locks = keep_locks; 4843 if (ret == 0) { 4844 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4845 memcpy(min_key, &found_key, sizeof(found_key)); 4846 } 4847 return ret; 4848 } 4849 4850 /* 4851 * this is similar to btrfs_next_leaf, but does not try to preserve 4852 * and fixup the path. It looks for and returns the next key in the 4853 * tree based on the current path and the min_trans parameters. 4854 * 4855 * 0 is returned if another key is found, < 0 if there are any errors 4856 * and 1 is returned if there are no higher keys in the tree 4857 * 4858 * path->keep_locks should be set to 1 on the search made before 4859 * calling this function. 4860 */ 4861 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4862 struct btrfs_key *key, int level, u64 min_trans) 4863 { 4864 int slot; 4865 struct extent_buffer *c; 4866 4867 WARN_ON(!path->keep_locks && !path->skip_locking); 4868 while (level < BTRFS_MAX_LEVEL) { 4869 if (!path->nodes[level]) 4870 return 1; 4871 4872 slot = path->slots[level] + 1; 4873 c = path->nodes[level]; 4874 next: 4875 if (slot >= btrfs_header_nritems(c)) { 4876 int ret; 4877 int orig_lowest; 4878 struct btrfs_key cur_key; 4879 if (level + 1 >= BTRFS_MAX_LEVEL || 4880 !path->nodes[level + 1]) 4881 return 1; 4882 4883 if (path->locks[level + 1] || path->skip_locking) { 4884 level++; 4885 continue; 4886 } 4887 4888 slot = btrfs_header_nritems(c) - 1; 4889 if (level == 0) 4890 btrfs_item_key_to_cpu(c, &cur_key, slot); 4891 else 4892 btrfs_node_key_to_cpu(c, &cur_key, slot); 4893 4894 orig_lowest = path->lowest_level; 4895 btrfs_release_path(path); 4896 path->lowest_level = level; 4897 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4898 0, 0); 4899 path->lowest_level = orig_lowest; 4900 if (ret < 0) 4901 return ret; 4902 4903 c = path->nodes[level]; 4904 slot = path->slots[level]; 4905 if (ret == 0) 4906 slot++; 4907 goto next; 4908 } 4909 4910 if (level == 0) 4911 btrfs_item_key_to_cpu(c, key, slot); 4912 else { 4913 u64 gen = btrfs_node_ptr_generation(c, slot); 4914 4915 if (gen < min_trans) { 4916 slot++; 4917 goto next; 4918 } 4919 btrfs_node_key_to_cpu(c, key, slot); 4920 } 4921 return 0; 4922 } 4923 return 1; 4924 } 4925 4926 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4927 u64 time_seq) 4928 { 4929 int slot; 4930 int level; 4931 struct extent_buffer *c; 4932 struct extent_buffer *next; 4933 struct btrfs_fs_info *fs_info = root->fs_info; 4934 struct btrfs_key key; 4935 bool need_commit_sem = false; 4936 u32 nritems; 4937 int ret; 4938 int i; 4939 4940 /* 4941 * The nowait semantics are used only for write paths, where we don't 4942 * use the tree mod log and sequence numbers. 4943 */ 4944 if (time_seq) 4945 ASSERT(!path->nowait); 4946 4947 nritems = btrfs_header_nritems(path->nodes[0]); 4948 if (nritems == 0) 4949 return 1; 4950 4951 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4952 again: 4953 level = 1; 4954 next = NULL; 4955 btrfs_release_path(path); 4956 4957 path->keep_locks = 1; 4958 4959 if (time_seq) { 4960 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4961 } else { 4962 if (path->need_commit_sem) { 4963 path->need_commit_sem = 0; 4964 need_commit_sem = true; 4965 if (path->nowait) { 4966 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4967 ret = -EAGAIN; 4968 goto done; 4969 } 4970 } else { 4971 down_read(&fs_info->commit_root_sem); 4972 } 4973 } 4974 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4975 } 4976 path->keep_locks = 0; 4977 4978 if (ret < 0) 4979 goto done; 4980 4981 nritems = btrfs_header_nritems(path->nodes[0]); 4982 /* 4983 * by releasing the path above we dropped all our locks. A balance 4984 * could have added more items next to the key that used to be 4985 * at the very end of the block. So, check again here and 4986 * advance the path if there are now more items available. 4987 */ 4988 if (nritems > 0 && path->slots[0] < nritems - 1) { 4989 if (ret == 0) 4990 path->slots[0]++; 4991 ret = 0; 4992 goto done; 4993 } 4994 /* 4995 * So the above check misses one case: 4996 * - after releasing the path above, someone has removed the item that 4997 * used to be at the very end of the block, and balance between leafs 4998 * gets another one with bigger key.offset to replace it. 4999 * 5000 * This one should be returned as well, or we can get leaf corruption 5001 * later(esp. in __btrfs_drop_extents()). 5002 * 5003 * And a bit more explanation about this check, 5004 * with ret > 0, the key isn't found, the path points to the slot 5005 * where it should be inserted, so the path->slots[0] item must be the 5006 * bigger one. 5007 */ 5008 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 5009 ret = 0; 5010 goto done; 5011 } 5012 5013 while (level < BTRFS_MAX_LEVEL) { 5014 if (!path->nodes[level]) { 5015 ret = 1; 5016 goto done; 5017 } 5018 5019 slot = path->slots[level] + 1; 5020 c = path->nodes[level]; 5021 if (slot >= btrfs_header_nritems(c)) { 5022 level++; 5023 if (level == BTRFS_MAX_LEVEL) { 5024 ret = 1; 5025 goto done; 5026 } 5027 continue; 5028 } 5029 5030 5031 /* 5032 * Our current level is where we're going to start from, and to 5033 * make sure lockdep doesn't complain we need to drop our locks 5034 * and nodes from 0 to our current level. 5035 */ 5036 for (i = 0; i < level; i++) { 5037 if (path->locks[level]) { 5038 btrfs_tree_read_unlock(path->nodes[i]); 5039 path->locks[i] = 0; 5040 } 5041 free_extent_buffer(path->nodes[i]); 5042 path->nodes[i] = NULL; 5043 } 5044 5045 next = c; 5046 ret = read_block_for_search(root, path, &next, level, 5047 slot, &key); 5048 if (ret == -EAGAIN && !path->nowait) 5049 goto again; 5050 5051 if (ret < 0) { 5052 btrfs_release_path(path); 5053 goto done; 5054 } 5055 5056 if (!path->skip_locking) { 5057 ret = btrfs_try_tree_read_lock(next); 5058 if (!ret && path->nowait) { 5059 ret = -EAGAIN; 5060 goto done; 5061 } 5062 if (!ret && time_seq) { 5063 /* 5064 * If we don't get the lock, we may be racing 5065 * with push_leaf_left, holding that lock while 5066 * itself waiting for the leaf we've currently 5067 * locked. To solve this situation, we give up 5068 * on our lock and cycle. 5069 */ 5070 free_extent_buffer(next); 5071 btrfs_release_path(path); 5072 cond_resched(); 5073 goto again; 5074 } 5075 if (!ret) 5076 btrfs_tree_read_lock(next); 5077 } 5078 break; 5079 } 5080 path->slots[level] = slot; 5081 while (1) { 5082 level--; 5083 path->nodes[level] = next; 5084 path->slots[level] = 0; 5085 if (!path->skip_locking) 5086 path->locks[level] = BTRFS_READ_LOCK; 5087 if (!level) 5088 break; 5089 5090 ret = read_block_for_search(root, path, &next, level, 5091 0, &key); 5092 if (ret == -EAGAIN && !path->nowait) 5093 goto again; 5094 5095 if (ret < 0) { 5096 btrfs_release_path(path); 5097 goto done; 5098 } 5099 5100 if (!path->skip_locking) { 5101 if (path->nowait) { 5102 if (!btrfs_try_tree_read_lock(next)) { 5103 ret = -EAGAIN; 5104 goto done; 5105 } 5106 } else { 5107 btrfs_tree_read_lock(next); 5108 } 5109 } 5110 } 5111 ret = 0; 5112 done: 5113 unlock_up(path, 0, 1, 0, NULL); 5114 if (need_commit_sem) { 5115 int ret2; 5116 5117 path->need_commit_sem = 1; 5118 ret2 = finish_need_commit_sem_search(path); 5119 up_read(&fs_info->commit_root_sem); 5120 if (ret2) 5121 ret = ret2; 5122 } 5123 5124 return ret; 5125 } 5126 5127 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 5128 { 5129 path->slots[0]++; 5130 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 5131 return btrfs_next_old_leaf(root, path, time_seq); 5132 return 0; 5133 } 5134 5135 /* 5136 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5137 * searching until it gets past min_objectid or finds an item of 'type' 5138 * 5139 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5140 */ 5141 int btrfs_previous_item(struct btrfs_root *root, 5142 struct btrfs_path *path, u64 min_objectid, 5143 int type) 5144 { 5145 struct btrfs_key found_key; 5146 struct extent_buffer *leaf; 5147 u32 nritems; 5148 int ret; 5149 5150 while (1) { 5151 if (path->slots[0] == 0) { 5152 ret = btrfs_prev_leaf(root, path); 5153 if (ret != 0) 5154 return ret; 5155 } else { 5156 path->slots[0]--; 5157 } 5158 leaf = path->nodes[0]; 5159 nritems = btrfs_header_nritems(leaf); 5160 if (nritems == 0) 5161 return 1; 5162 if (path->slots[0] == nritems) 5163 path->slots[0]--; 5164 5165 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5166 if (found_key.objectid < min_objectid) 5167 break; 5168 if (found_key.type == type) 5169 return 0; 5170 if (found_key.objectid == min_objectid && 5171 found_key.type < type) 5172 break; 5173 } 5174 return 1; 5175 } 5176 5177 /* 5178 * search in extent tree to find a previous Metadata/Data extent item with 5179 * min objecitd. 5180 * 5181 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5182 */ 5183 int btrfs_previous_extent_item(struct btrfs_root *root, 5184 struct btrfs_path *path, u64 min_objectid) 5185 { 5186 struct btrfs_key found_key; 5187 struct extent_buffer *leaf; 5188 u32 nritems; 5189 int ret; 5190 5191 while (1) { 5192 if (path->slots[0] == 0) { 5193 ret = btrfs_prev_leaf(root, path); 5194 if (ret != 0) 5195 return ret; 5196 } else { 5197 path->slots[0]--; 5198 } 5199 leaf = path->nodes[0]; 5200 nritems = btrfs_header_nritems(leaf); 5201 if (nritems == 0) 5202 return 1; 5203 if (path->slots[0] == nritems) 5204 path->slots[0]--; 5205 5206 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5207 if (found_key.objectid < min_objectid) 5208 break; 5209 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5210 found_key.type == BTRFS_METADATA_ITEM_KEY) 5211 return 0; 5212 if (found_key.objectid == min_objectid && 5213 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5214 break; 5215 } 5216 return 1; 5217 } 5218 5219 int __init btrfs_ctree_init(void) 5220 { 5221 btrfs_path_cachep = kmem_cache_create("btrfs_path", 5222 sizeof(struct btrfs_path), 0, 5223 SLAB_MEM_SPREAD, NULL); 5224 if (!btrfs_path_cachep) 5225 return -ENOMEM; 5226 return 0; 5227 } 5228 5229 void __cold btrfs_ctree_exit(void) 5230 { 5231 kmem_cache_destroy(btrfs_path_cachep); 5232 } 5233