1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include <linux/error-injection.h> 11 #include "messages.h" 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "print-tree.h" 16 #include "locking.h" 17 #include "volumes.h" 18 #include "qgroup.h" 19 #include "tree-mod-log.h" 20 #include "tree-checker.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 #include "relocation.h" 25 #include "file-item.h" 26 27 static struct kmem_cache *btrfs_path_cachep; 28 29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 30 *root, struct btrfs_path *path, int level); 31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 32 const struct btrfs_key *ins_key, struct btrfs_path *path, 33 int data_size, int extend); 34 static int push_node_left(struct btrfs_trans_handle *trans, 35 struct extent_buffer *dst, 36 struct extent_buffer *src, int empty); 37 static int balance_node_right(struct btrfs_trans_handle *trans, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 41 int level, int slot); 42 43 static const struct btrfs_csums { 44 u16 size; 45 const char name[10]; 46 const char driver[12]; 47 } btrfs_csums[] = { 48 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 49 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 50 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 51 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 52 .driver = "blake2b-256" }, 53 }; 54 55 /* 56 * The leaf data grows from end-to-front in the node. this returns the address 57 * of the start of the last item, which is the stop of the leaf data stack. 58 */ 59 static unsigned int leaf_data_end(const struct extent_buffer *leaf) 60 { 61 u32 nr = btrfs_header_nritems(leaf); 62 63 if (nr == 0) 64 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); 65 return btrfs_item_offset(leaf, nr - 1); 66 } 67 68 /* 69 * Move data in a @leaf (using memmove, safe for overlapping ranges). 70 * 71 * @leaf: leaf that we're doing a memmove on 72 * @dst_offset: item data offset we're moving to 73 * @src_offset: item data offset were' moving from 74 * @len: length of the data we're moving 75 * 76 * Wrapper around memmove_extent_buffer() that takes into account the header on 77 * the leaf. The btrfs_item offset's start directly after the header, so we 78 * have to adjust any offsets to account for the header in the leaf. This 79 * handles that math to simplify the callers. 80 */ 81 static inline void memmove_leaf_data(const struct extent_buffer *leaf, 82 unsigned long dst_offset, 83 unsigned long src_offset, 84 unsigned long len) 85 { 86 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, 87 btrfs_item_nr_offset(leaf, 0) + src_offset, len); 88 } 89 90 /* 91 * Copy item data from @src into @dst at the given @offset. 92 * 93 * @dst: destination leaf that we're copying into 94 * @src: source leaf that we're copying from 95 * @dst_offset: item data offset we're copying to 96 * @src_offset: item data offset were' copying from 97 * @len: length of the data we're copying 98 * 99 * Wrapper around copy_extent_buffer() that takes into account the header on 100 * the leaf. The btrfs_item offset's start directly after the header, so we 101 * have to adjust any offsets to account for the header in the leaf. This 102 * handles that math to simplify the callers. 103 */ 104 static inline void copy_leaf_data(const struct extent_buffer *dst, 105 const struct extent_buffer *src, 106 unsigned long dst_offset, 107 unsigned long src_offset, unsigned long len) 108 { 109 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, 110 btrfs_item_nr_offset(src, 0) + src_offset, len); 111 } 112 113 /* 114 * Move items in a @leaf (using memmove). 115 * 116 * @dst: destination leaf for the items 117 * @dst_item: the item nr we're copying into 118 * @src_item: the item nr we're copying from 119 * @nr_items: the number of items to copy 120 * 121 * Wrapper around memmove_extent_buffer() that does the math to get the 122 * appropriate offsets into the leaf from the item numbers. 123 */ 124 static inline void memmove_leaf_items(const struct extent_buffer *leaf, 125 int dst_item, int src_item, int nr_items) 126 { 127 memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), 128 btrfs_item_nr_offset(leaf, src_item), 129 nr_items * sizeof(struct btrfs_item)); 130 } 131 132 /* 133 * Copy items from @src into @dst at the given @offset. 134 * 135 * @dst: destination leaf for the items 136 * @src: source leaf for the items 137 * @dst_item: the item nr we're copying into 138 * @src_item: the item nr we're copying from 139 * @nr_items: the number of items to copy 140 * 141 * Wrapper around copy_extent_buffer() that does the math to get the 142 * appropriate offsets into the leaf from the item numbers. 143 */ 144 static inline void copy_leaf_items(const struct extent_buffer *dst, 145 const struct extent_buffer *src, 146 int dst_item, int src_item, int nr_items) 147 { 148 copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), 149 btrfs_item_nr_offset(src, src_item), 150 nr_items * sizeof(struct btrfs_item)); 151 } 152 153 /* This exists for btrfs-progs usages. */ 154 u16 btrfs_csum_type_size(u16 type) 155 { 156 return btrfs_csums[type].size; 157 } 158 159 int btrfs_super_csum_size(const struct btrfs_super_block *s) 160 { 161 u16 t = btrfs_super_csum_type(s); 162 /* 163 * csum type is validated at mount time 164 */ 165 return btrfs_csum_type_size(t); 166 } 167 168 const char *btrfs_super_csum_name(u16 csum_type) 169 { 170 /* csum type is validated at mount time */ 171 return btrfs_csums[csum_type].name; 172 } 173 174 /* 175 * Return driver name if defined, otherwise the name that's also a valid driver 176 * name 177 */ 178 const char *btrfs_super_csum_driver(u16 csum_type) 179 { 180 /* csum type is validated at mount time */ 181 return btrfs_csums[csum_type].driver[0] ? 182 btrfs_csums[csum_type].driver : 183 btrfs_csums[csum_type].name; 184 } 185 186 size_t __attribute_const__ btrfs_get_num_csums(void) 187 { 188 return ARRAY_SIZE(btrfs_csums); 189 } 190 191 struct btrfs_path *btrfs_alloc_path(void) 192 { 193 might_sleep(); 194 195 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 196 } 197 198 /* this also releases the path */ 199 void btrfs_free_path(struct btrfs_path *p) 200 { 201 if (!p) 202 return; 203 btrfs_release_path(p); 204 kmem_cache_free(btrfs_path_cachep, p); 205 } 206 207 /* 208 * path release drops references on the extent buffers in the path 209 * and it drops any locks held by this path 210 * 211 * It is safe to call this on paths that no locks or extent buffers held. 212 */ 213 noinline void btrfs_release_path(struct btrfs_path *p) 214 { 215 int i; 216 217 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 218 p->slots[i] = 0; 219 if (!p->nodes[i]) 220 continue; 221 if (p->locks[i]) { 222 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 223 p->locks[i] = 0; 224 } 225 free_extent_buffer(p->nodes[i]); 226 p->nodes[i] = NULL; 227 } 228 } 229 230 /* 231 * We want the transaction abort to print stack trace only for errors where the 232 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are 233 * caused by external factors. 234 */ 235 bool __cold abort_should_print_stack(int errno) 236 { 237 switch (errno) { 238 case -EIO: 239 case -EROFS: 240 case -ENOMEM: 241 return false; 242 } 243 return true; 244 } 245 246 /* 247 * safely gets a reference on the root node of a tree. A lock 248 * is not taken, so a concurrent writer may put a different node 249 * at the root of the tree. See btrfs_lock_root_node for the 250 * looping required. 251 * 252 * The extent buffer returned by this has a reference taken, so 253 * it won't disappear. It may stop being the root of the tree 254 * at any time because there are no locks held. 255 */ 256 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 257 { 258 struct extent_buffer *eb; 259 260 while (1) { 261 rcu_read_lock(); 262 eb = rcu_dereference(root->node); 263 264 /* 265 * RCU really hurts here, we could free up the root node because 266 * it was COWed but we may not get the new root node yet so do 267 * the inc_not_zero dance and if it doesn't work then 268 * synchronize_rcu and try again. 269 */ 270 if (atomic_inc_not_zero(&eb->refs)) { 271 rcu_read_unlock(); 272 break; 273 } 274 rcu_read_unlock(); 275 synchronize_rcu(); 276 } 277 return eb; 278 } 279 280 /* 281 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 282 * just get put onto a simple dirty list. Transaction walks this list to make 283 * sure they get properly updated on disk. 284 */ 285 static void add_root_to_dirty_list(struct btrfs_root *root) 286 { 287 struct btrfs_fs_info *fs_info = root->fs_info; 288 289 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 290 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 291 return; 292 293 spin_lock(&fs_info->trans_lock); 294 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 295 /* Want the extent tree to be the last on the list */ 296 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 297 list_move_tail(&root->dirty_list, 298 &fs_info->dirty_cowonly_roots); 299 else 300 list_move(&root->dirty_list, 301 &fs_info->dirty_cowonly_roots); 302 } 303 spin_unlock(&fs_info->trans_lock); 304 } 305 306 /* 307 * used by snapshot creation to make a copy of a root for a tree with 308 * a given objectid. The buffer with the new root node is returned in 309 * cow_ret, and this func returns zero on success or a negative error code. 310 */ 311 int btrfs_copy_root(struct btrfs_trans_handle *trans, 312 struct btrfs_root *root, 313 struct extent_buffer *buf, 314 struct extent_buffer **cow_ret, u64 new_root_objectid) 315 { 316 struct btrfs_fs_info *fs_info = root->fs_info; 317 struct extent_buffer *cow; 318 int ret = 0; 319 int level; 320 struct btrfs_disk_key disk_key; 321 322 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 323 trans->transid != fs_info->running_transaction->transid); 324 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 325 trans->transid != root->last_trans); 326 327 level = btrfs_header_level(buf); 328 if (level == 0) 329 btrfs_item_key(buf, &disk_key, 0); 330 else 331 btrfs_node_key(buf, &disk_key, 0); 332 333 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 334 &disk_key, level, buf->start, 0, 335 BTRFS_NESTING_NEW_ROOT); 336 if (IS_ERR(cow)) 337 return PTR_ERR(cow); 338 339 copy_extent_buffer_full(cow, buf); 340 btrfs_set_header_bytenr(cow, cow->start); 341 btrfs_set_header_generation(cow, trans->transid); 342 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 343 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 344 BTRFS_HEADER_FLAG_RELOC); 345 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 346 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 347 else 348 btrfs_set_header_owner(cow, new_root_objectid); 349 350 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 351 352 WARN_ON(btrfs_header_generation(buf) > trans->transid); 353 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 354 ret = btrfs_inc_ref(trans, root, cow, 1); 355 else 356 ret = btrfs_inc_ref(trans, root, cow, 0); 357 if (ret) { 358 btrfs_tree_unlock(cow); 359 free_extent_buffer(cow); 360 btrfs_abort_transaction(trans, ret); 361 return ret; 362 } 363 364 btrfs_mark_buffer_dirty(cow); 365 *cow_ret = cow; 366 return 0; 367 } 368 369 /* 370 * check if the tree block can be shared by multiple trees 371 */ 372 int btrfs_block_can_be_shared(struct btrfs_root *root, 373 struct extent_buffer *buf) 374 { 375 /* 376 * Tree blocks not in shareable trees and tree roots are never shared. 377 * If a block was allocated after the last snapshot and the block was 378 * not allocated by tree relocation, we know the block is not shared. 379 */ 380 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 381 buf != root->node && buf != root->commit_root && 382 (btrfs_header_generation(buf) <= 383 btrfs_root_last_snapshot(&root->root_item) || 384 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 385 return 1; 386 387 return 0; 388 } 389 390 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 391 struct btrfs_root *root, 392 struct extent_buffer *buf, 393 struct extent_buffer *cow, 394 int *last_ref) 395 { 396 struct btrfs_fs_info *fs_info = root->fs_info; 397 u64 refs; 398 u64 owner; 399 u64 flags; 400 u64 new_flags = 0; 401 int ret; 402 403 /* 404 * Backrefs update rules: 405 * 406 * Always use full backrefs for extent pointers in tree block 407 * allocated by tree relocation. 408 * 409 * If a shared tree block is no longer referenced by its owner 410 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 411 * use full backrefs for extent pointers in tree block. 412 * 413 * If a tree block is been relocating 414 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 415 * use full backrefs for extent pointers in tree block. 416 * The reason for this is some operations (such as drop tree) 417 * are only allowed for blocks use full backrefs. 418 */ 419 420 if (btrfs_block_can_be_shared(root, buf)) { 421 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 422 btrfs_header_level(buf), 1, 423 &refs, &flags); 424 if (ret) 425 return ret; 426 if (refs == 0) { 427 ret = -EROFS; 428 btrfs_handle_fs_error(fs_info, ret, NULL); 429 return ret; 430 } 431 } else { 432 refs = 1; 433 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 434 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 435 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 436 else 437 flags = 0; 438 } 439 440 owner = btrfs_header_owner(buf); 441 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 442 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 443 444 if (refs > 1) { 445 if ((owner == root->root_key.objectid || 446 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 447 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 448 ret = btrfs_inc_ref(trans, root, buf, 1); 449 if (ret) 450 return ret; 451 452 if (root->root_key.objectid == 453 BTRFS_TREE_RELOC_OBJECTID) { 454 ret = btrfs_dec_ref(trans, root, buf, 0); 455 if (ret) 456 return ret; 457 ret = btrfs_inc_ref(trans, root, cow, 1); 458 if (ret) 459 return ret; 460 } 461 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 462 } else { 463 464 if (root->root_key.objectid == 465 BTRFS_TREE_RELOC_OBJECTID) 466 ret = btrfs_inc_ref(trans, root, cow, 1); 467 else 468 ret = btrfs_inc_ref(trans, root, cow, 0); 469 if (ret) 470 return ret; 471 } 472 if (new_flags != 0) { 473 ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); 474 if (ret) 475 return ret; 476 } 477 } else { 478 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 479 if (root->root_key.objectid == 480 BTRFS_TREE_RELOC_OBJECTID) 481 ret = btrfs_inc_ref(trans, root, cow, 1); 482 else 483 ret = btrfs_inc_ref(trans, root, cow, 0); 484 if (ret) 485 return ret; 486 ret = btrfs_dec_ref(trans, root, buf, 1); 487 if (ret) 488 return ret; 489 } 490 btrfs_clear_buffer_dirty(trans, buf); 491 *last_ref = 1; 492 } 493 return 0; 494 } 495 496 /* 497 * does the dirty work in cow of a single block. The parent block (if 498 * supplied) is updated to point to the new cow copy. The new buffer is marked 499 * dirty and returned locked. If you modify the block it needs to be marked 500 * dirty again. 501 * 502 * search_start -- an allocation hint for the new block 503 * 504 * empty_size -- a hint that you plan on doing more cow. This is the size in 505 * bytes the allocator should try to find free next to the block it returns. 506 * This is just a hint and may be ignored by the allocator. 507 */ 508 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 509 struct btrfs_root *root, 510 struct extent_buffer *buf, 511 struct extent_buffer *parent, int parent_slot, 512 struct extent_buffer **cow_ret, 513 u64 search_start, u64 empty_size, 514 enum btrfs_lock_nesting nest) 515 { 516 struct btrfs_fs_info *fs_info = root->fs_info; 517 struct btrfs_disk_key disk_key; 518 struct extent_buffer *cow; 519 int level, ret; 520 int last_ref = 0; 521 int unlock_orig = 0; 522 u64 parent_start = 0; 523 524 if (*cow_ret == buf) 525 unlock_orig = 1; 526 527 btrfs_assert_tree_write_locked(buf); 528 529 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 530 trans->transid != fs_info->running_transaction->transid); 531 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 532 trans->transid != root->last_trans); 533 534 level = btrfs_header_level(buf); 535 536 if (level == 0) 537 btrfs_item_key(buf, &disk_key, 0); 538 else 539 btrfs_node_key(buf, &disk_key, 0); 540 541 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 542 parent_start = parent->start; 543 544 cow = btrfs_alloc_tree_block(trans, root, parent_start, 545 root->root_key.objectid, &disk_key, level, 546 search_start, empty_size, nest); 547 if (IS_ERR(cow)) 548 return PTR_ERR(cow); 549 550 /* cow is set to blocking by btrfs_init_new_buffer */ 551 552 copy_extent_buffer_full(cow, buf); 553 btrfs_set_header_bytenr(cow, cow->start); 554 btrfs_set_header_generation(cow, trans->transid); 555 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 556 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 557 BTRFS_HEADER_FLAG_RELOC); 558 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 559 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 560 else 561 btrfs_set_header_owner(cow, root->root_key.objectid); 562 563 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 564 565 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 566 if (ret) { 567 btrfs_tree_unlock(cow); 568 free_extent_buffer(cow); 569 btrfs_abort_transaction(trans, ret); 570 return ret; 571 } 572 573 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 574 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 575 if (ret) { 576 btrfs_tree_unlock(cow); 577 free_extent_buffer(cow); 578 btrfs_abort_transaction(trans, ret); 579 return ret; 580 } 581 } 582 583 if (buf == root->node) { 584 WARN_ON(parent && parent != buf); 585 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 586 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 587 parent_start = buf->start; 588 589 atomic_inc(&cow->refs); 590 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 591 BUG_ON(ret < 0); 592 rcu_assign_pointer(root->node, cow); 593 594 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 595 parent_start, last_ref); 596 free_extent_buffer(buf); 597 add_root_to_dirty_list(root); 598 } else { 599 WARN_ON(trans->transid != btrfs_header_generation(parent)); 600 btrfs_tree_mod_log_insert_key(parent, parent_slot, 601 BTRFS_MOD_LOG_KEY_REPLACE); 602 btrfs_set_node_blockptr(parent, parent_slot, 603 cow->start); 604 btrfs_set_node_ptr_generation(parent, parent_slot, 605 trans->transid); 606 btrfs_mark_buffer_dirty(parent); 607 if (last_ref) { 608 ret = btrfs_tree_mod_log_free_eb(buf); 609 if (ret) { 610 btrfs_tree_unlock(cow); 611 free_extent_buffer(cow); 612 btrfs_abort_transaction(trans, ret); 613 return ret; 614 } 615 } 616 btrfs_free_tree_block(trans, btrfs_root_id(root), buf, 617 parent_start, last_ref); 618 } 619 if (unlock_orig) 620 btrfs_tree_unlock(buf); 621 free_extent_buffer_stale(buf); 622 btrfs_mark_buffer_dirty(cow); 623 *cow_ret = cow; 624 return 0; 625 } 626 627 static inline int should_cow_block(struct btrfs_trans_handle *trans, 628 struct btrfs_root *root, 629 struct extent_buffer *buf) 630 { 631 if (btrfs_is_testing(root->fs_info)) 632 return 0; 633 634 /* Ensure we can see the FORCE_COW bit */ 635 smp_mb__before_atomic(); 636 637 /* 638 * We do not need to cow a block if 639 * 1) this block is not created or changed in this transaction; 640 * 2) this block does not belong to TREE_RELOC tree; 641 * 3) the root is not forced COW. 642 * 643 * What is forced COW: 644 * when we create snapshot during committing the transaction, 645 * after we've finished copying src root, we must COW the shared 646 * block to ensure the metadata consistency. 647 */ 648 if (btrfs_header_generation(buf) == trans->transid && 649 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 650 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 651 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 652 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 653 return 0; 654 return 1; 655 } 656 657 /* 658 * cows a single block, see __btrfs_cow_block for the real work. 659 * This version of it has extra checks so that a block isn't COWed more than 660 * once per transaction, as long as it hasn't been written yet 661 */ 662 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 663 struct btrfs_root *root, struct extent_buffer *buf, 664 struct extent_buffer *parent, int parent_slot, 665 struct extent_buffer **cow_ret, 666 enum btrfs_lock_nesting nest) 667 { 668 struct btrfs_fs_info *fs_info = root->fs_info; 669 u64 search_start; 670 int ret; 671 672 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 673 btrfs_err(fs_info, 674 "COW'ing blocks on a fs root that's being dropped"); 675 676 if (trans->transaction != fs_info->running_transaction) 677 WARN(1, KERN_CRIT "trans %llu running %llu\n", 678 trans->transid, 679 fs_info->running_transaction->transid); 680 681 if (trans->transid != fs_info->generation) 682 WARN(1, KERN_CRIT "trans %llu running %llu\n", 683 trans->transid, fs_info->generation); 684 685 if (!should_cow_block(trans, root, buf)) { 686 *cow_ret = buf; 687 return 0; 688 } 689 690 search_start = buf->start & ~((u64)SZ_1G - 1); 691 692 /* 693 * Before CoWing this block for later modification, check if it's 694 * the subtree root and do the delayed subtree trace if needed. 695 * 696 * Also We don't care about the error, as it's handled internally. 697 */ 698 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 699 ret = __btrfs_cow_block(trans, root, buf, parent, 700 parent_slot, cow_ret, search_start, 0, nest); 701 702 trace_btrfs_cow_block(root, buf, *cow_ret); 703 704 return ret; 705 } 706 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 707 708 /* 709 * helper function for defrag to decide if two blocks pointed to by a 710 * node are actually close by 711 */ 712 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 713 { 714 if (blocknr < other && other - (blocknr + blocksize) < 32768) 715 return 1; 716 if (blocknr > other && blocknr - (other + blocksize) < 32768) 717 return 1; 718 return 0; 719 } 720 721 #ifdef __LITTLE_ENDIAN 722 723 /* 724 * Compare two keys, on little-endian the disk order is same as CPU order and 725 * we can avoid the conversion. 726 */ 727 static int comp_keys(const struct btrfs_disk_key *disk_key, 728 const struct btrfs_key *k2) 729 { 730 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 731 732 return btrfs_comp_cpu_keys(k1, k2); 733 } 734 735 #else 736 737 /* 738 * compare two keys in a memcmp fashion 739 */ 740 static int comp_keys(const struct btrfs_disk_key *disk, 741 const struct btrfs_key *k2) 742 { 743 struct btrfs_key k1; 744 745 btrfs_disk_key_to_cpu(&k1, disk); 746 747 return btrfs_comp_cpu_keys(&k1, k2); 748 } 749 #endif 750 751 /* 752 * same as comp_keys only with two btrfs_key's 753 */ 754 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 755 { 756 if (k1->objectid > k2->objectid) 757 return 1; 758 if (k1->objectid < k2->objectid) 759 return -1; 760 if (k1->type > k2->type) 761 return 1; 762 if (k1->type < k2->type) 763 return -1; 764 if (k1->offset > k2->offset) 765 return 1; 766 if (k1->offset < k2->offset) 767 return -1; 768 return 0; 769 } 770 771 /* 772 * this is used by the defrag code to go through all the 773 * leaves pointed to by a node and reallocate them so that 774 * disk order is close to key order 775 */ 776 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 777 struct btrfs_root *root, struct extent_buffer *parent, 778 int start_slot, u64 *last_ret, 779 struct btrfs_key *progress) 780 { 781 struct btrfs_fs_info *fs_info = root->fs_info; 782 struct extent_buffer *cur; 783 u64 blocknr; 784 u64 search_start = *last_ret; 785 u64 last_block = 0; 786 u64 other; 787 u32 parent_nritems; 788 int end_slot; 789 int i; 790 int err = 0; 791 u32 blocksize; 792 int progress_passed = 0; 793 struct btrfs_disk_key disk_key; 794 795 WARN_ON(trans->transaction != fs_info->running_transaction); 796 WARN_ON(trans->transid != fs_info->generation); 797 798 parent_nritems = btrfs_header_nritems(parent); 799 blocksize = fs_info->nodesize; 800 end_slot = parent_nritems - 1; 801 802 if (parent_nritems <= 1) 803 return 0; 804 805 for (i = start_slot; i <= end_slot; i++) { 806 int close = 1; 807 808 btrfs_node_key(parent, &disk_key, i); 809 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 810 continue; 811 812 progress_passed = 1; 813 blocknr = btrfs_node_blockptr(parent, i); 814 if (last_block == 0) 815 last_block = blocknr; 816 817 if (i > 0) { 818 other = btrfs_node_blockptr(parent, i - 1); 819 close = close_blocks(blocknr, other, blocksize); 820 } 821 if (!close && i < end_slot) { 822 other = btrfs_node_blockptr(parent, i + 1); 823 close = close_blocks(blocknr, other, blocksize); 824 } 825 if (close) { 826 last_block = blocknr; 827 continue; 828 } 829 830 cur = btrfs_read_node_slot(parent, i); 831 if (IS_ERR(cur)) 832 return PTR_ERR(cur); 833 if (search_start == 0) 834 search_start = last_block; 835 836 btrfs_tree_lock(cur); 837 err = __btrfs_cow_block(trans, root, cur, parent, i, 838 &cur, search_start, 839 min(16 * blocksize, 840 (end_slot - i) * blocksize), 841 BTRFS_NESTING_COW); 842 if (err) { 843 btrfs_tree_unlock(cur); 844 free_extent_buffer(cur); 845 break; 846 } 847 search_start = cur->start; 848 last_block = cur->start; 849 *last_ret = search_start; 850 btrfs_tree_unlock(cur); 851 free_extent_buffer(cur); 852 } 853 return err; 854 } 855 856 /* 857 * Search for a key in the given extent_buffer. 858 * 859 * The lower boundary for the search is specified by the slot number @first_slot. 860 * Use a value of 0 to search over the whole extent buffer. Works for both 861 * leaves and nodes. 862 * 863 * The slot in the extent buffer is returned via @slot. If the key exists in the 864 * extent buffer, then @slot will point to the slot where the key is, otherwise 865 * it points to the slot where you would insert the key. 866 * 867 * Slot may point to the total number of items (i.e. one position beyond the last 868 * key) if the key is bigger than the last key in the extent buffer. 869 */ 870 int btrfs_bin_search(struct extent_buffer *eb, int first_slot, 871 const struct btrfs_key *key, int *slot) 872 { 873 unsigned long p; 874 int item_size; 875 /* 876 * Use unsigned types for the low and high slots, so that we get a more 877 * efficient division in the search loop below. 878 */ 879 u32 low = first_slot; 880 u32 high = btrfs_header_nritems(eb); 881 int ret; 882 const int key_size = sizeof(struct btrfs_disk_key); 883 884 if (unlikely(low > high)) { 885 btrfs_err(eb->fs_info, 886 "%s: low (%u) > high (%u) eb %llu owner %llu level %d", 887 __func__, low, high, eb->start, 888 btrfs_header_owner(eb), btrfs_header_level(eb)); 889 return -EINVAL; 890 } 891 892 if (btrfs_header_level(eb) == 0) { 893 p = offsetof(struct btrfs_leaf, items); 894 item_size = sizeof(struct btrfs_item); 895 } else { 896 p = offsetof(struct btrfs_node, ptrs); 897 item_size = sizeof(struct btrfs_key_ptr); 898 } 899 900 while (low < high) { 901 unsigned long oip; 902 unsigned long offset; 903 struct btrfs_disk_key *tmp; 904 struct btrfs_disk_key unaligned; 905 int mid; 906 907 mid = (low + high) / 2; 908 offset = p + mid * item_size; 909 oip = offset_in_page(offset); 910 911 if (oip + key_size <= PAGE_SIZE) { 912 const unsigned long idx = get_eb_page_index(offset); 913 char *kaddr = page_address(eb->pages[idx]); 914 915 oip = get_eb_offset_in_page(eb, offset); 916 tmp = (struct btrfs_disk_key *)(kaddr + oip); 917 } else { 918 read_extent_buffer(eb, &unaligned, offset, key_size); 919 tmp = &unaligned; 920 } 921 922 ret = comp_keys(tmp, key); 923 924 if (ret < 0) 925 low = mid + 1; 926 else if (ret > 0) 927 high = mid; 928 else { 929 *slot = mid; 930 return 0; 931 } 932 } 933 *slot = low; 934 return 1; 935 } 936 937 static void root_add_used(struct btrfs_root *root, u32 size) 938 { 939 spin_lock(&root->accounting_lock); 940 btrfs_set_root_used(&root->root_item, 941 btrfs_root_used(&root->root_item) + size); 942 spin_unlock(&root->accounting_lock); 943 } 944 945 static void root_sub_used(struct btrfs_root *root, u32 size) 946 { 947 spin_lock(&root->accounting_lock); 948 btrfs_set_root_used(&root->root_item, 949 btrfs_root_used(&root->root_item) - size); 950 spin_unlock(&root->accounting_lock); 951 } 952 953 /* given a node and slot number, this reads the blocks it points to. The 954 * extent buffer is returned with a reference taken (but unlocked). 955 */ 956 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 957 int slot) 958 { 959 int level = btrfs_header_level(parent); 960 struct btrfs_tree_parent_check check = { 0 }; 961 struct extent_buffer *eb; 962 963 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 964 return ERR_PTR(-ENOENT); 965 966 ASSERT(level); 967 968 check.level = level - 1; 969 check.transid = btrfs_node_ptr_generation(parent, slot); 970 check.owner_root = btrfs_header_owner(parent); 971 check.has_first_key = true; 972 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 973 974 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 975 &check); 976 if (IS_ERR(eb)) 977 return eb; 978 if (!extent_buffer_uptodate(eb)) { 979 free_extent_buffer(eb); 980 return ERR_PTR(-EIO); 981 } 982 983 return eb; 984 } 985 986 /* 987 * node level balancing, used to make sure nodes are in proper order for 988 * item deletion. We balance from the top down, so we have to make sure 989 * that a deletion won't leave an node completely empty later on. 990 */ 991 static noinline int balance_level(struct btrfs_trans_handle *trans, 992 struct btrfs_root *root, 993 struct btrfs_path *path, int level) 994 { 995 struct btrfs_fs_info *fs_info = root->fs_info; 996 struct extent_buffer *right = NULL; 997 struct extent_buffer *mid; 998 struct extent_buffer *left = NULL; 999 struct extent_buffer *parent = NULL; 1000 int ret = 0; 1001 int wret; 1002 int pslot; 1003 int orig_slot = path->slots[level]; 1004 u64 orig_ptr; 1005 1006 ASSERT(level > 0); 1007 1008 mid = path->nodes[level]; 1009 1010 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 1011 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1012 1013 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1014 1015 if (level < BTRFS_MAX_LEVEL - 1) { 1016 parent = path->nodes[level + 1]; 1017 pslot = path->slots[level + 1]; 1018 } 1019 1020 /* 1021 * deal with the case where there is only one pointer in the root 1022 * by promoting the node below to a root 1023 */ 1024 if (!parent) { 1025 struct extent_buffer *child; 1026 1027 if (btrfs_header_nritems(mid) != 1) 1028 return 0; 1029 1030 /* promote the child to a root */ 1031 child = btrfs_read_node_slot(mid, 0); 1032 if (IS_ERR(child)) { 1033 ret = PTR_ERR(child); 1034 btrfs_handle_fs_error(fs_info, ret, NULL); 1035 goto enospc; 1036 } 1037 1038 btrfs_tree_lock(child); 1039 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 1040 BTRFS_NESTING_COW); 1041 if (ret) { 1042 btrfs_tree_unlock(child); 1043 free_extent_buffer(child); 1044 goto enospc; 1045 } 1046 1047 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 1048 BUG_ON(ret < 0); 1049 rcu_assign_pointer(root->node, child); 1050 1051 add_root_to_dirty_list(root); 1052 btrfs_tree_unlock(child); 1053 1054 path->locks[level] = 0; 1055 path->nodes[level] = NULL; 1056 btrfs_clear_buffer_dirty(trans, mid); 1057 btrfs_tree_unlock(mid); 1058 /* once for the path */ 1059 free_extent_buffer(mid); 1060 1061 root_sub_used(root, mid->len); 1062 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1063 /* once for the root ptr */ 1064 free_extent_buffer_stale(mid); 1065 return 0; 1066 } 1067 if (btrfs_header_nritems(mid) > 1068 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1069 return 0; 1070 1071 if (pslot) { 1072 left = btrfs_read_node_slot(parent, pslot - 1); 1073 if (IS_ERR(left)) { 1074 ret = PTR_ERR(left); 1075 left = NULL; 1076 goto enospc; 1077 } 1078 1079 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1080 wret = btrfs_cow_block(trans, root, left, 1081 parent, pslot - 1, &left, 1082 BTRFS_NESTING_LEFT_COW); 1083 if (wret) { 1084 ret = wret; 1085 goto enospc; 1086 } 1087 } 1088 1089 if (pslot + 1 < btrfs_header_nritems(parent)) { 1090 right = btrfs_read_node_slot(parent, pslot + 1); 1091 if (IS_ERR(right)) { 1092 ret = PTR_ERR(right); 1093 right = NULL; 1094 goto enospc; 1095 } 1096 1097 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1098 wret = btrfs_cow_block(trans, root, right, 1099 parent, pslot + 1, &right, 1100 BTRFS_NESTING_RIGHT_COW); 1101 if (wret) { 1102 ret = wret; 1103 goto enospc; 1104 } 1105 } 1106 1107 /* first, try to make some room in the middle buffer */ 1108 if (left) { 1109 orig_slot += btrfs_header_nritems(left); 1110 wret = push_node_left(trans, left, mid, 1); 1111 if (wret < 0) 1112 ret = wret; 1113 } 1114 1115 /* 1116 * then try to empty the right most buffer into the middle 1117 */ 1118 if (right) { 1119 wret = push_node_left(trans, mid, right, 1); 1120 if (wret < 0 && wret != -ENOSPC) 1121 ret = wret; 1122 if (btrfs_header_nritems(right) == 0) { 1123 btrfs_clear_buffer_dirty(trans, right); 1124 btrfs_tree_unlock(right); 1125 del_ptr(root, path, level + 1, pslot + 1); 1126 root_sub_used(root, right->len); 1127 btrfs_free_tree_block(trans, btrfs_root_id(root), right, 1128 0, 1); 1129 free_extent_buffer_stale(right); 1130 right = NULL; 1131 } else { 1132 struct btrfs_disk_key right_key; 1133 btrfs_node_key(right, &right_key, 0); 1134 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1135 BTRFS_MOD_LOG_KEY_REPLACE); 1136 BUG_ON(ret < 0); 1137 btrfs_set_node_key(parent, &right_key, pslot + 1); 1138 btrfs_mark_buffer_dirty(parent); 1139 } 1140 } 1141 if (btrfs_header_nritems(mid) == 1) { 1142 /* 1143 * we're not allowed to leave a node with one item in the 1144 * tree during a delete. A deletion from lower in the tree 1145 * could try to delete the only pointer in this node. 1146 * So, pull some keys from the left. 1147 * There has to be a left pointer at this point because 1148 * otherwise we would have pulled some pointers from the 1149 * right 1150 */ 1151 if (!left) { 1152 ret = -EROFS; 1153 btrfs_handle_fs_error(fs_info, ret, NULL); 1154 goto enospc; 1155 } 1156 wret = balance_node_right(trans, mid, left); 1157 if (wret < 0) { 1158 ret = wret; 1159 goto enospc; 1160 } 1161 if (wret == 1) { 1162 wret = push_node_left(trans, left, mid, 1); 1163 if (wret < 0) 1164 ret = wret; 1165 } 1166 BUG_ON(wret == 1); 1167 } 1168 if (btrfs_header_nritems(mid) == 0) { 1169 btrfs_clear_buffer_dirty(trans, mid); 1170 btrfs_tree_unlock(mid); 1171 del_ptr(root, path, level + 1, pslot); 1172 root_sub_used(root, mid->len); 1173 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); 1174 free_extent_buffer_stale(mid); 1175 mid = NULL; 1176 } else { 1177 /* update the parent key to reflect our changes */ 1178 struct btrfs_disk_key mid_key; 1179 btrfs_node_key(mid, &mid_key, 0); 1180 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1181 BTRFS_MOD_LOG_KEY_REPLACE); 1182 BUG_ON(ret < 0); 1183 btrfs_set_node_key(parent, &mid_key, pslot); 1184 btrfs_mark_buffer_dirty(parent); 1185 } 1186 1187 /* update the path */ 1188 if (left) { 1189 if (btrfs_header_nritems(left) > orig_slot) { 1190 atomic_inc(&left->refs); 1191 /* left was locked after cow */ 1192 path->nodes[level] = left; 1193 path->slots[level + 1] -= 1; 1194 path->slots[level] = orig_slot; 1195 if (mid) { 1196 btrfs_tree_unlock(mid); 1197 free_extent_buffer(mid); 1198 } 1199 } else { 1200 orig_slot -= btrfs_header_nritems(left); 1201 path->slots[level] = orig_slot; 1202 } 1203 } 1204 /* double check we haven't messed things up */ 1205 if (orig_ptr != 1206 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1207 BUG(); 1208 enospc: 1209 if (right) { 1210 btrfs_tree_unlock(right); 1211 free_extent_buffer(right); 1212 } 1213 if (left) { 1214 if (path->nodes[level] != left) 1215 btrfs_tree_unlock(left); 1216 free_extent_buffer(left); 1217 } 1218 return ret; 1219 } 1220 1221 /* Node balancing for insertion. Here we only split or push nodes around 1222 * when they are completely full. This is also done top down, so we 1223 * have to be pessimistic. 1224 */ 1225 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1226 struct btrfs_root *root, 1227 struct btrfs_path *path, int level) 1228 { 1229 struct btrfs_fs_info *fs_info = root->fs_info; 1230 struct extent_buffer *right = NULL; 1231 struct extent_buffer *mid; 1232 struct extent_buffer *left = NULL; 1233 struct extent_buffer *parent = NULL; 1234 int ret = 0; 1235 int wret; 1236 int pslot; 1237 int orig_slot = path->slots[level]; 1238 1239 if (level == 0) 1240 return 1; 1241 1242 mid = path->nodes[level]; 1243 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1244 1245 if (level < BTRFS_MAX_LEVEL - 1) { 1246 parent = path->nodes[level + 1]; 1247 pslot = path->slots[level + 1]; 1248 } 1249 1250 if (!parent) 1251 return 1; 1252 1253 /* first, try to make some room in the middle buffer */ 1254 if (pslot) { 1255 u32 left_nr; 1256 1257 left = btrfs_read_node_slot(parent, pslot - 1); 1258 if (IS_ERR(left)) 1259 return PTR_ERR(left); 1260 1261 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1262 1263 left_nr = btrfs_header_nritems(left); 1264 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1265 wret = 1; 1266 } else { 1267 ret = btrfs_cow_block(trans, root, left, parent, 1268 pslot - 1, &left, 1269 BTRFS_NESTING_LEFT_COW); 1270 if (ret) 1271 wret = 1; 1272 else { 1273 wret = push_node_left(trans, left, mid, 0); 1274 } 1275 } 1276 if (wret < 0) 1277 ret = wret; 1278 if (wret == 0) { 1279 struct btrfs_disk_key disk_key; 1280 orig_slot += left_nr; 1281 btrfs_node_key(mid, &disk_key, 0); 1282 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1283 BTRFS_MOD_LOG_KEY_REPLACE); 1284 BUG_ON(ret < 0); 1285 btrfs_set_node_key(parent, &disk_key, pslot); 1286 btrfs_mark_buffer_dirty(parent); 1287 if (btrfs_header_nritems(left) > orig_slot) { 1288 path->nodes[level] = left; 1289 path->slots[level + 1] -= 1; 1290 path->slots[level] = orig_slot; 1291 btrfs_tree_unlock(mid); 1292 free_extent_buffer(mid); 1293 } else { 1294 orig_slot -= 1295 btrfs_header_nritems(left); 1296 path->slots[level] = orig_slot; 1297 btrfs_tree_unlock(left); 1298 free_extent_buffer(left); 1299 } 1300 return 0; 1301 } 1302 btrfs_tree_unlock(left); 1303 free_extent_buffer(left); 1304 } 1305 1306 /* 1307 * then try to empty the right most buffer into the middle 1308 */ 1309 if (pslot + 1 < btrfs_header_nritems(parent)) { 1310 u32 right_nr; 1311 1312 right = btrfs_read_node_slot(parent, pslot + 1); 1313 if (IS_ERR(right)) 1314 return PTR_ERR(right); 1315 1316 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1317 1318 right_nr = btrfs_header_nritems(right); 1319 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1320 wret = 1; 1321 } else { 1322 ret = btrfs_cow_block(trans, root, right, 1323 parent, pslot + 1, 1324 &right, BTRFS_NESTING_RIGHT_COW); 1325 if (ret) 1326 wret = 1; 1327 else { 1328 wret = balance_node_right(trans, right, mid); 1329 } 1330 } 1331 if (wret < 0) 1332 ret = wret; 1333 if (wret == 0) { 1334 struct btrfs_disk_key disk_key; 1335 1336 btrfs_node_key(right, &disk_key, 0); 1337 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1338 BTRFS_MOD_LOG_KEY_REPLACE); 1339 BUG_ON(ret < 0); 1340 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1341 btrfs_mark_buffer_dirty(parent); 1342 1343 if (btrfs_header_nritems(mid) <= orig_slot) { 1344 path->nodes[level] = right; 1345 path->slots[level + 1] += 1; 1346 path->slots[level] = orig_slot - 1347 btrfs_header_nritems(mid); 1348 btrfs_tree_unlock(mid); 1349 free_extent_buffer(mid); 1350 } else { 1351 btrfs_tree_unlock(right); 1352 free_extent_buffer(right); 1353 } 1354 return 0; 1355 } 1356 btrfs_tree_unlock(right); 1357 free_extent_buffer(right); 1358 } 1359 return 1; 1360 } 1361 1362 /* 1363 * readahead one full node of leaves, finding things that are close 1364 * to the block in 'slot', and triggering ra on them. 1365 */ 1366 static void reada_for_search(struct btrfs_fs_info *fs_info, 1367 struct btrfs_path *path, 1368 int level, int slot, u64 objectid) 1369 { 1370 struct extent_buffer *node; 1371 struct btrfs_disk_key disk_key; 1372 u32 nritems; 1373 u64 search; 1374 u64 target; 1375 u64 nread = 0; 1376 u64 nread_max; 1377 u32 nr; 1378 u32 blocksize; 1379 u32 nscan = 0; 1380 1381 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1382 return; 1383 1384 if (!path->nodes[level]) 1385 return; 1386 1387 node = path->nodes[level]; 1388 1389 /* 1390 * Since the time between visiting leaves is much shorter than the time 1391 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1392 * much IO at once (possibly random). 1393 */ 1394 if (path->reada == READA_FORWARD_ALWAYS) { 1395 if (level > 1) 1396 nread_max = node->fs_info->nodesize; 1397 else 1398 nread_max = SZ_128K; 1399 } else { 1400 nread_max = SZ_64K; 1401 } 1402 1403 search = btrfs_node_blockptr(node, slot); 1404 blocksize = fs_info->nodesize; 1405 if (path->reada != READA_FORWARD_ALWAYS) { 1406 struct extent_buffer *eb; 1407 1408 eb = find_extent_buffer(fs_info, search); 1409 if (eb) { 1410 free_extent_buffer(eb); 1411 return; 1412 } 1413 } 1414 1415 target = search; 1416 1417 nritems = btrfs_header_nritems(node); 1418 nr = slot; 1419 1420 while (1) { 1421 if (path->reada == READA_BACK) { 1422 if (nr == 0) 1423 break; 1424 nr--; 1425 } else if (path->reada == READA_FORWARD || 1426 path->reada == READA_FORWARD_ALWAYS) { 1427 nr++; 1428 if (nr >= nritems) 1429 break; 1430 } 1431 if (path->reada == READA_BACK && objectid) { 1432 btrfs_node_key(node, &disk_key, nr); 1433 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1434 break; 1435 } 1436 search = btrfs_node_blockptr(node, nr); 1437 if (path->reada == READA_FORWARD_ALWAYS || 1438 (search <= target && target - search <= 65536) || 1439 (search > target && search - target <= 65536)) { 1440 btrfs_readahead_node_child(node, nr); 1441 nread += blocksize; 1442 } 1443 nscan++; 1444 if (nread > nread_max || nscan > 32) 1445 break; 1446 } 1447 } 1448 1449 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1450 { 1451 struct extent_buffer *parent; 1452 int slot; 1453 int nritems; 1454 1455 parent = path->nodes[level + 1]; 1456 if (!parent) 1457 return; 1458 1459 nritems = btrfs_header_nritems(parent); 1460 slot = path->slots[level + 1]; 1461 1462 if (slot > 0) 1463 btrfs_readahead_node_child(parent, slot - 1); 1464 if (slot + 1 < nritems) 1465 btrfs_readahead_node_child(parent, slot + 1); 1466 } 1467 1468 1469 /* 1470 * when we walk down the tree, it is usually safe to unlock the higher layers 1471 * in the tree. The exceptions are when our path goes through slot 0, because 1472 * operations on the tree might require changing key pointers higher up in the 1473 * tree. 1474 * 1475 * callers might also have set path->keep_locks, which tells this code to keep 1476 * the lock if the path points to the last slot in the block. This is part of 1477 * walking through the tree, and selecting the next slot in the higher block. 1478 * 1479 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1480 * if lowest_unlock is 1, level 0 won't be unlocked 1481 */ 1482 static noinline void unlock_up(struct btrfs_path *path, int level, 1483 int lowest_unlock, int min_write_lock_level, 1484 int *write_lock_level) 1485 { 1486 int i; 1487 int skip_level = level; 1488 bool check_skip = true; 1489 1490 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1491 if (!path->nodes[i]) 1492 break; 1493 if (!path->locks[i]) 1494 break; 1495 1496 if (check_skip) { 1497 if (path->slots[i] == 0) { 1498 skip_level = i + 1; 1499 continue; 1500 } 1501 1502 if (path->keep_locks) { 1503 u32 nritems; 1504 1505 nritems = btrfs_header_nritems(path->nodes[i]); 1506 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1507 skip_level = i + 1; 1508 continue; 1509 } 1510 } 1511 } 1512 1513 if (i >= lowest_unlock && i > skip_level) { 1514 check_skip = false; 1515 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 1516 path->locks[i] = 0; 1517 if (write_lock_level && 1518 i > min_write_lock_level && 1519 i <= *write_lock_level) { 1520 *write_lock_level = i - 1; 1521 } 1522 } 1523 } 1524 } 1525 1526 /* 1527 * Helper function for btrfs_search_slot() and other functions that do a search 1528 * on a btree. The goal is to find a tree block in the cache (the radix tree at 1529 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1530 * its pages from disk. 1531 * 1532 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1533 * whole btree search, starting again from the current root node. 1534 */ 1535 static int 1536 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1537 struct extent_buffer **eb_ret, int level, int slot, 1538 const struct btrfs_key *key) 1539 { 1540 struct btrfs_fs_info *fs_info = root->fs_info; 1541 struct btrfs_tree_parent_check check = { 0 }; 1542 u64 blocknr; 1543 u64 gen; 1544 struct extent_buffer *tmp; 1545 int ret; 1546 int parent_level; 1547 bool unlock_up; 1548 1549 unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1550 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1551 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1552 parent_level = btrfs_header_level(*eb_ret); 1553 btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); 1554 check.has_first_key = true; 1555 check.level = parent_level - 1; 1556 check.transid = gen; 1557 check.owner_root = root->root_key.objectid; 1558 1559 /* 1560 * If we need to read an extent buffer from disk and we are holding locks 1561 * on upper level nodes, we unlock all the upper nodes before reading the 1562 * extent buffer, and then return -EAGAIN to the caller as it needs to 1563 * restart the search. We don't release the lock on the current level 1564 * because we need to walk this node to figure out which blocks to read. 1565 */ 1566 tmp = find_extent_buffer(fs_info, blocknr); 1567 if (tmp) { 1568 if (p->reada == READA_FORWARD_ALWAYS) 1569 reada_for_search(fs_info, p, level, slot, key->objectid); 1570 1571 /* first we do an atomic uptodate check */ 1572 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1573 /* 1574 * Do extra check for first_key, eb can be stale due to 1575 * being cached, read from scrub, or have multiple 1576 * parents (shared tree blocks). 1577 */ 1578 if (btrfs_verify_level_key(tmp, 1579 parent_level - 1, &check.first_key, gen)) { 1580 free_extent_buffer(tmp); 1581 return -EUCLEAN; 1582 } 1583 *eb_ret = tmp; 1584 return 0; 1585 } 1586 1587 if (p->nowait) { 1588 free_extent_buffer(tmp); 1589 return -EAGAIN; 1590 } 1591 1592 if (unlock_up) 1593 btrfs_unlock_up_safe(p, level + 1); 1594 1595 /* now we're allowed to do a blocking uptodate check */ 1596 ret = btrfs_read_extent_buffer(tmp, &check); 1597 if (ret) { 1598 free_extent_buffer(tmp); 1599 btrfs_release_path(p); 1600 return -EIO; 1601 } 1602 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1603 free_extent_buffer(tmp); 1604 btrfs_release_path(p); 1605 return -EUCLEAN; 1606 } 1607 1608 if (unlock_up) 1609 ret = -EAGAIN; 1610 1611 goto out; 1612 } else if (p->nowait) { 1613 return -EAGAIN; 1614 } 1615 1616 if (unlock_up) { 1617 btrfs_unlock_up_safe(p, level + 1); 1618 ret = -EAGAIN; 1619 } else { 1620 ret = 0; 1621 } 1622 1623 if (p->reada != READA_NONE) 1624 reada_for_search(fs_info, p, level, slot, key->objectid); 1625 1626 tmp = read_tree_block(fs_info, blocknr, &check); 1627 if (IS_ERR(tmp)) { 1628 btrfs_release_path(p); 1629 return PTR_ERR(tmp); 1630 } 1631 /* 1632 * If the read above didn't mark this buffer up to date, 1633 * it will never end up being up to date. Set ret to EIO now 1634 * and give up so that our caller doesn't loop forever 1635 * on our EAGAINs. 1636 */ 1637 if (!extent_buffer_uptodate(tmp)) 1638 ret = -EIO; 1639 1640 out: 1641 if (ret == 0) { 1642 *eb_ret = tmp; 1643 } else { 1644 free_extent_buffer(tmp); 1645 btrfs_release_path(p); 1646 } 1647 1648 return ret; 1649 } 1650 1651 /* 1652 * helper function for btrfs_search_slot. This does all of the checks 1653 * for node-level blocks and does any balancing required based on 1654 * the ins_len. 1655 * 1656 * If no extra work was required, zero is returned. If we had to 1657 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1658 * start over 1659 */ 1660 static int 1661 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1662 struct btrfs_root *root, struct btrfs_path *p, 1663 struct extent_buffer *b, int level, int ins_len, 1664 int *write_lock_level) 1665 { 1666 struct btrfs_fs_info *fs_info = root->fs_info; 1667 int ret = 0; 1668 1669 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1670 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1671 1672 if (*write_lock_level < level + 1) { 1673 *write_lock_level = level + 1; 1674 btrfs_release_path(p); 1675 return -EAGAIN; 1676 } 1677 1678 reada_for_balance(p, level); 1679 ret = split_node(trans, root, p, level); 1680 1681 b = p->nodes[level]; 1682 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1683 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1684 1685 if (*write_lock_level < level + 1) { 1686 *write_lock_level = level + 1; 1687 btrfs_release_path(p); 1688 return -EAGAIN; 1689 } 1690 1691 reada_for_balance(p, level); 1692 ret = balance_level(trans, root, p, level); 1693 if (ret) 1694 return ret; 1695 1696 b = p->nodes[level]; 1697 if (!b) { 1698 btrfs_release_path(p); 1699 return -EAGAIN; 1700 } 1701 BUG_ON(btrfs_header_nritems(b) == 1); 1702 } 1703 return ret; 1704 } 1705 1706 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1707 u64 iobjectid, u64 ioff, u8 key_type, 1708 struct btrfs_key *found_key) 1709 { 1710 int ret; 1711 struct btrfs_key key; 1712 struct extent_buffer *eb; 1713 1714 ASSERT(path); 1715 ASSERT(found_key); 1716 1717 key.type = key_type; 1718 key.objectid = iobjectid; 1719 key.offset = ioff; 1720 1721 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1722 if (ret < 0) 1723 return ret; 1724 1725 eb = path->nodes[0]; 1726 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1727 ret = btrfs_next_leaf(fs_root, path); 1728 if (ret) 1729 return ret; 1730 eb = path->nodes[0]; 1731 } 1732 1733 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1734 if (found_key->type != key.type || 1735 found_key->objectid != key.objectid) 1736 return 1; 1737 1738 return 0; 1739 } 1740 1741 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1742 struct btrfs_path *p, 1743 int write_lock_level) 1744 { 1745 struct extent_buffer *b; 1746 int root_lock = 0; 1747 int level = 0; 1748 1749 if (p->search_commit_root) { 1750 b = root->commit_root; 1751 atomic_inc(&b->refs); 1752 level = btrfs_header_level(b); 1753 /* 1754 * Ensure that all callers have set skip_locking when 1755 * p->search_commit_root = 1. 1756 */ 1757 ASSERT(p->skip_locking == 1); 1758 1759 goto out; 1760 } 1761 1762 if (p->skip_locking) { 1763 b = btrfs_root_node(root); 1764 level = btrfs_header_level(b); 1765 goto out; 1766 } 1767 1768 /* We try very hard to do read locks on the root */ 1769 root_lock = BTRFS_READ_LOCK; 1770 1771 /* 1772 * If the level is set to maximum, we can skip trying to get the read 1773 * lock. 1774 */ 1775 if (write_lock_level < BTRFS_MAX_LEVEL) { 1776 /* 1777 * We don't know the level of the root node until we actually 1778 * have it read locked 1779 */ 1780 if (p->nowait) { 1781 b = btrfs_try_read_lock_root_node(root); 1782 if (IS_ERR(b)) 1783 return b; 1784 } else { 1785 b = btrfs_read_lock_root_node(root); 1786 } 1787 level = btrfs_header_level(b); 1788 if (level > write_lock_level) 1789 goto out; 1790 1791 /* Whoops, must trade for write lock */ 1792 btrfs_tree_read_unlock(b); 1793 free_extent_buffer(b); 1794 } 1795 1796 b = btrfs_lock_root_node(root); 1797 root_lock = BTRFS_WRITE_LOCK; 1798 1799 /* The level might have changed, check again */ 1800 level = btrfs_header_level(b); 1801 1802 out: 1803 /* 1804 * The root may have failed to write out at some point, and thus is no 1805 * longer valid, return an error in this case. 1806 */ 1807 if (!extent_buffer_uptodate(b)) { 1808 if (root_lock) 1809 btrfs_tree_unlock_rw(b, root_lock); 1810 free_extent_buffer(b); 1811 return ERR_PTR(-EIO); 1812 } 1813 1814 p->nodes[level] = b; 1815 if (!p->skip_locking) 1816 p->locks[level] = root_lock; 1817 /* 1818 * Callers are responsible for dropping b's references. 1819 */ 1820 return b; 1821 } 1822 1823 /* 1824 * Replace the extent buffer at the lowest level of the path with a cloned 1825 * version. The purpose is to be able to use it safely, after releasing the 1826 * commit root semaphore, even if relocation is happening in parallel, the 1827 * transaction used for relocation is committed and the extent buffer is 1828 * reallocated in the next transaction. 1829 * 1830 * This is used in a context where the caller does not prevent transaction 1831 * commits from happening, either by holding a transaction handle or holding 1832 * some lock, while it's doing searches through a commit root. 1833 * At the moment it's only used for send operations. 1834 */ 1835 static int finish_need_commit_sem_search(struct btrfs_path *path) 1836 { 1837 const int i = path->lowest_level; 1838 const int slot = path->slots[i]; 1839 struct extent_buffer *lowest = path->nodes[i]; 1840 struct extent_buffer *clone; 1841 1842 ASSERT(path->need_commit_sem); 1843 1844 if (!lowest) 1845 return 0; 1846 1847 lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); 1848 1849 clone = btrfs_clone_extent_buffer(lowest); 1850 if (!clone) 1851 return -ENOMEM; 1852 1853 btrfs_release_path(path); 1854 path->nodes[i] = clone; 1855 path->slots[i] = slot; 1856 1857 return 0; 1858 } 1859 1860 static inline int search_for_key_slot(struct extent_buffer *eb, 1861 int search_low_slot, 1862 const struct btrfs_key *key, 1863 int prev_cmp, 1864 int *slot) 1865 { 1866 /* 1867 * If a previous call to btrfs_bin_search() on a parent node returned an 1868 * exact match (prev_cmp == 0), we can safely assume the target key will 1869 * always be at slot 0 on lower levels, since each key pointer 1870 * (struct btrfs_key_ptr) refers to the lowest key accessible from the 1871 * subtree it points to. Thus we can skip searching lower levels. 1872 */ 1873 if (prev_cmp == 0) { 1874 *slot = 0; 1875 return 0; 1876 } 1877 1878 return btrfs_bin_search(eb, search_low_slot, key, slot); 1879 } 1880 1881 static int search_leaf(struct btrfs_trans_handle *trans, 1882 struct btrfs_root *root, 1883 const struct btrfs_key *key, 1884 struct btrfs_path *path, 1885 int ins_len, 1886 int prev_cmp) 1887 { 1888 struct extent_buffer *leaf = path->nodes[0]; 1889 int leaf_free_space = -1; 1890 int search_low_slot = 0; 1891 int ret; 1892 bool do_bin_search = true; 1893 1894 /* 1895 * If we are doing an insertion, the leaf has enough free space and the 1896 * destination slot for the key is not slot 0, then we can unlock our 1897 * write lock on the parent, and any other upper nodes, before doing the 1898 * binary search on the leaf (with search_for_key_slot()), allowing other 1899 * tasks to lock the parent and any other upper nodes. 1900 */ 1901 if (ins_len > 0) { 1902 /* 1903 * Cache the leaf free space, since we will need it later and it 1904 * will not change until then. 1905 */ 1906 leaf_free_space = btrfs_leaf_free_space(leaf); 1907 1908 /* 1909 * !path->locks[1] means we have a single node tree, the leaf is 1910 * the root of the tree. 1911 */ 1912 if (path->locks[1] && leaf_free_space >= ins_len) { 1913 struct btrfs_disk_key first_key; 1914 1915 ASSERT(btrfs_header_nritems(leaf) > 0); 1916 btrfs_item_key(leaf, &first_key, 0); 1917 1918 /* 1919 * Doing the extra comparison with the first key is cheap, 1920 * taking into account that the first key is very likely 1921 * already in a cache line because it immediately follows 1922 * the extent buffer's header and we have recently accessed 1923 * the header's level field. 1924 */ 1925 ret = comp_keys(&first_key, key); 1926 if (ret < 0) { 1927 /* 1928 * The first key is smaller than the key we want 1929 * to insert, so we are safe to unlock all upper 1930 * nodes and we have to do the binary search. 1931 * 1932 * We do use btrfs_unlock_up_safe() and not 1933 * unlock_up() because the later does not unlock 1934 * nodes with a slot of 0 - we can safely unlock 1935 * any node even if its slot is 0 since in this 1936 * case the key does not end up at slot 0 of the 1937 * leaf and there's no need to split the leaf. 1938 */ 1939 btrfs_unlock_up_safe(path, 1); 1940 search_low_slot = 1; 1941 } else { 1942 /* 1943 * The first key is >= then the key we want to 1944 * insert, so we can skip the binary search as 1945 * the target key will be at slot 0. 1946 * 1947 * We can not unlock upper nodes when the key is 1948 * less than the first key, because we will need 1949 * to update the key at slot 0 of the parent node 1950 * and possibly of other upper nodes too. 1951 * If the key matches the first key, then we can 1952 * unlock all the upper nodes, using 1953 * btrfs_unlock_up_safe() instead of unlock_up() 1954 * as stated above. 1955 */ 1956 if (ret == 0) 1957 btrfs_unlock_up_safe(path, 1); 1958 /* 1959 * ret is already 0 or 1, matching the result of 1960 * a btrfs_bin_search() call, so there is no need 1961 * to adjust it. 1962 */ 1963 do_bin_search = false; 1964 path->slots[0] = 0; 1965 } 1966 } 1967 } 1968 1969 if (do_bin_search) { 1970 ret = search_for_key_slot(leaf, search_low_slot, key, 1971 prev_cmp, &path->slots[0]); 1972 if (ret < 0) 1973 return ret; 1974 } 1975 1976 if (ins_len > 0) { 1977 /* 1978 * Item key already exists. In this case, if we are allowed to 1979 * insert the item (for example, in dir_item case, item key 1980 * collision is allowed), it will be merged with the original 1981 * item. Only the item size grows, no new btrfs item will be 1982 * added. If search_for_extension is not set, ins_len already 1983 * accounts the size btrfs_item, deduct it here so leaf space 1984 * check will be correct. 1985 */ 1986 if (ret == 0 && !path->search_for_extension) { 1987 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1988 ins_len -= sizeof(struct btrfs_item); 1989 } 1990 1991 ASSERT(leaf_free_space >= 0); 1992 1993 if (leaf_free_space < ins_len) { 1994 int err; 1995 1996 err = split_leaf(trans, root, key, path, ins_len, 1997 (ret == 0)); 1998 ASSERT(err <= 0); 1999 if (WARN_ON(err > 0)) 2000 err = -EUCLEAN; 2001 if (err) 2002 ret = err; 2003 } 2004 } 2005 2006 return ret; 2007 } 2008 2009 /* 2010 * btrfs_search_slot - look for a key in a tree and perform necessary 2011 * modifications to preserve tree invariants. 2012 * 2013 * @trans: Handle of transaction, used when modifying the tree 2014 * @p: Holds all btree nodes along the search path 2015 * @root: The root node of the tree 2016 * @key: The key we are looking for 2017 * @ins_len: Indicates purpose of search: 2018 * >0 for inserts it's size of item inserted (*) 2019 * <0 for deletions 2020 * 0 for plain searches, not modifying the tree 2021 * 2022 * (*) If size of item inserted doesn't include 2023 * sizeof(struct btrfs_item), then p->search_for_extension must 2024 * be set. 2025 * @cow: boolean should CoW operations be performed. Must always be 1 2026 * when modifying the tree. 2027 * 2028 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2029 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2030 * 2031 * If @key is found, 0 is returned and you can find the item in the leaf level 2032 * of the path (level 0) 2033 * 2034 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2035 * points to the slot where it should be inserted 2036 * 2037 * If an error is encountered while searching the tree a negative error number 2038 * is returned 2039 */ 2040 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2041 const struct btrfs_key *key, struct btrfs_path *p, 2042 int ins_len, int cow) 2043 { 2044 struct btrfs_fs_info *fs_info = root->fs_info; 2045 struct extent_buffer *b; 2046 int slot; 2047 int ret; 2048 int err; 2049 int level; 2050 int lowest_unlock = 1; 2051 /* everything at write_lock_level or lower must be write locked */ 2052 int write_lock_level = 0; 2053 u8 lowest_level = 0; 2054 int min_write_lock_level; 2055 int prev_cmp; 2056 2057 might_sleep(); 2058 2059 lowest_level = p->lowest_level; 2060 WARN_ON(lowest_level && ins_len > 0); 2061 WARN_ON(p->nodes[0] != NULL); 2062 BUG_ON(!cow && ins_len); 2063 2064 /* 2065 * For now only allow nowait for read only operations. There's no 2066 * strict reason why we can't, we just only need it for reads so it's 2067 * only implemented for reads. 2068 */ 2069 ASSERT(!p->nowait || !cow); 2070 2071 if (ins_len < 0) { 2072 lowest_unlock = 2; 2073 2074 /* when we are removing items, we might have to go up to level 2075 * two as we update tree pointers Make sure we keep write 2076 * for those levels as well 2077 */ 2078 write_lock_level = 2; 2079 } else if (ins_len > 0) { 2080 /* 2081 * for inserting items, make sure we have a write lock on 2082 * level 1 so we can update keys 2083 */ 2084 write_lock_level = 1; 2085 } 2086 2087 if (!cow) 2088 write_lock_level = -1; 2089 2090 if (cow && (p->keep_locks || p->lowest_level)) 2091 write_lock_level = BTRFS_MAX_LEVEL; 2092 2093 min_write_lock_level = write_lock_level; 2094 2095 if (p->need_commit_sem) { 2096 ASSERT(p->search_commit_root); 2097 if (p->nowait) { 2098 if (!down_read_trylock(&fs_info->commit_root_sem)) 2099 return -EAGAIN; 2100 } else { 2101 down_read(&fs_info->commit_root_sem); 2102 } 2103 } 2104 2105 again: 2106 prev_cmp = -1; 2107 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2108 if (IS_ERR(b)) { 2109 ret = PTR_ERR(b); 2110 goto done; 2111 } 2112 2113 while (b) { 2114 int dec = 0; 2115 2116 level = btrfs_header_level(b); 2117 2118 if (cow) { 2119 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2120 2121 /* 2122 * if we don't really need to cow this block 2123 * then we don't want to set the path blocking, 2124 * so we test it here 2125 */ 2126 if (!should_cow_block(trans, root, b)) 2127 goto cow_done; 2128 2129 /* 2130 * must have write locks on this node and the 2131 * parent 2132 */ 2133 if (level > write_lock_level || 2134 (level + 1 > write_lock_level && 2135 level + 1 < BTRFS_MAX_LEVEL && 2136 p->nodes[level + 1])) { 2137 write_lock_level = level + 1; 2138 btrfs_release_path(p); 2139 goto again; 2140 } 2141 2142 if (last_level) 2143 err = btrfs_cow_block(trans, root, b, NULL, 0, 2144 &b, 2145 BTRFS_NESTING_COW); 2146 else 2147 err = btrfs_cow_block(trans, root, b, 2148 p->nodes[level + 1], 2149 p->slots[level + 1], &b, 2150 BTRFS_NESTING_COW); 2151 if (err) { 2152 ret = err; 2153 goto done; 2154 } 2155 } 2156 cow_done: 2157 p->nodes[level] = b; 2158 2159 /* 2160 * we have a lock on b and as long as we aren't changing 2161 * the tree, there is no way to for the items in b to change. 2162 * It is safe to drop the lock on our parent before we 2163 * go through the expensive btree search on b. 2164 * 2165 * If we're inserting or deleting (ins_len != 0), then we might 2166 * be changing slot zero, which may require changing the parent. 2167 * So, we can't drop the lock until after we know which slot 2168 * we're operating on. 2169 */ 2170 if (!ins_len && !p->keep_locks) { 2171 int u = level + 1; 2172 2173 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2174 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2175 p->locks[u] = 0; 2176 } 2177 } 2178 2179 if (level == 0) { 2180 if (ins_len > 0) 2181 ASSERT(write_lock_level >= 1); 2182 2183 ret = search_leaf(trans, root, key, p, ins_len, prev_cmp); 2184 if (!p->search_for_split) 2185 unlock_up(p, level, lowest_unlock, 2186 min_write_lock_level, NULL); 2187 goto done; 2188 } 2189 2190 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot); 2191 if (ret < 0) 2192 goto done; 2193 prev_cmp = ret; 2194 2195 if (ret && slot > 0) { 2196 dec = 1; 2197 slot--; 2198 } 2199 p->slots[level] = slot; 2200 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2201 &write_lock_level); 2202 if (err == -EAGAIN) 2203 goto again; 2204 if (err) { 2205 ret = err; 2206 goto done; 2207 } 2208 b = p->nodes[level]; 2209 slot = p->slots[level]; 2210 2211 /* 2212 * Slot 0 is special, if we change the key we have to update 2213 * the parent pointer which means we must have a write lock on 2214 * the parent 2215 */ 2216 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2217 write_lock_level = level + 1; 2218 btrfs_release_path(p); 2219 goto again; 2220 } 2221 2222 unlock_up(p, level, lowest_unlock, min_write_lock_level, 2223 &write_lock_level); 2224 2225 if (level == lowest_level) { 2226 if (dec) 2227 p->slots[level]++; 2228 goto done; 2229 } 2230 2231 err = read_block_for_search(root, p, &b, level, slot, key); 2232 if (err == -EAGAIN) 2233 goto again; 2234 if (err) { 2235 ret = err; 2236 goto done; 2237 } 2238 2239 if (!p->skip_locking) { 2240 level = btrfs_header_level(b); 2241 2242 btrfs_maybe_reset_lockdep_class(root, b); 2243 2244 if (level <= write_lock_level) { 2245 btrfs_tree_lock(b); 2246 p->locks[level] = BTRFS_WRITE_LOCK; 2247 } else { 2248 if (p->nowait) { 2249 if (!btrfs_try_tree_read_lock(b)) { 2250 free_extent_buffer(b); 2251 ret = -EAGAIN; 2252 goto done; 2253 } 2254 } else { 2255 btrfs_tree_read_lock(b); 2256 } 2257 p->locks[level] = BTRFS_READ_LOCK; 2258 } 2259 p->nodes[level] = b; 2260 } 2261 } 2262 ret = 1; 2263 done: 2264 if (ret < 0 && !p->skip_release_on_error) 2265 btrfs_release_path(p); 2266 2267 if (p->need_commit_sem) { 2268 int ret2; 2269 2270 ret2 = finish_need_commit_sem_search(p); 2271 up_read(&fs_info->commit_root_sem); 2272 if (ret2) 2273 ret = ret2; 2274 } 2275 2276 return ret; 2277 } 2278 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 2279 2280 /* 2281 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2282 * current state of the tree together with the operations recorded in the tree 2283 * modification log to search for the key in a previous version of this tree, as 2284 * denoted by the time_seq parameter. 2285 * 2286 * Naturally, there is no support for insert, delete or cow operations. 2287 * 2288 * The resulting path and return value will be set up as if we called 2289 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2290 */ 2291 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2292 struct btrfs_path *p, u64 time_seq) 2293 { 2294 struct btrfs_fs_info *fs_info = root->fs_info; 2295 struct extent_buffer *b; 2296 int slot; 2297 int ret; 2298 int err; 2299 int level; 2300 int lowest_unlock = 1; 2301 u8 lowest_level = 0; 2302 2303 lowest_level = p->lowest_level; 2304 WARN_ON(p->nodes[0] != NULL); 2305 ASSERT(!p->nowait); 2306 2307 if (p->search_commit_root) { 2308 BUG_ON(time_seq); 2309 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2310 } 2311 2312 again: 2313 b = btrfs_get_old_root(root, time_seq); 2314 if (!b) { 2315 ret = -EIO; 2316 goto done; 2317 } 2318 level = btrfs_header_level(b); 2319 p->locks[level] = BTRFS_READ_LOCK; 2320 2321 while (b) { 2322 int dec = 0; 2323 2324 level = btrfs_header_level(b); 2325 p->nodes[level] = b; 2326 2327 /* 2328 * we have a lock on b and as long as we aren't changing 2329 * the tree, there is no way to for the items in b to change. 2330 * It is safe to drop the lock on our parent before we 2331 * go through the expensive btree search on b. 2332 */ 2333 btrfs_unlock_up_safe(p, level + 1); 2334 2335 ret = btrfs_bin_search(b, 0, key, &slot); 2336 if (ret < 0) 2337 goto done; 2338 2339 if (level == 0) { 2340 p->slots[level] = slot; 2341 unlock_up(p, level, lowest_unlock, 0, NULL); 2342 goto done; 2343 } 2344 2345 if (ret && slot > 0) { 2346 dec = 1; 2347 slot--; 2348 } 2349 p->slots[level] = slot; 2350 unlock_up(p, level, lowest_unlock, 0, NULL); 2351 2352 if (level == lowest_level) { 2353 if (dec) 2354 p->slots[level]++; 2355 goto done; 2356 } 2357 2358 err = read_block_for_search(root, p, &b, level, slot, key); 2359 if (err == -EAGAIN) 2360 goto again; 2361 if (err) { 2362 ret = err; 2363 goto done; 2364 } 2365 2366 level = btrfs_header_level(b); 2367 btrfs_tree_read_lock(b); 2368 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2369 if (!b) { 2370 ret = -ENOMEM; 2371 goto done; 2372 } 2373 p->locks[level] = BTRFS_READ_LOCK; 2374 p->nodes[level] = b; 2375 } 2376 ret = 1; 2377 done: 2378 if (ret < 0) 2379 btrfs_release_path(p); 2380 2381 return ret; 2382 } 2383 2384 /* 2385 * Search the tree again to find a leaf with smaller keys. 2386 * Returns 0 if it found something. 2387 * Returns 1 if there are no smaller keys. 2388 * Returns < 0 on error. 2389 * 2390 * This may release the path, and so you may lose any locks held at the 2391 * time you call it. 2392 */ 2393 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 2394 { 2395 struct btrfs_key key; 2396 struct btrfs_key orig_key; 2397 struct btrfs_disk_key found_key; 2398 int ret; 2399 2400 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 2401 orig_key = key; 2402 2403 if (key.offset > 0) { 2404 key.offset--; 2405 } else if (key.type > 0) { 2406 key.type--; 2407 key.offset = (u64)-1; 2408 } else if (key.objectid > 0) { 2409 key.objectid--; 2410 key.type = (u8)-1; 2411 key.offset = (u64)-1; 2412 } else { 2413 return 1; 2414 } 2415 2416 btrfs_release_path(path); 2417 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2418 if (ret <= 0) 2419 return ret; 2420 2421 /* 2422 * Previous key not found. Even if we were at slot 0 of the leaf we had 2423 * before releasing the path and calling btrfs_search_slot(), we now may 2424 * be in a slot pointing to the same original key - this can happen if 2425 * after we released the path, one of more items were moved from a 2426 * sibling leaf into the front of the leaf we had due to an insertion 2427 * (see push_leaf_right()). 2428 * If we hit this case and our slot is > 0 and just decrement the slot 2429 * so that the caller does not process the same key again, which may or 2430 * may not break the caller, depending on its logic. 2431 */ 2432 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 2433 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); 2434 ret = comp_keys(&found_key, &orig_key); 2435 if (ret == 0) { 2436 if (path->slots[0] > 0) { 2437 path->slots[0]--; 2438 return 0; 2439 } 2440 /* 2441 * At slot 0, same key as before, it means orig_key is 2442 * the lowest, leftmost, key in the tree. We're done. 2443 */ 2444 return 1; 2445 } 2446 } 2447 2448 btrfs_item_key(path->nodes[0], &found_key, 0); 2449 ret = comp_keys(&found_key, &key); 2450 /* 2451 * We might have had an item with the previous key in the tree right 2452 * before we released our path. And after we released our path, that 2453 * item might have been pushed to the first slot (0) of the leaf we 2454 * were holding due to a tree balance. Alternatively, an item with the 2455 * previous key can exist as the only element of a leaf (big fat item). 2456 * Therefore account for these 2 cases, so that our callers (like 2457 * btrfs_previous_item) don't miss an existing item with a key matching 2458 * the previous key we computed above. 2459 */ 2460 if (ret <= 0) 2461 return 0; 2462 return 1; 2463 } 2464 2465 /* 2466 * helper to use instead of search slot if no exact match is needed but 2467 * instead the next or previous item should be returned. 2468 * When find_higher is true, the next higher item is returned, the next lower 2469 * otherwise. 2470 * When return_any and find_higher are both true, and no higher item is found, 2471 * return the next lower instead. 2472 * When return_any is true and find_higher is false, and no lower item is found, 2473 * return the next higher instead. 2474 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2475 * < 0 on error 2476 */ 2477 int btrfs_search_slot_for_read(struct btrfs_root *root, 2478 const struct btrfs_key *key, 2479 struct btrfs_path *p, int find_higher, 2480 int return_any) 2481 { 2482 int ret; 2483 struct extent_buffer *leaf; 2484 2485 again: 2486 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2487 if (ret <= 0) 2488 return ret; 2489 /* 2490 * a return value of 1 means the path is at the position where the 2491 * item should be inserted. Normally this is the next bigger item, 2492 * but in case the previous item is the last in a leaf, path points 2493 * to the first free slot in the previous leaf, i.e. at an invalid 2494 * item. 2495 */ 2496 leaf = p->nodes[0]; 2497 2498 if (find_higher) { 2499 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2500 ret = btrfs_next_leaf(root, p); 2501 if (ret <= 0) 2502 return ret; 2503 if (!return_any) 2504 return 1; 2505 /* 2506 * no higher item found, return the next 2507 * lower instead 2508 */ 2509 return_any = 0; 2510 find_higher = 0; 2511 btrfs_release_path(p); 2512 goto again; 2513 } 2514 } else { 2515 if (p->slots[0] == 0) { 2516 ret = btrfs_prev_leaf(root, p); 2517 if (ret < 0) 2518 return ret; 2519 if (!ret) { 2520 leaf = p->nodes[0]; 2521 if (p->slots[0] == btrfs_header_nritems(leaf)) 2522 p->slots[0]--; 2523 return 0; 2524 } 2525 if (!return_any) 2526 return 1; 2527 /* 2528 * no lower item found, return the next 2529 * higher instead 2530 */ 2531 return_any = 0; 2532 find_higher = 1; 2533 btrfs_release_path(p); 2534 goto again; 2535 } else { 2536 --p->slots[0]; 2537 } 2538 } 2539 return 0; 2540 } 2541 2542 /* 2543 * Execute search and call btrfs_previous_item to traverse backwards if the item 2544 * was not found. 2545 * 2546 * Return 0 if found, 1 if not found and < 0 if error. 2547 */ 2548 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 2549 struct btrfs_path *path) 2550 { 2551 int ret; 2552 2553 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 2554 if (ret > 0) 2555 ret = btrfs_previous_item(root, path, key->objectid, key->type); 2556 2557 if (ret == 0) 2558 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2559 2560 return ret; 2561 } 2562 2563 /* 2564 * Search for a valid slot for the given path. 2565 * 2566 * @root: The root node of the tree. 2567 * @key: Will contain a valid item if found. 2568 * @path: The starting point to validate the slot. 2569 * 2570 * Return: 0 if the item is valid 2571 * 1 if not found 2572 * <0 if error. 2573 */ 2574 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2575 struct btrfs_path *path) 2576 { 2577 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2578 int ret; 2579 2580 ret = btrfs_next_leaf(root, path); 2581 if (ret) 2582 return ret; 2583 } 2584 2585 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2586 return 0; 2587 } 2588 2589 /* 2590 * adjust the pointers going up the tree, starting at level 2591 * making sure the right key of each node is points to 'key'. 2592 * This is used after shifting pointers to the left, so it stops 2593 * fixing up pointers when a given leaf/node is not in slot 0 of the 2594 * higher levels 2595 * 2596 */ 2597 static void fixup_low_keys(struct btrfs_path *path, 2598 struct btrfs_disk_key *key, int level) 2599 { 2600 int i; 2601 struct extent_buffer *t; 2602 int ret; 2603 2604 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2605 int tslot = path->slots[i]; 2606 2607 if (!path->nodes[i]) 2608 break; 2609 t = path->nodes[i]; 2610 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2611 BTRFS_MOD_LOG_KEY_REPLACE); 2612 BUG_ON(ret < 0); 2613 btrfs_set_node_key(t, key, tslot); 2614 btrfs_mark_buffer_dirty(path->nodes[i]); 2615 if (tslot != 0) 2616 break; 2617 } 2618 } 2619 2620 /* 2621 * update item key. 2622 * 2623 * This function isn't completely safe. It's the caller's responsibility 2624 * that the new key won't break the order 2625 */ 2626 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2627 struct btrfs_path *path, 2628 const struct btrfs_key *new_key) 2629 { 2630 struct btrfs_disk_key disk_key; 2631 struct extent_buffer *eb; 2632 int slot; 2633 2634 eb = path->nodes[0]; 2635 slot = path->slots[0]; 2636 if (slot > 0) { 2637 btrfs_item_key(eb, &disk_key, slot - 1); 2638 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2639 btrfs_print_leaf(eb); 2640 btrfs_crit(fs_info, 2641 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2642 slot, btrfs_disk_key_objectid(&disk_key), 2643 btrfs_disk_key_type(&disk_key), 2644 btrfs_disk_key_offset(&disk_key), 2645 new_key->objectid, new_key->type, 2646 new_key->offset); 2647 BUG(); 2648 } 2649 } 2650 if (slot < btrfs_header_nritems(eb) - 1) { 2651 btrfs_item_key(eb, &disk_key, slot + 1); 2652 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2653 btrfs_print_leaf(eb); 2654 btrfs_crit(fs_info, 2655 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2656 slot, btrfs_disk_key_objectid(&disk_key), 2657 btrfs_disk_key_type(&disk_key), 2658 btrfs_disk_key_offset(&disk_key), 2659 new_key->objectid, new_key->type, 2660 new_key->offset); 2661 BUG(); 2662 } 2663 } 2664 2665 btrfs_cpu_key_to_disk(&disk_key, new_key); 2666 btrfs_set_item_key(eb, &disk_key, slot); 2667 btrfs_mark_buffer_dirty(eb); 2668 if (slot == 0) 2669 fixup_low_keys(path, &disk_key, 1); 2670 } 2671 2672 /* 2673 * Check key order of two sibling extent buffers. 2674 * 2675 * Return true if something is wrong. 2676 * Return false if everything is fine. 2677 * 2678 * Tree-checker only works inside one tree block, thus the following 2679 * corruption can not be detected by tree-checker: 2680 * 2681 * Leaf @left | Leaf @right 2682 * -------------------------------------------------------------- 2683 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2684 * 2685 * Key f6 in leaf @left itself is valid, but not valid when the next 2686 * key in leaf @right is 7. 2687 * This can only be checked at tree block merge time. 2688 * And since tree checker has ensured all key order in each tree block 2689 * is correct, we only need to bother the last key of @left and the first 2690 * key of @right. 2691 */ 2692 static bool check_sibling_keys(struct extent_buffer *left, 2693 struct extent_buffer *right) 2694 { 2695 struct btrfs_key left_last; 2696 struct btrfs_key right_first; 2697 int level = btrfs_header_level(left); 2698 int nr_left = btrfs_header_nritems(left); 2699 int nr_right = btrfs_header_nritems(right); 2700 2701 /* No key to check in one of the tree blocks */ 2702 if (!nr_left || !nr_right) 2703 return false; 2704 2705 if (level) { 2706 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2707 btrfs_node_key_to_cpu(right, &right_first, 0); 2708 } else { 2709 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2710 btrfs_item_key_to_cpu(right, &right_first, 0); 2711 } 2712 2713 if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { 2714 btrfs_crit(left->fs_info, "left extent buffer:"); 2715 btrfs_print_tree(left, false); 2716 btrfs_crit(left->fs_info, "right extent buffer:"); 2717 btrfs_print_tree(right, false); 2718 btrfs_crit(left->fs_info, 2719 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2720 left_last.objectid, left_last.type, 2721 left_last.offset, right_first.objectid, 2722 right_first.type, right_first.offset); 2723 return true; 2724 } 2725 return false; 2726 } 2727 2728 /* 2729 * try to push data from one node into the next node left in the 2730 * tree. 2731 * 2732 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2733 * error, and > 0 if there was no room in the left hand block. 2734 */ 2735 static int push_node_left(struct btrfs_trans_handle *trans, 2736 struct extent_buffer *dst, 2737 struct extent_buffer *src, int empty) 2738 { 2739 struct btrfs_fs_info *fs_info = trans->fs_info; 2740 int push_items = 0; 2741 int src_nritems; 2742 int dst_nritems; 2743 int ret = 0; 2744 2745 src_nritems = btrfs_header_nritems(src); 2746 dst_nritems = btrfs_header_nritems(dst); 2747 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2748 WARN_ON(btrfs_header_generation(src) != trans->transid); 2749 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2750 2751 if (!empty && src_nritems <= 8) 2752 return 1; 2753 2754 if (push_items <= 0) 2755 return 1; 2756 2757 if (empty) { 2758 push_items = min(src_nritems, push_items); 2759 if (push_items < src_nritems) { 2760 /* leave at least 8 pointers in the node if 2761 * we aren't going to empty it 2762 */ 2763 if (src_nritems - push_items < 8) { 2764 if (push_items <= 8) 2765 return 1; 2766 push_items -= 8; 2767 } 2768 } 2769 } else 2770 push_items = min(src_nritems - 8, push_items); 2771 2772 /* dst is the left eb, src is the middle eb */ 2773 if (check_sibling_keys(dst, src)) { 2774 ret = -EUCLEAN; 2775 btrfs_abort_transaction(trans, ret); 2776 return ret; 2777 } 2778 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2779 if (ret) { 2780 btrfs_abort_transaction(trans, ret); 2781 return ret; 2782 } 2783 copy_extent_buffer(dst, src, 2784 btrfs_node_key_ptr_offset(dst, dst_nritems), 2785 btrfs_node_key_ptr_offset(src, 0), 2786 push_items * sizeof(struct btrfs_key_ptr)); 2787 2788 if (push_items < src_nritems) { 2789 /* 2790 * Don't call btrfs_tree_mod_log_insert_move() here, key removal 2791 * was already fully logged by btrfs_tree_mod_log_eb_copy() above. 2792 */ 2793 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), 2794 btrfs_node_key_ptr_offset(src, push_items), 2795 (src_nritems - push_items) * 2796 sizeof(struct btrfs_key_ptr)); 2797 } 2798 btrfs_set_header_nritems(src, src_nritems - push_items); 2799 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2800 btrfs_mark_buffer_dirty(src); 2801 btrfs_mark_buffer_dirty(dst); 2802 2803 return ret; 2804 } 2805 2806 /* 2807 * try to push data from one node into the next node right in the 2808 * tree. 2809 * 2810 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2811 * error, and > 0 if there was no room in the right hand block. 2812 * 2813 * this will only push up to 1/2 the contents of the left node over 2814 */ 2815 static int balance_node_right(struct btrfs_trans_handle *trans, 2816 struct extent_buffer *dst, 2817 struct extent_buffer *src) 2818 { 2819 struct btrfs_fs_info *fs_info = trans->fs_info; 2820 int push_items = 0; 2821 int max_push; 2822 int src_nritems; 2823 int dst_nritems; 2824 int ret = 0; 2825 2826 WARN_ON(btrfs_header_generation(src) != trans->transid); 2827 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2828 2829 src_nritems = btrfs_header_nritems(src); 2830 dst_nritems = btrfs_header_nritems(dst); 2831 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2832 if (push_items <= 0) 2833 return 1; 2834 2835 if (src_nritems < 4) 2836 return 1; 2837 2838 max_push = src_nritems / 2 + 1; 2839 /* don't try to empty the node */ 2840 if (max_push >= src_nritems) 2841 return 1; 2842 2843 if (max_push < push_items) 2844 push_items = max_push; 2845 2846 /* dst is the right eb, src is the middle eb */ 2847 if (check_sibling_keys(src, dst)) { 2848 ret = -EUCLEAN; 2849 btrfs_abort_transaction(trans, ret); 2850 return ret; 2851 } 2852 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 2853 BUG_ON(ret < 0); 2854 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), 2855 btrfs_node_key_ptr_offset(dst, 0), 2856 (dst_nritems) * 2857 sizeof(struct btrfs_key_ptr)); 2858 2859 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2860 push_items); 2861 if (ret) { 2862 btrfs_abort_transaction(trans, ret); 2863 return ret; 2864 } 2865 copy_extent_buffer(dst, src, 2866 btrfs_node_key_ptr_offset(dst, 0), 2867 btrfs_node_key_ptr_offset(src, src_nritems - push_items), 2868 push_items * sizeof(struct btrfs_key_ptr)); 2869 2870 btrfs_set_header_nritems(src, src_nritems - push_items); 2871 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2872 2873 btrfs_mark_buffer_dirty(src); 2874 btrfs_mark_buffer_dirty(dst); 2875 2876 return ret; 2877 } 2878 2879 /* 2880 * helper function to insert a new root level in the tree. 2881 * A new node is allocated, and a single item is inserted to 2882 * point to the existing root 2883 * 2884 * returns zero on success or < 0 on failure. 2885 */ 2886 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2887 struct btrfs_root *root, 2888 struct btrfs_path *path, int level) 2889 { 2890 struct btrfs_fs_info *fs_info = root->fs_info; 2891 u64 lower_gen; 2892 struct extent_buffer *lower; 2893 struct extent_buffer *c; 2894 struct extent_buffer *old; 2895 struct btrfs_disk_key lower_key; 2896 int ret; 2897 2898 BUG_ON(path->nodes[level]); 2899 BUG_ON(path->nodes[level-1] != root->node); 2900 2901 lower = path->nodes[level-1]; 2902 if (level == 1) 2903 btrfs_item_key(lower, &lower_key, 0); 2904 else 2905 btrfs_node_key(lower, &lower_key, 0); 2906 2907 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 2908 &lower_key, level, root->node->start, 0, 2909 BTRFS_NESTING_NEW_ROOT); 2910 if (IS_ERR(c)) 2911 return PTR_ERR(c); 2912 2913 root_add_used(root, fs_info->nodesize); 2914 2915 btrfs_set_header_nritems(c, 1); 2916 btrfs_set_node_key(c, &lower_key, 0); 2917 btrfs_set_node_blockptr(c, 0, lower->start); 2918 lower_gen = btrfs_header_generation(lower); 2919 WARN_ON(lower_gen != trans->transid); 2920 2921 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2922 2923 btrfs_mark_buffer_dirty(c); 2924 2925 old = root->node; 2926 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2927 BUG_ON(ret < 0); 2928 rcu_assign_pointer(root->node, c); 2929 2930 /* the super has an extra ref to root->node */ 2931 free_extent_buffer(old); 2932 2933 add_root_to_dirty_list(root); 2934 atomic_inc(&c->refs); 2935 path->nodes[level] = c; 2936 path->locks[level] = BTRFS_WRITE_LOCK; 2937 path->slots[level] = 0; 2938 return 0; 2939 } 2940 2941 /* 2942 * worker function to insert a single pointer in a node. 2943 * the node should have enough room for the pointer already 2944 * 2945 * slot and level indicate where you want the key to go, and 2946 * blocknr is the block the key points to. 2947 */ 2948 static void insert_ptr(struct btrfs_trans_handle *trans, 2949 struct btrfs_path *path, 2950 struct btrfs_disk_key *key, u64 bytenr, 2951 int slot, int level) 2952 { 2953 struct extent_buffer *lower; 2954 int nritems; 2955 int ret; 2956 2957 BUG_ON(!path->nodes[level]); 2958 btrfs_assert_tree_write_locked(path->nodes[level]); 2959 lower = path->nodes[level]; 2960 nritems = btrfs_header_nritems(lower); 2961 BUG_ON(slot > nritems); 2962 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2963 if (slot != nritems) { 2964 if (level) { 2965 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2966 slot, nritems - slot); 2967 BUG_ON(ret < 0); 2968 } 2969 memmove_extent_buffer(lower, 2970 btrfs_node_key_ptr_offset(lower, slot + 1), 2971 btrfs_node_key_ptr_offset(lower, slot), 2972 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2973 } 2974 if (level) { 2975 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2976 BTRFS_MOD_LOG_KEY_ADD); 2977 BUG_ON(ret < 0); 2978 } 2979 btrfs_set_node_key(lower, key, slot); 2980 btrfs_set_node_blockptr(lower, slot, bytenr); 2981 WARN_ON(trans->transid == 0); 2982 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2983 btrfs_set_header_nritems(lower, nritems + 1); 2984 btrfs_mark_buffer_dirty(lower); 2985 } 2986 2987 /* 2988 * split the node at the specified level in path in two. 2989 * The path is corrected to point to the appropriate node after the split 2990 * 2991 * Before splitting this tries to make some room in the node by pushing 2992 * left and right, if either one works, it returns right away. 2993 * 2994 * returns 0 on success and < 0 on failure 2995 */ 2996 static noinline int split_node(struct btrfs_trans_handle *trans, 2997 struct btrfs_root *root, 2998 struct btrfs_path *path, int level) 2999 { 3000 struct btrfs_fs_info *fs_info = root->fs_info; 3001 struct extent_buffer *c; 3002 struct extent_buffer *split; 3003 struct btrfs_disk_key disk_key; 3004 int mid; 3005 int ret; 3006 u32 c_nritems; 3007 3008 c = path->nodes[level]; 3009 WARN_ON(btrfs_header_generation(c) != trans->transid); 3010 if (c == root->node) { 3011 /* 3012 * trying to split the root, lets make a new one 3013 * 3014 * tree mod log: We don't log_removal old root in 3015 * insert_new_root, because that root buffer will be kept as a 3016 * normal node. We are going to log removal of half of the 3017 * elements below with btrfs_tree_mod_log_eb_copy(). We're 3018 * holding a tree lock on the buffer, which is why we cannot 3019 * race with other tree_mod_log users. 3020 */ 3021 ret = insert_new_root(trans, root, path, level + 1); 3022 if (ret) 3023 return ret; 3024 } else { 3025 ret = push_nodes_for_insert(trans, root, path, level); 3026 c = path->nodes[level]; 3027 if (!ret && btrfs_header_nritems(c) < 3028 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3029 return 0; 3030 if (ret < 0) 3031 return ret; 3032 } 3033 3034 c_nritems = btrfs_header_nritems(c); 3035 mid = (c_nritems + 1) / 2; 3036 btrfs_node_key(c, &disk_key, mid); 3037 3038 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3039 &disk_key, level, c->start, 0, 3040 BTRFS_NESTING_SPLIT); 3041 if (IS_ERR(split)) 3042 return PTR_ERR(split); 3043 3044 root_add_used(root, fs_info->nodesize); 3045 ASSERT(btrfs_header_level(c) == level); 3046 3047 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 3048 if (ret) { 3049 btrfs_abort_transaction(trans, ret); 3050 return ret; 3051 } 3052 copy_extent_buffer(split, c, 3053 btrfs_node_key_ptr_offset(split, 0), 3054 btrfs_node_key_ptr_offset(c, mid), 3055 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3056 btrfs_set_header_nritems(split, c_nritems - mid); 3057 btrfs_set_header_nritems(c, mid); 3058 3059 btrfs_mark_buffer_dirty(c); 3060 btrfs_mark_buffer_dirty(split); 3061 3062 insert_ptr(trans, path, &disk_key, split->start, 3063 path->slots[level + 1] + 1, level + 1); 3064 3065 if (path->slots[level] >= mid) { 3066 path->slots[level] -= mid; 3067 btrfs_tree_unlock(c); 3068 free_extent_buffer(c); 3069 path->nodes[level] = split; 3070 path->slots[level + 1] += 1; 3071 } else { 3072 btrfs_tree_unlock(split); 3073 free_extent_buffer(split); 3074 } 3075 return 0; 3076 } 3077 3078 /* 3079 * how many bytes are required to store the items in a leaf. start 3080 * and nr indicate which items in the leaf to check. This totals up the 3081 * space used both by the item structs and the item data 3082 */ 3083 static int leaf_space_used(const struct extent_buffer *l, int start, int nr) 3084 { 3085 int data_len; 3086 int nritems = btrfs_header_nritems(l); 3087 int end = min(nritems, start + nr) - 1; 3088 3089 if (!nr) 3090 return 0; 3091 data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start); 3092 data_len = data_len - btrfs_item_offset(l, end); 3093 data_len += sizeof(struct btrfs_item) * nr; 3094 WARN_ON(data_len < 0); 3095 return data_len; 3096 } 3097 3098 /* 3099 * The space between the end of the leaf items and 3100 * the start of the leaf data. IOW, how much room 3101 * the leaf has left for both items and data 3102 */ 3103 int btrfs_leaf_free_space(const struct extent_buffer *leaf) 3104 { 3105 struct btrfs_fs_info *fs_info = leaf->fs_info; 3106 int nritems = btrfs_header_nritems(leaf); 3107 int ret; 3108 3109 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3110 if (ret < 0) { 3111 btrfs_crit(fs_info, 3112 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3113 ret, 3114 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3115 leaf_space_used(leaf, 0, nritems), nritems); 3116 } 3117 return ret; 3118 } 3119 3120 /* 3121 * min slot controls the lowest index we're willing to push to the 3122 * right. We'll push up to and including min_slot, but no lower 3123 */ 3124 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3125 struct btrfs_path *path, 3126 int data_size, int empty, 3127 struct extent_buffer *right, 3128 int free_space, u32 left_nritems, 3129 u32 min_slot) 3130 { 3131 struct btrfs_fs_info *fs_info = right->fs_info; 3132 struct extent_buffer *left = path->nodes[0]; 3133 struct extent_buffer *upper = path->nodes[1]; 3134 struct btrfs_map_token token; 3135 struct btrfs_disk_key disk_key; 3136 int slot; 3137 u32 i; 3138 int push_space = 0; 3139 int push_items = 0; 3140 u32 nr; 3141 u32 right_nritems; 3142 u32 data_end; 3143 u32 this_item_size; 3144 3145 if (empty) 3146 nr = 0; 3147 else 3148 nr = max_t(u32, 1, min_slot); 3149 3150 if (path->slots[0] >= left_nritems) 3151 push_space += data_size; 3152 3153 slot = path->slots[1]; 3154 i = left_nritems - 1; 3155 while (i >= nr) { 3156 if (!empty && push_items > 0) { 3157 if (path->slots[0] > i) 3158 break; 3159 if (path->slots[0] == i) { 3160 int space = btrfs_leaf_free_space(left); 3161 3162 if (space + push_space * 2 > free_space) 3163 break; 3164 } 3165 } 3166 3167 if (path->slots[0] == i) 3168 push_space += data_size; 3169 3170 this_item_size = btrfs_item_size(left, i); 3171 if (this_item_size + sizeof(struct btrfs_item) + 3172 push_space > free_space) 3173 break; 3174 3175 push_items++; 3176 push_space += this_item_size + sizeof(struct btrfs_item); 3177 if (i == 0) 3178 break; 3179 i--; 3180 } 3181 3182 if (push_items == 0) 3183 goto out_unlock; 3184 3185 WARN_ON(!empty && push_items == left_nritems); 3186 3187 /* push left to right */ 3188 right_nritems = btrfs_header_nritems(right); 3189 3190 push_space = btrfs_item_data_end(left, left_nritems - push_items); 3191 push_space -= leaf_data_end(left); 3192 3193 /* make room in the right data area */ 3194 data_end = leaf_data_end(right); 3195 memmove_leaf_data(right, data_end - push_space, data_end, 3196 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3197 3198 /* copy from the left data area */ 3199 copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3200 leaf_data_end(left), push_space); 3201 3202 memmove_leaf_items(right, push_items, 0, right_nritems); 3203 3204 /* copy the items from left to right */ 3205 copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); 3206 3207 /* update the item pointers */ 3208 btrfs_init_map_token(&token, right); 3209 right_nritems += push_items; 3210 btrfs_set_header_nritems(right, right_nritems); 3211 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3212 for (i = 0; i < right_nritems; i++) { 3213 push_space -= btrfs_token_item_size(&token, i); 3214 btrfs_set_token_item_offset(&token, i, push_space); 3215 } 3216 3217 left_nritems -= push_items; 3218 btrfs_set_header_nritems(left, left_nritems); 3219 3220 if (left_nritems) 3221 btrfs_mark_buffer_dirty(left); 3222 else 3223 btrfs_clear_buffer_dirty(trans, left); 3224 3225 btrfs_mark_buffer_dirty(right); 3226 3227 btrfs_item_key(right, &disk_key, 0); 3228 btrfs_set_node_key(upper, &disk_key, slot + 1); 3229 btrfs_mark_buffer_dirty(upper); 3230 3231 /* then fixup the leaf pointer in the path */ 3232 if (path->slots[0] >= left_nritems) { 3233 path->slots[0] -= left_nritems; 3234 if (btrfs_header_nritems(path->nodes[0]) == 0) 3235 btrfs_clear_buffer_dirty(trans, path->nodes[0]); 3236 btrfs_tree_unlock(path->nodes[0]); 3237 free_extent_buffer(path->nodes[0]); 3238 path->nodes[0] = right; 3239 path->slots[1] += 1; 3240 } else { 3241 btrfs_tree_unlock(right); 3242 free_extent_buffer(right); 3243 } 3244 return 0; 3245 3246 out_unlock: 3247 btrfs_tree_unlock(right); 3248 free_extent_buffer(right); 3249 return 1; 3250 } 3251 3252 /* 3253 * push some data in the path leaf to the right, trying to free up at 3254 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3255 * 3256 * returns 1 if the push failed because the other node didn't have enough 3257 * room, 0 if everything worked out and < 0 if there were major errors. 3258 * 3259 * this will push starting from min_slot to the end of the leaf. It won't 3260 * push any slot lower than min_slot 3261 */ 3262 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3263 *root, struct btrfs_path *path, 3264 int min_data_size, int data_size, 3265 int empty, u32 min_slot) 3266 { 3267 struct extent_buffer *left = path->nodes[0]; 3268 struct extent_buffer *right; 3269 struct extent_buffer *upper; 3270 int slot; 3271 int free_space; 3272 u32 left_nritems; 3273 int ret; 3274 3275 if (!path->nodes[1]) 3276 return 1; 3277 3278 slot = path->slots[1]; 3279 upper = path->nodes[1]; 3280 if (slot >= btrfs_header_nritems(upper) - 1) 3281 return 1; 3282 3283 btrfs_assert_tree_write_locked(path->nodes[1]); 3284 3285 right = btrfs_read_node_slot(upper, slot + 1); 3286 if (IS_ERR(right)) 3287 return PTR_ERR(right); 3288 3289 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 3290 3291 free_space = btrfs_leaf_free_space(right); 3292 if (free_space < data_size) 3293 goto out_unlock; 3294 3295 ret = btrfs_cow_block(trans, root, right, upper, 3296 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 3297 if (ret) 3298 goto out_unlock; 3299 3300 left_nritems = btrfs_header_nritems(left); 3301 if (left_nritems == 0) 3302 goto out_unlock; 3303 3304 if (check_sibling_keys(left, right)) { 3305 ret = -EUCLEAN; 3306 btrfs_abort_transaction(trans, ret); 3307 btrfs_tree_unlock(right); 3308 free_extent_buffer(right); 3309 return ret; 3310 } 3311 if (path->slots[0] == left_nritems && !empty) { 3312 /* Key greater than all keys in the leaf, right neighbor has 3313 * enough room for it and we're not emptying our leaf to delete 3314 * it, therefore use right neighbor to insert the new item and 3315 * no need to touch/dirty our left leaf. */ 3316 btrfs_tree_unlock(left); 3317 free_extent_buffer(left); 3318 path->nodes[0] = right; 3319 path->slots[0] = 0; 3320 path->slots[1]++; 3321 return 0; 3322 } 3323 3324 return __push_leaf_right(trans, path, min_data_size, empty, right, 3325 free_space, left_nritems, min_slot); 3326 out_unlock: 3327 btrfs_tree_unlock(right); 3328 free_extent_buffer(right); 3329 return 1; 3330 } 3331 3332 /* 3333 * push some data in the path leaf to the left, trying to free up at 3334 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3335 * 3336 * max_slot can put a limit on how far into the leaf we'll push items. The 3337 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3338 * items 3339 */ 3340 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3341 struct btrfs_path *path, int data_size, 3342 int empty, struct extent_buffer *left, 3343 int free_space, u32 right_nritems, 3344 u32 max_slot) 3345 { 3346 struct btrfs_fs_info *fs_info = left->fs_info; 3347 struct btrfs_disk_key disk_key; 3348 struct extent_buffer *right = path->nodes[0]; 3349 int i; 3350 int push_space = 0; 3351 int push_items = 0; 3352 u32 old_left_nritems; 3353 u32 nr; 3354 int ret = 0; 3355 u32 this_item_size; 3356 u32 old_left_item_size; 3357 struct btrfs_map_token token; 3358 3359 if (empty) 3360 nr = min(right_nritems, max_slot); 3361 else 3362 nr = min(right_nritems - 1, max_slot); 3363 3364 for (i = 0; i < nr; i++) { 3365 if (!empty && push_items > 0) { 3366 if (path->slots[0] < i) 3367 break; 3368 if (path->slots[0] == i) { 3369 int space = btrfs_leaf_free_space(right); 3370 3371 if (space + push_space * 2 > free_space) 3372 break; 3373 } 3374 } 3375 3376 if (path->slots[0] == i) 3377 push_space += data_size; 3378 3379 this_item_size = btrfs_item_size(right, i); 3380 if (this_item_size + sizeof(struct btrfs_item) + push_space > 3381 free_space) 3382 break; 3383 3384 push_items++; 3385 push_space += this_item_size + sizeof(struct btrfs_item); 3386 } 3387 3388 if (push_items == 0) { 3389 ret = 1; 3390 goto out; 3391 } 3392 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3393 3394 /* push data from right to left */ 3395 copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); 3396 3397 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3398 btrfs_item_offset(right, push_items - 1); 3399 3400 copy_leaf_data(left, right, leaf_data_end(left) - push_space, 3401 btrfs_item_offset(right, push_items - 1), push_space); 3402 old_left_nritems = btrfs_header_nritems(left); 3403 BUG_ON(old_left_nritems <= 0); 3404 3405 btrfs_init_map_token(&token, left); 3406 old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); 3407 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3408 u32 ioff; 3409 3410 ioff = btrfs_token_item_offset(&token, i); 3411 btrfs_set_token_item_offset(&token, i, 3412 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 3413 } 3414 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3415 3416 /* fixup right node */ 3417 if (push_items > right_nritems) 3418 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3419 right_nritems); 3420 3421 if (push_items < right_nritems) { 3422 push_space = btrfs_item_offset(right, push_items - 1) - 3423 leaf_data_end(right); 3424 memmove_leaf_data(right, 3425 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3426 leaf_data_end(right), push_space); 3427 3428 memmove_leaf_items(right, 0, push_items, 3429 btrfs_header_nritems(right) - push_items); 3430 } 3431 3432 btrfs_init_map_token(&token, right); 3433 right_nritems -= push_items; 3434 btrfs_set_header_nritems(right, right_nritems); 3435 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3436 for (i = 0; i < right_nritems; i++) { 3437 push_space = push_space - btrfs_token_item_size(&token, i); 3438 btrfs_set_token_item_offset(&token, i, push_space); 3439 } 3440 3441 btrfs_mark_buffer_dirty(left); 3442 if (right_nritems) 3443 btrfs_mark_buffer_dirty(right); 3444 else 3445 btrfs_clear_buffer_dirty(trans, right); 3446 3447 btrfs_item_key(right, &disk_key, 0); 3448 fixup_low_keys(path, &disk_key, 1); 3449 3450 /* then fixup the leaf pointer in the path */ 3451 if (path->slots[0] < push_items) { 3452 path->slots[0] += old_left_nritems; 3453 btrfs_tree_unlock(path->nodes[0]); 3454 free_extent_buffer(path->nodes[0]); 3455 path->nodes[0] = left; 3456 path->slots[1] -= 1; 3457 } else { 3458 btrfs_tree_unlock(left); 3459 free_extent_buffer(left); 3460 path->slots[0] -= push_items; 3461 } 3462 BUG_ON(path->slots[0] < 0); 3463 return ret; 3464 out: 3465 btrfs_tree_unlock(left); 3466 free_extent_buffer(left); 3467 return ret; 3468 } 3469 3470 /* 3471 * push some data in the path leaf to the left, trying to free up at 3472 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3473 * 3474 * max_slot can put a limit on how far into the leaf we'll push items. The 3475 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3476 * items 3477 */ 3478 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3479 *root, struct btrfs_path *path, int min_data_size, 3480 int data_size, int empty, u32 max_slot) 3481 { 3482 struct extent_buffer *right = path->nodes[0]; 3483 struct extent_buffer *left; 3484 int slot; 3485 int free_space; 3486 u32 right_nritems; 3487 int ret = 0; 3488 3489 slot = path->slots[1]; 3490 if (slot == 0) 3491 return 1; 3492 if (!path->nodes[1]) 3493 return 1; 3494 3495 right_nritems = btrfs_header_nritems(right); 3496 if (right_nritems == 0) 3497 return 1; 3498 3499 btrfs_assert_tree_write_locked(path->nodes[1]); 3500 3501 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3502 if (IS_ERR(left)) 3503 return PTR_ERR(left); 3504 3505 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3506 3507 free_space = btrfs_leaf_free_space(left); 3508 if (free_space < data_size) { 3509 ret = 1; 3510 goto out; 3511 } 3512 3513 ret = btrfs_cow_block(trans, root, left, 3514 path->nodes[1], slot - 1, &left, 3515 BTRFS_NESTING_LEFT_COW); 3516 if (ret) { 3517 /* we hit -ENOSPC, but it isn't fatal here */ 3518 if (ret == -ENOSPC) 3519 ret = 1; 3520 goto out; 3521 } 3522 3523 if (check_sibling_keys(left, right)) { 3524 ret = -EUCLEAN; 3525 btrfs_abort_transaction(trans, ret); 3526 goto out; 3527 } 3528 return __push_leaf_left(trans, path, min_data_size, empty, left, 3529 free_space, right_nritems, max_slot); 3530 out: 3531 btrfs_tree_unlock(left); 3532 free_extent_buffer(left); 3533 return ret; 3534 } 3535 3536 /* 3537 * split the path's leaf in two, making sure there is at least data_size 3538 * available for the resulting leaf level of the path. 3539 */ 3540 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3541 struct btrfs_path *path, 3542 struct extent_buffer *l, 3543 struct extent_buffer *right, 3544 int slot, int mid, int nritems) 3545 { 3546 struct btrfs_fs_info *fs_info = trans->fs_info; 3547 int data_copy_size; 3548 int rt_data_off; 3549 int i; 3550 struct btrfs_disk_key disk_key; 3551 struct btrfs_map_token token; 3552 3553 nritems = nritems - mid; 3554 btrfs_set_header_nritems(right, nritems); 3555 data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); 3556 3557 copy_leaf_items(right, l, 0, mid, nritems); 3558 3559 copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, 3560 leaf_data_end(l), data_copy_size); 3561 3562 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); 3563 3564 btrfs_init_map_token(&token, right); 3565 for (i = 0; i < nritems; i++) { 3566 u32 ioff; 3567 3568 ioff = btrfs_token_item_offset(&token, i); 3569 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); 3570 } 3571 3572 btrfs_set_header_nritems(l, mid); 3573 btrfs_item_key(right, &disk_key, 0); 3574 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3575 3576 btrfs_mark_buffer_dirty(right); 3577 btrfs_mark_buffer_dirty(l); 3578 BUG_ON(path->slots[0] != slot); 3579 3580 if (mid <= slot) { 3581 btrfs_tree_unlock(path->nodes[0]); 3582 free_extent_buffer(path->nodes[0]); 3583 path->nodes[0] = right; 3584 path->slots[0] -= mid; 3585 path->slots[1] += 1; 3586 } else { 3587 btrfs_tree_unlock(right); 3588 free_extent_buffer(right); 3589 } 3590 3591 BUG_ON(path->slots[0] < 0); 3592 } 3593 3594 /* 3595 * double splits happen when we need to insert a big item in the middle 3596 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3597 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3598 * A B C 3599 * 3600 * We avoid this by trying to push the items on either side of our target 3601 * into the adjacent leaves. If all goes well we can avoid the double split 3602 * completely. 3603 */ 3604 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3605 struct btrfs_root *root, 3606 struct btrfs_path *path, 3607 int data_size) 3608 { 3609 int ret; 3610 int progress = 0; 3611 int slot; 3612 u32 nritems; 3613 int space_needed = data_size; 3614 3615 slot = path->slots[0]; 3616 if (slot < btrfs_header_nritems(path->nodes[0])) 3617 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3618 3619 /* 3620 * try to push all the items after our slot into the 3621 * right leaf 3622 */ 3623 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3624 if (ret < 0) 3625 return ret; 3626 3627 if (ret == 0) 3628 progress++; 3629 3630 nritems = btrfs_header_nritems(path->nodes[0]); 3631 /* 3632 * our goal is to get our slot at the start or end of a leaf. If 3633 * we've done so we're done 3634 */ 3635 if (path->slots[0] == 0 || path->slots[0] == nritems) 3636 return 0; 3637 3638 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3639 return 0; 3640 3641 /* try to push all the items before our slot into the next leaf */ 3642 slot = path->slots[0]; 3643 space_needed = data_size; 3644 if (slot > 0) 3645 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3646 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3647 if (ret < 0) 3648 return ret; 3649 3650 if (ret == 0) 3651 progress++; 3652 3653 if (progress) 3654 return 0; 3655 return 1; 3656 } 3657 3658 /* 3659 * split the path's leaf in two, making sure there is at least data_size 3660 * available for the resulting leaf level of the path. 3661 * 3662 * returns 0 if all went well and < 0 on failure. 3663 */ 3664 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3665 struct btrfs_root *root, 3666 const struct btrfs_key *ins_key, 3667 struct btrfs_path *path, int data_size, 3668 int extend) 3669 { 3670 struct btrfs_disk_key disk_key; 3671 struct extent_buffer *l; 3672 u32 nritems; 3673 int mid; 3674 int slot; 3675 struct extent_buffer *right; 3676 struct btrfs_fs_info *fs_info = root->fs_info; 3677 int ret = 0; 3678 int wret; 3679 int split; 3680 int num_doubles = 0; 3681 int tried_avoid_double = 0; 3682 3683 l = path->nodes[0]; 3684 slot = path->slots[0]; 3685 if (extend && data_size + btrfs_item_size(l, slot) + 3686 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3687 return -EOVERFLOW; 3688 3689 /* first try to make some room by pushing left and right */ 3690 if (data_size && path->nodes[1]) { 3691 int space_needed = data_size; 3692 3693 if (slot < btrfs_header_nritems(l)) 3694 space_needed -= btrfs_leaf_free_space(l); 3695 3696 wret = push_leaf_right(trans, root, path, space_needed, 3697 space_needed, 0, 0); 3698 if (wret < 0) 3699 return wret; 3700 if (wret) { 3701 space_needed = data_size; 3702 if (slot > 0) 3703 space_needed -= btrfs_leaf_free_space(l); 3704 wret = push_leaf_left(trans, root, path, space_needed, 3705 space_needed, 0, (u32)-1); 3706 if (wret < 0) 3707 return wret; 3708 } 3709 l = path->nodes[0]; 3710 3711 /* did the pushes work? */ 3712 if (btrfs_leaf_free_space(l) >= data_size) 3713 return 0; 3714 } 3715 3716 if (!path->nodes[1]) { 3717 ret = insert_new_root(trans, root, path, 1); 3718 if (ret) 3719 return ret; 3720 } 3721 again: 3722 split = 1; 3723 l = path->nodes[0]; 3724 slot = path->slots[0]; 3725 nritems = btrfs_header_nritems(l); 3726 mid = (nritems + 1) / 2; 3727 3728 if (mid <= slot) { 3729 if (nritems == 1 || 3730 leaf_space_used(l, mid, nritems - mid) + data_size > 3731 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3732 if (slot >= nritems) { 3733 split = 0; 3734 } else { 3735 mid = slot; 3736 if (mid != nritems && 3737 leaf_space_used(l, mid, nritems - mid) + 3738 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3739 if (data_size && !tried_avoid_double) 3740 goto push_for_double; 3741 split = 2; 3742 } 3743 } 3744 } 3745 } else { 3746 if (leaf_space_used(l, 0, mid) + data_size > 3747 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3748 if (!extend && data_size && slot == 0) { 3749 split = 0; 3750 } else if ((extend || !data_size) && slot == 0) { 3751 mid = 1; 3752 } else { 3753 mid = slot; 3754 if (mid != nritems && 3755 leaf_space_used(l, mid, nritems - mid) + 3756 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3757 if (data_size && !tried_avoid_double) 3758 goto push_for_double; 3759 split = 2; 3760 } 3761 } 3762 } 3763 } 3764 3765 if (split == 0) 3766 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3767 else 3768 btrfs_item_key(l, &disk_key, mid); 3769 3770 /* 3771 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3772 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3773 * subclasses, which is 8 at the time of this patch, and we've maxed it 3774 * out. In the future we could add a 3775 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3776 * use BTRFS_NESTING_NEW_ROOT. 3777 */ 3778 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3779 &disk_key, 0, l->start, 0, 3780 num_doubles ? BTRFS_NESTING_NEW_ROOT : 3781 BTRFS_NESTING_SPLIT); 3782 if (IS_ERR(right)) 3783 return PTR_ERR(right); 3784 3785 root_add_used(root, fs_info->nodesize); 3786 3787 if (split == 0) { 3788 if (mid <= slot) { 3789 btrfs_set_header_nritems(right, 0); 3790 insert_ptr(trans, path, &disk_key, 3791 right->start, path->slots[1] + 1, 1); 3792 btrfs_tree_unlock(path->nodes[0]); 3793 free_extent_buffer(path->nodes[0]); 3794 path->nodes[0] = right; 3795 path->slots[0] = 0; 3796 path->slots[1] += 1; 3797 } else { 3798 btrfs_set_header_nritems(right, 0); 3799 insert_ptr(trans, path, &disk_key, 3800 right->start, path->slots[1], 1); 3801 btrfs_tree_unlock(path->nodes[0]); 3802 free_extent_buffer(path->nodes[0]); 3803 path->nodes[0] = right; 3804 path->slots[0] = 0; 3805 if (path->slots[1] == 0) 3806 fixup_low_keys(path, &disk_key, 1); 3807 } 3808 /* 3809 * We create a new leaf 'right' for the required ins_len and 3810 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3811 * the content of ins_len to 'right'. 3812 */ 3813 return ret; 3814 } 3815 3816 copy_for_split(trans, path, l, right, slot, mid, nritems); 3817 3818 if (split == 2) { 3819 BUG_ON(num_doubles != 0); 3820 num_doubles++; 3821 goto again; 3822 } 3823 3824 return 0; 3825 3826 push_for_double: 3827 push_for_double_split(trans, root, path, data_size); 3828 tried_avoid_double = 1; 3829 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3830 return 0; 3831 goto again; 3832 } 3833 3834 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3835 struct btrfs_root *root, 3836 struct btrfs_path *path, int ins_len) 3837 { 3838 struct btrfs_key key; 3839 struct extent_buffer *leaf; 3840 struct btrfs_file_extent_item *fi; 3841 u64 extent_len = 0; 3842 u32 item_size; 3843 int ret; 3844 3845 leaf = path->nodes[0]; 3846 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3847 3848 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3849 key.type != BTRFS_EXTENT_CSUM_KEY); 3850 3851 if (btrfs_leaf_free_space(leaf) >= ins_len) 3852 return 0; 3853 3854 item_size = btrfs_item_size(leaf, path->slots[0]); 3855 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3856 fi = btrfs_item_ptr(leaf, path->slots[0], 3857 struct btrfs_file_extent_item); 3858 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3859 } 3860 btrfs_release_path(path); 3861 3862 path->keep_locks = 1; 3863 path->search_for_split = 1; 3864 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3865 path->search_for_split = 0; 3866 if (ret > 0) 3867 ret = -EAGAIN; 3868 if (ret < 0) 3869 goto err; 3870 3871 ret = -EAGAIN; 3872 leaf = path->nodes[0]; 3873 /* if our item isn't there, return now */ 3874 if (item_size != btrfs_item_size(leaf, path->slots[0])) 3875 goto err; 3876 3877 /* the leaf has changed, it now has room. return now */ 3878 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3879 goto err; 3880 3881 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3882 fi = btrfs_item_ptr(leaf, path->slots[0], 3883 struct btrfs_file_extent_item); 3884 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3885 goto err; 3886 } 3887 3888 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3889 if (ret) 3890 goto err; 3891 3892 path->keep_locks = 0; 3893 btrfs_unlock_up_safe(path, 1); 3894 return 0; 3895 err: 3896 path->keep_locks = 0; 3897 return ret; 3898 } 3899 3900 static noinline int split_item(struct btrfs_path *path, 3901 const struct btrfs_key *new_key, 3902 unsigned long split_offset) 3903 { 3904 struct extent_buffer *leaf; 3905 int orig_slot, slot; 3906 char *buf; 3907 u32 nritems; 3908 u32 item_size; 3909 u32 orig_offset; 3910 struct btrfs_disk_key disk_key; 3911 3912 leaf = path->nodes[0]; 3913 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); 3914 3915 orig_slot = path->slots[0]; 3916 orig_offset = btrfs_item_offset(leaf, path->slots[0]); 3917 item_size = btrfs_item_size(leaf, path->slots[0]); 3918 3919 buf = kmalloc(item_size, GFP_NOFS); 3920 if (!buf) 3921 return -ENOMEM; 3922 3923 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3924 path->slots[0]), item_size); 3925 3926 slot = path->slots[0] + 1; 3927 nritems = btrfs_header_nritems(leaf); 3928 if (slot != nritems) { 3929 /* shift the items */ 3930 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); 3931 } 3932 3933 btrfs_cpu_key_to_disk(&disk_key, new_key); 3934 btrfs_set_item_key(leaf, &disk_key, slot); 3935 3936 btrfs_set_item_offset(leaf, slot, orig_offset); 3937 btrfs_set_item_size(leaf, slot, item_size - split_offset); 3938 3939 btrfs_set_item_offset(leaf, orig_slot, 3940 orig_offset + item_size - split_offset); 3941 btrfs_set_item_size(leaf, orig_slot, split_offset); 3942 3943 btrfs_set_header_nritems(leaf, nritems + 1); 3944 3945 /* write the data for the start of the original item */ 3946 write_extent_buffer(leaf, buf, 3947 btrfs_item_ptr_offset(leaf, path->slots[0]), 3948 split_offset); 3949 3950 /* write the data for the new item */ 3951 write_extent_buffer(leaf, buf + split_offset, 3952 btrfs_item_ptr_offset(leaf, slot), 3953 item_size - split_offset); 3954 btrfs_mark_buffer_dirty(leaf); 3955 3956 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3957 kfree(buf); 3958 return 0; 3959 } 3960 3961 /* 3962 * This function splits a single item into two items, 3963 * giving 'new_key' to the new item and splitting the 3964 * old one at split_offset (from the start of the item). 3965 * 3966 * The path may be released by this operation. After 3967 * the split, the path is pointing to the old item. The 3968 * new item is going to be in the same node as the old one. 3969 * 3970 * Note, the item being split must be smaller enough to live alone on 3971 * a tree block with room for one extra struct btrfs_item 3972 * 3973 * This allows us to split the item in place, keeping a lock on the 3974 * leaf the entire time. 3975 */ 3976 int btrfs_split_item(struct btrfs_trans_handle *trans, 3977 struct btrfs_root *root, 3978 struct btrfs_path *path, 3979 const struct btrfs_key *new_key, 3980 unsigned long split_offset) 3981 { 3982 int ret; 3983 ret = setup_leaf_for_split(trans, root, path, 3984 sizeof(struct btrfs_item)); 3985 if (ret) 3986 return ret; 3987 3988 ret = split_item(path, new_key, split_offset); 3989 return ret; 3990 } 3991 3992 /* 3993 * make the item pointed to by the path smaller. new_size indicates 3994 * how small to make it, and from_end tells us if we just chop bytes 3995 * off the end of the item or if we shift the item to chop bytes off 3996 * the front. 3997 */ 3998 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 3999 { 4000 int slot; 4001 struct extent_buffer *leaf; 4002 u32 nritems; 4003 unsigned int data_end; 4004 unsigned int old_data_start; 4005 unsigned int old_size; 4006 unsigned int size_diff; 4007 int i; 4008 struct btrfs_map_token token; 4009 4010 leaf = path->nodes[0]; 4011 slot = path->slots[0]; 4012 4013 old_size = btrfs_item_size(leaf, slot); 4014 if (old_size == new_size) 4015 return; 4016 4017 nritems = btrfs_header_nritems(leaf); 4018 data_end = leaf_data_end(leaf); 4019 4020 old_data_start = btrfs_item_offset(leaf, slot); 4021 4022 size_diff = old_size - new_size; 4023 4024 BUG_ON(slot < 0); 4025 BUG_ON(slot >= nritems); 4026 4027 /* 4028 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4029 */ 4030 /* first correct the data pointers */ 4031 btrfs_init_map_token(&token, leaf); 4032 for (i = slot; i < nritems; i++) { 4033 u32 ioff; 4034 4035 ioff = btrfs_token_item_offset(&token, i); 4036 btrfs_set_token_item_offset(&token, i, ioff + size_diff); 4037 } 4038 4039 /* shift the data */ 4040 if (from_end) { 4041 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4042 old_data_start + new_size - data_end); 4043 } else { 4044 struct btrfs_disk_key disk_key; 4045 u64 offset; 4046 4047 btrfs_item_key(leaf, &disk_key, slot); 4048 4049 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4050 unsigned long ptr; 4051 struct btrfs_file_extent_item *fi; 4052 4053 fi = btrfs_item_ptr(leaf, slot, 4054 struct btrfs_file_extent_item); 4055 fi = (struct btrfs_file_extent_item *)( 4056 (unsigned long)fi - size_diff); 4057 4058 if (btrfs_file_extent_type(leaf, fi) == 4059 BTRFS_FILE_EXTENT_INLINE) { 4060 ptr = btrfs_item_ptr_offset(leaf, slot); 4061 memmove_extent_buffer(leaf, ptr, 4062 (unsigned long)fi, 4063 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4064 } 4065 } 4066 4067 memmove_leaf_data(leaf, data_end + size_diff, data_end, 4068 old_data_start - data_end); 4069 4070 offset = btrfs_disk_key_offset(&disk_key); 4071 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4072 btrfs_set_item_key(leaf, &disk_key, slot); 4073 if (slot == 0) 4074 fixup_low_keys(path, &disk_key, 1); 4075 } 4076 4077 btrfs_set_item_size(leaf, slot, new_size); 4078 btrfs_mark_buffer_dirty(leaf); 4079 4080 if (btrfs_leaf_free_space(leaf) < 0) { 4081 btrfs_print_leaf(leaf); 4082 BUG(); 4083 } 4084 } 4085 4086 /* 4087 * make the item pointed to by the path bigger, data_size is the added size. 4088 */ 4089 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 4090 { 4091 int slot; 4092 struct extent_buffer *leaf; 4093 u32 nritems; 4094 unsigned int data_end; 4095 unsigned int old_data; 4096 unsigned int old_size; 4097 int i; 4098 struct btrfs_map_token token; 4099 4100 leaf = path->nodes[0]; 4101 4102 nritems = btrfs_header_nritems(leaf); 4103 data_end = leaf_data_end(leaf); 4104 4105 if (btrfs_leaf_free_space(leaf) < data_size) { 4106 btrfs_print_leaf(leaf); 4107 BUG(); 4108 } 4109 slot = path->slots[0]; 4110 old_data = btrfs_item_data_end(leaf, slot); 4111 4112 BUG_ON(slot < 0); 4113 if (slot >= nritems) { 4114 btrfs_print_leaf(leaf); 4115 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 4116 slot, nritems); 4117 BUG(); 4118 } 4119 4120 /* 4121 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4122 */ 4123 /* first correct the data pointers */ 4124 btrfs_init_map_token(&token, leaf); 4125 for (i = slot; i < nritems; i++) { 4126 u32 ioff; 4127 4128 ioff = btrfs_token_item_offset(&token, i); 4129 btrfs_set_token_item_offset(&token, i, ioff - data_size); 4130 } 4131 4132 /* shift the data */ 4133 memmove_leaf_data(leaf, data_end - data_size, data_end, 4134 old_data - data_end); 4135 4136 data_end = old_data; 4137 old_size = btrfs_item_size(leaf, slot); 4138 btrfs_set_item_size(leaf, slot, old_size + data_size); 4139 btrfs_mark_buffer_dirty(leaf); 4140 4141 if (btrfs_leaf_free_space(leaf) < 0) { 4142 btrfs_print_leaf(leaf); 4143 BUG(); 4144 } 4145 } 4146 4147 /* 4148 * Make space in the node before inserting one or more items. 4149 * 4150 * @root: root we are inserting items to 4151 * @path: points to the leaf/slot where we are going to insert new items 4152 * @batch: information about the batch of items to insert 4153 * 4154 * Main purpose is to save stack depth by doing the bulk of the work in a 4155 * function that doesn't call btrfs_search_slot 4156 */ 4157 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4158 const struct btrfs_item_batch *batch) 4159 { 4160 struct btrfs_fs_info *fs_info = root->fs_info; 4161 int i; 4162 u32 nritems; 4163 unsigned int data_end; 4164 struct btrfs_disk_key disk_key; 4165 struct extent_buffer *leaf; 4166 int slot; 4167 struct btrfs_map_token token; 4168 u32 total_size; 4169 4170 /* 4171 * Before anything else, update keys in the parent and other ancestors 4172 * if needed, then release the write locks on them, so that other tasks 4173 * can use them while we modify the leaf. 4174 */ 4175 if (path->slots[0] == 0) { 4176 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); 4177 fixup_low_keys(path, &disk_key, 1); 4178 } 4179 btrfs_unlock_up_safe(path, 1); 4180 4181 leaf = path->nodes[0]; 4182 slot = path->slots[0]; 4183 4184 nritems = btrfs_header_nritems(leaf); 4185 data_end = leaf_data_end(leaf); 4186 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4187 4188 if (btrfs_leaf_free_space(leaf) < total_size) { 4189 btrfs_print_leaf(leaf); 4190 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4191 total_size, btrfs_leaf_free_space(leaf)); 4192 BUG(); 4193 } 4194 4195 btrfs_init_map_token(&token, leaf); 4196 if (slot != nritems) { 4197 unsigned int old_data = btrfs_item_data_end(leaf, slot); 4198 4199 if (old_data < data_end) { 4200 btrfs_print_leaf(leaf); 4201 btrfs_crit(fs_info, 4202 "item at slot %d with data offset %u beyond data end of leaf %u", 4203 slot, old_data, data_end); 4204 BUG(); 4205 } 4206 /* 4207 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4208 */ 4209 /* first correct the data pointers */ 4210 for (i = slot; i < nritems; i++) { 4211 u32 ioff; 4212 4213 ioff = btrfs_token_item_offset(&token, i); 4214 btrfs_set_token_item_offset(&token, i, 4215 ioff - batch->total_data_size); 4216 } 4217 /* shift the items */ 4218 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); 4219 4220 /* shift the data */ 4221 memmove_leaf_data(leaf, data_end - batch->total_data_size, 4222 data_end, old_data - data_end); 4223 data_end = old_data; 4224 } 4225 4226 /* setup the item for the new data */ 4227 for (i = 0; i < batch->nr; i++) { 4228 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); 4229 btrfs_set_item_key(leaf, &disk_key, slot + i); 4230 data_end -= batch->data_sizes[i]; 4231 btrfs_set_token_item_offset(&token, slot + i, data_end); 4232 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); 4233 } 4234 4235 btrfs_set_header_nritems(leaf, nritems + batch->nr); 4236 btrfs_mark_buffer_dirty(leaf); 4237 4238 if (btrfs_leaf_free_space(leaf) < 0) { 4239 btrfs_print_leaf(leaf); 4240 BUG(); 4241 } 4242 } 4243 4244 /* 4245 * Insert a new item into a leaf. 4246 * 4247 * @root: The root of the btree. 4248 * @path: A path pointing to the target leaf and slot. 4249 * @key: The key of the new item. 4250 * @data_size: The size of the data associated with the new key. 4251 */ 4252 void btrfs_setup_item_for_insert(struct btrfs_root *root, 4253 struct btrfs_path *path, 4254 const struct btrfs_key *key, 4255 u32 data_size) 4256 { 4257 struct btrfs_item_batch batch; 4258 4259 batch.keys = key; 4260 batch.data_sizes = &data_size; 4261 batch.total_data_size = data_size; 4262 batch.nr = 1; 4263 4264 setup_items_for_insert(root, path, &batch); 4265 } 4266 4267 /* 4268 * Given a key and some data, insert items into the tree. 4269 * This does all the path init required, making room in the tree if needed. 4270 */ 4271 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4272 struct btrfs_root *root, 4273 struct btrfs_path *path, 4274 const struct btrfs_item_batch *batch) 4275 { 4276 int ret = 0; 4277 int slot; 4278 u32 total_size; 4279 4280 total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); 4281 ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); 4282 if (ret == 0) 4283 return -EEXIST; 4284 if (ret < 0) 4285 return ret; 4286 4287 slot = path->slots[0]; 4288 BUG_ON(slot < 0); 4289 4290 setup_items_for_insert(root, path, batch); 4291 return 0; 4292 } 4293 4294 /* 4295 * Given a key and some data, insert an item into the tree. 4296 * This does all the path init required, making room in the tree if needed. 4297 */ 4298 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4299 const struct btrfs_key *cpu_key, void *data, 4300 u32 data_size) 4301 { 4302 int ret = 0; 4303 struct btrfs_path *path; 4304 struct extent_buffer *leaf; 4305 unsigned long ptr; 4306 4307 path = btrfs_alloc_path(); 4308 if (!path) 4309 return -ENOMEM; 4310 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4311 if (!ret) { 4312 leaf = path->nodes[0]; 4313 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4314 write_extent_buffer(leaf, data, ptr, data_size); 4315 btrfs_mark_buffer_dirty(leaf); 4316 } 4317 btrfs_free_path(path); 4318 return ret; 4319 } 4320 4321 /* 4322 * This function duplicates an item, giving 'new_key' to the new item. 4323 * It guarantees both items live in the same tree leaf and the new item is 4324 * contiguous with the original item. 4325 * 4326 * This allows us to split a file extent in place, keeping a lock on the leaf 4327 * the entire time. 4328 */ 4329 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4330 struct btrfs_root *root, 4331 struct btrfs_path *path, 4332 const struct btrfs_key *new_key) 4333 { 4334 struct extent_buffer *leaf; 4335 int ret; 4336 u32 item_size; 4337 4338 leaf = path->nodes[0]; 4339 item_size = btrfs_item_size(leaf, path->slots[0]); 4340 ret = setup_leaf_for_split(trans, root, path, 4341 item_size + sizeof(struct btrfs_item)); 4342 if (ret) 4343 return ret; 4344 4345 path->slots[0]++; 4346 btrfs_setup_item_for_insert(root, path, new_key, item_size); 4347 leaf = path->nodes[0]; 4348 memcpy_extent_buffer(leaf, 4349 btrfs_item_ptr_offset(leaf, path->slots[0]), 4350 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4351 item_size); 4352 return 0; 4353 } 4354 4355 /* 4356 * delete the pointer from a given node. 4357 * 4358 * the tree should have been previously balanced so the deletion does not 4359 * empty a node. 4360 */ 4361 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4362 int level, int slot) 4363 { 4364 struct extent_buffer *parent = path->nodes[level]; 4365 u32 nritems; 4366 int ret; 4367 4368 nritems = btrfs_header_nritems(parent); 4369 if (slot != nritems - 1) { 4370 if (level) { 4371 ret = btrfs_tree_mod_log_insert_move(parent, slot, 4372 slot + 1, nritems - slot - 1); 4373 BUG_ON(ret < 0); 4374 } 4375 memmove_extent_buffer(parent, 4376 btrfs_node_key_ptr_offset(parent, slot), 4377 btrfs_node_key_ptr_offset(parent, slot + 1), 4378 sizeof(struct btrfs_key_ptr) * 4379 (nritems - slot - 1)); 4380 } else if (level) { 4381 ret = btrfs_tree_mod_log_insert_key(parent, slot, 4382 BTRFS_MOD_LOG_KEY_REMOVE); 4383 BUG_ON(ret < 0); 4384 } 4385 4386 nritems--; 4387 btrfs_set_header_nritems(parent, nritems); 4388 if (nritems == 0 && parent == root->node) { 4389 BUG_ON(btrfs_header_level(root->node) != 1); 4390 /* just turn the root into a leaf and break */ 4391 btrfs_set_header_level(root->node, 0); 4392 } else if (slot == 0) { 4393 struct btrfs_disk_key disk_key; 4394 4395 btrfs_node_key(parent, &disk_key, 0); 4396 fixup_low_keys(path, &disk_key, level + 1); 4397 } 4398 btrfs_mark_buffer_dirty(parent); 4399 } 4400 4401 /* 4402 * a helper function to delete the leaf pointed to by path->slots[1] and 4403 * path->nodes[1]. 4404 * 4405 * This deletes the pointer in path->nodes[1] and frees the leaf 4406 * block extent. zero is returned if it all worked out, < 0 otherwise. 4407 * 4408 * The path must have already been setup for deleting the leaf, including 4409 * all the proper balancing. path->nodes[1] must be locked. 4410 */ 4411 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4412 struct btrfs_root *root, 4413 struct btrfs_path *path, 4414 struct extent_buffer *leaf) 4415 { 4416 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4417 del_ptr(root, path, 1, path->slots[1]); 4418 4419 /* 4420 * btrfs_free_extent is expensive, we want to make sure we 4421 * aren't holding any locks when we call it 4422 */ 4423 btrfs_unlock_up_safe(path, 0); 4424 4425 root_sub_used(root, leaf->len); 4426 4427 atomic_inc(&leaf->refs); 4428 btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); 4429 free_extent_buffer_stale(leaf); 4430 } 4431 /* 4432 * delete the item at the leaf level in path. If that empties 4433 * the leaf, remove it from the tree 4434 */ 4435 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4436 struct btrfs_path *path, int slot, int nr) 4437 { 4438 struct btrfs_fs_info *fs_info = root->fs_info; 4439 struct extent_buffer *leaf; 4440 int ret = 0; 4441 int wret; 4442 u32 nritems; 4443 4444 leaf = path->nodes[0]; 4445 nritems = btrfs_header_nritems(leaf); 4446 4447 if (slot + nr != nritems) { 4448 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4449 const int data_end = leaf_data_end(leaf); 4450 struct btrfs_map_token token; 4451 u32 dsize = 0; 4452 int i; 4453 4454 for (i = 0; i < nr; i++) 4455 dsize += btrfs_item_size(leaf, slot + i); 4456 4457 memmove_leaf_data(leaf, data_end + dsize, data_end, 4458 last_off - data_end); 4459 4460 btrfs_init_map_token(&token, leaf); 4461 for (i = slot + nr; i < nritems; i++) { 4462 u32 ioff; 4463 4464 ioff = btrfs_token_item_offset(&token, i); 4465 btrfs_set_token_item_offset(&token, i, ioff + dsize); 4466 } 4467 4468 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); 4469 } 4470 btrfs_set_header_nritems(leaf, nritems - nr); 4471 nritems -= nr; 4472 4473 /* delete the leaf if we've emptied it */ 4474 if (nritems == 0) { 4475 if (leaf == root->node) { 4476 btrfs_set_header_level(leaf, 0); 4477 } else { 4478 btrfs_clear_buffer_dirty(trans, leaf); 4479 btrfs_del_leaf(trans, root, path, leaf); 4480 } 4481 } else { 4482 int used = leaf_space_used(leaf, 0, nritems); 4483 if (slot == 0) { 4484 struct btrfs_disk_key disk_key; 4485 4486 btrfs_item_key(leaf, &disk_key, 0); 4487 fixup_low_keys(path, &disk_key, 1); 4488 } 4489 4490 /* 4491 * Try to delete the leaf if it is mostly empty. We do this by 4492 * trying to move all its items into its left and right neighbours. 4493 * If we can't move all the items, then we don't delete it - it's 4494 * not ideal, but future insertions might fill the leaf with more 4495 * items, or items from other leaves might be moved later into our 4496 * leaf due to deletions on those leaves. 4497 */ 4498 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4499 u32 min_push_space; 4500 4501 /* push_leaf_left fixes the path. 4502 * make sure the path still points to our leaf 4503 * for possible call to del_ptr below 4504 */ 4505 slot = path->slots[1]; 4506 atomic_inc(&leaf->refs); 4507 /* 4508 * We want to be able to at least push one item to the 4509 * left neighbour leaf, and that's the first item. 4510 */ 4511 min_push_space = sizeof(struct btrfs_item) + 4512 btrfs_item_size(leaf, 0); 4513 wret = push_leaf_left(trans, root, path, 0, 4514 min_push_space, 1, (u32)-1); 4515 if (wret < 0 && wret != -ENOSPC) 4516 ret = wret; 4517 4518 if (path->nodes[0] == leaf && 4519 btrfs_header_nritems(leaf)) { 4520 /* 4521 * If we were not able to push all items from our 4522 * leaf to its left neighbour, then attempt to 4523 * either push all the remaining items to the 4524 * right neighbour or none. There's no advantage 4525 * in pushing only some items, instead of all, as 4526 * it's pointless to end up with a leaf having 4527 * too few items while the neighbours can be full 4528 * or nearly full. 4529 */ 4530 nritems = btrfs_header_nritems(leaf); 4531 min_push_space = leaf_space_used(leaf, 0, nritems); 4532 wret = push_leaf_right(trans, root, path, 0, 4533 min_push_space, 1, 0); 4534 if (wret < 0 && wret != -ENOSPC) 4535 ret = wret; 4536 } 4537 4538 if (btrfs_header_nritems(leaf) == 0) { 4539 path->slots[1] = slot; 4540 btrfs_del_leaf(trans, root, path, leaf); 4541 free_extent_buffer(leaf); 4542 ret = 0; 4543 } else { 4544 /* if we're still in the path, make sure 4545 * we're dirty. Otherwise, one of the 4546 * push_leaf functions must have already 4547 * dirtied this buffer 4548 */ 4549 if (path->nodes[0] == leaf) 4550 btrfs_mark_buffer_dirty(leaf); 4551 free_extent_buffer(leaf); 4552 } 4553 } else { 4554 btrfs_mark_buffer_dirty(leaf); 4555 } 4556 } 4557 return ret; 4558 } 4559 4560 /* 4561 * A helper function to walk down the tree starting at min_key, and looking 4562 * for nodes or leaves that are have a minimum transaction id. 4563 * This is used by the btree defrag code, and tree logging 4564 * 4565 * This does not cow, but it does stuff the starting key it finds back 4566 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4567 * key and get a writable path. 4568 * 4569 * This honors path->lowest_level to prevent descent past a given level 4570 * of the tree. 4571 * 4572 * min_trans indicates the oldest transaction that you are interested 4573 * in walking through. Any nodes or leaves older than min_trans are 4574 * skipped over (without reading them). 4575 * 4576 * returns zero if something useful was found, < 0 on error and 1 if there 4577 * was nothing in the tree that matched the search criteria. 4578 */ 4579 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4580 struct btrfs_path *path, 4581 u64 min_trans) 4582 { 4583 struct extent_buffer *cur; 4584 struct btrfs_key found_key; 4585 int slot; 4586 int sret; 4587 u32 nritems; 4588 int level; 4589 int ret = 1; 4590 int keep_locks = path->keep_locks; 4591 4592 ASSERT(!path->nowait); 4593 path->keep_locks = 1; 4594 again: 4595 cur = btrfs_read_lock_root_node(root); 4596 level = btrfs_header_level(cur); 4597 WARN_ON(path->nodes[level]); 4598 path->nodes[level] = cur; 4599 path->locks[level] = BTRFS_READ_LOCK; 4600 4601 if (btrfs_header_generation(cur) < min_trans) { 4602 ret = 1; 4603 goto out; 4604 } 4605 while (1) { 4606 nritems = btrfs_header_nritems(cur); 4607 level = btrfs_header_level(cur); 4608 sret = btrfs_bin_search(cur, 0, min_key, &slot); 4609 if (sret < 0) { 4610 ret = sret; 4611 goto out; 4612 } 4613 4614 /* at the lowest level, we're done, setup the path and exit */ 4615 if (level == path->lowest_level) { 4616 if (slot >= nritems) 4617 goto find_next_key; 4618 ret = 0; 4619 path->slots[level] = slot; 4620 btrfs_item_key_to_cpu(cur, &found_key, slot); 4621 goto out; 4622 } 4623 if (sret && slot > 0) 4624 slot--; 4625 /* 4626 * check this node pointer against the min_trans parameters. 4627 * If it is too old, skip to the next one. 4628 */ 4629 while (slot < nritems) { 4630 u64 gen; 4631 4632 gen = btrfs_node_ptr_generation(cur, slot); 4633 if (gen < min_trans) { 4634 slot++; 4635 continue; 4636 } 4637 break; 4638 } 4639 find_next_key: 4640 /* 4641 * we didn't find a candidate key in this node, walk forward 4642 * and find another one 4643 */ 4644 if (slot >= nritems) { 4645 path->slots[level] = slot; 4646 sret = btrfs_find_next_key(root, path, min_key, level, 4647 min_trans); 4648 if (sret == 0) { 4649 btrfs_release_path(path); 4650 goto again; 4651 } else { 4652 goto out; 4653 } 4654 } 4655 /* save our key for returning back */ 4656 btrfs_node_key_to_cpu(cur, &found_key, slot); 4657 path->slots[level] = slot; 4658 if (level == path->lowest_level) { 4659 ret = 0; 4660 goto out; 4661 } 4662 cur = btrfs_read_node_slot(cur, slot); 4663 if (IS_ERR(cur)) { 4664 ret = PTR_ERR(cur); 4665 goto out; 4666 } 4667 4668 btrfs_tree_read_lock(cur); 4669 4670 path->locks[level - 1] = BTRFS_READ_LOCK; 4671 path->nodes[level - 1] = cur; 4672 unlock_up(path, level, 1, 0, NULL); 4673 } 4674 out: 4675 path->keep_locks = keep_locks; 4676 if (ret == 0) { 4677 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4678 memcpy(min_key, &found_key, sizeof(found_key)); 4679 } 4680 return ret; 4681 } 4682 4683 /* 4684 * this is similar to btrfs_next_leaf, but does not try to preserve 4685 * and fixup the path. It looks for and returns the next key in the 4686 * tree based on the current path and the min_trans parameters. 4687 * 4688 * 0 is returned if another key is found, < 0 if there are any errors 4689 * and 1 is returned if there are no higher keys in the tree 4690 * 4691 * path->keep_locks should be set to 1 on the search made before 4692 * calling this function. 4693 */ 4694 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4695 struct btrfs_key *key, int level, u64 min_trans) 4696 { 4697 int slot; 4698 struct extent_buffer *c; 4699 4700 WARN_ON(!path->keep_locks && !path->skip_locking); 4701 while (level < BTRFS_MAX_LEVEL) { 4702 if (!path->nodes[level]) 4703 return 1; 4704 4705 slot = path->slots[level] + 1; 4706 c = path->nodes[level]; 4707 next: 4708 if (slot >= btrfs_header_nritems(c)) { 4709 int ret; 4710 int orig_lowest; 4711 struct btrfs_key cur_key; 4712 if (level + 1 >= BTRFS_MAX_LEVEL || 4713 !path->nodes[level + 1]) 4714 return 1; 4715 4716 if (path->locks[level + 1] || path->skip_locking) { 4717 level++; 4718 continue; 4719 } 4720 4721 slot = btrfs_header_nritems(c) - 1; 4722 if (level == 0) 4723 btrfs_item_key_to_cpu(c, &cur_key, slot); 4724 else 4725 btrfs_node_key_to_cpu(c, &cur_key, slot); 4726 4727 orig_lowest = path->lowest_level; 4728 btrfs_release_path(path); 4729 path->lowest_level = level; 4730 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4731 0, 0); 4732 path->lowest_level = orig_lowest; 4733 if (ret < 0) 4734 return ret; 4735 4736 c = path->nodes[level]; 4737 slot = path->slots[level]; 4738 if (ret == 0) 4739 slot++; 4740 goto next; 4741 } 4742 4743 if (level == 0) 4744 btrfs_item_key_to_cpu(c, key, slot); 4745 else { 4746 u64 gen = btrfs_node_ptr_generation(c, slot); 4747 4748 if (gen < min_trans) { 4749 slot++; 4750 goto next; 4751 } 4752 btrfs_node_key_to_cpu(c, key, slot); 4753 } 4754 return 0; 4755 } 4756 return 1; 4757 } 4758 4759 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4760 u64 time_seq) 4761 { 4762 int slot; 4763 int level; 4764 struct extent_buffer *c; 4765 struct extent_buffer *next; 4766 struct btrfs_fs_info *fs_info = root->fs_info; 4767 struct btrfs_key key; 4768 bool need_commit_sem = false; 4769 u32 nritems; 4770 int ret; 4771 int i; 4772 4773 /* 4774 * The nowait semantics are used only for write paths, where we don't 4775 * use the tree mod log and sequence numbers. 4776 */ 4777 if (time_seq) 4778 ASSERT(!path->nowait); 4779 4780 nritems = btrfs_header_nritems(path->nodes[0]); 4781 if (nritems == 0) 4782 return 1; 4783 4784 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4785 again: 4786 level = 1; 4787 next = NULL; 4788 btrfs_release_path(path); 4789 4790 path->keep_locks = 1; 4791 4792 if (time_seq) { 4793 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4794 } else { 4795 if (path->need_commit_sem) { 4796 path->need_commit_sem = 0; 4797 need_commit_sem = true; 4798 if (path->nowait) { 4799 if (!down_read_trylock(&fs_info->commit_root_sem)) { 4800 ret = -EAGAIN; 4801 goto done; 4802 } 4803 } else { 4804 down_read(&fs_info->commit_root_sem); 4805 } 4806 } 4807 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4808 } 4809 path->keep_locks = 0; 4810 4811 if (ret < 0) 4812 goto done; 4813 4814 nritems = btrfs_header_nritems(path->nodes[0]); 4815 /* 4816 * by releasing the path above we dropped all our locks. A balance 4817 * could have added more items next to the key that used to be 4818 * at the very end of the block. So, check again here and 4819 * advance the path if there are now more items available. 4820 */ 4821 if (nritems > 0 && path->slots[0] < nritems - 1) { 4822 if (ret == 0) 4823 path->slots[0]++; 4824 ret = 0; 4825 goto done; 4826 } 4827 /* 4828 * So the above check misses one case: 4829 * - after releasing the path above, someone has removed the item that 4830 * used to be at the very end of the block, and balance between leafs 4831 * gets another one with bigger key.offset to replace it. 4832 * 4833 * This one should be returned as well, or we can get leaf corruption 4834 * later(esp. in __btrfs_drop_extents()). 4835 * 4836 * And a bit more explanation about this check, 4837 * with ret > 0, the key isn't found, the path points to the slot 4838 * where it should be inserted, so the path->slots[0] item must be the 4839 * bigger one. 4840 */ 4841 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4842 ret = 0; 4843 goto done; 4844 } 4845 4846 while (level < BTRFS_MAX_LEVEL) { 4847 if (!path->nodes[level]) { 4848 ret = 1; 4849 goto done; 4850 } 4851 4852 slot = path->slots[level] + 1; 4853 c = path->nodes[level]; 4854 if (slot >= btrfs_header_nritems(c)) { 4855 level++; 4856 if (level == BTRFS_MAX_LEVEL) { 4857 ret = 1; 4858 goto done; 4859 } 4860 continue; 4861 } 4862 4863 4864 /* 4865 * Our current level is where we're going to start from, and to 4866 * make sure lockdep doesn't complain we need to drop our locks 4867 * and nodes from 0 to our current level. 4868 */ 4869 for (i = 0; i < level; i++) { 4870 if (path->locks[level]) { 4871 btrfs_tree_read_unlock(path->nodes[i]); 4872 path->locks[i] = 0; 4873 } 4874 free_extent_buffer(path->nodes[i]); 4875 path->nodes[i] = NULL; 4876 } 4877 4878 next = c; 4879 ret = read_block_for_search(root, path, &next, level, 4880 slot, &key); 4881 if (ret == -EAGAIN && !path->nowait) 4882 goto again; 4883 4884 if (ret < 0) { 4885 btrfs_release_path(path); 4886 goto done; 4887 } 4888 4889 if (!path->skip_locking) { 4890 ret = btrfs_try_tree_read_lock(next); 4891 if (!ret && path->nowait) { 4892 ret = -EAGAIN; 4893 goto done; 4894 } 4895 if (!ret && time_seq) { 4896 /* 4897 * If we don't get the lock, we may be racing 4898 * with push_leaf_left, holding that lock while 4899 * itself waiting for the leaf we've currently 4900 * locked. To solve this situation, we give up 4901 * on our lock and cycle. 4902 */ 4903 free_extent_buffer(next); 4904 btrfs_release_path(path); 4905 cond_resched(); 4906 goto again; 4907 } 4908 if (!ret) 4909 btrfs_tree_read_lock(next); 4910 } 4911 break; 4912 } 4913 path->slots[level] = slot; 4914 while (1) { 4915 level--; 4916 path->nodes[level] = next; 4917 path->slots[level] = 0; 4918 if (!path->skip_locking) 4919 path->locks[level] = BTRFS_READ_LOCK; 4920 if (!level) 4921 break; 4922 4923 ret = read_block_for_search(root, path, &next, level, 4924 0, &key); 4925 if (ret == -EAGAIN && !path->nowait) 4926 goto again; 4927 4928 if (ret < 0) { 4929 btrfs_release_path(path); 4930 goto done; 4931 } 4932 4933 if (!path->skip_locking) { 4934 if (path->nowait) { 4935 if (!btrfs_try_tree_read_lock(next)) { 4936 ret = -EAGAIN; 4937 goto done; 4938 } 4939 } else { 4940 btrfs_tree_read_lock(next); 4941 } 4942 } 4943 } 4944 ret = 0; 4945 done: 4946 unlock_up(path, 0, 1, 0, NULL); 4947 if (need_commit_sem) { 4948 int ret2; 4949 4950 path->need_commit_sem = 1; 4951 ret2 = finish_need_commit_sem_search(path); 4952 up_read(&fs_info->commit_root_sem); 4953 if (ret2) 4954 ret = ret2; 4955 } 4956 4957 return ret; 4958 } 4959 4960 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) 4961 { 4962 path->slots[0]++; 4963 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 4964 return btrfs_next_old_leaf(root, path, time_seq); 4965 return 0; 4966 } 4967 4968 /* 4969 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4970 * searching until it gets past min_objectid or finds an item of 'type' 4971 * 4972 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4973 */ 4974 int btrfs_previous_item(struct btrfs_root *root, 4975 struct btrfs_path *path, u64 min_objectid, 4976 int type) 4977 { 4978 struct btrfs_key found_key; 4979 struct extent_buffer *leaf; 4980 u32 nritems; 4981 int ret; 4982 4983 while (1) { 4984 if (path->slots[0] == 0) { 4985 ret = btrfs_prev_leaf(root, path); 4986 if (ret != 0) 4987 return ret; 4988 } else { 4989 path->slots[0]--; 4990 } 4991 leaf = path->nodes[0]; 4992 nritems = btrfs_header_nritems(leaf); 4993 if (nritems == 0) 4994 return 1; 4995 if (path->slots[0] == nritems) 4996 path->slots[0]--; 4997 4998 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4999 if (found_key.objectid < min_objectid) 5000 break; 5001 if (found_key.type == type) 5002 return 0; 5003 if (found_key.objectid == min_objectid && 5004 found_key.type < type) 5005 break; 5006 } 5007 return 1; 5008 } 5009 5010 /* 5011 * search in extent tree to find a previous Metadata/Data extent item with 5012 * min objecitd. 5013 * 5014 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5015 */ 5016 int btrfs_previous_extent_item(struct btrfs_root *root, 5017 struct btrfs_path *path, u64 min_objectid) 5018 { 5019 struct btrfs_key found_key; 5020 struct extent_buffer *leaf; 5021 u32 nritems; 5022 int ret; 5023 5024 while (1) { 5025 if (path->slots[0] == 0) { 5026 ret = btrfs_prev_leaf(root, path); 5027 if (ret != 0) 5028 return ret; 5029 } else { 5030 path->slots[0]--; 5031 } 5032 leaf = path->nodes[0]; 5033 nritems = btrfs_header_nritems(leaf); 5034 if (nritems == 0) 5035 return 1; 5036 if (path->slots[0] == nritems) 5037 path->slots[0]--; 5038 5039 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5040 if (found_key.objectid < min_objectid) 5041 break; 5042 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5043 found_key.type == BTRFS_METADATA_ITEM_KEY) 5044 return 0; 5045 if (found_key.objectid == min_objectid && 5046 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5047 break; 5048 } 5049 return 1; 5050 } 5051 5052 int __init btrfs_ctree_init(void) 5053 { 5054 btrfs_path_cachep = kmem_cache_create("btrfs_path", 5055 sizeof(struct btrfs_path), 0, 5056 SLAB_MEM_SPREAD, NULL); 5057 if (!btrfs_path_cachep) 5058 return -ENOMEM; 5059 return 0; 5060 } 5061 5062 void __cold btrfs_ctree_exit(void) 5063 { 5064 kmem_cache_destroy(btrfs_path_cachep); 5065 } 5066