1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include "ctree.h" 11 #include "disk-io.h" 12 #include "transaction.h" 13 #include "print-tree.h" 14 #include "locking.h" 15 #include "volumes.h" 16 #include "qgroup.h" 17 #include "tree-mod-log.h" 18 19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 20 *root, struct btrfs_path *path, int level); 21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 22 const struct btrfs_key *ins_key, struct btrfs_path *path, 23 int data_size, int extend); 24 static int push_node_left(struct btrfs_trans_handle *trans, 25 struct extent_buffer *dst, 26 struct extent_buffer *src, int empty); 27 static int balance_node_right(struct btrfs_trans_handle *trans, 28 struct extent_buffer *dst_buf, 29 struct extent_buffer *src_buf); 30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 31 int level, int slot); 32 33 static const struct btrfs_csums { 34 u16 size; 35 const char name[10]; 36 const char driver[12]; 37 } btrfs_csums[] = { 38 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 39 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 40 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 41 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 42 .driver = "blake2b-256" }, 43 }; 44 45 int btrfs_super_csum_size(const struct btrfs_super_block *s) 46 { 47 u16 t = btrfs_super_csum_type(s); 48 /* 49 * csum type is validated at mount time 50 */ 51 return btrfs_csums[t].size; 52 } 53 54 const char *btrfs_super_csum_name(u16 csum_type) 55 { 56 /* csum type is validated at mount time */ 57 return btrfs_csums[csum_type].name; 58 } 59 60 /* 61 * Return driver name if defined, otherwise the name that's also a valid driver 62 * name 63 */ 64 const char *btrfs_super_csum_driver(u16 csum_type) 65 { 66 /* csum type is validated at mount time */ 67 return btrfs_csums[csum_type].driver[0] ? 68 btrfs_csums[csum_type].driver : 69 btrfs_csums[csum_type].name; 70 } 71 72 size_t __attribute_const__ btrfs_get_num_csums(void) 73 { 74 return ARRAY_SIZE(btrfs_csums); 75 } 76 77 struct btrfs_path *btrfs_alloc_path(void) 78 { 79 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 80 } 81 82 /* this also releases the path */ 83 void btrfs_free_path(struct btrfs_path *p) 84 { 85 if (!p) 86 return; 87 btrfs_release_path(p); 88 kmem_cache_free(btrfs_path_cachep, p); 89 } 90 91 /* 92 * path release drops references on the extent buffers in the path 93 * and it drops any locks held by this path 94 * 95 * It is safe to call this on paths that no locks or extent buffers held. 96 */ 97 noinline void btrfs_release_path(struct btrfs_path *p) 98 { 99 int i; 100 101 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 102 p->slots[i] = 0; 103 if (!p->nodes[i]) 104 continue; 105 if (p->locks[i]) { 106 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 107 p->locks[i] = 0; 108 } 109 free_extent_buffer(p->nodes[i]); 110 p->nodes[i] = NULL; 111 } 112 } 113 114 /* 115 * safely gets a reference on the root node of a tree. A lock 116 * is not taken, so a concurrent writer may put a different node 117 * at the root of the tree. See btrfs_lock_root_node for the 118 * looping required. 119 * 120 * The extent buffer returned by this has a reference taken, so 121 * it won't disappear. It may stop being the root of the tree 122 * at any time because there are no locks held. 123 */ 124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 125 { 126 struct extent_buffer *eb; 127 128 while (1) { 129 rcu_read_lock(); 130 eb = rcu_dereference(root->node); 131 132 /* 133 * RCU really hurts here, we could free up the root node because 134 * it was COWed but we may not get the new root node yet so do 135 * the inc_not_zero dance and if it doesn't work then 136 * synchronize_rcu and try again. 137 */ 138 if (atomic_inc_not_zero(&eb->refs)) { 139 rcu_read_unlock(); 140 break; 141 } 142 rcu_read_unlock(); 143 synchronize_rcu(); 144 } 145 return eb; 146 } 147 148 /* 149 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), 150 * just get put onto a simple dirty list. Transaction walks this list to make 151 * sure they get properly updated on disk. 152 */ 153 static void add_root_to_dirty_list(struct btrfs_root *root) 154 { 155 struct btrfs_fs_info *fs_info = root->fs_info; 156 157 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 158 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 159 return; 160 161 spin_lock(&fs_info->trans_lock); 162 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 163 /* Want the extent tree to be the last on the list */ 164 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 165 list_move_tail(&root->dirty_list, 166 &fs_info->dirty_cowonly_roots); 167 else 168 list_move(&root->dirty_list, 169 &fs_info->dirty_cowonly_roots); 170 } 171 spin_unlock(&fs_info->trans_lock); 172 } 173 174 /* 175 * used by snapshot creation to make a copy of a root for a tree with 176 * a given objectid. The buffer with the new root node is returned in 177 * cow_ret, and this func returns zero on success or a negative error code. 178 */ 179 int btrfs_copy_root(struct btrfs_trans_handle *trans, 180 struct btrfs_root *root, 181 struct extent_buffer *buf, 182 struct extent_buffer **cow_ret, u64 new_root_objectid) 183 { 184 struct btrfs_fs_info *fs_info = root->fs_info; 185 struct extent_buffer *cow; 186 int ret = 0; 187 int level; 188 struct btrfs_disk_key disk_key; 189 190 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 191 trans->transid != fs_info->running_transaction->transid); 192 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 193 trans->transid != root->last_trans); 194 195 level = btrfs_header_level(buf); 196 if (level == 0) 197 btrfs_item_key(buf, &disk_key, 0); 198 else 199 btrfs_node_key(buf, &disk_key, 0); 200 201 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 202 &disk_key, level, buf->start, 0, 203 BTRFS_NESTING_NEW_ROOT); 204 if (IS_ERR(cow)) 205 return PTR_ERR(cow); 206 207 copy_extent_buffer_full(cow, buf); 208 btrfs_set_header_bytenr(cow, cow->start); 209 btrfs_set_header_generation(cow, trans->transid); 210 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 211 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 212 BTRFS_HEADER_FLAG_RELOC); 213 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 214 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 215 else 216 btrfs_set_header_owner(cow, new_root_objectid); 217 218 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 219 220 WARN_ON(btrfs_header_generation(buf) > trans->transid); 221 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 222 ret = btrfs_inc_ref(trans, root, cow, 1); 223 else 224 ret = btrfs_inc_ref(trans, root, cow, 0); 225 if (ret) { 226 btrfs_tree_unlock(cow); 227 free_extent_buffer(cow); 228 btrfs_abort_transaction(trans, ret); 229 return ret; 230 } 231 232 btrfs_mark_buffer_dirty(cow); 233 *cow_ret = cow; 234 return 0; 235 } 236 237 /* 238 * check if the tree block can be shared by multiple trees 239 */ 240 int btrfs_block_can_be_shared(struct btrfs_root *root, 241 struct extent_buffer *buf) 242 { 243 /* 244 * Tree blocks not in shareable trees and tree roots are never shared. 245 * If a block was allocated after the last snapshot and the block was 246 * not allocated by tree relocation, we know the block is not shared. 247 */ 248 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 249 buf != root->node && buf != root->commit_root && 250 (btrfs_header_generation(buf) <= 251 btrfs_root_last_snapshot(&root->root_item) || 252 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 253 return 1; 254 255 return 0; 256 } 257 258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 259 struct btrfs_root *root, 260 struct extent_buffer *buf, 261 struct extent_buffer *cow, 262 int *last_ref) 263 { 264 struct btrfs_fs_info *fs_info = root->fs_info; 265 u64 refs; 266 u64 owner; 267 u64 flags; 268 u64 new_flags = 0; 269 int ret; 270 271 /* 272 * Backrefs update rules: 273 * 274 * Always use full backrefs for extent pointers in tree block 275 * allocated by tree relocation. 276 * 277 * If a shared tree block is no longer referenced by its owner 278 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 279 * use full backrefs for extent pointers in tree block. 280 * 281 * If a tree block is been relocating 282 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 283 * use full backrefs for extent pointers in tree block. 284 * The reason for this is some operations (such as drop tree) 285 * are only allowed for blocks use full backrefs. 286 */ 287 288 if (btrfs_block_can_be_shared(root, buf)) { 289 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 290 btrfs_header_level(buf), 1, 291 &refs, &flags); 292 if (ret) 293 return ret; 294 if (refs == 0) { 295 ret = -EROFS; 296 btrfs_handle_fs_error(fs_info, ret, NULL); 297 return ret; 298 } 299 } else { 300 refs = 1; 301 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 302 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 303 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 304 else 305 flags = 0; 306 } 307 308 owner = btrfs_header_owner(buf); 309 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 310 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 311 312 if (refs > 1) { 313 if ((owner == root->root_key.objectid || 314 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 315 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 316 ret = btrfs_inc_ref(trans, root, buf, 1); 317 if (ret) 318 return ret; 319 320 if (root->root_key.objectid == 321 BTRFS_TREE_RELOC_OBJECTID) { 322 ret = btrfs_dec_ref(trans, root, buf, 0); 323 if (ret) 324 return ret; 325 ret = btrfs_inc_ref(trans, root, cow, 1); 326 if (ret) 327 return ret; 328 } 329 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 330 } else { 331 332 if (root->root_key.objectid == 333 BTRFS_TREE_RELOC_OBJECTID) 334 ret = btrfs_inc_ref(trans, root, cow, 1); 335 else 336 ret = btrfs_inc_ref(trans, root, cow, 0); 337 if (ret) 338 return ret; 339 } 340 if (new_flags != 0) { 341 int level = btrfs_header_level(buf); 342 343 ret = btrfs_set_disk_extent_flags(trans, buf, 344 new_flags, level, 0); 345 if (ret) 346 return ret; 347 } 348 } else { 349 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 350 if (root->root_key.objectid == 351 BTRFS_TREE_RELOC_OBJECTID) 352 ret = btrfs_inc_ref(trans, root, cow, 1); 353 else 354 ret = btrfs_inc_ref(trans, root, cow, 0); 355 if (ret) 356 return ret; 357 ret = btrfs_dec_ref(trans, root, buf, 1); 358 if (ret) 359 return ret; 360 } 361 btrfs_clean_tree_block(buf); 362 *last_ref = 1; 363 } 364 return 0; 365 } 366 367 static struct extent_buffer *alloc_tree_block_no_bg_flush( 368 struct btrfs_trans_handle *trans, 369 struct btrfs_root *root, 370 u64 parent_start, 371 const struct btrfs_disk_key *disk_key, 372 int level, 373 u64 hint, 374 u64 empty_size, 375 enum btrfs_lock_nesting nest) 376 { 377 struct btrfs_fs_info *fs_info = root->fs_info; 378 struct extent_buffer *ret; 379 380 /* 381 * If we are COWing a node/leaf from the extent, chunk, device or free 382 * space trees, make sure that we do not finish block group creation of 383 * pending block groups. We do this to avoid a deadlock. 384 * COWing can result in allocation of a new chunk, and flushing pending 385 * block groups (btrfs_create_pending_block_groups()) can be triggered 386 * when finishing allocation of a new chunk. Creation of a pending block 387 * group modifies the extent, chunk, device and free space trees, 388 * therefore we could deadlock with ourselves since we are holding a 389 * lock on an extent buffer that btrfs_create_pending_block_groups() may 390 * try to COW later. 391 * For similar reasons, we also need to delay flushing pending block 392 * groups when splitting a leaf or node, from one of those trees, since 393 * we are holding a write lock on it and its parent or when inserting a 394 * new root node for one of those trees. 395 */ 396 if (root == fs_info->extent_root || 397 root == fs_info->chunk_root || 398 root == fs_info->dev_root || 399 root == fs_info->free_space_root) 400 trans->can_flush_pending_bgs = false; 401 402 ret = btrfs_alloc_tree_block(trans, root, parent_start, 403 root->root_key.objectid, disk_key, level, 404 hint, empty_size, nest); 405 trans->can_flush_pending_bgs = true; 406 407 return ret; 408 } 409 410 /* 411 * does the dirty work in cow of a single block. The parent block (if 412 * supplied) is updated to point to the new cow copy. The new buffer is marked 413 * dirty and returned locked. If you modify the block it needs to be marked 414 * dirty again. 415 * 416 * search_start -- an allocation hint for the new block 417 * 418 * empty_size -- a hint that you plan on doing more cow. This is the size in 419 * bytes the allocator should try to find free next to the block it returns. 420 * This is just a hint and may be ignored by the allocator. 421 */ 422 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 423 struct btrfs_root *root, 424 struct extent_buffer *buf, 425 struct extent_buffer *parent, int parent_slot, 426 struct extent_buffer **cow_ret, 427 u64 search_start, u64 empty_size, 428 enum btrfs_lock_nesting nest) 429 { 430 struct btrfs_fs_info *fs_info = root->fs_info; 431 struct btrfs_disk_key disk_key; 432 struct extent_buffer *cow; 433 int level, ret; 434 int last_ref = 0; 435 int unlock_orig = 0; 436 u64 parent_start = 0; 437 438 if (*cow_ret == buf) 439 unlock_orig = 1; 440 441 btrfs_assert_tree_locked(buf); 442 443 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 444 trans->transid != fs_info->running_transaction->transid); 445 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && 446 trans->transid != root->last_trans); 447 448 level = btrfs_header_level(buf); 449 450 if (level == 0) 451 btrfs_item_key(buf, &disk_key, 0); 452 else 453 btrfs_node_key(buf, &disk_key, 0); 454 455 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 456 parent_start = parent->start; 457 458 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, 459 level, search_start, empty_size, nest); 460 if (IS_ERR(cow)) 461 return PTR_ERR(cow); 462 463 /* cow is set to blocking by btrfs_init_new_buffer */ 464 465 copy_extent_buffer_full(cow, buf); 466 btrfs_set_header_bytenr(cow, cow->start); 467 btrfs_set_header_generation(cow, trans->transid); 468 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 469 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 470 BTRFS_HEADER_FLAG_RELOC); 471 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 472 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 473 else 474 btrfs_set_header_owner(cow, root->root_key.objectid); 475 476 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 477 478 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 479 if (ret) { 480 btrfs_tree_unlock(cow); 481 free_extent_buffer(cow); 482 btrfs_abort_transaction(trans, ret); 483 return ret; 484 } 485 486 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 487 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 488 if (ret) { 489 btrfs_tree_unlock(cow); 490 free_extent_buffer(cow); 491 btrfs_abort_transaction(trans, ret); 492 return ret; 493 } 494 } 495 496 if (buf == root->node) { 497 WARN_ON(parent && parent != buf); 498 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 499 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 500 parent_start = buf->start; 501 502 atomic_inc(&cow->refs); 503 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); 504 BUG_ON(ret < 0); 505 rcu_assign_pointer(root->node, cow); 506 507 btrfs_free_tree_block(trans, root, buf, parent_start, 508 last_ref); 509 free_extent_buffer(buf); 510 add_root_to_dirty_list(root); 511 } else { 512 WARN_ON(trans->transid != btrfs_header_generation(parent)); 513 btrfs_tree_mod_log_insert_key(parent, parent_slot, 514 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 515 btrfs_set_node_blockptr(parent, parent_slot, 516 cow->start); 517 btrfs_set_node_ptr_generation(parent, parent_slot, 518 trans->transid); 519 btrfs_mark_buffer_dirty(parent); 520 if (last_ref) { 521 ret = btrfs_tree_mod_log_free_eb(buf); 522 if (ret) { 523 btrfs_tree_unlock(cow); 524 free_extent_buffer(cow); 525 btrfs_abort_transaction(trans, ret); 526 return ret; 527 } 528 } 529 btrfs_free_tree_block(trans, root, buf, parent_start, 530 last_ref); 531 } 532 if (unlock_orig) 533 btrfs_tree_unlock(buf); 534 free_extent_buffer_stale(buf); 535 btrfs_mark_buffer_dirty(cow); 536 *cow_ret = cow; 537 return 0; 538 } 539 540 static inline int should_cow_block(struct btrfs_trans_handle *trans, 541 struct btrfs_root *root, 542 struct extent_buffer *buf) 543 { 544 if (btrfs_is_testing(root->fs_info)) 545 return 0; 546 547 /* Ensure we can see the FORCE_COW bit */ 548 smp_mb__before_atomic(); 549 550 /* 551 * We do not need to cow a block if 552 * 1) this block is not created or changed in this transaction; 553 * 2) this block does not belong to TREE_RELOC tree; 554 * 3) the root is not forced COW. 555 * 556 * What is forced COW: 557 * when we create snapshot during committing the transaction, 558 * after we've finished copying src root, we must COW the shared 559 * block to ensure the metadata consistency. 560 */ 561 if (btrfs_header_generation(buf) == trans->transid && 562 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 563 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 564 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 565 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 566 return 0; 567 return 1; 568 } 569 570 /* 571 * cows a single block, see __btrfs_cow_block for the real work. 572 * This version of it has extra checks so that a block isn't COWed more than 573 * once per transaction, as long as it hasn't been written yet 574 */ 575 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 576 struct btrfs_root *root, struct extent_buffer *buf, 577 struct extent_buffer *parent, int parent_slot, 578 struct extent_buffer **cow_ret, 579 enum btrfs_lock_nesting nest) 580 { 581 struct btrfs_fs_info *fs_info = root->fs_info; 582 u64 search_start; 583 int ret; 584 585 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 586 btrfs_err(fs_info, 587 "COW'ing blocks on a fs root that's being dropped"); 588 589 if (trans->transaction != fs_info->running_transaction) 590 WARN(1, KERN_CRIT "trans %llu running %llu\n", 591 trans->transid, 592 fs_info->running_transaction->transid); 593 594 if (trans->transid != fs_info->generation) 595 WARN(1, KERN_CRIT "trans %llu running %llu\n", 596 trans->transid, fs_info->generation); 597 598 if (!should_cow_block(trans, root, buf)) { 599 *cow_ret = buf; 600 return 0; 601 } 602 603 search_start = buf->start & ~((u64)SZ_1G - 1); 604 605 /* 606 * Before CoWing this block for later modification, check if it's 607 * the subtree root and do the delayed subtree trace if needed. 608 * 609 * Also We don't care about the error, as it's handled internally. 610 */ 611 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 612 ret = __btrfs_cow_block(trans, root, buf, parent, 613 parent_slot, cow_ret, search_start, 0, nest); 614 615 trace_btrfs_cow_block(root, buf, *cow_ret); 616 617 return ret; 618 } 619 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 620 621 /* 622 * helper function for defrag to decide if two blocks pointed to by a 623 * node are actually close by 624 */ 625 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 626 { 627 if (blocknr < other && other - (blocknr + blocksize) < 32768) 628 return 1; 629 if (blocknr > other && blocknr - (other + blocksize) < 32768) 630 return 1; 631 return 0; 632 } 633 634 #ifdef __LITTLE_ENDIAN 635 636 /* 637 * Compare two keys, on little-endian the disk order is same as CPU order and 638 * we can avoid the conversion. 639 */ 640 static int comp_keys(const struct btrfs_disk_key *disk_key, 641 const struct btrfs_key *k2) 642 { 643 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; 644 645 return btrfs_comp_cpu_keys(k1, k2); 646 } 647 648 #else 649 650 /* 651 * compare two keys in a memcmp fashion 652 */ 653 static int comp_keys(const struct btrfs_disk_key *disk, 654 const struct btrfs_key *k2) 655 { 656 struct btrfs_key k1; 657 658 btrfs_disk_key_to_cpu(&k1, disk); 659 660 return btrfs_comp_cpu_keys(&k1, k2); 661 } 662 #endif 663 664 /* 665 * same as comp_keys only with two btrfs_key's 666 */ 667 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 668 { 669 if (k1->objectid > k2->objectid) 670 return 1; 671 if (k1->objectid < k2->objectid) 672 return -1; 673 if (k1->type > k2->type) 674 return 1; 675 if (k1->type < k2->type) 676 return -1; 677 if (k1->offset > k2->offset) 678 return 1; 679 if (k1->offset < k2->offset) 680 return -1; 681 return 0; 682 } 683 684 /* 685 * this is used by the defrag code to go through all the 686 * leaves pointed to by a node and reallocate them so that 687 * disk order is close to key order 688 */ 689 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 690 struct btrfs_root *root, struct extent_buffer *parent, 691 int start_slot, u64 *last_ret, 692 struct btrfs_key *progress) 693 { 694 struct btrfs_fs_info *fs_info = root->fs_info; 695 struct extent_buffer *cur; 696 u64 blocknr; 697 u64 search_start = *last_ret; 698 u64 last_block = 0; 699 u64 other; 700 u32 parent_nritems; 701 int end_slot; 702 int i; 703 int err = 0; 704 u32 blocksize; 705 int progress_passed = 0; 706 struct btrfs_disk_key disk_key; 707 708 WARN_ON(trans->transaction != fs_info->running_transaction); 709 WARN_ON(trans->transid != fs_info->generation); 710 711 parent_nritems = btrfs_header_nritems(parent); 712 blocksize = fs_info->nodesize; 713 end_slot = parent_nritems - 1; 714 715 if (parent_nritems <= 1) 716 return 0; 717 718 for (i = start_slot; i <= end_slot; i++) { 719 int close = 1; 720 721 btrfs_node_key(parent, &disk_key, i); 722 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 723 continue; 724 725 progress_passed = 1; 726 blocknr = btrfs_node_blockptr(parent, i); 727 if (last_block == 0) 728 last_block = blocknr; 729 730 if (i > 0) { 731 other = btrfs_node_blockptr(parent, i - 1); 732 close = close_blocks(blocknr, other, blocksize); 733 } 734 if (!close && i < end_slot) { 735 other = btrfs_node_blockptr(parent, i + 1); 736 close = close_blocks(blocknr, other, blocksize); 737 } 738 if (close) { 739 last_block = blocknr; 740 continue; 741 } 742 743 cur = btrfs_read_node_slot(parent, i); 744 if (IS_ERR(cur)) 745 return PTR_ERR(cur); 746 if (search_start == 0) 747 search_start = last_block; 748 749 btrfs_tree_lock(cur); 750 err = __btrfs_cow_block(trans, root, cur, parent, i, 751 &cur, search_start, 752 min(16 * blocksize, 753 (end_slot - i) * blocksize), 754 BTRFS_NESTING_COW); 755 if (err) { 756 btrfs_tree_unlock(cur); 757 free_extent_buffer(cur); 758 break; 759 } 760 search_start = cur->start; 761 last_block = cur->start; 762 *last_ret = search_start; 763 btrfs_tree_unlock(cur); 764 free_extent_buffer(cur); 765 } 766 return err; 767 } 768 769 /* 770 * search for key in the extent_buffer. The items start at offset p, 771 * and they are item_size apart. There are 'max' items in p. 772 * 773 * the slot in the array is returned via slot, and it points to 774 * the place where you would insert key if it is not found in 775 * the array. 776 * 777 * slot may point to max if the key is bigger than all of the keys 778 */ 779 static noinline int generic_bin_search(struct extent_buffer *eb, 780 unsigned long p, int item_size, 781 const struct btrfs_key *key, 782 int max, int *slot) 783 { 784 int low = 0; 785 int high = max; 786 int ret; 787 const int key_size = sizeof(struct btrfs_disk_key); 788 789 if (low > high) { 790 btrfs_err(eb->fs_info, 791 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 792 __func__, low, high, eb->start, 793 btrfs_header_owner(eb), btrfs_header_level(eb)); 794 return -EINVAL; 795 } 796 797 while (low < high) { 798 unsigned long oip; 799 unsigned long offset; 800 struct btrfs_disk_key *tmp; 801 struct btrfs_disk_key unaligned; 802 int mid; 803 804 mid = (low + high) / 2; 805 offset = p + mid * item_size; 806 oip = offset_in_page(offset); 807 808 if (oip + key_size <= PAGE_SIZE) { 809 const unsigned long idx = get_eb_page_index(offset); 810 char *kaddr = page_address(eb->pages[idx]); 811 812 oip = get_eb_offset_in_page(eb, offset); 813 tmp = (struct btrfs_disk_key *)(kaddr + oip); 814 } else { 815 read_extent_buffer(eb, &unaligned, offset, key_size); 816 tmp = &unaligned; 817 } 818 819 ret = comp_keys(tmp, key); 820 821 if (ret < 0) 822 low = mid + 1; 823 else if (ret > 0) 824 high = mid; 825 else { 826 *slot = mid; 827 return 0; 828 } 829 } 830 *slot = low; 831 return 1; 832 } 833 834 /* 835 * simple bin_search frontend that does the right thing for 836 * leaves vs nodes 837 */ 838 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 839 int *slot) 840 { 841 if (btrfs_header_level(eb) == 0) 842 return generic_bin_search(eb, 843 offsetof(struct btrfs_leaf, items), 844 sizeof(struct btrfs_item), 845 key, btrfs_header_nritems(eb), 846 slot); 847 else 848 return generic_bin_search(eb, 849 offsetof(struct btrfs_node, ptrs), 850 sizeof(struct btrfs_key_ptr), 851 key, btrfs_header_nritems(eb), 852 slot); 853 } 854 855 static void root_add_used(struct btrfs_root *root, u32 size) 856 { 857 spin_lock(&root->accounting_lock); 858 btrfs_set_root_used(&root->root_item, 859 btrfs_root_used(&root->root_item) + size); 860 spin_unlock(&root->accounting_lock); 861 } 862 863 static void root_sub_used(struct btrfs_root *root, u32 size) 864 { 865 spin_lock(&root->accounting_lock); 866 btrfs_set_root_used(&root->root_item, 867 btrfs_root_used(&root->root_item) - size); 868 spin_unlock(&root->accounting_lock); 869 } 870 871 /* given a node and slot number, this reads the blocks it points to. The 872 * extent buffer is returned with a reference taken (but unlocked). 873 */ 874 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, 875 int slot) 876 { 877 int level = btrfs_header_level(parent); 878 struct extent_buffer *eb; 879 struct btrfs_key first_key; 880 881 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 882 return ERR_PTR(-ENOENT); 883 884 BUG_ON(level == 0); 885 886 btrfs_node_key_to_cpu(parent, &first_key, slot); 887 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 888 btrfs_header_owner(parent), 889 btrfs_node_ptr_generation(parent, slot), 890 level - 1, &first_key); 891 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 892 free_extent_buffer(eb); 893 eb = ERR_PTR(-EIO); 894 } 895 896 return eb; 897 } 898 899 /* 900 * node level balancing, used to make sure nodes are in proper order for 901 * item deletion. We balance from the top down, so we have to make sure 902 * that a deletion won't leave an node completely empty later on. 903 */ 904 static noinline int balance_level(struct btrfs_trans_handle *trans, 905 struct btrfs_root *root, 906 struct btrfs_path *path, int level) 907 { 908 struct btrfs_fs_info *fs_info = root->fs_info; 909 struct extent_buffer *right = NULL; 910 struct extent_buffer *mid; 911 struct extent_buffer *left = NULL; 912 struct extent_buffer *parent = NULL; 913 int ret = 0; 914 int wret; 915 int pslot; 916 int orig_slot = path->slots[level]; 917 u64 orig_ptr; 918 919 ASSERT(level > 0); 920 921 mid = path->nodes[level]; 922 923 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); 924 WARN_ON(btrfs_header_generation(mid) != trans->transid); 925 926 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 927 928 if (level < BTRFS_MAX_LEVEL - 1) { 929 parent = path->nodes[level + 1]; 930 pslot = path->slots[level + 1]; 931 } 932 933 /* 934 * deal with the case where there is only one pointer in the root 935 * by promoting the node below to a root 936 */ 937 if (!parent) { 938 struct extent_buffer *child; 939 940 if (btrfs_header_nritems(mid) != 1) 941 return 0; 942 943 /* promote the child to a root */ 944 child = btrfs_read_node_slot(mid, 0); 945 if (IS_ERR(child)) { 946 ret = PTR_ERR(child); 947 btrfs_handle_fs_error(fs_info, ret, NULL); 948 goto enospc; 949 } 950 951 btrfs_tree_lock(child); 952 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 953 BTRFS_NESTING_COW); 954 if (ret) { 955 btrfs_tree_unlock(child); 956 free_extent_buffer(child); 957 goto enospc; 958 } 959 960 ret = btrfs_tree_mod_log_insert_root(root->node, child, true); 961 BUG_ON(ret < 0); 962 rcu_assign_pointer(root->node, child); 963 964 add_root_to_dirty_list(root); 965 btrfs_tree_unlock(child); 966 967 path->locks[level] = 0; 968 path->nodes[level] = NULL; 969 btrfs_clean_tree_block(mid); 970 btrfs_tree_unlock(mid); 971 /* once for the path */ 972 free_extent_buffer(mid); 973 974 root_sub_used(root, mid->len); 975 btrfs_free_tree_block(trans, root, mid, 0, 1); 976 /* once for the root ptr */ 977 free_extent_buffer_stale(mid); 978 return 0; 979 } 980 if (btrfs_header_nritems(mid) > 981 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 982 return 0; 983 984 left = btrfs_read_node_slot(parent, pslot - 1); 985 if (IS_ERR(left)) 986 left = NULL; 987 988 if (left) { 989 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 990 wret = btrfs_cow_block(trans, root, left, 991 parent, pslot - 1, &left, 992 BTRFS_NESTING_LEFT_COW); 993 if (wret) { 994 ret = wret; 995 goto enospc; 996 } 997 } 998 999 right = btrfs_read_node_slot(parent, pslot + 1); 1000 if (IS_ERR(right)) 1001 right = NULL; 1002 1003 if (right) { 1004 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1005 wret = btrfs_cow_block(trans, root, right, 1006 parent, pslot + 1, &right, 1007 BTRFS_NESTING_RIGHT_COW); 1008 if (wret) { 1009 ret = wret; 1010 goto enospc; 1011 } 1012 } 1013 1014 /* first, try to make some room in the middle buffer */ 1015 if (left) { 1016 orig_slot += btrfs_header_nritems(left); 1017 wret = push_node_left(trans, left, mid, 1); 1018 if (wret < 0) 1019 ret = wret; 1020 } 1021 1022 /* 1023 * then try to empty the right most buffer into the middle 1024 */ 1025 if (right) { 1026 wret = push_node_left(trans, mid, right, 1); 1027 if (wret < 0 && wret != -ENOSPC) 1028 ret = wret; 1029 if (btrfs_header_nritems(right) == 0) { 1030 btrfs_clean_tree_block(right); 1031 btrfs_tree_unlock(right); 1032 del_ptr(root, path, level + 1, pslot + 1); 1033 root_sub_used(root, right->len); 1034 btrfs_free_tree_block(trans, root, right, 0, 1); 1035 free_extent_buffer_stale(right); 1036 right = NULL; 1037 } else { 1038 struct btrfs_disk_key right_key; 1039 btrfs_node_key(right, &right_key, 0); 1040 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1041 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1042 BUG_ON(ret < 0); 1043 btrfs_set_node_key(parent, &right_key, pslot + 1); 1044 btrfs_mark_buffer_dirty(parent); 1045 } 1046 } 1047 if (btrfs_header_nritems(mid) == 1) { 1048 /* 1049 * we're not allowed to leave a node with one item in the 1050 * tree during a delete. A deletion from lower in the tree 1051 * could try to delete the only pointer in this node. 1052 * So, pull some keys from the left. 1053 * There has to be a left pointer at this point because 1054 * otherwise we would have pulled some pointers from the 1055 * right 1056 */ 1057 if (!left) { 1058 ret = -EROFS; 1059 btrfs_handle_fs_error(fs_info, ret, NULL); 1060 goto enospc; 1061 } 1062 wret = balance_node_right(trans, mid, left); 1063 if (wret < 0) { 1064 ret = wret; 1065 goto enospc; 1066 } 1067 if (wret == 1) { 1068 wret = push_node_left(trans, left, mid, 1); 1069 if (wret < 0) 1070 ret = wret; 1071 } 1072 BUG_ON(wret == 1); 1073 } 1074 if (btrfs_header_nritems(mid) == 0) { 1075 btrfs_clean_tree_block(mid); 1076 btrfs_tree_unlock(mid); 1077 del_ptr(root, path, level + 1, pslot); 1078 root_sub_used(root, mid->len); 1079 btrfs_free_tree_block(trans, root, mid, 0, 1); 1080 free_extent_buffer_stale(mid); 1081 mid = NULL; 1082 } else { 1083 /* update the parent key to reflect our changes */ 1084 struct btrfs_disk_key mid_key; 1085 btrfs_node_key(mid, &mid_key, 0); 1086 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1087 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1088 BUG_ON(ret < 0); 1089 btrfs_set_node_key(parent, &mid_key, pslot); 1090 btrfs_mark_buffer_dirty(parent); 1091 } 1092 1093 /* update the path */ 1094 if (left) { 1095 if (btrfs_header_nritems(left) > orig_slot) { 1096 atomic_inc(&left->refs); 1097 /* left was locked after cow */ 1098 path->nodes[level] = left; 1099 path->slots[level + 1] -= 1; 1100 path->slots[level] = orig_slot; 1101 if (mid) { 1102 btrfs_tree_unlock(mid); 1103 free_extent_buffer(mid); 1104 } 1105 } else { 1106 orig_slot -= btrfs_header_nritems(left); 1107 path->slots[level] = orig_slot; 1108 } 1109 } 1110 /* double check we haven't messed things up */ 1111 if (orig_ptr != 1112 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1113 BUG(); 1114 enospc: 1115 if (right) { 1116 btrfs_tree_unlock(right); 1117 free_extent_buffer(right); 1118 } 1119 if (left) { 1120 if (path->nodes[level] != left) 1121 btrfs_tree_unlock(left); 1122 free_extent_buffer(left); 1123 } 1124 return ret; 1125 } 1126 1127 /* Node balancing for insertion. Here we only split or push nodes around 1128 * when they are completely full. This is also done top down, so we 1129 * have to be pessimistic. 1130 */ 1131 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1132 struct btrfs_root *root, 1133 struct btrfs_path *path, int level) 1134 { 1135 struct btrfs_fs_info *fs_info = root->fs_info; 1136 struct extent_buffer *right = NULL; 1137 struct extent_buffer *mid; 1138 struct extent_buffer *left = NULL; 1139 struct extent_buffer *parent = NULL; 1140 int ret = 0; 1141 int wret; 1142 int pslot; 1143 int orig_slot = path->slots[level]; 1144 1145 if (level == 0) 1146 return 1; 1147 1148 mid = path->nodes[level]; 1149 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1150 1151 if (level < BTRFS_MAX_LEVEL - 1) { 1152 parent = path->nodes[level + 1]; 1153 pslot = path->slots[level + 1]; 1154 } 1155 1156 if (!parent) 1157 return 1; 1158 1159 left = btrfs_read_node_slot(parent, pslot - 1); 1160 if (IS_ERR(left)) 1161 left = NULL; 1162 1163 /* first, try to make some room in the middle buffer */ 1164 if (left) { 1165 u32 left_nr; 1166 1167 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 1168 1169 left_nr = btrfs_header_nritems(left); 1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1171 wret = 1; 1172 } else { 1173 ret = btrfs_cow_block(trans, root, left, parent, 1174 pslot - 1, &left, 1175 BTRFS_NESTING_LEFT_COW); 1176 if (ret) 1177 wret = 1; 1178 else { 1179 wret = push_node_left(trans, left, mid, 0); 1180 } 1181 } 1182 if (wret < 0) 1183 ret = wret; 1184 if (wret == 0) { 1185 struct btrfs_disk_key disk_key; 1186 orig_slot += left_nr; 1187 btrfs_node_key(mid, &disk_key, 0); 1188 ret = btrfs_tree_mod_log_insert_key(parent, pslot, 1189 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1190 BUG_ON(ret < 0); 1191 btrfs_set_node_key(parent, &disk_key, pslot); 1192 btrfs_mark_buffer_dirty(parent); 1193 if (btrfs_header_nritems(left) > orig_slot) { 1194 path->nodes[level] = left; 1195 path->slots[level + 1] -= 1; 1196 path->slots[level] = orig_slot; 1197 btrfs_tree_unlock(mid); 1198 free_extent_buffer(mid); 1199 } else { 1200 orig_slot -= 1201 btrfs_header_nritems(left); 1202 path->slots[level] = orig_slot; 1203 btrfs_tree_unlock(left); 1204 free_extent_buffer(left); 1205 } 1206 return 0; 1207 } 1208 btrfs_tree_unlock(left); 1209 free_extent_buffer(left); 1210 } 1211 right = btrfs_read_node_slot(parent, pslot + 1); 1212 if (IS_ERR(right)) 1213 right = NULL; 1214 1215 /* 1216 * then try to empty the right most buffer into the middle 1217 */ 1218 if (right) { 1219 u32 right_nr; 1220 1221 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 1222 1223 right_nr = btrfs_header_nritems(right); 1224 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 1225 wret = 1; 1226 } else { 1227 ret = btrfs_cow_block(trans, root, right, 1228 parent, pslot + 1, 1229 &right, BTRFS_NESTING_RIGHT_COW); 1230 if (ret) 1231 wret = 1; 1232 else { 1233 wret = balance_node_right(trans, right, mid); 1234 } 1235 } 1236 if (wret < 0) 1237 ret = wret; 1238 if (wret == 0) { 1239 struct btrfs_disk_key disk_key; 1240 1241 btrfs_node_key(right, &disk_key, 0); 1242 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, 1243 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); 1244 BUG_ON(ret < 0); 1245 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1246 btrfs_mark_buffer_dirty(parent); 1247 1248 if (btrfs_header_nritems(mid) <= orig_slot) { 1249 path->nodes[level] = right; 1250 path->slots[level + 1] += 1; 1251 path->slots[level] = orig_slot - 1252 btrfs_header_nritems(mid); 1253 btrfs_tree_unlock(mid); 1254 free_extent_buffer(mid); 1255 } else { 1256 btrfs_tree_unlock(right); 1257 free_extent_buffer(right); 1258 } 1259 return 0; 1260 } 1261 btrfs_tree_unlock(right); 1262 free_extent_buffer(right); 1263 } 1264 return 1; 1265 } 1266 1267 /* 1268 * readahead one full node of leaves, finding things that are close 1269 * to the block in 'slot', and triggering ra on them. 1270 */ 1271 static void reada_for_search(struct btrfs_fs_info *fs_info, 1272 struct btrfs_path *path, 1273 int level, int slot, u64 objectid) 1274 { 1275 struct extent_buffer *node; 1276 struct btrfs_disk_key disk_key; 1277 u32 nritems; 1278 u64 search; 1279 u64 target; 1280 u64 nread = 0; 1281 u64 nread_max; 1282 struct extent_buffer *eb; 1283 u32 nr; 1284 u32 blocksize; 1285 u32 nscan = 0; 1286 1287 if (level != 1 && path->reada != READA_FORWARD_ALWAYS) 1288 return; 1289 1290 if (!path->nodes[level]) 1291 return; 1292 1293 node = path->nodes[level]; 1294 1295 /* 1296 * Since the time between visiting leaves is much shorter than the time 1297 * between visiting nodes, limit read ahead of nodes to 1, to avoid too 1298 * much IO at once (possibly random). 1299 */ 1300 if (path->reada == READA_FORWARD_ALWAYS) { 1301 if (level > 1) 1302 nread_max = node->fs_info->nodesize; 1303 else 1304 nread_max = SZ_128K; 1305 } else { 1306 nread_max = SZ_64K; 1307 } 1308 1309 search = btrfs_node_blockptr(node, slot); 1310 blocksize = fs_info->nodesize; 1311 eb = find_extent_buffer(fs_info, search); 1312 if (eb) { 1313 free_extent_buffer(eb); 1314 return; 1315 } 1316 1317 target = search; 1318 1319 nritems = btrfs_header_nritems(node); 1320 nr = slot; 1321 1322 while (1) { 1323 if (path->reada == READA_BACK) { 1324 if (nr == 0) 1325 break; 1326 nr--; 1327 } else if (path->reada == READA_FORWARD || 1328 path->reada == READA_FORWARD_ALWAYS) { 1329 nr++; 1330 if (nr >= nritems) 1331 break; 1332 } 1333 if (path->reada == READA_BACK && objectid) { 1334 btrfs_node_key(node, &disk_key, nr); 1335 if (btrfs_disk_key_objectid(&disk_key) != objectid) 1336 break; 1337 } 1338 search = btrfs_node_blockptr(node, nr); 1339 if (path->reada == READA_FORWARD_ALWAYS || 1340 (search <= target && target - search <= 65536) || 1341 (search > target && search - target <= 65536)) { 1342 btrfs_readahead_node_child(node, nr); 1343 nread += blocksize; 1344 } 1345 nscan++; 1346 if (nread > nread_max || nscan > 32) 1347 break; 1348 } 1349 } 1350 1351 static noinline void reada_for_balance(struct btrfs_path *path, int level) 1352 { 1353 struct extent_buffer *parent; 1354 int slot; 1355 int nritems; 1356 1357 parent = path->nodes[level + 1]; 1358 if (!parent) 1359 return; 1360 1361 nritems = btrfs_header_nritems(parent); 1362 slot = path->slots[level + 1]; 1363 1364 if (slot > 0) 1365 btrfs_readahead_node_child(parent, slot - 1); 1366 if (slot + 1 < nritems) 1367 btrfs_readahead_node_child(parent, slot + 1); 1368 } 1369 1370 1371 /* 1372 * when we walk down the tree, it is usually safe to unlock the higher layers 1373 * in the tree. The exceptions are when our path goes through slot 0, because 1374 * operations on the tree might require changing key pointers higher up in the 1375 * tree. 1376 * 1377 * callers might also have set path->keep_locks, which tells this code to keep 1378 * the lock if the path points to the last slot in the block. This is part of 1379 * walking through the tree, and selecting the next slot in the higher block. 1380 * 1381 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 1382 * if lowest_unlock is 1, level 0 won't be unlocked 1383 */ 1384 static noinline void unlock_up(struct btrfs_path *path, int level, 1385 int lowest_unlock, int min_write_lock_level, 1386 int *write_lock_level) 1387 { 1388 int i; 1389 int skip_level = level; 1390 int no_skips = 0; 1391 struct extent_buffer *t; 1392 1393 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 1394 if (!path->nodes[i]) 1395 break; 1396 if (!path->locks[i]) 1397 break; 1398 if (!no_skips && path->slots[i] == 0) { 1399 skip_level = i + 1; 1400 continue; 1401 } 1402 if (!no_skips && path->keep_locks) { 1403 u32 nritems; 1404 t = path->nodes[i]; 1405 nritems = btrfs_header_nritems(t); 1406 if (nritems < 1 || path->slots[i] >= nritems - 1) { 1407 skip_level = i + 1; 1408 continue; 1409 } 1410 } 1411 if (skip_level < i && i >= lowest_unlock) 1412 no_skips = 1; 1413 1414 t = path->nodes[i]; 1415 if (i >= lowest_unlock && i > skip_level) { 1416 btrfs_tree_unlock_rw(t, path->locks[i]); 1417 path->locks[i] = 0; 1418 if (write_lock_level && 1419 i > min_write_lock_level && 1420 i <= *write_lock_level) { 1421 *write_lock_level = i - 1; 1422 } 1423 } 1424 } 1425 } 1426 1427 /* 1428 * helper function for btrfs_search_slot. The goal is to find a block 1429 * in cache without setting the path to blocking. If we find the block 1430 * we return zero and the path is unchanged. 1431 * 1432 * If we can't find the block, we set the path blocking and do some 1433 * reada. -EAGAIN is returned and the search must be repeated. 1434 */ 1435 static int 1436 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 1437 struct extent_buffer **eb_ret, int level, int slot, 1438 const struct btrfs_key *key) 1439 { 1440 struct btrfs_fs_info *fs_info = root->fs_info; 1441 u64 blocknr; 1442 u64 gen; 1443 struct extent_buffer *tmp; 1444 struct btrfs_key first_key; 1445 int ret; 1446 int parent_level; 1447 1448 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1449 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1450 parent_level = btrfs_header_level(*eb_ret); 1451 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); 1452 1453 tmp = find_extent_buffer(fs_info, blocknr); 1454 if (tmp) { 1455 if (p->reada == READA_FORWARD_ALWAYS) 1456 reada_for_search(fs_info, p, level, slot, key->objectid); 1457 1458 /* first we do an atomic uptodate check */ 1459 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 1460 /* 1461 * Do extra check for first_key, eb can be stale due to 1462 * being cached, read from scrub, or have multiple 1463 * parents (shared tree blocks). 1464 */ 1465 if (btrfs_verify_level_key(tmp, 1466 parent_level - 1, &first_key, gen)) { 1467 free_extent_buffer(tmp); 1468 return -EUCLEAN; 1469 } 1470 *eb_ret = tmp; 1471 return 0; 1472 } 1473 1474 /* now we're allowed to do a blocking uptodate check */ 1475 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 1476 if (!ret) { 1477 *eb_ret = tmp; 1478 return 0; 1479 } 1480 free_extent_buffer(tmp); 1481 btrfs_release_path(p); 1482 return -EIO; 1483 } 1484 1485 /* 1486 * reduce lock contention at high levels 1487 * of the btree by dropping locks before 1488 * we read. Don't release the lock on the current 1489 * level because we need to walk this node to figure 1490 * out which blocks to read. 1491 */ 1492 btrfs_unlock_up_safe(p, level + 1); 1493 1494 if (p->reada != READA_NONE) 1495 reada_for_search(fs_info, p, level, slot, key->objectid); 1496 1497 ret = -EAGAIN; 1498 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1499 gen, parent_level - 1, &first_key); 1500 if (!IS_ERR(tmp)) { 1501 /* 1502 * If the read above didn't mark this buffer up to date, 1503 * it will never end up being up to date. Set ret to EIO now 1504 * and give up so that our caller doesn't loop forever 1505 * on our EAGAINs. 1506 */ 1507 if (!extent_buffer_uptodate(tmp)) 1508 ret = -EIO; 1509 free_extent_buffer(tmp); 1510 } else { 1511 ret = PTR_ERR(tmp); 1512 } 1513 1514 btrfs_release_path(p); 1515 return ret; 1516 } 1517 1518 /* 1519 * helper function for btrfs_search_slot. This does all of the checks 1520 * for node-level blocks and does any balancing required based on 1521 * the ins_len. 1522 * 1523 * If no extra work was required, zero is returned. If we had to 1524 * drop the path, -EAGAIN is returned and btrfs_search_slot must 1525 * start over 1526 */ 1527 static int 1528 setup_nodes_for_search(struct btrfs_trans_handle *trans, 1529 struct btrfs_root *root, struct btrfs_path *p, 1530 struct extent_buffer *b, int level, int ins_len, 1531 int *write_lock_level) 1532 { 1533 struct btrfs_fs_info *fs_info = root->fs_info; 1534 int ret = 0; 1535 1536 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1537 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 1538 1539 if (*write_lock_level < level + 1) { 1540 *write_lock_level = level + 1; 1541 btrfs_release_path(p); 1542 return -EAGAIN; 1543 } 1544 1545 reada_for_balance(p, level); 1546 ret = split_node(trans, root, p, level); 1547 1548 b = p->nodes[level]; 1549 } else if (ins_len < 0 && btrfs_header_nritems(b) < 1550 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 1551 1552 if (*write_lock_level < level + 1) { 1553 *write_lock_level = level + 1; 1554 btrfs_release_path(p); 1555 return -EAGAIN; 1556 } 1557 1558 reada_for_balance(p, level); 1559 ret = balance_level(trans, root, p, level); 1560 if (ret) 1561 return ret; 1562 1563 b = p->nodes[level]; 1564 if (!b) { 1565 btrfs_release_path(p); 1566 return -EAGAIN; 1567 } 1568 BUG_ON(btrfs_header_nritems(b) == 1); 1569 } 1570 return ret; 1571 } 1572 1573 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 1574 u64 iobjectid, u64 ioff, u8 key_type, 1575 struct btrfs_key *found_key) 1576 { 1577 int ret; 1578 struct btrfs_key key; 1579 struct extent_buffer *eb; 1580 1581 ASSERT(path); 1582 ASSERT(found_key); 1583 1584 key.type = key_type; 1585 key.objectid = iobjectid; 1586 key.offset = ioff; 1587 1588 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 1589 if (ret < 0) 1590 return ret; 1591 1592 eb = path->nodes[0]; 1593 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 1594 ret = btrfs_next_leaf(fs_root, path); 1595 if (ret) 1596 return ret; 1597 eb = path->nodes[0]; 1598 } 1599 1600 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 1601 if (found_key->type != key.type || 1602 found_key->objectid != key.objectid) 1603 return 1; 1604 1605 return 0; 1606 } 1607 1608 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 1609 struct btrfs_path *p, 1610 int write_lock_level) 1611 { 1612 struct btrfs_fs_info *fs_info = root->fs_info; 1613 struct extent_buffer *b; 1614 int root_lock; 1615 int level = 0; 1616 1617 /* We try very hard to do read locks on the root */ 1618 root_lock = BTRFS_READ_LOCK; 1619 1620 if (p->search_commit_root) { 1621 /* 1622 * The commit roots are read only so we always do read locks, 1623 * and we always must hold the commit_root_sem when doing 1624 * searches on them, the only exception is send where we don't 1625 * want to block transaction commits for a long time, so 1626 * we need to clone the commit root in order to avoid races 1627 * with transaction commits that create a snapshot of one of 1628 * the roots used by a send operation. 1629 */ 1630 if (p->need_commit_sem) { 1631 down_read(&fs_info->commit_root_sem); 1632 b = btrfs_clone_extent_buffer(root->commit_root); 1633 up_read(&fs_info->commit_root_sem); 1634 if (!b) 1635 return ERR_PTR(-ENOMEM); 1636 1637 } else { 1638 b = root->commit_root; 1639 atomic_inc(&b->refs); 1640 } 1641 level = btrfs_header_level(b); 1642 /* 1643 * Ensure that all callers have set skip_locking when 1644 * p->search_commit_root = 1. 1645 */ 1646 ASSERT(p->skip_locking == 1); 1647 1648 goto out; 1649 } 1650 1651 if (p->skip_locking) { 1652 b = btrfs_root_node(root); 1653 level = btrfs_header_level(b); 1654 goto out; 1655 } 1656 1657 /* 1658 * If the level is set to maximum, we can skip trying to get the read 1659 * lock. 1660 */ 1661 if (write_lock_level < BTRFS_MAX_LEVEL) { 1662 /* 1663 * We don't know the level of the root node until we actually 1664 * have it read locked 1665 */ 1666 b = btrfs_read_lock_root_node(root); 1667 level = btrfs_header_level(b); 1668 if (level > write_lock_level) 1669 goto out; 1670 1671 /* Whoops, must trade for write lock */ 1672 btrfs_tree_read_unlock(b); 1673 free_extent_buffer(b); 1674 } 1675 1676 b = btrfs_lock_root_node(root); 1677 root_lock = BTRFS_WRITE_LOCK; 1678 1679 /* The level might have changed, check again */ 1680 level = btrfs_header_level(b); 1681 1682 out: 1683 p->nodes[level] = b; 1684 if (!p->skip_locking) 1685 p->locks[level] = root_lock; 1686 /* 1687 * Callers are responsible for dropping b's references. 1688 */ 1689 return b; 1690 } 1691 1692 1693 /* 1694 * btrfs_search_slot - look for a key in a tree and perform necessary 1695 * modifications to preserve tree invariants. 1696 * 1697 * @trans: Handle of transaction, used when modifying the tree 1698 * @p: Holds all btree nodes along the search path 1699 * @root: The root node of the tree 1700 * @key: The key we are looking for 1701 * @ins_len: Indicates purpose of search: 1702 * >0 for inserts it's size of item inserted (*) 1703 * <0 for deletions 1704 * 0 for plain searches, not modifying the tree 1705 * 1706 * (*) If size of item inserted doesn't include 1707 * sizeof(struct btrfs_item), then p->search_for_extension must 1708 * be set. 1709 * @cow: boolean should CoW operations be performed. Must always be 1 1710 * when modifying the tree. 1711 * 1712 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 1713 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 1714 * 1715 * If @key is found, 0 is returned and you can find the item in the leaf level 1716 * of the path (level 0) 1717 * 1718 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 1719 * points to the slot where it should be inserted 1720 * 1721 * If an error is encountered while searching the tree a negative error number 1722 * is returned 1723 */ 1724 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1725 const struct btrfs_key *key, struct btrfs_path *p, 1726 int ins_len, int cow) 1727 { 1728 struct extent_buffer *b; 1729 int slot; 1730 int ret; 1731 int err; 1732 int level; 1733 int lowest_unlock = 1; 1734 /* everything at write_lock_level or lower must be write locked */ 1735 int write_lock_level = 0; 1736 u8 lowest_level = 0; 1737 int min_write_lock_level; 1738 int prev_cmp; 1739 1740 lowest_level = p->lowest_level; 1741 WARN_ON(lowest_level && ins_len > 0); 1742 WARN_ON(p->nodes[0] != NULL); 1743 BUG_ON(!cow && ins_len); 1744 1745 if (ins_len < 0) { 1746 lowest_unlock = 2; 1747 1748 /* when we are removing items, we might have to go up to level 1749 * two as we update tree pointers Make sure we keep write 1750 * for those levels as well 1751 */ 1752 write_lock_level = 2; 1753 } else if (ins_len > 0) { 1754 /* 1755 * for inserting items, make sure we have a write lock on 1756 * level 1 so we can update keys 1757 */ 1758 write_lock_level = 1; 1759 } 1760 1761 if (!cow) 1762 write_lock_level = -1; 1763 1764 if (cow && (p->keep_locks || p->lowest_level)) 1765 write_lock_level = BTRFS_MAX_LEVEL; 1766 1767 min_write_lock_level = write_lock_level; 1768 1769 again: 1770 prev_cmp = -1; 1771 b = btrfs_search_slot_get_root(root, p, write_lock_level); 1772 if (IS_ERR(b)) { 1773 ret = PTR_ERR(b); 1774 goto done; 1775 } 1776 1777 while (b) { 1778 int dec = 0; 1779 1780 level = btrfs_header_level(b); 1781 1782 if (cow) { 1783 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 1784 1785 /* 1786 * if we don't really need to cow this block 1787 * then we don't want to set the path blocking, 1788 * so we test it here 1789 */ 1790 if (!should_cow_block(trans, root, b)) 1791 goto cow_done; 1792 1793 /* 1794 * must have write locks on this node and the 1795 * parent 1796 */ 1797 if (level > write_lock_level || 1798 (level + 1 > write_lock_level && 1799 level + 1 < BTRFS_MAX_LEVEL && 1800 p->nodes[level + 1])) { 1801 write_lock_level = level + 1; 1802 btrfs_release_path(p); 1803 goto again; 1804 } 1805 1806 if (last_level) 1807 err = btrfs_cow_block(trans, root, b, NULL, 0, 1808 &b, 1809 BTRFS_NESTING_COW); 1810 else 1811 err = btrfs_cow_block(trans, root, b, 1812 p->nodes[level + 1], 1813 p->slots[level + 1], &b, 1814 BTRFS_NESTING_COW); 1815 if (err) { 1816 ret = err; 1817 goto done; 1818 } 1819 } 1820 cow_done: 1821 p->nodes[level] = b; 1822 /* 1823 * Leave path with blocking locks to avoid massive 1824 * lock context switch, this is made on purpose. 1825 */ 1826 1827 /* 1828 * we have a lock on b and as long as we aren't changing 1829 * the tree, there is no way to for the items in b to change. 1830 * It is safe to drop the lock on our parent before we 1831 * go through the expensive btree search on b. 1832 * 1833 * If we're inserting or deleting (ins_len != 0), then we might 1834 * be changing slot zero, which may require changing the parent. 1835 * So, we can't drop the lock until after we know which slot 1836 * we're operating on. 1837 */ 1838 if (!ins_len && !p->keep_locks) { 1839 int u = level + 1; 1840 1841 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 1842 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 1843 p->locks[u] = 0; 1844 } 1845 } 1846 1847 /* 1848 * If btrfs_bin_search returns an exact match (prev_cmp == 0) 1849 * we can safely assume the target key will always be in slot 0 1850 * on lower levels due to the invariants BTRFS' btree provides, 1851 * namely that a btrfs_key_ptr entry always points to the 1852 * lowest key in the child node, thus we can skip searching 1853 * lower levels 1854 */ 1855 if (prev_cmp == 0) { 1856 slot = 0; 1857 ret = 0; 1858 } else { 1859 ret = btrfs_bin_search(b, key, &slot); 1860 prev_cmp = ret; 1861 if (ret < 0) 1862 goto done; 1863 } 1864 1865 if (level == 0) { 1866 p->slots[level] = slot; 1867 /* 1868 * Item key already exists. In this case, if we are 1869 * allowed to insert the item (for example, in dir_item 1870 * case, item key collision is allowed), it will be 1871 * merged with the original item. Only the item size 1872 * grows, no new btrfs item will be added. If 1873 * search_for_extension is not set, ins_len already 1874 * accounts the size btrfs_item, deduct it here so leaf 1875 * space check will be correct. 1876 */ 1877 if (ret == 0 && ins_len > 0 && !p->search_for_extension) { 1878 ASSERT(ins_len >= sizeof(struct btrfs_item)); 1879 ins_len -= sizeof(struct btrfs_item); 1880 } 1881 if (ins_len > 0 && 1882 btrfs_leaf_free_space(b) < ins_len) { 1883 if (write_lock_level < 1) { 1884 write_lock_level = 1; 1885 btrfs_release_path(p); 1886 goto again; 1887 } 1888 1889 err = split_leaf(trans, root, key, 1890 p, ins_len, ret == 0); 1891 1892 BUG_ON(err > 0); 1893 if (err) { 1894 ret = err; 1895 goto done; 1896 } 1897 } 1898 if (!p->search_for_split) 1899 unlock_up(p, level, lowest_unlock, 1900 min_write_lock_level, NULL); 1901 goto done; 1902 } 1903 if (ret && slot > 0) { 1904 dec = 1; 1905 slot--; 1906 } 1907 p->slots[level] = slot; 1908 err = setup_nodes_for_search(trans, root, p, b, level, ins_len, 1909 &write_lock_level); 1910 if (err == -EAGAIN) 1911 goto again; 1912 if (err) { 1913 ret = err; 1914 goto done; 1915 } 1916 b = p->nodes[level]; 1917 slot = p->slots[level]; 1918 1919 /* 1920 * Slot 0 is special, if we change the key we have to update 1921 * the parent pointer which means we must have a write lock on 1922 * the parent 1923 */ 1924 if (slot == 0 && ins_len && write_lock_level < level + 1) { 1925 write_lock_level = level + 1; 1926 btrfs_release_path(p); 1927 goto again; 1928 } 1929 1930 unlock_up(p, level, lowest_unlock, min_write_lock_level, 1931 &write_lock_level); 1932 1933 if (level == lowest_level) { 1934 if (dec) 1935 p->slots[level]++; 1936 goto done; 1937 } 1938 1939 err = read_block_for_search(root, p, &b, level, slot, key); 1940 if (err == -EAGAIN) 1941 goto again; 1942 if (err) { 1943 ret = err; 1944 goto done; 1945 } 1946 1947 if (!p->skip_locking) { 1948 level = btrfs_header_level(b); 1949 if (level <= write_lock_level) { 1950 btrfs_tree_lock(b); 1951 p->locks[level] = BTRFS_WRITE_LOCK; 1952 } else { 1953 btrfs_tree_read_lock(b); 1954 p->locks[level] = BTRFS_READ_LOCK; 1955 } 1956 p->nodes[level] = b; 1957 } 1958 } 1959 ret = 1; 1960 done: 1961 if (ret < 0 && !p->skip_release_on_error) 1962 btrfs_release_path(p); 1963 return ret; 1964 } 1965 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); 1966 1967 /* 1968 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 1969 * current state of the tree together with the operations recorded in the tree 1970 * modification log to search for the key in a previous version of this tree, as 1971 * denoted by the time_seq parameter. 1972 * 1973 * Naturally, there is no support for insert, delete or cow operations. 1974 * 1975 * The resulting path and return value will be set up as if we called 1976 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 1977 */ 1978 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 1979 struct btrfs_path *p, u64 time_seq) 1980 { 1981 struct btrfs_fs_info *fs_info = root->fs_info; 1982 struct extent_buffer *b; 1983 int slot; 1984 int ret; 1985 int err; 1986 int level; 1987 int lowest_unlock = 1; 1988 u8 lowest_level = 0; 1989 1990 lowest_level = p->lowest_level; 1991 WARN_ON(p->nodes[0] != NULL); 1992 1993 if (p->search_commit_root) { 1994 BUG_ON(time_seq); 1995 return btrfs_search_slot(NULL, root, key, p, 0, 0); 1996 } 1997 1998 again: 1999 b = btrfs_get_old_root(root, time_seq); 2000 if (!b) { 2001 ret = -EIO; 2002 goto done; 2003 } 2004 level = btrfs_header_level(b); 2005 p->locks[level] = BTRFS_READ_LOCK; 2006 2007 while (b) { 2008 int dec = 0; 2009 2010 level = btrfs_header_level(b); 2011 p->nodes[level] = b; 2012 2013 /* 2014 * we have a lock on b and as long as we aren't changing 2015 * the tree, there is no way to for the items in b to change. 2016 * It is safe to drop the lock on our parent before we 2017 * go through the expensive btree search on b. 2018 */ 2019 btrfs_unlock_up_safe(p, level + 1); 2020 2021 ret = btrfs_bin_search(b, key, &slot); 2022 if (ret < 0) 2023 goto done; 2024 2025 if (level == 0) { 2026 p->slots[level] = slot; 2027 unlock_up(p, level, lowest_unlock, 0, NULL); 2028 goto done; 2029 } 2030 2031 if (ret && slot > 0) { 2032 dec = 1; 2033 slot--; 2034 } 2035 p->slots[level] = slot; 2036 unlock_up(p, level, lowest_unlock, 0, NULL); 2037 2038 if (level == lowest_level) { 2039 if (dec) 2040 p->slots[level]++; 2041 goto done; 2042 } 2043 2044 err = read_block_for_search(root, p, &b, level, slot, key); 2045 if (err == -EAGAIN) 2046 goto again; 2047 if (err) { 2048 ret = err; 2049 goto done; 2050 } 2051 2052 level = btrfs_header_level(b); 2053 btrfs_tree_read_lock(b); 2054 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); 2055 if (!b) { 2056 ret = -ENOMEM; 2057 goto done; 2058 } 2059 p->locks[level] = BTRFS_READ_LOCK; 2060 p->nodes[level] = b; 2061 } 2062 ret = 1; 2063 done: 2064 if (ret < 0) 2065 btrfs_release_path(p); 2066 2067 return ret; 2068 } 2069 2070 /* 2071 * helper to use instead of search slot if no exact match is needed but 2072 * instead the next or previous item should be returned. 2073 * When find_higher is true, the next higher item is returned, the next lower 2074 * otherwise. 2075 * When return_any and find_higher are both true, and no higher item is found, 2076 * return the next lower instead. 2077 * When return_any is true and find_higher is false, and no lower item is found, 2078 * return the next higher instead. 2079 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2080 * < 0 on error 2081 */ 2082 int btrfs_search_slot_for_read(struct btrfs_root *root, 2083 const struct btrfs_key *key, 2084 struct btrfs_path *p, int find_higher, 2085 int return_any) 2086 { 2087 int ret; 2088 struct extent_buffer *leaf; 2089 2090 again: 2091 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2092 if (ret <= 0) 2093 return ret; 2094 /* 2095 * a return value of 1 means the path is at the position where the 2096 * item should be inserted. Normally this is the next bigger item, 2097 * but in case the previous item is the last in a leaf, path points 2098 * to the first free slot in the previous leaf, i.e. at an invalid 2099 * item. 2100 */ 2101 leaf = p->nodes[0]; 2102 2103 if (find_higher) { 2104 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2105 ret = btrfs_next_leaf(root, p); 2106 if (ret <= 0) 2107 return ret; 2108 if (!return_any) 2109 return 1; 2110 /* 2111 * no higher item found, return the next 2112 * lower instead 2113 */ 2114 return_any = 0; 2115 find_higher = 0; 2116 btrfs_release_path(p); 2117 goto again; 2118 } 2119 } else { 2120 if (p->slots[0] == 0) { 2121 ret = btrfs_prev_leaf(root, p); 2122 if (ret < 0) 2123 return ret; 2124 if (!ret) { 2125 leaf = p->nodes[0]; 2126 if (p->slots[0] == btrfs_header_nritems(leaf)) 2127 p->slots[0]--; 2128 return 0; 2129 } 2130 if (!return_any) 2131 return 1; 2132 /* 2133 * no lower item found, return the next 2134 * higher instead 2135 */ 2136 return_any = 0; 2137 find_higher = 1; 2138 btrfs_release_path(p); 2139 goto again; 2140 } else { 2141 --p->slots[0]; 2142 } 2143 } 2144 return 0; 2145 } 2146 2147 /* 2148 * adjust the pointers going up the tree, starting at level 2149 * making sure the right key of each node is points to 'key'. 2150 * This is used after shifting pointers to the left, so it stops 2151 * fixing up pointers when a given leaf/node is not in slot 0 of the 2152 * higher levels 2153 * 2154 */ 2155 static void fixup_low_keys(struct btrfs_path *path, 2156 struct btrfs_disk_key *key, int level) 2157 { 2158 int i; 2159 struct extent_buffer *t; 2160 int ret; 2161 2162 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2163 int tslot = path->slots[i]; 2164 2165 if (!path->nodes[i]) 2166 break; 2167 t = path->nodes[i]; 2168 ret = btrfs_tree_mod_log_insert_key(t, tslot, 2169 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); 2170 BUG_ON(ret < 0); 2171 btrfs_set_node_key(t, key, tslot); 2172 btrfs_mark_buffer_dirty(path->nodes[i]); 2173 if (tslot != 0) 2174 break; 2175 } 2176 } 2177 2178 /* 2179 * update item key. 2180 * 2181 * This function isn't completely safe. It's the caller's responsibility 2182 * that the new key won't break the order 2183 */ 2184 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 2185 struct btrfs_path *path, 2186 const struct btrfs_key *new_key) 2187 { 2188 struct btrfs_disk_key disk_key; 2189 struct extent_buffer *eb; 2190 int slot; 2191 2192 eb = path->nodes[0]; 2193 slot = path->slots[0]; 2194 if (slot > 0) { 2195 btrfs_item_key(eb, &disk_key, slot - 1); 2196 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { 2197 btrfs_crit(fs_info, 2198 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2199 slot, btrfs_disk_key_objectid(&disk_key), 2200 btrfs_disk_key_type(&disk_key), 2201 btrfs_disk_key_offset(&disk_key), 2202 new_key->objectid, new_key->type, 2203 new_key->offset); 2204 btrfs_print_leaf(eb); 2205 BUG(); 2206 } 2207 } 2208 if (slot < btrfs_header_nritems(eb) - 1) { 2209 btrfs_item_key(eb, &disk_key, slot + 1); 2210 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { 2211 btrfs_crit(fs_info, 2212 "slot %u key (%llu %u %llu) new key (%llu %u %llu)", 2213 slot, btrfs_disk_key_objectid(&disk_key), 2214 btrfs_disk_key_type(&disk_key), 2215 btrfs_disk_key_offset(&disk_key), 2216 new_key->objectid, new_key->type, 2217 new_key->offset); 2218 btrfs_print_leaf(eb); 2219 BUG(); 2220 } 2221 } 2222 2223 btrfs_cpu_key_to_disk(&disk_key, new_key); 2224 btrfs_set_item_key(eb, &disk_key, slot); 2225 btrfs_mark_buffer_dirty(eb); 2226 if (slot == 0) 2227 fixup_low_keys(path, &disk_key, 1); 2228 } 2229 2230 /* 2231 * Check key order of two sibling extent buffers. 2232 * 2233 * Return true if something is wrong. 2234 * Return false if everything is fine. 2235 * 2236 * Tree-checker only works inside one tree block, thus the following 2237 * corruption can not be detected by tree-checker: 2238 * 2239 * Leaf @left | Leaf @right 2240 * -------------------------------------------------------------- 2241 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | 2242 * 2243 * Key f6 in leaf @left itself is valid, but not valid when the next 2244 * key in leaf @right is 7. 2245 * This can only be checked at tree block merge time. 2246 * And since tree checker has ensured all key order in each tree block 2247 * is correct, we only need to bother the last key of @left and the first 2248 * key of @right. 2249 */ 2250 static bool check_sibling_keys(struct extent_buffer *left, 2251 struct extent_buffer *right) 2252 { 2253 struct btrfs_key left_last; 2254 struct btrfs_key right_first; 2255 int level = btrfs_header_level(left); 2256 int nr_left = btrfs_header_nritems(left); 2257 int nr_right = btrfs_header_nritems(right); 2258 2259 /* No key to check in one of the tree blocks */ 2260 if (!nr_left || !nr_right) 2261 return false; 2262 2263 if (level) { 2264 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); 2265 btrfs_node_key_to_cpu(right, &right_first, 0); 2266 } else { 2267 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); 2268 btrfs_item_key_to_cpu(right, &right_first, 0); 2269 } 2270 2271 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { 2272 btrfs_crit(left->fs_info, 2273 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", 2274 left_last.objectid, left_last.type, 2275 left_last.offset, right_first.objectid, 2276 right_first.type, right_first.offset); 2277 return true; 2278 } 2279 return false; 2280 } 2281 2282 /* 2283 * try to push data from one node into the next node left in the 2284 * tree. 2285 * 2286 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2287 * error, and > 0 if there was no room in the left hand block. 2288 */ 2289 static int push_node_left(struct btrfs_trans_handle *trans, 2290 struct extent_buffer *dst, 2291 struct extent_buffer *src, int empty) 2292 { 2293 struct btrfs_fs_info *fs_info = trans->fs_info; 2294 int push_items = 0; 2295 int src_nritems; 2296 int dst_nritems; 2297 int ret = 0; 2298 2299 src_nritems = btrfs_header_nritems(src); 2300 dst_nritems = btrfs_header_nritems(dst); 2301 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2302 WARN_ON(btrfs_header_generation(src) != trans->transid); 2303 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2304 2305 if (!empty && src_nritems <= 8) 2306 return 1; 2307 2308 if (push_items <= 0) 2309 return 1; 2310 2311 if (empty) { 2312 push_items = min(src_nritems, push_items); 2313 if (push_items < src_nritems) { 2314 /* leave at least 8 pointers in the node if 2315 * we aren't going to empty it 2316 */ 2317 if (src_nritems - push_items < 8) { 2318 if (push_items <= 8) 2319 return 1; 2320 push_items -= 8; 2321 } 2322 } 2323 } else 2324 push_items = min(src_nritems - 8, push_items); 2325 2326 /* dst is the left eb, src is the middle eb */ 2327 if (check_sibling_keys(dst, src)) { 2328 ret = -EUCLEAN; 2329 btrfs_abort_transaction(trans, ret); 2330 return ret; 2331 } 2332 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); 2333 if (ret) { 2334 btrfs_abort_transaction(trans, ret); 2335 return ret; 2336 } 2337 copy_extent_buffer(dst, src, 2338 btrfs_node_key_ptr_offset(dst_nritems), 2339 btrfs_node_key_ptr_offset(0), 2340 push_items * sizeof(struct btrfs_key_ptr)); 2341 2342 if (push_items < src_nritems) { 2343 /* 2344 * Don't call btrfs_tree_mod_log_insert_move() here, key removal 2345 * was already fully logged by btrfs_tree_mod_log_eb_copy() above. 2346 */ 2347 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2348 btrfs_node_key_ptr_offset(push_items), 2349 (src_nritems - push_items) * 2350 sizeof(struct btrfs_key_ptr)); 2351 } 2352 btrfs_set_header_nritems(src, src_nritems - push_items); 2353 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2354 btrfs_mark_buffer_dirty(src); 2355 btrfs_mark_buffer_dirty(dst); 2356 2357 return ret; 2358 } 2359 2360 /* 2361 * try to push data from one node into the next node right in the 2362 * tree. 2363 * 2364 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 2365 * error, and > 0 if there was no room in the right hand block. 2366 * 2367 * this will only push up to 1/2 the contents of the left node over 2368 */ 2369 static int balance_node_right(struct btrfs_trans_handle *trans, 2370 struct extent_buffer *dst, 2371 struct extent_buffer *src) 2372 { 2373 struct btrfs_fs_info *fs_info = trans->fs_info; 2374 int push_items = 0; 2375 int max_push; 2376 int src_nritems; 2377 int dst_nritems; 2378 int ret = 0; 2379 2380 WARN_ON(btrfs_header_generation(src) != trans->transid); 2381 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2382 2383 src_nritems = btrfs_header_nritems(src); 2384 dst_nritems = btrfs_header_nritems(dst); 2385 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 2386 if (push_items <= 0) 2387 return 1; 2388 2389 if (src_nritems < 4) 2390 return 1; 2391 2392 max_push = src_nritems / 2 + 1; 2393 /* don't try to empty the node */ 2394 if (max_push >= src_nritems) 2395 return 1; 2396 2397 if (max_push < push_items) 2398 push_items = max_push; 2399 2400 /* dst is the right eb, src is the middle eb */ 2401 if (check_sibling_keys(src, dst)) { 2402 ret = -EUCLEAN; 2403 btrfs_abort_transaction(trans, ret); 2404 return ret; 2405 } 2406 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 2407 BUG_ON(ret < 0); 2408 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2409 btrfs_node_key_ptr_offset(0), 2410 (dst_nritems) * 2411 sizeof(struct btrfs_key_ptr)); 2412 2413 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, 2414 push_items); 2415 if (ret) { 2416 btrfs_abort_transaction(trans, ret); 2417 return ret; 2418 } 2419 copy_extent_buffer(dst, src, 2420 btrfs_node_key_ptr_offset(0), 2421 btrfs_node_key_ptr_offset(src_nritems - push_items), 2422 push_items * sizeof(struct btrfs_key_ptr)); 2423 2424 btrfs_set_header_nritems(src, src_nritems - push_items); 2425 btrfs_set_header_nritems(dst, dst_nritems + push_items); 2426 2427 btrfs_mark_buffer_dirty(src); 2428 btrfs_mark_buffer_dirty(dst); 2429 2430 return ret; 2431 } 2432 2433 /* 2434 * helper function to insert a new root level in the tree. 2435 * A new node is allocated, and a single item is inserted to 2436 * point to the existing root 2437 * 2438 * returns zero on success or < 0 on failure. 2439 */ 2440 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 2441 struct btrfs_root *root, 2442 struct btrfs_path *path, int level) 2443 { 2444 struct btrfs_fs_info *fs_info = root->fs_info; 2445 u64 lower_gen; 2446 struct extent_buffer *lower; 2447 struct extent_buffer *c; 2448 struct extent_buffer *old; 2449 struct btrfs_disk_key lower_key; 2450 int ret; 2451 2452 BUG_ON(path->nodes[level]); 2453 BUG_ON(path->nodes[level-1] != root->node); 2454 2455 lower = path->nodes[level-1]; 2456 if (level == 1) 2457 btrfs_item_key(lower, &lower_key, 0); 2458 else 2459 btrfs_node_key(lower, &lower_key, 0); 2460 2461 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, 2462 root->node->start, 0, 2463 BTRFS_NESTING_NEW_ROOT); 2464 if (IS_ERR(c)) 2465 return PTR_ERR(c); 2466 2467 root_add_used(root, fs_info->nodesize); 2468 2469 btrfs_set_header_nritems(c, 1); 2470 btrfs_set_node_key(c, &lower_key, 0); 2471 btrfs_set_node_blockptr(c, 0, lower->start); 2472 lower_gen = btrfs_header_generation(lower); 2473 WARN_ON(lower_gen != trans->transid); 2474 2475 btrfs_set_node_ptr_generation(c, 0, lower_gen); 2476 2477 btrfs_mark_buffer_dirty(c); 2478 2479 old = root->node; 2480 ret = btrfs_tree_mod_log_insert_root(root->node, c, false); 2481 BUG_ON(ret < 0); 2482 rcu_assign_pointer(root->node, c); 2483 2484 /* the super has an extra ref to root->node */ 2485 free_extent_buffer(old); 2486 2487 add_root_to_dirty_list(root); 2488 atomic_inc(&c->refs); 2489 path->nodes[level] = c; 2490 path->locks[level] = BTRFS_WRITE_LOCK; 2491 path->slots[level] = 0; 2492 return 0; 2493 } 2494 2495 /* 2496 * worker function to insert a single pointer in a node. 2497 * the node should have enough room for the pointer already 2498 * 2499 * slot and level indicate where you want the key to go, and 2500 * blocknr is the block the key points to. 2501 */ 2502 static void insert_ptr(struct btrfs_trans_handle *trans, 2503 struct btrfs_path *path, 2504 struct btrfs_disk_key *key, u64 bytenr, 2505 int slot, int level) 2506 { 2507 struct extent_buffer *lower; 2508 int nritems; 2509 int ret; 2510 2511 BUG_ON(!path->nodes[level]); 2512 btrfs_assert_tree_locked(path->nodes[level]); 2513 lower = path->nodes[level]; 2514 nritems = btrfs_header_nritems(lower); 2515 BUG_ON(slot > nritems); 2516 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); 2517 if (slot != nritems) { 2518 if (level) { 2519 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, 2520 slot, nritems - slot); 2521 BUG_ON(ret < 0); 2522 } 2523 memmove_extent_buffer(lower, 2524 btrfs_node_key_ptr_offset(slot + 1), 2525 btrfs_node_key_ptr_offset(slot), 2526 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2527 } 2528 if (level) { 2529 ret = btrfs_tree_mod_log_insert_key(lower, slot, 2530 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); 2531 BUG_ON(ret < 0); 2532 } 2533 btrfs_set_node_key(lower, key, slot); 2534 btrfs_set_node_blockptr(lower, slot, bytenr); 2535 WARN_ON(trans->transid == 0); 2536 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 2537 btrfs_set_header_nritems(lower, nritems + 1); 2538 btrfs_mark_buffer_dirty(lower); 2539 } 2540 2541 /* 2542 * split the node at the specified level in path in two. 2543 * The path is corrected to point to the appropriate node after the split 2544 * 2545 * Before splitting this tries to make some room in the node by pushing 2546 * left and right, if either one works, it returns right away. 2547 * 2548 * returns 0 on success and < 0 on failure 2549 */ 2550 static noinline int split_node(struct btrfs_trans_handle *trans, 2551 struct btrfs_root *root, 2552 struct btrfs_path *path, int level) 2553 { 2554 struct btrfs_fs_info *fs_info = root->fs_info; 2555 struct extent_buffer *c; 2556 struct extent_buffer *split; 2557 struct btrfs_disk_key disk_key; 2558 int mid; 2559 int ret; 2560 u32 c_nritems; 2561 2562 c = path->nodes[level]; 2563 WARN_ON(btrfs_header_generation(c) != trans->transid); 2564 if (c == root->node) { 2565 /* 2566 * trying to split the root, lets make a new one 2567 * 2568 * tree mod log: We don't log_removal old root in 2569 * insert_new_root, because that root buffer will be kept as a 2570 * normal node. We are going to log removal of half of the 2571 * elements below with btrfs_tree_mod_log_eb_copy(). We're 2572 * holding a tree lock on the buffer, which is why we cannot 2573 * race with other tree_mod_log users. 2574 */ 2575 ret = insert_new_root(trans, root, path, level + 1); 2576 if (ret) 2577 return ret; 2578 } else { 2579 ret = push_nodes_for_insert(trans, root, path, level); 2580 c = path->nodes[level]; 2581 if (!ret && btrfs_header_nritems(c) < 2582 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 2583 return 0; 2584 if (ret < 0) 2585 return ret; 2586 } 2587 2588 c_nritems = btrfs_header_nritems(c); 2589 mid = (c_nritems + 1) / 2; 2590 btrfs_node_key(c, &disk_key, mid); 2591 2592 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, 2593 c->start, 0, BTRFS_NESTING_SPLIT); 2594 if (IS_ERR(split)) 2595 return PTR_ERR(split); 2596 2597 root_add_used(root, fs_info->nodesize); 2598 ASSERT(btrfs_header_level(c) == level); 2599 2600 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); 2601 if (ret) { 2602 btrfs_abort_transaction(trans, ret); 2603 return ret; 2604 } 2605 copy_extent_buffer(split, c, 2606 btrfs_node_key_ptr_offset(0), 2607 btrfs_node_key_ptr_offset(mid), 2608 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 2609 btrfs_set_header_nritems(split, c_nritems - mid); 2610 btrfs_set_header_nritems(c, mid); 2611 2612 btrfs_mark_buffer_dirty(c); 2613 btrfs_mark_buffer_dirty(split); 2614 2615 insert_ptr(trans, path, &disk_key, split->start, 2616 path->slots[level + 1] + 1, level + 1); 2617 2618 if (path->slots[level] >= mid) { 2619 path->slots[level] -= mid; 2620 btrfs_tree_unlock(c); 2621 free_extent_buffer(c); 2622 path->nodes[level] = split; 2623 path->slots[level + 1] += 1; 2624 } else { 2625 btrfs_tree_unlock(split); 2626 free_extent_buffer(split); 2627 } 2628 return 0; 2629 } 2630 2631 /* 2632 * how many bytes are required to store the items in a leaf. start 2633 * and nr indicate which items in the leaf to check. This totals up the 2634 * space used both by the item structs and the item data 2635 */ 2636 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 2637 { 2638 struct btrfs_item *start_item; 2639 struct btrfs_item *end_item; 2640 int data_len; 2641 int nritems = btrfs_header_nritems(l); 2642 int end = min(nritems, start + nr) - 1; 2643 2644 if (!nr) 2645 return 0; 2646 start_item = btrfs_item_nr(start); 2647 end_item = btrfs_item_nr(end); 2648 data_len = btrfs_item_offset(l, start_item) + 2649 btrfs_item_size(l, start_item); 2650 data_len = data_len - btrfs_item_offset(l, end_item); 2651 data_len += sizeof(struct btrfs_item) * nr; 2652 WARN_ON(data_len < 0); 2653 return data_len; 2654 } 2655 2656 /* 2657 * The space between the end of the leaf items and 2658 * the start of the leaf data. IOW, how much room 2659 * the leaf has left for both items and data 2660 */ 2661 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) 2662 { 2663 struct btrfs_fs_info *fs_info = leaf->fs_info; 2664 int nritems = btrfs_header_nritems(leaf); 2665 int ret; 2666 2667 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 2668 if (ret < 0) { 2669 btrfs_crit(fs_info, 2670 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 2671 ret, 2672 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 2673 leaf_space_used(leaf, 0, nritems), nritems); 2674 } 2675 return ret; 2676 } 2677 2678 /* 2679 * min slot controls the lowest index we're willing to push to the 2680 * right. We'll push up to and including min_slot, but no lower 2681 */ 2682 static noinline int __push_leaf_right(struct btrfs_path *path, 2683 int data_size, int empty, 2684 struct extent_buffer *right, 2685 int free_space, u32 left_nritems, 2686 u32 min_slot) 2687 { 2688 struct btrfs_fs_info *fs_info = right->fs_info; 2689 struct extent_buffer *left = path->nodes[0]; 2690 struct extent_buffer *upper = path->nodes[1]; 2691 struct btrfs_map_token token; 2692 struct btrfs_disk_key disk_key; 2693 int slot; 2694 u32 i; 2695 int push_space = 0; 2696 int push_items = 0; 2697 struct btrfs_item *item; 2698 u32 nr; 2699 u32 right_nritems; 2700 u32 data_end; 2701 u32 this_item_size; 2702 2703 if (empty) 2704 nr = 0; 2705 else 2706 nr = max_t(u32, 1, min_slot); 2707 2708 if (path->slots[0] >= left_nritems) 2709 push_space += data_size; 2710 2711 slot = path->slots[1]; 2712 i = left_nritems - 1; 2713 while (i >= nr) { 2714 item = btrfs_item_nr(i); 2715 2716 if (!empty && push_items > 0) { 2717 if (path->slots[0] > i) 2718 break; 2719 if (path->slots[0] == i) { 2720 int space = btrfs_leaf_free_space(left); 2721 2722 if (space + push_space * 2 > free_space) 2723 break; 2724 } 2725 } 2726 2727 if (path->slots[0] == i) 2728 push_space += data_size; 2729 2730 this_item_size = btrfs_item_size(left, item); 2731 if (this_item_size + sizeof(*item) + push_space > free_space) 2732 break; 2733 2734 push_items++; 2735 push_space += this_item_size + sizeof(*item); 2736 if (i == 0) 2737 break; 2738 i--; 2739 } 2740 2741 if (push_items == 0) 2742 goto out_unlock; 2743 2744 WARN_ON(!empty && push_items == left_nritems); 2745 2746 /* push left to right */ 2747 right_nritems = btrfs_header_nritems(right); 2748 2749 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 2750 push_space -= leaf_data_end(left); 2751 2752 /* make room in the right data area */ 2753 data_end = leaf_data_end(right); 2754 memmove_extent_buffer(right, 2755 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 2756 BTRFS_LEAF_DATA_OFFSET + data_end, 2757 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 2758 2759 /* copy from the left data area */ 2760 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 2761 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 2762 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), 2763 push_space); 2764 2765 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 2766 btrfs_item_nr_offset(0), 2767 right_nritems * sizeof(struct btrfs_item)); 2768 2769 /* copy the items from left to right */ 2770 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 2771 btrfs_item_nr_offset(left_nritems - push_items), 2772 push_items * sizeof(struct btrfs_item)); 2773 2774 /* update the item pointers */ 2775 btrfs_init_map_token(&token, right); 2776 right_nritems += push_items; 2777 btrfs_set_header_nritems(right, right_nritems); 2778 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 2779 for (i = 0; i < right_nritems; i++) { 2780 item = btrfs_item_nr(i); 2781 push_space -= btrfs_token_item_size(&token, item); 2782 btrfs_set_token_item_offset(&token, item, push_space); 2783 } 2784 2785 left_nritems -= push_items; 2786 btrfs_set_header_nritems(left, left_nritems); 2787 2788 if (left_nritems) 2789 btrfs_mark_buffer_dirty(left); 2790 else 2791 btrfs_clean_tree_block(left); 2792 2793 btrfs_mark_buffer_dirty(right); 2794 2795 btrfs_item_key(right, &disk_key, 0); 2796 btrfs_set_node_key(upper, &disk_key, slot + 1); 2797 btrfs_mark_buffer_dirty(upper); 2798 2799 /* then fixup the leaf pointer in the path */ 2800 if (path->slots[0] >= left_nritems) { 2801 path->slots[0] -= left_nritems; 2802 if (btrfs_header_nritems(path->nodes[0]) == 0) 2803 btrfs_clean_tree_block(path->nodes[0]); 2804 btrfs_tree_unlock(path->nodes[0]); 2805 free_extent_buffer(path->nodes[0]); 2806 path->nodes[0] = right; 2807 path->slots[1] += 1; 2808 } else { 2809 btrfs_tree_unlock(right); 2810 free_extent_buffer(right); 2811 } 2812 return 0; 2813 2814 out_unlock: 2815 btrfs_tree_unlock(right); 2816 free_extent_buffer(right); 2817 return 1; 2818 } 2819 2820 /* 2821 * push some data in the path leaf to the right, trying to free up at 2822 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2823 * 2824 * returns 1 if the push failed because the other node didn't have enough 2825 * room, 0 if everything worked out and < 0 if there were major errors. 2826 * 2827 * this will push starting from min_slot to the end of the leaf. It won't 2828 * push any slot lower than min_slot 2829 */ 2830 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2831 *root, struct btrfs_path *path, 2832 int min_data_size, int data_size, 2833 int empty, u32 min_slot) 2834 { 2835 struct extent_buffer *left = path->nodes[0]; 2836 struct extent_buffer *right; 2837 struct extent_buffer *upper; 2838 int slot; 2839 int free_space; 2840 u32 left_nritems; 2841 int ret; 2842 2843 if (!path->nodes[1]) 2844 return 1; 2845 2846 slot = path->slots[1]; 2847 upper = path->nodes[1]; 2848 if (slot >= btrfs_header_nritems(upper) - 1) 2849 return 1; 2850 2851 btrfs_assert_tree_locked(path->nodes[1]); 2852 2853 right = btrfs_read_node_slot(upper, slot + 1); 2854 /* 2855 * slot + 1 is not valid or we fail to read the right node, 2856 * no big deal, just return. 2857 */ 2858 if (IS_ERR(right)) 2859 return 1; 2860 2861 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); 2862 2863 free_space = btrfs_leaf_free_space(right); 2864 if (free_space < data_size) 2865 goto out_unlock; 2866 2867 /* cow and double check */ 2868 ret = btrfs_cow_block(trans, root, right, upper, 2869 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 2870 if (ret) 2871 goto out_unlock; 2872 2873 free_space = btrfs_leaf_free_space(right); 2874 if (free_space < data_size) 2875 goto out_unlock; 2876 2877 left_nritems = btrfs_header_nritems(left); 2878 if (left_nritems == 0) 2879 goto out_unlock; 2880 2881 if (check_sibling_keys(left, right)) { 2882 ret = -EUCLEAN; 2883 btrfs_tree_unlock(right); 2884 free_extent_buffer(right); 2885 return ret; 2886 } 2887 if (path->slots[0] == left_nritems && !empty) { 2888 /* Key greater than all keys in the leaf, right neighbor has 2889 * enough room for it and we're not emptying our leaf to delete 2890 * it, therefore use right neighbor to insert the new item and 2891 * no need to touch/dirty our left leaf. */ 2892 btrfs_tree_unlock(left); 2893 free_extent_buffer(left); 2894 path->nodes[0] = right; 2895 path->slots[0] = 0; 2896 path->slots[1]++; 2897 return 0; 2898 } 2899 2900 return __push_leaf_right(path, min_data_size, empty, 2901 right, free_space, left_nritems, min_slot); 2902 out_unlock: 2903 btrfs_tree_unlock(right); 2904 free_extent_buffer(right); 2905 return 1; 2906 } 2907 2908 /* 2909 * push some data in the path leaf to the left, trying to free up at 2910 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2911 * 2912 * max_slot can put a limit on how far into the leaf we'll push items. The 2913 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 2914 * items 2915 */ 2916 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, 2917 int empty, struct extent_buffer *left, 2918 int free_space, u32 right_nritems, 2919 u32 max_slot) 2920 { 2921 struct btrfs_fs_info *fs_info = left->fs_info; 2922 struct btrfs_disk_key disk_key; 2923 struct extent_buffer *right = path->nodes[0]; 2924 int i; 2925 int push_space = 0; 2926 int push_items = 0; 2927 struct btrfs_item *item; 2928 u32 old_left_nritems; 2929 u32 nr; 2930 int ret = 0; 2931 u32 this_item_size; 2932 u32 old_left_item_size; 2933 struct btrfs_map_token token; 2934 2935 if (empty) 2936 nr = min(right_nritems, max_slot); 2937 else 2938 nr = min(right_nritems - 1, max_slot); 2939 2940 for (i = 0; i < nr; i++) { 2941 item = btrfs_item_nr(i); 2942 2943 if (!empty && push_items > 0) { 2944 if (path->slots[0] < i) 2945 break; 2946 if (path->slots[0] == i) { 2947 int space = btrfs_leaf_free_space(right); 2948 2949 if (space + push_space * 2 > free_space) 2950 break; 2951 } 2952 } 2953 2954 if (path->slots[0] == i) 2955 push_space += data_size; 2956 2957 this_item_size = btrfs_item_size(right, item); 2958 if (this_item_size + sizeof(*item) + push_space > free_space) 2959 break; 2960 2961 push_items++; 2962 push_space += this_item_size + sizeof(*item); 2963 } 2964 2965 if (push_items == 0) { 2966 ret = 1; 2967 goto out; 2968 } 2969 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 2970 2971 /* push data from right to left */ 2972 copy_extent_buffer(left, right, 2973 btrfs_item_nr_offset(btrfs_header_nritems(left)), 2974 btrfs_item_nr_offset(0), 2975 push_items * sizeof(struct btrfs_item)); 2976 2977 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 2978 btrfs_item_offset_nr(right, push_items - 1); 2979 2980 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 2981 leaf_data_end(left) - push_space, 2982 BTRFS_LEAF_DATA_OFFSET + 2983 btrfs_item_offset_nr(right, push_items - 1), 2984 push_space); 2985 old_left_nritems = btrfs_header_nritems(left); 2986 BUG_ON(old_left_nritems <= 0); 2987 2988 btrfs_init_map_token(&token, left); 2989 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 2990 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 2991 u32 ioff; 2992 2993 item = btrfs_item_nr(i); 2994 2995 ioff = btrfs_token_item_offset(&token, item); 2996 btrfs_set_token_item_offset(&token, item, 2997 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); 2998 } 2999 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3000 3001 /* fixup right node */ 3002 if (push_items > right_nritems) 3003 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3004 right_nritems); 3005 3006 if (push_items < right_nritems) { 3007 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3008 leaf_data_end(right); 3009 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 3010 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3011 BTRFS_LEAF_DATA_OFFSET + 3012 leaf_data_end(right), push_space); 3013 3014 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3015 btrfs_item_nr_offset(push_items), 3016 (btrfs_header_nritems(right) - push_items) * 3017 sizeof(struct btrfs_item)); 3018 } 3019 3020 btrfs_init_map_token(&token, right); 3021 right_nritems -= push_items; 3022 btrfs_set_header_nritems(right, right_nritems); 3023 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3024 for (i = 0; i < right_nritems; i++) { 3025 item = btrfs_item_nr(i); 3026 3027 push_space = push_space - btrfs_token_item_size(&token, item); 3028 btrfs_set_token_item_offset(&token, item, push_space); 3029 } 3030 3031 btrfs_mark_buffer_dirty(left); 3032 if (right_nritems) 3033 btrfs_mark_buffer_dirty(right); 3034 else 3035 btrfs_clean_tree_block(right); 3036 3037 btrfs_item_key(right, &disk_key, 0); 3038 fixup_low_keys(path, &disk_key, 1); 3039 3040 /* then fixup the leaf pointer in the path */ 3041 if (path->slots[0] < push_items) { 3042 path->slots[0] += old_left_nritems; 3043 btrfs_tree_unlock(path->nodes[0]); 3044 free_extent_buffer(path->nodes[0]); 3045 path->nodes[0] = left; 3046 path->slots[1] -= 1; 3047 } else { 3048 btrfs_tree_unlock(left); 3049 free_extent_buffer(left); 3050 path->slots[0] -= push_items; 3051 } 3052 BUG_ON(path->slots[0] < 0); 3053 return ret; 3054 out: 3055 btrfs_tree_unlock(left); 3056 free_extent_buffer(left); 3057 return ret; 3058 } 3059 3060 /* 3061 * push some data in the path leaf to the left, trying to free up at 3062 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3063 * 3064 * max_slot can put a limit on how far into the leaf we'll push items. The 3065 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3066 * items 3067 */ 3068 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3069 *root, struct btrfs_path *path, int min_data_size, 3070 int data_size, int empty, u32 max_slot) 3071 { 3072 struct extent_buffer *right = path->nodes[0]; 3073 struct extent_buffer *left; 3074 int slot; 3075 int free_space; 3076 u32 right_nritems; 3077 int ret = 0; 3078 3079 slot = path->slots[1]; 3080 if (slot == 0) 3081 return 1; 3082 if (!path->nodes[1]) 3083 return 1; 3084 3085 right_nritems = btrfs_header_nritems(right); 3086 if (right_nritems == 0) 3087 return 1; 3088 3089 btrfs_assert_tree_locked(path->nodes[1]); 3090 3091 left = btrfs_read_node_slot(path->nodes[1], slot - 1); 3092 /* 3093 * slot - 1 is not valid or we fail to read the left node, 3094 * no big deal, just return. 3095 */ 3096 if (IS_ERR(left)) 3097 return 1; 3098 3099 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); 3100 3101 free_space = btrfs_leaf_free_space(left); 3102 if (free_space < data_size) { 3103 ret = 1; 3104 goto out; 3105 } 3106 3107 /* cow and double check */ 3108 ret = btrfs_cow_block(trans, root, left, 3109 path->nodes[1], slot - 1, &left, 3110 BTRFS_NESTING_LEFT_COW); 3111 if (ret) { 3112 /* we hit -ENOSPC, but it isn't fatal here */ 3113 if (ret == -ENOSPC) 3114 ret = 1; 3115 goto out; 3116 } 3117 3118 free_space = btrfs_leaf_free_space(left); 3119 if (free_space < data_size) { 3120 ret = 1; 3121 goto out; 3122 } 3123 3124 if (check_sibling_keys(left, right)) { 3125 ret = -EUCLEAN; 3126 goto out; 3127 } 3128 return __push_leaf_left(path, min_data_size, 3129 empty, left, free_space, right_nritems, 3130 max_slot); 3131 out: 3132 btrfs_tree_unlock(left); 3133 free_extent_buffer(left); 3134 return ret; 3135 } 3136 3137 /* 3138 * split the path's leaf in two, making sure there is at least data_size 3139 * available for the resulting leaf level of the path. 3140 */ 3141 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3142 struct btrfs_path *path, 3143 struct extent_buffer *l, 3144 struct extent_buffer *right, 3145 int slot, int mid, int nritems) 3146 { 3147 struct btrfs_fs_info *fs_info = trans->fs_info; 3148 int data_copy_size; 3149 int rt_data_off; 3150 int i; 3151 struct btrfs_disk_key disk_key; 3152 struct btrfs_map_token token; 3153 3154 nritems = nritems - mid; 3155 btrfs_set_header_nritems(right, nritems); 3156 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); 3157 3158 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3159 btrfs_item_nr_offset(mid), 3160 nritems * sizeof(struct btrfs_item)); 3161 3162 copy_extent_buffer(right, l, 3163 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 3164 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 3165 leaf_data_end(l), data_copy_size); 3166 3167 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); 3168 3169 btrfs_init_map_token(&token, right); 3170 for (i = 0; i < nritems; i++) { 3171 struct btrfs_item *item = btrfs_item_nr(i); 3172 u32 ioff; 3173 3174 ioff = btrfs_token_item_offset(&token, item); 3175 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); 3176 } 3177 3178 btrfs_set_header_nritems(l, mid); 3179 btrfs_item_key(right, &disk_key, 0); 3180 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); 3181 3182 btrfs_mark_buffer_dirty(right); 3183 btrfs_mark_buffer_dirty(l); 3184 BUG_ON(path->slots[0] != slot); 3185 3186 if (mid <= slot) { 3187 btrfs_tree_unlock(path->nodes[0]); 3188 free_extent_buffer(path->nodes[0]); 3189 path->nodes[0] = right; 3190 path->slots[0] -= mid; 3191 path->slots[1] += 1; 3192 } else { 3193 btrfs_tree_unlock(right); 3194 free_extent_buffer(right); 3195 } 3196 3197 BUG_ON(path->slots[0] < 0); 3198 } 3199 3200 /* 3201 * double splits happen when we need to insert a big item in the middle 3202 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3203 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3204 * A B C 3205 * 3206 * We avoid this by trying to push the items on either side of our target 3207 * into the adjacent leaves. If all goes well we can avoid the double split 3208 * completely. 3209 */ 3210 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3211 struct btrfs_root *root, 3212 struct btrfs_path *path, 3213 int data_size) 3214 { 3215 int ret; 3216 int progress = 0; 3217 int slot; 3218 u32 nritems; 3219 int space_needed = data_size; 3220 3221 slot = path->slots[0]; 3222 if (slot < btrfs_header_nritems(path->nodes[0])) 3223 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3224 3225 /* 3226 * try to push all the items after our slot into the 3227 * right leaf 3228 */ 3229 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 3230 if (ret < 0) 3231 return ret; 3232 3233 if (ret == 0) 3234 progress++; 3235 3236 nritems = btrfs_header_nritems(path->nodes[0]); 3237 /* 3238 * our goal is to get our slot at the start or end of a leaf. If 3239 * we've done so we're done 3240 */ 3241 if (path->slots[0] == 0 || path->slots[0] == nritems) 3242 return 0; 3243 3244 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3245 return 0; 3246 3247 /* try to push all the items before our slot into the next leaf */ 3248 slot = path->slots[0]; 3249 space_needed = data_size; 3250 if (slot > 0) 3251 space_needed -= btrfs_leaf_free_space(path->nodes[0]); 3252 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 3253 if (ret < 0) 3254 return ret; 3255 3256 if (ret == 0) 3257 progress++; 3258 3259 if (progress) 3260 return 0; 3261 return 1; 3262 } 3263 3264 /* 3265 * split the path's leaf in two, making sure there is at least data_size 3266 * available for the resulting leaf level of the path. 3267 * 3268 * returns 0 if all went well and < 0 on failure. 3269 */ 3270 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3271 struct btrfs_root *root, 3272 const struct btrfs_key *ins_key, 3273 struct btrfs_path *path, int data_size, 3274 int extend) 3275 { 3276 struct btrfs_disk_key disk_key; 3277 struct extent_buffer *l; 3278 u32 nritems; 3279 int mid; 3280 int slot; 3281 struct extent_buffer *right; 3282 struct btrfs_fs_info *fs_info = root->fs_info; 3283 int ret = 0; 3284 int wret; 3285 int split; 3286 int num_doubles = 0; 3287 int tried_avoid_double = 0; 3288 3289 l = path->nodes[0]; 3290 slot = path->slots[0]; 3291 if (extend && data_size + btrfs_item_size_nr(l, slot) + 3292 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 3293 return -EOVERFLOW; 3294 3295 /* first try to make some room by pushing left and right */ 3296 if (data_size && path->nodes[1]) { 3297 int space_needed = data_size; 3298 3299 if (slot < btrfs_header_nritems(l)) 3300 space_needed -= btrfs_leaf_free_space(l); 3301 3302 wret = push_leaf_right(trans, root, path, space_needed, 3303 space_needed, 0, 0); 3304 if (wret < 0) 3305 return wret; 3306 if (wret) { 3307 space_needed = data_size; 3308 if (slot > 0) 3309 space_needed -= btrfs_leaf_free_space(l); 3310 wret = push_leaf_left(trans, root, path, space_needed, 3311 space_needed, 0, (u32)-1); 3312 if (wret < 0) 3313 return wret; 3314 } 3315 l = path->nodes[0]; 3316 3317 /* did the pushes work? */ 3318 if (btrfs_leaf_free_space(l) >= data_size) 3319 return 0; 3320 } 3321 3322 if (!path->nodes[1]) { 3323 ret = insert_new_root(trans, root, path, 1); 3324 if (ret) 3325 return ret; 3326 } 3327 again: 3328 split = 1; 3329 l = path->nodes[0]; 3330 slot = path->slots[0]; 3331 nritems = btrfs_header_nritems(l); 3332 mid = (nritems + 1) / 2; 3333 3334 if (mid <= slot) { 3335 if (nritems == 1 || 3336 leaf_space_used(l, mid, nritems - mid) + data_size > 3337 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3338 if (slot >= nritems) { 3339 split = 0; 3340 } else { 3341 mid = slot; 3342 if (mid != nritems && 3343 leaf_space_used(l, mid, nritems - mid) + 3344 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3345 if (data_size && !tried_avoid_double) 3346 goto push_for_double; 3347 split = 2; 3348 } 3349 } 3350 } 3351 } else { 3352 if (leaf_space_used(l, 0, mid) + data_size > 3353 BTRFS_LEAF_DATA_SIZE(fs_info)) { 3354 if (!extend && data_size && slot == 0) { 3355 split = 0; 3356 } else if ((extend || !data_size) && slot == 0) { 3357 mid = 1; 3358 } else { 3359 mid = slot; 3360 if (mid != nritems && 3361 leaf_space_used(l, mid, nritems - mid) + 3362 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 3363 if (data_size && !tried_avoid_double) 3364 goto push_for_double; 3365 split = 2; 3366 } 3367 } 3368 } 3369 } 3370 3371 if (split == 0) 3372 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3373 else 3374 btrfs_item_key(l, &disk_key, mid); 3375 3376 /* 3377 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double 3378 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES 3379 * subclasses, which is 8 at the time of this patch, and we've maxed it 3380 * out. In the future we could add a 3381 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just 3382 * use BTRFS_NESTING_NEW_ROOT. 3383 */ 3384 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, 3385 l->start, 0, num_doubles ? 3386 BTRFS_NESTING_NEW_ROOT : 3387 BTRFS_NESTING_SPLIT); 3388 if (IS_ERR(right)) 3389 return PTR_ERR(right); 3390 3391 root_add_used(root, fs_info->nodesize); 3392 3393 if (split == 0) { 3394 if (mid <= slot) { 3395 btrfs_set_header_nritems(right, 0); 3396 insert_ptr(trans, path, &disk_key, 3397 right->start, path->slots[1] + 1, 1); 3398 btrfs_tree_unlock(path->nodes[0]); 3399 free_extent_buffer(path->nodes[0]); 3400 path->nodes[0] = right; 3401 path->slots[0] = 0; 3402 path->slots[1] += 1; 3403 } else { 3404 btrfs_set_header_nritems(right, 0); 3405 insert_ptr(trans, path, &disk_key, 3406 right->start, path->slots[1], 1); 3407 btrfs_tree_unlock(path->nodes[0]); 3408 free_extent_buffer(path->nodes[0]); 3409 path->nodes[0] = right; 3410 path->slots[0] = 0; 3411 if (path->slots[1] == 0) 3412 fixup_low_keys(path, &disk_key, 1); 3413 } 3414 /* 3415 * We create a new leaf 'right' for the required ins_len and 3416 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 3417 * the content of ins_len to 'right'. 3418 */ 3419 return ret; 3420 } 3421 3422 copy_for_split(trans, path, l, right, slot, mid, nritems); 3423 3424 if (split == 2) { 3425 BUG_ON(num_doubles != 0); 3426 num_doubles++; 3427 goto again; 3428 } 3429 3430 return 0; 3431 3432 push_for_double: 3433 push_for_double_split(trans, root, path, data_size); 3434 tried_avoid_double = 1; 3435 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) 3436 return 0; 3437 goto again; 3438 } 3439 3440 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3441 struct btrfs_root *root, 3442 struct btrfs_path *path, int ins_len) 3443 { 3444 struct btrfs_key key; 3445 struct extent_buffer *leaf; 3446 struct btrfs_file_extent_item *fi; 3447 u64 extent_len = 0; 3448 u32 item_size; 3449 int ret; 3450 3451 leaf = path->nodes[0]; 3452 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3453 3454 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 3455 key.type != BTRFS_EXTENT_CSUM_KEY); 3456 3457 if (btrfs_leaf_free_space(leaf) >= ins_len) 3458 return 0; 3459 3460 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3461 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3462 fi = btrfs_item_ptr(leaf, path->slots[0], 3463 struct btrfs_file_extent_item); 3464 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3465 } 3466 btrfs_release_path(path); 3467 3468 path->keep_locks = 1; 3469 path->search_for_split = 1; 3470 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3471 path->search_for_split = 0; 3472 if (ret > 0) 3473 ret = -EAGAIN; 3474 if (ret < 0) 3475 goto err; 3476 3477 ret = -EAGAIN; 3478 leaf = path->nodes[0]; 3479 /* if our item isn't there, return now */ 3480 if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3481 goto err; 3482 3483 /* the leaf has changed, it now has room. return now */ 3484 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) 3485 goto err; 3486 3487 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3488 fi = btrfs_item_ptr(leaf, path->slots[0], 3489 struct btrfs_file_extent_item); 3490 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 3491 goto err; 3492 } 3493 3494 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3495 if (ret) 3496 goto err; 3497 3498 path->keep_locks = 0; 3499 btrfs_unlock_up_safe(path, 1); 3500 return 0; 3501 err: 3502 path->keep_locks = 0; 3503 return ret; 3504 } 3505 3506 static noinline int split_item(struct btrfs_path *path, 3507 const struct btrfs_key *new_key, 3508 unsigned long split_offset) 3509 { 3510 struct extent_buffer *leaf; 3511 struct btrfs_item *item; 3512 struct btrfs_item *new_item; 3513 int slot; 3514 char *buf; 3515 u32 nritems; 3516 u32 item_size; 3517 u32 orig_offset; 3518 struct btrfs_disk_key disk_key; 3519 3520 leaf = path->nodes[0]; 3521 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); 3522 3523 item = btrfs_item_nr(path->slots[0]); 3524 orig_offset = btrfs_item_offset(leaf, item); 3525 item_size = btrfs_item_size(leaf, item); 3526 3527 buf = kmalloc(item_size, GFP_NOFS); 3528 if (!buf) 3529 return -ENOMEM; 3530 3531 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3532 path->slots[0]), item_size); 3533 3534 slot = path->slots[0] + 1; 3535 nritems = btrfs_header_nritems(leaf); 3536 if (slot != nritems) { 3537 /* shift the items */ 3538 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 3539 btrfs_item_nr_offset(slot), 3540 (nritems - slot) * sizeof(struct btrfs_item)); 3541 } 3542 3543 btrfs_cpu_key_to_disk(&disk_key, new_key); 3544 btrfs_set_item_key(leaf, &disk_key, slot); 3545 3546 new_item = btrfs_item_nr(slot); 3547 3548 btrfs_set_item_offset(leaf, new_item, orig_offset); 3549 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 3550 3551 btrfs_set_item_offset(leaf, item, 3552 orig_offset + item_size - split_offset); 3553 btrfs_set_item_size(leaf, item, split_offset); 3554 3555 btrfs_set_header_nritems(leaf, nritems + 1); 3556 3557 /* write the data for the start of the original item */ 3558 write_extent_buffer(leaf, buf, 3559 btrfs_item_ptr_offset(leaf, path->slots[0]), 3560 split_offset); 3561 3562 /* write the data for the new item */ 3563 write_extent_buffer(leaf, buf + split_offset, 3564 btrfs_item_ptr_offset(leaf, slot), 3565 item_size - split_offset); 3566 btrfs_mark_buffer_dirty(leaf); 3567 3568 BUG_ON(btrfs_leaf_free_space(leaf) < 0); 3569 kfree(buf); 3570 return 0; 3571 } 3572 3573 /* 3574 * This function splits a single item into two items, 3575 * giving 'new_key' to the new item and splitting the 3576 * old one at split_offset (from the start of the item). 3577 * 3578 * The path may be released by this operation. After 3579 * the split, the path is pointing to the old item. The 3580 * new item is going to be in the same node as the old one. 3581 * 3582 * Note, the item being split must be smaller enough to live alone on 3583 * a tree block with room for one extra struct btrfs_item 3584 * 3585 * This allows us to split the item in place, keeping a lock on the 3586 * leaf the entire time. 3587 */ 3588 int btrfs_split_item(struct btrfs_trans_handle *trans, 3589 struct btrfs_root *root, 3590 struct btrfs_path *path, 3591 const struct btrfs_key *new_key, 3592 unsigned long split_offset) 3593 { 3594 int ret; 3595 ret = setup_leaf_for_split(trans, root, path, 3596 sizeof(struct btrfs_item)); 3597 if (ret) 3598 return ret; 3599 3600 ret = split_item(path, new_key, split_offset); 3601 return ret; 3602 } 3603 3604 /* 3605 * This function duplicate a item, giving 'new_key' to the new item. 3606 * It guarantees both items live in the same tree leaf and the new item 3607 * is contiguous with the original item. 3608 * 3609 * This allows us to split file extent in place, keeping a lock on the 3610 * leaf the entire time. 3611 */ 3612 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 3613 struct btrfs_root *root, 3614 struct btrfs_path *path, 3615 const struct btrfs_key *new_key) 3616 { 3617 struct extent_buffer *leaf; 3618 int ret; 3619 u32 item_size; 3620 3621 leaf = path->nodes[0]; 3622 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3623 ret = setup_leaf_for_split(trans, root, path, 3624 item_size + sizeof(struct btrfs_item)); 3625 if (ret) 3626 return ret; 3627 3628 path->slots[0]++; 3629 setup_items_for_insert(root, path, new_key, &item_size, 1); 3630 leaf = path->nodes[0]; 3631 memcpy_extent_buffer(leaf, 3632 btrfs_item_ptr_offset(leaf, path->slots[0]), 3633 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 3634 item_size); 3635 return 0; 3636 } 3637 3638 /* 3639 * make the item pointed to by the path smaller. new_size indicates 3640 * how small to make it, and from_end tells us if we just chop bytes 3641 * off the end of the item or if we shift the item to chop bytes off 3642 * the front. 3643 */ 3644 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) 3645 { 3646 int slot; 3647 struct extent_buffer *leaf; 3648 struct btrfs_item *item; 3649 u32 nritems; 3650 unsigned int data_end; 3651 unsigned int old_data_start; 3652 unsigned int old_size; 3653 unsigned int size_diff; 3654 int i; 3655 struct btrfs_map_token token; 3656 3657 leaf = path->nodes[0]; 3658 slot = path->slots[0]; 3659 3660 old_size = btrfs_item_size_nr(leaf, slot); 3661 if (old_size == new_size) 3662 return; 3663 3664 nritems = btrfs_header_nritems(leaf); 3665 data_end = leaf_data_end(leaf); 3666 3667 old_data_start = btrfs_item_offset_nr(leaf, slot); 3668 3669 size_diff = old_size - new_size; 3670 3671 BUG_ON(slot < 0); 3672 BUG_ON(slot >= nritems); 3673 3674 /* 3675 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3676 */ 3677 /* first correct the data pointers */ 3678 btrfs_init_map_token(&token, leaf); 3679 for (i = slot; i < nritems; i++) { 3680 u32 ioff; 3681 item = btrfs_item_nr(i); 3682 3683 ioff = btrfs_token_item_offset(&token, item); 3684 btrfs_set_token_item_offset(&token, item, ioff + size_diff); 3685 } 3686 3687 /* shift the data */ 3688 if (from_end) { 3689 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3690 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3691 data_end, old_data_start + new_size - data_end); 3692 } else { 3693 struct btrfs_disk_key disk_key; 3694 u64 offset; 3695 3696 btrfs_item_key(leaf, &disk_key, slot); 3697 3698 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 3699 unsigned long ptr; 3700 struct btrfs_file_extent_item *fi; 3701 3702 fi = btrfs_item_ptr(leaf, slot, 3703 struct btrfs_file_extent_item); 3704 fi = (struct btrfs_file_extent_item *)( 3705 (unsigned long)fi - size_diff); 3706 3707 if (btrfs_file_extent_type(leaf, fi) == 3708 BTRFS_FILE_EXTENT_INLINE) { 3709 ptr = btrfs_item_ptr_offset(leaf, slot); 3710 memmove_extent_buffer(leaf, ptr, 3711 (unsigned long)fi, 3712 BTRFS_FILE_EXTENT_INLINE_DATA_START); 3713 } 3714 } 3715 3716 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3717 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 3718 data_end, old_data_start - data_end); 3719 3720 offset = btrfs_disk_key_offset(&disk_key); 3721 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 3722 btrfs_set_item_key(leaf, &disk_key, slot); 3723 if (slot == 0) 3724 fixup_low_keys(path, &disk_key, 1); 3725 } 3726 3727 item = btrfs_item_nr(slot); 3728 btrfs_set_item_size(leaf, item, new_size); 3729 btrfs_mark_buffer_dirty(leaf); 3730 3731 if (btrfs_leaf_free_space(leaf) < 0) { 3732 btrfs_print_leaf(leaf); 3733 BUG(); 3734 } 3735 } 3736 3737 /* 3738 * make the item pointed to by the path bigger, data_size is the added size. 3739 */ 3740 void btrfs_extend_item(struct btrfs_path *path, u32 data_size) 3741 { 3742 int slot; 3743 struct extent_buffer *leaf; 3744 struct btrfs_item *item; 3745 u32 nritems; 3746 unsigned int data_end; 3747 unsigned int old_data; 3748 unsigned int old_size; 3749 int i; 3750 struct btrfs_map_token token; 3751 3752 leaf = path->nodes[0]; 3753 3754 nritems = btrfs_header_nritems(leaf); 3755 data_end = leaf_data_end(leaf); 3756 3757 if (btrfs_leaf_free_space(leaf) < data_size) { 3758 btrfs_print_leaf(leaf); 3759 BUG(); 3760 } 3761 slot = path->slots[0]; 3762 old_data = btrfs_item_end_nr(leaf, slot); 3763 3764 BUG_ON(slot < 0); 3765 if (slot >= nritems) { 3766 btrfs_print_leaf(leaf); 3767 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", 3768 slot, nritems); 3769 BUG(); 3770 } 3771 3772 /* 3773 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3774 */ 3775 /* first correct the data pointers */ 3776 btrfs_init_map_token(&token, leaf); 3777 for (i = slot; i < nritems; i++) { 3778 u32 ioff; 3779 item = btrfs_item_nr(i); 3780 3781 ioff = btrfs_token_item_offset(&token, item); 3782 btrfs_set_token_item_offset(&token, item, ioff - data_size); 3783 } 3784 3785 /* shift the data */ 3786 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3787 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 3788 data_end, old_data - data_end); 3789 3790 data_end = old_data; 3791 old_size = btrfs_item_size_nr(leaf, slot); 3792 item = btrfs_item_nr(slot); 3793 btrfs_set_item_size(leaf, item, old_size + data_size); 3794 btrfs_mark_buffer_dirty(leaf); 3795 3796 if (btrfs_leaf_free_space(leaf) < 0) { 3797 btrfs_print_leaf(leaf); 3798 BUG(); 3799 } 3800 } 3801 3802 /** 3803 * setup_items_for_insert - Helper called before inserting one or more items 3804 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work 3805 * in a function that doesn't call btrfs_search_slot 3806 * 3807 * @root: root we are inserting items to 3808 * @path: points to the leaf/slot where we are going to insert new items 3809 * @cpu_key: array of keys for items to be inserted 3810 * @data_size: size of the body of each item we are going to insert 3811 * @nr: size of @cpu_key/@data_size arrays 3812 */ 3813 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 3814 const struct btrfs_key *cpu_key, u32 *data_size, 3815 int nr) 3816 { 3817 struct btrfs_fs_info *fs_info = root->fs_info; 3818 struct btrfs_item *item; 3819 int i; 3820 u32 nritems; 3821 unsigned int data_end; 3822 struct btrfs_disk_key disk_key; 3823 struct extent_buffer *leaf; 3824 int slot; 3825 struct btrfs_map_token token; 3826 u32 total_size; 3827 u32 total_data = 0; 3828 3829 for (i = 0; i < nr; i++) 3830 total_data += data_size[i]; 3831 total_size = total_data + (nr * sizeof(struct btrfs_item)); 3832 3833 if (path->slots[0] == 0) { 3834 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3835 fixup_low_keys(path, &disk_key, 1); 3836 } 3837 btrfs_unlock_up_safe(path, 1); 3838 3839 leaf = path->nodes[0]; 3840 slot = path->slots[0]; 3841 3842 nritems = btrfs_header_nritems(leaf); 3843 data_end = leaf_data_end(leaf); 3844 3845 if (btrfs_leaf_free_space(leaf) < total_size) { 3846 btrfs_print_leaf(leaf); 3847 btrfs_crit(fs_info, "not enough freespace need %u have %d", 3848 total_size, btrfs_leaf_free_space(leaf)); 3849 BUG(); 3850 } 3851 3852 btrfs_init_map_token(&token, leaf); 3853 if (slot != nritems) { 3854 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 3855 3856 if (old_data < data_end) { 3857 btrfs_print_leaf(leaf); 3858 btrfs_crit(fs_info, 3859 "item at slot %d with data offset %u beyond data end of leaf %u", 3860 slot, old_data, data_end); 3861 BUG(); 3862 } 3863 /* 3864 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3865 */ 3866 /* first correct the data pointers */ 3867 for (i = slot; i < nritems; i++) { 3868 u32 ioff; 3869 3870 item = btrfs_item_nr(i); 3871 ioff = btrfs_token_item_offset(&token, item); 3872 btrfs_set_token_item_offset(&token, item, 3873 ioff - total_data); 3874 } 3875 /* shift the items */ 3876 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3877 btrfs_item_nr_offset(slot), 3878 (nritems - slot) * sizeof(struct btrfs_item)); 3879 3880 /* shift the data */ 3881 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 3882 data_end - total_data, BTRFS_LEAF_DATA_OFFSET + 3883 data_end, old_data - data_end); 3884 data_end = old_data; 3885 } 3886 3887 /* setup the item for the new data */ 3888 for (i = 0; i < nr; i++) { 3889 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 3890 btrfs_set_item_key(leaf, &disk_key, slot + i); 3891 item = btrfs_item_nr(slot + i); 3892 data_end -= data_size[i]; 3893 btrfs_set_token_item_offset(&token, item, data_end); 3894 btrfs_set_token_item_size(&token, item, data_size[i]); 3895 } 3896 3897 btrfs_set_header_nritems(leaf, nritems + nr); 3898 btrfs_mark_buffer_dirty(leaf); 3899 3900 if (btrfs_leaf_free_space(leaf) < 0) { 3901 btrfs_print_leaf(leaf); 3902 BUG(); 3903 } 3904 } 3905 3906 /* 3907 * Given a key and some data, insert items into the tree. 3908 * This does all the path init required, making room in the tree if needed. 3909 */ 3910 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 3911 struct btrfs_root *root, 3912 struct btrfs_path *path, 3913 const struct btrfs_key *cpu_key, u32 *data_size, 3914 int nr) 3915 { 3916 int ret = 0; 3917 int slot; 3918 int i; 3919 u32 total_size = 0; 3920 u32 total_data = 0; 3921 3922 for (i = 0; i < nr; i++) 3923 total_data += data_size[i]; 3924 3925 total_size = total_data + (nr * sizeof(struct btrfs_item)); 3926 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 3927 if (ret == 0) 3928 return -EEXIST; 3929 if (ret < 0) 3930 return ret; 3931 3932 slot = path->slots[0]; 3933 BUG_ON(slot < 0); 3934 3935 setup_items_for_insert(root, path, cpu_key, data_size, nr); 3936 return 0; 3937 } 3938 3939 /* 3940 * Given a key and some data, insert an item into the tree. 3941 * This does all the path init required, making room in the tree if needed. 3942 */ 3943 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3944 const struct btrfs_key *cpu_key, void *data, 3945 u32 data_size) 3946 { 3947 int ret = 0; 3948 struct btrfs_path *path; 3949 struct extent_buffer *leaf; 3950 unsigned long ptr; 3951 3952 path = btrfs_alloc_path(); 3953 if (!path) 3954 return -ENOMEM; 3955 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3956 if (!ret) { 3957 leaf = path->nodes[0]; 3958 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 3959 write_extent_buffer(leaf, data, ptr, data_size); 3960 btrfs_mark_buffer_dirty(leaf); 3961 } 3962 btrfs_free_path(path); 3963 return ret; 3964 } 3965 3966 /* 3967 * delete the pointer from a given node. 3968 * 3969 * the tree should have been previously balanced so the deletion does not 3970 * empty a node. 3971 */ 3972 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 3973 int level, int slot) 3974 { 3975 struct extent_buffer *parent = path->nodes[level]; 3976 u32 nritems; 3977 int ret; 3978 3979 nritems = btrfs_header_nritems(parent); 3980 if (slot != nritems - 1) { 3981 if (level) { 3982 ret = btrfs_tree_mod_log_insert_move(parent, slot, 3983 slot + 1, nritems - slot - 1); 3984 BUG_ON(ret < 0); 3985 } 3986 memmove_extent_buffer(parent, 3987 btrfs_node_key_ptr_offset(slot), 3988 btrfs_node_key_ptr_offset(slot + 1), 3989 sizeof(struct btrfs_key_ptr) * 3990 (nritems - slot - 1)); 3991 } else if (level) { 3992 ret = btrfs_tree_mod_log_insert_key(parent, slot, 3993 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); 3994 BUG_ON(ret < 0); 3995 } 3996 3997 nritems--; 3998 btrfs_set_header_nritems(parent, nritems); 3999 if (nritems == 0 && parent == root->node) { 4000 BUG_ON(btrfs_header_level(root->node) != 1); 4001 /* just turn the root into a leaf and break */ 4002 btrfs_set_header_level(root->node, 0); 4003 } else if (slot == 0) { 4004 struct btrfs_disk_key disk_key; 4005 4006 btrfs_node_key(parent, &disk_key, 0); 4007 fixup_low_keys(path, &disk_key, level + 1); 4008 } 4009 btrfs_mark_buffer_dirty(parent); 4010 } 4011 4012 /* 4013 * a helper function to delete the leaf pointed to by path->slots[1] and 4014 * path->nodes[1]. 4015 * 4016 * This deletes the pointer in path->nodes[1] and frees the leaf 4017 * block extent. zero is returned if it all worked out, < 0 otherwise. 4018 * 4019 * The path must have already been setup for deleting the leaf, including 4020 * all the proper balancing. path->nodes[1] must be locked. 4021 */ 4022 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4023 struct btrfs_root *root, 4024 struct btrfs_path *path, 4025 struct extent_buffer *leaf) 4026 { 4027 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4028 del_ptr(root, path, 1, path->slots[1]); 4029 4030 /* 4031 * btrfs_free_extent is expensive, we want to make sure we 4032 * aren't holding any locks when we call it 4033 */ 4034 btrfs_unlock_up_safe(path, 0); 4035 4036 root_sub_used(root, leaf->len); 4037 4038 atomic_inc(&leaf->refs); 4039 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4040 free_extent_buffer_stale(leaf); 4041 } 4042 /* 4043 * delete the item at the leaf level in path. If that empties 4044 * the leaf, remove it from the tree 4045 */ 4046 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4047 struct btrfs_path *path, int slot, int nr) 4048 { 4049 struct btrfs_fs_info *fs_info = root->fs_info; 4050 struct extent_buffer *leaf; 4051 struct btrfs_item *item; 4052 u32 last_off; 4053 u32 dsize = 0; 4054 int ret = 0; 4055 int wret; 4056 int i; 4057 u32 nritems; 4058 4059 leaf = path->nodes[0]; 4060 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4061 4062 for (i = 0; i < nr; i++) 4063 dsize += btrfs_item_size_nr(leaf, slot + i); 4064 4065 nritems = btrfs_header_nritems(leaf); 4066 4067 if (slot + nr != nritems) { 4068 int data_end = leaf_data_end(leaf); 4069 struct btrfs_map_token token; 4070 4071 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4072 data_end + dsize, 4073 BTRFS_LEAF_DATA_OFFSET + data_end, 4074 last_off - data_end); 4075 4076 btrfs_init_map_token(&token, leaf); 4077 for (i = slot + nr; i < nritems; i++) { 4078 u32 ioff; 4079 4080 item = btrfs_item_nr(i); 4081 ioff = btrfs_token_item_offset(&token, item); 4082 btrfs_set_token_item_offset(&token, item, ioff + dsize); 4083 } 4084 4085 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4086 btrfs_item_nr_offset(slot + nr), 4087 sizeof(struct btrfs_item) * 4088 (nritems - slot - nr)); 4089 } 4090 btrfs_set_header_nritems(leaf, nritems - nr); 4091 nritems -= nr; 4092 4093 /* delete the leaf if we've emptied it */ 4094 if (nritems == 0) { 4095 if (leaf == root->node) { 4096 btrfs_set_header_level(leaf, 0); 4097 } else { 4098 btrfs_clean_tree_block(leaf); 4099 btrfs_del_leaf(trans, root, path, leaf); 4100 } 4101 } else { 4102 int used = leaf_space_used(leaf, 0, nritems); 4103 if (slot == 0) { 4104 struct btrfs_disk_key disk_key; 4105 4106 btrfs_item_key(leaf, &disk_key, 0); 4107 fixup_low_keys(path, &disk_key, 1); 4108 } 4109 4110 /* delete the leaf if it is mostly empty */ 4111 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4112 /* push_leaf_left fixes the path. 4113 * make sure the path still points to our leaf 4114 * for possible call to del_ptr below 4115 */ 4116 slot = path->slots[1]; 4117 atomic_inc(&leaf->refs); 4118 4119 wret = push_leaf_left(trans, root, path, 1, 1, 4120 1, (u32)-1); 4121 if (wret < 0 && wret != -ENOSPC) 4122 ret = wret; 4123 4124 if (path->nodes[0] == leaf && 4125 btrfs_header_nritems(leaf)) { 4126 wret = push_leaf_right(trans, root, path, 1, 4127 1, 1, 0); 4128 if (wret < 0 && wret != -ENOSPC) 4129 ret = wret; 4130 } 4131 4132 if (btrfs_header_nritems(leaf) == 0) { 4133 path->slots[1] = slot; 4134 btrfs_del_leaf(trans, root, path, leaf); 4135 free_extent_buffer(leaf); 4136 ret = 0; 4137 } else { 4138 /* if we're still in the path, make sure 4139 * we're dirty. Otherwise, one of the 4140 * push_leaf functions must have already 4141 * dirtied this buffer 4142 */ 4143 if (path->nodes[0] == leaf) 4144 btrfs_mark_buffer_dirty(leaf); 4145 free_extent_buffer(leaf); 4146 } 4147 } else { 4148 btrfs_mark_buffer_dirty(leaf); 4149 } 4150 } 4151 return ret; 4152 } 4153 4154 /* 4155 * search the tree again to find a leaf with lesser keys 4156 * returns 0 if it found something or 1 if there are no lesser leaves. 4157 * returns < 0 on io errors. 4158 * 4159 * This may release the path, and so you may lose any locks held at the 4160 * time you call it. 4161 */ 4162 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4163 { 4164 struct btrfs_key key; 4165 struct btrfs_disk_key found_key; 4166 int ret; 4167 4168 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4169 4170 if (key.offset > 0) { 4171 key.offset--; 4172 } else if (key.type > 0) { 4173 key.type--; 4174 key.offset = (u64)-1; 4175 } else if (key.objectid > 0) { 4176 key.objectid--; 4177 key.type = (u8)-1; 4178 key.offset = (u64)-1; 4179 } else { 4180 return 1; 4181 } 4182 4183 btrfs_release_path(path); 4184 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4185 if (ret < 0) 4186 return ret; 4187 btrfs_item_key(path->nodes[0], &found_key, 0); 4188 ret = comp_keys(&found_key, &key); 4189 /* 4190 * We might have had an item with the previous key in the tree right 4191 * before we released our path. And after we released our path, that 4192 * item might have been pushed to the first slot (0) of the leaf we 4193 * were holding due to a tree balance. Alternatively, an item with the 4194 * previous key can exist as the only element of a leaf (big fat item). 4195 * Therefore account for these 2 cases, so that our callers (like 4196 * btrfs_previous_item) don't miss an existing item with a key matching 4197 * the previous key we computed above. 4198 */ 4199 if (ret <= 0) 4200 return 0; 4201 return 1; 4202 } 4203 4204 /* 4205 * A helper function to walk down the tree starting at min_key, and looking 4206 * for nodes or leaves that are have a minimum transaction id. 4207 * This is used by the btree defrag code, and tree logging 4208 * 4209 * This does not cow, but it does stuff the starting key it finds back 4210 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4211 * key and get a writable path. 4212 * 4213 * This honors path->lowest_level to prevent descent past a given level 4214 * of the tree. 4215 * 4216 * min_trans indicates the oldest transaction that you are interested 4217 * in walking through. Any nodes or leaves older than min_trans are 4218 * skipped over (without reading them). 4219 * 4220 * returns zero if something useful was found, < 0 on error and 1 if there 4221 * was nothing in the tree that matched the search criteria. 4222 */ 4223 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4224 struct btrfs_path *path, 4225 u64 min_trans) 4226 { 4227 struct extent_buffer *cur; 4228 struct btrfs_key found_key; 4229 int slot; 4230 int sret; 4231 u32 nritems; 4232 int level; 4233 int ret = 1; 4234 int keep_locks = path->keep_locks; 4235 4236 path->keep_locks = 1; 4237 again: 4238 cur = btrfs_read_lock_root_node(root); 4239 level = btrfs_header_level(cur); 4240 WARN_ON(path->nodes[level]); 4241 path->nodes[level] = cur; 4242 path->locks[level] = BTRFS_READ_LOCK; 4243 4244 if (btrfs_header_generation(cur) < min_trans) { 4245 ret = 1; 4246 goto out; 4247 } 4248 while (1) { 4249 nritems = btrfs_header_nritems(cur); 4250 level = btrfs_header_level(cur); 4251 sret = btrfs_bin_search(cur, min_key, &slot); 4252 if (sret < 0) { 4253 ret = sret; 4254 goto out; 4255 } 4256 4257 /* at the lowest level, we're done, setup the path and exit */ 4258 if (level == path->lowest_level) { 4259 if (slot >= nritems) 4260 goto find_next_key; 4261 ret = 0; 4262 path->slots[level] = slot; 4263 btrfs_item_key_to_cpu(cur, &found_key, slot); 4264 goto out; 4265 } 4266 if (sret && slot > 0) 4267 slot--; 4268 /* 4269 * check this node pointer against the min_trans parameters. 4270 * If it is too old, skip to the next one. 4271 */ 4272 while (slot < nritems) { 4273 u64 gen; 4274 4275 gen = btrfs_node_ptr_generation(cur, slot); 4276 if (gen < min_trans) { 4277 slot++; 4278 continue; 4279 } 4280 break; 4281 } 4282 find_next_key: 4283 /* 4284 * we didn't find a candidate key in this node, walk forward 4285 * and find another one 4286 */ 4287 if (slot >= nritems) { 4288 path->slots[level] = slot; 4289 sret = btrfs_find_next_key(root, path, min_key, level, 4290 min_trans); 4291 if (sret == 0) { 4292 btrfs_release_path(path); 4293 goto again; 4294 } else { 4295 goto out; 4296 } 4297 } 4298 /* save our key for returning back */ 4299 btrfs_node_key_to_cpu(cur, &found_key, slot); 4300 path->slots[level] = slot; 4301 if (level == path->lowest_level) { 4302 ret = 0; 4303 goto out; 4304 } 4305 cur = btrfs_read_node_slot(cur, slot); 4306 if (IS_ERR(cur)) { 4307 ret = PTR_ERR(cur); 4308 goto out; 4309 } 4310 4311 btrfs_tree_read_lock(cur); 4312 4313 path->locks[level - 1] = BTRFS_READ_LOCK; 4314 path->nodes[level - 1] = cur; 4315 unlock_up(path, level, 1, 0, NULL); 4316 } 4317 out: 4318 path->keep_locks = keep_locks; 4319 if (ret == 0) { 4320 btrfs_unlock_up_safe(path, path->lowest_level + 1); 4321 memcpy(min_key, &found_key, sizeof(found_key)); 4322 } 4323 return ret; 4324 } 4325 4326 /* 4327 * this is similar to btrfs_next_leaf, but does not try to preserve 4328 * and fixup the path. It looks for and returns the next key in the 4329 * tree based on the current path and the min_trans parameters. 4330 * 4331 * 0 is returned if another key is found, < 0 if there are any errors 4332 * and 1 is returned if there are no higher keys in the tree 4333 * 4334 * path->keep_locks should be set to 1 on the search made before 4335 * calling this function. 4336 */ 4337 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4338 struct btrfs_key *key, int level, u64 min_trans) 4339 { 4340 int slot; 4341 struct extent_buffer *c; 4342 4343 WARN_ON(!path->keep_locks && !path->skip_locking); 4344 while (level < BTRFS_MAX_LEVEL) { 4345 if (!path->nodes[level]) 4346 return 1; 4347 4348 slot = path->slots[level] + 1; 4349 c = path->nodes[level]; 4350 next: 4351 if (slot >= btrfs_header_nritems(c)) { 4352 int ret; 4353 int orig_lowest; 4354 struct btrfs_key cur_key; 4355 if (level + 1 >= BTRFS_MAX_LEVEL || 4356 !path->nodes[level + 1]) 4357 return 1; 4358 4359 if (path->locks[level + 1] || path->skip_locking) { 4360 level++; 4361 continue; 4362 } 4363 4364 slot = btrfs_header_nritems(c) - 1; 4365 if (level == 0) 4366 btrfs_item_key_to_cpu(c, &cur_key, slot); 4367 else 4368 btrfs_node_key_to_cpu(c, &cur_key, slot); 4369 4370 orig_lowest = path->lowest_level; 4371 btrfs_release_path(path); 4372 path->lowest_level = level; 4373 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4374 0, 0); 4375 path->lowest_level = orig_lowest; 4376 if (ret < 0) 4377 return ret; 4378 4379 c = path->nodes[level]; 4380 slot = path->slots[level]; 4381 if (ret == 0) 4382 slot++; 4383 goto next; 4384 } 4385 4386 if (level == 0) 4387 btrfs_item_key_to_cpu(c, key, slot); 4388 else { 4389 u64 gen = btrfs_node_ptr_generation(c, slot); 4390 4391 if (gen < min_trans) { 4392 slot++; 4393 goto next; 4394 } 4395 btrfs_node_key_to_cpu(c, key, slot); 4396 } 4397 return 0; 4398 } 4399 return 1; 4400 } 4401 4402 /* 4403 * search the tree again to find a leaf with greater keys 4404 * returns 0 if it found something or 1 if there are no greater leaves. 4405 * returns < 0 on io errors. 4406 */ 4407 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4408 { 4409 return btrfs_next_old_leaf(root, path, 0); 4410 } 4411 4412 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 4413 u64 time_seq) 4414 { 4415 int slot; 4416 int level; 4417 struct extent_buffer *c; 4418 struct extent_buffer *next; 4419 struct btrfs_key key; 4420 u32 nritems; 4421 int ret; 4422 int i; 4423 4424 nritems = btrfs_header_nritems(path->nodes[0]); 4425 if (nritems == 0) 4426 return 1; 4427 4428 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4429 again: 4430 level = 1; 4431 next = NULL; 4432 btrfs_release_path(path); 4433 4434 path->keep_locks = 1; 4435 4436 if (time_seq) 4437 ret = btrfs_search_old_slot(root, &key, path, time_seq); 4438 else 4439 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4440 path->keep_locks = 0; 4441 4442 if (ret < 0) 4443 return ret; 4444 4445 nritems = btrfs_header_nritems(path->nodes[0]); 4446 /* 4447 * by releasing the path above we dropped all our locks. A balance 4448 * could have added more items next to the key that used to be 4449 * at the very end of the block. So, check again here and 4450 * advance the path if there are now more items available. 4451 */ 4452 if (nritems > 0 && path->slots[0] < nritems - 1) { 4453 if (ret == 0) 4454 path->slots[0]++; 4455 ret = 0; 4456 goto done; 4457 } 4458 /* 4459 * So the above check misses one case: 4460 * - after releasing the path above, someone has removed the item that 4461 * used to be at the very end of the block, and balance between leafs 4462 * gets another one with bigger key.offset to replace it. 4463 * 4464 * This one should be returned as well, or we can get leaf corruption 4465 * later(esp. in __btrfs_drop_extents()). 4466 * 4467 * And a bit more explanation about this check, 4468 * with ret > 0, the key isn't found, the path points to the slot 4469 * where it should be inserted, so the path->slots[0] item must be the 4470 * bigger one. 4471 */ 4472 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 4473 ret = 0; 4474 goto done; 4475 } 4476 4477 while (level < BTRFS_MAX_LEVEL) { 4478 if (!path->nodes[level]) { 4479 ret = 1; 4480 goto done; 4481 } 4482 4483 slot = path->slots[level] + 1; 4484 c = path->nodes[level]; 4485 if (slot >= btrfs_header_nritems(c)) { 4486 level++; 4487 if (level == BTRFS_MAX_LEVEL) { 4488 ret = 1; 4489 goto done; 4490 } 4491 continue; 4492 } 4493 4494 4495 /* 4496 * Our current level is where we're going to start from, and to 4497 * make sure lockdep doesn't complain we need to drop our locks 4498 * and nodes from 0 to our current level. 4499 */ 4500 for (i = 0; i < level; i++) { 4501 if (path->locks[level]) { 4502 btrfs_tree_read_unlock(path->nodes[i]); 4503 path->locks[i] = 0; 4504 } 4505 free_extent_buffer(path->nodes[i]); 4506 path->nodes[i] = NULL; 4507 } 4508 4509 next = c; 4510 ret = read_block_for_search(root, path, &next, level, 4511 slot, &key); 4512 if (ret == -EAGAIN) 4513 goto again; 4514 4515 if (ret < 0) { 4516 btrfs_release_path(path); 4517 goto done; 4518 } 4519 4520 if (!path->skip_locking) { 4521 ret = btrfs_try_tree_read_lock(next); 4522 if (!ret && time_seq) { 4523 /* 4524 * If we don't get the lock, we may be racing 4525 * with push_leaf_left, holding that lock while 4526 * itself waiting for the leaf we've currently 4527 * locked. To solve this situation, we give up 4528 * on our lock and cycle. 4529 */ 4530 free_extent_buffer(next); 4531 btrfs_release_path(path); 4532 cond_resched(); 4533 goto again; 4534 } 4535 if (!ret) 4536 btrfs_tree_read_lock(next); 4537 } 4538 break; 4539 } 4540 path->slots[level] = slot; 4541 while (1) { 4542 level--; 4543 path->nodes[level] = next; 4544 path->slots[level] = 0; 4545 if (!path->skip_locking) 4546 path->locks[level] = BTRFS_READ_LOCK; 4547 if (!level) 4548 break; 4549 4550 ret = read_block_for_search(root, path, &next, level, 4551 0, &key); 4552 if (ret == -EAGAIN) 4553 goto again; 4554 4555 if (ret < 0) { 4556 btrfs_release_path(path); 4557 goto done; 4558 } 4559 4560 if (!path->skip_locking) 4561 btrfs_tree_read_lock(next); 4562 } 4563 ret = 0; 4564 done: 4565 unlock_up(path, 0, 1, 0, NULL); 4566 4567 return ret; 4568 } 4569 4570 /* 4571 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 4572 * searching until it gets past min_objectid or finds an item of 'type' 4573 * 4574 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4575 */ 4576 int btrfs_previous_item(struct btrfs_root *root, 4577 struct btrfs_path *path, u64 min_objectid, 4578 int type) 4579 { 4580 struct btrfs_key found_key; 4581 struct extent_buffer *leaf; 4582 u32 nritems; 4583 int ret; 4584 4585 while (1) { 4586 if (path->slots[0] == 0) { 4587 ret = btrfs_prev_leaf(root, path); 4588 if (ret != 0) 4589 return ret; 4590 } else { 4591 path->slots[0]--; 4592 } 4593 leaf = path->nodes[0]; 4594 nritems = btrfs_header_nritems(leaf); 4595 if (nritems == 0) 4596 return 1; 4597 if (path->slots[0] == nritems) 4598 path->slots[0]--; 4599 4600 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4601 if (found_key.objectid < min_objectid) 4602 break; 4603 if (found_key.type == type) 4604 return 0; 4605 if (found_key.objectid == min_objectid && 4606 found_key.type < type) 4607 break; 4608 } 4609 return 1; 4610 } 4611 4612 /* 4613 * search in extent tree to find a previous Metadata/Data extent item with 4614 * min objecitd. 4615 * 4616 * returns 0 if something is found, 1 if nothing was found and < 0 on error 4617 */ 4618 int btrfs_previous_extent_item(struct btrfs_root *root, 4619 struct btrfs_path *path, u64 min_objectid) 4620 { 4621 struct btrfs_key found_key; 4622 struct extent_buffer *leaf; 4623 u32 nritems; 4624 int ret; 4625 4626 while (1) { 4627 if (path->slots[0] == 0) { 4628 ret = btrfs_prev_leaf(root, path); 4629 if (ret != 0) 4630 return ret; 4631 } else { 4632 path->slots[0]--; 4633 } 4634 leaf = path->nodes[0]; 4635 nritems = btrfs_header_nritems(leaf); 4636 if (nritems == 0) 4637 return 1; 4638 if (path->slots[0] == nritems) 4639 path->slots[0]--; 4640 4641 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4642 if (found_key.objectid < min_objectid) 4643 break; 4644 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 4645 found_key.type == BTRFS_METADATA_ITEM_KEY) 4646 return 0; 4647 if (found_key.objectid == min_objectid && 4648 found_key.type < BTRFS_EXTENT_ITEM_KEY) 4649 break; 4650 } 4651 return 1; 4652 } 4653