1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include "ctree.h" 11 #include "disk-io.h" 12 #include "transaction.h" 13 #include "print-tree.h" 14 #include "locking.h" 15 16 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 17 *root, struct btrfs_path *path, int level); 18 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 19 const struct btrfs_key *ins_key, struct btrfs_path *path, 20 int data_size, int extend); 21 static int push_node_left(struct btrfs_trans_handle *trans, 22 struct btrfs_fs_info *fs_info, 23 struct extent_buffer *dst, 24 struct extent_buffer *src, int empty); 25 static int balance_node_right(struct btrfs_trans_handle *trans, 26 struct btrfs_fs_info *fs_info, 27 struct extent_buffer *dst_buf, 28 struct extent_buffer *src_buf); 29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 30 int level, int slot); 31 32 struct btrfs_path *btrfs_alloc_path(void) 33 { 34 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 35 } 36 37 /* 38 * set all locked nodes in the path to blocking locks. This should 39 * be done before scheduling 40 */ 41 noinline void btrfs_set_path_blocking(struct btrfs_path *p) 42 { 43 int i; 44 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 45 if (!p->nodes[i] || !p->locks[i]) 46 continue; 47 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 48 if (p->locks[i] == BTRFS_READ_LOCK) 49 p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 50 else if (p->locks[i] == BTRFS_WRITE_LOCK) 51 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 52 } 53 } 54 55 /* this also releases the path */ 56 void btrfs_free_path(struct btrfs_path *p) 57 { 58 if (!p) 59 return; 60 btrfs_release_path(p); 61 kmem_cache_free(btrfs_path_cachep, p); 62 } 63 64 /* 65 * path release drops references on the extent buffers in the path 66 * and it drops any locks held by this path 67 * 68 * It is safe to call this on paths that no locks or extent buffers held. 69 */ 70 noinline void btrfs_release_path(struct btrfs_path *p) 71 { 72 int i; 73 74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 75 p->slots[i] = 0; 76 if (!p->nodes[i]) 77 continue; 78 if (p->locks[i]) { 79 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 80 p->locks[i] = 0; 81 } 82 free_extent_buffer(p->nodes[i]); 83 p->nodes[i] = NULL; 84 } 85 } 86 87 /* 88 * safely gets a reference on the root node of a tree. A lock 89 * is not taken, so a concurrent writer may put a different node 90 * at the root of the tree. See btrfs_lock_root_node for the 91 * looping required. 92 * 93 * The extent buffer returned by this has a reference taken, so 94 * it won't disappear. It may stop being the root of the tree 95 * at any time because there are no locks held. 96 */ 97 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 98 { 99 struct extent_buffer *eb; 100 101 while (1) { 102 rcu_read_lock(); 103 eb = rcu_dereference(root->node); 104 105 /* 106 * RCU really hurts here, we could free up the root node because 107 * it was COWed but we may not get the new root node yet so do 108 * the inc_not_zero dance and if it doesn't work then 109 * synchronize_rcu and try again. 110 */ 111 if (atomic_inc_not_zero(&eb->refs)) { 112 rcu_read_unlock(); 113 break; 114 } 115 rcu_read_unlock(); 116 synchronize_rcu(); 117 } 118 return eb; 119 } 120 121 /* loop around taking references on and locking the root node of the 122 * tree until you end up with a lock on the root. A locked buffer 123 * is returned, with a reference held. 124 */ 125 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 126 { 127 struct extent_buffer *eb; 128 129 while (1) { 130 eb = btrfs_root_node(root); 131 btrfs_tree_lock(eb); 132 if (eb == root->node) 133 break; 134 btrfs_tree_unlock(eb); 135 free_extent_buffer(eb); 136 } 137 return eb; 138 } 139 140 /* loop around taking references on and locking the root node of the 141 * tree until you end up with a lock on the root. A locked buffer 142 * is returned, with a reference held. 143 */ 144 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 145 { 146 struct extent_buffer *eb; 147 148 while (1) { 149 eb = btrfs_root_node(root); 150 btrfs_tree_read_lock(eb); 151 if (eb == root->node) 152 break; 153 btrfs_tree_read_unlock(eb); 154 free_extent_buffer(eb); 155 } 156 return eb; 157 } 158 159 /* cowonly root (everything not a reference counted cow subvolume), just get 160 * put onto a simple dirty list. transaction.c walks this to make sure they 161 * get properly updated on disk. 162 */ 163 static void add_root_to_dirty_list(struct btrfs_root *root) 164 { 165 struct btrfs_fs_info *fs_info = root->fs_info; 166 167 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 168 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 169 return; 170 171 spin_lock(&fs_info->trans_lock); 172 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 173 /* Want the extent tree to be the last on the list */ 174 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 175 list_move_tail(&root->dirty_list, 176 &fs_info->dirty_cowonly_roots); 177 else 178 list_move(&root->dirty_list, 179 &fs_info->dirty_cowonly_roots); 180 } 181 spin_unlock(&fs_info->trans_lock); 182 } 183 184 /* 185 * used by snapshot creation to make a copy of a root for a tree with 186 * a given objectid. The buffer with the new root node is returned in 187 * cow_ret, and this func returns zero on success or a negative error code. 188 */ 189 int btrfs_copy_root(struct btrfs_trans_handle *trans, 190 struct btrfs_root *root, 191 struct extent_buffer *buf, 192 struct extent_buffer **cow_ret, u64 new_root_objectid) 193 { 194 struct btrfs_fs_info *fs_info = root->fs_info; 195 struct extent_buffer *cow; 196 int ret = 0; 197 int level; 198 struct btrfs_disk_key disk_key; 199 200 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 201 trans->transid != fs_info->running_transaction->transid); 202 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 203 trans->transid != root->last_trans); 204 205 level = btrfs_header_level(buf); 206 if (level == 0) 207 btrfs_item_key(buf, &disk_key, 0); 208 else 209 btrfs_node_key(buf, &disk_key, 0); 210 211 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 212 &disk_key, level, buf->start, 0); 213 if (IS_ERR(cow)) 214 return PTR_ERR(cow); 215 216 copy_extent_buffer_full(cow, buf); 217 btrfs_set_header_bytenr(cow, cow->start); 218 btrfs_set_header_generation(cow, trans->transid); 219 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 220 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 221 BTRFS_HEADER_FLAG_RELOC); 222 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 223 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 224 else 225 btrfs_set_header_owner(cow, new_root_objectid); 226 227 write_extent_buffer_fsid(cow, fs_info->fsid); 228 229 WARN_ON(btrfs_header_generation(buf) > trans->transid); 230 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 231 ret = btrfs_inc_ref(trans, root, cow, 1); 232 else 233 ret = btrfs_inc_ref(trans, root, cow, 0); 234 235 if (ret) 236 return ret; 237 238 btrfs_mark_buffer_dirty(cow); 239 *cow_ret = cow; 240 return 0; 241 } 242 243 enum mod_log_op { 244 MOD_LOG_KEY_REPLACE, 245 MOD_LOG_KEY_ADD, 246 MOD_LOG_KEY_REMOVE, 247 MOD_LOG_KEY_REMOVE_WHILE_FREEING, 248 MOD_LOG_KEY_REMOVE_WHILE_MOVING, 249 MOD_LOG_MOVE_KEYS, 250 MOD_LOG_ROOT_REPLACE, 251 }; 252 253 struct tree_mod_root { 254 u64 logical; 255 u8 level; 256 }; 257 258 struct tree_mod_elem { 259 struct rb_node node; 260 u64 logical; 261 u64 seq; 262 enum mod_log_op op; 263 264 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 265 int slot; 266 267 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ 268 u64 generation; 269 270 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ 271 struct btrfs_disk_key key; 272 u64 blockptr; 273 274 /* this is used for op == MOD_LOG_MOVE_KEYS */ 275 struct { 276 int dst_slot; 277 int nr_items; 278 } move; 279 280 /* this is used for op == MOD_LOG_ROOT_REPLACE */ 281 struct tree_mod_root old_root; 282 }; 283 284 /* 285 * Pull a new tree mod seq number for our operation. 286 */ 287 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 288 { 289 return atomic64_inc_return(&fs_info->tree_mod_seq); 290 } 291 292 /* 293 * This adds a new blocker to the tree mod log's blocker list if the @elem 294 * passed does not already have a sequence number set. So when a caller expects 295 * to record tree modifications, it should ensure to set elem->seq to zero 296 * before calling btrfs_get_tree_mod_seq. 297 * Returns a fresh, unused tree log modification sequence number, even if no new 298 * blocker was added. 299 */ 300 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 301 struct seq_list *elem) 302 { 303 write_lock(&fs_info->tree_mod_log_lock); 304 spin_lock(&fs_info->tree_mod_seq_lock); 305 if (!elem->seq) { 306 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 307 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 308 } 309 spin_unlock(&fs_info->tree_mod_seq_lock); 310 write_unlock(&fs_info->tree_mod_log_lock); 311 312 return elem->seq; 313 } 314 315 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 316 struct seq_list *elem) 317 { 318 struct rb_root *tm_root; 319 struct rb_node *node; 320 struct rb_node *next; 321 struct seq_list *cur_elem; 322 struct tree_mod_elem *tm; 323 u64 min_seq = (u64)-1; 324 u64 seq_putting = elem->seq; 325 326 if (!seq_putting) 327 return; 328 329 spin_lock(&fs_info->tree_mod_seq_lock); 330 list_del(&elem->list); 331 elem->seq = 0; 332 333 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 334 if (cur_elem->seq < min_seq) { 335 if (seq_putting > cur_elem->seq) { 336 /* 337 * blocker with lower sequence number exists, we 338 * cannot remove anything from the log 339 */ 340 spin_unlock(&fs_info->tree_mod_seq_lock); 341 return; 342 } 343 min_seq = cur_elem->seq; 344 } 345 } 346 spin_unlock(&fs_info->tree_mod_seq_lock); 347 348 /* 349 * anything that's lower than the lowest existing (read: blocked) 350 * sequence number can be removed from the tree. 351 */ 352 write_lock(&fs_info->tree_mod_log_lock); 353 tm_root = &fs_info->tree_mod_log; 354 for (node = rb_first(tm_root); node; node = next) { 355 next = rb_next(node); 356 tm = rb_entry(node, struct tree_mod_elem, node); 357 if (tm->seq > min_seq) 358 continue; 359 rb_erase(node, tm_root); 360 kfree(tm); 361 } 362 write_unlock(&fs_info->tree_mod_log_lock); 363 } 364 365 /* 366 * key order of the log: 367 * node/leaf start address -> sequence 368 * 369 * The 'start address' is the logical address of the *new* root node 370 * for root replace operations, or the logical address of the affected 371 * block for all other operations. 372 * 373 * Note: must be called with write lock for fs_info::tree_mod_log_lock. 374 */ 375 static noinline int 376 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 377 { 378 struct rb_root *tm_root; 379 struct rb_node **new; 380 struct rb_node *parent = NULL; 381 struct tree_mod_elem *cur; 382 383 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 384 385 tm_root = &fs_info->tree_mod_log; 386 new = &tm_root->rb_node; 387 while (*new) { 388 cur = rb_entry(*new, struct tree_mod_elem, node); 389 parent = *new; 390 if (cur->logical < tm->logical) 391 new = &((*new)->rb_left); 392 else if (cur->logical > tm->logical) 393 new = &((*new)->rb_right); 394 else if (cur->seq < tm->seq) 395 new = &((*new)->rb_left); 396 else if (cur->seq > tm->seq) 397 new = &((*new)->rb_right); 398 else 399 return -EEXIST; 400 } 401 402 rb_link_node(&tm->node, parent, new); 403 rb_insert_color(&tm->node, tm_root); 404 return 0; 405 } 406 407 /* 408 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it 409 * returns zero with the tree_mod_log_lock acquired. The caller must hold 410 * this until all tree mod log insertions are recorded in the rb tree and then 411 * write unlock fs_info::tree_mod_log_lock. 412 */ 413 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 414 struct extent_buffer *eb) { 415 smp_mb(); 416 if (list_empty(&(fs_info)->tree_mod_seq_list)) 417 return 1; 418 if (eb && btrfs_header_level(eb) == 0) 419 return 1; 420 421 write_lock(&fs_info->tree_mod_log_lock); 422 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 423 write_unlock(&fs_info->tree_mod_log_lock); 424 return 1; 425 } 426 427 return 0; 428 } 429 430 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 431 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, 432 struct extent_buffer *eb) 433 { 434 smp_mb(); 435 if (list_empty(&(fs_info)->tree_mod_seq_list)) 436 return 0; 437 if (eb && btrfs_header_level(eb) == 0) 438 return 0; 439 440 return 1; 441 } 442 443 static struct tree_mod_elem * 444 alloc_tree_mod_elem(struct extent_buffer *eb, int slot, 445 enum mod_log_op op, gfp_t flags) 446 { 447 struct tree_mod_elem *tm; 448 449 tm = kzalloc(sizeof(*tm), flags); 450 if (!tm) 451 return NULL; 452 453 tm->logical = eb->start; 454 if (op != MOD_LOG_KEY_ADD) { 455 btrfs_node_key(eb, &tm->key, slot); 456 tm->blockptr = btrfs_node_blockptr(eb, slot); 457 } 458 tm->op = op; 459 tm->slot = slot; 460 tm->generation = btrfs_node_ptr_generation(eb, slot); 461 RB_CLEAR_NODE(&tm->node); 462 463 return tm; 464 } 465 466 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 467 enum mod_log_op op, gfp_t flags) 468 { 469 struct tree_mod_elem *tm; 470 int ret; 471 472 if (!tree_mod_need_log(eb->fs_info, eb)) 473 return 0; 474 475 tm = alloc_tree_mod_elem(eb, slot, op, flags); 476 if (!tm) 477 return -ENOMEM; 478 479 if (tree_mod_dont_log(eb->fs_info, eb)) { 480 kfree(tm); 481 return 0; 482 } 483 484 ret = __tree_mod_log_insert(eb->fs_info, tm); 485 write_unlock(&eb->fs_info->tree_mod_log_lock); 486 if (ret) 487 kfree(tm); 488 489 return ret; 490 } 491 492 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb, 493 int dst_slot, int src_slot, int nr_items) 494 { 495 struct tree_mod_elem *tm = NULL; 496 struct tree_mod_elem **tm_list = NULL; 497 int ret = 0; 498 int i; 499 int locked = 0; 500 501 if (!tree_mod_need_log(eb->fs_info, eb)) 502 return 0; 503 504 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 505 if (!tm_list) 506 return -ENOMEM; 507 508 tm = kzalloc(sizeof(*tm), GFP_NOFS); 509 if (!tm) { 510 ret = -ENOMEM; 511 goto free_tms; 512 } 513 514 tm->logical = eb->start; 515 tm->slot = src_slot; 516 tm->move.dst_slot = dst_slot; 517 tm->move.nr_items = nr_items; 518 tm->op = MOD_LOG_MOVE_KEYS; 519 520 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 521 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 522 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); 523 if (!tm_list[i]) { 524 ret = -ENOMEM; 525 goto free_tms; 526 } 527 } 528 529 if (tree_mod_dont_log(eb->fs_info, eb)) 530 goto free_tms; 531 locked = 1; 532 533 /* 534 * When we override something during the move, we log these removals. 535 * This can only happen when we move towards the beginning of the 536 * buffer, i.e. dst_slot < src_slot. 537 */ 538 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 539 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]); 540 if (ret) 541 goto free_tms; 542 } 543 544 ret = __tree_mod_log_insert(eb->fs_info, tm); 545 if (ret) 546 goto free_tms; 547 write_unlock(&eb->fs_info->tree_mod_log_lock); 548 kfree(tm_list); 549 550 return 0; 551 free_tms: 552 for (i = 0; i < nr_items; i++) { 553 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 554 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 555 kfree(tm_list[i]); 556 } 557 if (locked) 558 write_unlock(&eb->fs_info->tree_mod_log_lock); 559 kfree(tm_list); 560 kfree(tm); 561 562 return ret; 563 } 564 565 static inline int 566 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 567 struct tree_mod_elem **tm_list, 568 int nritems) 569 { 570 int i, j; 571 int ret; 572 573 for (i = nritems - 1; i >= 0; i--) { 574 ret = __tree_mod_log_insert(fs_info, tm_list[i]); 575 if (ret) { 576 for (j = nritems - 1; j > i; j--) 577 rb_erase(&tm_list[j]->node, 578 &fs_info->tree_mod_log); 579 return ret; 580 } 581 } 582 583 return 0; 584 } 585 586 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root, 587 struct extent_buffer *new_root, int log_removal) 588 { 589 struct btrfs_fs_info *fs_info = old_root->fs_info; 590 struct tree_mod_elem *tm = NULL; 591 struct tree_mod_elem **tm_list = NULL; 592 int nritems = 0; 593 int ret = 0; 594 int i; 595 596 if (!tree_mod_need_log(fs_info, NULL)) 597 return 0; 598 599 if (log_removal && btrfs_header_level(old_root) > 0) { 600 nritems = btrfs_header_nritems(old_root); 601 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 602 GFP_NOFS); 603 if (!tm_list) { 604 ret = -ENOMEM; 605 goto free_tms; 606 } 607 for (i = 0; i < nritems; i++) { 608 tm_list[i] = alloc_tree_mod_elem(old_root, i, 609 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 610 if (!tm_list[i]) { 611 ret = -ENOMEM; 612 goto free_tms; 613 } 614 } 615 } 616 617 tm = kzalloc(sizeof(*tm), GFP_NOFS); 618 if (!tm) { 619 ret = -ENOMEM; 620 goto free_tms; 621 } 622 623 tm->logical = new_root->start; 624 tm->old_root.logical = old_root->start; 625 tm->old_root.level = btrfs_header_level(old_root); 626 tm->generation = btrfs_header_generation(old_root); 627 tm->op = MOD_LOG_ROOT_REPLACE; 628 629 if (tree_mod_dont_log(fs_info, NULL)) 630 goto free_tms; 631 632 if (tm_list) 633 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); 634 if (!ret) 635 ret = __tree_mod_log_insert(fs_info, tm); 636 637 write_unlock(&fs_info->tree_mod_log_lock); 638 if (ret) 639 goto free_tms; 640 kfree(tm_list); 641 642 return ret; 643 644 free_tms: 645 if (tm_list) { 646 for (i = 0; i < nritems; i++) 647 kfree(tm_list[i]); 648 kfree(tm_list); 649 } 650 kfree(tm); 651 652 return ret; 653 } 654 655 static struct tree_mod_elem * 656 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, 657 int smallest) 658 { 659 struct rb_root *tm_root; 660 struct rb_node *node; 661 struct tree_mod_elem *cur = NULL; 662 struct tree_mod_elem *found = NULL; 663 664 read_lock(&fs_info->tree_mod_log_lock); 665 tm_root = &fs_info->tree_mod_log; 666 node = tm_root->rb_node; 667 while (node) { 668 cur = rb_entry(node, struct tree_mod_elem, node); 669 if (cur->logical < start) { 670 node = node->rb_left; 671 } else if (cur->logical > start) { 672 node = node->rb_right; 673 } else if (cur->seq < min_seq) { 674 node = node->rb_left; 675 } else if (!smallest) { 676 /* we want the node with the highest seq */ 677 if (found) 678 BUG_ON(found->seq > cur->seq); 679 found = cur; 680 node = node->rb_left; 681 } else if (cur->seq > min_seq) { 682 /* we want the node with the smallest seq */ 683 if (found) 684 BUG_ON(found->seq < cur->seq); 685 found = cur; 686 node = node->rb_right; 687 } else { 688 found = cur; 689 break; 690 } 691 } 692 read_unlock(&fs_info->tree_mod_log_lock); 693 694 return found; 695 } 696 697 /* 698 * this returns the element from the log with the smallest time sequence 699 * value that's in the log (the oldest log item). any element with a time 700 * sequence lower than min_seq will be ignored. 701 */ 702 static struct tree_mod_elem * 703 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, 704 u64 min_seq) 705 { 706 return __tree_mod_log_search(fs_info, start, min_seq, 1); 707 } 708 709 /* 710 * this returns the element from the log with the largest time sequence 711 * value that's in the log (the most recent log item). any element with 712 * a time sequence lower than min_seq will be ignored. 713 */ 714 static struct tree_mod_elem * 715 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) 716 { 717 return __tree_mod_log_search(fs_info, start, min_seq, 0); 718 } 719 720 static noinline int 721 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 722 struct extent_buffer *src, unsigned long dst_offset, 723 unsigned long src_offset, int nr_items) 724 { 725 int ret = 0; 726 struct tree_mod_elem **tm_list = NULL; 727 struct tree_mod_elem **tm_list_add, **tm_list_rem; 728 int i; 729 int locked = 0; 730 731 if (!tree_mod_need_log(fs_info, NULL)) 732 return 0; 733 734 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 735 return 0; 736 737 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 738 GFP_NOFS); 739 if (!tm_list) 740 return -ENOMEM; 741 742 tm_list_add = tm_list; 743 tm_list_rem = tm_list + nr_items; 744 for (i = 0; i < nr_items; i++) { 745 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 746 MOD_LOG_KEY_REMOVE, GFP_NOFS); 747 if (!tm_list_rem[i]) { 748 ret = -ENOMEM; 749 goto free_tms; 750 } 751 752 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 753 MOD_LOG_KEY_ADD, GFP_NOFS); 754 if (!tm_list_add[i]) { 755 ret = -ENOMEM; 756 goto free_tms; 757 } 758 } 759 760 if (tree_mod_dont_log(fs_info, NULL)) 761 goto free_tms; 762 locked = 1; 763 764 for (i = 0; i < nr_items; i++) { 765 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); 766 if (ret) 767 goto free_tms; 768 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); 769 if (ret) 770 goto free_tms; 771 } 772 773 write_unlock(&fs_info->tree_mod_log_lock); 774 kfree(tm_list); 775 776 return 0; 777 778 free_tms: 779 for (i = 0; i < nr_items * 2; i++) { 780 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 781 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 782 kfree(tm_list[i]); 783 } 784 if (locked) 785 write_unlock(&fs_info->tree_mod_log_lock); 786 kfree(tm_list); 787 788 return ret; 789 } 790 791 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb) 792 { 793 struct tree_mod_elem **tm_list = NULL; 794 int nritems = 0; 795 int i; 796 int ret = 0; 797 798 if (btrfs_header_level(eb) == 0) 799 return 0; 800 801 if (!tree_mod_need_log(eb->fs_info, NULL)) 802 return 0; 803 804 nritems = btrfs_header_nritems(eb); 805 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 806 if (!tm_list) 807 return -ENOMEM; 808 809 for (i = 0; i < nritems; i++) { 810 tm_list[i] = alloc_tree_mod_elem(eb, i, 811 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 812 if (!tm_list[i]) { 813 ret = -ENOMEM; 814 goto free_tms; 815 } 816 } 817 818 if (tree_mod_dont_log(eb->fs_info, eb)) 819 goto free_tms; 820 821 ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 822 write_unlock(&eb->fs_info->tree_mod_log_lock); 823 if (ret) 824 goto free_tms; 825 kfree(tm_list); 826 827 return 0; 828 829 free_tms: 830 for (i = 0; i < nritems; i++) 831 kfree(tm_list[i]); 832 kfree(tm_list); 833 834 return ret; 835 } 836 837 /* 838 * check if the tree block can be shared by multiple trees 839 */ 840 int btrfs_block_can_be_shared(struct btrfs_root *root, 841 struct extent_buffer *buf) 842 { 843 /* 844 * Tree blocks not in reference counted trees and tree roots 845 * are never shared. If a block was allocated after the last 846 * snapshot and the block was not allocated by tree relocation, 847 * we know the block is not shared. 848 */ 849 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 850 buf != root->node && buf != root->commit_root && 851 (btrfs_header_generation(buf) <= 852 btrfs_root_last_snapshot(&root->root_item) || 853 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 854 return 1; 855 856 return 0; 857 } 858 859 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 860 struct btrfs_root *root, 861 struct extent_buffer *buf, 862 struct extent_buffer *cow, 863 int *last_ref) 864 { 865 struct btrfs_fs_info *fs_info = root->fs_info; 866 u64 refs; 867 u64 owner; 868 u64 flags; 869 u64 new_flags = 0; 870 int ret; 871 872 /* 873 * Backrefs update rules: 874 * 875 * Always use full backrefs for extent pointers in tree block 876 * allocated by tree relocation. 877 * 878 * If a shared tree block is no longer referenced by its owner 879 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 880 * use full backrefs for extent pointers in tree block. 881 * 882 * If a tree block is been relocating 883 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 884 * use full backrefs for extent pointers in tree block. 885 * The reason for this is some operations (such as drop tree) 886 * are only allowed for blocks use full backrefs. 887 */ 888 889 if (btrfs_block_can_be_shared(root, buf)) { 890 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 891 btrfs_header_level(buf), 1, 892 &refs, &flags); 893 if (ret) 894 return ret; 895 if (refs == 0) { 896 ret = -EROFS; 897 btrfs_handle_fs_error(fs_info, ret, NULL); 898 return ret; 899 } 900 } else { 901 refs = 1; 902 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 903 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 904 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 905 else 906 flags = 0; 907 } 908 909 owner = btrfs_header_owner(buf); 910 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 911 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 912 913 if (refs > 1) { 914 if ((owner == root->root_key.objectid || 915 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 916 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 917 ret = btrfs_inc_ref(trans, root, buf, 1); 918 if (ret) 919 return ret; 920 921 if (root->root_key.objectid == 922 BTRFS_TREE_RELOC_OBJECTID) { 923 ret = btrfs_dec_ref(trans, root, buf, 0); 924 if (ret) 925 return ret; 926 ret = btrfs_inc_ref(trans, root, cow, 1); 927 if (ret) 928 return ret; 929 } 930 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 931 } else { 932 933 if (root->root_key.objectid == 934 BTRFS_TREE_RELOC_OBJECTID) 935 ret = btrfs_inc_ref(trans, root, cow, 1); 936 else 937 ret = btrfs_inc_ref(trans, root, cow, 0); 938 if (ret) 939 return ret; 940 } 941 if (new_flags != 0) { 942 int level = btrfs_header_level(buf); 943 944 ret = btrfs_set_disk_extent_flags(trans, fs_info, 945 buf->start, 946 buf->len, 947 new_flags, level, 0); 948 if (ret) 949 return ret; 950 } 951 } else { 952 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 953 if (root->root_key.objectid == 954 BTRFS_TREE_RELOC_OBJECTID) 955 ret = btrfs_inc_ref(trans, root, cow, 1); 956 else 957 ret = btrfs_inc_ref(trans, root, cow, 0); 958 if (ret) 959 return ret; 960 ret = btrfs_dec_ref(trans, root, buf, 1); 961 if (ret) 962 return ret; 963 } 964 clean_tree_block(fs_info, buf); 965 *last_ref = 1; 966 } 967 return 0; 968 } 969 970 /* 971 * does the dirty work in cow of a single block. The parent block (if 972 * supplied) is updated to point to the new cow copy. The new buffer is marked 973 * dirty and returned locked. If you modify the block it needs to be marked 974 * dirty again. 975 * 976 * search_start -- an allocation hint for the new block 977 * 978 * empty_size -- a hint that you plan on doing more cow. This is the size in 979 * bytes the allocator should try to find free next to the block it returns. 980 * This is just a hint and may be ignored by the allocator. 981 */ 982 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 983 struct btrfs_root *root, 984 struct extent_buffer *buf, 985 struct extent_buffer *parent, int parent_slot, 986 struct extent_buffer **cow_ret, 987 u64 search_start, u64 empty_size) 988 { 989 struct btrfs_fs_info *fs_info = root->fs_info; 990 struct btrfs_disk_key disk_key; 991 struct extent_buffer *cow; 992 int level, ret; 993 int last_ref = 0; 994 int unlock_orig = 0; 995 u64 parent_start = 0; 996 997 if (*cow_ret == buf) 998 unlock_orig = 1; 999 1000 btrfs_assert_tree_locked(buf); 1001 1002 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 1003 trans->transid != fs_info->running_transaction->transid); 1004 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 1005 trans->transid != root->last_trans); 1006 1007 level = btrfs_header_level(buf); 1008 1009 if (level == 0) 1010 btrfs_item_key(buf, &disk_key, 0); 1011 else 1012 btrfs_node_key(buf, &disk_key, 0); 1013 1014 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 1015 parent_start = parent->start; 1016 1017 /* 1018 * If we are COWing a node/leaf from the extent, chunk or device trees, 1019 * make sure that we do not finish block group creation of pending block 1020 * groups. We do this to avoid a deadlock. 1021 * COWing can result in allocation of a new chunk, and flushing pending 1022 * block groups (btrfs_create_pending_block_groups()) can be triggered 1023 * when finishing allocation of a new chunk. Creation of a pending block 1024 * group modifies the extent, chunk and device trees, therefore we could 1025 * deadlock with ourselves since we are holding a lock on an extent 1026 * buffer that btrfs_create_pending_block_groups() may try to COW later. 1027 */ 1028 if (root == fs_info->extent_root || 1029 root == fs_info->chunk_root || 1030 root == fs_info->dev_root) 1031 trans->can_flush_pending_bgs = false; 1032 1033 cow = btrfs_alloc_tree_block(trans, root, parent_start, 1034 root->root_key.objectid, &disk_key, level, 1035 search_start, empty_size); 1036 trans->can_flush_pending_bgs = true; 1037 if (IS_ERR(cow)) 1038 return PTR_ERR(cow); 1039 1040 /* cow is set to blocking by btrfs_init_new_buffer */ 1041 1042 copy_extent_buffer_full(cow, buf); 1043 btrfs_set_header_bytenr(cow, cow->start); 1044 btrfs_set_header_generation(cow, trans->transid); 1045 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 1046 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 1047 BTRFS_HEADER_FLAG_RELOC); 1048 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1049 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 1050 else 1051 btrfs_set_header_owner(cow, root->root_key.objectid); 1052 1053 write_extent_buffer_fsid(cow, fs_info->fsid); 1054 1055 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 1056 if (ret) { 1057 btrfs_abort_transaction(trans, ret); 1058 return ret; 1059 } 1060 1061 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 1062 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 1063 if (ret) { 1064 btrfs_abort_transaction(trans, ret); 1065 return ret; 1066 } 1067 } 1068 1069 if (buf == root->node) { 1070 WARN_ON(parent && parent != buf); 1071 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 1072 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 1073 parent_start = buf->start; 1074 1075 extent_buffer_get(cow); 1076 ret = tree_mod_log_insert_root(root->node, cow, 1); 1077 BUG_ON(ret < 0); 1078 rcu_assign_pointer(root->node, cow); 1079 1080 btrfs_free_tree_block(trans, root, buf, parent_start, 1081 last_ref); 1082 free_extent_buffer(buf); 1083 add_root_to_dirty_list(root); 1084 } else { 1085 WARN_ON(trans->transid != btrfs_header_generation(parent)); 1086 tree_mod_log_insert_key(parent, parent_slot, 1087 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1088 btrfs_set_node_blockptr(parent, parent_slot, 1089 cow->start); 1090 btrfs_set_node_ptr_generation(parent, parent_slot, 1091 trans->transid); 1092 btrfs_mark_buffer_dirty(parent); 1093 if (last_ref) { 1094 ret = tree_mod_log_free_eb(buf); 1095 if (ret) { 1096 btrfs_abort_transaction(trans, ret); 1097 return ret; 1098 } 1099 } 1100 btrfs_free_tree_block(trans, root, buf, parent_start, 1101 last_ref); 1102 } 1103 if (unlock_orig) 1104 btrfs_tree_unlock(buf); 1105 free_extent_buffer_stale(buf); 1106 btrfs_mark_buffer_dirty(cow); 1107 *cow_ret = cow; 1108 return 0; 1109 } 1110 1111 /* 1112 * returns the logical address of the oldest predecessor of the given root. 1113 * entries older than time_seq are ignored. 1114 */ 1115 static struct tree_mod_elem *__tree_mod_log_oldest_root( 1116 struct extent_buffer *eb_root, u64 time_seq) 1117 { 1118 struct tree_mod_elem *tm; 1119 struct tree_mod_elem *found = NULL; 1120 u64 root_logical = eb_root->start; 1121 int looped = 0; 1122 1123 if (!time_seq) 1124 return NULL; 1125 1126 /* 1127 * the very last operation that's logged for a root is the 1128 * replacement operation (if it is replaced at all). this has 1129 * the logical address of the *new* root, making it the very 1130 * first operation that's logged for this root. 1131 */ 1132 while (1) { 1133 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 1134 time_seq); 1135 if (!looped && !tm) 1136 return NULL; 1137 /* 1138 * if there are no tree operation for the oldest root, we simply 1139 * return it. this should only happen if that (old) root is at 1140 * level 0. 1141 */ 1142 if (!tm) 1143 break; 1144 1145 /* 1146 * if there's an operation that's not a root replacement, we 1147 * found the oldest version of our root. normally, we'll find a 1148 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 1149 */ 1150 if (tm->op != MOD_LOG_ROOT_REPLACE) 1151 break; 1152 1153 found = tm; 1154 root_logical = tm->old_root.logical; 1155 looped = 1; 1156 } 1157 1158 /* if there's no old root to return, return what we found instead */ 1159 if (!found) 1160 found = tm; 1161 1162 return found; 1163 } 1164 1165 /* 1166 * tm is a pointer to the first operation to rewind within eb. then, all 1167 * previous operations will be rewound (until we reach something older than 1168 * time_seq). 1169 */ 1170 static void 1171 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 1172 u64 time_seq, struct tree_mod_elem *first_tm) 1173 { 1174 u32 n; 1175 struct rb_node *next; 1176 struct tree_mod_elem *tm = first_tm; 1177 unsigned long o_dst; 1178 unsigned long o_src; 1179 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1180 1181 n = btrfs_header_nritems(eb); 1182 read_lock(&fs_info->tree_mod_log_lock); 1183 while (tm && tm->seq >= time_seq) { 1184 /* 1185 * all the operations are recorded with the operator used for 1186 * the modification. as we're going backwards, we do the 1187 * opposite of each operation here. 1188 */ 1189 switch (tm->op) { 1190 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1191 BUG_ON(tm->slot < n); 1192 /* Fallthrough */ 1193 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1194 case MOD_LOG_KEY_REMOVE: 1195 btrfs_set_node_key(eb, &tm->key, tm->slot); 1196 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1197 btrfs_set_node_ptr_generation(eb, tm->slot, 1198 tm->generation); 1199 n++; 1200 break; 1201 case MOD_LOG_KEY_REPLACE: 1202 BUG_ON(tm->slot >= n); 1203 btrfs_set_node_key(eb, &tm->key, tm->slot); 1204 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1205 btrfs_set_node_ptr_generation(eb, tm->slot, 1206 tm->generation); 1207 break; 1208 case MOD_LOG_KEY_ADD: 1209 /* if a move operation is needed it's in the log */ 1210 n--; 1211 break; 1212 case MOD_LOG_MOVE_KEYS: 1213 o_dst = btrfs_node_key_ptr_offset(tm->slot); 1214 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 1215 memmove_extent_buffer(eb, o_dst, o_src, 1216 tm->move.nr_items * p_size); 1217 break; 1218 case MOD_LOG_ROOT_REPLACE: 1219 /* 1220 * this operation is special. for roots, this must be 1221 * handled explicitly before rewinding. 1222 * for non-roots, this operation may exist if the node 1223 * was a root: root A -> child B; then A gets empty and 1224 * B is promoted to the new root. in the mod log, we'll 1225 * have a root-replace operation for B, a tree block 1226 * that is no root. we simply ignore that operation. 1227 */ 1228 break; 1229 } 1230 next = rb_next(&tm->node); 1231 if (!next) 1232 break; 1233 tm = rb_entry(next, struct tree_mod_elem, node); 1234 if (tm->logical != first_tm->logical) 1235 break; 1236 } 1237 read_unlock(&fs_info->tree_mod_log_lock); 1238 btrfs_set_header_nritems(eb, n); 1239 } 1240 1241 /* 1242 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 1243 * is returned. If rewind operations happen, a fresh buffer is returned. The 1244 * returned buffer is always read-locked. If the returned buffer is not the 1245 * input buffer, the lock on the input buffer is released and the input buffer 1246 * is freed (its refcount is decremented). 1247 */ 1248 static struct extent_buffer * 1249 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 1250 struct extent_buffer *eb, u64 time_seq) 1251 { 1252 struct extent_buffer *eb_rewin; 1253 struct tree_mod_elem *tm; 1254 1255 if (!time_seq) 1256 return eb; 1257 1258 if (btrfs_header_level(eb) == 0) 1259 return eb; 1260 1261 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 1262 if (!tm) 1263 return eb; 1264 1265 btrfs_set_path_blocking(path); 1266 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1267 1268 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1269 BUG_ON(tm->slot != 0); 1270 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 1271 if (!eb_rewin) { 1272 btrfs_tree_read_unlock_blocking(eb); 1273 free_extent_buffer(eb); 1274 return NULL; 1275 } 1276 btrfs_set_header_bytenr(eb_rewin, eb->start); 1277 btrfs_set_header_backref_rev(eb_rewin, 1278 btrfs_header_backref_rev(eb)); 1279 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 1280 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 1281 } else { 1282 eb_rewin = btrfs_clone_extent_buffer(eb); 1283 if (!eb_rewin) { 1284 btrfs_tree_read_unlock_blocking(eb); 1285 free_extent_buffer(eb); 1286 return NULL; 1287 } 1288 } 1289 1290 btrfs_tree_read_unlock_blocking(eb); 1291 free_extent_buffer(eb); 1292 1293 extent_buffer_get(eb_rewin); 1294 btrfs_tree_read_lock(eb_rewin); 1295 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 1296 WARN_ON(btrfs_header_nritems(eb_rewin) > 1297 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1298 1299 return eb_rewin; 1300 } 1301 1302 /* 1303 * get_old_root() rewinds the state of @root's root node to the given @time_seq 1304 * value. If there are no changes, the current root->root_node is returned. If 1305 * anything changed in between, there's a fresh buffer allocated on which the 1306 * rewind operations are done. In any case, the returned buffer is read locked. 1307 * Returns NULL on error (with no locks held). 1308 */ 1309 static inline struct extent_buffer * 1310 get_old_root(struct btrfs_root *root, u64 time_seq) 1311 { 1312 struct btrfs_fs_info *fs_info = root->fs_info; 1313 struct tree_mod_elem *tm; 1314 struct extent_buffer *eb = NULL; 1315 struct extent_buffer *eb_root; 1316 struct extent_buffer *old; 1317 struct tree_mod_root *old_root = NULL; 1318 u64 old_generation = 0; 1319 u64 logical; 1320 int level; 1321 1322 eb_root = btrfs_read_lock_root_node(root); 1323 tm = __tree_mod_log_oldest_root(eb_root, time_seq); 1324 if (!tm) 1325 return eb_root; 1326 1327 if (tm->op == MOD_LOG_ROOT_REPLACE) { 1328 old_root = &tm->old_root; 1329 old_generation = tm->generation; 1330 logical = old_root->logical; 1331 level = old_root->level; 1332 } else { 1333 logical = eb_root->start; 1334 level = btrfs_header_level(eb_root); 1335 } 1336 1337 tm = tree_mod_log_search(fs_info, logical, time_seq); 1338 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1339 btrfs_tree_read_unlock(eb_root); 1340 free_extent_buffer(eb_root); 1341 old = read_tree_block(fs_info, logical, 0, level, NULL); 1342 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1343 if (!IS_ERR(old)) 1344 free_extent_buffer(old); 1345 btrfs_warn(fs_info, 1346 "failed to read tree block %llu from get_old_root", 1347 logical); 1348 } else { 1349 eb = btrfs_clone_extent_buffer(old); 1350 free_extent_buffer(old); 1351 } 1352 } else if (old_root) { 1353 btrfs_tree_read_unlock(eb_root); 1354 free_extent_buffer(eb_root); 1355 eb = alloc_dummy_extent_buffer(fs_info, logical); 1356 } else { 1357 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); 1358 eb = btrfs_clone_extent_buffer(eb_root); 1359 btrfs_tree_read_unlock_blocking(eb_root); 1360 free_extent_buffer(eb_root); 1361 } 1362 1363 if (!eb) 1364 return NULL; 1365 extent_buffer_get(eb); 1366 btrfs_tree_read_lock(eb); 1367 if (old_root) { 1368 btrfs_set_header_bytenr(eb, eb->start); 1369 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1370 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); 1371 btrfs_set_header_level(eb, old_root->level); 1372 btrfs_set_header_generation(eb, old_generation); 1373 } 1374 if (tm) 1375 __tree_mod_log_rewind(fs_info, eb, time_seq, tm); 1376 else 1377 WARN_ON(btrfs_header_level(eb) != 0); 1378 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1379 1380 return eb; 1381 } 1382 1383 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1384 { 1385 struct tree_mod_elem *tm; 1386 int level; 1387 struct extent_buffer *eb_root = btrfs_root_node(root); 1388 1389 tm = __tree_mod_log_oldest_root(eb_root, time_seq); 1390 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { 1391 level = tm->old_root.level; 1392 } else { 1393 level = btrfs_header_level(eb_root); 1394 } 1395 free_extent_buffer(eb_root); 1396 1397 return level; 1398 } 1399 1400 static inline int should_cow_block(struct btrfs_trans_handle *trans, 1401 struct btrfs_root *root, 1402 struct extent_buffer *buf) 1403 { 1404 if (btrfs_is_testing(root->fs_info)) 1405 return 0; 1406 1407 /* Ensure we can see the FORCE_COW bit */ 1408 smp_mb__before_atomic(); 1409 1410 /* 1411 * We do not need to cow a block if 1412 * 1) this block is not created or changed in this transaction; 1413 * 2) this block does not belong to TREE_RELOC tree; 1414 * 3) the root is not forced COW. 1415 * 1416 * What is forced COW: 1417 * when we create snapshot during committing the transaction, 1418 * after we've finished coping src root, we must COW the shared 1419 * block to ensure the metadata consistency. 1420 */ 1421 if (btrfs_header_generation(buf) == trans->transid && 1422 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1423 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1424 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1425 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 1426 return 0; 1427 return 1; 1428 } 1429 1430 /* 1431 * cows a single block, see __btrfs_cow_block for the real work. 1432 * This version of it has extra checks so that a block isn't COWed more than 1433 * once per transaction, as long as it hasn't been written yet 1434 */ 1435 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 1436 struct btrfs_root *root, struct extent_buffer *buf, 1437 struct extent_buffer *parent, int parent_slot, 1438 struct extent_buffer **cow_ret) 1439 { 1440 struct btrfs_fs_info *fs_info = root->fs_info; 1441 u64 search_start; 1442 int ret; 1443 1444 if (trans->transaction != fs_info->running_transaction) 1445 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1446 trans->transid, 1447 fs_info->running_transaction->transid); 1448 1449 if (trans->transid != fs_info->generation) 1450 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1451 trans->transid, fs_info->generation); 1452 1453 if (!should_cow_block(trans, root, buf)) { 1454 trans->dirty = true; 1455 *cow_ret = buf; 1456 return 0; 1457 } 1458 1459 search_start = buf->start & ~((u64)SZ_1G - 1); 1460 1461 if (parent) 1462 btrfs_set_lock_blocking(parent); 1463 btrfs_set_lock_blocking(buf); 1464 1465 ret = __btrfs_cow_block(trans, root, buf, parent, 1466 parent_slot, cow_ret, search_start, 0); 1467 1468 trace_btrfs_cow_block(root, buf, *cow_ret); 1469 1470 return ret; 1471 } 1472 1473 /* 1474 * helper function for defrag to decide if two blocks pointed to by a 1475 * node are actually close by 1476 */ 1477 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 1478 { 1479 if (blocknr < other && other - (blocknr + blocksize) < 32768) 1480 return 1; 1481 if (blocknr > other && blocknr - (other + blocksize) < 32768) 1482 return 1; 1483 return 0; 1484 } 1485 1486 /* 1487 * compare two keys in a memcmp fashion 1488 */ 1489 static int comp_keys(const struct btrfs_disk_key *disk, 1490 const struct btrfs_key *k2) 1491 { 1492 struct btrfs_key k1; 1493 1494 btrfs_disk_key_to_cpu(&k1, disk); 1495 1496 return btrfs_comp_cpu_keys(&k1, k2); 1497 } 1498 1499 /* 1500 * same as comp_keys only with two btrfs_key's 1501 */ 1502 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 1503 { 1504 if (k1->objectid > k2->objectid) 1505 return 1; 1506 if (k1->objectid < k2->objectid) 1507 return -1; 1508 if (k1->type > k2->type) 1509 return 1; 1510 if (k1->type < k2->type) 1511 return -1; 1512 if (k1->offset > k2->offset) 1513 return 1; 1514 if (k1->offset < k2->offset) 1515 return -1; 1516 return 0; 1517 } 1518 1519 /* 1520 * this is used by the defrag code to go through all the 1521 * leaves pointed to by a node and reallocate them so that 1522 * disk order is close to key order 1523 */ 1524 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1525 struct btrfs_root *root, struct extent_buffer *parent, 1526 int start_slot, u64 *last_ret, 1527 struct btrfs_key *progress) 1528 { 1529 struct btrfs_fs_info *fs_info = root->fs_info; 1530 struct extent_buffer *cur; 1531 u64 blocknr; 1532 u64 gen; 1533 u64 search_start = *last_ret; 1534 u64 last_block = 0; 1535 u64 other; 1536 u32 parent_nritems; 1537 int end_slot; 1538 int i; 1539 int err = 0; 1540 int parent_level; 1541 int uptodate; 1542 u32 blocksize; 1543 int progress_passed = 0; 1544 struct btrfs_disk_key disk_key; 1545 1546 parent_level = btrfs_header_level(parent); 1547 1548 WARN_ON(trans->transaction != fs_info->running_transaction); 1549 WARN_ON(trans->transid != fs_info->generation); 1550 1551 parent_nritems = btrfs_header_nritems(parent); 1552 blocksize = fs_info->nodesize; 1553 end_slot = parent_nritems - 1; 1554 1555 if (parent_nritems <= 1) 1556 return 0; 1557 1558 btrfs_set_lock_blocking(parent); 1559 1560 for (i = start_slot; i <= end_slot; i++) { 1561 struct btrfs_key first_key; 1562 int close = 1; 1563 1564 btrfs_node_key(parent, &disk_key, i); 1565 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 1566 continue; 1567 1568 progress_passed = 1; 1569 blocknr = btrfs_node_blockptr(parent, i); 1570 gen = btrfs_node_ptr_generation(parent, i); 1571 btrfs_node_key_to_cpu(parent, &first_key, i); 1572 if (last_block == 0) 1573 last_block = blocknr; 1574 1575 if (i > 0) { 1576 other = btrfs_node_blockptr(parent, i - 1); 1577 close = close_blocks(blocknr, other, blocksize); 1578 } 1579 if (!close && i < end_slot) { 1580 other = btrfs_node_blockptr(parent, i + 1); 1581 close = close_blocks(blocknr, other, blocksize); 1582 } 1583 if (close) { 1584 last_block = blocknr; 1585 continue; 1586 } 1587 1588 cur = find_extent_buffer(fs_info, blocknr); 1589 if (cur) 1590 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1591 else 1592 uptodate = 0; 1593 if (!cur || !uptodate) { 1594 if (!cur) { 1595 cur = read_tree_block(fs_info, blocknr, gen, 1596 parent_level - 1, 1597 &first_key); 1598 if (IS_ERR(cur)) { 1599 return PTR_ERR(cur); 1600 } else if (!extent_buffer_uptodate(cur)) { 1601 free_extent_buffer(cur); 1602 return -EIO; 1603 } 1604 } else if (!uptodate) { 1605 err = btrfs_read_buffer(cur, gen, 1606 parent_level - 1,&first_key); 1607 if (err) { 1608 free_extent_buffer(cur); 1609 return err; 1610 } 1611 } 1612 } 1613 if (search_start == 0) 1614 search_start = last_block; 1615 1616 btrfs_tree_lock(cur); 1617 btrfs_set_lock_blocking(cur); 1618 err = __btrfs_cow_block(trans, root, cur, parent, i, 1619 &cur, search_start, 1620 min(16 * blocksize, 1621 (end_slot - i) * blocksize)); 1622 if (err) { 1623 btrfs_tree_unlock(cur); 1624 free_extent_buffer(cur); 1625 break; 1626 } 1627 search_start = cur->start; 1628 last_block = cur->start; 1629 *last_ret = search_start; 1630 btrfs_tree_unlock(cur); 1631 free_extent_buffer(cur); 1632 } 1633 return err; 1634 } 1635 1636 /* 1637 * search for key in the extent_buffer. The items start at offset p, 1638 * and they are item_size apart. There are 'max' items in p. 1639 * 1640 * the slot in the array is returned via slot, and it points to 1641 * the place where you would insert key if it is not found in 1642 * the array. 1643 * 1644 * slot may point to max if the key is bigger than all of the keys 1645 */ 1646 static noinline int generic_bin_search(struct extent_buffer *eb, 1647 unsigned long p, int item_size, 1648 const struct btrfs_key *key, 1649 int max, int *slot) 1650 { 1651 int low = 0; 1652 int high = max; 1653 int mid; 1654 int ret; 1655 struct btrfs_disk_key *tmp = NULL; 1656 struct btrfs_disk_key unaligned; 1657 unsigned long offset; 1658 char *kaddr = NULL; 1659 unsigned long map_start = 0; 1660 unsigned long map_len = 0; 1661 int err; 1662 1663 if (low > high) { 1664 btrfs_err(eb->fs_info, 1665 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 1666 __func__, low, high, eb->start, 1667 btrfs_header_owner(eb), btrfs_header_level(eb)); 1668 return -EINVAL; 1669 } 1670 1671 while (low < high) { 1672 mid = (low + high) / 2; 1673 offset = p + mid * item_size; 1674 1675 if (!kaddr || offset < map_start || 1676 (offset + sizeof(struct btrfs_disk_key)) > 1677 map_start + map_len) { 1678 1679 err = map_private_extent_buffer(eb, offset, 1680 sizeof(struct btrfs_disk_key), 1681 &kaddr, &map_start, &map_len); 1682 1683 if (!err) { 1684 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1685 map_start); 1686 } else if (err == 1) { 1687 read_extent_buffer(eb, &unaligned, 1688 offset, sizeof(unaligned)); 1689 tmp = &unaligned; 1690 } else { 1691 return err; 1692 } 1693 1694 } else { 1695 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1696 map_start); 1697 } 1698 ret = comp_keys(tmp, key); 1699 1700 if (ret < 0) 1701 low = mid + 1; 1702 else if (ret > 0) 1703 high = mid; 1704 else { 1705 *slot = mid; 1706 return 0; 1707 } 1708 } 1709 *slot = low; 1710 return 1; 1711 } 1712 1713 /* 1714 * simple bin_search frontend that does the right thing for 1715 * leaves vs nodes 1716 */ 1717 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 1718 int level, int *slot) 1719 { 1720 if (level == 0) 1721 return generic_bin_search(eb, 1722 offsetof(struct btrfs_leaf, items), 1723 sizeof(struct btrfs_item), 1724 key, btrfs_header_nritems(eb), 1725 slot); 1726 else 1727 return generic_bin_search(eb, 1728 offsetof(struct btrfs_node, ptrs), 1729 sizeof(struct btrfs_key_ptr), 1730 key, btrfs_header_nritems(eb), 1731 slot); 1732 } 1733 1734 static void root_add_used(struct btrfs_root *root, u32 size) 1735 { 1736 spin_lock(&root->accounting_lock); 1737 btrfs_set_root_used(&root->root_item, 1738 btrfs_root_used(&root->root_item) + size); 1739 spin_unlock(&root->accounting_lock); 1740 } 1741 1742 static void root_sub_used(struct btrfs_root *root, u32 size) 1743 { 1744 spin_lock(&root->accounting_lock); 1745 btrfs_set_root_used(&root->root_item, 1746 btrfs_root_used(&root->root_item) - size); 1747 spin_unlock(&root->accounting_lock); 1748 } 1749 1750 /* given a node and slot number, this reads the blocks it points to. The 1751 * extent buffer is returned with a reference taken (but unlocked). 1752 */ 1753 static noinline struct extent_buffer * 1754 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, 1755 int slot) 1756 { 1757 int level = btrfs_header_level(parent); 1758 struct extent_buffer *eb; 1759 struct btrfs_key first_key; 1760 1761 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 1762 return ERR_PTR(-ENOENT); 1763 1764 BUG_ON(level == 0); 1765 1766 btrfs_node_key_to_cpu(parent, &first_key, slot); 1767 eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot), 1768 btrfs_node_ptr_generation(parent, slot), 1769 level - 1, &first_key); 1770 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 1771 free_extent_buffer(eb); 1772 eb = ERR_PTR(-EIO); 1773 } 1774 1775 return eb; 1776 } 1777 1778 /* 1779 * node level balancing, used to make sure nodes are in proper order for 1780 * item deletion. We balance from the top down, so we have to make sure 1781 * that a deletion won't leave an node completely empty later on. 1782 */ 1783 static noinline int balance_level(struct btrfs_trans_handle *trans, 1784 struct btrfs_root *root, 1785 struct btrfs_path *path, int level) 1786 { 1787 struct btrfs_fs_info *fs_info = root->fs_info; 1788 struct extent_buffer *right = NULL; 1789 struct extent_buffer *mid; 1790 struct extent_buffer *left = NULL; 1791 struct extent_buffer *parent = NULL; 1792 int ret = 0; 1793 int wret; 1794 int pslot; 1795 int orig_slot = path->slots[level]; 1796 u64 orig_ptr; 1797 1798 ASSERT(level > 0); 1799 1800 mid = path->nodes[level]; 1801 1802 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && 1803 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); 1804 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1805 1806 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1807 1808 if (level < BTRFS_MAX_LEVEL - 1) { 1809 parent = path->nodes[level + 1]; 1810 pslot = path->slots[level + 1]; 1811 } 1812 1813 /* 1814 * deal with the case where there is only one pointer in the root 1815 * by promoting the node below to a root 1816 */ 1817 if (!parent) { 1818 struct extent_buffer *child; 1819 1820 if (btrfs_header_nritems(mid) != 1) 1821 return 0; 1822 1823 /* promote the child to a root */ 1824 child = read_node_slot(fs_info, mid, 0); 1825 if (IS_ERR(child)) { 1826 ret = PTR_ERR(child); 1827 btrfs_handle_fs_error(fs_info, ret, NULL); 1828 goto enospc; 1829 } 1830 1831 btrfs_tree_lock(child); 1832 btrfs_set_lock_blocking(child); 1833 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1834 if (ret) { 1835 btrfs_tree_unlock(child); 1836 free_extent_buffer(child); 1837 goto enospc; 1838 } 1839 1840 ret = tree_mod_log_insert_root(root->node, child, 1); 1841 BUG_ON(ret < 0); 1842 rcu_assign_pointer(root->node, child); 1843 1844 add_root_to_dirty_list(root); 1845 btrfs_tree_unlock(child); 1846 1847 path->locks[level] = 0; 1848 path->nodes[level] = NULL; 1849 clean_tree_block(fs_info, mid); 1850 btrfs_tree_unlock(mid); 1851 /* once for the path */ 1852 free_extent_buffer(mid); 1853 1854 root_sub_used(root, mid->len); 1855 btrfs_free_tree_block(trans, root, mid, 0, 1); 1856 /* once for the root ptr */ 1857 free_extent_buffer_stale(mid); 1858 return 0; 1859 } 1860 if (btrfs_header_nritems(mid) > 1861 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1862 return 0; 1863 1864 left = read_node_slot(fs_info, parent, pslot - 1); 1865 if (IS_ERR(left)) 1866 left = NULL; 1867 1868 if (left) { 1869 btrfs_tree_lock(left); 1870 btrfs_set_lock_blocking(left); 1871 wret = btrfs_cow_block(trans, root, left, 1872 parent, pslot - 1, &left); 1873 if (wret) { 1874 ret = wret; 1875 goto enospc; 1876 } 1877 } 1878 1879 right = read_node_slot(fs_info, parent, pslot + 1); 1880 if (IS_ERR(right)) 1881 right = NULL; 1882 1883 if (right) { 1884 btrfs_tree_lock(right); 1885 btrfs_set_lock_blocking(right); 1886 wret = btrfs_cow_block(trans, root, right, 1887 parent, pslot + 1, &right); 1888 if (wret) { 1889 ret = wret; 1890 goto enospc; 1891 } 1892 } 1893 1894 /* first, try to make some room in the middle buffer */ 1895 if (left) { 1896 orig_slot += btrfs_header_nritems(left); 1897 wret = push_node_left(trans, fs_info, left, mid, 1); 1898 if (wret < 0) 1899 ret = wret; 1900 } 1901 1902 /* 1903 * then try to empty the right most buffer into the middle 1904 */ 1905 if (right) { 1906 wret = push_node_left(trans, fs_info, mid, right, 1); 1907 if (wret < 0 && wret != -ENOSPC) 1908 ret = wret; 1909 if (btrfs_header_nritems(right) == 0) { 1910 clean_tree_block(fs_info, right); 1911 btrfs_tree_unlock(right); 1912 del_ptr(root, path, level + 1, pslot + 1); 1913 root_sub_used(root, right->len); 1914 btrfs_free_tree_block(trans, root, right, 0, 1); 1915 free_extent_buffer_stale(right); 1916 right = NULL; 1917 } else { 1918 struct btrfs_disk_key right_key; 1919 btrfs_node_key(right, &right_key, 0); 1920 ret = tree_mod_log_insert_key(parent, pslot + 1, 1921 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1922 BUG_ON(ret < 0); 1923 btrfs_set_node_key(parent, &right_key, pslot + 1); 1924 btrfs_mark_buffer_dirty(parent); 1925 } 1926 } 1927 if (btrfs_header_nritems(mid) == 1) { 1928 /* 1929 * we're not allowed to leave a node with one item in the 1930 * tree during a delete. A deletion from lower in the tree 1931 * could try to delete the only pointer in this node. 1932 * So, pull some keys from the left. 1933 * There has to be a left pointer at this point because 1934 * otherwise we would have pulled some pointers from the 1935 * right 1936 */ 1937 if (!left) { 1938 ret = -EROFS; 1939 btrfs_handle_fs_error(fs_info, ret, NULL); 1940 goto enospc; 1941 } 1942 wret = balance_node_right(trans, fs_info, mid, left); 1943 if (wret < 0) { 1944 ret = wret; 1945 goto enospc; 1946 } 1947 if (wret == 1) { 1948 wret = push_node_left(trans, fs_info, left, mid, 1); 1949 if (wret < 0) 1950 ret = wret; 1951 } 1952 BUG_ON(wret == 1); 1953 } 1954 if (btrfs_header_nritems(mid) == 0) { 1955 clean_tree_block(fs_info, mid); 1956 btrfs_tree_unlock(mid); 1957 del_ptr(root, path, level + 1, pslot); 1958 root_sub_used(root, mid->len); 1959 btrfs_free_tree_block(trans, root, mid, 0, 1); 1960 free_extent_buffer_stale(mid); 1961 mid = NULL; 1962 } else { 1963 /* update the parent key to reflect our changes */ 1964 struct btrfs_disk_key mid_key; 1965 btrfs_node_key(mid, &mid_key, 0); 1966 ret = tree_mod_log_insert_key(parent, pslot, 1967 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1968 BUG_ON(ret < 0); 1969 btrfs_set_node_key(parent, &mid_key, pslot); 1970 btrfs_mark_buffer_dirty(parent); 1971 } 1972 1973 /* update the path */ 1974 if (left) { 1975 if (btrfs_header_nritems(left) > orig_slot) { 1976 extent_buffer_get(left); 1977 /* left was locked after cow */ 1978 path->nodes[level] = left; 1979 path->slots[level + 1] -= 1; 1980 path->slots[level] = orig_slot; 1981 if (mid) { 1982 btrfs_tree_unlock(mid); 1983 free_extent_buffer(mid); 1984 } 1985 } else { 1986 orig_slot -= btrfs_header_nritems(left); 1987 path->slots[level] = orig_slot; 1988 } 1989 } 1990 /* double check we haven't messed things up */ 1991 if (orig_ptr != 1992 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1993 BUG(); 1994 enospc: 1995 if (right) { 1996 btrfs_tree_unlock(right); 1997 free_extent_buffer(right); 1998 } 1999 if (left) { 2000 if (path->nodes[level] != left) 2001 btrfs_tree_unlock(left); 2002 free_extent_buffer(left); 2003 } 2004 return ret; 2005 } 2006 2007 /* Node balancing for insertion. Here we only split or push nodes around 2008 * when they are completely full. This is also done top down, so we 2009 * have to be pessimistic. 2010 */ 2011 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 2012 struct btrfs_root *root, 2013 struct btrfs_path *path, int level) 2014 { 2015 struct btrfs_fs_info *fs_info = root->fs_info; 2016 struct extent_buffer *right = NULL; 2017 struct extent_buffer *mid; 2018 struct extent_buffer *left = NULL; 2019 struct extent_buffer *parent = NULL; 2020 int ret = 0; 2021 int wret; 2022 int pslot; 2023 int orig_slot = path->slots[level]; 2024 2025 if (level == 0) 2026 return 1; 2027 2028 mid = path->nodes[level]; 2029 WARN_ON(btrfs_header_generation(mid) != trans->transid); 2030 2031 if (level < BTRFS_MAX_LEVEL - 1) { 2032 parent = path->nodes[level + 1]; 2033 pslot = path->slots[level + 1]; 2034 } 2035 2036 if (!parent) 2037 return 1; 2038 2039 left = read_node_slot(fs_info, parent, pslot - 1); 2040 if (IS_ERR(left)) 2041 left = NULL; 2042 2043 /* first, try to make some room in the middle buffer */ 2044 if (left) { 2045 u32 left_nr; 2046 2047 btrfs_tree_lock(left); 2048 btrfs_set_lock_blocking(left); 2049 2050 left_nr = btrfs_header_nritems(left); 2051 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 2052 wret = 1; 2053 } else { 2054 ret = btrfs_cow_block(trans, root, left, parent, 2055 pslot - 1, &left); 2056 if (ret) 2057 wret = 1; 2058 else { 2059 wret = push_node_left(trans, fs_info, 2060 left, mid, 0); 2061 } 2062 } 2063 if (wret < 0) 2064 ret = wret; 2065 if (wret == 0) { 2066 struct btrfs_disk_key disk_key; 2067 orig_slot += left_nr; 2068 btrfs_node_key(mid, &disk_key, 0); 2069 ret = tree_mod_log_insert_key(parent, pslot, 2070 MOD_LOG_KEY_REPLACE, GFP_NOFS); 2071 BUG_ON(ret < 0); 2072 btrfs_set_node_key(parent, &disk_key, pslot); 2073 btrfs_mark_buffer_dirty(parent); 2074 if (btrfs_header_nritems(left) > orig_slot) { 2075 path->nodes[level] = left; 2076 path->slots[level + 1] -= 1; 2077 path->slots[level] = orig_slot; 2078 btrfs_tree_unlock(mid); 2079 free_extent_buffer(mid); 2080 } else { 2081 orig_slot -= 2082 btrfs_header_nritems(left); 2083 path->slots[level] = orig_slot; 2084 btrfs_tree_unlock(left); 2085 free_extent_buffer(left); 2086 } 2087 return 0; 2088 } 2089 btrfs_tree_unlock(left); 2090 free_extent_buffer(left); 2091 } 2092 right = read_node_slot(fs_info, parent, pslot + 1); 2093 if (IS_ERR(right)) 2094 right = NULL; 2095 2096 /* 2097 * then try to empty the right most buffer into the middle 2098 */ 2099 if (right) { 2100 u32 right_nr; 2101 2102 btrfs_tree_lock(right); 2103 btrfs_set_lock_blocking(right); 2104 2105 right_nr = btrfs_header_nritems(right); 2106 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 2107 wret = 1; 2108 } else { 2109 ret = btrfs_cow_block(trans, root, right, 2110 parent, pslot + 1, 2111 &right); 2112 if (ret) 2113 wret = 1; 2114 else { 2115 wret = balance_node_right(trans, fs_info, 2116 right, mid); 2117 } 2118 } 2119 if (wret < 0) 2120 ret = wret; 2121 if (wret == 0) { 2122 struct btrfs_disk_key disk_key; 2123 2124 btrfs_node_key(right, &disk_key, 0); 2125 ret = tree_mod_log_insert_key(parent, pslot + 1, 2126 MOD_LOG_KEY_REPLACE, GFP_NOFS); 2127 BUG_ON(ret < 0); 2128 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2129 btrfs_mark_buffer_dirty(parent); 2130 2131 if (btrfs_header_nritems(mid) <= orig_slot) { 2132 path->nodes[level] = right; 2133 path->slots[level + 1] += 1; 2134 path->slots[level] = orig_slot - 2135 btrfs_header_nritems(mid); 2136 btrfs_tree_unlock(mid); 2137 free_extent_buffer(mid); 2138 } else { 2139 btrfs_tree_unlock(right); 2140 free_extent_buffer(right); 2141 } 2142 return 0; 2143 } 2144 btrfs_tree_unlock(right); 2145 free_extent_buffer(right); 2146 } 2147 return 1; 2148 } 2149 2150 /* 2151 * readahead one full node of leaves, finding things that are close 2152 * to the block in 'slot', and triggering ra on them. 2153 */ 2154 static void reada_for_search(struct btrfs_fs_info *fs_info, 2155 struct btrfs_path *path, 2156 int level, int slot, u64 objectid) 2157 { 2158 struct extent_buffer *node; 2159 struct btrfs_disk_key disk_key; 2160 u32 nritems; 2161 u64 search; 2162 u64 target; 2163 u64 nread = 0; 2164 struct extent_buffer *eb; 2165 u32 nr; 2166 u32 blocksize; 2167 u32 nscan = 0; 2168 2169 if (level != 1) 2170 return; 2171 2172 if (!path->nodes[level]) 2173 return; 2174 2175 node = path->nodes[level]; 2176 2177 search = btrfs_node_blockptr(node, slot); 2178 blocksize = fs_info->nodesize; 2179 eb = find_extent_buffer(fs_info, search); 2180 if (eb) { 2181 free_extent_buffer(eb); 2182 return; 2183 } 2184 2185 target = search; 2186 2187 nritems = btrfs_header_nritems(node); 2188 nr = slot; 2189 2190 while (1) { 2191 if (path->reada == READA_BACK) { 2192 if (nr == 0) 2193 break; 2194 nr--; 2195 } else if (path->reada == READA_FORWARD) { 2196 nr++; 2197 if (nr >= nritems) 2198 break; 2199 } 2200 if (path->reada == READA_BACK && objectid) { 2201 btrfs_node_key(node, &disk_key, nr); 2202 if (btrfs_disk_key_objectid(&disk_key) != objectid) 2203 break; 2204 } 2205 search = btrfs_node_blockptr(node, nr); 2206 if ((search <= target && target - search <= 65536) || 2207 (search > target && search - target <= 65536)) { 2208 readahead_tree_block(fs_info, search); 2209 nread += blocksize; 2210 } 2211 nscan++; 2212 if ((nread > 65536 || nscan > 32)) 2213 break; 2214 } 2215 } 2216 2217 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, 2218 struct btrfs_path *path, int level) 2219 { 2220 int slot; 2221 int nritems; 2222 struct extent_buffer *parent; 2223 struct extent_buffer *eb; 2224 u64 gen; 2225 u64 block1 = 0; 2226 u64 block2 = 0; 2227 2228 parent = path->nodes[level + 1]; 2229 if (!parent) 2230 return; 2231 2232 nritems = btrfs_header_nritems(parent); 2233 slot = path->slots[level + 1]; 2234 2235 if (slot > 0) { 2236 block1 = btrfs_node_blockptr(parent, slot - 1); 2237 gen = btrfs_node_ptr_generation(parent, slot - 1); 2238 eb = find_extent_buffer(fs_info, block1); 2239 /* 2240 * if we get -eagain from btrfs_buffer_uptodate, we 2241 * don't want to return eagain here. That will loop 2242 * forever 2243 */ 2244 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2245 block1 = 0; 2246 free_extent_buffer(eb); 2247 } 2248 if (slot + 1 < nritems) { 2249 block2 = btrfs_node_blockptr(parent, slot + 1); 2250 gen = btrfs_node_ptr_generation(parent, slot + 1); 2251 eb = find_extent_buffer(fs_info, block2); 2252 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2253 block2 = 0; 2254 free_extent_buffer(eb); 2255 } 2256 2257 if (block1) 2258 readahead_tree_block(fs_info, block1); 2259 if (block2) 2260 readahead_tree_block(fs_info, block2); 2261 } 2262 2263 2264 /* 2265 * when we walk down the tree, it is usually safe to unlock the higher layers 2266 * in the tree. The exceptions are when our path goes through slot 0, because 2267 * operations on the tree might require changing key pointers higher up in the 2268 * tree. 2269 * 2270 * callers might also have set path->keep_locks, which tells this code to keep 2271 * the lock if the path points to the last slot in the block. This is part of 2272 * walking through the tree, and selecting the next slot in the higher block. 2273 * 2274 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 2275 * if lowest_unlock is 1, level 0 won't be unlocked 2276 */ 2277 static noinline void unlock_up(struct btrfs_path *path, int level, 2278 int lowest_unlock, int min_write_lock_level, 2279 int *write_lock_level) 2280 { 2281 int i; 2282 int skip_level = level; 2283 int no_skips = 0; 2284 struct extent_buffer *t; 2285 2286 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2287 if (!path->nodes[i]) 2288 break; 2289 if (!path->locks[i]) 2290 break; 2291 if (!no_skips && path->slots[i] == 0) { 2292 skip_level = i + 1; 2293 continue; 2294 } 2295 if (!no_skips && path->keep_locks) { 2296 u32 nritems; 2297 t = path->nodes[i]; 2298 nritems = btrfs_header_nritems(t); 2299 if (nritems < 1 || path->slots[i] >= nritems - 1) { 2300 skip_level = i + 1; 2301 continue; 2302 } 2303 } 2304 if (skip_level < i && i >= lowest_unlock) 2305 no_skips = 1; 2306 2307 t = path->nodes[i]; 2308 if (i >= lowest_unlock && i > skip_level) { 2309 btrfs_tree_unlock_rw(t, path->locks[i]); 2310 path->locks[i] = 0; 2311 if (write_lock_level && 2312 i > min_write_lock_level && 2313 i <= *write_lock_level) { 2314 *write_lock_level = i - 1; 2315 } 2316 } 2317 } 2318 } 2319 2320 /* 2321 * This releases any locks held in the path starting at level and 2322 * going all the way up to the root. 2323 * 2324 * btrfs_search_slot will keep the lock held on higher nodes in a few 2325 * corner cases, such as COW of the block at slot zero in the node. This 2326 * ignores those rules, and it should only be called when there are no 2327 * more updates to be done higher up in the tree. 2328 */ 2329 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 2330 { 2331 int i; 2332 2333 if (path->keep_locks) 2334 return; 2335 2336 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2337 if (!path->nodes[i]) 2338 continue; 2339 if (!path->locks[i]) 2340 continue; 2341 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 2342 path->locks[i] = 0; 2343 } 2344 } 2345 2346 /* 2347 * helper function for btrfs_search_slot. The goal is to find a block 2348 * in cache without setting the path to blocking. If we find the block 2349 * we return zero and the path is unchanged. 2350 * 2351 * If we can't find the block, we set the path blocking and do some 2352 * reada. -EAGAIN is returned and the search must be repeated. 2353 */ 2354 static int 2355 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 2356 struct extent_buffer **eb_ret, int level, int slot, 2357 const struct btrfs_key *key) 2358 { 2359 struct btrfs_fs_info *fs_info = root->fs_info; 2360 u64 blocknr; 2361 u64 gen; 2362 struct extent_buffer *b = *eb_ret; 2363 struct extent_buffer *tmp; 2364 struct btrfs_key first_key; 2365 int ret; 2366 int parent_level; 2367 2368 blocknr = btrfs_node_blockptr(b, slot); 2369 gen = btrfs_node_ptr_generation(b, slot); 2370 parent_level = btrfs_header_level(b); 2371 btrfs_node_key_to_cpu(b, &first_key, slot); 2372 2373 tmp = find_extent_buffer(fs_info, blocknr); 2374 if (tmp) { 2375 /* first we do an atomic uptodate check */ 2376 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2377 *eb_ret = tmp; 2378 return 0; 2379 } 2380 2381 /* the pages were up to date, but we failed 2382 * the generation number check. Do a full 2383 * read for the generation number that is correct. 2384 * We must do this without dropping locks so 2385 * we can trust our generation number 2386 */ 2387 btrfs_set_path_blocking(p); 2388 2389 /* now we're allowed to do a blocking uptodate check */ 2390 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 2391 if (!ret) { 2392 *eb_ret = tmp; 2393 return 0; 2394 } 2395 free_extent_buffer(tmp); 2396 btrfs_release_path(p); 2397 return -EIO; 2398 } 2399 2400 /* 2401 * reduce lock contention at high levels 2402 * of the btree by dropping locks before 2403 * we read. Don't release the lock on the current 2404 * level because we need to walk this node to figure 2405 * out which blocks to read. 2406 */ 2407 btrfs_unlock_up_safe(p, level + 1); 2408 btrfs_set_path_blocking(p); 2409 2410 if (p->reada != READA_NONE) 2411 reada_for_search(fs_info, p, level, slot, key->objectid); 2412 2413 ret = -EAGAIN; 2414 tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, 2415 &first_key); 2416 if (!IS_ERR(tmp)) { 2417 /* 2418 * If the read above didn't mark this buffer up to date, 2419 * it will never end up being up to date. Set ret to EIO now 2420 * and give up so that our caller doesn't loop forever 2421 * on our EAGAINs. 2422 */ 2423 if (!extent_buffer_uptodate(tmp)) 2424 ret = -EIO; 2425 free_extent_buffer(tmp); 2426 } else { 2427 ret = PTR_ERR(tmp); 2428 } 2429 2430 btrfs_release_path(p); 2431 return ret; 2432 } 2433 2434 /* 2435 * helper function for btrfs_search_slot. This does all of the checks 2436 * for node-level blocks and does any balancing required based on 2437 * the ins_len. 2438 * 2439 * If no extra work was required, zero is returned. If we had to 2440 * drop the path, -EAGAIN is returned and btrfs_search_slot must 2441 * start over 2442 */ 2443 static int 2444 setup_nodes_for_search(struct btrfs_trans_handle *trans, 2445 struct btrfs_root *root, struct btrfs_path *p, 2446 struct extent_buffer *b, int level, int ins_len, 2447 int *write_lock_level) 2448 { 2449 struct btrfs_fs_info *fs_info = root->fs_info; 2450 int ret; 2451 2452 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 2453 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 2454 int sret; 2455 2456 if (*write_lock_level < level + 1) { 2457 *write_lock_level = level + 1; 2458 btrfs_release_path(p); 2459 goto again; 2460 } 2461 2462 btrfs_set_path_blocking(p); 2463 reada_for_balance(fs_info, p, level); 2464 sret = split_node(trans, root, p, level); 2465 2466 BUG_ON(sret > 0); 2467 if (sret) { 2468 ret = sret; 2469 goto done; 2470 } 2471 b = p->nodes[level]; 2472 } else if (ins_len < 0 && btrfs_header_nritems(b) < 2473 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 2474 int sret; 2475 2476 if (*write_lock_level < level + 1) { 2477 *write_lock_level = level + 1; 2478 btrfs_release_path(p); 2479 goto again; 2480 } 2481 2482 btrfs_set_path_blocking(p); 2483 reada_for_balance(fs_info, p, level); 2484 sret = balance_level(trans, root, p, level); 2485 2486 if (sret) { 2487 ret = sret; 2488 goto done; 2489 } 2490 b = p->nodes[level]; 2491 if (!b) { 2492 btrfs_release_path(p); 2493 goto again; 2494 } 2495 BUG_ON(btrfs_header_nritems(b) == 1); 2496 } 2497 return 0; 2498 2499 again: 2500 ret = -EAGAIN; 2501 done: 2502 return ret; 2503 } 2504 2505 static void key_search_validate(struct extent_buffer *b, 2506 const struct btrfs_key *key, 2507 int level) 2508 { 2509 #ifdef CONFIG_BTRFS_ASSERT 2510 struct btrfs_disk_key disk_key; 2511 2512 btrfs_cpu_key_to_disk(&disk_key, key); 2513 2514 if (level == 0) 2515 ASSERT(!memcmp_extent_buffer(b, &disk_key, 2516 offsetof(struct btrfs_leaf, items[0].key), 2517 sizeof(disk_key))); 2518 else 2519 ASSERT(!memcmp_extent_buffer(b, &disk_key, 2520 offsetof(struct btrfs_node, ptrs[0].key), 2521 sizeof(disk_key))); 2522 #endif 2523 } 2524 2525 static int key_search(struct extent_buffer *b, const struct btrfs_key *key, 2526 int level, int *prev_cmp, int *slot) 2527 { 2528 if (*prev_cmp != 0) { 2529 *prev_cmp = btrfs_bin_search(b, key, level, slot); 2530 return *prev_cmp; 2531 } 2532 2533 key_search_validate(b, key, level); 2534 *slot = 0; 2535 2536 return 0; 2537 } 2538 2539 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 2540 u64 iobjectid, u64 ioff, u8 key_type, 2541 struct btrfs_key *found_key) 2542 { 2543 int ret; 2544 struct btrfs_key key; 2545 struct extent_buffer *eb; 2546 2547 ASSERT(path); 2548 ASSERT(found_key); 2549 2550 key.type = key_type; 2551 key.objectid = iobjectid; 2552 key.offset = ioff; 2553 2554 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 2555 if (ret < 0) 2556 return ret; 2557 2558 eb = path->nodes[0]; 2559 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 2560 ret = btrfs_next_leaf(fs_root, path); 2561 if (ret) 2562 return ret; 2563 eb = path->nodes[0]; 2564 } 2565 2566 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 2567 if (found_key->type != key.type || 2568 found_key->objectid != key.objectid) 2569 return 1; 2570 2571 return 0; 2572 } 2573 2574 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 2575 struct btrfs_path *p, 2576 int write_lock_level) 2577 { 2578 struct btrfs_fs_info *fs_info = root->fs_info; 2579 struct extent_buffer *b; 2580 int root_lock; 2581 int level = 0; 2582 2583 /* We try very hard to do read locks on the root */ 2584 root_lock = BTRFS_READ_LOCK; 2585 2586 if (p->search_commit_root) { 2587 /* The commit roots are read only so we always do read locks */ 2588 if (p->need_commit_sem) 2589 down_read(&fs_info->commit_root_sem); 2590 b = root->commit_root; 2591 extent_buffer_get(b); 2592 level = btrfs_header_level(b); 2593 if (p->need_commit_sem) 2594 up_read(&fs_info->commit_root_sem); 2595 /* 2596 * Ensure that all callers have set skip_locking when 2597 * p->search_commit_root = 1. 2598 */ 2599 ASSERT(p->skip_locking == 1); 2600 2601 goto out; 2602 } 2603 2604 if (p->skip_locking) { 2605 b = btrfs_root_node(root); 2606 level = btrfs_header_level(b); 2607 goto out; 2608 } 2609 2610 /* 2611 * If the level is set to maximum, we can skip trying to get the read 2612 * lock. 2613 */ 2614 if (write_lock_level < BTRFS_MAX_LEVEL) { 2615 /* 2616 * We don't know the level of the root node until we actually 2617 * have it read locked 2618 */ 2619 b = btrfs_read_lock_root_node(root); 2620 level = btrfs_header_level(b); 2621 if (level > write_lock_level) 2622 goto out; 2623 2624 /* Whoops, must trade for write lock */ 2625 btrfs_tree_read_unlock(b); 2626 free_extent_buffer(b); 2627 } 2628 2629 b = btrfs_lock_root_node(root); 2630 root_lock = BTRFS_WRITE_LOCK; 2631 2632 /* The level might have changed, check again */ 2633 level = btrfs_header_level(b); 2634 2635 out: 2636 p->nodes[level] = b; 2637 if (!p->skip_locking) 2638 p->locks[level] = root_lock; 2639 /* 2640 * Callers are responsible for dropping b's references. 2641 */ 2642 return b; 2643 } 2644 2645 2646 /* 2647 * btrfs_search_slot - look for a key in a tree and perform necessary 2648 * modifications to preserve tree invariants. 2649 * 2650 * @trans: Handle of transaction, used when modifying the tree 2651 * @p: Holds all btree nodes along the search path 2652 * @root: The root node of the tree 2653 * @key: The key we are looking for 2654 * @ins_len: Indicates purpose of search, for inserts it is 1, for 2655 * deletions it's -1. 0 for plain searches 2656 * @cow: boolean should CoW operations be performed. Must always be 1 2657 * when modifying the tree. 2658 * 2659 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2660 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2661 * 2662 * If @key is found, 0 is returned and you can find the item in the leaf level 2663 * of the path (level 0) 2664 * 2665 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2666 * points to the slot where it should be inserted 2667 * 2668 * If an error is encountered while searching the tree a negative error number 2669 * is returned 2670 */ 2671 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2672 const struct btrfs_key *key, struct btrfs_path *p, 2673 int ins_len, int cow) 2674 { 2675 struct btrfs_fs_info *fs_info = root->fs_info; 2676 struct extent_buffer *b; 2677 int slot; 2678 int ret; 2679 int err; 2680 int level; 2681 int lowest_unlock = 1; 2682 /* everything at write_lock_level or lower must be write locked */ 2683 int write_lock_level = 0; 2684 u8 lowest_level = 0; 2685 int min_write_lock_level; 2686 int prev_cmp; 2687 2688 lowest_level = p->lowest_level; 2689 WARN_ON(lowest_level && ins_len > 0); 2690 WARN_ON(p->nodes[0] != NULL); 2691 BUG_ON(!cow && ins_len); 2692 2693 if (ins_len < 0) { 2694 lowest_unlock = 2; 2695 2696 /* when we are removing items, we might have to go up to level 2697 * two as we update tree pointers Make sure we keep write 2698 * for those levels as well 2699 */ 2700 write_lock_level = 2; 2701 } else if (ins_len > 0) { 2702 /* 2703 * for inserting items, make sure we have a write lock on 2704 * level 1 so we can update keys 2705 */ 2706 write_lock_level = 1; 2707 } 2708 2709 if (!cow) 2710 write_lock_level = -1; 2711 2712 if (cow && (p->keep_locks || p->lowest_level)) 2713 write_lock_level = BTRFS_MAX_LEVEL; 2714 2715 min_write_lock_level = write_lock_level; 2716 2717 again: 2718 prev_cmp = -1; 2719 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2720 2721 while (b) { 2722 level = btrfs_header_level(b); 2723 2724 /* 2725 * setup the path here so we can release it under lock 2726 * contention with the cow code 2727 */ 2728 if (cow) { 2729 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2730 2731 /* 2732 * if we don't really need to cow this block 2733 * then we don't want to set the path blocking, 2734 * so we test it here 2735 */ 2736 if (!should_cow_block(trans, root, b)) { 2737 trans->dirty = true; 2738 goto cow_done; 2739 } 2740 2741 /* 2742 * must have write locks on this node and the 2743 * parent 2744 */ 2745 if (level > write_lock_level || 2746 (level + 1 > write_lock_level && 2747 level + 1 < BTRFS_MAX_LEVEL && 2748 p->nodes[level + 1])) { 2749 write_lock_level = level + 1; 2750 btrfs_release_path(p); 2751 goto again; 2752 } 2753 2754 btrfs_set_path_blocking(p); 2755 if (last_level) 2756 err = btrfs_cow_block(trans, root, b, NULL, 0, 2757 &b); 2758 else 2759 err = btrfs_cow_block(trans, root, b, 2760 p->nodes[level + 1], 2761 p->slots[level + 1], &b); 2762 if (err) { 2763 ret = err; 2764 goto done; 2765 } 2766 } 2767 cow_done: 2768 p->nodes[level] = b; 2769 /* 2770 * Leave path with blocking locks to avoid massive 2771 * lock context switch, this is made on purpose. 2772 */ 2773 2774 /* 2775 * we have a lock on b and as long as we aren't changing 2776 * the tree, there is no way to for the items in b to change. 2777 * It is safe to drop the lock on our parent before we 2778 * go through the expensive btree search on b. 2779 * 2780 * If we're inserting or deleting (ins_len != 0), then we might 2781 * be changing slot zero, which may require changing the parent. 2782 * So, we can't drop the lock until after we know which slot 2783 * we're operating on. 2784 */ 2785 if (!ins_len && !p->keep_locks) { 2786 int u = level + 1; 2787 2788 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2789 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2790 p->locks[u] = 0; 2791 } 2792 } 2793 2794 ret = key_search(b, key, level, &prev_cmp, &slot); 2795 if (ret < 0) 2796 goto done; 2797 2798 if (level != 0) { 2799 int dec = 0; 2800 if (ret && slot > 0) { 2801 dec = 1; 2802 slot -= 1; 2803 } 2804 p->slots[level] = slot; 2805 err = setup_nodes_for_search(trans, root, p, b, level, 2806 ins_len, &write_lock_level); 2807 if (err == -EAGAIN) 2808 goto again; 2809 if (err) { 2810 ret = err; 2811 goto done; 2812 } 2813 b = p->nodes[level]; 2814 slot = p->slots[level]; 2815 2816 /* 2817 * slot 0 is special, if we change the key 2818 * we have to update the parent pointer 2819 * which means we must have a write lock 2820 * on the parent 2821 */ 2822 if (slot == 0 && ins_len && 2823 write_lock_level < level + 1) { 2824 write_lock_level = level + 1; 2825 btrfs_release_path(p); 2826 goto again; 2827 } 2828 2829 unlock_up(p, level, lowest_unlock, 2830 min_write_lock_level, &write_lock_level); 2831 2832 if (level == lowest_level) { 2833 if (dec) 2834 p->slots[level]++; 2835 goto done; 2836 } 2837 2838 err = read_block_for_search(root, p, &b, level, 2839 slot, key); 2840 if (err == -EAGAIN) 2841 goto again; 2842 if (err) { 2843 ret = err; 2844 goto done; 2845 } 2846 2847 if (!p->skip_locking) { 2848 level = btrfs_header_level(b); 2849 if (level <= write_lock_level) { 2850 err = btrfs_try_tree_write_lock(b); 2851 if (!err) { 2852 btrfs_set_path_blocking(p); 2853 btrfs_tree_lock(b); 2854 } 2855 p->locks[level] = BTRFS_WRITE_LOCK; 2856 } else { 2857 err = btrfs_tree_read_lock_atomic(b); 2858 if (!err) { 2859 btrfs_set_path_blocking(p); 2860 btrfs_tree_read_lock(b); 2861 } 2862 p->locks[level] = BTRFS_READ_LOCK; 2863 } 2864 p->nodes[level] = b; 2865 } 2866 } else { 2867 p->slots[level] = slot; 2868 if (ins_len > 0 && 2869 btrfs_leaf_free_space(fs_info, b) < ins_len) { 2870 if (write_lock_level < 1) { 2871 write_lock_level = 1; 2872 btrfs_release_path(p); 2873 goto again; 2874 } 2875 2876 btrfs_set_path_blocking(p); 2877 err = split_leaf(trans, root, key, 2878 p, ins_len, ret == 0); 2879 2880 BUG_ON(err > 0); 2881 if (err) { 2882 ret = err; 2883 goto done; 2884 } 2885 } 2886 if (!p->search_for_split) 2887 unlock_up(p, level, lowest_unlock, 2888 min_write_lock_level, NULL); 2889 goto done; 2890 } 2891 } 2892 ret = 1; 2893 done: 2894 /* 2895 * we don't really know what they plan on doing with the path 2896 * from here on, so for now just mark it as blocking 2897 */ 2898 if (!p->leave_spinning) 2899 btrfs_set_path_blocking(p); 2900 if (ret < 0 && !p->skip_release_on_error) 2901 btrfs_release_path(p); 2902 return ret; 2903 } 2904 2905 /* 2906 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2907 * current state of the tree together with the operations recorded in the tree 2908 * modification log to search for the key in a previous version of this tree, as 2909 * denoted by the time_seq parameter. 2910 * 2911 * Naturally, there is no support for insert, delete or cow operations. 2912 * 2913 * The resulting path and return value will be set up as if we called 2914 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2915 */ 2916 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2917 struct btrfs_path *p, u64 time_seq) 2918 { 2919 struct btrfs_fs_info *fs_info = root->fs_info; 2920 struct extent_buffer *b; 2921 int slot; 2922 int ret; 2923 int err; 2924 int level; 2925 int lowest_unlock = 1; 2926 u8 lowest_level = 0; 2927 int prev_cmp = -1; 2928 2929 lowest_level = p->lowest_level; 2930 WARN_ON(p->nodes[0] != NULL); 2931 2932 if (p->search_commit_root) { 2933 BUG_ON(time_seq); 2934 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2935 } 2936 2937 again: 2938 b = get_old_root(root, time_seq); 2939 if (!b) { 2940 ret = -EIO; 2941 goto done; 2942 } 2943 level = btrfs_header_level(b); 2944 p->locks[level] = BTRFS_READ_LOCK; 2945 2946 while (b) { 2947 level = btrfs_header_level(b); 2948 p->nodes[level] = b; 2949 2950 /* 2951 * we have a lock on b and as long as we aren't changing 2952 * the tree, there is no way to for the items in b to change. 2953 * It is safe to drop the lock on our parent before we 2954 * go through the expensive btree search on b. 2955 */ 2956 btrfs_unlock_up_safe(p, level + 1); 2957 2958 /* 2959 * Since we can unwind ebs we want to do a real search every 2960 * time. 2961 */ 2962 prev_cmp = -1; 2963 ret = key_search(b, key, level, &prev_cmp, &slot); 2964 2965 if (level != 0) { 2966 int dec = 0; 2967 if (ret && slot > 0) { 2968 dec = 1; 2969 slot -= 1; 2970 } 2971 p->slots[level] = slot; 2972 unlock_up(p, level, lowest_unlock, 0, NULL); 2973 2974 if (level == lowest_level) { 2975 if (dec) 2976 p->slots[level]++; 2977 goto done; 2978 } 2979 2980 err = read_block_for_search(root, p, &b, level, 2981 slot, key); 2982 if (err == -EAGAIN) 2983 goto again; 2984 if (err) { 2985 ret = err; 2986 goto done; 2987 } 2988 2989 level = btrfs_header_level(b); 2990 err = btrfs_tree_read_lock_atomic(b); 2991 if (!err) { 2992 btrfs_set_path_blocking(p); 2993 btrfs_tree_read_lock(b); 2994 } 2995 b = tree_mod_log_rewind(fs_info, p, b, time_seq); 2996 if (!b) { 2997 ret = -ENOMEM; 2998 goto done; 2999 } 3000 p->locks[level] = BTRFS_READ_LOCK; 3001 p->nodes[level] = b; 3002 } else { 3003 p->slots[level] = slot; 3004 unlock_up(p, level, lowest_unlock, 0, NULL); 3005 goto done; 3006 } 3007 } 3008 ret = 1; 3009 done: 3010 if (!p->leave_spinning) 3011 btrfs_set_path_blocking(p); 3012 if (ret < 0) 3013 btrfs_release_path(p); 3014 3015 return ret; 3016 } 3017 3018 /* 3019 * helper to use instead of search slot if no exact match is needed but 3020 * instead the next or previous item should be returned. 3021 * When find_higher is true, the next higher item is returned, the next lower 3022 * otherwise. 3023 * When return_any and find_higher are both true, and no higher item is found, 3024 * return the next lower instead. 3025 * When return_any is true and find_higher is false, and no lower item is found, 3026 * return the next higher instead. 3027 * It returns 0 if any item is found, 1 if none is found (tree empty), and 3028 * < 0 on error 3029 */ 3030 int btrfs_search_slot_for_read(struct btrfs_root *root, 3031 const struct btrfs_key *key, 3032 struct btrfs_path *p, int find_higher, 3033 int return_any) 3034 { 3035 int ret; 3036 struct extent_buffer *leaf; 3037 3038 again: 3039 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 3040 if (ret <= 0) 3041 return ret; 3042 /* 3043 * a return value of 1 means the path is at the position where the 3044 * item should be inserted. Normally this is the next bigger item, 3045 * but in case the previous item is the last in a leaf, path points 3046 * to the first free slot in the previous leaf, i.e. at an invalid 3047 * item. 3048 */ 3049 leaf = p->nodes[0]; 3050 3051 if (find_higher) { 3052 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 3053 ret = btrfs_next_leaf(root, p); 3054 if (ret <= 0) 3055 return ret; 3056 if (!return_any) 3057 return 1; 3058 /* 3059 * no higher item found, return the next 3060 * lower instead 3061 */ 3062 return_any = 0; 3063 find_higher = 0; 3064 btrfs_release_path(p); 3065 goto again; 3066 } 3067 } else { 3068 if (p->slots[0] == 0) { 3069 ret = btrfs_prev_leaf(root, p); 3070 if (ret < 0) 3071 return ret; 3072 if (!ret) { 3073 leaf = p->nodes[0]; 3074 if (p->slots[0] == btrfs_header_nritems(leaf)) 3075 p->slots[0]--; 3076 return 0; 3077 } 3078 if (!return_any) 3079 return 1; 3080 /* 3081 * no lower item found, return the next 3082 * higher instead 3083 */ 3084 return_any = 0; 3085 find_higher = 1; 3086 btrfs_release_path(p); 3087 goto again; 3088 } else { 3089 --p->slots[0]; 3090 } 3091 } 3092 return 0; 3093 } 3094 3095 /* 3096 * adjust the pointers going up the tree, starting at level 3097 * making sure the right key of each node is points to 'key'. 3098 * This is used after shifting pointers to the left, so it stops 3099 * fixing up pointers when a given leaf/node is not in slot 0 of the 3100 * higher levels 3101 * 3102 */ 3103 static void fixup_low_keys(struct btrfs_path *path, 3104 struct btrfs_disk_key *key, int level) 3105 { 3106 int i; 3107 struct extent_buffer *t; 3108 int ret; 3109 3110 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 3111 int tslot = path->slots[i]; 3112 3113 if (!path->nodes[i]) 3114 break; 3115 t = path->nodes[i]; 3116 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE, 3117 GFP_ATOMIC); 3118 BUG_ON(ret < 0); 3119 btrfs_set_node_key(t, key, tslot); 3120 btrfs_mark_buffer_dirty(path->nodes[i]); 3121 if (tslot != 0) 3122 break; 3123 } 3124 } 3125 3126 /* 3127 * update item key. 3128 * 3129 * This function isn't completely safe. It's the caller's responsibility 3130 * that the new key won't break the order 3131 */ 3132 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 3133 struct btrfs_path *path, 3134 const struct btrfs_key *new_key) 3135 { 3136 struct btrfs_disk_key disk_key; 3137 struct extent_buffer *eb; 3138 int slot; 3139 3140 eb = path->nodes[0]; 3141 slot = path->slots[0]; 3142 if (slot > 0) { 3143 btrfs_item_key(eb, &disk_key, slot - 1); 3144 BUG_ON(comp_keys(&disk_key, new_key) >= 0); 3145 } 3146 if (slot < btrfs_header_nritems(eb) - 1) { 3147 btrfs_item_key(eb, &disk_key, slot + 1); 3148 BUG_ON(comp_keys(&disk_key, new_key) <= 0); 3149 } 3150 3151 btrfs_cpu_key_to_disk(&disk_key, new_key); 3152 btrfs_set_item_key(eb, &disk_key, slot); 3153 btrfs_mark_buffer_dirty(eb); 3154 if (slot == 0) 3155 fixup_low_keys(path, &disk_key, 1); 3156 } 3157 3158 /* 3159 * try to push data from one node into the next node left in the 3160 * tree. 3161 * 3162 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 3163 * error, and > 0 if there was no room in the left hand block. 3164 */ 3165 static int push_node_left(struct btrfs_trans_handle *trans, 3166 struct btrfs_fs_info *fs_info, 3167 struct extent_buffer *dst, 3168 struct extent_buffer *src, int empty) 3169 { 3170 int push_items = 0; 3171 int src_nritems; 3172 int dst_nritems; 3173 int ret = 0; 3174 3175 src_nritems = btrfs_header_nritems(src); 3176 dst_nritems = btrfs_header_nritems(dst); 3177 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 3178 WARN_ON(btrfs_header_generation(src) != trans->transid); 3179 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3180 3181 if (!empty && src_nritems <= 8) 3182 return 1; 3183 3184 if (push_items <= 0) 3185 return 1; 3186 3187 if (empty) { 3188 push_items = min(src_nritems, push_items); 3189 if (push_items < src_nritems) { 3190 /* leave at least 8 pointers in the node if 3191 * we aren't going to empty it 3192 */ 3193 if (src_nritems - push_items < 8) { 3194 if (push_items <= 8) 3195 return 1; 3196 push_items -= 8; 3197 } 3198 } 3199 } else 3200 push_items = min(src_nritems - 8, push_items); 3201 3202 ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0, 3203 push_items); 3204 if (ret) { 3205 btrfs_abort_transaction(trans, ret); 3206 return ret; 3207 } 3208 copy_extent_buffer(dst, src, 3209 btrfs_node_key_ptr_offset(dst_nritems), 3210 btrfs_node_key_ptr_offset(0), 3211 push_items * sizeof(struct btrfs_key_ptr)); 3212 3213 if (push_items < src_nritems) { 3214 /* 3215 * Don't call tree_mod_log_insert_move here, key removal was 3216 * already fully logged by tree_mod_log_eb_copy above. 3217 */ 3218 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 3219 btrfs_node_key_ptr_offset(push_items), 3220 (src_nritems - push_items) * 3221 sizeof(struct btrfs_key_ptr)); 3222 } 3223 btrfs_set_header_nritems(src, src_nritems - push_items); 3224 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3225 btrfs_mark_buffer_dirty(src); 3226 btrfs_mark_buffer_dirty(dst); 3227 3228 return ret; 3229 } 3230 3231 /* 3232 * try to push data from one node into the next node right in the 3233 * tree. 3234 * 3235 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 3236 * error, and > 0 if there was no room in the right hand block. 3237 * 3238 * this will only push up to 1/2 the contents of the left node over 3239 */ 3240 static int balance_node_right(struct btrfs_trans_handle *trans, 3241 struct btrfs_fs_info *fs_info, 3242 struct extent_buffer *dst, 3243 struct extent_buffer *src) 3244 { 3245 int push_items = 0; 3246 int max_push; 3247 int src_nritems; 3248 int dst_nritems; 3249 int ret = 0; 3250 3251 WARN_ON(btrfs_header_generation(src) != trans->transid); 3252 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3253 3254 src_nritems = btrfs_header_nritems(src); 3255 dst_nritems = btrfs_header_nritems(dst); 3256 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 3257 if (push_items <= 0) 3258 return 1; 3259 3260 if (src_nritems < 4) 3261 return 1; 3262 3263 max_push = src_nritems / 2 + 1; 3264 /* don't try to empty the node */ 3265 if (max_push >= src_nritems) 3266 return 1; 3267 3268 if (max_push < push_items) 3269 push_items = max_push; 3270 3271 ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 3272 BUG_ON(ret < 0); 3273 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 3274 btrfs_node_key_ptr_offset(0), 3275 (dst_nritems) * 3276 sizeof(struct btrfs_key_ptr)); 3277 3278 ret = tree_mod_log_eb_copy(fs_info, dst, src, 0, 3279 src_nritems - push_items, push_items); 3280 if (ret) { 3281 btrfs_abort_transaction(trans, ret); 3282 return ret; 3283 } 3284 copy_extent_buffer(dst, src, 3285 btrfs_node_key_ptr_offset(0), 3286 btrfs_node_key_ptr_offset(src_nritems - push_items), 3287 push_items * sizeof(struct btrfs_key_ptr)); 3288 3289 btrfs_set_header_nritems(src, src_nritems - push_items); 3290 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3291 3292 btrfs_mark_buffer_dirty(src); 3293 btrfs_mark_buffer_dirty(dst); 3294 3295 return ret; 3296 } 3297 3298 /* 3299 * helper function to insert a new root level in the tree. 3300 * A new node is allocated, and a single item is inserted to 3301 * point to the existing root 3302 * 3303 * returns zero on success or < 0 on failure. 3304 */ 3305 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3306 struct btrfs_root *root, 3307 struct btrfs_path *path, int level) 3308 { 3309 struct btrfs_fs_info *fs_info = root->fs_info; 3310 u64 lower_gen; 3311 struct extent_buffer *lower; 3312 struct extent_buffer *c; 3313 struct extent_buffer *old; 3314 struct btrfs_disk_key lower_key; 3315 int ret; 3316 3317 BUG_ON(path->nodes[level]); 3318 BUG_ON(path->nodes[level-1] != root->node); 3319 3320 lower = path->nodes[level-1]; 3321 if (level == 1) 3322 btrfs_item_key(lower, &lower_key, 0); 3323 else 3324 btrfs_node_key(lower, &lower_key, 0); 3325 3326 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3327 &lower_key, level, root->node->start, 0); 3328 if (IS_ERR(c)) 3329 return PTR_ERR(c); 3330 3331 root_add_used(root, fs_info->nodesize); 3332 3333 btrfs_set_header_nritems(c, 1); 3334 btrfs_set_node_key(c, &lower_key, 0); 3335 btrfs_set_node_blockptr(c, 0, lower->start); 3336 lower_gen = btrfs_header_generation(lower); 3337 WARN_ON(lower_gen != trans->transid); 3338 3339 btrfs_set_node_ptr_generation(c, 0, lower_gen); 3340 3341 btrfs_mark_buffer_dirty(c); 3342 3343 old = root->node; 3344 ret = tree_mod_log_insert_root(root->node, c, 0); 3345 BUG_ON(ret < 0); 3346 rcu_assign_pointer(root->node, c); 3347 3348 /* the super has an extra ref to root->node */ 3349 free_extent_buffer(old); 3350 3351 add_root_to_dirty_list(root); 3352 extent_buffer_get(c); 3353 path->nodes[level] = c; 3354 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 3355 path->slots[level] = 0; 3356 return 0; 3357 } 3358 3359 /* 3360 * worker function to insert a single pointer in a node. 3361 * the node should have enough room for the pointer already 3362 * 3363 * slot and level indicate where you want the key to go, and 3364 * blocknr is the block the key points to. 3365 */ 3366 static void insert_ptr(struct btrfs_trans_handle *trans, 3367 struct btrfs_fs_info *fs_info, struct btrfs_path *path, 3368 struct btrfs_disk_key *key, u64 bytenr, 3369 int slot, int level) 3370 { 3371 struct extent_buffer *lower; 3372 int nritems; 3373 int ret; 3374 3375 BUG_ON(!path->nodes[level]); 3376 btrfs_assert_tree_locked(path->nodes[level]); 3377 lower = path->nodes[level]; 3378 nritems = btrfs_header_nritems(lower); 3379 BUG_ON(slot > nritems); 3380 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 3381 if (slot != nritems) { 3382 if (level) { 3383 ret = tree_mod_log_insert_move(lower, slot + 1, slot, 3384 nritems - slot); 3385 BUG_ON(ret < 0); 3386 } 3387 memmove_extent_buffer(lower, 3388 btrfs_node_key_ptr_offset(slot + 1), 3389 btrfs_node_key_ptr_offset(slot), 3390 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3391 } 3392 if (level) { 3393 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD, 3394 GFP_NOFS); 3395 BUG_ON(ret < 0); 3396 } 3397 btrfs_set_node_key(lower, key, slot); 3398 btrfs_set_node_blockptr(lower, slot, bytenr); 3399 WARN_ON(trans->transid == 0); 3400 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3401 btrfs_set_header_nritems(lower, nritems + 1); 3402 btrfs_mark_buffer_dirty(lower); 3403 } 3404 3405 /* 3406 * split the node at the specified level in path in two. 3407 * The path is corrected to point to the appropriate node after the split 3408 * 3409 * Before splitting this tries to make some room in the node by pushing 3410 * left and right, if either one works, it returns right away. 3411 * 3412 * returns 0 on success and < 0 on failure 3413 */ 3414 static noinline int split_node(struct btrfs_trans_handle *trans, 3415 struct btrfs_root *root, 3416 struct btrfs_path *path, int level) 3417 { 3418 struct btrfs_fs_info *fs_info = root->fs_info; 3419 struct extent_buffer *c; 3420 struct extent_buffer *split; 3421 struct btrfs_disk_key disk_key; 3422 int mid; 3423 int ret; 3424 u32 c_nritems; 3425 3426 c = path->nodes[level]; 3427 WARN_ON(btrfs_header_generation(c) != trans->transid); 3428 if (c == root->node) { 3429 /* 3430 * trying to split the root, lets make a new one 3431 * 3432 * tree mod log: We don't log_removal old root in 3433 * insert_new_root, because that root buffer will be kept as a 3434 * normal node. We are going to log removal of half of the 3435 * elements below with tree_mod_log_eb_copy. We're holding a 3436 * tree lock on the buffer, which is why we cannot race with 3437 * other tree_mod_log users. 3438 */ 3439 ret = insert_new_root(trans, root, path, level + 1); 3440 if (ret) 3441 return ret; 3442 } else { 3443 ret = push_nodes_for_insert(trans, root, path, level); 3444 c = path->nodes[level]; 3445 if (!ret && btrfs_header_nritems(c) < 3446 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3447 return 0; 3448 if (ret < 0) 3449 return ret; 3450 } 3451 3452 c_nritems = btrfs_header_nritems(c); 3453 mid = (c_nritems + 1) / 2; 3454 btrfs_node_key(c, &disk_key, mid); 3455 3456 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 3457 &disk_key, level, c->start, 0); 3458 if (IS_ERR(split)) 3459 return PTR_ERR(split); 3460 3461 root_add_used(root, fs_info->nodesize); 3462 ASSERT(btrfs_header_level(c) == level); 3463 3464 ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid); 3465 if (ret) { 3466 btrfs_abort_transaction(trans, ret); 3467 return ret; 3468 } 3469 copy_extent_buffer(split, c, 3470 btrfs_node_key_ptr_offset(0), 3471 btrfs_node_key_ptr_offset(mid), 3472 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3473 btrfs_set_header_nritems(split, c_nritems - mid); 3474 btrfs_set_header_nritems(c, mid); 3475 ret = 0; 3476 3477 btrfs_mark_buffer_dirty(c); 3478 btrfs_mark_buffer_dirty(split); 3479 3480 insert_ptr(trans, fs_info, path, &disk_key, split->start, 3481 path->slots[level + 1] + 1, level + 1); 3482 3483 if (path->slots[level] >= mid) { 3484 path->slots[level] -= mid; 3485 btrfs_tree_unlock(c); 3486 free_extent_buffer(c); 3487 path->nodes[level] = split; 3488 path->slots[level + 1] += 1; 3489 } else { 3490 btrfs_tree_unlock(split); 3491 free_extent_buffer(split); 3492 } 3493 return ret; 3494 } 3495 3496 /* 3497 * how many bytes are required to store the items in a leaf. start 3498 * and nr indicate which items in the leaf to check. This totals up the 3499 * space used both by the item structs and the item data 3500 */ 3501 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3502 { 3503 struct btrfs_item *start_item; 3504 struct btrfs_item *end_item; 3505 struct btrfs_map_token token; 3506 int data_len; 3507 int nritems = btrfs_header_nritems(l); 3508 int end = min(nritems, start + nr) - 1; 3509 3510 if (!nr) 3511 return 0; 3512 btrfs_init_map_token(&token); 3513 start_item = btrfs_item_nr(start); 3514 end_item = btrfs_item_nr(end); 3515 data_len = btrfs_token_item_offset(l, start_item, &token) + 3516 btrfs_token_item_size(l, start_item, &token); 3517 data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3518 data_len += sizeof(struct btrfs_item) * nr; 3519 WARN_ON(data_len < 0); 3520 return data_len; 3521 } 3522 3523 /* 3524 * The space between the end of the leaf items and 3525 * the start of the leaf data. IOW, how much room 3526 * the leaf has left for both items and data 3527 */ 3528 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, 3529 struct extent_buffer *leaf) 3530 { 3531 int nritems = btrfs_header_nritems(leaf); 3532 int ret; 3533 3534 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3535 if (ret < 0) { 3536 btrfs_crit(fs_info, 3537 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3538 ret, 3539 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3540 leaf_space_used(leaf, 0, nritems), nritems); 3541 } 3542 return ret; 3543 } 3544 3545 /* 3546 * min slot controls the lowest index we're willing to push to the 3547 * right. We'll push up to and including min_slot, but no lower 3548 */ 3549 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, 3550 struct btrfs_path *path, 3551 int data_size, int empty, 3552 struct extent_buffer *right, 3553 int free_space, u32 left_nritems, 3554 u32 min_slot) 3555 { 3556 struct extent_buffer *left = path->nodes[0]; 3557 struct extent_buffer *upper = path->nodes[1]; 3558 struct btrfs_map_token token; 3559 struct btrfs_disk_key disk_key; 3560 int slot; 3561 u32 i; 3562 int push_space = 0; 3563 int push_items = 0; 3564 struct btrfs_item *item; 3565 u32 nr; 3566 u32 right_nritems; 3567 u32 data_end; 3568 u32 this_item_size; 3569 3570 btrfs_init_map_token(&token); 3571 3572 if (empty) 3573 nr = 0; 3574 else 3575 nr = max_t(u32, 1, min_slot); 3576 3577 if (path->slots[0] >= left_nritems) 3578 push_space += data_size; 3579 3580 slot = path->slots[1]; 3581 i = left_nritems - 1; 3582 while (i >= nr) { 3583 item = btrfs_item_nr(i); 3584 3585 if (!empty && push_items > 0) { 3586 if (path->slots[0] > i) 3587 break; 3588 if (path->slots[0] == i) { 3589 int space = btrfs_leaf_free_space(fs_info, left); 3590 if (space + push_space * 2 > free_space) 3591 break; 3592 } 3593 } 3594 3595 if (path->slots[0] == i) 3596 push_space += data_size; 3597 3598 this_item_size = btrfs_item_size(left, item); 3599 if (this_item_size + sizeof(*item) + push_space > free_space) 3600 break; 3601 3602 push_items++; 3603 push_space += this_item_size + sizeof(*item); 3604 if (i == 0) 3605 break; 3606 i--; 3607 } 3608 3609 if (push_items == 0) 3610 goto out_unlock; 3611 3612 WARN_ON(!empty && push_items == left_nritems); 3613 3614 /* push left to right */ 3615 right_nritems = btrfs_header_nritems(right); 3616 3617 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 3618 push_space -= leaf_data_end(fs_info, left); 3619 3620 /* make room in the right data area */ 3621 data_end = leaf_data_end(fs_info, right); 3622 memmove_extent_buffer(right, 3623 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 3624 BTRFS_LEAF_DATA_OFFSET + data_end, 3625 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3626 3627 /* copy from the left data area */ 3628 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 3629 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3630 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), 3631 push_space); 3632 3633 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3634 btrfs_item_nr_offset(0), 3635 right_nritems * sizeof(struct btrfs_item)); 3636 3637 /* copy the items from left to right */ 3638 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 3639 btrfs_item_nr_offset(left_nritems - push_items), 3640 push_items * sizeof(struct btrfs_item)); 3641 3642 /* update the item pointers */ 3643 right_nritems += push_items; 3644 btrfs_set_header_nritems(right, right_nritems); 3645 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3646 for (i = 0; i < right_nritems; i++) { 3647 item = btrfs_item_nr(i); 3648 push_space -= btrfs_token_item_size(right, item, &token); 3649 btrfs_set_token_item_offset(right, item, push_space, &token); 3650 } 3651 3652 left_nritems -= push_items; 3653 btrfs_set_header_nritems(left, left_nritems); 3654 3655 if (left_nritems) 3656 btrfs_mark_buffer_dirty(left); 3657 else 3658 clean_tree_block(fs_info, left); 3659 3660 btrfs_mark_buffer_dirty(right); 3661 3662 btrfs_item_key(right, &disk_key, 0); 3663 btrfs_set_node_key(upper, &disk_key, slot + 1); 3664 btrfs_mark_buffer_dirty(upper); 3665 3666 /* then fixup the leaf pointer in the path */ 3667 if (path->slots[0] >= left_nritems) { 3668 path->slots[0] -= left_nritems; 3669 if (btrfs_header_nritems(path->nodes[0]) == 0) 3670 clean_tree_block(fs_info, path->nodes[0]); 3671 btrfs_tree_unlock(path->nodes[0]); 3672 free_extent_buffer(path->nodes[0]); 3673 path->nodes[0] = right; 3674 path->slots[1] += 1; 3675 } else { 3676 btrfs_tree_unlock(right); 3677 free_extent_buffer(right); 3678 } 3679 return 0; 3680 3681 out_unlock: 3682 btrfs_tree_unlock(right); 3683 free_extent_buffer(right); 3684 return 1; 3685 } 3686 3687 /* 3688 * push some data in the path leaf to the right, trying to free up at 3689 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3690 * 3691 * returns 1 if the push failed because the other node didn't have enough 3692 * room, 0 if everything worked out and < 0 if there were major errors. 3693 * 3694 * this will push starting from min_slot to the end of the leaf. It won't 3695 * push any slot lower than min_slot 3696 */ 3697 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3698 *root, struct btrfs_path *path, 3699 int min_data_size, int data_size, 3700 int empty, u32 min_slot) 3701 { 3702 struct btrfs_fs_info *fs_info = root->fs_info; 3703 struct extent_buffer *left = path->nodes[0]; 3704 struct extent_buffer *right; 3705 struct extent_buffer *upper; 3706 int slot; 3707 int free_space; 3708 u32 left_nritems; 3709 int ret; 3710 3711 if (!path->nodes[1]) 3712 return 1; 3713 3714 slot = path->slots[1]; 3715 upper = path->nodes[1]; 3716 if (slot >= btrfs_header_nritems(upper) - 1) 3717 return 1; 3718 3719 btrfs_assert_tree_locked(path->nodes[1]); 3720 3721 right = read_node_slot(fs_info, upper, slot + 1); 3722 /* 3723 * slot + 1 is not valid or we fail to read the right node, 3724 * no big deal, just return. 3725 */ 3726 if (IS_ERR(right)) 3727 return 1; 3728 3729 btrfs_tree_lock(right); 3730 btrfs_set_lock_blocking(right); 3731 3732 free_space = btrfs_leaf_free_space(fs_info, right); 3733 if (free_space < data_size) 3734 goto out_unlock; 3735 3736 /* cow and double check */ 3737 ret = btrfs_cow_block(trans, root, right, upper, 3738 slot + 1, &right); 3739 if (ret) 3740 goto out_unlock; 3741 3742 free_space = btrfs_leaf_free_space(fs_info, right); 3743 if (free_space < data_size) 3744 goto out_unlock; 3745 3746 left_nritems = btrfs_header_nritems(left); 3747 if (left_nritems == 0) 3748 goto out_unlock; 3749 3750 if (path->slots[0] == left_nritems && !empty) { 3751 /* Key greater than all keys in the leaf, right neighbor has 3752 * enough room for it and we're not emptying our leaf to delete 3753 * it, therefore use right neighbor to insert the new item and 3754 * no need to touch/dirty our left leaft. */ 3755 btrfs_tree_unlock(left); 3756 free_extent_buffer(left); 3757 path->nodes[0] = right; 3758 path->slots[0] = 0; 3759 path->slots[1]++; 3760 return 0; 3761 } 3762 3763 return __push_leaf_right(fs_info, path, min_data_size, empty, 3764 right, free_space, left_nritems, min_slot); 3765 out_unlock: 3766 btrfs_tree_unlock(right); 3767 free_extent_buffer(right); 3768 return 1; 3769 } 3770 3771 /* 3772 * push some data in the path leaf to the left, trying to free up at 3773 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3774 * 3775 * max_slot can put a limit on how far into the leaf we'll push items. The 3776 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3777 * items 3778 */ 3779 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, 3780 struct btrfs_path *path, int data_size, 3781 int empty, struct extent_buffer *left, 3782 int free_space, u32 right_nritems, 3783 u32 max_slot) 3784 { 3785 struct btrfs_disk_key disk_key; 3786 struct extent_buffer *right = path->nodes[0]; 3787 int i; 3788 int push_space = 0; 3789 int push_items = 0; 3790 struct btrfs_item *item; 3791 u32 old_left_nritems; 3792 u32 nr; 3793 int ret = 0; 3794 u32 this_item_size; 3795 u32 old_left_item_size; 3796 struct btrfs_map_token token; 3797 3798 btrfs_init_map_token(&token); 3799 3800 if (empty) 3801 nr = min(right_nritems, max_slot); 3802 else 3803 nr = min(right_nritems - 1, max_slot); 3804 3805 for (i = 0; i < nr; i++) { 3806 item = btrfs_item_nr(i); 3807 3808 if (!empty && push_items > 0) { 3809 if (path->slots[0] < i) 3810 break; 3811 if (path->slots[0] == i) { 3812 int space = btrfs_leaf_free_space(fs_info, right); 3813 if (space + push_space * 2 > free_space) 3814 break; 3815 } 3816 } 3817 3818 if (path->slots[0] == i) 3819 push_space += data_size; 3820 3821 this_item_size = btrfs_item_size(right, item); 3822 if (this_item_size + sizeof(*item) + push_space > free_space) 3823 break; 3824 3825 push_items++; 3826 push_space += this_item_size + sizeof(*item); 3827 } 3828 3829 if (push_items == 0) { 3830 ret = 1; 3831 goto out; 3832 } 3833 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3834 3835 /* push data from right to left */ 3836 copy_extent_buffer(left, right, 3837 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3838 btrfs_item_nr_offset(0), 3839 push_items * sizeof(struct btrfs_item)); 3840 3841 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3842 btrfs_item_offset_nr(right, push_items - 1); 3843 3844 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 3845 leaf_data_end(fs_info, left) - push_space, 3846 BTRFS_LEAF_DATA_OFFSET + 3847 btrfs_item_offset_nr(right, push_items - 1), 3848 push_space); 3849 old_left_nritems = btrfs_header_nritems(left); 3850 BUG_ON(old_left_nritems <= 0); 3851 3852 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 3853 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3854 u32 ioff; 3855 3856 item = btrfs_item_nr(i); 3857 3858 ioff = btrfs_token_item_offset(left, item, &token); 3859 btrfs_set_token_item_offset(left, item, 3860 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size), 3861 &token); 3862 } 3863 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3864 3865 /* fixup right node */ 3866 if (push_items > right_nritems) 3867 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3868 right_nritems); 3869 3870 if (push_items < right_nritems) { 3871 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3872 leaf_data_end(fs_info, right); 3873 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 3874 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3875 BTRFS_LEAF_DATA_OFFSET + 3876 leaf_data_end(fs_info, right), push_space); 3877 3878 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3879 btrfs_item_nr_offset(push_items), 3880 (btrfs_header_nritems(right) - push_items) * 3881 sizeof(struct btrfs_item)); 3882 } 3883 right_nritems -= push_items; 3884 btrfs_set_header_nritems(right, right_nritems); 3885 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3886 for (i = 0; i < right_nritems; i++) { 3887 item = btrfs_item_nr(i); 3888 3889 push_space = push_space - btrfs_token_item_size(right, 3890 item, &token); 3891 btrfs_set_token_item_offset(right, item, push_space, &token); 3892 } 3893 3894 btrfs_mark_buffer_dirty(left); 3895 if (right_nritems) 3896 btrfs_mark_buffer_dirty(right); 3897 else 3898 clean_tree_block(fs_info, right); 3899 3900 btrfs_item_key(right, &disk_key, 0); 3901 fixup_low_keys(path, &disk_key, 1); 3902 3903 /* then fixup the leaf pointer in the path */ 3904 if (path->slots[0] < push_items) { 3905 path->slots[0] += old_left_nritems; 3906 btrfs_tree_unlock(path->nodes[0]); 3907 free_extent_buffer(path->nodes[0]); 3908 path->nodes[0] = left; 3909 path->slots[1] -= 1; 3910 } else { 3911 btrfs_tree_unlock(left); 3912 free_extent_buffer(left); 3913 path->slots[0] -= push_items; 3914 } 3915 BUG_ON(path->slots[0] < 0); 3916 return ret; 3917 out: 3918 btrfs_tree_unlock(left); 3919 free_extent_buffer(left); 3920 return ret; 3921 } 3922 3923 /* 3924 * push some data in the path leaf to the left, trying to free up at 3925 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3926 * 3927 * max_slot can put a limit on how far into the leaf we'll push items. The 3928 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3929 * items 3930 */ 3931 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3932 *root, struct btrfs_path *path, int min_data_size, 3933 int data_size, int empty, u32 max_slot) 3934 { 3935 struct btrfs_fs_info *fs_info = root->fs_info; 3936 struct extent_buffer *right = path->nodes[0]; 3937 struct extent_buffer *left; 3938 int slot; 3939 int free_space; 3940 u32 right_nritems; 3941 int ret = 0; 3942 3943 slot = path->slots[1]; 3944 if (slot == 0) 3945 return 1; 3946 if (!path->nodes[1]) 3947 return 1; 3948 3949 right_nritems = btrfs_header_nritems(right); 3950 if (right_nritems == 0) 3951 return 1; 3952 3953 btrfs_assert_tree_locked(path->nodes[1]); 3954 3955 left = read_node_slot(fs_info, path->nodes[1], slot - 1); 3956 /* 3957 * slot - 1 is not valid or we fail to read the left node, 3958 * no big deal, just return. 3959 */ 3960 if (IS_ERR(left)) 3961 return 1; 3962 3963 btrfs_tree_lock(left); 3964 btrfs_set_lock_blocking(left); 3965 3966 free_space = btrfs_leaf_free_space(fs_info, left); 3967 if (free_space < data_size) { 3968 ret = 1; 3969 goto out; 3970 } 3971 3972 /* cow and double check */ 3973 ret = btrfs_cow_block(trans, root, left, 3974 path->nodes[1], slot - 1, &left); 3975 if (ret) { 3976 /* we hit -ENOSPC, but it isn't fatal here */ 3977 if (ret == -ENOSPC) 3978 ret = 1; 3979 goto out; 3980 } 3981 3982 free_space = btrfs_leaf_free_space(fs_info, left); 3983 if (free_space < data_size) { 3984 ret = 1; 3985 goto out; 3986 } 3987 3988 return __push_leaf_left(fs_info, path, min_data_size, 3989 empty, left, free_space, right_nritems, 3990 max_slot); 3991 out: 3992 btrfs_tree_unlock(left); 3993 free_extent_buffer(left); 3994 return ret; 3995 } 3996 3997 /* 3998 * split the path's leaf in two, making sure there is at least data_size 3999 * available for the resulting leaf level of the path. 4000 */ 4001 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 4002 struct btrfs_fs_info *fs_info, 4003 struct btrfs_path *path, 4004 struct extent_buffer *l, 4005 struct extent_buffer *right, 4006 int slot, int mid, int nritems) 4007 { 4008 int data_copy_size; 4009 int rt_data_off; 4010 int i; 4011 struct btrfs_disk_key disk_key; 4012 struct btrfs_map_token token; 4013 4014 btrfs_init_map_token(&token); 4015 4016 nritems = nritems - mid; 4017 btrfs_set_header_nritems(right, nritems); 4018 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l); 4019 4020 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 4021 btrfs_item_nr_offset(mid), 4022 nritems * sizeof(struct btrfs_item)); 4023 4024 copy_extent_buffer(right, l, 4025 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 4026 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 4027 leaf_data_end(fs_info, l), data_copy_size); 4028 4029 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); 4030 4031 for (i = 0; i < nritems; i++) { 4032 struct btrfs_item *item = btrfs_item_nr(i); 4033 u32 ioff; 4034 4035 ioff = btrfs_token_item_offset(right, item, &token); 4036 btrfs_set_token_item_offset(right, item, 4037 ioff + rt_data_off, &token); 4038 } 4039 4040 btrfs_set_header_nritems(l, mid); 4041 btrfs_item_key(right, &disk_key, 0); 4042 insert_ptr(trans, fs_info, path, &disk_key, right->start, 4043 path->slots[1] + 1, 1); 4044 4045 btrfs_mark_buffer_dirty(right); 4046 btrfs_mark_buffer_dirty(l); 4047 BUG_ON(path->slots[0] != slot); 4048 4049 if (mid <= slot) { 4050 btrfs_tree_unlock(path->nodes[0]); 4051 free_extent_buffer(path->nodes[0]); 4052 path->nodes[0] = right; 4053 path->slots[0] -= mid; 4054 path->slots[1] += 1; 4055 } else { 4056 btrfs_tree_unlock(right); 4057 free_extent_buffer(right); 4058 } 4059 4060 BUG_ON(path->slots[0] < 0); 4061 } 4062 4063 /* 4064 * double splits happen when we need to insert a big item in the middle 4065 * of a leaf. A double split can leave us with 3 mostly empty leaves: 4066 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 4067 * A B C 4068 * 4069 * We avoid this by trying to push the items on either side of our target 4070 * into the adjacent leaves. If all goes well we can avoid the double split 4071 * completely. 4072 */ 4073 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 4074 struct btrfs_root *root, 4075 struct btrfs_path *path, 4076 int data_size) 4077 { 4078 struct btrfs_fs_info *fs_info = root->fs_info; 4079 int ret; 4080 int progress = 0; 4081 int slot; 4082 u32 nritems; 4083 int space_needed = data_size; 4084 4085 slot = path->slots[0]; 4086 if (slot < btrfs_header_nritems(path->nodes[0])) 4087 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); 4088 4089 /* 4090 * try to push all the items after our slot into the 4091 * right leaf 4092 */ 4093 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 4094 if (ret < 0) 4095 return ret; 4096 4097 if (ret == 0) 4098 progress++; 4099 4100 nritems = btrfs_header_nritems(path->nodes[0]); 4101 /* 4102 * our goal is to get our slot at the start or end of a leaf. If 4103 * we've done so we're done 4104 */ 4105 if (path->slots[0] == 0 || path->slots[0] == nritems) 4106 return 0; 4107 4108 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) 4109 return 0; 4110 4111 /* try to push all the items before our slot into the next leaf */ 4112 slot = path->slots[0]; 4113 space_needed = data_size; 4114 if (slot > 0) 4115 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); 4116 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 4117 if (ret < 0) 4118 return ret; 4119 4120 if (ret == 0) 4121 progress++; 4122 4123 if (progress) 4124 return 0; 4125 return 1; 4126 } 4127 4128 /* 4129 * split the path's leaf in two, making sure there is at least data_size 4130 * available for the resulting leaf level of the path. 4131 * 4132 * returns 0 if all went well and < 0 on failure. 4133 */ 4134 static noinline int split_leaf(struct btrfs_trans_handle *trans, 4135 struct btrfs_root *root, 4136 const struct btrfs_key *ins_key, 4137 struct btrfs_path *path, int data_size, 4138 int extend) 4139 { 4140 struct btrfs_disk_key disk_key; 4141 struct extent_buffer *l; 4142 u32 nritems; 4143 int mid; 4144 int slot; 4145 struct extent_buffer *right; 4146 struct btrfs_fs_info *fs_info = root->fs_info; 4147 int ret = 0; 4148 int wret; 4149 int split; 4150 int num_doubles = 0; 4151 int tried_avoid_double = 0; 4152 4153 l = path->nodes[0]; 4154 slot = path->slots[0]; 4155 if (extend && data_size + btrfs_item_size_nr(l, slot) + 4156 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 4157 return -EOVERFLOW; 4158 4159 /* first try to make some room by pushing left and right */ 4160 if (data_size && path->nodes[1]) { 4161 int space_needed = data_size; 4162 4163 if (slot < btrfs_header_nritems(l)) 4164 space_needed -= btrfs_leaf_free_space(fs_info, l); 4165 4166 wret = push_leaf_right(trans, root, path, space_needed, 4167 space_needed, 0, 0); 4168 if (wret < 0) 4169 return wret; 4170 if (wret) { 4171 space_needed = data_size; 4172 if (slot > 0) 4173 space_needed -= btrfs_leaf_free_space(fs_info, 4174 l); 4175 wret = push_leaf_left(trans, root, path, space_needed, 4176 space_needed, 0, (u32)-1); 4177 if (wret < 0) 4178 return wret; 4179 } 4180 l = path->nodes[0]; 4181 4182 /* did the pushes work? */ 4183 if (btrfs_leaf_free_space(fs_info, l) >= data_size) 4184 return 0; 4185 } 4186 4187 if (!path->nodes[1]) { 4188 ret = insert_new_root(trans, root, path, 1); 4189 if (ret) 4190 return ret; 4191 } 4192 again: 4193 split = 1; 4194 l = path->nodes[0]; 4195 slot = path->slots[0]; 4196 nritems = btrfs_header_nritems(l); 4197 mid = (nritems + 1) / 2; 4198 4199 if (mid <= slot) { 4200 if (nritems == 1 || 4201 leaf_space_used(l, mid, nritems - mid) + data_size > 4202 BTRFS_LEAF_DATA_SIZE(fs_info)) { 4203 if (slot >= nritems) { 4204 split = 0; 4205 } else { 4206 mid = slot; 4207 if (mid != nritems && 4208 leaf_space_used(l, mid, nritems - mid) + 4209 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 4210 if (data_size && !tried_avoid_double) 4211 goto push_for_double; 4212 split = 2; 4213 } 4214 } 4215 } 4216 } else { 4217 if (leaf_space_used(l, 0, mid) + data_size > 4218 BTRFS_LEAF_DATA_SIZE(fs_info)) { 4219 if (!extend && data_size && slot == 0) { 4220 split = 0; 4221 } else if ((extend || !data_size) && slot == 0) { 4222 mid = 1; 4223 } else { 4224 mid = slot; 4225 if (mid != nritems && 4226 leaf_space_used(l, mid, nritems - mid) + 4227 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 4228 if (data_size && !tried_avoid_double) 4229 goto push_for_double; 4230 split = 2; 4231 } 4232 } 4233 } 4234 } 4235 4236 if (split == 0) 4237 btrfs_cpu_key_to_disk(&disk_key, ins_key); 4238 else 4239 btrfs_item_key(l, &disk_key, mid); 4240 4241 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, 4242 &disk_key, 0, l->start, 0); 4243 if (IS_ERR(right)) 4244 return PTR_ERR(right); 4245 4246 root_add_used(root, fs_info->nodesize); 4247 4248 if (split == 0) { 4249 if (mid <= slot) { 4250 btrfs_set_header_nritems(right, 0); 4251 insert_ptr(trans, fs_info, path, &disk_key, 4252 right->start, path->slots[1] + 1, 1); 4253 btrfs_tree_unlock(path->nodes[0]); 4254 free_extent_buffer(path->nodes[0]); 4255 path->nodes[0] = right; 4256 path->slots[0] = 0; 4257 path->slots[1] += 1; 4258 } else { 4259 btrfs_set_header_nritems(right, 0); 4260 insert_ptr(trans, fs_info, path, &disk_key, 4261 right->start, path->slots[1], 1); 4262 btrfs_tree_unlock(path->nodes[0]); 4263 free_extent_buffer(path->nodes[0]); 4264 path->nodes[0] = right; 4265 path->slots[0] = 0; 4266 if (path->slots[1] == 0) 4267 fixup_low_keys(path, &disk_key, 1); 4268 } 4269 /* 4270 * We create a new leaf 'right' for the required ins_len and 4271 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 4272 * the content of ins_len to 'right'. 4273 */ 4274 return ret; 4275 } 4276 4277 copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems); 4278 4279 if (split == 2) { 4280 BUG_ON(num_doubles != 0); 4281 num_doubles++; 4282 goto again; 4283 } 4284 4285 return 0; 4286 4287 push_for_double: 4288 push_for_double_split(trans, root, path, data_size); 4289 tried_avoid_double = 1; 4290 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) 4291 return 0; 4292 goto again; 4293 } 4294 4295 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 4296 struct btrfs_root *root, 4297 struct btrfs_path *path, int ins_len) 4298 { 4299 struct btrfs_fs_info *fs_info = root->fs_info; 4300 struct btrfs_key key; 4301 struct extent_buffer *leaf; 4302 struct btrfs_file_extent_item *fi; 4303 u64 extent_len = 0; 4304 u32 item_size; 4305 int ret; 4306 4307 leaf = path->nodes[0]; 4308 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4309 4310 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 4311 key.type != BTRFS_EXTENT_CSUM_KEY); 4312 4313 if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len) 4314 return 0; 4315 4316 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4317 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4318 fi = btrfs_item_ptr(leaf, path->slots[0], 4319 struct btrfs_file_extent_item); 4320 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 4321 } 4322 btrfs_release_path(path); 4323 4324 path->keep_locks = 1; 4325 path->search_for_split = 1; 4326 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4327 path->search_for_split = 0; 4328 if (ret > 0) 4329 ret = -EAGAIN; 4330 if (ret < 0) 4331 goto err; 4332 4333 ret = -EAGAIN; 4334 leaf = path->nodes[0]; 4335 /* if our item isn't there, return now */ 4336 if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) 4337 goto err; 4338 4339 /* the leaf has changed, it now has room. return now */ 4340 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len) 4341 goto err; 4342 4343 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4344 fi = btrfs_item_ptr(leaf, path->slots[0], 4345 struct btrfs_file_extent_item); 4346 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4347 goto err; 4348 } 4349 4350 btrfs_set_path_blocking(path); 4351 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4352 if (ret) 4353 goto err; 4354 4355 path->keep_locks = 0; 4356 btrfs_unlock_up_safe(path, 1); 4357 return 0; 4358 err: 4359 path->keep_locks = 0; 4360 return ret; 4361 } 4362 4363 static noinline int split_item(struct btrfs_fs_info *fs_info, 4364 struct btrfs_path *path, 4365 const struct btrfs_key *new_key, 4366 unsigned long split_offset) 4367 { 4368 struct extent_buffer *leaf; 4369 struct btrfs_item *item; 4370 struct btrfs_item *new_item; 4371 int slot; 4372 char *buf; 4373 u32 nritems; 4374 u32 item_size; 4375 u32 orig_offset; 4376 struct btrfs_disk_key disk_key; 4377 4378 leaf = path->nodes[0]; 4379 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item)); 4380 4381 btrfs_set_path_blocking(path); 4382 4383 item = btrfs_item_nr(path->slots[0]); 4384 orig_offset = btrfs_item_offset(leaf, item); 4385 item_size = btrfs_item_size(leaf, item); 4386 4387 buf = kmalloc(item_size, GFP_NOFS); 4388 if (!buf) 4389 return -ENOMEM; 4390 4391 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4392 path->slots[0]), item_size); 4393 4394 slot = path->slots[0] + 1; 4395 nritems = btrfs_header_nritems(leaf); 4396 if (slot != nritems) { 4397 /* shift the items */ 4398 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 4399 btrfs_item_nr_offset(slot), 4400 (nritems - slot) * sizeof(struct btrfs_item)); 4401 } 4402 4403 btrfs_cpu_key_to_disk(&disk_key, new_key); 4404 btrfs_set_item_key(leaf, &disk_key, slot); 4405 4406 new_item = btrfs_item_nr(slot); 4407 4408 btrfs_set_item_offset(leaf, new_item, orig_offset); 4409 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 4410 4411 btrfs_set_item_offset(leaf, item, 4412 orig_offset + item_size - split_offset); 4413 btrfs_set_item_size(leaf, item, split_offset); 4414 4415 btrfs_set_header_nritems(leaf, nritems + 1); 4416 4417 /* write the data for the start of the original item */ 4418 write_extent_buffer(leaf, buf, 4419 btrfs_item_ptr_offset(leaf, path->slots[0]), 4420 split_offset); 4421 4422 /* write the data for the new item */ 4423 write_extent_buffer(leaf, buf + split_offset, 4424 btrfs_item_ptr_offset(leaf, slot), 4425 item_size - split_offset); 4426 btrfs_mark_buffer_dirty(leaf); 4427 4428 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0); 4429 kfree(buf); 4430 return 0; 4431 } 4432 4433 /* 4434 * This function splits a single item into two items, 4435 * giving 'new_key' to the new item and splitting the 4436 * old one at split_offset (from the start of the item). 4437 * 4438 * The path may be released by this operation. After 4439 * the split, the path is pointing to the old item. The 4440 * new item is going to be in the same node as the old one. 4441 * 4442 * Note, the item being split must be smaller enough to live alone on 4443 * a tree block with room for one extra struct btrfs_item 4444 * 4445 * This allows us to split the item in place, keeping a lock on the 4446 * leaf the entire time. 4447 */ 4448 int btrfs_split_item(struct btrfs_trans_handle *trans, 4449 struct btrfs_root *root, 4450 struct btrfs_path *path, 4451 const struct btrfs_key *new_key, 4452 unsigned long split_offset) 4453 { 4454 int ret; 4455 ret = setup_leaf_for_split(trans, root, path, 4456 sizeof(struct btrfs_item)); 4457 if (ret) 4458 return ret; 4459 4460 ret = split_item(root->fs_info, path, new_key, split_offset); 4461 return ret; 4462 } 4463 4464 /* 4465 * This function duplicate a item, giving 'new_key' to the new item. 4466 * It guarantees both items live in the same tree leaf and the new item 4467 * is contiguous with the original item. 4468 * 4469 * This allows us to split file extent in place, keeping a lock on the 4470 * leaf the entire time. 4471 */ 4472 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4473 struct btrfs_root *root, 4474 struct btrfs_path *path, 4475 const struct btrfs_key *new_key) 4476 { 4477 struct extent_buffer *leaf; 4478 int ret; 4479 u32 item_size; 4480 4481 leaf = path->nodes[0]; 4482 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4483 ret = setup_leaf_for_split(trans, root, path, 4484 item_size + sizeof(struct btrfs_item)); 4485 if (ret) 4486 return ret; 4487 4488 path->slots[0]++; 4489 setup_items_for_insert(root, path, new_key, &item_size, 4490 item_size, item_size + 4491 sizeof(struct btrfs_item), 1); 4492 leaf = path->nodes[0]; 4493 memcpy_extent_buffer(leaf, 4494 btrfs_item_ptr_offset(leaf, path->slots[0]), 4495 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4496 item_size); 4497 return 0; 4498 } 4499 4500 /* 4501 * make the item pointed to by the path smaller. new_size indicates 4502 * how small to make it, and from_end tells us if we just chop bytes 4503 * off the end of the item or if we shift the item to chop bytes off 4504 * the front. 4505 */ 4506 void btrfs_truncate_item(struct btrfs_fs_info *fs_info, 4507 struct btrfs_path *path, u32 new_size, int from_end) 4508 { 4509 int slot; 4510 struct extent_buffer *leaf; 4511 struct btrfs_item *item; 4512 u32 nritems; 4513 unsigned int data_end; 4514 unsigned int old_data_start; 4515 unsigned int old_size; 4516 unsigned int size_diff; 4517 int i; 4518 struct btrfs_map_token token; 4519 4520 btrfs_init_map_token(&token); 4521 4522 leaf = path->nodes[0]; 4523 slot = path->slots[0]; 4524 4525 old_size = btrfs_item_size_nr(leaf, slot); 4526 if (old_size == new_size) 4527 return; 4528 4529 nritems = btrfs_header_nritems(leaf); 4530 data_end = leaf_data_end(fs_info, leaf); 4531 4532 old_data_start = btrfs_item_offset_nr(leaf, slot); 4533 4534 size_diff = old_size - new_size; 4535 4536 BUG_ON(slot < 0); 4537 BUG_ON(slot >= nritems); 4538 4539 /* 4540 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4541 */ 4542 /* first correct the data pointers */ 4543 for (i = slot; i < nritems; i++) { 4544 u32 ioff; 4545 item = btrfs_item_nr(i); 4546 4547 ioff = btrfs_token_item_offset(leaf, item, &token); 4548 btrfs_set_token_item_offset(leaf, item, 4549 ioff + size_diff, &token); 4550 } 4551 4552 /* shift the data */ 4553 if (from_end) { 4554 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4555 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 4556 data_end, old_data_start + new_size - data_end); 4557 } else { 4558 struct btrfs_disk_key disk_key; 4559 u64 offset; 4560 4561 btrfs_item_key(leaf, &disk_key, slot); 4562 4563 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4564 unsigned long ptr; 4565 struct btrfs_file_extent_item *fi; 4566 4567 fi = btrfs_item_ptr(leaf, slot, 4568 struct btrfs_file_extent_item); 4569 fi = (struct btrfs_file_extent_item *)( 4570 (unsigned long)fi - size_diff); 4571 4572 if (btrfs_file_extent_type(leaf, fi) == 4573 BTRFS_FILE_EXTENT_INLINE) { 4574 ptr = btrfs_item_ptr_offset(leaf, slot); 4575 memmove_extent_buffer(leaf, ptr, 4576 (unsigned long)fi, 4577 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4578 } 4579 } 4580 4581 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4582 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 4583 data_end, old_data_start - data_end); 4584 4585 offset = btrfs_disk_key_offset(&disk_key); 4586 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4587 btrfs_set_item_key(leaf, &disk_key, slot); 4588 if (slot == 0) 4589 fixup_low_keys(path, &disk_key, 1); 4590 } 4591 4592 item = btrfs_item_nr(slot); 4593 btrfs_set_item_size(leaf, item, new_size); 4594 btrfs_mark_buffer_dirty(leaf); 4595 4596 if (btrfs_leaf_free_space(fs_info, leaf) < 0) { 4597 btrfs_print_leaf(leaf); 4598 BUG(); 4599 } 4600 } 4601 4602 /* 4603 * make the item pointed to by the path bigger, data_size is the added size. 4604 */ 4605 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 4606 u32 data_size) 4607 { 4608 int slot; 4609 struct extent_buffer *leaf; 4610 struct btrfs_item *item; 4611 u32 nritems; 4612 unsigned int data_end; 4613 unsigned int old_data; 4614 unsigned int old_size; 4615 int i; 4616 struct btrfs_map_token token; 4617 4618 btrfs_init_map_token(&token); 4619 4620 leaf = path->nodes[0]; 4621 4622 nritems = btrfs_header_nritems(leaf); 4623 data_end = leaf_data_end(fs_info, leaf); 4624 4625 if (btrfs_leaf_free_space(fs_info, leaf) < data_size) { 4626 btrfs_print_leaf(leaf); 4627 BUG(); 4628 } 4629 slot = path->slots[0]; 4630 old_data = btrfs_item_end_nr(leaf, slot); 4631 4632 BUG_ON(slot < 0); 4633 if (slot >= nritems) { 4634 btrfs_print_leaf(leaf); 4635 btrfs_crit(fs_info, "slot %d too large, nritems %d", 4636 slot, nritems); 4637 BUG_ON(1); 4638 } 4639 4640 /* 4641 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4642 */ 4643 /* first correct the data pointers */ 4644 for (i = slot; i < nritems; i++) { 4645 u32 ioff; 4646 item = btrfs_item_nr(i); 4647 4648 ioff = btrfs_token_item_offset(leaf, item, &token); 4649 btrfs_set_token_item_offset(leaf, item, 4650 ioff - data_size, &token); 4651 } 4652 4653 /* shift the data */ 4654 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4655 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 4656 data_end, old_data - data_end); 4657 4658 data_end = old_data; 4659 old_size = btrfs_item_size_nr(leaf, slot); 4660 item = btrfs_item_nr(slot); 4661 btrfs_set_item_size(leaf, item, old_size + data_size); 4662 btrfs_mark_buffer_dirty(leaf); 4663 4664 if (btrfs_leaf_free_space(fs_info, leaf) < 0) { 4665 btrfs_print_leaf(leaf); 4666 BUG(); 4667 } 4668 } 4669 4670 /* 4671 * this is a helper for btrfs_insert_empty_items, the main goal here is 4672 * to save stack depth by doing the bulk of the work in a function 4673 * that doesn't call btrfs_search_slot 4674 */ 4675 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4676 const struct btrfs_key *cpu_key, u32 *data_size, 4677 u32 total_data, u32 total_size, int nr) 4678 { 4679 struct btrfs_fs_info *fs_info = root->fs_info; 4680 struct btrfs_item *item; 4681 int i; 4682 u32 nritems; 4683 unsigned int data_end; 4684 struct btrfs_disk_key disk_key; 4685 struct extent_buffer *leaf; 4686 int slot; 4687 struct btrfs_map_token token; 4688 4689 if (path->slots[0] == 0) { 4690 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 4691 fixup_low_keys(path, &disk_key, 1); 4692 } 4693 btrfs_unlock_up_safe(path, 1); 4694 4695 btrfs_init_map_token(&token); 4696 4697 leaf = path->nodes[0]; 4698 slot = path->slots[0]; 4699 4700 nritems = btrfs_header_nritems(leaf); 4701 data_end = leaf_data_end(fs_info, leaf); 4702 4703 if (btrfs_leaf_free_space(fs_info, leaf) < total_size) { 4704 btrfs_print_leaf(leaf); 4705 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4706 total_size, btrfs_leaf_free_space(fs_info, leaf)); 4707 BUG(); 4708 } 4709 4710 if (slot != nritems) { 4711 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 4712 4713 if (old_data < data_end) { 4714 btrfs_print_leaf(leaf); 4715 btrfs_crit(fs_info, "slot %d old_data %d data_end %d", 4716 slot, old_data, data_end); 4717 BUG_ON(1); 4718 } 4719 /* 4720 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4721 */ 4722 /* first correct the data pointers */ 4723 for (i = slot; i < nritems; i++) { 4724 u32 ioff; 4725 4726 item = btrfs_item_nr(i); 4727 ioff = btrfs_token_item_offset(leaf, item, &token); 4728 btrfs_set_token_item_offset(leaf, item, 4729 ioff - total_data, &token); 4730 } 4731 /* shift the items */ 4732 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 4733 btrfs_item_nr_offset(slot), 4734 (nritems - slot) * sizeof(struct btrfs_item)); 4735 4736 /* shift the data */ 4737 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4738 data_end - total_data, BTRFS_LEAF_DATA_OFFSET + 4739 data_end, old_data - data_end); 4740 data_end = old_data; 4741 } 4742 4743 /* setup the item for the new data */ 4744 for (i = 0; i < nr; i++) { 4745 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 4746 btrfs_set_item_key(leaf, &disk_key, slot + i); 4747 item = btrfs_item_nr(slot + i); 4748 btrfs_set_token_item_offset(leaf, item, 4749 data_end - data_size[i], &token); 4750 data_end -= data_size[i]; 4751 btrfs_set_token_item_size(leaf, item, data_size[i], &token); 4752 } 4753 4754 btrfs_set_header_nritems(leaf, nritems + nr); 4755 btrfs_mark_buffer_dirty(leaf); 4756 4757 if (btrfs_leaf_free_space(fs_info, leaf) < 0) { 4758 btrfs_print_leaf(leaf); 4759 BUG(); 4760 } 4761 } 4762 4763 /* 4764 * Given a key and some data, insert items into the tree. 4765 * This does all the path init required, making room in the tree if needed. 4766 */ 4767 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4768 struct btrfs_root *root, 4769 struct btrfs_path *path, 4770 const struct btrfs_key *cpu_key, u32 *data_size, 4771 int nr) 4772 { 4773 int ret = 0; 4774 int slot; 4775 int i; 4776 u32 total_size = 0; 4777 u32 total_data = 0; 4778 4779 for (i = 0; i < nr; i++) 4780 total_data += data_size[i]; 4781 4782 total_size = total_data + (nr * sizeof(struct btrfs_item)); 4783 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 4784 if (ret == 0) 4785 return -EEXIST; 4786 if (ret < 0) 4787 return ret; 4788 4789 slot = path->slots[0]; 4790 BUG_ON(slot < 0); 4791 4792 setup_items_for_insert(root, path, cpu_key, data_size, 4793 total_data, total_size, nr); 4794 return 0; 4795 } 4796 4797 /* 4798 * Given a key and some data, insert an item into the tree. 4799 * This does all the path init required, making room in the tree if needed. 4800 */ 4801 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4802 const struct btrfs_key *cpu_key, void *data, 4803 u32 data_size) 4804 { 4805 int ret = 0; 4806 struct btrfs_path *path; 4807 struct extent_buffer *leaf; 4808 unsigned long ptr; 4809 4810 path = btrfs_alloc_path(); 4811 if (!path) 4812 return -ENOMEM; 4813 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4814 if (!ret) { 4815 leaf = path->nodes[0]; 4816 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4817 write_extent_buffer(leaf, data, ptr, data_size); 4818 btrfs_mark_buffer_dirty(leaf); 4819 } 4820 btrfs_free_path(path); 4821 return ret; 4822 } 4823 4824 /* 4825 * delete the pointer from a given node. 4826 * 4827 * the tree should have been previously balanced so the deletion does not 4828 * empty a node. 4829 */ 4830 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4831 int level, int slot) 4832 { 4833 struct extent_buffer *parent = path->nodes[level]; 4834 u32 nritems; 4835 int ret; 4836 4837 nritems = btrfs_header_nritems(parent); 4838 if (slot != nritems - 1) { 4839 if (level) { 4840 ret = tree_mod_log_insert_move(parent, slot, slot + 1, 4841 nritems - slot - 1); 4842 BUG_ON(ret < 0); 4843 } 4844 memmove_extent_buffer(parent, 4845 btrfs_node_key_ptr_offset(slot), 4846 btrfs_node_key_ptr_offset(slot + 1), 4847 sizeof(struct btrfs_key_ptr) * 4848 (nritems - slot - 1)); 4849 } else if (level) { 4850 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE, 4851 GFP_NOFS); 4852 BUG_ON(ret < 0); 4853 } 4854 4855 nritems--; 4856 btrfs_set_header_nritems(parent, nritems); 4857 if (nritems == 0 && parent == root->node) { 4858 BUG_ON(btrfs_header_level(root->node) != 1); 4859 /* just turn the root into a leaf and break */ 4860 btrfs_set_header_level(root->node, 0); 4861 } else if (slot == 0) { 4862 struct btrfs_disk_key disk_key; 4863 4864 btrfs_node_key(parent, &disk_key, 0); 4865 fixup_low_keys(path, &disk_key, level + 1); 4866 } 4867 btrfs_mark_buffer_dirty(parent); 4868 } 4869 4870 /* 4871 * a helper function to delete the leaf pointed to by path->slots[1] and 4872 * path->nodes[1]. 4873 * 4874 * This deletes the pointer in path->nodes[1] and frees the leaf 4875 * block extent. zero is returned if it all worked out, < 0 otherwise. 4876 * 4877 * The path must have already been setup for deleting the leaf, including 4878 * all the proper balancing. path->nodes[1] must be locked. 4879 */ 4880 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4881 struct btrfs_root *root, 4882 struct btrfs_path *path, 4883 struct extent_buffer *leaf) 4884 { 4885 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4886 del_ptr(root, path, 1, path->slots[1]); 4887 4888 /* 4889 * btrfs_free_extent is expensive, we want to make sure we 4890 * aren't holding any locks when we call it 4891 */ 4892 btrfs_unlock_up_safe(path, 0); 4893 4894 root_sub_used(root, leaf->len); 4895 4896 extent_buffer_get(leaf); 4897 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4898 free_extent_buffer_stale(leaf); 4899 } 4900 /* 4901 * delete the item at the leaf level in path. If that empties 4902 * the leaf, remove it from the tree 4903 */ 4904 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4905 struct btrfs_path *path, int slot, int nr) 4906 { 4907 struct btrfs_fs_info *fs_info = root->fs_info; 4908 struct extent_buffer *leaf; 4909 struct btrfs_item *item; 4910 u32 last_off; 4911 u32 dsize = 0; 4912 int ret = 0; 4913 int wret; 4914 int i; 4915 u32 nritems; 4916 struct btrfs_map_token token; 4917 4918 btrfs_init_map_token(&token); 4919 4920 leaf = path->nodes[0]; 4921 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4922 4923 for (i = 0; i < nr; i++) 4924 dsize += btrfs_item_size_nr(leaf, slot + i); 4925 4926 nritems = btrfs_header_nritems(leaf); 4927 4928 if (slot + nr != nritems) { 4929 int data_end = leaf_data_end(fs_info, leaf); 4930 4931 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4932 data_end + dsize, 4933 BTRFS_LEAF_DATA_OFFSET + data_end, 4934 last_off - data_end); 4935 4936 for (i = slot + nr; i < nritems; i++) { 4937 u32 ioff; 4938 4939 item = btrfs_item_nr(i); 4940 ioff = btrfs_token_item_offset(leaf, item, &token); 4941 btrfs_set_token_item_offset(leaf, item, 4942 ioff + dsize, &token); 4943 } 4944 4945 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4946 btrfs_item_nr_offset(slot + nr), 4947 sizeof(struct btrfs_item) * 4948 (nritems - slot - nr)); 4949 } 4950 btrfs_set_header_nritems(leaf, nritems - nr); 4951 nritems -= nr; 4952 4953 /* delete the leaf if we've emptied it */ 4954 if (nritems == 0) { 4955 if (leaf == root->node) { 4956 btrfs_set_header_level(leaf, 0); 4957 } else { 4958 btrfs_set_path_blocking(path); 4959 clean_tree_block(fs_info, leaf); 4960 btrfs_del_leaf(trans, root, path, leaf); 4961 } 4962 } else { 4963 int used = leaf_space_used(leaf, 0, nritems); 4964 if (slot == 0) { 4965 struct btrfs_disk_key disk_key; 4966 4967 btrfs_item_key(leaf, &disk_key, 0); 4968 fixup_low_keys(path, &disk_key, 1); 4969 } 4970 4971 /* delete the leaf if it is mostly empty */ 4972 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4973 /* push_leaf_left fixes the path. 4974 * make sure the path still points to our leaf 4975 * for possible call to del_ptr below 4976 */ 4977 slot = path->slots[1]; 4978 extent_buffer_get(leaf); 4979 4980 btrfs_set_path_blocking(path); 4981 wret = push_leaf_left(trans, root, path, 1, 1, 4982 1, (u32)-1); 4983 if (wret < 0 && wret != -ENOSPC) 4984 ret = wret; 4985 4986 if (path->nodes[0] == leaf && 4987 btrfs_header_nritems(leaf)) { 4988 wret = push_leaf_right(trans, root, path, 1, 4989 1, 1, 0); 4990 if (wret < 0 && wret != -ENOSPC) 4991 ret = wret; 4992 } 4993 4994 if (btrfs_header_nritems(leaf) == 0) { 4995 path->slots[1] = slot; 4996 btrfs_del_leaf(trans, root, path, leaf); 4997 free_extent_buffer(leaf); 4998 ret = 0; 4999 } else { 5000 /* if we're still in the path, make sure 5001 * we're dirty. Otherwise, one of the 5002 * push_leaf functions must have already 5003 * dirtied this buffer 5004 */ 5005 if (path->nodes[0] == leaf) 5006 btrfs_mark_buffer_dirty(leaf); 5007 free_extent_buffer(leaf); 5008 } 5009 } else { 5010 btrfs_mark_buffer_dirty(leaf); 5011 } 5012 } 5013 return ret; 5014 } 5015 5016 /* 5017 * search the tree again to find a leaf with lesser keys 5018 * returns 0 if it found something or 1 if there are no lesser leaves. 5019 * returns < 0 on io errors. 5020 * 5021 * This may release the path, and so you may lose any locks held at the 5022 * time you call it. 5023 */ 5024 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 5025 { 5026 struct btrfs_key key; 5027 struct btrfs_disk_key found_key; 5028 int ret; 5029 5030 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 5031 5032 if (key.offset > 0) { 5033 key.offset--; 5034 } else if (key.type > 0) { 5035 key.type--; 5036 key.offset = (u64)-1; 5037 } else if (key.objectid > 0) { 5038 key.objectid--; 5039 key.type = (u8)-1; 5040 key.offset = (u64)-1; 5041 } else { 5042 return 1; 5043 } 5044 5045 btrfs_release_path(path); 5046 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5047 if (ret < 0) 5048 return ret; 5049 btrfs_item_key(path->nodes[0], &found_key, 0); 5050 ret = comp_keys(&found_key, &key); 5051 /* 5052 * We might have had an item with the previous key in the tree right 5053 * before we released our path. And after we released our path, that 5054 * item might have been pushed to the first slot (0) of the leaf we 5055 * were holding due to a tree balance. Alternatively, an item with the 5056 * previous key can exist as the only element of a leaf (big fat item). 5057 * Therefore account for these 2 cases, so that our callers (like 5058 * btrfs_previous_item) don't miss an existing item with a key matching 5059 * the previous key we computed above. 5060 */ 5061 if (ret <= 0) 5062 return 0; 5063 return 1; 5064 } 5065 5066 /* 5067 * A helper function to walk down the tree starting at min_key, and looking 5068 * for nodes or leaves that are have a minimum transaction id. 5069 * This is used by the btree defrag code, and tree logging 5070 * 5071 * This does not cow, but it does stuff the starting key it finds back 5072 * into min_key, so you can call btrfs_search_slot with cow=1 on the 5073 * key and get a writable path. 5074 * 5075 * This honors path->lowest_level to prevent descent past a given level 5076 * of the tree. 5077 * 5078 * min_trans indicates the oldest transaction that you are interested 5079 * in walking through. Any nodes or leaves older than min_trans are 5080 * skipped over (without reading them). 5081 * 5082 * returns zero if something useful was found, < 0 on error and 1 if there 5083 * was nothing in the tree that matched the search criteria. 5084 */ 5085 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 5086 struct btrfs_path *path, 5087 u64 min_trans) 5088 { 5089 struct btrfs_fs_info *fs_info = root->fs_info; 5090 struct extent_buffer *cur; 5091 struct btrfs_key found_key; 5092 int slot; 5093 int sret; 5094 u32 nritems; 5095 int level; 5096 int ret = 1; 5097 int keep_locks = path->keep_locks; 5098 5099 path->keep_locks = 1; 5100 again: 5101 cur = btrfs_read_lock_root_node(root); 5102 level = btrfs_header_level(cur); 5103 WARN_ON(path->nodes[level]); 5104 path->nodes[level] = cur; 5105 path->locks[level] = BTRFS_READ_LOCK; 5106 5107 if (btrfs_header_generation(cur) < min_trans) { 5108 ret = 1; 5109 goto out; 5110 } 5111 while (1) { 5112 nritems = btrfs_header_nritems(cur); 5113 level = btrfs_header_level(cur); 5114 sret = btrfs_bin_search(cur, min_key, level, &slot); 5115 5116 /* at the lowest level, we're done, setup the path and exit */ 5117 if (level == path->lowest_level) { 5118 if (slot >= nritems) 5119 goto find_next_key; 5120 ret = 0; 5121 path->slots[level] = slot; 5122 btrfs_item_key_to_cpu(cur, &found_key, slot); 5123 goto out; 5124 } 5125 if (sret && slot > 0) 5126 slot--; 5127 /* 5128 * check this node pointer against the min_trans parameters. 5129 * If it is too old, old, skip to the next one. 5130 */ 5131 while (slot < nritems) { 5132 u64 gen; 5133 5134 gen = btrfs_node_ptr_generation(cur, slot); 5135 if (gen < min_trans) { 5136 slot++; 5137 continue; 5138 } 5139 break; 5140 } 5141 find_next_key: 5142 /* 5143 * we didn't find a candidate key in this node, walk forward 5144 * and find another one 5145 */ 5146 if (slot >= nritems) { 5147 path->slots[level] = slot; 5148 btrfs_set_path_blocking(path); 5149 sret = btrfs_find_next_key(root, path, min_key, level, 5150 min_trans); 5151 if (sret == 0) { 5152 btrfs_release_path(path); 5153 goto again; 5154 } else { 5155 goto out; 5156 } 5157 } 5158 /* save our key for returning back */ 5159 btrfs_node_key_to_cpu(cur, &found_key, slot); 5160 path->slots[level] = slot; 5161 if (level == path->lowest_level) { 5162 ret = 0; 5163 goto out; 5164 } 5165 btrfs_set_path_blocking(path); 5166 cur = read_node_slot(fs_info, cur, slot); 5167 if (IS_ERR(cur)) { 5168 ret = PTR_ERR(cur); 5169 goto out; 5170 } 5171 5172 btrfs_tree_read_lock(cur); 5173 5174 path->locks[level - 1] = BTRFS_READ_LOCK; 5175 path->nodes[level - 1] = cur; 5176 unlock_up(path, level, 1, 0, NULL); 5177 } 5178 out: 5179 path->keep_locks = keep_locks; 5180 if (ret == 0) { 5181 btrfs_unlock_up_safe(path, path->lowest_level + 1); 5182 btrfs_set_path_blocking(path); 5183 memcpy(min_key, &found_key, sizeof(found_key)); 5184 } 5185 return ret; 5186 } 5187 5188 static int tree_move_down(struct btrfs_fs_info *fs_info, 5189 struct btrfs_path *path, 5190 int *level) 5191 { 5192 struct extent_buffer *eb; 5193 5194 BUG_ON(*level == 0); 5195 eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]); 5196 if (IS_ERR(eb)) 5197 return PTR_ERR(eb); 5198 5199 path->nodes[*level - 1] = eb; 5200 path->slots[*level - 1] = 0; 5201 (*level)--; 5202 return 0; 5203 } 5204 5205 static int tree_move_next_or_upnext(struct btrfs_path *path, 5206 int *level, int root_level) 5207 { 5208 int ret = 0; 5209 int nritems; 5210 nritems = btrfs_header_nritems(path->nodes[*level]); 5211 5212 path->slots[*level]++; 5213 5214 while (path->slots[*level] >= nritems) { 5215 if (*level == root_level) 5216 return -1; 5217 5218 /* move upnext */ 5219 path->slots[*level] = 0; 5220 free_extent_buffer(path->nodes[*level]); 5221 path->nodes[*level] = NULL; 5222 (*level)++; 5223 path->slots[*level]++; 5224 5225 nritems = btrfs_header_nritems(path->nodes[*level]); 5226 ret = 1; 5227 } 5228 return ret; 5229 } 5230 5231 /* 5232 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 5233 * or down. 5234 */ 5235 static int tree_advance(struct btrfs_fs_info *fs_info, 5236 struct btrfs_path *path, 5237 int *level, int root_level, 5238 int allow_down, 5239 struct btrfs_key *key) 5240 { 5241 int ret; 5242 5243 if (*level == 0 || !allow_down) { 5244 ret = tree_move_next_or_upnext(path, level, root_level); 5245 } else { 5246 ret = tree_move_down(fs_info, path, level); 5247 } 5248 if (ret >= 0) { 5249 if (*level == 0) 5250 btrfs_item_key_to_cpu(path->nodes[*level], key, 5251 path->slots[*level]); 5252 else 5253 btrfs_node_key_to_cpu(path->nodes[*level], key, 5254 path->slots[*level]); 5255 } 5256 return ret; 5257 } 5258 5259 static int tree_compare_item(struct btrfs_path *left_path, 5260 struct btrfs_path *right_path, 5261 char *tmp_buf) 5262 { 5263 int cmp; 5264 int len1, len2; 5265 unsigned long off1, off2; 5266 5267 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); 5268 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); 5269 if (len1 != len2) 5270 return 1; 5271 5272 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 5273 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 5274 right_path->slots[0]); 5275 5276 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 5277 5278 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 5279 if (cmp) 5280 return 1; 5281 return 0; 5282 } 5283 5284 #define ADVANCE 1 5285 #define ADVANCE_ONLY_NEXT -1 5286 5287 /* 5288 * This function compares two trees and calls the provided callback for 5289 * every changed/new/deleted item it finds. 5290 * If shared tree blocks are encountered, whole subtrees are skipped, making 5291 * the compare pretty fast on snapshotted subvolumes. 5292 * 5293 * This currently works on commit roots only. As commit roots are read only, 5294 * we don't do any locking. The commit roots are protected with transactions. 5295 * Transactions are ended and rejoined when a commit is tried in between. 5296 * 5297 * This function checks for modifications done to the trees while comparing. 5298 * If it detects a change, it aborts immediately. 5299 */ 5300 int btrfs_compare_trees(struct btrfs_root *left_root, 5301 struct btrfs_root *right_root, 5302 btrfs_changed_cb_t changed_cb, void *ctx) 5303 { 5304 struct btrfs_fs_info *fs_info = left_root->fs_info; 5305 int ret; 5306 int cmp; 5307 struct btrfs_path *left_path = NULL; 5308 struct btrfs_path *right_path = NULL; 5309 struct btrfs_key left_key; 5310 struct btrfs_key right_key; 5311 char *tmp_buf = NULL; 5312 int left_root_level; 5313 int right_root_level; 5314 int left_level; 5315 int right_level; 5316 int left_end_reached; 5317 int right_end_reached; 5318 int advance_left; 5319 int advance_right; 5320 u64 left_blockptr; 5321 u64 right_blockptr; 5322 u64 left_gen; 5323 u64 right_gen; 5324 5325 left_path = btrfs_alloc_path(); 5326 if (!left_path) { 5327 ret = -ENOMEM; 5328 goto out; 5329 } 5330 right_path = btrfs_alloc_path(); 5331 if (!right_path) { 5332 ret = -ENOMEM; 5333 goto out; 5334 } 5335 5336 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 5337 if (!tmp_buf) { 5338 ret = -ENOMEM; 5339 goto out; 5340 } 5341 5342 left_path->search_commit_root = 1; 5343 left_path->skip_locking = 1; 5344 right_path->search_commit_root = 1; 5345 right_path->skip_locking = 1; 5346 5347 /* 5348 * Strategy: Go to the first items of both trees. Then do 5349 * 5350 * If both trees are at level 0 5351 * Compare keys of current items 5352 * If left < right treat left item as new, advance left tree 5353 * and repeat 5354 * If left > right treat right item as deleted, advance right tree 5355 * and repeat 5356 * If left == right do deep compare of items, treat as changed if 5357 * needed, advance both trees and repeat 5358 * If both trees are at the same level but not at level 0 5359 * Compare keys of current nodes/leafs 5360 * If left < right advance left tree and repeat 5361 * If left > right advance right tree and repeat 5362 * If left == right compare blockptrs of the next nodes/leafs 5363 * If they match advance both trees but stay at the same level 5364 * and repeat 5365 * If they don't match advance both trees while allowing to go 5366 * deeper and repeat 5367 * If tree levels are different 5368 * Advance the tree that needs it and repeat 5369 * 5370 * Advancing a tree means: 5371 * If we are at level 0, try to go to the next slot. If that's not 5372 * possible, go one level up and repeat. Stop when we found a level 5373 * where we could go to the next slot. We may at this point be on a 5374 * node or a leaf. 5375 * 5376 * If we are not at level 0 and not on shared tree blocks, go one 5377 * level deeper. 5378 * 5379 * If we are not at level 0 and on shared tree blocks, go one slot to 5380 * the right if possible or go up and right. 5381 */ 5382 5383 down_read(&fs_info->commit_root_sem); 5384 left_level = btrfs_header_level(left_root->commit_root); 5385 left_root_level = left_level; 5386 left_path->nodes[left_level] = 5387 btrfs_clone_extent_buffer(left_root->commit_root); 5388 if (!left_path->nodes[left_level]) { 5389 up_read(&fs_info->commit_root_sem); 5390 ret = -ENOMEM; 5391 goto out; 5392 } 5393 extent_buffer_get(left_path->nodes[left_level]); 5394 5395 right_level = btrfs_header_level(right_root->commit_root); 5396 right_root_level = right_level; 5397 right_path->nodes[right_level] = 5398 btrfs_clone_extent_buffer(right_root->commit_root); 5399 if (!right_path->nodes[right_level]) { 5400 up_read(&fs_info->commit_root_sem); 5401 ret = -ENOMEM; 5402 goto out; 5403 } 5404 extent_buffer_get(right_path->nodes[right_level]); 5405 up_read(&fs_info->commit_root_sem); 5406 5407 if (left_level == 0) 5408 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5409 &left_key, left_path->slots[left_level]); 5410 else 5411 btrfs_node_key_to_cpu(left_path->nodes[left_level], 5412 &left_key, left_path->slots[left_level]); 5413 if (right_level == 0) 5414 btrfs_item_key_to_cpu(right_path->nodes[right_level], 5415 &right_key, right_path->slots[right_level]); 5416 else 5417 btrfs_node_key_to_cpu(right_path->nodes[right_level], 5418 &right_key, right_path->slots[right_level]); 5419 5420 left_end_reached = right_end_reached = 0; 5421 advance_left = advance_right = 0; 5422 5423 while (1) { 5424 if (advance_left && !left_end_reached) { 5425 ret = tree_advance(fs_info, left_path, &left_level, 5426 left_root_level, 5427 advance_left != ADVANCE_ONLY_NEXT, 5428 &left_key); 5429 if (ret == -1) 5430 left_end_reached = ADVANCE; 5431 else if (ret < 0) 5432 goto out; 5433 advance_left = 0; 5434 } 5435 if (advance_right && !right_end_reached) { 5436 ret = tree_advance(fs_info, right_path, &right_level, 5437 right_root_level, 5438 advance_right != ADVANCE_ONLY_NEXT, 5439 &right_key); 5440 if (ret == -1) 5441 right_end_reached = ADVANCE; 5442 else if (ret < 0) 5443 goto out; 5444 advance_right = 0; 5445 } 5446 5447 if (left_end_reached && right_end_reached) { 5448 ret = 0; 5449 goto out; 5450 } else if (left_end_reached) { 5451 if (right_level == 0) { 5452 ret = changed_cb(left_path, right_path, 5453 &right_key, 5454 BTRFS_COMPARE_TREE_DELETED, 5455 ctx); 5456 if (ret < 0) 5457 goto out; 5458 } 5459 advance_right = ADVANCE; 5460 continue; 5461 } else if (right_end_reached) { 5462 if (left_level == 0) { 5463 ret = changed_cb(left_path, right_path, 5464 &left_key, 5465 BTRFS_COMPARE_TREE_NEW, 5466 ctx); 5467 if (ret < 0) 5468 goto out; 5469 } 5470 advance_left = ADVANCE; 5471 continue; 5472 } 5473 5474 if (left_level == 0 && right_level == 0) { 5475 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5476 if (cmp < 0) { 5477 ret = changed_cb(left_path, right_path, 5478 &left_key, 5479 BTRFS_COMPARE_TREE_NEW, 5480 ctx); 5481 if (ret < 0) 5482 goto out; 5483 advance_left = ADVANCE; 5484 } else if (cmp > 0) { 5485 ret = changed_cb(left_path, right_path, 5486 &right_key, 5487 BTRFS_COMPARE_TREE_DELETED, 5488 ctx); 5489 if (ret < 0) 5490 goto out; 5491 advance_right = ADVANCE; 5492 } else { 5493 enum btrfs_compare_tree_result result; 5494 5495 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5496 ret = tree_compare_item(left_path, right_path, 5497 tmp_buf); 5498 if (ret) 5499 result = BTRFS_COMPARE_TREE_CHANGED; 5500 else 5501 result = BTRFS_COMPARE_TREE_SAME; 5502 ret = changed_cb(left_path, right_path, 5503 &left_key, result, ctx); 5504 if (ret < 0) 5505 goto out; 5506 advance_left = ADVANCE; 5507 advance_right = ADVANCE; 5508 } 5509 } else if (left_level == right_level) { 5510 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5511 if (cmp < 0) { 5512 advance_left = ADVANCE; 5513 } else if (cmp > 0) { 5514 advance_right = ADVANCE; 5515 } else { 5516 left_blockptr = btrfs_node_blockptr( 5517 left_path->nodes[left_level], 5518 left_path->slots[left_level]); 5519 right_blockptr = btrfs_node_blockptr( 5520 right_path->nodes[right_level], 5521 right_path->slots[right_level]); 5522 left_gen = btrfs_node_ptr_generation( 5523 left_path->nodes[left_level], 5524 left_path->slots[left_level]); 5525 right_gen = btrfs_node_ptr_generation( 5526 right_path->nodes[right_level], 5527 right_path->slots[right_level]); 5528 if (left_blockptr == right_blockptr && 5529 left_gen == right_gen) { 5530 /* 5531 * As we're on a shared block, don't 5532 * allow to go deeper. 5533 */ 5534 advance_left = ADVANCE_ONLY_NEXT; 5535 advance_right = ADVANCE_ONLY_NEXT; 5536 } else { 5537 advance_left = ADVANCE; 5538 advance_right = ADVANCE; 5539 } 5540 } 5541 } else if (left_level < right_level) { 5542 advance_right = ADVANCE; 5543 } else { 5544 advance_left = ADVANCE; 5545 } 5546 } 5547 5548 out: 5549 btrfs_free_path(left_path); 5550 btrfs_free_path(right_path); 5551 kvfree(tmp_buf); 5552 return ret; 5553 } 5554 5555 /* 5556 * this is similar to btrfs_next_leaf, but does not try to preserve 5557 * and fixup the path. It looks for and returns the next key in the 5558 * tree based on the current path and the min_trans parameters. 5559 * 5560 * 0 is returned if another key is found, < 0 if there are any errors 5561 * and 1 is returned if there are no higher keys in the tree 5562 * 5563 * path->keep_locks should be set to 1 on the search made before 5564 * calling this function. 5565 */ 5566 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5567 struct btrfs_key *key, int level, u64 min_trans) 5568 { 5569 int slot; 5570 struct extent_buffer *c; 5571 5572 WARN_ON(!path->keep_locks); 5573 while (level < BTRFS_MAX_LEVEL) { 5574 if (!path->nodes[level]) 5575 return 1; 5576 5577 slot = path->slots[level] + 1; 5578 c = path->nodes[level]; 5579 next: 5580 if (slot >= btrfs_header_nritems(c)) { 5581 int ret; 5582 int orig_lowest; 5583 struct btrfs_key cur_key; 5584 if (level + 1 >= BTRFS_MAX_LEVEL || 5585 !path->nodes[level + 1]) 5586 return 1; 5587 5588 if (path->locks[level + 1]) { 5589 level++; 5590 continue; 5591 } 5592 5593 slot = btrfs_header_nritems(c) - 1; 5594 if (level == 0) 5595 btrfs_item_key_to_cpu(c, &cur_key, slot); 5596 else 5597 btrfs_node_key_to_cpu(c, &cur_key, slot); 5598 5599 orig_lowest = path->lowest_level; 5600 btrfs_release_path(path); 5601 path->lowest_level = level; 5602 ret = btrfs_search_slot(NULL, root, &cur_key, path, 5603 0, 0); 5604 path->lowest_level = orig_lowest; 5605 if (ret < 0) 5606 return ret; 5607 5608 c = path->nodes[level]; 5609 slot = path->slots[level]; 5610 if (ret == 0) 5611 slot++; 5612 goto next; 5613 } 5614 5615 if (level == 0) 5616 btrfs_item_key_to_cpu(c, key, slot); 5617 else { 5618 u64 gen = btrfs_node_ptr_generation(c, slot); 5619 5620 if (gen < min_trans) { 5621 slot++; 5622 goto next; 5623 } 5624 btrfs_node_key_to_cpu(c, key, slot); 5625 } 5626 return 0; 5627 } 5628 return 1; 5629 } 5630 5631 /* 5632 * search the tree again to find a leaf with greater keys 5633 * returns 0 if it found something or 1 if there are no greater leaves. 5634 * returns < 0 on io errors. 5635 */ 5636 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5637 { 5638 return btrfs_next_old_leaf(root, path, 0); 5639 } 5640 5641 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5642 u64 time_seq) 5643 { 5644 int slot; 5645 int level; 5646 struct extent_buffer *c; 5647 struct extent_buffer *next; 5648 struct btrfs_key key; 5649 u32 nritems; 5650 int ret; 5651 int old_spinning = path->leave_spinning; 5652 int next_rw_lock = 0; 5653 5654 nritems = btrfs_header_nritems(path->nodes[0]); 5655 if (nritems == 0) 5656 return 1; 5657 5658 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5659 again: 5660 level = 1; 5661 next = NULL; 5662 next_rw_lock = 0; 5663 btrfs_release_path(path); 5664 5665 path->keep_locks = 1; 5666 path->leave_spinning = 1; 5667 5668 if (time_seq) 5669 ret = btrfs_search_old_slot(root, &key, path, time_seq); 5670 else 5671 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5672 path->keep_locks = 0; 5673 5674 if (ret < 0) 5675 return ret; 5676 5677 nritems = btrfs_header_nritems(path->nodes[0]); 5678 /* 5679 * by releasing the path above we dropped all our locks. A balance 5680 * could have added more items next to the key that used to be 5681 * at the very end of the block. So, check again here and 5682 * advance the path if there are now more items available. 5683 */ 5684 if (nritems > 0 && path->slots[0] < nritems - 1) { 5685 if (ret == 0) 5686 path->slots[0]++; 5687 ret = 0; 5688 goto done; 5689 } 5690 /* 5691 * So the above check misses one case: 5692 * - after releasing the path above, someone has removed the item that 5693 * used to be at the very end of the block, and balance between leafs 5694 * gets another one with bigger key.offset to replace it. 5695 * 5696 * This one should be returned as well, or we can get leaf corruption 5697 * later(esp. in __btrfs_drop_extents()). 5698 * 5699 * And a bit more explanation about this check, 5700 * with ret > 0, the key isn't found, the path points to the slot 5701 * where it should be inserted, so the path->slots[0] item must be the 5702 * bigger one. 5703 */ 5704 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 5705 ret = 0; 5706 goto done; 5707 } 5708 5709 while (level < BTRFS_MAX_LEVEL) { 5710 if (!path->nodes[level]) { 5711 ret = 1; 5712 goto done; 5713 } 5714 5715 slot = path->slots[level] + 1; 5716 c = path->nodes[level]; 5717 if (slot >= btrfs_header_nritems(c)) { 5718 level++; 5719 if (level == BTRFS_MAX_LEVEL) { 5720 ret = 1; 5721 goto done; 5722 } 5723 continue; 5724 } 5725 5726 if (next) { 5727 btrfs_tree_unlock_rw(next, next_rw_lock); 5728 free_extent_buffer(next); 5729 } 5730 5731 next = c; 5732 next_rw_lock = path->locks[level]; 5733 ret = read_block_for_search(root, path, &next, level, 5734 slot, &key); 5735 if (ret == -EAGAIN) 5736 goto again; 5737 5738 if (ret < 0) { 5739 btrfs_release_path(path); 5740 goto done; 5741 } 5742 5743 if (!path->skip_locking) { 5744 ret = btrfs_try_tree_read_lock(next); 5745 if (!ret && time_seq) { 5746 /* 5747 * If we don't get the lock, we may be racing 5748 * with push_leaf_left, holding that lock while 5749 * itself waiting for the leaf we've currently 5750 * locked. To solve this situation, we give up 5751 * on our lock and cycle. 5752 */ 5753 free_extent_buffer(next); 5754 btrfs_release_path(path); 5755 cond_resched(); 5756 goto again; 5757 } 5758 if (!ret) { 5759 btrfs_set_path_blocking(path); 5760 btrfs_tree_read_lock(next); 5761 } 5762 next_rw_lock = BTRFS_READ_LOCK; 5763 } 5764 break; 5765 } 5766 path->slots[level] = slot; 5767 while (1) { 5768 level--; 5769 c = path->nodes[level]; 5770 if (path->locks[level]) 5771 btrfs_tree_unlock_rw(c, path->locks[level]); 5772 5773 free_extent_buffer(c); 5774 path->nodes[level] = next; 5775 path->slots[level] = 0; 5776 if (!path->skip_locking) 5777 path->locks[level] = next_rw_lock; 5778 if (!level) 5779 break; 5780 5781 ret = read_block_for_search(root, path, &next, level, 5782 0, &key); 5783 if (ret == -EAGAIN) 5784 goto again; 5785 5786 if (ret < 0) { 5787 btrfs_release_path(path); 5788 goto done; 5789 } 5790 5791 if (!path->skip_locking) { 5792 ret = btrfs_try_tree_read_lock(next); 5793 if (!ret) { 5794 btrfs_set_path_blocking(path); 5795 btrfs_tree_read_lock(next); 5796 } 5797 next_rw_lock = BTRFS_READ_LOCK; 5798 } 5799 } 5800 ret = 0; 5801 done: 5802 unlock_up(path, 0, 1, 0, NULL); 5803 path->leave_spinning = old_spinning; 5804 if (!old_spinning) 5805 btrfs_set_path_blocking(path); 5806 5807 return ret; 5808 } 5809 5810 /* 5811 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5812 * searching until it gets past min_objectid or finds an item of 'type' 5813 * 5814 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5815 */ 5816 int btrfs_previous_item(struct btrfs_root *root, 5817 struct btrfs_path *path, u64 min_objectid, 5818 int type) 5819 { 5820 struct btrfs_key found_key; 5821 struct extent_buffer *leaf; 5822 u32 nritems; 5823 int ret; 5824 5825 while (1) { 5826 if (path->slots[0] == 0) { 5827 btrfs_set_path_blocking(path); 5828 ret = btrfs_prev_leaf(root, path); 5829 if (ret != 0) 5830 return ret; 5831 } else { 5832 path->slots[0]--; 5833 } 5834 leaf = path->nodes[0]; 5835 nritems = btrfs_header_nritems(leaf); 5836 if (nritems == 0) 5837 return 1; 5838 if (path->slots[0] == nritems) 5839 path->slots[0]--; 5840 5841 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5842 if (found_key.objectid < min_objectid) 5843 break; 5844 if (found_key.type == type) 5845 return 0; 5846 if (found_key.objectid == min_objectid && 5847 found_key.type < type) 5848 break; 5849 } 5850 return 1; 5851 } 5852 5853 /* 5854 * search in extent tree to find a previous Metadata/Data extent item with 5855 * min objecitd. 5856 * 5857 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5858 */ 5859 int btrfs_previous_extent_item(struct btrfs_root *root, 5860 struct btrfs_path *path, u64 min_objectid) 5861 { 5862 struct btrfs_key found_key; 5863 struct extent_buffer *leaf; 5864 u32 nritems; 5865 int ret; 5866 5867 while (1) { 5868 if (path->slots[0] == 0) { 5869 btrfs_set_path_blocking(path); 5870 ret = btrfs_prev_leaf(root, path); 5871 if (ret != 0) 5872 return ret; 5873 } else { 5874 path->slots[0]--; 5875 } 5876 leaf = path->nodes[0]; 5877 nritems = btrfs_header_nritems(leaf); 5878 if (nritems == 0) 5879 return 1; 5880 if (path->slots[0] == nritems) 5881 path->slots[0]--; 5882 5883 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5884 if (found_key.objectid < min_objectid) 5885 break; 5886 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5887 found_key.type == BTRFS_METADATA_ITEM_KEY) 5888 return 0; 5889 if (found_key.objectid == min_objectid && 5890 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5891 break; 5892 } 5893 return 1; 5894 } 5895