1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007,2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/rbtree.h> 9 #include <linux/mm.h> 10 #include "ctree.h" 11 #include "disk-io.h" 12 #include "transaction.h" 13 #include "print-tree.h" 14 #include "locking.h" 15 #include "volumes.h" 16 17 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 18 *root, struct btrfs_path *path, int level); 19 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, 20 const struct btrfs_key *ins_key, struct btrfs_path *path, 21 int data_size, int extend); 22 static int push_node_left(struct btrfs_trans_handle *trans, 23 struct btrfs_fs_info *fs_info, 24 struct extent_buffer *dst, 25 struct extent_buffer *src, int empty); 26 static int balance_node_right(struct btrfs_trans_handle *trans, 27 struct btrfs_fs_info *fs_info, 28 struct extent_buffer *dst_buf, 29 struct extent_buffer *src_buf); 30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 31 int level, int slot); 32 33 struct btrfs_path *btrfs_alloc_path(void) 34 { 35 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 36 } 37 38 /* 39 * set all locked nodes in the path to blocking locks. This should 40 * be done before scheduling 41 */ 42 noinline void btrfs_set_path_blocking(struct btrfs_path *p) 43 { 44 int i; 45 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 46 if (!p->nodes[i] || !p->locks[i]) 47 continue; 48 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 49 if (p->locks[i] == BTRFS_READ_LOCK) 50 p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 51 else if (p->locks[i] == BTRFS_WRITE_LOCK) 52 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 53 } 54 } 55 56 /* this also releases the path */ 57 void btrfs_free_path(struct btrfs_path *p) 58 { 59 if (!p) 60 return; 61 btrfs_release_path(p); 62 kmem_cache_free(btrfs_path_cachep, p); 63 } 64 65 /* 66 * path release drops references on the extent buffers in the path 67 * and it drops any locks held by this path 68 * 69 * It is safe to call this on paths that no locks or extent buffers held. 70 */ 71 noinline void btrfs_release_path(struct btrfs_path *p) 72 { 73 int i; 74 75 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 76 p->slots[i] = 0; 77 if (!p->nodes[i]) 78 continue; 79 if (p->locks[i]) { 80 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 81 p->locks[i] = 0; 82 } 83 free_extent_buffer(p->nodes[i]); 84 p->nodes[i] = NULL; 85 } 86 } 87 88 /* 89 * safely gets a reference on the root node of a tree. A lock 90 * is not taken, so a concurrent writer may put a different node 91 * at the root of the tree. See btrfs_lock_root_node for the 92 * looping required. 93 * 94 * The extent buffer returned by this has a reference taken, so 95 * it won't disappear. It may stop being the root of the tree 96 * at any time because there are no locks held. 97 */ 98 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 99 { 100 struct extent_buffer *eb; 101 102 while (1) { 103 rcu_read_lock(); 104 eb = rcu_dereference(root->node); 105 106 /* 107 * RCU really hurts here, we could free up the root node because 108 * it was COWed but we may not get the new root node yet so do 109 * the inc_not_zero dance and if it doesn't work then 110 * synchronize_rcu and try again. 111 */ 112 if (atomic_inc_not_zero(&eb->refs)) { 113 rcu_read_unlock(); 114 break; 115 } 116 rcu_read_unlock(); 117 synchronize_rcu(); 118 } 119 return eb; 120 } 121 122 /* loop around taking references on and locking the root node of the 123 * tree until you end up with a lock on the root. A locked buffer 124 * is returned, with a reference held. 125 */ 126 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 127 { 128 struct extent_buffer *eb; 129 130 while (1) { 131 eb = btrfs_root_node(root); 132 btrfs_tree_lock(eb); 133 if (eb == root->node) 134 break; 135 btrfs_tree_unlock(eb); 136 free_extent_buffer(eb); 137 } 138 return eb; 139 } 140 141 /* loop around taking references on and locking the root node of the 142 * tree until you end up with a lock on the root. A locked buffer 143 * is returned, with a reference held. 144 */ 145 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 146 { 147 struct extent_buffer *eb; 148 149 while (1) { 150 eb = btrfs_root_node(root); 151 btrfs_tree_read_lock(eb); 152 if (eb == root->node) 153 break; 154 btrfs_tree_read_unlock(eb); 155 free_extent_buffer(eb); 156 } 157 return eb; 158 } 159 160 /* cowonly root (everything not a reference counted cow subvolume), just get 161 * put onto a simple dirty list. transaction.c walks this to make sure they 162 * get properly updated on disk. 163 */ 164 static void add_root_to_dirty_list(struct btrfs_root *root) 165 { 166 struct btrfs_fs_info *fs_info = root->fs_info; 167 168 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || 169 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) 170 return; 171 172 spin_lock(&fs_info->trans_lock); 173 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { 174 /* Want the extent tree to be the last on the list */ 175 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) 176 list_move_tail(&root->dirty_list, 177 &fs_info->dirty_cowonly_roots); 178 else 179 list_move(&root->dirty_list, 180 &fs_info->dirty_cowonly_roots); 181 } 182 spin_unlock(&fs_info->trans_lock); 183 } 184 185 /* 186 * used by snapshot creation to make a copy of a root for a tree with 187 * a given objectid. The buffer with the new root node is returned in 188 * cow_ret, and this func returns zero on success or a negative error code. 189 */ 190 int btrfs_copy_root(struct btrfs_trans_handle *trans, 191 struct btrfs_root *root, 192 struct extent_buffer *buf, 193 struct extent_buffer **cow_ret, u64 new_root_objectid) 194 { 195 struct btrfs_fs_info *fs_info = root->fs_info; 196 struct extent_buffer *cow; 197 int ret = 0; 198 int level; 199 struct btrfs_disk_key disk_key; 200 201 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 202 trans->transid != fs_info->running_transaction->transid); 203 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 204 trans->transid != root->last_trans); 205 206 level = btrfs_header_level(buf); 207 if (level == 0) 208 btrfs_item_key(buf, &disk_key, 0); 209 else 210 btrfs_node_key(buf, &disk_key, 0); 211 212 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 213 &disk_key, level, buf->start, 0); 214 if (IS_ERR(cow)) 215 return PTR_ERR(cow); 216 217 copy_extent_buffer_full(cow, buf); 218 btrfs_set_header_bytenr(cow, cow->start); 219 btrfs_set_header_generation(cow, trans->transid); 220 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 221 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 222 BTRFS_HEADER_FLAG_RELOC); 223 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 224 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 225 else 226 btrfs_set_header_owner(cow, new_root_objectid); 227 228 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 229 230 WARN_ON(btrfs_header_generation(buf) > trans->transid); 231 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 232 ret = btrfs_inc_ref(trans, root, cow, 1); 233 else 234 ret = btrfs_inc_ref(trans, root, cow, 0); 235 236 if (ret) 237 return ret; 238 239 btrfs_mark_buffer_dirty(cow); 240 *cow_ret = cow; 241 return 0; 242 } 243 244 enum mod_log_op { 245 MOD_LOG_KEY_REPLACE, 246 MOD_LOG_KEY_ADD, 247 MOD_LOG_KEY_REMOVE, 248 MOD_LOG_KEY_REMOVE_WHILE_FREEING, 249 MOD_LOG_KEY_REMOVE_WHILE_MOVING, 250 MOD_LOG_MOVE_KEYS, 251 MOD_LOG_ROOT_REPLACE, 252 }; 253 254 struct tree_mod_root { 255 u64 logical; 256 u8 level; 257 }; 258 259 struct tree_mod_elem { 260 struct rb_node node; 261 u64 logical; 262 u64 seq; 263 enum mod_log_op op; 264 265 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 266 int slot; 267 268 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ 269 u64 generation; 270 271 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ 272 struct btrfs_disk_key key; 273 u64 blockptr; 274 275 /* this is used for op == MOD_LOG_MOVE_KEYS */ 276 struct { 277 int dst_slot; 278 int nr_items; 279 } move; 280 281 /* this is used for op == MOD_LOG_ROOT_REPLACE */ 282 struct tree_mod_root old_root; 283 }; 284 285 /* 286 * Pull a new tree mod seq number for our operation. 287 */ 288 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 289 { 290 return atomic64_inc_return(&fs_info->tree_mod_seq); 291 } 292 293 /* 294 * This adds a new blocker to the tree mod log's blocker list if the @elem 295 * passed does not already have a sequence number set. So when a caller expects 296 * to record tree modifications, it should ensure to set elem->seq to zero 297 * before calling btrfs_get_tree_mod_seq. 298 * Returns a fresh, unused tree log modification sequence number, even if no new 299 * blocker was added. 300 */ 301 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 302 struct seq_list *elem) 303 { 304 write_lock(&fs_info->tree_mod_log_lock); 305 spin_lock(&fs_info->tree_mod_seq_lock); 306 if (!elem->seq) { 307 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 308 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 309 } 310 spin_unlock(&fs_info->tree_mod_seq_lock); 311 write_unlock(&fs_info->tree_mod_log_lock); 312 313 return elem->seq; 314 } 315 316 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 317 struct seq_list *elem) 318 { 319 struct rb_root *tm_root; 320 struct rb_node *node; 321 struct rb_node *next; 322 struct seq_list *cur_elem; 323 struct tree_mod_elem *tm; 324 u64 min_seq = (u64)-1; 325 u64 seq_putting = elem->seq; 326 327 if (!seq_putting) 328 return; 329 330 spin_lock(&fs_info->tree_mod_seq_lock); 331 list_del(&elem->list); 332 elem->seq = 0; 333 334 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 335 if (cur_elem->seq < min_seq) { 336 if (seq_putting > cur_elem->seq) { 337 /* 338 * blocker with lower sequence number exists, we 339 * cannot remove anything from the log 340 */ 341 spin_unlock(&fs_info->tree_mod_seq_lock); 342 return; 343 } 344 min_seq = cur_elem->seq; 345 } 346 } 347 spin_unlock(&fs_info->tree_mod_seq_lock); 348 349 /* 350 * anything that's lower than the lowest existing (read: blocked) 351 * sequence number can be removed from the tree. 352 */ 353 write_lock(&fs_info->tree_mod_log_lock); 354 tm_root = &fs_info->tree_mod_log; 355 for (node = rb_first(tm_root); node; node = next) { 356 next = rb_next(node); 357 tm = rb_entry(node, struct tree_mod_elem, node); 358 if (tm->seq > min_seq) 359 continue; 360 rb_erase(node, tm_root); 361 kfree(tm); 362 } 363 write_unlock(&fs_info->tree_mod_log_lock); 364 } 365 366 /* 367 * key order of the log: 368 * node/leaf start address -> sequence 369 * 370 * The 'start address' is the logical address of the *new* root node 371 * for root replace operations, or the logical address of the affected 372 * block for all other operations. 373 * 374 * Note: must be called with write lock for fs_info::tree_mod_log_lock. 375 */ 376 static noinline int 377 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 378 { 379 struct rb_root *tm_root; 380 struct rb_node **new; 381 struct rb_node *parent = NULL; 382 struct tree_mod_elem *cur; 383 384 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 385 386 tm_root = &fs_info->tree_mod_log; 387 new = &tm_root->rb_node; 388 while (*new) { 389 cur = rb_entry(*new, struct tree_mod_elem, node); 390 parent = *new; 391 if (cur->logical < tm->logical) 392 new = &((*new)->rb_left); 393 else if (cur->logical > tm->logical) 394 new = &((*new)->rb_right); 395 else if (cur->seq < tm->seq) 396 new = &((*new)->rb_left); 397 else if (cur->seq > tm->seq) 398 new = &((*new)->rb_right); 399 else 400 return -EEXIST; 401 } 402 403 rb_link_node(&tm->node, parent, new); 404 rb_insert_color(&tm->node, tm_root); 405 return 0; 406 } 407 408 /* 409 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it 410 * returns zero with the tree_mod_log_lock acquired. The caller must hold 411 * this until all tree mod log insertions are recorded in the rb tree and then 412 * write unlock fs_info::tree_mod_log_lock. 413 */ 414 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 415 struct extent_buffer *eb) { 416 smp_mb(); 417 if (list_empty(&(fs_info)->tree_mod_seq_list)) 418 return 1; 419 if (eb && btrfs_header_level(eb) == 0) 420 return 1; 421 422 write_lock(&fs_info->tree_mod_log_lock); 423 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 424 write_unlock(&fs_info->tree_mod_log_lock); 425 return 1; 426 } 427 428 return 0; 429 } 430 431 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 432 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, 433 struct extent_buffer *eb) 434 { 435 smp_mb(); 436 if (list_empty(&(fs_info)->tree_mod_seq_list)) 437 return 0; 438 if (eb && btrfs_header_level(eb) == 0) 439 return 0; 440 441 return 1; 442 } 443 444 static struct tree_mod_elem * 445 alloc_tree_mod_elem(struct extent_buffer *eb, int slot, 446 enum mod_log_op op, gfp_t flags) 447 { 448 struct tree_mod_elem *tm; 449 450 tm = kzalloc(sizeof(*tm), flags); 451 if (!tm) 452 return NULL; 453 454 tm->logical = eb->start; 455 if (op != MOD_LOG_KEY_ADD) { 456 btrfs_node_key(eb, &tm->key, slot); 457 tm->blockptr = btrfs_node_blockptr(eb, slot); 458 } 459 tm->op = op; 460 tm->slot = slot; 461 tm->generation = btrfs_node_ptr_generation(eb, slot); 462 RB_CLEAR_NODE(&tm->node); 463 464 return tm; 465 } 466 467 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 468 enum mod_log_op op, gfp_t flags) 469 { 470 struct tree_mod_elem *tm; 471 int ret; 472 473 if (!tree_mod_need_log(eb->fs_info, eb)) 474 return 0; 475 476 tm = alloc_tree_mod_elem(eb, slot, op, flags); 477 if (!tm) 478 return -ENOMEM; 479 480 if (tree_mod_dont_log(eb->fs_info, eb)) { 481 kfree(tm); 482 return 0; 483 } 484 485 ret = __tree_mod_log_insert(eb->fs_info, tm); 486 write_unlock(&eb->fs_info->tree_mod_log_lock); 487 if (ret) 488 kfree(tm); 489 490 return ret; 491 } 492 493 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb, 494 int dst_slot, int src_slot, int nr_items) 495 { 496 struct tree_mod_elem *tm = NULL; 497 struct tree_mod_elem **tm_list = NULL; 498 int ret = 0; 499 int i; 500 int locked = 0; 501 502 if (!tree_mod_need_log(eb->fs_info, eb)) 503 return 0; 504 505 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 506 if (!tm_list) 507 return -ENOMEM; 508 509 tm = kzalloc(sizeof(*tm), GFP_NOFS); 510 if (!tm) { 511 ret = -ENOMEM; 512 goto free_tms; 513 } 514 515 tm->logical = eb->start; 516 tm->slot = src_slot; 517 tm->move.dst_slot = dst_slot; 518 tm->move.nr_items = nr_items; 519 tm->op = MOD_LOG_MOVE_KEYS; 520 521 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 522 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 523 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); 524 if (!tm_list[i]) { 525 ret = -ENOMEM; 526 goto free_tms; 527 } 528 } 529 530 if (tree_mod_dont_log(eb->fs_info, eb)) 531 goto free_tms; 532 locked = 1; 533 534 /* 535 * When we override something during the move, we log these removals. 536 * This can only happen when we move towards the beginning of the 537 * buffer, i.e. dst_slot < src_slot. 538 */ 539 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 540 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]); 541 if (ret) 542 goto free_tms; 543 } 544 545 ret = __tree_mod_log_insert(eb->fs_info, tm); 546 if (ret) 547 goto free_tms; 548 write_unlock(&eb->fs_info->tree_mod_log_lock); 549 kfree(tm_list); 550 551 return 0; 552 free_tms: 553 for (i = 0; i < nr_items; i++) { 554 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 555 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 556 kfree(tm_list[i]); 557 } 558 if (locked) 559 write_unlock(&eb->fs_info->tree_mod_log_lock); 560 kfree(tm_list); 561 kfree(tm); 562 563 return ret; 564 } 565 566 static inline int 567 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 568 struct tree_mod_elem **tm_list, 569 int nritems) 570 { 571 int i, j; 572 int ret; 573 574 for (i = nritems - 1; i >= 0; i--) { 575 ret = __tree_mod_log_insert(fs_info, tm_list[i]); 576 if (ret) { 577 for (j = nritems - 1; j > i; j--) 578 rb_erase(&tm_list[j]->node, 579 &fs_info->tree_mod_log); 580 return ret; 581 } 582 } 583 584 return 0; 585 } 586 587 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root, 588 struct extent_buffer *new_root, int log_removal) 589 { 590 struct btrfs_fs_info *fs_info = old_root->fs_info; 591 struct tree_mod_elem *tm = NULL; 592 struct tree_mod_elem **tm_list = NULL; 593 int nritems = 0; 594 int ret = 0; 595 int i; 596 597 if (!tree_mod_need_log(fs_info, NULL)) 598 return 0; 599 600 if (log_removal && btrfs_header_level(old_root) > 0) { 601 nritems = btrfs_header_nritems(old_root); 602 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 603 GFP_NOFS); 604 if (!tm_list) { 605 ret = -ENOMEM; 606 goto free_tms; 607 } 608 for (i = 0; i < nritems; i++) { 609 tm_list[i] = alloc_tree_mod_elem(old_root, i, 610 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 611 if (!tm_list[i]) { 612 ret = -ENOMEM; 613 goto free_tms; 614 } 615 } 616 } 617 618 tm = kzalloc(sizeof(*tm), GFP_NOFS); 619 if (!tm) { 620 ret = -ENOMEM; 621 goto free_tms; 622 } 623 624 tm->logical = new_root->start; 625 tm->old_root.logical = old_root->start; 626 tm->old_root.level = btrfs_header_level(old_root); 627 tm->generation = btrfs_header_generation(old_root); 628 tm->op = MOD_LOG_ROOT_REPLACE; 629 630 if (tree_mod_dont_log(fs_info, NULL)) 631 goto free_tms; 632 633 if (tm_list) 634 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); 635 if (!ret) 636 ret = __tree_mod_log_insert(fs_info, tm); 637 638 write_unlock(&fs_info->tree_mod_log_lock); 639 if (ret) 640 goto free_tms; 641 kfree(tm_list); 642 643 return ret; 644 645 free_tms: 646 if (tm_list) { 647 for (i = 0; i < nritems; i++) 648 kfree(tm_list[i]); 649 kfree(tm_list); 650 } 651 kfree(tm); 652 653 return ret; 654 } 655 656 static struct tree_mod_elem * 657 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, 658 int smallest) 659 { 660 struct rb_root *tm_root; 661 struct rb_node *node; 662 struct tree_mod_elem *cur = NULL; 663 struct tree_mod_elem *found = NULL; 664 665 read_lock(&fs_info->tree_mod_log_lock); 666 tm_root = &fs_info->tree_mod_log; 667 node = tm_root->rb_node; 668 while (node) { 669 cur = rb_entry(node, struct tree_mod_elem, node); 670 if (cur->logical < start) { 671 node = node->rb_left; 672 } else if (cur->logical > start) { 673 node = node->rb_right; 674 } else if (cur->seq < min_seq) { 675 node = node->rb_left; 676 } else if (!smallest) { 677 /* we want the node with the highest seq */ 678 if (found) 679 BUG_ON(found->seq > cur->seq); 680 found = cur; 681 node = node->rb_left; 682 } else if (cur->seq > min_seq) { 683 /* we want the node with the smallest seq */ 684 if (found) 685 BUG_ON(found->seq < cur->seq); 686 found = cur; 687 node = node->rb_right; 688 } else { 689 found = cur; 690 break; 691 } 692 } 693 read_unlock(&fs_info->tree_mod_log_lock); 694 695 return found; 696 } 697 698 /* 699 * this returns the element from the log with the smallest time sequence 700 * value that's in the log (the oldest log item). any element with a time 701 * sequence lower than min_seq will be ignored. 702 */ 703 static struct tree_mod_elem * 704 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, 705 u64 min_seq) 706 { 707 return __tree_mod_log_search(fs_info, start, min_seq, 1); 708 } 709 710 /* 711 * this returns the element from the log with the largest time sequence 712 * value that's in the log (the most recent log item). any element with 713 * a time sequence lower than min_seq will be ignored. 714 */ 715 static struct tree_mod_elem * 716 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) 717 { 718 return __tree_mod_log_search(fs_info, start, min_seq, 0); 719 } 720 721 static noinline int 722 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 723 struct extent_buffer *src, unsigned long dst_offset, 724 unsigned long src_offset, int nr_items) 725 { 726 int ret = 0; 727 struct tree_mod_elem **tm_list = NULL; 728 struct tree_mod_elem **tm_list_add, **tm_list_rem; 729 int i; 730 int locked = 0; 731 732 if (!tree_mod_need_log(fs_info, NULL)) 733 return 0; 734 735 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 736 return 0; 737 738 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 739 GFP_NOFS); 740 if (!tm_list) 741 return -ENOMEM; 742 743 tm_list_add = tm_list; 744 tm_list_rem = tm_list + nr_items; 745 for (i = 0; i < nr_items; i++) { 746 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 747 MOD_LOG_KEY_REMOVE, GFP_NOFS); 748 if (!tm_list_rem[i]) { 749 ret = -ENOMEM; 750 goto free_tms; 751 } 752 753 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 754 MOD_LOG_KEY_ADD, GFP_NOFS); 755 if (!tm_list_add[i]) { 756 ret = -ENOMEM; 757 goto free_tms; 758 } 759 } 760 761 if (tree_mod_dont_log(fs_info, NULL)) 762 goto free_tms; 763 locked = 1; 764 765 for (i = 0; i < nr_items; i++) { 766 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); 767 if (ret) 768 goto free_tms; 769 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); 770 if (ret) 771 goto free_tms; 772 } 773 774 write_unlock(&fs_info->tree_mod_log_lock); 775 kfree(tm_list); 776 777 return 0; 778 779 free_tms: 780 for (i = 0; i < nr_items * 2; i++) { 781 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 782 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 783 kfree(tm_list[i]); 784 } 785 if (locked) 786 write_unlock(&fs_info->tree_mod_log_lock); 787 kfree(tm_list); 788 789 return ret; 790 } 791 792 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb) 793 { 794 struct tree_mod_elem **tm_list = NULL; 795 int nritems = 0; 796 int i; 797 int ret = 0; 798 799 if (btrfs_header_level(eb) == 0) 800 return 0; 801 802 if (!tree_mod_need_log(eb->fs_info, NULL)) 803 return 0; 804 805 nritems = btrfs_header_nritems(eb); 806 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 807 if (!tm_list) 808 return -ENOMEM; 809 810 for (i = 0; i < nritems; i++) { 811 tm_list[i] = alloc_tree_mod_elem(eb, i, 812 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 813 if (!tm_list[i]) { 814 ret = -ENOMEM; 815 goto free_tms; 816 } 817 } 818 819 if (tree_mod_dont_log(eb->fs_info, eb)) 820 goto free_tms; 821 822 ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 823 write_unlock(&eb->fs_info->tree_mod_log_lock); 824 if (ret) 825 goto free_tms; 826 kfree(tm_list); 827 828 return 0; 829 830 free_tms: 831 for (i = 0; i < nritems; i++) 832 kfree(tm_list[i]); 833 kfree(tm_list); 834 835 return ret; 836 } 837 838 /* 839 * check if the tree block can be shared by multiple trees 840 */ 841 int btrfs_block_can_be_shared(struct btrfs_root *root, 842 struct extent_buffer *buf) 843 { 844 /* 845 * Tree blocks not in reference counted trees and tree roots 846 * are never shared. If a block was allocated after the last 847 * snapshot and the block was not allocated by tree relocation, 848 * we know the block is not shared. 849 */ 850 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 851 buf != root->node && buf != root->commit_root && 852 (btrfs_header_generation(buf) <= 853 btrfs_root_last_snapshot(&root->root_item) || 854 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 855 return 1; 856 857 return 0; 858 } 859 860 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 861 struct btrfs_root *root, 862 struct extent_buffer *buf, 863 struct extent_buffer *cow, 864 int *last_ref) 865 { 866 struct btrfs_fs_info *fs_info = root->fs_info; 867 u64 refs; 868 u64 owner; 869 u64 flags; 870 u64 new_flags = 0; 871 int ret; 872 873 /* 874 * Backrefs update rules: 875 * 876 * Always use full backrefs for extent pointers in tree block 877 * allocated by tree relocation. 878 * 879 * If a shared tree block is no longer referenced by its owner 880 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 881 * use full backrefs for extent pointers in tree block. 882 * 883 * If a tree block is been relocating 884 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 885 * use full backrefs for extent pointers in tree block. 886 * The reason for this is some operations (such as drop tree) 887 * are only allowed for blocks use full backrefs. 888 */ 889 890 if (btrfs_block_can_be_shared(root, buf)) { 891 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, 892 btrfs_header_level(buf), 1, 893 &refs, &flags); 894 if (ret) 895 return ret; 896 if (refs == 0) { 897 ret = -EROFS; 898 btrfs_handle_fs_error(fs_info, ret, NULL); 899 return ret; 900 } 901 } else { 902 refs = 1; 903 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 904 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 905 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 906 else 907 flags = 0; 908 } 909 910 owner = btrfs_header_owner(buf); 911 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 912 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 913 914 if (refs > 1) { 915 if ((owner == root->root_key.objectid || 916 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 917 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 918 ret = btrfs_inc_ref(trans, root, buf, 1); 919 if (ret) 920 return ret; 921 922 if (root->root_key.objectid == 923 BTRFS_TREE_RELOC_OBJECTID) { 924 ret = btrfs_dec_ref(trans, root, buf, 0); 925 if (ret) 926 return ret; 927 ret = btrfs_inc_ref(trans, root, cow, 1); 928 if (ret) 929 return ret; 930 } 931 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 932 } else { 933 934 if (root->root_key.objectid == 935 BTRFS_TREE_RELOC_OBJECTID) 936 ret = btrfs_inc_ref(trans, root, cow, 1); 937 else 938 ret = btrfs_inc_ref(trans, root, cow, 0); 939 if (ret) 940 return ret; 941 } 942 if (new_flags != 0) { 943 int level = btrfs_header_level(buf); 944 945 ret = btrfs_set_disk_extent_flags(trans, fs_info, 946 buf->start, 947 buf->len, 948 new_flags, level, 0); 949 if (ret) 950 return ret; 951 } 952 } else { 953 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 954 if (root->root_key.objectid == 955 BTRFS_TREE_RELOC_OBJECTID) 956 ret = btrfs_inc_ref(trans, root, cow, 1); 957 else 958 ret = btrfs_inc_ref(trans, root, cow, 0); 959 if (ret) 960 return ret; 961 ret = btrfs_dec_ref(trans, root, buf, 1); 962 if (ret) 963 return ret; 964 } 965 clean_tree_block(fs_info, buf); 966 *last_ref = 1; 967 } 968 return 0; 969 } 970 971 static struct extent_buffer *alloc_tree_block_no_bg_flush( 972 struct btrfs_trans_handle *trans, 973 struct btrfs_root *root, 974 u64 parent_start, 975 const struct btrfs_disk_key *disk_key, 976 int level, 977 u64 hint, 978 u64 empty_size) 979 { 980 struct btrfs_fs_info *fs_info = root->fs_info; 981 struct extent_buffer *ret; 982 983 /* 984 * If we are COWing a node/leaf from the extent, chunk, device or free 985 * space trees, make sure that we do not finish block group creation of 986 * pending block groups. We do this to avoid a deadlock. 987 * COWing can result in allocation of a new chunk, and flushing pending 988 * block groups (btrfs_create_pending_block_groups()) can be triggered 989 * when finishing allocation of a new chunk. Creation of a pending block 990 * group modifies the extent, chunk, device and free space trees, 991 * therefore we could deadlock with ourselves since we are holding a 992 * lock on an extent buffer that btrfs_create_pending_block_groups() may 993 * try to COW later. 994 * For similar reasons, we also need to delay flushing pending block 995 * groups when splitting a leaf or node, from one of those trees, since 996 * we are holding a write lock on it and its parent or when inserting a 997 * new root node for one of those trees. 998 */ 999 if (root == fs_info->extent_root || 1000 root == fs_info->chunk_root || 1001 root == fs_info->dev_root || 1002 root == fs_info->free_space_root) 1003 trans->can_flush_pending_bgs = false; 1004 1005 ret = btrfs_alloc_tree_block(trans, root, parent_start, 1006 root->root_key.objectid, disk_key, level, 1007 hint, empty_size); 1008 trans->can_flush_pending_bgs = true; 1009 1010 return ret; 1011 } 1012 1013 /* 1014 * does the dirty work in cow of a single block. The parent block (if 1015 * supplied) is updated to point to the new cow copy. The new buffer is marked 1016 * dirty and returned locked. If you modify the block it needs to be marked 1017 * dirty again. 1018 * 1019 * search_start -- an allocation hint for the new block 1020 * 1021 * empty_size -- a hint that you plan on doing more cow. This is the size in 1022 * bytes the allocator should try to find free next to the block it returns. 1023 * This is just a hint and may be ignored by the allocator. 1024 */ 1025 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 1026 struct btrfs_root *root, 1027 struct extent_buffer *buf, 1028 struct extent_buffer *parent, int parent_slot, 1029 struct extent_buffer **cow_ret, 1030 u64 search_start, u64 empty_size) 1031 { 1032 struct btrfs_fs_info *fs_info = root->fs_info; 1033 struct btrfs_disk_key disk_key; 1034 struct extent_buffer *cow; 1035 int level, ret; 1036 int last_ref = 0; 1037 int unlock_orig = 0; 1038 u64 parent_start = 0; 1039 1040 if (*cow_ret == buf) 1041 unlock_orig = 1; 1042 1043 btrfs_assert_tree_locked(buf); 1044 1045 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 1046 trans->transid != fs_info->running_transaction->transid); 1047 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 1048 trans->transid != root->last_trans); 1049 1050 level = btrfs_header_level(buf); 1051 1052 if (level == 0) 1053 btrfs_item_key(buf, &disk_key, 0); 1054 else 1055 btrfs_node_key(buf, &disk_key, 0); 1056 1057 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) 1058 parent_start = parent->start; 1059 1060 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, 1061 level, search_start, empty_size); 1062 if (IS_ERR(cow)) 1063 return PTR_ERR(cow); 1064 1065 /* cow is set to blocking by btrfs_init_new_buffer */ 1066 1067 copy_extent_buffer_full(cow, buf); 1068 btrfs_set_header_bytenr(cow, cow->start); 1069 btrfs_set_header_generation(cow, trans->transid); 1070 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 1071 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 1072 BTRFS_HEADER_FLAG_RELOC); 1073 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1074 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 1075 else 1076 btrfs_set_header_owner(cow, root->root_key.objectid); 1077 1078 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); 1079 1080 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 1081 if (ret) { 1082 btrfs_abort_transaction(trans, ret); 1083 return ret; 1084 } 1085 1086 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 1087 ret = btrfs_reloc_cow_block(trans, root, buf, cow); 1088 if (ret) { 1089 btrfs_abort_transaction(trans, ret); 1090 return ret; 1091 } 1092 } 1093 1094 if (buf == root->node) { 1095 WARN_ON(parent && parent != buf); 1096 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 1097 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 1098 parent_start = buf->start; 1099 1100 extent_buffer_get(cow); 1101 ret = tree_mod_log_insert_root(root->node, cow, 1); 1102 BUG_ON(ret < 0); 1103 rcu_assign_pointer(root->node, cow); 1104 1105 btrfs_free_tree_block(trans, root, buf, parent_start, 1106 last_ref); 1107 free_extent_buffer(buf); 1108 add_root_to_dirty_list(root); 1109 } else { 1110 WARN_ON(trans->transid != btrfs_header_generation(parent)); 1111 tree_mod_log_insert_key(parent, parent_slot, 1112 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1113 btrfs_set_node_blockptr(parent, parent_slot, 1114 cow->start); 1115 btrfs_set_node_ptr_generation(parent, parent_slot, 1116 trans->transid); 1117 btrfs_mark_buffer_dirty(parent); 1118 if (last_ref) { 1119 ret = tree_mod_log_free_eb(buf); 1120 if (ret) { 1121 btrfs_abort_transaction(trans, ret); 1122 return ret; 1123 } 1124 } 1125 btrfs_free_tree_block(trans, root, buf, parent_start, 1126 last_ref); 1127 } 1128 if (unlock_orig) 1129 btrfs_tree_unlock(buf); 1130 free_extent_buffer_stale(buf); 1131 btrfs_mark_buffer_dirty(cow); 1132 *cow_ret = cow; 1133 return 0; 1134 } 1135 1136 /* 1137 * returns the logical address of the oldest predecessor of the given root. 1138 * entries older than time_seq are ignored. 1139 */ 1140 static struct tree_mod_elem *__tree_mod_log_oldest_root( 1141 struct extent_buffer *eb_root, u64 time_seq) 1142 { 1143 struct tree_mod_elem *tm; 1144 struct tree_mod_elem *found = NULL; 1145 u64 root_logical = eb_root->start; 1146 int looped = 0; 1147 1148 if (!time_seq) 1149 return NULL; 1150 1151 /* 1152 * the very last operation that's logged for a root is the 1153 * replacement operation (if it is replaced at all). this has 1154 * the logical address of the *new* root, making it the very 1155 * first operation that's logged for this root. 1156 */ 1157 while (1) { 1158 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 1159 time_seq); 1160 if (!looped && !tm) 1161 return NULL; 1162 /* 1163 * if there are no tree operation for the oldest root, we simply 1164 * return it. this should only happen if that (old) root is at 1165 * level 0. 1166 */ 1167 if (!tm) 1168 break; 1169 1170 /* 1171 * if there's an operation that's not a root replacement, we 1172 * found the oldest version of our root. normally, we'll find a 1173 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 1174 */ 1175 if (tm->op != MOD_LOG_ROOT_REPLACE) 1176 break; 1177 1178 found = tm; 1179 root_logical = tm->old_root.logical; 1180 looped = 1; 1181 } 1182 1183 /* if there's no old root to return, return what we found instead */ 1184 if (!found) 1185 found = tm; 1186 1187 return found; 1188 } 1189 1190 /* 1191 * tm is a pointer to the first operation to rewind within eb. then, all 1192 * previous operations will be rewound (until we reach something older than 1193 * time_seq). 1194 */ 1195 static void 1196 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 1197 u64 time_seq, struct tree_mod_elem *first_tm) 1198 { 1199 u32 n; 1200 struct rb_node *next; 1201 struct tree_mod_elem *tm = first_tm; 1202 unsigned long o_dst; 1203 unsigned long o_src; 1204 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1205 1206 n = btrfs_header_nritems(eb); 1207 read_lock(&fs_info->tree_mod_log_lock); 1208 while (tm && tm->seq >= time_seq) { 1209 /* 1210 * all the operations are recorded with the operator used for 1211 * the modification. as we're going backwards, we do the 1212 * opposite of each operation here. 1213 */ 1214 switch (tm->op) { 1215 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1216 BUG_ON(tm->slot < n); 1217 /* Fallthrough */ 1218 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1219 case MOD_LOG_KEY_REMOVE: 1220 btrfs_set_node_key(eb, &tm->key, tm->slot); 1221 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1222 btrfs_set_node_ptr_generation(eb, tm->slot, 1223 tm->generation); 1224 n++; 1225 break; 1226 case MOD_LOG_KEY_REPLACE: 1227 BUG_ON(tm->slot >= n); 1228 btrfs_set_node_key(eb, &tm->key, tm->slot); 1229 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1230 btrfs_set_node_ptr_generation(eb, tm->slot, 1231 tm->generation); 1232 break; 1233 case MOD_LOG_KEY_ADD: 1234 /* if a move operation is needed it's in the log */ 1235 n--; 1236 break; 1237 case MOD_LOG_MOVE_KEYS: 1238 o_dst = btrfs_node_key_ptr_offset(tm->slot); 1239 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 1240 memmove_extent_buffer(eb, o_dst, o_src, 1241 tm->move.nr_items * p_size); 1242 break; 1243 case MOD_LOG_ROOT_REPLACE: 1244 /* 1245 * this operation is special. for roots, this must be 1246 * handled explicitly before rewinding. 1247 * for non-roots, this operation may exist if the node 1248 * was a root: root A -> child B; then A gets empty and 1249 * B is promoted to the new root. in the mod log, we'll 1250 * have a root-replace operation for B, a tree block 1251 * that is no root. we simply ignore that operation. 1252 */ 1253 break; 1254 } 1255 next = rb_next(&tm->node); 1256 if (!next) 1257 break; 1258 tm = rb_entry(next, struct tree_mod_elem, node); 1259 if (tm->logical != first_tm->logical) 1260 break; 1261 } 1262 read_unlock(&fs_info->tree_mod_log_lock); 1263 btrfs_set_header_nritems(eb, n); 1264 } 1265 1266 /* 1267 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 1268 * is returned. If rewind operations happen, a fresh buffer is returned. The 1269 * returned buffer is always read-locked. If the returned buffer is not the 1270 * input buffer, the lock on the input buffer is released and the input buffer 1271 * is freed (its refcount is decremented). 1272 */ 1273 static struct extent_buffer * 1274 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 1275 struct extent_buffer *eb, u64 time_seq) 1276 { 1277 struct extent_buffer *eb_rewin; 1278 struct tree_mod_elem *tm; 1279 1280 if (!time_seq) 1281 return eb; 1282 1283 if (btrfs_header_level(eb) == 0) 1284 return eb; 1285 1286 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 1287 if (!tm) 1288 return eb; 1289 1290 btrfs_set_path_blocking(path); 1291 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1292 1293 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1294 BUG_ON(tm->slot != 0); 1295 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 1296 if (!eb_rewin) { 1297 btrfs_tree_read_unlock_blocking(eb); 1298 free_extent_buffer(eb); 1299 return NULL; 1300 } 1301 btrfs_set_header_bytenr(eb_rewin, eb->start); 1302 btrfs_set_header_backref_rev(eb_rewin, 1303 btrfs_header_backref_rev(eb)); 1304 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 1305 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 1306 } else { 1307 eb_rewin = btrfs_clone_extent_buffer(eb); 1308 if (!eb_rewin) { 1309 btrfs_tree_read_unlock_blocking(eb); 1310 free_extent_buffer(eb); 1311 return NULL; 1312 } 1313 } 1314 1315 btrfs_tree_read_unlock_blocking(eb); 1316 free_extent_buffer(eb); 1317 1318 btrfs_tree_read_lock(eb_rewin); 1319 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 1320 WARN_ON(btrfs_header_nritems(eb_rewin) > 1321 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1322 1323 return eb_rewin; 1324 } 1325 1326 /* 1327 * get_old_root() rewinds the state of @root's root node to the given @time_seq 1328 * value. If there are no changes, the current root->root_node is returned. If 1329 * anything changed in between, there's a fresh buffer allocated on which the 1330 * rewind operations are done. In any case, the returned buffer is read locked. 1331 * Returns NULL on error (with no locks held). 1332 */ 1333 static inline struct extent_buffer * 1334 get_old_root(struct btrfs_root *root, u64 time_seq) 1335 { 1336 struct btrfs_fs_info *fs_info = root->fs_info; 1337 struct tree_mod_elem *tm; 1338 struct extent_buffer *eb = NULL; 1339 struct extent_buffer *eb_root; 1340 struct extent_buffer *old; 1341 struct tree_mod_root *old_root = NULL; 1342 u64 old_generation = 0; 1343 u64 logical; 1344 int level; 1345 1346 eb_root = btrfs_read_lock_root_node(root); 1347 tm = __tree_mod_log_oldest_root(eb_root, time_seq); 1348 if (!tm) 1349 return eb_root; 1350 1351 if (tm->op == MOD_LOG_ROOT_REPLACE) { 1352 old_root = &tm->old_root; 1353 old_generation = tm->generation; 1354 logical = old_root->logical; 1355 level = old_root->level; 1356 } else { 1357 logical = eb_root->start; 1358 level = btrfs_header_level(eb_root); 1359 } 1360 1361 tm = tree_mod_log_search(fs_info, logical, time_seq); 1362 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1363 btrfs_tree_read_unlock(eb_root); 1364 free_extent_buffer(eb_root); 1365 old = read_tree_block(fs_info, logical, 0, level, NULL); 1366 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1367 if (!IS_ERR(old)) 1368 free_extent_buffer(old); 1369 btrfs_warn(fs_info, 1370 "failed to read tree block %llu from get_old_root", 1371 logical); 1372 } else { 1373 eb = btrfs_clone_extent_buffer(old); 1374 free_extent_buffer(old); 1375 } 1376 } else if (old_root) { 1377 btrfs_tree_read_unlock(eb_root); 1378 free_extent_buffer(eb_root); 1379 eb = alloc_dummy_extent_buffer(fs_info, logical); 1380 } else { 1381 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); 1382 eb = btrfs_clone_extent_buffer(eb_root); 1383 btrfs_tree_read_unlock_blocking(eb_root); 1384 free_extent_buffer(eb_root); 1385 } 1386 1387 if (!eb) 1388 return NULL; 1389 btrfs_tree_read_lock(eb); 1390 if (old_root) { 1391 btrfs_set_header_bytenr(eb, eb->start); 1392 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1393 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); 1394 btrfs_set_header_level(eb, old_root->level); 1395 btrfs_set_header_generation(eb, old_generation); 1396 } 1397 if (tm) 1398 __tree_mod_log_rewind(fs_info, eb, time_seq, tm); 1399 else 1400 WARN_ON(btrfs_header_level(eb) != 0); 1401 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 1402 1403 return eb; 1404 } 1405 1406 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1407 { 1408 struct tree_mod_elem *tm; 1409 int level; 1410 struct extent_buffer *eb_root = btrfs_root_node(root); 1411 1412 tm = __tree_mod_log_oldest_root(eb_root, time_seq); 1413 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { 1414 level = tm->old_root.level; 1415 } else { 1416 level = btrfs_header_level(eb_root); 1417 } 1418 free_extent_buffer(eb_root); 1419 1420 return level; 1421 } 1422 1423 static inline int should_cow_block(struct btrfs_trans_handle *trans, 1424 struct btrfs_root *root, 1425 struct extent_buffer *buf) 1426 { 1427 if (btrfs_is_testing(root->fs_info)) 1428 return 0; 1429 1430 /* Ensure we can see the FORCE_COW bit */ 1431 smp_mb__before_atomic(); 1432 1433 /* 1434 * We do not need to cow a block if 1435 * 1) this block is not created or changed in this transaction; 1436 * 2) this block does not belong to TREE_RELOC tree; 1437 * 3) the root is not forced COW. 1438 * 1439 * What is forced COW: 1440 * when we create snapshot during committing the transaction, 1441 * after we've finished copying src root, we must COW the shared 1442 * block to ensure the metadata consistency. 1443 */ 1444 if (btrfs_header_generation(buf) == trans->transid && 1445 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1446 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1447 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1448 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) 1449 return 0; 1450 return 1; 1451 } 1452 1453 /* 1454 * cows a single block, see __btrfs_cow_block for the real work. 1455 * This version of it has extra checks so that a block isn't COWed more than 1456 * once per transaction, as long as it hasn't been written yet 1457 */ 1458 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 1459 struct btrfs_root *root, struct extent_buffer *buf, 1460 struct extent_buffer *parent, int parent_slot, 1461 struct extent_buffer **cow_ret) 1462 { 1463 struct btrfs_fs_info *fs_info = root->fs_info; 1464 u64 search_start; 1465 int ret; 1466 1467 if (test_bit(BTRFS_ROOT_DELETING, &root->state)) 1468 btrfs_err(fs_info, 1469 "COW'ing blocks on a fs root that's being dropped"); 1470 1471 if (trans->transaction != fs_info->running_transaction) 1472 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1473 trans->transid, 1474 fs_info->running_transaction->transid); 1475 1476 if (trans->transid != fs_info->generation) 1477 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1478 trans->transid, fs_info->generation); 1479 1480 if (!should_cow_block(trans, root, buf)) { 1481 trans->dirty = true; 1482 *cow_ret = buf; 1483 return 0; 1484 } 1485 1486 search_start = buf->start & ~((u64)SZ_1G - 1); 1487 1488 if (parent) 1489 btrfs_set_lock_blocking(parent); 1490 btrfs_set_lock_blocking(buf); 1491 1492 ret = __btrfs_cow_block(trans, root, buf, parent, 1493 parent_slot, cow_ret, search_start, 0); 1494 1495 trace_btrfs_cow_block(root, buf, *cow_ret); 1496 1497 return ret; 1498 } 1499 1500 /* 1501 * helper function for defrag to decide if two blocks pointed to by a 1502 * node are actually close by 1503 */ 1504 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 1505 { 1506 if (blocknr < other && other - (blocknr + blocksize) < 32768) 1507 return 1; 1508 if (blocknr > other && blocknr - (other + blocksize) < 32768) 1509 return 1; 1510 return 0; 1511 } 1512 1513 /* 1514 * compare two keys in a memcmp fashion 1515 */ 1516 static int comp_keys(const struct btrfs_disk_key *disk, 1517 const struct btrfs_key *k2) 1518 { 1519 struct btrfs_key k1; 1520 1521 btrfs_disk_key_to_cpu(&k1, disk); 1522 1523 return btrfs_comp_cpu_keys(&k1, k2); 1524 } 1525 1526 /* 1527 * same as comp_keys only with two btrfs_key's 1528 */ 1529 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) 1530 { 1531 if (k1->objectid > k2->objectid) 1532 return 1; 1533 if (k1->objectid < k2->objectid) 1534 return -1; 1535 if (k1->type > k2->type) 1536 return 1; 1537 if (k1->type < k2->type) 1538 return -1; 1539 if (k1->offset > k2->offset) 1540 return 1; 1541 if (k1->offset < k2->offset) 1542 return -1; 1543 return 0; 1544 } 1545 1546 /* 1547 * this is used by the defrag code to go through all the 1548 * leaves pointed to by a node and reallocate them so that 1549 * disk order is close to key order 1550 */ 1551 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1552 struct btrfs_root *root, struct extent_buffer *parent, 1553 int start_slot, u64 *last_ret, 1554 struct btrfs_key *progress) 1555 { 1556 struct btrfs_fs_info *fs_info = root->fs_info; 1557 struct extent_buffer *cur; 1558 u64 blocknr; 1559 u64 gen; 1560 u64 search_start = *last_ret; 1561 u64 last_block = 0; 1562 u64 other; 1563 u32 parent_nritems; 1564 int end_slot; 1565 int i; 1566 int err = 0; 1567 int parent_level; 1568 int uptodate; 1569 u32 blocksize; 1570 int progress_passed = 0; 1571 struct btrfs_disk_key disk_key; 1572 1573 parent_level = btrfs_header_level(parent); 1574 1575 WARN_ON(trans->transaction != fs_info->running_transaction); 1576 WARN_ON(trans->transid != fs_info->generation); 1577 1578 parent_nritems = btrfs_header_nritems(parent); 1579 blocksize = fs_info->nodesize; 1580 end_slot = parent_nritems - 1; 1581 1582 if (parent_nritems <= 1) 1583 return 0; 1584 1585 btrfs_set_lock_blocking(parent); 1586 1587 for (i = start_slot; i <= end_slot; i++) { 1588 struct btrfs_key first_key; 1589 int close = 1; 1590 1591 btrfs_node_key(parent, &disk_key, i); 1592 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 1593 continue; 1594 1595 progress_passed = 1; 1596 blocknr = btrfs_node_blockptr(parent, i); 1597 gen = btrfs_node_ptr_generation(parent, i); 1598 btrfs_node_key_to_cpu(parent, &first_key, i); 1599 if (last_block == 0) 1600 last_block = blocknr; 1601 1602 if (i > 0) { 1603 other = btrfs_node_blockptr(parent, i - 1); 1604 close = close_blocks(blocknr, other, blocksize); 1605 } 1606 if (!close && i < end_slot) { 1607 other = btrfs_node_blockptr(parent, i + 1); 1608 close = close_blocks(blocknr, other, blocksize); 1609 } 1610 if (close) { 1611 last_block = blocknr; 1612 continue; 1613 } 1614 1615 cur = find_extent_buffer(fs_info, blocknr); 1616 if (cur) 1617 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1618 else 1619 uptodate = 0; 1620 if (!cur || !uptodate) { 1621 if (!cur) { 1622 cur = read_tree_block(fs_info, blocknr, gen, 1623 parent_level - 1, 1624 &first_key); 1625 if (IS_ERR(cur)) { 1626 return PTR_ERR(cur); 1627 } else if (!extent_buffer_uptodate(cur)) { 1628 free_extent_buffer(cur); 1629 return -EIO; 1630 } 1631 } else if (!uptodate) { 1632 err = btrfs_read_buffer(cur, gen, 1633 parent_level - 1,&first_key); 1634 if (err) { 1635 free_extent_buffer(cur); 1636 return err; 1637 } 1638 } 1639 } 1640 if (search_start == 0) 1641 search_start = last_block; 1642 1643 btrfs_tree_lock(cur); 1644 btrfs_set_lock_blocking(cur); 1645 err = __btrfs_cow_block(trans, root, cur, parent, i, 1646 &cur, search_start, 1647 min(16 * blocksize, 1648 (end_slot - i) * blocksize)); 1649 if (err) { 1650 btrfs_tree_unlock(cur); 1651 free_extent_buffer(cur); 1652 break; 1653 } 1654 search_start = cur->start; 1655 last_block = cur->start; 1656 *last_ret = search_start; 1657 btrfs_tree_unlock(cur); 1658 free_extent_buffer(cur); 1659 } 1660 return err; 1661 } 1662 1663 /* 1664 * search for key in the extent_buffer. The items start at offset p, 1665 * and they are item_size apart. There are 'max' items in p. 1666 * 1667 * the slot in the array is returned via slot, and it points to 1668 * the place where you would insert key if it is not found in 1669 * the array. 1670 * 1671 * slot may point to max if the key is bigger than all of the keys 1672 */ 1673 static noinline int generic_bin_search(struct extent_buffer *eb, 1674 unsigned long p, int item_size, 1675 const struct btrfs_key *key, 1676 int max, int *slot) 1677 { 1678 int low = 0; 1679 int high = max; 1680 int mid; 1681 int ret; 1682 struct btrfs_disk_key *tmp = NULL; 1683 struct btrfs_disk_key unaligned; 1684 unsigned long offset; 1685 char *kaddr = NULL; 1686 unsigned long map_start = 0; 1687 unsigned long map_len = 0; 1688 int err; 1689 1690 if (low > high) { 1691 btrfs_err(eb->fs_info, 1692 "%s: low (%d) > high (%d) eb %llu owner %llu level %d", 1693 __func__, low, high, eb->start, 1694 btrfs_header_owner(eb), btrfs_header_level(eb)); 1695 return -EINVAL; 1696 } 1697 1698 while (low < high) { 1699 mid = (low + high) / 2; 1700 offset = p + mid * item_size; 1701 1702 if (!kaddr || offset < map_start || 1703 (offset + sizeof(struct btrfs_disk_key)) > 1704 map_start + map_len) { 1705 1706 err = map_private_extent_buffer(eb, offset, 1707 sizeof(struct btrfs_disk_key), 1708 &kaddr, &map_start, &map_len); 1709 1710 if (!err) { 1711 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1712 map_start); 1713 } else if (err == 1) { 1714 read_extent_buffer(eb, &unaligned, 1715 offset, sizeof(unaligned)); 1716 tmp = &unaligned; 1717 } else { 1718 return err; 1719 } 1720 1721 } else { 1722 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1723 map_start); 1724 } 1725 ret = comp_keys(tmp, key); 1726 1727 if (ret < 0) 1728 low = mid + 1; 1729 else if (ret > 0) 1730 high = mid; 1731 else { 1732 *slot = mid; 1733 return 0; 1734 } 1735 } 1736 *slot = low; 1737 return 1; 1738 } 1739 1740 /* 1741 * simple bin_search frontend that does the right thing for 1742 * leaves vs nodes 1743 */ 1744 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, 1745 int level, int *slot) 1746 { 1747 if (level == 0) 1748 return generic_bin_search(eb, 1749 offsetof(struct btrfs_leaf, items), 1750 sizeof(struct btrfs_item), 1751 key, btrfs_header_nritems(eb), 1752 slot); 1753 else 1754 return generic_bin_search(eb, 1755 offsetof(struct btrfs_node, ptrs), 1756 sizeof(struct btrfs_key_ptr), 1757 key, btrfs_header_nritems(eb), 1758 slot); 1759 } 1760 1761 static void root_add_used(struct btrfs_root *root, u32 size) 1762 { 1763 spin_lock(&root->accounting_lock); 1764 btrfs_set_root_used(&root->root_item, 1765 btrfs_root_used(&root->root_item) + size); 1766 spin_unlock(&root->accounting_lock); 1767 } 1768 1769 static void root_sub_used(struct btrfs_root *root, u32 size) 1770 { 1771 spin_lock(&root->accounting_lock); 1772 btrfs_set_root_used(&root->root_item, 1773 btrfs_root_used(&root->root_item) - size); 1774 spin_unlock(&root->accounting_lock); 1775 } 1776 1777 /* given a node and slot number, this reads the blocks it points to. The 1778 * extent buffer is returned with a reference taken (but unlocked). 1779 */ 1780 static noinline struct extent_buffer * 1781 read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, 1782 int slot) 1783 { 1784 int level = btrfs_header_level(parent); 1785 struct extent_buffer *eb; 1786 struct btrfs_key first_key; 1787 1788 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 1789 return ERR_PTR(-ENOENT); 1790 1791 BUG_ON(level == 0); 1792 1793 btrfs_node_key_to_cpu(parent, &first_key, slot); 1794 eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot), 1795 btrfs_node_ptr_generation(parent, slot), 1796 level - 1, &first_key); 1797 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 1798 free_extent_buffer(eb); 1799 eb = ERR_PTR(-EIO); 1800 } 1801 1802 return eb; 1803 } 1804 1805 /* 1806 * node level balancing, used to make sure nodes are in proper order for 1807 * item deletion. We balance from the top down, so we have to make sure 1808 * that a deletion won't leave an node completely empty later on. 1809 */ 1810 static noinline int balance_level(struct btrfs_trans_handle *trans, 1811 struct btrfs_root *root, 1812 struct btrfs_path *path, int level) 1813 { 1814 struct btrfs_fs_info *fs_info = root->fs_info; 1815 struct extent_buffer *right = NULL; 1816 struct extent_buffer *mid; 1817 struct extent_buffer *left = NULL; 1818 struct extent_buffer *parent = NULL; 1819 int ret = 0; 1820 int wret; 1821 int pslot; 1822 int orig_slot = path->slots[level]; 1823 u64 orig_ptr; 1824 1825 ASSERT(level > 0); 1826 1827 mid = path->nodes[level]; 1828 1829 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && 1830 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); 1831 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1832 1833 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1834 1835 if (level < BTRFS_MAX_LEVEL - 1) { 1836 parent = path->nodes[level + 1]; 1837 pslot = path->slots[level + 1]; 1838 } 1839 1840 /* 1841 * deal with the case where there is only one pointer in the root 1842 * by promoting the node below to a root 1843 */ 1844 if (!parent) { 1845 struct extent_buffer *child; 1846 1847 if (btrfs_header_nritems(mid) != 1) 1848 return 0; 1849 1850 /* promote the child to a root */ 1851 child = read_node_slot(fs_info, mid, 0); 1852 if (IS_ERR(child)) { 1853 ret = PTR_ERR(child); 1854 btrfs_handle_fs_error(fs_info, ret, NULL); 1855 goto enospc; 1856 } 1857 1858 btrfs_tree_lock(child); 1859 btrfs_set_lock_blocking(child); 1860 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1861 if (ret) { 1862 btrfs_tree_unlock(child); 1863 free_extent_buffer(child); 1864 goto enospc; 1865 } 1866 1867 ret = tree_mod_log_insert_root(root->node, child, 1); 1868 BUG_ON(ret < 0); 1869 rcu_assign_pointer(root->node, child); 1870 1871 add_root_to_dirty_list(root); 1872 btrfs_tree_unlock(child); 1873 1874 path->locks[level] = 0; 1875 path->nodes[level] = NULL; 1876 clean_tree_block(fs_info, mid); 1877 btrfs_tree_unlock(mid); 1878 /* once for the path */ 1879 free_extent_buffer(mid); 1880 1881 root_sub_used(root, mid->len); 1882 btrfs_free_tree_block(trans, root, mid, 0, 1); 1883 /* once for the root ptr */ 1884 free_extent_buffer_stale(mid); 1885 return 0; 1886 } 1887 if (btrfs_header_nritems(mid) > 1888 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) 1889 return 0; 1890 1891 left = read_node_slot(fs_info, parent, pslot - 1); 1892 if (IS_ERR(left)) 1893 left = NULL; 1894 1895 if (left) { 1896 btrfs_tree_lock(left); 1897 btrfs_set_lock_blocking(left); 1898 wret = btrfs_cow_block(trans, root, left, 1899 parent, pslot - 1, &left); 1900 if (wret) { 1901 ret = wret; 1902 goto enospc; 1903 } 1904 } 1905 1906 right = read_node_slot(fs_info, parent, pslot + 1); 1907 if (IS_ERR(right)) 1908 right = NULL; 1909 1910 if (right) { 1911 btrfs_tree_lock(right); 1912 btrfs_set_lock_blocking(right); 1913 wret = btrfs_cow_block(trans, root, right, 1914 parent, pslot + 1, &right); 1915 if (wret) { 1916 ret = wret; 1917 goto enospc; 1918 } 1919 } 1920 1921 /* first, try to make some room in the middle buffer */ 1922 if (left) { 1923 orig_slot += btrfs_header_nritems(left); 1924 wret = push_node_left(trans, fs_info, left, mid, 1); 1925 if (wret < 0) 1926 ret = wret; 1927 } 1928 1929 /* 1930 * then try to empty the right most buffer into the middle 1931 */ 1932 if (right) { 1933 wret = push_node_left(trans, fs_info, mid, right, 1); 1934 if (wret < 0 && wret != -ENOSPC) 1935 ret = wret; 1936 if (btrfs_header_nritems(right) == 0) { 1937 clean_tree_block(fs_info, right); 1938 btrfs_tree_unlock(right); 1939 del_ptr(root, path, level + 1, pslot + 1); 1940 root_sub_used(root, right->len); 1941 btrfs_free_tree_block(trans, root, right, 0, 1); 1942 free_extent_buffer_stale(right); 1943 right = NULL; 1944 } else { 1945 struct btrfs_disk_key right_key; 1946 btrfs_node_key(right, &right_key, 0); 1947 ret = tree_mod_log_insert_key(parent, pslot + 1, 1948 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1949 BUG_ON(ret < 0); 1950 btrfs_set_node_key(parent, &right_key, pslot + 1); 1951 btrfs_mark_buffer_dirty(parent); 1952 } 1953 } 1954 if (btrfs_header_nritems(mid) == 1) { 1955 /* 1956 * we're not allowed to leave a node with one item in the 1957 * tree during a delete. A deletion from lower in the tree 1958 * could try to delete the only pointer in this node. 1959 * So, pull some keys from the left. 1960 * There has to be a left pointer at this point because 1961 * otherwise we would have pulled some pointers from the 1962 * right 1963 */ 1964 if (!left) { 1965 ret = -EROFS; 1966 btrfs_handle_fs_error(fs_info, ret, NULL); 1967 goto enospc; 1968 } 1969 wret = balance_node_right(trans, fs_info, mid, left); 1970 if (wret < 0) { 1971 ret = wret; 1972 goto enospc; 1973 } 1974 if (wret == 1) { 1975 wret = push_node_left(trans, fs_info, left, mid, 1); 1976 if (wret < 0) 1977 ret = wret; 1978 } 1979 BUG_ON(wret == 1); 1980 } 1981 if (btrfs_header_nritems(mid) == 0) { 1982 clean_tree_block(fs_info, mid); 1983 btrfs_tree_unlock(mid); 1984 del_ptr(root, path, level + 1, pslot); 1985 root_sub_used(root, mid->len); 1986 btrfs_free_tree_block(trans, root, mid, 0, 1); 1987 free_extent_buffer_stale(mid); 1988 mid = NULL; 1989 } else { 1990 /* update the parent key to reflect our changes */ 1991 struct btrfs_disk_key mid_key; 1992 btrfs_node_key(mid, &mid_key, 0); 1993 ret = tree_mod_log_insert_key(parent, pslot, 1994 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1995 BUG_ON(ret < 0); 1996 btrfs_set_node_key(parent, &mid_key, pslot); 1997 btrfs_mark_buffer_dirty(parent); 1998 } 1999 2000 /* update the path */ 2001 if (left) { 2002 if (btrfs_header_nritems(left) > orig_slot) { 2003 extent_buffer_get(left); 2004 /* left was locked after cow */ 2005 path->nodes[level] = left; 2006 path->slots[level + 1] -= 1; 2007 path->slots[level] = orig_slot; 2008 if (mid) { 2009 btrfs_tree_unlock(mid); 2010 free_extent_buffer(mid); 2011 } 2012 } else { 2013 orig_slot -= btrfs_header_nritems(left); 2014 path->slots[level] = orig_slot; 2015 } 2016 } 2017 /* double check we haven't messed things up */ 2018 if (orig_ptr != 2019 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 2020 BUG(); 2021 enospc: 2022 if (right) { 2023 btrfs_tree_unlock(right); 2024 free_extent_buffer(right); 2025 } 2026 if (left) { 2027 if (path->nodes[level] != left) 2028 btrfs_tree_unlock(left); 2029 free_extent_buffer(left); 2030 } 2031 return ret; 2032 } 2033 2034 /* Node balancing for insertion. Here we only split or push nodes around 2035 * when they are completely full. This is also done top down, so we 2036 * have to be pessimistic. 2037 */ 2038 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 2039 struct btrfs_root *root, 2040 struct btrfs_path *path, int level) 2041 { 2042 struct btrfs_fs_info *fs_info = root->fs_info; 2043 struct extent_buffer *right = NULL; 2044 struct extent_buffer *mid; 2045 struct extent_buffer *left = NULL; 2046 struct extent_buffer *parent = NULL; 2047 int ret = 0; 2048 int wret; 2049 int pslot; 2050 int orig_slot = path->slots[level]; 2051 2052 if (level == 0) 2053 return 1; 2054 2055 mid = path->nodes[level]; 2056 WARN_ON(btrfs_header_generation(mid) != trans->transid); 2057 2058 if (level < BTRFS_MAX_LEVEL - 1) { 2059 parent = path->nodes[level + 1]; 2060 pslot = path->slots[level + 1]; 2061 } 2062 2063 if (!parent) 2064 return 1; 2065 2066 left = read_node_slot(fs_info, parent, pslot - 1); 2067 if (IS_ERR(left)) 2068 left = NULL; 2069 2070 /* first, try to make some room in the middle buffer */ 2071 if (left) { 2072 u32 left_nr; 2073 2074 btrfs_tree_lock(left); 2075 btrfs_set_lock_blocking(left); 2076 2077 left_nr = btrfs_header_nritems(left); 2078 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 2079 wret = 1; 2080 } else { 2081 ret = btrfs_cow_block(trans, root, left, parent, 2082 pslot - 1, &left); 2083 if (ret) 2084 wret = 1; 2085 else { 2086 wret = push_node_left(trans, fs_info, 2087 left, mid, 0); 2088 } 2089 } 2090 if (wret < 0) 2091 ret = wret; 2092 if (wret == 0) { 2093 struct btrfs_disk_key disk_key; 2094 orig_slot += left_nr; 2095 btrfs_node_key(mid, &disk_key, 0); 2096 ret = tree_mod_log_insert_key(parent, pslot, 2097 MOD_LOG_KEY_REPLACE, GFP_NOFS); 2098 BUG_ON(ret < 0); 2099 btrfs_set_node_key(parent, &disk_key, pslot); 2100 btrfs_mark_buffer_dirty(parent); 2101 if (btrfs_header_nritems(left) > orig_slot) { 2102 path->nodes[level] = left; 2103 path->slots[level + 1] -= 1; 2104 path->slots[level] = orig_slot; 2105 btrfs_tree_unlock(mid); 2106 free_extent_buffer(mid); 2107 } else { 2108 orig_slot -= 2109 btrfs_header_nritems(left); 2110 path->slots[level] = orig_slot; 2111 btrfs_tree_unlock(left); 2112 free_extent_buffer(left); 2113 } 2114 return 0; 2115 } 2116 btrfs_tree_unlock(left); 2117 free_extent_buffer(left); 2118 } 2119 right = read_node_slot(fs_info, parent, pslot + 1); 2120 if (IS_ERR(right)) 2121 right = NULL; 2122 2123 /* 2124 * then try to empty the right most buffer into the middle 2125 */ 2126 if (right) { 2127 u32 right_nr; 2128 2129 btrfs_tree_lock(right); 2130 btrfs_set_lock_blocking(right); 2131 2132 right_nr = btrfs_header_nritems(right); 2133 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { 2134 wret = 1; 2135 } else { 2136 ret = btrfs_cow_block(trans, root, right, 2137 parent, pslot + 1, 2138 &right); 2139 if (ret) 2140 wret = 1; 2141 else { 2142 wret = balance_node_right(trans, fs_info, 2143 right, mid); 2144 } 2145 } 2146 if (wret < 0) 2147 ret = wret; 2148 if (wret == 0) { 2149 struct btrfs_disk_key disk_key; 2150 2151 btrfs_node_key(right, &disk_key, 0); 2152 ret = tree_mod_log_insert_key(parent, pslot + 1, 2153 MOD_LOG_KEY_REPLACE, GFP_NOFS); 2154 BUG_ON(ret < 0); 2155 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2156 btrfs_mark_buffer_dirty(parent); 2157 2158 if (btrfs_header_nritems(mid) <= orig_slot) { 2159 path->nodes[level] = right; 2160 path->slots[level + 1] += 1; 2161 path->slots[level] = orig_slot - 2162 btrfs_header_nritems(mid); 2163 btrfs_tree_unlock(mid); 2164 free_extent_buffer(mid); 2165 } else { 2166 btrfs_tree_unlock(right); 2167 free_extent_buffer(right); 2168 } 2169 return 0; 2170 } 2171 btrfs_tree_unlock(right); 2172 free_extent_buffer(right); 2173 } 2174 return 1; 2175 } 2176 2177 /* 2178 * readahead one full node of leaves, finding things that are close 2179 * to the block in 'slot', and triggering ra on them. 2180 */ 2181 static void reada_for_search(struct btrfs_fs_info *fs_info, 2182 struct btrfs_path *path, 2183 int level, int slot, u64 objectid) 2184 { 2185 struct extent_buffer *node; 2186 struct btrfs_disk_key disk_key; 2187 u32 nritems; 2188 u64 search; 2189 u64 target; 2190 u64 nread = 0; 2191 struct extent_buffer *eb; 2192 u32 nr; 2193 u32 blocksize; 2194 u32 nscan = 0; 2195 2196 if (level != 1) 2197 return; 2198 2199 if (!path->nodes[level]) 2200 return; 2201 2202 node = path->nodes[level]; 2203 2204 search = btrfs_node_blockptr(node, slot); 2205 blocksize = fs_info->nodesize; 2206 eb = find_extent_buffer(fs_info, search); 2207 if (eb) { 2208 free_extent_buffer(eb); 2209 return; 2210 } 2211 2212 target = search; 2213 2214 nritems = btrfs_header_nritems(node); 2215 nr = slot; 2216 2217 while (1) { 2218 if (path->reada == READA_BACK) { 2219 if (nr == 0) 2220 break; 2221 nr--; 2222 } else if (path->reada == READA_FORWARD) { 2223 nr++; 2224 if (nr >= nritems) 2225 break; 2226 } 2227 if (path->reada == READA_BACK && objectid) { 2228 btrfs_node_key(node, &disk_key, nr); 2229 if (btrfs_disk_key_objectid(&disk_key) != objectid) 2230 break; 2231 } 2232 search = btrfs_node_blockptr(node, nr); 2233 if ((search <= target && target - search <= 65536) || 2234 (search > target && search - target <= 65536)) { 2235 readahead_tree_block(fs_info, search); 2236 nread += blocksize; 2237 } 2238 nscan++; 2239 if ((nread > 65536 || nscan > 32)) 2240 break; 2241 } 2242 } 2243 2244 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, 2245 struct btrfs_path *path, int level) 2246 { 2247 int slot; 2248 int nritems; 2249 struct extent_buffer *parent; 2250 struct extent_buffer *eb; 2251 u64 gen; 2252 u64 block1 = 0; 2253 u64 block2 = 0; 2254 2255 parent = path->nodes[level + 1]; 2256 if (!parent) 2257 return; 2258 2259 nritems = btrfs_header_nritems(parent); 2260 slot = path->slots[level + 1]; 2261 2262 if (slot > 0) { 2263 block1 = btrfs_node_blockptr(parent, slot - 1); 2264 gen = btrfs_node_ptr_generation(parent, slot - 1); 2265 eb = find_extent_buffer(fs_info, block1); 2266 /* 2267 * if we get -eagain from btrfs_buffer_uptodate, we 2268 * don't want to return eagain here. That will loop 2269 * forever 2270 */ 2271 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2272 block1 = 0; 2273 free_extent_buffer(eb); 2274 } 2275 if (slot + 1 < nritems) { 2276 block2 = btrfs_node_blockptr(parent, slot + 1); 2277 gen = btrfs_node_ptr_generation(parent, slot + 1); 2278 eb = find_extent_buffer(fs_info, block2); 2279 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2280 block2 = 0; 2281 free_extent_buffer(eb); 2282 } 2283 2284 if (block1) 2285 readahead_tree_block(fs_info, block1); 2286 if (block2) 2287 readahead_tree_block(fs_info, block2); 2288 } 2289 2290 2291 /* 2292 * when we walk down the tree, it is usually safe to unlock the higher layers 2293 * in the tree. The exceptions are when our path goes through slot 0, because 2294 * operations on the tree might require changing key pointers higher up in the 2295 * tree. 2296 * 2297 * callers might also have set path->keep_locks, which tells this code to keep 2298 * the lock if the path points to the last slot in the block. This is part of 2299 * walking through the tree, and selecting the next slot in the higher block. 2300 * 2301 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 2302 * if lowest_unlock is 1, level 0 won't be unlocked 2303 */ 2304 static noinline void unlock_up(struct btrfs_path *path, int level, 2305 int lowest_unlock, int min_write_lock_level, 2306 int *write_lock_level) 2307 { 2308 int i; 2309 int skip_level = level; 2310 int no_skips = 0; 2311 struct extent_buffer *t; 2312 2313 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2314 if (!path->nodes[i]) 2315 break; 2316 if (!path->locks[i]) 2317 break; 2318 if (!no_skips && path->slots[i] == 0) { 2319 skip_level = i + 1; 2320 continue; 2321 } 2322 if (!no_skips && path->keep_locks) { 2323 u32 nritems; 2324 t = path->nodes[i]; 2325 nritems = btrfs_header_nritems(t); 2326 if (nritems < 1 || path->slots[i] >= nritems - 1) { 2327 skip_level = i + 1; 2328 continue; 2329 } 2330 } 2331 if (skip_level < i && i >= lowest_unlock) 2332 no_skips = 1; 2333 2334 t = path->nodes[i]; 2335 if (i >= lowest_unlock && i > skip_level) { 2336 btrfs_tree_unlock_rw(t, path->locks[i]); 2337 path->locks[i] = 0; 2338 if (write_lock_level && 2339 i > min_write_lock_level && 2340 i <= *write_lock_level) { 2341 *write_lock_level = i - 1; 2342 } 2343 } 2344 } 2345 } 2346 2347 /* 2348 * This releases any locks held in the path starting at level and 2349 * going all the way up to the root. 2350 * 2351 * btrfs_search_slot will keep the lock held on higher nodes in a few 2352 * corner cases, such as COW of the block at slot zero in the node. This 2353 * ignores those rules, and it should only be called when there are no 2354 * more updates to be done higher up in the tree. 2355 */ 2356 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 2357 { 2358 int i; 2359 2360 if (path->keep_locks) 2361 return; 2362 2363 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2364 if (!path->nodes[i]) 2365 continue; 2366 if (!path->locks[i]) 2367 continue; 2368 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 2369 path->locks[i] = 0; 2370 } 2371 } 2372 2373 /* 2374 * helper function for btrfs_search_slot. The goal is to find a block 2375 * in cache without setting the path to blocking. If we find the block 2376 * we return zero and the path is unchanged. 2377 * 2378 * If we can't find the block, we set the path blocking and do some 2379 * reada. -EAGAIN is returned and the search must be repeated. 2380 */ 2381 static int 2382 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, 2383 struct extent_buffer **eb_ret, int level, int slot, 2384 const struct btrfs_key *key) 2385 { 2386 struct btrfs_fs_info *fs_info = root->fs_info; 2387 u64 blocknr; 2388 u64 gen; 2389 struct extent_buffer *b = *eb_ret; 2390 struct extent_buffer *tmp; 2391 struct btrfs_key first_key; 2392 int ret; 2393 int parent_level; 2394 2395 blocknr = btrfs_node_blockptr(b, slot); 2396 gen = btrfs_node_ptr_generation(b, slot); 2397 parent_level = btrfs_header_level(b); 2398 btrfs_node_key_to_cpu(b, &first_key, slot); 2399 2400 tmp = find_extent_buffer(fs_info, blocknr); 2401 if (tmp) { 2402 /* first we do an atomic uptodate check */ 2403 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2404 *eb_ret = tmp; 2405 return 0; 2406 } 2407 2408 /* the pages were up to date, but we failed 2409 * the generation number check. Do a full 2410 * read for the generation number that is correct. 2411 * We must do this without dropping locks so 2412 * we can trust our generation number 2413 */ 2414 btrfs_set_path_blocking(p); 2415 2416 /* now we're allowed to do a blocking uptodate check */ 2417 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 2418 if (!ret) { 2419 *eb_ret = tmp; 2420 return 0; 2421 } 2422 free_extent_buffer(tmp); 2423 btrfs_release_path(p); 2424 return -EIO; 2425 } 2426 2427 /* 2428 * reduce lock contention at high levels 2429 * of the btree by dropping locks before 2430 * we read. Don't release the lock on the current 2431 * level because we need to walk this node to figure 2432 * out which blocks to read. 2433 */ 2434 btrfs_unlock_up_safe(p, level + 1); 2435 btrfs_set_path_blocking(p); 2436 2437 if (p->reada != READA_NONE) 2438 reada_for_search(fs_info, p, level, slot, key->objectid); 2439 2440 ret = -EAGAIN; 2441 tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, 2442 &first_key); 2443 if (!IS_ERR(tmp)) { 2444 /* 2445 * If the read above didn't mark this buffer up to date, 2446 * it will never end up being up to date. Set ret to EIO now 2447 * and give up so that our caller doesn't loop forever 2448 * on our EAGAINs. 2449 */ 2450 if (!extent_buffer_uptodate(tmp)) 2451 ret = -EIO; 2452 free_extent_buffer(tmp); 2453 } else { 2454 ret = PTR_ERR(tmp); 2455 } 2456 2457 btrfs_release_path(p); 2458 return ret; 2459 } 2460 2461 /* 2462 * helper function for btrfs_search_slot. This does all of the checks 2463 * for node-level blocks and does any balancing required based on 2464 * the ins_len. 2465 * 2466 * If no extra work was required, zero is returned. If we had to 2467 * drop the path, -EAGAIN is returned and btrfs_search_slot must 2468 * start over 2469 */ 2470 static int 2471 setup_nodes_for_search(struct btrfs_trans_handle *trans, 2472 struct btrfs_root *root, struct btrfs_path *p, 2473 struct extent_buffer *b, int level, int ins_len, 2474 int *write_lock_level) 2475 { 2476 struct btrfs_fs_info *fs_info = root->fs_info; 2477 int ret; 2478 2479 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 2480 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { 2481 int sret; 2482 2483 if (*write_lock_level < level + 1) { 2484 *write_lock_level = level + 1; 2485 btrfs_release_path(p); 2486 goto again; 2487 } 2488 2489 btrfs_set_path_blocking(p); 2490 reada_for_balance(fs_info, p, level); 2491 sret = split_node(trans, root, p, level); 2492 2493 BUG_ON(sret > 0); 2494 if (sret) { 2495 ret = sret; 2496 goto done; 2497 } 2498 b = p->nodes[level]; 2499 } else if (ins_len < 0 && btrfs_header_nritems(b) < 2500 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { 2501 int sret; 2502 2503 if (*write_lock_level < level + 1) { 2504 *write_lock_level = level + 1; 2505 btrfs_release_path(p); 2506 goto again; 2507 } 2508 2509 btrfs_set_path_blocking(p); 2510 reada_for_balance(fs_info, p, level); 2511 sret = balance_level(trans, root, p, level); 2512 2513 if (sret) { 2514 ret = sret; 2515 goto done; 2516 } 2517 b = p->nodes[level]; 2518 if (!b) { 2519 btrfs_release_path(p); 2520 goto again; 2521 } 2522 BUG_ON(btrfs_header_nritems(b) == 1); 2523 } 2524 return 0; 2525 2526 again: 2527 ret = -EAGAIN; 2528 done: 2529 return ret; 2530 } 2531 2532 static void key_search_validate(struct extent_buffer *b, 2533 const struct btrfs_key *key, 2534 int level) 2535 { 2536 #ifdef CONFIG_BTRFS_ASSERT 2537 struct btrfs_disk_key disk_key; 2538 2539 btrfs_cpu_key_to_disk(&disk_key, key); 2540 2541 if (level == 0) 2542 ASSERT(!memcmp_extent_buffer(b, &disk_key, 2543 offsetof(struct btrfs_leaf, items[0].key), 2544 sizeof(disk_key))); 2545 else 2546 ASSERT(!memcmp_extent_buffer(b, &disk_key, 2547 offsetof(struct btrfs_node, ptrs[0].key), 2548 sizeof(disk_key))); 2549 #endif 2550 } 2551 2552 static int key_search(struct extent_buffer *b, const struct btrfs_key *key, 2553 int level, int *prev_cmp, int *slot) 2554 { 2555 if (*prev_cmp != 0) { 2556 *prev_cmp = btrfs_bin_search(b, key, level, slot); 2557 return *prev_cmp; 2558 } 2559 2560 key_search_validate(b, key, level); 2561 *slot = 0; 2562 2563 return 0; 2564 } 2565 2566 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, 2567 u64 iobjectid, u64 ioff, u8 key_type, 2568 struct btrfs_key *found_key) 2569 { 2570 int ret; 2571 struct btrfs_key key; 2572 struct extent_buffer *eb; 2573 2574 ASSERT(path); 2575 ASSERT(found_key); 2576 2577 key.type = key_type; 2578 key.objectid = iobjectid; 2579 key.offset = ioff; 2580 2581 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); 2582 if (ret < 0) 2583 return ret; 2584 2585 eb = path->nodes[0]; 2586 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { 2587 ret = btrfs_next_leaf(fs_root, path); 2588 if (ret) 2589 return ret; 2590 eb = path->nodes[0]; 2591 } 2592 2593 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); 2594 if (found_key->type != key.type || 2595 found_key->objectid != key.objectid) 2596 return 1; 2597 2598 return 0; 2599 } 2600 2601 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, 2602 struct btrfs_path *p, 2603 int write_lock_level) 2604 { 2605 struct btrfs_fs_info *fs_info = root->fs_info; 2606 struct extent_buffer *b; 2607 int root_lock; 2608 int level = 0; 2609 2610 /* We try very hard to do read locks on the root */ 2611 root_lock = BTRFS_READ_LOCK; 2612 2613 if (p->search_commit_root) { 2614 /* 2615 * The commit roots are read only so we always do read locks, 2616 * and we always must hold the commit_root_sem when doing 2617 * searches on them, the only exception is send where we don't 2618 * want to block transaction commits for a long time, so 2619 * we need to clone the commit root in order to avoid races 2620 * with transaction commits that create a snapshot of one of 2621 * the roots used by a send operation. 2622 */ 2623 if (p->need_commit_sem) { 2624 down_read(&fs_info->commit_root_sem); 2625 b = btrfs_clone_extent_buffer(root->commit_root); 2626 up_read(&fs_info->commit_root_sem); 2627 if (!b) 2628 return ERR_PTR(-ENOMEM); 2629 2630 } else { 2631 b = root->commit_root; 2632 extent_buffer_get(b); 2633 } 2634 level = btrfs_header_level(b); 2635 /* 2636 * Ensure that all callers have set skip_locking when 2637 * p->search_commit_root = 1. 2638 */ 2639 ASSERT(p->skip_locking == 1); 2640 2641 goto out; 2642 } 2643 2644 if (p->skip_locking) { 2645 b = btrfs_root_node(root); 2646 level = btrfs_header_level(b); 2647 goto out; 2648 } 2649 2650 /* 2651 * If the level is set to maximum, we can skip trying to get the read 2652 * lock. 2653 */ 2654 if (write_lock_level < BTRFS_MAX_LEVEL) { 2655 /* 2656 * We don't know the level of the root node until we actually 2657 * have it read locked 2658 */ 2659 b = btrfs_read_lock_root_node(root); 2660 level = btrfs_header_level(b); 2661 if (level > write_lock_level) 2662 goto out; 2663 2664 /* Whoops, must trade for write lock */ 2665 btrfs_tree_read_unlock(b); 2666 free_extent_buffer(b); 2667 } 2668 2669 b = btrfs_lock_root_node(root); 2670 root_lock = BTRFS_WRITE_LOCK; 2671 2672 /* The level might have changed, check again */ 2673 level = btrfs_header_level(b); 2674 2675 out: 2676 p->nodes[level] = b; 2677 if (!p->skip_locking) 2678 p->locks[level] = root_lock; 2679 /* 2680 * Callers are responsible for dropping b's references. 2681 */ 2682 return b; 2683 } 2684 2685 2686 /* 2687 * btrfs_search_slot - look for a key in a tree and perform necessary 2688 * modifications to preserve tree invariants. 2689 * 2690 * @trans: Handle of transaction, used when modifying the tree 2691 * @p: Holds all btree nodes along the search path 2692 * @root: The root node of the tree 2693 * @key: The key we are looking for 2694 * @ins_len: Indicates purpose of search, for inserts it is 1, for 2695 * deletions it's -1. 0 for plain searches 2696 * @cow: boolean should CoW operations be performed. Must always be 1 2697 * when modifying the tree. 2698 * 2699 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree. 2700 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible) 2701 * 2702 * If @key is found, 0 is returned and you can find the item in the leaf level 2703 * of the path (level 0) 2704 * 2705 * If @key isn't found, 1 is returned and the leaf level of the path (level 0) 2706 * points to the slot where it should be inserted 2707 * 2708 * If an error is encountered while searching the tree a negative error number 2709 * is returned 2710 */ 2711 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2712 const struct btrfs_key *key, struct btrfs_path *p, 2713 int ins_len, int cow) 2714 { 2715 struct btrfs_fs_info *fs_info = root->fs_info; 2716 struct extent_buffer *b; 2717 int slot; 2718 int ret; 2719 int err; 2720 int level; 2721 int lowest_unlock = 1; 2722 /* everything at write_lock_level or lower must be write locked */ 2723 int write_lock_level = 0; 2724 u8 lowest_level = 0; 2725 int min_write_lock_level; 2726 int prev_cmp; 2727 2728 lowest_level = p->lowest_level; 2729 WARN_ON(lowest_level && ins_len > 0); 2730 WARN_ON(p->nodes[0] != NULL); 2731 BUG_ON(!cow && ins_len); 2732 2733 if (ins_len < 0) { 2734 lowest_unlock = 2; 2735 2736 /* when we are removing items, we might have to go up to level 2737 * two as we update tree pointers Make sure we keep write 2738 * for those levels as well 2739 */ 2740 write_lock_level = 2; 2741 } else if (ins_len > 0) { 2742 /* 2743 * for inserting items, make sure we have a write lock on 2744 * level 1 so we can update keys 2745 */ 2746 write_lock_level = 1; 2747 } 2748 2749 if (!cow) 2750 write_lock_level = -1; 2751 2752 if (cow && (p->keep_locks || p->lowest_level)) 2753 write_lock_level = BTRFS_MAX_LEVEL; 2754 2755 min_write_lock_level = write_lock_level; 2756 2757 again: 2758 prev_cmp = -1; 2759 b = btrfs_search_slot_get_root(root, p, write_lock_level); 2760 if (IS_ERR(b)) { 2761 ret = PTR_ERR(b); 2762 goto done; 2763 } 2764 2765 while (b) { 2766 level = btrfs_header_level(b); 2767 2768 /* 2769 * setup the path here so we can release it under lock 2770 * contention with the cow code 2771 */ 2772 if (cow) { 2773 bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); 2774 2775 /* 2776 * if we don't really need to cow this block 2777 * then we don't want to set the path blocking, 2778 * so we test it here 2779 */ 2780 if (!should_cow_block(trans, root, b)) { 2781 trans->dirty = true; 2782 goto cow_done; 2783 } 2784 2785 /* 2786 * must have write locks on this node and the 2787 * parent 2788 */ 2789 if (level > write_lock_level || 2790 (level + 1 > write_lock_level && 2791 level + 1 < BTRFS_MAX_LEVEL && 2792 p->nodes[level + 1])) { 2793 write_lock_level = level + 1; 2794 btrfs_release_path(p); 2795 goto again; 2796 } 2797 2798 btrfs_set_path_blocking(p); 2799 if (last_level) 2800 err = btrfs_cow_block(trans, root, b, NULL, 0, 2801 &b); 2802 else 2803 err = btrfs_cow_block(trans, root, b, 2804 p->nodes[level + 1], 2805 p->slots[level + 1], &b); 2806 if (err) { 2807 ret = err; 2808 goto done; 2809 } 2810 } 2811 cow_done: 2812 p->nodes[level] = b; 2813 /* 2814 * Leave path with blocking locks to avoid massive 2815 * lock context switch, this is made on purpose. 2816 */ 2817 2818 /* 2819 * we have a lock on b and as long as we aren't changing 2820 * the tree, there is no way to for the items in b to change. 2821 * It is safe to drop the lock on our parent before we 2822 * go through the expensive btree search on b. 2823 * 2824 * If we're inserting or deleting (ins_len != 0), then we might 2825 * be changing slot zero, which may require changing the parent. 2826 * So, we can't drop the lock until after we know which slot 2827 * we're operating on. 2828 */ 2829 if (!ins_len && !p->keep_locks) { 2830 int u = level + 1; 2831 2832 if (u < BTRFS_MAX_LEVEL && p->locks[u]) { 2833 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); 2834 p->locks[u] = 0; 2835 } 2836 } 2837 2838 ret = key_search(b, key, level, &prev_cmp, &slot); 2839 if (ret < 0) 2840 goto done; 2841 2842 if (level != 0) { 2843 int dec = 0; 2844 if (ret && slot > 0) { 2845 dec = 1; 2846 slot -= 1; 2847 } 2848 p->slots[level] = slot; 2849 err = setup_nodes_for_search(trans, root, p, b, level, 2850 ins_len, &write_lock_level); 2851 if (err == -EAGAIN) 2852 goto again; 2853 if (err) { 2854 ret = err; 2855 goto done; 2856 } 2857 b = p->nodes[level]; 2858 slot = p->slots[level]; 2859 2860 /* 2861 * slot 0 is special, if we change the key 2862 * we have to update the parent pointer 2863 * which means we must have a write lock 2864 * on the parent 2865 */ 2866 if (slot == 0 && ins_len && 2867 write_lock_level < level + 1) { 2868 write_lock_level = level + 1; 2869 btrfs_release_path(p); 2870 goto again; 2871 } 2872 2873 unlock_up(p, level, lowest_unlock, 2874 min_write_lock_level, &write_lock_level); 2875 2876 if (level == lowest_level) { 2877 if (dec) 2878 p->slots[level]++; 2879 goto done; 2880 } 2881 2882 err = read_block_for_search(root, p, &b, level, 2883 slot, key); 2884 if (err == -EAGAIN) 2885 goto again; 2886 if (err) { 2887 ret = err; 2888 goto done; 2889 } 2890 2891 if (!p->skip_locking) { 2892 level = btrfs_header_level(b); 2893 if (level <= write_lock_level) { 2894 err = btrfs_try_tree_write_lock(b); 2895 if (!err) { 2896 btrfs_set_path_blocking(p); 2897 btrfs_tree_lock(b); 2898 } 2899 p->locks[level] = BTRFS_WRITE_LOCK; 2900 } else { 2901 err = btrfs_tree_read_lock_atomic(b); 2902 if (!err) { 2903 btrfs_set_path_blocking(p); 2904 btrfs_tree_read_lock(b); 2905 } 2906 p->locks[level] = BTRFS_READ_LOCK; 2907 } 2908 p->nodes[level] = b; 2909 } 2910 } else { 2911 p->slots[level] = slot; 2912 if (ins_len > 0 && 2913 btrfs_leaf_free_space(fs_info, b) < ins_len) { 2914 if (write_lock_level < 1) { 2915 write_lock_level = 1; 2916 btrfs_release_path(p); 2917 goto again; 2918 } 2919 2920 btrfs_set_path_blocking(p); 2921 err = split_leaf(trans, root, key, 2922 p, ins_len, ret == 0); 2923 2924 BUG_ON(err > 0); 2925 if (err) { 2926 ret = err; 2927 goto done; 2928 } 2929 } 2930 if (!p->search_for_split) 2931 unlock_up(p, level, lowest_unlock, 2932 min_write_lock_level, NULL); 2933 goto done; 2934 } 2935 } 2936 ret = 1; 2937 done: 2938 /* 2939 * we don't really know what they plan on doing with the path 2940 * from here on, so for now just mark it as blocking 2941 */ 2942 if (!p->leave_spinning) 2943 btrfs_set_path_blocking(p); 2944 if (ret < 0 && !p->skip_release_on_error) 2945 btrfs_release_path(p); 2946 return ret; 2947 } 2948 2949 /* 2950 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2951 * current state of the tree together with the operations recorded in the tree 2952 * modification log to search for the key in a previous version of this tree, as 2953 * denoted by the time_seq parameter. 2954 * 2955 * Naturally, there is no support for insert, delete or cow operations. 2956 * 2957 * The resulting path and return value will be set up as if we called 2958 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2959 */ 2960 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, 2961 struct btrfs_path *p, u64 time_seq) 2962 { 2963 struct btrfs_fs_info *fs_info = root->fs_info; 2964 struct extent_buffer *b; 2965 int slot; 2966 int ret; 2967 int err; 2968 int level; 2969 int lowest_unlock = 1; 2970 u8 lowest_level = 0; 2971 int prev_cmp = -1; 2972 2973 lowest_level = p->lowest_level; 2974 WARN_ON(p->nodes[0] != NULL); 2975 2976 if (p->search_commit_root) { 2977 BUG_ON(time_seq); 2978 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2979 } 2980 2981 again: 2982 b = get_old_root(root, time_seq); 2983 if (!b) { 2984 ret = -EIO; 2985 goto done; 2986 } 2987 level = btrfs_header_level(b); 2988 p->locks[level] = BTRFS_READ_LOCK; 2989 2990 while (b) { 2991 level = btrfs_header_level(b); 2992 p->nodes[level] = b; 2993 2994 /* 2995 * we have a lock on b and as long as we aren't changing 2996 * the tree, there is no way to for the items in b to change. 2997 * It is safe to drop the lock on our parent before we 2998 * go through the expensive btree search on b. 2999 */ 3000 btrfs_unlock_up_safe(p, level + 1); 3001 3002 /* 3003 * Since we can unwind ebs we want to do a real search every 3004 * time. 3005 */ 3006 prev_cmp = -1; 3007 ret = key_search(b, key, level, &prev_cmp, &slot); 3008 3009 if (level != 0) { 3010 int dec = 0; 3011 if (ret && slot > 0) { 3012 dec = 1; 3013 slot -= 1; 3014 } 3015 p->slots[level] = slot; 3016 unlock_up(p, level, lowest_unlock, 0, NULL); 3017 3018 if (level == lowest_level) { 3019 if (dec) 3020 p->slots[level]++; 3021 goto done; 3022 } 3023 3024 err = read_block_for_search(root, p, &b, level, 3025 slot, key); 3026 if (err == -EAGAIN) 3027 goto again; 3028 if (err) { 3029 ret = err; 3030 goto done; 3031 } 3032 3033 level = btrfs_header_level(b); 3034 err = btrfs_tree_read_lock_atomic(b); 3035 if (!err) { 3036 btrfs_set_path_blocking(p); 3037 btrfs_tree_read_lock(b); 3038 } 3039 b = tree_mod_log_rewind(fs_info, p, b, time_seq); 3040 if (!b) { 3041 ret = -ENOMEM; 3042 goto done; 3043 } 3044 p->locks[level] = BTRFS_READ_LOCK; 3045 p->nodes[level] = b; 3046 } else { 3047 p->slots[level] = slot; 3048 unlock_up(p, level, lowest_unlock, 0, NULL); 3049 goto done; 3050 } 3051 } 3052 ret = 1; 3053 done: 3054 if (!p->leave_spinning) 3055 btrfs_set_path_blocking(p); 3056 if (ret < 0) 3057 btrfs_release_path(p); 3058 3059 return ret; 3060 } 3061 3062 /* 3063 * helper to use instead of search slot if no exact match is needed but 3064 * instead the next or previous item should be returned. 3065 * When find_higher is true, the next higher item is returned, the next lower 3066 * otherwise. 3067 * When return_any and find_higher are both true, and no higher item is found, 3068 * return the next lower instead. 3069 * When return_any is true and find_higher is false, and no lower item is found, 3070 * return the next higher instead. 3071 * It returns 0 if any item is found, 1 if none is found (tree empty), and 3072 * < 0 on error 3073 */ 3074 int btrfs_search_slot_for_read(struct btrfs_root *root, 3075 const struct btrfs_key *key, 3076 struct btrfs_path *p, int find_higher, 3077 int return_any) 3078 { 3079 int ret; 3080 struct extent_buffer *leaf; 3081 3082 again: 3083 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 3084 if (ret <= 0) 3085 return ret; 3086 /* 3087 * a return value of 1 means the path is at the position where the 3088 * item should be inserted. Normally this is the next bigger item, 3089 * but in case the previous item is the last in a leaf, path points 3090 * to the first free slot in the previous leaf, i.e. at an invalid 3091 * item. 3092 */ 3093 leaf = p->nodes[0]; 3094 3095 if (find_higher) { 3096 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 3097 ret = btrfs_next_leaf(root, p); 3098 if (ret <= 0) 3099 return ret; 3100 if (!return_any) 3101 return 1; 3102 /* 3103 * no higher item found, return the next 3104 * lower instead 3105 */ 3106 return_any = 0; 3107 find_higher = 0; 3108 btrfs_release_path(p); 3109 goto again; 3110 } 3111 } else { 3112 if (p->slots[0] == 0) { 3113 ret = btrfs_prev_leaf(root, p); 3114 if (ret < 0) 3115 return ret; 3116 if (!ret) { 3117 leaf = p->nodes[0]; 3118 if (p->slots[0] == btrfs_header_nritems(leaf)) 3119 p->slots[0]--; 3120 return 0; 3121 } 3122 if (!return_any) 3123 return 1; 3124 /* 3125 * no lower item found, return the next 3126 * higher instead 3127 */ 3128 return_any = 0; 3129 find_higher = 1; 3130 btrfs_release_path(p); 3131 goto again; 3132 } else { 3133 --p->slots[0]; 3134 } 3135 } 3136 return 0; 3137 } 3138 3139 /* 3140 * adjust the pointers going up the tree, starting at level 3141 * making sure the right key of each node is points to 'key'. 3142 * This is used after shifting pointers to the left, so it stops 3143 * fixing up pointers when a given leaf/node is not in slot 0 of the 3144 * higher levels 3145 * 3146 */ 3147 static void fixup_low_keys(struct btrfs_path *path, 3148 struct btrfs_disk_key *key, int level) 3149 { 3150 int i; 3151 struct extent_buffer *t; 3152 int ret; 3153 3154 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 3155 int tslot = path->slots[i]; 3156 3157 if (!path->nodes[i]) 3158 break; 3159 t = path->nodes[i]; 3160 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE, 3161 GFP_ATOMIC); 3162 BUG_ON(ret < 0); 3163 btrfs_set_node_key(t, key, tslot); 3164 btrfs_mark_buffer_dirty(path->nodes[i]); 3165 if (tslot != 0) 3166 break; 3167 } 3168 } 3169 3170 /* 3171 * update item key. 3172 * 3173 * This function isn't completely safe. It's the caller's responsibility 3174 * that the new key won't break the order 3175 */ 3176 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, 3177 struct btrfs_path *path, 3178 const struct btrfs_key *new_key) 3179 { 3180 struct btrfs_disk_key disk_key; 3181 struct extent_buffer *eb; 3182 int slot; 3183 3184 eb = path->nodes[0]; 3185 slot = path->slots[0]; 3186 if (slot > 0) { 3187 btrfs_item_key(eb, &disk_key, slot - 1); 3188 BUG_ON(comp_keys(&disk_key, new_key) >= 0); 3189 } 3190 if (slot < btrfs_header_nritems(eb) - 1) { 3191 btrfs_item_key(eb, &disk_key, slot + 1); 3192 BUG_ON(comp_keys(&disk_key, new_key) <= 0); 3193 } 3194 3195 btrfs_cpu_key_to_disk(&disk_key, new_key); 3196 btrfs_set_item_key(eb, &disk_key, slot); 3197 btrfs_mark_buffer_dirty(eb); 3198 if (slot == 0) 3199 fixup_low_keys(path, &disk_key, 1); 3200 } 3201 3202 /* 3203 * try to push data from one node into the next node left in the 3204 * tree. 3205 * 3206 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 3207 * error, and > 0 if there was no room in the left hand block. 3208 */ 3209 static int push_node_left(struct btrfs_trans_handle *trans, 3210 struct btrfs_fs_info *fs_info, 3211 struct extent_buffer *dst, 3212 struct extent_buffer *src, int empty) 3213 { 3214 int push_items = 0; 3215 int src_nritems; 3216 int dst_nritems; 3217 int ret = 0; 3218 3219 src_nritems = btrfs_header_nritems(src); 3220 dst_nritems = btrfs_header_nritems(dst); 3221 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 3222 WARN_ON(btrfs_header_generation(src) != trans->transid); 3223 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3224 3225 if (!empty && src_nritems <= 8) 3226 return 1; 3227 3228 if (push_items <= 0) 3229 return 1; 3230 3231 if (empty) { 3232 push_items = min(src_nritems, push_items); 3233 if (push_items < src_nritems) { 3234 /* leave at least 8 pointers in the node if 3235 * we aren't going to empty it 3236 */ 3237 if (src_nritems - push_items < 8) { 3238 if (push_items <= 8) 3239 return 1; 3240 push_items -= 8; 3241 } 3242 } 3243 } else 3244 push_items = min(src_nritems - 8, push_items); 3245 3246 ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0, 3247 push_items); 3248 if (ret) { 3249 btrfs_abort_transaction(trans, ret); 3250 return ret; 3251 } 3252 copy_extent_buffer(dst, src, 3253 btrfs_node_key_ptr_offset(dst_nritems), 3254 btrfs_node_key_ptr_offset(0), 3255 push_items * sizeof(struct btrfs_key_ptr)); 3256 3257 if (push_items < src_nritems) { 3258 /* 3259 * Don't call tree_mod_log_insert_move here, key removal was 3260 * already fully logged by tree_mod_log_eb_copy above. 3261 */ 3262 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 3263 btrfs_node_key_ptr_offset(push_items), 3264 (src_nritems - push_items) * 3265 sizeof(struct btrfs_key_ptr)); 3266 } 3267 btrfs_set_header_nritems(src, src_nritems - push_items); 3268 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3269 btrfs_mark_buffer_dirty(src); 3270 btrfs_mark_buffer_dirty(dst); 3271 3272 return ret; 3273 } 3274 3275 /* 3276 * try to push data from one node into the next node right in the 3277 * tree. 3278 * 3279 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 3280 * error, and > 0 if there was no room in the right hand block. 3281 * 3282 * this will only push up to 1/2 the contents of the left node over 3283 */ 3284 static int balance_node_right(struct btrfs_trans_handle *trans, 3285 struct btrfs_fs_info *fs_info, 3286 struct extent_buffer *dst, 3287 struct extent_buffer *src) 3288 { 3289 int push_items = 0; 3290 int max_push; 3291 int src_nritems; 3292 int dst_nritems; 3293 int ret = 0; 3294 3295 WARN_ON(btrfs_header_generation(src) != trans->transid); 3296 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3297 3298 src_nritems = btrfs_header_nritems(src); 3299 dst_nritems = btrfs_header_nritems(dst); 3300 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; 3301 if (push_items <= 0) 3302 return 1; 3303 3304 if (src_nritems < 4) 3305 return 1; 3306 3307 max_push = src_nritems / 2 + 1; 3308 /* don't try to empty the node */ 3309 if (max_push >= src_nritems) 3310 return 1; 3311 3312 if (max_push < push_items) 3313 push_items = max_push; 3314 3315 ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); 3316 BUG_ON(ret < 0); 3317 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 3318 btrfs_node_key_ptr_offset(0), 3319 (dst_nritems) * 3320 sizeof(struct btrfs_key_ptr)); 3321 3322 ret = tree_mod_log_eb_copy(fs_info, dst, src, 0, 3323 src_nritems - push_items, push_items); 3324 if (ret) { 3325 btrfs_abort_transaction(trans, ret); 3326 return ret; 3327 } 3328 copy_extent_buffer(dst, src, 3329 btrfs_node_key_ptr_offset(0), 3330 btrfs_node_key_ptr_offset(src_nritems - push_items), 3331 push_items * sizeof(struct btrfs_key_ptr)); 3332 3333 btrfs_set_header_nritems(src, src_nritems - push_items); 3334 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3335 3336 btrfs_mark_buffer_dirty(src); 3337 btrfs_mark_buffer_dirty(dst); 3338 3339 return ret; 3340 } 3341 3342 /* 3343 * helper function to insert a new root level in the tree. 3344 * A new node is allocated, and a single item is inserted to 3345 * point to the existing root 3346 * 3347 * returns zero on success or < 0 on failure. 3348 */ 3349 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3350 struct btrfs_root *root, 3351 struct btrfs_path *path, int level) 3352 { 3353 struct btrfs_fs_info *fs_info = root->fs_info; 3354 u64 lower_gen; 3355 struct extent_buffer *lower; 3356 struct extent_buffer *c; 3357 struct extent_buffer *old; 3358 struct btrfs_disk_key lower_key; 3359 int ret; 3360 3361 BUG_ON(path->nodes[level]); 3362 BUG_ON(path->nodes[level-1] != root->node); 3363 3364 lower = path->nodes[level-1]; 3365 if (level == 1) 3366 btrfs_item_key(lower, &lower_key, 0); 3367 else 3368 btrfs_node_key(lower, &lower_key, 0); 3369 3370 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, 3371 root->node->start, 0); 3372 if (IS_ERR(c)) 3373 return PTR_ERR(c); 3374 3375 root_add_used(root, fs_info->nodesize); 3376 3377 btrfs_set_header_nritems(c, 1); 3378 btrfs_set_node_key(c, &lower_key, 0); 3379 btrfs_set_node_blockptr(c, 0, lower->start); 3380 lower_gen = btrfs_header_generation(lower); 3381 WARN_ON(lower_gen != trans->transid); 3382 3383 btrfs_set_node_ptr_generation(c, 0, lower_gen); 3384 3385 btrfs_mark_buffer_dirty(c); 3386 3387 old = root->node; 3388 ret = tree_mod_log_insert_root(root->node, c, 0); 3389 BUG_ON(ret < 0); 3390 rcu_assign_pointer(root->node, c); 3391 3392 /* the super has an extra ref to root->node */ 3393 free_extent_buffer(old); 3394 3395 add_root_to_dirty_list(root); 3396 extent_buffer_get(c); 3397 path->nodes[level] = c; 3398 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 3399 path->slots[level] = 0; 3400 return 0; 3401 } 3402 3403 /* 3404 * worker function to insert a single pointer in a node. 3405 * the node should have enough room for the pointer already 3406 * 3407 * slot and level indicate where you want the key to go, and 3408 * blocknr is the block the key points to. 3409 */ 3410 static void insert_ptr(struct btrfs_trans_handle *trans, 3411 struct btrfs_fs_info *fs_info, struct btrfs_path *path, 3412 struct btrfs_disk_key *key, u64 bytenr, 3413 int slot, int level) 3414 { 3415 struct extent_buffer *lower; 3416 int nritems; 3417 int ret; 3418 3419 BUG_ON(!path->nodes[level]); 3420 btrfs_assert_tree_locked(path->nodes[level]); 3421 lower = path->nodes[level]; 3422 nritems = btrfs_header_nritems(lower); 3423 BUG_ON(slot > nritems); 3424 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 3425 if (slot != nritems) { 3426 if (level) { 3427 ret = tree_mod_log_insert_move(lower, slot + 1, slot, 3428 nritems - slot); 3429 BUG_ON(ret < 0); 3430 } 3431 memmove_extent_buffer(lower, 3432 btrfs_node_key_ptr_offset(slot + 1), 3433 btrfs_node_key_ptr_offset(slot), 3434 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3435 } 3436 if (level) { 3437 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD, 3438 GFP_NOFS); 3439 BUG_ON(ret < 0); 3440 } 3441 btrfs_set_node_key(lower, key, slot); 3442 btrfs_set_node_blockptr(lower, slot, bytenr); 3443 WARN_ON(trans->transid == 0); 3444 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3445 btrfs_set_header_nritems(lower, nritems + 1); 3446 btrfs_mark_buffer_dirty(lower); 3447 } 3448 3449 /* 3450 * split the node at the specified level in path in two. 3451 * The path is corrected to point to the appropriate node after the split 3452 * 3453 * Before splitting this tries to make some room in the node by pushing 3454 * left and right, if either one works, it returns right away. 3455 * 3456 * returns 0 on success and < 0 on failure 3457 */ 3458 static noinline int split_node(struct btrfs_trans_handle *trans, 3459 struct btrfs_root *root, 3460 struct btrfs_path *path, int level) 3461 { 3462 struct btrfs_fs_info *fs_info = root->fs_info; 3463 struct extent_buffer *c; 3464 struct extent_buffer *split; 3465 struct btrfs_disk_key disk_key; 3466 int mid; 3467 int ret; 3468 u32 c_nritems; 3469 3470 c = path->nodes[level]; 3471 WARN_ON(btrfs_header_generation(c) != trans->transid); 3472 if (c == root->node) { 3473 /* 3474 * trying to split the root, lets make a new one 3475 * 3476 * tree mod log: We don't log_removal old root in 3477 * insert_new_root, because that root buffer will be kept as a 3478 * normal node. We are going to log removal of half of the 3479 * elements below with tree_mod_log_eb_copy. We're holding a 3480 * tree lock on the buffer, which is why we cannot race with 3481 * other tree_mod_log users. 3482 */ 3483 ret = insert_new_root(trans, root, path, level + 1); 3484 if (ret) 3485 return ret; 3486 } else { 3487 ret = push_nodes_for_insert(trans, root, path, level); 3488 c = path->nodes[level]; 3489 if (!ret && btrfs_header_nritems(c) < 3490 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) 3491 return 0; 3492 if (ret < 0) 3493 return ret; 3494 } 3495 3496 c_nritems = btrfs_header_nritems(c); 3497 mid = (c_nritems + 1) / 2; 3498 btrfs_node_key(c, &disk_key, mid); 3499 3500 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, 3501 c->start, 0); 3502 if (IS_ERR(split)) 3503 return PTR_ERR(split); 3504 3505 root_add_used(root, fs_info->nodesize); 3506 ASSERT(btrfs_header_level(c) == level); 3507 3508 ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid); 3509 if (ret) { 3510 btrfs_abort_transaction(trans, ret); 3511 return ret; 3512 } 3513 copy_extent_buffer(split, c, 3514 btrfs_node_key_ptr_offset(0), 3515 btrfs_node_key_ptr_offset(mid), 3516 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3517 btrfs_set_header_nritems(split, c_nritems - mid); 3518 btrfs_set_header_nritems(c, mid); 3519 ret = 0; 3520 3521 btrfs_mark_buffer_dirty(c); 3522 btrfs_mark_buffer_dirty(split); 3523 3524 insert_ptr(trans, fs_info, path, &disk_key, split->start, 3525 path->slots[level + 1] + 1, level + 1); 3526 3527 if (path->slots[level] >= mid) { 3528 path->slots[level] -= mid; 3529 btrfs_tree_unlock(c); 3530 free_extent_buffer(c); 3531 path->nodes[level] = split; 3532 path->slots[level + 1] += 1; 3533 } else { 3534 btrfs_tree_unlock(split); 3535 free_extent_buffer(split); 3536 } 3537 return ret; 3538 } 3539 3540 /* 3541 * how many bytes are required to store the items in a leaf. start 3542 * and nr indicate which items in the leaf to check. This totals up the 3543 * space used both by the item structs and the item data 3544 */ 3545 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3546 { 3547 struct btrfs_item *start_item; 3548 struct btrfs_item *end_item; 3549 struct btrfs_map_token token; 3550 int data_len; 3551 int nritems = btrfs_header_nritems(l); 3552 int end = min(nritems, start + nr) - 1; 3553 3554 if (!nr) 3555 return 0; 3556 btrfs_init_map_token(&token); 3557 start_item = btrfs_item_nr(start); 3558 end_item = btrfs_item_nr(end); 3559 data_len = btrfs_token_item_offset(l, start_item, &token) + 3560 btrfs_token_item_size(l, start_item, &token); 3561 data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3562 data_len += sizeof(struct btrfs_item) * nr; 3563 WARN_ON(data_len < 0); 3564 return data_len; 3565 } 3566 3567 /* 3568 * The space between the end of the leaf items and 3569 * the start of the leaf data. IOW, how much room 3570 * the leaf has left for both items and data 3571 */ 3572 noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, 3573 struct extent_buffer *leaf) 3574 { 3575 int nritems = btrfs_header_nritems(leaf); 3576 int ret; 3577 3578 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); 3579 if (ret < 0) { 3580 btrfs_crit(fs_info, 3581 "leaf free space ret %d, leaf data size %lu, used %d nritems %d", 3582 ret, 3583 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), 3584 leaf_space_used(leaf, 0, nritems), nritems); 3585 } 3586 return ret; 3587 } 3588 3589 /* 3590 * min slot controls the lowest index we're willing to push to the 3591 * right. We'll push up to and including min_slot, but no lower 3592 */ 3593 static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, 3594 struct btrfs_path *path, 3595 int data_size, int empty, 3596 struct extent_buffer *right, 3597 int free_space, u32 left_nritems, 3598 u32 min_slot) 3599 { 3600 struct extent_buffer *left = path->nodes[0]; 3601 struct extent_buffer *upper = path->nodes[1]; 3602 struct btrfs_map_token token; 3603 struct btrfs_disk_key disk_key; 3604 int slot; 3605 u32 i; 3606 int push_space = 0; 3607 int push_items = 0; 3608 struct btrfs_item *item; 3609 u32 nr; 3610 u32 right_nritems; 3611 u32 data_end; 3612 u32 this_item_size; 3613 3614 btrfs_init_map_token(&token); 3615 3616 if (empty) 3617 nr = 0; 3618 else 3619 nr = max_t(u32, 1, min_slot); 3620 3621 if (path->slots[0] >= left_nritems) 3622 push_space += data_size; 3623 3624 slot = path->slots[1]; 3625 i = left_nritems - 1; 3626 while (i >= nr) { 3627 item = btrfs_item_nr(i); 3628 3629 if (!empty && push_items > 0) { 3630 if (path->slots[0] > i) 3631 break; 3632 if (path->slots[0] == i) { 3633 int space = btrfs_leaf_free_space(fs_info, left); 3634 if (space + push_space * 2 > free_space) 3635 break; 3636 } 3637 } 3638 3639 if (path->slots[0] == i) 3640 push_space += data_size; 3641 3642 this_item_size = btrfs_item_size(left, item); 3643 if (this_item_size + sizeof(*item) + push_space > free_space) 3644 break; 3645 3646 push_items++; 3647 push_space += this_item_size + sizeof(*item); 3648 if (i == 0) 3649 break; 3650 i--; 3651 } 3652 3653 if (push_items == 0) 3654 goto out_unlock; 3655 3656 WARN_ON(!empty && push_items == left_nritems); 3657 3658 /* push left to right */ 3659 right_nritems = btrfs_header_nritems(right); 3660 3661 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 3662 push_space -= leaf_data_end(fs_info, left); 3663 3664 /* make room in the right data area */ 3665 data_end = leaf_data_end(fs_info, right); 3666 memmove_extent_buffer(right, 3667 BTRFS_LEAF_DATA_OFFSET + data_end - push_space, 3668 BTRFS_LEAF_DATA_OFFSET + data_end, 3669 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3670 3671 /* copy from the left data area */ 3672 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + 3673 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3674 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), 3675 push_space); 3676 3677 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3678 btrfs_item_nr_offset(0), 3679 right_nritems * sizeof(struct btrfs_item)); 3680 3681 /* copy the items from left to right */ 3682 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 3683 btrfs_item_nr_offset(left_nritems - push_items), 3684 push_items * sizeof(struct btrfs_item)); 3685 3686 /* update the item pointers */ 3687 right_nritems += push_items; 3688 btrfs_set_header_nritems(right, right_nritems); 3689 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3690 for (i = 0; i < right_nritems; i++) { 3691 item = btrfs_item_nr(i); 3692 push_space -= btrfs_token_item_size(right, item, &token); 3693 btrfs_set_token_item_offset(right, item, push_space, &token); 3694 } 3695 3696 left_nritems -= push_items; 3697 btrfs_set_header_nritems(left, left_nritems); 3698 3699 if (left_nritems) 3700 btrfs_mark_buffer_dirty(left); 3701 else 3702 clean_tree_block(fs_info, left); 3703 3704 btrfs_mark_buffer_dirty(right); 3705 3706 btrfs_item_key(right, &disk_key, 0); 3707 btrfs_set_node_key(upper, &disk_key, slot + 1); 3708 btrfs_mark_buffer_dirty(upper); 3709 3710 /* then fixup the leaf pointer in the path */ 3711 if (path->slots[0] >= left_nritems) { 3712 path->slots[0] -= left_nritems; 3713 if (btrfs_header_nritems(path->nodes[0]) == 0) 3714 clean_tree_block(fs_info, path->nodes[0]); 3715 btrfs_tree_unlock(path->nodes[0]); 3716 free_extent_buffer(path->nodes[0]); 3717 path->nodes[0] = right; 3718 path->slots[1] += 1; 3719 } else { 3720 btrfs_tree_unlock(right); 3721 free_extent_buffer(right); 3722 } 3723 return 0; 3724 3725 out_unlock: 3726 btrfs_tree_unlock(right); 3727 free_extent_buffer(right); 3728 return 1; 3729 } 3730 3731 /* 3732 * push some data in the path leaf to the right, trying to free up at 3733 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3734 * 3735 * returns 1 if the push failed because the other node didn't have enough 3736 * room, 0 if everything worked out and < 0 if there were major errors. 3737 * 3738 * this will push starting from min_slot to the end of the leaf. It won't 3739 * push any slot lower than min_slot 3740 */ 3741 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3742 *root, struct btrfs_path *path, 3743 int min_data_size, int data_size, 3744 int empty, u32 min_slot) 3745 { 3746 struct btrfs_fs_info *fs_info = root->fs_info; 3747 struct extent_buffer *left = path->nodes[0]; 3748 struct extent_buffer *right; 3749 struct extent_buffer *upper; 3750 int slot; 3751 int free_space; 3752 u32 left_nritems; 3753 int ret; 3754 3755 if (!path->nodes[1]) 3756 return 1; 3757 3758 slot = path->slots[1]; 3759 upper = path->nodes[1]; 3760 if (slot >= btrfs_header_nritems(upper) - 1) 3761 return 1; 3762 3763 btrfs_assert_tree_locked(path->nodes[1]); 3764 3765 right = read_node_slot(fs_info, upper, slot + 1); 3766 /* 3767 * slot + 1 is not valid or we fail to read the right node, 3768 * no big deal, just return. 3769 */ 3770 if (IS_ERR(right)) 3771 return 1; 3772 3773 btrfs_tree_lock(right); 3774 btrfs_set_lock_blocking(right); 3775 3776 free_space = btrfs_leaf_free_space(fs_info, right); 3777 if (free_space < data_size) 3778 goto out_unlock; 3779 3780 /* cow and double check */ 3781 ret = btrfs_cow_block(trans, root, right, upper, 3782 slot + 1, &right); 3783 if (ret) 3784 goto out_unlock; 3785 3786 free_space = btrfs_leaf_free_space(fs_info, right); 3787 if (free_space < data_size) 3788 goto out_unlock; 3789 3790 left_nritems = btrfs_header_nritems(left); 3791 if (left_nritems == 0) 3792 goto out_unlock; 3793 3794 if (path->slots[0] == left_nritems && !empty) { 3795 /* Key greater than all keys in the leaf, right neighbor has 3796 * enough room for it and we're not emptying our leaf to delete 3797 * it, therefore use right neighbor to insert the new item and 3798 * no need to touch/dirty our left leaf. */ 3799 btrfs_tree_unlock(left); 3800 free_extent_buffer(left); 3801 path->nodes[0] = right; 3802 path->slots[0] = 0; 3803 path->slots[1]++; 3804 return 0; 3805 } 3806 3807 return __push_leaf_right(fs_info, path, min_data_size, empty, 3808 right, free_space, left_nritems, min_slot); 3809 out_unlock: 3810 btrfs_tree_unlock(right); 3811 free_extent_buffer(right); 3812 return 1; 3813 } 3814 3815 /* 3816 * push some data in the path leaf to the left, trying to free up at 3817 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3818 * 3819 * max_slot can put a limit on how far into the leaf we'll push items. The 3820 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3821 * items 3822 */ 3823 static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, 3824 struct btrfs_path *path, int data_size, 3825 int empty, struct extent_buffer *left, 3826 int free_space, u32 right_nritems, 3827 u32 max_slot) 3828 { 3829 struct btrfs_disk_key disk_key; 3830 struct extent_buffer *right = path->nodes[0]; 3831 int i; 3832 int push_space = 0; 3833 int push_items = 0; 3834 struct btrfs_item *item; 3835 u32 old_left_nritems; 3836 u32 nr; 3837 int ret = 0; 3838 u32 this_item_size; 3839 u32 old_left_item_size; 3840 struct btrfs_map_token token; 3841 3842 btrfs_init_map_token(&token); 3843 3844 if (empty) 3845 nr = min(right_nritems, max_slot); 3846 else 3847 nr = min(right_nritems - 1, max_slot); 3848 3849 for (i = 0; i < nr; i++) { 3850 item = btrfs_item_nr(i); 3851 3852 if (!empty && push_items > 0) { 3853 if (path->slots[0] < i) 3854 break; 3855 if (path->slots[0] == i) { 3856 int space = btrfs_leaf_free_space(fs_info, right); 3857 if (space + push_space * 2 > free_space) 3858 break; 3859 } 3860 } 3861 3862 if (path->slots[0] == i) 3863 push_space += data_size; 3864 3865 this_item_size = btrfs_item_size(right, item); 3866 if (this_item_size + sizeof(*item) + push_space > free_space) 3867 break; 3868 3869 push_items++; 3870 push_space += this_item_size + sizeof(*item); 3871 } 3872 3873 if (push_items == 0) { 3874 ret = 1; 3875 goto out; 3876 } 3877 WARN_ON(!empty && push_items == btrfs_header_nritems(right)); 3878 3879 /* push data from right to left */ 3880 copy_extent_buffer(left, right, 3881 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3882 btrfs_item_nr_offset(0), 3883 push_items * sizeof(struct btrfs_item)); 3884 3885 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3886 btrfs_item_offset_nr(right, push_items - 1); 3887 3888 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + 3889 leaf_data_end(fs_info, left) - push_space, 3890 BTRFS_LEAF_DATA_OFFSET + 3891 btrfs_item_offset_nr(right, push_items - 1), 3892 push_space); 3893 old_left_nritems = btrfs_header_nritems(left); 3894 BUG_ON(old_left_nritems <= 0); 3895 3896 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 3897 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3898 u32 ioff; 3899 3900 item = btrfs_item_nr(i); 3901 3902 ioff = btrfs_token_item_offset(left, item, &token); 3903 btrfs_set_token_item_offset(left, item, 3904 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size), 3905 &token); 3906 } 3907 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3908 3909 /* fixup right node */ 3910 if (push_items > right_nritems) 3911 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3912 right_nritems); 3913 3914 if (push_items < right_nritems) { 3915 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3916 leaf_data_end(fs_info, right); 3917 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + 3918 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3919 BTRFS_LEAF_DATA_OFFSET + 3920 leaf_data_end(fs_info, right), push_space); 3921 3922 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3923 btrfs_item_nr_offset(push_items), 3924 (btrfs_header_nritems(right) - push_items) * 3925 sizeof(struct btrfs_item)); 3926 } 3927 right_nritems -= push_items; 3928 btrfs_set_header_nritems(right, right_nritems); 3929 push_space = BTRFS_LEAF_DATA_SIZE(fs_info); 3930 for (i = 0; i < right_nritems; i++) { 3931 item = btrfs_item_nr(i); 3932 3933 push_space = push_space - btrfs_token_item_size(right, 3934 item, &token); 3935 btrfs_set_token_item_offset(right, item, push_space, &token); 3936 } 3937 3938 btrfs_mark_buffer_dirty(left); 3939 if (right_nritems) 3940 btrfs_mark_buffer_dirty(right); 3941 else 3942 clean_tree_block(fs_info, right); 3943 3944 btrfs_item_key(right, &disk_key, 0); 3945 fixup_low_keys(path, &disk_key, 1); 3946 3947 /* then fixup the leaf pointer in the path */ 3948 if (path->slots[0] < push_items) { 3949 path->slots[0] += old_left_nritems; 3950 btrfs_tree_unlock(path->nodes[0]); 3951 free_extent_buffer(path->nodes[0]); 3952 path->nodes[0] = left; 3953 path->slots[1] -= 1; 3954 } else { 3955 btrfs_tree_unlock(left); 3956 free_extent_buffer(left); 3957 path->slots[0] -= push_items; 3958 } 3959 BUG_ON(path->slots[0] < 0); 3960 return ret; 3961 out: 3962 btrfs_tree_unlock(left); 3963 free_extent_buffer(left); 3964 return ret; 3965 } 3966 3967 /* 3968 * push some data in the path leaf to the left, trying to free up at 3969 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3970 * 3971 * max_slot can put a limit on how far into the leaf we'll push items. The 3972 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3973 * items 3974 */ 3975 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3976 *root, struct btrfs_path *path, int min_data_size, 3977 int data_size, int empty, u32 max_slot) 3978 { 3979 struct btrfs_fs_info *fs_info = root->fs_info; 3980 struct extent_buffer *right = path->nodes[0]; 3981 struct extent_buffer *left; 3982 int slot; 3983 int free_space; 3984 u32 right_nritems; 3985 int ret = 0; 3986 3987 slot = path->slots[1]; 3988 if (slot == 0) 3989 return 1; 3990 if (!path->nodes[1]) 3991 return 1; 3992 3993 right_nritems = btrfs_header_nritems(right); 3994 if (right_nritems == 0) 3995 return 1; 3996 3997 btrfs_assert_tree_locked(path->nodes[1]); 3998 3999 left = read_node_slot(fs_info, path->nodes[1], slot - 1); 4000 /* 4001 * slot - 1 is not valid or we fail to read the left node, 4002 * no big deal, just return. 4003 */ 4004 if (IS_ERR(left)) 4005 return 1; 4006 4007 btrfs_tree_lock(left); 4008 btrfs_set_lock_blocking(left); 4009 4010 free_space = btrfs_leaf_free_space(fs_info, left); 4011 if (free_space < data_size) { 4012 ret = 1; 4013 goto out; 4014 } 4015 4016 /* cow and double check */ 4017 ret = btrfs_cow_block(trans, root, left, 4018 path->nodes[1], slot - 1, &left); 4019 if (ret) { 4020 /* we hit -ENOSPC, but it isn't fatal here */ 4021 if (ret == -ENOSPC) 4022 ret = 1; 4023 goto out; 4024 } 4025 4026 free_space = btrfs_leaf_free_space(fs_info, left); 4027 if (free_space < data_size) { 4028 ret = 1; 4029 goto out; 4030 } 4031 4032 return __push_leaf_left(fs_info, path, min_data_size, 4033 empty, left, free_space, right_nritems, 4034 max_slot); 4035 out: 4036 btrfs_tree_unlock(left); 4037 free_extent_buffer(left); 4038 return ret; 4039 } 4040 4041 /* 4042 * split the path's leaf in two, making sure there is at least data_size 4043 * available for the resulting leaf level of the path. 4044 */ 4045 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 4046 struct btrfs_fs_info *fs_info, 4047 struct btrfs_path *path, 4048 struct extent_buffer *l, 4049 struct extent_buffer *right, 4050 int slot, int mid, int nritems) 4051 { 4052 int data_copy_size; 4053 int rt_data_off; 4054 int i; 4055 struct btrfs_disk_key disk_key; 4056 struct btrfs_map_token token; 4057 4058 btrfs_init_map_token(&token); 4059 4060 nritems = nritems - mid; 4061 btrfs_set_header_nritems(right, nritems); 4062 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l); 4063 4064 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 4065 btrfs_item_nr_offset(mid), 4066 nritems * sizeof(struct btrfs_item)); 4067 4068 copy_extent_buffer(right, l, 4069 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - 4070 data_copy_size, BTRFS_LEAF_DATA_OFFSET + 4071 leaf_data_end(fs_info, l), data_copy_size); 4072 4073 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); 4074 4075 for (i = 0; i < nritems; i++) { 4076 struct btrfs_item *item = btrfs_item_nr(i); 4077 u32 ioff; 4078 4079 ioff = btrfs_token_item_offset(right, item, &token); 4080 btrfs_set_token_item_offset(right, item, 4081 ioff + rt_data_off, &token); 4082 } 4083 4084 btrfs_set_header_nritems(l, mid); 4085 btrfs_item_key(right, &disk_key, 0); 4086 insert_ptr(trans, fs_info, path, &disk_key, right->start, 4087 path->slots[1] + 1, 1); 4088 4089 btrfs_mark_buffer_dirty(right); 4090 btrfs_mark_buffer_dirty(l); 4091 BUG_ON(path->slots[0] != slot); 4092 4093 if (mid <= slot) { 4094 btrfs_tree_unlock(path->nodes[0]); 4095 free_extent_buffer(path->nodes[0]); 4096 path->nodes[0] = right; 4097 path->slots[0] -= mid; 4098 path->slots[1] += 1; 4099 } else { 4100 btrfs_tree_unlock(right); 4101 free_extent_buffer(right); 4102 } 4103 4104 BUG_ON(path->slots[0] < 0); 4105 } 4106 4107 /* 4108 * double splits happen when we need to insert a big item in the middle 4109 * of a leaf. A double split can leave us with 3 mostly empty leaves: 4110 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 4111 * A B C 4112 * 4113 * We avoid this by trying to push the items on either side of our target 4114 * into the adjacent leaves. If all goes well we can avoid the double split 4115 * completely. 4116 */ 4117 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 4118 struct btrfs_root *root, 4119 struct btrfs_path *path, 4120 int data_size) 4121 { 4122 struct btrfs_fs_info *fs_info = root->fs_info; 4123 int ret; 4124 int progress = 0; 4125 int slot; 4126 u32 nritems; 4127 int space_needed = data_size; 4128 4129 slot = path->slots[0]; 4130 if (slot < btrfs_header_nritems(path->nodes[0])) 4131 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); 4132 4133 /* 4134 * try to push all the items after our slot into the 4135 * right leaf 4136 */ 4137 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot); 4138 if (ret < 0) 4139 return ret; 4140 4141 if (ret == 0) 4142 progress++; 4143 4144 nritems = btrfs_header_nritems(path->nodes[0]); 4145 /* 4146 * our goal is to get our slot at the start or end of a leaf. If 4147 * we've done so we're done 4148 */ 4149 if (path->slots[0] == 0 || path->slots[0] == nritems) 4150 return 0; 4151 4152 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) 4153 return 0; 4154 4155 /* try to push all the items before our slot into the next leaf */ 4156 slot = path->slots[0]; 4157 space_needed = data_size; 4158 if (slot > 0) 4159 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); 4160 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); 4161 if (ret < 0) 4162 return ret; 4163 4164 if (ret == 0) 4165 progress++; 4166 4167 if (progress) 4168 return 0; 4169 return 1; 4170 } 4171 4172 /* 4173 * split the path's leaf in two, making sure there is at least data_size 4174 * available for the resulting leaf level of the path. 4175 * 4176 * returns 0 if all went well and < 0 on failure. 4177 */ 4178 static noinline int split_leaf(struct btrfs_trans_handle *trans, 4179 struct btrfs_root *root, 4180 const struct btrfs_key *ins_key, 4181 struct btrfs_path *path, int data_size, 4182 int extend) 4183 { 4184 struct btrfs_disk_key disk_key; 4185 struct extent_buffer *l; 4186 u32 nritems; 4187 int mid; 4188 int slot; 4189 struct extent_buffer *right; 4190 struct btrfs_fs_info *fs_info = root->fs_info; 4191 int ret = 0; 4192 int wret; 4193 int split; 4194 int num_doubles = 0; 4195 int tried_avoid_double = 0; 4196 4197 l = path->nodes[0]; 4198 slot = path->slots[0]; 4199 if (extend && data_size + btrfs_item_size_nr(l, slot) + 4200 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) 4201 return -EOVERFLOW; 4202 4203 /* first try to make some room by pushing left and right */ 4204 if (data_size && path->nodes[1]) { 4205 int space_needed = data_size; 4206 4207 if (slot < btrfs_header_nritems(l)) 4208 space_needed -= btrfs_leaf_free_space(fs_info, l); 4209 4210 wret = push_leaf_right(trans, root, path, space_needed, 4211 space_needed, 0, 0); 4212 if (wret < 0) 4213 return wret; 4214 if (wret) { 4215 space_needed = data_size; 4216 if (slot > 0) 4217 space_needed -= btrfs_leaf_free_space(fs_info, 4218 l); 4219 wret = push_leaf_left(trans, root, path, space_needed, 4220 space_needed, 0, (u32)-1); 4221 if (wret < 0) 4222 return wret; 4223 } 4224 l = path->nodes[0]; 4225 4226 /* did the pushes work? */ 4227 if (btrfs_leaf_free_space(fs_info, l) >= data_size) 4228 return 0; 4229 } 4230 4231 if (!path->nodes[1]) { 4232 ret = insert_new_root(trans, root, path, 1); 4233 if (ret) 4234 return ret; 4235 } 4236 again: 4237 split = 1; 4238 l = path->nodes[0]; 4239 slot = path->slots[0]; 4240 nritems = btrfs_header_nritems(l); 4241 mid = (nritems + 1) / 2; 4242 4243 if (mid <= slot) { 4244 if (nritems == 1 || 4245 leaf_space_used(l, mid, nritems - mid) + data_size > 4246 BTRFS_LEAF_DATA_SIZE(fs_info)) { 4247 if (slot >= nritems) { 4248 split = 0; 4249 } else { 4250 mid = slot; 4251 if (mid != nritems && 4252 leaf_space_used(l, mid, nritems - mid) + 4253 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 4254 if (data_size && !tried_avoid_double) 4255 goto push_for_double; 4256 split = 2; 4257 } 4258 } 4259 } 4260 } else { 4261 if (leaf_space_used(l, 0, mid) + data_size > 4262 BTRFS_LEAF_DATA_SIZE(fs_info)) { 4263 if (!extend && data_size && slot == 0) { 4264 split = 0; 4265 } else if ((extend || !data_size) && slot == 0) { 4266 mid = 1; 4267 } else { 4268 mid = slot; 4269 if (mid != nritems && 4270 leaf_space_used(l, mid, nritems - mid) + 4271 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { 4272 if (data_size && !tried_avoid_double) 4273 goto push_for_double; 4274 split = 2; 4275 } 4276 } 4277 } 4278 } 4279 4280 if (split == 0) 4281 btrfs_cpu_key_to_disk(&disk_key, ins_key); 4282 else 4283 btrfs_item_key(l, &disk_key, mid); 4284 4285 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, 4286 l->start, 0); 4287 if (IS_ERR(right)) 4288 return PTR_ERR(right); 4289 4290 root_add_used(root, fs_info->nodesize); 4291 4292 if (split == 0) { 4293 if (mid <= slot) { 4294 btrfs_set_header_nritems(right, 0); 4295 insert_ptr(trans, fs_info, path, &disk_key, 4296 right->start, path->slots[1] + 1, 1); 4297 btrfs_tree_unlock(path->nodes[0]); 4298 free_extent_buffer(path->nodes[0]); 4299 path->nodes[0] = right; 4300 path->slots[0] = 0; 4301 path->slots[1] += 1; 4302 } else { 4303 btrfs_set_header_nritems(right, 0); 4304 insert_ptr(trans, fs_info, path, &disk_key, 4305 right->start, path->slots[1], 1); 4306 btrfs_tree_unlock(path->nodes[0]); 4307 free_extent_buffer(path->nodes[0]); 4308 path->nodes[0] = right; 4309 path->slots[0] = 0; 4310 if (path->slots[1] == 0) 4311 fixup_low_keys(path, &disk_key, 1); 4312 } 4313 /* 4314 * We create a new leaf 'right' for the required ins_len and 4315 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying 4316 * the content of ins_len to 'right'. 4317 */ 4318 return ret; 4319 } 4320 4321 copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems); 4322 4323 if (split == 2) { 4324 BUG_ON(num_doubles != 0); 4325 num_doubles++; 4326 goto again; 4327 } 4328 4329 return 0; 4330 4331 push_for_double: 4332 push_for_double_split(trans, root, path, data_size); 4333 tried_avoid_double = 1; 4334 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) 4335 return 0; 4336 goto again; 4337 } 4338 4339 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 4340 struct btrfs_root *root, 4341 struct btrfs_path *path, int ins_len) 4342 { 4343 struct btrfs_fs_info *fs_info = root->fs_info; 4344 struct btrfs_key key; 4345 struct extent_buffer *leaf; 4346 struct btrfs_file_extent_item *fi; 4347 u64 extent_len = 0; 4348 u32 item_size; 4349 int ret; 4350 4351 leaf = path->nodes[0]; 4352 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4353 4354 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 4355 key.type != BTRFS_EXTENT_CSUM_KEY); 4356 4357 if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len) 4358 return 0; 4359 4360 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4361 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4362 fi = btrfs_item_ptr(leaf, path->slots[0], 4363 struct btrfs_file_extent_item); 4364 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 4365 } 4366 btrfs_release_path(path); 4367 4368 path->keep_locks = 1; 4369 path->search_for_split = 1; 4370 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4371 path->search_for_split = 0; 4372 if (ret > 0) 4373 ret = -EAGAIN; 4374 if (ret < 0) 4375 goto err; 4376 4377 ret = -EAGAIN; 4378 leaf = path->nodes[0]; 4379 /* if our item isn't there, return now */ 4380 if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) 4381 goto err; 4382 4383 /* the leaf has changed, it now has room. return now */ 4384 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len) 4385 goto err; 4386 4387 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4388 fi = btrfs_item_ptr(leaf, path->slots[0], 4389 struct btrfs_file_extent_item); 4390 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4391 goto err; 4392 } 4393 4394 btrfs_set_path_blocking(path); 4395 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4396 if (ret) 4397 goto err; 4398 4399 path->keep_locks = 0; 4400 btrfs_unlock_up_safe(path, 1); 4401 return 0; 4402 err: 4403 path->keep_locks = 0; 4404 return ret; 4405 } 4406 4407 static noinline int split_item(struct btrfs_fs_info *fs_info, 4408 struct btrfs_path *path, 4409 const struct btrfs_key *new_key, 4410 unsigned long split_offset) 4411 { 4412 struct extent_buffer *leaf; 4413 struct btrfs_item *item; 4414 struct btrfs_item *new_item; 4415 int slot; 4416 char *buf; 4417 u32 nritems; 4418 u32 item_size; 4419 u32 orig_offset; 4420 struct btrfs_disk_key disk_key; 4421 4422 leaf = path->nodes[0]; 4423 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item)); 4424 4425 btrfs_set_path_blocking(path); 4426 4427 item = btrfs_item_nr(path->slots[0]); 4428 orig_offset = btrfs_item_offset(leaf, item); 4429 item_size = btrfs_item_size(leaf, item); 4430 4431 buf = kmalloc(item_size, GFP_NOFS); 4432 if (!buf) 4433 return -ENOMEM; 4434 4435 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4436 path->slots[0]), item_size); 4437 4438 slot = path->slots[0] + 1; 4439 nritems = btrfs_header_nritems(leaf); 4440 if (slot != nritems) { 4441 /* shift the items */ 4442 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 4443 btrfs_item_nr_offset(slot), 4444 (nritems - slot) * sizeof(struct btrfs_item)); 4445 } 4446 4447 btrfs_cpu_key_to_disk(&disk_key, new_key); 4448 btrfs_set_item_key(leaf, &disk_key, slot); 4449 4450 new_item = btrfs_item_nr(slot); 4451 4452 btrfs_set_item_offset(leaf, new_item, orig_offset); 4453 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 4454 4455 btrfs_set_item_offset(leaf, item, 4456 orig_offset + item_size - split_offset); 4457 btrfs_set_item_size(leaf, item, split_offset); 4458 4459 btrfs_set_header_nritems(leaf, nritems + 1); 4460 4461 /* write the data for the start of the original item */ 4462 write_extent_buffer(leaf, buf, 4463 btrfs_item_ptr_offset(leaf, path->slots[0]), 4464 split_offset); 4465 4466 /* write the data for the new item */ 4467 write_extent_buffer(leaf, buf + split_offset, 4468 btrfs_item_ptr_offset(leaf, slot), 4469 item_size - split_offset); 4470 btrfs_mark_buffer_dirty(leaf); 4471 4472 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0); 4473 kfree(buf); 4474 return 0; 4475 } 4476 4477 /* 4478 * This function splits a single item into two items, 4479 * giving 'new_key' to the new item and splitting the 4480 * old one at split_offset (from the start of the item). 4481 * 4482 * The path may be released by this operation. After 4483 * the split, the path is pointing to the old item. The 4484 * new item is going to be in the same node as the old one. 4485 * 4486 * Note, the item being split must be smaller enough to live alone on 4487 * a tree block with room for one extra struct btrfs_item 4488 * 4489 * This allows us to split the item in place, keeping a lock on the 4490 * leaf the entire time. 4491 */ 4492 int btrfs_split_item(struct btrfs_trans_handle *trans, 4493 struct btrfs_root *root, 4494 struct btrfs_path *path, 4495 const struct btrfs_key *new_key, 4496 unsigned long split_offset) 4497 { 4498 int ret; 4499 ret = setup_leaf_for_split(trans, root, path, 4500 sizeof(struct btrfs_item)); 4501 if (ret) 4502 return ret; 4503 4504 ret = split_item(root->fs_info, path, new_key, split_offset); 4505 return ret; 4506 } 4507 4508 /* 4509 * This function duplicate a item, giving 'new_key' to the new item. 4510 * It guarantees both items live in the same tree leaf and the new item 4511 * is contiguous with the original item. 4512 * 4513 * This allows us to split file extent in place, keeping a lock on the 4514 * leaf the entire time. 4515 */ 4516 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4517 struct btrfs_root *root, 4518 struct btrfs_path *path, 4519 const struct btrfs_key *new_key) 4520 { 4521 struct extent_buffer *leaf; 4522 int ret; 4523 u32 item_size; 4524 4525 leaf = path->nodes[0]; 4526 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4527 ret = setup_leaf_for_split(trans, root, path, 4528 item_size + sizeof(struct btrfs_item)); 4529 if (ret) 4530 return ret; 4531 4532 path->slots[0]++; 4533 setup_items_for_insert(root, path, new_key, &item_size, 4534 item_size, item_size + 4535 sizeof(struct btrfs_item), 1); 4536 leaf = path->nodes[0]; 4537 memcpy_extent_buffer(leaf, 4538 btrfs_item_ptr_offset(leaf, path->slots[0]), 4539 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4540 item_size); 4541 return 0; 4542 } 4543 4544 /* 4545 * make the item pointed to by the path smaller. new_size indicates 4546 * how small to make it, and from_end tells us if we just chop bytes 4547 * off the end of the item or if we shift the item to chop bytes off 4548 * the front. 4549 */ 4550 void btrfs_truncate_item(struct btrfs_fs_info *fs_info, 4551 struct btrfs_path *path, u32 new_size, int from_end) 4552 { 4553 int slot; 4554 struct extent_buffer *leaf; 4555 struct btrfs_item *item; 4556 u32 nritems; 4557 unsigned int data_end; 4558 unsigned int old_data_start; 4559 unsigned int old_size; 4560 unsigned int size_diff; 4561 int i; 4562 struct btrfs_map_token token; 4563 4564 btrfs_init_map_token(&token); 4565 4566 leaf = path->nodes[0]; 4567 slot = path->slots[0]; 4568 4569 old_size = btrfs_item_size_nr(leaf, slot); 4570 if (old_size == new_size) 4571 return; 4572 4573 nritems = btrfs_header_nritems(leaf); 4574 data_end = leaf_data_end(fs_info, leaf); 4575 4576 old_data_start = btrfs_item_offset_nr(leaf, slot); 4577 4578 size_diff = old_size - new_size; 4579 4580 BUG_ON(slot < 0); 4581 BUG_ON(slot >= nritems); 4582 4583 /* 4584 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4585 */ 4586 /* first correct the data pointers */ 4587 for (i = slot; i < nritems; i++) { 4588 u32 ioff; 4589 item = btrfs_item_nr(i); 4590 4591 ioff = btrfs_token_item_offset(leaf, item, &token); 4592 btrfs_set_token_item_offset(leaf, item, 4593 ioff + size_diff, &token); 4594 } 4595 4596 /* shift the data */ 4597 if (from_end) { 4598 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4599 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 4600 data_end, old_data_start + new_size - data_end); 4601 } else { 4602 struct btrfs_disk_key disk_key; 4603 u64 offset; 4604 4605 btrfs_item_key(leaf, &disk_key, slot); 4606 4607 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4608 unsigned long ptr; 4609 struct btrfs_file_extent_item *fi; 4610 4611 fi = btrfs_item_ptr(leaf, slot, 4612 struct btrfs_file_extent_item); 4613 fi = (struct btrfs_file_extent_item *)( 4614 (unsigned long)fi - size_diff); 4615 4616 if (btrfs_file_extent_type(leaf, fi) == 4617 BTRFS_FILE_EXTENT_INLINE) { 4618 ptr = btrfs_item_ptr_offset(leaf, slot); 4619 memmove_extent_buffer(leaf, ptr, 4620 (unsigned long)fi, 4621 BTRFS_FILE_EXTENT_INLINE_DATA_START); 4622 } 4623 } 4624 4625 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4626 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + 4627 data_end, old_data_start - data_end); 4628 4629 offset = btrfs_disk_key_offset(&disk_key); 4630 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4631 btrfs_set_item_key(leaf, &disk_key, slot); 4632 if (slot == 0) 4633 fixup_low_keys(path, &disk_key, 1); 4634 } 4635 4636 item = btrfs_item_nr(slot); 4637 btrfs_set_item_size(leaf, item, new_size); 4638 btrfs_mark_buffer_dirty(leaf); 4639 4640 if (btrfs_leaf_free_space(fs_info, leaf) < 0) { 4641 btrfs_print_leaf(leaf); 4642 BUG(); 4643 } 4644 } 4645 4646 /* 4647 * make the item pointed to by the path bigger, data_size is the added size. 4648 */ 4649 void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 4650 u32 data_size) 4651 { 4652 int slot; 4653 struct extent_buffer *leaf; 4654 struct btrfs_item *item; 4655 u32 nritems; 4656 unsigned int data_end; 4657 unsigned int old_data; 4658 unsigned int old_size; 4659 int i; 4660 struct btrfs_map_token token; 4661 4662 btrfs_init_map_token(&token); 4663 4664 leaf = path->nodes[0]; 4665 4666 nritems = btrfs_header_nritems(leaf); 4667 data_end = leaf_data_end(fs_info, leaf); 4668 4669 if (btrfs_leaf_free_space(fs_info, leaf) < data_size) { 4670 btrfs_print_leaf(leaf); 4671 BUG(); 4672 } 4673 slot = path->slots[0]; 4674 old_data = btrfs_item_end_nr(leaf, slot); 4675 4676 BUG_ON(slot < 0); 4677 if (slot >= nritems) { 4678 btrfs_print_leaf(leaf); 4679 btrfs_crit(fs_info, "slot %d too large, nritems %d", 4680 slot, nritems); 4681 BUG_ON(1); 4682 } 4683 4684 /* 4685 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4686 */ 4687 /* first correct the data pointers */ 4688 for (i = slot; i < nritems; i++) { 4689 u32 ioff; 4690 item = btrfs_item_nr(i); 4691 4692 ioff = btrfs_token_item_offset(leaf, item, &token); 4693 btrfs_set_token_item_offset(leaf, item, 4694 ioff - data_size, &token); 4695 } 4696 4697 /* shift the data */ 4698 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4699 data_end - data_size, BTRFS_LEAF_DATA_OFFSET + 4700 data_end, old_data - data_end); 4701 4702 data_end = old_data; 4703 old_size = btrfs_item_size_nr(leaf, slot); 4704 item = btrfs_item_nr(slot); 4705 btrfs_set_item_size(leaf, item, old_size + data_size); 4706 btrfs_mark_buffer_dirty(leaf); 4707 4708 if (btrfs_leaf_free_space(fs_info, leaf) < 0) { 4709 btrfs_print_leaf(leaf); 4710 BUG(); 4711 } 4712 } 4713 4714 /* 4715 * this is a helper for btrfs_insert_empty_items, the main goal here is 4716 * to save stack depth by doing the bulk of the work in a function 4717 * that doesn't call btrfs_search_slot 4718 */ 4719 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4720 const struct btrfs_key *cpu_key, u32 *data_size, 4721 u32 total_data, u32 total_size, int nr) 4722 { 4723 struct btrfs_fs_info *fs_info = root->fs_info; 4724 struct btrfs_item *item; 4725 int i; 4726 u32 nritems; 4727 unsigned int data_end; 4728 struct btrfs_disk_key disk_key; 4729 struct extent_buffer *leaf; 4730 int slot; 4731 struct btrfs_map_token token; 4732 4733 if (path->slots[0] == 0) { 4734 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 4735 fixup_low_keys(path, &disk_key, 1); 4736 } 4737 btrfs_unlock_up_safe(path, 1); 4738 4739 btrfs_init_map_token(&token); 4740 4741 leaf = path->nodes[0]; 4742 slot = path->slots[0]; 4743 4744 nritems = btrfs_header_nritems(leaf); 4745 data_end = leaf_data_end(fs_info, leaf); 4746 4747 if (btrfs_leaf_free_space(fs_info, leaf) < total_size) { 4748 btrfs_print_leaf(leaf); 4749 btrfs_crit(fs_info, "not enough freespace need %u have %d", 4750 total_size, btrfs_leaf_free_space(fs_info, leaf)); 4751 BUG(); 4752 } 4753 4754 if (slot != nritems) { 4755 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 4756 4757 if (old_data < data_end) { 4758 btrfs_print_leaf(leaf); 4759 btrfs_crit(fs_info, "slot %d old_data %d data_end %d", 4760 slot, old_data, data_end); 4761 BUG_ON(1); 4762 } 4763 /* 4764 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4765 */ 4766 /* first correct the data pointers */ 4767 for (i = slot; i < nritems; i++) { 4768 u32 ioff; 4769 4770 item = btrfs_item_nr(i); 4771 ioff = btrfs_token_item_offset(leaf, item, &token); 4772 btrfs_set_token_item_offset(leaf, item, 4773 ioff - total_data, &token); 4774 } 4775 /* shift the items */ 4776 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 4777 btrfs_item_nr_offset(slot), 4778 (nritems - slot) * sizeof(struct btrfs_item)); 4779 4780 /* shift the data */ 4781 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4782 data_end - total_data, BTRFS_LEAF_DATA_OFFSET + 4783 data_end, old_data - data_end); 4784 data_end = old_data; 4785 } 4786 4787 /* setup the item for the new data */ 4788 for (i = 0; i < nr; i++) { 4789 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 4790 btrfs_set_item_key(leaf, &disk_key, slot + i); 4791 item = btrfs_item_nr(slot + i); 4792 btrfs_set_token_item_offset(leaf, item, 4793 data_end - data_size[i], &token); 4794 data_end -= data_size[i]; 4795 btrfs_set_token_item_size(leaf, item, data_size[i], &token); 4796 } 4797 4798 btrfs_set_header_nritems(leaf, nritems + nr); 4799 btrfs_mark_buffer_dirty(leaf); 4800 4801 if (btrfs_leaf_free_space(fs_info, leaf) < 0) { 4802 btrfs_print_leaf(leaf); 4803 BUG(); 4804 } 4805 } 4806 4807 /* 4808 * Given a key and some data, insert items into the tree. 4809 * This does all the path init required, making room in the tree if needed. 4810 */ 4811 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4812 struct btrfs_root *root, 4813 struct btrfs_path *path, 4814 const struct btrfs_key *cpu_key, u32 *data_size, 4815 int nr) 4816 { 4817 int ret = 0; 4818 int slot; 4819 int i; 4820 u32 total_size = 0; 4821 u32 total_data = 0; 4822 4823 for (i = 0; i < nr; i++) 4824 total_data += data_size[i]; 4825 4826 total_size = total_data + (nr * sizeof(struct btrfs_item)); 4827 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 4828 if (ret == 0) 4829 return -EEXIST; 4830 if (ret < 0) 4831 return ret; 4832 4833 slot = path->slots[0]; 4834 BUG_ON(slot < 0); 4835 4836 setup_items_for_insert(root, path, cpu_key, data_size, 4837 total_data, total_size, nr); 4838 return 0; 4839 } 4840 4841 /* 4842 * Given a key and some data, insert an item into the tree. 4843 * This does all the path init required, making room in the tree if needed. 4844 */ 4845 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4846 const struct btrfs_key *cpu_key, void *data, 4847 u32 data_size) 4848 { 4849 int ret = 0; 4850 struct btrfs_path *path; 4851 struct extent_buffer *leaf; 4852 unsigned long ptr; 4853 4854 path = btrfs_alloc_path(); 4855 if (!path) 4856 return -ENOMEM; 4857 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4858 if (!ret) { 4859 leaf = path->nodes[0]; 4860 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4861 write_extent_buffer(leaf, data, ptr, data_size); 4862 btrfs_mark_buffer_dirty(leaf); 4863 } 4864 btrfs_free_path(path); 4865 return ret; 4866 } 4867 4868 /* 4869 * delete the pointer from a given node. 4870 * 4871 * the tree should have been previously balanced so the deletion does not 4872 * empty a node. 4873 */ 4874 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4875 int level, int slot) 4876 { 4877 struct extent_buffer *parent = path->nodes[level]; 4878 u32 nritems; 4879 int ret; 4880 4881 nritems = btrfs_header_nritems(parent); 4882 if (slot != nritems - 1) { 4883 if (level) { 4884 ret = tree_mod_log_insert_move(parent, slot, slot + 1, 4885 nritems - slot - 1); 4886 BUG_ON(ret < 0); 4887 } 4888 memmove_extent_buffer(parent, 4889 btrfs_node_key_ptr_offset(slot), 4890 btrfs_node_key_ptr_offset(slot + 1), 4891 sizeof(struct btrfs_key_ptr) * 4892 (nritems - slot - 1)); 4893 } else if (level) { 4894 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE, 4895 GFP_NOFS); 4896 BUG_ON(ret < 0); 4897 } 4898 4899 nritems--; 4900 btrfs_set_header_nritems(parent, nritems); 4901 if (nritems == 0 && parent == root->node) { 4902 BUG_ON(btrfs_header_level(root->node) != 1); 4903 /* just turn the root into a leaf and break */ 4904 btrfs_set_header_level(root->node, 0); 4905 } else if (slot == 0) { 4906 struct btrfs_disk_key disk_key; 4907 4908 btrfs_node_key(parent, &disk_key, 0); 4909 fixup_low_keys(path, &disk_key, level + 1); 4910 } 4911 btrfs_mark_buffer_dirty(parent); 4912 } 4913 4914 /* 4915 * a helper function to delete the leaf pointed to by path->slots[1] and 4916 * path->nodes[1]. 4917 * 4918 * This deletes the pointer in path->nodes[1] and frees the leaf 4919 * block extent. zero is returned if it all worked out, < 0 otherwise. 4920 * 4921 * The path must have already been setup for deleting the leaf, including 4922 * all the proper balancing. path->nodes[1] must be locked. 4923 */ 4924 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4925 struct btrfs_root *root, 4926 struct btrfs_path *path, 4927 struct extent_buffer *leaf) 4928 { 4929 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4930 del_ptr(root, path, 1, path->slots[1]); 4931 4932 /* 4933 * btrfs_free_extent is expensive, we want to make sure we 4934 * aren't holding any locks when we call it 4935 */ 4936 btrfs_unlock_up_safe(path, 0); 4937 4938 root_sub_used(root, leaf->len); 4939 4940 extent_buffer_get(leaf); 4941 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4942 free_extent_buffer_stale(leaf); 4943 } 4944 /* 4945 * delete the item at the leaf level in path. If that empties 4946 * the leaf, remove it from the tree 4947 */ 4948 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4949 struct btrfs_path *path, int slot, int nr) 4950 { 4951 struct btrfs_fs_info *fs_info = root->fs_info; 4952 struct extent_buffer *leaf; 4953 struct btrfs_item *item; 4954 u32 last_off; 4955 u32 dsize = 0; 4956 int ret = 0; 4957 int wret; 4958 int i; 4959 u32 nritems; 4960 struct btrfs_map_token token; 4961 4962 btrfs_init_map_token(&token); 4963 4964 leaf = path->nodes[0]; 4965 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4966 4967 for (i = 0; i < nr; i++) 4968 dsize += btrfs_item_size_nr(leaf, slot + i); 4969 4970 nritems = btrfs_header_nritems(leaf); 4971 4972 if (slot + nr != nritems) { 4973 int data_end = leaf_data_end(fs_info, leaf); 4974 4975 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4976 data_end + dsize, 4977 BTRFS_LEAF_DATA_OFFSET + data_end, 4978 last_off - data_end); 4979 4980 for (i = slot + nr; i < nritems; i++) { 4981 u32 ioff; 4982 4983 item = btrfs_item_nr(i); 4984 ioff = btrfs_token_item_offset(leaf, item, &token); 4985 btrfs_set_token_item_offset(leaf, item, 4986 ioff + dsize, &token); 4987 } 4988 4989 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4990 btrfs_item_nr_offset(slot + nr), 4991 sizeof(struct btrfs_item) * 4992 (nritems - slot - nr)); 4993 } 4994 btrfs_set_header_nritems(leaf, nritems - nr); 4995 nritems -= nr; 4996 4997 /* delete the leaf if we've emptied it */ 4998 if (nritems == 0) { 4999 if (leaf == root->node) { 5000 btrfs_set_header_level(leaf, 0); 5001 } else { 5002 btrfs_set_path_blocking(path); 5003 clean_tree_block(fs_info, leaf); 5004 btrfs_del_leaf(trans, root, path, leaf); 5005 } 5006 } else { 5007 int used = leaf_space_used(leaf, 0, nritems); 5008 if (slot == 0) { 5009 struct btrfs_disk_key disk_key; 5010 5011 btrfs_item_key(leaf, &disk_key, 0); 5012 fixup_low_keys(path, &disk_key, 1); 5013 } 5014 5015 /* delete the leaf if it is mostly empty */ 5016 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 5017 /* push_leaf_left fixes the path. 5018 * make sure the path still points to our leaf 5019 * for possible call to del_ptr below 5020 */ 5021 slot = path->slots[1]; 5022 extent_buffer_get(leaf); 5023 5024 btrfs_set_path_blocking(path); 5025 wret = push_leaf_left(trans, root, path, 1, 1, 5026 1, (u32)-1); 5027 if (wret < 0 && wret != -ENOSPC) 5028 ret = wret; 5029 5030 if (path->nodes[0] == leaf && 5031 btrfs_header_nritems(leaf)) { 5032 wret = push_leaf_right(trans, root, path, 1, 5033 1, 1, 0); 5034 if (wret < 0 && wret != -ENOSPC) 5035 ret = wret; 5036 } 5037 5038 if (btrfs_header_nritems(leaf) == 0) { 5039 path->slots[1] = slot; 5040 btrfs_del_leaf(trans, root, path, leaf); 5041 free_extent_buffer(leaf); 5042 ret = 0; 5043 } else { 5044 /* if we're still in the path, make sure 5045 * we're dirty. Otherwise, one of the 5046 * push_leaf functions must have already 5047 * dirtied this buffer 5048 */ 5049 if (path->nodes[0] == leaf) 5050 btrfs_mark_buffer_dirty(leaf); 5051 free_extent_buffer(leaf); 5052 } 5053 } else { 5054 btrfs_mark_buffer_dirty(leaf); 5055 } 5056 } 5057 return ret; 5058 } 5059 5060 /* 5061 * search the tree again to find a leaf with lesser keys 5062 * returns 0 if it found something or 1 if there are no lesser leaves. 5063 * returns < 0 on io errors. 5064 * 5065 * This may release the path, and so you may lose any locks held at the 5066 * time you call it. 5067 */ 5068 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 5069 { 5070 struct btrfs_key key; 5071 struct btrfs_disk_key found_key; 5072 int ret; 5073 5074 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 5075 5076 if (key.offset > 0) { 5077 key.offset--; 5078 } else if (key.type > 0) { 5079 key.type--; 5080 key.offset = (u64)-1; 5081 } else if (key.objectid > 0) { 5082 key.objectid--; 5083 key.type = (u8)-1; 5084 key.offset = (u64)-1; 5085 } else { 5086 return 1; 5087 } 5088 5089 btrfs_release_path(path); 5090 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5091 if (ret < 0) 5092 return ret; 5093 btrfs_item_key(path->nodes[0], &found_key, 0); 5094 ret = comp_keys(&found_key, &key); 5095 /* 5096 * We might have had an item with the previous key in the tree right 5097 * before we released our path. And after we released our path, that 5098 * item might have been pushed to the first slot (0) of the leaf we 5099 * were holding due to a tree balance. Alternatively, an item with the 5100 * previous key can exist as the only element of a leaf (big fat item). 5101 * Therefore account for these 2 cases, so that our callers (like 5102 * btrfs_previous_item) don't miss an existing item with a key matching 5103 * the previous key we computed above. 5104 */ 5105 if (ret <= 0) 5106 return 0; 5107 return 1; 5108 } 5109 5110 /* 5111 * A helper function to walk down the tree starting at min_key, and looking 5112 * for nodes or leaves that are have a minimum transaction id. 5113 * This is used by the btree defrag code, and tree logging 5114 * 5115 * This does not cow, but it does stuff the starting key it finds back 5116 * into min_key, so you can call btrfs_search_slot with cow=1 on the 5117 * key and get a writable path. 5118 * 5119 * This honors path->lowest_level to prevent descent past a given level 5120 * of the tree. 5121 * 5122 * min_trans indicates the oldest transaction that you are interested 5123 * in walking through. Any nodes or leaves older than min_trans are 5124 * skipped over (without reading them). 5125 * 5126 * returns zero if something useful was found, < 0 on error and 1 if there 5127 * was nothing in the tree that matched the search criteria. 5128 */ 5129 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 5130 struct btrfs_path *path, 5131 u64 min_trans) 5132 { 5133 struct btrfs_fs_info *fs_info = root->fs_info; 5134 struct extent_buffer *cur; 5135 struct btrfs_key found_key; 5136 int slot; 5137 int sret; 5138 u32 nritems; 5139 int level; 5140 int ret = 1; 5141 int keep_locks = path->keep_locks; 5142 5143 path->keep_locks = 1; 5144 again: 5145 cur = btrfs_read_lock_root_node(root); 5146 level = btrfs_header_level(cur); 5147 WARN_ON(path->nodes[level]); 5148 path->nodes[level] = cur; 5149 path->locks[level] = BTRFS_READ_LOCK; 5150 5151 if (btrfs_header_generation(cur) < min_trans) { 5152 ret = 1; 5153 goto out; 5154 } 5155 while (1) { 5156 nritems = btrfs_header_nritems(cur); 5157 level = btrfs_header_level(cur); 5158 sret = btrfs_bin_search(cur, min_key, level, &slot); 5159 5160 /* at the lowest level, we're done, setup the path and exit */ 5161 if (level == path->lowest_level) { 5162 if (slot >= nritems) 5163 goto find_next_key; 5164 ret = 0; 5165 path->slots[level] = slot; 5166 btrfs_item_key_to_cpu(cur, &found_key, slot); 5167 goto out; 5168 } 5169 if (sret && slot > 0) 5170 slot--; 5171 /* 5172 * check this node pointer against the min_trans parameters. 5173 * If it is too old, old, skip to the next one. 5174 */ 5175 while (slot < nritems) { 5176 u64 gen; 5177 5178 gen = btrfs_node_ptr_generation(cur, slot); 5179 if (gen < min_trans) { 5180 slot++; 5181 continue; 5182 } 5183 break; 5184 } 5185 find_next_key: 5186 /* 5187 * we didn't find a candidate key in this node, walk forward 5188 * and find another one 5189 */ 5190 if (slot >= nritems) { 5191 path->slots[level] = slot; 5192 btrfs_set_path_blocking(path); 5193 sret = btrfs_find_next_key(root, path, min_key, level, 5194 min_trans); 5195 if (sret == 0) { 5196 btrfs_release_path(path); 5197 goto again; 5198 } else { 5199 goto out; 5200 } 5201 } 5202 /* save our key for returning back */ 5203 btrfs_node_key_to_cpu(cur, &found_key, slot); 5204 path->slots[level] = slot; 5205 if (level == path->lowest_level) { 5206 ret = 0; 5207 goto out; 5208 } 5209 btrfs_set_path_blocking(path); 5210 cur = read_node_slot(fs_info, cur, slot); 5211 if (IS_ERR(cur)) { 5212 ret = PTR_ERR(cur); 5213 goto out; 5214 } 5215 5216 btrfs_tree_read_lock(cur); 5217 5218 path->locks[level - 1] = BTRFS_READ_LOCK; 5219 path->nodes[level - 1] = cur; 5220 unlock_up(path, level, 1, 0, NULL); 5221 } 5222 out: 5223 path->keep_locks = keep_locks; 5224 if (ret == 0) { 5225 btrfs_unlock_up_safe(path, path->lowest_level + 1); 5226 btrfs_set_path_blocking(path); 5227 memcpy(min_key, &found_key, sizeof(found_key)); 5228 } 5229 return ret; 5230 } 5231 5232 static int tree_move_down(struct btrfs_fs_info *fs_info, 5233 struct btrfs_path *path, 5234 int *level) 5235 { 5236 struct extent_buffer *eb; 5237 5238 BUG_ON(*level == 0); 5239 eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]); 5240 if (IS_ERR(eb)) 5241 return PTR_ERR(eb); 5242 5243 path->nodes[*level - 1] = eb; 5244 path->slots[*level - 1] = 0; 5245 (*level)--; 5246 return 0; 5247 } 5248 5249 static int tree_move_next_or_upnext(struct btrfs_path *path, 5250 int *level, int root_level) 5251 { 5252 int ret = 0; 5253 int nritems; 5254 nritems = btrfs_header_nritems(path->nodes[*level]); 5255 5256 path->slots[*level]++; 5257 5258 while (path->slots[*level] >= nritems) { 5259 if (*level == root_level) 5260 return -1; 5261 5262 /* move upnext */ 5263 path->slots[*level] = 0; 5264 free_extent_buffer(path->nodes[*level]); 5265 path->nodes[*level] = NULL; 5266 (*level)++; 5267 path->slots[*level]++; 5268 5269 nritems = btrfs_header_nritems(path->nodes[*level]); 5270 ret = 1; 5271 } 5272 return ret; 5273 } 5274 5275 /* 5276 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 5277 * or down. 5278 */ 5279 static int tree_advance(struct btrfs_fs_info *fs_info, 5280 struct btrfs_path *path, 5281 int *level, int root_level, 5282 int allow_down, 5283 struct btrfs_key *key) 5284 { 5285 int ret; 5286 5287 if (*level == 0 || !allow_down) { 5288 ret = tree_move_next_or_upnext(path, level, root_level); 5289 } else { 5290 ret = tree_move_down(fs_info, path, level); 5291 } 5292 if (ret >= 0) { 5293 if (*level == 0) 5294 btrfs_item_key_to_cpu(path->nodes[*level], key, 5295 path->slots[*level]); 5296 else 5297 btrfs_node_key_to_cpu(path->nodes[*level], key, 5298 path->slots[*level]); 5299 } 5300 return ret; 5301 } 5302 5303 static int tree_compare_item(struct btrfs_path *left_path, 5304 struct btrfs_path *right_path, 5305 char *tmp_buf) 5306 { 5307 int cmp; 5308 int len1, len2; 5309 unsigned long off1, off2; 5310 5311 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); 5312 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); 5313 if (len1 != len2) 5314 return 1; 5315 5316 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 5317 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 5318 right_path->slots[0]); 5319 5320 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 5321 5322 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 5323 if (cmp) 5324 return 1; 5325 return 0; 5326 } 5327 5328 #define ADVANCE 1 5329 #define ADVANCE_ONLY_NEXT -1 5330 5331 /* 5332 * This function compares two trees and calls the provided callback for 5333 * every changed/new/deleted item it finds. 5334 * If shared tree blocks are encountered, whole subtrees are skipped, making 5335 * the compare pretty fast on snapshotted subvolumes. 5336 * 5337 * This currently works on commit roots only. As commit roots are read only, 5338 * we don't do any locking. The commit roots are protected with transactions. 5339 * Transactions are ended and rejoined when a commit is tried in between. 5340 * 5341 * This function checks for modifications done to the trees while comparing. 5342 * If it detects a change, it aborts immediately. 5343 */ 5344 int btrfs_compare_trees(struct btrfs_root *left_root, 5345 struct btrfs_root *right_root, 5346 btrfs_changed_cb_t changed_cb, void *ctx) 5347 { 5348 struct btrfs_fs_info *fs_info = left_root->fs_info; 5349 int ret; 5350 int cmp; 5351 struct btrfs_path *left_path = NULL; 5352 struct btrfs_path *right_path = NULL; 5353 struct btrfs_key left_key; 5354 struct btrfs_key right_key; 5355 char *tmp_buf = NULL; 5356 int left_root_level; 5357 int right_root_level; 5358 int left_level; 5359 int right_level; 5360 int left_end_reached; 5361 int right_end_reached; 5362 int advance_left; 5363 int advance_right; 5364 u64 left_blockptr; 5365 u64 right_blockptr; 5366 u64 left_gen; 5367 u64 right_gen; 5368 5369 left_path = btrfs_alloc_path(); 5370 if (!left_path) { 5371 ret = -ENOMEM; 5372 goto out; 5373 } 5374 right_path = btrfs_alloc_path(); 5375 if (!right_path) { 5376 ret = -ENOMEM; 5377 goto out; 5378 } 5379 5380 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 5381 if (!tmp_buf) { 5382 ret = -ENOMEM; 5383 goto out; 5384 } 5385 5386 left_path->search_commit_root = 1; 5387 left_path->skip_locking = 1; 5388 right_path->search_commit_root = 1; 5389 right_path->skip_locking = 1; 5390 5391 /* 5392 * Strategy: Go to the first items of both trees. Then do 5393 * 5394 * If both trees are at level 0 5395 * Compare keys of current items 5396 * If left < right treat left item as new, advance left tree 5397 * and repeat 5398 * If left > right treat right item as deleted, advance right tree 5399 * and repeat 5400 * If left == right do deep compare of items, treat as changed if 5401 * needed, advance both trees and repeat 5402 * If both trees are at the same level but not at level 0 5403 * Compare keys of current nodes/leafs 5404 * If left < right advance left tree and repeat 5405 * If left > right advance right tree and repeat 5406 * If left == right compare blockptrs of the next nodes/leafs 5407 * If they match advance both trees but stay at the same level 5408 * and repeat 5409 * If they don't match advance both trees while allowing to go 5410 * deeper and repeat 5411 * If tree levels are different 5412 * Advance the tree that needs it and repeat 5413 * 5414 * Advancing a tree means: 5415 * If we are at level 0, try to go to the next slot. If that's not 5416 * possible, go one level up and repeat. Stop when we found a level 5417 * where we could go to the next slot. We may at this point be on a 5418 * node or a leaf. 5419 * 5420 * If we are not at level 0 and not on shared tree blocks, go one 5421 * level deeper. 5422 * 5423 * If we are not at level 0 and on shared tree blocks, go one slot to 5424 * the right if possible or go up and right. 5425 */ 5426 5427 down_read(&fs_info->commit_root_sem); 5428 left_level = btrfs_header_level(left_root->commit_root); 5429 left_root_level = left_level; 5430 left_path->nodes[left_level] = 5431 btrfs_clone_extent_buffer(left_root->commit_root); 5432 if (!left_path->nodes[left_level]) { 5433 up_read(&fs_info->commit_root_sem); 5434 ret = -ENOMEM; 5435 goto out; 5436 } 5437 5438 right_level = btrfs_header_level(right_root->commit_root); 5439 right_root_level = right_level; 5440 right_path->nodes[right_level] = 5441 btrfs_clone_extent_buffer(right_root->commit_root); 5442 if (!right_path->nodes[right_level]) { 5443 up_read(&fs_info->commit_root_sem); 5444 ret = -ENOMEM; 5445 goto out; 5446 } 5447 up_read(&fs_info->commit_root_sem); 5448 5449 if (left_level == 0) 5450 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5451 &left_key, left_path->slots[left_level]); 5452 else 5453 btrfs_node_key_to_cpu(left_path->nodes[left_level], 5454 &left_key, left_path->slots[left_level]); 5455 if (right_level == 0) 5456 btrfs_item_key_to_cpu(right_path->nodes[right_level], 5457 &right_key, right_path->slots[right_level]); 5458 else 5459 btrfs_node_key_to_cpu(right_path->nodes[right_level], 5460 &right_key, right_path->slots[right_level]); 5461 5462 left_end_reached = right_end_reached = 0; 5463 advance_left = advance_right = 0; 5464 5465 while (1) { 5466 if (advance_left && !left_end_reached) { 5467 ret = tree_advance(fs_info, left_path, &left_level, 5468 left_root_level, 5469 advance_left != ADVANCE_ONLY_NEXT, 5470 &left_key); 5471 if (ret == -1) 5472 left_end_reached = ADVANCE; 5473 else if (ret < 0) 5474 goto out; 5475 advance_left = 0; 5476 } 5477 if (advance_right && !right_end_reached) { 5478 ret = tree_advance(fs_info, right_path, &right_level, 5479 right_root_level, 5480 advance_right != ADVANCE_ONLY_NEXT, 5481 &right_key); 5482 if (ret == -1) 5483 right_end_reached = ADVANCE; 5484 else if (ret < 0) 5485 goto out; 5486 advance_right = 0; 5487 } 5488 5489 if (left_end_reached && right_end_reached) { 5490 ret = 0; 5491 goto out; 5492 } else if (left_end_reached) { 5493 if (right_level == 0) { 5494 ret = changed_cb(left_path, right_path, 5495 &right_key, 5496 BTRFS_COMPARE_TREE_DELETED, 5497 ctx); 5498 if (ret < 0) 5499 goto out; 5500 } 5501 advance_right = ADVANCE; 5502 continue; 5503 } else if (right_end_reached) { 5504 if (left_level == 0) { 5505 ret = changed_cb(left_path, right_path, 5506 &left_key, 5507 BTRFS_COMPARE_TREE_NEW, 5508 ctx); 5509 if (ret < 0) 5510 goto out; 5511 } 5512 advance_left = ADVANCE; 5513 continue; 5514 } 5515 5516 if (left_level == 0 && right_level == 0) { 5517 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5518 if (cmp < 0) { 5519 ret = changed_cb(left_path, right_path, 5520 &left_key, 5521 BTRFS_COMPARE_TREE_NEW, 5522 ctx); 5523 if (ret < 0) 5524 goto out; 5525 advance_left = ADVANCE; 5526 } else if (cmp > 0) { 5527 ret = changed_cb(left_path, right_path, 5528 &right_key, 5529 BTRFS_COMPARE_TREE_DELETED, 5530 ctx); 5531 if (ret < 0) 5532 goto out; 5533 advance_right = ADVANCE; 5534 } else { 5535 enum btrfs_compare_tree_result result; 5536 5537 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5538 ret = tree_compare_item(left_path, right_path, 5539 tmp_buf); 5540 if (ret) 5541 result = BTRFS_COMPARE_TREE_CHANGED; 5542 else 5543 result = BTRFS_COMPARE_TREE_SAME; 5544 ret = changed_cb(left_path, right_path, 5545 &left_key, result, ctx); 5546 if (ret < 0) 5547 goto out; 5548 advance_left = ADVANCE; 5549 advance_right = ADVANCE; 5550 } 5551 } else if (left_level == right_level) { 5552 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5553 if (cmp < 0) { 5554 advance_left = ADVANCE; 5555 } else if (cmp > 0) { 5556 advance_right = ADVANCE; 5557 } else { 5558 left_blockptr = btrfs_node_blockptr( 5559 left_path->nodes[left_level], 5560 left_path->slots[left_level]); 5561 right_blockptr = btrfs_node_blockptr( 5562 right_path->nodes[right_level], 5563 right_path->slots[right_level]); 5564 left_gen = btrfs_node_ptr_generation( 5565 left_path->nodes[left_level], 5566 left_path->slots[left_level]); 5567 right_gen = btrfs_node_ptr_generation( 5568 right_path->nodes[right_level], 5569 right_path->slots[right_level]); 5570 if (left_blockptr == right_blockptr && 5571 left_gen == right_gen) { 5572 /* 5573 * As we're on a shared block, don't 5574 * allow to go deeper. 5575 */ 5576 advance_left = ADVANCE_ONLY_NEXT; 5577 advance_right = ADVANCE_ONLY_NEXT; 5578 } else { 5579 advance_left = ADVANCE; 5580 advance_right = ADVANCE; 5581 } 5582 } 5583 } else if (left_level < right_level) { 5584 advance_right = ADVANCE; 5585 } else { 5586 advance_left = ADVANCE; 5587 } 5588 } 5589 5590 out: 5591 btrfs_free_path(left_path); 5592 btrfs_free_path(right_path); 5593 kvfree(tmp_buf); 5594 return ret; 5595 } 5596 5597 /* 5598 * this is similar to btrfs_next_leaf, but does not try to preserve 5599 * and fixup the path. It looks for and returns the next key in the 5600 * tree based on the current path and the min_trans parameters. 5601 * 5602 * 0 is returned if another key is found, < 0 if there are any errors 5603 * and 1 is returned if there are no higher keys in the tree 5604 * 5605 * path->keep_locks should be set to 1 on the search made before 5606 * calling this function. 5607 */ 5608 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5609 struct btrfs_key *key, int level, u64 min_trans) 5610 { 5611 int slot; 5612 struct extent_buffer *c; 5613 5614 WARN_ON(!path->keep_locks); 5615 while (level < BTRFS_MAX_LEVEL) { 5616 if (!path->nodes[level]) 5617 return 1; 5618 5619 slot = path->slots[level] + 1; 5620 c = path->nodes[level]; 5621 next: 5622 if (slot >= btrfs_header_nritems(c)) { 5623 int ret; 5624 int orig_lowest; 5625 struct btrfs_key cur_key; 5626 if (level + 1 >= BTRFS_MAX_LEVEL || 5627 !path->nodes[level + 1]) 5628 return 1; 5629 5630 if (path->locks[level + 1]) { 5631 level++; 5632 continue; 5633 } 5634 5635 slot = btrfs_header_nritems(c) - 1; 5636 if (level == 0) 5637 btrfs_item_key_to_cpu(c, &cur_key, slot); 5638 else 5639 btrfs_node_key_to_cpu(c, &cur_key, slot); 5640 5641 orig_lowest = path->lowest_level; 5642 btrfs_release_path(path); 5643 path->lowest_level = level; 5644 ret = btrfs_search_slot(NULL, root, &cur_key, path, 5645 0, 0); 5646 path->lowest_level = orig_lowest; 5647 if (ret < 0) 5648 return ret; 5649 5650 c = path->nodes[level]; 5651 slot = path->slots[level]; 5652 if (ret == 0) 5653 slot++; 5654 goto next; 5655 } 5656 5657 if (level == 0) 5658 btrfs_item_key_to_cpu(c, key, slot); 5659 else { 5660 u64 gen = btrfs_node_ptr_generation(c, slot); 5661 5662 if (gen < min_trans) { 5663 slot++; 5664 goto next; 5665 } 5666 btrfs_node_key_to_cpu(c, key, slot); 5667 } 5668 return 0; 5669 } 5670 return 1; 5671 } 5672 5673 /* 5674 * search the tree again to find a leaf with greater keys 5675 * returns 0 if it found something or 1 if there are no greater leaves. 5676 * returns < 0 on io errors. 5677 */ 5678 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5679 { 5680 return btrfs_next_old_leaf(root, path, 0); 5681 } 5682 5683 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5684 u64 time_seq) 5685 { 5686 int slot; 5687 int level; 5688 struct extent_buffer *c; 5689 struct extent_buffer *next; 5690 struct btrfs_key key; 5691 u32 nritems; 5692 int ret; 5693 int old_spinning = path->leave_spinning; 5694 int next_rw_lock = 0; 5695 5696 nritems = btrfs_header_nritems(path->nodes[0]); 5697 if (nritems == 0) 5698 return 1; 5699 5700 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5701 again: 5702 level = 1; 5703 next = NULL; 5704 next_rw_lock = 0; 5705 btrfs_release_path(path); 5706 5707 path->keep_locks = 1; 5708 path->leave_spinning = 1; 5709 5710 if (time_seq) 5711 ret = btrfs_search_old_slot(root, &key, path, time_seq); 5712 else 5713 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5714 path->keep_locks = 0; 5715 5716 if (ret < 0) 5717 return ret; 5718 5719 nritems = btrfs_header_nritems(path->nodes[0]); 5720 /* 5721 * by releasing the path above we dropped all our locks. A balance 5722 * could have added more items next to the key that used to be 5723 * at the very end of the block. So, check again here and 5724 * advance the path if there are now more items available. 5725 */ 5726 if (nritems > 0 && path->slots[0] < nritems - 1) { 5727 if (ret == 0) 5728 path->slots[0]++; 5729 ret = 0; 5730 goto done; 5731 } 5732 /* 5733 * So the above check misses one case: 5734 * - after releasing the path above, someone has removed the item that 5735 * used to be at the very end of the block, and balance between leafs 5736 * gets another one with bigger key.offset to replace it. 5737 * 5738 * This one should be returned as well, or we can get leaf corruption 5739 * later(esp. in __btrfs_drop_extents()). 5740 * 5741 * And a bit more explanation about this check, 5742 * with ret > 0, the key isn't found, the path points to the slot 5743 * where it should be inserted, so the path->slots[0] item must be the 5744 * bigger one. 5745 */ 5746 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { 5747 ret = 0; 5748 goto done; 5749 } 5750 5751 while (level < BTRFS_MAX_LEVEL) { 5752 if (!path->nodes[level]) { 5753 ret = 1; 5754 goto done; 5755 } 5756 5757 slot = path->slots[level] + 1; 5758 c = path->nodes[level]; 5759 if (slot >= btrfs_header_nritems(c)) { 5760 level++; 5761 if (level == BTRFS_MAX_LEVEL) { 5762 ret = 1; 5763 goto done; 5764 } 5765 continue; 5766 } 5767 5768 if (next) { 5769 btrfs_tree_unlock_rw(next, next_rw_lock); 5770 free_extent_buffer(next); 5771 } 5772 5773 next = c; 5774 next_rw_lock = path->locks[level]; 5775 ret = read_block_for_search(root, path, &next, level, 5776 slot, &key); 5777 if (ret == -EAGAIN) 5778 goto again; 5779 5780 if (ret < 0) { 5781 btrfs_release_path(path); 5782 goto done; 5783 } 5784 5785 if (!path->skip_locking) { 5786 ret = btrfs_try_tree_read_lock(next); 5787 if (!ret && time_seq) { 5788 /* 5789 * If we don't get the lock, we may be racing 5790 * with push_leaf_left, holding that lock while 5791 * itself waiting for the leaf we've currently 5792 * locked. To solve this situation, we give up 5793 * on our lock and cycle. 5794 */ 5795 free_extent_buffer(next); 5796 btrfs_release_path(path); 5797 cond_resched(); 5798 goto again; 5799 } 5800 if (!ret) { 5801 btrfs_set_path_blocking(path); 5802 btrfs_tree_read_lock(next); 5803 } 5804 next_rw_lock = BTRFS_READ_LOCK; 5805 } 5806 break; 5807 } 5808 path->slots[level] = slot; 5809 while (1) { 5810 level--; 5811 c = path->nodes[level]; 5812 if (path->locks[level]) 5813 btrfs_tree_unlock_rw(c, path->locks[level]); 5814 5815 free_extent_buffer(c); 5816 path->nodes[level] = next; 5817 path->slots[level] = 0; 5818 if (!path->skip_locking) 5819 path->locks[level] = next_rw_lock; 5820 if (!level) 5821 break; 5822 5823 ret = read_block_for_search(root, path, &next, level, 5824 0, &key); 5825 if (ret == -EAGAIN) 5826 goto again; 5827 5828 if (ret < 0) { 5829 btrfs_release_path(path); 5830 goto done; 5831 } 5832 5833 if (!path->skip_locking) { 5834 ret = btrfs_try_tree_read_lock(next); 5835 if (!ret) { 5836 btrfs_set_path_blocking(path); 5837 btrfs_tree_read_lock(next); 5838 } 5839 next_rw_lock = BTRFS_READ_LOCK; 5840 } 5841 } 5842 ret = 0; 5843 done: 5844 unlock_up(path, 0, 1, 0, NULL); 5845 path->leave_spinning = old_spinning; 5846 if (!old_spinning) 5847 btrfs_set_path_blocking(path); 5848 5849 return ret; 5850 } 5851 5852 /* 5853 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5854 * searching until it gets past min_objectid or finds an item of 'type' 5855 * 5856 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5857 */ 5858 int btrfs_previous_item(struct btrfs_root *root, 5859 struct btrfs_path *path, u64 min_objectid, 5860 int type) 5861 { 5862 struct btrfs_key found_key; 5863 struct extent_buffer *leaf; 5864 u32 nritems; 5865 int ret; 5866 5867 while (1) { 5868 if (path->slots[0] == 0) { 5869 btrfs_set_path_blocking(path); 5870 ret = btrfs_prev_leaf(root, path); 5871 if (ret != 0) 5872 return ret; 5873 } else { 5874 path->slots[0]--; 5875 } 5876 leaf = path->nodes[0]; 5877 nritems = btrfs_header_nritems(leaf); 5878 if (nritems == 0) 5879 return 1; 5880 if (path->slots[0] == nritems) 5881 path->slots[0]--; 5882 5883 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5884 if (found_key.objectid < min_objectid) 5885 break; 5886 if (found_key.type == type) 5887 return 0; 5888 if (found_key.objectid == min_objectid && 5889 found_key.type < type) 5890 break; 5891 } 5892 return 1; 5893 } 5894 5895 /* 5896 * search in extent tree to find a previous Metadata/Data extent item with 5897 * min objecitd. 5898 * 5899 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5900 */ 5901 int btrfs_previous_extent_item(struct btrfs_root *root, 5902 struct btrfs_path *path, u64 min_objectid) 5903 { 5904 struct btrfs_key found_key; 5905 struct extent_buffer *leaf; 5906 u32 nritems; 5907 int ret; 5908 5909 while (1) { 5910 if (path->slots[0] == 0) { 5911 btrfs_set_path_blocking(path); 5912 ret = btrfs_prev_leaf(root, path); 5913 if (ret != 0) 5914 return ret; 5915 } else { 5916 path->slots[0]--; 5917 } 5918 leaf = path->nodes[0]; 5919 nritems = btrfs_header_nritems(leaf); 5920 if (nritems == 0) 5921 return 1; 5922 if (path->slots[0] == nritems) 5923 path->slots[0]--; 5924 5925 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5926 if (found_key.objectid < min_objectid) 5927 break; 5928 if (found_key.type == BTRFS_EXTENT_ITEM_KEY || 5929 found_key.type == BTRFS_METADATA_ITEM_KEY) 5930 return 0; 5931 if (found_key.objectid == min_objectid && 5932 found_key.type < BTRFS_EXTENT_ITEM_KEY) 5933 break; 5934 } 5935 return 1; 5936 } 5937