1 /* 2 * Copyright (C) 2007,2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/rbtree.h> 22 #include "ctree.h" 23 #include "disk-io.h" 24 #include "transaction.h" 25 #include "print-tree.h" 26 #include "locking.h" 27 28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 29 *root, struct btrfs_path *path, int level); 30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root 31 *root, struct btrfs_key *ins_key, 32 struct btrfs_path *path, int data_size, int extend); 33 static int push_node_left(struct btrfs_trans_handle *trans, 34 struct btrfs_root *root, struct extent_buffer *dst, 35 struct extent_buffer *src, int empty); 36 static int balance_node_right(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 41 int level, int slot); 42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 43 struct extent_buffer *eb); 44 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 45 46 struct btrfs_path *btrfs_alloc_path(void) 47 { 48 struct btrfs_path *path; 49 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 50 return path; 51 } 52 53 /* 54 * set all locked nodes in the path to blocking locks. This should 55 * be done before scheduling 56 */ 57 noinline void btrfs_set_path_blocking(struct btrfs_path *p) 58 { 59 int i; 60 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 61 if (!p->nodes[i] || !p->locks[i]) 62 continue; 63 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 64 if (p->locks[i] == BTRFS_READ_LOCK) 65 p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 66 else if (p->locks[i] == BTRFS_WRITE_LOCK) 67 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 68 } 69 } 70 71 /* 72 * reset all the locked nodes in the patch to spinning locks. 73 * 74 * held is used to keep lockdep happy, when lockdep is enabled 75 * we set held to a blocking lock before we go around and 76 * retake all the spinlocks in the path. You can safely use NULL 77 * for held 78 */ 79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 80 struct extent_buffer *held, int held_rw) 81 { 82 int i; 83 84 #ifdef CONFIG_DEBUG_LOCK_ALLOC 85 /* lockdep really cares that we take all of these spinlocks 86 * in the right order. If any of the locks in the path are not 87 * currently blocking, it is going to complain. So, make really 88 * really sure by forcing the path to blocking before we clear 89 * the path blocking. 90 */ 91 if (held) { 92 btrfs_set_lock_blocking_rw(held, held_rw); 93 if (held_rw == BTRFS_WRITE_LOCK) 94 held_rw = BTRFS_WRITE_LOCK_BLOCKING; 95 else if (held_rw == BTRFS_READ_LOCK) 96 held_rw = BTRFS_READ_LOCK_BLOCKING; 97 } 98 btrfs_set_path_blocking(p); 99 #endif 100 101 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 102 if (p->nodes[i] && p->locks[i]) { 103 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); 104 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) 105 p->locks[i] = BTRFS_WRITE_LOCK; 106 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) 107 p->locks[i] = BTRFS_READ_LOCK; 108 } 109 } 110 111 #ifdef CONFIG_DEBUG_LOCK_ALLOC 112 if (held) 113 btrfs_clear_lock_blocking_rw(held, held_rw); 114 #endif 115 } 116 117 /* this also releases the path */ 118 void btrfs_free_path(struct btrfs_path *p) 119 { 120 if (!p) 121 return; 122 btrfs_release_path(p); 123 kmem_cache_free(btrfs_path_cachep, p); 124 } 125 126 /* 127 * path release drops references on the extent buffers in the path 128 * and it drops any locks held by this path 129 * 130 * It is safe to call this on paths that no locks or extent buffers held. 131 */ 132 noinline void btrfs_release_path(struct btrfs_path *p) 133 { 134 int i; 135 136 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 137 p->slots[i] = 0; 138 if (!p->nodes[i]) 139 continue; 140 if (p->locks[i]) { 141 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 142 p->locks[i] = 0; 143 } 144 free_extent_buffer(p->nodes[i]); 145 p->nodes[i] = NULL; 146 } 147 } 148 149 /* 150 * safely gets a reference on the root node of a tree. A lock 151 * is not taken, so a concurrent writer may put a different node 152 * at the root of the tree. See btrfs_lock_root_node for the 153 * looping required. 154 * 155 * The extent buffer returned by this has a reference taken, so 156 * it won't disappear. It may stop being the root of the tree 157 * at any time because there are no locks held. 158 */ 159 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 160 { 161 struct extent_buffer *eb; 162 163 while (1) { 164 rcu_read_lock(); 165 eb = rcu_dereference(root->node); 166 167 /* 168 * RCU really hurts here, we could free up the root node because 169 * it was cow'ed but we may not get the new root node yet so do 170 * the inc_not_zero dance and if it doesn't work then 171 * synchronize_rcu and try again. 172 */ 173 if (atomic_inc_not_zero(&eb->refs)) { 174 rcu_read_unlock(); 175 break; 176 } 177 rcu_read_unlock(); 178 synchronize_rcu(); 179 } 180 return eb; 181 } 182 183 /* loop around taking references on and locking the root node of the 184 * tree until you end up with a lock on the root. A locked buffer 185 * is returned, with a reference held. 186 */ 187 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 188 { 189 struct extent_buffer *eb; 190 191 while (1) { 192 eb = btrfs_root_node(root); 193 btrfs_tree_lock(eb); 194 if (eb == root->node) 195 break; 196 btrfs_tree_unlock(eb); 197 free_extent_buffer(eb); 198 } 199 return eb; 200 } 201 202 /* loop around taking references on and locking the root node of the 203 * tree until you end up with a lock on the root. A locked buffer 204 * is returned, with a reference held. 205 */ 206 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 207 { 208 struct extent_buffer *eb; 209 210 while (1) { 211 eb = btrfs_root_node(root); 212 btrfs_tree_read_lock(eb); 213 if (eb == root->node) 214 break; 215 btrfs_tree_read_unlock(eb); 216 free_extent_buffer(eb); 217 } 218 return eb; 219 } 220 221 /* cowonly root (everything not a reference counted cow subvolume), just get 222 * put onto a simple dirty list. transaction.c walks this to make sure they 223 * get properly updated on disk. 224 */ 225 static void add_root_to_dirty_list(struct btrfs_root *root) 226 { 227 spin_lock(&root->fs_info->trans_lock); 228 if (root->track_dirty && list_empty(&root->dirty_list)) { 229 list_add(&root->dirty_list, 230 &root->fs_info->dirty_cowonly_roots); 231 } 232 spin_unlock(&root->fs_info->trans_lock); 233 } 234 235 /* 236 * used by snapshot creation to make a copy of a root for a tree with 237 * a given objectid. The buffer with the new root node is returned in 238 * cow_ret, and this func returns zero on success or a negative error code. 239 */ 240 int btrfs_copy_root(struct btrfs_trans_handle *trans, 241 struct btrfs_root *root, 242 struct extent_buffer *buf, 243 struct extent_buffer **cow_ret, u64 new_root_objectid) 244 { 245 struct extent_buffer *cow; 246 int ret = 0; 247 int level; 248 struct btrfs_disk_key disk_key; 249 250 WARN_ON(root->ref_cows && trans->transid != 251 root->fs_info->running_transaction->transid); 252 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 253 254 level = btrfs_header_level(buf); 255 if (level == 0) 256 btrfs_item_key(buf, &disk_key, 0); 257 else 258 btrfs_node_key(buf, &disk_key, 0); 259 260 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 261 new_root_objectid, &disk_key, level, 262 buf->start, 0); 263 if (IS_ERR(cow)) 264 return PTR_ERR(cow); 265 266 copy_extent_buffer(cow, buf, 0, 0, cow->len); 267 btrfs_set_header_bytenr(cow, cow->start); 268 btrfs_set_header_generation(cow, trans->transid); 269 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 270 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 271 BTRFS_HEADER_FLAG_RELOC); 272 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 273 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 274 else 275 btrfs_set_header_owner(cow, new_root_objectid); 276 277 write_extent_buffer(cow, root->fs_info->fsid, 278 (unsigned long)btrfs_header_fsid(cow), 279 BTRFS_FSID_SIZE); 280 281 WARN_ON(btrfs_header_generation(buf) > trans->transid); 282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 283 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 284 else 285 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 286 287 if (ret) 288 return ret; 289 290 btrfs_mark_buffer_dirty(cow); 291 *cow_ret = cow; 292 return 0; 293 } 294 295 enum mod_log_op { 296 MOD_LOG_KEY_REPLACE, 297 MOD_LOG_KEY_ADD, 298 MOD_LOG_KEY_REMOVE, 299 MOD_LOG_KEY_REMOVE_WHILE_FREEING, 300 MOD_LOG_KEY_REMOVE_WHILE_MOVING, 301 MOD_LOG_MOVE_KEYS, 302 MOD_LOG_ROOT_REPLACE, 303 }; 304 305 struct tree_mod_move { 306 int dst_slot; 307 int nr_items; 308 }; 309 310 struct tree_mod_root { 311 u64 logical; 312 u8 level; 313 }; 314 315 struct tree_mod_elem { 316 struct rb_node node; 317 u64 index; /* shifted logical */ 318 u64 seq; 319 enum mod_log_op op; 320 321 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 322 int slot; 323 324 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ 325 u64 generation; 326 327 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ 328 struct btrfs_disk_key key; 329 u64 blockptr; 330 331 /* this is used for op == MOD_LOG_MOVE_KEYS */ 332 struct tree_mod_move move; 333 334 /* this is used for op == MOD_LOG_ROOT_REPLACE */ 335 struct tree_mod_root old_root; 336 }; 337 338 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) 339 { 340 read_lock(&fs_info->tree_mod_log_lock); 341 } 342 343 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) 344 { 345 read_unlock(&fs_info->tree_mod_log_lock); 346 } 347 348 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) 349 { 350 write_lock(&fs_info->tree_mod_log_lock); 351 } 352 353 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) 354 { 355 write_unlock(&fs_info->tree_mod_log_lock); 356 } 357 358 /* 359 * Increment the upper half of tree_mod_seq, set lower half zero. 360 * 361 * Must be called with fs_info->tree_mod_seq_lock held. 362 */ 363 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) 364 { 365 u64 seq = atomic64_read(&fs_info->tree_mod_seq); 366 seq &= 0xffffffff00000000ull; 367 seq += 1ull << 32; 368 atomic64_set(&fs_info->tree_mod_seq, seq); 369 return seq; 370 } 371 372 /* 373 * Increment the lower half of tree_mod_seq. 374 * 375 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers 376 * are generated should not technically require a spin lock here. (Rationale: 377 * incrementing the minor while incrementing the major seq number is between its 378 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it 379 * just returns a unique sequence number as usual.) We have decided to leave 380 * that requirement in here and rethink it once we notice it really imposes a 381 * problem on some workload. 382 */ 383 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) 384 { 385 return atomic64_inc_return(&fs_info->tree_mod_seq); 386 } 387 388 /* 389 * return the last minor in the previous major tree_mod_seq number 390 */ 391 u64 btrfs_tree_mod_seq_prev(u64 seq) 392 { 393 return (seq & 0xffffffff00000000ull) - 1ull; 394 } 395 396 /* 397 * This adds a new blocker to the tree mod log's blocker list if the @elem 398 * passed does not already have a sequence number set. So when a caller expects 399 * to record tree modifications, it should ensure to set elem->seq to zero 400 * before calling btrfs_get_tree_mod_seq. 401 * Returns a fresh, unused tree log modification sequence number, even if no new 402 * blocker was added. 403 */ 404 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 405 struct seq_list *elem) 406 { 407 u64 seq; 408 409 tree_mod_log_write_lock(fs_info); 410 spin_lock(&fs_info->tree_mod_seq_lock); 411 if (!elem->seq) { 412 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); 413 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 414 } 415 seq = btrfs_inc_tree_mod_seq_minor(fs_info); 416 spin_unlock(&fs_info->tree_mod_seq_lock); 417 tree_mod_log_write_unlock(fs_info); 418 419 return seq; 420 } 421 422 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 423 struct seq_list *elem) 424 { 425 struct rb_root *tm_root; 426 struct rb_node *node; 427 struct rb_node *next; 428 struct seq_list *cur_elem; 429 struct tree_mod_elem *tm; 430 u64 min_seq = (u64)-1; 431 u64 seq_putting = elem->seq; 432 433 if (!seq_putting) 434 return; 435 436 spin_lock(&fs_info->tree_mod_seq_lock); 437 list_del(&elem->list); 438 elem->seq = 0; 439 440 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 441 if (cur_elem->seq < min_seq) { 442 if (seq_putting > cur_elem->seq) { 443 /* 444 * blocker with lower sequence number exists, we 445 * cannot remove anything from the log 446 */ 447 spin_unlock(&fs_info->tree_mod_seq_lock); 448 return; 449 } 450 min_seq = cur_elem->seq; 451 } 452 } 453 spin_unlock(&fs_info->tree_mod_seq_lock); 454 455 /* 456 * anything that's lower than the lowest existing (read: blocked) 457 * sequence number can be removed from the tree. 458 */ 459 tree_mod_log_write_lock(fs_info); 460 tm_root = &fs_info->tree_mod_log; 461 for (node = rb_first(tm_root); node; node = next) { 462 next = rb_next(node); 463 tm = container_of(node, struct tree_mod_elem, node); 464 if (tm->seq > min_seq) 465 continue; 466 rb_erase(node, tm_root); 467 kfree(tm); 468 } 469 tree_mod_log_write_unlock(fs_info); 470 } 471 472 /* 473 * key order of the log: 474 * index -> sequence 475 * 476 * the index is the shifted logical of the *new* root node for root replace 477 * operations, or the shifted logical of the affected block for all other 478 * operations. 479 */ 480 static noinline int 481 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 482 { 483 struct rb_root *tm_root; 484 struct rb_node **new; 485 struct rb_node *parent = NULL; 486 struct tree_mod_elem *cur; 487 int ret = 0; 488 489 BUG_ON(!tm); 490 491 tree_mod_log_write_lock(fs_info); 492 if (list_empty(&fs_info->tree_mod_seq_list)) { 493 tree_mod_log_write_unlock(fs_info); 494 /* 495 * Ok we no longer care about logging modifications, free up tm 496 * and return 0. Any callers shouldn't be using tm after 497 * calling tree_mod_log_insert, but if they do we can just 498 * change this to return a special error code to let the callers 499 * do their own thing. 500 */ 501 kfree(tm); 502 return 0; 503 } 504 505 spin_lock(&fs_info->tree_mod_seq_lock); 506 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); 507 spin_unlock(&fs_info->tree_mod_seq_lock); 508 509 tm_root = &fs_info->tree_mod_log; 510 new = &tm_root->rb_node; 511 while (*new) { 512 cur = container_of(*new, struct tree_mod_elem, node); 513 parent = *new; 514 if (cur->index < tm->index) 515 new = &((*new)->rb_left); 516 else if (cur->index > tm->index) 517 new = &((*new)->rb_right); 518 else if (cur->seq < tm->seq) 519 new = &((*new)->rb_left); 520 else if (cur->seq > tm->seq) 521 new = &((*new)->rb_right); 522 else { 523 ret = -EEXIST; 524 kfree(tm); 525 goto out; 526 } 527 } 528 529 rb_link_node(&tm->node, parent, new); 530 rb_insert_color(&tm->node, tm_root); 531 out: 532 tree_mod_log_write_unlock(fs_info); 533 return ret; 534 } 535 536 /* 537 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it 538 * returns zero with the tree_mod_log_lock acquired. The caller must hold 539 * this until all tree mod log insertions are recorded in the rb tree and then 540 * call tree_mod_log_write_unlock() to release. 541 */ 542 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 543 struct extent_buffer *eb) { 544 smp_mb(); 545 if (list_empty(&(fs_info)->tree_mod_seq_list)) 546 return 1; 547 if (eb && btrfs_header_level(eb) == 0) 548 return 1; 549 return 0; 550 } 551 552 static inline int 553 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, 554 struct extent_buffer *eb, int slot, 555 enum mod_log_op op, gfp_t flags) 556 { 557 struct tree_mod_elem *tm; 558 559 tm = kzalloc(sizeof(*tm), flags); 560 if (!tm) 561 return -ENOMEM; 562 563 tm->index = eb->start >> PAGE_CACHE_SHIFT; 564 if (op != MOD_LOG_KEY_ADD) { 565 btrfs_node_key(eb, &tm->key, slot); 566 tm->blockptr = btrfs_node_blockptr(eb, slot); 567 } 568 tm->op = op; 569 tm->slot = slot; 570 tm->generation = btrfs_node_ptr_generation(eb, slot); 571 572 return __tree_mod_log_insert(fs_info, tm); 573 } 574 575 static noinline int 576 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, 577 struct extent_buffer *eb, int slot, 578 enum mod_log_op op, gfp_t flags) 579 { 580 if (tree_mod_dont_log(fs_info, eb)) 581 return 0; 582 583 return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); 584 } 585 586 static noinline int 587 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 588 struct extent_buffer *eb, int dst_slot, int src_slot, 589 int nr_items, gfp_t flags) 590 { 591 struct tree_mod_elem *tm; 592 int ret; 593 int i; 594 595 if (tree_mod_dont_log(fs_info, eb)) 596 return 0; 597 598 /* 599 * When we override something during the move, we log these removals. 600 * This can only happen when we move towards the beginning of the 601 * buffer, i.e. dst_slot < src_slot. 602 */ 603 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 604 ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, 605 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); 606 BUG_ON(ret < 0); 607 } 608 609 tm = kzalloc(sizeof(*tm), flags); 610 if (!tm) 611 return -ENOMEM; 612 613 tm->index = eb->start >> PAGE_CACHE_SHIFT; 614 tm->slot = src_slot; 615 tm->move.dst_slot = dst_slot; 616 tm->move.nr_items = nr_items; 617 tm->op = MOD_LOG_MOVE_KEYS; 618 619 return __tree_mod_log_insert(fs_info, tm); 620 } 621 622 static inline void 623 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 624 { 625 int i; 626 u32 nritems; 627 int ret; 628 629 if (btrfs_header_level(eb) == 0) 630 return; 631 632 nritems = btrfs_header_nritems(eb); 633 for (i = nritems - 1; i >= 0; i--) { 634 ret = __tree_mod_log_insert_key(fs_info, eb, i, 635 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 636 BUG_ON(ret < 0); 637 } 638 } 639 640 static noinline int 641 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 642 struct extent_buffer *old_root, 643 struct extent_buffer *new_root, gfp_t flags, 644 int log_removal) 645 { 646 struct tree_mod_elem *tm; 647 648 if (tree_mod_dont_log(fs_info, NULL)) 649 return 0; 650 651 if (log_removal) 652 __tree_mod_log_free_eb(fs_info, old_root); 653 654 tm = kzalloc(sizeof(*tm), flags); 655 if (!tm) 656 return -ENOMEM; 657 658 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 659 tm->old_root.logical = old_root->start; 660 tm->old_root.level = btrfs_header_level(old_root); 661 tm->generation = btrfs_header_generation(old_root); 662 tm->op = MOD_LOG_ROOT_REPLACE; 663 664 return __tree_mod_log_insert(fs_info, tm); 665 } 666 667 static struct tree_mod_elem * 668 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, 669 int smallest) 670 { 671 struct rb_root *tm_root; 672 struct rb_node *node; 673 struct tree_mod_elem *cur = NULL; 674 struct tree_mod_elem *found = NULL; 675 u64 index = start >> PAGE_CACHE_SHIFT; 676 677 tree_mod_log_read_lock(fs_info); 678 tm_root = &fs_info->tree_mod_log; 679 node = tm_root->rb_node; 680 while (node) { 681 cur = container_of(node, struct tree_mod_elem, node); 682 if (cur->index < index) { 683 node = node->rb_left; 684 } else if (cur->index > index) { 685 node = node->rb_right; 686 } else if (cur->seq < min_seq) { 687 node = node->rb_left; 688 } else if (!smallest) { 689 /* we want the node with the highest seq */ 690 if (found) 691 BUG_ON(found->seq > cur->seq); 692 found = cur; 693 node = node->rb_left; 694 } else if (cur->seq > min_seq) { 695 /* we want the node with the smallest seq */ 696 if (found) 697 BUG_ON(found->seq < cur->seq); 698 found = cur; 699 node = node->rb_right; 700 } else { 701 found = cur; 702 break; 703 } 704 } 705 tree_mod_log_read_unlock(fs_info); 706 707 return found; 708 } 709 710 /* 711 * this returns the element from the log with the smallest time sequence 712 * value that's in the log (the oldest log item). any element with a time 713 * sequence lower than min_seq will be ignored. 714 */ 715 static struct tree_mod_elem * 716 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, 717 u64 min_seq) 718 { 719 return __tree_mod_log_search(fs_info, start, min_seq, 1); 720 } 721 722 /* 723 * this returns the element from the log with the largest time sequence 724 * value that's in the log (the most recent log item). any element with 725 * a time sequence lower than min_seq will be ignored. 726 */ 727 static struct tree_mod_elem * 728 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) 729 { 730 return __tree_mod_log_search(fs_info, start, min_seq, 0); 731 } 732 733 static noinline void 734 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 735 struct extent_buffer *src, unsigned long dst_offset, 736 unsigned long src_offset, int nr_items) 737 { 738 int ret; 739 int i; 740 741 if (tree_mod_dont_log(fs_info, NULL)) 742 return; 743 744 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 745 return; 746 747 for (i = 0; i < nr_items; i++) { 748 ret = __tree_mod_log_insert_key(fs_info, src, 749 i + src_offset, 750 MOD_LOG_KEY_REMOVE, GFP_NOFS); 751 BUG_ON(ret < 0); 752 ret = __tree_mod_log_insert_key(fs_info, dst, 753 i + dst_offset, 754 MOD_LOG_KEY_ADD, 755 GFP_NOFS); 756 BUG_ON(ret < 0); 757 } 758 } 759 760 static inline void 761 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 762 int dst_offset, int src_offset, int nr_items) 763 { 764 int ret; 765 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, 766 nr_items, GFP_NOFS); 767 BUG_ON(ret < 0); 768 } 769 770 static noinline void 771 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 772 struct extent_buffer *eb, int slot, int atomic) 773 { 774 int ret; 775 776 ret = __tree_mod_log_insert_key(fs_info, eb, slot, 777 MOD_LOG_KEY_REPLACE, 778 atomic ? GFP_ATOMIC : GFP_NOFS); 779 BUG_ON(ret < 0); 780 } 781 782 static noinline void 783 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 784 { 785 if (tree_mod_dont_log(fs_info, eb)) 786 return; 787 __tree_mod_log_free_eb(fs_info, eb); 788 } 789 790 static noinline void 791 tree_mod_log_set_root_pointer(struct btrfs_root *root, 792 struct extent_buffer *new_root_node, 793 int log_removal) 794 { 795 int ret; 796 ret = tree_mod_log_insert_root(root->fs_info, root->node, 797 new_root_node, GFP_NOFS, log_removal); 798 BUG_ON(ret < 0); 799 } 800 801 /* 802 * check if the tree block can be shared by multiple trees 803 */ 804 int btrfs_block_can_be_shared(struct btrfs_root *root, 805 struct extent_buffer *buf) 806 { 807 /* 808 * Tree blocks not in refernece counted trees and tree roots 809 * are never shared. If a block was allocated after the last 810 * snapshot and the block was not allocated by tree relocation, 811 * we know the block is not shared. 812 */ 813 if (root->ref_cows && 814 buf != root->node && buf != root->commit_root && 815 (btrfs_header_generation(buf) <= 816 btrfs_root_last_snapshot(&root->root_item) || 817 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 818 return 1; 819 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 820 if (root->ref_cows && 821 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 822 return 1; 823 #endif 824 return 0; 825 } 826 827 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 828 struct btrfs_root *root, 829 struct extent_buffer *buf, 830 struct extent_buffer *cow, 831 int *last_ref) 832 { 833 u64 refs; 834 u64 owner; 835 u64 flags; 836 u64 new_flags = 0; 837 int ret; 838 839 /* 840 * Backrefs update rules: 841 * 842 * Always use full backrefs for extent pointers in tree block 843 * allocated by tree relocation. 844 * 845 * If a shared tree block is no longer referenced by its owner 846 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 847 * use full backrefs for extent pointers in tree block. 848 * 849 * If a tree block is been relocating 850 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 851 * use full backrefs for extent pointers in tree block. 852 * The reason for this is some operations (such as drop tree) 853 * are only allowed for blocks use full backrefs. 854 */ 855 856 if (btrfs_block_can_be_shared(root, buf)) { 857 ret = btrfs_lookup_extent_info(trans, root, buf->start, 858 btrfs_header_level(buf), 1, 859 &refs, &flags); 860 if (ret) 861 return ret; 862 if (refs == 0) { 863 ret = -EROFS; 864 btrfs_std_error(root->fs_info, ret); 865 return ret; 866 } 867 } else { 868 refs = 1; 869 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 870 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 871 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 872 else 873 flags = 0; 874 } 875 876 owner = btrfs_header_owner(buf); 877 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 878 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 879 880 if (refs > 1) { 881 if ((owner == root->root_key.objectid || 882 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 883 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 884 ret = btrfs_inc_ref(trans, root, buf, 1, 1); 885 BUG_ON(ret); /* -ENOMEM */ 886 887 if (root->root_key.objectid == 888 BTRFS_TREE_RELOC_OBJECTID) { 889 ret = btrfs_dec_ref(trans, root, buf, 0, 1); 890 BUG_ON(ret); /* -ENOMEM */ 891 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 892 BUG_ON(ret); /* -ENOMEM */ 893 } 894 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 895 } else { 896 897 if (root->root_key.objectid == 898 BTRFS_TREE_RELOC_OBJECTID) 899 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 900 else 901 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 902 BUG_ON(ret); /* -ENOMEM */ 903 } 904 if (new_flags != 0) { 905 int level = btrfs_header_level(buf); 906 907 ret = btrfs_set_disk_extent_flags(trans, root, 908 buf->start, 909 buf->len, 910 new_flags, level, 0); 911 if (ret) 912 return ret; 913 } 914 } else { 915 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 916 if (root->root_key.objectid == 917 BTRFS_TREE_RELOC_OBJECTID) 918 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 919 else 920 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 921 BUG_ON(ret); /* -ENOMEM */ 922 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 923 BUG_ON(ret); /* -ENOMEM */ 924 } 925 clean_tree_block(trans, root, buf); 926 *last_ref = 1; 927 } 928 return 0; 929 } 930 931 /* 932 * does the dirty work in cow of a single block. The parent block (if 933 * supplied) is updated to point to the new cow copy. The new buffer is marked 934 * dirty and returned locked. If you modify the block it needs to be marked 935 * dirty again. 936 * 937 * search_start -- an allocation hint for the new block 938 * 939 * empty_size -- a hint that you plan on doing more cow. This is the size in 940 * bytes the allocator should try to find free next to the block it returns. 941 * This is just a hint and may be ignored by the allocator. 942 */ 943 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 944 struct btrfs_root *root, 945 struct extent_buffer *buf, 946 struct extent_buffer *parent, int parent_slot, 947 struct extent_buffer **cow_ret, 948 u64 search_start, u64 empty_size) 949 { 950 struct btrfs_disk_key disk_key; 951 struct extent_buffer *cow; 952 int level, ret; 953 int last_ref = 0; 954 int unlock_orig = 0; 955 u64 parent_start; 956 957 if (*cow_ret == buf) 958 unlock_orig = 1; 959 960 btrfs_assert_tree_locked(buf); 961 962 WARN_ON(root->ref_cows && trans->transid != 963 root->fs_info->running_transaction->transid); 964 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 965 966 level = btrfs_header_level(buf); 967 968 if (level == 0) 969 btrfs_item_key(buf, &disk_key, 0); 970 else 971 btrfs_node_key(buf, &disk_key, 0); 972 973 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 974 if (parent) 975 parent_start = parent->start; 976 else 977 parent_start = 0; 978 } else 979 parent_start = 0; 980 981 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 982 root->root_key.objectid, &disk_key, 983 level, search_start, empty_size); 984 if (IS_ERR(cow)) 985 return PTR_ERR(cow); 986 987 /* cow is set to blocking by btrfs_init_new_buffer */ 988 989 copy_extent_buffer(cow, buf, 0, 0, cow->len); 990 btrfs_set_header_bytenr(cow, cow->start); 991 btrfs_set_header_generation(cow, trans->transid); 992 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 993 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 994 BTRFS_HEADER_FLAG_RELOC); 995 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 996 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 997 else 998 btrfs_set_header_owner(cow, root->root_key.objectid); 999 1000 write_extent_buffer(cow, root->fs_info->fsid, 1001 (unsigned long)btrfs_header_fsid(cow), 1002 BTRFS_FSID_SIZE); 1003 1004 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 1005 if (ret) { 1006 btrfs_abort_transaction(trans, root, ret); 1007 return ret; 1008 } 1009 1010 if (root->ref_cows) 1011 btrfs_reloc_cow_block(trans, root, buf, cow); 1012 1013 if (buf == root->node) { 1014 WARN_ON(parent && parent != buf); 1015 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 1016 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 1017 parent_start = buf->start; 1018 else 1019 parent_start = 0; 1020 1021 extent_buffer_get(cow); 1022 tree_mod_log_set_root_pointer(root, cow, 1); 1023 rcu_assign_pointer(root->node, cow); 1024 1025 btrfs_free_tree_block(trans, root, buf, parent_start, 1026 last_ref); 1027 free_extent_buffer(buf); 1028 add_root_to_dirty_list(root); 1029 } else { 1030 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1031 parent_start = parent->start; 1032 else 1033 parent_start = 0; 1034 1035 WARN_ON(trans->transid != btrfs_header_generation(parent)); 1036 tree_mod_log_insert_key(root->fs_info, parent, parent_slot, 1037 MOD_LOG_KEY_REPLACE, GFP_NOFS); 1038 btrfs_set_node_blockptr(parent, parent_slot, 1039 cow->start); 1040 btrfs_set_node_ptr_generation(parent, parent_slot, 1041 trans->transid); 1042 btrfs_mark_buffer_dirty(parent); 1043 if (last_ref) 1044 tree_mod_log_free_eb(root->fs_info, buf); 1045 btrfs_free_tree_block(trans, root, buf, parent_start, 1046 last_ref); 1047 } 1048 if (unlock_orig) 1049 btrfs_tree_unlock(buf); 1050 free_extent_buffer_stale(buf); 1051 btrfs_mark_buffer_dirty(cow); 1052 *cow_ret = cow; 1053 return 0; 1054 } 1055 1056 /* 1057 * returns the logical address of the oldest predecessor of the given root. 1058 * entries older than time_seq are ignored. 1059 */ 1060 static struct tree_mod_elem * 1061 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, 1062 struct extent_buffer *eb_root, u64 time_seq) 1063 { 1064 struct tree_mod_elem *tm; 1065 struct tree_mod_elem *found = NULL; 1066 u64 root_logical = eb_root->start; 1067 int looped = 0; 1068 1069 if (!time_seq) 1070 return NULL; 1071 1072 /* 1073 * the very last operation that's logged for a root is the replacement 1074 * operation (if it is replaced at all). this has the index of the *new* 1075 * root, making it the very first operation that's logged for this root. 1076 */ 1077 while (1) { 1078 tm = tree_mod_log_search_oldest(fs_info, root_logical, 1079 time_seq); 1080 if (!looped && !tm) 1081 return NULL; 1082 /* 1083 * if there are no tree operation for the oldest root, we simply 1084 * return it. this should only happen if that (old) root is at 1085 * level 0. 1086 */ 1087 if (!tm) 1088 break; 1089 1090 /* 1091 * if there's an operation that's not a root replacement, we 1092 * found the oldest version of our root. normally, we'll find a 1093 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 1094 */ 1095 if (tm->op != MOD_LOG_ROOT_REPLACE) 1096 break; 1097 1098 found = tm; 1099 root_logical = tm->old_root.logical; 1100 looped = 1; 1101 } 1102 1103 /* if there's no old root to return, return what we found instead */ 1104 if (!found) 1105 found = tm; 1106 1107 return found; 1108 } 1109 1110 /* 1111 * tm is a pointer to the first operation to rewind within eb. then, all 1112 * previous operations will be rewinded (until we reach something older than 1113 * time_seq). 1114 */ 1115 static void 1116 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 1117 u64 time_seq, struct tree_mod_elem *first_tm) 1118 { 1119 u32 n; 1120 struct rb_node *next; 1121 struct tree_mod_elem *tm = first_tm; 1122 unsigned long o_dst; 1123 unsigned long o_src; 1124 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1125 1126 n = btrfs_header_nritems(eb); 1127 tree_mod_log_read_lock(fs_info); 1128 while (tm && tm->seq >= time_seq) { 1129 /* 1130 * all the operations are recorded with the operator used for 1131 * the modification. as we're going backwards, we do the 1132 * opposite of each operation here. 1133 */ 1134 switch (tm->op) { 1135 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1136 BUG_ON(tm->slot < n); 1137 /* Fallthrough */ 1138 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1139 case MOD_LOG_KEY_REMOVE: 1140 btrfs_set_node_key(eb, &tm->key, tm->slot); 1141 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1142 btrfs_set_node_ptr_generation(eb, tm->slot, 1143 tm->generation); 1144 n++; 1145 break; 1146 case MOD_LOG_KEY_REPLACE: 1147 BUG_ON(tm->slot >= n); 1148 btrfs_set_node_key(eb, &tm->key, tm->slot); 1149 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1150 btrfs_set_node_ptr_generation(eb, tm->slot, 1151 tm->generation); 1152 break; 1153 case MOD_LOG_KEY_ADD: 1154 /* if a move operation is needed it's in the log */ 1155 n--; 1156 break; 1157 case MOD_LOG_MOVE_KEYS: 1158 o_dst = btrfs_node_key_ptr_offset(tm->slot); 1159 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 1160 memmove_extent_buffer(eb, o_dst, o_src, 1161 tm->move.nr_items * p_size); 1162 break; 1163 case MOD_LOG_ROOT_REPLACE: 1164 /* 1165 * this operation is special. for roots, this must be 1166 * handled explicitly before rewinding. 1167 * for non-roots, this operation may exist if the node 1168 * was a root: root A -> child B; then A gets empty and 1169 * B is promoted to the new root. in the mod log, we'll 1170 * have a root-replace operation for B, a tree block 1171 * that is no root. we simply ignore that operation. 1172 */ 1173 break; 1174 } 1175 next = rb_next(&tm->node); 1176 if (!next) 1177 break; 1178 tm = container_of(next, struct tree_mod_elem, node); 1179 if (tm->index != first_tm->index) 1180 break; 1181 } 1182 tree_mod_log_read_unlock(fs_info); 1183 btrfs_set_header_nritems(eb, n); 1184 } 1185 1186 /* 1187 * Called with eb read locked. If the buffer cannot be rewinded, the same buffer 1188 * is returned. If rewind operations happen, a fresh buffer is returned. The 1189 * returned buffer is always read-locked. If the returned buffer is not the 1190 * input buffer, the lock on the input buffer is released and the input buffer 1191 * is freed (its refcount is decremented). 1192 */ 1193 static struct extent_buffer * 1194 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 1195 struct extent_buffer *eb, u64 time_seq) 1196 { 1197 struct extent_buffer *eb_rewin; 1198 struct tree_mod_elem *tm; 1199 1200 if (!time_seq) 1201 return eb; 1202 1203 if (btrfs_header_level(eb) == 0) 1204 return eb; 1205 1206 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 1207 if (!tm) 1208 return eb; 1209 1210 btrfs_set_path_blocking(path); 1211 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1212 1213 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1214 BUG_ON(tm->slot != 0); 1215 eb_rewin = alloc_dummy_extent_buffer(eb->start, 1216 fs_info->tree_root->nodesize); 1217 if (!eb_rewin) { 1218 btrfs_tree_read_unlock_blocking(eb); 1219 free_extent_buffer(eb); 1220 return NULL; 1221 } 1222 btrfs_set_header_bytenr(eb_rewin, eb->start); 1223 btrfs_set_header_backref_rev(eb_rewin, 1224 btrfs_header_backref_rev(eb)); 1225 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 1226 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 1227 } else { 1228 eb_rewin = btrfs_clone_extent_buffer(eb); 1229 if (!eb_rewin) { 1230 btrfs_tree_read_unlock_blocking(eb); 1231 free_extent_buffer(eb); 1232 return NULL; 1233 } 1234 } 1235 1236 btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); 1237 btrfs_tree_read_unlock_blocking(eb); 1238 free_extent_buffer(eb); 1239 1240 extent_buffer_get(eb_rewin); 1241 btrfs_tree_read_lock(eb_rewin); 1242 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 1243 WARN_ON(btrfs_header_nritems(eb_rewin) > 1244 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1245 1246 return eb_rewin; 1247 } 1248 1249 /* 1250 * get_old_root() rewinds the state of @root's root node to the given @time_seq 1251 * value. If there are no changes, the current root->root_node is returned. If 1252 * anything changed in between, there's a fresh buffer allocated on which the 1253 * rewind operations are done. In any case, the returned buffer is read locked. 1254 * Returns NULL on error (with no locks held). 1255 */ 1256 static inline struct extent_buffer * 1257 get_old_root(struct btrfs_root *root, u64 time_seq) 1258 { 1259 struct tree_mod_elem *tm; 1260 struct extent_buffer *eb = NULL; 1261 struct extent_buffer *eb_root; 1262 struct extent_buffer *old; 1263 struct tree_mod_root *old_root = NULL; 1264 u64 old_generation = 0; 1265 u64 logical; 1266 u32 blocksize; 1267 1268 eb_root = btrfs_read_lock_root_node(root); 1269 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1270 if (!tm) 1271 return eb_root; 1272 1273 if (tm->op == MOD_LOG_ROOT_REPLACE) { 1274 old_root = &tm->old_root; 1275 old_generation = tm->generation; 1276 logical = old_root->logical; 1277 } else { 1278 logical = eb_root->start; 1279 } 1280 1281 tm = tree_mod_log_search(root->fs_info, logical, time_seq); 1282 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1283 btrfs_tree_read_unlock(eb_root); 1284 free_extent_buffer(eb_root); 1285 blocksize = btrfs_level_size(root, old_root->level); 1286 old = read_tree_block(root, logical, blocksize, 0); 1287 if (!old || !extent_buffer_uptodate(old)) { 1288 free_extent_buffer(old); 1289 pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", 1290 logical); 1291 WARN_ON(1); 1292 } else { 1293 eb = btrfs_clone_extent_buffer(old); 1294 free_extent_buffer(old); 1295 } 1296 } else if (old_root) { 1297 btrfs_tree_read_unlock(eb_root); 1298 free_extent_buffer(eb_root); 1299 eb = alloc_dummy_extent_buffer(logical, root->nodesize); 1300 } else { 1301 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); 1302 eb = btrfs_clone_extent_buffer(eb_root); 1303 btrfs_tree_read_unlock_blocking(eb_root); 1304 free_extent_buffer(eb_root); 1305 } 1306 1307 if (!eb) 1308 return NULL; 1309 extent_buffer_get(eb); 1310 btrfs_tree_read_lock(eb); 1311 if (old_root) { 1312 btrfs_set_header_bytenr(eb, eb->start); 1313 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1314 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); 1315 btrfs_set_header_level(eb, old_root->level); 1316 btrfs_set_header_generation(eb, old_generation); 1317 } 1318 if (tm) 1319 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); 1320 else 1321 WARN_ON(btrfs_header_level(eb) != 0); 1322 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1323 1324 return eb; 1325 } 1326 1327 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1328 { 1329 struct tree_mod_elem *tm; 1330 int level; 1331 struct extent_buffer *eb_root = btrfs_root_node(root); 1332 1333 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1334 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { 1335 level = tm->old_root.level; 1336 } else { 1337 level = btrfs_header_level(eb_root); 1338 } 1339 free_extent_buffer(eb_root); 1340 1341 return level; 1342 } 1343 1344 static inline int should_cow_block(struct btrfs_trans_handle *trans, 1345 struct btrfs_root *root, 1346 struct extent_buffer *buf) 1347 { 1348 /* ensure we can see the force_cow */ 1349 smp_rmb(); 1350 1351 /* 1352 * We do not need to cow a block if 1353 * 1) this block is not created or changed in this transaction; 1354 * 2) this block does not belong to TREE_RELOC tree; 1355 * 3) the root is not forced COW. 1356 * 1357 * What is forced COW: 1358 * when we create snapshot during commiting the transaction, 1359 * after we've finished coping src root, we must COW the shared 1360 * block to ensure the metadata consistency. 1361 */ 1362 if (btrfs_header_generation(buf) == trans->transid && 1363 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1364 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1365 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1366 !root->force_cow) 1367 return 0; 1368 return 1; 1369 } 1370 1371 /* 1372 * cows a single block, see __btrfs_cow_block for the real work. 1373 * This version of it has extra checks so that a block isn't cow'd more than 1374 * once per transaction, as long as it hasn't been written yet 1375 */ 1376 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 1377 struct btrfs_root *root, struct extent_buffer *buf, 1378 struct extent_buffer *parent, int parent_slot, 1379 struct extent_buffer **cow_ret) 1380 { 1381 u64 search_start; 1382 int ret; 1383 1384 if (trans->transaction != root->fs_info->running_transaction) 1385 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1386 (unsigned long long)trans->transid, 1387 (unsigned long long) 1388 root->fs_info->running_transaction->transid); 1389 1390 if (trans->transid != root->fs_info->generation) 1391 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1392 (unsigned long long)trans->transid, 1393 (unsigned long long)root->fs_info->generation); 1394 1395 if (!should_cow_block(trans, root, buf)) { 1396 *cow_ret = buf; 1397 return 0; 1398 } 1399 1400 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 1401 1402 if (parent) 1403 btrfs_set_lock_blocking(parent); 1404 btrfs_set_lock_blocking(buf); 1405 1406 ret = __btrfs_cow_block(trans, root, buf, parent, 1407 parent_slot, cow_ret, search_start, 0); 1408 1409 trace_btrfs_cow_block(root, buf, *cow_ret); 1410 1411 return ret; 1412 } 1413 1414 /* 1415 * helper function for defrag to decide if two blocks pointed to by a 1416 * node are actually close by 1417 */ 1418 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 1419 { 1420 if (blocknr < other && other - (blocknr + blocksize) < 32768) 1421 return 1; 1422 if (blocknr > other && blocknr - (other + blocksize) < 32768) 1423 return 1; 1424 return 0; 1425 } 1426 1427 /* 1428 * compare two keys in a memcmp fashion 1429 */ 1430 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) 1431 { 1432 struct btrfs_key k1; 1433 1434 btrfs_disk_key_to_cpu(&k1, disk); 1435 1436 return btrfs_comp_cpu_keys(&k1, k2); 1437 } 1438 1439 /* 1440 * same as comp_keys only with two btrfs_key's 1441 */ 1442 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) 1443 { 1444 if (k1->objectid > k2->objectid) 1445 return 1; 1446 if (k1->objectid < k2->objectid) 1447 return -1; 1448 if (k1->type > k2->type) 1449 return 1; 1450 if (k1->type < k2->type) 1451 return -1; 1452 if (k1->offset > k2->offset) 1453 return 1; 1454 if (k1->offset < k2->offset) 1455 return -1; 1456 return 0; 1457 } 1458 1459 /* 1460 * this is used by the defrag code to go through all the 1461 * leaves pointed to by a node and reallocate them so that 1462 * disk order is close to key order 1463 */ 1464 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1465 struct btrfs_root *root, struct extent_buffer *parent, 1466 int start_slot, u64 *last_ret, 1467 struct btrfs_key *progress) 1468 { 1469 struct extent_buffer *cur; 1470 u64 blocknr; 1471 u64 gen; 1472 u64 search_start = *last_ret; 1473 u64 last_block = 0; 1474 u64 other; 1475 u32 parent_nritems; 1476 int end_slot; 1477 int i; 1478 int err = 0; 1479 int parent_level; 1480 int uptodate; 1481 u32 blocksize; 1482 int progress_passed = 0; 1483 struct btrfs_disk_key disk_key; 1484 1485 parent_level = btrfs_header_level(parent); 1486 1487 WARN_ON(trans->transaction != root->fs_info->running_transaction); 1488 WARN_ON(trans->transid != root->fs_info->generation); 1489 1490 parent_nritems = btrfs_header_nritems(parent); 1491 blocksize = btrfs_level_size(root, parent_level - 1); 1492 end_slot = parent_nritems; 1493 1494 if (parent_nritems == 1) 1495 return 0; 1496 1497 btrfs_set_lock_blocking(parent); 1498 1499 for (i = start_slot; i < end_slot; i++) { 1500 int close = 1; 1501 1502 btrfs_node_key(parent, &disk_key, i); 1503 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 1504 continue; 1505 1506 progress_passed = 1; 1507 blocknr = btrfs_node_blockptr(parent, i); 1508 gen = btrfs_node_ptr_generation(parent, i); 1509 if (last_block == 0) 1510 last_block = blocknr; 1511 1512 if (i > 0) { 1513 other = btrfs_node_blockptr(parent, i - 1); 1514 close = close_blocks(blocknr, other, blocksize); 1515 } 1516 if (!close && i < end_slot - 2) { 1517 other = btrfs_node_blockptr(parent, i + 1); 1518 close = close_blocks(blocknr, other, blocksize); 1519 } 1520 if (close) { 1521 last_block = blocknr; 1522 continue; 1523 } 1524 1525 cur = btrfs_find_tree_block(root, blocknr, blocksize); 1526 if (cur) 1527 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1528 else 1529 uptodate = 0; 1530 if (!cur || !uptodate) { 1531 if (!cur) { 1532 cur = read_tree_block(root, blocknr, 1533 blocksize, gen); 1534 if (!cur || !extent_buffer_uptodate(cur)) { 1535 free_extent_buffer(cur); 1536 return -EIO; 1537 } 1538 } else if (!uptodate) { 1539 err = btrfs_read_buffer(cur, gen); 1540 if (err) { 1541 free_extent_buffer(cur); 1542 return err; 1543 } 1544 } 1545 } 1546 if (search_start == 0) 1547 search_start = last_block; 1548 1549 btrfs_tree_lock(cur); 1550 btrfs_set_lock_blocking(cur); 1551 err = __btrfs_cow_block(trans, root, cur, parent, i, 1552 &cur, search_start, 1553 min(16 * blocksize, 1554 (end_slot - i) * blocksize)); 1555 if (err) { 1556 btrfs_tree_unlock(cur); 1557 free_extent_buffer(cur); 1558 break; 1559 } 1560 search_start = cur->start; 1561 last_block = cur->start; 1562 *last_ret = search_start; 1563 btrfs_tree_unlock(cur); 1564 free_extent_buffer(cur); 1565 } 1566 return err; 1567 } 1568 1569 /* 1570 * The leaf data grows from end-to-front in the node. 1571 * this returns the address of the start of the last item, 1572 * which is the stop of the leaf data stack 1573 */ 1574 static inline unsigned int leaf_data_end(struct btrfs_root *root, 1575 struct extent_buffer *leaf) 1576 { 1577 u32 nr = btrfs_header_nritems(leaf); 1578 if (nr == 0) 1579 return BTRFS_LEAF_DATA_SIZE(root); 1580 return btrfs_item_offset_nr(leaf, nr - 1); 1581 } 1582 1583 1584 /* 1585 * search for key in the extent_buffer. The items start at offset p, 1586 * and they are item_size apart. There are 'max' items in p. 1587 * 1588 * the slot in the array is returned via slot, and it points to 1589 * the place where you would insert key if it is not found in 1590 * the array. 1591 * 1592 * slot may point to max if the key is bigger than all of the keys 1593 */ 1594 static noinline int generic_bin_search(struct extent_buffer *eb, 1595 unsigned long p, 1596 int item_size, struct btrfs_key *key, 1597 int max, int *slot) 1598 { 1599 int low = 0; 1600 int high = max; 1601 int mid; 1602 int ret; 1603 struct btrfs_disk_key *tmp = NULL; 1604 struct btrfs_disk_key unaligned; 1605 unsigned long offset; 1606 char *kaddr = NULL; 1607 unsigned long map_start = 0; 1608 unsigned long map_len = 0; 1609 int err; 1610 1611 while (low < high) { 1612 mid = (low + high) / 2; 1613 offset = p + mid * item_size; 1614 1615 if (!kaddr || offset < map_start || 1616 (offset + sizeof(struct btrfs_disk_key)) > 1617 map_start + map_len) { 1618 1619 err = map_private_extent_buffer(eb, offset, 1620 sizeof(struct btrfs_disk_key), 1621 &kaddr, &map_start, &map_len); 1622 1623 if (!err) { 1624 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1625 map_start); 1626 } else { 1627 read_extent_buffer(eb, &unaligned, 1628 offset, sizeof(unaligned)); 1629 tmp = &unaligned; 1630 } 1631 1632 } else { 1633 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1634 map_start); 1635 } 1636 ret = comp_keys(tmp, key); 1637 1638 if (ret < 0) 1639 low = mid + 1; 1640 else if (ret > 0) 1641 high = mid; 1642 else { 1643 *slot = mid; 1644 return 0; 1645 } 1646 } 1647 *slot = low; 1648 return 1; 1649 } 1650 1651 /* 1652 * simple bin_search frontend that does the right thing for 1653 * leaves vs nodes 1654 */ 1655 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1656 int level, int *slot) 1657 { 1658 if (level == 0) 1659 return generic_bin_search(eb, 1660 offsetof(struct btrfs_leaf, items), 1661 sizeof(struct btrfs_item), 1662 key, btrfs_header_nritems(eb), 1663 slot); 1664 else 1665 return generic_bin_search(eb, 1666 offsetof(struct btrfs_node, ptrs), 1667 sizeof(struct btrfs_key_ptr), 1668 key, btrfs_header_nritems(eb), 1669 slot); 1670 } 1671 1672 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1673 int level, int *slot) 1674 { 1675 return bin_search(eb, key, level, slot); 1676 } 1677 1678 static void root_add_used(struct btrfs_root *root, u32 size) 1679 { 1680 spin_lock(&root->accounting_lock); 1681 btrfs_set_root_used(&root->root_item, 1682 btrfs_root_used(&root->root_item) + size); 1683 spin_unlock(&root->accounting_lock); 1684 } 1685 1686 static void root_sub_used(struct btrfs_root *root, u32 size) 1687 { 1688 spin_lock(&root->accounting_lock); 1689 btrfs_set_root_used(&root->root_item, 1690 btrfs_root_used(&root->root_item) - size); 1691 spin_unlock(&root->accounting_lock); 1692 } 1693 1694 /* given a node and slot number, this reads the blocks it points to. The 1695 * extent buffer is returned with a reference taken (but unlocked). 1696 * NULL is returned on error. 1697 */ 1698 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, 1699 struct extent_buffer *parent, int slot) 1700 { 1701 int level = btrfs_header_level(parent); 1702 struct extent_buffer *eb; 1703 1704 if (slot < 0) 1705 return NULL; 1706 if (slot >= btrfs_header_nritems(parent)) 1707 return NULL; 1708 1709 BUG_ON(level == 0); 1710 1711 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1712 btrfs_level_size(root, level - 1), 1713 btrfs_node_ptr_generation(parent, slot)); 1714 if (eb && !extent_buffer_uptodate(eb)) { 1715 free_extent_buffer(eb); 1716 eb = NULL; 1717 } 1718 1719 return eb; 1720 } 1721 1722 /* 1723 * node level balancing, used to make sure nodes are in proper order for 1724 * item deletion. We balance from the top down, so we have to make sure 1725 * that a deletion won't leave an node completely empty later on. 1726 */ 1727 static noinline int balance_level(struct btrfs_trans_handle *trans, 1728 struct btrfs_root *root, 1729 struct btrfs_path *path, int level) 1730 { 1731 struct extent_buffer *right = NULL; 1732 struct extent_buffer *mid; 1733 struct extent_buffer *left = NULL; 1734 struct extent_buffer *parent = NULL; 1735 int ret = 0; 1736 int wret; 1737 int pslot; 1738 int orig_slot = path->slots[level]; 1739 u64 orig_ptr; 1740 1741 if (level == 0) 1742 return 0; 1743 1744 mid = path->nodes[level]; 1745 1746 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && 1747 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); 1748 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1749 1750 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1751 1752 if (level < BTRFS_MAX_LEVEL - 1) { 1753 parent = path->nodes[level + 1]; 1754 pslot = path->slots[level + 1]; 1755 } 1756 1757 /* 1758 * deal with the case where there is only one pointer in the root 1759 * by promoting the node below to a root 1760 */ 1761 if (!parent) { 1762 struct extent_buffer *child; 1763 1764 if (btrfs_header_nritems(mid) != 1) 1765 return 0; 1766 1767 /* promote the child to a root */ 1768 child = read_node_slot(root, mid, 0); 1769 if (!child) { 1770 ret = -EROFS; 1771 btrfs_std_error(root->fs_info, ret); 1772 goto enospc; 1773 } 1774 1775 btrfs_tree_lock(child); 1776 btrfs_set_lock_blocking(child); 1777 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1778 if (ret) { 1779 btrfs_tree_unlock(child); 1780 free_extent_buffer(child); 1781 goto enospc; 1782 } 1783 1784 tree_mod_log_set_root_pointer(root, child, 1); 1785 rcu_assign_pointer(root->node, child); 1786 1787 add_root_to_dirty_list(root); 1788 btrfs_tree_unlock(child); 1789 1790 path->locks[level] = 0; 1791 path->nodes[level] = NULL; 1792 clean_tree_block(trans, root, mid); 1793 btrfs_tree_unlock(mid); 1794 /* once for the path */ 1795 free_extent_buffer(mid); 1796 1797 root_sub_used(root, mid->len); 1798 btrfs_free_tree_block(trans, root, mid, 0, 1); 1799 /* once for the root ptr */ 1800 free_extent_buffer_stale(mid); 1801 return 0; 1802 } 1803 if (btrfs_header_nritems(mid) > 1804 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1805 return 0; 1806 1807 left = read_node_slot(root, parent, pslot - 1); 1808 if (left) { 1809 btrfs_tree_lock(left); 1810 btrfs_set_lock_blocking(left); 1811 wret = btrfs_cow_block(trans, root, left, 1812 parent, pslot - 1, &left); 1813 if (wret) { 1814 ret = wret; 1815 goto enospc; 1816 } 1817 } 1818 right = read_node_slot(root, parent, pslot + 1); 1819 if (right) { 1820 btrfs_tree_lock(right); 1821 btrfs_set_lock_blocking(right); 1822 wret = btrfs_cow_block(trans, root, right, 1823 parent, pslot + 1, &right); 1824 if (wret) { 1825 ret = wret; 1826 goto enospc; 1827 } 1828 } 1829 1830 /* first, try to make some room in the middle buffer */ 1831 if (left) { 1832 orig_slot += btrfs_header_nritems(left); 1833 wret = push_node_left(trans, root, left, mid, 1); 1834 if (wret < 0) 1835 ret = wret; 1836 } 1837 1838 /* 1839 * then try to empty the right most buffer into the middle 1840 */ 1841 if (right) { 1842 wret = push_node_left(trans, root, mid, right, 1); 1843 if (wret < 0 && wret != -ENOSPC) 1844 ret = wret; 1845 if (btrfs_header_nritems(right) == 0) { 1846 clean_tree_block(trans, root, right); 1847 btrfs_tree_unlock(right); 1848 del_ptr(root, path, level + 1, pslot + 1); 1849 root_sub_used(root, right->len); 1850 btrfs_free_tree_block(trans, root, right, 0, 1); 1851 free_extent_buffer_stale(right); 1852 right = NULL; 1853 } else { 1854 struct btrfs_disk_key right_key; 1855 btrfs_node_key(right, &right_key, 0); 1856 tree_mod_log_set_node_key(root->fs_info, parent, 1857 pslot + 1, 0); 1858 btrfs_set_node_key(parent, &right_key, pslot + 1); 1859 btrfs_mark_buffer_dirty(parent); 1860 } 1861 } 1862 if (btrfs_header_nritems(mid) == 1) { 1863 /* 1864 * we're not allowed to leave a node with one item in the 1865 * tree during a delete. A deletion from lower in the tree 1866 * could try to delete the only pointer in this node. 1867 * So, pull some keys from the left. 1868 * There has to be a left pointer at this point because 1869 * otherwise we would have pulled some pointers from the 1870 * right 1871 */ 1872 if (!left) { 1873 ret = -EROFS; 1874 btrfs_std_error(root->fs_info, ret); 1875 goto enospc; 1876 } 1877 wret = balance_node_right(trans, root, mid, left); 1878 if (wret < 0) { 1879 ret = wret; 1880 goto enospc; 1881 } 1882 if (wret == 1) { 1883 wret = push_node_left(trans, root, left, mid, 1); 1884 if (wret < 0) 1885 ret = wret; 1886 } 1887 BUG_ON(wret == 1); 1888 } 1889 if (btrfs_header_nritems(mid) == 0) { 1890 clean_tree_block(trans, root, mid); 1891 btrfs_tree_unlock(mid); 1892 del_ptr(root, path, level + 1, pslot); 1893 root_sub_used(root, mid->len); 1894 btrfs_free_tree_block(trans, root, mid, 0, 1); 1895 free_extent_buffer_stale(mid); 1896 mid = NULL; 1897 } else { 1898 /* update the parent key to reflect our changes */ 1899 struct btrfs_disk_key mid_key; 1900 btrfs_node_key(mid, &mid_key, 0); 1901 tree_mod_log_set_node_key(root->fs_info, parent, 1902 pslot, 0); 1903 btrfs_set_node_key(parent, &mid_key, pslot); 1904 btrfs_mark_buffer_dirty(parent); 1905 } 1906 1907 /* update the path */ 1908 if (left) { 1909 if (btrfs_header_nritems(left) > orig_slot) { 1910 extent_buffer_get(left); 1911 /* left was locked after cow */ 1912 path->nodes[level] = left; 1913 path->slots[level + 1] -= 1; 1914 path->slots[level] = orig_slot; 1915 if (mid) { 1916 btrfs_tree_unlock(mid); 1917 free_extent_buffer(mid); 1918 } 1919 } else { 1920 orig_slot -= btrfs_header_nritems(left); 1921 path->slots[level] = orig_slot; 1922 } 1923 } 1924 /* double check we haven't messed things up */ 1925 if (orig_ptr != 1926 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1927 BUG(); 1928 enospc: 1929 if (right) { 1930 btrfs_tree_unlock(right); 1931 free_extent_buffer(right); 1932 } 1933 if (left) { 1934 if (path->nodes[level] != left) 1935 btrfs_tree_unlock(left); 1936 free_extent_buffer(left); 1937 } 1938 return ret; 1939 } 1940 1941 /* Node balancing for insertion. Here we only split or push nodes around 1942 * when they are completely full. This is also done top down, so we 1943 * have to be pessimistic. 1944 */ 1945 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1946 struct btrfs_root *root, 1947 struct btrfs_path *path, int level) 1948 { 1949 struct extent_buffer *right = NULL; 1950 struct extent_buffer *mid; 1951 struct extent_buffer *left = NULL; 1952 struct extent_buffer *parent = NULL; 1953 int ret = 0; 1954 int wret; 1955 int pslot; 1956 int orig_slot = path->slots[level]; 1957 1958 if (level == 0) 1959 return 1; 1960 1961 mid = path->nodes[level]; 1962 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1963 1964 if (level < BTRFS_MAX_LEVEL - 1) { 1965 parent = path->nodes[level + 1]; 1966 pslot = path->slots[level + 1]; 1967 } 1968 1969 if (!parent) 1970 return 1; 1971 1972 left = read_node_slot(root, parent, pslot - 1); 1973 1974 /* first, try to make some room in the middle buffer */ 1975 if (left) { 1976 u32 left_nr; 1977 1978 btrfs_tree_lock(left); 1979 btrfs_set_lock_blocking(left); 1980 1981 left_nr = btrfs_header_nritems(left); 1982 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1983 wret = 1; 1984 } else { 1985 ret = btrfs_cow_block(trans, root, left, parent, 1986 pslot - 1, &left); 1987 if (ret) 1988 wret = 1; 1989 else { 1990 wret = push_node_left(trans, root, 1991 left, mid, 0); 1992 } 1993 } 1994 if (wret < 0) 1995 ret = wret; 1996 if (wret == 0) { 1997 struct btrfs_disk_key disk_key; 1998 orig_slot += left_nr; 1999 btrfs_node_key(mid, &disk_key, 0); 2000 tree_mod_log_set_node_key(root->fs_info, parent, 2001 pslot, 0); 2002 btrfs_set_node_key(parent, &disk_key, pslot); 2003 btrfs_mark_buffer_dirty(parent); 2004 if (btrfs_header_nritems(left) > orig_slot) { 2005 path->nodes[level] = left; 2006 path->slots[level + 1] -= 1; 2007 path->slots[level] = orig_slot; 2008 btrfs_tree_unlock(mid); 2009 free_extent_buffer(mid); 2010 } else { 2011 orig_slot -= 2012 btrfs_header_nritems(left); 2013 path->slots[level] = orig_slot; 2014 btrfs_tree_unlock(left); 2015 free_extent_buffer(left); 2016 } 2017 return 0; 2018 } 2019 btrfs_tree_unlock(left); 2020 free_extent_buffer(left); 2021 } 2022 right = read_node_slot(root, parent, pslot + 1); 2023 2024 /* 2025 * then try to empty the right most buffer into the middle 2026 */ 2027 if (right) { 2028 u32 right_nr; 2029 2030 btrfs_tree_lock(right); 2031 btrfs_set_lock_blocking(right); 2032 2033 right_nr = btrfs_header_nritems(right); 2034 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 2035 wret = 1; 2036 } else { 2037 ret = btrfs_cow_block(trans, root, right, 2038 parent, pslot + 1, 2039 &right); 2040 if (ret) 2041 wret = 1; 2042 else { 2043 wret = balance_node_right(trans, root, 2044 right, mid); 2045 } 2046 } 2047 if (wret < 0) 2048 ret = wret; 2049 if (wret == 0) { 2050 struct btrfs_disk_key disk_key; 2051 2052 btrfs_node_key(right, &disk_key, 0); 2053 tree_mod_log_set_node_key(root->fs_info, parent, 2054 pslot + 1, 0); 2055 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2056 btrfs_mark_buffer_dirty(parent); 2057 2058 if (btrfs_header_nritems(mid) <= orig_slot) { 2059 path->nodes[level] = right; 2060 path->slots[level + 1] += 1; 2061 path->slots[level] = orig_slot - 2062 btrfs_header_nritems(mid); 2063 btrfs_tree_unlock(mid); 2064 free_extent_buffer(mid); 2065 } else { 2066 btrfs_tree_unlock(right); 2067 free_extent_buffer(right); 2068 } 2069 return 0; 2070 } 2071 btrfs_tree_unlock(right); 2072 free_extent_buffer(right); 2073 } 2074 return 1; 2075 } 2076 2077 /* 2078 * readahead one full node of leaves, finding things that are close 2079 * to the block in 'slot', and triggering ra on them. 2080 */ 2081 static void reada_for_search(struct btrfs_root *root, 2082 struct btrfs_path *path, 2083 int level, int slot, u64 objectid) 2084 { 2085 struct extent_buffer *node; 2086 struct btrfs_disk_key disk_key; 2087 u32 nritems; 2088 u64 search; 2089 u64 target; 2090 u64 nread = 0; 2091 u64 gen; 2092 int direction = path->reada; 2093 struct extent_buffer *eb; 2094 u32 nr; 2095 u32 blocksize; 2096 u32 nscan = 0; 2097 2098 if (level != 1) 2099 return; 2100 2101 if (!path->nodes[level]) 2102 return; 2103 2104 node = path->nodes[level]; 2105 2106 search = btrfs_node_blockptr(node, slot); 2107 blocksize = btrfs_level_size(root, level - 1); 2108 eb = btrfs_find_tree_block(root, search, blocksize); 2109 if (eb) { 2110 free_extent_buffer(eb); 2111 return; 2112 } 2113 2114 target = search; 2115 2116 nritems = btrfs_header_nritems(node); 2117 nr = slot; 2118 2119 while (1) { 2120 if (direction < 0) { 2121 if (nr == 0) 2122 break; 2123 nr--; 2124 } else if (direction > 0) { 2125 nr++; 2126 if (nr >= nritems) 2127 break; 2128 } 2129 if (path->reada < 0 && objectid) { 2130 btrfs_node_key(node, &disk_key, nr); 2131 if (btrfs_disk_key_objectid(&disk_key) != objectid) 2132 break; 2133 } 2134 search = btrfs_node_blockptr(node, nr); 2135 if ((search <= target && target - search <= 65536) || 2136 (search > target && search - target <= 65536)) { 2137 gen = btrfs_node_ptr_generation(node, nr); 2138 readahead_tree_block(root, search, blocksize, gen); 2139 nread += blocksize; 2140 } 2141 nscan++; 2142 if ((nread > 65536 || nscan > 32)) 2143 break; 2144 } 2145 } 2146 2147 static noinline void reada_for_balance(struct btrfs_root *root, 2148 struct btrfs_path *path, int level) 2149 { 2150 int slot; 2151 int nritems; 2152 struct extent_buffer *parent; 2153 struct extent_buffer *eb; 2154 u64 gen; 2155 u64 block1 = 0; 2156 u64 block2 = 0; 2157 int blocksize; 2158 2159 parent = path->nodes[level + 1]; 2160 if (!parent) 2161 return; 2162 2163 nritems = btrfs_header_nritems(parent); 2164 slot = path->slots[level + 1]; 2165 blocksize = btrfs_level_size(root, level); 2166 2167 if (slot > 0) { 2168 block1 = btrfs_node_blockptr(parent, slot - 1); 2169 gen = btrfs_node_ptr_generation(parent, slot - 1); 2170 eb = btrfs_find_tree_block(root, block1, blocksize); 2171 /* 2172 * if we get -eagain from btrfs_buffer_uptodate, we 2173 * don't want to return eagain here. That will loop 2174 * forever 2175 */ 2176 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2177 block1 = 0; 2178 free_extent_buffer(eb); 2179 } 2180 if (slot + 1 < nritems) { 2181 block2 = btrfs_node_blockptr(parent, slot + 1); 2182 gen = btrfs_node_ptr_generation(parent, slot + 1); 2183 eb = btrfs_find_tree_block(root, block2, blocksize); 2184 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2185 block2 = 0; 2186 free_extent_buffer(eb); 2187 } 2188 2189 if (block1) 2190 readahead_tree_block(root, block1, blocksize, 0); 2191 if (block2) 2192 readahead_tree_block(root, block2, blocksize, 0); 2193 } 2194 2195 2196 /* 2197 * when we walk down the tree, it is usually safe to unlock the higher layers 2198 * in the tree. The exceptions are when our path goes through slot 0, because 2199 * operations on the tree might require changing key pointers higher up in the 2200 * tree. 2201 * 2202 * callers might also have set path->keep_locks, which tells this code to keep 2203 * the lock if the path points to the last slot in the block. This is part of 2204 * walking through the tree, and selecting the next slot in the higher block. 2205 * 2206 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 2207 * if lowest_unlock is 1, level 0 won't be unlocked 2208 */ 2209 static noinline void unlock_up(struct btrfs_path *path, int level, 2210 int lowest_unlock, int min_write_lock_level, 2211 int *write_lock_level) 2212 { 2213 int i; 2214 int skip_level = level; 2215 int no_skips = 0; 2216 struct extent_buffer *t; 2217 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2219 if (!path->nodes[i]) 2220 break; 2221 if (!path->locks[i]) 2222 break; 2223 if (!no_skips && path->slots[i] == 0) { 2224 skip_level = i + 1; 2225 continue; 2226 } 2227 if (!no_skips && path->keep_locks) { 2228 u32 nritems; 2229 t = path->nodes[i]; 2230 nritems = btrfs_header_nritems(t); 2231 if (nritems < 1 || path->slots[i] >= nritems - 1) { 2232 skip_level = i + 1; 2233 continue; 2234 } 2235 } 2236 if (skip_level < i && i >= lowest_unlock) 2237 no_skips = 1; 2238 2239 t = path->nodes[i]; 2240 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 2241 btrfs_tree_unlock_rw(t, path->locks[i]); 2242 path->locks[i] = 0; 2243 if (write_lock_level && 2244 i > min_write_lock_level && 2245 i <= *write_lock_level) { 2246 *write_lock_level = i - 1; 2247 } 2248 } 2249 } 2250 } 2251 2252 /* 2253 * This releases any locks held in the path starting at level and 2254 * going all the way up to the root. 2255 * 2256 * btrfs_search_slot will keep the lock held on higher nodes in a few 2257 * corner cases, such as COW of the block at slot zero in the node. This 2258 * ignores those rules, and it should only be called when there are no 2259 * more updates to be done higher up in the tree. 2260 */ 2261 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 2262 { 2263 int i; 2264 2265 if (path->keep_locks) 2266 return; 2267 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2269 if (!path->nodes[i]) 2270 continue; 2271 if (!path->locks[i]) 2272 continue; 2273 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 2274 path->locks[i] = 0; 2275 } 2276 } 2277 2278 /* 2279 * helper function for btrfs_search_slot. The goal is to find a block 2280 * in cache without setting the path to blocking. If we find the block 2281 * we return zero and the path is unchanged. 2282 * 2283 * If we can't find the block, we set the path blocking and do some 2284 * reada. -EAGAIN is returned and the search must be repeated. 2285 */ 2286 static int 2287 read_block_for_search(struct btrfs_trans_handle *trans, 2288 struct btrfs_root *root, struct btrfs_path *p, 2289 struct extent_buffer **eb_ret, int level, int slot, 2290 struct btrfs_key *key, u64 time_seq) 2291 { 2292 u64 blocknr; 2293 u64 gen; 2294 u32 blocksize; 2295 struct extent_buffer *b = *eb_ret; 2296 struct extent_buffer *tmp; 2297 int ret; 2298 2299 blocknr = btrfs_node_blockptr(b, slot); 2300 gen = btrfs_node_ptr_generation(b, slot); 2301 blocksize = btrfs_level_size(root, level - 1); 2302 2303 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2304 if (tmp) { 2305 /* first we do an atomic uptodate check */ 2306 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2307 *eb_ret = tmp; 2308 return 0; 2309 } 2310 2311 /* the pages were up to date, but we failed 2312 * the generation number check. Do a full 2313 * read for the generation number that is correct. 2314 * We must do this without dropping locks so 2315 * we can trust our generation number 2316 */ 2317 btrfs_set_path_blocking(p); 2318 2319 /* now we're allowed to do a blocking uptodate check */ 2320 ret = btrfs_read_buffer(tmp, gen); 2321 if (!ret) { 2322 *eb_ret = tmp; 2323 return 0; 2324 } 2325 free_extent_buffer(tmp); 2326 btrfs_release_path(p); 2327 return -EIO; 2328 } 2329 2330 /* 2331 * reduce lock contention at high levels 2332 * of the btree by dropping locks before 2333 * we read. Don't release the lock on the current 2334 * level because we need to walk this node to figure 2335 * out which blocks to read. 2336 */ 2337 btrfs_unlock_up_safe(p, level + 1); 2338 btrfs_set_path_blocking(p); 2339 2340 free_extent_buffer(tmp); 2341 if (p->reada) 2342 reada_for_search(root, p, level, slot, key->objectid); 2343 2344 btrfs_release_path(p); 2345 2346 ret = -EAGAIN; 2347 tmp = read_tree_block(root, blocknr, blocksize, 0); 2348 if (tmp) { 2349 /* 2350 * If the read above didn't mark this buffer up to date, 2351 * it will never end up being up to date. Set ret to EIO now 2352 * and give up so that our caller doesn't loop forever 2353 * on our EAGAINs. 2354 */ 2355 if (!btrfs_buffer_uptodate(tmp, 0, 0)) 2356 ret = -EIO; 2357 free_extent_buffer(tmp); 2358 } 2359 return ret; 2360 } 2361 2362 /* 2363 * helper function for btrfs_search_slot. This does all of the checks 2364 * for node-level blocks and does any balancing required based on 2365 * the ins_len. 2366 * 2367 * If no extra work was required, zero is returned. If we had to 2368 * drop the path, -EAGAIN is returned and btrfs_search_slot must 2369 * start over 2370 */ 2371 static int 2372 setup_nodes_for_search(struct btrfs_trans_handle *trans, 2373 struct btrfs_root *root, struct btrfs_path *p, 2374 struct extent_buffer *b, int level, int ins_len, 2375 int *write_lock_level) 2376 { 2377 int ret; 2378 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 2379 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 2380 int sret; 2381 2382 if (*write_lock_level < level + 1) { 2383 *write_lock_level = level + 1; 2384 btrfs_release_path(p); 2385 goto again; 2386 } 2387 2388 btrfs_set_path_blocking(p); 2389 reada_for_balance(root, p, level); 2390 sret = split_node(trans, root, p, level); 2391 btrfs_clear_path_blocking(p, NULL, 0); 2392 2393 BUG_ON(sret > 0); 2394 if (sret) { 2395 ret = sret; 2396 goto done; 2397 } 2398 b = p->nodes[level]; 2399 } else if (ins_len < 0 && btrfs_header_nritems(b) < 2400 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 2401 int sret; 2402 2403 if (*write_lock_level < level + 1) { 2404 *write_lock_level = level + 1; 2405 btrfs_release_path(p); 2406 goto again; 2407 } 2408 2409 btrfs_set_path_blocking(p); 2410 reada_for_balance(root, p, level); 2411 sret = balance_level(trans, root, p, level); 2412 btrfs_clear_path_blocking(p, NULL, 0); 2413 2414 if (sret) { 2415 ret = sret; 2416 goto done; 2417 } 2418 b = p->nodes[level]; 2419 if (!b) { 2420 btrfs_release_path(p); 2421 goto again; 2422 } 2423 BUG_ON(btrfs_header_nritems(b) == 1); 2424 } 2425 return 0; 2426 2427 again: 2428 ret = -EAGAIN; 2429 done: 2430 return ret; 2431 } 2432 2433 /* 2434 * look for key in the tree. path is filled in with nodes along the way 2435 * if key is found, we return zero and you can find the item in the leaf 2436 * level of the path (level 0) 2437 * 2438 * If the key isn't found, the path points to the slot where it should 2439 * be inserted, and 1 is returned. If there are other errors during the 2440 * search a negative error number is returned. 2441 * 2442 * if ins_len > 0, nodes and leaves will be split as we walk down the 2443 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if 2444 * possible) 2445 */ 2446 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2447 *root, struct btrfs_key *key, struct btrfs_path *p, int 2448 ins_len, int cow) 2449 { 2450 struct extent_buffer *b; 2451 int slot; 2452 int ret; 2453 int err; 2454 int level; 2455 int lowest_unlock = 1; 2456 int root_lock; 2457 /* everything at write_lock_level or lower must be write locked */ 2458 int write_lock_level = 0; 2459 u8 lowest_level = 0; 2460 int min_write_lock_level; 2461 2462 lowest_level = p->lowest_level; 2463 WARN_ON(lowest_level && ins_len > 0); 2464 WARN_ON(p->nodes[0] != NULL); 2465 2466 if (ins_len < 0) { 2467 lowest_unlock = 2; 2468 2469 /* when we are removing items, we might have to go up to level 2470 * two as we update tree pointers Make sure we keep write 2471 * for those levels as well 2472 */ 2473 write_lock_level = 2; 2474 } else if (ins_len > 0) { 2475 /* 2476 * for inserting items, make sure we have a write lock on 2477 * level 1 so we can update keys 2478 */ 2479 write_lock_level = 1; 2480 } 2481 2482 if (!cow) 2483 write_lock_level = -1; 2484 2485 if (cow && (p->keep_locks || p->lowest_level)) 2486 write_lock_level = BTRFS_MAX_LEVEL; 2487 2488 min_write_lock_level = write_lock_level; 2489 2490 again: 2491 /* 2492 * we try very hard to do read locks on the root 2493 */ 2494 root_lock = BTRFS_READ_LOCK; 2495 level = 0; 2496 if (p->search_commit_root) { 2497 /* 2498 * the commit roots are read only 2499 * so we always do read locks 2500 */ 2501 b = root->commit_root; 2502 extent_buffer_get(b); 2503 level = btrfs_header_level(b); 2504 if (!p->skip_locking) 2505 btrfs_tree_read_lock(b); 2506 } else { 2507 if (p->skip_locking) { 2508 b = btrfs_root_node(root); 2509 level = btrfs_header_level(b); 2510 } else { 2511 /* we don't know the level of the root node 2512 * until we actually have it read locked 2513 */ 2514 b = btrfs_read_lock_root_node(root); 2515 level = btrfs_header_level(b); 2516 if (level <= write_lock_level) { 2517 /* whoops, must trade for write lock */ 2518 btrfs_tree_read_unlock(b); 2519 free_extent_buffer(b); 2520 b = btrfs_lock_root_node(root); 2521 root_lock = BTRFS_WRITE_LOCK; 2522 2523 /* the level might have changed, check again */ 2524 level = btrfs_header_level(b); 2525 } 2526 } 2527 } 2528 p->nodes[level] = b; 2529 if (!p->skip_locking) 2530 p->locks[level] = root_lock; 2531 2532 while (b) { 2533 level = btrfs_header_level(b); 2534 2535 /* 2536 * setup the path here so we can release it under lock 2537 * contention with the cow code 2538 */ 2539 if (cow) { 2540 /* 2541 * if we don't really need to cow this block 2542 * then we don't want to set the path blocking, 2543 * so we test it here 2544 */ 2545 if (!should_cow_block(trans, root, b)) 2546 goto cow_done; 2547 2548 btrfs_set_path_blocking(p); 2549 2550 /* 2551 * must have write locks on this node and the 2552 * parent 2553 */ 2554 if (level > write_lock_level || 2555 (level + 1 > write_lock_level && 2556 level + 1 < BTRFS_MAX_LEVEL && 2557 p->nodes[level + 1])) { 2558 write_lock_level = level + 1; 2559 btrfs_release_path(p); 2560 goto again; 2561 } 2562 2563 err = btrfs_cow_block(trans, root, b, 2564 p->nodes[level + 1], 2565 p->slots[level + 1], &b); 2566 if (err) { 2567 ret = err; 2568 goto done; 2569 } 2570 } 2571 cow_done: 2572 BUG_ON(!cow && ins_len); 2573 2574 p->nodes[level] = b; 2575 btrfs_clear_path_blocking(p, NULL, 0); 2576 2577 /* 2578 * we have a lock on b and as long as we aren't changing 2579 * the tree, there is no way to for the items in b to change. 2580 * It is safe to drop the lock on our parent before we 2581 * go through the expensive btree search on b. 2582 * 2583 * If cow is true, then we might be changing slot zero, 2584 * which may require changing the parent. So, we can't 2585 * drop the lock until after we know which slot we're 2586 * operating on. 2587 */ 2588 if (!cow) 2589 btrfs_unlock_up_safe(p, level + 1); 2590 2591 ret = bin_search(b, key, level, &slot); 2592 2593 if (level != 0) { 2594 int dec = 0; 2595 if (ret && slot > 0) { 2596 dec = 1; 2597 slot -= 1; 2598 } 2599 p->slots[level] = slot; 2600 err = setup_nodes_for_search(trans, root, p, b, level, 2601 ins_len, &write_lock_level); 2602 if (err == -EAGAIN) 2603 goto again; 2604 if (err) { 2605 ret = err; 2606 goto done; 2607 } 2608 b = p->nodes[level]; 2609 slot = p->slots[level]; 2610 2611 /* 2612 * slot 0 is special, if we change the key 2613 * we have to update the parent pointer 2614 * which means we must have a write lock 2615 * on the parent 2616 */ 2617 if (slot == 0 && cow && 2618 write_lock_level < level + 1) { 2619 write_lock_level = level + 1; 2620 btrfs_release_path(p); 2621 goto again; 2622 } 2623 2624 unlock_up(p, level, lowest_unlock, 2625 min_write_lock_level, &write_lock_level); 2626 2627 if (level == lowest_level) { 2628 if (dec) 2629 p->slots[level]++; 2630 goto done; 2631 } 2632 2633 err = read_block_for_search(trans, root, p, 2634 &b, level, slot, key, 0); 2635 if (err == -EAGAIN) 2636 goto again; 2637 if (err) { 2638 ret = err; 2639 goto done; 2640 } 2641 2642 if (!p->skip_locking) { 2643 level = btrfs_header_level(b); 2644 if (level <= write_lock_level) { 2645 err = btrfs_try_tree_write_lock(b); 2646 if (!err) { 2647 btrfs_set_path_blocking(p); 2648 btrfs_tree_lock(b); 2649 btrfs_clear_path_blocking(p, b, 2650 BTRFS_WRITE_LOCK); 2651 } 2652 p->locks[level] = BTRFS_WRITE_LOCK; 2653 } else { 2654 err = btrfs_try_tree_read_lock(b); 2655 if (!err) { 2656 btrfs_set_path_blocking(p); 2657 btrfs_tree_read_lock(b); 2658 btrfs_clear_path_blocking(p, b, 2659 BTRFS_READ_LOCK); 2660 } 2661 p->locks[level] = BTRFS_READ_LOCK; 2662 } 2663 p->nodes[level] = b; 2664 } 2665 } else { 2666 p->slots[level] = slot; 2667 if (ins_len > 0 && 2668 btrfs_leaf_free_space(root, b) < ins_len) { 2669 if (write_lock_level < 1) { 2670 write_lock_level = 1; 2671 btrfs_release_path(p); 2672 goto again; 2673 } 2674 2675 btrfs_set_path_blocking(p); 2676 err = split_leaf(trans, root, key, 2677 p, ins_len, ret == 0); 2678 btrfs_clear_path_blocking(p, NULL, 0); 2679 2680 BUG_ON(err > 0); 2681 if (err) { 2682 ret = err; 2683 goto done; 2684 } 2685 } 2686 if (!p->search_for_split) 2687 unlock_up(p, level, lowest_unlock, 2688 min_write_lock_level, &write_lock_level); 2689 goto done; 2690 } 2691 } 2692 ret = 1; 2693 done: 2694 /* 2695 * we don't really know what they plan on doing with the path 2696 * from here on, so for now just mark it as blocking 2697 */ 2698 if (!p->leave_spinning) 2699 btrfs_set_path_blocking(p); 2700 if (ret < 0) 2701 btrfs_release_path(p); 2702 return ret; 2703 } 2704 2705 /* 2706 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2707 * current state of the tree together with the operations recorded in the tree 2708 * modification log to search for the key in a previous version of this tree, as 2709 * denoted by the time_seq parameter. 2710 * 2711 * Naturally, there is no support for insert, delete or cow operations. 2712 * 2713 * The resulting path and return value will be set up as if we called 2714 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2715 */ 2716 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2717 struct btrfs_path *p, u64 time_seq) 2718 { 2719 struct extent_buffer *b; 2720 int slot; 2721 int ret; 2722 int err; 2723 int level; 2724 int lowest_unlock = 1; 2725 u8 lowest_level = 0; 2726 2727 lowest_level = p->lowest_level; 2728 WARN_ON(p->nodes[0] != NULL); 2729 2730 if (p->search_commit_root) { 2731 BUG_ON(time_seq); 2732 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2733 } 2734 2735 again: 2736 b = get_old_root(root, time_seq); 2737 level = btrfs_header_level(b); 2738 p->locks[level] = BTRFS_READ_LOCK; 2739 2740 while (b) { 2741 level = btrfs_header_level(b); 2742 p->nodes[level] = b; 2743 btrfs_clear_path_blocking(p, NULL, 0); 2744 2745 /* 2746 * we have a lock on b and as long as we aren't changing 2747 * the tree, there is no way to for the items in b to change. 2748 * It is safe to drop the lock on our parent before we 2749 * go through the expensive btree search on b. 2750 */ 2751 btrfs_unlock_up_safe(p, level + 1); 2752 2753 ret = bin_search(b, key, level, &slot); 2754 2755 if (level != 0) { 2756 int dec = 0; 2757 if (ret && slot > 0) { 2758 dec = 1; 2759 slot -= 1; 2760 } 2761 p->slots[level] = slot; 2762 unlock_up(p, level, lowest_unlock, 0, NULL); 2763 2764 if (level == lowest_level) { 2765 if (dec) 2766 p->slots[level]++; 2767 goto done; 2768 } 2769 2770 err = read_block_for_search(NULL, root, p, &b, level, 2771 slot, key, time_seq); 2772 if (err == -EAGAIN) 2773 goto again; 2774 if (err) { 2775 ret = err; 2776 goto done; 2777 } 2778 2779 level = btrfs_header_level(b); 2780 err = btrfs_try_tree_read_lock(b); 2781 if (!err) { 2782 btrfs_set_path_blocking(p); 2783 btrfs_tree_read_lock(b); 2784 btrfs_clear_path_blocking(p, b, 2785 BTRFS_READ_LOCK); 2786 } 2787 b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); 2788 if (!b) { 2789 ret = -ENOMEM; 2790 goto done; 2791 } 2792 p->locks[level] = BTRFS_READ_LOCK; 2793 p->nodes[level] = b; 2794 } else { 2795 p->slots[level] = slot; 2796 unlock_up(p, level, lowest_unlock, 0, NULL); 2797 goto done; 2798 } 2799 } 2800 ret = 1; 2801 done: 2802 if (!p->leave_spinning) 2803 btrfs_set_path_blocking(p); 2804 if (ret < 0) 2805 btrfs_release_path(p); 2806 2807 return ret; 2808 } 2809 2810 /* 2811 * helper to use instead of search slot if no exact match is needed but 2812 * instead the next or previous item should be returned. 2813 * When find_higher is true, the next higher item is returned, the next lower 2814 * otherwise. 2815 * When return_any and find_higher are both true, and no higher item is found, 2816 * return the next lower instead. 2817 * When return_any is true and find_higher is false, and no lower item is found, 2818 * return the next higher instead. 2819 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2820 * < 0 on error 2821 */ 2822 int btrfs_search_slot_for_read(struct btrfs_root *root, 2823 struct btrfs_key *key, struct btrfs_path *p, 2824 int find_higher, int return_any) 2825 { 2826 int ret; 2827 struct extent_buffer *leaf; 2828 2829 again: 2830 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2831 if (ret <= 0) 2832 return ret; 2833 /* 2834 * a return value of 1 means the path is at the position where the 2835 * item should be inserted. Normally this is the next bigger item, 2836 * but in case the previous item is the last in a leaf, path points 2837 * to the first free slot in the previous leaf, i.e. at an invalid 2838 * item. 2839 */ 2840 leaf = p->nodes[0]; 2841 2842 if (find_higher) { 2843 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2844 ret = btrfs_next_leaf(root, p); 2845 if (ret <= 0) 2846 return ret; 2847 if (!return_any) 2848 return 1; 2849 /* 2850 * no higher item found, return the next 2851 * lower instead 2852 */ 2853 return_any = 0; 2854 find_higher = 0; 2855 btrfs_release_path(p); 2856 goto again; 2857 } 2858 } else { 2859 if (p->slots[0] == 0) { 2860 ret = btrfs_prev_leaf(root, p); 2861 if (ret < 0) 2862 return ret; 2863 if (!ret) { 2864 p->slots[0] = btrfs_header_nritems(leaf) - 1; 2865 return 0; 2866 } 2867 if (!return_any) 2868 return 1; 2869 /* 2870 * no lower item found, return the next 2871 * higher instead 2872 */ 2873 return_any = 0; 2874 find_higher = 1; 2875 btrfs_release_path(p); 2876 goto again; 2877 } else { 2878 --p->slots[0]; 2879 } 2880 } 2881 return 0; 2882 } 2883 2884 /* 2885 * adjust the pointers going up the tree, starting at level 2886 * making sure the right key of each node is points to 'key'. 2887 * This is used after shifting pointers to the left, so it stops 2888 * fixing up pointers when a given leaf/node is not in slot 0 of the 2889 * higher levels 2890 * 2891 */ 2892 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, 2893 struct btrfs_disk_key *key, int level) 2894 { 2895 int i; 2896 struct extent_buffer *t; 2897 2898 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2899 int tslot = path->slots[i]; 2900 if (!path->nodes[i]) 2901 break; 2902 t = path->nodes[i]; 2903 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); 2904 btrfs_set_node_key(t, key, tslot); 2905 btrfs_mark_buffer_dirty(path->nodes[i]); 2906 if (tslot != 0) 2907 break; 2908 } 2909 } 2910 2911 /* 2912 * update item key. 2913 * 2914 * This function isn't completely safe. It's the caller's responsibility 2915 * that the new key won't break the order 2916 */ 2917 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, 2918 struct btrfs_key *new_key) 2919 { 2920 struct btrfs_disk_key disk_key; 2921 struct extent_buffer *eb; 2922 int slot; 2923 2924 eb = path->nodes[0]; 2925 slot = path->slots[0]; 2926 if (slot > 0) { 2927 btrfs_item_key(eb, &disk_key, slot - 1); 2928 BUG_ON(comp_keys(&disk_key, new_key) >= 0); 2929 } 2930 if (slot < btrfs_header_nritems(eb) - 1) { 2931 btrfs_item_key(eb, &disk_key, slot + 1); 2932 BUG_ON(comp_keys(&disk_key, new_key) <= 0); 2933 } 2934 2935 btrfs_cpu_key_to_disk(&disk_key, new_key); 2936 btrfs_set_item_key(eb, &disk_key, slot); 2937 btrfs_mark_buffer_dirty(eb); 2938 if (slot == 0) 2939 fixup_low_keys(root, path, &disk_key, 1); 2940 } 2941 2942 /* 2943 * try to push data from one node into the next node left in the 2944 * tree. 2945 * 2946 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 2947 * error, and > 0 if there was no room in the left hand block. 2948 */ 2949 static int push_node_left(struct btrfs_trans_handle *trans, 2950 struct btrfs_root *root, struct extent_buffer *dst, 2951 struct extent_buffer *src, int empty) 2952 { 2953 int push_items = 0; 2954 int src_nritems; 2955 int dst_nritems; 2956 int ret = 0; 2957 2958 src_nritems = btrfs_header_nritems(src); 2959 dst_nritems = btrfs_header_nritems(dst); 2960 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; 2961 WARN_ON(btrfs_header_generation(src) != trans->transid); 2962 WARN_ON(btrfs_header_generation(dst) != trans->transid); 2963 2964 if (!empty && src_nritems <= 8) 2965 return 1; 2966 2967 if (push_items <= 0) 2968 return 1; 2969 2970 if (empty) { 2971 push_items = min(src_nritems, push_items); 2972 if (push_items < src_nritems) { 2973 /* leave at least 8 pointers in the node if 2974 * we aren't going to empty it 2975 */ 2976 if (src_nritems - push_items < 8) { 2977 if (push_items <= 8) 2978 return 1; 2979 push_items -= 8; 2980 } 2981 } 2982 } else 2983 push_items = min(src_nritems - 8, push_items); 2984 2985 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, 2986 push_items); 2987 copy_extent_buffer(dst, src, 2988 btrfs_node_key_ptr_offset(dst_nritems), 2989 btrfs_node_key_ptr_offset(0), 2990 push_items * sizeof(struct btrfs_key_ptr)); 2991 2992 if (push_items < src_nritems) { 2993 /* 2994 * don't call tree_mod_log_eb_move here, key removal was already 2995 * fully logged by tree_mod_log_eb_copy above. 2996 */ 2997 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2998 btrfs_node_key_ptr_offset(push_items), 2999 (src_nritems - push_items) * 3000 sizeof(struct btrfs_key_ptr)); 3001 } 3002 btrfs_set_header_nritems(src, src_nritems - push_items); 3003 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3004 btrfs_mark_buffer_dirty(src); 3005 btrfs_mark_buffer_dirty(dst); 3006 3007 return ret; 3008 } 3009 3010 /* 3011 * try to push data from one node into the next node right in the 3012 * tree. 3013 * 3014 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 3015 * error, and > 0 if there was no room in the right hand block. 3016 * 3017 * this will only push up to 1/2 the contents of the left node over 3018 */ 3019 static int balance_node_right(struct btrfs_trans_handle *trans, 3020 struct btrfs_root *root, 3021 struct extent_buffer *dst, 3022 struct extent_buffer *src) 3023 { 3024 int push_items = 0; 3025 int max_push; 3026 int src_nritems; 3027 int dst_nritems; 3028 int ret = 0; 3029 3030 WARN_ON(btrfs_header_generation(src) != trans->transid); 3031 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3032 3033 src_nritems = btrfs_header_nritems(src); 3034 dst_nritems = btrfs_header_nritems(dst); 3035 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; 3036 if (push_items <= 0) 3037 return 1; 3038 3039 if (src_nritems < 4) 3040 return 1; 3041 3042 max_push = src_nritems / 2 + 1; 3043 /* don't try to empty the node */ 3044 if (max_push >= src_nritems) 3045 return 1; 3046 3047 if (max_push < push_items) 3048 push_items = max_push; 3049 3050 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); 3051 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 3052 btrfs_node_key_ptr_offset(0), 3053 (dst_nritems) * 3054 sizeof(struct btrfs_key_ptr)); 3055 3056 tree_mod_log_eb_copy(root->fs_info, dst, src, 0, 3057 src_nritems - push_items, push_items); 3058 copy_extent_buffer(dst, src, 3059 btrfs_node_key_ptr_offset(0), 3060 btrfs_node_key_ptr_offset(src_nritems - push_items), 3061 push_items * sizeof(struct btrfs_key_ptr)); 3062 3063 btrfs_set_header_nritems(src, src_nritems - push_items); 3064 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3065 3066 btrfs_mark_buffer_dirty(src); 3067 btrfs_mark_buffer_dirty(dst); 3068 3069 return ret; 3070 } 3071 3072 /* 3073 * helper function to insert a new root level in the tree. 3074 * A new node is allocated, and a single item is inserted to 3075 * point to the existing root 3076 * 3077 * returns zero on success or < 0 on failure. 3078 */ 3079 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3080 struct btrfs_root *root, 3081 struct btrfs_path *path, int level) 3082 { 3083 u64 lower_gen; 3084 struct extent_buffer *lower; 3085 struct extent_buffer *c; 3086 struct extent_buffer *old; 3087 struct btrfs_disk_key lower_key; 3088 3089 BUG_ON(path->nodes[level]); 3090 BUG_ON(path->nodes[level-1] != root->node); 3091 3092 lower = path->nodes[level-1]; 3093 if (level == 1) 3094 btrfs_item_key(lower, &lower_key, 0); 3095 else 3096 btrfs_node_key(lower, &lower_key, 0); 3097 3098 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3099 root->root_key.objectid, &lower_key, 3100 level, root->node->start, 0); 3101 if (IS_ERR(c)) 3102 return PTR_ERR(c); 3103 3104 root_add_used(root, root->nodesize); 3105 3106 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 3107 btrfs_set_header_nritems(c, 1); 3108 btrfs_set_header_level(c, level); 3109 btrfs_set_header_bytenr(c, c->start); 3110 btrfs_set_header_generation(c, trans->transid); 3111 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); 3112 btrfs_set_header_owner(c, root->root_key.objectid); 3113 3114 write_extent_buffer(c, root->fs_info->fsid, 3115 (unsigned long)btrfs_header_fsid(c), 3116 BTRFS_FSID_SIZE); 3117 3118 write_extent_buffer(c, root->fs_info->chunk_tree_uuid, 3119 (unsigned long)btrfs_header_chunk_tree_uuid(c), 3120 BTRFS_UUID_SIZE); 3121 3122 btrfs_set_node_key(c, &lower_key, 0); 3123 btrfs_set_node_blockptr(c, 0, lower->start); 3124 lower_gen = btrfs_header_generation(lower); 3125 WARN_ON(lower_gen != trans->transid); 3126 3127 btrfs_set_node_ptr_generation(c, 0, lower_gen); 3128 3129 btrfs_mark_buffer_dirty(c); 3130 3131 old = root->node; 3132 tree_mod_log_set_root_pointer(root, c, 0); 3133 rcu_assign_pointer(root->node, c); 3134 3135 /* the super has an extra ref to root->node */ 3136 free_extent_buffer(old); 3137 3138 add_root_to_dirty_list(root); 3139 extent_buffer_get(c); 3140 path->nodes[level] = c; 3141 path->locks[level] = BTRFS_WRITE_LOCK; 3142 path->slots[level] = 0; 3143 return 0; 3144 } 3145 3146 /* 3147 * worker function to insert a single pointer in a node. 3148 * the node should have enough room for the pointer already 3149 * 3150 * slot and level indicate where you want the key to go, and 3151 * blocknr is the block the key points to. 3152 */ 3153 static void insert_ptr(struct btrfs_trans_handle *trans, 3154 struct btrfs_root *root, struct btrfs_path *path, 3155 struct btrfs_disk_key *key, u64 bytenr, 3156 int slot, int level) 3157 { 3158 struct extent_buffer *lower; 3159 int nritems; 3160 int ret; 3161 3162 BUG_ON(!path->nodes[level]); 3163 btrfs_assert_tree_locked(path->nodes[level]); 3164 lower = path->nodes[level]; 3165 nritems = btrfs_header_nritems(lower); 3166 BUG_ON(slot > nritems); 3167 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 3168 if (slot != nritems) { 3169 if (level) 3170 tree_mod_log_eb_move(root->fs_info, lower, slot + 1, 3171 slot, nritems - slot); 3172 memmove_extent_buffer(lower, 3173 btrfs_node_key_ptr_offset(slot + 1), 3174 btrfs_node_key_ptr_offset(slot), 3175 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3176 } 3177 if (level) { 3178 ret = tree_mod_log_insert_key(root->fs_info, lower, slot, 3179 MOD_LOG_KEY_ADD, GFP_NOFS); 3180 BUG_ON(ret < 0); 3181 } 3182 btrfs_set_node_key(lower, key, slot); 3183 btrfs_set_node_blockptr(lower, slot, bytenr); 3184 WARN_ON(trans->transid == 0); 3185 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3186 btrfs_set_header_nritems(lower, nritems + 1); 3187 btrfs_mark_buffer_dirty(lower); 3188 } 3189 3190 /* 3191 * split the node at the specified level in path in two. 3192 * The path is corrected to point to the appropriate node after the split 3193 * 3194 * Before splitting this tries to make some room in the node by pushing 3195 * left and right, if either one works, it returns right away. 3196 * 3197 * returns 0 on success and < 0 on failure 3198 */ 3199 static noinline int split_node(struct btrfs_trans_handle *trans, 3200 struct btrfs_root *root, 3201 struct btrfs_path *path, int level) 3202 { 3203 struct extent_buffer *c; 3204 struct extent_buffer *split; 3205 struct btrfs_disk_key disk_key; 3206 int mid; 3207 int ret; 3208 u32 c_nritems; 3209 3210 c = path->nodes[level]; 3211 WARN_ON(btrfs_header_generation(c) != trans->transid); 3212 if (c == root->node) { 3213 /* 3214 * trying to split the root, lets make a new one 3215 * 3216 * tree mod log: We don't log_removal old root in 3217 * insert_new_root, because that root buffer will be kept as a 3218 * normal node. We are going to log removal of half of the 3219 * elements below with tree_mod_log_eb_copy. We're holding a 3220 * tree lock on the buffer, which is why we cannot race with 3221 * other tree_mod_log users. 3222 */ 3223 ret = insert_new_root(trans, root, path, level + 1); 3224 if (ret) 3225 return ret; 3226 } else { 3227 ret = push_nodes_for_insert(trans, root, path, level); 3228 c = path->nodes[level]; 3229 if (!ret && btrfs_header_nritems(c) < 3230 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) 3231 return 0; 3232 if (ret < 0) 3233 return ret; 3234 } 3235 3236 c_nritems = btrfs_header_nritems(c); 3237 mid = (c_nritems + 1) / 2; 3238 btrfs_node_key(c, &disk_key, mid); 3239 3240 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3241 root->root_key.objectid, 3242 &disk_key, level, c->start, 0); 3243 if (IS_ERR(split)) 3244 return PTR_ERR(split); 3245 3246 root_add_used(root, root->nodesize); 3247 3248 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 3249 btrfs_set_header_level(split, btrfs_header_level(c)); 3250 btrfs_set_header_bytenr(split, split->start); 3251 btrfs_set_header_generation(split, trans->transid); 3252 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); 3253 btrfs_set_header_owner(split, root->root_key.objectid); 3254 write_extent_buffer(split, root->fs_info->fsid, 3255 (unsigned long)btrfs_header_fsid(split), 3256 BTRFS_FSID_SIZE); 3257 write_extent_buffer(split, root->fs_info->chunk_tree_uuid, 3258 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3259 BTRFS_UUID_SIZE); 3260 3261 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); 3262 copy_extent_buffer(split, c, 3263 btrfs_node_key_ptr_offset(0), 3264 btrfs_node_key_ptr_offset(mid), 3265 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3266 btrfs_set_header_nritems(split, c_nritems - mid); 3267 btrfs_set_header_nritems(c, mid); 3268 ret = 0; 3269 3270 btrfs_mark_buffer_dirty(c); 3271 btrfs_mark_buffer_dirty(split); 3272 3273 insert_ptr(trans, root, path, &disk_key, split->start, 3274 path->slots[level + 1] + 1, level + 1); 3275 3276 if (path->slots[level] >= mid) { 3277 path->slots[level] -= mid; 3278 btrfs_tree_unlock(c); 3279 free_extent_buffer(c); 3280 path->nodes[level] = split; 3281 path->slots[level + 1] += 1; 3282 } else { 3283 btrfs_tree_unlock(split); 3284 free_extent_buffer(split); 3285 } 3286 return ret; 3287 } 3288 3289 /* 3290 * how many bytes are required to store the items in a leaf. start 3291 * and nr indicate which items in the leaf to check. This totals up the 3292 * space used both by the item structs and the item data 3293 */ 3294 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3295 { 3296 struct btrfs_item *start_item; 3297 struct btrfs_item *end_item; 3298 struct btrfs_map_token token; 3299 int data_len; 3300 int nritems = btrfs_header_nritems(l); 3301 int end = min(nritems, start + nr) - 1; 3302 3303 if (!nr) 3304 return 0; 3305 btrfs_init_map_token(&token); 3306 start_item = btrfs_item_nr(l, start); 3307 end_item = btrfs_item_nr(l, end); 3308 data_len = btrfs_token_item_offset(l, start_item, &token) + 3309 btrfs_token_item_size(l, start_item, &token); 3310 data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3311 data_len += sizeof(struct btrfs_item) * nr; 3312 WARN_ON(data_len < 0); 3313 return data_len; 3314 } 3315 3316 /* 3317 * The space between the end of the leaf items and 3318 * the start of the leaf data. IOW, how much room 3319 * the leaf has left for both items and data 3320 */ 3321 noinline int btrfs_leaf_free_space(struct btrfs_root *root, 3322 struct extent_buffer *leaf) 3323 { 3324 int nritems = btrfs_header_nritems(leaf); 3325 int ret; 3326 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); 3327 if (ret < 0) { 3328 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " 3329 "used %d nritems %d\n", 3330 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), 3331 leaf_space_used(leaf, 0, nritems), nritems); 3332 } 3333 return ret; 3334 } 3335 3336 /* 3337 * min slot controls the lowest index we're willing to push to the 3338 * right. We'll push up to and including min_slot, but no lower 3339 */ 3340 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3341 struct btrfs_root *root, 3342 struct btrfs_path *path, 3343 int data_size, int empty, 3344 struct extent_buffer *right, 3345 int free_space, u32 left_nritems, 3346 u32 min_slot) 3347 { 3348 struct extent_buffer *left = path->nodes[0]; 3349 struct extent_buffer *upper = path->nodes[1]; 3350 struct btrfs_map_token token; 3351 struct btrfs_disk_key disk_key; 3352 int slot; 3353 u32 i; 3354 int push_space = 0; 3355 int push_items = 0; 3356 struct btrfs_item *item; 3357 u32 nr; 3358 u32 right_nritems; 3359 u32 data_end; 3360 u32 this_item_size; 3361 3362 btrfs_init_map_token(&token); 3363 3364 if (empty) 3365 nr = 0; 3366 else 3367 nr = max_t(u32, 1, min_slot); 3368 3369 if (path->slots[0] >= left_nritems) 3370 push_space += data_size; 3371 3372 slot = path->slots[1]; 3373 i = left_nritems - 1; 3374 while (i >= nr) { 3375 item = btrfs_item_nr(left, i); 3376 3377 if (!empty && push_items > 0) { 3378 if (path->slots[0] > i) 3379 break; 3380 if (path->slots[0] == i) { 3381 int space = btrfs_leaf_free_space(root, left); 3382 if (space + push_space * 2 > free_space) 3383 break; 3384 } 3385 } 3386 3387 if (path->slots[0] == i) 3388 push_space += data_size; 3389 3390 this_item_size = btrfs_item_size(left, item); 3391 if (this_item_size + sizeof(*item) + push_space > free_space) 3392 break; 3393 3394 push_items++; 3395 push_space += this_item_size + sizeof(*item); 3396 if (i == 0) 3397 break; 3398 i--; 3399 } 3400 3401 if (push_items == 0) 3402 goto out_unlock; 3403 3404 WARN_ON(!empty && push_items == left_nritems); 3405 3406 /* push left to right */ 3407 right_nritems = btrfs_header_nritems(right); 3408 3409 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 3410 push_space -= leaf_data_end(root, left); 3411 3412 /* make room in the right data area */ 3413 data_end = leaf_data_end(root, right); 3414 memmove_extent_buffer(right, 3415 btrfs_leaf_data(right) + data_end - push_space, 3416 btrfs_leaf_data(right) + data_end, 3417 BTRFS_LEAF_DATA_SIZE(root) - data_end); 3418 3419 /* copy from the left data area */ 3420 copy_extent_buffer(right, left, btrfs_leaf_data(right) + 3421 BTRFS_LEAF_DATA_SIZE(root) - push_space, 3422 btrfs_leaf_data(left) + leaf_data_end(root, left), 3423 push_space); 3424 3425 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3426 btrfs_item_nr_offset(0), 3427 right_nritems * sizeof(struct btrfs_item)); 3428 3429 /* copy the items from left to right */ 3430 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 3431 btrfs_item_nr_offset(left_nritems - push_items), 3432 push_items * sizeof(struct btrfs_item)); 3433 3434 /* update the item pointers */ 3435 right_nritems += push_items; 3436 btrfs_set_header_nritems(right, right_nritems); 3437 push_space = BTRFS_LEAF_DATA_SIZE(root); 3438 for (i = 0; i < right_nritems; i++) { 3439 item = btrfs_item_nr(right, i); 3440 push_space -= btrfs_token_item_size(right, item, &token); 3441 btrfs_set_token_item_offset(right, item, push_space, &token); 3442 } 3443 3444 left_nritems -= push_items; 3445 btrfs_set_header_nritems(left, left_nritems); 3446 3447 if (left_nritems) 3448 btrfs_mark_buffer_dirty(left); 3449 else 3450 clean_tree_block(trans, root, left); 3451 3452 btrfs_mark_buffer_dirty(right); 3453 3454 btrfs_item_key(right, &disk_key, 0); 3455 btrfs_set_node_key(upper, &disk_key, slot + 1); 3456 btrfs_mark_buffer_dirty(upper); 3457 3458 /* then fixup the leaf pointer in the path */ 3459 if (path->slots[0] >= left_nritems) { 3460 path->slots[0] -= left_nritems; 3461 if (btrfs_header_nritems(path->nodes[0]) == 0) 3462 clean_tree_block(trans, root, path->nodes[0]); 3463 btrfs_tree_unlock(path->nodes[0]); 3464 free_extent_buffer(path->nodes[0]); 3465 path->nodes[0] = right; 3466 path->slots[1] += 1; 3467 } else { 3468 btrfs_tree_unlock(right); 3469 free_extent_buffer(right); 3470 } 3471 return 0; 3472 3473 out_unlock: 3474 btrfs_tree_unlock(right); 3475 free_extent_buffer(right); 3476 return 1; 3477 } 3478 3479 /* 3480 * push some data in the path leaf to the right, trying to free up at 3481 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3482 * 3483 * returns 1 if the push failed because the other node didn't have enough 3484 * room, 0 if everything worked out and < 0 if there were major errors. 3485 * 3486 * this will push starting from min_slot to the end of the leaf. It won't 3487 * push any slot lower than min_slot 3488 */ 3489 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3490 *root, struct btrfs_path *path, 3491 int min_data_size, int data_size, 3492 int empty, u32 min_slot) 3493 { 3494 struct extent_buffer *left = path->nodes[0]; 3495 struct extent_buffer *right; 3496 struct extent_buffer *upper; 3497 int slot; 3498 int free_space; 3499 u32 left_nritems; 3500 int ret; 3501 3502 if (!path->nodes[1]) 3503 return 1; 3504 3505 slot = path->slots[1]; 3506 upper = path->nodes[1]; 3507 if (slot >= btrfs_header_nritems(upper) - 1) 3508 return 1; 3509 3510 btrfs_assert_tree_locked(path->nodes[1]); 3511 3512 right = read_node_slot(root, upper, slot + 1); 3513 if (right == NULL) 3514 return 1; 3515 3516 btrfs_tree_lock(right); 3517 btrfs_set_lock_blocking(right); 3518 3519 free_space = btrfs_leaf_free_space(root, right); 3520 if (free_space < data_size) 3521 goto out_unlock; 3522 3523 /* cow and double check */ 3524 ret = btrfs_cow_block(trans, root, right, upper, 3525 slot + 1, &right); 3526 if (ret) 3527 goto out_unlock; 3528 3529 free_space = btrfs_leaf_free_space(root, right); 3530 if (free_space < data_size) 3531 goto out_unlock; 3532 3533 left_nritems = btrfs_header_nritems(left); 3534 if (left_nritems == 0) 3535 goto out_unlock; 3536 3537 return __push_leaf_right(trans, root, path, min_data_size, empty, 3538 right, free_space, left_nritems, min_slot); 3539 out_unlock: 3540 btrfs_tree_unlock(right); 3541 free_extent_buffer(right); 3542 return 1; 3543 } 3544 3545 /* 3546 * push some data in the path leaf to the left, trying to free up at 3547 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3548 * 3549 * max_slot can put a limit on how far into the leaf we'll push items. The 3550 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3551 * items 3552 */ 3553 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3554 struct btrfs_root *root, 3555 struct btrfs_path *path, int data_size, 3556 int empty, struct extent_buffer *left, 3557 int free_space, u32 right_nritems, 3558 u32 max_slot) 3559 { 3560 struct btrfs_disk_key disk_key; 3561 struct extent_buffer *right = path->nodes[0]; 3562 int i; 3563 int push_space = 0; 3564 int push_items = 0; 3565 struct btrfs_item *item; 3566 u32 old_left_nritems; 3567 u32 nr; 3568 int ret = 0; 3569 u32 this_item_size; 3570 u32 old_left_item_size; 3571 struct btrfs_map_token token; 3572 3573 btrfs_init_map_token(&token); 3574 3575 if (empty) 3576 nr = min(right_nritems, max_slot); 3577 else 3578 nr = min(right_nritems - 1, max_slot); 3579 3580 for (i = 0; i < nr; i++) { 3581 item = btrfs_item_nr(right, i); 3582 3583 if (!empty && push_items > 0) { 3584 if (path->slots[0] < i) 3585 break; 3586 if (path->slots[0] == i) { 3587 int space = btrfs_leaf_free_space(root, right); 3588 if (space + push_space * 2 > free_space) 3589 break; 3590 } 3591 } 3592 3593 if (path->slots[0] == i) 3594 push_space += data_size; 3595 3596 this_item_size = btrfs_item_size(right, item); 3597 if (this_item_size + sizeof(*item) + push_space > free_space) 3598 break; 3599 3600 push_items++; 3601 push_space += this_item_size + sizeof(*item); 3602 } 3603 3604 if (push_items == 0) { 3605 ret = 1; 3606 goto out; 3607 } 3608 if (!empty && push_items == btrfs_header_nritems(right)) 3609 WARN_ON(1); 3610 3611 /* push data from right to left */ 3612 copy_extent_buffer(left, right, 3613 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3614 btrfs_item_nr_offset(0), 3615 push_items * sizeof(struct btrfs_item)); 3616 3617 push_space = BTRFS_LEAF_DATA_SIZE(root) - 3618 btrfs_item_offset_nr(right, push_items - 1); 3619 3620 copy_extent_buffer(left, right, btrfs_leaf_data(left) + 3621 leaf_data_end(root, left) - push_space, 3622 btrfs_leaf_data(right) + 3623 btrfs_item_offset_nr(right, push_items - 1), 3624 push_space); 3625 old_left_nritems = btrfs_header_nritems(left); 3626 BUG_ON(old_left_nritems <= 0); 3627 3628 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 3629 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3630 u32 ioff; 3631 3632 item = btrfs_item_nr(left, i); 3633 3634 ioff = btrfs_token_item_offset(left, item, &token); 3635 btrfs_set_token_item_offset(left, item, 3636 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), 3637 &token); 3638 } 3639 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3640 3641 /* fixup right node */ 3642 if (push_items > right_nritems) 3643 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3644 right_nritems); 3645 3646 if (push_items < right_nritems) { 3647 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3648 leaf_data_end(root, right); 3649 memmove_extent_buffer(right, btrfs_leaf_data(right) + 3650 BTRFS_LEAF_DATA_SIZE(root) - push_space, 3651 btrfs_leaf_data(right) + 3652 leaf_data_end(root, right), push_space); 3653 3654 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3655 btrfs_item_nr_offset(push_items), 3656 (btrfs_header_nritems(right) - push_items) * 3657 sizeof(struct btrfs_item)); 3658 } 3659 right_nritems -= push_items; 3660 btrfs_set_header_nritems(right, right_nritems); 3661 push_space = BTRFS_LEAF_DATA_SIZE(root); 3662 for (i = 0; i < right_nritems; i++) { 3663 item = btrfs_item_nr(right, i); 3664 3665 push_space = push_space - btrfs_token_item_size(right, 3666 item, &token); 3667 btrfs_set_token_item_offset(right, item, push_space, &token); 3668 } 3669 3670 btrfs_mark_buffer_dirty(left); 3671 if (right_nritems) 3672 btrfs_mark_buffer_dirty(right); 3673 else 3674 clean_tree_block(trans, root, right); 3675 3676 btrfs_item_key(right, &disk_key, 0); 3677 fixup_low_keys(root, path, &disk_key, 1); 3678 3679 /* then fixup the leaf pointer in the path */ 3680 if (path->slots[0] < push_items) { 3681 path->slots[0] += old_left_nritems; 3682 btrfs_tree_unlock(path->nodes[0]); 3683 free_extent_buffer(path->nodes[0]); 3684 path->nodes[0] = left; 3685 path->slots[1] -= 1; 3686 } else { 3687 btrfs_tree_unlock(left); 3688 free_extent_buffer(left); 3689 path->slots[0] -= push_items; 3690 } 3691 BUG_ON(path->slots[0] < 0); 3692 return ret; 3693 out: 3694 btrfs_tree_unlock(left); 3695 free_extent_buffer(left); 3696 return ret; 3697 } 3698 3699 /* 3700 * push some data in the path leaf to the left, trying to free up at 3701 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3702 * 3703 * max_slot can put a limit on how far into the leaf we'll push items. The 3704 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3705 * items 3706 */ 3707 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3708 *root, struct btrfs_path *path, int min_data_size, 3709 int data_size, int empty, u32 max_slot) 3710 { 3711 struct extent_buffer *right = path->nodes[0]; 3712 struct extent_buffer *left; 3713 int slot; 3714 int free_space; 3715 u32 right_nritems; 3716 int ret = 0; 3717 3718 slot = path->slots[1]; 3719 if (slot == 0) 3720 return 1; 3721 if (!path->nodes[1]) 3722 return 1; 3723 3724 right_nritems = btrfs_header_nritems(right); 3725 if (right_nritems == 0) 3726 return 1; 3727 3728 btrfs_assert_tree_locked(path->nodes[1]); 3729 3730 left = read_node_slot(root, path->nodes[1], slot - 1); 3731 if (left == NULL) 3732 return 1; 3733 3734 btrfs_tree_lock(left); 3735 btrfs_set_lock_blocking(left); 3736 3737 free_space = btrfs_leaf_free_space(root, left); 3738 if (free_space < data_size) { 3739 ret = 1; 3740 goto out; 3741 } 3742 3743 /* cow and double check */ 3744 ret = btrfs_cow_block(trans, root, left, 3745 path->nodes[1], slot - 1, &left); 3746 if (ret) { 3747 /* we hit -ENOSPC, but it isn't fatal here */ 3748 if (ret == -ENOSPC) 3749 ret = 1; 3750 goto out; 3751 } 3752 3753 free_space = btrfs_leaf_free_space(root, left); 3754 if (free_space < data_size) { 3755 ret = 1; 3756 goto out; 3757 } 3758 3759 return __push_leaf_left(trans, root, path, min_data_size, 3760 empty, left, free_space, right_nritems, 3761 max_slot); 3762 out: 3763 btrfs_tree_unlock(left); 3764 free_extent_buffer(left); 3765 return ret; 3766 } 3767 3768 /* 3769 * split the path's leaf in two, making sure there is at least data_size 3770 * available for the resulting leaf level of the path. 3771 */ 3772 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3773 struct btrfs_root *root, 3774 struct btrfs_path *path, 3775 struct extent_buffer *l, 3776 struct extent_buffer *right, 3777 int slot, int mid, int nritems) 3778 { 3779 int data_copy_size; 3780 int rt_data_off; 3781 int i; 3782 struct btrfs_disk_key disk_key; 3783 struct btrfs_map_token token; 3784 3785 btrfs_init_map_token(&token); 3786 3787 nritems = nritems - mid; 3788 btrfs_set_header_nritems(right, nritems); 3789 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); 3790 3791 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3792 btrfs_item_nr_offset(mid), 3793 nritems * sizeof(struct btrfs_item)); 3794 3795 copy_extent_buffer(right, l, 3796 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - 3797 data_copy_size, btrfs_leaf_data(l) + 3798 leaf_data_end(root, l), data_copy_size); 3799 3800 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - 3801 btrfs_item_end_nr(l, mid); 3802 3803 for (i = 0; i < nritems; i++) { 3804 struct btrfs_item *item = btrfs_item_nr(right, i); 3805 u32 ioff; 3806 3807 ioff = btrfs_token_item_offset(right, item, &token); 3808 btrfs_set_token_item_offset(right, item, 3809 ioff + rt_data_off, &token); 3810 } 3811 3812 btrfs_set_header_nritems(l, mid); 3813 btrfs_item_key(right, &disk_key, 0); 3814 insert_ptr(trans, root, path, &disk_key, right->start, 3815 path->slots[1] + 1, 1); 3816 3817 btrfs_mark_buffer_dirty(right); 3818 btrfs_mark_buffer_dirty(l); 3819 BUG_ON(path->slots[0] != slot); 3820 3821 if (mid <= slot) { 3822 btrfs_tree_unlock(path->nodes[0]); 3823 free_extent_buffer(path->nodes[0]); 3824 path->nodes[0] = right; 3825 path->slots[0] -= mid; 3826 path->slots[1] += 1; 3827 } else { 3828 btrfs_tree_unlock(right); 3829 free_extent_buffer(right); 3830 } 3831 3832 BUG_ON(path->slots[0] < 0); 3833 } 3834 3835 /* 3836 * double splits happen when we need to insert a big item in the middle 3837 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3838 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3839 * A B C 3840 * 3841 * We avoid this by trying to push the items on either side of our target 3842 * into the adjacent leaves. If all goes well we can avoid the double split 3843 * completely. 3844 */ 3845 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3846 struct btrfs_root *root, 3847 struct btrfs_path *path, 3848 int data_size) 3849 { 3850 int ret; 3851 int progress = 0; 3852 int slot; 3853 u32 nritems; 3854 3855 slot = path->slots[0]; 3856 3857 /* 3858 * try to push all the items after our slot into the 3859 * right leaf 3860 */ 3861 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); 3862 if (ret < 0) 3863 return ret; 3864 3865 if (ret == 0) 3866 progress++; 3867 3868 nritems = btrfs_header_nritems(path->nodes[0]); 3869 /* 3870 * our goal is to get our slot at the start or end of a leaf. If 3871 * we've done so we're done 3872 */ 3873 if (path->slots[0] == 0 || path->slots[0] == nritems) 3874 return 0; 3875 3876 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) 3877 return 0; 3878 3879 /* try to push all the items before our slot into the next leaf */ 3880 slot = path->slots[0]; 3881 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); 3882 if (ret < 0) 3883 return ret; 3884 3885 if (ret == 0) 3886 progress++; 3887 3888 if (progress) 3889 return 0; 3890 return 1; 3891 } 3892 3893 /* 3894 * split the path's leaf in two, making sure there is at least data_size 3895 * available for the resulting leaf level of the path. 3896 * 3897 * returns 0 if all went well and < 0 on failure. 3898 */ 3899 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3900 struct btrfs_root *root, 3901 struct btrfs_key *ins_key, 3902 struct btrfs_path *path, int data_size, 3903 int extend) 3904 { 3905 struct btrfs_disk_key disk_key; 3906 struct extent_buffer *l; 3907 u32 nritems; 3908 int mid; 3909 int slot; 3910 struct extent_buffer *right; 3911 int ret = 0; 3912 int wret; 3913 int split; 3914 int num_doubles = 0; 3915 int tried_avoid_double = 0; 3916 3917 l = path->nodes[0]; 3918 slot = path->slots[0]; 3919 if (extend && data_size + btrfs_item_size_nr(l, slot) + 3920 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) 3921 return -EOVERFLOW; 3922 3923 /* first try to make some room by pushing left and right */ 3924 if (data_size && path->nodes[1]) { 3925 wret = push_leaf_right(trans, root, path, data_size, 3926 data_size, 0, 0); 3927 if (wret < 0) 3928 return wret; 3929 if (wret) { 3930 wret = push_leaf_left(trans, root, path, data_size, 3931 data_size, 0, (u32)-1); 3932 if (wret < 0) 3933 return wret; 3934 } 3935 l = path->nodes[0]; 3936 3937 /* did the pushes work? */ 3938 if (btrfs_leaf_free_space(root, l) >= data_size) 3939 return 0; 3940 } 3941 3942 if (!path->nodes[1]) { 3943 ret = insert_new_root(trans, root, path, 1); 3944 if (ret) 3945 return ret; 3946 } 3947 again: 3948 split = 1; 3949 l = path->nodes[0]; 3950 slot = path->slots[0]; 3951 nritems = btrfs_header_nritems(l); 3952 mid = (nritems + 1) / 2; 3953 3954 if (mid <= slot) { 3955 if (nritems == 1 || 3956 leaf_space_used(l, mid, nritems - mid) + data_size > 3957 BTRFS_LEAF_DATA_SIZE(root)) { 3958 if (slot >= nritems) { 3959 split = 0; 3960 } else { 3961 mid = slot; 3962 if (mid != nritems && 3963 leaf_space_used(l, mid, nritems - mid) + 3964 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3965 if (data_size && !tried_avoid_double) 3966 goto push_for_double; 3967 split = 2; 3968 } 3969 } 3970 } 3971 } else { 3972 if (leaf_space_used(l, 0, mid) + data_size > 3973 BTRFS_LEAF_DATA_SIZE(root)) { 3974 if (!extend && data_size && slot == 0) { 3975 split = 0; 3976 } else if ((extend || !data_size) && slot == 0) { 3977 mid = 1; 3978 } else { 3979 mid = slot; 3980 if (mid != nritems && 3981 leaf_space_used(l, mid, nritems - mid) + 3982 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3983 if (data_size && !tried_avoid_double) 3984 goto push_for_double; 3985 split = 2 ; 3986 } 3987 } 3988 } 3989 } 3990 3991 if (split == 0) 3992 btrfs_cpu_key_to_disk(&disk_key, ins_key); 3993 else 3994 btrfs_item_key(l, &disk_key, mid); 3995 3996 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3997 root->root_key.objectid, 3998 &disk_key, 0, l->start, 0); 3999 if (IS_ERR(right)) 4000 return PTR_ERR(right); 4001 4002 root_add_used(root, root->leafsize); 4003 4004 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 4005 btrfs_set_header_bytenr(right, right->start); 4006 btrfs_set_header_generation(right, trans->transid); 4007 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); 4008 btrfs_set_header_owner(right, root->root_key.objectid); 4009 btrfs_set_header_level(right, 0); 4010 write_extent_buffer(right, root->fs_info->fsid, 4011 (unsigned long)btrfs_header_fsid(right), 4012 BTRFS_FSID_SIZE); 4013 4014 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 4015 (unsigned long)btrfs_header_chunk_tree_uuid(right), 4016 BTRFS_UUID_SIZE); 4017 4018 if (split == 0) { 4019 if (mid <= slot) { 4020 btrfs_set_header_nritems(right, 0); 4021 insert_ptr(trans, root, path, &disk_key, right->start, 4022 path->slots[1] + 1, 1); 4023 btrfs_tree_unlock(path->nodes[0]); 4024 free_extent_buffer(path->nodes[0]); 4025 path->nodes[0] = right; 4026 path->slots[0] = 0; 4027 path->slots[1] += 1; 4028 } else { 4029 btrfs_set_header_nritems(right, 0); 4030 insert_ptr(trans, root, path, &disk_key, right->start, 4031 path->slots[1], 1); 4032 btrfs_tree_unlock(path->nodes[0]); 4033 free_extent_buffer(path->nodes[0]); 4034 path->nodes[0] = right; 4035 path->slots[0] = 0; 4036 if (path->slots[1] == 0) 4037 fixup_low_keys(root, path, &disk_key, 1); 4038 } 4039 btrfs_mark_buffer_dirty(right); 4040 return ret; 4041 } 4042 4043 copy_for_split(trans, root, path, l, right, slot, mid, nritems); 4044 4045 if (split == 2) { 4046 BUG_ON(num_doubles != 0); 4047 num_doubles++; 4048 goto again; 4049 } 4050 4051 return 0; 4052 4053 push_for_double: 4054 push_for_double_split(trans, root, path, data_size); 4055 tried_avoid_double = 1; 4056 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) 4057 return 0; 4058 goto again; 4059 } 4060 4061 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 4062 struct btrfs_root *root, 4063 struct btrfs_path *path, int ins_len) 4064 { 4065 struct btrfs_key key; 4066 struct extent_buffer *leaf; 4067 struct btrfs_file_extent_item *fi; 4068 u64 extent_len = 0; 4069 u32 item_size; 4070 int ret; 4071 4072 leaf = path->nodes[0]; 4073 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4074 4075 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 4076 key.type != BTRFS_EXTENT_CSUM_KEY); 4077 4078 if (btrfs_leaf_free_space(root, leaf) >= ins_len) 4079 return 0; 4080 4081 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4082 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4083 fi = btrfs_item_ptr(leaf, path->slots[0], 4084 struct btrfs_file_extent_item); 4085 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 4086 } 4087 btrfs_release_path(path); 4088 4089 path->keep_locks = 1; 4090 path->search_for_split = 1; 4091 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4092 path->search_for_split = 0; 4093 if (ret < 0) 4094 goto err; 4095 4096 ret = -EAGAIN; 4097 leaf = path->nodes[0]; 4098 /* if our item isn't there or got smaller, return now */ 4099 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 4100 goto err; 4101 4102 /* the leaf has changed, it now has room. return now */ 4103 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) 4104 goto err; 4105 4106 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4107 fi = btrfs_item_ptr(leaf, path->slots[0], 4108 struct btrfs_file_extent_item); 4109 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4110 goto err; 4111 } 4112 4113 btrfs_set_path_blocking(path); 4114 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4115 if (ret) 4116 goto err; 4117 4118 path->keep_locks = 0; 4119 btrfs_unlock_up_safe(path, 1); 4120 return 0; 4121 err: 4122 path->keep_locks = 0; 4123 return ret; 4124 } 4125 4126 static noinline int split_item(struct btrfs_trans_handle *trans, 4127 struct btrfs_root *root, 4128 struct btrfs_path *path, 4129 struct btrfs_key *new_key, 4130 unsigned long split_offset) 4131 { 4132 struct extent_buffer *leaf; 4133 struct btrfs_item *item; 4134 struct btrfs_item *new_item; 4135 int slot; 4136 char *buf; 4137 u32 nritems; 4138 u32 item_size; 4139 u32 orig_offset; 4140 struct btrfs_disk_key disk_key; 4141 4142 leaf = path->nodes[0]; 4143 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 4144 4145 btrfs_set_path_blocking(path); 4146 4147 item = btrfs_item_nr(leaf, path->slots[0]); 4148 orig_offset = btrfs_item_offset(leaf, item); 4149 item_size = btrfs_item_size(leaf, item); 4150 4151 buf = kmalloc(item_size, GFP_NOFS); 4152 if (!buf) 4153 return -ENOMEM; 4154 4155 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4156 path->slots[0]), item_size); 4157 4158 slot = path->slots[0] + 1; 4159 nritems = btrfs_header_nritems(leaf); 4160 if (slot != nritems) { 4161 /* shift the items */ 4162 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 4163 btrfs_item_nr_offset(slot), 4164 (nritems - slot) * sizeof(struct btrfs_item)); 4165 } 4166 4167 btrfs_cpu_key_to_disk(&disk_key, new_key); 4168 btrfs_set_item_key(leaf, &disk_key, slot); 4169 4170 new_item = btrfs_item_nr(leaf, slot); 4171 4172 btrfs_set_item_offset(leaf, new_item, orig_offset); 4173 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 4174 4175 btrfs_set_item_offset(leaf, item, 4176 orig_offset + item_size - split_offset); 4177 btrfs_set_item_size(leaf, item, split_offset); 4178 4179 btrfs_set_header_nritems(leaf, nritems + 1); 4180 4181 /* write the data for the start of the original item */ 4182 write_extent_buffer(leaf, buf, 4183 btrfs_item_ptr_offset(leaf, path->slots[0]), 4184 split_offset); 4185 4186 /* write the data for the new item */ 4187 write_extent_buffer(leaf, buf + split_offset, 4188 btrfs_item_ptr_offset(leaf, slot), 4189 item_size - split_offset); 4190 btrfs_mark_buffer_dirty(leaf); 4191 4192 BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); 4193 kfree(buf); 4194 return 0; 4195 } 4196 4197 /* 4198 * This function splits a single item into two items, 4199 * giving 'new_key' to the new item and splitting the 4200 * old one at split_offset (from the start of the item). 4201 * 4202 * The path may be released by this operation. After 4203 * the split, the path is pointing to the old item. The 4204 * new item is going to be in the same node as the old one. 4205 * 4206 * Note, the item being split must be smaller enough to live alone on 4207 * a tree block with room for one extra struct btrfs_item 4208 * 4209 * This allows us to split the item in place, keeping a lock on the 4210 * leaf the entire time. 4211 */ 4212 int btrfs_split_item(struct btrfs_trans_handle *trans, 4213 struct btrfs_root *root, 4214 struct btrfs_path *path, 4215 struct btrfs_key *new_key, 4216 unsigned long split_offset) 4217 { 4218 int ret; 4219 ret = setup_leaf_for_split(trans, root, path, 4220 sizeof(struct btrfs_item)); 4221 if (ret) 4222 return ret; 4223 4224 ret = split_item(trans, root, path, new_key, split_offset); 4225 return ret; 4226 } 4227 4228 /* 4229 * This function duplicate a item, giving 'new_key' to the new item. 4230 * It guarantees both items live in the same tree leaf and the new item 4231 * is contiguous with the original item. 4232 * 4233 * This allows us to split file extent in place, keeping a lock on the 4234 * leaf the entire time. 4235 */ 4236 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4237 struct btrfs_root *root, 4238 struct btrfs_path *path, 4239 struct btrfs_key *new_key) 4240 { 4241 struct extent_buffer *leaf; 4242 int ret; 4243 u32 item_size; 4244 4245 leaf = path->nodes[0]; 4246 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4247 ret = setup_leaf_for_split(trans, root, path, 4248 item_size + sizeof(struct btrfs_item)); 4249 if (ret) 4250 return ret; 4251 4252 path->slots[0]++; 4253 setup_items_for_insert(root, path, new_key, &item_size, 4254 item_size, item_size + 4255 sizeof(struct btrfs_item), 1); 4256 leaf = path->nodes[0]; 4257 memcpy_extent_buffer(leaf, 4258 btrfs_item_ptr_offset(leaf, path->slots[0]), 4259 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4260 item_size); 4261 return 0; 4262 } 4263 4264 /* 4265 * make the item pointed to by the path smaller. new_size indicates 4266 * how small to make it, and from_end tells us if we just chop bytes 4267 * off the end of the item or if we shift the item to chop bytes off 4268 * the front. 4269 */ 4270 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, 4271 u32 new_size, int from_end) 4272 { 4273 int slot; 4274 struct extent_buffer *leaf; 4275 struct btrfs_item *item; 4276 u32 nritems; 4277 unsigned int data_end; 4278 unsigned int old_data_start; 4279 unsigned int old_size; 4280 unsigned int size_diff; 4281 int i; 4282 struct btrfs_map_token token; 4283 4284 btrfs_init_map_token(&token); 4285 4286 leaf = path->nodes[0]; 4287 slot = path->slots[0]; 4288 4289 old_size = btrfs_item_size_nr(leaf, slot); 4290 if (old_size == new_size) 4291 return; 4292 4293 nritems = btrfs_header_nritems(leaf); 4294 data_end = leaf_data_end(root, leaf); 4295 4296 old_data_start = btrfs_item_offset_nr(leaf, slot); 4297 4298 size_diff = old_size - new_size; 4299 4300 BUG_ON(slot < 0); 4301 BUG_ON(slot >= nritems); 4302 4303 /* 4304 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4305 */ 4306 /* first correct the data pointers */ 4307 for (i = slot; i < nritems; i++) { 4308 u32 ioff; 4309 item = btrfs_item_nr(leaf, i); 4310 4311 ioff = btrfs_token_item_offset(leaf, item, &token); 4312 btrfs_set_token_item_offset(leaf, item, 4313 ioff + size_diff, &token); 4314 } 4315 4316 /* shift the data */ 4317 if (from_end) { 4318 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4319 data_end + size_diff, btrfs_leaf_data(leaf) + 4320 data_end, old_data_start + new_size - data_end); 4321 } else { 4322 struct btrfs_disk_key disk_key; 4323 u64 offset; 4324 4325 btrfs_item_key(leaf, &disk_key, slot); 4326 4327 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4328 unsigned long ptr; 4329 struct btrfs_file_extent_item *fi; 4330 4331 fi = btrfs_item_ptr(leaf, slot, 4332 struct btrfs_file_extent_item); 4333 fi = (struct btrfs_file_extent_item *)( 4334 (unsigned long)fi - size_diff); 4335 4336 if (btrfs_file_extent_type(leaf, fi) == 4337 BTRFS_FILE_EXTENT_INLINE) { 4338 ptr = btrfs_item_ptr_offset(leaf, slot); 4339 memmove_extent_buffer(leaf, ptr, 4340 (unsigned long)fi, 4341 offsetof(struct btrfs_file_extent_item, 4342 disk_bytenr)); 4343 } 4344 } 4345 4346 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4347 data_end + size_diff, btrfs_leaf_data(leaf) + 4348 data_end, old_data_start - data_end); 4349 4350 offset = btrfs_disk_key_offset(&disk_key); 4351 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4352 btrfs_set_item_key(leaf, &disk_key, slot); 4353 if (slot == 0) 4354 fixup_low_keys(root, path, &disk_key, 1); 4355 } 4356 4357 item = btrfs_item_nr(leaf, slot); 4358 btrfs_set_item_size(leaf, item, new_size); 4359 btrfs_mark_buffer_dirty(leaf); 4360 4361 if (btrfs_leaf_free_space(root, leaf) < 0) { 4362 btrfs_print_leaf(root, leaf); 4363 BUG(); 4364 } 4365 } 4366 4367 /* 4368 * make the item pointed to by the path bigger, data_size is the added size. 4369 */ 4370 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, 4371 u32 data_size) 4372 { 4373 int slot; 4374 struct extent_buffer *leaf; 4375 struct btrfs_item *item; 4376 u32 nritems; 4377 unsigned int data_end; 4378 unsigned int old_data; 4379 unsigned int old_size; 4380 int i; 4381 struct btrfs_map_token token; 4382 4383 btrfs_init_map_token(&token); 4384 4385 leaf = path->nodes[0]; 4386 4387 nritems = btrfs_header_nritems(leaf); 4388 data_end = leaf_data_end(root, leaf); 4389 4390 if (btrfs_leaf_free_space(root, leaf) < data_size) { 4391 btrfs_print_leaf(root, leaf); 4392 BUG(); 4393 } 4394 slot = path->slots[0]; 4395 old_data = btrfs_item_end_nr(leaf, slot); 4396 4397 BUG_ON(slot < 0); 4398 if (slot >= nritems) { 4399 btrfs_print_leaf(root, leaf); 4400 printk(KERN_CRIT "slot %d too large, nritems %d\n", 4401 slot, nritems); 4402 BUG_ON(1); 4403 } 4404 4405 /* 4406 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4407 */ 4408 /* first correct the data pointers */ 4409 for (i = slot; i < nritems; i++) { 4410 u32 ioff; 4411 item = btrfs_item_nr(leaf, i); 4412 4413 ioff = btrfs_token_item_offset(leaf, item, &token); 4414 btrfs_set_token_item_offset(leaf, item, 4415 ioff - data_size, &token); 4416 } 4417 4418 /* shift the data */ 4419 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4420 data_end - data_size, btrfs_leaf_data(leaf) + 4421 data_end, old_data - data_end); 4422 4423 data_end = old_data; 4424 old_size = btrfs_item_size_nr(leaf, slot); 4425 item = btrfs_item_nr(leaf, slot); 4426 btrfs_set_item_size(leaf, item, old_size + data_size); 4427 btrfs_mark_buffer_dirty(leaf); 4428 4429 if (btrfs_leaf_free_space(root, leaf) < 0) { 4430 btrfs_print_leaf(root, leaf); 4431 BUG(); 4432 } 4433 } 4434 4435 /* 4436 * this is a helper for btrfs_insert_empty_items, the main goal here is 4437 * to save stack depth by doing the bulk of the work in a function 4438 * that doesn't call btrfs_search_slot 4439 */ 4440 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4441 struct btrfs_key *cpu_key, u32 *data_size, 4442 u32 total_data, u32 total_size, int nr) 4443 { 4444 struct btrfs_item *item; 4445 int i; 4446 u32 nritems; 4447 unsigned int data_end; 4448 struct btrfs_disk_key disk_key; 4449 struct extent_buffer *leaf; 4450 int slot; 4451 struct btrfs_map_token token; 4452 4453 btrfs_init_map_token(&token); 4454 4455 leaf = path->nodes[0]; 4456 slot = path->slots[0]; 4457 4458 nritems = btrfs_header_nritems(leaf); 4459 data_end = leaf_data_end(root, leaf); 4460 4461 if (btrfs_leaf_free_space(root, leaf) < total_size) { 4462 btrfs_print_leaf(root, leaf); 4463 printk(KERN_CRIT "not enough freespace need %u have %d\n", 4464 total_size, btrfs_leaf_free_space(root, leaf)); 4465 BUG(); 4466 } 4467 4468 if (slot != nritems) { 4469 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 4470 4471 if (old_data < data_end) { 4472 btrfs_print_leaf(root, leaf); 4473 printk(KERN_CRIT "slot %d old_data %d data_end %d\n", 4474 slot, old_data, data_end); 4475 BUG_ON(1); 4476 } 4477 /* 4478 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4479 */ 4480 /* first correct the data pointers */ 4481 for (i = slot; i < nritems; i++) { 4482 u32 ioff; 4483 4484 item = btrfs_item_nr(leaf, i); 4485 ioff = btrfs_token_item_offset(leaf, item, &token); 4486 btrfs_set_token_item_offset(leaf, item, 4487 ioff - total_data, &token); 4488 } 4489 /* shift the items */ 4490 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 4491 btrfs_item_nr_offset(slot), 4492 (nritems - slot) * sizeof(struct btrfs_item)); 4493 4494 /* shift the data */ 4495 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4496 data_end - total_data, btrfs_leaf_data(leaf) + 4497 data_end, old_data - data_end); 4498 data_end = old_data; 4499 } 4500 4501 /* setup the item for the new data */ 4502 for (i = 0; i < nr; i++) { 4503 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 4504 btrfs_set_item_key(leaf, &disk_key, slot + i); 4505 item = btrfs_item_nr(leaf, slot + i); 4506 btrfs_set_token_item_offset(leaf, item, 4507 data_end - data_size[i], &token); 4508 data_end -= data_size[i]; 4509 btrfs_set_token_item_size(leaf, item, data_size[i], &token); 4510 } 4511 4512 btrfs_set_header_nritems(leaf, nritems + nr); 4513 4514 if (slot == 0) { 4515 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 4516 fixup_low_keys(root, path, &disk_key, 1); 4517 } 4518 btrfs_unlock_up_safe(path, 1); 4519 btrfs_mark_buffer_dirty(leaf); 4520 4521 if (btrfs_leaf_free_space(root, leaf) < 0) { 4522 btrfs_print_leaf(root, leaf); 4523 BUG(); 4524 } 4525 } 4526 4527 /* 4528 * Given a key and some data, insert items into the tree. 4529 * This does all the path init required, making room in the tree if needed. 4530 */ 4531 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4532 struct btrfs_root *root, 4533 struct btrfs_path *path, 4534 struct btrfs_key *cpu_key, u32 *data_size, 4535 int nr) 4536 { 4537 int ret = 0; 4538 int slot; 4539 int i; 4540 u32 total_size = 0; 4541 u32 total_data = 0; 4542 4543 for (i = 0; i < nr; i++) 4544 total_data += data_size[i]; 4545 4546 total_size = total_data + (nr * sizeof(struct btrfs_item)); 4547 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 4548 if (ret == 0) 4549 return -EEXIST; 4550 if (ret < 0) 4551 return ret; 4552 4553 slot = path->slots[0]; 4554 BUG_ON(slot < 0); 4555 4556 setup_items_for_insert(root, path, cpu_key, data_size, 4557 total_data, total_size, nr); 4558 return 0; 4559 } 4560 4561 /* 4562 * Given a key and some data, insert an item into the tree. 4563 * This does all the path init required, making room in the tree if needed. 4564 */ 4565 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root 4566 *root, struct btrfs_key *cpu_key, void *data, u32 4567 data_size) 4568 { 4569 int ret = 0; 4570 struct btrfs_path *path; 4571 struct extent_buffer *leaf; 4572 unsigned long ptr; 4573 4574 path = btrfs_alloc_path(); 4575 if (!path) 4576 return -ENOMEM; 4577 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4578 if (!ret) { 4579 leaf = path->nodes[0]; 4580 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4581 write_extent_buffer(leaf, data, ptr, data_size); 4582 btrfs_mark_buffer_dirty(leaf); 4583 } 4584 btrfs_free_path(path); 4585 return ret; 4586 } 4587 4588 /* 4589 * delete the pointer from a given node. 4590 * 4591 * the tree should have been previously balanced so the deletion does not 4592 * empty a node. 4593 */ 4594 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4595 int level, int slot) 4596 { 4597 struct extent_buffer *parent = path->nodes[level]; 4598 u32 nritems; 4599 int ret; 4600 4601 nritems = btrfs_header_nritems(parent); 4602 if (slot != nritems - 1) { 4603 if (level) 4604 tree_mod_log_eb_move(root->fs_info, parent, slot, 4605 slot + 1, nritems - slot - 1); 4606 memmove_extent_buffer(parent, 4607 btrfs_node_key_ptr_offset(slot), 4608 btrfs_node_key_ptr_offset(slot + 1), 4609 sizeof(struct btrfs_key_ptr) * 4610 (nritems - slot - 1)); 4611 } else if (level) { 4612 ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4613 MOD_LOG_KEY_REMOVE, GFP_NOFS); 4614 BUG_ON(ret < 0); 4615 } 4616 4617 nritems--; 4618 btrfs_set_header_nritems(parent, nritems); 4619 if (nritems == 0 && parent == root->node) { 4620 BUG_ON(btrfs_header_level(root->node) != 1); 4621 /* just turn the root into a leaf and break */ 4622 btrfs_set_header_level(root->node, 0); 4623 } else if (slot == 0) { 4624 struct btrfs_disk_key disk_key; 4625 4626 btrfs_node_key(parent, &disk_key, 0); 4627 fixup_low_keys(root, path, &disk_key, level + 1); 4628 } 4629 btrfs_mark_buffer_dirty(parent); 4630 } 4631 4632 /* 4633 * a helper function to delete the leaf pointed to by path->slots[1] and 4634 * path->nodes[1]. 4635 * 4636 * This deletes the pointer in path->nodes[1] and frees the leaf 4637 * block extent. zero is returned if it all worked out, < 0 otherwise. 4638 * 4639 * The path must have already been setup for deleting the leaf, including 4640 * all the proper balancing. path->nodes[1] must be locked. 4641 */ 4642 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4643 struct btrfs_root *root, 4644 struct btrfs_path *path, 4645 struct extent_buffer *leaf) 4646 { 4647 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4648 del_ptr(root, path, 1, path->slots[1]); 4649 4650 /* 4651 * btrfs_free_extent is expensive, we want to make sure we 4652 * aren't holding any locks when we call it 4653 */ 4654 btrfs_unlock_up_safe(path, 0); 4655 4656 root_sub_used(root, leaf->len); 4657 4658 extent_buffer_get(leaf); 4659 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4660 free_extent_buffer_stale(leaf); 4661 } 4662 /* 4663 * delete the item at the leaf level in path. If that empties 4664 * the leaf, remove it from the tree 4665 */ 4666 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4667 struct btrfs_path *path, int slot, int nr) 4668 { 4669 struct extent_buffer *leaf; 4670 struct btrfs_item *item; 4671 int last_off; 4672 int dsize = 0; 4673 int ret = 0; 4674 int wret; 4675 int i; 4676 u32 nritems; 4677 struct btrfs_map_token token; 4678 4679 btrfs_init_map_token(&token); 4680 4681 leaf = path->nodes[0]; 4682 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4683 4684 for (i = 0; i < nr; i++) 4685 dsize += btrfs_item_size_nr(leaf, slot + i); 4686 4687 nritems = btrfs_header_nritems(leaf); 4688 4689 if (slot + nr != nritems) { 4690 int data_end = leaf_data_end(root, leaf); 4691 4692 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4693 data_end + dsize, 4694 btrfs_leaf_data(leaf) + data_end, 4695 last_off - data_end); 4696 4697 for (i = slot + nr; i < nritems; i++) { 4698 u32 ioff; 4699 4700 item = btrfs_item_nr(leaf, i); 4701 ioff = btrfs_token_item_offset(leaf, item, &token); 4702 btrfs_set_token_item_offset(leaf, item, 4703 ioff + dsize, &token); 4704 } 4705 4706 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4707 btrfs_item_nr_offset(slot + nr), 4708 sizeof(struct btrfs_item) * 4709 (nritems - slot - nr)); 4710 } 4711 btrfs_set_header_nritems(leaf, nritems - nr); 4712 nritems -= nr; 4713 4714 /* delete the leaf if we've emptied it */ 4715 if (nritems == 0) { 4716 if (leaf == root->node) { 4717 btrfs_set_header_level(leaf, 0); 4718 } else { 4719 btrfs_set_path_blocking(path); 4720 clean_tree_block(trans, root, leaf); 4721 btrfs_del_leaf(trans, root, path, leaf); 4722 } 4723 } else { 4724 int used = leaf_space_used(leaf, 0, nritems); 4725 if (slot == 0) { 4726 struct btrfs_disk_key disk_key; 4727 4728 btrfs_item_key(leaf, &disk_key, 0); 4729 fixup_low_keys(root, path, &disk_key, 1); 4730 } 4731 4732 /* delete the leaf if it is mostly empty */ 4733 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { 4734 /* push_leaf_left fixes the path. 4735 * make sure the path still points to our leaf 4736 * for possible call to del_ptr below 4737 */ 4738 slot = path->slots[1]; 4739 extent_buffer_get(leaf); 4740 4741 btrfs_set_path_blocking(path); 4742 wret = push_leaf_left(trans, root, path, 1, 1, 4743 1, (u32)-1); 4744 if (wret < 0 && wret != -ENOSPC) 4745 ret = wret; 4746 4747 if (path->nodes[0] == leaf && 4748 btrfs_header_nritems(leaf)) { 4749 wret = push_leaf_right(trans, root, path, 1, 4750 1, 1, 0); 4751 if (wret < 0 && wret != -ENOSPC) 4752 ret = wret; 4753 } 4754 4755 if (btrfs_header_nritems(leaf) == 0) { 4756 path->slots[1] = slot; 4757 btrfs_del_leaf(trans, root, path, leaf); 4758 free_extent_buffer(leaf); 4759 ret = 0; 4760 } else { 4761 /* if we're still in the path, make sure 4762 * we're dirty. Otherwise, one of the 4763 * push_leaf functions must have already 4764 * dirtied this buffer 4765 */ 4766 if (path->nodes[0] == leaf) 4767 btrfs_mark_buffer_dirty(leaf); 4768 free_extent_buffer(leaf); 4769 } 4770 } else { 4771 btrfs_mark_buffer_dirty(leaf); 4772 } 4773 } 4774 return ret; 4775 } 4776 4777 /* 4778 * search the tree again to find a leaf with lesser keys 4779 * returns 0 if it found something or 1 if there are no lesser leaves. 4780 * returns < 0 on io errors. 4781 * 4782 * This may release the path, and so you may lose any locks held at the 4783 * time you call it. 4784 */ 4785 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4786 { 4787 struct btrfs_key key; 4788 struct btrfs_disk_key found_key; 4789 int ret; 4790 4791 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4792 4793 if (key.offset > 0) 4794 key.offset--; 4795 else if (key.type > 0) 4796 key.type--; 4797 else if (key.objectid > 0) 4798 key.objectid--; 4799 else 4800 return 1; 4801 4802 btrfs_release_path(path); 4803 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4804 if (ret < 0) 4805 return ret; 4806 btrfs_item_key(path->nodes[0], &found_key, 0); 4807 ret = comp_keys(&found_key, &key); 4808 if (ret < 0) 4809 return 0; 4810 return 1; 4811 } 4812 4813 /* 4814 * A helper function to walk down the tree starting at min_key, and looking 4815 * for nodes or leaves that are have a minimum transaction id. 4816 * This is used by the btree defrag code, and tree logging 4817 * 4818 * This does not cow, but it does stuff the starting key it finds back 4819 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4820 * key and get a writable path. 4821 * 4822 * This does lock as it descends, and path->keep_locks should be set 4823 * to 1 by the caller. 4824 * 4825 * This honors path->lowest_level to prevent descent past a given level 4826 * of the tree. 4827 * 4828 * min_trans indicates the oldest transaction that you are interested 4829 * in walking through. Any nodes or leaves older than min_trans are 4830 * skipped over (without reading them). 4831 * 4832 * returns zero if something useful was found, < 0 on error and 1 if there 4833 * was nothing in the tree that matched the search criteria. 4834 */ 4835 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4836 struct btrfs_key *max_key, 4837 struct btrfs_path *path, 4838 u64 min_trans) 4839 { 4840 struct extent_buffer *cur; 4841 struct btrfs_key found_key; 4842 int slot; 4843 int sret; 4844 u32 nritems; 4845 int level; 4846 int ret = 1; 4847 4848 WARN_ON(!path->keep_locks); 4849 again: 4850 cur = btrfs_read_lock_root_node(root); 4851 level = btrfs_header_level(cur); 4852 WARN_ON(path->nodes[level]); 4853 path->nodes[level] = cur; 4854 path->locks[level] = BTRFS_READ_LOCK; 4855 4856 if (btrfs_header_generation(cur) < min_trans) { 4857 ret = 1; 4858 goto out; 4859 } 4860 while (1) { 4861 nritems = btrfs_header_nritems(cur); 4862 level = btrfs_header_level(cur); 4863 sret = bin_search(cur, min_key, level, &slot); 4864 4865 /* at the lowest level, we're done, setup the path and exit */ 4866 if (level == path->lowest_level) { 4867 if (slot >= nritems) 4868 goto find_next_key; 4869 ret = 0; 4870 path->slots[level] = slot; 4871 btrfs_item_key_to_cpu(cur, &found_key, slot); 4872 goto out; 4873 } 4874 if (sret && slot > 0) 4875 slot--; 4876 /* 4877 * check this node pointer against the min_trans parameters. 4878 * If it is too old, old, skip to the next one. 4879 */ 4880 while (slot < nritems) { 4881 u64 blockptr; 4882 u64 gen; 4883 4884 blockptr = btrfs_node_blockptr(cur, slot); 4885 gen = btrfs_node_ptr_generation(cur, slot); 4886 if (gen < min_trans) { 4887 slot++; 4888 continue; 4889 } 4890 break; 4891 } 4892 find_next_key: 4893 /* 4894 * we didn't find a candidate key in this node, walk forward 4895 * and find another one 4896 */ 4897 if (slot >= nritems) { 4898 path->slots[level] = slot; 4899 btrfs_set_path_blocking(path); 4900 sret = btrfs_find_next_key(root, path, min_key, level, 4901 min_trans); 4902 if (sret == 0) { 4903 btrfs_release_path(path); 4904 goto again; 4905 } else { 4906 goto out; 4907 } 4908 } 4909 /* save our key for returning back */ 4910 btrfs_node_key_to_cpu(cur, &found_key, slot); 4911 path->slots[level] = slot; 4912 if (level == path->lowest_level) { 4913 ret = 0; 4914 unlock_up(path, level, 1, 0, NULL); 4915 goto out; 4916 } 4917 btrfs_set_path_blocking(path); 4918 cur = read_node_slot(root, cur, slot); 4919 BUG_ON(!cur); /* -ENOMEM */ 4920 4921 btrfs_tree_read_lock(cur); 4922 4923 path->locks[level - 1] = BTRFS_READ_LOCK; 4924 path->nodes[level - 1] = cur; 4925 unlock_up(path, level, 1, 0, NULL); 4926 btrfs_clear_path_blocking(path, NULL, 0); 4927 } 4928 out: 4929 if (ret == 0) 4930 memcpy(min_key, &found_key, sizeof(found_key)); 4931 btrfs_set_path_blocking(path); 4932 return ret; 4933 } 4934 4935 static void tree_move_down(struct btrfs_root *root, 4936 struct btrfs_path *path, 4937 int *level, int root_level) 4938 { 4939 BUG_ON(*level == 0); 4940 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 4941 path->slots[*level]); 4942 path->slots[*level - 1] = 0; 4943 (*level)--; 4944 } 4945 4946 static int tree_move_next_or_upnext(struct btrfs_root *root, 4947 struct btrfs_path *path, 4948 int *level, int root_level) 4949 { 4950 int ret = 0; 4951 int nritems; 4952 nritems = btrfs_header_nritems(path->nodes[*level]); 4953 4954 path->slots[*level]++; 4955 4956 while (path->slots[*level] >= nritems) { 4957 if (*level == root_level) 4958 return -1; 4959 4960 /* move upnext */ 4961 path->slots[*level] = 0; 4962 free_extent_buffer(path->nodes[*level]); 4963 path->nodes[*level] = NULL; 4964 (*level)++; 4965 path->slots[*level]++; 4966 4967 nritems = btrfs_header_nritems(path->nodes[*level]); 4968 ret = 1; 4969 } 4970 return ret; 4971 } 4972 4973 /* 4974 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 4975 * or down. 4976 */ 4977 static int tree_advance(struct btrfs_root *root, 4978 struct btrfs_path *path, 4979 int *level, int root_level, 4980 int allow_down, 4981 struct btrfs_key *key) 4982 { 4983 int ret; 4984 4985 if (*level == 0 || !allow_down) { 4986 ret = tree_move_next_or_upnext(root, path, level, root_level); 4987 } else { 4988 tree_move_down(root, path, level, root_level); 4989 ret = 0; 4990 } 4991 if (ret >= 0) { 4992 if (*level == 0) 4993 btrfs_item_key_to_cpu(path->nodes[*level], key, 4994 path->slots[*level]); 4995 else 4996 btrfs_node_key_to_cpu(path->nodes[*level], key, 4997 path->slots[*level]); 4998 } 4999 return ret; 5000 } 5001 5002 static int tree_compare_item(struct btrfs_root *left_root, 5003 struct btrfs_path *left_path, 5004 struct btrfs_path *right_path, 5005 char *tmp_buf) 5006 { 5007 int cmp; 5008 int len1, len2; 5009 unsigned long off1, off2; 5010 5011 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); 5012 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); 5013 if (len1 != len2) 5014 return 1; 5015 5016 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 5017 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 5018 right_path->slots[0]); 5019 5020 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 5021 5022 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 5023 if (cmp) 5024 return 1; 5025 return 0; 5026 } 5027 5028 #define ADVANCE 1 5029 #define ADVANCE_ONLY_NEXT -1 5030 5031 /* 5032 * This function compares two trees and calls the provided callback for 5033 * every changed/new/deleted item it finds. 5034 * If shared tree blocks are encountered, whole subtrees are skipped, making 5035 * the compare pretty fast on snapshotted subvolumes. 5036 * 5037 * This currently works on commit roots only. As commit roots are read only, 5038 * we don't do any locking. The commit roots are protected with transactions. 5039 * Transactions are ended and rejoined when a commit is tried in between. 5040 * 5041 * This function checks for modifications done to the trees while comparing. 5042 * If it detects a change, it aborts immediately. 5043 */ 5044 int btrfs_compare_trees(struct btrfs_root *left_root, 5045 struct btrfs_root *right_root, 5046 btrfs_changed_cb_t changed_cb, void *ctx) 5047 { 5048 int ret; 5049 int cmp; 5050 struct btrfs_trans_handle *trans = NULL; 5051 struct btrfs_path *left_path = NULL; 5052 struct btrfs_path *right_path = NULL; 5053 struct btrfs_key left_key; 5054 struct btrfs_key right_key; 5055 char *tmp_buf = NULL; 5056 int left_root_level; 5057 int right_root_level; 5058 int left_level; 5059 int right_level; 5060 int left_end_reached; 5061 int right_end_reached; 5062 int advance_left; 5063 int advance_right; 5064 u64 left_blockptr; 5065 u64 right_blockptr; 5066 u64 left_start_ctransid; 5067 u64 right_start_ctransid; 5068 u64 ctransid; 5069 5070 left_path = btrfs_alloc_path(); 5071 if (!left_path) { 5072 ret = -ENOMEM; 5073 goto out; 5074 } 5075 right_path = btrfs_alloc_path(); 5076 if (!right_path) { 5077 ret = -ENOMEM; 5078 goto out; 5079 } 5080 5081 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); 5082 if (!tmp_buf) { 5083 ret = -ENOMEM; 5084 goto out; 5085 } 5086 5087 left_path->search_commit_root = 1; 5088 left_path->skip_locking = 1; 5089 right_path->search_commit_root = 1; 5090 right_path->skip_locking = 1; 5091 5092 spin_lock(&left_root->root_item_lock); 5093 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5094 spin_unlock(&left_root->root_item_lock); 5095 5096 spin_lock(&right_root->root_item_lock); 5097 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5098 spin_unlock(&right_root->root_item_lock); 5099 5100 trans = btrfs_join_transaction(left_root); 5101 if (IS_ERR(trans)) { 5102 ret = PTR_ERR(trans); 5103 trans = NULL; 5104 goto out; 5105 } 5106 5107 /* 5108 * Strategy: Go to the first items of both trees. Then do 5109 * 5110 * If both trees are at level 0 5111 * Compare keys of current items 5112 * If left < right treat left item as new, advance left tree 5113 * and repeat 5114 * If left > right treat right item as deleted, advance right tree 5115 * and repeat 5116 * If left == right do deep compare of items, treat as changed if 5117 * needed, advance both trees and repeat 5118 * If both trees are at the same level but not at level 0 5119 * Compare keys of current nodes/leafs 5120 * If left < right advance left tree and repeat 5121 * If left > right advance right tree and repeat 5122 * If left == right compare blockptrs of the next nodes/leafs 5123 * If they match advance both trees but stay at the same level 5124 * and repeat 5125 * If they don't match advance both trees while allowing to go 5126 * deeper and repeat 5127 * If tree levels are different 5128 * Advance the tree that needs it and repeat 5129 * 5130 * Advancing a tree means: 5131 * If we are at level 0, try to go to the next slot. If that's not 5132 * possible, go one level up and repeat. Stop when we found a level 5133 * where we could go to the next slot. We may at this point be on a 5134 * node or a leaf. 5135 * 5136 * If we are not at level 0 and not on shared tree blocks, go one 5137 * level deeper. 5138 * 5139 * If we are not at level 0 and on shared tree blocks, go one slot to 5140 * the right if possible or go up and right. 5141 */ 5142 5143 left_level = btrfs_header_level(left_root->commit_root); 5144 left_root_level = left_level; 5145 left_path->nodes[left_level] = left_root->commit_root; 5146 extent_buffer_get(left_path->nodes[left_level]); 5147 5148 right_level = btrfs_header_level(right_root->commit_root); 5149 right_root_level = right_level; 5150 right_path->nodes[right_level] = right_root->commit_root; 5151 extent_buffer_get(right_path->nodes[right_level]); 5152 5153 if (left_level == 0) 5154 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5155 &left_key, left_path->slots[left_level]); 5156 else 5157 btrfs_node_key_to_cpu(left_path->nodes[left_level], 5158 &left_key, left_path->slots[left_level]); 5159 if (right_level == 0) 5160 btrfs_item_key_to_cpu(right_path->nodes[right_level], 5161 &right_key, right_path->slots[right_level]); 5162 else 5163 btrfs_node_key_to_cpu(right_path->nodes[right_level], 5164 &right_key, right_path->slots[right_level]); 5165 5166 left_end_reached = right_end_reached = 0; 5167 advance_left = advance_right = 0; 5168 5169 while (1) { 5170 /* 5171 * We need to make sure the transaction does not get committed 5172 * while we do anything on commit roots. This means, we need to 5173 * join and leave transactions for every item that we process. 5174 */ 5175 if (trans && btrfs_should_end_transaction(trans, left_root)) { 5176 btrfs_release_path(left_path); 5177 btrfs_release_path(right_path); 5178 5179 ret = btrfs_end_transaction(trans, left_root); 5180 trans = NULL; 5181 if (ret < 0) 5182 goto out; 5183 } 5184 /* now rejoin the transaction */ 5185 if (!trans) { 5186 trans = btrfs_join_transaction(left_root); 5187 if (IS_ERR(trans)) { 5188 ret = PTR_ERR(trans); 5189 trans = NULL; 5190 goto out; 5191 } 5192 5193 spin_lock(&left_root->root_item_lock); 5194 ctransid = btrfs_root_ctransid(&left_root->root_item); 5195 spin_unlock(&left_root->root_item_lock); 5196 if (ctransid != left_start_ctransid) 5197 left_start_ctransid = 0; 5198 5199 spin_lock(&right_root->root_item_lock); 5200 ctransid = btrfs_root_ctransid(&right_root->root_item); 5201 spin_unlock(&right_root->root_item_lock); 5202 if (ctransid != right_start_ctransid) 5203 right_start_ctransid = 0; 5204 5205 if (!left_start_ctransid || !right_start_ctransid) { 5206 WARN(1, KERN_WARNING 5207 "btrfs: btrfs_compare_tree detected " 5208 "a change in one of the trees while " 5209 "iterating. This is probably a " 5210 "bug.\n"); 5211 ret = -EIO; 5212 goto out; 5213 } 5214 5215 /* 5216 * the commit root may have changed, so start again 5217 * where we stopped 5218 */ 5219 left_path->lowest_level = left_level; 5220 right_path->lowest_level = right_level; 5221 ret = btrfs_search_slot(NULL, left_root, 5222 &left_key, left_path, 0, 0); 5223 if (ret < 0) 5224 goto out; 5225 ret = btrfs_search_slot(NULL, right_root, 5226 &right_key, right_path, 0, 0); 5227 if (ret < 0) 5228 goto out; 5229 } 5230 5231 if (advance_left && !left_end_reached) { 5232 ret = tree_advance(left_root, left_path, &left_level, 5233 left_root_level, 5234 advance_left != ADVANCE_ONLY_NEXT, 5235 &left_key); 5236 if (ret < 0) 5237 left_end_reached = ADVANCE; 5238 advance_left = 0; 5239 } 5240 if (advance_right && !right_end_reached) { 5241 ret = tree_advance(right_root, right_path, &right_level, 5242 right_root_level, 5243 advance_right != ADVANCE_ONLY_NEXT, 5244 &right_key); 5245 if (ret < 0) 5246 right_end_reached = ADVANCE; 5247 advance_right = 0; 5248 } 5249 5250 if (left_end_reached && right_end_reached) { 5251 ret = 0; 5252 goto out; 5253 } else if (left_end_reached) { 5254 if (right_level == 0) { 5255 ret = changed_cb(left_root, right_root, 5256 left_path, right_path, 5257 &right_key, 5258 BTRFS_COMPARE_TREE_DELETED, 5259 ctx); 5260 if (ret < 0) 5261 goto out; 5262 } 5263 advance_right = ADVANCE; 5264 continue; 5265 } else if (right_end_reached) { 5266 if (left_level == 0) { 5267 ret = changed_cb(left_root, right_root, 5268 left_path, right_path, 5269 &left_key, 5270 BTRFS_COMPARE_TREE_NEW, 5271 ctx); 5272 if (ret < 0) 5273 goto out; 5274 } 5275 advance_left = ADVANCE; 5276 continue; 5277 } 5278 5279 if (left_level == 0 && right_level == 0) { 5280 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5281 if (cmp < 0) { 5282 ret = changed_cb(left_root, right_root, 5283 left_path, right_path, 5284 &left_key, 5285 BTRFS_COMPARE_TREE_NEW, 5286 ctx); 5287 if (ret < 0) 5288 goto out; 5289 advance_left = ADVANCE; 5290 } else if (cmp > 0) { 5291 ret = changed_cb(left_root, right_root, 5292 left_path, right_path, 5293 &right_key, 5294 BTRFS_COMPARE_TREE_DELETED, 5295 ctx); 5296 if (ret < 0) 5297 goto out; 5298 advance_right = ADVANCE; 5299 } else { 5300 enum btrfs_compare_tree_result cmp; 5301 5302 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5303 ret = tree_compare_item(left_root, left_path, 5304 right_path, tmp_buf); 5305 if (ret) 5306 cmp = BTRFS_COMPARE_TREE_CHANGED; 5307 else 5308 cmp = BTRFS_COMPARE_TREE_SAME; 5309 ret = changed_cb(left_root, right_root, 5310 left_path, right_path, 5311 &left_key, cmp, ctx); 5312 if (ret < 0) 5313 goto out; 5314 advance_left = ADVANCE; 5315 advance_right = ADVANCE; 5316 } 5317 } else if (left_level == right_level) { 5318 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5319 if (cmp < 0) { 5320 advance_left = ADVANCE; 5321 } else if (cmp > 0) { 5322 advance_right = ADVANCE; 5323 } else { 5324 left_blockptr = btrfs_node_blockptr( 5325 left_path->nodes[left_level], 5326 left_path->slots[left_level]); 5327 right_blockptr = btrfs_node_blockptr( 5328 right_path->nodes[right_level], 5329 right_path->slots[right_level]); 5330 if (left_blockptr == right_blockptr) { 5331 /* 5332 * As we're on a shared block, don't 5333 * allow to go deeper. 5334 */ 5335 advance_left = ADVANCE_ONLY_NEXT; 5336 advance_right = ADVANCE_ONLY_NEXT; 5337 } else { 5338 advance_left = ADVANCE; 5339 advance_right = ADVANCE; 5340 } 5341 } 5342 } else if (left_level < right_level) { 5343 advance_right = ADVANCE; 5344 } else { 5345 advance_left = ADVANCE; 5346 } 5347 } 5348 5349 out: 5350 btrfs_free_path(left_path); 5351 btrfs_free_path(right_path); 5352 kfree(tmp_buf); 5353 5354 if (trans) { 5355 if (!ret) 5356 ret = btrfs_end_transaction(trans, left_root); 5357 else 5358 btrfs_end_transaction(trans, left_root); 5359 } 5360 5361 return ret; 5362 } 5363 5364 /* 5365 * this is similar to btrfs_next_leaf, but does not try to preserve 5366 * and fixup the path. It looks for and returns the next key in the 5367 * tree based on the current path and the min_trans parameters. 5368 * 5369 * 0 is returned if another key is found, < 0 if there are any errors 5370 * and 1 is returned if there are no higher keys in the tree 5371 * 5372 * path->keep_locks should be set to 1 on the search made before 5373 * calling this function. 5374 */ 5375 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5376 struct btrfs_key *key, int level, u64 min_trans) 5377 { 5378 int slot; 5379 struct extent_buffer *c; 5380 5381 WARN_ON(!path->keep_locks); 5382 while (level < BTRFS_MAX_LEVEL) { 5383 if (!path->nodes[level]) 5384 return 1; 5385 5386 slot = path->slots[level] + 1; 5387 c = path->nodes[level]; 5388 next: 5389 if (slot >= btrfs_header_nritems(c)) { 5390 int ret; 5391 int orig_lowest; 5392 struct btrfs_key cur_key; 5393 if (level + 1 >= BTRFS_MAX_LEVEL || 5394 !path->nodes[level + 1]) 5395 return 1; 5396 5397 if (path->locks[level + 1]) { 5398 level++; 5399 continue; 5400 } 5401 5402 slot = btrfs_header_nritems(c) - 1; 5403 if (level == 0) 5404 btrfs_item_key_to_cpu(c, &cur_key, slot); 5405 else 5406 btrfs_node_key_to_cpu(c, &cur_key, slot); 5407 5408 orig_lowest = path->lowest_level; 5409 btrfs_release_path(path); 5410 path->lowest_level = level; 5411 ret = btrfs_search_slot(NULL, root, &cur_key, path, 5412 0, 0); 5413 path->lowest_level = orig_lowest; 5414 if (ret < 0) 5415 return ret; 5416 5417 c = path->nodes[level]; 5418 slot = path->slots[level]; 5419 if (ret == 0) 5420 slot++; 5421 goto next; 5422 } 5423 5424 if (level == 0) 5425 btrfs_item_key_to_cpu(c, key, slot); 5426 else { 5427 u64 gen = btrfs_node_ptr_generation(c, slot); 5428 5429 if (gen < min_trans) { 5430 slot++; 5431 goto next; 5432 } 5433 btrfs_node_key_to_cpu(c, key, slot); 5434 } 5435 return 0; 5436 } 5437 return 1; 5438 } 5439 5440 /* 5441 * search the tree again to find a leaf with greater keys 5442 * returns 0 if it found something or 1 if there are no greater leaves. 5443 * returns < 0 on io errors. 5444 */ 5445 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5446 { 5447 return btrfs_next_old_leaf(root, path, 0); 5448 } 5449 5450 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5451 u64 time_seq) 5452 { 5453 int slot; 5454 int level; 5455 struct extent_buffer *c; 5456 struct extent_buffer *next; 5457 struct btrfs_key key; 5458 u32 nritems; 5459 int ret; 5460 int old_spinning = path->leave_spinning; 5461 int next_rw_lock = 0; 5462 5463 nritems = btrfs_header_nritems(path->nodes[0]); 5464 if (nritems == 0) 5465 return 1; 5466 5467 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5468 again: 5469 level = 1; 5470 next = NULL; 5471 next_rw_lock = 0; 5472 btrfs_release_path(path); 5473 5474 path->keep_locks = 1; 5475 path->leave_spinning = 1; 5476 5477 if (time_seq) 5478 ret = btrfs_search_old_slot(root, &key, path, time_seq); 5479 else 5480 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5481 path->keep_locks = 0; 5482 5483 if (ret < 0) 5484 return ret; 5485 5486 nritems = btrfs_header_nritems(path->nodes[0]); 5487 /* 5488 * by releasing the path above we dropped all our locks. A balance 5489 * could have added more items next to the key that used to be 5490 * at the very end of the block. So, check again here and 5491 * advance the path if there are now more items available. 5492 */ 5493 if (nritems > 0 && path->slots[0] < nritems - 1) { 5494 if (ret == 0) 5495 path->slots[0]++; 5496 ret = 0; 5497 goto done; 5498 } 5499 5500 while (level < BTRFS_MAX_LEVEL) { 5501 if (!path->nodes[level]) { 5502 ret = 1; 5503 goto done; 5504 } 5505 5506 slot = path->slots[level] + 1; 5507 c = path->nodes[level]; 5508 if (slot >= btrfs_header_nritems(c)) { 5509 level++; 5510 if (level == BTRFS_MAX_LEVEL) { 5511 ret = 1; 5512 goto done; 5513 } 5514 continue; 5515 } 5516 5517 if (next) { 5518 btrfs_tree_unlock_rw(next, next_rw_lock); 5519 free_extent_buffer(next); 5520 } 5521 5522 next = c; 5523 next_rw_lock = path->locks[level]; 5524 ret = read_block_for_search(NULL, root, path, &next, level, 5525 slot, &key, 0); 5526 if (ret == -EAGAIN) 5527 goto again; 5528 5529 if (ret < 0) { 5530 btrfs_release_path(path); 5531 goto done; 5532 } 5533 5534 if (!path->skip_locking) { 5535 ret = btrfs_try_tree_read_lock(next); 5536 if (!ret && time_seq) { 5537 /* 5538 * If we don't get the lock, we may be racing 5539 * with push_leaf_left, holding that lock while 5540 * itself waiting for the leaf we've currently 5541 * locked. To solve this situation, we give up 5542 * on our lock and cycle. 5543 */ 5544 free_extent_buffer(next); 5545 btrfs_release_path(path); 5546 cond_resched(); 5547 goto again; 5548 } 5549 if (!ret) { 5550 btrfs_set_path_blocking(path); 5551 btrfs_tree_read_lock(next); 5552 btrfs_clear_path_blocking(path, next, 5553 BTRFS_READ_LOCK); 5554 } 5555 next_rw_lock = BTRFS_READ_LOCK; 5556 } 5557 break; 5558 } 5559 path->slots[level] = slot; 5560 while (1) { 5561 level--; 5562 c = path->nodes[level]; 5563 if (path->locks[level]) 5564 btrfs_tree_unlock_rw(c, path->locks[level]); 5565 5566 free_extent_buffer(c); 5567 path->nodes[level] = next; 5568 path->slots[level] = 0; 5569 if (!path->skip_locking) 5570 path->locks[level] = next_rw_lock; 5571 if (!level) 5572 break; 5573 5574 ret = read_block_for_search(NULL, root, path, &next, level, 5575 0, &key, 0); 5576 if (ret == -EAGAIN) 5577 goto again; 5578 5579 if (ret < 0) { 5580 btrfs_release_path(path); 5581 goto done; 5582 } 5583 5584 if (!path->skip_locking) { 5585 ret = btrfs_try_tree_read_lock(next); 5586 if (!ret) { 5587 btrfs_set_path_blocking(path); 5588 btrfs_tree_read_lock(next); 5589 btrfs_clear_path_blocking(path, next, 5590 BTRFS_READ_LOCK); 5591 } 5592 next_rw_lock = BTRFS_READ_LOCK; 5593 } 5594 } 5595 ret = 0; 5596 done: 5597 unlock_up(path, 0, 1, 0, NULL); 5598 path->leave_spinning = old_spinning; 5599 if (!old_spinning) 5600 btrfs_set_path_blocking(path); 5601 5602 return ret; 5603 } 5604 5605 /* 5606 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5607 * searching until it gets past min_objectid or finds an item of 'type' 5608 * 5609 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5610 */ 5611 int btrfs_previous_item(struct btrfs_root *root, 5612 struct btrfs_path *path, u64 min_objectid, 5613 int type) 5614 { 5615 struct btrfs_key found_key; 5616 struct extent_buffer *leaf; 5617 u32 nritems; 5618 int ret; 5619 5620 while (1) { 5621 if (path->slots[0] == 0) { 5622 btrfs_set_path_blocking(path); 5623 ret = btrfs_prev_leaf(root, path); 5624 if (ret != 0) 5625 return ret; 5626 } else { 5627 path->slots[0]--; 5628 } 5629 leaf = path->nodes[0]; 5630 nritems = btrfs_header_nritems(leaf); 5631 if (nritems == 0) 5632 return 1; 5633 if (path->slots[0] == nritems) 5634 path->slots[0]--; 5635 5636 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5637 if (found_key.objectid < min_objectid) 5638 break; 5639 if (found_key.type == type) 5640 return 0; 5641 if (found_key.objectid == min_objectid && 5642 found_key.type < type) 5643 break; 5644 } 5645 return 1; 5646 } 5647