1 /* 2 * Copyright (C) 2007,2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/rbtree.h> 22 #include "ctree.h" 23 #include "disk-io.h" 24 #include "transaction.h" 25 #include "print-tree.h" 26 #include "locking.h" 27 28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 29 *root, struct btrfs_path *path, int level); 30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root 31 *root, struct btrfs_key *ins_key, 32 struct btrfs_path *path, int data_size, int extend); 33 static int push_node_left(struct btrfs_trans_handle *trans, 34 struct btrfs_root *root, struct extent_buffer *dst, 35 struct extent_buffer *src, int empty); 36 static int balance_node_right(struct btrfs_trans_handle *trans, 37 struct btrfs_root *root, 38 struct extent_buffer *dst_buf, 39 struct extent_buffer *src_buf); 40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 41 int level, int slot); 42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 43 struct extent_buffer *eb); 44 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 45 46 struct btrfs_path *btrfs_alloc_path(void) 47 { 48 struct btrfs_path *path; 49 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 50 return path; 51 } 52 53 /* 54 * set all locked nodes in the path to blocking locks. This should 55 * be done before scheduling 56 */ 57 noinline void btrfs_set_path_blocking(struct btrfs_path *p) 58 { 59 int i; 60 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 61 if (!p->nodes[i] || !p->locks[i]) 62 continue; 63 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); 64 if (p->locks[i] == BTRFS_READ_LOCK) 65 p->locks[i] = BTRFS_READ_LOCK_BLOCKING; 66 else if (p->locks[i] == BTRFS_WRITE_LOCK) 67 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; 68 } 69 } 70 71 /* 72 * reset all the locked nodes in the patch to spinning locks. 73 * 74 * held is used to keep lockdep happy, when lockdep is enabled 75 * we set held to a blocking lock before we go around and 76 * retake all the spinlocks in the path. You can safely use NULL 77 * for held 78 */ 79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 80 struct extent_buffer *held, int held_rw) 81 { 82 int i; 83 84 #ifdef CONFIG_DEBUG_LOCK_ALLOC 85 /* lockdep really cares that we take all of these spinlocks 86 * in the right order. If any of the locks in the path are not 87 * currently blocking, it is going to complain. So, make really 88 * really sure by forcing the path to blocking before we clear 89 * the path blocking. 90 */ 91 if (held) { 92 btrfs_set_lock_blocking_rw(held, held_rw); 93 if (held_rw == BTRFS_WRITE_LOCK) 94 held_rw = BTRFS_WRITE_LOCK_BLOCKING; 95 else if (held_rw == BTRFS_READ_LOCK) 96 held_rw = BTRFS_READ_LOCK_BLOCKING; 97 } 98 btrfs_set_path_blocking(p); 99 #endif 100 101 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 102 if (p->nodes[i] && p->locks[i]) { 103 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); 104 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) 105 p->locks[i] = BTRFS_WRITE_LOCK; 106 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) 107 p->locks[i] = BTRFS_READ_LOCK; 108 } 109 } 110 111 #ifdef CONFIG_DEBUG_LOCK_ALLOC 112 if (held) 113 btrfs_clear_lock_blocking_rw(held, held_rw); 114 #endif 115 } 116 117 /* this also releases the path */ 118 void btrfs_free_path(struct btrfs_path *p) 119 { 120 if (!p) 121 return; 122 btrfs_release_path(p); 123 kmem_cache_free(btrfs_path_cachep, p); 124 } 125 126 /* 127 * path release drops references on the extent buffers in the path 128 * and it drops any locks held by this path 129 * 130 * It is safe to call this on paths that no locks or extent buffers held. 131 */ 132 noinline void btrfs_release_path(struct btrfs_path *p) 133 { 134 int i; 135 136 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 137 p->slots[i] = 0; 138 if (!p->nodes[i]) 139 continue; 140 if (p->locks[i]) { 141 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); 142 p->locks[i] = 0; 143 } 144 free_extent_buffer(p->nodes[i]); 145 p->nodes[i] = NULL; 146 } 147 } 148 149 /* 150 * safely gets a reference on the root node of a tree. A lock 151 * is not taken, so a concurrent writer may put a different node 152 * at the root of the tree. See btrfs_lock_root_node for the 153 * looping required. 154 * 155 * The extent buffer returned by this has a reference taken, so 156 * it won't disappear. It may stop being the root of the tree 157 * at any time because there are no locks held. 158 */ 159 struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 160 { 161 struct extent_buffer *eb; 162 163 while (1) { 164 rcu_read_lock(); 165 eb = rcu_dereference(root->node); 166 167 /* 168 * RCU really hurts here, we could free up the root node because 169 * it was cow'ed but we may not get the new root node yet so do 170 * the inc_not_zero dance and if it doesn't work then 171 * synchronize_rcu and try again. 172 */ 173 if (atomic_inc_not_zero(&eb->refs)) { 174 rcu_read_unlock(); 175 break; 176 } 177 rcu_read_unlock(); 178 synchronize_rcu(); 179 } 180 return eb; 181 } 182 183 /* loop around taking references on and locking the root node of the 184 * tree until you end up with a lock on the root. A locked buffer 185 * is returned, with a reference held. 186 */ 187 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 188 { 189 struct extent_buffer *eb; 190 191 while (1) { 192 eb = btrfs_root_node(root); 193 btrfs_tree_lock(eb); 194 if (eb == root->node) 195 break; 196 btrfs_tree_unlock(eb); 197 free_extent_buffer(eb); 198 } 199 return eb; 200 } 201 202 /* loop around taking references on and locking the root node of the 203 * tree until you end up with a lock on the root. A locked buffer 204 * is returned, with a reference held. 205 */ 206 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 207 { 208 struct extent_buffer *eb; 209 210 while (1) { 211 eb = btrfs_root_node(root); 212 btrfs_tree_read_lock(eb); 213 if (eb == root->node) 214 break; 215 btrfs_tree_read_unlock(eb); 216 free_extent_buffer(eb); 217 } 218 return eb; 219 } 220 221 /* cowonly root (everything not a reference counted cow subvolume), just get 222 * put onto a simple dirty list. transaction.c walks this to make sure they 223 * get properly updated on disk. 224 */ 225 static void add_root_to_dirty_list(struct btrfs_root *root) 226 { 227 spin_lock(&root->fs_info->trans_lock); 228 if (root->track_dirty && list_empty(&root->dirty_list)) { 229 list_add(&root->dirty_list, 230 &root->fs_info->dirty_cowonly_roots); 231 } 232 spin_unlock(&root->fs_info->trans_lock); 233 } 234 235 /* 236 * used by snapshot creation to make a copy of a root for a tree with 237 * a given objectid. The buffer with the new root node is returned in 238 * cow_ret, and this func returns zero on success or a negative error code. 239 */ 240 int btrfs_copy_root(struct btrfs_trans_handle *trans, 241 struct btrfs_root *root, 242 struct extent_buffer *buf, 243 struct extent_buffer **cow_ret, u64 new_root_objectid) 244 { 245 struct extent_buffer *cow; 246 int ret = 0; 247 int level; 248 struct btrfs_disk_key disk_key; 249 250 WARN_ON(root->ref_cows && trans->transid != 251 root->fs_info->running_transaction->transid); 252 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 253 254 level = btrfs_header_level(buf); 255 if (level == 0) 256 btrfs_item_key(buf, &disk_key, 0); 257 else 258 btrfs_node_key(buf, &disk_key, 0); 259 260 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 261 new_root_objectid, &disk_key, level, 262 buf->start, 0); 263 if (IS_ERR(cow)) 264 return PTR_ERR(cow); 265 266 copy_extent_buffer(cow, buf, 0, 0, cow->len); 267 btrfs_set_header_bytenr(cow, cow->start); 268 btrfs_set_header_generation(cow, trans->transid); 269 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 270 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 271 BTRFS_HEADER_FLAG_RELOC); 272 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 273 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 274 else 275 btrfs_set_header_owner(cow, new_root_objectid); 276 277 write_extent_buffer(cow, root->fs_info->fsid, 278 (unsigned long)btrfs_header_fsid(cow), 279 BTRFS_FSID_SIZE); 280 281 WARN_ON(btrfs_header_generation(buf) > trans->transid); 282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 283 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 284 else 285 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 286 287 if (ret) 288 return ret; 289 290 btrfs_mark_buffer_dirty(cow); 291 *cow_ret = cow; 292 return 0; 293 } 294 295 enum mod_log_op { 296 MOD_LOG_KEY_REPLACE, 297 MOD_LOG_KEY_ADD, 298 MOD_LOG_KEY_REMOVE, 299 MOD_LOG_KEY_REMOVE_WHILE_FREEING, 300 MOD_LOG_KEY_REMOVE_WHILE_MOVING, 301 MOD_LOG_MOVE_KEYS, 302 MOD_LOG_ROOT_REPLACE, 303 }; 304 305 struct tree_mod_move { 306 int dst_slot; 307 int nr_items; 308 }; 309 310 struct tree_mod_root { 311 u64 logical; 312 u8 level; 313 }; 314 315 struct tree_mod_elem { 316 struct rb_node node; 317 u64 index; /* shifted logical */ 318 u64 seq; 319 enum mod_log_op op; 320 321 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 322 int slot; 323 324 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ 325 u64 generation; 326 327 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ 328 struct btrfs_disk_key key; 329 u64 blockptr; 330 331 /* this is used for op == MOD_LOG_MOVE_KEYS */ 332 struct tree_mod_move move; 333 334 /* this is used for op == MOD_LOG_ROOT_REPLACE */ 335 struct tree_mod_root old_root; 336 }; 337 338 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) 339 { 340 read_lock(&fs_info->tree_mod_log_lock); 341 } 342 343 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) 344 { 345 read_unlock(&fs_info->tree_mod_log_lock); 346 } 347 348 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) 349 { 350 write_lock(&fs_info->tree_mod_log_lock); 351 } 352 353 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) 354 { 355 write_unlock(&fs_info->tree_mod_log_lock); 356 } 357 358 /* 359 * Increment the upper half of tree_mod_seq, set lower half zero. 360 * 361 * Must be called with fs_info->tree_mod_seq_lock held. 362 */ 363 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) 364 { 365 u64 seq = atomic64_read(&fs_info->tree_mod_seq); 366 seq &= 0xffffffff00000000ull; 367 seq += 1ull << 32; 368 atomic64_set(&fs_info->tree_mod_seq, seq); 369 return seq; 370 } 371 372 /* 373 * Increment the lower half of tree_mod_seq. 374 * 375 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers 376 * are generated should not technically require a spin lock here. (Rationale: 377 * incrementing the minor while incrementing the major seq number is between its 378 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it 379 * just returns a unique sequence number as usual.) We have decided to leave 380 * that requirement in here and rethink it once we notice it really imposes a 381 * problem on some workload. 382 */ 383 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) 384 { 385 return atomic64_inc_return(&fs_info->tree_mod_seq); 386 } 387 388 /* 389 * return the last minor in the previous major tree_mod_seq number 390 */ 391 u64 btrfs_tree_mod_seq_prev(u64 seq) 392 { 393 return (seq & 0xffffffff00000000ull) - 1ull; 394 } 395 396 /* 397 * This adds a new blocker to the tree mod log's blocker list if the @elem 398 * passed does not already have a sequence number set. So when a caller expects 399 * to record tree modifications, it should ensure to set elem->seq to zero 400 * before calling btrfs_get_tree_mod_seq. 401 * Returns a fresh, unused tree log modification sequence number, even if no new 402 * blocker was added. 403 */ 404 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 405 struct seq_list *elem) 406 { 407 u64 seq; 408 409 tree_mod_log_write_lock(fs_info); 410 spin_lock(&fs_info->tree_mod_seq_lock); 411 if (!elem->seq) { 412 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); 413 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 414 } 415 seq = btrfs_inc_tree_mod_seq_minor(fs_info); 416 spin_unlock(&fs_info->tree_mod_seq_lock); 417 tree_mod_log_write_unlock(fs_info); 418 419 return seq; 420 } 421 422 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 423 struct seq_list *elem) 424 { 425 struct rb_root *tm_root; 426 struct rb_node *node; 427 struct rb_node *next; 428 struct seq_list *cur_elem; 429 struct tree_mod_elem *tm; 430 u64 min_seq = (u64)-1; 431 u64 seq_putting = elem->seq; 432 433 if (!seq_putting) 434 return; 435 436 spin_lock(&fs_info->tree_mod_seq_lock); 437 list_del(&elem->list); 438 elem->seq = 0; 439 440 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 441 if (cur_elem->seq < min_seq) { 442 if (seq_putting > cur_elem->seq) { 443 /* 444 * blocker with lower sequence number exists, we 445 * cannot remove anything from the log 446 */ 447 spin_unlock(&fs_info->tree_mod_seq_lock); 448 return; 449 } 450 min_seq = cur_elem->seq; 451 } 452 } 453 spin_unlock(&fs_info->tree_mod_seq_lock); 454 455 /* 456 * anything that's lower than the lowest existing (read: blocked) 457 * sequence number can be removed from the tree. 458 */ 459 tree_mod_log_write_lock(fs_info); 460 tm_root = &fs_info->tree_mod_log; 461 for (node = rb_first(tm_root); node; node = next) { 462 next = rb_next(node); 463 tm = container_of(node, struct tree_mod_elem, node); 464 if (tm->seq > min_seq) 465 continue; 466 rb_erase(node, tm_root); 467 kfree(tm); 468 } 469 tree_mod_log_write_unlock(fs_info); 470 } 471 472 /* 473 * key order of the log: 474 * index -> sequence 475 * 476 * the index is the shifted logical of the *new* root node for root replace 477 * operations, or the shifted logical of the affected block for all other 478 * operations. 479 */ 480 static noinline int 481 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 482 { 483 struct rb_root *tm_root; 484 struct rb_node **new; 485 struct rb_node *parent = NULL; 486 struct tree_mod_elem *cur; 487 488 BUG_ON(!tm || !tm->seq); 489 490 tm_root = &fs_info->tree_mod_log; 491 new = &tm_root->rb_node; 492 while (*new) { 493 cur = container_of(*new, struct tree_mod_elem, node); 494 parent = *new; 495 if (cur->index < tm->index) 496 new = &((*new)->rb_left); 497 else if (cur->index > tm->index) 498 new = &((*new)->rb_right); 499 else if (cur->seq < tm->seq) 500 new = &((*new)->rb_left); 501 else if (cur->seq > tm->seq) 502 new = &((*new)->rb_right); 503 else { 504 kfree(tm); 505 return -EEXIST; 506 } 507 } 508 509 rb_link_node(&tm->node, parent, new); 510 rb_insert_color(&tm->node, tm_root); 511 return 0; 512 } 513 514 /* 515 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it 516 * returns zero with the tree_mod_log_lock acquired. The caller must hold 517 * this until all tree mod log insertions are recorded in the rb tree and then 518 * call tree_mod_log_write_unlock() to release. 519 */ 520 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 521 struct extent_buffer *eb) { 522 smp_mb(); 523 if (list_empty(&(fs_info)->tree_mod_seq_list)) 524 return 1; 525 if (eb && btrfs_header_level(eb) == 0) 526 return 1; 527 528 tree_mod_log_write_lock(fs_info); 529 if (list_empty(&fs_info->tree_mod_seq_list)) { 530 /* 531 * someone emptied the list while we were waiting for the lock. 532 * we must not add to the list when no blocker exists. 533 */ 534 tree_mod_log_write_unlock(fs_info); 535 return 1; 536 } 537 538 return 0; 539 } 540 541 /* 542 * This allocates memory and gets a tree modification sequence number. 543 * 544 * Returns <0 on error. 545 * Returns >0 (the added sequence number) on success. 546 */ 547 static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, 548 struct tree_mod_elem **tm_ret) 549 { 550 struct tree_mod_elem *tm; 551 552 /* 553 * once we switch from spin locks to something different, we should 554 * honor the flags parameter here. 555 */ 556 tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); 557 if (!tm) 558 return -ENOMEM; 559 560 spin_lock(&fs_info->tree_mod_seq_lock); 561 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); 562 spin_unlock(&fs_info->tree_mod_seq_lock); 563 564 return tm->seq; 565 } 566 567 static inline int 568 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, 569 struct extent_buffer *eb, int slot, 570 enum mod_log_op op, gfp_t flags) 571 { 572 int ret; 573 struct tree_mod_elem *tm; 574 575 ret = tree_mod_alloc(fs_info, flags, &tm); 576 if (ret < 0) 577 return ret; 578 579 tm->index = eb->start >> PAGE_CACHE_SHIFT; 580 if (op != MOD_LOG_KEY_ADD) { 581 btrfs_node_key(eb, &tm->key, slot); 582 tm->blockptr = btrfs_node_blockptr(eb, slot); 583 } 584 tm->op = op; 585 tm->slot = slot; 586 tm->generation = btrfs_node_ptr_generation(eb, slot); 587 588 return __tree_mod_log_insert(fs_info, tm); 589 } 590 591 static noinline int 592 tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, 593 struct extent_buffer *eb, int slot, 594 enum mod_log_op op, gfp_t flags) 595 { 596 int ret; 597 598 if (tree_mod_dont_log(fs_info, eb)) 599 return 0; 600 601 ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); 602 603 tree_mod_log_write_unlock(fs_info); 604 return ret; 605 } 606 607 static noinline int 608 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 609 int slot, enum mod_log_op op) 610 { 611 return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); 612 } 613 614 static noinline int 615 tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, 616 struct extent_buffer *eb, int slot, 617 enum mod_log_op op) 618 { 619 return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); 620 } 621 622 static noinline int 623 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 624 struct extent_buffer *eb, int dst_slot, int src_slot, 625 int nr_items, gfp_t flags) 626 { 627 struct tree_mod_elem *tm; 628 int ret; 629 int i; 630 631 if (tree_mod_dont_log(fs_info, eb)) 632 return 0; 633 634 /* 635 * When we override something during the move, we log these removals. 636 * This can only happen when we move towards the beginning of the 637 * buffer, i.e. dst_slot < src_slot. 638 */ 639 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 640 ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, 641 MOD_LOG_KEY_REMOVE_WHILE_MOVING); 642 BUG_ON(ret < 0); 643 } 644 645 ret = tree_mod_alloc(fs_info, flags, &tm); 646 if (ret < 0) 647 goto out; 648 649 tm->index = eb->start >> PAGE_CACHE_SHIFT; 650 tm->slot = src_slot; 651 tm->move.dst_slot = dst_slot; 652 tm->move.nr_items = nr_items; 653 tm->op = MOD_LOG_MOVE_KEYS; 654 655 ret = __tree_mod_log_insert(fs_info, tm); 656 out: 657 tree_mod_log_write_unlock(fs_info); 658 return ret; 659 } 660 661 static inline void 662 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 663 { 664 int i; 665 u32 nritems; 666 int ret; 667 668 if (btrfs_header_level(eb) == 0) 669 return; 670 671 nritems = btrfs_header_nritems(eb); 672 for (i = nritems - 1; i >= 0; i--) { 673 ret = tree_mod_log_insert_key_locked(fs_info, eb, i, 674 MOD_LOG_KEY_REMOVE_WHILE_FREEING); 675 BUG_ON(ret < 0); 676 } 677 } 678 679 static noinline int 680 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 681 struct extent_buffer *old_root, 682 struct extent_buffer *new_root, gfp_t flags, 683 int log_removal) 684 { 685 struct tree_mod_elem *tm; 686 int ret; 687 688 if (tree_mod_dont_log(fs_info, NULL)) 689 return 0; 690 691 if (log_removal) 692 __tree_mod_log_free_eb(fs_info, old_root); 693 694 ret = tree_mod_alloc(fs_info, flags, &tm); 695 if (ret < 0) 696 goto out; 697 698 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 699 tm->old_root.logical = old_root->start; 700 tm->old_root.level = btrfs_header_level(old_root); 701 tm->generation = btrfs_header_generation(old_root); 702 tm->op = MOD_LOG_ROOT_REPLACE; 703 704 ret = __tree_mod_log_insert(fs_info, tm); 705 out: 706 tree_mod_log_write_unlock(fs_info); 707 return ret; 708 } 709 710 static struct tree_mod_elem * 711 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, 712 int smallest) 713 { 714 struct rb_root *tm_root; 715 struct rb_node *node; 716 struct tree_mod_elem *cur = NULL; 717 struct tree_mod_elem *found = NULL; 718 u64 index = start >> PAGE_CACHE_SHIFT; 719 720 tree_mod_log_read_lock(fs_info); 721 tm_root = &fs_info->tree_mod_log; 722 node = tm_root->rb_node; 723 while (node) { 724 cur = container_of(node, struct tree_mod_elem, node); 725 if (cur->index < index) { 726 node = node->rb_left; 727 } else if (cur->index > index) { 728 node = node->rb_right; 729 } else if (cur->seq < min_seq) { 730 node = node->rb_left; 731 } else if (!smallest) { 732 /* we want the node with the highest seq */ 733 if (found) 734 BUG_ON(found->seq > cur->seq); 735 found = cur; 736 node = node->rb_left; 737 } else if (cur->seq > min_seq) { 738 /* we want the node with the smallest seq */ 739 if (found) 740 BUG_ON(found->seq < cur->seq); 741 found = cur; 742 node = node->rb_right; 743 } else { 744 found = cur; 745 break; 746 } 747 } 748 tree_mod_log_read_unlock(fs_info); 749 750 return found; 751 } 752 753 /* 754 * this returns the element from the log with the smallest time sequence 755 * value that's in the log (the oldest log item). any element with a time 756 * sequence lower than min_seq will be ignored. 757 */ 758 static struct tree_mod_elem * 759 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, 760 u64 min_seq) 761 { 762 return __tree_mod_log_search(fs_info, start, min_seq, 1); 763 } 764 765 /* 766 * this returns the element from the log with the largest time sequence 767 * value that's in the log (the most recent log item). any element with 768 * a time sequence lower than min_seq will be ignored. 769 */ 770 static struct tree_mod_elem * 771 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) 772 { 773 return __tree_mod_log_search(fs_info, start, min_seq, 0); 774 } 775 776 static noinline void 777 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 778 struct extent_buffer *src, unsigned long dst_offset, 779 unsigned long src_offset, int nr_items) 780 { 781 int ret; 782 int i; 783 784 if (tree_mod_dont_log(fs_info, NULL)) 785 return; 786 787 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { 788 tree_mod_log_write_unlock(fs_info); 789 return; 790 } 791 792 for (i = 0; i < nr_items; i++) { 793 ret = tree_mod_log_insert_key_locked(fs_info, src, 794 i + src_offset, 795 MOD_LOG_KEY_REMOVE); 796 BUG_ON(ret < 0); 797 ret = tree_mod_log_insert_key_locked(fs_info, dst, 798 i + dst_offset, 799 MOD_LOG_KEY_ADD); 800 BUG_ON(ret < 0); 801 } 802 803 tree_mod_log_write_unlock(fs_info); 804 } 805 806 static inline void 807 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 808 int dst_offset, int src_offset, int nr_items) 809 { 810 int ret; 811 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, 812 nr_items, GFP_NOFS); 813 BUG_ON(ret < 0); 814 } 815 816 static noinline void 817 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 818 struct extent_buffer *eb, int slot, int atomic) 819 { 820 int ret; 821 822 ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, 823 MOD_LOG_KEY_REPLACE, 824 atomic ? GFP_ATOMIC : GFP_NOFS); 825 BUG_ON(ret < 0); 826 } 827 828 static noinline void 829 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 830 { 831 if (tree_mod_dont_log(fs_info, eb)) 832 return; 833 834 __tree_mod_log_free_eb(fs_info, eb); 835 836 tree_mod_log_write_unlock(fs_info); 837 } 838 839 static noinline void 840 tree_mod_log_set_root_pointer(struct btrfs_root *root, 841 struct extent_buffer *new_root_node, 842 int log_removal) 843 { 844 int ret; 845 ret = tree_mod_log_insert_root(root->fs_info, root->node, 846 new_root_node, GFP_NOFS, log_removal); 847 BUG_ON(ret < 0); 848 } 849 850 /* 851 * check if the tree block can be shared by multiple trees 852 */ 853 int btrfs_block_can_be_shared(struct btrfs_root *root, 854 struct extent_buffer *buf) 855 { 856 /* 857 * Tree blocks not in refernece counted trees and tree roots 858 * are never shared. If a block was allocated after the last 859 * snapshot and the block was not allocated by tree relocation, 860 * we know the block is not shared. 861 */ 862 if (root->ref_cows && 863 buf != root->node && buf != root->commit_root && 864 (btrfs_header_generation(buf) <= 865 btrfs_root_last_snapshot(&root->root_item) || 866 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 867 return 1; 868 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 869 if (root->ref_cows && 870 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 871 return 1; 872 #endif 873 return 0; 874 } 875 876 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 877 struct btrfs_root *root, 878 struct extent_buffer *buf, 879 struct extent_buffer *cow, 880 int *last_ref) 881 { 882 u64 refs; 883 u64 owner; 884 u64 flags; 885 u64 new_flags = 0; 886 int ret; 887 888 /* 889 * Backrefs update rules: 890 * 891 * Always use full backrefs for extent pointers in tree block 892 * allocated by tree relocation. 893 * 894 * If a shared tree block is no longer referenced by its owner 895 * tree (btrfs_header_owner(buf) == root->root_key.objectid), 896 * use full backrefs for extent pointers in tree block. 897 * 898 * If a tree block is been relocating 899 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), 900 * use full backrefs for extent pointers in tree block. 901 * The reason for this is some operations (such as drop tree) 902 * are only allowed for blocks use full backrefs. 903 */ 904 905 if (btrfs_block_can_be_shared(root, buf)) { 906 ret = btrfs_lookup_extent_info(trans, root, buf->start, 907 btrfs_header_level(buf), 1, 908 &refs, &flags); 909 if (ret) 910 return ret; 911 if (refs == 0) { 912 ret = -EROFS; 913 btrfs_std_error(root->fs_info, ret); 914 return ret; 915 } 916 } else { 917 refs = 1; 918 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 919 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 920 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 921 else 922 flags = 0; 923 } 924 925 owner = btrfs_header_owner(buf); 926 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && 927 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 928 929 if (refs > 1) { 930 if ((owner == root->root_key.objectid || 931 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 932 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 933 ret = btrfs_inc_ref(trans, root, buf, 1, 1); 934 BUG_ON(ret); /* -ENOMEM */ 935 936 if (root->root_key.objectid == 937 BTRFS_TREE_RELOC_OBJECTID) { 938 ret = btrfs_dec_ref(trans, root, buf, 0, 1); 939 BUG_ON(ret); /* -ENOMEM */ 940 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 941 BUG_ON(ret); /* -ENOMEM */ 942 } 943 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 944 } else { 945 946 if (root->root_key.objectid == 947 BTRFS_TREE_RELOC_OBJECTID) 948 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 949 else 950 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 951 BUG_ON(ret); /* -ENOMEM */ 952 } 953 if (new_flags != 0) { 954 int level = btrfs_header_level(buf); 955 956 ret = btrfs_set_disk_extent_flags(trans, root, 957 buf->start, 958 buf->len, 959 new_flags, level, 0); 960 if (ret) 961 return ret; 962 } 963 } else { 964 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 965 if (root->root_key.objectid == 966 BTRFS_TREE_RELOC_OBJECTID) 967 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 968 else 969 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 970 BUG_ON(ret); /* -ENOMEM */ 971 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 972 BUG_ON(ret); /* -ENOMEM */ 973 } 974 clean_tree_block(trans, root, buf); 975 *last_ref = 1; 976 } 977 return 0; 978 } 979 980 /* 981 * does the dirty work in cow of a single block. The parent block (if 982 * supplied) is updated to point to the new cow copy. The new buffer is marked 983 * dirty and returned locked. If you modify the block it needs to be marked 984 * dirty again. 985 * 986 * search_start -- an allocation hint for the new block 987 * 988 * empty_size -- a hint that you plan on doing more cow. This is the size in 989 * bytes the allocator should try to find free next to the block it returns. 990 * This is just a hint and may be ignored by the allocator. 991 */ 992 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, 993 struct btrfs_root *root, 994 struct extent_buffer *buf, 995 struct extent_buffer *parent, int parent_slot, 996 struct extent_buffer **cow_ret, 997 u64 search_start, u64 empty_size) 998 { 999 struct btrfs_disk_key disk_key; 1000 struct extent_buffer *cow; 1001 int level, ret; 1002 int last_ref = 0; 1003 int unlock_orig = 0; 1004 u64 parent_start; 1005 1006 if (*cow_ret == buf) 1007 unlock_orig = 1; 1008 1009 btrfs_assert_tree_locked(buf); 1010 1011 WARN_ON(root->ref_cows && trans->transid != 1012 root->fs_info->running_transaction->transid); 1013 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 1014 1015 level = btrfs_header_level(buf); 1016 1017 if (level == 0) 1018 btrfs_item_key(buf, &disk_key, 0); 1019 else 1020 btrfs_node_key(buf, &disk_key, 0); 1021 1022 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 1023 if (parent) 1024 parent_start = parent->start; 1025 else 1026 parent_start = 0; 1027 } else 1028 parent_start = 0; 1029 1030 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 1031 root->root_key.objectid, &disk_key, 1032 level, search_start, empty_size); 1033 if (IS_ERR(cow)) 1034 return PTR_ERR(cow); 1035 1036 /* cow is set to blocking by btrfs_init_new_buffer */ 1037 1038 copy_extent_buffer(cow, buf, 0, 0, cow->len); 1039 btrfs_set_header_bytenr(cow, cow->start); 1040 btrfs_set_header_generation(cow, trans->transid); 1041 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 1042 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 1043 BTRFS_HEADER_FLAG_RELOC); 1044 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1045 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 1046 else 1047 btrfs_set_header_owner(cow, root->root_key.objectid); 1048 1049 write_extent_buffer(cow, root->fs_info->fsid, 1050 (unsigned long)btrfs_header_fsid(cow), 1051 BTRFS_FSID_SIZE); 1052 1053 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); 1054 if (ret) { 1055 btrfs_abort_transaction(trans, root, ret); 1056 return ret; 1057 } 1058 1059 if (root->ref_cows) 1060 btrfs_reloc_cow_block(trans, root, buf, cow); 1061 1062 if (buf == root->node) { 1063 WARN_ON(parent && parent != buf); 1064 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || 1065 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 1066 parent_start = buf->start; 1067 else 1068 parent_start = 0; 1069 1070 extent_buffer_get(cow); 1071 tree_mod_log_set_root_pointer(root, cow, 1); 1072 rcu_assign_pointer(root->node, cow); 1073 1074 btrfs_free_tree_block(trans, root, buf, parent_start, 1075 last_ref); 1076 free_extent_buffer(buf); 1077 add_root_to_dirty_list(root); 1078 } else { 1079 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1080 parent_start = parent->start; 1081 else 1082 parent_start = 0; 1083 1084 WARN_ON(trans->transid != btrfs_header_generation(parent)); 1085 tree_mod_log_insert_key(root->fs_info, parent, parent_slot, 1086 MOD_LOG_KEY_REPLACE); 1087 btrfs_set_node_blockptr(parent, parent_slot, 1088 cow->start); 1089 btrfs_set_node_ptr_generation(parent, parent_slot, 1090 trans->transid); 1091 btrfs_mark_buffer_dirty(parent); 1092 tree_mod_log_free_eb(root->fs_info, buf); 1093 btrfs_free_tree_block(trans, root, buf, parent_start, 1094 last_ref); 1095 } 1096 if (unlock_orig) 1097 btrfs_tree_unlock(buf); 1098 free_extent_buffer_stale(buf); 1099 btrfs_mark_buffer_dirty(cow); 1100 *cow_ret = cow; 1101 return 0; 1102 } 1103 1104 /* 1105 * returns the logical address of the oldest predecessor of the given root. 1106 * entries older than time_seq are ignored. 1107 */ 1108 static struct tree_mod_elem * 1109 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, 1110 struct extent_buffer *eb_root, u64 time_seq) 1111 { 1112 struct tree_mod_elem *tm; 1113 struct tree_mod_elem *found = NULL; 1114 u64 root_logical = eb_root->start; 1115 int looped = 0; 1116 1117 if (!time_seq) 1118 return 0; 1119 1120 /* 1121 * the very last operation that's logged for a root is the replacement 1122 * operation (if it is replaced at all). this has the index of the *new* 1123 * root, making it the very first operation that's logged for this root. 1124 */ 1125 while (1) { 1126 tm = tree_mod_log_search_oldest(fs_info, root_logical, 1127 time_seq); 1128 if (!looped && !tm) 1129 return 0; 1130 /* 1131 * if there are no tree operation for the oldest root, we simply 1132 * return it. this should only happen if that (old) root is at 1133 * level 0. 1134 */ 1135 if (!tm) 1136 break; 1137 1138 /* 1139 * if there's an operation that's not a root replacement, we 1140 * found the oldest version of our root. normally, we'll find a 1141 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 1142 */ 1143 if (tm->op != MOD_LOG_ROOT_REPLACE) 1144 break; 1145 1146 found = tm; 1147 root_logical = tm->old_root.logical; 1148 looped = 1; 1149 } 1150 1151 /* if there's no old root to return, return what we found instead */ 1152 if (!found) 1153 found = tm; 1154 1155 return found; 1156 } 1157 1158 /* 1159 * tm is a pointer to the first operation to rewind within eb. then, all 1160 * previous operations will be rewinded (until we reach something older than 1161 * time_seq). 1162 */ 1163 static void 1164 __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1165 struct tree_mod_elem *first_tm) 1166 { 1167 u32 n; 1168 struct rb_node *next; 1169 struct tree_mod_elem *tm = first_tm; 1170 unsigned long o_dst; 1171 unsigned long o_src; 1172 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1173 1174 n = btrfs_header_nritems(eb); 1175 while (tm && tm->seq >= time_seq) { 1176 /* 1177 * all the operations are recorded with the operator used for 1178 * the modification. as we're going backwards, we do the 1179 * opposite of each operation here. 1180 */ 1181 switch (tm->op) { 1182 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1183 BUG_ON(tm->slot < n); 1184 /* Fallthrough */ 1185 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1186 case MOD_LOG_KEY_REMOVE: 1187 btrfs_set_node_key(eb, &tm->key, tm->slot); 1188 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1189 btrfs_set_node_ptr_generation(eb, tm->slot, 1190 tm->generation); 1191 n++; 1192 break; 1193 case MOD_LOG_KEY_REPLACE: 1194 BUG_ON(tm->slot >= n); 1195 btrfs_set_node_key(eb, &tm->key, tm->slot); 1196 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1197 btrfs_set_node_ptr_generation(eb, tm->slot, 1198 tm->generation); 1199 break; 1200 case MOD_LOG_KEY_ADD: 1201 /* if a move operation is needed it's in the log */ 1202 n--; 1203 break; 1204 case MOD_LOG_MOVE_KEYS: 1205 o_dst = btrfs_node_key_ptr_offset(tm->slot); 1206 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 1207 memmove_extent_buffer(eb, o_dst, o_src, 1208 tm->move.nr_items * p_size); 1209 break; 1210 case MOD_LOG_ROOT_REPLACE: 1211 /* 1212 * this operation is special. for roots, this must be 1213 * handled explicitly before rewinding. 1214 * for non-roots, this operation may exist if the node 1215 * was a root: root A -> child B; then A gets empty and 1216 * B is promoted to the new root. in the mod log, we'll 1217 * have a root-replace operation for B, a tree block 1218 * that is no root. we simply ignore that operation. 1219 */ 1220 break; 1221 } 1222 next = rb_next(&tm->node); 1223 if (!next) 1224 break; 1225 tm = container_of(next, struct tree_mod_elem, node); 1226 if (tm->index != first_tm->index) 1227 break; 1228 } 1229 btrfs_set_header_nritems(eb, n); 1230 } 1231 1232 /* 1233 * Called with eb read locked. If the buffer cannot be rewinded, the same buffer 1234 * is returned. If rewind operations happen, a fresh buffer is returned. The 1235 * returned buffer is always read-locked. If the returned buffer is not the 1236 * input buffer, the lock on the input buffer is released and the input buffer 1237 * is freed (its refcount is decremented). 1238 */ 1239 static struct extent_buffer * 1240 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 1241 u64 time_seq) 1242 { 1243 struct extent_buffer *eb_rewin; 1244 struct tree_mod_elem *tm; 1245 1246 if (!time_seq) 1247 return eb; 1248 1249 if (btrfs_header_level(eb) == 0) 1250 return eb; 1251 1252 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 1253 if (!tm) 1254 return eb; 1255 1256 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1257 BUG_ON(tm->slot != 0); 1258 eb_rewin = alloc_dummy_extent_buffer(eb->start, 1259 fs_info->tree_root->nodesize); 1260 BUG_ON(!eb_rewin); 1261 btrfs_set_header_bytenr(eb_rewin, eb->start); 1262 btrfs_set_header_backref_rev(eb_rewin, 1263 btrfs_header_backref_rev(eb)); 1264 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 1265 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 1266 } else { 1267 eb_rewin = btrfs_clone_extent_buffer(eb); 1268 BUG_ON(!eb_rewin); 1269 } 1270 1271 extent_buffer_get(eb_rewin); 1272 btrfs_tree_read_unlock(eb); 1273 free_extent_buffer(eb); 1274 1275 extent_buffer_get(eb_rewin); 1276 btrfs_tree_read_lock(eb_rewin); 1277 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1278 WARN_ON(btrfs_header_nritems(eb_rewin) > 1279 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1280 1281 return eb_rewin; 1282 } 1283 1284 /* 1285 * get_old_root() rewinds the state of @root's root node to the given @time_seq 1286 * value. If there are no changes, the current root->root_node is returned. If 1287 * anything changed in between, there's a fresh buffer allocated on which the 1288 * rewind operations are done. In any case, the returned buffer is read locked. 1289 * Returns NULL on error (with no locks held). 1290 */ 1291 static inline struct extent_buffer * 1292 get_old_root(struct btrfs_root *root, u64 time_seq) 1293 { 1294 struct tree_mod_elem *tm; 1295 struct extent_buffer *eb = NULL; 1296 struct extent_buffer *eb_root; 1297 struct extent_buffer *old; 1298 struct tree_mod_root *old_root = NULL; 1299 u64 old_generation = 0; 1300 u64 logical; 1301 u32 blocksize; 1302 1303 eb_root = btrfs_read_lock_root_node(root); 1304 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1305 if (!tm) 1306 return eb_root; 1307 1308 if (tm->op == MOD_LOG_ROOT_REPLACE) { 1309 old_root = &tm->old_root; 1310 old_generation = tm->generation; 1311 logical = old_root->logical; 1312 } else { 1313 logical = eb_root->start; 1314 } 1315 1316 tm = tree_mod_log_search(root->fs_info, logical, time_seq); 1317 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1318 btrfs_tree_read_unlock(eb_root); 1319 free_extent_buffer(eb_root); 1320 blocksize = btrfs_level_size(root, old_root->level); 1321 old = read_tree_block(root, logical, blocksize, 0); 1322 if (!old || !extent_buffer_uptodate(old)) { 1323 free_extent_buffer(old); 1324 pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", 1325 logical); 1326 WARN_ON(1); 1327 } else { 1328 eb = btrfs_clone_extent_buffer(old); 1329 free_extent_buffer(old); 1330 } 1331 } else if (old_root) { 1332 btrfs_tree_read_unlock(eb_root); 1333 free_extent_buffer(eb_root); 1334 eb = alloc_dummy_extent_buffer(logical, root->nodesize); 1335 } else { 1336 eb = btrfs_clone_extent_buffer(eb_root); 1337 btrfs_tree_read_unlock(eb_root); 1338 free_extent_buffer(eb_root); 1339 } 1340 1341 if (!eb) 1342 return NULL; 1343 extent_buffer_get(eb); 1344 btrfs_tree_read_lock(eb); 1345 if (old_root) { 1346 btrfs_set_header_bytenr(eb, eb->start); 1347 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 1348 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); 1349 btrfs_set_header_level(eb, old_root->level); 1350 btrfs_set_header_generation(eb, old_generation); 1351 } 1352 if (tm) 1353 __tree_mod_log_rewind(eb, time_seq, tm); 1354 else 1355 WARN_ON(btrfs_header_level(eb) != 0); 1356 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1357 1358 return eb; 1359 } 1360 1361 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 1362 { 1363 struct tree_mod_elem *tm; 1364 int level; 1365 struct extent_buffer *eb_root = btrfs_root_node(root); 1366 1367 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1368 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { 1369 level = tm->old_root.level; 1370 } else { 1371 level = btrfs_header_level(eb_root); 1372 } 1373 free_extent_buffer(eb_root); 1374 1375 return level; 1376 } 1377 1378 static inline int should_cow_block(struct btrfs_trans_handle *trans, 1379 struct btrfs_root *root, 1380 struct extent_buffer *buf) 1381 { 1382 /* ensure we can see the force_cow */ 1383 smp_rmb(); 1384 1385 /* 1386 * We do not need to cow a block if 1387 * 1) this block is not created or changed in this transaction; 1388 * 2) this block does not belong to TREE_RELOC tree; 1389 * 3) the root is not forced COW. 1390 * 1391 * What is forced COW: 1392 * when we create snapshot during commiting the transaction, 1393 * after we've finished coping src root, we must COW the shared 1394 * block to ensure the metadata consistency. 1395 */ 1396 if (btrfs_header_generation(buf) == trans->transid && 1397 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 1398 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1399 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && 1400 !root->force_cow) 1401 return 0; 1402 return 1; 1403 } 1404 1405 /* 1406 * cows a single block, see __btrfs_cow_block for the real work. 1407 * This version of it has extra checks so that a block isn't cow'd more than 1408 * once per transaction, as long as it hasn't been written yet 1409 */ 1410 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, 1411 struct btrfs_root *root, struct extent_buffer *buf, 1412 struct extent_buffer *parent, int parent_slot, 1413 struct extent_buffer **cow_ret) 1414 { 1415 u64 search_start; 1416 int ret; 1417 1418 if (trans->transaction != root->fs_info->running_transaction) 1419 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1420 (unsigned long long)trans->transid, 1421 (unsigned long long) 1422 root->fs_info->running_transaction->transid); 1423 1424 if (trans->transid != root->fs_info->generation) 1425 WARN(1, KERN_CRIT "trans %llu running %llu\n", 1426 (unsigned long long)trans->transid, 1427 (unsigned long long)root->fs_info->generation); 1428 1429 if (!should_cow_block(trans, root, buf)) { 1430 *cow_ret = buf; 1431 return 0; 1432 } 1433 1434 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 1435 1436 if (parent) 1437 btrfs_set_lock_blocking(parent); 1438 btrfs_set_lock_blocking(buf); 1439 1440 ret = __btrfs_cow_block(trans, root, buf, parent, 1441 parent_slot, cow_ret, search_start, 0); 1442 1443 trace_btrfs_cow_block(root, buf, *cow_ret); 1444 1445 return ret; 1446 } 1447 1448 /* 1449 * helper function for defrag to decide if two blocks pointed to by a 1450 * node are actually close by 1451 */ 1452 static int close_blocks(u64 blocknr, u64 other, u32 blocksize) 1453 { 1454 if (blocknr < other && other - (blocknr + blocksize) < 32768) 1455 return 1; 1456 if (blocknr > other && blocknr - (other + blocksize) < 32768) 1457 return 1; 1458 return 0; 1459 } 1460 1461 /* 1462 * compare two keys in a memcmp fashion 1463 */ 1464 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) 1465 { 1466 struct btrfs_key k1; 1467 1468 btrfs_disk_key_to_cpu(&k1, disk); 1469 1470 return btrfs_comp_cpu_keys(&k1, k2); 1471 } 1472 1473 /* 1474 * same as comp_keys only with two btrfs_key's 1475 */ 1476 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) 1477 { 1478 if (k1->objectid > k2->objectid) 1479 return 1; 1480 if (k1->objectid < k2->objectid) 1481 return -1; 1482 if (k1->type > k2->type) 1483 return 1; 1484 if (k1->type < k2->type) 1485 return -1; 1486 if (k1->offset > k2->offset) 1487 return 1; 1488 if (k1->offset < k2->offset) 1489 return -1; 1490 return 0; 1491 } 1492 1493 /* 1494 * this is used by the defrag code to go through all the 1495 * leaves pointed to by a node and reallocate them so that 1496 * disk order is close to key order 1497 */ 1498 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 1499 struct btrfs_root *root, struct extent_buffer *parent, 1500 int start_slot, u64 *last_ret, 1501 struct btrfs_key *progress) 1502 { 1503 struct extent_buffer *cur; 1504 u64 blocknr; 1505 u64 gen; 1506 u64 search_start = *last_ret; 1507 u64 last_block = 0; 1508 u64 other; 1509 u32 parent_nritems; 1510 int end_slot; 1511 int i; 1512 int err = 0; 1513 int parent_level; 1514 int uptodate; 1515 u32 blocksize; 1516 int progress_passed = 0; 1517 struct btrfs_disk_key disk_key; 1518 1519 parent_level = btrfs_header_level(parent); 1520 1521 WARN_ON(trans->transaction != root->fs_info->running_transaction); 1522 WARN_ON(trans->transid != root->fs_info->generation); 1523 1524 parent_nritems = btrfs_header_nritems(parent); 1525 blocksize = btrfs_level_size(root, parent_level - 1); 1526 end_slot = parent_nritems; 1527 1528 if (parent_nritems == 1) 1529 return 0; 1530 1531 btrfs_set_lock_blocking(parent); 1532 1533 for (i = start_slot; i < end_slot; i++) { 1534 int close = 1; 1535 1536 btrfs_node_key(parent, &disk_key, i); 1537 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 1538 continue; 1539 1540 progress_passed = 1; 1541 blocknr = btrfs_node_blockptr(parent, i); 1542 gen = btrfs_node_ptr_generation(parent, i); 1543 if (last_block == 0) 1544 last_block = blocknr; 1545 1546 if (i > 0) { 1547 other = btrfs_node_blockptr(parent, i - 1); 1548 close = close_blocks(blocknr, other, blocksize); 1549 } 1550 if (!close && i < end_slot - 2) { 1551 other = btrfs_node_blockptr(parent, i + 1); 1552 close = close_blocks(blocknr, other, blocksize); 1553 } 1554 if (close) { 1555 last_block = blocknr; 1556 continue; 1557 } 1558 1559 cur = btrfs_find_tree_block(root, blocknr, blocksize); 1560 if (cur) 1561 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1562 else 1563 uptodate = 0; 1564 if (!cur || !uptodate) { 1565 if (!cur) { 1566 cur = read_tree_block(root, blocknr, 1567 blocksize, gen); 1568 if (!cur || !extent_buffer_uptodate(cur)) { 1569 free_extent_buffer(cur); 1570 return -EIO; 1571 } 1572 } else if (!uptodate) { 1573 err = btrfs_read_buffer(cur, gen); 1574 if (err) { 1575 free_extent_buffer(cur); 1576 return err; 1577 } 1578 } 1579 } 1580 if (search_start == 0) 1581 search_start = last_block; 1582 1583 btrfs_tree_lock(cur); 1584 btrfs_set_lock_blocking(cur); 1585 err = __btrfs_cow_block(trans, root, cur, parent, i, 1586 &cur, search_start, 1587 min(16 * blocksize, 1588 (end_slot - i) * blocksize)); 1589 if (err) { 1590 btrfs_tree_unlock(cur); 1591 free_extent_buffer(cur); 1592 break; 1593 } 1594 search_start = cur->start; 1595 last_block = cur->start; 1596 *last_ret = search_start; 1597 btrfs_tree_unlock(cur); 1598 free_extent_buffer(cur); 1599 } 1600 return err; 1601 } 1602 1603 /* 1604 * The leaf data grows from end-to-front in the node. 1605 * this returns the address of the start of the last item, 1606 * which is the stop of the leaf data stack 1607 */ 1608 static inline unsigned int leaf_data_end(struct btrfs_root *root, 1609 struct extent_buffer *leaf) 1610 { 1611 u32 nr = btrfs_header_nritems(leaf); 1612 if (nr == 0) 1613 return BTRFS_LEAF_DATA_SIZE(root); 1614 return btrfs_item_offset_nr(leaf, nr - 1); 1615 } 1616 1617 1618 /* 1619 * search for key in the extent_buffer. The items start at offset p, 1620 * and they are item_size apart. There are 'max' items in p. 1621 * 1622 * the slot in the array is returned via slot, and it points to 1623 * the place where you would insert key if it is not found in 1624 * the array. 1625 * 1626 * slot may point to max if the key is bigger than all of the keys 1627 */ 1628 static noinline int generic_bin_search(struct extent_buffer *eb, 1629 unsigned long p, 1630 int item_size, struct btrfs_key *key, 1631 int max, int *slot) 1632 { 1633 int low = 0; 1634 int high = max; 1635 int mid; 1636 int ret; 1637 struct btrfs_disk_key *tmp = NULL; 1638 struct btrfs_disk_key unaligned; 1639 unsigned long offset; 1640 char *kaddr = NULL; 1641 unsigned long map_start = 0; 1642 unsigned long map_len = 0; 1643 int err; 1644 1645 while (low < high) { 1646 mid = (low + high) / 2; 1647 offset = p + mid * item_size; 1648 1649 if (!kaddr || offset < map_start || 1650 (offset + sizeof(struct btrfs_disk_key)) > 1651 map_start + map_len) { 1652 1653 err = map_private_extent_buffer(eb, offset, 1654 sizeof(struct btrfs_disk_key), 1655 &kaddr, &map_start, &map_len); 1656 1657 if (!err) { 1658 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1659 map_start); 1660 } else { 1661 read_extent_buffer(eb, &unaligned, 1662 offset, sizeof(unaligned)); 1663 tmp = &unaligned; 1664 } 1665 1666 } else { 1667 tmp = (struct btrfs_disk_key *)(kaddr + offset - 1668 map_start); 1669 } 1670 ret = comp_keys(tmp, key); 1671 1672 if (ret < 0) 1673 low = mid + 1; 1674 else if (ret > 0) 1675 high = mid; 1676 else { 1677 *slot = mid; 1678 return 0; 1679 } 1680 } 1681 *slot = low; 1682 return 1; 1683 } 1684 1685 /* 1686 * simple bin_search frontend that does the right thing for 1687 * leaves vs nodes 1688 */ 1689 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1690 int level, int *slot) 1691 { 1692 if (level == 0) 1693 return generic_bin_search(eb, 1694 offsetof(struct btrfs_leaf, items), 1695 sizeof(struct btrfs_item), 1696 key, btrfs_header_nritems(eb), 1697 slot); 1698 else 1699 return generic_bin_search(eb, 1700 offsetof(struct btrfs_node, ptrs), 1701 sizeof(struct btrfs_key_ptr), 1702 key, btrfs_header_nritems(eb), 1703 slot); 1704 } 1705 1706 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1707 int level, int *slot) 1708 { 1709 return bin_search(eb, key, level, slot); 1710 } 1711 1712 static void root_add_used(struct btrfs_root *root, u32 size) 1713 { 1714 spin_lock(&root->accounting_lock); 1715 btrfs_set_root_used(&root->root_item, 1716 btrfs_root_used(&root->root_item) + size); 1717 spin_unlock(&root->accounting_lock); 1718 } 1719 1720 static void root_sub_used(struct btrfs_root *root, u32 size) 1721 { 1722 spin_lock(&root->accounting_lock); 1723 btrfs_set_root_used(&root->root_item, 1724 btrfs_root_used(&root->root_item) - size); 1725 spin_unlock(&root->accounting_lock); 1726 } 1727 1728 /* given a node and slot number, this reads the blocks it points to. The 1729 * extent buffer is returned with a reference taken (but unlocked). 1730 * NULL is returned on error. 1731 */ 1732 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, 1733 struct extent_buffer *parent, int slot) 1734 { 1735 int level = btrfs_header_level(parent); 1736 struct extent_buffer *eb; 1737 1738 if (slot < 0) 1739 return NULL; 1740 if (slot >= btrfs_header_nritems(parent)) 1741 return NULL; 1742 1743 BUG_ON(level == 0); 1744 1745 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1746 btrfs_level_size(root, level - 1), 1747 btrfs_node_ptr_generation(parent, slot)); 1748 if (eb && !extent_buffer_uptodate(eb)) { 1749 free_extent_buffer(eb); 1750 eb = NULL; 1751 } 1752 1753 return eb; 1754 } 1755 1756 /* 1757 * node level balancing, used to make sure nodes are in proper order for 1758 * item deletion. We balance from the top down, so we have to make sure 1759 * that a deletion won't leave an node completely empty later on. 1760 */ 1761 static noinline int balance_level(struct btrfs_trans_handle *trans, 1762 struct btrfs_root *root, 1763 struct btrfs_path *path, int level) 1764 { 1765 struct extent_buffer *right = NULL; 1766 struct extent_buffer *mid; 1767 struct extent_buffer *left = NULL; 1768 struct extent_buffer *parent = NULL; 1769 int ret = 0; 1770 int wret; 1771 int pslot; 1772 int orig_slot = path->slots[level]; 1773 u64 orig_ptr; 1774 1775 if (level == 0) 1776 return 0; 1777 1778 mid = path->nodes[level]; 1779 1780 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && 1781 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); 1782 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1783 1784 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 1785 1786 if (level < BTRFS_MAX_LEVEL - 1) { 1787 parent = path->nodes[level + 1]; 1788 pslot = path->slots[level + 1]; 1789 } 1790 1791 /* 1792 * deal with the case where there is only one pointer in the root 1793 * by promoting the node below to a root 1794 */ 1795 if (!parent) { 1796 struct extent_buffer *child; 1797 1798 if (btrfs_header_nritems(mid) != 1) 1799 return 0; 1800 1801 /* promote the child to a root */ 1802 child = read_node_slot(root, mid, 0); 1803 if (!child) { 1804 ret = -EROFS; 1805 btrfs_std_error(root->fs_info, ret); 1806 goto enospc; 1807 } 1808 1809 btrfs_tree_lock(child); 1810 btrfs_set_lock_blocking(child); 1811 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1812 if (ret) { 1813 btrfs_tree_unlock(child); 1814 free_extent_buffer(child); 1815 goto enospc; 1816 } 1817 1818 tree_mod_log_set_root_pointer(root, child, 1); 1819 rcu_assign_pointer(root->node, child); 1820 1821 add_root_to_dirty_list(root); 1822 btrfs_tree_unlock(child); 1823 1824 path->locks[level] = 0; 1825 path->nodes[level] = NULL; 1826 clean_tree_block(trans, root, mid); 1827 btrfs_tree_unlock(mid); 1828 /* once for the path */ 1829 free_extent_buffer(mid); 1830 1831 root_sub_used(root, mid->len); 1832 btrfs_free_tree_block(trans, root, mid, 0, 1); 1833 /* once for the root ptr */ 1834 free_extent_buffer_stale(mid); 1835 return 0; 1836 } 1837 if (btrfs_header_nritems(mid) > 1838 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1839 return 0; 1840 1841 left = read_node_slot(root, parent, pslot - 1); 1842 if (left) { 1843 btrfs_tree_lock(left); 1844 btrfs_set_lock_blocking(left); 1845 wret = btrfs_cow_block(trans, root, left, 1846 parent, pslot - 1, &left); 1847 if (wret) { 1848 ret = wret; 1849 goto enospc; 1850 } 1851 } 1852 right = read_node_slot(root, parent, pslot + 1); 1853 if (right) { 1854 btrfs_tree_lock(right); 1855 btrfs_set_lock_blocking(right); 1856 wret = btrfs_cow_block(trans, root, right, 1857 parent, pslot + 1, &right); 1858 if (wret) { 1859 ret = wret; 1860 goto enospc; 1861 } 1862 } 1863 1864 /* first, try to make some room in the middle buffer */ 1865 if (left) { 1866 orig_slot += btrfs_header_nritems(left); 1867 wret = push_node_left(trans, root, left, mid, 1); 1868 if (wret < 0) 1869 ret = wret; 1870 } 1871 1872 /* 1873 * then try to empty the right most buffer into the middle 1874 */ 1875 if (right) { 1876 wret = push_node_left(trans, root, mid, right, 1); 1877 if (wret < 0 && wret != -ENOSPC) 1878 ret = wret; 1879 if (btrfs_header_nritems(right) == 0) { 1880 clean_tree_block(trans, root, right); 1881 btrfs_tree_unlock(right); 1882 del_ptr(root, path, level + 1, pslot + 1); 1883 root_sub_used(root, right->len); 1884 btrfs_free_tree_block(trans, root, right, 0, 1); 1885 free_extent_buffer_stale(right); 1886 right = NULL; 1887 } else { 1888 struct btrfs_disk_key right_key; 1889 btrfs_node_key(right, &right_key, 0); 1890 tree_mod_log_set_node_key(root->fs_info, parent, 1891 pslot + 1, 0); 1892 btrfs_set_node_key(parent, &right_key, pslot + 1); 1893 btrfs_mark_buffer_dirty(parent); 1894 } 1895 } 1896 if (btrfs_header_nritems(mid) == 1) { 1897 /* 1898 * we're not allowed to leave a node with one item in the 1899 * tree during a delete. A deletion from lower in the tree 1900 * could try to delete the only pointer in this node. 1901 * So, pull some keys from the left. 1902 * There has to be a left pointer at this point because 1903 * otherwise we would have pulled some pointers from the 1904 * right 1905 */ 1906 if (!left) { 1907 ret = -EROFS; 1908 btrfs_std_error(root->fs_info, ret); 1909 goto enospc; 1910 } 1911 wret = balance_node_right(trans, root, mid, left); 1912 if (wret < 0) { 1913 ret = wret; 1914 goto enospc; 1915 } 1916 if (wret == 1) { 1917 wret = push_node_left(trans, root, left, mid, 1); 1918 if (wret < 0) 1919 ret = wret; 1920 } 1921 BUG_ON(wret == 1); 1922 } 1923 if (btrfs_header_nritems(mid) == 0) { 1924 clean_tree_block(trans, root, mid); 1925 btrfs_tree_unlock(mid); 1926 del_ptr(root, path, level + 1, pslot); 1927 root_sub_used(root, mid->len); 1928 btrfs_free_tree_block(trans, root, mid, 0, 1); 1929 free_extent_buffer_stale(mid); 1930 mid = NULL; 1931 } else { 1932 /* update the parent key to reflect our changes */ 1933 struct btrfs_disk_key mid_key; 1934 btrfs_node_key(mid, &mid_key, 0); 1935 tree_mod_log_set_node_key(root->fs_info, parent, 1936 pslot, 0); 1937 btrfs_set_node_key(parent, &mid_key, pslot); 1938 btrfs_mark_buffer_dirty(parent); 1939 } 1940 1941 /* update the path */ 1942 if (left) { 1943 if (btrfs_header_nritems(left) > orig_slot) { 1944 extent_buffer_get(left); 1945 /* left was locked after cow */ 1946 path->nodes[level] = left; 1947 path->slots[level + 1] -= 1; 1948 path->slots[level] = orig_slot; 1949 if (mid) { 1950 btrfs_tree_unlock(mid); 1951 free_extent_buffer(mid); 1952 } 1953 } else { 1954 orig_slot -= btrfs_header_nritems(left); 1955 path->slots[level] = orig_slot; 1956 } 1957 } 1958 /* double check we haven't messed things up */ 1959 if (orig_ptr != 1960 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1961 BUG(); 1962 enospc: 1963 if (right) { 1964 btrfs_tree_unlock(right); 1965 free_extent_buffer(right); 1966 } 1967 if (left) { 1968 if (path->nodes[level] != left) 1969 btrfs_tree_unlock(left); 1970 free_extent_buffer(left); 1971 } 1972 return ret; 1973 } 1974 1975 /* Node balancing for insertion. Here we only split or push nodes around 1976 * when they are completely full. This is also done top down, so we 1977 * have to be pessimistic. 1978 */ 1979 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, 1980 struct btrfs_root *root, 1981 struct btrfs_path *path, int level) 1982 { 1983 struct extent_buffer *right = NULL; 1984 struct extent_buffer *mid; 1985 struct extent_buffer *left = NULL; 1986 struct extent_buffer *parent = NULL; 1987 int ret = 0; 1988 int wret; 1989 int pslot; 1990 int orig_slot = path->slots[level]; 1991 1992 if (level == 0) 1993 return 1; 1994 1995 mid = path->nodes[level]; 1996 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1997 1998 if (level < BTRFS_MAX_LEVEL - 1) { 1999 parent = path->nodes[level + 1]; 2000 pslot = path->slots[level + 1]; 2001 } 2002 2003 if (!parent) 2004 return 1; 2005 2006 left = read_node_slot(root, parent, pslot - 1); 2007 2008 /* first, try to make some room in the middle buffer */ 2009 if (left) { 2010 u32 left_nr; 2011 2012 btrfs_tree_lock(left); 2013 btrfs_set_lock_blocking(left); 2014 2015 left_nr = btrfs_header_nritems(left); 2016 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 2017 wret = 1; 2018 } else { 2019 ret = btrfs_cow_block(trans, root, left, parent, 2020 pslot - 1, &left); 2021 if (ret) 2022 wret = 1; 2023 else { 2024 wret = push_node_left(trans, root, 2025 left, mid, 0); 2026 } 2027 } 2028 if (wret < 0) 2029 ret = wret; 2030 if (wret == 0) { 2031 struct btrfs_disk_key disk_key; 2032 orig_slot += left_nr; 2033 btrfs_node_key(mid, &disk_key, 0); 2034 tree_mod_log_set_node_key(root->fs_info, parent, 2035 pslot, 0); 2036 btrfs_set_node_key(parent, &disk_key, pslot); 2037 btrfs_mark_buffer_dirty(parent); 2038 if (btrfs_header_nritems(left) > orig_slot) { 2039 path->nodes[level] = left; 2040 path->slots[level + 1] -= 1; 2041 path->slots[level] = orig_slot; 2042 btrfs_tree_unlock(mid); 2043 free_extent_buffer(mid); 2044 } else { 2045 orig_slot -= 2046 btrfs_header_nritems(left); 2047 path->slots[level] = orig_slot; 2048 btrfs_tree_unlock(left); 2049 free_extent_buffer(left); 2050 } 2051 return 0; 2052 } 2053 btrfs_tree_unlock(left); 2054 free_extent_buffer(left); 2055 } 2056 right = read_node_slot(root, parent, pslot + 1); 2057 2058 /* 2059 * then try to empty the right most buffer into the middle 2060 */ 2061 if (right) { 2062 u32 right_nr; 2063 2064 btrfs_tree_lock(right); 2065 btrfs_set_lock_blocking(right); 2066 2067 right_nr = btrfs_header_nritems(right); 2068 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 2069 wret = 1; 2070 } else { 2071 ret = btrfs_cow_block(trans, root, right, 2072 parent, pslot + 1, 2073 &right); 2074 if (ret) 2075 wret = 1; 2076 else { 2077 wret = balance_node_right(trans, root, 2078 right, mid); 2079 } 2080 } 2081 if (wret < 0) 2082 ret = wret; 2083 if (wret == 0) { 2084 struct btrfs_disk_key disk_key; 2085 2086 btrfs_node_key(right, &disk_key, 0); 2087 tree_mod_log_set_node_key(root->fs_info, parent, 2088 pslot + 1, 0); 2089 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2090 btrfs_mark_buffer_dirty(parent); 2091 2092 if (btrfs_header_nritems(mid) <= orig_slot) { 2093 path->nodes[level] = right; 2094 path->slots[level + 1] += 1; 2095 path->slots[level] = orig_slot - 2096 btrfs_header_nritems(mid); 2097 btrfs_tree_unlock(mid); 2098 free_extent_buffer(mid); 2099 } else { 2100 btrfs_tree_unlock(right); 2101 free_extent_buffer(right); 2102 } 2103 return 0; 2104 } 2105 btrfs_tree_unlock(right); 2106 free_extent_buffer(right); 2107 } 2108 return 1; 2109 } 2110 2111 /* 2112 * readahead one full node of leaves, finding things that are close 2113 * to the block in 'slot', and triggering ra on them. 2114 */ 2115 static void reada_for_search(struct btrfs_root *root, 2116 struct btrfs_path *path, 2117 int level, int slot, u64 objectid) 2118 { 2119 struct extent_buffer *node; 2120 struct btrfs_disk_key disk_key; 2121 u32 nritems; 2122 u64 search; 2123 u64 target; 2124 u64 nread = 0; 2125 u64 gen; 2126 int direction = path->reada; 2127 struct extent_buffer *eb; 2128 u32 nr; 2129 u32 blocksize; 2130 u32 nscan = 0; 2131 2132 if (level != 1) 2133 return; 2134 2135 if (!path->nodes[level]) 2136 return; 2137 2138 node = path->nodes[level]; 2139 2140 search = btrfs_node_blockptr(node, slot); 2141 blocksize = btrfs_level_size(root, level - 1); 2142 eb = btrfs_find_tree_block(root, search, blocksize); 2143 if (eb) { 2144 free_extent_buffer(eb); 2145 return; 2146 } 2147 2148 target = search; 2149 2150 nritems = btrfs_header_nritems(node); 2151 nr = slot; 2152 2153 while (1) { 2154 if (direction < 0) { 2155 if (nr == 0) 2156 break; 2157 nr--; 2158 } else if (direction > 0) { 2159 nr++; 2160 if (nr >= nritems) 2161 break; 2162 } 2163 if (path->reada < 0 && objectid) { 2164 btrfs_node_key(node, &disk_key, nr); 2165 if (btrfs_disk_key_objectid(&disk_key) != objectid) 2166 break; 2167 } 2168 search = btrfs_node_blockptr(node, nr); 2169 if ((search <= target && target - search <= 65536) || 2170 (search > target && search - target <= 65536)) { 2171 gen = btrfs_node_ptr_generation(node, nr); 2172 readahead_tree_block(root, search, blocksize, gen); 2173 nread += blocksize; 2174 } 2175 nscan++; 2176 if ((nread > 65536 || nscan > 32)) 2177 break; 2178 } 2179 } 2180 2181 /* 2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2183 * cache 2184 */ 2185 static noinline int reada_for_balance(struct btrfs_root *root, 2186 struct btrfs_path *path, int level) 2187 { 2188 int slot; 2189 int nritems; 2190 struct extent_buffer *parent; 2191 struct extent_buffer *eb; 2192 u64 gen; 2193 u64 block1 = 0; 2194 u64 block2 = 0; 2195 int ret = 0; 2196 int blocksize; 2197 2198 parent = path->nodes[level + 1]; 2199 if (!parent) 2200 return 0; 2201 2202 nritems = btrfs_header_nritems(parent); 2203 slot = path->slots[level + 1]; 2204 blocksize = btrfs_level_size(root, level); 2205 2206 if (slot > 0) { 2207 block1 = btrfs_node_blockptr(parent, slot - 1); 2208 gen = btrfs_node_ptr_generation(parent, slot - 1); 2209 eb = btrfs_find_tree_block(root, block1, blocksize); 2210 /* 2211 * if we get -eagain from btrfs_buffer_uptodate, we 2212 * don't want to return eagain here. That will loop 2213 * forever 2214 */ 2215 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2216 block1 = 0; 2217 free_extent_buffer(eb); 2218 } 2219 if (slot + 1 < nritems) { 2220 block2 = btrfs_node_blockptr(parent, slot + 1); 2221 gen = btrfs_node_ptr_generation(parent, slot + 1); 2222 eb = btrfs_find_tree_block(root, block2, blocksize); 2223 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2224 block2 = 0; 2225 free_extent_buffer(eb); 2226 } 2227 if (block1 || block2) { 2228 ret = -EAGAIN; 2229 2230 /* release the whole path */ 2231 btrfs_release_path(path); 2232 2233 /* read the blocks */ 2234 if (block1) 2235 readahead_tree_block(root, block1, blocksize, 0); 2236 if (block2) 2237 readahead_tree_block(root, block2, blocksize, 0); 2238 2239 if (block1) { 2240 eb = read_tree_block(root, block1, blocksize, 0); 2241 free_extent_buffer(eb); 2242 } 2243 if (block2) { 2244 eb = read_tree_block(root, block2, blocksize, 0); 2245 free_extent_buffer(eb); 2246 } 2247 } 2248 return ret; 2249 } 2250 2251 2252 /* 2253 * when we walk down the tree, it is usually safe to unlock the higher layers 2254 * in the tree. The exceptions are when our path goes through slot 0, because 2255 * operations on the tree might require changing key pointers higher up in the 2256 * tree. 2257 * 2258 * callers might also have set path->keep_locks, which tells this code to keep 2259 * the lock if the path points to the last slot in the block. This is part of 2260 * walking through the tree, and selecting the next slot in the higher block. 2261 * 2262 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so 2263 * if lowest_unlock is 1, level 0 won't be unlocked 2264 */ 2265 static noinline void unlock_up(struct btrfs_path *path, int level, 2266 int lowest_unlock, int min_write_lock_level, 2267 int *write_lock_level) 2268 { 2269 int i; 2270 int skip_level = level; 2271 int no_skips = 0; 2272 struct extent_buffer *t; 2273 2274 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2275 if (!path->nodes[i]) 2276 break; 2277 if (!path->locks[i]) 2278 break; 2279 if (!no_skips && path->slots[i] == 0) { 2280 skip_level = i + 1; 2281 continue; 2282 } 2283 if (!no_skips && path->keep_locks) { 2284 u32 nritems; 2285 t = path->nodes[i]; 2286 nritems = btrfs_header_nritems(t); 2287 if (nritems < 1 || path->slots[i] >= nritems - 1) { 2288 skip_level = i + 1; 2289 continue; 2290 } 2291 } 2292 if (skip_level < i && i >= lowest_unlock) 2293 no_skips = 1; 2294 2295 t = path->nodes[i]; 2296 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 2297 btrfs_tree_unlock_rw(t, path->locks[i]); 2298 path->locks[i] = 0; 2299 if (write_lock_level && 2300 i > min_write_lock_level && 2301 i <= *write_lock_level) { 2302 *write_lock_level = i - 1; 2303 } 2304 } 2305 } 2306 } 2307 2308 /* 2309 * This releases any locks held in the path starting at level and 2310 * going all the way up to the root. 2311 * 2312 * btrfs_search_slot will keep the lock held on higher nodes in a few 2313 * corner cases, such as COW of the block at slot zero in the node. This 2314 * ignores those rules, and it should only be called when there are no 2315 * more updates to be done higher up in the tree. 2316 */ 2317 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 2318 { 2319 int i; 2320 2321 if (path->keep_locks) 2322 return; 2323 2324 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2325 if (!path->nodes[i]) 2326 continue; 2327 if (!path->locks[i]) 2328 continue; 2329 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 2330 path->locks[i] = 0; 2331 } 2332 } 2333 2334 /* 2335 * helper function for btrfs_search_slot. The goal is to find a block 2336 * in cache without setting the path to blocking. If we find the block 2337 * we return zero and the path is unchanged. 2338 * 2339 * If we can't find the block, we set the path blocking and do some 2340 * reada. -EAGAIN is returned and the search must be repeated. 2341 */ 2342 static int 2343 read_block_for_search(struct btrfs_trans_handle *trans, 2344 struct btrfs_root *root, struct btrfs_path *p, 2345 struct extent_buffer **eb_ret, int level, int slot, 2346 struct btrfs_key *key, u64 time_seq) 2347 { 2348 u64 blocknr; 2349 u64 gen; 2350 u32 blocksize; 2351 struct extent_buffer *b = *eb_ret; 2352 struct extent_buffer *tmp; 2353 int ret; 2354 2355 blocknr = btrfs_node_blockptr(b, slot); 2356 gen = btrfs_node_ptr_generation(b, slot); 2357 blocksize = btrfs_level_size(root, level - 1); 2358 2359 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2360 if (tmp) { 2361 /* first we do an atomic uptodate check */ 2362 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2363 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2364 /* 2365 * we found an up to date block without 2366 * sleeping, return 2367 * right away 2368 */ 2369 *eb_ret = tmp; 2370 return 0; 2371 } 2372 /* the pages were up to date, but we failed 2373 * the generation number check. Do a full 2374 * read for the generation number that is correct. 2375 * We must do this without dropping locks so 2376 * we can trust our generation number 2377 */ 2378 free_extent_buffer(tmp); 2379 btrfs_set_path_blocking(p); 2380 2381 /* now we're allowed to do a blocking uptodate check */ 2382 tmp = read_tree_block(root, blocknr, blocksize, gen); 2383 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2384 *eb_ret = tmp; 2385 return 0; 2386 } 2387 free_extent_buffer(tmp); 2388 btrfs_release_path(p); 2389 return -EIO; 2390 } 2391 } 2392 2393 /* 2394 * reduce lock contention at high levels 2395 * of the btree by dropping locks before 2396 * we read. Don't release the lock on the current 2397 * level because we need to walk this node to figure 2398 * out which blocks to read. 2399 */ 2400 btrfs_unlock_up_safe(p, level + 1); 2401 btrfs_set_path_blocking(p); 2402 2403 free_extent_buffer(tmp); 2404 if (p->reada) 2405 reada_for_search(root, p, level, slot, key->objectid); 2406 2407 btrfs_release_path(p); 2408 2409 ret = -EAGAIN; 2410 tmp = read_tree_block(root, blocknr, blocksize, 0); 2411 if (tmp) { 2412 /* 2413 * If the read above didn't mark this buffer up to date, 2414 * it will never end up being up to date. Set ret to EIO now 2415 * and give up so that our caller doesn't loop forever 2416 * on our EAGAINs. 2417 */ 2418 if (!btrfs_buffer_uptodate(tmp, 0, 0)) 2419 ret = -EIO; 2420 free_extent_buffer(tmp); 2421 } 2422 return ret; 2423 } 2424 2425 /* 2426 * helper function for btrfs_search_slot. This does all of the checks 2427 * for node-level blocks and does any balancing required based on 2428 * the ins_len. 2429 * 2430 * If no extra work was required, zero is returned. If we had to 2431 * drop the path, -EAGAIN is returned and btrfs_search_slot must 2432 * start over 2433 */ 2434 static int 2435 setup_nodes_for_search(struct btrfs_trans_handle *trans, 2436 struct btrfs_root *root, struct btrfs_path *p, 2437 struct extent_buffer *b, int level, int ins_len, 2438 int *write_lock_level) 2439 { 2440 int ret; 2441 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 2442 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 2443 int sret; 2444 2445 if (*write_lock_level < level + 1) { 2446 *write_lock_level = level + 1; 2447 btrfs_release_path(p); 2448 goto again; 2449 } 2450 2451 sret = reada_for_balance(root, p, level); 2452 if (sret) 2453 goto again; 2454 2455 btrfs_set_path_blocking(p); 2456 sret = split_node(trans, root, p, level); 2457 btrfs_clear_path_blocking(p, NULL, 0); 2458 2459 BUG_ON(sret > 0); 2460 if (sret) { 2461 ret = sret; 2462 goto done; 2463 } 2464 b = p->nodes[level]; 2465 } else if (ins_len < 0 && btrfs_header_nritems(b) < 2466 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 2467 int sret; 2468 2469 if (*write_lock_level < level + 1) { 2470 *write_lock_level = level + 1; 2471 btrfs_release_path(p); 2472 goto again; 2473 } 2474 2475 sret = reada_for_balance(root, p, level); 2476 if (sret) 2477 goto again; 2478 2479 btrfs_set_path_blocking(p); 2480 sret = balance_level(trans, root, p, level); 2481 btrfs_clear_path_blocking(p, NULL, 0); 2482 2483 if (sret) { 2484 ret = sret; 2485 goto done; 2486 } 2487 b = p->nodes[level]; 2488 if (!b) { 2489 btrfs_release_path(p); 2490 goto again; 2491 } 2492 BUG_ON(btrfs_header_nritems(b) == 1); 2493 } 2494 return 0; 2495 2496 again: 2497 ret = -EAGAIN; 2498 done: 2499 return ret; 2500 } 2501 2502 /* 2503 * look for key in the tree. path is filled in with nodes along the way 2504 * if key is found, we return zero and you can find the item in the leaf 2505 * level of the path (level 0) 2506 * 2507 * If the key isn't found, the path points to the slot where it should 2508 * be inserted, and 1 is returned. If there are other errors during the 2509 * search a negative error number is returned. 2510 * 2511 * if ins_len > 0, nodes and leaves will be split as we walk down the 2512 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if 2513 * possible) 2514 */ 2515 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2516 *root, struct btrfs_key *key, struct btrfs_path *p, int 2517 ins_len, int cow) 2518 { 2519 struct extent_buffer *b; 2520 int slot; 2521 int ret; 2522 int err; 2523 int level; 2524 int lowest_unlock = 1; 2525 int root_lock; 2526 /* everything at write_lock_level or lower must be write locked */ 2527 int write_lock_level = 0; 2528 u8 lowest_level = 0; 2529 int min_write_lock_level; 2530 2531 lowest_level = p->lowest_level; 2532 WARN_ON(lowest_level && ins_len > 0); 2533 WARN_ON(p->nodes[0] != NULL); 2534 2535 if (ins_len < 0) { 2536 lowest_unlock = 2; 2537 2538 /* when we are removing items, we might have to go up to level 2539 * two as we update tree pointers Make sure we keep write 2540 * for those levels as well 2541 */ 2542 write_lock_level = 2; 2543 } else if (ins_len > 0) { 2544 /* 2545 * for inserting items, make sure we have a write lock on 2546 * level 1 so we can update keys 2547 */ 2548 write_lock_level = 1; 2549 } 2550 2551 if (!cow) 2552 write_lock_level = -1; 2553 2554 if (cow && (p->keep_locks || p->lowest_level)) 2555 write_lock_level = BTRFS_MAX_LEVEL; 2556 2557 min_write_lock_level = write_lock_level; 2558 2559 again: 2560 /* 2561 * we try very hard to do read locks on the root 2562 */ 2563 root_lock = BTRFS_READ_LOCK; 2564 level = 0; 2565 if (p->search_commit_root) { 2566 /* 2567 * the commit roots are read only 2568 * so we always do read locks 2569 */ 2570 b = root->commit_root; 2571 extent_buffer_get(b); 2572 level = btrfs_header_level(b); 2573 if (!p->skip_locking) 2574 btrfs_tree_read_lock(b); 2575 } else { 2576 if (p->skip_locking) { 2577 b = btrfs_root_node(root); 2578 level = btrfs_header_level(b); 2579 } else { 2580 /* we don't know the level of the root node 2581 * until we actually have it read locked 2582 */ 2583 b = btrfs_read_lock_root_node(root); 2584 level = btrfs_header_level(b); 2585 if (level <= write_lock_level) { 2586 /* whoops, must trade for write lock */ 2587 btrfs_tree_read_unlock(b); 2588 free_extent_buffer(b); 2589 b = btrfs_lock_root_node(root); 2590 root_lock = BTRFS_WRITE_LOCK; 2591 2592 /* the level might have changed, check again */ 2593 level = btrfs_header_level(b); 2594 } 2595 } 2596 } 2597 p->nodes[level] = b; 2598 if (!p->skip_locking) 2599 p->locks[level] = root_lock; 2600 2601 while (b) { 2602 level = btrfs_header_level(b); 2603 2604 /* 2605 * setup the path here so we can release it under lock 2606 * contention with the cow code 2607 */ 2608 if (cow) { 2609 /* 2610 * if we don't really need to cow this block 2611 * then we don't want to set the path blocking, 2612 * so we test it here 2613 */ 2614 if (!should_cow_block(trans, root, b)) 2615 goto cow_done; 2616 2617 btrfs_set_path_blocking(p); 2618 2619 /* 2620 * must have write locks on this node and the 2621 * parent 2622 */ 2623 if (level > write_lock_level || 2624 (level + 1 > write_lock_level && 2625 level + 1 < BTRFS_MAX_LEVEL && 2626 p->nodes[level + 1])) { 2627 write_lock_level = level + 1; 2628 btrfs_release_path(p); 2629 goto again; 2630 } 2631 2632 err = btrfs_cow_block(trans, root, b, 2633 p->nodes[level + 1], 2634 p->slots[level + 1], &b); 2635 if (err) { 2636 ret = err; 2637 goto done; 2638 } 2639 } 2640 cow_done: 2641 BUG_ON(!cow && ins_len); 2642 2643 p->nodes[level] = b; 2644 btrfs_clear_path_blocking(p, NULL, 0); 2645 2646 /* 2647 * we have a lock on b and as long as we aren't changing 2648 * the tree, there is no way to for the items in b to change. 2649 * It is safe to drop the lock on our parent before we 2650 * go through the expensive btree search on b. 2651 * 2652 * If cow is true, then we might be changing slot zero, 2653 * which may require changing the parent. So, we can't 2654 * drop the lock until after we know which slot we're 2655 * operating on. 2656 */ 2657 if (!cow) 2658 btrfs_unlock_up_safe(p, level + 1); 2659 2660 ret = bin_search(b, key, level, &slot); 2661 2662 if (level != 0) { 2663 int dec = 0; 2664 if (ret && slot > 0) { 2665 dec = 1; 2666 slot -= 1; 2667 } 2668 p->slots[level] = slot; 2669 err = setup_nodes_for_search(trans, root, p, b, level, 2670 ins_len, &write_lock_level); 2671 if (err == -EAGAIN) 2672 goto again; 2673 if (err) { 2674 ret = err; 2675 goto done; 2676 } 2677 b = p->nodes[level]; 2678 slot = p->slots[level]; 2679 2680 /* 2681 * slot 0 is special, if we change the key 2682 * we have to update the parent pointer 2683 * which means we must have a write lock 2684 * on the parent 2685 */ 2686 if (slot == 0 && cow && 2687 write_lock_level < level + 1) { 2688 write_lock_level = level + 1; 2689 btrfs_release_path(p); 2690 goto again; 2691 } 2692 2693 unlock_up(p, level, lowest_unlock, 2694 min_write_lock_level, &write_lock_level); 2695 2696 if (level == lowest_level) { 2697 if (dec) 2698 p->slots[level]++; 2699 goto done; 2700 } 2701 2702 err = read_block_for_search(trans, root, p, 2703 &b, level, slot, key, 0); 2704 if (err == -EAGAIN) 2705 goto again; 2706 if (err) { 2707 ret = err; 2708 goto done; 2709 } 2710 2711 if (!p->skip_locking) { 2712 level = btrfs_header_level(b); 2713 if (level <= write_lock_level) { 2714 err = btrfs_try_tree_write_lock(b); 2715 if (!err) { 2716 btrfs_set_path_blocking(p); 2717 btrfs_tree_lock(b); 2718 btrfs_clear_path_blocking(p, b, 2719 BTRFS_WRITE_LOCK); 2720 } 2721 p->locks[level] = BTRFS_WRITE_LOCK; 2722 } else { 2723 err = btrfs_try_tree_read_lock(b); 2724 if (!err) { 2725 btrfs_set_path_blocking(p); 2726 btrfs_tree_read_lock(b); 2727 btrfs_clear_path_blocking(p, b, 2728 BTRFS_READ_LOCK); 2729 } 2730 p->locks[level] = BTRFS_READ_LOCK; 2731 } 2732 p->nodes[level] = b; 2733 } 2734 } else { 2735 p->slots[level] = slot; 2736 if (ins_len > 0 && 2737 btrfs_leaf_free_space(root, b) < ins_len) { 2738 if (write_lock_level < 1) { 2739 write_lock_level = 1; 2740 btrfs_release_path(p); 2741 goto again; 2742 } 2743 2744 btrfs_set_path_blocking(p); 2745 err = split_leaf(trans, root, key, 2746 p, ins_len, ret == 0); 2747 btrfs_clear_path_blocking(p, NULL, 0); 2748 2749 BUG_ON(err > 0); 2750 if (err) { 2751 ret = err; 2752 goto done; 2753 } 2754 } 2755 if (!p->search_for_split) 2756 unlock_up(p, level, lowest_unlock, 2757 min_write_lock_level, &write_lock_level); 2758 goto done; 2759 } 2760 } 2761 ret = 1; 2762 done: 2763 /* 2764 * we don't really know what they plan on doing with the path 2765 * from here on, so for now just mark it as blocking 2766 */ 2767 if (!p->leave_spinning) 2768 btrfs_set_path_blocking(p); 2769 if (ret < 0) 2770 btrfs_release_path(p); 2771 return ret; 2772 } 2773 2774 /* 2775 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 2776 * current state of the tree together with the operations recorded in the tree 2777 * modification log to search for the key in a previous version of this tree, as 2778 * denoted by the time_seq parameter. 2779 * 2780 * Naturally, there is no support for insert, delete or cow operations. 2781 * 2782 * The resulting path and return value will be set up as if we called 2783 * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 2784 */ 2785 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2786 struct btrfs_path *p, u64 time_seq) 2787 { 2788 struct extent_buffer *b; 2789 int slot; 2790 int ret; 2791 int err; 2792 int level; 2793 int lowest_unlock = 1; 2794 u8 lowest_level = 0; 2795 2796 lowest_level = p->lowest_level; 2797 WARN_ON(p->nodes[0] != NULL); 2798 2799 if (p->search_commit_root) { 2800 BUG_ON(time_seq); 2801 return btrfs_search_slot(NULL, root, key, p, 0, 0); 2802 } 2803 2804 again: 2805 b = get_old_root(root, time_seq); 2806 level = btrfs_header_level(b); 2807 p->locks[level] = BTRFS_READ_LOCK; 2808 2809 while (b) { 2810 level = btrfs_header_level(b); 2811 p->nodes[level] = b; 2812 btrfs_clear_path_blocking(p, NULL, 0); 2813 2814 /* 2815 * we have a lock on b and as long as we aren't changing 2816 * the tree, there is no way to for the items in b to change. 2817 * It is safe to drop the lock on our parent before we 2818 * go through the expensive btree search on b. 2819 */ 2820 btrfs_unlock_up_safe(p, level + 1); 2821 2822 ret = bin_search(b, key, level, &slot); 2823 2824 if (level != 0) { 2825 int dec = 0; 2826 if (ret && slot > 0) { 2827 dec = 1; 2828 slot -= 1; 2829 } 2830 p->slots[level] = slot; 2831 unlock_up(p, level, lowest_unlock, 0, NULL); 2832 2833 if (level == lowest_level) { 2834 if (dec) 2835 p->slots[level]++; 2836 goto done; 2837 } 2838 2839 err = read_block_for_search(NULL, root, p, &b, level, 2840 slot, key, time_seq); 2841 if (err == -EAGAIN) 2842 goto again; 2843 if (err) { 2844 ret = err; 2845 goto done; 2846 } 2847 2848 level = btrfs_header_level(b); 2849 err = btrfs_try_tree_read_lock(b); 2850 if (!err) { 2851 btrfs_set_path_blocking(p); 2852 btrfs_tree_read_lock(b); 2853 btrfs_clear_path_blocking(p, b, 2854 BTRFS_READ_LOCK); 2855 } 2856 b = tree_mod_log_rewind(root->fs_info, b, time_seq); 2857 p->locks[level] = BTRFS_READ_LOCK; 2858 p->nodes[level] = b; 2859 } else { 2860 p->slots[level] = slot; 2861 unlock_up(p, level, lowest_unlock, 0, NULL); 2862 goto done; 2863 } 2864 } 2865 ret = 1; 2866 done: 2867 if (!p->leave_spinning) 2868 btrfs_set_path_blocking(p); 2869 if (ret < 0) 2870 btrfs_release_path(p); 2871 2872 return ret; 2873 } 2874 2875 /* 2876 * helper to use instead of search slot if no exact match is needed but 2877 * instead the next or previous item should be returned. 2878 * When find_higher is true, the next higher item is returned, the next lower 2879 * otherwise. 2880 * When return_any and find_higher are both true, and no higher item is found, 2881 * return the next lower instead. 2882 * When return_any is true and find_higher is false, and no lower item is found, 2883 * return the next higher instead. 2884 * It returns 0 if any item is found, 1 if none is found (tree empty), and 2885 * < 0 on error 2886 */ 2887 int btrfs_search_slot_for_read(struct btrfs_root *root, 2888 struct btrfs_key *key, struct btrfs_path *p, 2889 int find_higher, int return_any) 2890 { 2891 int ret; 2892 struct extent_buffer *leaf; 2893 2894 again: 2895 ret = btrfs_search_slot(NULL, root, key, p, 0, 0); 2896 if (ret <= 0) 2897 return ret; 2898 /* 2899 * a return value of 1 means the path is at the position where the 2900 * item should be inserted. Normally this is the next bigger item, 2901 * but in case the previous item is the last in a leaf, path points 2902 * to the first free slot in the previous leaf, i.e. at an invalid 2903 * item. 2904 */ 2905 leaf = p->nodes[0]; 2906 2907 if (find_higher) { 2908 if (p->slots[0] >= btrfs_header_nritems(leaf)) { 2909 ret = btrfs_next_leaf(root, p); 2910 if (ret <= 0) 2911 return ret; 2912 if (!return_any) 2913 return 1; 2914 /* 2915 * no higher item found, return the next 2916 * lower instead 2917 */ 2918 return_any = 0; 2919 find_higher = 0; 2920 btrfs_release_path(p); 2921 goto again; 2922 } 2923 } else { 2924 if (p->slots[0] == 0) { 2925 ret = btrfs_prev_leaf(root, p); 2926 if (ret < 0) 2927 return ret; 2928 if (!ret) { 2929 p->slots[0] = btrfs_header_nritems(leaf) - 1; 2930 return 0; 2931 } 2932 if (!return_any) 2933 return 1; 2934 /* 2935 * no lower item found, return the next 2936 * higher instead 2937 */ 2938 return_any = 0; 2939 find_higher = 1; 2940 btrfs_release_path(p); 2941 goto again; 2942 } else { 2943 --p->slots[0]; 2944 } 2945 } 2946 return 0; 2947 } 2948 2949 /* 2950 * adjust the pointers going up the tree, starting at level 2951 * making sure the right key of each node is points to 'key'. 2952 * This is used after shifting pointers to the left, so it stops 2953 * fixing up pointers when a given leaf/node is not in slot 0 of the 2954 * higher levels 2955 * 2956 */ 2957 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, 2958 struct btrfs_disk_key *key, int level) 2959 { 2960 int i; 2961 struct extent_buffer *t; 2962 2963 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2964 int tslot = path->slots[i]; 2965 if (!path->nodes[i]) 2966 break; 2967 t = path->nodes[i]; 2968 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); 2969 btrfs_set_node_key(t, key, tslot); 2970 btrfs_mark_buffer_dirty(path->nodes[i]); 2971 if (tslot != 0) 2972 break; 2973 } 2974 } 2975 2976 /* 2977 * update item key. 2978 * 2979 * This function isn't completely safe. It's the caller's responsibility 2980 * that the new key won't break the order 2981 */ 2982 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, 2983 struct btrfs_key *new_key) 2984 { 2985 struct btrfs_disk_key disk_key; 2986 struct extent_buffer *eb; 2987 int slot; 2988 2989 eb = path->nodes[0]; 2990 slot = path->slots[0]; 2991 if (slot > 0) { 2992 btrfs_item_key(eb, &disk_key, slot - 1); 2993 BUG_ON(comp_keys(&disk_key, new_key) >= 0); 2994 } 2995 if (slot < btrfs_header_nritems(eb) - 1) { 2996 btrfs_item_key(eb, &disk_key, slot + 1); 2997 BUG_ON(comp_keys(&disk_key, new_key) <= 0); 2998 } 2999 3000 btrfs_cpu_key_to_disk(&disk_key, new_key); 3001 btrfs_set_item_key(eb, &disk_key, slot); 3002 btrfs_mark_buffer_dirty(eb); 3003 if (slot == 0) 3004 fixup_low_keys(root, path, &disk_key, 1); 3005 } 3006 3007 /* 3008 * try to push data from one node into the next node left in the 3009 * tree. 3010 * 3011 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible 3012 * error, and > 0 if there was no room in the left hand block. 3013 */ 3014 static int push_node_left(struct btrfs_trans_handle *trans, 3015 struct btrfs_root *root, struct extent_buffer *dst, 3016 struct extent_buffer *src, int empty) 3017 { 3018 int push_items = 0; 3019 int src_nritems; 3020 int dst_nritems; 3021 int ret = 0; 3022 3023 src_nritems = btrfs_header_nritems(src); 3024 dst_nritems = btrfs_header_nritems(dst); 3025 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; 3026 WARN_ON(btrfs_header_generation(src) != trans->transid); 3027 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3028 3029 if (!empty && src_nritems <= 8) 3030 return 1; 3031 3032 if (push_items <= 0) 3033 return 1; 3034 3035 if (empty) { 3036 push_items = min(src_nritems, push_items); 3037 if (push_items < src_nritems) { 3038 /* leave at least 8 pointers in the node if 3039 * we aren't going to empty it 3040 */ 3041 if (src_nritems - push_items < 8) { 3042 if (push_items <= 8) 3043 return 1; 3044 push_items -= 8; 3045 } 3046 } 3047 } else 3048 push_items = min(src_nritems - 8, push_items); 3049 3050 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, 3051 push_items); 3052 copy_extent_buffer(dst, src, 3053 btrfs_node_key_ptr_offset(dst_nritems), 3054 btrfs_node_key_ptr_offset(0), 3055 push_items * sizeof(struct btrfs_key_ptr)); 3056 3057 if (push_items < src_nritems) { 3058 /* 3059 * don't call tree_mod_log_eb_move here, key removal was already 3060 * fully logged by tree_mod_log_eb_copy above. 3061 */ 3062 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 3063 btrfs_node_key_ptr_offset(push_items), 3064 (src_nritems - push_items) * 3065 sizeof(struct btrfs_key_ptr)); 3066 } 3067 btrfs_set_header_nritems(src, src_nritems - push_items); 3068 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3069 btrfs_mark_buffer_dirty(src); 3070 btrfs_mark_buffer_dirty(dst); 3071 3072 return ret; 3073 } 3074 3075 /* 3076 * try to push data from one node into the next node right in the 3077 * tree. 3078 * 3079 * returns 0 if some ptrs were pushed, < 0 if there was some horrible 3080 * error, and > 0 if there was no room in the right hand block. 3081 * 3082 * this will only push up to 1/2 the contents of the left node over 3083 */ 3084 static int balance_node_right(struct btrfs_trans_handle *trans, 3085 struct btrfs_root *root, 3086 struct extent_buffer *dst, 3087 struct extent_buffer *src) 3088 { 3089 int push_items = 0; 3090 int max_push; 3091 int src_nritems; 3092 int dst_nritems; 3093 int ret = 0; 3094 3095 WARN_ON(btrfs_header_generation(src) != trans->transid); 3096 WARN_ON(btrfs_header_generation(dst) != trans->transid); 3097 3098 src_nritems = btrfs_header_nritems(src); 3099 dst_nritems = btrfs_header_nritems(dst); 3100 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; 3101 if (push_items <= 0) 3102 return 1; 3103 3104 if (src_nritems < 4) 3105 return 1; 3106 3107 max_push = src_nritems / 2 + 1; 3108 /* don't try to empty the node */ 3109 if (max_push >= src_nritems) 3110 return 1; 3111 3112 if (max_push < push_items) 3113 push_items = max_push; 3114 3115 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); 3116 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 3117 btrfs_node_key_ptr_offset(0), 3118 (dst_nritems) * 3119 sizeof(struct btrfs_key_ptr)); 3120 3121 tree_mod_log_eb_copy(root->fs_info, dst, src, 0, 3122 src_nritems - push_items, push_items); 3123 copy_extent_buffer(dst, src, 3124 btrfs_node_key_ptr_offset(0), 3125 btrfs_node_key_ptr_offset(src_nritems - push_items), 3126 push_items * sizeof(struct btrfs_key_ptr)); 3127 3128 btrfs_set_header_nritems(src, src_nritems - push_items); 3129 btrfs_set_header_nritems(dst, dst_nritems + push_items); 3130 3131 btrfs_mark_buffer_dirty(src); 3132 btrfs_mark_buffer_dirty(dst); 3133 3134 return ret; 3135 } 3136 3137 /* 3138 * helper function to insert a new root level in the tree. 3139 * A new node is allocated, and a single item is inserted to 3140 * point to the existing root 3141 * 3142 * returns zero on success or < 0 on failure. 3143 */ 3144 static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3145 struct btrfs_root *root, 3146 struct btrfs_path *path, int level, int log_removal) 3147 { 3148 u64 lower_gen; 3149 struct extent_buffer *lower; 3150 struct extent_buffer *c; 3151 struct extent_buffer *old; 3152 struct btrfs_disk_key lower_key; 3153 3154 BUG_ON(path->nodes[level]); 3155 BUG_ON(path->nodes[level-1] != root->node); 3156 3157 lower = path->nodes[level-1]; 3158 if (level == 1) 3159 btrfs_item_key(lower, &lower_key, 0); 3160 else 3161 btrfs_node_key(lower, &lower_key, 0); 3162 3163 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3164 root->root_key.objectid, &lower_key, 3165 level, root->node->start, 0); 3166 if (IS_ERR(c)) 3167 return PTR_ERR(c); 3168 3169 root_add_used(root, root->nodesize); 3170 3171 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 3172 btrfs_set_header_nritems(c, 1); 3173 btrfs_set_header_level(c, level); 3174 btrfs_set_header_bytenr(c, c->start); 3175 btrfs_set_header_generation(c, trans->transid); 3176 btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); 3177 btrfs_set_header_owner(c, root->root_key.objectid); 3178 3179 write_extent_buffer(c, root->fs_info->fsid, 3180 (unsigned long)btrfs_header_fsid(c), 3181 BTRFS_FSID_SIZE); 3182 3183 write_extent_buffer(c, root->fs_info->chunk_tree_uuid, 3184 (unsigned long)btrfs_header_chunk_tree_uuid(c), 3185 BTRFS_UUID_SIZE); 3186 3187 btrfs_set_node_key(c, &lower_key, 0); 3188 btrfs_set_node_blockptr(c, 0, lower->start); 3189 lower_gen = btrfs_header_generation(lower); 3190 WARN_ON(lower_gen != trans->transid); 3191 3192 btrfs_set_node_ptr_generation(c, 0, lower_gen); 3193 3194 btrfs_mark_buffer_dirty(c); 3195 3196 old = root->node; 3197 tree_mod_log_set_root_pointer(root, c, log_removal); 3198 rcu_assign_pointer(root->node, c); 3199 3200 /* the super has an extra ref to root->node */ 3201 free_extent_buffer(old); 3202 3203 add_root_to_dirty_list(root); 3204 extent_buffer_get(c); 3205 path->nodes[level] = c; 3206 path->locks[level] = BTRFS_WRITE_LOCK; 3207 path->slots[level] = 0; 3208 return 0; 3209 } 3210 3211 /* 3212 * worker function to insert a single pointer in a node. 3213 * the node should have enough room for the pointer already 3214 * 3215 * slot and level indicate where you want the key to go, and 3216 * blocknr is the block the key points to. 3217 */ 3218 static void insert_ptr(struct btrfs_trans_handle *trans, 3219 struct btrfs_root *root, struct btrfs_path *path, 3220 struct btrfs_disk_key *key, u64 bytenr, 3221 int slot, int level) 3222 { 3223 struct extent_buffer *lower; 3224 int nritems; 3225 int ret; 3226 3227 BUG_ON(!path->nodes[level]); 3228 btrfs_assert_tree_locked(path->nodes[level]); 3229 lower = path->nodes[level]; 3230 nritems = btrfs_header_nritems(lower); 3231 BUG_ON(slot > nritems); 3232 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 3233 if (slot != nritems) { 3234 if (level) 3235 tree_mod_log_eb_move(root->fs_info, lower, slot + 1, 3236 slot, nritems - slot); 3237 memmove_extent_buffer(lower, 3238 btrfs_node_key_ptr_offset(slot + 1), 3239 btrfs_node_key_ptr_offset(slot), 3240 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 3241 } 3242 if (level) { 3243 ret = tree_mod_log_insert_key(root->fs_info, lower, slot, 3244 MOD_LOG_KEY_ADD); 3245 BUG_ON(ret < 0); 3246 } 3247 btrfs_set_node_key(lower, key, slot); 3248 btrfs_set_node_blockptr(lower, slot, bytenr); 3249 WARN_ON(trans->transid == 0); 3250 btrfs_set_node_ptr_generation(lower, slot, trans->transid); 3251 btrfs_set_header_nritems(lower, nritems + 1); 3252 btrfs_mark_buffer_dirty(lower); 3253 } 3254 3255 /* 3256 * split the node at the specified level in path in two. 3257 * The path is corrected to point to the appropriate node after the split 3258 * 3259 * Before splitting this tries to make some room in the node by pushing 3260 * left and right, if either one works, it returns right away. 3261 * 3262 * returns 0 on success and < 0 on failure 3263 */ 3264 static noinline int split_node(struct btrfs_trans_handle *trans, 3265 struct btrfs_root *root, 3266 struct btrfs_path *path, int level) 3267 { 3268 struct extent_buffer *c; 3269 struct extent_buffer *split; 3270 struct btrfs_disk_key disk_key; 3271 int mid; 3272 int ret; 3273 u32 c_nritems; 3274 3275 c = path->nodes[level]; 3276 WARN_ON(btrfs_header_generation(c) != trans->transid); 3277 if (c == root->node) { 3278 /* 3279 * trying to split the root, lets make a new one 3280 * 3281 * tree mod log: We pass 0 as log_removal parameter to 3282 * insert_new_root, because that root buffer will be kept as a 3283 * normal node. We are going to log removal of half of the 3284 * elements below with tree_mod_log_eb_copy. We're holding a 3285 * tree lock on the buffer, which is why we cannot race with 3286 * other tree_mod_log users. 3287 */ 3288 ret = insert_new_root(trans, root, path, level + 1, 0); 3289 if (ret) 3290 return ret; 3291 } else { 3292 ret = push_nodes_for_insert(trans, root, path, level); 3293 c = path->nodes[level]; 3294 if (!ret && btrfs_header_nritems(c) < 3295 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) 3296 return 0; 3297 if (ret < 0) 3298 return ret; 3299 } 3300 3301 c_nritems = btrfs_header_nritems(c); 3302 mid = (c_nritems + 1) / 2; 3303 btrfs_node_key(c, &disk_key, mid); 3304 3305 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3306 root->root_key.objectid, 3307 &disk_key, level, c->start, 0); 3308 if (IS_ERR(split)) 3309 return PTR_ERR(split); 3310 3311 root_add_used(root, root->nodesize); 3312 3313 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 3314 btrfs_set_header_level(split, btrfs_header_level(c)); 3315 btrfs_set_header_bytenr(split, split->start); 3316 btrfs_set_header_generation(split, trans->transid); 3317 btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); 3318 btrfs_set_header_owner(split, root->root_key.objectid); 3319 write_extent_buffer(split, root->fs_info->fsid, 3320 (unsigned long)btrfs_header_fsid(split), 3321 BTRFS_FSID_SIZE); 3322 write_extent_buffer(split, root->fs_info->chunk_tree_uuid, 3323 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3324 BTRFS_UUID_SIZE); 3325 3326 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); 3327 copy_extent_buffer(split, c, 3328 btrfs_node_key_ptr_offset(0), 3329 btrfs_node_key_ptr_offset(mid), 3330 (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); 3331 btrfs_set_header_nritems(split, c_nritems - mid); 3332 btrfs_set_header_nritems(c, mid); 3333 ret = 0; 3334 3335 btrfs_mark_buffer_dirty(c); 3336 btrfs_mark_buffer_dirty(split); 3337 3338 insert_ptr(trans, root, path, &disk_key, split->start, 3339 path->slots[level + 1] + 1, level + 1); 3340 3341 if (path->slots[level] >= mid) { 3342 path->slots[level] -= mid; 3343 btrfs_tree_unlock(c); 3344 free_extent_buffer(c); 3345 path->nodes[level] = split; 3346 path->slots[level + 1] += 1; 3347 } else { 3348 btrfs_tree_unlock(split); 3349 free_extent_buffer(split); 3350 } 3351 return ret; 3352 } 3353 3354 /* 3355 * how many bytes are required to store the items in a leaf. start 3356 * and nr indicate which items in the leaf to check. This totals up the 3357 * space used both by the item structs and the item data 3358 */ 3359 static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3360 { 3361 struct btrfs_item *start_item; 3362 struct btrfs_item *end_item; 3363 struct btrfs_map_token token; 3364 int data_len; 3365 int nritems = btrfs_header_nritems(l); 3366 int end = min(nritems, start + nr) - 1; 3367 3368 if (!nr) 3369 return 0; 3370 btrfs_init_map_token(&token); 3371 start_item = btrfs_item_nr(l, start); 3372 end_item = btrfs_item_nr(l, end); 3373 data_len = btrfs_token_item_offset(l, start_item, &token) + 3374 btrfs_token_item_size(l, start_item, &token); 3375 data_len = data_len - btrfs_token_item_offset(l, end_item, &token); 3376 data_len += sizeof(struct btrfs_item) * nr; 3377 WARN_ON(data_len < 0); 3378 return data_len; 3379 } 3380 3381 /* 3382 * The space between the end of the leaf items and 3383 * the start of the leaf data. IOW, how much room 3384 * the leaf has left for both items and data 3385 */ 3386 noinline int btrfs_leaf_free_space(struct btrfs_root *root, 3387 struct extent_buffer *leaf) 3388 { 3389 int nritems = btrfs_header_nritems(leaf); 3390 int ret; 3391 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); 3392 if (ret < 0) { 3393 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " 3394 "used %d nritems %d\n", 3395 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), 3396 leaf_space_used(leaf, 0, nritems), nritems); 3397 } 3398 return ret; 3399 } 3400 3401 /* 3402 * min slot controls the lowest index we're willing to push to the 3403 * right. We'll push up to and including min_slot, but no lower 3404 */ 3405 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 3406 struct btrfs_root *root, 3407 struct btrfs_path *path, 3408 int data_size, int empty, 3409 struct extent_buffer *right, 3410 int free_space, u32 left_nritems, 3411 u32 min_slot) 3412 { 3413 struct extent_buffer *left = path->nodes[0]; 3414 struct extent_buffer *upper = path->nodes[1]; 3415 struct btrfs_map_token token; 3416 struct btrfs_disk_key disk_key; 3417 int slot; 3418 u32 i; 3419 int push_space = 0; 3420 int push_items = 0; 3421 struct btrfs_item *item; 3422 u32 nr; 3423 u32 right_nritems; 3424 u32 data_end; 3425 u32 this_item_size; 3426 3427 btrfs_init_map_token(&token); 3428 3429 if (empty) 3430 nr = 0; 3431 else 3432 nr = max_t(u32, 1, min_slot); 3433 3434 if (path->slots[0] >= left_nritems) 3435 push_space += data_size; 3436 3437 slot = path->slots[1]; 3438 i = left_nritems - 1; 3439 while (i >= nr) { 3440 item = btrfs_item_nr(left, i); 3441 3442 if (!empty && push_items > 0) { 3443 if (path->slots[0] > i) 3444 break; 3445 if (path->slots[0] == i) { 3446 int space = btrfs_leaf_free_space(root, left); 3447 if (space + push_space * 2 > free_space) 3448 break; 3449 } 3450 } 3451 3452 if (path->slots[0] == i) 3453 push_space += data_size; 3454 3455 this_item_size = btrfs_item_size(left, item); 3456 if (this_item_size + sizeof(*item) + push_space > free_space) 3457 break; 3458 3459 push_items++; 3460 push_space += this_item_size + sizeof(*item); 3461 if (i == 0) 3462 break; 3463 i--; 3464 } 3465 3466 if (push_items == 0) 3467 goto out_unlock; 3468 3469 WARN_ON(!empty && push_items == left_nritems); 3470 3471 /* push left to right */ 3472 right_nritems = btrfs_header_nritems(right); 3473 3474 push_space = btrfs_item_end_nr(left, left_nritems - push_items); 3475 push_space -= leaf_data_end(root, left); 3476 3477 /* make room in the right data area */ 3478 data_end = leaf_data_end(root, right); 3479 memmove_extent_buffer(right, 3480 btrfs_leaf_data(right) + data_end - push_space, 3481 btrfs_leaf_data(right) + data_end, 3482 BTRFS_LEAF_DATA_SIZE(root) - data_end); 3483 3484 /* copy from the left data area */ 3485 copy_extent_buffer(right, left, btrfs_leaf_data(right) + 3486 BTRFS_LEAF_DATA_SIZE(root) - push_space, 3487 btrfs_leaf_data(left) + leaf_data_end(root, left), 3488 push_space); 3489 3490 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3491 btrfs_item_nr_offset(0), 3492 right_nritems * sizeof(struct btrfs_item)); 3493 3494 /* copy the items from left to right */ 3495 copy_extent_buffer(right, left, btrfs_item_nr_offset(0), 3496 btrfs_item_nr_offset(left_nritems - push_items), 3497 push_items * sizeof(struct btrfs_item)); 3498 3499 /* update the item pointers */ 3500 right_nritems += push_items; 3501 btrfs_set_header_nritems(right, right_nritems); 3502 push_space = BTRFS_LEAF_DATA_SIZE(root); 3503 for (i = 0; i < right_nritems; i++) { 3504 item = btrfs_item_nr(right, i); 3505 push_space -= btrfs_token_item_size(right, item, &token); 3506 btrfs_set_token_item_offset(right, item, push_space, &token); 3507 } 3508 3509 left_nritems -= push_items; 3510 btrfs_set_header_nritems(left, left_nritems); 3511 3512 if (left_nritems) 3513 btrfs_mark_buffer_dirty(left); 3514 else 3515 clean_tree_block(trans, root, left); 3516 3517 btrfs_mark_buffer_dirty(right); 3518 3519 btrfs_item_key(right, &disk_key, 0); 3520 btrfs_set_node_key(upper, &disk_key, slot + 1); 3521 btrfs_mark_buffer_dirty(upper); 3522 3523 /* then fixup the leaf pointer in the path */ 3524 if (path->slots[0] >= left_nritems) { 3525 path->slots[0] -= left_nritems; 3526 if (btrfs_header_nritems(path->nodes[0]) == 0) 3527 clean_tree_block(trans, root, path->nodes[0]); 3528 btrfs_tree_unlock(path->nodes[0]); 3529 free_extent_buffer(path->nodes[0]); 3530 path->nodes[0] = right; 3531 path->slots[1] += 1; 3532 } else { 3533 btrfs_tree_unlock(right); 3534 free_extent_buffer(right); 3535 } 3536 return 0; 3537 3538 out_unlock: 3539 btrfs_tree_unlock(right); 3540 free_extent_buffer(right); 3541 return 1; 3542 } 3543 3544 /* 3545 * push some data in the path leaf to the right, trying to free up at 3546 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3547 * 3548 * returns 1 if the push failed because the other node didn't have enough 3549 * room, 0 if everything worked out and < 0 if there were major errors. 3550 * 3551 * this will push starting from min_slot to the end of the leaf. It won't 3552 * push any slot lower than min_slot 3553 */ 3554 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 3555 *root, struct btrfs_path *path, 3556 int min_data_size, int data_size, 3557 int empty, u32 min_slot) 3558 { 3559 struct extent_buffer *left = path->nodes[0]; 3560 struct extent_buffer *right; 3561 struct extent_buffer *upper; 3562 int slot; 3563 int free_space; 3564 u32 left_nritems; 3565 int ret; 3566 3567 if (!path->nodes[1]) 3568 return 1; 3569 3570 slot = path->slots[1]; 3571 upper = path->nodes[1]; 3572 if (slot >= btrfs_header_nritems(upper) - 1) 3573 return 1; 3574 3575 btrfs_assert_tree_locked(path->nodes[1]); 3576 3577 right = read_node_slot(root, upper, slot + 1); 3578 if (right == NULL) 3579 return 1; 3580 3581 btrfs_tree_lock(right); 3582 btrfs_set_lock_blocking(right); 3583 3584 free_space = btrfs_leaf_free_space(root, right); 3585 if (free_space < data_size) 3586 goto out_unlock; 3587 3588 /* cow and double check */ 3589 ret = btrfs_cow_block(trans, root, right, upper, 3590 slot + 1, &right); 3591 if (ret) 3592 goto out_unlock; 3593 3594 free_space = btrfs_leaf_free_space(root, right); 3595 if (free_space < data_size) 3596 goto out_unlock; 3597 3598 left_nritems = btrfs_header_nritems(left); 3599 if (left_nritems == 0) 3600 goto out_unlock; 3601 3602 return __push_leaf_right(trans, root, path, min_data_size, empty, 3603 right, free_space, left_nritems, min_slot); 3604 out_unlock: 3605 btrfs_tree_unlock(right); 3606 free_extent_buffer(right); 3607 return 1; 3608 } 3609 3610 /* 3611 * push some data in the path leaf to the left, trying to free up at 3612 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3613 * 3614 * max_slot can put a limit on how far into the leaf we'll push items. The 3615 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the 3616 * items 3617 */ 3618 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 3619 struct btrfs_root *root, 3620 struct btrfs_path *path, int data_size, 3621 int empty, struct extent_buffer *left, 3622 int free_space, u32 right_nritems, 3623 u32 max_slot) 3624 { 3625 struct btrfs_disk_key disk_key; 3626 struct extent_buffer *right = path->nodes[0]; 3627 int i; 3628 int push_space = 0; 3629 int push_items = 0; 3630 struct btrfs_item *item; 3631 u32 old_left_nritems; 3632 u32 nr; 3633 int ret = 0; 3634 u32 this_item_size; 3635 u32 old_left_item_size; 3636 struct btrfs_map_token token; 3637 3638 btrfs_init_map_token(&token); 3639 3640 if (empty) 3641 nr = min(right_nritems, max_slot); 3642 else 3643 nr = min(right_nritems - 1, max_slot); 3644 3645 for (i = 0; i < nr; i++) { 3646 item = btrfs_item_nr(right, i); 3647 3648 if (!empty && push_items > 0) { 3649 if (path->slots[0] < i) 3650 break; 3651 if (path->slots[0] == i) { 3652 int space = btrfs_leaf_free_space(root, right); 3653 if (space + push_space * 2 > free_space) 3654 break; 3655 } 3656 } 3657 3658 if (path->slots[0] == i) 3659 push_space += data_size; 3660 3661 this_item_size = btrfs_item_size(right, item); 3662 if (this_item_size + sizeof(*item) + push_space > free_space) 3663 break; 3664 3665 push_items++; 3666 push_space += this_item_size + sizeof(*item); 3667 } 3668 3669 if (push_items == 0) { 3670 ret = 1; 3671 goto out; 3672 } 3673 if (!empty && push_items == btrfs_header_nritems(right)) 3674 WARN_ON(1); 3675 3676 /* push data from right to left */ 3677 copy_extent_buffer(left, right, 3678 btrfs_item_nr_offset(btrfs_header_nritems(left)), 3679 btrfs_item_nr_offset(0), 3680 push_items * sizeof(struct btrfs_item)); 3681 3682 push_space = BTRFS_LEAF_DATA_SIZE(root) - 3683 btrfs_item_offset_nr(right, push_items - 1); 3684 3685 copy_extent_buffer(left, right, btrfs_leaf_data(left) + 3686 leaf_data_end(root, left) - push_space, 3687 btrfs_leaf_data(right) + 3688 btrfs_item_offset_nr(right, push_items - 1), 3689 push_space); 3690 old_left_nritems = btrfs_header_nritems(left); 3691 BUG_ON(old_left_nritems <= 0); 3692 3693 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); 3694 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { 3695 u32 ioff; 3696 3697 item = btrfs_item_nr(left, i); 3698 3699 ioff = btrfs_token_item_offset(left, item, &token); 3700 btrfs_set_token_item_offset(left, item, 3701 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), 3702 &token); 3703 } 3704 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3705 3706 /* fixup right node */ 3707 if (push_items > right_nritems) 3708 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, 3709 right_nritems); 3710 3711 if (push_items < right_nritems) { 3712 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3713 leaf_data_end(root, right); 3714 memmove_extent_buffer(right, btrfs_leaf_data(right) + 3715 BTRFS_LEAF_DATA_SIZE(root) - push_space, 3716 btrfs_leaf_data(right) + 3717 leaf_data_end(root, right), push_space); 3718 3719 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3720 btrfs_item_nr_offset(push_items), 3721 (btrfs_header_nritems(right) - push_items) * 3722 sizeof(struct btrfs_item)); 3723 } 3724 right_nritems -= push_items; 3725 btrfs_set_header_nritems(right, right_nritems); 3726 push_space = BTRFS_LEAF_DATA_SIZE(root); 3727 for (i = 0; i < right_nritems; i++) { 3728 item = btrfs_item_nr(right, i); 3729 3730 push_space = push_space - btrfs_token_item_size(right, 3731 item, &token); 3732 btrfs_set_token_item_offset(right, item, push_space, &token); 3733 } 3734 3735 btrfs_mark_buffer_dirty(left); 3736 if (right_nritems) 3737 btrfs_mark_buffer_dirty(right); 3738 else 3739 clean_tree_block(trans, root, right); 3740 3741 btrfs_item_key(right, &disk_key, 0); 3742 fixup_low_keys(root, path, &disk_key, 1); 3743 3744 /* then fixup the leaf pointer in the path */ 3745 if (path->slots[0] < push_items) { 3746 path->slots[0] += old_left_nritems; 3747 btrfs_tree_unlock(path->nodes[0]); 3748 free_extent_buffer(path->nodes[0]); 3749 path->nodes[0] = left; 3750 path->slots[1] -= 1; 3751 } else { 3752 btrfs_tree_unlock(left); 3753 free_extent_buffer(left); 3754 path->slots[0] -= push_items; 3755 } 3756 BUG_ON(path->slots[0] < 0); 3757 return ret; 3758 out: 3759 btrfs_tree_unlock(left); 3760 free_extent_buffer(left); 3761 return ret; 3762 } 3763 3764 /* 3765 * push some data in the path leaf to the left, trying to free up at 3766 * least data_size bytes. returns zero if the push worked, nonzero otherwise 3767 * 3768 * max_slot can put a limit on how far into the leaf we'll push items. The 3769 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the 3770 * items 3771 */ 3772 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 3773 *root, struct btrfs_path *path, int min_data_size, 3774 int data_size, int empty, u32 max_slot) 3775 { 3776 struct extent_buffer *right = path->nodes[0]; 3777 struct extent_buffer *left; 3778 int slot; 3779 int free_space; 3780 u32 right_nritems; 3781 int ret = 0; 3782 3783 slot = path->slots[1]; 3784 if (slot == 0) 3785 return 1; 3786 if (!path->nodes[1]) 3787 return 1; 3788 3789 right_nritems = btrfs_header_nritems(right); 3790 if (right_nritems == 0) 3791 return 1; 3792 3793 btrfs_assert_tree_locked(path->nodes[1]); 3794 3795 left = read_node_slot(root, path->nodes[1], slot - 1); 3796 if (left == NULL) 3797 return 1; 3798 3799 btrfs_tree_lock(left); 3800 btrfs_set_lock_blocking(left); 3801 3802 free_space = btrfs_leaf_free_space(root, left); 3803 if (free_space < data_size) { 3804 ret = 1; 3805 goto out; 3806 } 3807 3808 /* cow and double check */ 3809 ret = btrfs_cow_block(trans, root, left, 3810 path->nodes[1], slot - 1, &left); 3811 if (ret) { 3812 /* we hit -ENOSPC, but it isn't fatal here */ 3813 if (ret == -ENOSPC) 3814 ret = 1; 3815 goto out; 3816 } 3817 3818 free_space = btrfs_leaf_free_space(root, left); 3819 if (free_space < data_size) { 3820 ret = 1; 3821 goto out; 3822 } 3823 3824 return __push_leaf_left(trans, root, path, min_data_size, 3825 empty, left, free_space, right_nritems, 3826 max_slot); 3827 out: 3828 btrfs_tree_unlock(left); 3829 free_extent_buffer(left); 3830 return ret; 3831 } 3832 3833 /* 3834 * split the path's leaf in two, making sure there is at least data_size 3835 * available for the resulting leaf level of the path. 3836 */ 3837 static noinline void copy_for_split(struct btrfs_trans_handle *trans, 3838 struct btrfs_root *root, 3839 struct btrfs_path *path, 3840 struct extent_buffer *l, 3841 struct extent_buffer *right, 3842 int slot, int mid, int nritems) 3843 { 3844 int data_copy_size; 3845 int rt_data_off; 3846 int i; 3847 struct btrfs_disk_key disk_key; 3848 struct btrfs_map_token token; 3849 3850 btrfs_init_map_token(&token); 3851 3852 nritems = nritems - mid; 3853 btrfs_set_header_nritems(right, nritems); 3854 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); 3855 3856 copy_extent_buffer(right, l, btrfs_item_nr_offset(0), 3857 btrfs_item_nr_offset(mid), 3858 nritems * sizeof(struct btrfs_item)); 3859 3860 copy_extent_buffer(right, l, 3861 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - 3862 data_copy_size, btrfs_leaf_data(l) + 3863 leaf_data_end(root, l), data_copy_size); 3864 3865 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - 3866 btrfs_item_end_nr(l, mid); 3867 3868 for (i = 0; i < nritems; i++) { 3869 struct btrfs_item *item = btrfs_item_nr(right, i); 3870 u32 ioff; 3871 3872 ioff = btrfs_token_item_offset(right, item, &token); 3873 btrfs_set_token_item_offset(right, item, 3874 ioff + rt_data_off, &token); 3875 } 3876 3877 btrfs_set_header_nritems(l, mid); 3878 btrfs_item_key(right, &disk_key, 0); 3879 insert_ptr(trans, root, path, &disk_key, right->start, 3880 path->slots[1] + 1, 1); 3881 3882 btrfs_mark_buffer_dirty(right); 3883 btrfs_mark_buffer_dirty(l); 3884 BUG_ON(path->slots[0] != slot); 3885 3886 if (mid <= slot) { 3887 btrfs_tree_unlock(path->nodes[0]); 3888 free_extent_buffer(path->nodes[0]); 3889 path->nodes[0] = right; 3890 path->slots[0] -= mid; 3891 path->slots[1] += 1; 3892 } else { 3893 btrfs_tree_unlock(right); 3894 free_extent_buffer(right); 3895 } 3896 3897 BUG_ON(path->slots[0] < 0); 3898 } 3899 3900 /* 3901 * double splits happen when we need to insert a big item in the middle 3902 * of a leaf. A double split can leave us with 3 mostly empty leaves: 3903 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] 3904 * A B C 3905 * 3906 * We avoid this by trying to push the items on either side of our target 3907 * into the adjacent leaves. If all goes well we can avoid the double split 3908 * completely. 3909 */ 3910 static noinline int push_for_double_split(struct btrfs_trans_handle *trans, 3911 struct btrfs_root *root, 3912 struct btrfs_path *path, 3913 int data_size) 3914 { 3915 int ret; 3916 int progress = 0; 3917 int slot; 3918 u32 nritems; 3919 3920 slot = path->slots[0]; 3921 3922 /* 3923 * try to push all the items after our slot into the 3924 * right leaf 3925 */ 3926 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); 3927 if (ret < 0) 3928 return ret; 3929 3930 if (ret == 0) 3931 progress++; 3932 3933 nritems = btrfs_header_nritems(path->nodes[0]); 3934 /* 3935 * our goal is to get our slot at the start or end of a leaf. If 3936 * we've done so we're done 3937 */ 3938 if (path->slots[0] == 0 || path->slots[0] == nritems) 3939 return 0; 3940 3941 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) 3942 return 0; 3943 3944 /* try to push all the items before our slot into the next leaf */ 3945 slot = path->slots[0]; 3946 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); 3947 if (ret < 0) 3948 return ret; 3949 3950 if (ret == 0) 3951 progress++; 3952 3953 if (progress) 3954 return 0; 3955 return 1; 3956 } 3957 3958 /* 3959 * split the path's leaf in two, making sure there is at least data_size 3960 * available for the resulting leaf level of the path. 3961 * 3962 * returns 0 if all went well and < 0 on failure. 3963 */ 3964 static noinline int split_leaf(struct btrfs_trans_handle *trans, 3965 struct btrfs_root *root, 3966 struct btrfs_key *ins_key, 3967 struct btrfs_path *path, int data_size, 3968 int extend) 3969 { 3970 struct btrfs_disk_key disk_key; 3971 struct extent_buffer *l; 3972 u32 nritems; 3973 int mid; 3974 int slot; 3975 struct extent_buffer *right; 3976 int ret = 0; 3977 int wret; 3978 int split; 3979 int num_doubles = 0; 3980 int tried_avoid_double = 0; 3981 3982 l = path->nodes[0]; 3983 slot = path->slots[0]; 3984 if (extend && data_size + btrfs_item_size_nr(l, slot) + 3985 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) 3986 return -EOVERFLOW; 3987 3988 /* first try to make some room by pushing left and right */ 3989 if (data_size) { 3990 wret = push_leaf_right(trans, root, path, data_size, 3991 data_size, 0, 0); 3992 if (wret < 0) 3993 return wret; 3994 if (wret) { 3995 wret = push_leaf_left(trans, root, path, data_size, 3996 data_size, 0, (u32)-1); 3997 if (wret < 0) 3998 return wret; 3999 } 4000 l = path->nodes[0]; 4001 4002 /* did the pushes work? */ 4003 if (btrfs_leaf_free_space(root, l) >= data_size) 4004 return 0; 4005 } 4006 4007 if (!path->nodes[1]) { 4008 ret = insert_new_root(trans, root, path, 1, 1); 4009 if (ret) 4010 return ret; 4011 } 4012 again: 4013 split = 1; 4014 l = path->nodes[0]; 4015 slot = path->slots[0]; 4016 nritems = btrfs_header_nritems(l); 4017 mid = (nritems + 1) / 2; 4018 4019 if (mid <= slot) { 4020 if (nritems == 1 || 4021 leaf_space_used(l, mid, nritems - mid) + data_size > 4022 BTRFS_LEAF_DATA_SIZE(root)) { 4023 if (slot >= nritems) { 4024 split = 0; 4025 } else { 4026 mid = slot; 4027 if (mid != nritems && 4028 leaf_space_used(l, mid, nritems - mid) + 4029 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 4030 if (data_size && !tried_avoid_double) 4031 goto push_for_double; 4032 split = 2; 4033 } 4034 } 4035 } 4036 } else { 4037 if (leaf_space_used(l, 0, mid) + data_size > 4038 BTRFS_LEAF_DATA_SIZE(root)) { 4039 if (!extend && data_size && slot == 0) { 4040 split = 0; 4041 } else if ((extend || !data_size) && slot == 0) { 4042 mid = 1; 4043 } else { 4044 mid = slot; 4045 if (mid != nritems && 4046 leaf_space_used(l, mid, nritems - mid) + 4047 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 4048 if (data_size && !tried_avoid_double) 4049 goto push_for_double; 4050 split = 2 ; 4051 } 4052 } 4053 } 4054 } 4055 4056 if (split == 0) 4057 btrfs_cpu_key_to_disk(&disk_key, ins_key); 4058 else 4059 btrfs_item_key(l, &disk_key, mid); 4060 4061 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 4062 root->root_key.objectid, 4063 &disk_key, 0, l->start, 0); 4064 if (IS_ERR(right)) 4065 return PTR_ERR(right); 4066 4067 root_add_used(root, root->leafsize); 4068 4069 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 4070 btrfs_set_header_bytenr(right, right->start); 4071 btrfs_set_header_generation(right, trans->transid); 4072 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); 4073 btrfs_set_header_owner(right, root->root_key.objectid); 4074 btrfs_set_header_level(right, 0); 4075 write_extent_buffer(right, root->fs_info->fsid, 4076 (unsigned long)btrfs_header_fsid(right), 4077 BTRFS_FSID_SIZE); 4078 4079 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 4080 (unsigned long)btrfs_header_chunk_tree_uuid(right), 4081 BTRFS_UUID_SIZE); 4082 4083 if (split == 0) { 4084 if (mid <= slot) { 4085 btrfs_set_header_nritems(right, 0); 4086 insert_ptr(trans, root, path, &disk_key, right->start, 4087 path->slots[1] + 1, 1); 4088 btrfs_tree_unlock(path->nodes[0]); 4089 free_extent_buffer(path->nodes[0]); 4090 path->nodes[0] = right; 4091 path->slots[0] = 0; 4092 path->slots[1] += 1; 4093 } else { 4094 btrfs_set_header_nritems(right, 0); 4095 insert_ptr(trans, root, path, &disk_key, right->start, 4096 path->slots[1], 1); 4097 btrfs_tree_unlock(path->nodes[0]); 4098 free_extent_buffer(path->nodes[0]); 4099 path->nodes[0] = right; 4100 path->slots[0] = 0; 4101 if (path->slots[1] == 0) 4102 fixup_low_keys(root, path, &disk_key, 1); 4103 } 4104 btrfs_mark_buffer_dirty(right); 4105 return ret; 4106 } 4107 4108 copy_for_split(trans, root, path, l, right, slot, mid, nritems); 4109 4110 if (split == 2) { 4111 BUG_ON(num_doubles != 0); 4112 num_doubles++; 4113 goto again; 4114 } 4115 4116 return 0; 4117 4118 push_for_double: 4119 push_for_double_split(trans, root, path, data_size); 4120 tried_avoid_double = 1; 4121 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) 4122 return 0; 4123 goto again; 4124 } 4125 4126 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 4127 struct btrfs_root *root, 4128 struct btrfs_path *path, int ins_len) 4129 { 4130 struct btrfs_key key; 4131 struct extent_buffer *leaf; 4132 struct btrfs_file_extent_item *fi; 4133 u64 extent_len = 0; 4134 u32 item_size; 4135 int ret; 4136 4137 leaf = path->nodes[0]; 4138 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4139 4140 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && 4141 key.type != BTRFS_EXTENT_CSUM_KEY); 4142 4143 if (btrfs_leaf_free_space(root, leaf) >= ins_len) 4144 return 0; 4145 4146 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4147 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4148 fi = btrfs_item_ptr(leaf, path->slots[0], 4149 struct btrfs_file_extent_item); 4150 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 4151 } 4152 btrfs_release_path(path); 4153 4154 path->keep_locks = 1; 4155 path->search_for_split = 1; 4156 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 4157 path->search_for_split = 0; 4158 if (ret < 0) 4159 goto err; 4160 4161 ret = -EAGAIN; 4162 leaf = path->nodes[0]; 4163 /* if our item isn't there or got smaller, return now */ 4164 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 4165 goto err; 4166 4167 /* the leaf has changed, it now has room. return now */ 4168 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) 4169 goto err; 4170 4171 if (key.type == BTRFS_EXTENT_DATA_KEY) { 4172 fi = btrfs_item_ptr(leaf, path->slots[0], 4173 struct btrfs_file_extent_item); 4174 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) 4175 goto err; 4176 } 4177 4178 btrfs_set_path_blocking(path); 4179 ret = split_leaf(trans, root, &key, path, ins_len, 1); 4180 if (ret) 4181 goto err; 4182 4183 path->keep_locks = 0; 4184 btrfs_unlock_up_safe(path, 1); 4185 return 0; 4186 err: 4187 path->keep_locks = 0; 4188 return ret; 4189 } 4190 4191 static noinline int split_item(struct btrfs_trans_handle *trans, 4192 struct btrfs_root *root, 4193 struct btrfs_path *path, 4194 struct btrfs_key *new_key, 4195 unsigned long split_offset) 4196 { 4197 struct extent_buffer *leaf; 4198 struct btrfs_item *item; 4199 struct btrfs_item *new_item; 4200 int slot; 4201 char *buf; 4202 u32 nritems; 4203 u32 item_size; 4204 u32 orig_offset; 4205 struct btrfs_disk_key disk_key; 4206 4207 leaf = path->nodes[0]; 4208 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 4209 4210 btrfs_set_path_blocking(path); 4211 4212 item = btrfs_item_nr(leaf, path->slots[0]); 4213 orig_offset = btrfs_item_offset(leaf, item); 4214 item_size = btrfs_item_size(leaf, item); 4215 4216 buf = kmalloc(item_size, GFP_NOFS); 4217 if (!buf) 4218 return -ENOMEM; 4219 4220 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 4221 path->slots[0]), item_size); 4222 4223 slot = path->slots[0] + 1; 4224 nritems = btrfs_header_nritems(leaf); 4225 if (slot != nritems) { 4226 /* shift the items */ 4227 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 4228 btrfs_item_nr_offset(slot), 4229 (nritems - slot) * sizeof(struct btrfs_item)); 4230 } 4231 4232 btrfs_cpu_key_to_disk(&disk_key, new_key); 4233 btrfs_set_item_key(leaf, &disk_key, slot); 4234 4235 new_item = btrfs_item_nr(leaf, slot); 4236 4237 btrfs_set_item_offset(leaf, new_item, orig_offset); 4238 btrfs_set_item_size(leaf, new_item, item_size - split_offset); 4239 4240 btrfs_set_item_offset(leaf, item, 4241 orig_offset + item_size - split_offset); 4242 btrfs_set_item_size(leaf, item, split_offset); 4243 4244 btrfs_set_header_nritems(leaf, nritems + 1); 4245 4246 /* write the data for the start of the original item */ 4247 write_extent_buffer(leaf, buf, 4248 btrfs_item_ptr_offset(leaf, path->slots[0]), 4249 split_offset); 4250 4251 /* write the data for the new item */ 4252 write_extent_buffer(leaf, buf + split_offset, 4253 btrfs_item_ptr_offset(leaf, slot), 4254 item_size - split_offset); 4255 btrfs_mark_buffer_dirty(leaf); 4256 4257 BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); 4258 kfree(buf); 4259 return 0; 4260 } 4261 4262 /* 4263 * This function splits a single item into two items, 4264 * giving 'new_key' to the new item and splitting the 4265 * old one at split_offset (from the start of the item). 4266 * 4267 * The path may be released by this operation. After 4268 * the split, the path is pointing to the old item. The 4269 * new item is going to be in the same node as the old one. 4270 * 4271 * Note, the item being split must be smaller enough to live alone on 4272 * a tree block with room for one extra struct btrfs_item 4273 * 4274 * This allows us to split the item in place, keeping a lock on the 4275 * leaf the entire time. 4276 */ 4277 int btrfs_split_item(struct btrfs_trans_handle *trans, 4278 struct btrfs_root *root, 4279 struct btrfs_path *path, 4280 struct btrfs_key *new_key, 4281 unsigned long split_offset) 4282 { 4283 int ret; 4284 ret = setup_leaf_for_split(trans, root, path, 4285 sizeof(struct btrfs_item)); 4286 if (ret) 4287 return ret; 4288 4289 ret = split_item(trans, root, path, new_key, split_offset); 4290 return ret; 4291 } 4292 4293 /* 4294 * This function duplicate a item, giving 'new_key' to the new item. 4295 * It guarantees both items live in the same tree leaf and the new item 4296 * is contiguous with the original item. 4297 * 4298 * This allows us to split file extent in place, keeping a lock on the 4299 * leaf the entire time. 4300 */ 4301 int btrfs_duplicate_item(struct btrfs_trans_handle *trans, 4302 struct btrfs_root *root, 4303 struct btrfs_path *path, 4304 struct btrfs_key *new_key) 4305 { 4306 struct extent_buffer *leaf; 4307 int ret; 4308 u32 item_size; 4309 4310 leaf = path->nodes[0]; 4311 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 4312 ret = setup_leaf_for_split(trans, root, path, 4313 item_size + sizeof(struct btrfs_item)); 4314 if (ret) 4315 return ret; 4316 4317 path->slots[0]++; 4318 setup_items_for_insert(root, path, new_key, &item_size, 4319 item_size, item_size + 4320 sizeof(struct btrfs_item), 1); 4321 leaf = path->nodes[0]; 4322 memcpy_extent_buffer(leaf, 4323 btrfs_item_ptr_offset(leaf, path->slots[0]), 4324 btrfs_item_ptr_offset(leaf, path->slots[0] - 1), 4325 item_size); 4326 return 0; 4327 } 4328 4329 /* 4330 * make the item pointed to by the path smaller. new_size indicates 4331 * how small to make it, and from_end tells us if we just chop bytes 4332 * off the end of the item or if we shift the item to chop bytes off 4333 * the front. 4334 */ 4335 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, 4336 u32 new_size, int from_end) 4337 { 4338 int slot; 4339 struct extent_buffer *leaf; 4340 struct btrfs_item *item; 4341 u32 nritems; 4342 unsigned int data_end; 4343 unsigned int old_data_start; 4344 unsigned int old_size; 4345 unsigned int size_diff; 4346 int i; 4347 struct btrfs_map_token token; 4348 4349 btrfs_init_map_token(&token); 4350 4351 leaf = path->nodes[0]; 4352 slot = path->slots[0]; 4353 4354 old_size = btrfs_item_size_nr(leaf, slot); 4355 if (old_size == new_size) 4356 return; 4357 4358 nritems = btrfs_header_nritems(leaf); 4359 data_end = leaf_data_end(root, leaf); 4360 4361 old_data_start = btrfs_item_offset_nr(leaf, slot); 4362 4363 size_diff = old_size - new_size; 4364 4365 BUG_ON(slot < 0); 4366 BUG_ON(slot >= nritems); 4367 4368 /* 4369 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4370 */ 4371 /* first correct the data pointers */ 4372 for (i = slot; i < nritems; i++) { 4373 u32 ioff; 4374 item = btrfs_item_nr(leaf, i); 4375 4376 ioff = btrfs_token_item_offset(leaf, item, &token); 4377 btrfs_set_token_item_offset(leaf, item, 4378 ioff + size_diff, &token); 4379 } 4380 4381 /* shift the data */ 4382 if (from_end) { 4383 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4384 data_end + size_diff, btrfs_leaf_data(leaf) + 4385 data_end, old_data_start + new_size - data_end); 4386 } else { 4387 struct btrfs_disk_key disk_key; 4388 u64 offset; 4389 4390 btrfs_item_key(leaf, &disk_key, slot); 4391 4392 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { 4393 unsigned long ptr; 4394 struct btrfs_file_extent_item *fi; 4395 4396 fi = btrfs_item_ptr(leaf, slot, 4397 struct btrfs_file_extent_item); 4398 fi = (struct btrfs_file_extent_item *)( 4399 (unsigned long)fi - size_diff); 4400 4401 if (btrfs_file_extent_type(leaf, fi) == 4402 BTRFS_FILE_EXTENT_INLINE) { 4403 ptr = btrfs_item_ptr_offset(leaf, slot); 4404 memmove_extent_buffer(leaf, ptr, 4405 (unsigned long)fi, 4406 offsetof(struct btrfs_file_extent_item, 4407 disk_bytenr)); 4408 } 4409 } 4410 4411 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4412 data_end + size_diff, btrfs_leaf_data(leaf) + 4413 data_end, old_data_start - data_end); 4414 4415 offset = btrfs_disk_key_offset(&disk_key); 4416 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4417 btrfs_set_item_key(leaf, &disk_key, slot); 4418 if (slot == 0) 4419 fixup_low_keys(root, path, &disk_key, 1); 4420 } 4421 4422 item = btrfs_item_nr(leaf, slot); 4423 btrfs_set_item_size(leaf, item, new_size); 4424 btrfs_mark_buffer_dirty(leaf); 4425 4426 if (btrfs_leaf_free_space(root, leaf) < 0) { 4427 btrfs_print_leaf(root, leaf); 4428 BUG(); 4429 } 4430 } 4431 4432 /* 4433 * make the item pointed to by the path bigger, data_size is the new size. 4434 */ 4435 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, 4436 u32 data_size) 4437 { 4438 int slot; 4439 struct extent_buffer *leaf; 4440 struct btrfs_item *item; 4441 u32 nritems; 4442 unsigned int data_end; 4443 unsigned int old_data; 4444 unsigned int old_size; 4445 int i; 4446 struct btrfs_map_token token; 4447 4448 btrfs_init_map_token(&token); 4449 4450 leaf = path->nodes[0]; 4451 4452 nritems = btrfs_header_nritems(leaf); 4453 data_end = leaf_data_end(root, leaf); 4454 4455 if (btrfs_leaf_free_space(root, leaf) < data_size) { 4456 btrfs_print_leaf(root, leaf); 4457 BUG(); 4458 } 4459 slot = path->slots[0]; 4460 old_data = btrfs_item_end_nr(leaf, slot); 4461 4462 BUG_ON(slot < 0); 4463 if (slot >= nritems) { 4464 btrfs_print_leaf(root, leaf); 4465 printk(KERN_CRIT "slot %d too large, nritems %d\n", 4466 slot, nritems); 4467 BUG_ON(1); 4468 } 4469 4470 /* 4471 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4472 */ 4473 /* first correct the data pointers */ 4474 for (i = slot; i < nritems; i++) { 4475 u32 ioff; 4476 item = btrfs_item_nr(leaf, i); 4477 4478 ioff = btrfs_token_item_offset(leaf, item, &token); 4479 btrfs_set_token_item_offset(leaf, item, 4480 ioff - data_size, &token); 4481 } 4482 4483 /* shift the data */ 4484 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4485 data_end - data_size, btrfs_leaf_data(leaf) + 4486 data_end, old_data - data_end); 4487 4488 data_end = old_data; 4489 old_size = btrfs_item_size_nr(leaf, slot); 4490 item = btrfs_item_nr(leaf, slot); 4491 btrfs_set_item_size(leaf, item, old_size + data_size); 4492 btrfs_mark_buffer_dirty(leaf); 4493 4494 if (btrfs_leaf_free_space(root, leaf) < 0) { 4495 btrfs_print_leaf(root, leaf); 4496 BUG(); 4497 } 4498 } 4499 4500 /* 4501 * this is a helper for btrfs_insert_empty_items, the main goal here is 4502 * to save stack depth by doing the bulk of the work in a function 4503 * that doesn't call btrfs_search_slot 4504 */ 4505 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, 4506 struct btrfs_key *cpu_key, u32 *data_size, 4507 u32 total_data, u32 total_size, int nr) 4508 { 4509 struct btrfs_item *item; 4510 int i; 4511 u32 nritems; 4512 unsigned int data_end; 4513 struct btrfs_disk_key disk_key; 4514 struct extent_buffer *leaf; 4515 int slot; 4516 struct btrfs_map_token token; 4517 4518 btrfs_init_map_token(&token); 4519 4520 leaf = path->nodes[0]; 4521 slot = path->slots[0]; 4522 4523 nritems = btrfs_header_nritems(leaf); 4524 data_end = leaf_data_end(root, leaf); 4525 4526 if (btrfs_leaf_free_space(root, leaf) < total_size) { 4527 btrfs_print_leaf(root, leaf); 4528 printk(KERN_CRIT "not enough freespace need %u have %d\n", 4529 total_size, btrfs_leaf_free_space(root, leaf)); 4530 BUG(); 4531 } 4532 4533 if (slot != nritems) { 4534 unsigned int old_data = btrfs_item_end_nr(leaf, slot); 4535 4536 if (old_data < data_end) { 4537 btrfs_print_leaf(root, leaf); 4538 printk(KERN_CRIT "slot %d old_data %d data_end %d\n", 4539 slot, old_data, data_end); 4540 BUG_ON(1); 4541 } 4542 /* 4543 * item0..itemN ... dataN.offset..dataN.size .. data0.size 4544 */ 4545 /* first correct the data pointers */ 4546 for (i = slot; i < nritems; i++) { 4547 u32 ioff; 4548 4549 item = btrfs_item_nr(leaf, i); 4550 ioff = btrfs_token_item_offset(leaf, item, &token); 4551 btrfs_set_token_item_offset(leaf, item, 4552 ioff - total_data, &token); 4553 } 4554 /* shift the items */ 4555 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 4556 btrfs_item_nr_offset(slot), 4557 (nritems - slot) * sizeof(struct btrfs_item)); 4558 4559 /* shift the data */ 4560 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4561 data_end - total_data, btrfs_leaf_data(leaf) + 4562 data_end, old_data - data_end); 4563 data_end = old_data; 4564 } 4565 4566 /* setup the item for the new data */ 4567 for (i = 0; i < nr; i++) { 4568 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); 4569 btrfs_set_item_key(leaf, &disk_key, slot + i); 4570 item = btrfs_item_nr(leaf, slot + i); 4571 btrfs_set_token_item_offset(leaf, item, 4572 data_end - data_size[i], &token); 4573 data_end -= data_size[i]; 4574 btrfs_set_token_item_size(leaf, item, data_size[i], &token); 4575 } 4576 4577 btrfs_set_header_nritems(leaf, nritems + nr); 4578 4579 if (slot == 0) { 4580 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 4581 fixup_low_keys(root, path, &disk_key, 1); 4582 } 4583 btrfs_unlock_up_safe(path, 1); 4584 btrfs_mark_buffer_dirty(leaf); 4585 4586 if (btrfs_leaf_free_space(root, leaf) < 0) { 4587 btrfs_print_leaf(root, leaf); 4588 BUG(); 4589 } 4590 } 4591 4592 /* 4593 * Given a key and some data, insert items into the tree. 4594 * This does all the path init required, making room in the tree if needed. 4595 */ 4596 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 4597 struct btrfs_root *root, 4598 struct btrfs_path *path, 4599 struct btrfs_key *cpu_key, u32 *data_size, 4600 int nr) 4601 { 4602 int ret = 0; 4603 int slot; 4604 int i; 4605 u32 total_size = 0; 4606 u32 total_data = 0; 4607 4608 for (i = 0; i < nr; i++) 4609 total_data += data_size[i]; 4610 4611 total_size = total_data + (nr * sizeof(struct btrfs_item)); 4612 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); 4613 if (ret == 0) 4614 return -EEXIST; 4615 if (ret < 0) 4616 return ret; 4617 4618 slot = path->slots[0]; 4619 BUG_ON(slot < 0); 4620 4621 setup_items_for_insert(root, path, cpu_key, data_size, 4622 total_data, total_size, nr); 4623 return 0; 4624 } 4625 4626 /* 4627 * Given a key and some data, insert an item into the tree. 4628 * This does all the path init required, making room in the tree if needed. 4629 */ 4630 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root 4631 *root, struct btrfs_key *cpu_key, void *data, u32 4632 data_size) 4633 { 4634 int ret = 0; 4635 struct btrfs_path *path; 4636 struct extent_buffer *leaf; 4637 unsigned long ptr; 4638 4639 path = btrfs_alloc_path(); 4640 if (!path) 4641 return -ENOMEM; 4642 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 4643 if (!ret) { 4644 leaf = path->nodes[0]; 4645 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 4646 write_extent_buffer(leaf, data, ptr, data_size); 4647 btrfs_mark_buffer_dirty(leaf); 4648 } 4649 btrfs_free_path(path); 4650 return ret; 4651 } 4652 4653 /* 4654 * delete the pointer from a given node. 4655 * 4656 * the tree should have been previously balanced so the deletion does not 4657 * empty a node. 4658 */ 4659 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 4660 int level, int slot) 4661 { 4662 struct extent_buffer *parent = path->nodes[level]; 4663 u32 nritems; 4664 int ret; 4665 4666 nritems = btrfs_header_nritems(parent); 4667 if (slot != nritems - 1) { 4668 if (level) 4669 tree_mod_log_eb_move(root->fs_info, parent, slot, 4670 slot + 1, nritems - slot - 1); 4671 memmove_extent_buffer(parent, 4672 btrfs_node_key_ptr_offset(slot), 4673 btrfs_node_key_ptr_offset(slot + 1), 4674 sizeof(struct btrfs_key_ptr) * 4675 (nritems - slot - 1)); 4676 } else if (level) { 4677 ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4678 MOD_LOG_KEY_REMOVE); 4679 BUG_ON(ret < 0); 4680 } 4681 4682 nritems--; 4683 btrfs_set_header_nritems(parent, nritems); 4684 if (nritems == 0 && parent == root->node) { 4685 BUG_ON(btrfs_header_level(root->node) != 1); 4686 /* just turn the root into a leaf and break */ 4687 btrfs_set_header_level(root->node, 0); 4688 } else if (slot == 0) { 4689 struct btrfs_disk_key disk_key; 4690 4691 btrfs_node_key(parent, &disk_key, 0); 4692 fixup_low_keys(root, path, &disk_key, level + 1); 4693 } 4694 btrfs_mark_buffer_dirty(parent); 4695 } 4696 4697 /* 4698 * a helper function to delete the leaf pointed to by path->slots[1] and 4699 * path->nodes[1]. 4700 * 4701 * This deletes the pointer in path->nodes[1] and frees the leaf 4702 * block extent. zero is returned if it all worked out, < 0 otherwise. 4703 * 4704 * The path must have already been setup for deleting the leaf, including 4705 * all the proper balancing. path->nodes[1] must be locked. 4706 */ 4707 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, 4708 struct btrfs_root *root, 4709 struct btrfs_path *path, 4710 struct extent_buffer *leaf) 4711 { 4712 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4713 del_ptr(root, path, 1, path->slots[1]); 4714 4715 /* 4716 * btrfs_free_extent is expensive, we want to make sure we 4717 * aren't holding any locks when we call it 4718 */ 4719 btrfs_unlock_up_safe(path, 0); 4720 4721 root_sub_used(root, leaf->len); 4722 4723 extent_buffer_get(leaf); 4724 btrfs_free_tree_block(trans, root, leaf, 0, 1); 4725 free_extent_buffer_stale(leaf); 4726 } 4727 /* 4728 * delete the item at the leaf level in path. If that empties 4729 * the leaf, remove it from the tree 4730 */ 4731 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4732 struct btrfs_path *path, int slot, int nr) 4733 { 4734 struct extent_buffer *leaf; 4735 struct btrfs_item *item; 4736 int last_off; 4737 int dsize = 0; 4738 int ret = 0; 4739 int wret; 4740 int i; 4741 u32 nritems; 4742 struct btrfs_map_token token; 4743 4744 btrfs_init_map_token(&token); 4745 4746 leaf = path->nodes[0]; 4747 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); 4748 4749 for (i = 0; i < nr; i++) 4750 dsize += btrfs_item_size_nr(leaf, slot + i); 4751 4752 nritems = btrfs_header_nritems(leaf); 4753 4754 if (slot + nr != nritems) { 4755 int data_end = leaf_data_end(root, leaf); 4756 4757 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4758 data_end + dsize, 4759 btrfs_leaf_data(leaf) + data_end, 4760 last_off - data_end); 4761 4762 for (i = slot + nr; i < nritems; i++) { 4763 u32 ioff; 4764 4765 item = btrfs_item_nr(leaf, i); 4766 ioff = btrfs_token_item_offset(leaf, item, &token); 4767 btrfs_set_token_item_offset(leaf, item, 4768 ioff + dsize, &token); 4769 } 4770 4771 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 4772 btrfs_item_nr_offset(slot + nr), 4773 sizeof(struct btrfs_item) * 4774 (nritems - slot - nr)); 4775 } 4776 btrfs_set_header_nritems(leaf, nritems - nr); 4777 nritems -= nr; 4778 4779 /* delete the leaf if we've emptied it */ 4780 if (nritems == 0) { 4781 if (leaf == root->node) { 4782 btrfs_set_header_level(leaf, 0); 4783 } else { 4784 btrfs_set_path_blocking(path); 4785 clean_tree_block(trans, root, leaf); 4786 btrfs_del_leaf(trans, root, path, leaf); 4787 } 4788 } else { 4789 int used = leaf_space_used(leaf, 0, nritems); 4790 if (slot == 0) { 4791 struct btrfs_disk_key disk_key; 4792 4793 btrfs_item_key(leaf, &disk_key, 0); 4794 fixup_low_keys(root, path, &disk_key, 1); 4795 } 4796 4797 /* delete the leaf if it is mostly empty */ 4798 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { 4799 /* push_leaf_left fixes the path. 4800 * make sure the path still points to our leaf 4801 * for possible call to del_ptr below 4802 */ 4803 slot = path->slots[1]; 4804 extent_buffer_get(leaf); 4805 4806 btrfs_set_path_blocking(path); 4807 wret = push_leaf_left(trans, root, path, 1, 1, 4808 1, (u32)-1); 4809 if (wret < 0 && wret != -ENOSPC) 4810 ret = wret; 4811 4812 if (path->nodes[0] == leaf && 4813 btrfs_header_nritems(leaf)) { 4814 wret = push_leaf_right(trans, root, path, 1, 4815 1, 1, 0); 4816 if (wret < 0 && wret != -ENOSPC) 4817 ret = wret; 4818 } 4819 4820 if (btrfs_header_nritems(leaf) == 0) { 4821 path->slots[1] = slot; 4822 btrfs_del_leaf(trans, root, path, leaf); 4823 free_extent_buffer(leaf); 4824 ret = 0; 4825 } else { 4826 /* if we're still in the path, make sure 4827 * we're dirty. Otherwise, one of the 4828 * push_leaf functions must have already 4829 * dirtied this buffer 4830 */ 4831 if (path->nodes[0] == leaf) 4832 btrfs_mark_buffer_dirty(leaf); 4833 free_extent_buffer(leaf); 4834 } 4835 } else { 4836 btrfs_mark_buffer_dirty(leaf); 4837 } 4838 } 4839 return ret; 4840 } 4841 4842 /* 4843 * search the tree again to find a leaf with lesser keys 4844 * returns 0 if it found something or 1 if there are no lesser leaves. 4845 * returns < 0 on io errors. 4846 * 4847 * This may release the path, and so you may lose any locks held at the 4848 * time you call it. 4849 */ 4850 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 4851 { 4852 struct btrfs_key key; 4853 struct btrfs_disk_key found_key; 4854 int ret; 4855 4856 btrfs_item_key_to_cpu(path->nodes[0], &key, 0); 4857 4858 if (key.offset > 0) 4859 key.offset--; 4860 else if (key.type > 0) 4861 key.type--; 4862 else if (key.objectid > 0) 4863 key.objectid--; 4864 else 4865 return 1; 4866 4867 btrfs_release_path(path); 4868 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4869 if (ret < 0) 4870 return ret; 4871 btrfs_item_key(path->nodes[0], &found_key, 0); 4872 ret = comp_keys(&found_key, &key); 4873 if (ret < 0) 4874 return 0; 4875 return 1; 4876 } 4877 4878 /* 4879 * A helper function to walk down the tree starting at min_key, and looking 4880 * for nodes or leaves that are have a minimum transaction id. 4881 * This is used by the btree defrag code, and tree logging 4882 * 4883 * This does not cow, but it does stuff the starting key it finds back 4884 * into min_key, so you can call btrfs_search_slot with cow=1 on the 4885 * key and get a writable path. 4886 * 4887 * This does lock as it descends, and path->keep_locks should be set 4888 * to 1 by the caller. 4889 * 4890 * This honors path->lowest_level to prevent descent past a given level 4891 * of the tree. 4892 * 4893 * min_trans indicates the oldest transaction that you are interested 4894 * in walking through. Any nodes or leaves older than min_trans are 4895 * skipped over (without reading them). 4896 * 4897 * returns zero if something useful was found, < 0 on error and 1 if there 4898 * was nothing in the tree that matched the search criteria. 4899 */ 4900 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, 4901 struct btrfs_key *max_key, 4902 struct btrfs_path *path, 4903 u64 min_trans) 4904 { 4905 struct extent_buffer *cur; 4906 struct btrfs_key found_key; 4907 int slot; 4908 int sret; 4909 u32 nritems; 4910 int level; 4911 int ret = 1; 4912 4913 WARN_ON(!path->keep_locks); 4914 again: 4915 cur = btrfs_read_lock_root_node(root); 4916 level = btrfs_header_level(cur); 4917 WARN_ON(path->nodes[level]); 4918 path->nodes[level] = cur; 4919 path->locks[level] = BTRFS_READ_LOCK; 4920 4921 if (btrfs_header_generation(cur) < min_trans) { 4922 ret = 1; 4923 goto out; 4924 } 4925 while (1) { 4926 nritems = btrfs_header_nritems(cur); 4927 level = btrfs_header_level(cur); 4928 sret = bin_search(cur, min_key, level, &slot); 4929 4930 /* at the lowest level, we're done, setup the path and exit */ 4931 if (level == path->lowest_level) { 4932 if (slot >= nritems) 4933 goto find_next_key; 4934 ret = 0; 4935 path->slots[level] = slot; 4936 btrfs_item_key_to_cpu(cur, &found_key, slot); 4937 goto out; 4938 } 4939 if (sret && slot > 0) 4940 slot--; 4941 /* 4942 * check this node pointer against the min_trans parameters. 4943 * If it is too old, old, skip to the next one. 4944 */ 4945 while (slot < nritems) { 4946 u64 blockptr; 4947 u64 gen; 4948 4949 blockptr = btrfs_node_blockptr(cur, slot); 4950 gen = btrfs_node_ptr_generation(cur, slot); 4951 if (gen < min_trans) { 4952 slot++; 4953 continue; 4954 } 4955 break; 4956 } 4957 find_next_key: 4958 /* 4959 * we didn't find a candidate key in this node, walk forward 4960 * and find another one 4961 */ 4962 if (slot >= nritems) { 4963 path->slots[level] = slot; 4964 btrfs_set_path_blocking(path); 4965 sret = btrfs_find_next_key(root, path, min_key, level, 4966 min_trans); 4967 if (sret == 0) { 4968 btrfs_release_path(path); 4969 goto again; 4970 } else { 4971 goto out; 4972 } 4973 } 4974 /* save our key for returning back */ 4975 btrfs_node_key_to_cpu(cur, &found_key, slot); 4976 path->slots[level] = slot; 4977 if (level == path->lowest_level) { 4978 ret = 0; 4979 unlock_up(path, level, 1, 0, NULL); 4980 goto out; 4981 } 4982 btrfs_set_path_blocking(path); 4983 cur = read_node_slot(root, cur, slot); 4984 BUG_ON(!cur); /* -ENOMEM */ 4985 4986 btrfs_tree_read_lock(cur); 4987 4988 path->locks[level - 1] = BTRFS_READ_LOCK; 4989 path->nodes[level - 1] = cur; 4990 unlock_up(path, level, 1, 0, NULL); 4991 btrfs_clear_path_blocking(path, NULL, 0); 4992 } 4993 out: 4994 if (ret == 0) 4995 memcpy(min_key, &found_key, sizeof(found_key)); 4996 btrfs_set_path_blocking(path); 4997 return ret; 4998 } 4999 5000 static void tree_move_down(struct btrfs_root *root, 5001 struct btrfs_path *path, 5002 int *level, int root_level) 5003 { 5004 BUG_ON(*level == 0); 5005 path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], 5006 path->slots[*level]); 5007 path->slots[*level - 1] = 0; 5008 (*level)--; 5009 } 5010 5011 static int tree_move_next_or_upnext(struct btrfs_root *root, 5012 struct btrfs_path *path, 5013 int *level, int root_level) 5014 { 5015 int ret = 0; 5016 int nritems; 5017 nritems = btrfs_header_nritems(path->nodes[*level]); 5018 5019 path->slots[*level]++; 5020 5021 while (path->slots[*level] >= nritems) { 5022 if (*level == root_level) 5023 return -1; 5024 5025 /* move upnext */ 5026 path->slots[*level] = 0; 5027 free_extent_buffer(path->nodes[*level]); 5028 path->nodes[*level] = NULL; 5029 (*level)++; 5030 path->slots[*level]++; 5031 5032 nritems = btrfs_header_nritems(path->nodes[*level]); 5033 ret = 1; 5034 } 5035 return ret; 5036 } 5037 5038 /* 5039 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 5040 * or down. 5041 */ 5042 static int tree_advance(struct btrfs_root *root, 5043 struct btrfs_path *path, 5044 int *level, int root_level, 5045 int allow_down, 5046 struct btrfs_key *key) 5047 { 5048 int ret; 5049 5050 if (*level == 0 || !allow_down) { 5051 ret = tree_move_next_or_upnext(root, path, level, root_level); 5052 } else { 5053 tree_move_down(root, path, level, root_level); 5054 ret = 0; 5055 } 5056 if (ret >= 0) { 5057 if (*level == 0) 5058 btrfs_item_key_to_cpu(path->nodes[*level], key, 5059 path->slots[*level]); 5060 else 5061 btrfs_node_key_to_cpu(path->nodes[*level], key, 5062 path->slots[*level]); 5063 } 5064 return ret; 5065 } 5066 5067 static int tree_compare_item(struct btrfs_root *left_root, 5068 struct btrfs_path *left_path, 5069 struct btrfs_path *right_path, 5070 char *tmp_buf) 5071 { 5072 int cmp; 5073 int len1, len2; 5074 unsigned long off1, off2; 5075 5076 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); 5077 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); 5078 if (len1 != len2) 5079 return 1; 5080 5081 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 5082 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 5083 right_path->slots[0]); 5084 5085 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 5086 5087 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 5088 if (cmp) 5089 return 1; 5090 return 0; 5091 } 5092 5093 #define ADVANCE 1 5094 #define ADVANCE_ONLY_NEXT -1 5095 5096 /* 5097 * This function compares two trees and calls the provided callback for 5098 * every changed/new/deleted item it finds. 5099 * If shared tree blocks are encountered, whole subtrees are skipped, making 5100 * the compare pretty fast on snapshotted subvolumes. 5101 * 5102 * This currently works on commit roots only. As commit roots are read only, 5103 * we don't do any locking. The commit roots are protected with transactions. 5104 * Transactions are ended and rejoined when a commit is tried in between. 5105 * 5106 * This function checks for modifications done to the trees while comparing. 5107 * If it detects a change, it aborts immediately. 5108 */ 5109 int btrfs_compare_trees(struct btrfs_root *left_root, 5110 struct btrfs_root *right_root, 5111 btrfs_changed_cb_t changed_cb, void *ctx) 5112 { 5113 int ret; 5114 int cmp; 5115 struct btrfs_trans_handle *trans = NULL; 5116 struct btrfs_path *left_path = NULL; 5117 struct btrfs_path *right_path = NULL; 5118 struct btrfs_key left_key; 5119 struct btrfs_key right_key; 5120 char *tmp_buf = NULL; 5121 int left_root_level; 5122 int right_root_level; 5123 int left_level; 5124 int right_level; 5125 int left_end_reached; 5126 int right_end_reached; 5127 int advance_left; 5128 int advance_right; 5129 u64 left_blockptr; 5130 u64 right_blockptr; 5131 u64 left_start_ctransid; 5132 u64 right_start_ctransid; 5133 u64 ctransid; 5134 5135 left_path = btrfs_alloc_path(); 5136 if (!left_path) { 5137 ret = -ENOMEM; 5138 goto out; 5139 } 5140 right_path = btrfs_alloc_path(); 5141 if (!right_path) { 5142 ret = -ENOMEM; 5143 goto out; 5144 } 5145 5146 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); 5147 if (!tmp_buf) { 5148 ret = -ENOMEM; 5149 goto out; 5150 } 5151 5152 left_path->search_commit_root = 1; 5153 left_path->skip_locking = 1; 5154 right_path->search_commit_root = 1; 5155 right_path->skip_locking = 1; 5156 5157 spin_lock(&left_root->root_item_lock); 5158 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5159 spin_unlock(&left_root->root_item_lock); 5160 5161 spin_lock(&right_root->root_item_lock); 5162 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5163 spin_unlock(&right_root->root_item_lock); 5164 5165 trans = btrfs_join_transaction(left_root); 5166 if (IS_ERR(trans)) { 5167 ret = PTR_ERR(trans); 5168 trans = NULL; 5169 goto out; 5170 } 5171 5172 /* 5173 * Strategy: Go to the first items of both trees. Then do 5174 * 5175 * If both trees are at level 0 5176 * Compare keys of current items 5177 * If left < right treat left item as new, advance left tree 5178 * and repeat 5179 * If left > right treat right item as deleted, advance right tree 5180 * and repeat 5181 * If left == right do deep compare of items, treat as changed if 5182 * needed, advance both trees and repeat 5183 * If both trees are at the same level but not at level 0 5184 * Compare keys of current nodes/leafs 5185 * If left < right advance left tree and repeat 5186 * If left > right advance right tree and repeat 5187 * If left == right compare blockptrs of the next nodes/leafs 5188 * If they match advance both trees but stay at the same level 5189 * and repeat 5190 * If they don't match advance both trees while allowing to go 5191 * deeper and repeat 5192 * If tree levels are different 5193 * Advance the tree that needs it and repeat 5194 * 5195 * Advancing a tree means: 5196 * If we are at level 0, try to go to the next slot. If that's not 5197 * possible, go one level up and repeat. Stop when we found a level 5198 * where we could go to the next slot. We may at this point be on a 5199 * node or a leaf. 5200 * 5201 * If we are not at level 0 and not on shared tree blocks, go one 5202 * level deeper. 5203 * 5204 * If we are not at level 0 and on shared tree blocks, go one slot to 5205 * the right if possible or go up and right. 5206 */ 5207 5208 left_level = btrfs_header_level(left_root->commit_root); 5209 left_root_level = left_level; 5210 left_path->nodes[left_level] = left_root->commit_root; 5211 extent_buffer_get(left_path->nodes[left_level]); 5212 5213 right_level = btrfs_header_level(right_root->commit_root); 5214 right_root_level = right_level; 5215 right_path->nodes[right_level] = right_root->commit_root; 5216 extent_buffer_get(right_path->nodes[right_level]); 5217 5218 if (left_level == 0) 5219 btrfs_item_key_to_cpu(left_path->nodes[left_level], 5220 &left_key, left_path->slots[left_level]); 5221 else 5222 btrfs_node_key_to_cpu(left_path->nodes[left_level], 5223 &left_key, left_path->slots[left_level]); 5224 if (right_level == 0) 5225 btrfs_item_key_to_cpu(right_path->nodes[right_level], 5226 &right_key, right_path->slots[right_level]); 5227 else 5228 btrfs_node_key_to_cpu(right_path->nodes[right_level], 5229 &right_key, right_path->slots[right_level]); 5230 5231 left_end_reached = right_end_reached = 0; 5232 advance_left = advance_right = 0; 5233 5234 while (1) { 5235 /* 5236 * We need to make sure the transaction does not get committed 5237 * while we do anything on commit roots. This means, we need to 5238 * join and leave transactions for every item that we process. 5239 */ 5240 if (trans && btrfs_should_end_transaction(trans, left_root)) { 5241 btrfs_release_path(left_path); 5242 btrfs_release_path(right_path); 5243 5244 ret = btrfs_end_transaction(trans, left_root); 5245 trans = NULL; 5246 if (ret < 0) 5247 goto out; 5248 } 5249 /* now rejoin the transaction */ 5250 if (!trans) { 5251 trans = btrfs_join_transaction(left_root); 5252 if (IS_ERR(trans)) { 5253 ret = PTR_ERR(trans); 5254 trans = NULL; 5255 goto out; 5256 } 5257 5258 spin_lock(&left_root->root_item_lock); 5259 ctransid = btrfs_root_ctransid(&left_root->root_item); 5260 spin_unlock(&left_root->root_item_lock); 5261 if (ctransid != left_start_ctransid) 5262 left_start_ctransid = 0; 5263 5264 spin_lock(&right_root->root_item_lock); 5265 ctransid = btrfs_root_ctransid(&right_root->root_item); 5266 spin_unlock(&right_root->root_item_lock); 5267 if (ctransid != right_start_ctransid) 5268 right_start_ctransid = 0; 5269 5270 if (!left_start_ctransid || !right_start_ctransid) { 5271 WARN(1, KERN_WARNING 5272 "btrfs: btrfs_compare_tree detected " 5273 "a change in one of the trees while " 5274 "iterating. This is probably a " 5275 "bug.\n"); 5276 ret = -EIO; 5277 goto out; 5278 } 5279 5280 /* 5281 * the commit root may have changed, so start again 5282 * where we stopped 5283 */ 5284 left_path->lowest_level = left_level; 5285 right_path->lowest_level = right_level; 5286 ret = btrfs_search_slot(NULL, left_root, 5287 &left_key, left_path, 0, 0); 5288 if (ret < 0) 5289 goto out; 5290 ret = btrfs_search_slot(NULL, right_root, 5291 &right_key, right_path, 0, 0); 5292 if (ret < 0) 5293 goto out; 5294 } 5295 5296 if (advance_left && !left_end_reached) { 5297 ret = tree_advance(left_root, left_path, &left_level, 5298 left_root_level, 5299 advance_left != ADVANCE_ONLY_NEXT, 5300 &left_key); 5301 if (ret < 0) 5302 left_end_reached = ADVANCE; 5303 advance_left = 0; 5304 } 5305 if (advance_right && !right_end_reached) { 5306 ret = tree_advance(right_root, right_path, &right_level, 5307 right_root_level, 5308 advance_right != ADVANCE_ONLY_NEXT, 5309 &right_key); 5310 if (ret < 0) 5311 right_end_reached = ADVANCE; 5312 advance_right = 0; 5313 } 5314 5315 if (left_end_reached && right_end_reached) { 5316 ret = 0; 5317 goto out; 5318 } else if (left_end_reached) { 5319 if (right_level == 0) { 5320 ret = changed_cb(left_root, right_root, 5321 left_path, right_path, 5322 &right_key, 5323 BTRFS_COMPARE_TREE_DELETED, 5324 ctx); 5325 if (ret < 0) 5326 goto out; 5327 } 5328 advance_right = ADVANCE; 5329 continue; 5330 } else if (right_end_reached) { 5331 if (left_level == 0) { 5332 ret = changed_cb(left_root, right_root, 5333 left_path, right_path, 5334 &left_key, 5335 BTRFS_COMPARE_TREE_NEW, 5336 ctx); 5337 if (ret < 0) 5338 goto out; 5339 } 5340 advance_left = ADVANCE; 5341 continue; 5342 } 5343 5344 if (left_level == 0 && right_level == 0) { 5345 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5346 if (cmp < 0) { 5347 ret = changed_cb(left_root, right_root, 5348 left_path, right_path, 5349 &left_key, 5350 BTRFS_COMPARE_TREE_NEW, 5351 ctx); 5352 if (ret < 0) 5353 goto out; 5354 advance_left = ADVANCE; 5355 } else if (cmp > 0) { 5356 ret = changed_cb(left_root, right_root, 5357 left_path, right_path, 5358 &right_key, 5359 BTRFS_COMPARE_TREE_DELETED, 5360 ctx); 5361 if (ret < 0) 5362 goto out; 5363 advance_right = ADVANCE; 5364 } else { 5365 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5366 ret = tree_compare_item(left_root, left_path, 5367 right_path, tmp_buf); 5368 if (ret) { 5369 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5370 ret = changed_cb(left_root, right_root, 5371 left_path, right_path, 5372 &left_key, 5373 BTRFS_COMPARE_TREE_CHANGED, 5374 ctx); 5375 if (ret < 0) 5376 goto out; 5377 } 5378 advance_left = ADVANCE; 5379 advance_right = ADVANCE; 5380 } 5381 } else if (left_level == right_level) { 5382 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 5383 if (cmp < 0) { 5384 advance_left = ADVANCE; 5385 } else if (cmp > 0) { 5386 advance_right = ADVANCE; 5387 } else { 5388 left_blockptr = btrfs_node_blockptr( 5389 left_path->nodes[left_level], 5390 left_path->slots[left_level]); 5391 right_blockptr = btrfs_node_blockptr( 5392 right_path->nodes[right_level], 5393 right_path->slots[right_level]); 5394 if (left_blockptr == right_blockptr) { 5395 /* 5396 * As we're on a shared block, don't 5397 * allow to go deeper. 5398 */ 5399 advance_left = ADVANCE_ONLY_NEXT; 5400 advance_right = ADVANCE_ONLY_NEXT; 5401 } else { 5402 advance_left = ADVANCE; 5403 advance_right = ADVANCE; 5404 } 5405 } 5406 } else if (left_level < right_level) { 5407 advance_right = ADVANCE; 5408 } else { 5409 advance_left = ADVANCE; 5410 } 5411 } 5412 5413 out: 5414 btrfs_free_path(left_path); 5415 btrfs_free_path(right_path); 5416 kfree(tmp_buf); 5417 5418 if (trans) { 5419 if (!ret) 5420 ret = btrfs_end_transaction(trans, left_root); 5421 else 5422 btrfs_end_transaction(trans, left_root); 5423 } 5424 5425 return ret; 5426 } 5427 5428 /* 5429 * this is similar to btrfs_next_leaf, but does not try to preserve 5430 * and fixup the path. It looks for and returns the next key in the 5431 * tree based on the current path and the min_trans parameters. 5432 * 5433 * 0 is returned if another key is found, < 0 if there are any errors 5434 * and 1 is returned if there are no higher keys in the tree 5435 * 5436 * path->keep_locks should be set to 1 on the search made before 5437 * calling this function. 5438 */ 5439 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 5440 struct btrfs_key *key, int level, u64 min_trans) 5441 { 5442 int slot; 5443 struct extent_buffer *c; 5444 5445 WARN_ON(!path->keep_locks); 5446 while (level < BTRFS_MAX_LEVEL) { 5447 if (!path->nodes[level]) 5448 return 1; 5449 5450 slot = path->slots[level] + 1; 5451 c = path->nodes[level]; 5452 next: 5453 if (slot >= btrfs_header_nritems(c)) { 5454 int ret; 5455 int orig_lowest; 5456 struct btrfs_key cur_key; 5457 if (level + 1 >= BTRFS_MAX_LEVEL || 5458 !path->nodes[level + 1]) 5459 return 1; 5460 5461 if (path->locks[level + 1]) { 5462 level++; 5463 continue; 5464 } 5465 5466 slot = btrfs_header_nritems(c) - 1; 5467 if (level == 0) 5468 btrfs_item_key_to_cpu(c, &cur_key, slot); 5469 else 5470 btrfs_node_key_to_cpu(c, &cur_key, slot); 5471 5472 orig_lowest = path->lowest_level; 5473 btrfs_release_path(path); 5474 path->lowest_level = level; 5475 ret = btrfs_search_slot(NULL, root, &cur_key, path, 5476 0, 0); 5477 path->lowest_level = orig_lowest; 5478 if (ret < 0) 5479 return ret; 5480 5481 c = path->nodes[level]; 5482 slot = path->slots[level]; 5483 if (ret == 0) 5484 slot++; 5485 goto next; 5486 } 5487 5488 if (level == 0) 5489 btrfs_item_key_to_cpu(c, key, slot); 5490 else { 5491 u64 gen = btrfs_node_ptr_generation(c, slot); 5492 5493 if (gen < min_trans) { 5494 slot++; 5495 goto next; 5496 } 5497 btrfs_node_key_to_cpu(c, key, slot); 5498 } 5499 return 0; 5500 } 5501 return 1; 5502 } 5503 5504 /* 5505 * search the tree again to find a leaf with greater keys 5506 * returns 0 if it found something or 1 if there are no greater leaves. 5507 * returns < 0 on io errors. 5508 */ 5509 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 5510 { 5511 return btrfs_next_old_leaf(root, path, 0); 5512 } 5513 5514 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5515 u64 time_seq) 5516 { 5517 int slot; 5518 int level; 5519 struct extent_buffer *c; 5520 struct extent_buffer *next; 5521 struct btrfs_key key; 5522 u32 nritems; 5523 int ret; 5524 int old_spinning = path->leave_spinning; 5525 int next_rw_lock = 0; 5526 5527 nritems = btrfs_header_nritems(path->nodes[0]); 5528 if (nritems == 0) 5529 return 1; 5530 5531 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 5532 again: 5533 level = 1; 5534 next = NULL; 5535 next_rw_lock = 0; 5536 btrfs_release_path(path); 5537 5538 path->keep_locks = 1; 5539 path->leave_spinning = 1; 5540 5541 if (time_seq) 5542 ret = btrfs_search_old_slot(root, &key, path, time_seq); 5543 else 5544 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5545 path->keep_locks = 0; 5546 5547 if (ret < 0) 5548 return ret; 5549 5550 nritems = btrfs_header_nritems(path->nodes[0]); 5551 /* 5552 * by releasing the path above we dropped all our locks. A balance 5553 * could have added more items next to the key that used to be 5554 * at the very end of the block. So, check again here and 5555 * advance the path if there are now more items available. 5556 */ 5557 if (nritems > 0 && path->slots[0] < nritems - 1) { 5558 if (ret == 0) 5559 path->slots[0]++; 5560 ret = 0; 5561 goto done; 5562 } 5563 5564 while (level < BTRFS_MAX_LEVEL) { 5565 if (!path->nodes[level]) { 5566 ret = 1; 5567 goto done; 5568 } 5569 5570 slot = path->slots[level] + 1; 5571 c = path->nodes[level]; 5572 if (slot >= btrfs_header_nritems(c)) { 5573 level++; 5574 if (level == BTRFS_MAX_LEVEL) { 5575 ret = 1; 5576 goto done; 5577 } 5578 continue; 5579 } 5580 5581 if (next) { 5582 btrfs_tree_unlock_rw(next, next_rw_lock); 5583 free_extent_buffer(next); 5584 } 5585 5586 next = c; 5587 next_rw_lock = path->locks[level]; 5588 ret = read_block_for_search(NULL, root, path, &next, level, 5589 slot, &key, 0); 5590 if (ret == -EAGAIN) 5591 goto again; 5592 5593 if (ret < 0) { 5594 btrfs_release_path(path); 5595 goto done; 5596 } 5597 5598 if (!path->skip_locking) { 5599 ret = btrfs_try_tree_read_lock(next); 5600 if (!ret && time_seq) { 5601 /* 5602 * If we don't get the lock, we may be racing 5603 * with push_leaf_left, holding that lock while 5604 * itself waiting for the leaf we've currently 5605 * locked. To solve this situation, we give up 5606 * on our lock and cycle. 5607 */ 5608 free_extent_buffer(next); 5609 btrfs_release_path(path); 5610 cond_resched(); 5611 goto again; 5612 } 5613 if (!ret) { 5614 btrfs_set_path_blocking(path); 5615 btrfs_tree_read_lock(next); 5616 btrfs_clear_path_blocking(path, next, 5617 BTRFS_READ_LOCK); 5618 } 5619 next_rw_lock = BTRFS_READ_LOCK; 5620 } 5621 break; 5622 } 5623 path->slots[level] = slot; 5624 while (1) { 5625 level--; 5626 c = path->nodes[level]; 5627 if (path->locks[level]) 5628 btrfs_tree_unlock_rw(c, path->locks[level]); 5629 5630 free_extent_buffer(c); 5631 path->nodes[level] = next; 5632 path->slots[level] = 0; 5633 if (!path->skip_locking) 5634 path->locks[level] = next_rw_lock; 5635 if (!level) 5636 break; 5637 5638 ret = read_block_for_search(NULL, root, path, &next, level, 5639 0, &key, 0); 5640 if (ret == -EAGAIN) 5641 goto again; 5642 5643 if (ret < 0) { 5644 btrfs_release_path(path); 5645 goto done; 5646 } 5647 5648 if (!path->skip_locking) { 5649 ret = btrfs_try_tree_read_lock(next); 5650 if (!ret) { 5651 btrfs_set_path_blocking(path); 5652 btrfs_tree_read_lock(next); 5653 btrfs_clear_path_blocking(path, next, 5654 BTRFS_READ_LOCK); 5655 } 5656 next_rw_lock = BTRFS_READ_LOCK; 5657 } 5658 } 5659 ret = 0; 5660 done: 5661 unlock_up(path, 0, 1, 0, NULL); 5662 path->leave_spinning = old_spinning; 5663 if (!old_spinning) 5664 btrfs_set_path_blocking(path); 5665 5666 return ret; 5667 } 5668 5669 /* 5670 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps 5671 * searching until it gets past min_objectid or finds an item of 'type' 5672 * 5673 * returns 0 if something is found, 1 if nothing was found and < 0 on error 5674 */ 5675 int btrfs_previous_item(struct btrfs_root *root, 5676 struct btrfs_path *path, u64 min_objectid, 5677 int type) 5678 { 5679 struct btrfs_key found_key; 5680 struct extent_buffer *leaf; 5681 u32 nritems; 5682 int ret; 5683 5684 while (1) { 5685 if (path->slots[0] == 0) { 5686 btrfs_set_path_blocking(path); 5687 ret = btrfs_prev_leaf(root, path); 5688 if (ret != 0) 5689 return ret; 5690 } else { 5691 path->slots[0]--; 5692 } 5693 leaf = path->nodes[0]; 5694 nritems = btrfs_header_nritems(leaf); 5695 if (nritems == 0) 5696 return 1; 5697 if (path->slots[0] == nritems) 5698 path->slots[0]--; 5699 5700 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5701 if (found_key.objectid < min_objectid) 5702 break; 5703 if (found_key.type == type) 5704 return 0; 5705 if (found_key.objectid == min_objectid && 5706 found_key.type < type) 5707 break; 5708 } 5709 return 1; 5710 } 5711