1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include "compat.h" 26 #include "hash.h" 27 #include "ctree.h" 28 #include "disk-io.h" 29 #include "print-tree.h" 30 #include "transaction.h" 31 #include "volumes.h" 32 #include "locking.h" 33 #include "free-space-cache.h" 34 35 static int update_block_group(struct btrfs_trans_handle *trans, 36 struct btrfs_root *root, 37 u64 bytenr, u64 num_bytes, int alloc, 38 int mark_free); 39 static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve); 41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 42 struct btrfs_root *root, 43 u64 bytenr, u64 num_bytes, u64 parent, 44 u64 root_objectid, u64 owner_objectid, 45 u64 owner_offset, int refs_to_drop, 46 struct btrfs_delayed_extent_op *extra_op); 47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 48 struct extent_buffer *leaf, 49 struct btrfs_extent_item *ei); 50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 51 struct btrfs_root *root, 52 u64 parent, u64 root_objectid, 53 u64 flags, u64 owner, u64 offset, 54 struct btrfs_key *ins, int ref_mod); 55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 56 struct btrfs_root *root, 57 u64 parent, u64 root_objectid, 58 u64 flags, struct btrfs_disk_key *key, 59 int level, struct btrfs_key *ins); 60 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 61 struct btrfs_root *extent_root, u64 alloc_bytes, 62 u64 flags, int force); 63 static int pin_down_bytes(struct btrfs_trans_handle *trans, 64 struct btrfs_root *root, 65 struct btrfs_path *path, 66 u64 bytenr, u64 num_bytes, 67 int is_data, int reserved, 68 struct extent_buffer **must_clean); 69 static int find_next_key(struct btrfs_path *path, int level, 70 struct btrfs_key *key); 71 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 72 int dump_block_groups); 73 74 static noinline int 75 block_group_cache_done(struct btrfs_block_group_cache *cache) 76 { 77 smp_mb(); 78 return cache->cached == BTRFS_CACHE_FINISHED; 79 } 80 81 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 82 { 83 return (cache->flags & bits) == bits; 84 } 85 86 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 87 { 88 atomic_inc(&cache->count); 89 } 90 91 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 92 { 93 if (atomic_dec_and_test(&cache->count)) 94 kfree(cache); 95 } 96 97 /* 98 * this adds the block group to the fs_info rb tree for the block group 99 * cache 100 */ 101 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 102 struct btrfs_block_group_cache *block_group) 103 { 104 struct rb_node **p; 105 struct rb_node *parent = NULL; 106 struct btrfs_block_group_cache *cache; 107 108 spin_lock(&info->block_group_cache_lock); 109 p = &info->block_group_cache_tree.rb_node; 110 111 while (*p) { 112 parent = *p; 113 cache = rb_entry(parent, struct btrfs_block_group_cache, 114 cache_node); 115 if (block_group->key.objectid < cache->key.objectid) { 116 p = &(*p)->rb_left; 117 } else if (block_group->key.objectid > cache->key.objectid) { 118 p = &(*p)->rb_right; 119 } else { 120 spin_unlock(&info->block_group_cache_lock); 121 return -EEXIST; 122 } 123 } 124 125 rb_link_node(&block_group->cache_node, parent, p); 126 rb_insert_color(&block_group->cache_node, 127 &info->block_group_cache_tree); 128 spin_unlock(&info->block_group_cache_lock); 129 130 return 0; 131 } 132 133 /* 134 * This will return the block group at or after bytenr if contains is 0, else 135 * it will return the block group that contains the bytenr 136 */ 137 static struct btrfs_block_group_cache * 138 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 139 int contains) 140 { 141 struct btrfs_block_group_cache *cache, *ret = NULL; 142 struct rb_node *n; 143 u64 end, start; 144 145 spin_lock(&info->block_group_cache_lock); 146 n = info->block_group_cache_tree.rb_node; 147 148 while (n) { 149 cache = rb_entry(n, struct btrfs_block_group_cache, 150 cache_node); 151 end = cache->key.objectid + cache->key.offset - 1; 152 start = cache->key.objectid; 153 154 if (bytenr < start) { 155 if (!contains && (!ret || start < ret->key.objectid)) 156 ret = cache; 157 n = n->rb_left; 158 } else if (bytenr > start) { 159 if (contains && bytenr <= end) { 160 ret = cache; 161 break; 162 } 163 n = n->rb_right; 164 } else { 165 ret = cache; 166 break; 167 } 168 } 169 if (ret) 170 btrfs_get_block_group(ret); 171 spin_unlock(&info->block_group_cache_lock); 172 173 return ret; 174 } 175 176 static int add_excluded_extent(struct btrfs_root *root, 177 u64 start, u64 num_bytes) 178 { 179 u64 end = start + num_bytes - 1; 180 set_extent_bits(&root->fs_info->freed_extents[0], 181 start, end, EXTENT_UPTODATE, GFP_NOFS); 182 set_extent_bits(&root->fs_info->freed_extents[1], 183 start, end, EXTENT_UPTODATE, GFP_NOFS); 184 return 0; 185 } 186 187 static void free_excluded_extents(struct btrfs_root *root, 188 struct btrfs_block_group_cache *cache) 189 { 190 u64 start, end; 191 192 start = cache->key.objectid; 193 end = start + cache->key.offset - 1; 194 195 clear_extent_bits(&root->fs_info->freed_extents[0], 196 start, end, EXTENT_UPTODATE, GFP_NOFS); 197 clear_extent_bits(&root->fs_info->freed_extents[1], 198 start, end, EXTENT_UPTODATE, GFP_NOFS); 199 } 200 201 static int exclude_super_stripes(struct btrfs_root *root, 202 struct btrfs_block_group_cache *cache) 203 { 204 u64 bytenr; 205 u64 *logical; 206 int stripe_len; 207 int i, nr, ret; 208 209 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 210 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 211 cache->bytes_super += stripe_len; 212 ret = add_excluded_extent(root, cache->key.objectid, 213 stripe_len); 214 BUG_ON(ret); 215 } 216 217 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 218 bytenr = btrfs_sb_offset(i); 219 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 220 cache->key.objectid, bytenr, 221 0, &logical, &nr, &stripe_len); 222 BUG_ON(ret); 223 224 while (nr--) { 225 cache->bytes_super += stripe_len; 226 ret = add_excluded_extent(root, logical[nr], 227 stripe_len); 228 BUG_ON(ret); 229 } 230 231 kfree(logical); 232 } 233 return 0; 234 } 235 236 static struct btrfs_caching_control * 237 get_caching_control(struct btrfs_block_group_cache *cache) 238 { 239 struct btrfs_caching_control *ctl; 240 241 spin_lock(&cache->lock); 242 if (cache->cached != BTRFS_CACHE_STARTED) { 243 spin_unlock(&cache->lock); 244 return NULL; 245 } 246 247 ctl = cache->caching_ctl; 248 atomic_inc(&ctl->count); 249 spin_unlock(&cache->lock); 250 return ctl; 251 } 252 253 static void put_caching_control(struct btrfs_caching_control *ctl) 254 { 255 if (atomic_dec_and_test(&ctl->count)) 256 kfree(ctl); 257 } 258 259 /* 260 * this is only called by cache_block_group, since we could have freed extents 261 * we need to check the pinned_extents for any extents that can't be used yet 262 * since their free space will be released as soon as the transaction commits. 263 */ 264 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 265 struct btrfs_fs_info *info, u64 start, u64 end) 266 { 267 u64 extent_start, extent_end, size, total_added = 0; 268 int ret; 269 270 while (start < end) { 271 ret = find_first_extent_bit(info->pinned_extents, start, 272 &extent_start, &extent_end, 273 EXTENT_DIRTY | EXTENT_UPTODATE); 274 if (ret) 275 break; 276 277 if (extent_start <= start) { 278 start = extent_end + 1; 279 } else if (extent_start > start && extent_start < end) { 280 size = extent_start - start; 281 total_added += size; 282 ret = btrfs_add_free_space(block_group, start, 283 size); 284 BUG_ON(ret); 285 start = extent_end + 1; 286 } else { 287 break; 288 } 289 } 290 291 if (start < end) { 292 size = end - start; 293 total_added += size; 294 ret = btrfs_add_free_space(block_group, start, size); 295 BUG_ON(ret); 296 } 297 298 return total_added; 299 } 300 301 static int caching_kthread(void *data) 302 { 303 struct btrfs_block_group_cache *block_group = data; 304 struct btrfs_fs_info *fs_info = block_group->fs_info; 305 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; 306 struct btrfs_root *extent_root = fs_info->extent_root; 307 struct btrfs_path *path; 308 struct extent_buffer *leaf; 309 struct btrfs_key key; 310 u64 total_found = 0; 311 u64 last = 0; 312 u32 nritems; 313 int ret = 0; 314 315 path = btrfs_alloc_path(); 316 if (!path) 317 return -ENOMEM; 318 319 exclude_super_stripes(extent_root, block_group); 320 spin_lock(&block_group->space_info->lock); 321 block_group->space_info->bytes_super += block_group->bytes_super; 322 spin_unlock(&block_group->space_info->lock); 323 324 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 325 326 /* 327 * We don't want to deadlock with somebody trying to allocate a new 328 * extent for the extent root while also trying to search the extent 329 * root to add free space. So we skip locking and search the commit 330 * root, since its read-only 331 */ 332 path->skip_locking = 1; 333 path->search_commit_root = 1; 334 path->reada = 2; 335 336 key.objectid = last; 337 key.offset = 0; 338 key.type = BTRFS_EXTENT_ITEM_KEY; 339 again: 340 mutex_lock(&caching_ctl->mutex); 341 /* need to make sure the commit_root doesn't disappear */ 342 down_read(&fs_info->extent_commit_sem); 343 344 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 345 if (ret < 0) 346 goto err; 347 348 leaf = path->nodes[0]; 349 nritems = btrfs_header_nritems(leaf); 350 351 while (1) { 352 smp_mb(); 353 if (fs_info->closing > 1) { 354 last = (u64)-1; 355 break; 356 } 357 358 if (path->slots[0] < nritems) { 359 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 360 } else { 361 ret = find_next_key(path, 0, &key); 362 if (ret) 363 break; 364 365 caching_ctl->progress = last; 366 btrfs_release_path(extent_root, path); 367 up_read(&fs_info->extent_commit_sem); 368 mutex_unlock(&caching_ctl->mutex); 369 if (btrfs_transaction_in_commit(fs_info)) 370 schedule_timeout(1); 371 else 372 cond_resched(); 373 goto again; 374 } 375 376 if (key.objectid < block_group->key.objectid) { 377 path->slots[0]++; 378 continue; 379 } 380 381 if (key.objectid >= block_group->key.objectid + 382 block_group->key.offset) 383 break; 384 385 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 386 total_found += add_new_free_space(block_group, 387 fs_info, last, 388 key.objectid); 389 last = key.objectid + key.offset; 390 391 if (total_found > (1024 * 1024 * 2)) { 392 total_found = 0; 393 wake_up(&caching_ctl->wait); 394 } 395 } 396 path->slots[0]++; 397 } 398 ret = 0; 399 400 total_found += add_new_free_space(block_group, fs_info, last, 401 block_group->key.objectid + 402 block_group->key.offset); 403 caching_ctl->progress = (u64)-1; 404 405 spin_lock(&block_group->lock); 406 block_group->caching_ctl = NULL; 407 block_group->cached = BTRFS_CACHE_FINISHED; 408 spin_unlock(&block_group->lock); 409 410 err: 411 btrfs_free_path(path); 412 up_read(&fs_info->extent_commit_sem); 413 414 free_excluded_extents(extent_root, block_group); 415 416 mutex_unlock(&caching_ctl->mutex); 417 wake_up(&caching_ctl->wait); 418 419 put_caching_control(caching_ctl); 420 atomic_dec(&block_group->space_info->caching_threads); 421 btrfs_put_block_group(block_group); 422 423 return 0; 424 } 425 426 static int cache_block_group(struct btrfs_block_group_cache *cache) 427 { 428 struct btrfs_fs_info *fs_info = cache->fs_info; 429 struct btrfs_caching_control *caching_ctl; 430 struct task_struct *tsk; 431 int ret = 0; 432 433 smp_mb(); 434 if (cache->cached != BTRFS_CACHE_NO) 435 return 0; 436 437 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 438 BUG_ON(!caching_ctl); 439 440 INIT_LIST_HEAD(&caching_ctl->list); 441 mutex_init(&caching_ctl->mutex); 442 init_waitqueue_head(&caching_ctl->wait); 443 caching_ctl->block_group = cache; 444 caching_ctl->progress = cache->key.objectid; 445 /* one for caching kthread, one for caching block group list */ 446 atomic_set(&caching_ctl->count, 2); 447 448 spin_lock(&cache->lock); 449 if (cache->cached != BTRFS_CACHE_NO) { 450 spin_unlock(&cache->lock); 451 kfree(caching_ctl); 452 return 0; 453 } 454 cache->caching_ctl = caching_ctl; 455 cache->cached = BTRFS_CACHE_STARTED; 456 spin_unlock(&cache->lock); 457 458 down_write(&fs_info->extent_commit_sem); 459 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 460 up_write(&fs_info->extent_commit_sem); 461 462 atomic_inc(&cache->space_info->caching_threads); 463 btrfs_get_block_group(cache); 464 465 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 466 cache->key.objectid); 467 if (IS_ERR(tsk)) { 468 ret = PTR_ERR(tsk); 469 printk(KERN_ERR "error running thread %d\n", ret); 470 BUG(); 471 } 472 473 return ret; 474 } 475 476 /* 477 * return the block group that starts at or after bytenr 478 */ 479 static struct btrfs_block_group_cache * 480 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 481 { 482 struct btrfs_block_group_cache *cache; 483 484 cache = block_group_cache_tree_search(info, bytenr, 0); 485 486 return cache; 487 } 488 489 /* 490 * return the block group that contains the given bytenr 491 */ 492 struct btrfs_block_group_cache *btrfs_lookup_block_group( 493 struct btrfs_fs_info *info, 494 u64 bytenr) 495 { 496 struct btrfs_block_group_cache *cache; 497 498 cache = block_group_cache_tree_search(info, bytenr, 1); 499 500 return cache; 501 } 502 503 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 504 u64 flags) 505 { 506 struct list_head *head = &info->space_info; 507 struct btrfs_space_info *found; 508 509 rcu_read_lock(); 510 list_for_each_entry_rcu(found, head, list) { 511 if (found->flags == flags) { 512 rcu_read_unlock(); 513 return found; 514 } 515 } 516 rcu_read_unlock(); 517 return NULL; 518 } 519 520 /* 521 * after adding space to the filesystem, we need to clear the full flags 522 * on all the space infos. 523 */ 524 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 525 { 526 struct list_head *head = &info->space_info; 527 struct btrfs_space_info *found; 528 529 rcu_read_lock(); 530 list_for_each_entry_rcu(found, head, list) 531 found->full = 0; 532 rcu_read_unlock(); 533 } 534 535 static u64 div_factor(u64 num, int factor) 536 { 537 if (factor == 10) 538 return num; 539 num *= factor; 540 do_div(num, 10); 541 return num; 542 } 543 544 u64 btrfs_find_block_group(struct btrfs_root *root, 545 u64 search_start, u64 search_hint, int owner) 546 { 547 struct btrfs_block_group_cache *cache; 548 u64 used; 549 u64 last = max(search_hint, search_start); 550 u64 group_start = 0; 551 int full_search = 0; 552 int factor = 9; 553 int wrapped = 0; 554 again: 555 while (1) { 556 cache = btrfs_lookup_first_block_group(root->fs_info, last); 557 if (!cache) 558 break; 559 560 spin_lock(&cache->lock); 561 last = cache->key.objectid + cache->key.offset; 562 used = btrfs_block_group_used(&cache->item); 563 564 if ((full_search || !cache->ro) && 565 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { 566 if (used + cache->pinned + cache->reserved < 567 div_factor(cache->key.offset, factor)) { 568 group_start = cache->key.objectid; 569 spin_unlock(&cache->lock); 570 btrfs_put_block_group(cache); 571 goto found; 572 } 573 } 574 spin_unlock(&cache->lock); 575 btrfs_put_block_group(cache); 576 cond_resched(); 577 } 578 if (!wrapped) { 579 last = search_start; 580 wrapped = 1; 581 goto again; 582 } 583 if (!full_search && factor < 10) { 584 last = search_start; 585 full_search = 1; 586 factor = 10; 587 goto again; 588 } 589 found: 590 return group_start; 591 } 592 593 /* simple helper to search for an existing extent at a given offset */ 594 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 595 { 596 int ret; 597 struct btrfs_key key; 598 struct btrfs_path *path; 599 600 path = btrfs_alloc_path(); 601 BUG_ON(!path); 602 key.objectid = start; 603 key.offset = len; 604 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 605 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 606 0, 0); 607 btrfs_free_path(path); 608 return ret; 609 } 610 611 /* 612 * Back reference rules. Back refs have three main goals: 613 * 614 * 1) differentiate between all holders of references to an extent so that 615 * when a reference is dropped we can make sure it was a valid reference 616 * before freeing the extent. 617 * 618 * 2) Provide enough information to quickly find the holders of an extent 619 * if we notice a given block is corrupted or bad. 620 * 621 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 622 * maintenance. This is actually the same as #2, but with a slightly 623 * different use case. 624 * 625 * There are two kinds of back refs. The implicit back refs is optimized 626 * for pointers in non-shared tree blocks. For a given pointer in a block, 627 * back refs of this kind provide information about the block's owner tree 628 * and the pointer's key. These information allow us to find the block by 629 * b-tree searching. The full back refs is for pointers in tree blocks not 630 * referenced by their owner trees. The location of tree block is recorded 631 * in the back refs. Actually the full back refs is generic, and can be 632 * used in all cases the implicit back refs is used. The major shortcoming 633 * of the full back refs is its overhead. Every time a tree block gets 634 * COWed, we have to update back refs entry for all pointers in it. 635 * 636 * For a newly allocated tree block, we use implicit back refs for 637 * pointers in it. This means most tree related operations only involve 638 * implicit back refs. For a tree block created in old transaction, the 639 * only way to drop a reference to it is COW it. So we can detect the 640 * event that tree block loses its owner tree's reference and do the 641 * back refs conversion. 642 * 643 * When a tree block is COW'd through a tree, there are four cases: 644 * 645 * The reference count of the block is one and the tree is the block's 646 * owner tree. Nothing to do in this case. 647 * 648 * The reference count of the block is one and the tree is not the 649 * block's owner tree. In this case, full back refs is used for pointers 650 * in the block. Remove these full back refs, add implicit back refs for 651 * every pointers in the new block. 652 * 653 * The reference count of the block is greater than one and the tree is 654 * the block's owner tree. In this case, implicit back refs is used for 655 * pointers in the block. Add full back refs for every pointers in the 656 * block, increase lower level extents' reference counts. The original 657 * implicit back refs are entailed to the new block. 658 * 659 * The reference count of the block is greater than one and the tree is 660 * not the block's owner tree. Add implicit back refs for every pointer in 661 * the new block, increase lower level extents' reference count. 662 * 663 * Back Reference Key composing: 664 * 665 * The key objectid corresponds to the first byte in the extent, 666 * The key type is used to differentiate between types of back refs. 667 * There are different meanings of the key offset for different types 668 * of back refs. 669 * 670 * File extents can be referenced by: 671 * 672 * - multiple snapshots, subvolumes, or different generations in one subvol 673 * - different files inside a single subvolume 674 * - different offsets inside a file (bookend extents in file.c) 675 * 676 * The extent ref structure for the implicit back refs has fields for: 677 * 678 * - Objectid of the subvolume root 679 * - objectid of the file holding the reference 680 * - original offset in the file 681 * - how many bookend extents 682 * 683 * The key offset for the implicit back refs is hash of the first 684 * three fields. 685 * 686 * The extent ref structure for the full back refs has field for: 687 * 688 * - number of pointers in the tree leaf 689 * 690 * The key offset for the implicit back refs is the first byte of 691 * the tree leaf 692 * 693 * When a file extent is allocated, The implicit back refs is used. 694 * the fields are filled in: 695 * 696 * (root_key.objectid, inode objectid, offset in file, 1) 697 * 698 * When a file extent is removed file truncation, we find the 699 * corresponding implicit back refs and check the following fields: 700 * 701 * (btrfs_header_owner(leaf), inode objectid, offset in file) 702 * 703 * Btree extents can be referenced by: 704 * 705 * - Different subvolumes 706 * 707 * Both the implicit back refs and the full back refs for tree blocks 708 * only consist of key. The key offset for the implicit back refs is 709 * objectid of block's owner tree. The key offset for the full back refs 710 * is the first byte of parent block. 711 * 712 * When implicit back refs is used, information about the lowest key and 713 * level of the tree block are required. These information are stored in 714 * tree block info structure. 715 */ 716 717 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 718 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 719 struct btrfs_root *root, 720 struct btrfs_path *path, 721 u64 owner, u32 extra_size) 722 { 723 struct btrfs_extent_item *item; 724 struct btrfs_extent_item_v0 *ei0; 725 struct btrfs_extent_ref_v0 *ref0; 726 struct btrfs_tree_block_info *bi; 727 struct extent_buffer *leaf; 728 struct btrfs_key key; 729 struct btrfs_key found_key; 730 u32 new_size = sizeof(*item); 731 u64 refs; 732 int ret; 733 734 leaf = path->nodes[0]; 735 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 736 737 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 738 ei0 = btrfs_item_ptr(leaf, path->slots[0], 739 struct btrfs_extent_item_v0); 740 refs = btrfs_extent_refs_v0(leaf, ei0); 741 742 if (owner == (u64)-1) { 743 while (1) { 744 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 745 ret = btrfs_next_leaf(root, path); 746 if (ret < 0) 747 return ret; 748 BUG_ON(ret > 0); 749 leaf = path->nodes[0]; 750 } 751 btrfs_item_key_to_cpu(leaf, &found_key, 752 path->slots[0]); 753 BUG_ON(key.objectid != found_key.objectid); 754 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 755 path->slots[0]++; 756 continue; 757 } 758 ref0 = btrfs_item_ptr(leaf, path->slots[0], 759 struct btrfs_extent_ref_v0); 760 owner = btrfs_ref_objectid_v0(leaf, ref0); 761 break; 762 } 763 } 764 btrfs_release_path(root, path); 765 766 if (owner < BTRFS_FIRST_FREE_OBJECTID) 767 new_size += sizeof(*bi); 768 769 new_size -= sizeof(*ei0); 770 ret = btrfs_search_slot(trans, root, &key, path, 771 new_size + extra_size, 1); 772 if (ret < 0) 773 return ret; 774 BUG_ON(ret); 775 776 ret = btrfs_extend_item(trans, root, path, new_size); 777 BUG_ON(ret); 778 779 leaf = path->nodes[0]; 780 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 781 btrfs_set_extent_refs(leaf, item, refs); 782 /* FIXME: get real generation */ 783 btrfs_set_extent_generation(leaf, item, 0); 784 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 785 btrfs_set_extent_flags(leaf, item, 786 BTRFS_EXTENT_FLAG_TREE_BLOCK | 787 BTRFS_BLOCK_FLAG_FULL_BACKREF); 788 bi = (struct btrfs_tree_block_info *)(item + 1); 789 /* FIXME: get first key of the block */ 790 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 791 btrfs_set_tree_block_level(leaf, bi, (int)owner); 792 } else { 793 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 794 } 795 btrfs_mark_buffer_dirty(leaf); 796 return 0; 797 } 798 #endif 799 800 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 801 { 802 u32 high_crc = ~(u32)0; 803 u32 low_crc = ~(u32)0; 804 __le64 lenum; 805 806 lenum = cpu_to_le64(root_objectid); 807 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 808 lenum = cpu_to_le64(owner); 809 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 810 lenum = cpu_to_le64(offset); 811 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 812 813 return ((u64)high_crc << 31) ^ (u64)low_crc; 814 } 815 816 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 817 struct btrfs_extent_data_ref *ref) 818 { 819 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 820 btrfs_extent_data_ref_objectid(leaf, ref), 821 btrfs_extent_data_ref_offset(leaf, ref)); 822 } 823 824 static int match_extent_data_ref(struct extent_buffer *leaf, 825 struct btrfs_extent_data_ref *ref, 826 u64 root_objectid, u64 owner, u64 offset) 827 { 828 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 829 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 830 btrfs_extent_data_ref_offset(leaf, ref) != offset) 831 return 0; 832 return 1; 833 } 834 835 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 836 struct btrfs_root *root, 837 struct btrfs_path *path, 838 u64 bytenr, u64 parent, 839 u64 root_objectid, 840 u64 owner, u64 offset) 841 { 842 struct btrfs_key key; 843 struct btrfs_extent_data_ref *ref; 844 struct extent_buffer *leaf; 845 u32 nritems; 846 int ret; 847 int recow; 848 int err = -ENOENT; 849 850 key.objectid = bytenr; 851 if (parent) { 852 key.type = BTRFS_SHARED_DATA_REF_KEY; 853 key.offset = parent; 854 } else { 855 key.type = BTRFS_EXTENT_DATA_REF_KEY; 856 key.offset = hash_extent_data_ref(root_objectid, 857 owner, offset); 858 } 859 again: 860 recow = 0; 861 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 862 if (ret < 0) { 863 err = ret; 864 goto fail; 865 } 866 867 if (parent) { 868 if (!ret) 869 return 0; 870 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 871 key.type = BTRFS_EXTENT_REF_V0_KEY; 872 btrfs_release_path(root, path); 873 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 874 if (ret < 0) { 875 err = ret; 876 goto fail; 877 } 878 if (!ret) 879 return 0; 880 #endif 881 goto fail; 882 } 883 884 leaf = path->nodes[0]; 885 nritems = btrfs_header_nritems(leaf); 886 while (1) { 887 if (path->slots[0] >= nritems) { 888 ret = btrfs_next_leaf(root, path); 889 if (ret < 0) 890 err = ret; 891 if (ret) 892 goto fail; 893 894 leaf = path->nodes[0]; 895 nritems = btrfs_header_nritems(leaf); 896 recow = 1; 897 } 898 899 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 900 if (key.objectid != bytenr || 901 key.type != BTRFS_EXTENT_DATA_REF_KEY) 902 goto fail; 903 904 ref = btrfs_item_ptr(leaf, path->slots[0], 905 struct btrfs_extent_data_ref); 906 907 if (match_extent_data_ref(leaf, ref, root_objectid, 908 owner, offset)) { 909 if (recow) { 910 btrfs_release_path(root, path); 911 goto again; 912 } 913 err = 0; 914 break; 915 } 916 path->slots[0]++; 917 } 918 fail: 919 return err; 920 } 921 922 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 923 struct btrfs_root *root, 924 struct btrfs_path *path, 925 u64 bytenr, u64 parent, 926 u64 root_objectid, u64 owner, 927 u64 offset, int refs_to_add) 928 { 929 struct btrfs_key key; 930 struct extent_buffer *leaf; 931 u32 size; 932 u32 num_refs; 933 int ret; 934 935 key.objectid = bytenr; 936 if (parent) { 937 key.type = BTRFS_SHARED_DATA_REF_KEY; 938 key.offset = parent; 939 size = sizeof(struct btrfs_shared_data_ref); 940 } else { 941 key.type = BTRFS_EXTENT_DATA_REF_KEY; 942 key.offset = hash_extent_data_ref(root_objectid, 943 owner, offset); 944 size = sizeof(struct btrfs_extent_data_ref); 945 } 946 947 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 948 if (ret && ret != -EEXIST) 949 goto fail; 950 951 leaf = path->nodes[0]; 952 if (parent) { 953 struct btrfs_shared_data_ref *ref; 954 ref = btrfs_item_ptr(leaf, path->slots[0], 955 struct btrfs_shared_data_ref); 956 if (ret == 0) { 957 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 958 } else { 959 num_refs = btrfs_shared_data_ref_count(leaf, ref); 960 num_refs += refs_to_add; 961 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 962 } 963 } else { 964 struct btrfs_extent_data_ref *ref; 965 while (ret == -EEXIST) { 966 ref = btrfs_item_ptr(leaf, path->slots[0], 967 struct btrfs_extent_data_ref); 968 if (match_extent_data_ref(leaf, ref, root_objectid, 969 owner, offset)) 970 break; 971 btrfs_release_path(root, path); 972 key.offset++; 973 ret = btrfs_insert_empty_item(trans, root, path, &key, 974 size); 975 if (ret && ret != -EEXIST) 976 goto fail; 977 978 leaf = path->nodes[0]; 979 } 980 ref = btrfs_item_ptr(leaf, path->slots[0], 981 struct btrfs_extent_data_ref); 982 if (ret == 0) { 983 btrfs_set_extent_data_ref_root(leaf, ref, 984 root_objectid); 985 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 986 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 987 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 988 } else { 989 num_refs = btrfs_extent_data_ref_count(leaf, ref); 990 num_refs += refs_to_add; 991 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 992 } 993 } 994 btrfs_mark_buffer_dirty(leaf); 995 ret = 0; 996 fail: 997 btrfs_release_path(root, path); 998 return ret; 999 } 1000 1001 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1002 struct btrfs_root *root, 1003 struct btrfs_path *path, 1004 int refs_to_drop) 1005 { 1006 struct btrfs_key key; 1007 struct btrfs_extent_data_ref *ref1 = NULL; 1008 struct btrfs_shared_data_ref *ref2 = NULL; 1009 struct extent_buffer *leaf; 1010 u32 num_refs = 0; 1011 int ret = 0; 1012 1013 leaf = path->nodes[0]; 1014 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1015 1016 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1017 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1018 struct btrfs_extent_data_ref); 1019 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1020 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1021 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1022 struct btrfs_shared_data_ref); 1023 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1024 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1025 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1026 struct btrfs_extent_ref_v0 *ref0; 1027 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1028 struct btrfs_extent_ref_v0); 1029 num_refs = btrfs_ref_count_v0(leaf, ref0); 1030 #endif 1031 } else { 1032 BUG(); 1033 } 1034 1035 BUG_ON(num_refs < refs_to_drop); 1036 num_refs -= refs_to_drop; 1037 1038 if (num_refs == 0) { 1039 ret = btrfs_del_item(trans, root, path); 1040 } else { 1041 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1042 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1043 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1044 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1045 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1046 else { 1047 struct btrfs_extent_ref_v0 *ref0; 1048 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1049 struct btrfs_extent_ref_v0); 1050 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1051 } 1052 #endif 1053 btrfs_mark_buffer_dirty(leaf); 1054 } 1055 return ret; 1056 } 1057 1058 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1059 struct btrfs_path *path, 1060 struct btrfs_extent_inline_ref *iref) 1061 { 1062 struct btrfs_key key; 1063 struct extent_buffer *leaf; 1064 struct btrfs_extent_data_ref *ref1; 1065 struct btrfs_shared_data_ref *ref2; 1066 u32 num_refs = 0; 1067 1068 leaf = path->nodes[0]; 1069 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1070 if (iref) { 1071 if (btrfs_extent_inline_ref_type(leaf, iref) == 1072 BTRFS_EXTENT_DATA_REF_KEY) { 1073 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1074 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1075 } else { 1076 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1077 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1078 } 1079 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1080 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1081 struct btrfs_extent_data_ref); 1082 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1083 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1084 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1085 struct btrfs_shared_data_ref); 1086 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1087 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1088 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1089 struct btrfs_extent_ref_v0 *ref0; 1090 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1091 struct btrfs_extent_ref_v0); 1092 num_refs = btrfs_ref_count_v0(leaf, ref0); 1093 #endif 1094 } else { 1095 WARN_ON(1); 1096 } 1097 return num_refs; 1098 } 1099 1100 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1101 struct btrfs_root *root, 1102 struct btrfs_path *path, 1103 u64 bytenr, u64 parent, 1104 u64 root_objectid) 1105 { 1106 struct btrfs_key key; 1107 int ret; 1108 1109 key.objectid = bytenr; 1110 if (parent) { 1111 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1112 key.offset = parent; 1113 } else { 1114 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1115 key.offset = root_objectid; 1116 } 1117 1118 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1119 if (ret > 0) 1120 ret = -ENOENT; 1121 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1122 if (ret == -ENOENT && parent) { 1123 btrfs_release_path(root, path); 1124 key.type = BTRFS_EXTENT_REF_V0_KEY; 1125 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1126 if (ret > 0) 1127 ret = -ENOENT; 1128 } 1129 #endif 1130 return ret; 1131 } 1132 1133 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1134 struct btrfs_root *root, 1135 struct btrfs_path *path, 1136 u64 bytenr, u64 parent, 1137 u64 root_objectid) 1138 { 1139 struct btrfs_key key; 1140 int ret; 1141 1142 key.objectid = bytenr; 1143 if (parent) { 1144 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1145 key.offset = parent; 1146 } else { 1147 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1148 key.offset = root_objectid; 1149 } 1150 1151 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1152 btrfs_release_path(root, path); 1153 return ret; 1154 } 1155 1156 static inline int extent_ref_type(u64 parent, u64 owner) 1157 { 1158 int type; 1159 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1160 if (parent > 0) 1161 type = BTRFS_SHARED_BLOCK_REF_KEY; 1162 else 1163 type = BTRFS_TREE_BLOCK_REF_KEY; 1164 } else { 1165 if (parent > 0) 1166 type = BTRFS_SHARED_DATA_REF_KEY; 1167 else 1168 type = BTRFS_EXTENT_DATA_REF_KEY; 1169 } 1170 return type; 1171 } 1172 1173 static int find_next_key(struct btrfs_path *path, int level, 1174 struct btrfs_key *key) 1175 1176 { 1177 for (; level < BTRFS_MAX_LEVEL; level++) { 1178 if (!path->nodes[level]) 1179 break; 1180 if (path->slots[level] + 1 >= 1181 btrfs_header_nritems(path->nodes[level])) 1182 continue; 1183 if (level == 0) 1184 btrfs_item_key_to_cpu(path->nodes[level], key, 1185 path->slots[level] + 1); 1186 else 1187 btrfs_node_key_to_cpu(path->nodes[level], key, 1188 path->slots[level] + 1); 1189 return 0; 1190 } 1191 return 1; 1192 } 1193 1194 /* 1195 * look for inline back ref. if back ref is found, *ref_ret is set 1196 * to the address of inline back ref, and 0 is returned. 1197 * 1198 * if back ref isn't found, *ref_ret is set to the address where it 1199 * should be inserted, and -ENOENT is returned. 1200 * 1201 * if insert is true and there are too many inline back refs, the path 1202 * points to the extent item, and -EAGAIN is returned. 1203 * 1204 * NOTE: inline back refs are ordered in the same way that back ref 1205 * items in the tree are ordered. 1206 */ 1207 static noinline_for_stack 1208 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1209 struct btrfs_root *root, 1210 struct btrfs_path *path, 1211 struct btrfs_extent_inline_ref **ref_ret, 1212 u64 bytenr, u64 num_bytes, 1213 u64 parent, u64 root_objectid, 1214 u64 owner, u64 offset, int insert) 1215 { 1216 struct btrfs_key key; 1217 struct extent_buffer *leaf; 1218 struct btrfs_extent_item *ei; 1219 struct btrfs_extent_inline_ref *iref; 1220 u64 flags; 1221 u64 item_size; 1222 unsigned long ptr; 1223 unsigned long end; 1224 int extra_size; 1225 int type; 1226 int want; 1227 int ret; 1228 int err = 0; 1229 1230 key.objectid = bytenr; 1231 key.type = BTRFS_EXTENT_ITEM_KEY; 1232 key.offset = num_bytes; 1233 1234 want = extent_ref_type(parent, owner); 1235 if (insert) { 1236 extra_size = btrfs_extent_inline_ref_size(want); 1237 path->keep_locks = 1; 1238 } else 1239 extra_size = -1; 1240 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1241 if (ret < 0) { 1242 err = ret; 1243 goto out; 1244 } 1245 BUG_ON(ret); 1246 1247 leaf = path->nodes[0]; 1248 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1249 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1250 if (item_size < sizeof(*ei)) { 1251 if (!insert) { 1252 err = -ENOENT; 1253 goto out; 1254 } 1255 ret = convert_extent_item_v0(trans, root, path, owner, 1256 extra_size); 1257 if (ret < 0) { 1258 err = ret; 1259 goto out; 1260 } 1261 leaf = path->nodes[0]; 1262 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1263 } 1264 #endif 1265 BUG_ON(item_size < sizeof(*ei)); 1266 1267 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1268 flags = btrfs_extent_flags(leaf, ei); 1269 1270 ptr = (unsigned long)(ei + 1); 1271 end = (unsigned long)ei + item_size; 1272 1273 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1274 ptr += sizeof(struct btrfs_tree_block_info); 1275 BUG_ON(ptr > end); 1276 } else { 1277 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 1278 } 1279 1280 err = -ENOENT; 1281 while (1) { 1282 if (ptr >= end) { 1283 WARN_ON(ptr > end); 1284 break; 1285 } 1286 iref = (struct btrfs_extent_inline_ref *)ptr; 1287 type = btrfs_extent_inline_ref_type(leaf, iref); 1288 if (want < type) 1289 break; 1290 if (want > type) { 1291 ptr += btrfs_extent_inline_ref_size(type); 1292 continue; 1293 } 1294 1295 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1296 struct btrfs_extent_data_ref *dref; 1297 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1298 if (match_extent_data_ref(leaf, dref, root_objectid, 1299 owner, offset)) { 1300 err = 0; 1301 break; 1302 } 1303 if (hash_extent_data_ref_item(leaf, dref) < 1304 hash_extent_data_ref(root_objectid, owner, offset)) 1305 break; 1306 } else { 1307 u64 ref_offset; 1308 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1309 if (parent > 0) { 1310 if (parent == ref_offset) { 1311 err = 0; 1312 break; 1313 } 1314 if (ref_offset < parent) 1315 break; 1316 } else { 1317 if (root_objectid == ref_offset) { 1318 err = 0; 1319 break; 1320 } 1321 if (ref_offset < root_objectid) 1322 break; 1323 } 1324 } 1325 ptr += btrfs_extent_inline_ref_size(type); 1326 } 1327 if (err == -ENOENT && insert) { 1328 if (item_size + extra_size >= 1329 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1330 err = -EAGAIN; 1331 goto out; 1332 } 1333 /* 1334 * To add new inline back ref, we have to make sure 1335 * there is no corresponding back ref item. 1336 * For simplicity, we just do not add new inline back 1337 * ref if there is any kind of item for this block 1338 */ 1339 if (find_next_key(path, 0, &key) == 0 && 1340 key.objectid == bytenr && 1341 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1342 err = -EAGAIN; 1343 goto out; 1344 } 1345 } 1346 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1347 out: 1348 if (insert) { 1349 path->keep_locks = 0; 1350 btrfs_unlock_up_safe(path, 1); 1351 } 1352 return err; 1353 } 1354 1355 /* 1356 * helper to add new inline back ref 1357 */ 1358 static noinline_for_stack 1359 int setup_inline_extent_backref(struct btrfs_trans_handle *trans, 1360 struct btrfs_root *root, 1361 struct btrfs_path *path, 1362 struct btrfs_extent_inline_ref *iref, 1363 u64 parent, u64 root_objectid, 1364 u64 owner, u64 offset, int refs_to_add, 1365 struct btrfs_delayed_extent_op *extent_op) 1366 { 1367 struct extent_buffer *leaf; 1368 struct btrfs_extent_item *ei; 1369 unsigned long ptr; 1370 unsigned long end; 1371 unsigned long item_offset; 1372 u64 refs; 1373 int size; 1374 int type; 1375 int ret; 1376 1377 leaf = path->nodes[0]; 1378 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1379 item_offset = (unsigned long)iref - (unsigned long)ei; 1380 1381 type = extent_ref_type(parent, owner); 1382 size = btrfs_extent_inline_ref_size(type); 1383 1384 ret = btrfs_extend_item(trans, root, path, size); 1385 BUG_ON(ret); 1386 1387 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1388 refs = btrfs_extent_refs(leaf, ei); 1389 refs += refs_to_add; 1390 btrfs_set_extent_refs(leaf, ei, refs); 1391 if (extent_op) 1392 __run_delayed_extent_op(extent_op, leaf, ei); 1393 1394 ptr = (unsigned long)ei + item_offset; 1395 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1396 if (ptr < end - size) 1397 memmove_extent_buffer(leaf, ptr + size, ptr, 1398 end - size - ptr); 1399 1400 iref = (struct btrfs_extent_inline_ref *)ptr; 1401 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1402 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1403 struct btrfs_extent_data_ref *dref; 1404 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1405 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1406 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1407 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1408 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1409 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1410 struct btrfs_shared_data_ref *sref; 1411 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1412 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1413 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1414 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1415 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1416 } else { 1417 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1418 } 1419 btrfs_mark_buffer_dirty(leaf); 1420 return 0; 1421 } 1422 1423 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1424 struct btrfs_root *root, 1425 struct btrfs_path *path, 1426 struct btrfs_extent_inline_ref **ref_ret, 1427 u64 bytenr, u64 num_bytes, u64 parent, 1428 u64 root_objectid, u64 owner, u64 offset) 1429 { 1430 int ret; 1431 1432 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1433 bytenr, num_bytes, parent, 1434 root_objectid, owner, offset, 0); 1435 if (ret != -ENOENT) 1436 return ret; 1437 1438 btrfs_release_path(root, path); 1439 *ref_ret = NULL; 1440 1441 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1442 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1443 root_objectid); 1444 } else { 1445 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1446 root_objectid, owner, offset); 1447 } 1448 return ret; 1449 } 1450 1451 /* 1452 * helper to update/remove inline back ref 1453 */ 1454 static noinline_for_stack 1455 int update_inline_extent_backref(struct btrfs_trans_handle *trans, 1456 struct btrfs_root *root, 1457 struct btrfs_path *path, 1458 struct btrfs_extent_inline_ref *iref, 1459 int refs_to_mod, 1460 struct btrfs_delayed_extent_op *extent_op) 1461 { 1462 struct extent_buffer *leaf; 1463 struct btrfs_extent_item *ei; 1464 struct btrfs_extent_data_ref *dref = NULL; 1465 struct btrfs_shared_data_ref *sref = NULL; 1466 unsigned long ptr; 1467 unsigned long end; 1468 u32 item_size; 1469 int size; 1470 int type; 1471 int ret; 1472 u64 refs; 1473 1474 leaf = path->nodes[0]; 1475 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1476 refs = btrfs_extent_refs(leaf, ei); 1477 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1478 refs += refs_to_mod; 1479 btrfs_set_extent_refs(leaf, ei, refs); 1480 if (extent_op) 1481 __run_delayed_extent_op(extent_op, leaf, ei); 1482 1483 type = btrfs_extent_inline_ref_type(leaf, iref); 1484 1485 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1486 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1487 refs = btrfs_extent_data_ref_count(leaf, dref); 1488 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1489 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1490 refs = btrfs_shared_data_ref_count(leaf, sref); 1491 } else { 1492 refs = 1; 1493 BUG_ON(refs_to_mod != -1); 1494 } 1495 1496 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1497 refs += refs_to_mod; 1498 1499 if (refs > 0) { 1500 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1501 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1502 else 1503 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1504 } else { 1505 size = btrfs_extent_inline_ref_size(type); 1506 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1507 ptr = (unsigned long)iref; 1508 end = (unsigned long)ei + item_size; 1509 if (ptr + size < end) 1510 memmove_extent_buffer(leaf, ptr, ptr + size, 1511 end - ptr - size); 1512 item_size -= size; 1513 ret = btrfs_truncate_item(trans, root, path, item_size, 1); 1514 BUG_ON(ret); 1515 } 1516 btrfs_mark_buffer_dirty(leaf); 1517 return 0; 1518 } 1519 1520 static noinline_for_stack 1521 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1522 struct btrfs_root *root, 1523 struct btrfs_path *path, 1524 u64 bytenr, u64 num_bytes, u64 parent, 1525 u64 root_objectid, u64 owner, 1526 u64 offset, int refs_to_add, 1527 struct btrfs_delayed_extent_op *extent_op) 1528 { 1529 struct btrfs_extent_inline_ref *iref; 1530 int ret; 1531 1532 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1533 bytenr, num_bytes, parent, 1534 root_objectid, owner, offset, 1); 1535 if (ret == 0) { 1536 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1537 ret = update_inline_extent_backref(trans, root, path, iref, 1538 refs_to_add, extent_op); 1539 } else if (ret == -ENOENT) { 1540 ret = setup_inline_extent_backref(trans, root, path, iref, 1541 parent, root_objectid, 1542 owner, offset, refs_to_add, 1543 extent_op); 1544 } 1545 return ret; 1546 } 1547 1548 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1549 struct btrfs_root *root, 1550 struct btrfs_path *path, 1551 u64 bytenr, u64 parent, u64 root_objectid, 1552 u64 owner, u64 offset, int refs_to_add) 1553 { 1554 int ret; 1555 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1556 BUG_ON(refs_to_add != 1); 1557 ret = insert_tree_block_ref(trans, root, path, bytenr, 1558 parent, root_objectid); 1559 } else { 1560 ret = insert_extent_data_ref(trans, root, path, bytenr, 1561 parent, root_objectid, 1562 owner, offset, refs_to_add); 1563 } 1564 return ret; 1565 } 1566 1567 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1568 struct btrfs_root *root, 1569 struct btrfs_path *path, 1570 struct btrfs_extent_inline_ref *iref, 1571 int refs_to_drop, int is_data) 1572 { 1573 int ret; 1574 1575 BUG_ON(!is_data && refs_to_drop != 1); 1576 if (iref) { 1577 ret = update_inline_extent_backref(trans, root, path, iref, 1578 -refs_to_drop, NULL); 1579 } else if (is_data) { 1580 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1581 } else { 1582 ret = btrfs_del_item(trans, root, path); 1583 } 1584 return ret; 1585 } 1586 1587 static void btrfs_issue_discard(struct block_device *bdev, 1588 u64 start, u64 len) 1589 { 1590 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1591 DISCARD_FL_BARRIER); 1592 } 1593 1594 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1595 u64 num_bytes) 1596 { 1597 int ret; 1598 u64 map_length = num_bytes; 1599 struct btrfs_multi_bio *multi = NULL; 1600 1601 if (!btrfs_test_opt(root, DISCARD)) 1602 return 0; 1603 1604 /* Tell the block device(s) that the sectors can be discarded */ 1605 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1606 bytenr, &map_length, &multi, 0); 1607 if (!ret) { 1608 struct btrfs_bio_stripe *stripe = multi->stripes; 1609 int i; 1610 1611 if (map_length > num_bytes) 1612 map_length = num_bytes; 1613 1614 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1615 btrfs_issue_discard(stripe->dev->bdev, 1616 stripe->physical, 1617 map_length); 1618 } 1619 kfree(multi); 1620 } 1621 1622 return ret; 1623 } 1624 1625 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1626 struct btrfs_root *root, 1627 u64 bytenr, u64 num_bytes, u64 parent, 1628 u64 root_objectid, u64 owner, u64 offset) 1629 { 1630 int ret; 1631 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1632 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1633 1634 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1635 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1636 parent, root_objectid, (int)owner, 1637 BTRFS_ADD_DELAYED_REF, NULL); 1638 } else { 1639 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1640 parent, root_objectid, owner, offset, 1641 BTRFS_ADD_DELAYED_REF, NULL); 1642 } 1643 return ret; 1644 } 1645 1646 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1647 struct btrfs_root *root, 1648 u64 bytenr, u64 num_bytes, 1649 u64 parent, u64 root_objectid, 1650 u64 owner, u64 offset, int refs_to_add, 1651 struct btrfs_delayed_extent_op *extent_op) 1652 { 1653 struct btrfs_path *path; 1654 struct extent_buffer *leaf; 1655 struct btrfs_extent_item *item; 1656 u64 refs; 1657 int ret; 1658 int err = 0; 1659 1660 path = btrfs_alloc_path(); 1661 if (!path) 1662 return -ENOMEM; 1663 1664 path->reada = 1; 1665 path->leave_spinning = 1; 1666 /* this will setup the path even if it fails to insert the back ref */ 1667 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1668 path, bytenr, num_bytes, parent, 1669 root_objectid, owner, offset, 1670 refs_to_add, extent_op); 1671 if (ret == 0) 1672 goto out; 1673 1674 if (ret != -EAGAIN) { 1675 err = ret; 1676 goto out; 1677 } 1678 1679 leaf = path->nodes[0]; 1680 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1681 refs = btrfs_extent_refs(leaf, item); 1682 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1683 if (extent_op) 1684 __run_delayed_extent_op(extent_op, leaf, item); 1685 1686 btrfs_mark_buffer_dirty(leaf); 1687 btrfs_release_path(root->fs_info->extent_root, path); 1688 1689 path->reada = 1; 1690 path->leave_spinning = 1; 1691 1692 /* now insert the actual backref */ 1693 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1694 path, bytenr, parent, root_objectid, 1695 owner, offset, refs_to_add); 1696 BUG_ON(ret); 1697 out: 1698 btrfs_free_path(path); 1699 return err; 1700 } 1701 1702 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1703 struct btrfs_root *root, 1704 struct btrfs_delayed_ref_node *node, 1705 struct btrfs_delayed_extent_op *extent_op, 1706 int insert_reserved) 1707 { 1708 int ret = 0; 1709 struct btrfs_delayed_data_ref *ref; 1710 struct btrfs_key ins; 1711 u64 parent = 0; 1712 u64 ref_root = 0; 1713 u64 flags = 0; 1714 1715 ins.objectid = node->bytenr; 1716 ins.offset = node->num_bytes; 1717 ins.type = BTRFS_EXTENT_ITEM_KEY; 1718 1719 ref = btrfs_delayed_node_to_data_ref(node); 1720 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1721 parent = ref->parent; 1722 else 1723 ref_root = ref->root; 1724 1725 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1726 if (extent_op) { 1727 BUG_ON(extent_op->update_key); 1728 flags |= extent_op->flags_to_set; 1729 } 1730 ret = alloc_reserved_file_extent(trans, root, 1731 parent, ref_root, flags, 1732 ref->objectid, ref->offset, 1733 &ins, node->ref_mod); 1734 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1735 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1736 node->num_bytes, parent, 1737 ref_root, ref->objectid, 1738 ref->offset, node->ref_mod, 1739 extent_op); 1740 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1741 ret = __btrfs_free_extent(trans, root, node->bytenr, 1742 node->num_bytes, parent, 1743 ref_root, ref->objectid, 1744 ref->offset, node->ref_mod, 1745 extent_op); 1746 } else { 1747 BUG(); 1748 } 1749 return ret; 1750 } 1751 1752 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1753 struct extent_buffer *leaf, 1754 struct btrfs_extent_item *ei) 1755 { 1756 u64 flags = btrfs_extent_flags(leaf, ei); 1757 if (extent_op->update_flags) { 1758 flags |= extent_op->flags_to_set; 1759 btrfs_set_extent_flags(leaf, ei, flags); 1760 } 1761 1762 if (extent_op->update_key) { 1763 struct btrfs_tree_block_info *bi; 1764 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1765 bi = (struct btrfs_tree_block_info *)(ei + 1); 1766 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1767 } 1768 } 1769 1770 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1771 struct btrfs_root *root, 1772 struct btrfs_delayed_ref_node *node, 1773 struct btrfs_delayed_extent_op *extent_op) 1774 { 1775 struct btrfs_key key; 1776 struct btrfs_path *path; 1777 struct btrfs_extent_item *ei; 1778 struct extent_buffer *leaf; 1779 u32 item_size; 1780 int ret; 1781 int err = 0; 1782 1783 path = btrfs_alloc_path(); 1784 if (!path) 1785 return -ENOMEM; 1786 1787 key.objectid = node->bytenr; 1788 key.type = BTRFS_EXTENT_ITEM_KEY; 1789 key.offset = node->num_bytes; 1790 1791 path->reada = 1; 1792 path->leave_spinning = 1; 1793 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 1794 path, 0, 1); 1795 if (ret < 0) { 1796 err = ret; 1797 goto out; 1798 } 1799 if (ret > 0) { 1800 err = -EIO; 1801 goto out; 1802 } 1803 1804 leaf = path->nodes[0]; 1805 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1806 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1807 if (item_size < sizeof(*ei)) { 1808 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 1809 path, (u64)-1, 0); 1810 if (ret < 0) { 1811 err = ret; 1812 goto out; 1813 } 1814 leaf = path->nodes[0]; 1815 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1816 } 1817 #endif 1818 BUG_ON(item_size < sizeof(*ei)); 1819 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1820 __run_delayed_extent_op(extent_op, leaf, ei); 1821 1822 btrfs_mark_buffer_dirty(leaf); 1823 out: 1824 btrfs_free_path(path); 1825 return err; 1826 } 1827 1828 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 1829 struct btrfs_root *root, 1830 struct btrfs_delayed_ref_node *node, 1831 struct btrfs_delayed_extent_op *extent_op, 1832 int insert_reserved) 1833 { 1834 int ret = 0; 1835 struct btrfs_delayed_tree_ref *ref; 1836 struct btrfs_key ins; 1837 u64 parent = 0; 1838 u64 ref_root = 0; 1839 1840 ins.objectid = node->bytenr; 1841 ins.offset = node->num_bytes; 1842 ins.type = BTRFS_EXTENT_ITEM_KEY; 1843 1844 ref = btrfs_delayed_node_to_tree_ref(node); 1845 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1846 parent = ref->parent; 1847 else 1848 ref_root = ref->root; 1849 1850 BUG_ON(node->ref_mod != 1); 1851 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1852 BUG_ON(!extent_op || !extent_op->update_flags || 1853 !extent_op->update_key); 1854 ret = alloc_reserved_tree_block(trans, root, 1855 parent, ref_root, 1856 extent_op->flags_to_set, 1857 &extent_op->key, 1858 ref->level, &ins); 1859 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1860 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1861 node->num_bytes, parent, ref_root, 1862 ref->level, 0, 1, extent_op); 1863 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1864 ret = __btrfs_free_extent(trans, root, node->bytenr, 1865 node->num_bytes, parent, ref_root, 1866 ref->level, 0, 1, extent_op); 1867 } else { 1868 BUG(); 1869 } 1870 return ret; 1871 } 1872 1873 1874 /* helper function to actually process a single delayed ref entry */ 1875 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1876 struct btrfs_root *root, 1877 struct btrfs_delayed_ref_node *node, 1878 struct btrfs_delayed_extent_op *extent_op, 1879 int insert_reserved) 1880 { 1881 int ret; 1882 if (btrfs_delayed_ref_is_head(node)) { 1883 struct btrfs_delayed_ref_head *head; 1884 /* 1885 * we've hit the end of the chain and we were supposed 1886 * to insert this extent into the tree. But, it got 1887 * deleted before we ever needed to insert it, so all 1888 * we have to do is clean up the accounting 1889 */ 1890 BUG_ON(extent_op); 1891 head = btrfs_delayed_node_to_head(node); 1892 if (insert_reserved) { 1893 int mark_free = 0; 1894 struct extent_buffer *must_clean = NULL; 1895 1896 ret = pin_down_bytes(trans, root, NULL, 1897 node->bytenr, node->num_bytes, 1898 head->is_data, 1, &must_clean); 1899 if (ret > 0) 1900 mark_free = 1; 1901 1902 if (must_clean) { 1903 clean_tree_block(NULL, root, must_clean); 1904 btrfs_tree_unlock(must_clean); 1905 free_extent_buffer(must_clean); 1906 } 1907 if (head->is_data) { 1908 ret = btrfs_del_csums(trans, root, 1909 node->bytenr, 1910 node->num_bytes); 1911 BUG_ON(ret); 1912 } 1913 if (mark_free) { 1914 ret = btrfs_free_reserved_extent(root, 1915 node->bytenr, 1916 node->num_bytes); 1917 BUG_ON(ret); 1918 } 1919 } 1920 mutex_unlock(&head->mutex); 1921 return 0; 1922 } 1923 1924 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1925 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1926 ret = run_delayed_tree_ref(trans, root, node, extent_op, 1927 insert_reserved); 1928 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1929 node->type == BTRFS_SHARED_DATA_REF_KEY) 1930 ret = run_delayed_data_ref(trans, root, node, extent_op, 1931 insert_reserved); 1932 else 1933 BUG(); 1934 return ret; 1935 } 1936 1937 static noinline struct btrfs_delayed_ref_node * 1938 select_delayed_ref(struct btrfs_delayed_ref_head *head) 1939 { 1940 struct rb_node *node; 1941 struct btrfs_delayed_ref_node *ref; 1942 int action = BTRFS_ADD_DELAYED_REF; 1943 again: 1944 /* 1945 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 1946 * this prevents ref count from going down to zero when 1947 * there still are pending delayed ref. 1948 */ 1949 node = rb_prev(&head->node.rb_node); 1950 while (1) { 1951 if (!node) 1952 break; 1953 ref = rb_entry(node, struct btrfs_delayed_ref_node, 1954 rb_node); 1955 if (ref->bytenr != head->node.bytenr) 1956 break; 1957 if (ref->action == action) 1958 return ref; 1959 node = rb_prev(node); 1960 } 1961 if (action == BTRFS_ADD_DELAYED_REF) { 1962 action = BTRFS_DROP_DELAYED_REF; 1963 goto again; 1964 } 1965 return NULL; 1966 } 1967 1968 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 1969 struct btrfs_root *root, 1970 struct list_head *cluster) 1971 { 1972 struct btrfs_delayed_ref_root *delayed_refs; 1973 struct btrfs_delayed_ref_node *ref; 1974 struct btrfs_delayed_ref_head *locked_ref = NULL; 1975 struct btrfs_delayed_extent_op *extent_op; 1976 int ret; 1977 int count = 0; 1978 int must_insert_reserved = 0; 1979 1980 delayed_refs = &trans->transaction->delayed_refs; 1981 while (1) { 1982 if (!locked_ref) { 1983 /* pick a new head ref from the cluster list */ 1984 if (list_empty(cluster)) 1985 break; 1986 1987 locked_ref = list_entry(cluster->next, 1988 struct btrfs_delayed_ref_head, cluster); 1989 1990 /* grab the lock that says we are going to process 1991 * all the refs for this head */ 1992 ret = btrfs_delayed_ref_lock(trans, locked_ref); 1993 1994 /* 1995 * we may have dropped the spin lock to get the head 1996 * mutex lock, and that might have given someone else 1997 * time to free the head. If that's true, it has been 1998 * removed from our list and we can move on. 1999 */ 2000 if (ret == -EAGAIN) { 2001 locked_ref = NULL; 2002 count++; 2003 continue; 2004 } 2005 } 2006 2007 /* 2008 * record the must insert reserved flag before we 2009 * drop the spin lock. 2010 */ 2011 must_insert_reserved = locked_ref->must_insert_reserved; 2012 locked_ref->must_insert_reserved = 0; 2013 2014 extent_op = locked_ref->extent_op; 2015 locked_ref->extent_op = NULL; 2016 2017 /* 2018 * locked_ref is the head node, so we have to go one 2019 * node back for any delayed ref updates 2020 */ 2021 ref = select_delayed_ref(locked_ref); 2022 if (!ref) { 2023 /* All delayed refs have been processed, Go ahead 2024 * and send the head node to run_one_delayed_ref, 2025 * so that any accounting fixes can happen 2026 */ 2027 ref = &locked_ref->node; 2028 2029 if (extent_op && must_insert_reserved) { 2030 kfree(extent_op); 2031 extent_op = NULL; 2032 } 2033 2034 if (extent_op) { 2035 spin_unlock(&delayed_refs->lock); 2036 2037 ret = run_delayed_extent_op(trans, root, 2038 ref, extent_op); 2039 BUG_ON(ret); 2040 kfree(extent_op); 2041 2042 cond_resched(); 2043 spin_lock(&delayed_refs->lock); 2044 continue; 2045 } 2046 2047 list_del_init(&locked_ref->cluster); 2048 locked_ref = NULL; 2049 } 2050 2051 ref->in_tree = 0; 2052 rb_erase(&ref->rb_node, &delayed_refs->root); 2053 delayed_refs->num_entries--; 2054 2055 spin_unlock(&delayed_refs->lock); 2056 2057 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2058 must_insert_reserved); 2059 BUG_ON(ret); 2060 2061 btrfs_put_delayed_ref(ref); 2062 kfree(extent_op); 2063 count++; 2064 2065 cond_resched(); 2066 spin_lock(&delayed_refs->lock); 2067 } 2068 return count; 2069 } 2070 2071 /* 2072 * this starts processing the delayed reference count updates and 2073 * extent insertions we have queued up so far. count can be 2074 * 0, which means to process everything in the tree at the start 2075 * of the run (but not newly added entries), or it can be some target 2076 * number you'd like to process. 2077 */ 2078 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2079 struct btrfs_root *root, unsigned long count) 2080 { 2081 struct rb_node *node; 2082 struct btrfs_delayed_ref_root *delayed_refs; 2083 struct btrfs_delayed_ref_node *ref; 2084 struct list_head cluster; 2085 int ret; 2086 int run_all = count == (unsigned long)-1; 2087 int run_most = 0; 2088 2089 if (root == root->fs_info->extent_root) 2090 root = root->fs_info->tree_root; 2091 2092 delayed_refs = &trans->transaction->delayed_refs; 2093 INIT_LIST_HEAD(&cluster); 2094 again: 2095 spin_lock(&delayed_refs->lock); 2096 if (count == 0) { 2097 count = delayed_refs->num_entries * 2; 2098 run_most = 1; 2099 } 2100 while (1) { 2101 if (!(run_all || run_most) && 2102 delayed_refs->num_heads_ready < 64) 2103 break; 2104 2105 /* 2106 * go find something we can process in the rbtree. We start at 2107 * the beginning of the tree, and then build a cluster 2108 * of refs to process starting at the first one we are able to 2109 * lock 2110 */ 2111 ret = btrfs_find_ref_cluster(trans, &cluster, 2112 delayed_refs->run_delayed_start); 2113 if (ret) 2114 break; 2115 2116 ret = run_clustered_refs(trans, root, &cluster); 2117 BUG_ON(ret < 0); 2118 2119 count -= min_t(unsigned long, ret, count); 2120 2121 if (count == 0) 2122 break; 2123 } 2124 2125 if (run_all) { 2126 node = rb_first(&delayed_refs->root); 2127 if (!node) 2128 goto out; 2129 count = (unsigned long)-1; 2130 2131 while (node) { 2132 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2133 rb_node); 2134 if (btrfs_delayed_ref_is_head(ref)) { 2135 struct btrfs_delayed_ref_head *head; 2136 2137 head = btrfs_delayed_node_to_head(ref); 2138 atomic_inc(&ref->refs); 2139 2140 spin_unlock(&delayed_refs->lock); 2141 mutex_lock(&head->mutex); 2142 mutex_unlock(&head->mutex); 2143 2144 btrfs_put_delayed_ref(ref); 2145 cond_resched(); 2146 goto again; 2147 } 2148 node = rb_next(node); 2149 } 2150 spin_unlock(&delayed_refs->lock); 2151 schedule_timeout(1); 2152 goto again; 2153 } 2154 out: 2155 spin_unlock(&delayed_refs->lock); 2156 return 0; 2157 } 2158 2159 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2160 struct btrfs_root *root, 2161 u64 bytenr, u64 num_bytes, u64 flags, 2162 int is_data) 2163 { 2164 struct btrfs_delayed_extent_op *extent_op; 2165 int ret; 2166 2167 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2168 if (!extent_op) 2169 return -ENOMEM; 2170 2171 extent_op->flags_to_set = flags; 2172 extent_op->update_flags = 1; 2173 extent_op->update_key = 0; 2174 extent_op->is_data = is_data ? 1 : 0; 2175 2176 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2177 if (ret) 2178 kfree(extent_op); 2179 return ret; 2180 } 2181 2182 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2183 struct btrfs_root *root, 2184 struct btrfs_path *path, 2185 u64 objectid, u64 offset, u64 bytenr) 2186 { 2187 struct btrfs_delayed_ref_head *head; 2188 struct btrfs_delayed_ref_node *ref; 2189 struct btrfs_delayed_data_ref *data_ref; 2190 struct btrfs_delayed_ref_root *delayed_refs; 2191 struct rb_node *node; 2192 int ret = 0; 2193 2194 ret = -ENOENT; 2195 delayed_refs = &trans->transaction->delayed_refs; 2196 spin_lock(&delayed_refs->lock); 2197 head = btrfs_find_delayed_ref_head(trans, bytenr); 2198 if (!head) 2199 goto out; 2200 2201 if (!mutex_trylock(&head->mutex)) { 2202 atomic_inc(&head->node.refs); 2203 spin_unlock(&delayed_refs->lock); 2204 2205 btrfs_release_path(root->fs_info->extent_root, path); 2206 2207 mutex_lock(&head->mutex); 2208 mutex_unlock(&head->mutex); 2209 btrfs_put_delayed_ref(&head->node); 2210 return -EAGAIN; 2211 } 2212 2213 node = rb_prev(&head->node.rb_node); 2214 if (!node) 2215 goto out_unlock; 2216 2217 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2218 2219 if (ref->bytenr != bytenr) 2220 goto out_unlock; 2221 2222 ret = 1; 2223 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2224 goto out_unlock; 2225 2226 data_ref = btrfs_delayed_node_to_data_ref(ref); 2227 2228 node = rb_prev(node); 2229 if (node) { 2230 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2231 if (ref->bytenr == bytenr) 2232 goto out_unlock; 2233 } 2234 2235 if (data_ref->root != root->root_key.objectid || 2236 data_ref->objectid != objectid || data_ref->offset != offset) 2237 goto out_unlock; 2238 2239 ret = 0; 2240 out_unlock: 2241 mutex_unlock(&head->mutex); 2242 out: 2243 spin_unlock(&delayed_refs->lock); 2244 return ret; 2245 } 2246 2247 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2248 struct btrfs_root *root, 2249 struct btrfs_path *path, 2250 u64 objectid, u64 offset, u64 bytenr) 2251 { 2252 struct btrfs_root *extent_root = root->fs_info->extent_root; 2253 struct extent_buffer *leaf; 2254 struct btrfs_extent_data_ref *ref; 2255 struct btrfs_extent_inline_ref *iref; 2256 struct btrfs_extent_item *ei; 2257 struct btrfs_key key; 2258 u32 item_size; 2259 int ret; 2260 2261 key.objectid = bytenr; 2262 key.offset = (u64)-1; 2263 key.type = BTRFS_EXTENT_ITEM_KEY; 2264 2265 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2266 if (ret < 0) 2267 goto out; 2268 BUG_ON(ret == 0); 2269 2270 ret = -ENOENT; 2271 if (path->slots[0] == 0) 2272 goto out; 2273 2274 path->slots[0]--; 2275 leaf = path->nodes[0]; 2276 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2277 2278 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2279 goto out; 2280 2281 ret = 1; 2282 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2283 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2284 if (item_size < sizeof(*ei)) { 2285 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2286 goto out; 2287 } 2288 #endif 2289 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2290 2291 if (item_size != sizeof(*ei) + 2292 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2293 goto out; 2294 2295 if (btrfs_extent_generation(leaf, ei) <= 2296 btrfs_root_last_snapshot(&root->root_item)) 2297 goto out; 2298 2299 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2300 if (btrfs_extent_inline_ref_type(leaf, iref) != 2301 BTRFS_EXTENT_DATA_REF_KEY) 2302 goto out; 2303 2304 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2305 if (btrfs_extent_refs(leaf, ei) != 2306 btrfs_extent_data_ref_count(leaf, ref) || 2307 btrfs_extent_data_ref_root(leaf, ref) != 2308 root->root_key.objectid || 2309 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2310 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2311 goto out; 2312 2313 ret = 0; 2314 out: 2315 return ret; 2316 } 2317 2318 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2319 struct btrfs_root *root, 2320 u64 objectid, u64 offset, u64 bytenr) 2321 { 2322 struct btrfs_path *path; 2323 int ret; 2324 int ret2; 2325 2326 path = btrfs_alloc_path(); 2327 if (!path) 2328 return -ENOENT; 2329 2330 do { 2331 ret = check_committed_ref(trans, root, path, objectid, 2332 offset, bytenr); 2333 if (ret && ret != -ENOENT) 2334 goto out; 2335 2336 ret2 = check_delayed_ref(trans, root, path, objectid, 2337 offset, bytenr); 2338 } while (ret2 == -EAGAIN); 2339 2340 if (ret2 && ret2 != -ENOENT) { 2341 ret = ret2; 2342 goto out; 2343 } 2344 2345 if (ret != -ENOENT || ret2 != -ENOENT) 2346 ret = 0; 2347 out: 2348 btrfs_free_path(path); 2349 return ret; 2350 } 2351 2352 #if 0 2353 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2354 struct extent_buffer *buf, u32 nr_extents) 2355 { 2356 struct btrfs_key key; 2357 struct btrfs_file_extent_item *fi; 2358 u64 root_gen; 2359 u32 nritems; 2360 int i; 2361 int level; 2362 int ret = 0; 2363 int shared = 0; 2364 2365 if (!root->ref_cows) 2366 return 0; 2367 2368 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 2369 shared = 0; 2370 root_gen = root->root_key.offset; 2371 } else { 2372 shared = 1; 2373 root_gen = trans->transid - 1; 2374 } 2375 2376 level = btrfs_header_level(buf); 2377 nritems = btrfs_header_nritems(buf); 2378 2379 if (level == 0) { 2380 struct btrfs_leaf_ref *ref; 2381 struct btrfs_extent_info *info; 2382 2383 ref = btrfs_alloc_leaf_ref(root, nr_extents); 2384 if (!ref) { 2385 ret = -ENOMEM; 2386 goto out; 2387 } 2388 2389 ref->root_gen = root_gen; 2390 ref->bytenr = buf->start; 2391 ref->owner = btrfs_header_owner(buf); 2392 ref->generation = btrfs_header_generation(buf); 2393 ref->nritems = nr_extents; 2394 info = ref->extents; 2395 2396 for (i = 0; nr_extents > 0 && i < nritems; i++) { 2397 u64 disk_bytenr; 2398 btrfs_item_key_to_cpu(buf, &key, i); 2399 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2400 continue; 2401 fi = btrfs_item_ptr(buf, i, 2402 struct btrfs_file_extent_item); 2403 if (btrfs_file_extent_type(buf, fi) == 2404 BTRFS_FILE_EXTENT_INLINE) 2405 continue; 2406 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2407 if (disk_bytenr == 0) 2408 continue; 2409 2410 info->bytenr = disk_bytenr; 2411 info->num_bytes = 2412 btrfs_file_extent_disk_num_bytes(buf, fi); 2413 info->objectid = key.objectid; 2414 info->offset = key.offset; 2415 info++; 2416 } 2417 2418 ret = btrfs_add_leaf_ref(root, ref, shared); 2419 if (ret == -EEXIST && shared) { 2420 struct btrfs_leaf_ref *old; 2421 old = btrfs_lookup_leaf_ref(root, ref->bytenr); 2422 BUG_ON(!old); 2423 btrfs_remove_leaf_ref(root, old); 2424 btrfs_free_leaf_ref(root, old); 2425 ret = btrfs_add_leaf_ref(root, ref, shared); 2426 } 2427 WARN_ON(ret); 2428 btrfs_free_leaf_ref(root, ref); 2429 } 2430 out: 2431 return ret; 2432 } 2433 2434 /* when a block goes through cow, we update the reference counts of 2435 * everything that block points to. The internal pointers of the block 2436 * can be in just about any order, and it is likely to have clusters of 2437 * things that are close together and clusters of things that are not. 2438 * 2439 * To help reduce the seeks that come with updating all of these reference 2440 * counts, sort them by byte number before actual updates are done. 2441 * 2442 * struct refsort is used to match byte number to slot in the btree block. 2443 * we sort based on the byte number and then use the slot to actually 2444 * find the item. 2445 * 2446 * struct refsort is smaller than strcut btrfs_item and smaller than 2447 * struct btrfs_key_ptr. Since we're currently limited to the page size 2448 * for a btree block, there's no way for a kmalloc of refsorts for a 2449 * single node to be bigger than a page. 2450 */ 2451 struct refsort { 2452 u64 bytenr; 2453 u32 slot; 2454 }; 2455 2456 /* 2457 * for passing into sort() 2458 */ 2459 static int refsort_cmp(const void *a_void, const void *b_void) 2460 { 2461 const struct refsort *a = a_void; 2462 const struct refsort *b = b_void; 2463 2464 if (a->bytenr < b->bytenr) 2465 return -1; 2466 if (a->bytenr > b->bytenr) 2467 return 1; 2468 return 0; 2469 } 2470 #endif 2471 2472 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2473 struct btrfs_root *root, 2474 struct extent_buffer *buf, 2475 int full_backref, int inc) 2476 { 2477 u64 bytenr; 2478 u64 num_bytes; 2479 u64 parent; 2480 u64 ref_root; 2481 u32 nritems; 2482 struct btrfs_key key; 2483 struct btrfs_file_extent_item *fi; 2484 int i; 2485 int level; 2486 int ret = 0; 2487 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2488 u64, u64, u64, u64, u64, u64); 2489 2490 ref_root = btrfs_header_owner(buf); 2491 nritems = btrfs_header_nritems(buf); 2492 level = btrfs_header_level(buf); 2493 2494 if (!root->ref_cows && level == 0) 2495 return 0; 2496 2497 if (inc) 2498 process_func = btrfs_inc_extent_ref; 2499 else 2500 process_func = btrfs_free_extent; 2501 2502 if (full_backref) 2503 parent = buf->start; 2504 else 2505 parent = 0; 2506 2507 for (i = 0; i < nritems; i++) { 2508 if (level == 0) { 2509 btrfs_item_key_to_cpu(buf, &key, i); 2510 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2511 continue; 2512 fi = btrfs_item_ptr(buf, i, 2513 struct btrfs_file_extent_item); 2514 if (btrfs_file_extent_type(buf, fi) == 2515 BTRFS_FILE_EXTENT_INLINE) 2516 continue; 2517 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2518 if (bytenr == 0) 2519 continue; 2520 2521 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2522 key.offset -= btrfs_file_extent_offset(buf, fi); 2523 ret = process_func(trans, root, bytenr, num_bytes, 2524 parent, ref_root, key.objectid, 2525 key.offset); 2526 if (ret) 2527 goto fail; 2528 } else { 2529 bytenr = btrfs_node_blockptr(buf, i); 2530 num_bytes = btrfs_level_size(root, level - 1); 2531 ret = process_func(trans, root, bytenr, num_bytes, 2532 parent, ref_root, level - 1, 0); 2533 if (ret) 2534 goto fail; 2535 } 2536 } 2537 return 0; 2538 fail: 2539 BUG(); 2540 return ret; 2541 } 2542 2543 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2544 struct extent_buffer *buf, int full_backref) 2545 { 2546 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2547 } 2548 2549 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2550 struct extent_buffer *buf, int full_backref) 2551 { 2552 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2553 } 2554 2555 static int write_one_cache_group(struct btrfs_trans_handle *trans, 2556 struct btrfs_root *root, 2557 struct btrfs_path *path, 2558 struct btrfs_block_group_cache *cache) 2559 { 2560 int ret; 2561 struct btrfs_root *extent_root = root->fs_info->extent_root; 2562 unsigned long bi; 2563 struct extent_buffer *leaf; 2564 2565 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 2566 if (ret < 0) 2567 goto fail; 2568 BUG_ON(ret); 2569 2570 leaf = path->nodes[0]; 2571 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2572 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 2573 btrfs_mark_buffer_dirty(leaf); 2574 btrfs_release_path(extent_root, path); 2575 fail: 2576 if (ret) 2577 return ret; 2578 return 0; 2579 2580 } 2581 2582 static struct btrfs_block_group_cache * 2583 next_block_group(struct btrfs_root *root, 2584 struct btrfs_block_group_cache *cache) 2585 { 2586 struct rb_node *node; 2587 spin_lock(&root->fs_info->block_group_cache_lock); 2588 node = rb_next(&cache->cache_node); 2589 btrfs_put_block_group(cache); 2590 if (node) { 2591 cache = rb_entry(node, struct btrfs_block_group_cache, 2592 cache_node); 2593 btrfs_get_block_group(cache); 2594 } else 2595 cache = NULL; 2596 spin_unlock(&root->fs_info->block_group_cache_lock); 2597 return cache; 2598 } 2599 2600 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2601 struct btrfs_root *root) 2602 { 2603 struct btrfs_block_group_cache *cache; 2604 int err = 0; 2605 struct btrfs_path *path; 2606 u64 last = 0; 2607 2608 path = btrfs_alloc_path(); 2609 if (!path) 2610 return -ENOMEM; 2611 2612 while (1) { 2613 if (last == 0) { 2614 err = btrfs_run_delayed_refs(trans, root, 2615 (unsigned long)-1); 2616 BUG_ON(err); 2617 } 2618 2619 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2620 while (cache) { 2621 if (cache->dirty) 2622 break; 2623 cache = next_block_group(root, cache); 2624 } 2625 if (!cache) { 2626 if (last == 0) 2627 break; 2628 last = 0; 2629 continue; 2630 } 2631 2632 cache->dirty = 0; 2633 last = cache->key.objectid + cache->key.offset; 2634 2635 err = write_one_cache_group(trans, root, path, cache); 2636 BUG_ON(err); 2637 btrfs_put_block_group(cache); 2638 } 2639 2640 btrfs_free_path(path); 2641 return 0; 2642 } 2643 2644 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2645 { 2646 struct btrfs_block_group_cache *block_group; 2647 int readonly = 0; 2648 2649 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 2650 if (!block_group || block_group->ro) 2651 readonly = 1; 2652 if (block_group) 2653 btrfs_put_block_group(block_group); 2654 return readonly; 2655 } 2656 2657 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 2658 u64 total_bytes, u64 bytes_used, 2659 struct btrfs_space_info **space_info) 2660 { 2661 struct btrfs_space_info *found; 2662 2663 found = __find_space_info(info, flags); 2664 if (found) { 2665 spin_lock(&found->lock); 2666 found->total_bytes += total_bytes; 2667 found->bytes_used += bytes_used; 2668 found->full = 0; 2669 spin_unlock(&found->lock); 2670 *space_info = found; 2671 return 0; 2672 } 2673 found = kzalloc(sizeof(*found), GFP_NOFS); 2674 if (!found) 2675 return -ENOMEM; 2676 2677 INIT_LIST_HEAD(&found->block_groups); 2678 init_rwsem(&found->groups_sem); 2679 spin_lock_init(&found->lock); 2680 found->flags = flags; 2681 found->total_bytes = total_bytes; 2682 found->bytes_used = bytes_used; 2683 found->bytes_pinned = 0; 2684 found->bytes_reserved = 0; 2685 found->bytes_readonly = 0; 2686 found->bytes_delalloc = 0; 2687 found->full = 0; 2688 found->force_alloc = 0; 2689 *space_info = found; 2690 list_add_rcu(&found->list, &info->space_info); 2691 atomic_set(&found->caching_threads, 0); 2692 return 0; 2693 } 2694 2695 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 2696 { 2697 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 2698 BTRFS_BLOCK_GROUP_RAID1 | 2699 BTRFS_BLOCK_GROUP_RAID10 | 2700 BTRFS_BLOCK_GROUP_DUP); 2701 if (extra_flags) { 2702 if (flags & BTRFS_BLOCK_GROUP_DATA) 2703 fs_info->avail_data_alloc_bits |= extra_flags; 2704 if (flags & BTRFS_BLOCK_GROUP_METADATA) 2705 fs_info->avail_metadata_alloc_bits |= extra_flags; 2706 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 2707 fs_info->avail_system_alloc_bits |= extra_flags; 2708 } 2709 } 2710 2711 static void set_block_group_readonly(struct btrfs_block_group_cache *cache) 2712 { 2713 spin_lock(&cache->space_info->lock); 2714 spin_lock(&cache->lock); 2715 if (!cache->ro) { 2716 cache->space_info->bytes_readonly += cache->key.offset - 2717 btrfs_block_group_used(&cache->item); 2718 cache->ro = 1; 2719 } 2720 spin_unlock(&cache->lock); 2721 spin_unlock(&cache->space_info->lock); 2722 } 2723 2724 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2725 { 2726 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2727 2728 if (num_devices == 1) 2729 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 2730 if (num_devices < 4) 2731 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 2732 2733 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 2734 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 2735 BTRFS_BLOCK_GROUP_RAID10))) { 2736 flags &= ~BTRFS_BLOCK_GROUP_DUP; 2737 } 2738 2739 if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 2740 (flags & BTRFS_BLOCK_GROUP_RAID10)) { 2741 flags &= ~BTRFS_BLOCK_GROUP_RAID1; 2742 } 2743 2744 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 2745 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 2746 (flags & BTRFS_BLOCK_GROUP_RAID10) | 2747 (flags & BTRFS_BLOCK_GROUP_DUP))) 2748 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 2749 return flags; 2750 } 2751 2752 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2753 { 2754 struct btrfs_fs_info *info = root->fs_info; 2755 u64 alloc_profile; 2756 2757 if (data) { 2758 alloc_profile = info->avail_data_alloc_bits & 2759 info->data_alloc_profile; 2760 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 2761 } else if (root == root->fs_info->chunk_root) { 2762 alloc_profile = info->avail_system_alloc_bits & 2763 info->system_alloc_profile; 2764 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; 2765 } else { 2766 alloc_profile = info->avail_metadata_alloc_bits & 2767 info->metadata_alloc_profile; 2768 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; 2769 } 2770 2771 return btrfs_reduce_alloc_profile(root, data); 2772 } 2773 2774 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 2775 { 2776 u64 alloc_target; 2777 2778 alloc_target = btrfs_get_alloc_profile(root, 1); 2779 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, 2780 alloc_target); 2781 } 2782 2783 static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) 2784 { 2785 u64 num_bytes; 2786 int level; 2787 2788 level = BTRFS_MAX_LEVEL - 2; 2789 /* 2790 * NOTE: these calculations are absolutely the worst possible case. 2791 * This assumes that _every_ item we insert will require a new leaf, and 2792 * that the tree has grown to its maximum level size. 2793 */ 2794 2795 /* 2796 * for every item we insert we could insert both an extent item and a 2797 * extent ref item. Then for ever item we insert, we will need to cow 2798 * both the original leaf, plus the leaf to the left and right of it. 2799 * 2800 * Unless we are talking about the extent root, then we just want the 2801 * number of items * 2, since we just need the extent item plus its ref. 2802 */ 2803 if (root == root->fs_info->extent_root) 2804 num_bytes = num_items * 2; 2805 else 2806 num_bytes = (num_items + (2 * num_items)) * 3; 2807 2808 /* 2809 * num_bytes is total number of leaves we could need times the leaf 2810 * size, and then for every leaf we could end up cow'ing 2 nodes per 2811 * level, down to the leaf level. 2812 */ 2813 num_bytes = (num_bytes * root->leafsize) + 2814 (num_bytes * (level * 2)) * root->nodesize; 2815 2816 return num_bytes; 2817 } 2818 2819 /* 2820 * Unreserve metadata space for delalloc. If we have less reserved credits than 2821 * we have extents, this function does nothing. 2822 */ 2823 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2824 struct inode *inode, int num_items) 2825 { 2826 struct btrfs_fs_info *info = root->fs_info; 2827 struct btrfs_space_info *meta_sinfo; 2828 u64 num_bytes; 2829 u64 alloc_target; 2830 bool bug = false; 2831 2832 /* get the space info for where the metadata will live */ 2833 alloc_target = btrfs_get_alloc_profile(root, 0); 2834 meta_sinfo = __find_space_info(info, alloc_target); 2835 2836 num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 2837 num_items); 2838 2839 spin_lock(&meta_sinfo->lock); 2840 spin_lock(&BTRFS_I(inode)->accounting_lock); 2841 if (BTRFS_I(inode)->reserved_extents <= 2842 BTRFS_I(inode)->outstanding_extents) { 2843 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2844 spin_unlock(&meta_sinfo->lock); 2845 return 0; 2846 } 2847 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2848 2849 BTRFS_I(inode)->reserved_extents--; 2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 2851 2852 if (meta_sinfo->bytes_delalloc < num_bytes) { 2853 bug = true; 2854 meta_sinfo->bytes_delalloc = 0; 2855 } else { 2856 meta_sinfo->bytes_delalloc -= num_bytes; 2857 } 2858 spin_unlock(&meta_sinfo->lock); 2859 2860 BUG_ON(bug); 2861 2862 return 0; 2863 } 2864 2865 static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) 2866 { 2867 u64 thresh; 2868 2869 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2870 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2871 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2872 meta_sinfo->bytes_may_use; 2873 2874 thresh = meta_sinfo->total_bytes - thresh; 2875 thresh *= 80; 2876 do_div(thresh, 100); 2877 if (thresh <= meta_sinfo->bytes_delalloc) 2878 meta_sinfo->force_delalloc = 1; 2879 else 2880 meta_sinfo->force_delalloc = 0; 2881 } 2882 2883 struct async_flush { 2884 struct btrfs_root *root; 2885 struct btrfs_space_info *info; 2886 struct btrfs_work work; 2887 }; 2888 2889 static noinline void flush_delalloc_async(struct btrfs_work *work) 2890 { 2891 struct async_flush *async; 2892 struct btrfs_root *root; 2893 struct btrfs_space_info *info; 2894 2895 async = container_of(work, struct async_flush, work); 2896 root = async->root; 2897 info = async->info; 2898 2899 btrfs_start_delalloc_inodes(root, 0); 2900 wake_up(&info->flush_wait); 2901 btrfs_wait_ordered_extents(root, 0, 0); 2902 2903 spin_lock(&info->lock); 2904 info->flushing = 0; 2905 spin_unlock(&info->lock); 2906 wake_up(&info->flush_wait); 2907 2908 kfree(async); 2909 } 2910 2911 static void wait_on_flush(struct btrfs_space_info *info) 2912 { 2913 DEFINE_WAIT(wait); 2914 u64 used; 2915 2916 while (1) { 2917 prepare_to_wait(&info->flush_wait, &wait, 2918 TASK_UNINTERRUPTIBLE); 2919 spin_lock(&info->lock); 2920 if (!info->flushing) { 2921 spin_unlock(&info->lock); 2922 break; 2923 } 2924 2925 used = info->bytes_used + info->bytes_reserved + 2926 info->bytes_pinned + info->bytes_readonly + 2927 info->bytes_super + info->bytes_root + 2928 info->bytes_may_use + info->bytes_delalloc; 2929 if (used < info->total_bytes) { 2930 spin_unlock(&info->lock); 2931 break; 2932 } 2933 spin_unlock(&info->lock); 2934 schedule(); 2935 } 2936 finish_wait(&info->flush_wait, &wait); 2937 } 2938 2939 static void flush_delalloc(struct btrfs_root *root, 2940 struct btrfs_space_info *info) 2941 { 2942 struct async_flush *async; 2943 bool wait = false; 2944 2945 spin_lock(&info->lock); 2946 2947 if (!info->flushing) { 2948 info->flushing = 1; 2949 init_waitqueue_head(&info->flush_wait); 2950 } else { 2951 wait = true; 2952 } 2953 2954 spin_unlock(&info->lock); 2955 2956 if (wait) { 2957 wait_on_flush(info); 2958 return; 2959 } 2960 2961 async = kzalloc(sizeof(*async), GFP_NOFS); 2962 if (!async) 2963 goto flush; 2964 2965 async->root = root; 2966 async->info = info; 2967 async->work.func = flush_delalloc_async; 2968 2969 btrfs_queue_worker(&root->fs_info->enospc_workers, 2970 &async->work); 2971 wait_on_flush(info); 2972 return; 2973 2974 flush: 2975 btrfs_start_delalloc_inodes(root, 0); 2976 btrfs_wait_ordered_extents(root, 0, 0); 2977 2978 spin_lock(&info->lock); 2979 info->flushing = 0; 2980 spin_unlock(&info->lock); 2981 wake_up(&info->flush_wait); 2982 } 2983 2984 static int maybe_allocate_chunk(struct btrfs_root *root, 2985 struct btrfs_space_info *info) 2986 { 2987 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 2988 struct btrfs_trans_handle *trans; 2989 bool wait = false; 2990 int ret = 0; 2991 u64 min_metadata; 2992 u64 free_space; 2993 2994 free_space = btrfs_super_total_bytes(disk_super); 2995 /* 2996 * we allow the metadata to grow to a max of either 10gb or 5% of the 2997 * space in the volume. 2998 */ 2999 min_metadata = min((u64)10 * 1024 * 1024 * 1024, 3000 div64_u64(free_space * 5, 100)); 3001 if (info->total_bytes >= min_metadata) { 3002 spin_unlock(&info->lock); 3003 return 0; 3004 } 3005 3006 if (info->full) { 3007 spin_unlock(&info->lock); 3008 return 0; 3009 } 3010 3011 if (!info->allocating_chunk) { 3012 info->force_alloc = 1; 3013 info->allocating_chunk = 1; 3014 init_waitqueue_head(&info->allocate_wait); 3015 } else { 3016 wait = true; 3017 } 3018 3019 spin_unlock(&info->lock); 3020 3021 if (wait) { 3022 wait_event(info->allocate_wait, 3023 !info->allocating_chunk); 3024 return 1; 3025 } 3026 3027 trans = btrfs_start_transaction(root, 1); 3028 if (!trans) { 3029 ret = -ENOMEM; 3030 goto out; 3031 } 3032 3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3034 4096 + 2 * 1024 * 1024, 3035 info->flags, 0); 3036 btrfs_end_transaction(trans, root); 3037 if (ret) 3038 goto out; 3039 out: 3040 spin_lock(&info->lock); 3041 info->allocating_chunk = 0; 3042 spin_unlock(&info->lock); 3043 wake_up(&info->allocate_wait); 3044 3045 if (ret) 3046 return 0; 3047 return 1; 3048 } 3049 3050 /* 3051 * Reserve metadata space for delalloc. 3052 */ 3053 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 3054 struct inode *inode, int num_items) 3055 { 3056 struct btrfs_fs_info *info = root->fs_info; 3057 struct btrfs_space_info *meta_sinfo; 3058 u64 num_bytes; 3059 u64 used; 3060 u64 alloc_target; 3061 int flushed = 0; 3062 int force_delalloc; 3063 3064 /* get the space info for where the metadata will live */ 3065 alloc_target = btrfs_get_alloc_profile(root, 0); 3066 meta_sinfo = __find_space_info(info, alloc_target); 3067 3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 3069 num_items); 3070 again: 3071 spin_lock(&meta_sinfo->lock); 3072 3073 force_delalloc = meta_sinfo->force_delalloc; 3074 3075 if (unlikely(!meta_sinfo->bytes_root)) 3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3077 3078 if (!flushed) 3079 meta_sinfo->bytes_delalloc += num_bytes; 3080 3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3085 3086 if (used > meta_sinfo->total_bytes) { 3087 flushed++; 3088 3089 if (flushed == 1) { 3090 if (maybe_allocate_chunk(root, meta_sinfo)) 3091 goto again; 3092 flushed++; 3093 } else { 3094 spin_unlock(&meta_sinfo->lock); 3095 } 3096 3097 if (flushed == 2) { 3098 filemap_flush(inode->i_mapping); 3099 goto again; 3100 } else if (flushed == 3) { 3101 flush_delalloc(root, meta_sinfo); 3102 goto again; 3103 } 3104 spin_lock(&meta_sinfo->lock); 3105 meta_sinfo->bytes_delalloc -= num_bytes; 3106 spin_unlock(&meta_sinfo->lock); 3107 printk(KERN_ERR "enospc, has %d, reserved %d\n", 3108 BTRFS_I(inode)->outstanding_extents, 3109 BTRFS_I(inode)->reserved_extents); 3110 dump_space_info(meta_sinfo, 0, 0); 3111 return -ENOSPC; 3112 } 3113 3114 BTRFS_I(inode)->reserved_extents++; 3115 check_force_delalloc(meta_sinfo); 3116 spin_unlock(&meta_sinfo->lock); 3117 3118 if (!flushed && force_delalloc) 3119 filemap_flush(inode->i_mapping); 3120 3121 return 0; 3122 } 3123 3124 /* 3125 * unreserve num_items number of items worth of metadata space. This needs to 3126 * be paired with btrfs_reserve_metadata_space. 3127 * 3128 * NOTE: if you have the option, run this _AFTER_ you do a 3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref 3130 * oprations which will result in more used metadata, so we want to make sure we 3131 * can do that without issue. 3132 */ 3133 int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) 3134 { 3135 struct btrfs_fs_info *info = root->fs_info; 3136 struct btrfs_space_info *meta_sinfo; 3137 u64 num_bytes; 3138 u64 alloc_target; 3139 bool bug = false; 3140 3141 /* get the space info for where the metadata will live */ 3142 alloc_target = btrfs_get_alloc_profile(root, 0); 3143 meta_sinfo = __find_space_info(info, alloc_target); 3144 3145 num_bytes = calculate_bytes_needed(root, num_items); 3146 3147 spin_lock(&meta_sinfo->lock); 3148 if (meta_sinfo->bytes_may_use < num_bytes) { 3149 bug = true; 3150 meta_sinfo->bytes_may_use = 0; 3151 } else { 3152 meta_sinfo->bytes_may_use -= num_bytes; 3153 } 3154 spin_unlock(&meta_sinfo->lock); 3155 3156 BUG_ON(bug); 3157 3158 return 0; 3159 } 3160 3161 /* 3162 * Reserve some metadata space for use. We'll calculate the worste case number 3163 * of bytes that would be needed to modify num_items number of items. If we 3164 * have space, fantastic, if not, you get -ENOSPC. Please call 3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of 3166 * items you reserved, since whatever metadata you needed should have already 3167 * been allocated. 3168 * 3169 * This will commit the transaction to make more space if we don't have enough 3170 * metadata space. THe only time we don't do this is if we're reserving space 3171 * inside of a transaction, then we will just return -ENOSPC and it is the 3172 * callers responsibility to handle it properly. 3173 */ 3174 int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) 3175 { 3176 struct btrfs_fs_info *info = root->fs_info; 3177 struct btrfs_space_info *meta_sinfo; 3178 u64 num_bytes; 3179 u64 used; 3180 u64 alloc_target; 3181 int retries = 0; 3182 3183 /* get the space info for where the metadata will live */ 3184 alloc_target = btrfs_get_alloc_profile(root, 0); 3185 meta_sinfo = __find_space_info(info, alloc_target); 3186 3187 num_bytes = calculate_bytes_needed(root, num_items); 3188 again: 3189 spin_lock(&meta_sinfo->lock); 3190 3191 if (unlikely(!meta_sinfo->bytes_root)) 3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3193 3194 if (!retries) 3195 meta_sinfo->bytes_may_use += num_bytes; 3196 3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3201 3202 if (used > meta_sinfo->total_bytes) { 3203 retries++; 3204 if (retries == 1) { 3205 if (maybe_allocate_chunk(root, meta_sinfo)) 3206 goto again; 3207 retries++; 3208 } else { 3209 spin_unlock(&meta_sinfo->lock); 3210 } 3211 3212 if (retries == 2) { 3213 flush_delalloc(root, meta_sinfo); 3214 goto again; 3215 } 3216 spin_lock(&meta_sinfo->lock); 3217 meta_sinfo->bytes_may_use -= num_bytes; 3218 spin_unlock(&meta_sinfo->lock); 3219 3220 dump_space_info(meta_sinfo, 0, 0); 3221 return -ENOSPC; 3222 } 3223 3224 check_force_delalloc(meta_sinfo); 3225 spin_unlock(&meta_sinfo->lock); 3226 3227 return 0; 3228 } 3229 3230 /* 3231 * This will check the space that the inode allocates from to make sure we have 3232 * enough space for bytes. 3233 */ 3234 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 3235 u64 bytes) 3236 { 3237 struct btrfs_space_info *data_sinfo; 3238 int ret = 0, committed = 0; 3239 3240 /* make sure bytes are sectorsize aligned */ 3241 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3242 3243 data_sinfo = BTRFS_I(inode)->space_info; 3244 if (!data_sinfo) 3245 goto alloc; 3246 3247 again: 3248 /* make sure we have enough space to handle the data first */ 3249 spin_lock(&data_sinfo->lock); 3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 3254 struct btrfs_trans_handle *trans; 3255 3256 /* 3257 * if we don't have enough free bytes in this space then we need 3258 * to alloc a new chunk. 3259 */ 3260 if (!data_sinfo->full) { 3261 u64 alloc_target; 3262 3263 data_sinfo->force_alloc = 1; 3264 spin_unlock(&data_sinfo->lock); 3265 alloc: 3266 alloc_target = btrfs_get_alloc_profile(root, 1); 3267 trans = btrfs_start_transaction(root, 1); 3268 if (!trans) 3269 return -ENOMEM; 3270 3271 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3272 bytes + 2 * 1024 * 1024, 3273 alloc_target, 0); 3274 btrfs_end_transaction(trans, root); 3275 if (ret) 3276 return ret; 3277 3278 if (!data_sinfo) { 3279 btrfs_set_inode_space_info(root, inode); 3280 data_sinfo = BTRFS_I(inode)->space_info; 3281 } 3282 goto again; 3283 } 3284 spin_unlock(&data_sinfo->lock); 3285 3286 /* commit the current transaction and try again */ 3287 if (!committed && !root->fs_info->open_ioctl_trans) { 3288 committed = 1; 3289 trans = btrfs_join_transaction(root, 1); 3290 if (!trans) 3291 return -ENOMEM; 3292 ret = btrfs_commit_transaction(trans, root); 3293 if (ret) 3294 return ret; 3295 goto again; 3296 } 3297 3298 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 3299 ", %llu bytes_used, %llu bytes_reserved, " 3300 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 3301 "%llu total\n", (unsigned long long)bytes, 3302 (unsigned long long)data_sinfo->bytes_delalloc, 3303 (unsigned long long)data_sinfo->bytes_used, 3304 (unsigned long long)data_sinfo->bytes_reserved, 3305 (unsigned long long)data_sinfo->bytes_pinned, 3306 (unsigned long long)data_sinfo->bytes_readonly, 3307 (unsigned long long)data_sinfo->bytes_may_use, 3308 (unsigned long long)data_sinfo->total_bytes); 3309 return -ENOSPC; 3310 } 3311 data_sinfo->bytes_may_use += bytes; 3312 BTRFS_I(inode)->reserved_bytes += bytes; 3313 spin_unlock(&data_sinfo->lock); 3314 3315 return 0; 3316 } 3317 3318 /* 3319 * if there was an error for whatever reason after calling 3320 * btrfs_check_data_free_space, call this so we can cleanup the counters. 3321 */ 3322 void btrfs_free_reserved_data_space(struct btrfs_root *root, 3323 struct inode *inode, u64 bytes) 3324 { 3325 struct btrfs_space_info *data_sinfo; 3326 3327 /* make sure bytes are sectorsize aligned */ 3328 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3329 3330 data_sinfo = BTRFS_I(inode)->space_info; 3331 spin_lock(&data_sinfo->lock); 3332 data_sinfo->bytes_may_use -= bytes; 3333 BTRFS_I(inode)->reserved_bytes -= bytes; 3334 spin_unlock(&data_sinfo->lock); 3335 } 3336 3337 /* called when we are adding a delalloc extent to the inode's io_tree */ 3338 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 3339 u64 bytes) 3340 { 3341 struct btrfs_space_info *data_sinfo; 3342 3343 /* get the space info for where this inode will be storing its data */ 3344 data_sinfo = BTRFS_I(inode)->space_info; 3345 3346 /* make sure we have enough space to handle the data first */ 3347 spin_lock(&data_sinfo->lock); 3348 data_sinfo->bytes_delalloc += bytes; 3349 3350 /* 3351 * we are adding a delalloc extent without calling 3352 * btrfs_check_data_free_space first. This happens on a weird 3353 * writepage condition, but shouldn't hurt our accounting 3354 */ 3355 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { 3356 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; 3357 BTRFS_I(inode)->reserved_bytes = 0; 3358 } else { 3359 data_sinfo->bytes_may_use -= bytes; 3360 BTRFS_I(inode)->reserved_bytes -= bytes; 3361 } 3362 3363 spin_unlock(&data_sinfo->lock); 3364 } 3365 3366 /* called when we are clearing an delalloc extent from the inode's io_tree */ 3367 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 3368 u64 bytes) 3369 { 3370 struct btrfs_space_info *info; 3371 3372 info = BTRFS_I(inode)->space_info; 3373 3374 spin_lock(&info->lock); 3375 info->bytes_delalloc -= bytes; 3376 spin_unlock(&info->lock); 3377 } 3378 3379 static void force_metadata_allocation(struct btrfs_fs_info *info) 3380 { 3381 struct list_head *head = &info->space_info; 3382 struct btrfs_space_info *found; 3383 3384 rcu_read_lock(); 3385 list_for_each_entry_rcu(found, head, list) { 3386 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3387 found->force_alloc = 1; 3388 } 3389 rcu_read_unlock(); 3390 } 3391 3392 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3393 struct btrfs_root *extent_root, u64 alloc_bytes, 3394 u64 flags, int force) 3395 { 3396 struct btrfs_space_info *space_info; 3397 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3398 u64 thresh; 3399 int ret = 0; 3400 3401 mutex_lock(&fs_info->chunk_mutex); 3402 3403 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3404 3405 space_info = __find_space_info(extent_root->fs_info, flags); 3406 if (!space_info) { 3407 ret = update_space_info(extent_root->fs_info, flags, 3408 0, 0, &space_info); 3409 BUG_ON(ret); 3410 } 3411 BUG_ON(!space_info); 3412 3413 spin_lock(&space_info->lock); 3414 if (space_info->force_alloc) 3415 force = 1; 3416 if (space_info->full) { 3417 spin_unlock(&space_info->lock); 3418 goto out; 3419 } 3420 3421 thresh = space_info->total_bytes - space_info->bytes_readonly; 3422 thresh = div_factor(thresh, 8); 3423 if (!force && 3424 (space_info->bytes_used + space_info->bytes_pinned + 3425 space_info->bytes_reserved + alloc_bytes) < thresh) { 3426 spin_unlock(&space_info->lock); 3427 goto out; 3428 } 3429 spin_unlock(&space_info->lock); 3430 3431 /* 3432 * if we're doing a data chunk, go ahead and make sure that 3433 * we keep a reasonable number of metadata chunks allocated in the 3434 * FS as well. 3435 */ 3436 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3437 fs_info->data_chunk_allocations++; 3438 if (!(fs_info->data_chunk_allocations % 3439 fs_info->metadata_ratio)) 3440 force_metadata_allocation(fs_info); 3441 } 3442 3443 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3444 spin_lock(&space_info->lock); 3445 if (ret) 3446 space_info->full = 1; 3447 space_info->force_alloc = 0; 3448 spin_unlock(&space_info->lock); 3449 out: 3450 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3451 return ret; 3452 } 3453 3454 static int update_block_group(struct btrfs_trans_handle *trans, 3455 struct btrfs_root *root, 3456 u64 bytenr, u64 num_bytes, int alloc, 3457 int mark_free) 3458 { 3459 struct btrfs_block_group_cache *cache; 3460 struct btrfs_fs_info *info = root->fs_info; 3461 u64 total = num_bytes; 3462 u64 old_val; 3463 u64 byte_in_group; 3464 3465 /* block accounting for super block */ 3466 spin_lock(&info->delalloc_lock); 3467 old_val = btrfs_super_bytes_used(&info->super_copy); 3468 if (alloc) 3469 old_val += num_bytes; 3470 else 3471 old_val -= num_bytes; 3472 btrfs_set_super_bytes_used(&info->super_copy, old_val); 3473 spin_unlock(&info->delalloc_lock); 3474 3475 while (total) { 3476 cache = btrfs_lookup_block_group(info, bytenr); 3477 if (!cache) 3478 return -1; 3479 byte_in_group = bytenr - cache->key.objectid; 3480 WARN_ON(byte_in_group > cache->key.offset); 3481 3482 spin_lock(&cache->space_info->lock); 3483 spin_lock(&cache->lock); 3484 cache->dirty = 1; 3485 old_val = btrfs_block_group_used(&cache->item); 3486 num_bytes = min(total, cache->key.offset - byte_in_group); 3487 if (alloc) { 3488 old_val += num_bytes; 3489 btrfs_set_block_group_used(&cache->item, old_val); 3490 cache->reserved -= num_bytes; 3491 cache->space_info->bytes_used += num_bytes; 3492 cache->space_info->bytes_reserved -= num_bytes; 3493 if (cache->ro) 3494 cache->space_info->bytes_readonly -= num_bytes; 3495 spin_unlock(&cache->lock); 3496 spin_unlock(&cache->space_info->lock); 3497 } else { 3498 old_val -= num_bytes; 3499 cache->space_info->bytes_used -= num_bytes; 3500 if (cache->ro) 3501 cache->space_info->bytes_readonly += num_bytes; 3502 btrfs_set_block_group_used(&cache->item, old_val); 3503 spin_unlock(&cache->lock); 3504 spin_unlock(&cache->space_info->lock); 3505 if (mark_free) { 3506 int ret; 3507 3508 ret = btrfs_discard_extent(root, bytenr, 3509 num_bytes); 3510 WARN_ON(ret); 3511 3512 ret = btrfs_add_free_space(cache, bytenr, 3513 num_bytes); 3514 WARN_ON(ret); 3515 } 3516 } 3517 btrfs_put_block_group(cache); 3518 total -= num_bytes; 3519 bytenr += num_bytes; 3520 } 3521 return 0; 3522 } 3523 3524 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 3525 { 3526 struct btrfs_block_group_cache *cache; 3527 u64 bytenr; 3528 3529 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 3530 if (!cache) 3531 return 0; 3532 3533 bytenr = cache->key.objectid; 3534 btrfs_put_block_group(cache); 3535 3536 return bytenr; 3537 } 3538 3539 /* 3540 * this function must be called within transaction 3541 */ 3542 int btrfs_pin_extent(struct btrfs_root *root, 3543 u64 bytenr, u64 num_bytes, int reserved) 3544 { 3545 struct btrfs_fs_info *fs_info = root->fs_info; 3546 struct btrfs_block_group_cache *cache; 3547 3548 cache = btrfs_lookup_block_group(fs_info, bytenr); 3549 BUG_ON(!cache); 3550 3551 spin_lock(&cache->space_info->lock); 3552 spin_lock(&cache->lock); 3553 cache->pinned += num_bytes; 3554 cache->space_info->bytes_pinned += num_bytes; 3555 if (reserved) { 3556 cache->reserved -= num_bytes; 3557 cache->space_info->bytes_reserved -= num_bytes; 3558 } 3559 spin_unlock(&cache->lock); 3560 spin_unlock(&cache->space_info->lock); 3561 3562 btrfs_put_block_group(cache); 3563 3564 set_extent_dirty(fs_info->pinned_extents, 3565 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3566 return 0; 3567 } 3568 3569 static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3570 u64 num_bytes, int reserve) 3571 { 3572 spin_lock(&cache->space_info->lock); 3573 spin_lock(&cache->lock); 3574 if (reserve) { 3575 cache->reserved += num_bytes; 3576 cache->space_info->bytes_reserved += num_bytes; 3577 } else { 3578 cache->reserved -= num_bytes; 3579 cache->space_info->bytes_reserved -= num_bytes; 3580 } 3581 spin_unlock(&cache->lock); 3582 spin_unlock(&cache->space_info->lock); 3583 return 0; 3584 } 3585 3586 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3587 struct btrfs_root *root) 3588 { 3589 struct btrfs_fs_info *fs_info = root->fs_info; 3590 struct btrfs_caching_control *next; 3591 struct btrfs_caching_control *caching_ctl; 3592 struct btrfs_block_group_cache *cache; 3593 3594 down_write(&fs_info->extent_commit_sem); 3595 3596 list_for_each_entry_safe(caching_ctl, next, 3597 &fs_info->caching_block_groups, list) { 3598 cache = caching_ctl->block_group; 3599 if (block_group_cache_done(cache)) { 3600 cache->last_byte_to_unpin = (u64)-1; 3601 list_del_init(&caching_ctl->list); 3602 put_caching_control(caching_ctl); 3603 } else { 3604 cache->last_byte_to_unpin = caching_ctl->progress; 3605 } 3606 } 3607 3608 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 3609 fs_info->pinned_extents = &fs_info->freed_extents[1]; 3610 else 3611 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3612 3613 up_write(&fs_info->extent_commit_sem); 3614 return 0; 3615 } 3616 3617 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 3618 { 3619 struct btrfs_fs_info *fs_info = root->fs_info; 3620 struct btrfs_block_group_cache *cache = NULL; 3621 u64 len; 3622 3623 while (start <= end) { 3624 if (!cache || 3625 start >= cache->key.objectid + cache->key.offset) { 3626 if (cache) 3627 btrfs_put_block_group(cache); 3628 cache = btrfs_lookup_block_group(fs_info, start); 3629 BUG_ON(!cache); 3630 } 3631 3632 len = cache->key.objectid + cache->key.offset - start; 3633 len = min(len, end + 1 - start); 3634 3635 if (start < cache->last_byte_to_unpin) { 3636 len = min(len, cache->last_byte_to_unpin - start); 3637 btrfs_add_free_space(cache, start, len); 3638 } 3639 3640 spin_lock(&cache->space_info->lock); 3641 spin_lock(&cache->lock); 3642 cache->pinned -= len; 3643 cache->space_info->bytes_pinned -= len; 3644 spin_unlock(&cache->lock); 3645 spin_unlock(&cache->space_info->lock); 3646 3647 start += len; 3648 } 3649 3650 if (cache) 3651 btrfs_put_block_group(cache); 3652 return 0; 3653 } 3654 3655 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 3656 struct btrfs_root *root) 3657 { 3658 struct btrfs_fs_info *fs_info = root->fs_info; 3659 struct extent_io_tree *unpin; 3660 u64 start; 3661 u64 end; 3662 int ret; 3663 3664 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 3665 unpin = &fs_info->freed_extents[1]; 3666 else 3667 unpin = &fs_info->freed_extents[0]; 3668 3669 while (1) { 3670 ret = find_first_extent_bit(unpin, 0, &start, &end, 3671 EXTENT_DIRTY); 3672 if (ret) 3673 break; 3674 3675 ret = btrfs_discard_extent(root, start, end + 1 - start); 3676 3677 clear_extent_dirty(unpin, start, end, GFP_NOFS); 3678 unpin_extent_range(root, start, end); 3679 cond_resched(); 3680 } 3681 3682 return ret; 3683 } 3684 3685 static int pin_down_bytes(struct btrfs_trans_handle *trans, 3686 struct btrfs_root *root, 3687 struct btrfs_path *path, 3688 u64 bytenr, u64 num_bytes, 3689 int is_data, int reserved, 3690 struct extent_buffer **must_clean) 3691 { 3692 int err = 0; 3693 struct extent_buffer *buf; 3694 3695 if (is_data) 3696 goto pinit; 3697 3698 /* 3699 * discard is sloooow, and so triggering discards on 3700 * individual btree blocks isn't a good plan. Just 3701 * pin everything in discard mode. 3702 */ 3703 if (btrfs_test_opt(root, DISCARD)) 3704 goto pinit; 3705 3706 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 3707 if (!buf) 3708 goto pinit; 3709 3710 /* we can reuse a block if it hasn't been written 3711 * and it is from this transaction. We can't 3712 * reuse anything from the tree log root because 3713 * it has tiny sub-transactions. 3714 */ 3715 if (btrfs_buffer_uptodate(buf, 0) && 3716 btrfs_try_tree_lock(buf)) { 3717 u64 header_owner = btrfs_header_owner(buf); 3718 u64 header_transid = btrfs_header_generation(buf); 3719 if (header_owner != BTRFS_TREE_LOG_OBJECTID && 3720 header_transid == trans->transid && 3721 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3722 *must_clean = buf; 3723 return 1; 3724 } 3725 btrfs_tree_unlock(buf); 3726 } 3727 free_extent_buffer(buf); 3728 pinit: 3729 if (path) 3730 btrfs_set_path_blocking(path); 3731 /* unlocks the pinned mutex */ 3732 btrfs_pin_extent(root, bytenr, num_bytes, reserved); 3733 3734 BUG_ON(err < 0); 3735 return 0; 3736 } 3737 3738 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3739 struct btrfs_root *root, 3740 u64 bytenr, u64 num_bytes, u64 parent, 3741 u64 root_objectid, u64 owner_objectid, 3742 u64 owner_offset, int refs_to_drop, 3743 struct btrfs_delayed_extent_op *extent_op) 3744 { 3745 struct btrfs_key key; 3746 struct btrfs_path *path; 3747 struct btrfs_fs_info *info = root->fs_info; 3748 struct btrfs_root *extent_root = info->extent_root; 3749 struct extent_buffer *leaf; 3750 struct btrfs_extent_item *ei; 3751 struct btrfs_extent_inline_ref *iref; 3752 int ret; 3753 int is_data; 3754 int extent_slot = 0; 3755 int found_extent = 0; 3756 int num_to_del = 1; 3757 u32 item_size; 3758 u64 refs; 3759 3760 path = btrfs_alloc_path(); 3761 if (!path) 3762 return -ENOMEM; 3763 3764 path->reada = 1; 3765 path->leave_spinning = 1; 3766 3767 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 3768 BUG_ON(!is_data && refs_to_drop != 1); 3769 3770 ret = lookup_extent_backref(trans, extent_root, path, &iref, 3771 bytenr, num_bytes, parent, 3772 root_objectid, owner_objectid, 3773 owner_offset); 3774 if (ret == 0) { 3775 extent_slot = path->slots[0]; 3776 while (extent_slot >= 0) { 3777 btrfs_item_key_to_cpu(path->nodes[0], &key, 3778 extent_slot); 3779 if (key.objectid != bytenr) 3780 break; 3781 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3782 key.offset == num_bytes) { 3783 found_extent = 1; 3784 break; 3785 } 3786 if (path->slots[0] - extent_slot > 5) 3787 break; 3788 extent_slot--; 3789 } 3790 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3791 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 3792 if (found_extent && item_size < sizeof(*ei)) 3793 found_extent = 0; 3794 #endif 3795 if (!found_extent) { 3796 BUG_ON(iref); 3797 ret = remove_extent_backref(trans, extent_root, path, 3798 NULL, refs_to_drop, 3799 is_data); 3800 BUG_ON(ret); 3801 btrfs_release_path(extent_root, path); 3802 path->leave_spinning = 1; 3803 3804 key.objectid = bytenr; 3805 key.type = BTRFS_EXTENT_ITEM_KEY; 3806 key.offset = num_bytes; 3807 3808 ret = btrfs_search_slot(trans, extent_root, 3809 &key, path, -1, 1); 3810 if (ret) { 3811 printk(KERN_ERR "umm, got %d back from search" 3812 ", was looking for %llu\n", ret, 3813 (unsigned long long)bytenr); 3814 btrfs_print_leaf(extent_root, path->nodes[0]); 3815 } 3816 BUG_ON(ret); 3817 extent_slot = path->slots[0]; 3818 } 3819 } else { 3820 btrfs_print_leaf(extent_root, path->nodes[0]); 3821 WARN_ON(1); 3822 printk(KERN_ERR "btrfs unable to find ref byte nr %llu " 3823 "parent %llu root %llu owner %llu offset %llu\n", 3824 (unsigned long long)bytenr, 3825 (unsigned long long)parent, 3826 (unsigned long long)root_objectid, 3827 (unsigned long long)owner_objectid, 3828 (unsigned long long)owner_offset); 3829 } 3830 3831 leaf = path->nodes[0]; 3832 item_size = btrfs_item_size_nr(leaf, extent_slot); 3833 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3834 if (item_size < sizeof(*ei)) { 3835 BUG_ON(found_extent || extent_slot != path->slots[0]); 3836 ret = convert_extent_item_v0(trans, extent_root, path, 3837 owner_objectid, 0); 3838 BUG_ON(ret < 0); 3839 3840 btrfs_release_path(extent_root, path); 3841 path->leave_spinning = 1; 3842 3843 key.objectid = bytenr; 3844 key.type = BTRFS_EXTENT_ITEM_KEY; 3845 key.offset = num_bytes; 3846 3847 ret = btrfs_search_slot(trans, extent_root, &key, path, 3848 -1, 1); 3849 if (ret) { 3850 printk(KERN_ERR "umm, got %d back from search" 3851 ", was looking for %llu\n", ret, 3852 (unsigned long long)bytenr); 3853 btrfs_print_leaf(extent_root, path->nodes[0]); 3854 } 3855 BUG_ON(ret); 3856 extent_slot = path->slots[0]; 3857 leaf = path->nodes[0]; 3858 item_size = btrfs_item_size_nr(leaf, extent_slot); 3859 } 3860 #endif 3861 BUG_ON(item_size < sizeof(*ei)); 3862 ei = btrfs_item_ptr(leaf, extent_slot, 3863 struct btrfs_extent_item); 3864 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3865 struct btrfs_tree_block_info *bi; 3866 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 3867 bi = (struct btrfs_tree_block_info *)(ei + 1); 3868 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 3869 } 3870 3871 refs = btrfs_extent_refs(leaf, ei); 3872 BUG_ON(refs < refs_to_drop); 3873 refs -= refs_to_drop; 3874 3875 if (refs > 0) { 3876 if (extent_op) 3877 __run_delayed_extent_op(extent_op, leaf, ei); 3878 /* 3879 * In the case of inline back ref, reference count will 3880 * be updated by remove_extent_backref 3881 */ 3882 if (iref) { 3883 BUG_ON(!found_extent); 3884 } else { 3885 btrfs_set_extent_refs(leaf, ei, refs); 3886 btrfs_mark_buffer_dirty(leaf); 3887 } 3888 if (found_extent) { 3889 ret = remove_extent_backref(trans, extent_root, path, 3890 iref, refs_to_drop, 3891 is_data); 3892 BUG_ON(ret); 3893 } 3894 } else { 3895 int mark_free = 0; 3896 struct extent_buffer *must_clean = NULL; 3897 3898 if (found_extent) { 3899 BUG_ON(is_data && refs_to_drop != 3900 extent_data_ref_count(root, path, iref)); 3901 if (iref) { 3902 BUG_ON(path->slots[0] != extent_slot); 3903 } else { 3904 BUG_ON(path->slots[0] != extent_slot + 1); 3905 path->slots[0] = extent_slot; 3906 num_to_del = 2; 3907 } 3908 } 3909 3910 ret = pin_down_bytes(trans, root, path, bytenr, 3911 num_bytes, is_data, 0, &must_clean); 3912 if (ret > 0) 3913 mark_free = 1; 3914 BUG_ON(ret < 0); 3915 /* 3916 * it is going to be very rare for someone to be waiting 3917 * on the block we're freeing. del_items might need to 3918 * schedule, so rather than get fancy, just force it 3919 * to blocking here 3920 */ 3921 if (must_clean) 3922 btrfs_set_lock_blocking(must_clean); 3923 3924 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3925 num_to_del); 3926 BUG_ON(ret); 3927 btrfs_release_path(extent_root, path); 3928 3929 if (must_clean) { 3930 clean_tree_block(NULL, root, must_clean); 3931 btrfs_tree_unlock(must_clean); 3932 free_extent_buffer(must_clean); 3933 } 3934 3935 if (is_data) { 3936 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 3937 BUG_ON(ret); 3938 } else { 3939 invalidate_mapping_pages(info->btree_inode->i_mapping, 3940 bytenr >> PAGE_CACHE_SHIFT, 3941 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 3942 } 3943 3944 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 3945 mark_free); 3946 BUG_ON(ret); 3947 } 3948 btrfs_free_path(path); 3949 return ret; 3950 } 3951 3952 /* 3953 * when we free an extent, it is possible (and likely) that we free the last 3954 * delayed ref for that extent as well. This searches the delayed ref tree for 3955 * a given extent, and if there are no other delayed refs to be processed, it 3956 * removes it from the tree. 3957 */ 3958 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 3959 struct btrfs_root *root, u64 bytenr) 3960 { 3961 struct btrfs_delayed_ref_head *head; 3962 struct btrfs_delayed_ref_root *delayed_refs; 3963 struct btrfs_delayed_ref_node *ref; 3964 struct rb_node *node; 3965 int ret; 3966 3967 delayed_refs = &trans->transaction->delayed_refs; 3968 spin_lock(&delayed_refs->lock); 3969 head = btrfs_find_delayed_ref_head(trans, bytenr); 3970 if (!head) 3971 goto out; 3972 3973 node = rb_prev(&head->node.rb_node); 3974 if (!node) 3975 goto out; 3976 3977 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3978 3979 /* there are still entries for this ref, we can't drop it */ 3980 if (ref->bytenr == bytenr) 3981 goto out; 3982 3983 if (head->extent_op) { 3984 if (!head->must_insert_reserved) 3985 goto out; 3986 kfree(head->extent_op); 3987 head->extent_op = NULL; 3988 } 3989 3990 /* 3991 * waiting for the lock here would deadlock. If someone else has it 3992 * locked they are already in the process of dropping it anyway 3993 */ 3994 if (!mutex_trylock(&head->mutex)) 3995 goto out; 3996 3997 /* 3998 * at this point we have a head with no other entries. Go 3999 * ahead and process it. 4000 */ 4001 head->node.in_tree = 0; 4002 rb_erase(&head->node.rb_node, &delayed_refs->root); 4003 4004 delayed_refs->num_entries--; 4005 4006 /* 4007 * we don't take a ref on the node because we're removing it from the 4008 * tree, so we just steal the ref the tree was holding. 4009 */ 4010 delayed_refs->num_heads--; 4011 if (list_empty(&head->cluster)) 4012 delayed_refs->num_heads_ready--; 4013 4014 list_del_init(&head->cluster); 4015 spin_unlock(&delayed_refs->lock); 4016 4017 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4018 &head->node, head->extent_op, 4019 head->must_insert_reserved); 4020 BUG_ON(ret); 4021 btrfs_put_delayed_ref(&head->node); 4022 return 0; 4023 out: 4024 spin_unlock(&delayed_refs->lock); 4025 return 0; 4026 } 4027 4028 int btrfs_free_extent(struct btrfs_trans_handle *trans, 4029 struct btrfs_root *root, 4030 u64 bytenr, u64 num_bytes, u64 parent, 4031 u64 root_objectid, u64 owner, u64 offset) 4032 { 4033 int ret; 4034 4035 /* 4036 * tree log blocks never actually go into the extent allocation 4037 * tree, just update pinning info and exit early. 4038 */ 4039 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 4040 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 4041 /* unlocks the pinned mutex */ 4042 btrfs_pin_extent(root, bytenr, num_bytes, 1); 4043 ret = 0; 4044 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 4045 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 4046 parent, root_objectid, (int)owner, 4047 BTRFS_DROP_DELAYED_REF, NULL); 4048 BUG_ON(ret); 4049 ret = check_ref_cleanup(trans, root, bytenr); 4050 BUG_ON(ret); 4051 } else { 4052 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4053 parent, root_objectid, owner, 4054 offset, BTRFS_DROP_DELAYED_REF, NULL); 4055 BUG_ON(ret); 4056 } 4057 return ret; 4058 } 4059 4060 int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 4061 struct btrfs_root *root, 4062 u64 bytenr, u32 blocksize, 4063 u64 parent, u64 root_objectid, int level) 4064 { 4065 u64 used; 4066 spin_lock(&root->node_lock); 4067 used = btrfs_root_used(&root->root_item) - blocksize; 4068 btrfs_set_root_used(&root->root_item, used); 4069 spin_unlock(&root->node_lock); 4070 4071 return btrfs_free_extent(trans, root, bytenr, blocksize, 4072 parent, root_objectid, level, 0); 4073 } 4074 4075 static u64 stripe_align(struct btrfs_root *root, u64 val) 4076 { 4077 u64 mask = ((u64)root->stripesize - 1); 4078 u64 ret = (val + mask) & ~mask; 4079 return ret; 4080 } 4081 4082 /* 4083 * when we wait for progress in the block group caching, its because 4084 * our allocation attempt failed at least once. So, we must sleep 4085 * and let some progress happen before we try again. 4086 * 4087 * This function will sleep at least once waiting for new free space to 4088 * show up, and then it will check the block group free space numbers 4089 * for our min num_bytes. Another option is to have it go ahead 4090 * and look in the rbtree for a free extent of a given size, but this 4091 * is a good start. 4092 */ 4093 static noinline int 4094 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 4095 u64 num_bytes) 4096 { 4097 struct btrfs_caching_control *caching_ctl; 4098 DEFINE_WAIT(wait); 4099 4100 caching_ctl = get_caching_control(cache); 4101 if (!caching_ctl) 4102 return 0; 4103 4104 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 4105 (cache->free_space >= num_bytes)); 4106 4107 put_caching_control(caching_ctl); 4108 return 0; 4109 } 4110 4111 static noinline int 4112 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 4113 { 4114 struct btrfs_caching_control *caching_ctl; 4115 DEFINE_WAIT(wait); 4116 4117 caching_ctl = get_caching_control(cache); 4118 if (!caching_ctl) 4119 return 0; 4120 4121 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 4122 4123 put_caching_control(caching_ctl); 4124 return 0; 4125 } 4126 4127 enum btrfs_loop_type { 4128 LOOP_FIND_IDEAL = 0, 4129 LOOP_CACHING_NOWAIT = 1, 4130 LOOP_CACHING_WAIT = 2, 4131 LOOP_ALLOC_CHUNK = 3, 4132 LOOP_NO_EMPTY_SIZE = 4, 4133 }; 4134 4135 /* 4136 * walks the btree of allocated extents and find a hole of a given size. 4137 * The key ins is changed to record the hole: 4138 * ins->objectid == block start 4139 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4140 * ins->offset == number of blocks 4141 * Any available blocks before search_start are skipped. 4142 */ 4143 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 4144 struct btrfs_root *orig_root, 4145 u64 num_bytes, u64 empty_size, 4146 u64 search_start, u64 search_end, 4147 u64 hint_byte, struct btrfs_key *ins, 4148 u64 exclude_start, u64 exclude_nr, 4149 int data) 4150 { 4151 int ret = 0; 4152 struct btrfs_root *root = orig_root->fs_info->extent_root; 4153 struct btrfs_free_cluster *last_ptr = NULL; 4154 struct btrfs_block_group_cache *block_group = NULL; 4155 int empty_cluster = 2 * 1024 * 1024; 4156 int allowed_chunk_alloc = 0; 4157 int done_chunk_alloc = 0; 4158 struct btrfs_space_info *space_info; 4159 int last_ptr_loop = 0; 4160 int loop = 0; 4161 bool found_uncached_bg = false; 4162 bool failed_cluster_refill = false; 4163 bool failed_alloc = false; 4164 u64 ideal_cache_percent = 0; 4165 u64 ideal_cache_offset = 0; 4166 4167 WARN_ON(num_bytes < root->sectorsize); 4168 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4169 ins->objectid = 0; 4170 ins->offset = 0; 4171 4172 space_info = __find_space_info(root->fs_info, data); 4173 4174 if (orig_root->ref_cows || empty_size) 4175 allowed_chunk_alloc = 1; 4176 4177 if (data & BTRFS_BLOCK_GROUP_METADATA) { 4178 last_ptr = &root->fs_info->meta_alloc_cluster; 4179 if (!btrfs_test_opt(root, SSD)) 4180 empty_cluster = 64 * 1024; 4181 } 4182 4183 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4184 last_ptr = &root->fs_info->data_alloc_cluster; 4185 } 4186 4187 if (last_ptr) { 4188 spin_lock(&last_ptr->lock); 4189 if (last_ptr->block_group) 4190 hint_byte = last_ptr->window_start; 4191 spin_unlock(&last_ptr->lock); 4192 } 4193 4194 search_start = max(search_start, first_logical_byte(root, 0)); 4195 search_start = max(search_start, hint_byte); 4196 4197 if (!last_ptr) 4198 empty_cluster = 0; 4199 4200 if (search_start == hint_byte) { 4201 ideal_cache: 4202 block_group = btrfs_lookup_block_group(root->fs_info, 4203 search_start); 4204 /* 4205 * we don't want to use the block group if it doesn't match our 4206 * allocation bits, or if its not cached. 4207 * 4208 * However if we are re-searching with an ideal block group 4209 * picked out then we don't care that the block group is cached. 4210 */ 4211 if (block_group && block_group_bits(block_group, data) && 4212 (block_group->cached != BTRFS_CACHE_NO || 4213 search_start == ideal_cache_offset)) { 4214 down_read(&space_info->groups_sem); 4215 if (list_empty(&block_group->list) || 4216 block_group->ro) { 4217 /* 4218 * someone is removing this block group, 4219 * we can't jump into the have_block_group 4220 * target because our list pointers are not 4221 * valid 4222 */ 4223 btrfs_put_block_group(block_group); 4224 up_read(&space_info->groups_sem); 4225 } else { 4226 goto have_block_group; 4227 } 4228 } else if (block_group) { 4229 btrfs_put_block_group(block_group); 4230 } 4231 } 4232 search: 4233 down_read(&space_info->groups_sem); 4234 list_for_each_entry(block_group, &space_info->block_groups, list) { 4235 u64 offset; 4236 int cached; 4237 4238 btrfs_get_block_group(block_group); 4239 search_start = block_group->key.objectid; 4240 4241 have_block_group: 4242 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4243 u64 free_percent; 4244 4245 free_percent = btrfs_block_group_used(&block_group->item); 4246 free_percent *= 100; 4247 free_percent = div64_u64(free_percent, 4248 block_group->key.offset); 4249 free_percent = 100 - free_percent; 4250 if (free_percent > ideal_cache_percent && 4251 likely(!block_group->ro)) { 4252 ideal_cache_offset = block_group->key.objectid; 4253 ideal_cache_percent = free_percent; 4254 } 4255 4256 /* 4257 * We only want to start kthread caching if we are at 4258 * the point where we will wait for caching to make 4259 * progress, or if our ideal search is over and we've 4260 * found somebody to start caching. 4261 */ 4262 if (loop > LOOP_CACHING_NOWAIT || 4263 (loop > LOOP_FIND_IDEAL && 4264 atomic_read(&space_info->caching_threads) < 2)) { 4265 ret = cache_block_group(block_group); 4266 BUG_ON(ret); 4267 } 4268 found_uncached_bg = true; 4269 4270 /* 4271 * If loop is set for cached only, try the next block 4272 * group. 4273 */ 4274 if (loop == LOOP_FIND_IDEAL) 4275 goto loop; 4276 } 4277 4278 cached = block_group_cache_done(block_group); 4279 if (unlikely(!cached)) 4280 found_uncached_bg = true; 4281 4282 if (unlikely(block_group->ro)) 4283 goto loop; 4284 4285 /* 4286 * Ok we want to try and use the cluster allocator, so lets look 4287 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 4288 * have tried the cluster allocator plenty of times at this 4289 * point and not have found anything, so we are likely way too 4290 * fragmented for the clustering stuff to find anything, so lets 4291 * just skip it and let the allocator find whatever block it can 4292 * find 4293 */ 4294 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 4295 /* 4296 * the refill lock keeps out other 4297 * people trying to start a new cluster 4298 */ 4299 spin_lock(&last_ptr->refill_lock); 4300 if (last_ptr->block_group && 4301 (last_ptr->block_group->ro || 4302 !block_group_bits(last_ptr->block_group, data))) { 4303 offset = 0; 4304 goto refill_cluster; 4305 } 4306 4307 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 4308 num_bytes, search_start); 4309 if (offset) { 4310 /* we have a block, we're done */ 4311 spin_unlock(&last_ptr->refill_lock); 4312 goto checks; 4313 } 4314 4315 spin_lock(&last_ptr->lock); 4316 /* 4317 * whoops, this cluster doesn't actually point to 4318 * this block group. Get a ref on the block 4319 * group is does point to and try again 4320 */ 4321 if (!last_ptr_loop && last_ptr->block_group && 4322 last_ptr->block_group != block_group) { 4323 4324 btrfs_put_block_group(block_group); 4325 block_group = last_ptr->block_group; 4326 btrfs_get_block_group(block_group); 4327 spin_unlock(&last_ptr->lock); 4328 spin_unlock(&last_ptr->refill_lock); 4329 4330 last_ptr_loop = 1; 4331 search_start = block_group->key.objectid; 4332 /* 4333 * we know this block group is properly 4334 * in the list because 4335 * btrfs_remove_block_group, drops the 4336 * cluster before it removes the block 4337 * group from the list 4338 */ 4339 goto have_block_group; 4340 } 4341 spin_unlock(&last_ptr->lock); 4342 refill_cluster: 4343 /* 4344 * this cluster didn't work out, free it and 4345 * start over 4346 */ 4347 btrfs_return_cluster_to_free_space(NULL, last_ptr); 4348 4349 last_ptr_loop = 0; 4350 4351 /* allocate a cluster in this block group */ 4352 ret = btrfs_find_space_cluster(trans, root, 4353 block_group, last_ptr, 4354 offset, num_bytes, 4355 empty_cluster + empty_size); 4356 if (ret == 0) { 4357 /* 4358 * now pull our allocation out of this 4359 * cluster 4360 */ 4361 offset = btrfs_alloc_from_cluster(block_group, 4362 last_ptr, num_bytes, 4363 search_start); 4364 if (offset) { 4365 /* we found one, proceed */ 4366 spin_unlock(&last_ptr->refill_lock); 4367 goto checks; 4368 } 4369 } else if (!cached && loop > LOOP_CACHING_NOWAIT 4370 && !failed_cluster_refill) { 4371 spin_unlock(&last_ptr->refill_lock); 4372 4373 failed_cluster_refill = true; 4374 wait_block_group_cache_progress(block_group, 4375 num_bytes + empty_cluster + empty_size); 4376 goto have_block_group; 4377 } 4378 4379 /* 4380 * at this point we either didn't find a cluster 4381 * or we weren't able to allocate a block from our 4382 * cluster. Free the cluster we've been trying 4383 * to use, and go to the next block group 4384 */ 4385 btrfs_return_cluster_to_free_space(NULL, last_ptr); 4386 spin_unlock(&last_ptr->refill_lock); 4387 goto loop; 4388 } 4389 4390 offset = btrfs_find_space_for_alloc(block_group, search_start, 4391 num_bytes, empty_size); 4392 /* 4393 * If we didn't find a chunk, and we haven't failed on this 4394 * block group before, and this block group is in the middle of 4395 * caching and we are ok with waiting, then go ahead and wait 4396 * for progress to be made, and set failed_alloc to true. 4397 * 4398 * If failed_alloc is true then we've already waited on this 4399 * block group once and should move on to the next block group. 4400 */ 4401 if (!offset && !failed_alloc && !cached && 4402 loop > LOOP_CACHING_NOWAIT) { 4403 wait_block_group_cache_progress(block_group, 4404 num_bytes + empty_size); 4405 failed_alloc = true; 4406 goto have_block_group; 4407 } else if (!offset) { 4408 goto loop; 4409 } 4410 checks: 4411 search_start = stripe_align(root, offset); 4412 /* move on to the next group */ 4413 if (search_start + num_bytes >= search_end) { 4414 btrfs_add_free_space(block_group, offset, num_bytes); 4415 goto loop; 4416 } 4417 4418 /* move on to the next group */ 4419 if (search_start + num_bytes > 4420 block_group->key.objectid + block_group->key.offset) { 4421 btrfs_add_free_space(block_group, offset, num_bytes); 4422 goto loop; 4423 } 4424 4425 if (exclude_nr > 0 && 4426 (search_start + num_bytes > exclude_start && 4427 search_start < exclude_start + exclude_nr)) { 4428 search_start = exclude_start + exclude_nr; 4429 4430 btrfs_add_free_space(block_group, offset, num_bytes); 4431 /* 4432 * if search_start is still in this block group 4433 * then we just re-search this block group 4434 */ 4435 if (search_start >= block_group->key.objectid && 4436 search_start < (block_group->key.objectid + 4437 block_group->key.offset)) 4438 goto have_block_group; 4439 goto loop; 4440 } 4441 4442 ins->objectid = search_start; 4443 ins->offset = num_bytes; 4444 4445 if (offset < search_start) 4446 btrfs_add_free_space(block_group, offset, 4447 search_start - offset); 4448 BUG_ON(offset > search_start); 4449 4450 update_reserved_extents(block_group, num_bytes, 1); 4451 4452 /* we are all good, lets return */ 4453 break; 4454 loop: 4455 failed_cluster_refill = false; 4456 failed_alloc = false; 4457 btrfs_put_block_group(block_group); 4458 } 4459 up_read(&space_info->groups_sem); 4460 4461 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4462 * for them to make caching progress. Also 4463 * determine the best possible bg to cache 4464 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 4465 * caching kthreads as we move along 4466 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4467 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4468 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 4469 * again 4470 */ 4471 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4472 (found_uncached_bg || empty_size || empty_cluster || 4473 allowed_chunk_alloc)) { 4474 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4475 found_uncached_bg = false; 4476 loop++; 4477 if (!ideal_cache_percent && 4478 atomic_read(&space_info->caching_threads)) 4479 goto search; 4480 4481 /* 4482 * 1 of the following 2 things have happened so far 4483 * 4484 * 1) We found an ideal block group for caching that 4485 * is mostly full and will cache quickly, so we might 4486 * as well wait for it. 4487 * 4488 * 2) We searched for cached only and we didn't find 4489 * anything, and we didn't start any caching kthreads 4490 * either, so chances are we will loop through and 4491 * start a couple caching kthreads, and then come back 4492 * around and just wait for them. This will be slower 4493 * because we will have 2 caching kthreads reading at 4494 * the same time when we could have just started one 4495 * and waited for it to get far enough to give us an 4496 * allocation, so go ahead and go to the wait caching 4497 * loop. 4498 */ 4499 loop = LOOP_CACHING_WAIT; 4500 search_start = ideal_cache_offset; 4501 ideal_cache_percent = 0; 4502 goto ideal_cache; 4503 } else if (loop == LOOP_FIND_IDEAL) { 4504 /* 4505 * Didn't find a uncached bg, wait on anything we find 4506 * next. 4507 */ 4508 loop = LOOP_CACHING_WAIT; 4509 goto search; 4510 } 4511 4512 if (loop < LOOP_CACHING_WAIT) { 4513 loop++; 4514 goto search; 4515 } 4516 4517 if (loop == LOOP_ALLOC_CHUNK) { 4518 empty_size = 0; 4519 empty_cluster = 0; 4520 } 4521 4522 if (allowed_chunk_alloc) { 4523 ret = do_chunk_alloc(trans, root, num_bytes + 4524 2 * 1024 * 1024, data, 1); 4525 allowed_chunk_alloc = 0; 4526 done_chunk_alloc = 1; 4527 } else if (!done_chunk_alloc) { 4528 space_info->force_alloc = 1; 4529 } 4530 4531 if (loop < LOOP_NO_EMPTY_SIZE) { 4532 loop++; 4533 goto search; 4534 } 4535 ret = -ENOSPC; 4536 } else if (!ins->objectid) { 4537 ret = -ENOSPC; 4538 } 4539 4540 /* we found what we needed */ 4541 if (ins->objectid) { 4542 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 4543 trans->block_group = block_group->key.objectid; 4544 4545 btrfs_put_block_group(block_group); 4546 ret = 0; 4547 } 4548 4549 return ret; 4550 } 4551 4552 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 4553 int dump_block_groups) 4554 { 4555 struct btrfs_block_group_cache *cache; 4556 4557 spin_lock(&info->lock); 4558 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4559 (unsigned long long)(info->total_bytes - info->bytes_used - 4560 info->bytes_pinned - info->bytes_reserved - 4561 info->bytes_super), 4562 (info->full) ? "" : "not "); 4563 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4564 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4565 "\n", 4566 (unsigned long long)info->total_bytes, 4567 (unsigned long long)info->bytes_pinned, 4568 (unsigned long long)info->bytes_delalloc, 4569 (unsigned long long)info->bytes_may_use, 4570 (unsigned long long)info->bytes_used, 4571 (unsigned long long)info->bytes_root, 4572 (unsigned long long)info->bytes_super, 4573 (unsigned long long)info->bytes_reserved); 4574 spin_unlock(&info->lock); 4575 4576 if (!dump_block_groups) 4577 return; 4578 4579 down_read(&info->groups_sem); 4580 list_for_each_entry(cache, &info->block_groups, list) { 4581 spin_lock(&cache->lock); 4582 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4583 "%llu pinned %llu reserved\n", 4584 (unsigned long long)cache->key.objectid, 4585 (unsigned long long)cache->key.offset, 4586 (unsigned long long)btrfs_block_group_used(&cache->item), 4587 (unsigned long long)cache->pinned, 4588 (unsigned long long)cache->reserved); 4589 btrfs_dump_free_space(cache, bytes); 4590 spin_unlock(&cache->lock); 4591 } 4592 up_read(&info->groups_sem); 4593 } 4594 4595 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 4596 struct btrfs_root *root, 4597 u64 num_bytes, u64 min_alloc_size, 4598 u64 empty_size, u64 hint_byte, 4599 u64 search_end, struct btrfs_key *ins, 4600 u64 data) 4601 { 4602 int ret; 4603 u64 search_start = 0; 4604 4605 data = btrfs_get_alloc_profile(root, data); 4606 again: 4607 /* 4608 * the only place that sets empty_size is btrfs_realloc_node, which 4609 * is not called recursively on allocations 4610 */ 4611 if (empty_size || root->ref_cows) 4612 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4613 num_bytes + 2 * 1024 * 1024, data, 0); 4614 4615 WARN_ON(num_bytes < root->sectorsize); 4616 ret = find_free_extent(trans, root, num_bytes, empty_size, 4617 search_start, search_end, hint_byte, ins, 4618 trans->alloc_exclude_start, 4619 trans->alloc_exclude_nr, data); 4620 4621 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 4622 num_bytes = num_bytes >> 1; 4623 num_bytes = num_bytes & ~(root->sectorsize - 1); 4624 num_bytes = max(num_bytes, min_alloc_size); 4625 do_chunk_alloc(trans, root->fs_info->extent_root, 4626 num_bytes, data, 1); 4627 goto again; 4628 } 4629 if (ret == -ENOSPC) { 4630 struct btrfs_space_info *sinfo; 4631 4632 sinfo = __find_space_info(root->fs_info, data); 4633 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4634 "wanted %llu\n", (unsigned long long)data, 4635 (unsigned long long)num_bytes); 4636 dump_space_info(sinfo, num_bytes, 1); 4637 } 4638 4639 return ret; 4640 } 4641 4642 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 4643 { 4644 struct btrfs_block_group_cache *cache; 4645 int ret = 0; 4646 4647 cache = btrfs_lookup_block_group(root->fs_info, start); 4648 if (!cache) { 4649 printk(KERN_ERR "Unable to find block group for %llu\n", 4650 (unsigned long long)start); 4651 return -ENOSPC; 4652 } 4653 4654 ret = btrfs_discard_extent(root, start, len); 4655 4656 btrfs_add_free_space(cache, start, len); 4657 update_reserved_extents(cache, len, 0); 4658 btrfs_put_block_group(cache); 4659 4660 return ret; 4661 } 4662 4663 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4664 struct btrfs_root *root, 4665 u64 parent, u64 root_objectid, 4666 u64 flags, u64 owner, u64 offset, 4667 struct btrfs_key *ins, int ref_mod) 4668 { 4669 int ret; 4670 struct btrfs_fs_info *fs_info = root->fs_info; 4671 struct btrfs_extent_item *extent_item; 4672 struct btrfs_extent_inline_ref *iref; 4673 struct btrfs_path *path; 4674 struct extent_buffer *leaf; 4675 int type; 4676 u32 size; 4677 4678 if (parent > 0) 4679 type = BTRFS_SHARED_DATA_REF_KEY; 4680 else 4681 type = BTRFS_EXTENT_DATA_REF_KEY; 4682 4683 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 4684 4685 path = btrfs_alloc_path(); 4686 BUG_ON(!path); 4687 4688 path->leave_spinning = 1; 4689 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 4690 ins, size); 4691 BUG_ON(ret); 4692 4693 leaf = path->nodes[0]; 4694 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4695 struct btrfs_extent_item); 4696 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 4697 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4698 btrfs_set_extent_flags(leaf, extent_item, 4699 flags | BTRFS_EXTENT_FLAG_DATA); 4700 4701 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4702 btrfs_set_extent_inline_ref_type(leaf, iref, type); 4703 if (parent > 0) { 4704 struct btrfs_shared_data_ref *ref; 4705 ref = (struct btrfs_shared_data_ref *)(iref + 1); 4706 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4707 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 4708 } else { 4709 struct btrfs_extent_data_ref *ref; 4710 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 4711 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 4712 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 4713 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 4714 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 4715 } 4716 4717 btrfs_mark_buffer_dirty(path->nodes[0]); 4718 btrfs_free_path(path); 4719 4720 ret = update_block_group(trans, root, ins->objectid, ins->offset, 4721 1, 0); 4722 if (ret) { 4723 printk(KERN_ERR "btrfs update block group failed for %llu " 4724 "%llu\n", (unsigned long long)ins->objectid, 4725 (unsigned long long)ins->offset); 4726 BUG(); 4727 } 4728 return ret; 4729 } 4730 4731 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 4732 struct btrfs_root *root, 4733 u64 parent, u64 root_objectid, 4734 u64 flags, struct btrfs_disk_key *key, 4735 int level, struct btrfs_key *ins) 4736 { 4737 int ret; 4738 struct btrfs_fs_info *fs_info = root->fs_info; 4739 struct btrfs_extent_item *extent_item; 4740 struct btrfs_tree_block_info *block_info; 4741 struct btrfs_extent_inline_ref *iref; 4742 struct btrfs_path *path; 4743 struct extent_buffer *leaf; 4744 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 4745 4746 path = btrfs_alloc_path(); 4747 BUG_ON(!path); 4748 4749 path->leave_spinning = 1; 4750 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 4751 ins, size); 4752 BUG_ON(ret); 4753 4754 leaf = path->nodes[0]; 4755 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4756 struct btrfs_extent_item); 4757 btrfs_set_extent_refs(leaf, extent_item, 1); 4758 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4759 btrfs_set_extent_flags(leaf, extent_item, 4760 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 4761 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 4762 4763 btrfs_set_tree_block_key(leaf, block_info, key); 4764 btrfs_set_tree_block_level(leaf, block_info, level); 4765 4766 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 4767 if (parent > 0) { 4768 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 4769 btrfs_set_extent_inline_ref_type(leaf, iref, 4770 BTRFS_SHARED_BLOCK_REF_KEY); 4771 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4772 } else { 4773 btrfs_set_extent_inline_ref_type(leaf, iref, 4774 BTRFS_TREE_BLOCK_REF_KEY); 4775 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 4776 } 4777 4778 btrfs_mark_buffer_dirty(leaf); 4779 btrfs_free_path(path); 4780 4781 ret = update_block_group(trans, root, ins->objectid, ins->offset, 4782 1, 0); 4783 if (ret) { 4784 printk(KERN_ERR "btrfs update block group failed for %llu " 4785 "%llu\n", (unsigned long long)ins->objectid, 4786 (unsigned long long)ins->offset); 4787 BUG(); 4788 } 4789 return ret; 4790 } 4791 4792 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4793 struct btrfs_root *root, 4794 u64 root_objectid, u64 owner, 4795 u64 offset, struct btrfs_key *ins) 4796 { 4797 int ret; 4798 4799 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 4800 4801 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 4802 0, root_objectid, owner, offset, 4803 BTRFS_ADD_DELAYED_EXTENT, NULL); 4804 return ret; 4805 } 4806 4807 /* 4808 * this is used by the tree logging recovery code. It records that 4809 * an extent has been allocated and makes sure to clear the free 4810 * space cache bits as well 4811 */ 4812 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 4813 struct btrfs_root *root, 4814 u64 root_objectid, u64 owner, u64 offset, 4815 struct btrfs_key *ins) 4816 { 4817 int ret; 4818 struct btrfs_block_group_cache *block_group; 4819 struct btrfs_caching_control *caching_ctl; 4820 u64 start = ins->objectid; 4821 u64 num_bytes = ins->offset; 4822 4823 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4824 cache_block_group(block_group); 4825 caching_ctl = get_caching_control(block_group); 4826 4827 if (!caching_ctl) { 4828 BUG_ON(!block_group_cache_done(block_group)); 4829 ret = btrfs_remove_free_space(block_group, start, num_bytes); 4830 BUG_ON(ret); 4831 } else { 4832 mutex_lock(&caching_ctl->mutex); 4833 4834 if (start >= caching_ctl->progress) { 4835 ret = add_excluded_extent(root, start, num_bytes); 4836 BUG_ON(ret); 4837 } else if (start + num_bytes <= caching_ctl->progress) { 4838 ret = btrfs_remove_free_space(block_group, 4839 start, num_bytes); 4840 BUG_ON(ret); 4841 } else { 4842 num_bytes = caching_ctl->progress - start; 4843 ret = btrfs_remove_free_space(block_group, 4844 start, num_bytes); 4845 BUG_ON(ret); 4846 4847 start = caching_ctl->progress; 4848 num_bytes = ins->objectid + ins->offset - 4849 caching_ctl->progress; 4850 ret = add_excluded_extent(root, start, num_bytes); 4851 BUG_ON(ret); 4852 } 4853 4854 mutex_unlock(&caching_ctl->mutex); 4855 put_caching_control(caching_ctl); 4856 } 4857 4858 update_reserved_extents(block_group, ins->offset, 1); 4859 btrfs_put_block_group(block_group); 4860 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 4861 0, owner, offset, ins, 1); 4862 return ret; 4863 } 4864 4865 /* 4866 * finds a free extent and does all the dirty work required for allocation 4867 * returns the key for the extent through ins, and a tree buffer for 4868 * the first block of the extent through buf. 4869 * 4870 * returns 0 if everything worked, non-zero otherwise. 4871 */ 4872 static int alloc_tree_block(struct btrfs_trans_handle *trans, 4873 struct btrfs_root *root, 4874 u64 num_bytes, u64 parent, u64 root_objectid, 4875 struct btrfs_disk_key *key, int level, 4876 u64 empty_size, u64 hint_byte, u64 search_end, 4877 struct btrfs_key *ins) 4878 { 4879 int ret; 4880 u64 flags = 0; 4881 4882 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4883 empty_size, hint_byte, search_end, 4884 ins, 0); 4885 if (ret) 4886 return ret; 4887 4888 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4889 if (parent == 0) 4890 parent = ins->objectid; 4891 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 4892 } else 4893 BUG_ON(parent > 0); 4894 4895 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4896 struct btrfs_delayed_extent_op *extent_op; 4897 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 4898 BUG_ON(!extent_op); 4899 if (key) 4900 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 4901 else 4902 memset(&extent_op->key, 0, sizeof(extent_op->key)); 4903 extent_op->flags_to_set = flags; 4904 extent_op->update_key = 1; 4905 extent_op->update_flags = 1; 4906 extent_op->is_data = 0; 4907 4908 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, 4909 ins->offset, parent, root_objectid, 4910 level, BTRFS_ADD_DELAYED_EXTENT, 4911 extent_op); 4912 BUG_ON(ret); 4913 } 4914 4915 if (root_objectid == root->root_key.objectid) { 4916 u64 used; 4917 spin_lock(&root->node_lock); 4918 used = btrfs_root_used(&root->root_item) + num_bytes; 4919 btrfs_set_root_used(&root->root_item, used); 4920 spin_unlock(&root->node_lock); 4921 } 4922 return ret; 4923 } 4924 4925 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 4926 struct btrfs_root *root, 4927 u64 bytenr, u32 blocksize, 4928 int level) 4929 { 4930 struct extent_buffer *buf; 4931 4932 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 4933 if (!buf) 4934 return ERR_PTR(-ENOMEM); 4935 btrfs_set_header_generation(buf, trans->transid); 4936 btrfs_set_buffer_lockdep_class(buf, level); 4937 btrfs_tree_lock(buf); 4938 clean_tree_block(trans, root, buf); 4939 4940 btrfs_set_lock_blocking(buf); 4941 btrfs_set_buffer_uptodate(buf); 4942 4943 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 4944 /* 4945 * we allow two log transactions at a time, use different 4946 * EXENT bit to differentiate dirty pages. 4947 */ 4948 if (root->log_transid % 2 == 0) 4949 set_extent_dirty(&root->dirty_log_pages, buf->start, 4950 buf->start + buf->len - 1, GFP_NOFS); 4951 else 4952 set_extent_new(&root->dirty_log_pages, buf->start, 4953 buf->start + buf->len - 1, GFP_NOFS); 4954 } else { 4955 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 4956 buf->start + buf->len - 1, GFP_NOFS); 4957 } 4958 trans->blocks_used++; 4959 /* this returns a buffer locked for blocking */ 4960 return buf; 4961 } 4962 4963 /* 4964 * helper function to allocate a block for a given tree 4965 * returns the tree buffer or NULL. 4966 */ 4967 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 4968 struct btrfs_root *root, u32 blocksize, 4969 u64 parent, u64 root_objectid, 4970 struct btrfs_disk_key *key, int level, 4971 u64 hint, u64 empty_size) 4972 { 4973 struct btrfs_key ins; 4974 int ret; 4975 struct extent_buffer *buf; 4976 4977 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 4978 key, level, empty_size, hint, (u64)-1, &ins); 4979 if (ret) { 4980 BUG_ON(ret > 0); 4981 return ERR_PTR(ret); 4982 } 4983 4984 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 4985 blocksize, level); 4986 return buf; 4987 } 4988 4989 struct walk_control { 4990 u64 refs[BTRFS_MAX_LEVEL]; 4991 u64 flags[BTRFS_MAX_LEVEL]; 4992 struct btrfs_key update_progress; 4993 int stage; 4994 int level; 4995 int shared_level; 4996 int update_ref; 4997 int keep_locks; 4998 int reada_slot; 4999 int reada_count; 5000 }; 5001 5002 #define DROP_REFERENCE 1 5003 #define UPDATE_BACKREF 2 5004 5005 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 5006 struct btrfs_root *root, 5007 struct walk_control *wc, 5008 struct btrfs_path *path) 5009 { 5010 u64 bytenr; 5011 u64 generation; 5012 u64 refs; 5013 u64 flags; 5014 u64 last = 0; 5015 u32 nritems; 5016 u32 blocksize; 5017 struct btrfs_key key; 5018 struct extent_buffer *eb; 5019 int ret; 5020 int slot; 5021 int nread = 0; 5022 5023 if (path->slots[wc->level] < wc->reada_slot) { 5024 wc->reada_count = wc->reada_count * 2 / 3; 5025 wc->reada_count = max(wc->reada_count, 2); 5026 } else { 5027 wc->reada_count = wc->reada_count * 3 / 2; 5028 wc->reada_count = min_t(int, wc->reada_count, 5029 BTRFS_NODEPTRS_PER_BLOCK(root)); 5030 } 5031 5032 eb = path->nodes[wc->level]; 5033 nritems = btrfs_header_nritems(eb); 5034 blocksize = btrfs_level_size(root, wc->level - 1); 5035 5036 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 5037 if (nread >= wc->reada_count) 5038 break; 5039 5040 cond_resched(); 5041 bytenr = btrfs_node_blockptr(eb, slot); 5042 generation = btrfs_node_ptr_generation(eb, slot); 5043 5044 if (slot == path->slots[wc->level]) 5045 goto reada; 5046 5047 if (wc->stage == UPDATE_BACKREF && 5048 generation <= root->root_key.offset) 5049 continue; 5050 5051 /* We don't lock the tree block, it's OK to be racy here */ 5052 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5053 &refs, &flags); 5054 BUG_ON(ret); 5055 BUG_ON(refs == 0); 5056 5057 if (wc->stage == DROP_REFERENCE) { 5058 if (refs == 1) 5059 goto reada; 5060 5061 if (wc->level == 1 && 5062 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5063 continue; 5064 if (!wc->update_ref || 5065 generation <= root->root_key.offset) 5066 continue; 5067 btrfs_node_key_to_cpu(eb, &key, slot); 5068 ret = btrfs_comp_cpu_keys(&key, 5069 &wc->update_progress); 5070 if (ret < 0) 5071 continue; 5072 } else { 5073 if (wc->level == 1 && 5074 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5075 continue; 5076 } 5077 reada: 5078 ret = readahead_tree_block(root, bytenr, blocksize, 5079 generation); 5080 if (ret) 5081 break; 5082 last = bytenr + blocksize; 5083 nread++; 5084 } 5085 wc->reada_slot = slot; 5086 } 5087 5088 /* 5089 * hepler to process tree block while walking down the tree. 5090 * 5091 * when wc->stage == UPDATE_BACKREF, this function updates 5092 * back refs for pointers in the block. 5093 * 5094 * NOTE: return value 1 means we should stop walking down. 5095 */ 5096 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5097 struct btrfs_root *root, 5098 struct btrfs_path *path, 5099 struct walk_control *wc, int lookup_info) 5100 { 5101 int level = wc->level; 5102 struct extent_buffer *eb = path->nodes[level]; 5103 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5104 int ret; 5105 5106 if (wc->stage == UPDATE_BACKREF && 5107 btrfs_header_owner(eb) != root->root_key.objectid) 5108 return 1; 5109 5110 /* 5111 * when reference count of tree block is 1, it won't increase 5112 * again. once full backref flag is set, we never clear it. 5113 */ 5114 if (lookup_info && 5115 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5116 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 5117 BUG_ON(!path->locks[level]); 5118 ret = btrfs_lookup_extent_info(trans, root, 5119 eb->start, eb->len, 5120 &wc->refs[level], 5121 &wc->flags[level]); 5122 BUG_ON(ret); 5123 BUG_ON(wc->refs[level] == 0); 5124 } 5125 5126 if (wc->stage == DROP_REFERENCE) { 5127 if (wc->refs[level] > 1) 5128 return 1; 5129 5130 if (path->locks[level] && !wc->keep_locks) { 5131 btrfs_tree_unlock(eb); 5132 path->locks[level] = 0; 5133 } 5134 return 0; 5135 } 5136 5137 /* wc->stage == UPDATE_BACKREF */ 5138 if (!(wc->flags[level] & flag)) { 5139 BUG_ON(!path->locks[level]); 5140 ret = btrfs_inc_ref(trans, root, eb, 1); 5141 BUG_ON(ret); 5142 ret = btrfs_dec_ref(trans, root, eb, 0); 5143 BUG_ON(ret); 5144 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 5145 eb->len, flag, 0); 5146 BUG_ON(ret); 5147 wc->flags[level] |= flag; 5148 } 5149 5150 /* 5151 * the block is shared by multiple trees, so it's not good to 5152 * keep the tree lock 5153 */ 5154 if (path->locks[level] && level > 0) { 5155 btrfs_tree_unlock(eb); 5156 path->locks[level] = 0; 5157 } 5158 return 0; 5159 } 5160 5161 /* 5162 * hepler to process tree block pointer. 5163 * 5164 * when wc->stage == DROP_REFERENCE, this function checks 5165 * reference count of the block pointed to. if the block 5166 * is shared and we need update back refs for the subtree 5167 * rooted at the block, this function changes wc->stage to 5168 * UPDATE_BACKREF. if the block is shared and there is no 5169 * need to update back, this function drops the reference 5170 * to the block. 5171 * 5172 * NOTE: return value 1 means we should stop walking down. 5173 */ 5174 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5175 struct btrfs_root *root, 5176 struct btrfs_path *path, 5177 struct walk_control *wc, int *lookup_info) 5178 { 5179 u64 bytenr; 5180 u64 generation; 5181 u64 parent; 5182 u32 blocksize; 5183 struct btrfs_key key; 5184 struct extent_buffer *next; 5185 int level = wc->level; 5186 int reada = 0; 5187 int ret = 0; 5188 5189 generation = btrfs_node_ptr_generation(path->nodes[level], 5190 path->slots[level]); 5191 /* 5192 * if the lower level block was created before the snapshot 5193 * was created, we know there is no need to update back refs 5194 * for the subtree 5195 */ 5196 if (wc->stage == UPDATE_BACKREF && 5197 generation <= root->root_key.offset) { 5198 *lookup_info = 1; 5199 return 1; 5200 } 5201 5202 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5203 blocksize = btrfs_level_size(root, level - 1); 5204 5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5206 if (!next) { 5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5208 reada = 1; 5209 } 5210 btrfs_tree_lock(next); 5211 btrfs_set_lock_blocking(next); 5212 5213 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5214 &wc->refs[level - 1], 5215 &wc->flags[level - 1]); 5216 BUG_ON(ret); 5217 BUG_ON(wc->refs[level - 1] == 0); 5218 *lookup_info = 0; 5219 5220 if (wc->stage == DROP_REFERENCE) { 5221 if (wc->refs[level - 1] > 1) { 5222 if (level == 1 && 5223 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5224 goto skip; 5225 5226 if (!wc->update_ref || 5227 generation <= root->root_key.offset) 5228 goto skip; 5229 5230 btrfs_node_key_to_cpu(path->nodes[level], &key, 5231 path->slots[level]); 5232 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 5233 if (ret < 0) 5234 goto skip; 5235 5236 wc->stage = UPDATE_BACKREF; 5237 wc->shared_level = level - 1; 5238 } 5239 } else { 5240 if (level == 1 && 5241 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5242 goto skip; 5243 } 5244 5245 if (!btrfs_buffer_uptodate(next, generation)) { 5246 btrfs_tree_unlock(next); 5247 free_extent_buffer(next); 5248 next = NULL; 5249 *lookup_info = 1; 5250 } 5251 5252 if (!next) { 5253 if (reada && level == 1) 5254 reada_walk_down(trans, root, wc, path); 5255 next = read_tree_block(root, bytenr, blocksize, generation); 5256 btrfs_tree_lock(next); 5257 btrfs_set_lock_blocking(next); 5258 } 5259 5260 level--; 5261 BUG_ON(level != btrfs_header_level(next)); 5262 path->nodes[level] = next; 5263 path->slots[level] = 0; 5264 path->locks[level] = 1; 5265 wc->level = level; 5266 if (wc->level == 1) 5267 wc->reada_slot = 0; 5268 return 0; 5269 skip: 5270 wc->refs[level - 1] = 0; 5271 wc->flags[level - 1] = 0; 5272 if (wc->stage == DROP_REFERENCE) { 5273 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5274 parent = path->nodes[level]->start; 5275 } else { 5276 BUG_ON(root->root_key.objectid != 5277 btrfs_header_owner(path->nodes[level])); 5278 parent = 0; 5279 } 5280 5281 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 5282 root->root_key.objectid, level - 1, 0); 5283 BUG_ON(ret); 5284 } 5285 btrfs_tree_unlock(next); 5286 free_extent_buffer(next); 5287 *lookup_info = 1; 5288 return 1; 5289 } 5290 5291 /* 5292 * hepler to process tree block while walking up the tree. 5293 * 5294 * when wc->stage == DROP_REFERENCE, this function drops 5295 * reference count on the block. 5296 * 5297 * when wc->stage == UPDATE_BACKREF, this function changes 5298 * wc->stage back to DROP_REFERENCE if we changed wc->stage 5299 * to UPDATE_BACKREF previously while processing the block. 5300 * 5301 * NOTE: return value 1 means we should stop walking up. 5302 */ 5303 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 5304 struct btrfs_root *root, 5305 struct btrfs_path *path, 5306 struct walk_control *wc) 5307 { 5308 int ret = 0; 5309 int level = wc->level; 5310 struct extent_buffer *eb = path->nodes[level]; 5311 u64 parent = 0; 5312 5313 if (wc->stage == UPDATE_BACKREF) { 5314 BUG_ON(wc->shared_level < level); 5315 if (level < wc->shared_level) 5316 goto out; 5317 5318 ret = find_next_key(path, level + 1, &wc->update_progress); 5319 if (ret > 0) 5320 wc->update_ref = 0; 5321 5322 wc->stage = DROP_REFERENCE; 5323 wc->shared_level = -1; 5324 path->slots[level] = 0; 5325 5326 /* 5327 * check reference count again if the block isn't locked. 5328 * we should start walking down the tree again if reference 5329 * count is one. 5330 */ 5331 if (!path->locks[level]) { 5332 BUG_ON(level == 0); 5333 btrfs_tree_lock(eb); 5334 btrfs_set_lock_blocking(eb); 5335 path->locks[level] = 1; 5336 5337 ret = btrfs_lookup_extent_info(trans, root, 5338 eb->start, eb->len, 5339 &wc->refs[level], 5340 &wc->flags[level]); 5341 BUG_ON(ret); 5342 BUG_ON(wc->refs[level] == 0); 5343 if (wc->refs[level] == 1) { 5344 btrfs_tree_unlock(eb); 5345 path->locks[level] = 0; 5346 return 1; 5347 } 5348 } 5349 } 5350 5351 /* wc->stage == DROP_REFERENCE */ 5352 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 5353 5354 if (wc->refs[level] == 1) { 5355 if (level == 0) { 5356 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5357 ret = btrfs_dec_ref(trans, root, eb, 1); 5358 else 5359 ret = btrfs_dec_ref(trans, root, eb, 0); 5360 BUG_ON(ret); 5361 } 5362 /* make block locked assertion in clean_tree_block happy */ 5363 if (!path->locks[level] && 5364 btrfs_header_generation(eb) == trans->transid) { 5365 btrfs_tree_lock(eb); 5366 btrfs_set_lock_blocking(eb); 5367 path->locks[level] = 1; 5368 } 5369 clean_tree_block(trans, root, eb); 5370 } 5371 5372 if (eb == root->node) { 5373 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5374 parent = eb->start; 5375 else 5376 BUG_ON(root->root_key.objectid != 5377 btrfs_header_owner(eb)); 5378 } else { 5379 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5380 parent = path->nodes[level + 1]->start; 5381 else 5382 BUG_ON(root->root_key.objectid != 5383 btrfs_header_owner(path->nodes[level + 1])); 5384 } 5385 5386 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5387 root->root_key.objectid, level, 0); 5388 BUG_ON(ret); 5389 out: 5390 wc->refs[level] = 0; 5391 wc->flags[level] = 0; 5392 return ret; 5393 } 5394 5395 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5396 struct btrfs_root *root, 5397 struct btrfs_path *path, 5398 struct walk_control *wc) 5399 { 5400 int level = wc->level; 5401 int lookup_info = 1; 5402 int ret; 5403 5404 while (level >= 0) { 5405 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5406 if (ret > 0) 5407 break; 5408 5409 if (level == 0) 5410 break; 5411 5412 if (path->slots[level] >= 5413 btrfs_header_nritems(path->nodes[level])) 5414 break; 5415 5416 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5417 if (ret > 0) { 5418 path->slots[level]++; 5419 continue; 5420 } 5421 level = wc->level; 5422 } 5423 return 0; 5424 } 5425 5426 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5427 struct btrfs_root *root, 5428 struct btrfs_path *path, 5429 struct walk_control *wc, int max_level) 5430 { 5431 int level = wc->level; 5432 int ret; 5433 5434 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 5435 while (level < max_level && path->nodes[level]) { 5436 wc->level = level; 5437 if (path->slots[level] + 1 < 5438 btrfs_header_nritems(path->nodes[level])) { 5439 path->slots[level]++; 5440 return 0; 5441 } else { 5442 ret = walk_up_proc(trans, root, path, wc); 5443 if (ret > 0) 5444 return 0; 5445 5446 if (path->locks[level]) { 5447 btrfs_tree_unlock(path->nodes[level]); 5448 path->locks[level] = 0; 5449 } 5450 free_extent_buffer(path->nodes[level]); 5451 path->nodes[level] = NULL; 5452 level++; 5453 } 5454 } 5455 return 1; 5456 } 5457 5458 /* 5459 * drop a subvolume tree. 5460 * 5461 * this function traverses the tree freeing any blocks that only 5462 * referenced by the tree. 5463 * 5464 * when a shared tree block is found. this function decreases its 5465 * reference count by one. if update_ref is true, this function 5466 * also make sure backrefs for the shared block and all lower level 5467 * blocks are properly updated. 5468 */ 5469 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5470 { 5471 struct btrfs_path *path; 5472 struct btrfs_trans_handle *trans; 5473 struct btrfs_root *tree_root = root->fs_info->tree_root; 5474 struct btrfs_root_item *root_item = &root->root_item; 5475 struct walk_control *wc; 5476 struct btrfs_key key; 5477 int err = 0; 5478 int ret; 5479 int level; 5480 5481 path = btrfs_alloc_path(); 5482 BUG_ON(!path); 5483 5484 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5485 BUG_ON(!wc); 5486 5487 trans = btrfs_start_transaction(tree_root, 1); 5488 5489 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5490 level = btrfs_header_level(root->node); 5491 path->nodes[level] = btrfs_lock_root_node(root); 5492 btrfs_set_lock_blocking(path->nodes[level]); 5493 path->slots[level] = 0; 5494 path->locks[level] = 1; 5495 memset(&wc->update_progress, 0, 5496 sizeof(wc->update_progress)); 5497 } else { 5498 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5499 memcpy(&wc->update_progress, &key, 5500 sizeof(wc->update_progress)); 5501 5502 level = root_item->drop_level; 5503 BUG_ON(level == 0); 5504 path->lowest_level = level; 5505 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5506 path->lowest_level = 0; 5507 if (ret < 0) { 5508 err = ret; 5509 goto out; 5510 } 5511 WARN_ON(ret > 0); 5512 5513 /* 5514 * unlock our path, this is safe because only this 5515 * function is allowed to delete this snapshot 5516 */ 5517 btrfs_unlock_up_safe(path, 0); 5518 5519 level = btrfs_header_level(root->node); 5520 while (1) { 5521 btrfs_tree_lock(path->nodes[level]); 5522 btrfs_set_lock_blocking(path->nodes[level]); 5523 5524 ret = btrfs_lookup_extent_info(trans, root, 5525 path->nodes[level]->start, 5526 path->nodes[level]->len, 5527 &wc->refs[level], 5528 &wc->flags[level]); 5529 BUG_ON(ret); 5530 BUG_ON(wc->refs[level] == 0); 5531 5532 if (level == root_item->drop_level) 5533 break; 5534 5535 btrfs_tree_unlock(path->nodes[level]); 5536 WARN_ON(wc->refs[level] != 1); 5537 level--; 5538 } 5539 } 5540 5541 wc->level = level; 5542 wc->shared_level = -1; 5543 wc->stage = DROP_REFERENCE; 5544 wc->update_ref = update_ref; 5545 wc->keep_locks = 0; 5546 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 5547 5548 while (1) { 5549 ret = walk_down_tree(trans, root, path, wc); 5550 if (ret < 0) { 5551 err = ret; 5552 break; 5553 } 5554 5555 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 5556 if (ret < 0) { 5557 err = ret; 5558 break; 5559 } 5560 5561 if (ret > 0) { 5562 BUG_ON(wc->stage != DROP_REFERENCE); 5563 break; 5564 } 5565 5566 if (wc->stage == DROP_REFERENCE) { 5567 level = wc->level; 5568 btrfs_node_key(path->nodes[level], 5569 &root_item->drop_progress, 5570 path->slots[level]); 5571 root_item->drop_level = level; 5572 } 5573 5574 BUG_ON(wc->level == 0); 5575 if (trans->transaction->in_commit || 5576 trans->transaction->delayed_refs.flushing) { 5577 ret = btrfs_update_root(trans, tree_root, 5578 &root->root_key, 5579 root_item); 5580 BUG_ON(ret); 5581 5582 btrfs_end_transaction(trans, tree_root); 5583 trans = btrfs_start_transaction(tree_root, 1); 5584 } else { 5585 unsigned long update; 5586 update = trans->delayed_ref_updates; 5587 trans->delayed_ref_updates = 0; 5588 if (update) 5589 btrfs_run_delayed_refs(trans, tree_root, 5590 update); 5591 } 5592 } 5593 btrfs_release_path(root, path); 5594 BUG_ON(err); 5595 5596 ret = btrfs_del_root(trans, tree_root, &root->root_key); 5597 BUG_ON(ret); 5598 5599 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 5600 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 5601 NULL, NULL); 5602 BUG_ON(ret < 0); 5603 if (ret > 0) { 5604 ret = btrfs_del_orphan_item(trans, tree_root, 5605 root->root_key.objectid); 5606 BUG_ON(ret); 5607 } 5608 } 5609 5610 if (root->in_radix) { 5611 btrfs_free_fs_root(tree_root->fs_info, root); 5612 } else { 5613 free_extent_buffer(root->node); 5614 free_extent_buffer(root->commit_root); 5615 kfree(root); 5616 } 5617 out: 5618 btrfs_end_transaction(trans, tree_root); 5619 kfree(wc); 5620 btrfs_free_path(path); 5621 return err; 5622 } 5623 5624 /* 5625 * drop subtree rooted at tree block 'node'. 5626 * 5627 * NOTE: this function will unlock and release tree block 'node' 5628 */ 5629 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 5630 struct btrfs_root *root, 5631 struct extent_buffer *node, 5632 struct extent_buffer *parent) 5633 { 5634 struct btrfs_path *path; 5635 struct walk_control *wc; 5636 int level; 5637 int parent_level; 5638 int ret = 0; 5639 int wret; 5640 5641 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 5642 5643 path = btrfs_alloc_path(); 5644 BUG_ON(!path); 5645 5646 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5647 BUG_ON(!wc); 5648 5649 btrfs_assert_tree_locked(parent); 5650 parent_level = btrfs_header_level(parent); 5651 extent_buffer_get(parent); 5652 path->nodes[parent_level] = parent; 5653 path->slots[parent_level] = btrfs_header_nritems(parent); 5654 5655 btrfs_assert_tree_locked(node); 5656 level = btrfs_header_level(node); 5657 path->nodes[level] = node; 5658 path->slots[level] = 0; 5659 path->locks[level] = 1; 5660 5661 wc->refs[parent_level] = 1; 5662 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5663 wc->level = level; 5664 wc->shared_level = -1; 5665 wc->stage = DROP_REFERENCE; 5666 wc->update_ref = 0; 5667 wc->keep_locks = 1; 5668 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 5669 5670 while (1) { 5671 wret = walk_down_tree(trans, root, path, wc); 5672 if (wret < 0) { 5673 ret = wret; 5674 break; 5675 } 5676 5677 wret = walk_up_tree(trans, root, path, wc, parent_level); 5678 if (wret < 0) 5679 ret = wret; 5680 if (wret != 0) 5681 break; 5682 } 5683 5684 kfree(wc); 5685 btrfs_free_path(path); 5686 return ret; 5687 } 5688 5689 #if 0 5690 static unsigned long calc_ra(unsigned long start, unsigned long last, 5691 unsigned long nr) 5692 { 5693 return min(last, start + nr - 1); 5694 } 5695 5696 static noinline int relocate_inode_pages(struct inode *inode, u64 start, 5697 u64 len) 5698 { 5699 u64 page_start; 5700 u64 page_end; 5701 unsigned long first_index; 5702 unsigned long last_index; 5703 unsigned long i; 5704 struct page *page; 5705 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5706 struct file_ra_state *ra; 5707 struct btrfs_ordered_extent *ordered; 5708 unsigned int total_read = 0; 5709 unsigned int total_dirty = 0; 5710 int ret = 0; 5711 5712 ra = kzalloc(sizeof(*ra), GFP_NOFS); 5713 5714 mutex_lock(&inode->i_mutex); 5715 first_index = start >> PAGE_CACHE_SHIFT; 5716 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; 5717 5718 /* make sure the dirty trick played by the caller work */ 5719 ret = invalidate_inode_pages2_range(inode->i_mapping, 5720 first_index, last_index); 5721 if (ret) 5722 goto out_unlock; 5723 5724 file_ra_state_init(ra, inode->i_mapping); 5725 5726 for (i = first_index ; i <= last_index; i++) { 5727 if (total_read % ra->ra_pages == 0) { 5728 btrfs_force_ra(inode->i_mapping, ra, NULL, i, 5729 calc_ra(i, last_index, ra->ra_pages)); 5730 } 5731 total_read++; 5732 again: 5733 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) 5734 BUG_ON(1); 5735 page = grab_cache_page(inode->i_mapping, i); 5736 if (!page) { 5737 ret = -ENOMEM; 5738 goto out_unlock; 5739 } 5740 if (!PageUptodate(page)) { 5741 btrfs_readpage(NULL, page); 5742 lock_page(page); 5743 if (!PageUptodate(page)) { 5744 unlock_page(page); 5745 page_cache_release(page); 5746 ret = -EIO; 5747 goto out_unlock; 5748 } 5749 } 5750 wait_on_page_writeback(page); 5751 5752 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 5753 page_end = page_start + PAGE_CACHE_SIZE - 1; 5754 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5755 5756 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5757 if (ordered) { 5758 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5759 unlock_page(page); 5760 page_cache_release(page); 5761 btrfs_start_ordered_extent(inode, ordered, 1); 5762 btrfs_put_ordered_extent(ordered); 5763 goto again; 5764 } 5765 set_page_extent_mapped(page); 5766 5767 if (i == first_index) 5768 set_extent_bits(io_tree, page_start, page_end, 5769 EXTENT_BOUNDARY, GFP_NOFS); 5770 btrfs_set_extent_delalloc(inode, page_start, page_end); 5771 5772 set_page_dirty(page); 5773 total_dirty++; 5774 5775 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5776 unlock_page(page); 5777 page_cache_release(page); 5778 } 5779 5780 out_unlock: 5781 kfree(ra); 5782 mutex_unlock(&inode->i_mutex); 5783 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); 5784 return ret; 5785 } 5786 5787 static noinline int relocate_data_extent(struct inode *reloc_inode, 5788 struct btrfs_key *extent_key, 5789 u64 offset) 5790 { 5791 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 5792 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; 5793 struct extent_map *em; 5794 u64 start = extent_key->objectid - offset; 5795 u64 end = start + extent_key->offset - 1; 5796 5797 em = alloc_extent_map(GFP_NOFS); 5798 BUG_ON(!em || IS_ERR(em)); 5799 5800 em->start = start; 5801 em->len = extent_key->offset; 5802 em->block_len = extent_key->offset; 5803 em->block_start = extent_key->objectid; 5804 em->bdev = root->fs_info->fs_devices->latest_bdev; 5805 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5806 5807 /* setup extent map to cheat btrfs_readpage */ 5808 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5809 while (1) { 5810 int ret; 5811 write_lock(&em_tree->lock); 5812 ret = add_extent_mapping(em_tree, em); 5813 write_unlock(&em_tree->lock); 5814 if (ret != -EEXIST) { 5815 free_extent_map(em); 5816 break; 5817 } 5818 btrfs_drop_extent_cache(reloc_inode, start, end, 0); 5819 } 5820 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5821 5822 return relocate_inode_pages(reloc_inode, start, extent_key->offset); 5823 } 5824 5825 struct btrfs_ref_path { 5826 u64 extent_start; 5827 u64 nodes[BTRFS_MAX_LEVEL]; 5828 u64 root_objectid; 5829 u64 root_generation; 5830 u64 owner_objectid; 5831 u32 num_refs; 5832 int lowest_level; 5833 int current_level; 5834 int shared_level; 5835 5836 struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; 5837 u64 new_nodes[BTRFS_MAX_LEVEL]; 5838 }; 5839 5840 struct disk_extent { 5841 u64 ram_bytes; 5842 u64 disk_bytenr; 5843 u64 disk_num_bytes; 5844 u64 offset; 5845 u64 num_bytes; 5846 u8 compression; 5847 u8 encryption; 5848 u16 other_encoding; 5849 }; 5850 5851 static int is_cowonly_root(u64 root_objectid) 5852 { 5853 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 5854 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 5855 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 5856 root_objectid == BTRFS_DEV_TREE_OBJECTID || 5857 root_objectid == BTRFS_TREE_LOG_OBJECTID || 5858 root_objectid == BTRFS_CSUM_TREE_OBJECTID) 5859 return 1; 5860 return 0; 5861 } 5862 5863 static noinline int __next_ref_path(struct btrfs_trans_handle *trans, 5864 struct btrfs_root *extent_root, 5865 struct btrfs_ref_path *ref_path, 5866 int first_time) 5867 { 5868 struct extent_buffer *leaf; 5869 struct btrfs_path *path; 5870 struct btrfs_extent_ref *ref; 5871 struct btrfs_key key; 5872 struct btrfs_key found_key; 5873 u64 bytenr; 5874 u32 nritems; 5875 int level; 5876 int ret = 1; 5877 5878 path = btrfs_alloc_path(); 5879 if (!path) 5880 return -ENOMEM; 5881 5882 if (first_time) { 5883 ref_path->lowest_level = -1; 5884 ref_path->current_level = -1; 5885 ref_path->shared_level = -1; 5886 goto walk_up; 5887 } 5888 walk_down: 5889 level = ref_path->current_level - 1; 5890 while (level >= -1) { 5891 u64 parent; 5892 if (level < ref_path->lowest_level) 5893 break; 5894 5895 if (level >= 0) 5896 bytenr = ref_path->nodes[level]; 5897 else 5898 bytenr = ref_path->extent_start; 5899 BUG_ON(bytenr == 0); 5900 5901 parent = ref_path->nodes[level + 1]; 5902 ref_path->nodes[level + 1] = 0; 5903 ref_path->current_level = level; 5904 BUG_ON(parent == 0); 5905 5906 key.objectid = bytenr; 5907 key.offset = parent + 1; 5908 key.type = BTRFS_EXTENT_REF_KEY; 5909 5910 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 5911 if (ret < 0) 5912 goto out; 5913 BUG_ON(ret == 0); 5914 5915 leaf = path->nodes[0]; 5916 nritems = btrfs_header_nritems(leaf); 5917 if (path->slots[0] >= nritems) { 5918 ret = btrfs_next_leaf(extent_root, path); 5919 if (ret < 0) 5920 goto out; 5921 if (ret > 0) 5922 goto next; 5923 leaf = path->nodes[0]; 5924 } 5925 5926 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5927 if (found_key.objectid == bytenr && 5928 found_key.type == BTRFS_EXTENT_REF_KEY) { 5929 if (level < ref_path->shared_level) 5930 ref_path->shared_level = level; 5931 goto found; 5932 } 5933 next: 5934 level--; 5935 btrfs_release_path(extent_root, path); 5936 cond_resched(); 5937 } 5938 /* reached lowest level */ 5939 ret = 1; 5940 goto out; 5941 walk_up: 5942 level = ref_path->current_level; 5943 while (level < BTRFS_MAX_LEVEL - 1) { 5944 u64 ref_objectid; 5945 5946 if (level >= 0) 5947 bytenr = ref_path->nodes[level]; 5948 else 5949 bytenr = ref_path->extent_start; 5950 5951 BUG_ON(bytenr == 0); 5952 5953 key.objectid = bytenr; 5954 key.offset = 0; 5955 key.type = BTRFS_EXTENT_REF_KEY; 5956 5957 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); 5958 if (ret < 0) 5959 goto out; 5960 5961 leaf = path->nodes[0]; 5962 nritems = btrfs_header_nritems(leaf); 5963 if (path->slots[0] >= nritems) { 5964 ret = btrfs_next_leaf(extent_root, path); 5965 if (ret < 0) 5966 goto out; 5967 if (ret > 0) { 5968 /* the extent was freed by someone */ 5969 if (ref_path->lowest_level == level) 5970 goto out; 5971 btrfs_release_path(extent_root, path); 5972 goto walk_down; 5973 } 5974 leaf = path->nodes[0]; 5975 } 5976 5977 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5978 if (found_key.objectid != bytenr || 5979 found_key.type != BTRFS_EXTENT_REF_KEY) { 5980 /* the extent was freed by someone */ 5981 if (ref_path->lowest_level == level) { 5982 ret = 1; 5983 goto out; 5984 } 5985 btrfs_release_path(extent_root, path); 5986 goto walk_down; 5987 } 5988 found: 5989 ref = btrfs_item_ptr(leaf, path->slots[0], 5990 struct btrfs_extent_ref); 5991 ref_objectid = btrfs_ref_objectid(leaf, ref); 5992 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { 5993 if (first_time) { 5994 level = (int)ref_objectid; 5995 BUG_ON(level >= BTRFS_MAX_LEVEL); 5996 ref_path->lowest_level = level; 5997 ref_path->current_level = level; 5998 ref_path->nodes[level] = bytenr; 5999 } else { 6000 WARN_ON(ref_objectid != level); 6001 } 6002 } else { 6003 WARN_ON(level != -1); 6004 } 6005 first_time = 0; 6006 6007 if (ref_path->lowest_level == level) { 6008 ref_path->owner_objectid = ref_objectid; 6009 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); 6010 } 6011 6012 /* 6013 * the block is tree root or the block isn't in reference 6014 * counted tree. 6015 */ 6016 if (found_key.objectid == found_key.offset || 6017 is_cowonly_root(btrfs_ref_root(leaf, ref))) { 6018 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 6019 ref_path->root_generation = 6020 btrfs_ref_generation(leaf, ref); 6021 if (level < 0) { 6022 /* special reference from the tree log */ 6023 ref_path->nodes[0] = found_key.offset; 6024 ref_path->current_level = 0; 6025 } 6026 ret = 0; 6027 goto out; 6028 } 6029 6030 level++; 6031 BUG_ON(ref_path->nodes[level] != 0); 6032 ref_path->nodes[level] = found_key.offset; 6033 ref_path->current_level = level; 6034 6035 /* 6036 * the reference was created in the running transaction, 6037 * no need to continue walking up. 6038 */ 6039 if (btrfs_ref_generation(leaf, ref) == trans->transid) { 6040 ref_path->root_objectid = btrfs_ref_root(leaf, ref); 6041 ref_path->root_generation = 6042 btrfs_ref_generation(leaf, ref); 6043 ret = 0; 6044 goto out; 6045 } 6046 6047 btrfs_release_path(extent_root, path); 6048 cond_resched(); 6049 } 6050 /* reached max tree level, but no tree root found. */ 6051 BUG(); 6052 out: 6053 btrfs_free_path(path); 6054 return ret; 6055 } 6056 6057 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, 6058 struct btrfs_root *extent_root, 6059 struct btrfs_ref_path *ref_path, 6060 u64 extent_start) 6061 { 6062 memset(ref_path, 0, sizeof(*ref_path)); 6063 ref_path->extent_start = extent_start; 6064 6065 return __next_ref_path(trans, extent_root, ref_path, 1); 6066 } 6067 6068 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, 6069 struct btrfs_root *extent_root, 6070 struct btrfs_ref_path *ref_path) 6071 { 6072 return __next_ref_path(trans, extent_root, ref_path, 0); 6073 } 6074 6075 static noinline int get_new_locations(struct inode *reloc_inode, 6076 struct btrfs_key *extent_key, 6077 u64 offset, int no_fragment, 6078 struct disk_extent **extents, 6079 int *nr_extents) 6080 { 6081 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 6082 struct btrfs_path *path; 6083 struct btrfs_file_extent_item *fi; 6084 struct extent_buffer *leaf; 6085 struct disk_extent *exts = *extents; 6086 struct btrfs_key found_key; 6087 u64 cur_pos; 6088 u64 last_byte; 6089 u32 nritems; 6090 int nr = 0; 6091 int max = *nr_extents; 6092 int ret; 6093 6094 WARN_ON(!no_fragment && *extents); 6095 if (!exts) { 6096 max = 1; 6097 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); 6098 if (!exts) 6099 return -ENOMEM; 6100 } 6101 6102 path = btrfs_alloc_path(); 6103 BUG_ON(!path); 6104 6105 cur_pos = extent_key->objectid - offset; 6106 last_byte = extent_key->objectid + extent_key->offset; 6107 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, 6108 cur_pos, 0); 6109 if (ret < 0) 6110 goto out; 6111 if (ret > 0) { 6112 ret = -ENOENT; 6113 goto out; 6114 } 6115 6116 while (1) { 6117 leaf = path->nodes[0]; 6118 nritems = btrfs_header_nritems(leaf); 6119 if (path->slots[0] >= nritems) { 6120 ret = btrfs_next_leaf(root, path); 6121 if (ret < 0) 6122 goto out; 6123 if (ret > 0) 6124 break; 6125 leaf = path->nodes[0]; 6126 } 6127 6128 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6129 if (found_key.offset != cur_pos || 6130 found_key.type != BTRFS_EXTENT_DATA_KEY || 6131 found_key.objectid != reloc_inode->i_ino) 6132 break; 6133 6134 fi = btrfs_item_ptr(leaf, path->slots[0], 6135 struct btrfs_file_extent_item); 6136 if (btrfs_file_extent_type(leaf, fi) != 6137 BTRFS_FILE_EXTENT_REG || 6138 btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 6139 break; 6140 6141 if (nr == max) { 6142 struct disk_extent *old = exts; 6143 max *= 2; 6144 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 6145 memcpy(exts, old, sizeof(*exts) * nr); 6146 if (old != *extents) 6147 kfree(old); 6148 } 6149 6150 exts[nr].disk_bytenr = 6151 btrfs_file_extent_disk_bytenr(leaf, fi); 6152 exts[nr].disk_num_bytes = 6153 btrfs_file_extent_disk_num_bytes(leaf, fi); 6154 exts[nr].offset = btrfs_file_extent_offset(leaf, fi); 6155 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6156 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6157 exts[nr].compression = btrfs_file_extent_compression(leaf, fi); 6158 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); 6159 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, 6160 fi); 6161 BUG_ON(exts[nr].offset > 0); 6162 BUG_ON(exts[nr].compression || exts[nr].encryption); 6163 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); 6164 6165 cur_pos += exts[nr].num_bytes; 6166 nr++; 6167 6168 if (cur_pos + offset >= last_byte) 6169 break; 6170 6171 if (no_fragment) { 6172 ret = 1; 6173 goto out; 6174 } 6175 path->slots[0]++; 6176 } 6177 6178 BUG_ON(cur_pos + offset > last_byte); 6179 if (cur_pos + offset < last_byte) { 6180 ret = -ENOENT; 6181 goto out; 6182 } 6183 ret = 0; 6184 out: 6185 btrfs_free_path(path); 6186 if (ret) { 6187 if (exts != *extents) 6188 kfree(exts); 6189 } else { 6190 *extents = exts; 6191 *nr_extents = nr; 6192 } 6193 return ret; 6194 } 6195 6196 static noinline int replace_one_extent(struct btrfs_trans_handle *trans, 6197 struct btrfs_root *root, 6198 struct btrfs_path *path, 6199 struct btrfs_key *extent_key, 6200 struct btrfs_key *leaf_key, 6201 struct btrfs_ref_path *ref_path, 6202 struct disk_extent *new_extents, 6203 int nr_extents) 6204 { 6205 struct extent_buffer *leaf; 6206 struct btrfs_file_extent_item *fi; 6207 struct inode *inode = NULL; 6208 struct btrfs_key key; 6209 u64 lock_start = 0; 6210 u64 lock_end = 0; 6211 u64 num_bytes; 6212 u64 ext_offset; 6213 u64 search_end = (u64)-1; 6214 u32 nritems; 6215 int nr_scaned = 0; 6216 int extent_locked = 0; 6217 int extent_type; 6218 int ret; 6219 6220 memcpy(&key, leaf_key, sizeof(key)); 6221 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 6222 if (key.objectid < ref_path->owner_objectid || 6223 (key.objectid == ref_path->owner_objectid && 6224 key.type < BTRFS_EXTENT_DATA_KEY)) { 6225 key.objectid = ref_path->owner_objectid; 6226 key.type = BTRFS_EXTENT_DATA_KEY; 6227 key.offset = 0; 6228 } 6229 } 6230 6231 while (1) { 6232 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 6233 if (ret < 0) 6234 goto out; 6235 6236 leaf = path->nodes[0]; 6237 nritems = btrfs_header_nritems(leaf); 6238 next: 6239 if (extent_locked && ret > 0) { 6240 /* 6241 * the file extent item was modified by someone 6242 * before the extent got locked. 6243 */ 6244 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6245 lock_end, GFP_NOFS); 6246 extent_locked = 0; 6247 } 6248 6249 if (path->slots[0] >= nritems) { 6250 if (++nr_scaned > 2) 6251 break; 6252 6253 BUG_ON(extent_locked); 6254 ret = btrfs_next_leaf(root, path); 6255 if (ret < 0) 6256 goto out; 6257 if (ret > 0) 6258 break; 6259 leaf = path->nodes[0]; 6260 nritems = btrfs_header_nritems(leaf); 6261 } 6262 6263 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 6264 6265 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 6266 if ((key.objectid > ref_path->owner_objectid) || 6267 (key.objectid == ref_path->owner_objectid && 6268 key.type > BTRFS_EXTENT_DATA_KEY) || 6269 key.offset >= search_end) 6270 break; 6271 } 6272 6273 if (inode && key.objectid != inode->i_ino) { 6274 BUG_ON(extent_locked); 6275 btrfs_release_path(root, path); 6276 mutex_unlock(&inode->i_mutex); 6277 iput(inode); 6278 inode = NULL; 6279 continue; 6280 } 6281 6282 if (key.type != BTRFS_EXTENT_DATA_KEY) { 6283 path->slots[0]++; 6284 ret = 1; 6285 goto next; 6286 } 6287 fi = btrfs_item_ptr(leaf, path->slots[0], 6288 struct btrfs_file_extent_item); 6289 extent_type = btrfs_file_extent_type(leaf, fi); 6290 if ((extent_type != BTRFS_FILE_EXTENT_REG && 6291 extent_type != BTRFS_FILE_EXTENT_PREALLOC) || 6292 (btrfs_file_extent_disk_bytenr(leaf, fi) != 6293 extent_key->objectid)) { 6294 path->slots[0]++; 6295 ret = 1; 6296 goto next; 6297 } 6298 6299 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6300 ext_offset = btrfs_file_extent_offset(leaf, fi); 6301 6302 if (search_end == (u64)-1) { 6303 search_end = key.offset - ext_offset + 6304 btrfs_file_extent_ram_bytes(leaf, fi); 6305 } 6306 6307 if (!extent_locked) { 6308 lock_start = key.offset; 6309 lock_end = lock_start + num_bytes - 1; 6310 } else { 6311 if (lock_start > key.offset || 6312 lock_end + 1 < key.offset + num_bytes) { 6313 unlock_extent(&BTRFS_I(inode)->io_tree, 6314 lock_start, lock_end, GFP_NOFS); 6315 extent_locked = 0; 6316 } 6317 } 6318 6319 if (!inode) { 6320 btrfs_release_path(root, path); 6321 6322 inode = btrfs_iget_locked(root->fs_info->sb, 6323 key.objectid, root); 6324 if (inode->i_state & I_NEW) { 6325 BTRFS_I(inode)->root = root; 6326 BTRFS_I(inode)->location.objectid = 6327 key.objectid; 6328 BTRFS_I(inode)->location.type = 6329 BTRFS_INODE_ITEM_KEY; 6330 BTRFS_I(inode)->location.offset = 0; 6331 btrfs_read_locked_inode(inode); 6332 unlock_new_inode(inode); 6333 } 6334 /* 6335 * some code call btrfs_commit_transaction while 6336 * holding the i_mutex, so we can't use mutex_lock 6337 * here. 6338 */ 6339 if (is_bad_inode(inode) || 6340 !mutex_trylock(&inode->i_mutex)) { 6341 iput(inode); 6342 inode = NULL; 6343 key.offset = (u64)-1; 6344 goto skip; 6345 } 6346 } 6347 6348 if (!extent_locked) { 6349 struct btrfs_ordered_extent *ordered; 6350 6351 btrfs_release_path(root, path); 6352 6353 lock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6354 lock_end, GFP_NOFS); 6355 ordered = btrfs_lookup_first_ordered_extent(inode, 6356 lock_end); 6357 if (ordered && 6358 ordered->file_offset <= lock_end && 6359 ordered->file_offset + ordered->len > lock_start) { 6360 unlock_extent(&BTRFS_I(inode)->io_tree, 6361 lock_start, lock_end, GFP_NOFS); 6362 btrfs_start_ordered_extent(inode, ordered, 1); 6363 btrfs_put_ordered_extent(ordered); 6364 key.offset += num_bytes; 6365 goto skip; 6366 } 6367 if (ordered) 6368 btrfs_put_ordered_extent(ordered); 6369 6370 extent_locked = 1; 6371 continue; 6372 } 6373 6374 if (nr_extents == 1) { 6375 /* update extent pointer in place */ 6376 btrfs_set_file_extent_disk_bytenr(leaf, fi, 6377 new_extents[0].disk_bytenr); 6378 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 6379 new_extents[0].disk_num_bytes); 6380 btrfs_mark_buffer_dirty(leaf); 6381 6382 btrfs_drop_extent_cache(inode, key.offset, 6383 key.offset + num_bytes - 1, 0); 6384 6385 ret = btrfs_inc_extent_ref(trans, root, 6386 new_extents[0].disk_bytenr, 6387 new_extents[0].disk_num_bytes, 6388 leaf->start, 6389 root->root_key.objectid, 6390 trans->transid, 6391 key.objectid); 6392 BUG_ON(ret); 6393 6394 ret = btrfs_free_extent(trans, root, 6395 extent_key->objectid, 6396 extent_key->offset, 6397 leaf->start, 6398 btrfs_header_owner(leaf), 6399 btrfs_header_generation(leaf), 6400 key.objectid, 0); 6401 BUG_ON(ret); 6402 6403 btrfs_release_path(root, path); 6404 key.offset += num_bytes; 6405 } else { 6406 BUG_ON(1); 6407 #if 0 6408 u64 alloc_hint; 6409 u64 extent_len; 6410 int i; 6411 /* 6412 * drop old extent pointer at first, then insert the 6413 * new pointers one bye one 6414 */ 6415 btrfs_release_path(root, path); 6416 ret = btrfs_drop_extents(trans, root, inode, key.offset, 6417 key.offset + num_bytes, 6418 key.offset, &alloc_hint); 6419 BUG_ON(ret); 6420 6421 for (i = 0; i < nr_extents; i++) { 6422 if (ext_offset >= new_extents[i].num_bytes) { 6423 ext_offset -= new_extents[i].num_bytes; 6424 continue; 6425 } 6426 extent_len = min(new_extents[i].num_bytes - 6427 ext_offset, num_bytes); 6428 6429 ret = btrfs_insert_empty_item(trans, root, 6430 path, &key, 6431 sizeof(*fi)); 6432 BUG_ON(ret); 6433 6434 leaf = path->nodes[0]; 6435 fi = btrfs_item_ptr(leaf, path->slots[0], 6436 struct btrfs_file_extent_item); 6437 btrfs_set_file_extent_generation(leaf, fi, 6438 trans->transid); 6439 btrfs_set_file_extent_type(leaf, fi, 6440 BTRFS_FILE_EXTENT_REG); 6441 btrfs_set_file_extent_disk_bytenr(leaf, fi, 6442 new_extents[i].disk_bytenr); 6443 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 6444 new_extents[i].disk_num_bytes); 6445 btrfs_set_file_extent_ram_bytes(leaf, fi, 6446 new_extents[i].ram_bytes); 6447 6448 btrfs_set_file_extent_compression(leaf, fi, 6449 new_extents[i].compression); 6450 btrfs_set_file_extent_encryption(leaf, fi, 6451 new_extents[i].encryption); 6452 btrfs_set_file_extent_other_encoding(leaf, fi, 6453 new_extents[i].other_encoding); 6454 6455 btrfs_set_file_extent_num_bytes(leaf, fi, 6456 extent_len); 6457 ext_offset += new_extents[i].offset; 6458 btrfs_set_file_extent_offset(leaf, fi, 6459 ext_offset); 6460 btrfs_mark_buffer_dirty(leaf); 6461 6462 btrfs_drop_extent_cache(inode, key.offset, 6463 key.offset + extent_len - 1, 0); 6464 6465 ret = btrfs_inc_extent_ref(trans, root, 6466 new_extents[i].disk_bytenr, 6467 new_extents[i].disk_num_bytes, 6468 leaf->start, 6469 root->root_key.objectid, 6470 trans->transid, key.objectid); 6471 BUG_ON(ret); 6472 btrfs_release_path(root, path); 6473 6474 inode_add_bytes(inode, extent_len); 6475 6476 ext_offset = 0; 6477 num_bytes -= extent_len; 6478 key.offset += extent_len; 6479 6480 if (num_bytes == 0) 6481 break; 6482 } 6483 BUG_ON(i >= nr_extents); 6484 #endif 6485 } 6486 6487 if (extent_locked) { 6488 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6489 lock_end, GFP_NOFS); 6490 extent_locked = 0; 6491 } 6492 skip: 6493 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 6494 key.offset >= search_end) 6495 break; 6496 6497 cond_resched(); 6498 } 6499 ret = 0; 6500 out: 6501 btrfs_release_path(root, path); 6502 if (inode) { 6503 mutex_unlock(&inode->i_mutex); 6504 if (extent_locked) { 6505 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, 6506 lock_end, GFP_NOFS); 6507 } 6508 iput(inode); 6509 } 6510 return ret; 6511 } 6512 6513 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, 6514 struct btrfs_root *root, 6515 struct extent_buffer *buf, u64 orig_start) 6516 { 6517 int level; 6518 int ret; 6519 6520 BUG_ON(btrfs_header_generation(buf) != trans->transid); 6521 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6522 6523 level = btrfs_header_level(buf); 6524 if (level == 0) { 6525 struct btrfs_leaf_ref *ref; 6526 struct btrfs_leaf_ref *orig_ref; 6527 6528 orig_ref = btrfs_lookup_leaf_ref(root, orig_start); 6529 if (!orig_ref) 6530 return -ENOENT; 6531 6532 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); 6533 if (!ref) { 6534 btrfs_free_leaf_ref(root, orig_ref); 6535 return -ENOMEM; 6536 } 6537 6538 ref->nritems = orig_ref->nritems; 6539 memcpy(ref->extents, orig_ref->extents, 6540 sizeof(ref->extents[0]) * ref->nritems); 6541 6542 btrfs_free_leaf_ref(root, orig_ref); 6543 6544 ref->root_gen = trans->transid; 6545 ref->bytenr = buf->start; 6546 ref->owner = btrfs_header_owner(buf); 6547 ref->generation = btrfs_header_generation(buf); 6548 6549 ret = btrfs_add_leaf_ref(root, ref, 0); 6550 WARN_ON(ret); 6551 btrfs_free_leaf_ref(root, ref); 6552 } 6553 return 0; 6554 } 6555 6556 static noinline int invalidate_extent_cache(struct btrfs_root *root, 6557 struct extent_buffer *leaf, 6558 struct btrfs_block_group_cache *group, 6559 struct btrfs_root *target_root) 6560 { 6561 struct btrfs_key key; 6562 struct inode *inode = NULL; 6563 struct btrfs_file_extent_item *fi; 6564 u64 num_bytes; 6565 u64 skip_objectid = 0; 6566 u32 nritems; 6567 u32 i; 6568 6569 nritems = btrfs_header_nritems(leaf); 6570 for (i = 0; i < nritems; i++) { 6571 btrfs_item_key_to_cpu(leaf, &key, i); 6572 if (key.objectid == skip_objectid || 6573 key.type != BTRFS_EXTENT_DATA_KEY) 6574 continue; 6575 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 6576 if (btrfs_file_extent_type(leaf, fi) == 6577 BTRFS_FILE_EXTENT_INLINE) 6578 continue; 6579 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) 6580 continue; 6581 if (!inode || inode->i_ino != key.objectid) { 6582 iput(inode); 6583 inode = btrfs_ilookup(target_root->fs_info->sb, 6584 key.objectid, target_root, 1); 6585 } 6586 if (!inode) { 6587 skip_objectid = key.objectid; 6588 continue; 6589 } 6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6591 6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6593 key.offset + num_bytes - 1, GFP_NOFS); 6594 btrfs_drop_extent_cache(inode, key.offset, 6595 key.offset + num_bytes - 1, 1); 6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6597 key.offset + num_bytes - 1, GFP_NOFS); 6598 cond_resched(); 6599 } 6600 iput(inode); 6601 return 0; 6602 } 6603 6604 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, 6605 struct btrfs_root *root, 6606 struct extent_buffer *leaf, 6607 struct btrfs_block_group_cache *group, 6608 struct inode *reloc_inode) 6609 { 6610 struct btrfs_key key; 6611 struct btrfs_key extent_key; 6612 struct btrfs_file_extent_item *fi; 6613 struct btrfs_leaf_ref *ref; 6614 struct disk_extent *new_extent; 6615 u64 bytenr; 6616 u64 num_bytes; 6617 u32 nritems; 6618 u32 i; 6619 int ext_index; 6620 int nr_extent; 6621 int ret; 6622 6623 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 6624 BUG_ON(!new_extent); 6625 6626 ref = btrfs_lookup_leaf_ref(root, leaf->start); 6627 BUG_ON(!ref); 6628 6629 ext_index = -1; 6630 nritems = btrfs_header_nritems(leaf); 6631 for (i = 0; i < nritems; i++) { 6632 btrfs_item_key_to_cpu(leaf, &key, i); 6633 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 6634 continue; 6635 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 6636 if (btrfs_file_extent_type(leaf, fi) == 6637 BTRFS_FILE_EXTENT_INLINE) 6638 continue; 6639 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6640 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 6641 if (bytenr == 0) 6642 continue; 6643 6644 ext_index++; 6645 if (bytenr >= group->key.objectid + group->key.offset || 6646 bytenr + num_bytes <= group->key.objectid) 6647 continue; 6648 6649 extent_key.objectid = bytenr; 6650 extent_key.offset = num_bytes; 6651 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 6652 nr_extent = 1; 6653 ret = get_new_locations(reloc_inode, &extent_key, 6654 group->key.objectid, 1, 6655 &new_extent, &nr_extent); 6656 if (ret > 0) 6657 continue; 6658 BUG_ON(ret < 0); 6659 6660 BUG_ON(ref->extents[ext_index].bytenr != bytenr); 6661 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); 6662 ref->extents[ext_index].bytenr = new_extent->disk_bytenr; 6663 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; 6664 6665 btrfs_set_file_extent_disk_bytenr(leaf, fi, 6666 new_extent->disk_bytenr); 6667 btrfs_set_file_extent_disk_num_bytes(leaf, fi, 6668 new_extent->disk_num_bytes); 6669 btrfs_mark_buffer_dirty(leaf); 6670 6671 ret = btrfs_inc_extent_ref(trans, root, 6672 new_extent->disk_bytenr, 6673 new_extent->disk_num_bytes, 6674 leaf->start, 6675 root->root_key.objectid, 6676 trans->transid, key.objectid); 6677 BUG_ON(ret); 6678 6679 ret = btrfs_free_extent(trans, root, 6680 bytenr, num_bytes, leaf->start, 6681 btrfs_header_owner(leaf), 6682 btrfs_header_generation(leaf), 6683 key.objectid, 0); 6684 BUG_ON(ret); 6685 cond_resched(); 6686 } 6687 kfree(new_extent); 6688 BUG_ON(ext_index + 1 != ref->nritems); 6689 btrfs_free_leaf_ref(root, ref); 6690 return 0; 6691 } 6692 6693 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, 6694 struct btrfs_root *root) 6695 { 6696 struct btrfs_root *reloc_root; 6697 int ret; 6698 6699 if (root->reloc_root) { 6700 reloc_root = root->reloc_root; 6701 root->reloc_root = NULL; 6702 list_add(&reloc_root->dead_list, 6703 &root->fs_info->dead_reloc_roots); 6704 6705 btrfs_set_root_bytenr(&reloc_root->root_item, 6706 reloc_root->node->start); 6707 btrfs_set_root_level(&root->root_item, 6708 btrfs_header_level(reloc_root->node)); 6709 memset(&reloc_root->root_item.drop_progress, 0, 6710 sizeof(struct btrfs_disk_key)); 6711 reloc_root->root_item.drop_level = 0; 6712 6713 ret = btrfs_update_root(trans, root->fs_info->tree_root, 6714 &reloc_root->root_key, 6715 &reloc_root->root_item); 6716 BUG_ON(ret); 6717 } 6718 return 0; 6719 } 6720 6721 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) 6722 { 6723 struct btrfs_trans_handle *trans; 6724 struct btrfs_root *reloc_root; 6725 struct btrfs_root *prev_root = NULL; 6726 struct list_head dead_roots; 6727 int ret; 6728 unsigned long nr; 6729 6730 INIT_LIST_HEAD(&dead_roots); 6731 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); 6732 6733 while (!list_empty(&dead_roots)) { 6734 reloc_root = list_entry(dead_roots.prev, 6735 struct btrfs_root, dead_list); 6736 list_del_init(&reloc_root->dead_list); 6737 6738 BUG_ON(reloc_root->commit_root != NULL); 6739 while (1) { 6740 trans = btrfs_join_transaction(root, 1); 6741 BUG_ON(!trans); 6742 6743 mutex_lock(&root->fs_info->drop_mutex); 6744 ret = btrfs_drop_snapshot(trans, reloc_root); 6745 if (ret != -EAGAIN) 6746 break; 6747 mutex_unlock(&root->fs_info->drop_mutex); 6748 6749 nr = trans->blocks_used; 6750 ret = btrfs_end_transaction(trans, root); 6751 BUG_ON(ret); 6752 btrfs_btree_balance_dirty(root, nr); 6753 } 6754 6755 free_extent_buffer(reloc_root->node); 6756 6757 ret = btrfs_del_root(trans, root->fs_info->tree_root, 6758 &reloc_root->root_key); 6759 BUG_ON(ret); 6760 mutex_unlock(&root->fs_info->drop_mutex); 6761 6762 nr = trans->blocks_used; 6763 ret = btrfs_end_transaction(trans, root); 6764 BUG_ON(ret); 6765 btrfs_btree_balance_dirty(root, nr); 6766 6767 kfree(prev_root); 6768 prev_root = reloc_root; 6769 } 6770 if (prev_root) { 6771 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); 6772 kfree(prev_root); 6773 } 6774 return 0; 6775 } 6776 6777 int btrfs_add_dead_reloc_root(struct btrfs_root *root) 6778 { 6779 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); 6780 return 0; 6781 } 6782 6783 int btrfs_cleanup_reloc_trees(struct btrfs_root *root) 6784 { 6785 struct btrfs_root *reloc_root; 6786 struct btrfs_trans_handle *trans; 6787 struct btrfs_key location; 6788 int found; 6789 int ret; 6790 6791 mutex_lock(&root->fs_info->tree_reloc_mutex); 6792 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); 6793 BUG_ON(ret); 6794 found = !list_empty(&root->fs_info->dead_reloc_roots); 6795 mutex_unlock(&root->fs_info->tree_reloc_mutex); 6796 6797 if (found) { 6798 trans = btrfs_start_transaction(root, 1); 6799 BUG_ON(!trans); 6800 ret = btrfs_commit_transaction(trans, root); 6801 BUG_ON(ret); 6802 } 6803 6804 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 6805 location.offset = (u64)-1; 6806 location.type = BTRFS_ROOT_ITEM_KEY; 6807 6808 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 6809 BUG_ON(!reloc_root); 6810 btrfs_orphan_cleanup(reloc_root); 6811 return 0; 6812 } 6813 6814 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, 6815 struct btrfs_root *root) 6816 { 6817 struct btrfs_root *reloc_root; 6818 struct extent_buffer *eb; 6819 struct btrfs_root_item *root_item; 6820 struct btrfs_key root_key; 6821 int ret; 6822 6823 BUG_ON(!root->ref_cows); 6824 if (root->reloc_root) 6825 return 0; 6826 6827 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 6828 BUG_ON(!root_item); 6829 6830 ret = btrfs_copy_root(trans, root, root->commit_root, 6831 &eb, BTRFS_TREE_RELOC_OBJECTID); 6832 BUG_ON(ret); 6833 6834 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 6835 root_key.offset = root->root_key.objectid; 6836 root_key.type = BTRFS_ROOT_ITEM_KEY; 6837 6838 memcpy(root_item, &root->root_item, sizeof(root_item)); 6839 btrfs_set_root_refs(root_item, 0); 6840 btrfs_set_root_bytenr(root_item, eb->start); 6841 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 6842 btrfs_set_root_generation(root_item, trans->transid); 6843 6844 btrfs_tree_unlock(eb); 6845 free_extent_buffer(eb); 6846 6847 ret = btrfs_insert_root(trans, root->fs_info->tree_root, 6848 &root_key, root_item); 6849 BUG_ON(ret); 6850 kfree(root_item); 6851 6852 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 6853 &root_key); 6854 BUG_ON(!reloc_root); 6855 reloc_root->last_trans = trans->transid; 6856 reloc_root->commit_root = NULL; 6857 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 6858 6859 root->reloc_root = reloc_root; 6860 return 0; 6861 } 6862 6863 /* 6864 * Core function of space balance. 6865 * 6866 * The idea is using reloc trees to relocate tree blocks in reference 6867 * counted roots. There is one reloc tree for each subvol, and all 6868 * reloc trees share same root key objectid. Reloc trees are snapshots 6869 * of the latest committed roots of subvols (root->commit_root). 6870 * 6871 * To relocate a tree block referenced by a subvol, there are two steps. 6872 * COW the block through subvol's reloc tree, then update block pointer 6873 * in the subvol to point to the new block. Since all reloc trees share 6874 * same root key objectid, doing special handing for tree blocks owned 6875 * by them is easy. Once a tree block has been COWed in one reloc tree, 6876 * we can use the resulting new block directly when the same block is 6877 * required to COW again through other reloc trees. By this way, relocated 6878 * tree blocks are shared between reloc trees, so they are also shared 6879 * between subvols. 6880 */ 6881 static noinline int relocate_one_path(struct btrfs_trans_handle *trans, 6882 struct btrfs_root *root, 6883 struct btrfs_path *path, 6884 struct btrfs_key *first_key, 6885 struct btrfs_ref_path *ref_path, 6886 struct btrfs_block_group_cache *group, 6887 struct inode *reloc_inode) 6888 { 6889 struct btrfs_root *reloc_root; 6890 struct extent_buffer *eb = NULL; 6891 struct btrfs_key *keys; 6892 u64 *nodes; 6893 int level; 6894 int shared_level; 6895 int lowest_level = 0; 6896 int ret; 6897 6898 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) 6899 lowest_level = ref_path->owner_objectid; 6900 6901 if (!root->ref_cows) { 6902 path->lowest_level = lowest_level; 6903 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); 6904 BUG_ON(ret < 0); 6905 path->lowest_level = 0; 6906 btrfs_release_path(root, path); 6907 return 0; 6908 } 6909 6910 mutex_lock(&root->fs_info->tree_reloc_mutex); 6911 ret = init_reloc_tree(trans, root); 6912 BUG_ON(ret); 6913 reloc_root = root->reloc_root; 6914 6915 shared_level = ref_path->shared_level; 6916 ref_path->shared_level = BTRFS_MAX_LEVEL - 1; 6917 6918 keys = ref_path->node_keys; 6919 nodes = ref_path->new_nodes; 6920 memset(&keys[shared_level + 1], 0, 6921 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); 6922 memset(&nodes[shared_level + 1], 0, 6923 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); 6924 6925 if (nodes[lowest_level] == 0) { 6926 path->lowest_level = lowest_level; 6927 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 6928 0, 1); 6929 BUG_ON(ret); 6930 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { 6931 eb = path->nodes[level]; 6932 if (!eb || eb == reloc_root->node) 6933 break; 6934 nodes[level] = eb->start; 6935 if (level == 0) 6936 btrfs_item_key_to_cpu(eb, &keys[level], 0); 6937 else 6938 btrfs_node_key_to_cpu(eb, &keys[level], 0); 6939 } 6940 if (nodes[0] && 6941 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 6942 eb = path->nodes[0]; 6943 ret = replace_extents_in_leaf(trans, reloc_root, eb, 6944 group, reloc_inode); 6945 BUG_ON(ret); 6946 } 6947 btrfs_release_path(reloc_root, path); 6948 } else { 6949 ret = btrfs_merge_path(trans, reloc_root, keys, nodes, 6950 lowest_level); 6951 BUG_ON(ret); 6952 } 6953 6954 /* 6955 * replace tree blocks in the fs tree with tree blocks in 6956 * the reloc tree. 6957 */ 6958 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); 6959 BUG_ON(ret < 0); 6960 6961 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 6962 ret = btrfs_search_slot(trans, reloc_root, first_key, path, 6963 0, 0); 6964 BUG_ON(ret); 6965 extent_buffer_get(path->nodes[0]); 6966 eb = path->nodes[0]; 6967 btrfs_release_path(reloc_root, path); 6968 ret = invalidate_extent_cache(reloc_root, eb, group, root); 6969 BUG_ON(ret); 6970 free_extent_buffer(eb); 6971 } 6972 6973 mutex_unlock(&root->fs_info->tree_reloc_mutex); 6974 path->lowest_level = 0; 6975 return 0; 6976 } 6977 6978 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, 6979 struct btrfs_root *root, 6980 struct btrfs_path *path, 6981 struct btrfs_key *first_key, 6982 struct btrfs_ref_path *ref_path) 6983 { 6984 int ret; 6985 6986 ret = relocate_one_path(trans, root, path, first_key, 6987 ref_path, NULL, NULL); 6988 BUG_ON(ret); 6989 6990 return 0; 6991 } 6992 6993 static noinline int del_extent_zero(struct btrfs_trans_handle *trans, 6994 struct btrfs_root *extent_root, 6995 struct btrfs_path *path, 6996 struct btrfs_key *extent_key) 6997 { 6998 int ret; 6999 7000 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); 7001 if (ret) 7002 goto out; 7003 ret = btrfs_del_item(trans, extent_root, path); 7004 out: 7005 btrfs_release_path(extent_root, path); 7006 return ret; 7007 } 7008 7009 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, 7010 struct btrfs_ref_path *ref_path) 7011 { 7012 struct btrfs_key root_key; 7013 7014 root_key.objectid = ref_path->root_objectid; 7015 root_key.type = BTRFS_ROOT_ITEM_KEY; 7016 if (is_cowonly_root(ref_path->root_objectid)) 7017 root_key.offset = 0; 7018 else 7019 root_key.offset = (u64)-1; 7020 7021 return btrfs_read_fs_root_no_name(fs_info, &root_key); 7022 } 7023 7024 static noinline int relocate_one_extent(struct btrfs_root *extent_root, 7025 struct btrfs_path *path, 7026 struct btrfs_key *extent_key, 7027 struct btrfs_block_group_cache *group, 7028 struct inode *reloc_inode, int pass) 7029 { 7030 struct btrfs_trans_handle *trans; 7031 struct btrfs_root *found_root; 7032 struct btrfs_ref_path *ref_path = NULL; 7033 struct disk_extent *new_extents = NULL; 7034 int nr_extents = 0; 7035 int loops; 7036 int ret; 7037 int level; 7038 struct btrfs_key first_key; 7039 u64 prev_block = 0; 7040 7041 7042 trans = btrfs_start_transaction(extent_root, 1); 7043 BUG_ON(!trans); 7044 7045 if (extent_key->objectid == 0) { 7046 ret = del_extent_zero(trans, extent_root, path, extent_key); 7047 goto out; 7048 } 7049 7050 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); 7051 if (!ref_path) { 7052 ret = -ENOMEM; 7053 goto out; 7054 } 7055 7056 for (loops = 0; ; loops++) { 7057 if (loops == 0) { 7058 ret = btrfs_first_ref_path(trans, extent_root, ref_path, 7059 extent_key->objectid); 7060 } else { 7061 ret = btrfs_next_ref_path(trans, extent_root, ref_path); 7062 } 7063 if (ret < 0) 7064 goto out; 7065 if (ret > 0) 7066 break; 7067 7068 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || 7069 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) 7070 continue; 7071 7072 found_root = read_ref_root(extent_root->fs_info, ref_path); 7073 BUG_ON(!found_root); 7074 /* 7075 * for reference counted tree, only process reference paths 7076 * rooted at the latest committed root. 7077 */ 7078 if (found_root->ref_cows && 7079 ref_path->root_generation != found_root->root_key.offset) 7080 continue; 7081 7082 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7083 if (pass == 0) { 7084 /* 7085 * copy data extents to new locations 7086 */ 7087 u64 group_start = group->key.objectid; 7088 ret = relocate_data_extent(reloc_inode, 7089 extent_key, 7090 group_start); 7091 if (ret < 0) 7092 goto out; 7093 break; 7094 } 7095 level = 0; 7096 } else { 7097 level = ref_path->owner_objectid; 7098 } 7099 7100 if (prev_block != ref_path->nodes[level]) { 7101 struct extent_buffer *eb; 7102 u64 block_start = ref_path->nodes[level]; 7103 u64 block_size = btrfs_level_size(found_root, level); 7104 7105 eb = read_tree_block(found_root, block_start, 7106 block_size, 0); 7107 btrfs_tree_lock(eb); 7108 BUG_ON(level != btrfs_header_level(eb)); 7109 7110 if (level == 0) 7111 btrfs_item_key_to_cpu(eb, &first_key, 0); 7112 else 7113 btrfs_node_key_to_cpu(eb, &first_key, 0); 7114 7115 btrfs_tree_unlock(eb); 7116 free_extent_buffer(eb); 7117 prev_block = block_start; 7118 } 7119 7120 mutex_lock(&extent_root->fs_info->trans_mutex); 7121 btrfs_record_root_in_trans(found_root); 7122 mutex_unlock(&extent_root->fs_info->trans_mutex); 7123 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 7124 /* 7125 * try to update data extent references while 7126 * keeping metadata shared between snapshots. 7127 */ 7128 if (pass == 1) { 7129 ret = relocate_one_path(trans, found_root, 7130 path, &first_key, ref_path, 7131 group, reloc_inode); 7132 if (ret < 0) 7133 goto out; 7134 continue; 7135 } 7136 /* 7137 * use fallback method to process the remaining 7138 * references. 7139 */ 7140 if (!new_extents) { 7141 u64 group_start = group->key.objectid; 7142 new_extents = kmalloc(sizeof(*new_extents), 7143 GFP_NOFS); 7144 nr_extents = 1; 7145 ret = get_new_locations(reloc_inode, 7146 extent_key, 7147 group_start, 1, 7148 &new_extents, 7149 &nr_extents); 7150 if (ret) 7151 goto out; 7152 } 7153 ret = replace_one_extent(trans, found_root, 7154 path, extent_key, 7155 &first_key, ref_path, 7156 new_extents, nr_extents); 7157 } else { 7158 ret = relocate_tree_block(trans, found_root, path, 7159 &first_key, ref_path); 7160 } 7161 if (ret < 0) 7162 goto out; 7163 } 7164 ret = 0; 7165 out: 7166 btrfs_end_transaction(trans, extent_root); 7167 kfree(new_extents); 7168 kfree(ref_path); 7169 return ret; 7170 } 7171 #endif 7172 7173 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 7174 { 7175 u64 num_devices; 7176 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7177 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7178 7179 num_devices = root->fs_info->fs_devices->rw_devices; 7180 if (num_devices == 1) { 7181 stripped |= BTRFS_BLOCK_GROUP_DUP; 7182 stripped = flags & ~stripped; 7183 7184 /* turn raid0 into single device chunks */ 7185 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7186 return stripped; 7187 7188 /* turn mirroring into duplication */ 7189 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7190 BTRFS_BLOCK_GROUP_RAID10)) 7191 return stripped | BTRFS_BLOCK_GROUP_DUP; 7192 return flags; 7193 } else { 7194 /* they already had raid on here, just return */ 7195 if (flags & stripped) 7196 return flags; 7197 7198 stripped |= BTRFS_BLOCK_GROUP_DUP; 7199 stripped = flags & ~stripped; 7200 7201 /* switch duplicated blocks with raid1 */ 7202 if (flags & BTRFS_BLOCK_GROUP_DUP) 7203 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7204 7205 /* turn single device chunks into raid0 */ 7206 return stripped | BTRFS_BLOCK_GROUP_RAID0; 7207 } 7208 return flags; 7209 } 7210 7211 static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7212 struct btrfs_block_group_cache *shrink_block_group, 7213 int force) 7214 { 7215 struct btrfs_trans_handle *trans; 7216 u64 new_alloc_flags; 7217 u64 calc; 7218 7219 spin_lock(&shrink_block_group->lock); 7220 if (btrfs_block_group_used(&shrink_block_group->item) + 7221 shrink_block_group->reserved > 0) { 7222 spin_unlock(&shrink_block_group->lock); 7223 7224 trans = btrfs_start_transaction(root, 1); 7225 spin_lock(&shrink_block_group->lock); 7226 7227 new_alloc_flags = update_block_group_flags(root, 7228 shrink_block_group->flags); 7229 if (new_alloc_flags != shrink_block_group->flags) { 7230 calc = 7231 btrfs_block_group_used(&shrink_block_group->item); 7232 } else { 7233 calc = shrink_block_group->key.offset; 7234 } 7235 spin_unlock(&shrink_block_group->lock); 7236 7237 do_chunk_alloc(trans, root->fs_info->extent_root, 7238 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7239 7240 btrfs_end_transaction(trans, root); 7241 } else 7242 spin_unlock(&shrink_block_group->lock); 7243 return 0; 7244 } 7245 7246 7247 int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7248 struct btrfs_block_group_cache *group) 7249 7250 { 7251 __alloc_chunk_for_shrink(root, group, 1); 7252 set_block_group_readonly(group); 7253 return 0; 7254 } 7255 7256 /* 7257 * checks to see if its even possible to relocate this block group. 7258 * 7259 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 7260 * ok to go ahead and try. 7261 */ 7262 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 7263 { 7264 struct btrfs_block_group_cache *block_group; 7265 struct btrfs_space_info *space_info; 7266 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7267 struct btrfs_device *device; 7268 int full = 0; 7269 int ret = 0; 7270 7271 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 7272 7273 /* odd, couldn't find the block group, leave it alone */ 7274 if (!block_group) 7275 return -1; 7276 7277 /* no bytes used, we're good */ 7278 if (!btrfs_block_group_used(&block_group->item)) 7279 goto out; 7280 7281 space_info = block_group->space_info; 7282 spin_lock(&space_info->lock); 7283 7284 full = space_info->full; 7285 7286 /* 7287 * if this is the last block group we have in this space, we can't 7288 * relocate it unless we're able to allocate a new chunk below. 7289 * 7290 * Otherwise, we need to make sure we have room in the space to handle 7291 * all of the extents from this block group. If we can, we're good 7292 */ 7293 if ((space_info->total_bytes != block_group->key.offset) && 7294 (space_info->bytes_used + space_info->bytes_reserved + 7295 space_info->bytes_pinned + space_info->bytes_readonly + 7296 btrfs_block_group_used(&block_group->item) < 7297 space_info->total_bytes)) { 7298 spin_unlock(&space_info->lock); 7299 goto out; 7300 } 7301 spin_unlock(&space_info->lock); 7302 7303 /* 7304 * ok we don't have enough space, but maybe we have free space on our 7305 * devices to allocate new chunks for relocation, so loop through our 7306 * alloc devices and guess if we have enough space. However, if we 7307 * were marked as full, then we know there aren't enough chunks, and we 7308 * can just return. 7309 */ 7310 ret = -1; 7311 if (full) 7312 goto out; 7313 7314 mutex_lock(&root->fs_info->chunk_mutex); 7315 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 7316 u64 min_free = btrfs_block_group_used(&block_group->item); 7317 u64 dev_offset, max_avail; 7318 7319 /* 7320 * check to make sure we can actually find a chunk with enough 7321 * space to fit our block group in. 7322 */ 7323 if (device->total_bytes > device->bytes_used + min_free) { 7324 ret = find_free_dev_extent(NULL, device, min_free, 7325 &dev_offset, &max_avail); 7326 if (!ret) 7327 break; 7328 ret = -1; 7329 } 7330 } 7331 mutex_unlock(&root->fs_info->chunk_mutex); 7332 out: 7333 btrfs_put_block_group(block_group); 7334 return ret; 7335 } 7336 7337 static int find_first_block_group(struct btrfs_root *root, 7338 struct btrfs_path *path, struct btrfs_key *key) 7339 { 7340 int ret = 0; 7341 struct btrfs_key found_key; 7342 struct extent_buffer *leaf; 7343 int slot; 7344 7345 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7346 if (ret < 0) 7347 goto out; 7348 7349 while (1) { 7350 slot = path->slots[0]; 7351 leaf = path->nodes[0]; 7352 if (slot >= btrfs_header_nritems(leaf)) { 7353 ret = btrfs_next_leaf(root, path); 7354 if (ret == 0) 7355 continue; 7356 if (ret < 0) 7357 goto out; 7358 break; 7359 } 7360 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7361 7362 if (found_key.objectid >= key->objectid && 7363 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 7364 ret = 0; 7365 goto out; 7366 } 7367 path->slots[0]++; 7368 } 7369 ret = -ENOENT; 7370 out: 7371 return ret; 7372 } 7373 7374 int btrfs_free_block_groups(struct btrfs_fs_info *info) 7375 { 7376 struct btrfs_block_group_cache *block_group; 7377 struct btrfs_space_info *space_info; 7378 struct btrfs_caching_control *caching_ctl; 7379 struct rb_node *n; 7380 7381 down_write(&info->extent_commit_sem); 7382 while (!list_empty(&info->caching_block_groups)) { 7383 caching_ctl = list_entry(info->caching_block_groups.next, 7384 struct btrfs_caching_control, list); 7385 list_del(&caching_ctl->list); 7386 put_caching_control(caching_ctl); 7387 } 7388 up_write(&info->extent_commit_sem); 7389 7390 spin_lock(&info->block_group_cache_lock); 7391 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7392 block_group = rb_entry(n, struct btrfs_block_group_cache, 7393 cache_node); 7394 rb_erase(&block_group->cache_node, 7395 &info->block_group_cache_tree); 7396 spin_unlock(&info->block_group_cache_lock); 7397 7398 down_write(&block_group->space_info->groups_sem); 7399 list_del(&block_group->list); 7400 up_write(&block_group->space_info->groups_sem); 7401 7402 if (block_group->cached == BTRFS_CACHE_STARTED) 7403 wait_block_group_cache_done(block_group); 7404 7405 btrfs_remove_free_space_cache(block_group); 7406 btrfs_put_block_group(block_group); 7407 7408 spin_lock(&info->block_group_cache_lock); 7409 } 7410 spin_unlock(&info->block_group_cache_lock); 7411 7412 /* now that all the block groups are freed, go through and 7413 * free all the space_info structs. This is only called during 7414 * the final stages of unmount, and so we know nobody is 7415 * using them. We call synchronize_rcu() once before we start, 7416 * just to be on the safe side. 7417 */ 7418 synchronize_rcu(); 7419 7420 while(!list_empty(&info->space_info)) { 7421 space_info = list_entry(info->space_info.next, 7422 struct btrfs_space_info, 7423 list); 7424 7425 list_del(&space_info->list); 7426 kfree(space_info); 7427 } 7428 return 0; 7429 } 7430 7431 int btrfs_read_block_groups(struct btrfs_root *root) 7432 { 7433 struct btrfs_path *path; 7434 int ret; 7435 struct btrfs_block_group_cache *cache; 7436 struct btrfs_fs_info *info = root->fs_info; 7437 struct btrfs_space_info *space_info; 7438 struct btrfs_key key; 7439 struct btrfs_key found_key; 7440 struct extent_buffer *leaf; 7441 7442 root = info->extent_root; 7443 key.objectid = 0; 7444 key.offset = 0; 7445 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 7446 path = btrfs_alloc_path(); 7447 if (!path) 7448 return -ENOMEM; 7449 7450 while (1) { 7451 ret = find_first_block_group(root, path, &key); 7452 if (ret > 0) { 7453 ret = 0; 7454 goto error; 7455 } 7456 if (ret != 0) 7457 goto error; 7458 7459 leaf = path->nodes[0]; 7460 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 7461 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7462 if (!cache) { 7463 ret = -ENOMEM; 7464 break; 7465 } 7466 7467 atomic_set(&cache->count, 1); 7468 spin_lock_init(&cache->lock); 7469 spin_lock_init(&cache->tree_lock); 7470 cache->fs_info = info; 7471 INIT_LIST_HEAD(&cache->list); 7472 INIT_LIST_HEAD(&cache->cluster_list); 7473 7474 /* 7475 * we only want to have 32k of ram per block group for keeping 7476 * track of free space, and if we pass 1/2 of that we want to 7477 * start converting things over to using bitmaps 7478 */ 7479 cache->extents_thresh = ((1024 * 32) / 2) / 7480 sizeof(struct btrfs_free_space); 7481 7482 read_extent_buffer(leaf, &cache->item, 7483 btrfs_item_ptr_offset(leaf, path->slots[0]), 7484 sizeof(cache->item)); 7485 memcpy(&cache->key, &found_key, sizeof(found_key)); 7486 7487 key.objectid = found_key.objectid + found_key.offset; 7488 btrfs_release_path(root, path); 7489 cache->flags = btrfs_block_group_flags(&cache->item); 7490 cache->sectorsize = root->sectorsize; 7491 7492 /* 7493 * check for two cases, either we are full, and therefore 7494 * don't need to bother with the caching work since we won't 7495 * find any space, or we are empty, and we can just add all 7496 * the space in and be done with it. This saves us _alot_ of 7497 * time, particularly in the full case. 7498 */ 7499 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 7500 exclude_super_stripes(root, cache); 7501 cache->last_byte_to_unpin = (u64)-1; 7502 cache->cached = BTRFS_CACHE_FINISHED; 7503 free_excluded_extents(root, cache); 7504 } else if (btrfs_block_group_used(&cache->item) == 0) { 7505 exclude_super_stripes(root, cache); 7506 cache->last_byte_to_unpin = (u64)-1; 7507 cache->cached = BTRFS_CACHE_FINISHED; 7508 add_new_free_space(cache, root->fs_info, 7509 found_key.objectid, 7510 found_key.objectid + 7511 found_key.offset); 7512 free_excluded_extents(root, cache); 7513 } 7514 7515 ret = update_space_info(info, cache->flags, found_key.offset, 7516 btrfs_block_group_used(&cache->item), 7517 &space_info); 7518 BUG_ON(ret); 7519 cache->space_info = space_info; 7520 spin_lock(&cache->space_info->lock); 7521 cache->space_info->bytes_super += cache->bytes_super; 7522 spin_unlock(&cache->space_info->lock); 7523 7524 down_write(&space_info->groups_sem); 7525 list_add_tail(&cache->list, &space_info->block_groups); 7526 up_write(&space_info->groups_sem); 7527 7528 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7529 BUG_ON(ret); 7530 7531 set_avail_alloc_bits(root->fs_info, cache->flags); 7532 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7533 set_block_group_readonly(cache); 7534 } 7535 ret = 0; 7536 error: 7537 btrfs_free_path(path); 7538 return ret; 7539 } 7540 7541 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 7542 struct btrfs_root *root, u64 bytes_used, 7543 u64 type, u64 chunk_objectid, u64 chunk_offset, 7544 u64 size) 7545 { 7546 int ret; 7547 struct btrfs_root *extent_root; 7548 struct btrfs_block_group_cache *cache; 7549 7550 extent_root = root->fs_info->extent_root; 7551 7552 root->fs_info->last_trans_log_full_commit = trans->transid; 7553 7554 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7555 if (!cache) 7556 return -ENOMEM; 7557 7558 cache->key.objectid = chunk_offset; 7559 cache->key.offset = size; 7560 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7561 cache->sectorsize = root->sectorsize; 7562 7563 /* 7564 * we only want to have 32k of ram per block group for keeping track 7565 * of free space, and if we pass 1/2 of that we want to start 7566 * converting things over to using bitmaps 7567 */ 7568 cache->extents_thresh = ((1024 * 32) / 2) / 7569 sizeof(struct btrfs_free_space); 7570 atomic_set(&cache->count, 1); 7571 spin_lock_init(&cache->lock); 7572 spin_lock_init(&cache->tree_lock); 7573 INIT_LIST_HEAD(&cache->list); 7574 INIT_LIST_HEAD(&cache->cluster_list); 7575 7576 btrfs_set_block_group_used(&cache->item, bytes_used); 7577 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 7578 cache->flags = type; 7579 btrfs_set_block_group_flags(&cache->item, type); 7580 7581 cache->last_byte_to_unpin = (u64)-1; 7582 cache->cached = BTRFS_CACHE_FINISHED; 7583 exclude_super_stripes(root, cache); 7584 7585 add_new_free_space(cache, root->fs_info, chunk_offset, 7586 chunk_offset + size); 7587 7588 free_excluded_extents(root, cache); 7589 7590 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7591 &cache->space_info); 7592 BUG_ON(ret); 7593 7594 spin_lock(&cache->space_info->lock); 7595 cache->space_info->bytes_super += cache->bytes_super; 7596 spin_unlock(&cache->space_info->lock); 7597 7598 down_write(&cache->space_info->groups_sem); 7599 list_add_tail(&cache->list, &cache->space_info->block_groups); 7600 up_write(&cache->space_info->groups_sem); 7601 7602 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7603 BUG_ON(ret); 7604 7605 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, 7606 sizeof(cache->item)); 7607 BUG_ON(ret); 7608 7609 set_avail_alloc_bits(extent_root->fs_info, type); 7610 7611 return 0; 7612 } 7613 7614 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7615 struct btrfs_root *root, u64 group_start) 7616 { 7617 struct btrfs_path *path; 7618 struct btrfs_block_group_cache *block_group; 7619 struct btrfs_free_cluster *cluster; 7620 struct btrfs_key key; 7621 int ret; 7622 7623 root = root->fs_info->extent_root; 7624 7625 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 7626 BUG_ON(!block_group); 7627 BUG_ON(!block_group->ro); 7628 7629 memcpy(&key, &block_group->key, sizeof(key)); 7630 7631 /* make sure this block group isn't part of an allocation cluster */ 7632 cluster = &root->fs_info->data_alloc_cluster; 7633 spin_lock(&cluster->refill_lock); 7634 btrfs_return_cluster_to_free_space(block_group, cluster); 7635 spin_unlock(&cluster->refill_lock); 7636 7637 /* 7638 * make sure this block group isn't part of a metadata 7639 * allocation cluster 7640 */ 7641 cluster = &root->fs_info->meta_alloc_cluster; 7642 spin_lock(&cluster->refill_lock); 7643 btrfs_return_cluster_to_free_space(block_group, cluster); 7644 spin_unlock(&cluster->refill_lock); 7645 7646 path = btrfs_alloc_path(); 7647 BUG_ON(!path); 7648 7649 spin_lock(&root->fs_info->block_group_cache_lock); 7650 rb_erase(&block_group->cache_node, 7651 &root->fs_info->block_group_cache_tree); 7652 spin_unlock(&root->fs_info->block_group_cache_lock); 7653 7654 down_write(&block_group->space_info->groups_sem); 7655 /* 7656 * we must use list_del_init so people can check to see if they 7657 * are still on the list after taking the semaphore 7658 */ 7659 list_del_init(&block_group->list); 7660 up_write(&block_group->space_info->groups_sem); 7661 7662 if (block_group->cached == BTRFS_CACHE_STARTED) 7663 wait_block_group_cache_done(block_group); 7664 7665 btrfs_remove_free_space_cache(block_group); 7666 7667 spin_lock(&block_group->space_info->lock); 7668 block_group->space_info->total_bytes -= block_group->key.offset; 7669 block_group->space_info->bytes_readonly -= block_group->key.offset; 7670 spin_unlock(&block_group->space_info->lock); 7671 7672 btrfs_clear_space_info_full(root->fs_info); 7673 7674 btrfs_put_block_group(block_group); 7675 btrfs_put_block_group(block_group); 7676 7677 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 7678 if (ret > 0) 7679 ret = -EIO; 7680 if (ret < 0) 7681 goto out; 7682 7683 ret = btrfs_del_item(trans, root, path); 7684 out: 7685 btrfs_free_path(path); 7686 return ret; 7687 } 7688