1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include "compat.h" 28 #include "hash.h" 29 #include "ctree.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "transaction.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "math.h" 38 39 #undef SCRAMBLE_DELAYED_REFS 40 41 /* 42 * control flags for do_chunk_alloc's force field 43 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 44 * if we really need one. 45 * 46 * CHUNK_ALLOC_LIMITED means to only try and allocate one 47 * if we have very few chunks already allocated. This is 48 * used as part of the clustering code to help make sure 49 * we have a good pool of storage to cluster in, without 50 * filling the FS with empty chunks 51 * 52 * CHUNK_ALLOC_FORCE means it must try to allocate one 53 * 54 */ 55 enum { 56 CHUNK_ALLOC_NO_FORCE = 0, 57 CHUNK_ALLOC_LIMITED = 1, 58 CHUNK_ALLOC_FORCE = 2, 59 }; 60 61 /* 62 * Control how reservations are dealt with. 63 * 64 * RESERVE_FREE - freeing a reservation. 65 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 66 * ENOSPC accounting 67 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 68 * bytes_may_use as the ENOSPC accounting is done elsewhere 69 */ 70 enum { 71 RESERVE_FREE = 0, 72 RESERVE_ALLOC = 1, 73 RESERVE_ALLOC_NO_ACCOUNT = 2, 74 }; 75 76 static int update_block_group(struct btrfs_root *root, 77 u64 bytenr, u64 num_bytes, int alloc); 78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 79 struct btrfs_root *root, 80 u64 bytenr, u64 num_bytes, u64 parent, 81 u64 root_objectid, u64 owner_objectid, 82 u64 owner_offset, int refs_to_drop, 83 struct btrfs_delayed_extent_op *extra_op); 84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 85 struct extent_buffer *leaf, 86 struct btrfs_extent_item *ei); 87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 88 struct btrfs_root *root, 89 u64 parent, u64 root_objectid, 90 u64 flags, u64 owner, u64 offset, 91 struct btrfs_key *ins, int ref_mod); 92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 93 struct btrfs_root *root, 94 u64 parent, u64 root_objectid, 95 u64 flags, struct btrfs_disk_key *key, 96 int level, struct btrfs_key *ins); 97 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 98 struct btrfs_root *extent_root, u64 flags, 99 int force); 100 static int find_next_key(struct btrfs_path *path, int level, 101 struct btrfs_key *key); 102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 103 int dump_block_groups); 104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 105 u64 num_bytes, int reserve); 106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 107 u64 num_bytes); 108 int btrfs_pin_extent(struct btrfs_root *root, 109 u64 bytenr, u64 num_bytes, int reserved); 110 111 static noinline int 112 block_group_cache_done(struct btrfs_block_group_cache *cache) 113 { 114 smp_mb(); 115 return cache->cached == BTRFS_CACHE_FINISHED; 116 } 117 118 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 119 { 120 return (cache->flags & bits) == bits; 121 } 122 123 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 124 { 125 atomic_inc(&cache->count); 126 } 127 128 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 129 { 130 if (atomic_dec_and_test(&cache->count)) { 131 WARN_ON(cache->pinned > 0); 132 WARN_ON(cache->reserved > 0); 133 kfree(cache->free_space_ctl); 134 kfree(cache); 135 } 136 } 137 138 /* 139 * this adds the block group to the fs_info rb tree for the block group 140 * cache 141 */ 142 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 143 struct btrfs_block_group_cache *block_group) 144 { 145 struct rb_node **p; 146 struct rb_node *parent = NULL; 147 struct btrfs_block_group_cache *cache; 148 149 spin_lock(&info->block_group_cache_lock); 150 p = &info->block_group_cache_tree.rb_node; 151 152 while (*p) { 153 parent = *p; 154 cache = rb_entry(parent, struct btrfs_block_group_cache, 155 cache_node); 156 if (block_group->key.objectid < cache->key.objectid) { 157 p = &(*p)->rb_left; 158 } else if (block_group->key.objectid > cache->key.objectid) { 159 p = &(*p)->rb_right; 160 } else { 161 spin_unlock(&info->block_group_cache_lock); 162 return -EEXIST; 163 } 164 } 165 166 rb_link_node(&block_group->cache_node, parent, p); 167 rb_insert_color(&block_group->cache_node, 168 &info->block_group_cache_tree); 169 170 if (info->first_logical_byte > block_group->key.objectid) 171 info->first_logical_byte = block_group->key.objectid; 172 173 spin_unlock(&info->block_group_cache_lock); 174 175 return 0; 176 } 177 178 /* 179 * This will return the block group at or after bytenr if contains is 0, else 180 * it will return the block group that contains the bytenr 181 */ 182 static struct btrfs_block_group_cache * 183 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 184 int contains) 185 { 186 struct btrfs_block_group_cache *cache, *ret = NULL; 187 struct rb_node *n; 188 u64 end, start; 189 190 spin_lock(&info->block_group_cache_lock); 191 n = info->block_group_cache_tree.rb_node; 192 193 while (n) { 194 cache = rb_entry(n, struct btrfs_block_group_cache, 195 cache_node); 196 end = cache->key.objectid + cache->key.offset - 1; 197 start = cache->key.objectid; 198 199 if (bytenr < start) { 200 if (!contains && (!ret || start < ret->key.objectid)) 201 ret = cache; 202 n = n->rb_left; 203 } else if (bytenr > start) { 204 if (contains && bytenr <= end) { 205 ret = cache; 206 break; 207 } 208 n = n->rb_right; 209 } else { 210 ret = cache; 211 break; 212 } 213 } 214 if (ret) { 215 btrfs_get_block_group(ret); 216 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 217 info->first_logical_byte = ret->key.objectid; 218 } 219 spin_unlock(&info->block_group_cache_lock); 220 221 return ret; 222 } 223 224 static int add_excluded_extent(struct btrfs_root *root, 225 u64 start, u64 num_bytes) 226 { 227 u64 end = start + num_bytes - 1; 228 set_extent_bits(&root->fs_info->freed_extents[0], 229 start, end, EXTENT_UPTODATE, GFP_NOFS); 230 set_extent_bits(&root->fs_info->freed_extents[1], 231 start, end, EXTENT_UPTODATE, GFP_NOFS); 232 return 0; 233 } 234 235 static void free_excluded_extents(struct btrfs_root *root, 236 struct btrfs_block_group_cache *cache) 237 { 238 u64 start, end; 239 240 start = cache->key.objectid; 241 end = start + cache->key.offset - 1; 242 243 clear_extent_bits(&root->fs_info->freed_extents[0], 244 start, end, EXTENT_UPTODATE, GFP_NOFS); 245 clear_extent_bits(&root->fs_info->freed_extents[1], 246 start, end, EXTENT_UPTODATE, GFP_NOFS); 247 } 248 249 static int exclude_super_stripes(struct btrfs_root *root, 250 struct btrfs_block_group_cache *cache) 251 { 252 u64 bytenr; 253 u64 *logical; 254 int stripe_len; 255 int i, nr, ret; 256 257 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 258 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 259 cache->bytes_super += stripe_len; 260 ret = add_excluded_extent(root, cache->key.objectid, 261 stripe_len); 262 if (ret) 263 return ret; 264 } 265 266 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 267 bytenr = btrfs_sb_offset(i); 268 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 269 cache->key.objectid, bytenr, 270 0, &logical, &nr, &stripe_len); 271 if (ret) 272 return ret; 273 274 while (nr--) { 275 u64 start, len; 276 277 if (logical[nr] > cache->key.objectid + 278 cache->key.offset) 279 continue; 280 281 if (logical[nr] + stripe_len <= cache->key.objectid) 282 continue; 283 284 start = logical[nr]; 285 if (start < cache->key.objectid) { 286 start = cache->key.objectid; 287 len = (logical[nr] + stripe_len) - start; 288 } else { 289 len = min_t(u64, stripe_len, 290 cache->key.objectid + 291 cache->key.offset - start); 292 } 293 294 cache->bytes_super += len; 295 ret = add_excluded_extent(root, start, len); 296 if (ret) { 297 kfree(logical); 298 return ret; 299 } 300 } 301 302 kfree(logical); 303 } 304 return 0; 305 } 306 307 static struct btrfs_caching_control * 308 get_caching_control(struct btrfs_block_group_cache *cache) 309 { 310 struct btrfs_caching_control *ctl; 311 312 spin_lock(&cache->lock); 313 if (cache->cached != BTRFS_CACHE_STARTED) { 314 spin_unlock(&cache->lock); 315 return NULL; 316 } 317 318 /* We're loading it the fast way, so we don't have a caching_ctl. */ 319 if (!cache->caching_ctl) { 320 spin_unlock(&cache->lock); 321 return NULL; 322 } 323 324 ctl = cache->caching_ctl; 325 atomic_inc(&ctl->count); 326 spin_unlock(&cache->lock); 327 return ctl; 328 } 329 330 static void put_caching_control(struct btrfs_caching_control *ctl) 331 { 332 if (atomic_dec_and_test(&ctl->count)) 333 kfree(ctl); 334 } 335 336 /* 337 * this is only called by cache_block_group, since we could have freed extents 338 * we need to check the pinned_extents for any extents that can't be used yet 339 * since their free space will be released as soon as the transaction commits. 340 */ 341 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 342 struct btrfs_fs_info *info, u64 start, u64 end) 343 { 344 u64 extent_start, extent_end, size, total_added = 0; 345 int ret; 346 347 while (start < end) { 348 ret = find_first_extent_bit(info->pinned_extents, start, 349 &extent_start, &extent_end, 350 EXTENT_DIRTY | EXTENT_UPTODATE, 351 NULL); 352 if (ret) 353 break; 354 355 if (extent_start <= start) { 356 start = extent_end + 1; 357 } else if (extent_start > start && extent_start < end) { 358 size = extent_start - start; 359 total_added += size; 360 ret = btrfs_add_free_space(block_group, start, 361 size); 362 BUG_ON(ret); /* -ENOMEM or logic error */ 363 start = extent_end + 1; 364 } else { 365 break; 366 } 367 } 368 369 if (start < end) { 370 size = end - start; 371 total_added += size; 372 ret = btrfs_add_free_space(block_group, start, size); 373 BUG_ON(ret); /* -ENOMEM or logic error */ 374 } 375 376 return total_added; 377 } 378 379 static noinline void caching_thread(struct btrfs_work *work) 380 { 381 struct btrfs_block_group_cache *block_group; 382 struct btrfs_fs_info *fs_info; 383 struct btrfs_caching_control *caching_ctl; 384 struct btrfs_root *extent_root; 385 struct btrfs_path *path; 386 struct extent_buffer *leaf; 387 struct btrfs_key key; 388 u64 total_found = 0; 389 u64 last = 0; 390 u32 nritems; 391 int ret = 0; 392 393 caching_ctl = container_of(work, struct btrfs_caching_control, work); 394 block_group = caching_ctl->block_group; 395 fs_info = block_group->fs_info; 396 extent_root = fs_info->extent_root; 397 398 path = btrfs_alloc_path(); 399 if (!path) 400 goto out; 401 402 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 403 404 /* 405 * We don't want to deadlock with somebody trying to allocate a new 406 * extent for the extent root while also trying to search the extent 407 * root to add free space. So we skip locking and search the commit 408 * root, since its read-only 409 */ 410 path->skip_locking = 1; 411 path->search_commit_root = 1; 412 path->reada = 1; 413 414 key.objectid = last; 415 key.offset = 0; 416 key.type = BTRFS_EXTENT_ITEM_KEY; 417 again: 418 mutex_lock(&caching_ctl->mutex); 419 /* need to make sure the commit_root doesn't disappear */ 420 down_read(&fs_info->extent_commit_sem); 421 422 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 423 if (ret < 0) 424 goto err; 425 426 leaf = path->nodes[0]; 427 nritems = btrfs_header_nritems(leaf); 428 429 while (1) { 430 if (btrfs_fs_closing(fs_info) > 1) { 431 last = (u64)-1; 432 break; 433 } 434 435 if (path->slots[0] < nritems) { 436 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 437 } else { 438 ret = find_next_key(path, 0, &key); 439 if (ret) 440 break; 441 442 if (need_resched()) { 443 caching_ctl->progress = last; 444 btrfs_release_path(path); 445 up_read(&fs_info->extent_commit_sem); 446 mutex_unlock(&caching_ctl->mutex); 447 cond_resched(); 448 goto again; 449 } 450 451 ret = btrfs_next_leaf(extent_root, path); 452 if (ret < 0) 453 goto err; 454 if (ret) 455 break; 456 leaf = path->nodes[0]; 457 nritems = btrfs_header_nritems(leaf); 458 continue; 459 } 460 461 if (key.objectid < block_group->key.objectid) { 462 path->slots[0]++; 463 continue; 464 } 465 466 if (key.objectid >= block_group->key.objectid + 467 block_group->key.offset) 468 break; 469 470 if (key.type == BTRFS_EXTENT_ITEM_KEY || 471 key.type == BTRFS_METADATA_ITEM_KEY) { 472 total_found += add_new_free_space(block_group, 473 fs_info, last, 474 key.objectid); 475 if (key.type == BTRFS_METADATA_ITEM_KEY) 476 last = key.objectid + 477 fs_info->tree_root->leafsize; 478 else 479 last = key.objectid + key.offset; 480 481 if (total_found > (1024 * 1024 * 2)) { 482 total_found = 0; 483 wake_up(&caching_ctl->wait); 484 } 485 } 486 path->slots[0]++; 487 } 488 ret = 0; 489 490 total_found += add_new_free_space(block_group, fs_info, last, 491 block_group->key.objectid + 492 block_group->key.offset); 493 caching_ctl->progress = (u64)-1; 494 495 spin_lock(&block_group->lock); 496 block_group->caching_ctl = NULL; 497 block_group->cached = BTRFS_CACHE_FINISHED; 498 spin_unlock(&block_group->lock); 499 500 err: 501 btrfs_free_path(path); 502 up_read(&fs_info->extent_commit_sem); 503 504 free_excluded_extents(extent_root, block_group); 505 506 mutex_unlock(&caching_ctl->mutex); 507 out: 508 wake_up(&caching_ctl->wait); 509 510 put_caching_control(caching_ctl); 511 btrfs_put_block_group(block_group); 512 } 513 514 static int cache_block_group(struct btrfs_block_group_cache *cache, 515 int load_cache_only) 516 { 517 DEFINE_WAIT(wait); 518 struct btrfs_fs_info *fs_info = cache->fs_info; 519 struct btrfs_caching_control *caching_ctl; 520 int ret = 0; 521 522 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 523 if (!caching_ctl) 524 return -ENOMEM; 525 526 INIT_LIST_HEAD(&caching_ctl->list); 527 mutex_init(&caching_ctl->mutex); 528 init_waitqueue_head(&caching_ctl->wait); 529 caching_ctl->block_group = cache; 530 caching_ctl->progress = cache->key.objectid; 531 atomic_set(&caching_ctl->count, 1); 532 caching_ctl->work.func = caching_thread; 533 534 spin_lock(&cache->lock); 535 /* 536 * This should be a rare occasion, but this could happen I think in the 537 * case where one thread starts to load the space cache info, and then 538 * some other thread starts a transaction commit which tries to do an 539 * allocation while the other thread is still loading the space cache 540 * info. The previous loop should have kept us from choosing this block 541 * group, but if we've moved to the state where we will wait on caching 542 * block groups we need to first check if we're doing a fast load here, 543 * so we can wait for it to finish, otherwise we could end up allocating 544 * from a block group who's cache gets evicted for one reason or 545 * another. 546 */ 547 while (cache->cached == BTRFS_CACHE_FAST) { 548 struct btrfs_caching_control *ctl; 549 550 ctl = cache->caching_ctl; 551 atomic_inc(&ctl->count); 552 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 553 spin_unlock(&cache->lock); 554 555 schedule(); 556 557 finish_wait(&ctl->wait, &wait); 558 put_caching_control(ctl); 559 spin_lock(&cache->lock); 560 } 561 562 if (cache->cached != BTRFS_CACHE_NO) { 563 spin_unlock(&cache->lock); 564 kfree(caching_ctl); 565 return 0; 566 } 567 WARN_ON(cache->caching_ctl); 568 cache->caching_ctl = caching_ctl; 569 cache->cached = BTRFS_CACHE_FAST; 570 spin_unlock(&cache->lock); 571 572 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 573 ret = load_free_space_cache(fs_info, cache); 574 575 spin_lock(&cache->lock); 576 if (ret == 1) { 577 cache->caching_ctl = NULL; 578 cache->cached = BTRFS_CACHE_FINISHED; 579 cache->last_byte_to_unpin = (u64)-1; 580 } else { 581 if (load_cache_only) { 582 cache->caching_ctl = NULL; 583 cache->cached = BTRFS_CACHE_NO; 584 } else { 585 cache->cached = BTRFS_CACHE_STARTED; 586 } 587 } 588 spin_unlock(&cache->lock); 589 wake_up(&caching_ctl->wait); 590 if (ret == 1) { 591 put_caching_control(caching_ctl); 592 free_excluded_extents(fs_info->extent_root, cache); 593 return 0; 594 } 595 } else { 596 /* 597 * We are not going to do the fast caching, set cached to the 598 * appropriate value and wakeup any waiters. 599 */ 600 spin_lock(&cache->lock); 601 if (load_cache_only) { 602 cache->caching_ctl = NULL; 603 cache->cached = BTRFS_CACHE_NO; 604 } else { 605 cache->cached = BTRFS_CACHE_STARTED; 606 } 607 spin_unlock(&cache->lock); 608 wake_up(&caching_ctl->wait); 609 } 610 611 if (load_cache_only) { 612 put_caching_control(caching_ctl); 613 return 0; 614 } 615 616 down_write(&fs_info->extent_commit_sem); 617 atomic_inc(&caching_ctl->count); 618 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 619 up_write(&fs_info->extent_commit_sem); 620 621 btrfs_get_block_group(cache); 622 623 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); 624 625 return ret; 626 } 627 628 /* 629 * return the block group that starts at or after bytenr 630 */ 631 static struct btrfs_block_group_cache * 632 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 633 { 634 struct btrfs_block_group_cache *cache; 635 636 cache = block_group_cache_tree_search(info, bytenr, 0); 637 638 return cache; 639 } 640 641 /* 642 * return the block group that contains the given bytenr 643 */ 644 struct btrfs_block_group_cache *btrfs_lookup_block_group( 645 struct btrfs_fs_info *info, 646 u64 bytenr) 647 { 648 struct btrfs_block_group_cache *cache; 649 650 cache = block_group_cache_tree_search(info, bytenr, 1); 651 652 return cache; 653 } 654 655 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 656 u64 flags) 657 { 658 struct list_head *head = &info->space_info; 659 struct btrfs_space_info *found; 660 661 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 662 663 rcu_read_lock(); 664 list_for_each_entry_rcu(found, head, list) { 665 if (found->flags & flags) { 666 rcu_read_unlock(); 667 return found; 668 } 669 } 670 rcu_read_unlock(); 671 return NULL; 672 } 673 674 /* 675 * after adding space to the filesystem, we need to clear the full flags 676 * on all the space infos. 677 */ 678 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 679 { 680 struct list_head *head = &info->space_info; 681 struct btrfs_space_info *found; 682 683 rcu_read_lock(); 684 list_for_each_entry_rcu(found, head, list) 685 found->full = 0; 686 rcu_read_unlock(); 687 } 688 689 /* simple helper to search for an existing extent at a given offset */ 690 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 691 { 692 int ret; 693 struct btrfs_key key; 694 struct btrfs_path *path; 695 696 path = btrfs_alloc_path(); 697 if (!path) 698 return -ENOMEM; 699 700 key.objectid = start; 701 key.offset = len; 702 key.type = BTRFS_EXTENT_ITEM_KEY; 703 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 704 0, 0); 705 if (ret > 0) { 706 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 707 if (key.objectid == start && 708 key.type == BTRFS_METADATA_ITEM_KEY) 709 ret = 0; 710 } 711 btrfs_free_path(path); 712 return ret; 713 } 714 715 /* 716 * helper function to lookup reference count and flags of a tree block. 717 * 718 * the head node for delayed ref is used to store the sum of all the 719 * reference count modifications queued up in the rbtree. the head 720 * node may also store the extent flags to set. This way you can check 721 * to see what the reference count and extent flags would be if all of 722 * the delayed refs are not processed. 723 */ 724 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 725 struct btrfs_root *root, u64 bytenr, 726 u64 offset, int metadata, u64 *refs, u64 *flags) 727 { 728 struct btrfs_delayed_ref_head *head; 729 struct btrfs_delayed_ref_root *delayed_refs; 730 struct btrfs_path *path; 731 struct btrfs_extent_item *ei; 732 struct extent_buffer *leaf; 733 struct btrfs_key key; 734 u32 item_size; 735 u64 num_refs; 736 u64 extent_flags; 737 int ret; 738 739 /* 740 * If we don't have skinny metadata, don't bother doing anything 741 * different 742 */ 743 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 744 offset = root->leafsize; 745 metadata = 0; 746 } 747 748 path = btrfs_alloc_path(); 749 if (!path) 750 return -ENOMEM; 751 752 if (metadata) { 753 key.objectid = bytenr; 754 key.type = BTRFS_METADATA_ITEM_KEY; 755 key.offset = offset; 756 } else { 757 key.objectid = bytenr; 758 key.type = BTRFS_EXTENT_ITEM_KEY; 759 key.offset = offset; 760 } 761 762 if (!trans) { 763 path->skip_locking = 1; 764 path->search_commit_root = 1; 765 } 766 again: 767 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 768 &key, path, 0, 0); 769 if (ret < 0) 770 goto out_free; 771 772 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 773 key.type = BTRFS_EXTENT_ITEM_KEY; 774 key.offset = root->leafsize; 775 btrfs_release_path(path); 776 goto again; 777 } 778 779 if (ret == 0) { 780 leaf = path->nodes[0]; 781 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 782 if (item_size >= sizeof(*ei)) { 783 ei = btrfs_item_ptr(leaf, path->slots[0], 784 struct btrfs_extent_item); 785 num_refs = btrfs_extent_refs(leaf, ei); 786 extent_flags = btrfs_extent_flags(leaf, ei); 787 } else { 788 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 789 struct btrfs_extent_item_v0 *ei0; 790 BUG_ON(item_size != sizeof(*ei0)); 791 ei0 = btrfs_item_ptr(leaf, path->slots[0], 792 struct btrfs_extent_item_v0); 793 num_refs = btrfs_extent_refs_v0(leaf, ei0); 794 /* FIXME: this isn't correct for data */ 795 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 796 #else 797 BUG(); 798 #endif 799 } 800 BUG_ON(num_refs == 0); 801 } else { 802 num_refs = 0; 803 extent_flags = 0; 804 ret = 0; 805 } 806 807 if (!trans) 808 goto out; 809 810 delayed_refs = &trans->transaction->delayed_refs; 811 spin_lock(&delayed_refs->lock); 812 head = btrfs_find_delayed_ref_head(trans, bytenr); 813 if (head) { 814 if (!mutex_trylock(&head->mutex)) { 815 atomic_inc(&head->node.refs); 816 spin_unlock(&delayed_refs->lock); 817 818 btrfs_release_path(path); 819 820 /* 821 * Mutex was contended, block until it's released and try 822 * again 823 */ 824 mutex_lock(&head->mutex); 825 mutex_unlock(&head->mutex); 826 btrfs_put_delayed_ref(&head->node); 827 goto again; 828 } 829 if (head->extent_op && head->extent_op->update_flags) 830 extent_flags |= head->extent_op->flags_to_set; 831 else 832 BUG_ON(num_refs == 0); 833 834 num_refs += head->node.ref_mod; 835 mutex_unlock(&head->mutex); 836 } 837 spin_unlock(&delayed_refs->lock); 838 out: 839 WARN_ON(num_refs == 0); 840 if (refs) 841 *refs = num_refs; 842 if (flags) 843 *flags = extent_flags; 844 out_free: 845 btrfs_free_path(path); 846 return ret; 847 } 848 849 /* 850 * Back reference rules. Back refs have three main goals: 851 * 852 * 1) differentiate between all holders of references to an extent so that 853 * when a reference is dropped we can make sure it was a valid reference 854 * before freeing the extent. 855 * 856 * 2) Provide enough information to quickly find the holders of an extent 857 * if we notice a given block is corrupted or bad. 858 * 859 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 860 * maintenance. This is actually the same as #2, but with a slightly 861 * different use case. 862 * 863 * There are two kinds of back refs. The implicit back refs is optimized 864 * for pointers in non-shared tree blocks. For a given pointer in a block, 865 * back refs of this kind provide information about the block's owner tree 866 * and the pointer's key. These information allow us to find the block by 867 * b-tree searching. The full back refs is for pointers in tree blocks not 868 * referenced by their owner trees. The location of tree block is recorded 869 * in the back refs. Actually the full back refs is generic, and can be 870 * used in all cases the implicit back refs is used. The major shortcoming 871 * of the full back refs is its overhead. Every time a tree block gets 872 * COWed, we have to update back refs entry for all pointers in it. 873 * 874 * For a newly allocated tree block, we use implicit back refs for 875 * pointers in it. This means most tree related operations only involve 876 * implicit back refs. For a tree block created in old transaction, the 877 * only way to drop a reference to it is COW it. So we can detect the 878 * event that tree block loses its owner tree's reference and do the 879 * back refs conversion. 880 * 881 * When a tree block is COW'd through a tree, there are four cases: 882 * 883 * The reference count of the block is one and the tree is the block's 884 * owner tree. Nothing to do in this case. 885 * 886 * The reference count of the block is one and the tree is not the 887 * block's owner tree. In this case, full back refs is used for pointers 888 * in the block. Remove these full back refs, add implicit back refs for 889 * every pointers in the new block. 890 * 891 * The reference count of the block is greater than one and the tree is 892 * the block's owner tree. In this case, implicit back refs is used for 893 * pointers in the block. Add full back refs for every pointers in the 894 * block, increase lower level extents' reference counts. The original 895 * implicit back refs are entailed to the new block. 896 * 897 * The reference count of the block is greater than one and the tree is 898 * not the block's owner tree. Add implicit back refs for every pointer in 899 * the new block, increase lower level extents' reference count. 900 * 901 * Back Reference Key composing: 902 * 903 * The key objectid corresponds to the first byte in the extent, 904 * The key type is used to differentiate between types of back refs. 905 * There are different meanings of the key offset for different types 906 * of back refs. 907 * 908 * File extents can be referenced by: 909 * 910 * - multiple snapshots, subvolumes, or different generations in one subvol 911 * - different files inside a single subvolume 912 * - different offsets inside a file (bookend extents in file.c) 913 * 914 * The extent ref structure for the implicit back refs has fields for: 915 * 916 * - Objectid of the subvolume root 917 * - objectid of the file holding the reference 918 * - original offset in the file 919 * - how many bookend extents 920 * 921 * The key offset for the implicit back refs is hash of the first 922 * three fields. 923 * 924 * The extent ref structure for the full back refs has field for: 925 * 926 * - number of pointers in the tree leaf 927 * 928 * The key offset for the implicit back refs is the first byte of 929 * the tree leaf 930 * 931 * When a file extent is allocated, The implicit back refs is used. 932 * the fields are filled in: 933 * 934 * (root_key.objectid, inode objectid, offset in file, 1) 935 * 936 * When a file extent is removed file truncation, we find the 937 * corresponding implicit back refs and check the following fields: 938 * 939 * (btrfs_header_owner(leaf), inode objectid, offset in file) 940 * 941 * Btree extents can be referenced by: 942 * 943 * - Different subvolumes 944 * 945 * Both the implicit back refs and the full back refs for tree blocks 946 * only consist of key. The key offset for the implicit back refs is 947 * objectid of block's owner tree. The key offset for the full back refs 948 * is the first byte of parent block. 949 * 950 * When implicit back refs is used, information about the lowest key and 951 * level of the tree block are required. These information are stored in 952 * tree block info structure. 953 */ 954 955 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 956 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 957 struct btrfs_root *root, 958 struct btrfs_path *path, 959 u64 owner, u32 extra_size) 960 { 961 struct btrfs_extent_item *item; 962 struct btrfs_extent_item_v0 *ei0; 963 struct btrfs_extent_ref_v0 *ref0; 964 struct btrfs_tree_block_info *bi; 965 struct extent_buffer *leaf; 966 struct btrfs_key key; 967 struct btrfs_key found_key; 968 u32 new_size = sizeof(*item); 969 u64 refs; 970 int ret; 971 972 leaf = path->nodes[0]; 973 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 974 975 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 976 ei0 = btrfs_item_ptr(leaf, path->slots[0], 977 struct btrfs_extent_item_v0); 978 refs = btrfs_extent_refs_v0(leaf, ei0); 979 980 if (owner == (u64)-1) { 981 while (1) { 982 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 983 ret = btrfs_next_leaf(root, path); 984 if (ret < 0) 985 return ret; 986 BUG_ON(ret > 0); /* Corruption */ 987 leaf = path->nodes[0]; 988 } 989 btrfs_item_key_to_cpu(leaf, &found_key, 990 path->slots[0]); 991 BUG_ON(key.objectid != found_key.objectid); 992 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 993 path->slots[0]++; 994 continue; 995 } 996 ref0 = btrfs_item_ptr(leaf, path->slots[0], 997 struct btrfs_extent_ref_v0); 998 owner = btrfs_ref_objectid_v0(leaf, ref0); 999 break; 1000 } 1001 } 1002 btrfs_release_path(path); 1003 1004 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1005 new_size += sizeof(*bi); 1006 1007 new_size -= sizeof(*ei0); 1008 ret = btrfs_search_slot(trans, root, &key, path, 1009 new_size + extra_size, 1); 1010 if (ret < 0) 1011 return ret; 1012 BUG_ON(ret); /* Corruption */ 1013 1014 btrfs_extend_item(root, path, new_size); 1015 1016 leaf = path->nodes[0]; 1017 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1018 btrfs_set_extent_refs(leaf, item, refs); 1019 /* FIXME: get real generation */ 1020 btrfs_set_extent_generation(leaf, item, 0); 1021 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1022 btrfs_set_extent_flags(leaf, item, 1023 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1024 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1025 bi = (struct btrfs_tree_block_info *)(item + 1); 1026 /* FIXME: get first key of the block */ 1027 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1028 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1029 } else { 1030 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1031 } 1032 btrfs_mark_buffer_dirty(leaf); 1033 return 0; 1034 } 1035 #endif 1036 1037 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1038 { 1039 u32 high_crc = ~(u32)0; 1040 u32 low_crc = ~(u32)0; 1041 __le64 lenum; 1042 1043 lenum = cpu_to_le64(root_objectid); 1044 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1045 lenum = cpu_to_le64(owner); 1046 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1047 lenum = cpu_to_le64(offset); 1048 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1049 1050 return ((u64)high_crc << 31) ^ (u64)low_crc; 1051 } 1052 1053 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1054 struct btrfs_extent_data_ref *ref) 1055 { 1056 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1057 btrfs_extent_data_ref_objectid(leaf, ref), 1058 btrfs_extent_data_ref_offset(leaf, ref)); 1059 } 1060 1061 static int match_extent_data_ref(struct extent_buffer *leaf, 1062 struct btrfs_extent_data_ref *ref, 1063 u64 root_objectid, u64 owner, u64 offset) 1064 { 1065 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1066 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1067 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1068 return 0; 1069 return 1; 1070 } 1071 1072 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1073 struct btrfs_root *root, 1074 struct btrfs_path *path, 1075 u64 bytenr, u64 parent, 1076 u64 root_objectid, 1077 u64 owner, u64 offset) 1078 { 1079 struct btrfs_key key; 1080 struct btrfs_extent_data_ref *ref; 1081 struct extent_buffer *leaf; 1082 u32 nritems; 1083 int ret; 1084 int recow; 1085 int err = -ENOENT; 1086 1087 key.objectid = bytenr; 1088 if (parent) { 1089 key.type = BTRFS_SHARED_DATA_REF_KEY; 1090 key.offset = parent; 1091 } else { 1092 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1093 key.offset = hash_extent_data_ref(root_objectid, 1094 owner, offset); 1095 } 1096 again: 1097 recow = 0; 1098 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1099 if (ret < 0) { 1100 err = ret; 1101 goto fail; 1102 } 1103 1104 if (parent) { 1105 if (!ret) 1106 return 0; 1107 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1108 key.type = BTRFS_EXTENT_REF_V0_KEY; 1109 btrfs_release_path(path); 1110 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1111 if (ret < 0) { 1112 err = ret; 1113 goto fail; 1114 } 1115 if (!ret) 1116 return 0; 1117 #endif 1118 goto fail; 1119 } 1120 1121 leaf = path->nodes[0]; 1122 nritems = btrfs_header_nritems(leaf); 1123 while (1) { 1124 if (path->slots[0] >= nritems) { 1125 ret = btrfs_next_leaf(root, path); 1126 if (ret < 0) 1127 err = ret; 1128 if (ret) 1129 goto fail; 1130 1131 leaf = path->nodes[0]; 1132 nritems = btrfs_header_nritems(leaf); 1133 recow = 1; 1134 } 1135 1136 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1137 if (key.objectid != bytenr || 1138 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1139 goto fail; 1140 1141 ref = btrfs_item_ptr(leaf, path->slots[0], 1142 struct btrfs_extent_data_ref); 1143 1144 if (match_extent_data_ref(leaf, ref, root_objectid, 1145 owner, offset)) { 1146 if (recow) { 1147 btrfs_release_path(path); 1148 goto again; 1149 } 1150 err = 0; 1151 break; 1152 } 1153 path->slots[0]++; 1154 } 1155 fail: 1156 return err; 1157 } 1158 1159 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1160 struct btrfs_root *root, 1161 struct btrfs_path *path, 1162 u64 bytenr, u64 parent, 1163 u64 root_objectid, u64 owner, 1164 u64 offset, int refs_to_add) 1165 { 1166 struct btrfs_key key; 1167 struct extent_buffer *leaf; 1168 u32 size; 1169 u32 num_refs; 1170 int ret; 1171 1172 key.objectid = bytenr; 1173 if (parent) { 1174 key.type = BTRFS_SHARED_DATA_REF_KEY; 1175 key.offset = parent; 1176 size = sizeof(struct btrfs_shared_data_ref); 1177 } else { 1178 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1179 key.offset = hash_extent_data_ref(root_objectid, 1180 owner, offset); 1181 size = sizeof(struct btrfs_extent_data_ref); 1182 } 1183 1184 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1185 if (ret && ret != -EEXIST) 1186 goto fail; 1187 1188 leaf = path->nodes[0]; 1189 if (parent) { 1190 struct btrfs_shared_data_ref *ref; 1191 ref = btrfs_item_ptr(leaf, path->slots[0], 1192 struct btrfs_shared_data_ref); 1193 if (ret == 0) { 1194 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1195 } else { 1196 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1197 num_refs += refs_to_add; 1198 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1199 } 1200 } else { 1201 struct btrfs_extent_data_ref *ref; 1202 while (ret == -EEXIST) { 1203 ref = btrfs_item_ptr(leaf, path->slots[0], 1204 struct btrfs_extent_data_ref); 1205 if (match_extent_data_ref(leaf, ref, root_objectid, 1206 owner, offset)) 1207 break; 1208 btrfs_release_path(path); 1209 key.offset++; 1210 ret = btrfs_insert_empty_item(trans, root, path, &key, 1211 size); 1212 if (ret && ret != -EEXIST) 1213 goto fail; 1214 1215 leaf = path->nodes[0]; 1216 } 1217 ref = btrfs_item_ptr(leaf, path->slots[0], 1218 struct btrfs_extent_data_ref); 1219 if (ret == 0) { 1220 btrfs_set_extent_data_ref_root(leaf, ref, 1221 root_objectid); 1222 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1223 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1224 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1225 } else { 1226 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1227 num_refs += refs_to_add; 1228 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1229 } 1230 } 1231 btrfs_mark_buffer_dirty(leaf); 1232 ret = 0; 1233 fail: 1234 btrfs_release_path(path); 1235 return ret; 1236 } 1237 1238 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1239 struct btrfs_root *root, 1240 struct btrfs_path *path, 1241 int refs_to_drop) 1242 { 1243 struct btrfs_key key; 1244 struct btrfs_extent_data_ref *ref1 = NULL; 1245 struct btrfs_shared_data_ref *ref2 = NULL; 1246 struct extent_buffer *leaf; 1247 u32 num_refs = 0; 1248 int ret = 0; 1249 1250 leaf = path->nodes[0]; 1251 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1252 1253 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1254 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1255 struct btrfs_extent_data_ref); 1256 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1257 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1258 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1259 struct btrfs_shared_data_ref); 1260 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1261 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1262 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1263 struct btrfs_extent_ref_v0 *ref0; 1264 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1265 struct btrfs_extent_ref_v0); 1266 num_refs = btrfs_ref_count_v0(leaf, ref0); 1267 #endif 1268 } else { 1269 BUG(); 1270 } 1271 1272 BUG_ON(num_refs < refs_to_drop); 1273 num_refs -= refs_to_drop; 1274 1275 if (num_refs == 0) { 1276 ret = btrfs_del_item(trans, root, path); 1277 } else { 1278 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1279 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1280 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1281 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1282 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1283 else { 1284 struct btrfs_extent_ref_v0 *ref0; 1285 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1286 struct btrfs_extent_ref_v0); 1287 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1288 } 1289 #endif 1290 btrfs_mark_buffer_dirty(leaf); 1291 } 1292 return ret; 1293 } 1294 1295 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1296 struct btrfs_path *path, 1297 struct btrfs_extent_inline_ref *iref) 1298 { 1299 struct btrfs_key key; 1300 struct extent_buffer *leaf; 1301 struct btrfs_extent_data_ref *ref1; 1302 struct btrfs_shared_data_ref *ref2; 1303 u32 num_refs = 0; 1304 1305 leaf = path->nodes[0]; 1306 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1307 if (iref) { 1308 if (btrfs_extent_inline_ref_type(leaf, iref) == 1309 BTRFS_EXTENT_DATA_REF_KEY) { 1310 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1311 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1312 } else { 1313 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1314 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1315 } 1316 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1317 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1318 struct btrfs_extent_data_ref); 1319 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1320 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1321 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1322 struct btrfs_shared_data_ref); 1323 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1324 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1325 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1326 struct btrfs_extent_ref_v0 *ref0; 1327 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1328 struct btrfs_extent_ref_v0); 1329 num_refs = btrfs_ref_count_v0(leaf, ref0); 1330 #endif 1331 } else { 1332 WARN_ON(1); 1333 } 1334 return num_refs; 1335 } 1336 1337 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1338 struct btrfs_root *root, 1339 struct btrfs_path *path, 1340 u64 bytenr, u64 parent, 1341 u64 root_objectid) 1342 { 1343 struct btrfs_key key; 1344 int ret; 1345 1346 key.objectid = bytenr; 1347 if (parent) { 1348 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1349 key.offset = parent; 1350 } else { 1351 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1352 key.offset = root_objectid; 1353 } 1354 1355 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1356 if (ret > 0) 1357 ret = -ENOENT; 1358 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1359 if (ret == -ENOENT && parent) { 1360 btrfs_release_path(path); 1361 key.type = BTRFS_EXTENT_REF_V0_KEY; 1362 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1363 if (ret > 0) 1364 ret = -ENOENT; 1365 } 1366 #endif 1367 return ret; 1368 } 1369 1370 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1371 struct btrfs_root *root, 1372 struct btrfs_path *path, 1373 u64 bytenr, u64 parent, 1374 u64 root_objectid) 1375 { 1376 struct btrfs_key key; 1377 int ret; 1378 1379 key.objectid = bytenr; 1380 if (parent) { 1381 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1382 key.offset = parent; 1383 } else { 1384 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1385 key.offset = root_objectid; 1386 } 1387 1388 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1389 btrfs_release_path(path); 1390 return ret; 1391 } 1392 1393 static inline int extent_ref_type(u64 parent, u64 owner) 1394 { 1395 int type; 1396 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1397 if (parent > 0) 1398 type = BTRFS_SHARED_BLOCK_REF_KEY; 1399 else 1400 type = BTRFS_TREE_BLOCK_REF_KEY; 1401 } else { 1402 if (parent > 0) 1403 type = BTRFS_SHARED_DATA_REF_KEY; 1404 else 1405 type = BTRFS_EXTENT_DATA_REF_KEY; 1406 } 1407 return type; 1408 } 1409 1410 static int find_next_key(struct btrfs_path *path, int level, 1411 struct btrfs_key *key) 1412 1413 { 1414 for (; level < BTRFS_MAX_LEVEL; level++) { 1415 if (!path->nodes[level]) 1416 break; 1417 if (path->slots[level] + 1 >= 1418 btrfs_header_nritems(path->nodes[level])) 1419 continue; 1420 if (level == 0) 1421 btrfs_item_key_to_cpu(path->nodes[level], key, 1422 path->slots[level] + 1); 1423 else 1424 btrfs_node_key_to_cpu(path->nodes[level], key, 1425 path->slots[level] + 1); 1426 return 0; 1427 } 1428 return 1; 1429 } 1430 1431 /* 1432 * look for inline back ref. if back ref is found, *ref_ret is set 1433 * to the address of inline back ref, and 0 is returned. 1434 * 1435 * if back ref isn't found, *ref_ret is set to the address where it 1436 * should be inserted, and -ENOENT is returned. 1437 * 1438 * if insert is true and there are too many inline back refs, the path 1439 * points to the extent item, and -EAGAIN is returned. 1440 * 1441 * NOTE: inline back refs are ordered in the same way that back ref 1442 * items in the tree are ordered. 1443 */ 1444 static noinline_for_stack 1445 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1446 struct btrfs_root *root, 1447 struct btrfs_path *path, 1448 struct btrfs_extent_inline_ref **ref_ret, 1449 u64 bytenr, u64 num_bytes, 1450 u64 parent, u64 root_objectid, 1451 u64 owner, u64 offset, int insert) 1452 { 1453 struct btrfs_key key; 1454 struct extent_buffer *leaf; 1455 struct btrfs_extent_item *ei; 1456 struct btrfs_extent_inline_ref *iref; 1457 u64 flags; 1458 u64 item_size; 1459 unsigned long ptr; 1460 unsigned long end; 1461 int extra_size; 1462 int type; 1463 int want; 1464 int ret; 1465 int err = 0; 1466 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1467 SKINNY_METADATA); 1468 1469 key.objectid = bytenr; 1470 key.type = BTRFS_EXTENT_ITEM_KEY; 1471 key.offset = num_bytes; 1472 1473 want = extent_ref_type(parent, owner); 1474 if (insert) { 1475 extra_size = btrfs_extent_inline_ref_size(want); 1476 path->keep_locks = 1; 1477 } else 1478 extra_size = -1; 1479 1480 /* 1481 * Owner is our parent level, so we can just add one to get the level 1482 * for the block we are interested in. 1483 */ 1484 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1485 key.type = BTRFS_METADATA_ITEM_KEY; 1486 key.offset = owner; 1487 } 1488 1489 again: 1490 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1491 if (ret < 0) { 1492 err = ret; 1493 goto out; 1494 } 1495 1496 /* 1497 * We may be a newly converted file system which still has the old fat 1498 * extent entries for metadata, so try and see if we have one of those. 1499 */ 1500 if (ret > 0 && skinny_metadata) { 1501 skinny_metadata = false; 1502 if (path->slots[0]) { 1503 path->slots[0]--; 1504 btrfs_item_key_to_cpu(path->nodes[0], &key, 1505 path->slots[0]); 1506 if (key.objectid == bytenr && 1507 key.type == BTRFS_EXTENT_ITEM_KEY && 1508 key.offset == num_bytes) 1509 ret = 0; 1510 } 1511 if (ret) { 1512 key.type = BTRFS_EXTENT_ITEM_KEY; 1513 key.offset = num_bytes; 1514 btrfs_release_path(path); 1515 goto again; 1516 } 1517 } 1518 1519 if (ret && !insert) { 1520 err = -ENOENT; 1521 goto out; 1522 } else if (ret) { 1523 err = -EIO; 1524 WARN_ON(1); 1525 goto out; 1526 } 1527 1528 leaf = path->nodes[0]; 1529 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1530 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1531 if (item_size < sizeof(*ei)) { 1532 if (!insert) { 1533 err = -ENOENT; 1534 goto out; 1535 } 1536 ret = convert_extent_item_v0(trans, root, path, owner, 1537 extra_size); 1538 if (ret < 0) { 1539 err = ret; 1540 goto out; 1541 } 1542 leaf = path->nodes[0]; 1543 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1544 } 1545 #endif 1546 BUG_ON(item_size < sizeof(*ei)); 1547 1548 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1549 flags = btrfs_extent_flags(leaf, ei); 1550 1551 ptr = (unsigned long)(ei + 1); 1552 end = (unsigned long)ei + item_size; 1553 1554 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1555 ptr += sizeof(struct btrfs_tree_block_info); 1556 BUG_ON(ptr > end); 1557 } 1558 1559 err = -ENOENT; 1560 while (1) { 1561 if (ptr >= end) { 1562 WARN_ON(ptr > end); 1563 break; 1564 } 1565 iref = (struct btrfs_extent_inline_ref *)ptr; 1566 type = btrfs_extent_inline_ref_type(leaf, iref); 1567 if (want < type) 1568 break; 1569 if (want > type) { 1570 ptr += btrfs_extent_inline_ref_size(type); 1571 continue; 1572 } 1573 1574 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1575 struct btrfs_extent_data_ref *dref; 1576 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1577 if (match_extent_data_ref(leaf, dref, root_objectid, 1578 owner, offset)) { 1579 err = 0; 1580 break; 1581 } 1582 if (hash_extent_data_ref_item(leaf, dref) < 1583 hash_extent_data_ref(root_objectid, owner, offset)) 1584 break; 1585 } else { 1586 u64 ref_offset; 1587 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1588 if (parent > 0) { 1589 if (parent == ref_offset) { 1590 err = 0; 1591 break; 1592 } 1593 if (ref_offset < parent) 1594 break; 1595 } else { 1596 if (root_objectid == ref_offset) { 1597 err = 0; 1598 break; 1599 } 1600 if (ref_offset < root_objectid) 1601 break; 1602 } 1603 } 1604 ptr += btrfs_extent_inline_ref_size(type); 1605 } 1606 if (err == -ENOENT && insert) { 1607 if (item_size + extra_size >= 1608 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1609 err = -EAGAIN; 1610 goto out; 1611 } 1612 /* 1613 * To add new inline back ref, we have to make sure 1614 * there is no corresponding back ref item. 1615 * For simplicity, we just do not add new inline back 1616 * ref if there is any kind of item for this block 1617 */ 1618 if (find_next_key(path, 0, &key) == 0 && 1619 key.objectid == bytenr && 1620 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1621 err = -EAGAIN; 1622 goto out; 1623 } 1624 } 1625 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1626 out: 1627 if (insert) { 1628 path->keep_locks = 0; 1629 btrfs_unlock_up_safe(path, 1); 1630 } 1631 return err; 1632 } 1633 1634 /* 1635 * helper to add new inline back ref 1636 */ 1637 static noinline_for_stack 1638 void setup_inline_extent_backref(struct btrfs_root *root, 1639 struct btrfs_path *path, 1640 struct btrfs_extent_inline_ref *iref, 1641 u64 parent, u64 root_objectid, 1642 u64 owner, u64 offset, int refs_to_add, 1643 struct btrfs_delayed_extent_op *extent_op) 1644 { 1645 struct extent_buffer *leaf; 1646 struct btrfs_extent_item *ei; 1647 unsigned long ptr; 1648 unsigned long end; 1649 unsigned long item_offset; 1650 u64 refs; 1651 int size; 1652 int type; 1653 1654 leaf = path->nodes[0]; 1655 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1656 item_offset = (unsigned long)iref - (unsigned long)ei; 1657 1658 type = extent_ref_type(parent, owner); 1659 size = btrfs_extent_inline_ref_size(type); 1660 1661 btrfs_extend_item(root, path, size); 1662 1663 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1664 refs = btrfs_extent_refs(leaf, ei); 1665 refs += refs_to_add; 1666 btrfs_set_extent_refs(leaf, ei, refs); 1667 if (extent_op) 1668 __run_delayed_extent_op(extent_op, leaf, ei); 1669 1670 ptr = (unsigned long)ei + item_offset; 1671 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1672 if (ptr < end - size) 1673 memmove_extent_buffer(leaf, ptr + size, ptr, 1674 end - size - ptr); 1675 1676 iref = (struct btrfs_extent_inline_ref *)ptr; 1677 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1678 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1679 struct btrfs_extent_data_ref *dref; 1680 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1681 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1682 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1683 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1684 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1685 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1686 struct btrfs_shared_data_ref *sref; 1687 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1688 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1689 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1690 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1691 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1692 } else { 1693 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1694 } 1695 btrfs_mark_buffer_dirty(leaf); 1696 } 1697 1698 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1699 struct btrfs_root *root, 1700 struct btrfs_path *path, 1701 struct btrfs_extent_inline_ref **ref_ret, 1702 u64 bytenr, u64 num_bytes, u64 parent, 1703 u64 root_objectid, u64 owner, u64 offset) 1704 { 1705 int ret; 1706 1707 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1708 bytenr, num_bytes, parent, 1709 root_objectid, owner, offset, 0); 1710 if (ret != -ENOENT) 1711 return ret; 1712 1713 btrfs_release_path(path); 1714 *ref_ret = NULL; 1715 1716 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1717 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1718 root_objectid); 1719 } else { 1720 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1721 root_objectid, owner, offset); 1722 } 1723 return ret; 1724 } 1725 1726 /* 1727 * helper to update/remove inline back ref 1728 */ 1729 static noinline_for_stack 1730 void update_inline_extent_backref(struct btrfs_root *root, 1731 struct btrfs_path *path, 1732 struct btrfs_extent_inline_ref *iref, 1733 int refs_to_mod, 1734 struct btrfs_delayed_extent_op *extent_op) 1735 { 1736 struct extent_buffer *leaf; 1737 struct btrfs_extent_item *ei; 1738 struct btrfs_extent_data_ref *dref = NULL; 1739 struct btrfs_shared_data_ref *sref = NULL; 1740 unsigned long ptr; 1741 unsigned long end; 1742 u32 item_size; 1743 int size; 1744 int type; 1745 u64 refs; 1746 1747 leaf = path->nodes[0]; 1748 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1749 refs = btrfs_extent_refs(leaf, ei); 1750 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1751 refs += refs_to_mod; 1752 btrfs_set_extent_refs(leaf, ei, refs); 1753 if (extent_op) 1754 __run_delayed_extent_op(extent_op, leaf, ei); 1755 1756 type = btrfs_extent_inline_ref_type(leaf, iref); 1757 1758 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1759 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1760 refs = btrfs_extent_data_ref_count(leaf, dref); 1761 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1762 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1763 refs = btrfs_shared_data_ref_count(leaf, sref); 1764 } else { 1765 refs = 1; 1766 BUG_ON(refs_to_mod != -1); 1767 } 1768 1769 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1770 refs += refs_to_mod; 1771 1772 if (refs > 0) { 1773 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1774 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1775 else 1776 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1777 } else { 1778 size = btrfs_extent_inline_ref_size(type); 1779 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1780 ptr = (unsigned long)iref; 1781 end = (unsigned long)ei + item_size; 1782 if (ptr + size < end) 1783 memmove_extent_buffer(leaf, ptr, ptr + size, 1784 end - ptr - size); 1785 item_size -= size; 1786 btrfs_truncate_item(root, path, item_size, 1); 1787 } 1788 btrfs_mark_buffer_dirty(leaf); 1789 } 1790 1791 static noinline_for_stack 1792 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1793 struct btrfs_root *root, 1794 struct btrfs_path *path, 1795 u64 bytenr, u64 num_bytes, u64 parent, 1796 u64 root_objectid, u64 owner, 1797 u64 offset, int refs_to_add, 1798 struct btrfs_delayed_extent_op *extent_op) 1799 { 1800 struct btrfs_extent_inline_ref *iref; 1801 int ret; 1802 1803 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1804 bytenr, num_bytes, parent, 1805 root_objectid, owner, offset, 1); 1806 if (ret == 0) { 1807 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1808 update_inline_extent_backref(root, path, iref, 1809 refs_to_add, extent_op); 1810 } else if (ret == -ENOENT) { 1811 setup_inline_extent_backref(root, path, iref, parent, 1812 root_objectid, owner, offset, 1813 refs_to_add, extent_op); 1814 ret = 0; 1815 } 1816 return ret; 1817 } 1818 1819 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1820 struct btrfs_root *root, 1821 struct btrfs_path *path, 1822 u64 bytenr, u64 parent, u64 root_objectid, 1823 u64 owner, u64 offset, int refs_to_add) 1824 { 1825 int ret; 1826 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1827 BUG_ON(refs_to_add != 1); 1828 ret = insert_tree_block_ref(trans, root, path, bytenr, 1829 parent, root_objectid); 1830 } else { 1831 ret = insert_extent_data_ref(trans, root, path, bytenr, 1832 parent, root_objectid, 1833 owner, offset, refs_to_add); 1834 } 1835 return ret; 1836 } 1837 1838 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1839 struct btrfs_root *root, 1840 struct btrfs_path *path, 1841 struct btrfs_extent_inline_ref *iref, 1842 int refs_to_drop, int is_data) 1843 { 1844 int ret = 0; 1845 1846 BUG_ON(!is_data && refs_to_drop != 1); 1847 if (iref) { 1848 update_inline_extent_backref(root, path, iref, 1849 -refs_to_drop, NULL); 1850 } else if (is_data) { 1851 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1852 } else { 1853 ret = btrfs_del_item(trans, root, path); 1854 } 1855 return ret; 1856 } 1857 1858 static int btrfs_issue_discard(struct block_device *bdev, 1859 u64 start, u64 len) 1860 { 1861 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1862 } 1863 1864 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1865 u64 num_bytes, u64 *actual_bytes) 1866 { 1867 int ret; 1868 u64 discarded_bytes = 0; 1869 struct btrfs_bio *bbio = NULL; 1870 1871 1872 /* Tell the block device(s) that the sectors can be discarded */ 1873 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1874 bytenr, &num_bytes, &bbio, 0); 1875 /* Error condition is -ENOMEM */ 1876 if (!ret) { 1877 struct btrfs_bio_stripe *stripe = bbio->stripes; 1878 int i; 1879 1880 1881 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1882 if (!stripe->dev->can_discard) 1883 continue; 1884 1885 ret = btrfs_issue_discard(stripe->dev->bdev, 1886 stripe->physical, 1887 stripe->length); 1888 if (!ret) 1889 discarded_bytes += stripe->length; 1890 else if (ret != -EOPNOTSUPP) 1891 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1892 1893 /* 1894 * Just in case we get back EOPNOTSUPP for some reason, 1895 * just ignore the return value so we don't screw up 1896 * people calling discard_extent. 1897 */ 1898 ret = 0; 1899 } 1900 kfree(bbio); 1901 } 1902 1903 if (actual_bytes) 1904 *actual_bytes = discarded_bytes; 1905 1906 1907 if (ret == -EOPNOTSUPP) 1908 ret = 0; 1909 return ret; 1910 } 1911 1912 /* Can return -ENOMEM */ 1913 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1914 struct btrfs_root *root, 1915 u64 bytenr, u64 num_bytes, u64 parent, 1916 u64 root_objectid, u64 owner, u64 offset, int for_cow) 1917 { 1918 int ret; 1919 struct btrfs_fs_info *fs_info = root->fs_info; 1920 1921 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1922 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1923 1924 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1925 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1926 num_bytes, 1927 parent, root_objectid, (int)owner, 1928 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1929 } else { 1930 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1931 num_bytes, 1932 parent, root_objectid, owner, offset, 1933 BTRFS_ADD_DELAYED_REF, NULL, for_cow); 1934 } 1935 return ret; 1936 } 1937 1938 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1939 struct btrfs_root *root, 1940 u64 bytenr, u64 num_bytes, 1941 u64 parent, u64 root_objectid, 1942 u64 owner, u64 offset, int refs_to_add, 1943 struct btrfs_delayed_extent_op *extent_op) 1944 { 1945 struct btrfs_path *path; 1946 struct extent_buffer *leaf; 1947 struct btrfs_extent_item *item; 1948 u64 refs; 1949 int ret; 1950 int err = 0; 1951 1952 path = btrfs_alloc_path(); 1953 if (!path) 1954 return -ENOMEM; 1955 1956 path->reada = 1; 1957 path->leave_spinning = 1; 1958 /* this will setup the path even if it fails to insert the back ref */ 1959 ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, 1960 path, bytenr, num_bytes, parent, 1961 root_objectid, owner, offset, 1962 refs_to_add, extent_op); 1963 if (ret == 0) 1964 goto out; 1965 1966 if (ret != -EAGAIN) { 1967 err = ret; 1968 goto out; 1969 } 1970 1971 leaf = path->nodes[0]; 1972 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1973 refs = btrfs_extent_refs(leaf, item); 1974 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1975 if (extent_op) 1976 __run_delayed_extent_op(extent_op, leaf, item); 1977 1978 btrfs_mark_buffer_dirty(leaf); 1979 btrfs_release_path(path); 1980 1981 path->reada = 1; 1982 path->leave_spinning = 1; 1983 1984 /* now insert the actual backref */ 1985 ret = insert_extent_backref(trans, root->fs_info->extent_root, 1986 path, bytenr, parent, root_objectid, 1987 owner, offset, refs_to_add); 1988 if (ret) 1989 btrfs_abort_transaction(trans, root, ret); 1990 out: 1991 btrfs_free_path(path); 1992 return err; 1993 } 1994 1995 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1996 struct btrfs_root *root, 1997 struct btrfs_delayed_ref_node *node, 1998 struct btrfs_delayed_extent_op *extent_op, 1999 int insert_reserved) 2000 { 2001 int ret = 0; 2002 struct btrfs_delayed_data_ref *ref; 2003 struct btrfs_key ins; 2004 u64 parent = 0; 2005 u64 ref_root = 0; 2006 u64 flags = 0; 2007 2008 ins.objectid = node->bytenr; 2009 ins.offset = node->num_bytes; 2010 ins.type = BTRFS_EXTENT_ITEM_KEY; 2011 2012 ref = btrfs_delayed_node_to_data_ref(node); 2013 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2014 parent = ref->parent; 2015 else 2016 ref_root = ref->root; 2017 2018 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2019 if (extent_op) 2020 flags |= extent_op->flags_to_set; 2021 ret = alloc_reserved_file_extent(trans, root, 2022 parent, ref_root, flags, 2023 ref->objectid, ref->offset, 2024 &ins, node->ref_mod); 2025 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2026 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2027 node->num_bytes, parent, 2028 ref_root, ref->objectid, 2029 ref->offset, node->ref_mod, 2030 extent_op); 2031 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2032 ret = __btrfs_free_extent(trans, root, node->bytenr, 2033 node->num_bytes, parent, 2034 ref_root, ref->objectid, 2035 ref->offset, node->ref_mod, 2036 extent_op); 2037 } else { 2038 BUG(); 2039 } 2040 return ret; 2041 } 2042 2043 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2044 struct extent_buffer *leaf, 2045 struct btrfs_extent_item *ei) 2046 { 2047 u64 flags = btrfs_extent_flags(leaf, ei); 2048 if (extent_op->update_flags) { 2049 flags |= extent_op->flags_to_set; 2050 btrfs_set_extent_flags(leaf, ei, flags); 2051 } 2052 2053 if (extent_op->update_key) { 2054 struct btrfs_tree_block_info *bi; 2055 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2056 bi = (struct btrfs_tree_block_info *)(ei + 1); 2057 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2058 } 2059 } 2060 2061 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2062 struct btrfs_root *root, 2063 struct btrfs_delayed_ref_node *node, 2064 struct btrfs_delayed_extent_op *extent_op) 2065 { 2066 struct btrfs_key key; 2067 struct btrfs_path *path; 2068 struct btrfs_extent_item *ei; 2069 struct extent_buffer *leaf; 2070 u32 item_size; 2071 int ret; 2072 int err = 0; 2073 int metadata = !extent_op->is_data; 2074 2075 if (trans->aborted) 2076 return 0; 2077 2078 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2079 metadata = 0; 2080 2081 path = btrfs_alloc_path(); 2082 if (!path) 2083 return -ENOMEM; 2084 2085 key.objectid = node->bytenr; 2086 2087 if (metadata) { 2088 key.type = BTRFS_METADATA_ITEM_KEY; 2089 key.offset = extent_op->level; 2090 } else { 2091 key.type = BTRFS_EXTENT_ITEM_KEY; 2092 key.offset = node->num_bytes; 2093 } 2094 2095 again: 2096 path->reada = 1; 2097 path->leave_spinning = 1; 2098 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2099 path, 0, 1); 2100 if (ret < 0) { 2101 err = ret; 2102 goto out; 2103 } 2104 if (ret > 0) { 2105 if (metadata) { 2106 btrfs_release_path(path); 2107 metadata = 0; 2108 2109 key.offset = node->num_bytes; 2110 key.type = BTRFS_EXTENT_ITEM_KEY; 2111 goto again; 2112 } 2113 err = -EIO; 2114 goto out; 2115 } 2116 2117 leaf = path->nodes[0]; 2118 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2119 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2120 if (item_size < sizeof(*ei)) { 2121 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2122 path, (u64)-1, 0); 2123 if (ret < 0) { 2124 err = ret; 2125 goto out; 2126 } 2127 leaf = path->nodes[0]; 2128 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2129 } 2130 #endif 2131 BUG_ON(item_size < sizeof(*ei)); 2132 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2133 __run_delayed_extent_op(extent_op, leaf, ei); 2134 2135 btrfs_mark_buffer_dirty(leaf); 2136 out: 2137 btrfs_free_path(path); 2138 return err; 2139 } 2140 2141 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2142 struct btrfs_root *root, 2143 struct btrfs_delayed_ref_node *node, 2144 struct btrfs_delayed_extent_op *extent_op, 2145 int insert_reserved) 2146 { 2147 int ret = 0; 2148 struct btrfs_delayed_tree_ref *ref; 2149 struct btrfs_key ins; 2150 u64 parent = 0; 2151 u64 ref_root = 0; 2152 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2153 SKINNY_METADATA); 2154 2155 ref = btrfs_delayed_node_to_tree_ref(node); 2156 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2157 parent = ref->parent; 2158 else 2159 ref_root = ref->root; 2160 2161 ins.objectid = node->bytenr; 2162 if (skinny_metadata) { 2163 ins.offset = ref->level; 2164 ins.type = BTRFS_METADATA_ITEM_KEY; 2165 } else { 2166 ins.offset = node->num_bytes; 2167 ins.type = BTRFS_EXTENT_ITEM_KEY; 2168 } 2169 2170 BUG_ON(node->ref_mod != 1); 2171 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2172 BUG_ON(!extent_op || !extent_op->update_flags); 2173 ret = alloc_reserved_tree_block(trans, root, 2174 parent, ref_root, 2175 extent_op->flags_to_set, 2176 &extent_op->key, 2177 ref->level, &ins); 2178 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2179 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2180 node->num_bytes, parent, ref_root, 2181 ref->level, 0, 1, extent_op); 2182 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2183 ret = __btrfs_free_extent(trans, root, node->bytenr, 2184 node->num_bytes, parent, ref_root, 2185 ref->level, 0, 1, extent_op); 2186 } else { 2187 BUG(); 2188 } 2189 return ret; 2190 } 2191 2192 /* helper function to actually process a single delayed ref entry */ 2193 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2194 struct btrfs_root *root, 2195 struct btrfs_delayed_ref_node *node, 2196 struct btrfs_delayed_extent_op *extent_op, 2197 int insert_reserved) 2198 { 2199 int ret = 0; 2200 2201 if (trans->aborted) 2202 return 0; 2203 2204 if (btrfs_delayed_ref_is_head(node)) { 2205 struct btrfs_delayed_ref_head *head; 2206 /* 2207 * we've hit the end of the chain and we were supposed 2208 * to insert this extent into the tree. But, it got 2209 * deleted before we ever needed to insert it, so all 2210 * we have to do is clean up the accounting 2211 */ 2212 BUG_ON(extent_op); 2213 head = btrfs_delayed_node_to_head(node); 2214 if (insert_reserved) { 2215 btrfs_pin_extent(root, node->bytenr, 2216 node->num_bytes, 1); 2217 if (head->is_data) { 2218 ret = btrfs_del_csums(trans, root, 2219 node->bytenr, 2220 node->num_bytes); 2221 } 2222 } 2223 return ret; 2224 } 2225 2226 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2227 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2228 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2229 insert_reserved); 2230 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2231 node->type == BTRFS_SHARED_DATA_REF_KEY) 2232 ret = run_delayed_data_ref(trans, root, node, extent_op, 2233 insert_reserved); 2234 else 2235 BUG(); 2236 return ret; 2237 } 2238 2239 static noinline struct btrfs_delayed_ref_node * 2240 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2241 { 2242 struct rb_node *node; 2243 struct btrfs_delayed_ref_node *ref; 2244 int action = BTRFS_ADD_DELAYED_REF; 2245 again: 2246 /* 2247 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2248 * this prevents ref count from going down to zero when 2249 * there still are pending delayed ref. 2250 */ 2251 node = rb_prev(&head->node.rb_node); 2252 while (1) { 2253 if (!node) 2254 break; 2255 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2256 rb_node); 2257 if (ref->bytenr != head->node.bytenr) 2258 break; 2259 if (ref->action == action) 2260 return ref; 2261 node = rb_prev(node); 2262 } 2263 if (action == BTRFS_ADD_DELAYED_REF) { 2264 action = BTRFS_DROP_DELAYED_REF; 2265 goto again; 2266 } 2267 return NULL; 2268 } 2269 2270 /* 2271 * Returns 0 on success or if called with an already aborted transaction. 2272 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2273 */ 2274 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2275 struct btrfs_root *root, 2276 struct list_head *cluster) 2277 { 2278 struct btrfs_delayed_ref_root *delayed_refs; 2279 struct btrfs_delayed_ref_node *ref; 2280 struct btrfs_delayed_ref_head *locked_ref = NULL; 2281 struct btrfs_delayed_extent_op *extent_op; 2282 struct btrfs_fs_info *fs_info = root->fs_info; 2283 int ret; 2284 int count = 0; 2285 int must_insert_reserved = 0; 2286 2287 delayed_refs = &trans->transaction->delayed_refs; 2288 while (1) { 2289 if (!locked_ref) { 2290 /* pick a new head ref from the cluster list */ 2291 if (list_empty(cluster)) 2292 break; 2293 2294 locked_ref = list_entry(cluster->next, 2295 struct btrfs_delayed_ref_head, cluster); 2296 2297 /* grab the lock that says we are going to process 2298 * all the refs for this head */ 2299 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2300 2301 /* 2302 * we may have dropped the spin lock to get the head 2303 * mutex lock, and that might have given someone else 2304 * time to free the head. If that's true, it has been 2305 * removed from our list and we can move on. 2306 */ 2307 if (ret == -EAGAIN) { 2308 locked_ref = NULL; 2309 count++; 2310 continue; 2311 } 2312 } 2313 2314 /* 2315 * We need to try and merge add/drops of the same ref since we 2316 * can run into issues with relocate dropping the implicit ref 2317 * and then it being added back again before the drop can 2318 * finish. If we merged anything we need to re-loop so we can 2319 * get a good ref. 2320 */ 2321 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2322 locked_ref); 2323 2324 /* 2325 * locked_ref is the head node, so we have to go one 2326 * node back for any delayed ref updates 2327 */ 2328 ref = select_delayed_ref(locked_ref); 2329 2330 if (ref && ref->seq && 2331 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2332 /* 2333 * there are still refs with lower seq numbers in the 2334 * process of being added. Don't run this ref yet. 2335 */ 2336 list_del_init(&locked_ref->cluster); 2337 btrfs_delayed_ref_unlock(locked_ref); 2338 locked_ref = NULL; 2339 delayed_refs->num_heads_ready++; 2340 spin_unlock(&delayed_refs->lock); 2341 cond_resched(); 2342 spin_lock(&delayed_refs->lock); 2343 continue; 2344 } 2345 2346 /* 2347 * record the must insert reserved flag before we 2348 * drop the spin lock. 2349 */ 2350 must_insert_reserved = locked_ref->must_insert_reserved; 2351 locked_ref->must_insert_reserved = 0; 2352 2353 extent_op = locked_ref->extent_op; 2354 locked_ref->extent_op = NULL; 2355 2356 if (!ref) { 2357 /* All delayed refs have been processed, Go ahead 2358 * and send the head node to run_one_delayed_ref, 2359 * so that any accounting fixes can happen 2360 */ 2361 ref = &locked_ref->node; 2362 2363 if (extent_op && must_insert_reserved) { 2364 btrfs_free_delayed_extent_op(extent_op); 2365 extent_op = NULL; 2366 } 2367 2368 if (extent_op) { 2369 spin_unlock(&delayed_refs->lock); 2370 2371 ret = run_delayed_extent_op(trans, root, 2372 ref, extent_op); 2373 btrfs_free_delayed_extent_op(extent_op); 2374 2375 if (ret) { 2376 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2377 spin_lock(&delayed_refs->lock); 2378 btrfs_delayed_ref_unlock(locked_ref); 2379 return ret; 2380 } 2381 2382 goto next; 2383 } 2384 } 2385 2386 ref->in_tree = 0; 2387 rb_erase(&ref->rb_node, &delayed_refs->root); 2388 delayed_refs->num_entries--; 2389 if (!btrfs_delayed_ref_is_head(ref)) { 2390 /* 2391 * when we play the delayed ref, also correct the 2392 * ref_mod on head 2393 */ 2394 switch (ref->action) { 2395 case BTRFS_ADD_DELAYED_REF: 2396 case BTRFS_ADD_DELAYED_EXTENT: 2397 locked_ref->node.ref_mod -= ref->ref_mod; 2398 break; 2399 case BTRFS_DROP_DELAYED_REF: 2400 locked_ref->node.ref_mod += ref->ref_mod; 2401 break; 2402 default: 2403 WARN_ON(1); 2404 } 2405 } 2406 spin_unlock(&delayed_refs->lock); 2407 2408 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2409 must_insert_reserved); 2410 2411 btrfs_free_delayed_extent_op(extent_op); 2412 if (ret) { 2413 btrfs_delayed_ref_unlock(locked_ref); 2414 btrfs_put_delayed_ref(ref); 2415 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2416 spin_lock(&delayed_refs->lock); 2417 return ret; 2418 } 2419 2420 /* 2421 * If this node is a head, that means all the refs in this head 2422 * have been dealt with, and we will pick the next head to deal 2423 * with, so we must unlock the head and drop it from the cluster 2424 * list before we release it. 2425 */ 2426 if (btrfs_delayed_ref_is_head(ref)) { 2427 list_del_init(&locked_ref->cluster); 2428 btrfs_delayed_ref_unlock(locked_ref); 2429 locked_ref = NULL; 2430 } 2431 btrfs_put_delayed_ref(ref); 2432 count++; 2433 next: 2434 cond_resched(); 2435 spin_lock(&delayed_refs->lock); 2436 } 2437 return count; 2438 } 2439 2440 #ifdef SCRAMBLE_DELAYED_REFS 2441 /* 2442 * Normally delayed refs get processed in ascending bytenr order. This 2443 * correlates in most cases to the order added. To expose dependencies on this 2444 * order, we start to process the tree in the middle instead of the beginning 2445 */ 2446 static u64 find_middle(struct rb_root *root) 2447 { 2448 struct rb_node *n = root->rb_node; 2449 struct btrfs_delayed_ref_node *entry; 2450 int alt = 1; 2451 u64 middle; 2452 u64 first = 0, last = 0; 2453 2454 n = rb_first(root); 2455 if (n) { 2456 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2457 first = entry->bytenr; 2458 } 2459 n = rb_last(root); 2460 if (n) { 2461 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2462 last = entry->bytenr; 2463 } 2464 n = root->rb_node; 2465 2466 while (n) { 2467 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2468 WARN_ON(!entry->in_tree); 2469 2470 middle = entry->bytenr; 2471 2472 if (alt) 2473 n = n->rb_left; 2474 else 2475 n = n->rb_right; 2476 2477 alt = 1 - alt; 2478 } 2479 return middle; 2480 } 2481 #endif 2482 2483 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 2484 struct btrfs_fs_info *fs_info) 2485 { 2486 struct qgroup_update *qgroup_update; 2487 int ret = 0; 2488 2489 if (list_empty(&trans->qgroup_ref_list) != 2490 !trans->delayed_ref_elem.seq) { 2491 /* list without seq or seq without list */ 2492 btrfs_err(fs_info, 2493 "qgroup accounting update error, list is%s empty, seq is %#x.%x", 2494 list_empty(&trans->qgroup_ref_list) ? "" : " not", 2495 (u32)(trans->delayed_ref_elem.seq >> 32), 2496 (u32)trans->delayed_ref_elem.seq); 2497 BUG(); 2498 } 2499 2500 if (!trans->delayed_ref_elem.seq) 2501 return 0; 2502 2503 while (!list_empty(&trans->qgroup_ref_list)) { 2504 qgroup_update = list_first_entry(&trans->qgroup_ref_list, 2505 struct qgroup_update, list); 2506 list_del(&qgroup_update->list); 2507 if (!ret) 2508 ret = btrfs_qgroup_account_ref( 2509 trans, fs_info, qgroup_update->node, 2510 qgroup_update->extent_op); 2511 kfree(qgroup_update); 2512 } 2513 2514 btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); 2515 2516 return ret; 2517 } 2518 2519 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, 2520 int count) 2521 { 2522 int val = atomic_read(&delayed_refs->ref_seq); 2523 2524 if (val < seq || val >= seq + count) 2525 return 1; 2526 return 0; 2527 } 2528 2529 /* 2530 * this starts processing the delayed reference count updates and 2531 * extent insertions we have queued up so far. count can be 2532 * 0, which means to process everything in the tree at the start 2533 * of the run (but not newly added entries), or it can be some target 2534 * number you'd like to process. 2535 * 2536 * Returns 0 on success or if called with an aborted transaction 2537 * Returns <0 on error and aborts the transaction 2538 */ 2539 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2540 struct btrfs_root *root, unsigned long count) 2541 { 2542 struct rb_node *node; 2543 struct btrfs_delayed_ref_root *delayed_refs; 2544 struct btrfs_delayed_ref_node *ref; 2545 struct list_head cluster; 2546 int ret; 2547 u64 delayed_start; 2548 int run_all = count == (unsigned long)-1; 2549 int run_most = 0; 2550 int loops; 2551 2552 /* We'll clean this up in btrfs_cleanup_transaction */ 2553 if (trans->aborted) 2554 return 0; 2555 2556 if (root == root->fs_info->extent_root) 2557 root = root->fs_info->tree_root; 2558 2559 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2560 2561 delayed_refs = &trans->transaction->delayed_refs; 2562 INIT_LIST_HEAD(&cluster); 2563 if (count == 0) { 2564 count = delayed_refs->num_entries * 2; 2565 run_most = 1; 2566 } 2567 2568 if (!run_all && !run_most) { 2569 int old; 2570 int seq = atomic_read(&delayed_refs->ref_seq); 2571 2572 progress: 2573 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2574 if (old) { 2575 DEFINE_WAIT(__wait); 2576 if (delayed_refs->num_entries < 16348) 2577 return 0; 2578 2579 prepare_to_wait(&delayed_refs->wait, &__wait, 2580 TASK_UNINTERRUPTIBLE); 2581 2582 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2583 if (old) { 2584 schedule(); 2585 finish_wait(&delayed_refs->wait, &__wait); 2586 2587 if (!refs_newer(delayed_refs, seq, 256)) 2588 goto progress; 2589 else 2590 return 0; 2591 } else { 2592 finish_wait(&delayed_refs->wait, &__wait); 2593 goto again; 2594 } 2595 } 2596 2597 } else { 2598 atomic_inc(&delayed_refs->procs_running_refs); 2599 } 2600 2601 again: 2602 loops = 0; 2603 spin_lock(&delayed_refs->lock); 2604 2605 #ifdef SCRAMBLE_DELAYED_REFS 2606 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2607 #endif 2608 2609 while (1) { 2610 if (!(run_all || run_most) && 2611 delayed_refs->num_heads_ready < 64) 2612 break; 2613 2614 /* 2615 * go find something we can process in the rbtree. We start at 2616 * the beginning of the tree, and then build a cluster 2617 * of refs to process starting at the first one we are able to 2618 * lock 2619 */ 2620 delayed_start = delayed_refs->run_delayed_start; 2621 ret = btrfs_find_ref_cluster(trans, &cluster, 2622 delayed_refs->run_delayed_start); 2623 if (ret) 2624 break; 2625 2626 ret = run_clustered_refs(trans, root, &cluster); 2627 if (ret < 0) { 2628 btrfs_release_ref_cluster(&cluster); 2629 spin_unlock(&delayed_refs->lock); 2630 btrfs_abort_transaction(trans, root, ret); 2631 atomic_dec(&delayed_refs->procs_running_refs); 2632 return ret; 2633 } 2634 2635 atomic_add(ret, &delayed_refs->ref_seq); 2636 2637 count -= min_t(unsigned long, ret, count); 2638 2639 if (count == 0) 2640 break; 2641 2642 if (delayed_start >= delayed_refs->run_delayed_start) { 2643 if (loops == 0) { 2644 /* 2645 * btrfs_find_ref_cluster looped. let's do one 2646 * more cycle. if we don't run any delayed ref 2647 * during that cycle (because we can't because 2648 * all of them are blocked), bail out. 2649 */ 2650 loops = 1; 2651 } else { 2652 /* 2653 * no runnable refs left, stop trying 2654 */ 2655 BUG_ON(run_all); 2656 break; 2657 } 2658 } 2659 if (ret) { 2660 /* refs were run, let's reset staleness detection */ 2661 loops = 0; 2662 } 2663 } 2664 2665 if (run_all) { 2666 if (!list_empty(&trans->new_bgs)) { 2667 spin_unlock(&delayed_refs->lock); 2668 btrfs_create_pending_block_groups(trans, root); 2669 spin_lock(&delayed_refs->lock); 2670 } 2671 2672 node = rb_first(&delayed_refs->root); 2673 if (!node) 2674 goto out; 2675 count = (unsigned long)-1; 2676 2677 while (node) { 2678 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2679 rb_node); 2680 if (btrfs_delayed_ref_is_head(ref)) { 2681 struct btrfs_delayed_ref_head *head; 2682 2683 head = btrfs_delayed_node_to_head(ref); 2684 atomic_inc(&ref->refs); 2685 2686 spin_unlock(&delayed_refs->lock); 2687 /* 2688 * Mutex was contended, block until it's 2689 * released and try again 2690 */ 2691 mutex_lock(&head->mutex); 2692 mutex_unlock(&head->mutex); 2693 2694 btrfs_put_delayed_ref(ref); 2695 cond_resched(); 2696 goto again; 2697 } 2698 node = rb_next(node); 2699 } 2700 spin_unlock(&delayed_refs->lock); 2701 schedule_timeout(1); 2702 goto again; 2703 } 2704 out: 2705 atomic_dec(&delayed_refs->procs_running_refs); 2706 smp_mb(); 2707 if (waitqueue_active(&delayed_refs->wait)) 2708 wake_up(&delayed_refs->wait); 2709 2710 spin_unlock(&delayed_refs->lock); 2711 assert_qgroups_uptodate(trans); 2712 return 0; 2713 } 2714 2715 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2716 struct btrfs_root *root, 2717 u64 bytenr, u64 num_bytes, u64 flags, 2718 int level, int is_data) 2719 { 2720 struct btrfs_delayed_extent_op *extent_op; 2721 int ret; 2722 2723 extent_op = btrfs_alloc_delayed_extent_op(); 2724 if (!extent_op) 2725 return -ENOMEM; 2726 2727 extent_op->flags_to_set = flags; 2728 extent_op->update_flags = 1; 2729 extent_op->update_key = 0; 2730 extent_op->is_data = is_data ? 1 : 0; 2731 extent_op->level = level; 2732 2733 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2734 num_bytes, extent_op); 2735 if (ret) 2736 btrfs_free_delayed_extent_op(extent_op); 2737 return ret; 2738 } 2739 2740 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2741 struct btrfs_root *root, 2742 struct btrfs_path *path, 2743 u64 objectid, u64 offset, u64 bytenr) 2744 { 2745 struct btrfs_delayed_ref_head *head; 2746 struct btrfs_delayed_ref_node *ref; 2747 struct btrfs_delayed_data_ref *data_ref; 2748 struct btrfs_delayed_ref_root *delayed_refs; 2749 struct rb_node *node; 2750 int ret = 0; 2751 2752 ret = -ENOENT; 2753 delayed_refs = &trans->transaction->delayed_refs; 2754 spin_lock(&delayed_refs->lock); 2755 head = btrfs_find_delayed_ref_head(trans, bytenr); 2756 if (!head) 2757 goto out; 2758 2759 if (!mutex_trylock(&head->mutex)) { 2760 atomic_inc(&head->node.refs); 2761 spin_unlock(&delayed_refs->lock); 2762 2763 btrfs_release_path(path); 2764 2765 /* 2766 * Mutex was contended, block until it's released and let 2767 * caller try again 2768 */ 2769 mutex_lock(&head->mutex); 2770 mutex_unlock(&head->mutex); 2771 btrfs_put_delayed_ref(&head->node); 2772 return -EAGAIN; 2773 } 2774 2775 node = rb_prev(&head->node.rb_node); 2776 if (!node) 2777 goto out_unlock; 2778 2779 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2780 2781 if (ref->bytenr != bytenr) 2782 goto out_unlock; 2783 2784 ret = 1; 2785 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2786 goto out_unlock; 2787 2788 data_ref = btrfs_delayed_node_to_data_ref(ref); 2789 2790 node = rb_prev(node); 2791 if (node) { 2792 int seq = ref->seq; 2793 2794 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2795 if (ref->bytenr == bytenr && ref->seq == seq) 2796 goto out_unlock; 2797 } 2798 2799 if (data_ref->root != root->root_key.objectid || 2800 data_ref->objectid != objectid || data_ref->offset != offset) 2801 goto out_unlock; 2802 2803 ret = 0; 2804 out_unlock: 2805 mutex_unlock(&head->mutex); 2806 out: 2807 spin_unlock(&delayed_refs->lock); 2808 return ret; 2809 } 2810 2811 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2812 struct btrfs_root *root, 2813 struct btrfs_path *path, 2814 u64 objectid, u64 offset, u64 bytenr) 2815 { 2816 struct btrfs_root *extent_root = root->fs_info->extent_root; 2817 struct extent_buffer *leaf; 2818 struct btrfs_extent_data_ref *ref; 2819 struct btrfs_extent_inline_ref *iref; 2820 struct btrfs_extent_item *ei; 2821 struct btrfs_key key; 2822 u32 item_size; 2823 int ret; 2824 2825 key.objectid = bytenr; 2826 key.offset = (u64)-1; 2827 key.type = BTRFS_EXTENT_ITEM_KEY; 2828 2829 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2830 if (ret < 0) 2831 goto out; 2832 BUG_ON(ret == 0); /* Corruption */ 2833 2834 ret = -ENOENT; 2835 if (path->slots[0] == 0) 2836 goto out; 2837 2838 path->slots[0]--; 2839 leaf = path->nodes[0]; 2840 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2841 2842 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2843 goto out; 2844 2845 ret = 1; 2846 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2847 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2848 if (item_size < sizeof(*ei)) { 2849 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2850 goto out; 2851 } 2852 #endif 2853 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2854 2855 if (item_size != sizeof(*ei) + 2856 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2857 goto out; 2858 2859 if (btrfs_extent_generation(leaf, ei) <= 2860 btrfs_root_last_snapshot(&root->root_item)) 2861 goto out; 2862 2863 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2864 if (btrfs_extent_inline_ref_type(leaf, iref) != 2865 BTRFS_EXTENT_DATA_REF_KEY) 2866 goto out; 2867 2868 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2869 if (btrfs_extent_refs(leaf, ei) != 2870 btrfs_extent_data_ref_count(leaf, ref) || 2871 btrfs_extent_data_ref_root(leaf, ref) != 2872 root->root_key.objectid || 2873 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2874 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2875 goto out; 2876 2877 ret = 0; 2878 out: 2879 return ret; 2880 } 2881 2882 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2883 struct btrfs_root *root, 2884 u64 objectid, u64 offset, u64 bytenr) 2885 { 2886 struct btrfs_path *path; 2887 int ret; 2888 int ret2; 2889 2890 path = btrfs_alloc_path(); 2891 if (!path) 2892 return -ENOENT; 2893 2894 do { 2895 ret = check_committed_ref(trans, root, path, objectid, 2896 offset, bytenr); 2897 if (ret && ret != -ENOENT) 2898 goto out; 2899 2900 ret2 = check_delayed_ref(trans, root, path, objectid, 2901 offset, bytenr); 2902 } while (ret2 == -EAGAIN); 2903 2904 if (ret2 && ret2 != -ENOENT) { 2905 ret = ret2; 2906 goto out; 2907 } 2908 2909 if (ret != -ENOENT || ret2 != -ENOENT) 2910 ret = 0; 2911 out: 2912 btrfs_free_path(path); 2913 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 2914 WARN_ON(ret > 0); 2915 return ret; 2916 } 2917 2918 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2919 struct btrfs_root *root, 2920 struct extent_buffer *buf, 2921 int full_backref, int inc, int for_cow) 2922 { 2923 u64 bytenr; 2924 u64 num_bytes; 2925 u64 parent; 2926 u64 ref_root; 2927 u32 nritems; 2928 struct btrfs_key key; 2929 struct btrfs_file_extent_item *fi; 2930 int i; 2931 int level; 2932 int ret = 0; 2933 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2934 u64, u64, u64, u64, u64, u64, int); 2935 2936 ref_root = btrfs_header_owner(buf); 2937 nritems = btrfs_header_nritems(buf); 2938 level = btrfs_header_level(buf); 2939 2940 if (!root->ref_cows && level == 0) 2941 return 0; 2942 2943 if (inc) 2944 process_func = btrfs_inc_extent_ref; 2945 else 2946 process_func = btrfs_free_extent; 2947 2948 if (full_backref) 2949 parent = buf->start; 2950 else 2951 parent = 0; 2952 2953 for (i = 0; i < nritems; i++) { 2954 if (level == 0) { 2955 btrfs_item_key_to_cpu(buf, &key, i); 2956 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 2957 continue; 2958 fi = btrfs_item_ptr(buf, i, 2959 struct btrfs_file_extent_item); 2960 if (btrfs_file_extent_type(buf, fi) == 2961 BTRFS_FILE_EXTENT_INLINE) 2962 continue; 2963 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2964 if (bytenr == 0) 2965 continue; 2966 2967 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2968 key.offset -= btrfs_file_extent_offset(buf, fi); 2969 ret = process_func(trans, root, bytenr, num_bytes, 2970 parent, ref_root, key.objectid, 2971 key.offset, for_cow); 2972 if (ret) 2973 goto fail; 2974 } else { 2975 bytenr = btrfs_node_blockptr(buf, i); 2976 num_bytes = btrfs_level_size(root, level - 1); 2977 ret = process_func(trans, root, bytenr, num_bytes, 2978 parent, ref_root, level - 1, 0, 2979 for_cow); 2980 if (ret) 2981 goto fail; 2982 } 2983 } 2984 return 0; 2985 fail: 2986 return ret; 2987 } 2988 2989 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2990 struct extent_buffer *buf, int full_backref, int for_cow) 2991 { 2992 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); 2993 } 2994 2995 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2996 struct extent_buffer *buf, int full_backref, int for_cow) 2997 { 2998 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); 2999 } 3000 3001 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3002 struct btrfs_root *root, 3003 struct btrfs_path *path, 3004 struct btrfs_block_group_cache *cache) 3005 { 3006 int ret; 3007 struct btrfs_root *extent_root = root->fs_info->extent_root; 3008 unsigned long bi; 3009 struct extent_buffer *leaf; 3010 3011 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3012 if (ret < 0) 3013 goto fail; 3014 BUG_ON(ret); /* Corruption */ 3015 3016 leaf = path->nodes[0]; 3017 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3018 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3019 btrfs_mark_buffer_dirty(leaf); 3020 btrfs_release_path(path); 3021 fail: 3022 if (ret) { 3023 btrfs_abort_transaction(trans, root, ret); 3024 return ret; 3025 } 3026 return 0; 3027 3028 } 3029 3030 static struct btrfs_block_group_cache * 3031 next_block_group(struct btrfs_root *root, 3032 struct btrfs_block_group_cache *cache) 3033 { 3034 struct rb_node *node; 3035 spin_lock(&root->fs_info->block_group_cache_lock); 3036 node = rb_next(&cache->cache_node); 3037 btrfs_put_block_group(cache); 3038 if (node) { 3039 cache = rb_entry(node, struct btrfs_block_group_cache, 3040 cache_node); 3041 btrfs_get_block_group(cache); 3042 } else 3043 cache = NULL; 3044 spin_unlock(&root->fs_info->block_group_cache_lock); 3045 return cache; 3046 } 3047 3048 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3049 struct btrfs_trans_handle *trans, 3050 struct btrfs_path *path) 3051 { 3052 struct btrfs_root *root = block_group->fs_info->tree_root; 3053 struct inode *inode = NULL; 3054 u64 alloc_hint = 0; 3055 int dcs = BTRFS_DC_ERROR; 3056 int num_pages = 0; 3057 int retries = 0; 3058 int ret = 0; 3059 3060 /* 3061 * If this block group is smaller than 100 megs don't bother caching the 3062 * block group. 3063 */ 3064 if (block_group->key.offset < (100 * 1024 * 1024)) { 3065 spin_lock(&block_group->lock); 3066 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3067 spin_unlock(&block_group->lock); 3068 return 0; 3069 } 3070 3071 again: 3072 inode = lookup_free_space_inode(root, block_group, path); 3073 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3074 ret = PTR_ERR(inode); 3075 btrfs_release_path(path); 3076 goto out; 3077 } 3078 3079 if (IS_ERR(inode)) { 3080 BUG_ON(retries); 3081 retries++; 3082 3083 if (block_group->ro) 3084 goto out_free; 3085 3086 ret = create_free_space_inode(root, trans, block_group, path); 3087 if (ret) 3088 goto out_free; 3089 goto again; 3090 } 3091 3092 /* We've already setup this transaction, go ahead and exit */ 3093 if (block_group->cache_generation == trans->transid && 3094 i_size_read(inode)) { 3095 dcs = BTRFS_DC_SETUP; 3096 goto out_put; 3097 } 3098 3099 /* 3100 * We want to set the generation to 0, that way if anything goes wrong 3101 * from here on out we know not to trust this cache when we load up next 3102 * time. 3103 */ 3104 BTRFS_I(inode)->generation = 0; 3105 ret = btrfs_update_inode(trans, root, inode); 3106 WARN_ON(ret); 3107 3108 if (i_size_read(inode) > 0) { 3109 ret = btrfs_check_trunc_cache_free_space(root, 3110 &root->fs_info->global_block_rsv); 3111 if (ret) 3112 goto out_put; 3113 3114 ret = btrfs_truncate_free_space_cache(root, trans, path, 3115 inode); 3116 if (ret) 3117 goto out_put; 3118 } 3119 3120 spin_lock(&block_group->lock); 3121 if (block_group->cached != BTRFS_CACHE_FINISHED || 3122 !btrfs_test_opt(root, SPACE_CACHE)) { 3123 /* 3124 * don't bother trying to write stuff out _if_ 3125 * a) we're not cached, 3126 * b) we're with nospace_cache mount option. 3127 */ 3128 dcs = BTRFS_DC_WRITTEN; 3129 spin_unlock(&block_group->lock); 3130 goto out_put; 3131 } 3132 spin_unlock(&block_group->lock); 3133 3134 /* 3135 * Try to preallocate enough space based on how big the block group is. 3136 * Keep in mind this has to include any pinned space which could end up 3137 * taking up quite a bit since it's not folded into the other space 3138 * cache. 3139 */ 3140 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3141 if (!num_pages) 3142 num_pages = 1; 3143 3144 num_pages *= 16; 3145 num_pages *= PAGE_CACHE_SIZE; 3146 3147 ret = btrfs_check_data_free_space(inode, num_pages); 3148 if (ret) 3149 goto out_put; 3150 3151 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3152 num_pages, num_pages, 3153 &alloc_hint); 3154 if (!ret) 3155 dcs = BTRFS_DC_SETUP; 3156 btrfs_free_reserved_data_space(inode, num_pages); 3157 3158 out_put: 3159 iput(inode); 3160 out_free: 3161 btrfs_release_path(path); 3162 out: 3163 spin_lock(&block_group->lock); 3164 if (!ret && dcs == BTRFS_DC_SETUP) 3165 block_group->cache_generation = trans->transid; 3166 block_group->disk_cache_state = dcs; 3167 spin_unlock(&block_group->lock); 3168 3169 return ret; 3170 } 3171 3172 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3173 struct btrfs_root *root) 3174 { 3175 struct btrfs_block_group_cache *cache; 3176 int err = 0; 3177 struct btrfs_path *path; 3178 u64 last = 0; 3179 3180 path = btrfs_alloc_path(); 3181 if (!path) 3182 return -ENOMEM; 3183 3184 again: 3185 while (1) { 3186 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3187 while (cache) { 3188 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3189 break; 3190 cache = next_block_group(root, cache); 3191 } 3192 if (!cache) { 3193 if (last == 0) 3194 break; 3195 last = 0; 3196 continue; 3197 } 3198 err = cache_save_setup(cache, trans, path); 3199 last = cache->key.objectid + cache->key.offset; 3200 btrfs_put_block_group(cache); 3201 } 3202 3203 while (1) { 3204 if (last == 0) { 3205 err = btrfs_run_delayed_refs(trans, root, 3206 (unsigned long)-1); 3207 if (err) /* File system offline */ 3208 goto out; 3209 } 3210 3211 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3212 while (cache) { 3213 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3214 btrfs_put_block_group(cache); 3215 goto again; 3216 } 3217 3218 if (cache->dirty) 3219 break; 3220 cache = next_block_group(root, cache); 3221 } 3222 if (!cache) { 3223 if (last == 0) 3224 break; 3225 last = 0; 3226 continue; 3227 } 3228 3229 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3230 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3231 cache->dirty = 0; 3232 last = cache->key.objectid + cache->key.offset; 3233 3234 err = write_one_cache_group(trans, root, path, cache); 3235 if (err) /* File system offline */ 3236 goto out; 3237 3238 btrfs_put_block_group(cache); 3239 } 3240 3241 while (1) { 3242 /* 3243 * I don't think this is needed since we're just marking our 3244 * preallocated extent as written, but just in case it can't 3245 * hurt. 3246 */ 3247 if (last == 0) { 3248 err = btrfs_run_delayed_refs(trans, root, 3249 (unsigned long)-1); 3250 if (err) /* File system offline */ 3251 goto out; 3252 } 3253 3254 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3255 while (cache) { 3256 /* 3257 * Really this shouldn't happen, but it could if we 3258 * couldn't write the entire preallocated extent and 3259 * splitting the extent resulted in a new block. 3260 */ 3261 if (cache->dirty) { 3262 btrfs_put_block_group(cache); 3263 goto again; 3264 } 3265 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3266 break; 3267 cache = next_block_group(root, cache); 3268 } 3269 if (!cache) { 3270 if (last == 0) 3271 break; 3272 last = 0; 3273 continue; 3274 } 3275 3276 err = btrfs_write_out_cache(root, trans, cache, path); 3277 3278 /* 3279 * If we didn't have an error then the cache state is still 3280 * NEED_WRITE, so we can set it to WRITTEN. 3281 */ 3282 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3283 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3284 last = cache->key.objectid + cache->key.offset; 3285 btrfs_put_block_group(cache); 3286 } 3287 out: 3288 3289 btrfs_free_path(path); 3290 return err; 3291 } 3292 3293 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3294 { 3295 struct btrfs_block_group_cache *block_group; 3296 int readonly = 0; 3297 3298 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3299 if (!block_group || block_group->ro) 3300 readonly = 1; 3301 if (block_group) 3302 btrfs_put_block_group(block_group); 3303 return readonly; 3304 } 3305 3306 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3307 u64 total_bytes, u64 bytes_used, 3308 struct btrfs_space_info **space_info) 3309 { 3310 struct btrfs_space_info *found; 3311 int i; 3312 int factor; 3313 3314 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3315 BTRFS_BLOCK_GROUP_RAID10)) 3316 factor = 2; 3317 else 3318 factor = 1; 3319 3320 found = __find_space_info(info, flags); 3321 if (found) { 3322 spin_lock(&found->lock); 3323 found->total_bytes += total_bytes; 3324 found->disk_total += total_bytes * factor; 3325 found->bytes_used += bytes_used; 3326 found->disk_used += bytes_used * factor; 3327 found->full = 0; 3328 spin_unlock(&found->lock); 3329 *space_info = found; 3330 return 0; 3331 } 3332 found = kzalloc(sizeof(*found), GFP_NOFS); 3333 if (!found) 3334 return -ENOMEM; 3335 3336 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3337 INIT_LIST_HEAD(&found->block_groups[i]); 3338 init_rwsem(&found->groups_sem); 3339 spin_lock_init(&found->lock); 3340 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3341 found->total_bytes = total_bytes; 3342 found->disk_total = total_bytes * factor; 3343 found->bytes_used = bytes_used; 3344 found->disk_used = bytes_used * factor; 3345 found->bytes_pinned = 0; 3346 found->bytes_reserved = 0; 3347 found->bytes_readonly = 0; 3348 found->bytes_may_use = 0; 3349 found->full = 0; 3350 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3351 found->chunk_alloc = 0; 3352 found->flush = 0; 3353 init_waitqueue_head(&found->wait); 3354 *space_info = found; 3355 list_add_rcu(&found->list, &info->space_info); 3356 if (flags & BTRFS_BLOCK_GROUP_DATA) 3357 info->data_sinfo = found; 3358 return 0; 3359 } 3360 3361 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3362 { 3363 u64 extra_flags = chunk_to_extended(flags) & 3364 BTRFS_EXTENDED_PROFILE_MASK; 3365 3366 write_seqlock(&fs_info->profiles_lock); 3367 if (flags & BTRFS_BLOCK_GROUP_DATA) 3368 fs_info->avail_data_alloc_bits |= extra_flags; 3369 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3370 fs_info->avail_metadata_alloc_bits |= extra_flags; 3371 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3372 fs_info->avail_system_alloc_bits |= extra_flags; 3373 write_sequnlock(&fs_info->profiles_lock); 3374 } 3375 3376 /* 3377 * returns target flags in extended format or 0 if restripe for this 3378 * chunk_type is not in progress 3379 * 3380 * should be called with either volume_mutex or balance_lock held 3381 */ 3382 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3383 { 3384 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3385 u64 target = 0; 3386 3387 if (!bctl) 3388 return 0; 3389 3390 if (flags & BTRFS_BLOCK_GROUP_DATA && 3391 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3392 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3393 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3394 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3395 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3396 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3397 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3398 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3399 } 3400 3401 return target; 3402 } 3403 3404 /* 3405 * @flags: available profiles in extended format (see ctree.h) 3406 * 3407 * Returns reduced profile in chunk format. If profile changing is in 3408 * progress (either running or paused) picks the target profile (if it's 3409 * already available), otherwise falls back to plain reducing. 3410 */ 3411 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3412 { 3413 /* 3414 * we add in the count of missing devices because we want 3415 * to make sure that any RAID levels on a degraded FS 3416 * continue to be honored. 3417 */ 3418 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3419 root->fs_info->fs_devices->missing_devices; 3420 u64 target; 3421 u64 tmp; 3422 3423 /* 3424 * see if restripe for this chunk_type is in progress, if so 3425 * try to reduce to the target profile 3426 */ 3427 spin_lock(&root->fs_info->balance_lock); 3428 target = get_restripe_target(root->fs_info, flags); 3429 if (target) { 3430 /* pick target profile only if it's already available */ 3431 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3432 spin_unlock(&root->fs_info->balance_lock); 3433 return extended_to_chunk(target); 3434 } 3435 } 3436 spin_unlock(&root->fs_info->balance_lock); 3437 3438 /* First, mask out the RAID levels which aren't possible */ 3439 if (num_devices == 1) 3440 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3441 BTRFS_BLOCK_GROUP_RAID5); 3442 if (num_devices < 3) 3443 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3444 if (num_devices < 4) 3445 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3446 3447 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3448 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3449 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3450 flags &= ~tmp; 3451 3452 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3453 tmp = BTRFS_BLOCK_GROUP_RAID6; 3454 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3455 tmp = BTRFS_BLOCK_GROUP_RAID5; 3456 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3457 tmp = BTRFS_BLOCK_GROUP_RAID10; 3458 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3459 tmp = BTRFS_BLOCK_GROUP_RAID1; 3460 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3461 tmp = BTRFS_BLOCK_GROUP_RAID0; 3462 3463 return extended_to_chunk(flags | tmp); 3464 } 3465 3466 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3467 { 3468 unsigned seq; 3469 3470 do { 3471 seq = read_seqbegin(&root->fs_info->profiles_lock); 3472 3473 if (flags & BTRFS_BLOCK_GROUP_DATA) 3474 flags |= root->fs_info->avail_data_alloc_bits; 3475 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3476 flags |= root->fs_info->avail_system_alloc_bits; 3477 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3478 flags |= root->fs_info->avail_metadata_alloc_bits; 3479 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3480 3481 return btrfs_reduce_alloc_profile(root, flags); 3482 } 3483 3484 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3485 { 3486 u64 flags; 3487 u64 ret; 3488 3489 if (data) 3490 flags = BTRFS_BLOCK_GROUP_DATA; 3491 else if (root == root->fs_info->chunk_root) 3492 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3493 else 3494 flags = BTRFS_BLOCK_GROUP_METADATA; 3495 3496 ret = get_alloc_profile(root, flags); 3497 return ret; 3498 } 3499 3500 /* 3501 * This will check the space that the inode allocates from to make sure we have 3502 * enough space for bytes. 3503 */ 3504 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3505 { 3506 struct btrfs_space_info *data_sinfo; 3507 struct btrfs_root *root = BTRFS_I(inode)->root; 3508 struct btrfs_fs_info *fs_info = root->fs_info; 3509 u64 used; 3510 int ret = 0, committed = 0, alloc_chunk = 1; 3511 3512 /* make sure bytes are sectorsize aligned */ 3513 bytes = ALIGN(bytes, root->sectorsize); 3514 3515 if (root == root->fs_info->tree_root || 3516 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3517 alloc_chunk = 0; 3518 committed = 1; 3519 } 3520 3521 data_sinfo = fs_info->data_sinfo; 3522 if (!data_sinfo) 3523 goto alloc; 3524 3525 again: 3526 /* make sure we have enough space to handle the data first */ 3527 spin_lock(&data_sinfo->lock); 3528 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3529 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3530 data_sinfo->bytes_may_use; 3531 3532 if (used + bytes > data_sinfo->total_bytes) { 3533 struct btrfs_trans_handle *trans; 3534 3535 /* 3536 * if we don't have enough free bytes in this space then we need 3537 * to alloc a new chunk. 3538 */ 3539 if (!data_sinfo->full && alloc_chunk) { 3540 u64 alloc_target; 3541 3542 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3543 spin_unlock(&data_sinfo->lock); 3544 alloc: 3545 alloc_target = btrfs_get_alloc_profile(root, 1); 3546 trans = btrfs_join_transaction(root); 3547 if (IS_ERR(trans)) 3548 return PTR_ERR(trans); 3549 3550 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3551 alloc_target, 3552 CHUNK_ALLOC_NO_FORCE); 3553 btrfs_end_transaction(trans, root); 3554 if (ret < 0) { 3555 if (ret != -ENOSPC) 3556 return ret; 3557 else 3558 goto commit_trans; 3559 } 3560 3561 if (!data_sinfo) 3562 data_sinfo = fs_info->data_sinfo; 3563 3564 goto again; 3565 } 3566 3567 /* 3568 * If we have less pinned bytes than we want to allocate then 3569 * don't bother committing the transaction, it won't help us. 3570 */ 3571 if (data_sinfo->bytes_pinned < bytes) 3572 committed = 1; 3573 spin_unlock(&data_sinfo->lock); 3574 3575 /* commit the current transaction and try again */ 3576 commit_trans: 3577 if (!committed && 3578 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3579 committed = 1; 3580 trans = btrfs_join_transaction(root); 3581 if (IS_ERR(trans)) 3582 return PTR_ERR(trans); 3583 ret = btrfs_commit_transaction(trans, root); 3584 if (ret) 3585 return ret; 3586 goto again; 3587 } 3588 3589 return -ENOSPC; 3590 } 3591 data_sinfo->bytes_may_use += bytes; 3592 trace_btrfs_space_reservation(root->fs_info, "space_info", 3593 data_sinfo->flags, bytes, 1); 3594 spin_unlock(&data_sinfo->lock); 3595 3596 return 0; 3597 } 3598 3599 /* 3600 * Called if we need to clear a data reservation for this inode. 3601 */ 3602 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3603 { 3604 struct btrfs_root *root = BTRFS_I(inode)->root; 3605 struct btrfs_space_info *data_sinfo; 3606 3607 /* make sure bytes are sectorsize aligned */ 3608 bytes = ALIGN(bytes, root->sectorsize); 3609 3610 data_sinfo = root->fs_info->data_sinfo; 3611 spin_lock(&data_sinfo->lock); 3612 data_sinfo->bytes_may_use -= bytes; 3613 trace_btrfs_space_reservation(root->fs_info, "space_info", 3614 data_sinfo->flags, bytes, 0); 3615 spin_unlock(&data_sinfo->lock); 3616 } 3617 3618 static void force_metadata_allocation(struct btrfs_fs_info *info) 3619 { 3620 struct list_head *head = &info->space_info; 3621 struct btrfs_space_info *found; 3622 3623 rcu_read_lock(); 3624 list_for_each_entry_rcu(found, head, list) { 3625 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3626 found->force_alloc = CHUNK_ALLOC_FORCE; 3627 } 3628 rcu_read_unlock(); 3629 } 3630 3631 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3632 { 3633 return (global->size << 1); 3634 } 3635 3636 static int should_alloc_chunk(struct btrfs_root *root, 3637 struct btrfs_space_info *sinfo, int force) 3638 { 3639 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3640 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3641 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3642 u64 thresh; 3643 3644 if (force == CHUNK_ALLOC_FORCE) 3645 return 1; 3646 3647 /* 3648 * We need to take into account the global rsv because for all intents 3649 * and purposes it's used space. Don't worry about locking the 3650 * global_rsv, it doesn't change except when the transaction commits. 3651 */ 3652 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3653 num_allocated += calc_global_rsv_need_space(global_rsv); 3654 3655 /* 3656 * in limited mode, we want to have some free space up to 3657 * about 1% of the FS size. 3658 */ 3659 if (force == CHUNK_ALLOC_LIMITED) { 3660 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3661 thresh = max_t(u64, 64 * 1024 * 1024, 3662 div_factor_fine(thresh, 1)); 3663 3664 if (num_bytes - num_allocated < thresh) 3665 return 1; 3666 } 3667 3668 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3669 return 0; 3670 return 1; 3671 } 3672 3673 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3674 { 3675 u64 num_dev; 3676 3677 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3678 BTRFS_BLOCK_GROUP_RAID0 | 3679 BTRFS_BLOCK_GROUP_RAID5 | 3680 BTRFS_BLOCK_GROUP_RAID6)) 3681 num_dev = root->fs_info->fs_devices->rw_devices; 3682 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3683 num_dev = 2; 3684 else 3685 num_dev = 1; /* DUP or single */ 3686 3687 /* metadata for updaing devices and chunk tree */ 3688 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3689 } 3690 3691 static void check_system_chunk(struct btrfs_trans_handle *trans, 3692 struct btrfs_root *root, u64 type) 3693 { 3694 struct btrfs_space_info *info; 3695 u64 left; 3696 u64 thresh; 3697 3698 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3699 spin_lock(&info->lock); 3700 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3701 info->bytes_reserved - info->bytes_readonly; 3702 spin_unlock(&info->lock); 3703 3704 thresh = get_system_chunk_thresh(root, type); 3705 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3706 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3707 left, thresh, type); 3708 dump_space_info(info, 0, 0); 3709 } 3710 3711 if (left < thresh) { 3712 u64 flags; 3713 3714 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3715 btrfs_alloc_chunk(trans, root, flags); 3716 } 3717 } 3718 3719 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3720 struct btrfs_root *extent_root, u64 flags, int force) 3721 { 3722 struct btrfs_space_info *space_info; 3723 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3724 int wait_for_alloc = 0; 3725 int ret = 0; 3726 3727 /* Don't re-enter if we're already allocating a chunk */ 3728 if (trans->allocating_chunk) 3729 return -ENOSPC; 3730 3731 space_info = __find_space_info(extent_root->fs_info, flags); 3732 if (!space_info) { 3733 ret = update_space_info(extent_root->fs_info, flags, 3734 0, 0, &space_info); 3735 BUG_ON(ret); /* -ENOMEM */ 3736 } 3737 BUG_ON(!space_info); /* Logic error */ 3738 3739 again: 3740 spin_lock(&space_info->lock); 3741 if (force < space_info->force_alloc) 3742 force = space_info->force_alloc; 3743 if (space_info->full) { 3744 spin_unlock(&space_info->lock); 3745 return 0; 3746 } 3747 3748 if (!should_alloc_chunk(extent_root, space_info, force)) { 3749 spin_unlock(&space_info->lock); 3750 return 0; 3751 } else if (space_info->chunk_alloc) { 3752 wait_for_alloc = 1; 3753 } else { 3754 space_info->chunk_alloc = 1; 3755 } 3756 3757 spin_unlock(&space_info->lock); 3758 3759 mutex_lock(&fs_info->chunk_mutex); 3760 3761 /* 3762 * The chunk_mutex is held throughout the entirety of a chunk 3763 * allocation, so once we've acquired the chunk_mutex we know that the 3764 * other guy is done and we need to recheck and see if we should 3765 * allocate. 3766 */ 3767 if (wait_for_alloc) { 3768 mutex_unlock(&fs_info->chunk_mutex); 3769 wait_for_alloc = 0; 3770 goto again; 3771 } 3772 3773 trans->allocating_chunk = true; 3774 3775 /* 3776 * If we have mixed data/metadata chunks we want to make sure we keep 3777 * allocating mixed chunks instead of individual chunks. 3778 */ 3779 if (btrfs_mixed_space_info(space_info)) 3780 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3781 3782 /* 3783 * if we're doing a data chunk, go ahead and make sure that 3784 * we keep a reasonable number of metadata chunks allocated in the 3785 * FS as well. 3786 */ 3787 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3788 fs_info->data_chunk_allocations++; 3789 if (!(fs_info->data_chunk_allocations % 3790 fs_info->metadata_ratio)) 3791 force_metadata_allocation(fs_info); 3792 } 3793 3794 /* 3795 * Check if we have enough space in SYSTEM chunk because we may need 3796 * to update devices. 3797 */ 3798 check_system_chunk(trans, extent_root, flags); 3799 3800 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3801 trans->allocating_chunk = false; 3802 3803 spin_lock(&space_info->lock); 3804 if (ret < 0 && ret != -ENOSPC) 3805 goto out; 3806 if (ret) 3807 space_info->full = 1; 3808 else 3809 ret = 1; 3810 3811 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3812 out: 3813 space_info->chunk_alloc = 0; 3814 spin_unlock(&space_info->lock); 3815 mutex_unlock(&fs_info->chunk_mutex); 3816 return ret; 3817 } 3818 3819 static int can_overcommit(struct btrfs_root *root, 3820 struct btrfs_space_info *space_info, u64 bytes, 3821 enum btrfs_reserve_flush_enum flush) 3822 { 3823 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3824 u64 profile = btrfs_get_alloc_profile(root, 0); 3825 u64 space_size; 3826 u64 avail; 3827 u64 used; 3828 u64 to_add; 3829 3830 used = space_info->bytes_used + space_info->bytes_reserved + 3831 space_info->bytes_pinned + space_info->bytes_readonly; 3832 3833 /* 3834 * We only want to allow over committing if we have lots of actual space 3835 * free, but if we don't have enough space to handle the global reserve 3836 * space then we could end up having a real enospc problem when trying 3837 * to allocate a chunk or some other such important allocation. 3838 */ 3839 spin_lock(&global_rsv->lock); 3840 space_size = calc_global_rsv_need_space(global_rsv); 3841 spin_unlock(&global_rsv->lock); 3842 if (used + space_size >= space_info->total_bytes) 3843 return 0; 3844 3845 used += space_info->bytes_may_use; 3846 3847 spin_lock(&root->fs_info->free_chunk_lock); 3848 avail = root->fs_info->free_chunk_space; 3849 spin_unlock(&root->fs_info->free_chunk_lock); 3850 3851 /* 3852 * If we have dup, raid1 or raid10 then only half of the free 3853 * space is actually useable. For raid56, the space info used 3854 * doesn't include the parity drive, so we don't have to 3855 * change the math 3856 */ 3857 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3858 BTRFS_BLOCK_GROUP_RAID1 | 3859 BTRFS_BLOCK_GROUP_RAID10)) 3860 avail >>= 1; 3861 3862 to_add = space_info->total_bytes; 3863 3864 /* 3865 * If we aren't flushing all things, let us overcommit up to 3866 * 1/2th of the space. If we can flush, don't let us overcommit 3867 * too much, let it overcommit up to 1/8 of the space. 3868 */ 3869 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3870 to_add >>= 3; 3871 else 3872 to_add >>= 1; 3873 3874 /* 3875 * Limit the overcommit to the amount of free space we could possibly 3876 * allocate for chunks. 3877 */ 3878 to_add = min(avail, to_add); 3879 3880 if (used + bytes < space_info->total_bytes + to_add) 3881 return 1; 3882 return 0; 3883 } 3884 3885 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 3886 unsigned long nr_pages) 3887 { 3888 struct super_block *sb = root->fs_info->sb; 3889 int started; 3890 3891 /* If we can not start writeback, just sync all the delalloc file. */ 3892 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3893 WB_REASON_FS_FREE_SPACE); 3894 if (!started) { 3895 /* 3896 * We needn't worry the filesystem going from r/w to r/o though 3897 * we don't acquire ->s_umount mutex, because the filesystem 3898 * should guarantee the delalloc inodes list be empty after 3899 * the filesystem is readonly(all dirty pages are written to 3900 * the disk). 3901 */ 3902 btrfs_start_delalloc_inodes(root, 0); 3903 if (!current->journal_info) 3904 btrfs_wait_ordered_extents(root, 0); 3905 } 3906 } 3907 3908 /* 3909 * shrink metadata reservation for delalloc 3910 */ 3911 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 3912 bool wait_ordered) 3913 { 3914 struct btrfs_block_rsv *block_rsv; 3915 struct btrfs_space_info *space_info; 3916 struct btrfs_trans_handle *trans; 3917 u64 delalloc_bytes; 3918 u64 max_reclaim; 3919 long time_left; 3920 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3921 int loops = 0; 3922 enum btrfs_reserve_flush_enum flush; 3923 3924 trans = (struct btrfs_trans_handle *)current->journal_info; 3925 block_rsv = &root->fs_info->delalloc_block_rsv; 3926 space_info = block_rsv->space_info; 3927 3928 smp_mb(); 3929 delalloc_bytes = percpu_counter_sum_positive( 3930 &root->fs_info->delalloc_bytes); 3931 if (delalloc_bytes == 0) { 3932 if (trans) 3933 return; 3934 btrfs_wait_ordered_extents(root, 0); 3935 return; 3936 } 3937 3938 while (delalloc_bytes && loops < 3) { 3939 max_reclaim = min(delalloc_bytes, to_reclaim); 3940 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3941 btrfs_writeback_inodes_sb_nr(root, nr_pages); 3942 /* 3943 * We need to wait for the async pages to actually start before 3944 * we do anything. 3945 */ 3946 wait_event(root->fs_info->async_submit_wait, 3947 !atomic_read(&root->fs_info->async_delalloc_pages)); 3948 3949 if (!trans) 3950 flush = BTRFS_RESERVE_FLUSH_ALL; 3951 else 3952 flush = BTRFS_RESERVE_NO_FLUSH; 3953 spin_lock(&space_info->lock); 3954 if (can_overcommit(root, space_info, orig, flush)) { 3955 spin_unlock(&space_info->lock); 3956 break; 3957 } 3958 spin_unlock(&space_info->lock); 3959 3960 loops++; 3961 if (wait_ordered && !trans) { 3962 btrfs_wait_ordered_extents(root, 0); 3963 } else { 3964 time_left = schedule_timeout_killable(1); 3965 if (time_left) 3966 break; 3967 } 3968 smp_mb(); 3969 delalloc_bytes = percpu_counter_sum_positive( 3970 &root->fs_info->delalloc_bytes); 3971 } 3972 } 3973 3974 /** 3975 * maybe_commit_transaction - possibly commit the transaction if its ok to 3976 * @root - the root we're allocating for 3977 * @bytes - the number of bytes we want to reserve 3978 * @force - force the commit 3979 * 3980 * This will check to make sure that committing the transaction will actually 3981 * get us somewhere and then commit the transaction if it does. Otherwise it 3982 * will return -ENOSPC. 3983 */ 3984 static int may_commit_transaction(struct btrfs_root *root, 3985 struct btrfs_space_info *space_info, 3986 u64 bytes, int force) 3987 { 3988 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 3989 struct btrfs_trans_handle *trans; 3990 3991 trans = (struct btrfs_trans_handle *)current->journal_info; 3992 if (trans) 3993 return -EAGAIN; 3994 3995 if (force) 3996 goto commit; 3997 3998 /* See if there is enough pinned space to make this reservation */ 3999 spin_lock(&space_info->lock); 4000 if (space_info->bytes_pinned >= bytes) { 4001 spin_unlock(&space_info->lock); 4002 goto commit; 4003 } 4004 spin_unlock(&space_info->lock); 4005 4006 /* 4007 * See if there is some space in the delayed insertion reservation for 4008 * this reservation. 4009 */ 4010 if (space_info != delayed_rsv->space_info) 4011 return -ENOSPC; 4012 4013 spin_lock(&space_info->lock); 4014 spin_lock(&delayed_rsv->lock); 4015 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4016 spin_unlock(&delayed_rsv->lock); 4017 spin_unlock(&space_info->lock); 4018 return -ENOSPC; 4019 } 4020 spin_unlock(&delayed_rsv->lock); 4021 spin_unlock(&space_info->lock); 4022 4023 commit: 4024 trans = btrfs_join_transaction(root); 4025 if (IS_ERR(trans)) 4026 return -ENOSPC; 4027 4028 return btrfs_commit_transaction(trans, root); 4029 } 4030 4031 enum flush_state { 4032 FLUSH_DELAYED_ITEMS_NR = 1, 4033 FLUSH_DELAYED_ITEMS = 2, 4034 FLUSH_DELALLOC = 3, 4035 FLUSH_DELALLOC_WAIT = 4, 4036 ALLOC_CHUNK = 5, 4037 COMMIT_TRANS = 6, 4038 }; 4039 4040 static int flush_space(struct btrfs_root *root, 4041 struct btrfs_space_info *space_info, u64 num_bytes, 4042 u64 orig_bytes, int state) 4043 { 4044 struct btrfs_trans_handle *trans; 4045 int nr; 4046 int ret = 0; 4047 4048 switch (state) { 4049 case FLUSH_DELAYED_ITEMS_NR: 4050 case FLUSH_DELAYED_ITEMS: 4051 if (state == FLUSH_DELAYED_ITEMS_NR) { 4052 u64 bytes = btrfs_calc_trans_metadata_size(root, 1); 4053 4054 nr = (int)div64_u64(num_bytes, bytes); 4055 if (!nr) 4056 nr = 1; 4057 nr *= 2; 4058 } else { 4059 nr = -1; 4060 } 4061 trans = btrfs_join_transaction(root); 4062 if (IS_ERR(trans)) { 4063 ret = PTR_ERR(trans); 4064 break; 4065 } 4066 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4067 btrfs_end_transaction(trans, root); 4068 break; 4069 case FLUSH_DELALLOC: 4070 case FLUSH_DELALLOC_WAIT: 4071 shrink_delalloc(root, num_bytes, orig_bytes, 4072 state == FLUSH_DELALLOC_WAIT); 4073 break; 4074 case ALLOC_CHUNK: 4075 trans = btrfs_join_transaction(root); 4076 if (IS_ERR(trans)) { 4077 ret = PTR_ERR(trans); 4078 break; 4079 } 4080 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4081 btrfs_get_alloc_profile(root, 0), 4082 CHUNK_ALLOC_NO_FORCE); 4083 btrfs_end_transaction(trans, root); 4084 if (ret == -ENOSPC) 4085 ret = 0; 4086 break; 4087 case COMMIT_TRANS: 4088 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4089 break; 4090 default: 4091 ret = -ENOSPC; 4092 break; 4093 } 4094 4095 return ret; 4096 } 4097 /** 4098 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4099 * @root - the root we're allocating for 4100 * @block_rsv - the block_rsv we're allocating for 4101 * @orig_bytes - the number of bytes we want 4102 * @flush - whether or not we can flush to make our reservation 4103 * 4104 * This will reserve orgi_bytes number of bytes from the space info associated 4105 * with the block_rsv. If there is not enough space it will make an attempt to 4106 * flush out space to make room. It will do this by flushing delalloc if 4107 * possible or committing the transaction. If flush is 0 then no attempts to 4108 * regain reservations will be made and this will fail if there is not enough 4109 * space already. 4110 */ 4111 static int reserve_metadata_bytes(struct btrfs_root *root, 4112 struct btrfs_block_rsv *block_rsv, 4113 u64 orig_bytes, 4114 enum btrfs_reserve_flush_enum flush) 4115 { 4116 struct btrfs_space_info *space_info = block_rsv->space_info; 4117 u64 used; 4118 u64 num_bytes = orig_bytes; 4119 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4120 int ret = 0; 4121 bool flushing = false; 4122 4123 again: 4124 ret = 0; 4125 spin_lock(&space_info->lock); 4126 /* 4127 * We only want to wait if somebody other than us is flushing and we 4128 * are actually allowed to flush all things. 4129 */ 4130 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4131 space_info->flush) { 4132 spin_unlock(&space_info->lock); 4133 /* 4134 * If we have a trans handle we can't wait because the flusher 4135 * may have to commit the transaction, which would mean we would 4136 * deadlock since we are waiting for the flusher to finish, but 4137 * hold the current transaction open. 4138 */ 4139 if (current->journal_info) 4140 return -EAGAIN; 4141 ret = wait_event_killable(space_info->wait, !space_info->flush); 4142 /* Must have been killed, return */ 4143 if (ret) 4144 return -EINTR; 4145 4146 spin_lock(&space_info->lock); 4147 } 4148 4149 ret = -ENOSPC; 4150 used = space_info->bytes_used + space_info->bytes_reserved + 4151 space_info->bytes_pinned + space_info->bytes_readonly + 4152 space_info->bytes_may_use; 4153 4154 /* 4155 * The idea here is that we've not already over-reserved the block group 4156 * then we can go ahead and save our reservation first and then start 4157 * flushing if we need to. Otherwise if we've already overcommitted 4158 * lets start flushing stuff first and then come back and try to make 4159 * our reservation. 4160 */ 4161 if (used <= space_info->total_bytes) { 4162 if (used + orig_bytes <= space_info->total_bytes) { 4163 space_info->bytes_may_use += orig_bytes; 4164 trace_btrfs_space_reservation(root->fs_info, 4165 "space_info", space_info->flags, orig_bytes, 1); 4166 ret = 0; 4167 } else { 4168 /* 4169 * Ok set num_bytes to orig_bytes since we aren't 4170 * overocmmitted, this way we only try and reclaim what 4171 * we need. 4172 */ 4173 num_bytes = orig_bytes; 4174 } 4175 } else { 4176 /* 4177 * Ok we're over committed, set num_bytes to the overcommitted 4178 * amount plus the amount of bytes that we need for this 4179 * reservation. 4180 */ 4181 num_bytes = used - space_info->total_bytes + 4182 (orig_bytes * 2); 4183 } 4184 4185 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4186 space_info->bytes_may_use += orig_bytes; 4187 trace_btrfs_space_reservation(root->fs_info, "space_info", 4188 space_info->flags, orig_bytes, 4189 1); 4190 ret = 0; 4191 } 4192 4193 /* 4194 * Couldn't make our reservation, save our place so while we're trying 4195 * to reclaim space we can actually use it instead of somebody else 4196 * stealing it from us. 4197 * 4198 * We make the other tasks wait for the flush only when we can flush 4199 * all things. 4200 */ 4201 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4202 flushing = true; 4203 space_info->flush = 1; 4204 } 4205 4206 spin_unlock(&space_info->lock); 4207 4208 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4209 goto out; 4210 4211 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4212 flush_state); 4213 flush_state++; 4214 4215 /* 4216 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4217 * would happen. So skip delalloc flush. 4218 */ 4219 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4220 (flush_state == FLUSH_DELALLOC || 4221 flush_state == FLUSH_DELALLOC_WAIT)) 4222 flush_state = ALLOC_CHUNK; 4223 4224 if (!ret) 4225 goto again; 4226 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4227 flush_state < COMMIT_TRANS) 4228 goto again; 4229 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4230 flush_state <= COMMIT_TRANS) 4231 goto again; 4232 4233 out: 4234 if (ret == -ENOSPC && 4235 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4236 struct btrfs_block_rsv *global_rsv = 4237 &root->fs_info->global_block_rsv; 4238 4239 if (block_rsv != global_rsv && 4240 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4241 ret = 0; 4242 } 4243 if (flushing) { 4244 spin_lock(&space_info->lock); 4245 space_info->flush = 0; 4246 wake_up_all(&space_info->wait); 4247 spin_unlock(&space_info->lock); 4248 } 4249 return ret; 4250 } 4251 4252 static struct btrfs_block_rsv *get_block_rsv( 4253 const struct btrfs_trans_handle *trans, 4254 const struct btrfs_root *root) 4255 { 4256 struct btrfs_block_rsv *block_rsv = NULL; 4257 4258 if (root->ref_cows) 4259 block_rsv = trans->block_rsv; 4260 4261 if (root == root->fs_info->csum_root && trans->adding_csums) 4262 block_rsv = trans->block_rsv; 4263 4264 if (!block_rsv) 4265 block_rsv = root->block_rsv; 4266 4267 if (!block_rsv) 4268 block_rsv = &root->fs_info->empty_block_rsv; 4269 4270 return block_rsv; 4271 } 4272 4273 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4274 u64 num_bytes) 4275 { 4276 int ret = -ENOSPC; 4277 spin_lock(&block_rsv->lock); 4278 if (block_rsv->reserved >= num_bytes) { 4279 block_rsv->reserved -= num_bytes; 4280 if (block_rsv->reserved < block_rsv->size) 4281 block_rsv->full = 0; 4282 ret = 0; 4283 } 4284 spin_unlock(&block_rsv->lock); 4285 return ret; 4286 } 4287 4288 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4289 u64 num_bytes, int update_size) 4290 { 4291 spin_lock(&block_rsv->lock); 4292 block_rsv->reserved += num_bytes; 4293 if (update_size) 4294 block_rsv->size += num_bytes; 4295 else if (block_rsv->reserved >= block_rsv->size) 4296 block_rsv->full = 1; 4297 spin_unlock(&block_rsv->lock); 4298 } 4299 4300 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4301 struct btrfs_block_rsv *block_rsv, 4302 struct btrfs_block_rsv *dest, u64 num_bytes) 4303 { 4304 struct btrfs_space_info *space_info = block_rsv->space_info; 4305 4306 spin_lock(&block_rsv->lock); 4307 if (num_bytes == (u64)-1) 4308 num_bytes = block_rsv->size; 4309 block_rsv->size -= num_bytes; 4310 if (block_rsv->reserved >= block_rsv->size) { 4311 num_bytes = block_rsv->reserved - block_rsv->size; 4312 block_rsv->reserved = block_rsv->size; 4313 block_rsv->full = 1; 4314 } else { 4315 num_bytes = 0; 4316 } 4317 spin_unlock(&block_rsv->lock); 4318 4319 if (num_bytes > 0) { 4320 if (dest) { 4321 spin_lock(&dest->lock); 4322 if (!dest->full) { 4323 u64 bytes_to_add; 4324 4325 bytes_to_add = dest->size - dest->reserved; 4326 bytes_to_add = min(num_bytes, bytes_to_add); 4327 dest->reserved += bytes_to_add; 4328 if (dest->reserved >= dest->size) 4329 dest->full = 1; 4330 num_bytes -= bytes_to_add; 4331 } 4332 spin_unlock(&dest->lock); 4333 } 4334 if (num_bytes) { 4335 spin_lock(&space_info->lock); 4336 space_info->bytes_may_use -= num_bytes; 4337 trace_btrfs_space_reservation(fs_info, "space_info", 4338 space_info->flags, num_bytes, 0); 4339 space_info->reservation_progress++; 4340 spin_unlock(&space_info->lock); 4341 } 4342 } 4343 } 4344 4345 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4346 struct btrfs_block_rsv *dst, u64 num_bytes) 4347 { 4348 int ret; 4349 4350 ret = block_rsv_use_bytes(src, num_bytes); 4351 if (ret) 4352 return ret; 4353 4354 block_rsv_add_bytes(dst, num_bytes, 1); 4355 return 0; 4356 } 4357 4358 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4359 { 4360 memset(rsv, 0, sizeof(*rsv)); 4361 spin_lock_init(&rsv->lock); 4362 rsv->type = type; 4363 } 4364 4365 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4366 unsigned short type) 4367 { 4368 struct btrfs_block_rsv *block_rsv; 4369 struct btrfs_fs_info *fs_info = root->fs_info; 4370 4371 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4372 if (!block_rsv) 4373 return NULL; 4374 4375 btrfs_init_block_rsv(block_rsv, type); 4376 block_rsv->space_info = __find_space_info(fs_info, 4377 BTRFS_BLOCK_GROUP_METADATA); 4378 return block_rsv; 4379 } 4380 4381 void btrfs_free_block_rsv(struct btrfs_root *root, 4382 struct btrfs_block_rsv *rsv) 4383 { 4384 if (!rsv) 4385 return; 4386 btrfs_block_rsv_release(root, rsv, (u64)-1); 4387 kfree(rsv); 4388 } 4389 4390 int btrfs_block_rsv_add(struct btrfs_root *root, 4391 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4392 enum btrfs_reserve_flush_enum flush) 4393 { 4394 int ret; 4395 4396 if (num_bytes == 0) 4397 return 0; 4398 4399 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4400 if (!ret) { 4401 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4402 return 0; 4403 } 4404 4405 return ret; 4406 } 4407 4408 int btrfs_block_rsv_check(struct btrfs_root *root, 4409 struct btrfs_block_rsv *block_rsv, int min_factor) 4410 { 4411 u64 num_bytes = 0; 4412 int ret = -ENOSPC; 4413 4414 if (!block_rsv) 4415 return 0; 4416 4417 spin_lock(&block_rsv->lock); 4418 num_bytes = div_factor(block_rsv->size, min_factor); 4419 if (block_rsv->reserved >= num_bytes) 4420 ret = 0; 4421 spin_unlock(&block_rsv->lock); 4422 4423 return ret; 4424 } 4425 4426 int btrfs_block_rsv_refill(struct btrfs_root *root, 4427 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4428 enum btrfs_reserve_flush_enum flush) 4429 { 4430 u64 num_bytes = 0; 4431 int ret = -ENOSPC; 4432 4433 if (!block_rsv) 4434 return 0; 4435 4436 spin_lock(&block_rsv->lock); 4437 num_bytes = min_reserved; 4438 if (block_rsv->reserved >= num_bytes) 4439 ret = 0; 4440 else 4441 num_bytes -= block_rsv->reserved; 4442 spin_unlock(&block_rsv->lock); 4443 4444 if (!ret) 4445 return 0; 4446 4447 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4448 if (!ret) { 4449 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4450 return 0; 4451 } 4452 4453 return ret; 4454 } 4455 4456 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4457 struct btrfs_block_rsv *dst_rsv, 4458 u64 num_bytes) 4459 { 4460 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4461 } 4462 4463 void btrfs_block_rsv_release(struct btrfs_root *root, 4464 struct btrfs_block_rsv *block_rsv, 4465 u64 num_bytes) 4466 { 4467 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4468 if (global_rsv->full || global_rsv == block_rsv || 4469 block_rsv->space_info != global_rsv->space_info) 4470 global_rsv = NULL; 4471 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4472 num_bytes); 4473 } 4474 4475 /* 4476 * helper to calculate size of global block reservation. 4477 * the desired value is sum of space used by extent tree, 4478 * checksum tree and root tree 4479 */ 4480 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4481 { 4482 struct btrfs_space_info *sinfo; 4483 u64 num_bytes; 4484 u64 meta_used; 4485 u64 data_used; 4486 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4487 4488 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4489 spin_lock(&sinfo->lock); 4490 data_used = sinfo->bytes_used; 4491 spin_unlock(&sinfo->lock); 4492 4493 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4494 spin_lock(&sinfo->lock); 4495 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4496 data_used = 0; 4497 meta_used = sinfo->bytes_used; 4498 spin_unlock(&sinfo->lock); 4499 4500 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4501 csum_size * 2; 4502 num_bytes += div64_u64(data_used + meta_used, 50); 4503 4504 if (num_bytes * 3 > meta_used) 4505 num_bytes = div64_u64(meta_used, 3); 4506 4507 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4508 } 4509 4510 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4511 { 4512 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4513 struct btrfs_space_info *sinfo = block_rsv->space_info; 4514 u64 num_bytes; 4515 4516 num_bytes = calc_global_metadata_size(fs_info); 4517 4518 spin_lock(&sinfo->lock); 4519 spin_lock(&block_rsv->lock); 4520 4521 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4522 4523 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4524 sinfo->bytes_reserved + sinfo->bytes_readonly + 4525 sinfo->bytes_may_use; 4526 4527 if (sinfo->total_bytes > num_bytes) { 4528 num_bytes = sinfo->total_bytes - num_bytes; 4529 block_rsv->reserved += num_bytes; 4530 sinfo->bytes_may_use += num_bytes; 4531 trace_btrfs_space_reservation(fs_info, "space_info", 4532 sinfo->flags, num_bytes, 1); 4533 } 4534 4535 if (block_rsv->reserved >= block_rsv->size) { 4536 num_bytes = block_rsv->reserved - block_rsv->size; 4537 sinfo->bytes_may_use -= num_bytes; 4538 trace_btrfs_space_reservation(fs_info, "space_info", 4539 sinfo->flags, num_bytes, 0); 4540 sinfo->reservation_progress++; 4541 block_rsv->reserved = block_rsv->size; 4542 block_rsv->full = 1; 4543 } 4544 4545 spin_unlock(&block_rsv->lock); 4546 spin_unlock(&sinfo->lock); 4547 } 4548 4549 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4550 { 4551 struct btrfs_space_info *space_info; 4552 4553 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4554 fs_info->chunk_block_rsv.space_info = space_info; 4555 4556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4557 fs_info->global_block_rsv.space_info = space_info; 4558 fs_info->delalloc_block_rsv.space_info = space_info; 4559 fs_info->trans_block_rsv.space_info = space_info; 4560 fs_info->empty_block_rsv.space_info = space_info; 4561 fs_info->delayed_block_rsv.space_info = space_info; 4562 4563 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4564 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4565 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4566 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4567 if (fs_info->quota_root) 4568 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4569 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4570 4571 update_global_block_rsv(fs_info); 4572 } 4573 4574 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4575 { 4576 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4577 (u64)-1); 4578 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4579 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4580 WARN_ON(fs_info->trans_block_rsv.size > 0); 4581 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4582 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4583 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4584 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4585 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4586 } 4587 4588 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4589 struct btrfs_root *root) 4590 { 4591 if (!trans->block_rsv) 4592 return; 4593 4594 if (!trans->bytes_reserved) 4595 return; 4596 4597 trace_btrfs_space_reservation(root->fs_info, "transaction", 4598 trans->transid, trans->bytes_reserved, 0); 4599 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4600 trans->bytes_reserved = 0; 4601 } 4602 4603 /* Can only return 0 or -ENOSPC */ 4604 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4605 struct inode *inode) 4606 { 4607 struct btrfs_root *root = BTRFS_I(inode)->root; 4608 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4609 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4610 4611 /* 4612 * We need to hold space in order to delete our orphan item once we've 4613 * added it, so this takes the reservation so we can release it later 4614 * when we are truly done with the orphan item. 4615 */ 4616 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4617 trace_btrfs_space_reservation(root->fs_info, "orphan", 4618 btrfs_ino(inode), num_bytes, 1); 4619 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4620 } 4621 4622 void btrfs_orphan_release_metadata(struct inode *inode) 4623 { 4624 struct btrfs_root *root = BTRFS_I(inode)->root; 4625 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4626 trace_btrfs_space_reservation(root->fs_info, "orphan", 4627 btrfs_ino(inode), num_bytes, 0); 4628 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4629 } 4630 4631 /* 4632 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4633 * root: the root of the parent directory 4634 * rsv: block reservation 4635 * items: the number of items that we need do reservation 4636 * qgroup_reserved: used to return the reserved size in qgroup 4637 * 4638 * This function is used to reserve the space for snapshot/subvolume 4639 * creation and deletion. Those operations are different with the 4640 * common file/directory operations, they change two fs/file trees 4641 * and root tree, the number of items that the qgroup reserves is 4642 * different with the free space reservation. So we can not use 4643 * the space reseravtion mechanism in start_transaction(). 4644 */ 4645 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4646 struct btrfs_block_rsv *rsv, 4647 int items, 4648 u64 *qgroup_reserved) 4649 { 4650 u64 num_bytes; 4651 int ret; 4652 4653 if (root->fs_info->quota_enabled) { 4654 /* One for parent inode, two for dir entries */ 4655 num_bytes = 3 * root->leafsize; 4656 ret = btrfs_qgroup_reserve(root, num_bytes); 4657 if (ret) 4658 return ret; 4659 } else { 4660 num_bytes = 0; 4661 } 4662 4663 *qgroup_reserved = num_bytes; 4664 4665 num_bytes = btrfs_calc_trans_metadata_size(root, items); 4666 rsv->space_info = __find_space_info(root->fs_info, 4667 BTRFS_BLOCK_GROUP_METADATA); 4668 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 4669 BTRFS_RESERVE_FLUSH_ALL); 4670 if (ret) { 4671 if (*qgroup_reserved) 4672 btrfs_qgroup_free(root, *qgroup_reserved); 4673 } 4674 4675 return ret; 4676 } 4677 4678 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 4679 struct btrfs_block_rsv *rsv, 4680 u64 qgroup_reserved) 4681 { 4682 btrfs_block_rsv_release(root, rsv, (u64)-1); 4683 if (qgroup_reserved) 4684 btrfs_qgroup_free(root, qgroup_reserved); 4685 } 4686 4687 /** 4688 * drop_outstanding_extent - drop an outstanding extent 4689 * @inode: the inode we're dropping the extent for 4690 * 4691 * This is called when we are freeing up an outstanding extent, either called 4692 * after an error or after an extent is written. This will return the number of 4693 * reserved extents that need to be freed. This must be called with 4694 * BTRFS_I(inode)->lock held. 4695 */ 4696 static unsigned drop_outstanding_extent(struct inode *inode) 4697 { 4698 unsigned drop_inode_space = 0; 4699 unsigned dropped_extents = 0; 4700 4701 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4702 BTRFS_I(inode)->outstanding_extents--; 4703 4704 if (BTRFS_I(inode)->outstanding_extents == 0 && 4705 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4706 &BTRFS_I(inode)->runtime_flags)) 4707 drop_inode_space = 1; 4708 4709 /* 4710 * If we have more or the same amount of outsanding extents than we have 4711 * reserved then we need to leave the reserved extents count alone. 4712 */ 4713 if (BTRFS_I(inode)->outstanding_extents >= 4714 BTRFS_I(inode)->reserved_extents) 4715 return drop_inode_space; 4716 4717 dropped_extents = BTRFS_I(inode)->reserved_extents - 4718 BTRFS_I(inode)->outstanding_extents; 4719 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4720 return dropped_extents + drop_inode_space; 4721 } 4722 4723 /** 4724 * calc_csum_metadata_size - return the amount of metada space that must be 4725 * reserved/free'd for the given bytes. 4726 * @inode: the inode we're manipulating 4727 * @num_bytes: the number of bytes in question 4728 * @reserve: 1 if we are reserving space, 0 if we are freeing space 4729 * 4730 * This adjusts the number of csum_bytes in the inode and then returns the 4731 * correct amount of metadata that must either be reserved or freed. We 4732 * calculate how many checksums we can fit into one leaf and then divide the 4733 * number of bytes that will need to be checksumed by this value to figure out 4734 * how many checksums will be required. If we are adding bytes then the number 4735 * may go up and we will return the number of additional bytes that must be 4736 * reserved. If it is going down we will return the number of bytes that must 4737 * be freed. 4738 * 4739 * This must be called with BTRFS_I(inode)->lock held. 4740 */ 4741 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 4742 int reserve) 4743 { 4744 struct btrfs_root *root = BTRFS_I(inode)->root; 4745 u64 csum_size; 4746 int num_csums_per_leaf; 4747 int num_csums; 4748 int old_csums; 4749 4750 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 4751 BTRFS_I(inode)->csum_bytes == 0) 4752 return 0; 4753 4754 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4755 if (reserve) 4756 BTRFS_I(inode)->csum_bytes += num_bytes; 4757 else 4758 BTRFS_I(inode)->csum_bytes -= num_bytes; 4759 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 4760 num_csums_per_leaf = (int)div64_u64(csum_size, 4761 sizeof(struct btrfs_csum_item) + 4762 sizeof(struct btrfs_disk_key)); 4763 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 4764 num_csums = num_csums + num_csums_per_leaf - 1; 4765 num_csums = num_csums / num_csums_per_leaf; 4766 4767 old_csums = old_csums + num_csums_per_leaf - 1; 4768 old_csums = old_csums / num_csums_per_leaf; 4769 4770 /* No change, no need to reserve more */ 4771 if (old_csums == num_csums) 4772 return 0; 4773 4774 if (reserve) 4775 return btrfs_calc_trans_metadata_size(root, 4776 num_csums - old_csums); 4777 4778 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 4779 } 4780 4781 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4782 { 4783 struct btrfs_root *root = BTRFS_I(inode)->root; 4784 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4785 u64 to_reserve = 0; 4786 u64 csum_bytes; 4787 unsigned nr_extents = 0; 4788 int extra_reserve = 0; 4789 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4790 int ret = 0; 4791 bool delalloc_lock = true; 4792 u64 to_free = 0; 4793 unsigned dropped; 4794 4795 /* If we are a free space inode we need to not flush since we will be in 4796 * the middle of a transaction commit. We also don't need the delalloc 4797 * mutex since we won't race with anybody. We need this mostly to make 4798 * lockdep shut its filthy mouth. 4799 */ 4800 if (btrfs_is_free_space_inode(inode)) { 4801 flush = BTRFS_RESERVE_NO_FLUSH; 4802 delalloc_lock = false; 4803 } 4804 4805 if (flush != BTRFS_RESERVE_NO_FLUSH && 4806 btrfs_transaction_in_commit(root->fs_info)) 4807 schedule_timeout(1); 4808 4809 if (delalloc_lock) 4810 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4811 4812 num_bytes = ALIGN(num_bytes, root->sectorsize); 4813 4814 spin_lock(&BTRFS_I(inode)->lock); 4815 BTRFS_I(inode)->outstanding_extents++; 4816 4817 if (BTRFS_I(inode)->outstanding_extents > 4818 BTRFS_I(inode)->reserved_extents) 4819 nr_extents = BTRFS_I(inode)->outstanding_extents - 4820 BTRFS_I(inode)->reserved_extents; 4821 4822 /* 4823 * Add an item to reserve for updating the inode when we complete the 4824 * delalloc io. 4825 */ 4826 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4827 &BTRFS_I(inode)->runtime_flags)) { 4828 nr_extents++; 4829 extra_reserve = 1; 4830 } 4831 4832 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4833 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 4834 csum_bytes = BTRFS_I(inode)->csum_bytes; 4835 spin_unlock(&BTRFS_I(inode)->lock); 4836 4837 if (root->fs_info->quota_enabled) { 4838 ret = btrfs_qgroup_reserve(root, num_bytes + 4839 nr_extents * root->leafsize); 4840 if (ret) 4841 goto out_fail; 4842 } 4843 4844 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 4845 if (unlikely(ret)) { 4846 if (root->fs_info->quota_enabled) 4847 btrfs_qgroup_free(root, num_bytes + 4848 nr_extents * root->leafsize); 4849 goto out_fail; 4850 } 4851 4852 spin_lock(&BTRFS_I(inode)->lock); 4853 if (extra_reserve) { 4854 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4855 &BTRFS_I(inode)->runtime_flags); 4856 nr_extents--; 4857 } 4858 BTRFS_I(inode)->reserved_extents += nr_extents; 4859 spin_unlock(&BTRFS_I(inode)->lock); 4860 4861 if (delalloc_lock) 4862 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4863 4864 if (to_reserve) 4865 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4866 btrfs_ino(inode), to_reserve, 1); 4867 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4868 4869 return 0; 4870 4871 out_fail: 4872 spin_lock(&BTRFS_I(inode)->lock); 4873 dropped = drop_outstanding_extent(inode); 4874 /* 4875 * If the inodes csum_bytes is the same as the original 4876 * csum_bytes then we know we haven't raced with any free()ers 4877 * so we can just reduce our inodes csum bytes and carry on. 4878 */ 4879 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 4880 calc_csum_metadata_size(inode, num_bytes, 0); 4881 } else { 4882 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 4883 u64 bytes; 4884 4885 /* 4886 * This is tricky, but first we need to figure out how much we 4887 * free'd from any free-ers that occured during this 4888 * reservation, so we reset ->csum_bytes to the csum_bytes 4889 * before we dropped our lock, and then call the free for the 4890 * number of bytes that were freed while we were trying our 4891 * reservation. 4892 */ 4893 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 4894 BTRFS_I(inode)->csum_bytes = csum_bytes; 4895 to_free = calc_csum_metadata_size(inode, bytes, 0); 4896 4897 4898 /* 4899 * Now we need to see how much we would have freed had we not 4900 * been making this reservation and our ->csum_bytes were not 4901 * artificially inflated. 4902 */ 4903 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 4904 bytes = csum_bytes - orig_csum_bytes; 4905 bytes = calc_csum_metadata_size(inode, bytes, 0); 4906 4907 /* 4908 * Now reset ->csum_bytes to what it should be. If bytes is 4909 * more than to_free then we would have free'd more space had we 4910 * not had an artificially high ->csum_bytes, so we need to free 4911 * the remainder. If bytes is the same or less then we don't 4912 * need to do anything, the other free-ers did the correct 4913 * thing. 4914 */ 4915 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 4916 if (bytes > to_free) 4917 to_free = bytes - to_free; 4918 else 4919 to_free = 0; 4920 } 4921 spin_unlock(&BTRFS_I(inode)->lock); 4922 if (dropped) 4923 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4924 4925 if (to_free) { 4926 btrfs_block_rsv_release(root, block_rsv, to_free); 4927 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4928 btrfs_ino(inode), to_free, 0); 4929 } 4930 if (delalloc_lock) 4931 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4932 return ret; 4933 } 4934 4935 /** 4936 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 4937 * @inode: the inode to release the reservation for 4938 * @num_bytes: the number of bytes we're releasing 4939 * 4940 * This will release the metadata reservation for an inode. This can be called 4941 * once we complete IO for a given set of bytes to release their metadata 4942 * reservations. 4943 */ 4944 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4945 { 4946 struct btrfs_root *root = BTRFS_I(inode)->root; 4947 u64 to_free = 0; 4948 unsigned dropped; 4949 4950 num_bytes = ALIGN(num_bytes, root->sectorsize); 4951 spin_lock(&BTRFS_I(inode)->lock); 4952 dropped = drop_outstanding_extent(inode); 4953 4954 if (num_bytes) 4955 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4956 spin_unlock(&BTRFS_I(inode)->lock); 4957 if (dropped > 0) 4958 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4959 4960 trace_btrfs_space_reservation(root->fs_info, "delalloc", 4961 btrfs_ino(inode), to_free, 0); 4962 if (root->fs_info->quota_enabled) { 4963 btrfs_qgroup_free(root, num_bytes + 4964 dropped * root->leafsize); 4965 } 4966 4967 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4968 to_free); 4969 } 4970 4971 /** 4972 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 4973 * @inode: inode we're writing to 4974 * @num_bytes: the number of bytes we want to allocate 4975 * 4976 * This will do the following things 4977 * 4978 * o reserve space in the data space info for num_bytes 4979 * o reserve space in the metadata space info based on number of outstanding 4980 * extents and how much csums will be needed 4981 * o add to the inodes ->delalloc_bytes 4982 * o add it to the fs_info's delalloc inodes list. 4983 * 4984 * This will return 0 for success and -ENOSPC if there is no space left. 4985 */ 4986 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4987 { 4988 int ret; 4989 4990 ret = btrfs_check_data_free_space(inode, num_bytes); 4991 if (ret) 4992 return ret; 4993 4994 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 4995 if (ret) { 4996 btrfs_free_reserved_data_space(inode, num_bytes); 4997 return ret; 4998 } 4999 5000 return 0; 5001 } 5002 5003 /** 5004 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5005 * @inode: inode we're releasing space for 5006 * @num_bytes: the number of bytes we want to free up 5007 * 5008 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5009 * called in the case that we don't need the metadata AND data reservations 5010 * anymore. So if there is an error or we insert an inline extent. 5011 * 5012 * This function will release the metadata space that was not used and will 5013 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5014 * list if there are no delalloc bytes left. 5015 */ 5016 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5017 { 5018 btrfs_delalloc_release_metadata(inode, num_bytes); 5019 btrfs_free_reserved_data_space(inode, num_bytes); 5020 } 5021 5022 static int update_block_group(struct btrfs_root *root, 5023 u64 bytenr, u64 num_bytes, int alloc) 5024 { 5025 struct btrfs_block_group_cache *cache = NULL; 5026 struct btrfs_fs_info *info = root->fs_info; 5027 u64 total = num_bytes; 5028 u64 old_val; 5029 u64 byte_in_group; 5030 int factor; 5031 5032 /* block accounting for super block */ 5033 spin_lock(&info->delalloc_lock); 5034 old_val = btrfs_super_bytes_used(info->super_copy); 5035 if (alloc) 5036 old_val += num_bytes; 5037 else 5038 old_val -= num_bytes; 5039 btrfs_set_super_bytes_used(info->super_copy, old_val); 5040 spin_unlock(&info->delalloc_lock); 5041 5042 while (total) { 5043 cache = btrfs_lookup_block_group(info, bytenr); 5044 if (!cache) 5045 return -ENOENT; 5046 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5047 BTRFS_BLOCK_GROUP_RAID1 | 5048 BTRFS_BLOCK_GROUP_RAID10)) 5049 factor = 2; 5050 else 5051 factor = 1; 5052 /* 5053 * If this block group has free space cache written out, we 5054 * need to make sure to load it if we are removing space. This 5055 * is because we need the unpinning stage to actually add the 5056 * space back to the block group, otherwise we will leak space. 5057 */ 5058 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5059 cache_block_group(cache, 1); 5060 5061 byte_in_group = bytenr - cache->key.objectid; 5062 WARN_ON(byte_in_group > cache->key.offset); 5063 5064 spin_lock(&cache->space_info->lock); 5065 spin_lock(&cache->lock); 5066 5067 if (btrfs_test_opt(root, SPACE_CACHE) && 5068 cache->disk_cache_state < BTRFS_DC_CLEAR) 5069 cache->disk_cache_state = BTRFS_DC_CLEAR; 5070 5071 cache->dirty = 1; 5072 old_val = btrfs_block_group_used(&cache->item); 5073 num_bytes = min(total, cache->key.offset - byte_in_group); 5074 if (alloc) { 5075 old_val += num_bytes; 5076 btrfs_set_block_group_used(&cache->item, old_val); 5077 cache->reserved -= num_bytes; 5078 cache->space_info->bytes_reserved -= num_bytes; 5079 cache->space_info->bytes_used += num_bytes; 5080 cache->space_info->disk_used += num_bytes * factor; 5081 spin_unlock(&cache->lock); 5082 spin_unlock(&cache->space_info->lock); 5083 } else { 5084 old_val -= num_bytes; 5085 btrfs_set_block_group_used(&cache->item, old_val); 5086 cache->pinned += num_bytes; 5087 cache->space_info->bytes_pinned += num_bytes; 5088 cache->space_info->bytes_used -= num_bytes; 5089 cache->space_info->disk_used -= num_bytes * factor; 5090 spin_unlock(&cache->lock); 5091 spin_unlock(&cache->space_info->lock); 5092 5093 set_extent_dirty(info->pinned_extents, 5094 bytenr, bytenr + num_bytes - 1, 5095 GFP_NOFS | __GFP_NOFAIL); 5096 } 5097 btrfs_put_block_group(cache); 5098 total -= num_bytes; 5099 bytenr += num_bytes; 5100 } 5101 return 0; 5102 } 5103 5104 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5105 { 5106 struct btrfs_block_group_cache *cache; 5107 u64 bytenr; 5108 5109 spin_lock(&root->fs_info->block_group_cache_lock); 5110 bytenr = root->fs_info->first_logical_byte; 5111 spin_unlock(&root->fs_info->block_group_cache_lock); 5112 5113 if (bytenr < (u64)-1) 5114 return bytenr; 5115 5116 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5117 if (!cache) 5118 return 0; 5119 5120 bytenr = cache->key.objectid; 5121 btrfs_put_block_group(cache); 5122 5123 return bytenr; 5124 } 5125 5126 static int pin_down_extent(struct btrfs_root *root, 5127 struct btrfs_block_group_cache *cache, 5128 u64 bytenr, u64 num_bytes, int reserved) 5129 { 5130 spin_lock(&cache->space_info->lock); 5131 spin_lock(&cache->lock); 5132 cache->pinned += num_bytes; 5133 cache->space_info->bytes_pinned += num_bytes; 5134 if (reserved) { 5135 cache->reserved -= num_bytes; 5136 cache->space_info->bytes_reserved -= num_bytes; 5137 } 5138 spin_unlock(&cache->lock); 5139 spin_unlock(&cache->space_info->lock); 5140 5141 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5142 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5143 return 0; 5144 } 5145 5146 /* 5147 * this function must be called within transaction 5148 */ 5149 int btrfs_pin_extent(struct btrfs_root *root, 5150 u64 bytenr, u64 num_bytes, int reserved) 5151 { 5152 struct btrfs_block_group_cache *cache; 5153 5154 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5155 BUG_ON(!cache); /* Logic error */ 5156 5157 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5158 5159 btrfs_put_block_group(cache); 5160 return 0; 5161 } 5162 5163 /* 5164 * this function must be called within transaction 5165 */ 5166 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5167 u64 bytenr, u64 num_bytes) 5168 { 5169 struct btrfs_block_group_cache *cache; 5170 int ret; 5171 5172 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5173 if (!cache) 5174 return -EINVAL; 5175 5176 /* 5177 * pull in the free space cache (if any) so that our pin 5178 * removes the free space from the cache. We have load_only set 5179 * to one because the slow code to read in the free extents does check 5180 * the pinned extents. 5181 */ 5182 cache_block_group(cache, 1); 5183 5184 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5185 5186 /* remove us from the free space cache (if we're there at all) */ 5187 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5188 btrfs_put_block_group(cache); 5189 return ret; 5190 } 5191 5192 /** 5193 * btrfs_update_reserved_bytes - update the block_group and space info counters 5194 * @cache: The cache we are manipulating 5195 * @num_bytes: The number of bytes in question 5196 * @reserve: One of the reservation enums 5197 * 5198 * This is called by the allocator when it reserves space, or by somebody who is 5199 * freeing space that was never actually used on disk. For example if you 5200 * reserve some space for a new leaf in transaction A and before transaction A 5201 * commits you free that leaf, you call this with reserve set to 0 in order to 5202 * clear the reservation. 5203 * 5204 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5205 * ENOSPC accounting. For data we handle the reservation through clearing the 5206 * delalloc bits in the io_tree. We have to do this since we could end up 5207 * allocating less disk space for the amount of data we have reserved in the 5208 * case of compression. 5209 * 5210 * If this is a reservation and the block group has become read only we cannot 5211 * make the reservation and return -EAGAIN, otherwise this function always 5212 * succeeds. 5213 */ 5214 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5215 u64 num_bytes, int reserve) 5216 { 5217 struct btrfs_space_info *space_info = cache->space_info; 5218 int ret = 0; 5219 5220 spin_lock(&space_info->lock); 5221 spin_lock(&cache->lock); 5222 if (reserve != RESERVE_FREE) { 5223 if (cache->ro) { 5224 ret = -EAGAIN; 5225 } else { 5226 cache->reserved += num_bytes; 5227 space_info->bytes_reserved += num_bytes; 5228 if (reserve == RESERVE_ALLOC) { 5229 trace_btrfs_space_reservation(cache->fs_info, 5230 "space_info", space_info->flags, 5231 num_bytes, 0); 5232 space_info->bytes_may_use -= num_bytes; 5233 } 5234 } 5235 } else { 5236 if (cache->ro) 5237 space_info->bytes_readonly += num_bytes; 5238 cache->reserved -= num_bytes; 5239 space_info->bytes_reserved -= num_bytes; 5240 space_info->reservation_progress++; 5241 } 5242 spin_unlock(&cache->lock); 5243 spin_unlock(&space_info->lock); 5244 return ret; 5245 } 5246 5247 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5248 struct btrfs_root *root) 5249 { 5250 struct btrfs_fs_info *fs_info = root->fs_info; 5251 struct btrfs_caching_control *next; 5252 struct btrfs_caching_control *caching_ctl; 5253 struct btrfs_block_group_cache *cache; 5254 5255 down_write(&fs_info->extent_commit_sem); 5256 5257 list_for_each_entry_safe(caching_ctl, next, 5258 &fs_info->caching_block_groups, list) { 5259 cache = caching_ctl->block_group; 5260 if (block_group_cache_done(cache)) { 5261 cache->last_byte_to_unpin = (u64)-1; 5262 list_del_init(&caching_ctl->list); 5263 put_caching_control(caching_ctl); 5264 } else { 5265 cache->last_byte_to_unpin = caching_ctl->progress; 5266 } 5267 } 5268 5269 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5270 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5271 else 5272 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5273 5274 up_write(&fs_info->extent_commit_sem); 5275 5276 update_global_block_rsv(fs_info); 5277 } 5278 5279 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5280 { 5281 struct btrfs_fs_info *fs_info = root->fs_info; 5282 struct btrfs_block_group_cache *cache = NULL; 5283 struct btrfs_space_info *space_info; 5284 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5285 u64 len; 5286 bool readonly; 5287 5288 while (start <= end) { 5289 readonly = false; 5290 if (!cache || 5291 start >= cache->key.objectid + cache->key.offset) { 5292 if (cache) 5293 btrfs_put_block_group(cache); 5294 cache = btrfs_lookup_block_group(fs_info, start); 5295 BUG_ON(!cache); /* Logic error */ 5296 } 5297 5298 len = cache->key.objectid + cache->key.offset - start; 5299 len = min(len, end + 1 - start); 5300 5301 if (start < cache->last_byte_to_unpin) { 5302 len = min(len, cache->last_byte_to_unpin - start); 5303 btrfs_add_free_space(cache, start, len); 5304 } 5305 5306 start += len; 5307 space_info = cache->space_info; 5308 5309 spin_lock(&space_info->lock); 5310 spin_lock(&cache->lock); 5311 cache->pinned -= len; 5312 space_info->bytes_pinned -= len; 5313 if (cache->ro) { 5314 space_info->bytes_readonly += len; 5315 readonly = true; 5316 } 5317 spin_unlock(&cache->lock); 5318 if (!readonly && global_rsv->space_info == space_info) { 5319 spin_lock(&global_rsv->lock); 5320 if (!global_rsv->full) { 5321 len = min(len, global_rsv->size - 5322 global_rsv->reserved); 5323 global_rsv->reserved += len; 5324 space_info->bytes_may_use += len; 5325 if (global_rsv->reserved >= global_rsv->size) 5326 global_rsv->full = 1; 5327 } 5328 spin_unlock(&global_rsv->lock); 5329 } 5330 spin_unlock(&space_info->lock); 5331 } 5332 5333 if (cache) 5334 btrfs_put_block_group(cache); 5335 return 0; 5336 } 5337 5338 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5339 struct btrfs_root *root) 5340 { 5341 struct btrfs_fs_info *fs_info = root->fs_info; 5342 struct extent_io_tree *unpin; 5343 u64 start; 5344 u64 end; 5345 int ret; 5346 5347 if (trans->aborted) 5348 return 0; 5349 5350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5351 unpin = &fs_info->freed_extents[1]; 5352 else 5353 unpin = &fs_info->freed_extents[0]; 5354 5355 while (1) { 5356 ret = find_first_extent_bit(unpin, 0, &start, &end, 5357 EXTENT_DIRTY, NULL); 5358 if (ret) 5359 break; 5360 5361 if (btrfs_test_opt(root, DISCARD)) 5362 ret = btrfs_discard_extent(root, start, 5363 end + 1 - start, NULL); 5364 5365 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5366 unpin_extent_range(root, start, end); 5367 cond_resched(); 5368 } 5369 5370 return 0; 5371 } 5372 5373 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5374 struct btrfs_root *root, 5375 u64 bytenr, u64 num_bytes, u64 parent, 5376 u64 root_objectid, u64 owner_objectid, 5377 u64 owner_offset, int refs_to_drop, 5378 struct btrfs_delayed_extent_op *extent_op) 5379 { 5380 struct btrfs_key key; 5381 struct btrfs_path *path; 5382 struct btrfs_fs_info *info = root->fs_info; 5383 struct btrfs_root *extent_root = info->extent_root; 5384 struct extent_buffer *leaf; 5385 struct btrfs_extent_item *ei; 5386 struct btrfs_extent_inline_ref *iref; 5387 int ret; 5388 int is_data; 5389 int extent_slot = 0; 5390 int found_extent = 0; 5391 int num_to_del = 1; 5392 u32 item_size; 5393 u64 refs; 5394 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5395 SKINNY_METADATA); 5396 5397 path = btrfs_alloc_path(); 5398 if (!path) 5399 return -ENOMEM; 5400 5401 path->reada = 1; 5402 path->leave_spinning = 1; 5403 5404 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5405 BUG_ON(!is_data && refs_to_drop != 1); 5406 5407 if (is_data) 5408 skinny_metadata = 0; 5409 5410 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5411 bytenr, num_bytes, parent, 5412 root_objectid, owner_objectid, 5413 owner_offset); 5414 if (ret == 0) { 5415 extent_slot = path->slots[0]; 5416 while (extent_slot >= 0) { 5417 btrfs_item_key_to_cpu(path->nodes[0], &key, 5418 extent_slot); 5419 if (key.objectid != bytenr) 5420 break; 5421 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5422 key.offset == num_bytes) { 5423 found_extent = 1; 5424 break; 5425 } 5426 if (key.type == BTRFS_METADATA_ITEM_KEY && 5427 key.offset == owner_objectid) { 5428 found_extent = 1; 5429 break; 5430 } 5431 if (path->slots[0] - extent_slot > 5) 5432 break; 5433 extent_slot--; 5434 } 5435 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5436 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5437 if (found_extent && item_size < sizeof(*ei)) 5438 found_extent = 0; 5439 #endif 5440 if (!found_extent) { 5441 BUG_ON(iref); 5442 ret = remove_extent_backref(trans, extent_root, path, 5443 NULL, refs_to_drop, 5444 is_data); 5445 if (ret) { 5446 btrfs_abort_transaction(trans, extent_root, ret); 5447 goto out; 5448 } 5449 btrfs_release_path(path); 5450 path->leave_spinning = 1; 5451 5452 key.objectid = bytenr; 5453 key.type = BTRFS_EXTENT_ITEM_KEY; 5454 key.offset = num_bytes; 5455 5456 if (!is_data && skinny_metadata) { 5457 key.type = BTRFS_METADATA_ITEM_KEY; 5458 key.offset = owner_objectid; 5459 } 5460 5461 ret = btrfs_search_slot(trans, extent_root, 5462 &key, path, -1, 1); 5463 if (ret > 0 && skinny_metadata && path->slots[0]) { 5464 /* 5465 * Couldn't find our skinny metadata item, 5466 * see if we have ye olde extent item. 5467 */ 5468 path->slots[0]--; 5469 btrfs_item_key_to_cpu(path->nodes[0], &key, 5470 path->slots[0]); 5471 if (key.objectid == bytenr && 5472 key.type == BTRFS_EXTENT_ITEM_KEY && 5473 key.offset == num_bytes) 5474 ret = 0; 5475 } 5476 5477 if (ret > 0 && skinny_metadata) { 5478 skinny_metadata = false; 5479 key.type = BTRFS_EXTENT_ITEM_KEY; 5480 key.offset = num_bytes; 5481 btrfs_release_path(path); 5482 ret = btrfs_search_slot(trans, extent_root, 5483 &key, path, -1, 1); 5484 } 5485 5486 if (ret) { 5487 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5488 ret, (unsigned long long)bytenr); 5489 if (ret > 0) 5490 btrfs_print_leaf(extent_root, 5491 path->nodes[0]); 5492 } 5493 if (ret < 0) { 5494 btrfs_abort_transaction(trans, extent_root, ret); 5495 goto out; 5496 } 5497 extent_slot = path->slots[0]; 5498 } 5499 } else if (ret == -ENOENT) { 5500 btrfs_print_leaf(extent_root, path->nodes[0]); 5501 WARN_ON(1); 5502 btrfs_err(info, 5503 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5504 (unsigned long long)bytenr, 5505 (unsigned long long)parent, 5506 (unsigned long long)root_objectid, 5507 (unsigned long long)owner_objectid, 5508 (unsigned long long)owner_offset); 5509 } else { 5510 btrfs_abort_transaction(trans, extent_root, ret); 5511 goto out; 5512 } 5513 5514 leaf = path->nodes[0]; 5515 item_size = btrfs_item_size_nr(leaf, extent_slot); 5516 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5517 if (item_size < sizeof(*ei)) { 5518 BUG_ON(found_extent || extent_slot != path->slots[0]); 5519 ret = convert_extent_item_v0(trans, extent_root, path, 5520 owner_objectid, 0); 5521 if (ret < 0) { 5522 btrfs_abort_transaction(trans, extent_root, ret); 5523 goto out; 5524 } 5525 5526 btrfs_release_path(path); 5527 path->leave_spinning = 1; 5528 5529 key.objectid = bytenr; 5530 key.type = BTRFS_EXTENT_ITEM_KEY; 5531 key.offset = num_bytes; 5532 5533 ret = btrfs_search_slot(trans, extent_root, &key, path, 5534 -1, 1); 5535 if (ret) { 5536 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5537 ret, (unsigned long long)bytenr); 5538 btrfs_print_leaf(extent_root, path->nodes[0]); 5539 } 5540 if (ret < 0) { 5541 btrfs_abort_transaction(trans, extent_root, ret); 5542 goto out; 5543 } 5544 5545 extent_slot = path->slots[0]; 5546 leaf = path->nodes[0]; 5547 item_size = btrfs_item_size_nr(leaf, extent_slot); 5548 } 5549 #endif 5550 BUG_ON(item_size < sizeof(*ei)); 5551 ei = btrfs_item_ptr(leaf, extent_slot, 5552 struct btrfs_extent_item); 5553 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 5554 key.type == BTRFS_EXTENT_ITEM_KEY) { 5555 struct btrfs_tree_block_info *bi; 5556 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 5557 bi = (struct btrfs_tree_block_info *)(ei + 1); 5558 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 5559 } 5560 5561 refs = btrfs_extent_refs(leaf, ei); 5562 if (refs < refs_to_drop) { 5563 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 5564 "for bytenr %Lu\n", refs_to_drop, refs, bytenr); 5565 ret = -EINVAL; 5566 btrfs_abort_transaction(trans, extent_root, ret); 5567 goto out; 5568 } 5569 refs -= refs_to_drop; 5570 5571 if (refs > 0) { 5572 if (extent_op) 5573 __run_delayed_extent_op(extent_op, leaf, ei); 5574 /* 5575 * In the case of inline back ref, reference count will 5576 * be updated by remove_extent_backref 5577 */ 5578 if (iref) { 5579 BUG_ON(!found_extent); 5580 } else { 5581 btrfs_set_extent_refs(leaf, ei, refs); 5582 btrfs_mark_buffer_dirty(leaf); 5583 } 5584 if (found_extent) { 5585 ret = remove_extent_backref(trans, extent_root, path, 5586 iref, refs_to_drop, 5587 is_data); 5588 if (ret) { 5589 btrfs_abort_transaction(trans, extent_root, ret); 5590 goto out; 5591 } 5592 } 5593 } else { 5594 if (found_extent) { 5595 BUG_ON(is_data && refs_to_drop != 5596 extent_data_ref_count(root, path, iref)); 5597 if (iref) { 5598 BUG_ON(path->slots[0] != extent_slot); 5599 } else { 5600 BUG_ON(path->slots[0] != extent_slot + 1); 5601 path->slots[0] = extent_slot; 5602 num_to_del = 2; 5603 } 5604 } 5605 5606 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 5607 num_to_del); 5608 if (ret) { 5609 btrfs_abort_transaction(trans, extent_root, ret); 5610 goto out; 5611 } 5612 btrfs_release_path(path); 5613 5614 if (is_data) { 5615 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 5616 if (ret) { 5617 btrfs_abort_transaction(trans, extent_root, ret); 5618 goto out; 5619 } 5620 } 5621 5622 ret = update_block_group(root, bytenr, num_bytes, 0); 5623 if (ret) { 5624 btrfs_abort_transaction(trans, extent_root, ret); 5625 goto out; 5626 } 5627 } 5628 out: 5629 btrfs_free_path(path); 5630 return ret; 5631 } 5632 5633 /* 5634 * when we free an block, it is possible (and likely) that we free the last 5635 * delayed ref for that extent as well. This searches the delayed ref tree for 5636 * a given extent, and if there are no other delayed refs to be processed, it 5637 * removes it from the tree. 5638 */ 5639 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 5640 struct btrfs_root *root, u64 bytenr) 5641 { 5642 struct btrfs_delayed_ref_head *head; 5643 struct btrfs_delayed_ref_root *delayed_refs; 5644 struct btrfs_delayed_ref_node *ref; 5645 struct rb_node *node; 5646 int ret = 0; 5647 5648 delayed_refs = &trans->transaction->delayed_refs; 5649 spin_lock(&delayed_refs->lock); 5650 head = btrfs_find_delayed_ref_head(trans, bytenr); 5651 if (!head) 5652 goto out; 5653 5654 node = rb_prev(&head->node.rb_node); 5655 if (!node) 5656 goto out; 5657 5658 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 5659 5660 /* there are still entries for this ref, we can't drop it */ 5661 if (ref->bytenr == bytenr) 5662 goto out; 5663 5664 if (head->extent_op) { 5665 if (!head->must_insert_reserved) 5666 goto out; 5667 btrfs_free_delayed_extent_op(head->extent_op); 5668 head->extent_op = NULL; 5669 } 5670 5671 /* 5672 * waiting for the lock here would deadlock. If someone else has it 5673 * locked they are already in the process of dropping it anyway 5674 */ 5675 if (!mutex_trylock(&head->mutex)) 5676 goto out; 5677 5678 /* 5679 * at this point we have a head with no other entries. Go 5680 * ahead and process it. 5681 */ 5682 head->node.in_tree = 0; 5683 rb_erase(&head->node.rb_node, &delayed_refs->root); 5684 5685 delayed_refs->num_entries--; 5686 5687 /* 5688 * we don't take a ref on the node because we're removing it from the 5689 * tree, so we just steal the ref the tree was holding. 5690 */ 5691 delayed_refs->num_heads--; 5692 if (list_empty(&head->cluster)) 5693 delayed_refs->num_heads_ready--; 5694 5695 list_del_init(&head->cluster); 5696 spin_unlock(&delayed_refs->lock); 5697 5698 BUG_ON(head->extent_op); 5699 if (head->must_insert_reserved) 5700 ret = 1; 5701 5702 mutex_unlock(&head->mutex); 5703 btrfs_put_delayed_ref(&head->node); 5704 return ret; 5705 out: 5706 spin_unlock(&delayed_refs->lock); 5707 return 0; 5708 } 5709 5710 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5711 struct btrfs_root *root, 5712 struct extent_buffer *buf, 5713 u64 parent, int last_ref) 5714 { 5715 struct btrfs_block_group_cache *cache = NULL; 5716 int ret; 5717 5718 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5719 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 5720 buf->start, buf->len, 5721 parent, root->root_key.objectid, 5722 btrfs_header_level(buf), 5723 BTRFS_DROP_DELAYED_REF, NULL, 0); 5724 BUG_ON(ret); /* -ENOMEM */ 5725 } 5726 5727 if (!last_ref) 5728 return; 5729 5730 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 5731 5732 if (btrfs_header_generation(buf) == trans->transid) { 5733 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5734 ret = check_ref_cleanup(trans, root, buf->start); 5735 if (!ret) 5736 goto out; 5737 } 5738 5739 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 5740 pin_down_extent(root, cache, buf->start, buf->len, 1); 5741 goto out; 5742 } 5743 5744 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 5745 5746 btrfs_add_free_space(cache, buf->start, buf->len); 5747 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5748 } 5749 out: 5750 /* 5751 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5752 * anymore. 5753 */ 5754 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 5755 btrfs_put_block_group(cache); 5756 } 5757 5758 /* Can return -ENOMEM */ 5759 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 5760 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 5761 u64 owner, u64 offset, int for_cow) 5762 { 5763 int ret; 5764 struct btrfs_fs_info *fs_info = root->fs_info; 5765 5766 /* 5767 * tree log blocks never actually go into the extent allocation 5768 * tree, just update pinning info and exit early. 5769 */ 5770 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 5771 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 5772 /* unlocks the pinned mutex */ 5773 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5774 ret = 0; 5775 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5776 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 5777 num_bytes, 5778 parent, root_objectid, (int)owner, 5779 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5780 } else { 5781 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 5782 num_bytes, 5783 parent, root_objectid, owner, 5784 offset, BTRFS_DROP_DELAYED_REF, 5785 NULL, for_cow); 5786 } 5787 return ret; 5788 } 5789 5790 static u64 stripe_align(struct btrfs_root *root, 5791 struct btrfs_block_group_cache *cache, 5792 u64 val, u64 num_bytes) 5793 { 5794 u64 ret = ALIGN(val, root->stripesize); 5795 return ret; 5796 } 5797 5798 /* 5799 * when we wait for progress in the block group caching, its because 5800 * our allocation attempt failed at least once. So, we must sleep 5801 * and let some progress happen before we try again. 5802 * 5803 * This function will sleep at least once waiting for new free space to 5804 * show up, and then it will check the block group free space numbers 5805 * for our min num_bytes. Another option is to have it go ahead 5806 * and look in the rbtree for a free extent of a given size, but this 5807 * is a good start. 5808 */ 5809 static noinline int 5810 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 5811 u64 num_bytes) 5812 { 5813 struct btrfs_caching_control *caching_ctl; 5814 5815 caching_ctl = get_caching_control(cache); 5816 if (!caching_ctl) 5817 return 0; 5818 5819 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 5820 (cache->free_space_ctl->free_space >= num_bytes)); 5821 5822 put_caching_control(caching_ctl); 5823 return 0; 5824 } 5825 5826 static noinline int 5827 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5828 { 5829 struct btrfs_caching_control *caching_ctl; 5830 5831 caching_ctl = get_caching_control(cache); 5832 if (!caching_ctl) 5833 return 0; 5834 5835 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 5836 5837 put_caching_control(caching_ctl); 5838 return 0; 5839 } 5840 5841 int __get_raid_index(u64 flags) 5842 { 5843 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5844 return BTRFS_RAID_RAID10; 5845 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5846 return BTRFS_RAID_RAID1; 5847 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5848 return BTRFS_RAID_DUP; 5849 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5850 return BTRFS_RAID_RAID0; 5851 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 5852 return BTRFS_RAID_RAID5; 5853 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 5854 return BTRFS_RAID_RAID6; 5855 5856 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 5857 } 5858 5859 static int get_block_group_index(struct btrfs_block_group_cache *cache) 5860 { 5861 return __get_raid_index(cache->flags); 5862 } 5863 5864 enum btrfs_loop_type { 5865 LOOP_CACHING_NOWAIT = 0, 5866 LOOP_CACHING_WAIT = 1, 5867 LOOP_ALLOC_CHUNK = 2, 5868 LOOP_NO_EMPTY_SIZE = 3, 5869 }; 5870 5871 /* 5872 * walks the btree of allocated extents and find a hole of a given size. 5873 * The key ins is changed to record the hole: 5874 * ins->objectid == block start 5875 * ins->flags = BTRFS_EXTENT_ITEM_KEY 5876 * ins->offset == number of blocks 5877 * Any available blocks before search_start are skipped. 5878 */ 5879 static noinline int find_free_extent(struct btrfs_trans_handle *trans, 5880 struct btrfs_root *orig_root, 5881 u64 num_bytes, u64 empty_size, 5882 u64 hint_byte, struct btrfs_key *ins, 5883 u64 flags) 5884 { 5885 int ret = 0; 5886 struct btrfs_root *root = orig_root->fs_info->extent_root; 5887 struct btrfs_free_cluster *last_ptr = NULL; 5888 struct btrfs_block_group_cache *block_group = NULL; 5889 struct btrfs_block_group_cache *used_block_group; 5890 u64 search_start = 0; 5891 int empty_cluster = 2 * 1024 * 1024; 5892 struct btrfs_space_info *space_info; 5893 int loop = 0; 5894 int index = __get_raid_index(flags); 5895 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 5896 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 5897 bool found_uncached_bg = false; 5898 bool failed_cluster_refill = false; 5899 bool failed_alloc = false; 5900 bool use_cluster = true; 5901 bool have_caching_bg = false; 5902 5903 WARN_ON(num_bytes < root->sectorsize); 5904 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 5905 ins->objectid = 0; 5906 ins->offset = 0; 5907 5908 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 5909 5910 space_info = __find_space_info(root->fs_info, flags); 5911 if (!space_info) { 5912 btrfs_err(root->fs_info, "No space info for %llu", flags); 5913 return -ENOSPC; 5914 } 5915 5916 /* 5917 * If the space info is for both data and metadata it means we have a 5918 * small filesystem and we can't use the clustering stuff. 5919 */ 5920 if (btrfs_mixed_space_info(space_info)) 5921 use_cluster = false; 5922 5923 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 5924 last_ptr = &root->fs_info->meta_alloc_cluster; 5925 if (!btrfs_test_opt(root, SSD)) 5926 empty_cluster = 64 * 1024; 5927 } 5928 5929 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 5930 btrfs_test_opt(root, SSD)) { 5931 last_ptr = &root->fs_info->data_alloc_cluster; 5932 } 5933 5934 if (last_ptr) { 5935 spin_lock(&last_ptr->lock); 5936 if (last_ptr->block_group) 5937 hint_byte = last_ptr->window_start; 5938 spin_unlock(&last_ptr->lock); 5939 } 5940 5941 search_start = max(search_start, first_logical_byte(root, 0)); 5942 search_start = max(search_start, hint_byte); 5943 5944 if (!last_ptr) 5945 empty_cluster = 0; 5946 5947 if (search_start == hint_byte) { 5948 block_group = btrfs_lookup_block_group(root->fs_info, 5949 search_start); 5950 used_block_group = block_group; 5951 /* 5952 * we don't want to use the block group if it doesn't match our 5953 * allocation bits, or if its not cached. 5954 * 5955 * However if we are re-searching with an ideal block group 5956 * picked out then we don't care that the block group is cached. 5957 */ 5958 if (block_group && block_group_bits(block_group, flags) && 5959 block_group->cached != BTRFS_CACHE_NO) { 5960 down_read(&space_info->groups_sem); 5961 if (list_empty(&block_group->list) || 5962 block_group->ro) { 5963 /* 5964 * someone is removing this block group, 5965 * we can't jump into the have_block_group 5966 * target because our list pointers are not 5967 * valid 5968 */ 5969 btrfs_put_block_group(block_group); 5970 up_read(&space_info->groups_sem); 5971 } else { 5972 index = get_block_group_index(block_group); 5973 goto have_block_group; 5974 } 5975 } else if (block_group) { 5976 btrfs_put_block_group(block_group); 5977 } 5978 } 5979 search: 5980 have_caching_bg = false; 5981 down_read(&space_info->groups_sem); 5982 list_for_each_entry(block_group, &space_info->block_groups[index], 5983 list) { 5984 u64 offset; 5985 int cached; 5986 5987 used_block_group = block_group; 5988 btrfs_get_block_group(block_group); 5989 search_start = block_group->key.objectid; 5990 5991 /* 5992 * this can happen if we end up cycling through all the 5993 * raid types, but we want to make sure we only allocate 5994 * for the proper type. 5995 */ 5996 if (!block_group_bits(block_group, flags)) { 5997 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5998 BTRFS_BLOCK_GROUP_RAID1 | 5999 BTRFS_BLOCK_GROUP_RAID5 | 6000 BTRFS_BLOCK_GROUP_RAID6 | 6001 BTRFS_BLOCK_GROUP_RAID10; 6002 6003 /* 6004 * if they asked for extra copies and this block group 6005 * doesn't provide them, bail. This does allow us to 6006 * fill raid0 from raid1. 6007 */ 6008 if ((flags & extra) && !(block_group->flags & extra)) 6009 goto loop; 6010 } 6011 6012 have_block_group: 6013 cached = block_group_cache_done(block_group); 6014 if (unlikely(!cached)) { 6015 found_uncached_bg = true; 6016 ret = cache_block_group(block_group, 0); 6017 BUG_ON(ret < 0); 6018 ret = 0; 6019 } 6020 6021 if (unlikely(block_group->ro)) 6022 goto loop; 6023 6024 /* 6025 * Ok we want to try and use the cluster allocator, so 6026 * lets look there 6027 */ 6028 if (last_ptr) { 6029 unsigned long aligned_cluster; 6030 /* 6031 * the refill lock keeps out other 6032 * people trying to start a new cluster 6033 */ 6034 spin_lock(&last_ptr->refill_lock); 6035 used_block_group = last_ptr->block_group; 6036 if (used_block_group != block_group && 6037 (!used_block_group || 6038 used_block_group->ro || 6039 !block_group_bits(used_block_group, flags))) { 6040 used_block_group = block_group; 6041 goto refill_cluster; 6042 } 6043 6044 if (used_block_group != block_group) 6045 btrfs_get_block_group(used_block_group); 6046 6047 offset = btrfs_alloc_from_cluster(used_block_group, 6048 last_ptr, num_bytes, used_block_group->key.objectid); 6049 if (offset) { 6050 /* we have a block, we're done */ 6051 spin_unlock(&last_ptr->refill_lock); 6052 trace_btrfs_reserve_extent_cluster(root, 6053 block_group, search_start, num_bytes); 6054 goto checks; 6055 } 6056 6057 WARN_ON(last_ptr->block_group != used_block_group); 6058 if (used_block_group != block_group) { 6059 btrfs_put_block_group(used_block_group); 6060 used_block_group = block_group; 6061 } 6062 refill_cluster: 6063 BUG_ON(used_block_group != block_group); 6064 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6065 * set up a new clusters, so lets just skip it 6066 * and let the allocator find whatever block 6067 * it can find. If we reach this point, we 6068 * will have tried the cluster allocator 6069 * plenty of times and not have found 6070 * anything, so we are likely way too 6071 * fragmented for the clustering stuff to find 6072 * anything. 6073 * 6074 * However, if the cluster is taken from the 6075 * current block group, release the cluster 6076 * first, so that we stand a better chance of 6077 * succeeding in the unclustered 6078 * allocation. */ 6079 if (loop >= LOOP_NO_EMPTY_SIZE && 6080 last_ptr->block_group != block_group) { 6081 spin_unlock(&last_ptr->refill_lock); 6082 goto unclustered_alloc; 6083 } 6084 6085 /* 6086 * this cluster didn't work out, free it and 6087 * start over 6088 */ 6089 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6090 6091 if (loop >= LOOP_NO_EMPTY_SIZE) { 6092 spin_unlock(&last_ptr->refill_lock); 6093 goto unclustered_alloc; 6094 } 6095 6096 aligned_cluster = max_t(unsigned long, 6097 empty_cluster + empty_size, 6098 block_group->full_stripe_len); 6099 6100 /* allocate a cluster in this block group */ 6101 ret = btrfs_find_space_cluster(trans, root, 6102 block_group, last_ptr, 6103 search_start, num_bytes, 6104 aligned_cluster); 6105 if (ret == 0) { 6106 /* 6107 * now pull our allocation out of this 6108 * cluster 6109 */ 6110 offset = btrfs_alloc_from_cluster(block_group, 6111 last_ptr, num_bytes, 6112 search_start); 6113 if (offset) { 6114 /* we found one, proceed */ 6115 spin_unlock(&last_ptr->refill_lock); 6116 trace_btrfs_reserve_extent_cluster(root, 6117 block_group, search_start, 6118 num_bytes); 6119 goto checks; 6120 } 6121 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6122 && !failed_cluster_refill) { 6123 spin_unlock(&last_ptr->refill_lock); 6124 6125 failed_cluster_refill = true; 6126 wait_block_group_cache_progress(block_group, 6127 num_bytes + empty_cluster + empty_size); 6128 goto have_block_group; 6129 } 6130 6131 /* 6132 * at this point we either didn't find a cluster 6133 * or we weren't able to allocate a block from our 6134 * cluster. Free the cluster we've been trying 6135 * to use, and go to the next block group 6136 */ 6137 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6138 spin_unlock(&last_ptr->refill_lock); 6139 goto loop; 6140 } 6141 6142 unclustered_alloc: 6143 spin_lock(&block_group->free_space_ctl->tree_lock); 6144 if (cached && 6145 block_group->free_space_ctl->free_space < 6146 num_bytes + empty_cluster + empty_size) { 6147 spin_unlock(&block_group->free_space_ctl->tree_lock); 6148 goto loop; 6149 } 6150 spin_unlock(&block_group->free_space_ctl->tree_lock); 6151 6152 offset = btrfs_find_space_for_alloc(block_group, search_start, 6153 num_bytes, empty_size); 6154 /* 6155 * If we didn't find a chunk, and we haven't failed on this 6156 * block group before, and this block group is in the middle of 6157 * caching and we are ok with waiting, then go ahead and wait 6158 * for progress to be made, and set failed_alloc to true. 6159 * 6160 * If failed_alloc is true then we've already waited on this 6161 * block group once and should move on to the next block group. 6162 */ 6163 if (!offset && !failed_alloc && !cached && 6164 loop > LOOP_CACHING_NOWAIT) { 6165 wait_block_group_cache_progress(block_group, 6166 num_bytes + empty_size); 6167 failed_alloc = true; 6168 goto have_block_group; 6169 } else if (!offset) { 6170 if (!cached) 6171 have_caching_bg = true; 6172 goto loop; 6173 } 6174 checks: 6175 search_start = stripe_align(root, used_block_group, 6176 offset, num_bytes); 6177 6178 /* move on to the next group */ 6179 if (search_start + num_bytes > 6180 used_block_group->key.objectid + used_block_group->key.offset) { 6181 btrfs_add_free_space(used_block_group, offset, num_bytes); 6182 goto loop; 6183 } 6184 6185 if (offset < search_start) 6186 btrfs_add_free_space(used_block_group, offset, 6187 search_start - offset); 6188 BUG_ON(offset > search_start); 6189 6190 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, 6191 alloc_type); 6192 if (ret == -EAGAIN) { 6193 btrfs_add_free_space(used_block_group, offset, num_bytes); 6194 goto loop; 6195 } 6196 6197 /* we are all good, lets return */ 6198 ins->objectid = search_start; 6199 ins->offset = num_bytes; 6200 6201 trace_btrfs_reserve_extent(orig_root, block_group, 6202 search_start, num_bytes); 6203 if (used_block_group != block_group) 6204 btrfs_put_block_group(used_block_group); 6205 btrfs_put_block_group(block_group); 6206 break; 6207 loop: 6208 failed_cluster_refill = false; 6209 failed_alloc = false; 6210 BUG_ON(index != get_block_group_index(block_group)); 6211 if (used_block_group != block_group) 6212 btrfs_put_block_group(used_block_group); 6213 btrfs_put_block_group(block_group); 6214 } 6215 up_read(&space_info->groups_sem); 6216 6217 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6218 goto search; 6219 6220 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6221 goto search; 6222 6223 /* 6224 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6225 * caching kthreads as we move along 6226 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6227 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6228 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6229 * again 6230 */ 6231 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6232 index = 0; 6233 loop++; 6234 if (loop == LOOP_ALLOC_CHUNK) { 6235 ret = do_chunk_alloc(trans, root, flags, 6236 CHUNK_ALLOC_FORCE); 6237 /* 6238 * Do not bail out on ENOSPC since we 6239 * can do more things. 6240 */ 6241 if (ret < 0 && ret != -ENOSPC) { 6242 btrfs_abort_transaction(trans, 6243 root, ret); 6244 goto out; 6245 } 6246 } 6247 6248 if (loop == LOOP_NO_EMPTY_SIZE) { 6249 empty_size = 0; 6250 empty_cluster = 0; 6251 } 6252 6253 goto search; 6254 } else if (!ins->objectid) { 6255 ret = -ENOSPC; 6256 } else if (ins->objectid) { 6257 ret = 0; 6258 } 6259 out: 6260 6261 return ret; 6262 } 6263 6264 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6265 int dump_block_groups) 6266 { 6267 struct btrfs_block_group_cache *cache; 6268 int index = 0; 6269 6270 spin_lock(&info->lock); 6271 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", 6272 (unsigned long long)info->flags, 6273 (unsigned long long)(info->total_bytes - info->bytes_used - 6274 info->bytes_pinned - info->bytes_reserved - 6275 info->bytes_readonly), 6276 (info->full) ? "" : "not "); 6277 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " 6278 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6279 (unsigned long long)info->total_bytes, 6280 (unsigned long long)info->bytes_used, 6281 (unsigned long long)info->bytes_pinned, 6282 (unsigned long long)info->bytes_reserved, 6283 (unsigned long long)info->bytes_may_use, 6284 (unsigned long long)info->bytes_readonly); 6285 spin_unlock(&info->lock); 6286 6287 if (!dump_block_groups) 6288 return; 6289 6290 down_read(&info->groups_sem); 6291 again: 6292 list_for_each_entry(cache, &info->block_groups[index], list) { 6293 spin_lock(&cache->lock); 6294 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", 6295 (unsigned long long)cache->key.objectid, 6296 (unsigned long long)cache->key.offset, 6297 (unsigned long long)btrfs_block_group_used(&cache->item), 6298 (unsigned long long)cache->pinned, 6299 (unsigned long long)cache->reserved, 6300 cache->ro ? "[readonly]" : ""); 6301 btrfs_dump_free_space(cache, bytes); 6302 spin_unlock(&cache->lock); 6303 } 6304 if (++index < BTRFS_NR_RAID_TYPES) 6305 goto again; 6306 up_read(&info->groups_sem); 6307 } 6308 6309 int btrfs_reserve_extent(struct btrfs_trans_handle *trans, 6310 struct btrfs_root *root, 6311 u64 num_bytes, u64 min_alloc_size, 6312 u64 empty_size, u64 hint_byte, 6313 struct btrfs_key *ins, int is_data) 6314 { 6315 bool final_tried = false; 6316 u64 flags; 6317 int ret; 6318 6319 flags = btrfs_get_alloc_profile(root, is_data); 6320 again: 6321 WARN_ON(num_bytes < root->sectorsize); 6322 ret = find_free_extent(trans, root, num_bytes, empty_size, 6323 hint_byte, ins, flags); 6324 6325 if (ret == -ENOSPC) { 6326 if (!final_tried) { 6327 num_bytes = num_bytes >> 1; 6328 num_bytes = round_down(num_bytes, root->sectorsize); 6329 num_bytes = max(num_bytes, min_alloc_size); 6330 if (num_bytes == min_alloc_size) 6331 final_tried = true; 6332 goto again; 6333 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6334 struct btrfs_space_info *sinfo; 6335 6336 sinfo = __find_space_info(root->fs_info, flags); 6337 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6338 (unsigned long long)flags, 6339 (unsigned long long)num_bytes); 6340 if (sinfo) 6341 dump_space_info(sinfo, num_bytes, 1); 6342 } 6343 } 6344 6345 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 6346 6347 return ret; 6348 } 6349 6350 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6351 u64 start, u64 len, int pin) 6352 { 6353 struct btrfs_block_group_cache *cache; 6354 int ret = 0; 6355 6356 cache = btrfs_lookup_block_group(root->fs_info, start); 6357 if (!cache) { 6358 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6359 (unsigned long long)start); 6360 return -ENOSPC; 6361 } 6362 6363 if (btrfs_test_opt(root, DISCARD)) 6364 ret = btrfs_discard_extent(root, start, len, NULL); 6365 6366 if (pin) 6367 pin_down_extent(root, cache, start, len, 1); 6368 else { 6369 btrfs_add_free_space(cache, start, len); 6370 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); 6371 } 6372 btrfs_put_block_group(cache); 6373 6374 trace_btrfs_reserved_extent_free(root, start, len); 6375 6376 return ret; 6377 } 6378 6379 int btrfs_free_reserved_extent(struct btrfs_root *root, 6380 u64 start, u64 len) 6381 { 6382 return __btrfs_free_reserved_extent(root, start, len, 0); 6383 } 6384 6385 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6386 u64 start, u64 len) 6387 { 6388 return __btrfs_free_reserved_extent(root, start, len, 1); 6389 } 6390 6391 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6392 struct btrfs_root *root, 6393 u64 parent, u64 root_objectid, 6394 u64 flags, u64 owner, u64 offset, 6395 struct btrfs_key *ins, int ref_mod) 6396 { 6397 int ret; 6398 struct btrfs_fs_info *fs_info = root->fs_info; 6399 struct btrfs_extent_item *extent_item; 6400 struct btrfs_extent_inline_ref *iref; 6401 struct btrfs_path *path; 6402 struct extent_buffer *leaf; 6403 int type; 6404 u32 size; 6405 6406 if (parent > 0) 6407 type = BTRFS_SHARED_DATA_REF_KEY; 6408 else 6409 type = BTRFS_EXTENT_DATA_REF_KEY; 6410 6411 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6412 6413 path = btrfs_alloc_path(); 6414 if (!path) 6415 return -ENOMEM; 6416 6417 path->leave_spinning = 1; 6418 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6419 ins, size); 6420 if (ret) { 6421 btrfs_free_path(path); 6422 return ret; 6423 } 6424 6425 leaf = path->nodes[0]; 6426 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6427 struct btrfs_extent_item); 6428 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6429 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6430 btrfs_set_extent_flags(leaf, extent_item, 6431 flags | BTRFS_EXTENT_FLAG_DATA); 6432 6433 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6434 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6435 if (parent > 0) { 6436 struct btrfs_shared_data_ref *ref; 6437 ref = (struct btrfs_shared_data_ref *)(iref + 1); 6438 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6439 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 6440 } else { 6441 struct btrfs_extent_data_ref *ref; 6442 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 6443 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 6444 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 6445 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 6446 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 6447 } 6448 6449 btrfs_mark_buffer_dirty(path->nodes[0]); 6450 btrfs_free_path(path); 6451 6452 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6453 if (ret) { /* -ENOENT, logic error */ 6454 btrfs_err(fs_info, "update block group failed for %llu %llu", 6455 (unsigned long long)ins->objectid, 6456 (unsigned long long)ins->offset); 6457 BUG(); 6458 } 6459 return ret; 6460 } 6461 6462 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 6463 struct btrfs_root *root, 6464 u64 parent, u64 root_objectid, 6465 u64 flags, struct btrfs_disk_key *key, 6466 int level, struct btrfs_key *ins) 6467 { 6468 int ret; 6469 struct btrfs_fs_info *fs_info = root->fs_info; 6470 struct btrfs_extent_item *extent_item; 6471 struct btrfs_tree_block_info *block_info; 6472 struct btrfs_extent_inline_ref *iref; 6473 struct btrfs_path *path; 6474 struct extent_buffer *leaf; 6475 u32 size = sizeof(*extent_item) + sizeof(*iref); 6476 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6477 SKINNY_METADATA); 6478 6479 if (!skinny_metadata) 6480 size += sizeof(*block_info); 6481 6482 path = btrfs_alloc_path(); 6483 if (!path) 6484 return -ENOMEM; 6485 6486 path->leave_spinning = 1; 6487 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6488 ins, size); 6489 if (ret) { 6490 btrfs_free_path(path); 6491 return ret; 6492 } 6493 6494 leaf = path->nodes[0]; 6495 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6496 struct btrfs_extent_item); 6497 btrfs_set_extent_refs(leaf, extent_item, 1); 6498 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6499 btrfs_set_extent_flags(leaf, extent_item, 6500 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 6501 6502 if (skinny_metadata) { 6503 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6504 } else { 6505 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 6506 btrfs_set_tree_block_key(leaf, block_info, key); 6507 btrfs_set_tree_block_level(leaf, block_info, level); 6508 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 6509 } 6510 6511 if (parent > 0) { 6512 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 6513 btrfs_set_extent_inline_ref_type(leaf, iref, 6514 BTRFS_SHARED_BLOCK_REF_KEY); 6515 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6516 } else { 6517 btrfs_set_extent_inline_ref_type(leaf, iref, 6518 BTRFS_TREE_BLOCK_REF_KEY); 6519 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 6520 } 6521 6522 btrfs_mark_buffer_dirty(leaf); 6523 btrfs_free_path(path); 6524 6525 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 6526 if (ret) { /* -ENOENT, logic error */ 6527 btrfs_err(fs_info, "update block group failed for %llu %llu", 6528 (unsigned long long)ins->objectid, 6529 (unsigned long long)ins->offset); 6530 BUG(); 6531 } 6532 return ret; 6533 } 6534 6535 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6536 struct btrfs_root *root, 6537 u64 root_objectid, u64 owner, 6538 u64 offset, struct btrfs_key *ins) 6539 { 6540 int ret; 6541 6542 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6543 6544 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 6545 ins->offset, 0, 6546 root_objectid, owner, offset, 6547 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 6548 return ret; 6549 } 6550 6551 /* 6552 * this is used by the tree logging recovery code. It records that 6553 * an extent has been allocated and makes sure to clear the free 6554 * space cache bits as well 6555 */ 6556 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 6557 struct btrfs_root *root, 6558 u64 root_objectid, u64 owner, u64 offset, 6559 struct btrfs_key *ins) 6560 { 6561 int ret; 6562 struct btrfs_block_group_cache *block_group; 6563 struct btrfs_caching_control *caching_ctl; 6564 u64 start = ins->objectid; 6565 u64 num_bytes = ins->offset; 6566 6567 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6568 cache_block_group(block_group, 0); 6569 caching_ctl = get_caching_control(block_group); 6570 6571 if (!caching_ctl) { 6572 BUG_ON(!block_group_cache_done(block_group)); 6573 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6574 if (ret) 6575 goto out; 6576 } else { 6577 mutex_lock(&caching_ctl->mutex); 6578 6579 if (start >= caching_ctl->progress) { 6580 ret = add_excluded_extent(root, start, num_bytes); 6581 } else if (start + num_bytes <= caching_ctl->progress) { 6582 ret = btrfs_remove_free_space(block_group, 6583 start, num_bytes); 6584 } else { 6585 num_bytes = caching_ctl->progress - start; 6586 ret = btrfs_remove_free_space(block_group, 6587 start, num_bytes); 6588 if (ret) 6589 goto out_lock; 6590 6591 start = caching_ctl->progress; 6592 num_bytes = ins->objectid + ins->offset - 6593 caching_ctl->progress; 6594 ret = add_excluded_extent(root, start, num_bytes); 6595 } 6596 out_lock: 6597 mutex_unlock(&caching_ctl->mutex); 6598 put_caching_control(caching_ctl); 6599 if (ret) 6600 goto out; 6601 } 6602 6603 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6604 RESERVE_ALLOC_NO_ACCOUNT); 6605 BUG_ON(ret); /* logic error */ 6606 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6607 0, owner, offset, ins, 1); 6608 out: 6609 btrfs_put_block_group(block_group); 6610 return ret; 6611 } 6612 6613 static struct extent_buffer * 6614 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6615 u64 bytenr, u32 blocksize, int level) 6616 { 6617 struct extent_buffer *buf; 6618 6619 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 6620 if (!buf) 6621 return ERR_PTR(-ENOMEM); 6622 btrfs_set_header_generation(buf, trans->transid); 6623 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 6624 btrfs_tree_lock(buf); 6625 clean_tree_block(trans, root, buf); 6626 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 6627 6628 btrfs_set_lock_blocking(buf); 6629 btrfs_set_buffer_uptodate(buf); 6630 6631 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 6632 /* 6633 * we allow two log transactions at a time, use different 6634 * EXENT bit to differentiate dirty pages. 6635 */ 6636 if (root->log_transid % 2 == 0) 6637 set_extent_dirty(&root->dirty_log_pages, buf->start, 6638 buf->start + buf->len - 1, GFP_NOFS); 6639 else 6640 set_extent_new(&root->dirty_log_pages, buf->start, 6641 buf->start + buf->len - 1, GFP_NOFS); 6642 } else { 6643 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 6644 buf->start + buf->len - 1, GFP_NOFS); 6645 } 6646 trans->blocks_used++; 6647 /* this returns a buffer locked for blocking */ 6648 return buf; 6649 } 6650 6651 static struct btrfs_block_rsv * 6652 use_block_rsv(struct btrfs_trans_handle *trans, 6653 struct btrfs_root *root, u32 blocksize) 6654 { 6655 struct btrfs_block_rsv *block_rsv; 6656 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 6657 int ret; 6658 bool global_updated = false; 6659 6660 block_rsv = get_block_rsv(trans, root); 6661 6662 if (unlikely(block_rsv->size == 0)) 6663 goto try_reserve; 6664 again: 6665 ret = block_rsv_use_bytes(block_rsv, blocksize); 6666 if (!ret) 6667 return block_rsv; 6668 6669 if (block_rsv->failfast) 6670 return ERR_PTR(ret); 6671 6672 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 6673 global_updated = true; 6674 update_global_block_rsv(root->fs_info); 6675 goto again; 6676 } 6677 6678 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6679 static DEFINE_RATELIMIT_STATE(_rs, 6680 DEFAULT_RATELIMIT_INTERVAL * 10, 6681 /*DEFAULT_RATELIMIT_BURST*/ 1); 6682 if (__ratelimit(&_rs)) 6683 WARN(1, KERN_DEBUG 6684 "btrfs: block rsv returned %d\n", ret); 6685 } 6686 try_reserve: 6687 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6688 BTRFS_RESERVE_NO_FLUSH); 6689 if (!ret) 6690 return block_rsv; 6691 /* 6692 * If we couldn't reserve metadata bytes try and use some from 6693 * the global reserve if its space type is the same as the global 6694 * reservation. 6695 */ 6696 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 6697 block_rsv->space_info == global_rsv->space_info) { 6698 ret = block_rsv_use_bytes(global_rsv, blocksize); 6699 if (!ret) 6700 return global_rsv; 6701 } 6702 return ERR_PTR(ret); 6703 } 6704 6705 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 6706 struct btrfs_block_rsv *block_rsv, u32 blocksize) 6707 { 6708 block_rsv_add_bytes(block_rsv, blocksize, 0); 6709 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 6710 } 6711 6712 /* 6713 * finds a free extent and does all the dirty work required for allocation 6714 * returns the key for the extent through ins, and a tree buffer for 6715 * the first block of the extent through buf. 6716 * 6717 * returns the tree buffer or NULL. 6718 */ 6719 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 6720 struct btrfs_root *root, u32 blocksize, 6721 u64 parent, u64 root_objectid, 6722 struct btrfs_disk_key *key, int level, 6723 u64 hint, u64 empty_size) 6724 { 6725 struct btrfs_key ins; 6726 struct btrfs_block_rsv *block_rsv; 6727 struct extent_buffer *buf; 6728 u64 flags = 0; 6729 int ret; 6730 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6731 SKINNY_METADATA); 6732 6733 block_rsv = use_block_rsv(trans, root, blocksize); 6734 if (IS_ERR(block_rsv)) 6735 return ERR_CAST(block_rsv); 6736 6737 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6738 empty_size, hint, &ins, 0); 6739 if (ret) { 6740 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 6741 return ERR_PTR(ret); 6742 } 6743 6744 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 6745 blocksize, level); 6746 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 6747 6748 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 6749 if (parent == 0) 6750 parent = ins.objectid; 6751 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 6752 } else 6753 BUG_ON(parent > 0); 6754 6755 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6756 struct btrfs_delayed_extent_op *extent_op; 6757 extent_op = btrfs_alloc_delayed_extent_op(); 6758 BUG_ON(!extent_op); /* -ENOMEM */ 6759 if (key) 6760 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6761 else 6762 memset(&extent_op->key, 0, sizeof(extent_op->key)); 6763 extent_op->flags_to_set = flags; 6764 if (skinny_metadata) 6765 extent_op->update_key = 0; 6766 else 6767 extent_op->update_key = 1; 6768 extent_op->update_flags = 1; 6769 extent_op->is_data = 0; 6770 extent_op->level = level; 6771 6772 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6773 ins.objectid, 6774 ins.offset, parent, root_objectid, 6775 level, BTRFS_ADD_DELAYED_EXTENT, 6776 extent_op, 0); 6777 BUG_ON(ret); /* -ENOMEM */ 6778 } 6779 return buf; 6780 } 6781 6782 struct walk_control { 6783 u64 refs[BTRFS_MAX_LEVEL]; 6784 u64 flags[BTRFS_MAX_LEVEL]; 6785 struct btrfs_key update_progress; 6786 int stage; 6787 int level; 6788 int shared_level; 6789 int update_ref; 6790 int keep_locks; 6791 int reada_slot; 6792 int reada_count; 6793 int for_reloc; 6794 }; 6795 6796 #define DROP_REFERENCE 1 6797 #define UPDATE_BACKREF 2 6798 6799 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 6800 struct btrfs_root *root, 6801 struct walk_control *wc, 6802 struct btrfs_path *path) 6803 { 6804 u64 bytenr; 6805 u64 generation; 6806 u64 refs; 6807 u64 flags; 6808 u32 nritems; 6809 u32 blocksize; 6810 struct btrfs_key key; 6811 struct extent_buffer *eb; 6812 int ret; 6813 int slot; 6814 int nread = 0; 6815 6816 if (path->slots[wc->level] < wc->reada_slot) { 6817 wc->reada_count = wc->reada_count * 2 / 3; 6818 wc->reada_count = max(wc->reada_count, 2); 6819 } else { 6820 wc->reada_count = wc->reada_count * 3 / 2; 6821 wc->reada_count = min_t(int, wc->reada_count, 6822 BTRFS_NODEPTRS_PER_BLOCK(root)); 6823 } 6824 6825 eb = path->nodes[wc->level]; 6826 nritems = btrfs_header_nritems(eb); 6827 blocksize = btrfs_level_size(root, wc->level - 1); 6828 6829 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 6830 if (nread >= wc->reada_count) 6831 break; 6832 6833 cond_resched(); 6834 bytenr = btrfs_node_blockptr(eb, slot); 6835 generation = btrfs_node_ptr_generation(eb, slot); 6836 6837 if (slot == path->slots[wc->level]) 6838 goto reada; 6839 6840 if (wc->stage == UPDATE_BACKREF && 6841 generation <= root->root_key.offset) 6842 continue; 6843 6844 /* We don't lock the tree block, it's OK to be racy here */ 6845 ret = btrfs_lookup_extent_info(trans, root, bytenr, 6846 wc->level - 1, 1, &refs, 6847 &flags); 6848 /* We don't care about errors in readahead. */ 6849 if (ret < 0) 6850 continue; 6851 BUG_ON(refs == 0); 6852 6853 if (wc->stage == DROP_REFERENCE) { 6854 if (refs == 1) 6855 goto reada; 6856 6857 if (wc->level == 1 && 6858 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6859 continue; 6860 if (!wc->update_ref || 6861 generation <= root->root_key.offset) 6862 continue; 6863 btrfs_node_key_to_cpu(eb, &key, slot); 6864 ret = btrfs_comp_cpu_keys(&key, 6865 &wc->update_progress); 6866 if (ret < 0) 6867 continue; 6868 } else { 6869 if (wc->level == 1 && 6870 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6871 continue; 6872 } 6873 reada: 6874 ret = readahead_tree_block(root, bytenr, blocksize, 6875 generation); 6876 if (ret) 6877 break; 6878 nread++; 6879 } 6880 wc->reada_slot = slot; 6881 } 6882 6883 /* 6884 * helper to process tree block while walking down the tree. 6885 * 6886 * when wc->stage == UPDATE_BACKREF, this function updates 6887 * back refs for pointers in the block. 6888 * 6889 * NOTE: return value 1 means we should stop walking down. 6890 */ 6891 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 6892 struct btrfs_root *root, 6893 struct btrfs_path *path, 6894 struct walk_control *wc, int lookup_info) 6895 { 6896 int level = wc->level; 6897 struct extent_buffer *eb = path->nodes[level]; 6898 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6899 int ret; 6900 6901 if (wc->stage == UPDATE_BACKREF && 6902 btrfs_header_owner(eb) != root->root_key.objectid) 6903 return 1; 6904 6905 /* 6906 * when reference count of tree block is 1, it won't increase 6907 * again. once full backref flag is set, we never clear it. 6908 */ 6909 if (lookup_info && 6910 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 6911 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 6912 BUG_ON(!path->locks[level]); 6913 ret = btrfs_lookup_extent_info(trans, root, 6914 eb->start, level, 1, 6915 &wc->refs[level], 6916 &wc->flags[level]); 6917 BUG_ON(ret == -ENOMEM); 6918 if (ret) 6919 return ret; 6920 BUG_ON(wc->refs[level] == 0); 6921 } 6922 6923 if (wc->stage == DROP_REFERENCE) { 6924 if (wc->refs[level] > 1) 6925 return 1; 6926 6927 if (path->locks[level] && !wc->keep_locks) { 6928 btrfs_tree_unlock_rw(eb, path->locks[level]); 6929 path->locks[level] = 0; 6930 } 6931 return 0; 6932 } 6933 6934 /* wc->stage == UPDATE_BACKREF */ 6935 if (!(wc->flags[level] & flag)) { 6936 BUG_ON(!path->locks[level]); 6937 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 6938 BUG_ON(ret); /* -ENOMEM */ 6939 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 6940 BUG_ON(ret); /* -ENOMEM */ 6941 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6942 eb->len, flag, 6943 btrfs_header_level(eb), 0); 6944 BUG_ON(ret); /* -ENOMEM */ 6945 wc->flags[level] |= flag; 6946 } 6947 6948 /* 6949 * the block is shared by multiple trees, so it's not good to 6950 * keep the tree lock 6951 */ 6952 if (path->locks[level] && level > 0) { 6953 btrfs_tree_unlock_rw(eb, path->locks[level]); 6954 path->locks[level] = 0; 6955 } 6956 return 0; 6957 } 6958 6959 /* 6960 * helper to process tree block pointer. 6961 * 6962 * when wc->stage == DROP_REFERENCE, this function checks 6963 * reference count of the block pointed to. if the block 6964 * is shared and we need update back refs for the subtree 6965 * rooted at the block, this function changes wc->stage to 6966 * UPDATE_BACKREF. if the block is shared and there is no 6967 * need to update back, this function drops the reference 6968 * to the block. 6969 * 6970 * NOTE: return value 1 means we should stop walking down. 6971 */ 6972 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 6973 struct btrfs_root *root, 6974 struct btrfs_path *path, 6975 struct walk_control *wc, int *lookup_info) 6976 { 6977 u64 bytenr; 6978 u64 generation; 6979 u64 parent; 6980 u32 blocksize; 6981 struct btrfs_key key; 6982 struct extent_buffer *next; 6983 int level = wc->level; 6984 int reada = 0; 6985 int ret = 0; 6986 6987 generation = btrfs_node_ptr_generation(path->nodes[level], 6988 path->slots[level]); 6989 /* 6990 * if the lower level block was created before the snapshot 6991 * was created, we know there is no need to update back refs 6992 * for the subtree 6993 */ 6994 if (wc->stage == UPDATE_BACKREF && 6995 generation <= root->root_key.offset) { 6996 *lookup_info = 1; 6997 return 1; 6998 } 6999 7000 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7001 blocksize = btrfs_level_size(root, level - 1); 7002 7003 next = btrfs_find_tree_block(root, bytenr, blocksize); 7004 if (!next) { 7005 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7006 if (!next) 7007 return -ENOMEM; 7008 reada = 1; 7009 } 7010 btrfs_tree_lock(next); 7011 btrfs_set_lock_blocking(next); 7012 7013 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7014 &wc->refs[level - 1], 7015 &wc->flags[level - 1]); 7016 if (ret < 0) { 7017 btrfs_tree_unlock(next); 7018 return ret; 7019 } 7020 7021 if (unlikely(wc->refs[level - 1] == 0)) { 7022 btrfs_err(root->fs_info, "Missing references."); 7023 BUG(); 7024 } 7025 *lookup_info = 0; 7026 7027 if (wc->stage == DROP_REFERENCE) { 7028 if (wc->refs[level - 1] > 1) { 7029 if (level == 1 && 7030 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7031 goto skip; 7032 7033 if (!wc->update_ref || 7034 generation <= root->root_key.offset) 7035 goto skip; 7036 7037 btrfs_node_key_to_cpu(path->nodes[level], &key, 7038 path->slots[level]); 7039 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7040 if (ret < 0) 7041 goto skip; 7042 7043 wc->stage = UPDATE_BACKREF; 7044 wc->shared_level = level - 1; 7045 } 7046 } else { 7047 if (level == 1 && 7048 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7049 goto skip; 7050 } 7051 7052 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7053 btrfs_tree_unlock(next); 7054 free_extent_buffer(next); 7055 next = NULL; 7056 *lookup_info = 1; 7057 } 7058 7059 if (!next) { 7060 if (reada && level == 1) 7061 reada_walk_down(trans, root, wc, path); 7062 next = read_tree_block(root, bytenr, blocksize, generation); 7063 if (!next || !extent_buffer_uptodate(next)) { 7064 free_extent_buffer(next); 7065 return -EIO; 7066 } 7067 btrfs_tree_lock(next); 7068 btrfs_set_lock_blocking(next); 7069 } 7070 7071 level--; 7072 BUG_ON(level != btrfs_header_level(next)); 7073 path->nodes[level] = next; 7074 path->slots[level] = 0; 7075 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7076 wc->level = level; 7077 if (wc->level == 1) 7078 wc->reada_slot = 0; 7079 return 0; 7080 skip: 7081 wc->refs[level - 1] = 0; 7082 wc->flags[level - 1] = 0; 7083 if (wc->stage == DROP_REFERENCE) { 7084 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7085 parent = path->nodes[level]->start; 7086 } else { 7087 BUG_ON(root->root_key.objectid != 7088 btrfs_header_owner(path->nodes[level])); 7089 parent = 0; 7090 } 7091 7092 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7093 root->root_key.objectid, level - 1, 0, 0); 7094 BUG_ON(ret); /* -ENOMEM */ 7095 } 7096 btrfs_tree_unlock(next); 7097 free_extent_buffer(next); 7098 *lookup_info = 1; 7099 return 1; 7100 } 7101 7102 /* 7103 * helper to process tree block while walking up the tree. 7104 * 7105 * when wc->stage == DROP_REFERENCE, this function drops 7106 * reference count on the block. 7107 * 7108 * when wc->stage == UPDATE_BACKREF, this function changes 7109 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7110 * to UPDATE_BACKREF previously while processing the block. 7111 * 7112 * NOTE: return value 1 means we should stop walking up. 7113 */ 7114 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7115 struct btrfs_root *root, 7116 struct btrfs_path *path, 7117 struct walk_control *wc) 7118 { 7119 int ret; 7120 int level = wc->level; 7121 struct extent_buffer *eb = path->nodes[level]; 7122 u64 parent = 0; 7123 7124 if (wc->stage == UPDATE_BACKREF) { 7125 BUG_ON(wc->shared_level < level); 7126 if (level < wc->shared_level) 7127 goto out; 7128 7129 ret = find_next_key(path, level + 1, &wc->update_progress); 7130 if (ret > 0) 7131 wc->update_ref = 0; 7132 7133 wc->stage = DROP_REFERENCE; 7134 wc->shared_level = -1; 7135 path->slots[level] = 0; 7136 7137 /* 7138 * check reference count again if the block isn't locked. 7139 * we should start walking down the tree again if reference 7140 * count is one. 7141 */ 7142 if (!path->locks[level]) { 7143 BUG_ON(level == 0); 7144 btrfs_tree_lock(eb); 7145 btrfs_set_lock_blocking(eb); 7146 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7147 7148 ret = btrfs_lookup_extent_info(trans, root, 7149 eb->start, level, 1, 7150 &wc->refs[level], 7151 &wc->flags[level]); 7152 if (ret < 0) { 7153 btrfs_tree_unlock_rw(eb, path->locks[level]); 7154 path->locks[level] = 0; 7155 return ret; 7156 } 7157 BUG_ON(wc->refs[level] == 0); 7158 if (wc->refs[level] == 1) { 7159 btrfs_tree_unlock_rw(eb, path->locks[level]); 7160 path->locks[level] = 0; 7161 return 1; 7162 } 7163 } 7164 } 7165 7166 /* wc->stage == DROP_REFERENCE */ 7167 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7168 7169 if (wc->refs[level] == 1) { 7170 if (level == 0) { 7171 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7172 ret = btrfs_dec_ref(trans, root, eb, 1, 7173 wc->for_reloc); 7174 else 7175 ret = btrfs_dec_ref(trans, root, eb, 0, 7176 wc->for_reloc); 7177 BUG_ON(ret); /* -ENOMEM */ 7178 } 7179 /* make block locked assertion in clean_tree_block happy */ 7180 if (!path->locks[level] && 7181 btrfs_header_generation(eb) == trans->transid) { 7182 btrfs_tree_lock(eb); 7183 btrfs_set_lock_blocking(eb); 7184 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7185 } 7186 clean_tree_block(trans, root, eb); 7187 } 7188 7189 if (eb == root->node) { 7190 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7191 parent = eb->start; 7192 else 7193 BUG_ON(root->root_key.objectid != 7194 btrfs_header_owner(eb)); 7195 } else { 7196 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7197 parent = path->nodes[level + 1]->start; 7198 else 7199 BUG_ON(root->root_key.objectid != 7200 btrfs_header_owner(path->nodes[level + 1])); 7201 } 7202 7203 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 7204 out: 7205 wc->refs[level] = 0; 7206 wc->flags[level] = 0; 7207 return 0; 7208 } 7209 7210 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 7211 struct btrfs_root *root, 7212 struct btrfs_path *path, 7213 struct walk_control *wc) 7214 { 7215 int level = wc->level; 7216 int lookup_info = 1; 7217 int ret; 7218 7219 while (level >= 0) { 7220 ret = walk_down_proc(trans, root, path, wc, lookup_info); 7221 if (ret > 0) 7222 break; 7223 7224 if (level == 0) 7225 break; 7226 7227 if (path->slots[level] >= 7228 btrfs_header_nritems(path->nodes[level])) 7229 break; 7230 7231 ret = do_walk_down(trans, root, path, wc, &lookup_info); 7232 if (ret > 0) { 7233 path->slots[level]++; 7234 continue; 7235 } else if (ret < 0) 7236 return ret; 7237 level = wc->level; 7238 } 7239 return 0; 7240 } 7241 7242 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 7243 struct btrfs_root *root, 7244 struct btrfs_path *path, 7245 struct walk_control *wc, int max_level) 7246 { 7247 int level = wc->level; 7248 int ret; 7249 7250 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 7251 while (level < max_level && path->nodes[level]) { 7252 wc->level = level; 7253 if (path->slots[level] + 1 < 7254 btrfs_header_nritems(path->nodes[level])) { 7255 path->slots[level]++; 7256 return 0; 7257 } else { 7258 ret = walk_up_proc(trans, root, path, wc); 7259 if (ret > 0) 7260 return 0; 7261 7262 if (path->locks[level]) { 7263 btrfs_tree_unlock_rw(path->nodes[level], 7264 path->locks[level]); 7265 path->locks[level] = 0; 7266 } 7267 free_extent_buffer(path->nodes[level]); 7268 path->nodes[level] = NULL; 7269 level++; 7270 } 7271 } 7272 return 1; 7273 } 7274 7275 /* 7276 * drop a subvolume tree. 7277 * 7278 * this function traverses the tree freeing any blocks that only 7279 * referenced by the tree. 7280 * 7281 * when a shared tree block is found. this function decreases its 7282 * reference count by one. if update_ref is true, this function 7283 * also make sure backrefs for the shared block and all lower level 7284 * blocks are properly updated. 7285 * 7286 * If called with for_reloc == 0, may exit early with -EAGAIN 7287 */ 7288 int btrfs_drop_snapshot(struct btrfs_root *root, 7289 struct btrfs_block_rsv *block_rsv, int update_ref, 7290 int for_reloc) 7291 { 7292 struct btrfs_path *path; 7293 struct btrfs_trans_handle *trans; 7294 struct btrfs_root *tree_root = root->fs_info->tree_root; 7295 struct btrfs_root_item *root_item = &root->root_item; 7296 struct walk_control *wc; 7297 struct btrfs_key key; 7298 int err = 0; 7299 int ret; 7300 int level; 7301 7302 path = btrfs_alloc_path(); 7303 if (!path) { 7304 err = -ENOMEM; 7305 goto out; 7306 } 7307 7308 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7309 if (!wc) { 7310 btrfs_free_path(path); 7311 err = -ENOMEM; 7312 goto out; 7313 } 7314 7315 trans = btrfs_start_transaction(tree_root, 0); 7316 if (IS_ERR(trans)) { 7317 err = PTR_ERR(trans); 7318 goto out_free; 7319 } 7320 7321 if (block_rsv) 7322 trans->block_rsv = block_rsv; 7323 7324 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7325 level = btrfs_header_level(root->node); 7326 path->nodes[level] = btrfs_lock_root_node(root); 7327 btrfs_set_lock_blocking(path->nodes[level]); 7328 path->slots[level] = 0; 7329 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7330 memset(&wc->update_progress, 0, 7331 sizeof(wc->update_progress)); 7332 } else { 7333 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7334 memcpy(&wc->update_progress, &key, 7335 sizeof(wc->update_progress)); 7336 7337 level = root_item->drop_level; 7338 BUG_ON(level == 0); 7339 path->lowest_level = level; 7340 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7341 path->lowest_level = 0; 7342 if (ret < 0) { 7343 err = ret; 7344 goto out_end_trans; 7345 } 7346 WARN_ON(ret > 0); 7347 7348 /* 7349 * unlock our path, this is safe because only this 7350 * function is allowed to delete this snapshot 7351 */ 7352 btrfs_unlock_up_safe(path, 0); 7353 7354 level = btrfs_header_level(root->node); 7355 while (1) { 7356 btrfs_tree_lock(path->nodes[level]); 7357 btrfs_set_lock_blocking(path->nodes[level]); 7358 7359 ret = btrfs_lookup_extent_info(trans, root, 7360 path->nodes[level]->start, 7361 level, 1, &wc->refs[level], 7362 &wc->flags[level]); 7363 if (ret < 0) { 7364 err = ret; 7365 goto out_end_trans; 7366 } 7367 BUG_ON(wc->refs[level] == 0); 7368 7369 if (level == root_item->drop_level) 7370 break; 7371 7372 btrfs_tree_unlock(path->nodes[level]); 7373 WARN_ON(wc->refs[level] != 1); 7374 level--; 7375 } 7376 } 7377 7378 wc->level = level; 7379 wc->shared_level = -1; 7380 wc->stage = DROP_REFERENCE; 7381 wc->update_ref = update_ref; 7382 wc->keep_locks = 0; 7383 wc->for_reloc = for_reloc; 7384 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7385 7386 while (1) { 7387 if (!for_reloc && btrfs_fs_closing(root->fs_info)) { 7388 pr_debug("btrfs: drop snapshot early exit\n"); 7389 err = -EAGAIN; 7390 goto out_end_trans; 7391 } 7392 7393 ret = walk_down_tree(trans, root, path, wc); 7394 if (ret < 0) { 7395 err = ret; 7396 break; 7397 } 7398 7399 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 7400 if (ret < 0) { 7401 err = ret; 7402 break; 7403 } 7404 7405 if (ret > 0) { 7406 BUG_ON(wc->stage != DROP_REFERENCE); 7407 break; 7408 } 7409 7410 if (wc->stage == DROP_REFERENCE) { 7411 level = wc->level; 7412 btrfs_node_key(path->nodes[level], 7413 &root_item->drop_progress, 7414 path->slots[level]); 7415 root_item->drop_level = level; 7416 } 7417 7418 BUG_ON(wc->level == 0); 7419 if (btrfs_should_end_transaction(trans, tree_root)) { 7420 ret = btrfs_update_root(trans, tree_root, 7421 &root->root_key, 7422 root_item); 7423 if (ret) { 7424 btrfs_abort_transaction(trans, tree_root, ret); 7425 err = ret; 7426 goto out_end_trans; 7427 } 7428 7429 btrfs_end_transaction_throttle(trans, tree_root); 7430 trans = btrfs_start_transaction(tree_root, 0); 7431 if (IS_ERR(trans)) { 7432 err = PTR_ERR(trans); 7433 goto out_free; 7434 } 7435 if (block_rsv) 7436 trans->block_rsv = block_rsv; 7437 } 7438 } 7439 btrfs_release_path(path); 7440 if (err) 7441 goto out_end_trans; 7442 7443 ret = btrfs_del_root(trans, tree_root, &root->root_key); 7444 if (ret) { 7445 btrfs_abort_transaction(trans, tree_root, ret); 7446 goto out_end_trans; 7447 } 7448 7449 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7450 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7451 NULL, NULL); 7452 if (ret < 0) { 7453 btrfs_abort_transaction(trans, tree_root, ret); 7454 err = ret; 7455 goto out_end_trans; 7456 } else if (ret > 0) { 7457 /* if we fail to delete the orphan item this time 7458 * around, it'll get picked up the next time. 7459 * 7460 * The most common failure here is just -ENOENT. 7461 */ 7462 btrfs_del_orphan_item(trans, tree_root, 7463 root->root_key.objectid); 7464 } 7465 } 7466 7467 if (root->in_radix) { 7468 btrfs_free_fs_root(tree_root->fs_info, root); 7469 } else { 7470 free_extent_buffer(root->node); 7471 free_extent_buffer(root->commit_root); 7472 kfree(root); 7473 } 7474 out_end_trans: 7475 btrfs_end_transaction_throttle(trans, tree_root); 7476 out_free: 7477 kfree(wc); 7478 btrfs_free_path(path); 7479 out: 7480 if (err) 7481 btrfs_std_error(root->fs_info, err); 7482 return err; 7483 } 7484 7485 /* 7486 * drop subtree rooted at tree block 'node'. 7487 * 7488 * NOTE: this function will unlock and release tree block 'node' 7489 * only used by relocation code 7490 */ 7491 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 7492 struct btrfs_root *root, 7493 struct extent_buffer *node, 7494 struct extent_buffer *parent) 7495 { 7496 struct btrfs_path *path; 7497 struct walk_control *wc; 7498 int level; 7499 int parent_level; 7500 int ret = 0; 7501 int wret; 7502 7503 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 7504 7505 path = btrfs_alloc_path(); 7506 if (!path) 7507 return -ENOMEM; 7508 7509 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7510 if (!wc) { 7511 btrfs_free_path(path); 7512 return -ENOMEM; 7513 } 7514 7515 btrfs_assert_tree_locked(parent); 7516 parent_level = btrfs_header_level(parent); 7517 extent_buffer_get(parent); 7518 path->nodes[parent_level] = parent; 7519 path->slots[parent_level] = btrfs_header_nritems(parent); 7520 7521 btrfs_assert_tree_locked(node); 7522 level = btrfs_header_level(node); 7523 path->nodes[level] = node; 7524 path->slots[level] = 0; 7525 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7526 7527 wc->refs[parent_level] = 1; 7528 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7529 wc->level = level; 7530 wc->shared_level = -1; 7531 wc->stage = DROP_REFERENCE; 7532 wc->update_ref = 0; 7533 wc->keep_locks = 1; 7534 wc->for_reloc = 1; 7535 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7536 7537 while (1) { 7538 wret = walk_down_tree(trans, root, path, wc); 7539 if (wret < 0) { 7540 ret = wret; 7541 break; 7542 } 7543 7544 wret = walk_up_tree(trans, root, path, wc, parent_level); 7545 if (wret < 0) 7546 ret = wret; 7547 if (wret != 0) 7548 break; 7549 } 7550 7551 kfree(wc); 7552 btrfs_free_path(path); 7553 return ret; 7554 } 7555 7556 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 7557 { 7558 u64 num_devices; 7559 u64 stripped; 7560 7561 /* 7562 * if restripe for this chunk_type is on pick target profile and 7563 * return, otherwise do the usual balance 7564 */ 7565 stripped = get_restripe_target(root->fs_info, flags); 7566 if (stripped) 7567 return extended_to_chunk(stripped); 7568 7569 /* 7570 * we add in the count of missing devices because we want 7571 * to make sure that any RAID levels on a degraded FS 7572 * continue to be honored. 7573 */ 7574 num_devices = root->fs_info->fs_devices->rw_devices + 7575 root->fs_info->fs_devices->missing_devices; 7576 7577 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7578 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7579 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7580 7581 if (num_devices == 1) { 7582 stripped |= BTRFS_BLOCK_GROUP_DUP; 7583 stripped = flags & ~stripped; 7584 7585 /* turn raid0 into single device chunks */ 7586 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7587 return stripped; 7588 7589 /* turn mirroring into duplication */ 7590 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7591 BTRFS_BLOCK_GROUP_RAID10)) 7592 return stripped | BTRFS_BLOCK_GROUP_DUP; 7593 } else { 7594 /* they already had raid on here, just return */ 7595 if (flags & stripped) 7596 return flags; 7597 7598 stripped |= BTRFS_BLOCK_GROUP_DUP; 7599 stripped = flags & ~stripped; 7600 7601 /* switch duplicated blocks with raid1 */ 7602 if (flags & BTRFS_BLOCK_GROUP_DUP) 7603 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7604 7605 /* this is drive concat, leave it alone */ 7606 } 7607 7608 return flags; 7609 } 7610 7611 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 7612 { 7613 struct btrfs_space_info *sinfo = cache->space_info; 7614 u64 num_bytes; 7615 u64 min_allocable_bytes; 7616 int ret = -ENOSPC; 7617 7618 7619 /* 7620 * We need some metadata space and system metadata space for 7621 * allocating chunks in some corner cases until we force to set 7622 * it to be readonly. 7623 */ 7624 if ((sinfo->flags & 7625 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 7626 !force) 7627 min_allocable_bytes = 1 * 1024 * 1024; 7628 else 7629 min_allocable_bytes = 0; 7630 7631 spin_lock(&sinfo->lock); 7632 spin_lock(&cache->lock); 7633 7634 if (cache->ro) { 7635 ret = 0; 7636 goto out; 7637 } 7638 7639 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7640 cache->bytes_super - btrfs_block_group_used(&cache->item); 7641 7642 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 7643 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 7644 min_allocable_bytes <= sinfo->total_bytes) { 7645 sinfo->bytes_readonly += num_bytes; 7646 cache->ro = 1; 7647 ret = 0; 7648 } 7649 out: 7650 spin_unlock(&cache->lock); 7651 spin_unlock(&sinfo->lock); 7652 return ret; 7653 } 7654 7655 int btrfs_set_block_group_ro(struct btrfs_root *root, 7656 struct btrfs_block_group_cache *cache) 7657 7658 { 7659 struct btrfs_trans_handle *trans; 7660 u64 alloc_flags; 7661 int ret; 7662 7663 BUG_ON(cache->ro); 7664 7665 trans = btrfs_join_transaction(root); 7666 if (IS_ERR(trans)) 7667 return PTR_ERR(trans); 7668 7669 alloc_flags = update_block_group_flags(root, cache->flags); 7670 if (alloc_flags != cache->flags) { 7671 ret = do_chunk_alloc(trans, root, alloc_flags, 7672 CHUNK_ALLOC_FORCE); 7673 if (ret < 0) 7674 goto out; 7675 } 7676 7677 ret = set_block_group_ro(cache, 0); 7678 if (!ret) 7679 goto out; 7680 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 7681 ret = do_chunk_alloc(trans, root, alloc_flags, 7682 CHUNK_ALLOC_FORCE); 7683 if (ret < 0) 7684 goto out; 7685 ret = set_block_group_ro(cache, 0); 7686 out: 7687 btrfs_end_transaction(trans, root); 7688 return ret; 7689 } 7690 7691 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 7692 struct btrfs_root *root, u64 type) 7693 { 7694 u64 alloc_flags = get_alloc_profile(root, type); 7695 return do_chunk_alloc(trans, root, alloc_flags, 7696 CHUNK_ALLOC_FORCE); 7697 } 7698 7699 /* 7700 * helper to account the unused space of all the readonly block group in the 7701 * list. takes mirrors into account. 7702 */ 7703 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 7704 { 7705 struct btrfs_block_group_cache *block_group; 7706 u64 free_bytes = 0; 7707 int factor; 7708 7709 list_for_each_entry(block_group, groups_list, list) { 7710 spin_lock(&block_group->lock); 7711 7712 if (!block_group->ro) { 7713 spin_unlock(&block_group->lock); 7714 continue; 7715 } 7716 7717 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 7718 BTRFS_BLOCK_GROUP_RAID10 | 7719 BTRFS_BLOCK_GROUP_DUP)) 7720 factor = 2; 7721 else 7722 factor = 1; 7723 7724 free_bytes += (block_group->key.offset - 7725 btrfs_block_group_used(&block_group->item)) * 7726 factor; 7727 7728 spin_unlock(&block_group->lock); 7729 } 7730 7731 return free_bytes; 7732 } 7733 7734 /* 7735 * helper to account the unused space of all the readonly block group in the 7736 * space_info. takes mirrors into account. 7737 */ 7738 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 7739 { 7740 int i; 7741 u64 free_bytes = 0; 7742 7743 spin_lock(&sinfo->lock); 7744 7745 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) 7746 if (!list_empty(&sinfo->block_groups[i])) 7747 free_bytes += __btrfs_get_ro_block_group_free_space( 7748 &sinfo->block_groups[i]); 7749 7750 spin_unlock(&sinfo->lock); 7751 7752 return free_bytes; 7753 } 7754 7755 void btrfs_set_block_group_rw(struct btrfs_root *root, 7756 struct btrfs_block_group_cache *cache) 7757 { 7758 struct btrfs_space_info *sinfo = cache->space_info; 7759 u64 num_bytes; 7760 7761 BUG_ON(!cache->ro); 7762 7763 spin_lock(&sinfo->lock); 7764 spin_lock(&cache->lock); 7765 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7766 cache->bytes_super - btrfs_block_group_used(&cache->item); 7767 sinfo->bytes_readonly -= num_bytes; 7768 cache->ro = 0; 7769 spin_unlock(&cache->lock); 7770 spin_unlock(&sinfo->lock); 7771 } 7772 7773 /* 7774 * checks to see if its even possible to relocate this block group. 7775 * 7776 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 7777 * ok to go ahead and try. 7778 */ 7779 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 7780 { 7781 struct btrfs_block_group_cache *block_group; 7782 struct btrfs_space_info *space_info; 7783 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7784 struct btrfs_device *device; 7785 u64 min_free; 7786 u64 dev_min = 1; 7787 u64 dev_nr = 0; 7788 u64 target; 7789 int index; 7790 int full = 0; 7791 int ret = 0; 7792 7793 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 7794 7795 /* odd, couldn't find the block group, leave it alone */ 7796 if (!block_group) 7797 return -1; 7798 7799 min_free = btrfs_block_group_used(&block_group->item); 7800 7801 /* no bytes used, we're good */ 7802 if (!min_free) 7803 goto out; 7804 7805 space_info = block_group->space_info; 7806 spin_lock(&space_info->lock); 7807 7808 full = space_info->full; 7809 7810 /* 7811 * if this is the last block group we have in this space, we can't 7812 * relocate it unless we're able to allocate a new chunk below. 7813 * 7814 * Otherwise, we need to make sure we have room in the space to handle 7815 * all of the extents from this block group. If we can, we're good 7816 */ 7817 if ((space_info->total_bytes != block_group->key.offset) && 7818 (space_info->bytes_used + space_info->bytes_reserved + 7819 space_info->bytes_pinned + space_info->bytes_readonly + 7820 min_free < space_info->total_bytes)) { 7821 spin_unlock(&space_info->lock); 7822 goto out; 7823 } 7824 spin_unlock(&space_info->lock); 7825 7826 /* 7827 * ok we don't have enough space, but maybe we have free space on our 7828 * devices to allocate new chunks for relocation, so loop through our 7829 * alloc devices and guess if we have enough space. if this block 7830 * group is going to be restriped, run checks against the target 7831 * profile instead of the current one. 7832 */ 7833 ret = -1; 7834 7835 /* 7836 * index: 7837 * 0: raid10 7838 * 1: raid1 7839 * 2: dup 7840 * 3: raid0 7841 * 4: single 7842 */ 7843 target = get_restripe_target(root->fs_info, block_group->flags); 7844 if (target) { 7845 index = __get_raid_index(extended_to_chunk(target)); 7846 } else { 7847 /* 7848 * this is just a balance, so if we were marked as full 7849 * we know there is no space for a new chunk 7850 */ 7851 if (full) 7852 goto out; 7853 7854 index = get_block_group_index(block_group); 7855 } 7856 7857 if (index == BTRFS_RAID_RAID10) { 7858 dev_min = 4; 7859 /* Divide by 2 */ 7860 min_free >>= 1; 7861 } else if (index == BTRFS_RAID_RAID1) { 7862 dev_min = 2; 7863 } else if (index == BTRFS_RAID_DUP) { 7864 /* Multiply by 2 */ 7865 min_free <<= 1; 7866 } else if (index == BTRFS_RAID_RAID0) { 7867 dev_min = fs_devices->rw_devices; 7868 do_div(min_free, dev_min); 7869 } 7870 7871 mutex_lock(&root->fs_info->chunk_mutex); 7872 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 7873 u64 dev_offset; 7874 7875 /* 7876 * check to make sure we can actually find a chunk with enough 7877 * space to fit our block group in. 7878 */ 7879 if (device->total_bytes > device->bytes_used + min_free && 7880 !device->is_tgtdev_for_dev_replace) { 7881 ret = find_free_dev_extent(device, min_free, 7882 &dev_offset, NULL); 7883 if (!ret) 7884 dev_nr++; 7885 7886 if (dev_nr >= dev_min) 7887 break; 7888 7889 ret = -1; 7890 } 7891 } 7892 mutex_unlock(&root->fs_info->chunk_mutex); 7893 out: 7894 btrfs_put_block_group(block_group); 7895 return ret; 7896 } 7897 7898 static int find_first_block_group(struct btrfs_root *root, 7899 struct btrfs_path *path, struct btrfs_key *key) 7900 { 7901 int ret = 0; 7902 struct btrfs_key found_key; 7903 struct extent_buffer *leaf; 7904 int slot; 7905 7906 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7907 if (ret < 0) 7908 goto out; 7909 7910 while (1) { 7911 slot = path->slots[0]; 7912 leaf = path->nodes[0]; 7913 if (slot >= btrfs_header_nritems(leaf)) { 7914 ret = btrfs_next_leaf(root, path); 7915 if (ret == 0) 7916 continue; 7917 if (ret < 0) 7918 goto out; 7919 break; 7920 } 7921 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7922 7923 if (found_key.objectid >= key->objectid && 7924 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 7925 ret = 0; 7926 goto out; 7927 } 7928 path->slots[0]++; 7929 } 7930 out: 7931 return ret; 7932 } 7933 7934 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 7935 { 7936 struct btrfs_block_group_cache *block_group; 7937 u64 last = 0; 7938 7939 while (1) { 7940 struct inode *inode; 7941 7942 block_group = btrfs_lookup_first_block_group(info, last); 7943 while (block_group) { 7944 spin_lock(&block_group->lock); 7945 if (block_group->iref) 7946 break; 7947 spin_unlock(&block_group->lock); 7948 block_group = next_block_group(info->tree_root, 7949 block_group); 7950 } 7951 if (!block_group) { 7952 if (last == 0) 7953 break; 7954 last = 0; 7955 continue; 7956 } 7957 7958 inode = block_group->inode; 7959 block_group->iref = 0; 7960 block_group->inode = NULL; 7961 spin_unlock(&block_group->lock); 7962 iput(inode); 7963 last = block_group->key.objectid + block_group->key.offset; 7964 btrfs_put_block_group(block_group); 7965 } 7966 } 7967 7968 int btrfs_free_block_groups(struct btrfs_fs_info *info) 7969 { 7970 struct btrfs_block_group_cache *block_group; 7971 struct btrfs_space_info *space_info; 7972 struct btrfs_caching_control *caching_ctl; 7973 struct rb_node *n; 7974 7975 down_write(&info->extent_commit_sem); 7976 while (!list_empty(&info->caching_block_groups)) { 7977 caching_ctl = list_entry(info->caching_block_groups.next, 7978 struct btrfs_caching_control, list); 7979 list_del(&caching_ctl->list); 7980 put_caching_control(caching_ctl); 7981 } 7982 up_write(&info->extent_commit_sem); 7983 7984 spin_lock(&info->block_group_cache_lock); 7985 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7986 block_group = rb_entry(n, struct btrfs_block_group_cache, 7987 cache_node); 7988 rb_erase(&block_group->cache_node, 7989 &info->block_group_cache_tree); 7990 spin_unlock(&info->block_group_cache_lock); 7991 7992 down_write(&block_group->space_info->groups_sem); 7993 list_del(&block_group->list); 7994 up_write(&block_group->space_info->groups_sem); 7995 7996 if (block_group->cached == BTRFS_CACHE_STARTED) 7997 wait_block_group_cache_done(block_group); 7998 7999 /* 8000 * We haven't cached this block group, which means we could 8001 * possibly have excluded extents on this block group. 8002 */ 8003 if (block_group->cached == BTRFS_CACHE_NO) 8004 free_excluded_extents(info->extent_root, block_group); 8005 8006 btrfs_remove_free_space_cache(block_group); 8007 btrfs_put_block_group(block_group); 8008 8009 spin_lock(&info->block_group_cache_lock); 8010 } 8011 spin_unlock(&info->block_group_cache_lock); 8012 8013 /* now that all the block groups are freed, go through and 8014 * free all the space_info structs. This is only called during 8015 * the final stages of unmount, and so we know nobody is 8016 * using them. We call synchronize_rcu() once before we start, 8017 * just to be on the safe side. 8018 */ 8019 synchronize_rcu(); 8020 8021 release_global_block_rsv(info); 8022 8023 while(!list_empty(&info->space_info)) { 8024 space_info = list_entry(info->space_info.next, 8025 struct btrfs_space_info, 8026 list); 8027 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8028 if (space_info->bytes_pinned > 0 || 8029 space_info->bytes_reserved > 0 || 8030 space_info->bytes_may_use > 0) { 8031 WARN_ON(1); 8032 dump_space_info(space_info, 0, 0); 8033 } 8034 } 8035 list_del(&space_info->list); 8036 kfree(space_info); 8037 } 8038 return 0; 8039 } 8040 8041 static void __link_block_group(struct btrfs_space_info *space_info, 8042 struct btrfs_block_group_cache *cache) 8043 { 8044 int index = get_block_group_index(cache); 8045 8046 down_write(&space_info->groups_sem); 8047 list_add_tail(&cache->list, &space_info->block_groups[index]); 8048 up_write(&space_info->groups_sem); 8049 } 8050 8051 int btrfs_read_block_groups(struct btrfs_root *root) 8052 { 8053 struct btrfs_path *path; 8054 int ret; 8055 struct btrfs_block_group_cache *cache; 8056 struct btrfs_fs_info *info = root->fs_info; 8057 struct btrfs_space_info *space_info; 8058 struct btrfs_key key; 8059 struct btrfs_key found_key; 8060 struct extent_buffer *leaf; 8061 int need_clear = 0; 8062 u64 cache_gen; 8063 8064 root = info->extent_root; 8065 key.objectid = 0; 8066 key.offset = 0; 8067 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 8068 path = btrfs_alloc_path(); 8069 if (!path) 8070 return -ENOMEM; 8071 path->reada = 1; 8072 8073 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 8074 if (btrfs_test_opt(root, SPACE_CACHE) && 8075 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 8076 need_clear = 1; 8077 if (btrfs_test_opt(root, CLEAR_CACHE)) 8078 need_clear = 1; 8079 8080 while (1) { 8081 ret = find_first_block_group(root, path, &key); 8082 if (ret > 0) 8083 break; 8084 if (ret != 0) 8085 goto error; 8086 leaf = path->nodes[0]; 8087 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8088 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8089 if (!cache) { 8090 ret = -ENOMEM; 8091 goto error; 8092 } 8093 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8094 GFP_NOFS); 8095 if (!cache->free_space_ctl) { 8096 kfree(cache); 8097 ret = -ENOMEM; 8098 goto error; 8099 } 8100 8101 atomic_set(&cache->count, 1); 8102 spin_lock_init(&cache->lock); 8103 cache->fs_info = info; 8104 INIT_LIST_HEAD(&cache->list); 8105 INIT_LIST_HEAD(&cache->cluster_list); 8106 8107 if (need_clear) { 8108 /* 8109 * When we mount with old space cache, we need to 8110 * set BTRFS_DC_CLEAR and set dirty flag. 8111 * 8112 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 8113 * truncate the old free space cache inode and 8114 * setup a new one. 8115 * b) Setting 'dirty flag' makes sure that we flush 8116 * the new space cache info onto disk. 8117 */ 8118 cache->disk_cache_state = BTRFS_DC_CLEAR; 8119 if (btrfs_test_opt(root, SPACE_CACHE)) 8120 cache->dirty = 1; 8121 } 8122 8123 read_extent_buffer(leaf, &cache->item, 8124 btrfs_item_ptr_offset(leaf, path->slots[0]), 8125 sizeof(cache->item)); 8126 memcpy(&cache->key, &found_key, sizeof(found_key)); 8127 8128 key.objectid = found_key.objectid + found_key.offset; 8129 btrfs_release_path(path); 8130 cache->flags = btrfs_block_group_flags(&cache->item); 8131 cache->sectorsize = root->sectorsize; 8132 cache->full_stripe_len = btrfs_full_stripe_len(root, 8133 &root->fs_info->mapping_tree, 8134 found_key.objectid); 8135 btrfs_init_free_space_ctl(cache); 8136 8137 /* 8138 * We need to exclude the super stripes now so that the space 8139 * info has super bytes accounted for, otherwise we'll think 8140 * we have more space than we actually do. 8141 */ 8142 ret = exclude_super_stripes(root, cache); 8143 if (ret) { 8144 /* 8145 * We may have excluded something, so call this just in 8146 * case. 8147 */ 8148 free_excluded_extents(root, cache); 8149 kfree(cache->free_space_ctl); 8150 kfree(cache); 8151 goto error; 8152 } 8153 8154 /* 8155 * check for two cases, either we are full, and therefore 8156 * don't need to bother with the caching work since we won't 8157 * find any space, or we are empty, and we can just add all 8158 * the space in and be done with it. This saves us _alot_ of 8159 * time, particularly in the full case. 8160 */ 8161 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8162 cache->last_byte_to_unpin = (u64)-1; 8163 cache->cached = BTRFS_CACHE_FINISHED; 8164 free_excluded_extents(root, cache); 8165 } else if (btrfs_block_group_used(&cache->item) == 0) { 8166 cache->last_byte_to_unpin = (u64)-1; 8167 cache->cached = BTRFS_CACHE_FINISHED; 8168 add_new_free_space(cache, root->fs_info, 8169 found_key.objectid, 8170 found_key.objectid + 8171 found_key.offset); 8172 free_excluded_extents(root, cache); 8173 } 8174 8175 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8176 if (ret) { 8177 btrfs_remove_free_space_cache(cache); 8178 btrfs_put_block_group(cache); 8179 goto error; 8180 } 8181 8182 ret = update_space_info(info, cache->flags, found_key.offset, 8183 btrfs_block_group_used(&cache->item), 8184 &space_info); 8185 if (ret) { 8186 btrfs_remove_free_space_cache(cache); 8187 spin_lock(&info->block_group_cache_lock); 8188 rb_erase(&cache->cache_node, 8189 &info->block_group_cache_tree); 8190 spin_unlock(&info->block_group_cache_lock); 8191 btrfs_put_block_group(cache); 8192 goto error; 8193 } 8194 8195 cache->space_info = space_info; 8196 spin_lock(&cache->space_info->lock); 8197 cache->space_info->bytes_readonly += cache->bytes_super; 8198 spin_unlock(&cache->space_info->lock); 8199 8200 __link_block_group(space_info, cache); 8201 8202 set_avail_alloc_bits(root->fs_info, cache->flags); 8203 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8204 set_block_group_ro(cache, 1); 8205 } 8206 8207 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8208 if (!(get_alloc_profile(root, space_info->flags) & 8209 (BTRFS_BLOCK_GROUP_RAID10 | 8210 BTRFS_BLOCK_GROUP_RAID1 | 8211 BTRFS_BLOCK_GROUP_RAID5 | 8212 BTRFS_BLOCK_GROUP_RAID6 | 8213 BTRFS_BLOCK_GROUP_DUP))) 8214 continue; 8215 /* 8216 * avoid allocating from un-mirrored block group if there are 8217 * mirrored block groups. 8218 */ 8219 list_for_each_entry(cache, &space_info->block_groups[3], list) 8220 set_block_group_ro(cache, 1); 8221 list_for_each_entry(cache, &space_info->block_groups[4], list) 8222 set_block_group_ro(cache, 1); 8223 } 8224 8225 init_global_block_rsv(info); 8226 ret = 0; 8227 error: 8228 btrfs_free_path(path); 8229 return ret; 8230 } 8231 8232 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 8233 struct btrfs_root *root) 8234 { 8235 struct btrfs_block_group_cache *block_group, *tmp; 8236 struct btrfs_root *extent_root = root->fs_info->extent_root; 8237 struct btrfs_block_group_item item; 8238 struct btrfs_key key; 8239 int ret = 0; 8240 8241 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 8242 new_bg_list) { 8243 list_del_init(&block_group->new_bg_list); 8244 8245 if (ret) 8246 continue; 8247 8248 spin_lock(&block_group->lock); 8249 memcpy(&item, &block_group->item, sizeof(item)); 8250 memcpy(&key, &block_group->key, sizeof(key)); 8251 spin_unlock(&block_group->lock); 8252 8253 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8254 sizeof(item)); 8255 if (ret) 8256 btrfs_abort_transaction(trans, extent_root, ret); 8257 } 8258 } 8259 8260 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8261 struct btrfs_root *root, u64 bytes_used, 8262 u64 type, u64 chunk_objectid, u64 chunk_offset, 8263 u64 size) 8264 { 8265 int ret; 8266 struct btrfs_root *extent_root; 8267 struct btrfs_block_group_cache *cache; 8268 8269 extent_root = root->fs_info->extent_root; 8270 8271 root->fs_info->last_trans_log_full_commit = trans->transid; 8272 8273 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8274 if (!cache) 8275 return -ENOMEM; 8276 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8277 GFP_NOFS); 8278 if (!cache->free_space_ctl) { 8279 kfree(cache); 8280 return -ENOMEM; 8281 } 8282 8283 cache->key.objectid = chunk_offset; 8284 cache->key.offset = size; 8285 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8286 cache->sectorsize = root->sectorsize; 8287 cache->fs_info = root->fs_info; 8288 cache->full_stripe_len = btrfs_full_stripe_len(root, 8289 &root->fs_info->mapping_tree, 8290 chunk_offset); 8291 8292 atomic_set(&cache->count, 1); 8293 spin_lock_init(&cache->lock); 8294 INIT_LIST_HEAD(&cache->list); 8295 INIT_LIST_HEAD(&cache->cluster_list); 8296 INIT_LIST_HEAD(&cache->new_bg_list); 8297 8298 btrfs_init_free_space_ctl(cache); 8299 8300 btrfs_set_block_group_used(&cache->item, bytes_used); 8301 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8302 cache->flags = type; 8303 btrfs_set_block_group_flags(&cache->item, type); 8304 8305 cache->last_byte_to_unpin = (u64)-1; 8306 cache->cached = BTRFS_CACHE_FINISHED; 8307 ret = exclude_super_stripes(root, cache); 8308 if (ret) { 8309 /* 8310 * We may have excluded something, so call this just in 8311 * case. 8312 */ 8313 free_excluded_extents(root, cache); 8314 kfree(cache->free_space_ctl); 8315 kfree(cache); 8316 return ret; 8317 } 8318 8319 add_new_free_space(cache, root->fs_info, chunk_offset, 8320 chunk_offset + size); 8321 8322 free_excluded_extents(root, cache); 8323 8324 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8325 if (ret) { 8326 btrfs_remove_free_space_cache(cache); 8327 btrfs_put_block_group(cache); 8328 return ret; 8329 } 8330 8331 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8332 &cache->space_info); 8333 if (ret) { 8334 btrfs_remove_free_space_cache(cache); 8335 spin_lock(&root->fs_info->block_group_cache_lock); 8336 rb_erase(&cache->cache_node, 8337 &root->fs_info->block_group_cache_tree); 8338 spin_unlock(&root->fs_info->block_group_cache_lock); 8339 btrfs_put_block_group(cache); 8340 return ret; 8341 } 8342 update_global_block_rsv(root->fs_info); 8343 8344 spin_lock(&cache->space_info->lock); 8345 cache->space_info->bytes_readonly += cache->bytes_super; 8346 spin_unlock(&cache->space_info->lock); 8347 8348 __link_block_group(cache->space_info, cache); 8349 8350 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 8351 8352 set_avail_alloc_bits(extent_root->fs_info, type); 8353 8354 return 0; 8355 } 8356 8357 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 8358 { 8359 u64 extra_flags = chunk_to_extended(flags) & 8360 BTRFS_EXTENDED_PROFILE_MASK; 8361 8362 write_seqlock(&fs_info->profiles_lock); 8363 if (flags & BTRFS_BLOCK_GROUP_DATA) 8364 fs_info->avail_data_alloc_bits &= ~extra_flags; 8365 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8366 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8367 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8368 fs_info->avail_system_alloc_bits &= ~extra_flags; 8369 write_sequnlock(&fs_info->profiles_lock); 8370 } 8371 8372 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8373 struct btrfs_root *root, u64 group_start) 8374 { 8375 struct btrfs_path *path; 8376 struct btrfs_block_group_cache *block_group; 8377 struct btrfs_free_cluster *cluster; 8378 struct btrfs_root *tree_root = root->fs_info->tree_root; 8379 struct btrfs_key key; 8380 struct inode *inode; 8381 int ret; 8382 int index; 8383 int factor; 8384 8385 root = root->fs_info->extent_root; 8386 8387 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 8388 BUG_ON(!block_group); 8389 BUG_ON(!block_group->ro); 8390 8391 /* 8392 * Free the reserved super bytes from this block group before 8393 * remove it. 8394 */ 8395 free_excluded_extents(root, block_group); 8396 8397 memcpy(&key, &block_group->key, sizeof(key)); 8398 index = get_block_group_index(block_group); 8399 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8400 BTRFS_BLOCK_GROUP_RAID1 | 8401 BTRFS_BLOCK_GROUP_RAID10)) 8402 factor = 2; 8403 else 8404 factor = 1; 8405 8406 /* make sure this block group isn't part of an allocation cluster */ 8407 cluster = &root->fs_info->data_alloc_cluster; 8408 spin_lock(&cluster->refill_lock); 8409 btrfs_return_cluster_to_free_space(block_group, cluster); 8410 spin_unlock(&cluster->refill_lock); 8411 8412 /* 8413 * make sure this block group isn't part of a metadata 8414 * allocation cluster 8415 */ 8416 cluster = &root->fs_info->meta_alloc_cluster; 8417 spin_lock(&cluster->refill_lock); 8418 btrfs_return_cluster_to_free_space(block_group, cluster); 8419 spin_unlock(&cluster->refill_lock); 8420 8421 path = btrfs_alloc_path(); 8422 if (!path) { 8423 ret = -ENOMEM; 8424 goto out; 8425 } 8426 8427 inode = lookup_free_space_inode(tree_root, block_group, path); 8428 if (!IS_ERR(inode)) { 8429 ret = btrfs_orphan_add(trans, inode); 8430 if (ret) { 8431 btrfs_add_delayed_iput(inode); 8432 goto out; 8433 } 8434 clear_nlink(inode); 8435 /* One for the block groups ref */ 8436 spin_lock(&block_group->lock); 8437 if (block_group->iref) { 8438 block_group->iref = 0; 8439 block_group->inode = NULL; 8440 spin_unlock(&block_group->lock); 8441 iput(inode); 8442 } else { 8443 spin_unlock(&block_group->lock); 8444 } 8445 /* One for our lookup ref */ 8446 btrfs_add_delayed_iput(inode); 8447 } 8448 8449 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8450 key.offset = block_group->key.objectid; 8451 key.type = 0; 8452 8453 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8454 if (ret < 0) 8455 goto out; 8456 if (ret > 0) 8457 btrfs_release_path(path); 8458 if (ret == 0) { 8459 ret = btrfs_del_item(trans, tree_root, path); 8460 if (ret) 8461 goto out; 8462 btrfs_release_path(path); 8463 } 8464 8465 spin_lock(&root->fs_info->block_group_cache_lock); 8466 rb_erase(&block_group->cache_node, 8467 &root->fs_info->block_group_cache_tree); 8468 8469 if (root->fs_info->first_logical_byte == block_group->key.objectid) 8470 root->fs_info->first_logical_byte = (u64)-1; 8471 spin_unlock(&root->fs_info->block_group_cache_lock); 8472 8473 down_write(&block_group->space_info->groups_sem); 8474 /* 8475 * we must use list_del_init so people can check to see if they 8476 * are still on the list after taking the semaphore 8477 */ 8478 list_del_init(&block_group->list); 8479 if (list_empty(&block_group->space_info->block_groups[index])) 8480 clear_avail_alloc_bits(root->fs_info, block_group->flags); 8481 up_write(&block_group->space_info->groups_sem); 8482 8483 if (block_group->cached == BTRFS_CACHE_STARTED) 8484 wait_block_group_cache_done(block_group); 8485 8486 btrfs_remove_free_space_cache(block_group); 8487 8488 spin_lock(&block_group->space_info->lock); 8489 block_group->space_info->total_bytes -= block_group->key.offset; 8490 block_group->space_info->bytes_readonly -= block_group->key.offset; 8491 block_group->space_info->disk_total -= block_group->key.offset * factor; 8492 spin_unlock(&block_group->space_info->lock); 8493 8494 memcpy(&key, &block_group->key, sizeof(key)); 8495 8496 btrfs_clear_space_info_full(root->fs_info); 8497 8498 btrfs_put_block_group(block_group); 8499 btrfs_put_block_group(block_group); 8500 8501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 8502 if (ret > 0) 8503 ret = -EIO; 8504 if (ret < 0) 8505 goto out; 8506 8507 ret = btrfs_del_item(trans, root, path); 8508 out: 8509 btrfs_free_path(path); 8510 return ret; 8511 } 8512 8513 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 8514 { 8515 struct btrfs_space_info *space_info; 8516 struct btrfs_super_block *disk_super; 8517 u64 features; 8518 u64 flags; 8519 int mixed = 0; 8520 int ret; 8521 8522 disk_super = fs_info->super_copy; 8523 if (!btrfs_super_root(disk_super)) 8524 return 1; 8525 8526 features = btrfs_super_incompat_flags(disk_super); 8527 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 8528 mixed = 1; 8529 8530 flags = BTRFS_BLOCK_GROUP_SYSTEM; 8531 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8532 if (ret) 8533 goto out; 8534 8535 if (mixed) { 8536 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 8537 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8538 } else { 8539 flags = BTRFS_BLOCK_GROUP_METADATA; 8540 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8541 if (ret) 8542 goto out; 8543 8544 flags = BTRFS_BLOCK_GROUP_DATA; 8545 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 8546 } 8547 out: 8548 return ret; 8549 } 8550 8551 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8552 { 8553 return unpin_extent_range(root, start, end); 8554 } 8555 8556 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8557 u64 num_bytes, u64 *actual_bytes) 8558 { 8559 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 8560 } 8561 8562 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 8563 { 8564 struct btrfs_fs_info *fs_info = root->fs_info; 8565 struct btrfs_block_group_cache *cache = NULL; 8566 u64 group_trimmed; 8567 u64 start; 8568 u64 end; 8569 u64 trimmed = 0; 8570 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 8571 int ret = 0; 8572 8573 /* 8574 * try to trim all FS space, our block group may start from non-zero. 8575 */ 8576 if (range->len == total_bytes) 8577 cache = btrfs_lookup_first_block_group(fs_info, range->start); 8578 else 8579 cache = btrfs_lookup_block_group(fs_info, range->start); 8580 8581 while (cache) { 8582 if (cache->key.objectid >= (range->start + range->len)) { 8583 btrfs_put_block_group(cache); 8584 break; 8585 } 8586 8587 start = max(range->start, cache->key.objectid); 8588 end = min(range->start + range->len, 8589 cache->key.objectid + cache->key.offset); 8590 8591 if (end - start >= range->minlen) { 8592 if (!block_group_cache_done(cache)) { 8593 ret = cache_block_group(cache, 0); 8594 if (!ret) 8595 wait_block_group_cache_done(cache); 8596 } 8597 ret = btrfs_trim_block_group(cache, 8598 &group_trimmed, 8599 start, 8600 end, 8601 range->minlen); 8602 8603 trimmed += group_trimmed; 8604 if (ret) { 8605 btrfs_put_block_group(cache); 8606 break; 8607 } 8608 } 8609 8610 cache = next_block_group(fs_info->tree_root, cache); 8611 } 8612 8613 range->len = trimmed; 8614 return ret; 8615 } 8616