1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/sched/signal.h> 9 #include <linux/pagemap.h> 10 #include <linux/writeback.h> 11 #include <linux/blkdev.h> 12 #include <linux/sort.h> 13 #include <linux/rcupdate.h> 14 #include <linux/kthread.h> 15 #include <linux/slab.h> 16 #include <linux/ratelimit.h> 17 #include <linux/percpu_counter.h> 18 #include <linux/lockdep.h> 19 #include <linux/crc32c.h> 20 #include "tree-log.h" 21 #include "disk-io.h" 22 #include "print-tree.h" 23 #include "volumes.h" 24 #include "raid56.h" 25 #include "locking.h" 26 #include "free-space-cache.h" 27 #include "free-space-tree.h" 28 #include "math.h" 29 #include "sysfs.h" 30 #include "qgroup.h" 31 #include "ref-verify.h" 32 #include "space-info.h" 33 #include "block-rsv.h" 34 #include "delalloc-space.h" 35 36 #undef SCRAMBLE_DELAYED_REFS 37 38 39 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 40 struct btrfs_delayed_ref_node *node, u64 parent, 41 u64 root_objectid, u64 owner_objectid, 42 u64 owner_offset, int refs_to_drop, 43 struct btrfs_delayed_extent_op *extra_op); 44 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 45 struct extent_buffer *leaf, 46 struct btrfs_extent_item *ei); 47 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 48 u64 parent, u64 root_objectid, 49 u64 flags, u64 owner, u64 offset, 50 struct btrfs_key *ins, int ref_mod); 51 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 52 struct btrfs_delayed_ref_node *node, 53 struct btrfs_delayed_extent_op *extent_op); 54 static int find_next_key(struct btrfs_path *path, int level, 55 struct btrfs_key *key); 56 57 static noinline int 58 block_group_cache_done(struct btrfs_block_group_cache *cache) 59 { 60 smp_mb(); 61 return cache->cached == BTRFS_CACHE_FINISHED || 62 cache->cached == BTRFS_CACHE_ERROR; 63 } 64 65 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 66 { 67 return (cache->flags & bits) == bits; 68 } 69 70 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 71 { 72 atomic_inc(&cache->count); 73 } 74 75 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 76 { 77 if (atomic_dec_and_test(&cache->count)) { 78 WARN_ON(cache->pinned > 0); 79 WARN_ON(cache->reserved > 0); 80 81 /* 82 * If not empty, someone is still holding mutex of 83 * full_stripe_lock, which can only be released by caller. 84 * And it will definitely cause use-after-free when caller 85 * tries to release full stripe lock. 86 * 87 * No better way to resolve, but only to warn. 88 */ 89 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 90 kfree(cache->free_space_ctl); 91 kfree(cache); 92 } 93 } 94 95 /* 96 * this adds the block group to the fs_info rb tree for the block group 97 * cache 98 */ 99 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 100 struct btrfs_block_group_cache *block_group) 101 { 102 struct rb_node **p; 103 struct rb_node *parent = NULL; 104 struct btrfs_block_group_cache *cache; 105 106 spin_lock(&info->block_group_cache_lock); 107 p = &info->block_group_cache_tree.rb_node; 108 109 while (*p) { 110 parent = *p; 111 cache = rb_entry(parent, struct btrfs_block_group_cache, 112 cache_node); 113 if (block_group->key.objectid < cache->key.objectid) { 114 p = &(*p)->rb_left; 115 } else if (block_group->key.objectid > cache->key.objectid) { 116 p = &(*p)->rb_right; 117 } else { 118 spin_unlock(&info->block_group_cache_lock); 119 return -EEXIST; 120 } 121 } 122 123 rb_link_node(&block_group->cache_node, parent, p); 124 rb_insert_color(&block_group->cache_node, 125 &info->block_group_cache_tree); 126 127 if (info->first_logical_byte > block_group->key.objectid) 128 info->first_logical_byte = block_group->key.objectid; 129 130 spin_unlock(&info->block_group_cache_lock); 131 132 return 0; 133 } 134 135 /* 136 * This will return the block group at or after bytenr if contains is 0, else 137 * it will return the block group that contains the bytenr 138 */ 139 static struct btrfs_block_group_cache * 140 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 141 int contains) 142 { 143 struct btrfs_block_group_cache *cache, *ret = NULL; 144 struct rb_node *n; 145 u64 end, start; 146 147 spin_lock(&info->block_group_cache_lock); 148 n = info->block_group_cache_tree.rb_node; 149 150 while (n) { 151 cache = rb_entry(n, struct btrfs_block_group_cache, 152 cache_node); 153 end = cache->key.objectid + cache->key.offset - 1; 154 start = cache->key.objectid; 155 156 if (bytenr < start) { 157 if (!contains && (!ret || start < ret->key.objectid)) 158 ret = cache; 159 n = n->rb_left; 160 } else if (bytenr > start) { 161 if (contains && bytenr <= end) { 162 ret = cache; 163 break; 164 } 165 n = n->rb_right; 166 } else { 167 ret = cache; 168 break; 169 } 170 } 171 if (ret) { 172 btrfs_get_block_group(ret); 173 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 174 info->first_logical_byte = ret->key.objectid; 175 } 176 spin_unlock(&info->block_group_cache_lock); 177 178 return ret; 179 } 180 181 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 182 u64 start, u64 num_bytes) 183 { 184 u64 end = start + num_bytes - 1; 185 set_extent_bits(&fs_info->freed_extents[0], 186 start, end, EXTENT_UPTODATE); 187 set_extent_bits(&fs_info->freed_extents[1], 188 start, end, EXTENT_UPTODATE); 189 return 0; 190 } 191 192 static void free_excluded_extents(struct btrfs_block_group_cache *cache) 193 { 194 struct btrfs_fs_info *fs_info = cache->fs_info; 195 u64 start, end; 196 197 start = cache->key.objectid; 198 end = start + cache->key.offset - 1; 199 200 clear_extent_bits(&fs_info->freed_extents[0], 201 start, end, EXTENT_UPTODATE); 202 clear_extent_bits(&fs_info->freed_extents[1], 203 start, end, EXTENT_UPTODATE); 204 } 205 206 static int exclude_super_stripes(struct btrfs_block_group_cache *cache) 207 { 208 struct btrfs_fs_info *fs_info = cache->fs_info; 209 u64 bytenr; 210 u64 *logical; 211 int stripe_len; 212 int i, nr, ret; 213 214 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 215 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 216 cache->bytes_super += stripe_len; 217 ret = add_excluded_extent(fs_info, cache->key.objectid, 218 stripe_len); 219 if (ret) 220 return ret; 221 } 222 223 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 224 bytenr = btrfs_sb_offset(i); 225 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 226 bytenr, &logical, &nr, &stripe_len); 227 if (ret) 228 return ret; 229 230 while (nr--) { 231 u64 start, len; 232 233 if (logical[nr] > cache->key.objectid + 234 cache->key.offset) 235 continue; 236 237 if (logical[nr] + stripe_len <= cache->key.objectid) 238 continue; 239 240 start = logical[nr]; 241 if (start < cache->key.objectid) { 242 start = cache->key.objectid; 243 len = (logical[nr] + stripe_len) - start; 244 } else { 245 len = min_t(u64, stripe_len, 246 cache->key.objectid + 247 cache->key.offset - start); 248 } 249 250 cache->bytes_super += len; 251 ret = add_excluded_extent(fs_info, start, len); 252 if (ret) { 253 kfree(logical); 254 return ret; 255 } 256 } 257 258 kfree(logical); 259 } 260 return 0; 261 } 262 263 static struct btrfs_caching_control * 264 get_caching_control(struct btrfs_block_group_cache *cache) 265 { 266 struct btrfs_caching_control *ctl; 267 268 spin_lock(&cache->lock); 269 if (!cache->caching_ctl) { 270 spin_unlock(&cache->lock); 271 return NULL; 272 } 273 274 ctl = cache->caching_ctl; 275 refcount_inc(&ctl->count); 276 spin_unlock(&cache->lock); 277 return ctl; 278 } 279 280 static void put_caching_control(struct btrfs_caching_control *ctl) 281 { 282 if (refcount_dec_and_test(&ctl->count)) 283 kfree(ctl); 284 } 285 286 #ifdef CONFIG_BTRFS_DEBUG 287 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 288 { 289 struct btrfs_fs_info *fs_info = block_group->fs_info; 290 u64 start = block_group->key.objectid; 291 u64 len = block_group->key.offset; 292 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 293 fs_info->nodesize : fs_info->sectorsize; 294 u64 step = chunk << 1; 295 296 while (len > chunk) { 297 btrfs_remove_free_space(block_group, start, chunk); 298 start += step; 299 if (len < step) 300 len = 0; 301 else 302 len -= step; 303 } 304 } 305 #endif 306 307 /* 308 * this is only called by cache_block_group, since we could have freed extents 309 * we need to check the pinned_extents for any extents that can't be used yet 310 * since their free space will be released as soon as the transaction commits. 311 */ 312 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 313 u64 start, u64 end) 314 { 315 struct btrfs_fs_info *info = block_group->fs_info; 316 u64 extent_start, extent_end, size, total_added = 0; 317 int ret; 318 319 while (start < end) { 320 ret = find_first_extent_bit(info->pinned_extents, start, 321 &extent_start, &extent_end, 322 EXTENT_DIRTY | EXTENT_UPTODATE, 323 NULL); 324 if (ret) 325 break; 326 327 if (extent_start <= start) { 328 start = extent_end + 1; 329 } else if (extent_start > start && extent_start < end) { 330 size = extent_start - start; 331 total_added += size; 332 ret = btrfs_add_free_space(block_group, start, 333 size); 334 BUG_ON(ret); /* -ENOMEM or logic error */ 335 start = extent_end + 1; 336 } else { 337 break; 338 } 339 } 340 341 if (start < end) { 342 size = end - start; 343 total_added += size; 344 ret = btrfs_add_free_space(block_group, start, size); 345 BUG_ON(ret); /* -ENOMEM or logic error */ 346 } 347 348 return total_added; 349 } 350 351 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 352 { 353 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 354 struct btrfs_fs_info *fs_info = block_group->fs_info; 355 struct btrfs_root *extent_root = fs_info->extent_root; 356 struct btrfs_path *path; 357 struct extent_buffer *leaf; 358 struct btrfs_key key; 359 u64 total_found = 0; 360 u64 last = 0; 361 u32 nritems; 362 int ret; 363 bool wakeup = true; 364 365 path = btrfs_alloc_path(); 366 if (!path) 367 return -ENOMEM; 368 369 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 370 371 #ifdef CONFIG_BTRFS_DEBUG 372 /* 373 * If we're fragmenting we don't want to make anybody think we can 374 * allocate from this block group until we've had a chance to fragment 375 * the free space. 376 */ 377 if (btrfs_should_fragment_free_space(block_group)) 378 wakeup = false; 379 #endif 380 /* 381 * We don't want to deadlock with somebody trying to allocate a new 382 * extent for the extent root while also trying to search the extent 383 * root to add free space. So we skip locking and search the commit 384 * root, since its read-only 385 */ 386 path->skip_locking = 1; 387 path->search_commit_root = 1; 388 path->reada = READA_FORWARD; 389 390 key.objectid = last; 391 key.offset = 0; 392 key.type = BTRFS_EXTENT_ITEM_KEY; 393 394 next: 395 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 396 if (ret < 0) 397 goto out; 398 399 leaf = path->nodes[0]; 400 nritems = btrfs_header_nritems(leaf); 401 402 while (1) { 403 if (btrfs_fs_closing(fs_info) > 1) { 404 last = (u64)-1; 405 break; 406 } 407 408 if (path->slots[0] < nritems) { 409 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 410 } else { 411 ret = find_next_key(path, 0, &key); 412 if (ret) 413 break; 414 415 if (need_resched() || 416 rwsem_is_contended(&fs_info->commit_root_sem)) { 417 if (wakeup) 418 caching_ctl->progress = last; 419 btrfs_release_path(path); 420 up_read(&fs_info->commit_root_sem); 421 mutex_unlock(&caching_ctl->mutex); 422 cond_resched(); 423 mutex_lock(&caching_ctl->mutex); 424 down_read(&fs_info->commit_root_sem); 425 goto next; 426 } 427 428 ret = btrfs_next_leaf(extent_root, path); 429 if (ret < 0) 430 goto out; 431 if (ret) 432 break; 433 leaf = path->nodes[0]; 434 nritems = btrfs_header_nritems(leaf); 435 continue; 436 } 437 438 if (key.objectid < last) { 439 key.objectid = last; 440 key.offset = 0; 441 key.type = BTRFS_EXTENT_ITEM_KEY; 442 443 if (wakeup) 444 caching_ctl->progress = last; 445 btrfs_release_path(path); 446 goto next; 447 } 448 449 if (key.objectid < block_group->key.objectid) { 450 path->slots[0]++; 451 continue; 452 } 453 454 if (key.objectid >= block_group->key.objectid + 455 block_group->key.offset) 456 break; 457 458 if (key.type == BTRFS_EXTENT_ITEM_KEY || 459 key.type == BTRFS_METADATA_ITEM_KEY) { 460 total_found += add_new_free_space(block_group, last, 461 key.objectid); 462 if (key.type == BTRFS_METADATA_ITEM_KEY) 463 last = key.objectid + 464 fs_info->nodesize; 465 else 466 last = key.objectid + key.offset; 467 468 if (total_found > CACHING_CTL_WAKE_UP) { 469 total_found = 0; 470 if (wakeup) 471 wake_up(&caching_ctl->wait); 472 } 473 } 474 path->slots[0]++; 475 } 476 ret = 0; 477 478 total_found += add_new_free_space(block_group, last, 479 block_group->key.objectid + 480 block_group->key.offset); 481 caching_ctl->progress = (u64)-1; 482 483 out: 484 btrfs_free_path(path); 485 return ret; 486 } 487 488 static noinline void caching_thread(struct btrfs_work *work) 489 { 490 struct btrfs_block_group_cache *block_group; 491 struct btrfs_fs_info *fs_info; 492 struct btrfs_caching_control *caching_ctl; 493 int ret; 494 495 caching_ctl = container_of(work, struct btrfs_caching_control, work); 496 block_group = caching_ctl->block_group; 497 fs_info = block_group->fs_info; 498 499 mutex_lock(&caching_ctl->mutex); 500 down_read(&fs_info->commit_root_sem); 501 502 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 503 ret = load_free_space_tree(caching_ctl); 504 else 505 ret = load_extent_tree_free(caching_ctl); 506 507 spin_lock(&block_group->lock); 508 block_group->caching_ctl = NULL; 509 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 510 spin_unlock(&block_group->lock); 511 512 #ifdef CONFIG_BTRFS_DEBUG 513 if (btrfs_should_fragment_free_space(block_group)) { 514 u64 bytes_used; 515 516 spin_lock(&block_group->space_info->lock); 517 spin_lock(&block_group->lock); 518 bytes_used = block_group->key.offset - 519 btrfs_block_group_used(&block_group->item); 520 block_group->space_info->bytes_used += bytes_used >> 1; 521 spin_unlock(&block_group->lock); 522 spin_unlock(&block_group->space_info->lock); 523 fragment_free_space(block_group); 524 } 525 #endif 526 527 caching_ctl->progress = (u64)-1; 528 529 up_read(&fs_info->commit_root_sem); 530 free_excluded_extents(block_group); 531 mutex_unlock(&caching_ctl->mutex); 532 533 wake_up(&caching_ctl->wait); 534 535 put_caching_control(caching_ctl); 536 btrfs_put_block_group(block_group); 537 } 538 539 static int cache_block_group(struct btrfs_block_group_cache *cache, 540 int load_cache_only) 541 { 542 DEFINE_WAIT(wait); 543 struct btrfs_fs_info *fs_info = cache->fs_info; 544 struct btrfs_caching_control *caching_ctl; 545 int ret = 0; 546 547 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 548 if (!caching_ctl) 549 return -ENOMEM; 550 551 INIT_LIST_HEAD(&caching_ctl->list); 552 mutex_init(&caching_ctl->mutex); 553 init_waitqueue_head(&caching_ctl->wait); 554 caching_ctl->block_group = cache; 555 caching_ctl->progress = cache->key.objectid; 556 refcount_set(&caching_ctl->count, 1); 557 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 558 caching_thread, NULL, NULL); 559 560 spin_lock(&cache->lock); 561 /* 562 * This should be a rare occasion, but this could happen I think in the 563 * case where one thread starts to load the space cache info, and then 564 * some other thread starts a transaction commit which tries to do an 565 * allocation while the other thread is still loading the space cache 566 * info. The previous loop should have kept us from choosing this block 567 * group, but if we've moved to the state where we will wait on caching 568 * block groups we need to first check if we're doing a fast load here, 569 * so we can wait for it to finish, otherwise we could end up allocating 570 * from a block group who's cache gets evicted for one reason or 571 * another. 572 */ 573 while (cache->cached == BTRFS_CACHE_FAST) { 574 struct btrfs_caching_control *ctl; 575 576 ctl = cache->caching_ctl; 577 refcount_inc(&ctl->count); 578 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 579 spin_unlock(&cache->lock); 580 581 schedule(); 582 583 finish_wait(&ctl->wait, &wait); 584 put_caching_control(ctl); 585 spin_lock(&cache->lock); 586 } 587 588 if (cache->cached != BTRFS_CACHE_NO) { 589 spin_unlock(&cache->lock); 590 kfree(caching_ctl); 591 return 0; 592 } 593 WARN_ON(cache->caching_ctl); 594 cache->caching_ctl = caching_ctl; 595 cache->cached = BTRFS_CACHE_FAST; 596 spin_unlock(&cache->lock); 597 598 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 599 mutex_lock(&caching_ctl->mutex); 600 ret = load_free_space_cache(cache); 601 602 spin_lock(&cache->lock); 603 if (ret == 1) { 604 cache->caching_ctl = NULL; 605 cache->cached = BTRFS_CACHE_FINISHED; 606 cache->last_byte_to_unpin = (u64)-1; 607 caching_ctl->progress = (u64)-1; 608 } else { 609 if (load_cache_only) { 610 cache->caching_ctl = NULL; 611 cache->cached = BTRFS_CACHE_NO; 612 } else { 613 cache->cached = BTRFS_CACHE_STARTED; 614 cache->has_caching_ctl = 1; 615 } 616 } 617 spin_unlock(&cache->lock); 618 #ifdef CONFIG_BTRFS_DEBUG 619 if (ret == 1 && 620 btrfs_should_fragment_free_space(cache)) { 621 u64 bytes_used; 622 623 spin_lock(&cache->space_info->lock); 624 spin_lock(&cache->lock); 625 bytes_used = cache->key.offset - 626 btrfs_block_group_used(&cache->item); 627 cache->space_info->bytes_used += bytes_used >> 1; 628 spin_unlock(&cache->lock); 629 spin_unlock(&cache->space_info->lock); 630 fragment_free_space(cache); 631 } 632 #endif 633 mutex_unlock(&caching_ctl->mutex); 634 635 wake_up(&caching_ctl->wait); 636 if (ret == 1) { 637 put_caching_control(caching_ctl); 638 free_excluded_extents(cache); 639 return 0; 640 } 641 } else { 642 /* 643 * We're either using the free space tree or no caching at all. 644 * Set cached to the appropriate value and wakeup any waiters. 645 */ 646 spin_lock(&cache->lock); 647 if (load_cache_only) { 648 cache->caching_ctl = NULL; 649 cache->cached = BTRFS_CACHE_NO; 650 } else { 651 cache->cached = BTRFS_CACHE_STARTED; 652 cache->has_caching_ctl = 1; 653 } 654 spin_unlock(&cache->lock); 655 wake_up(&caching_ctl->wait); 656 } 657 658 if (load_cache_only) { 659 put_caching_control(caching_ctl); 660 return 0; 661 } 662 663 down_write(&fs_info->commit_root_sem); 664 refcount_inc(&caching_ctl->count); 665 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 666 up_write(&fs_info->commit_root_sem); 667 668 btrfs_get_block_group(cache); 669 670 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 671 672 return ret; 673 } 674 675 /* 676 * return the block group that starts at or after bytenr 677 */ 678 static struct btrfs_block_group_cache * 679 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 680 { 681 return block_group_cache_tree_search(info, bytenr, 0); 682 } 683 684 /* 685 * return the block group that contains the given bytenr 686 */ 687 struct btrfs_block_group_cache *btrfs_lookup_block_group( 688 struct btrfs_fs_info *info, 689 u64 bytenr) 690 { 691 return block_group_cache_tree_search(info, bytenr, 1); 692 } 693 694 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) 695 { 696 if (ref->type == BTRFS_REF_METADATA) { 697 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) 698 return BTRFS_BLOCK_GROUP_SYSTEM; 699 else 700 return BTRFS_BLOCK_GROUP_METADATA; 701 } 702 return BTRFS_BLOCK_GROUP_DATA; 703 } 704 705 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, 706 struct btrfs_ref *ref) 707 { 708 struct btrfs_space_info *space_info; 709 u64 flags = generic_ref_to_space_flags(ref); 710 711 space_info = btrfs_find_space_info(fs_info, flags); 712 ASSERT(space_info); 713 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, 714 BTRFS_TOTAL_BYTES_PINNED_BATCH); 715 } 716 717 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, 718 struct btrfs_ref *ref) 719 { 720 struct btrfs_space_info *space_info; 721 u64 flags = generic_ref_to_space_flags(ref); 722 723 space_info = btrfs_find_space_info(fs_info, flags); 724 ASSERT(space_info); 725 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, 726 BTRFS_TOTAL_BYTES_PINNED_BATCH); 727 } 728 729 /* simple helper to search for an existing data extent at a given offset */ 730 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 731 { 732 int ret; 733 struct btrfs_key key; 734 struct btrfs_path *path; 735 736 path = btrfs_alloc_path(); 737 if (!path) 738 return -ENOMEM; 739 740 key.objectid = start; 741 key.offset = len; 742 key.type = BTRFS_EXTENT_ITEM_KEY; 743 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 744 btrfs_free_path(path); 745 return ret; 746 } 747 748 /* 749 * helper function to lookup reference count and flags of a tree block. 750 * 751 * the head node for delayed ref is used to store the sum of all the 752 * reference count modifications queued up in the rbtree. the head 753 * node may also store the extent flags to set. This way you can check 754 * to see what the reference count and extent flags would be if all of 755 * the delayed refs are not processed. 756 */ 757 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 758 struct btrfs_fs_info *fs_info, u64 bytenr, 759 u64 offset, int metadata, u64 *refs, u64 *flags) 760 { 761 struct btrfs_delayed_ref_head *head; 762 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_path *path; 764 struct btrfs_extent_item *ei; 765 struct extent_buffer *leaf; 766 struct btrfs_key key; 767 u32 item_size; 768 u64 num_refs; 769 u64 extent_flags; 770 int ret; 771 772 /* 773 * If we don't have skinny metadata, don't bother doing anything 774 * different 775 */ 776 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 777 offset = fs_info->nodesize; 778 metadata = 0; 779 } 780 781 path = btrfs_alloc_path(); 782 if (!path) 783 return -ENOMEM; 784 785 if (!trans) { 786 path->skip_locking = 1; 787 path->search_commit_root = 1; 788 } 789 790 search_again: 791 key.objectid = bytenr; 792 key.offset = offset; 793 if (metadata) 794 key.type = BTRFS_METADATA_ITEM_KEY; 795 else 796 key.type = BTRFS_EXTENT_ITEM_KEY; 797 798 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 799 if (ret < 0) 800 goto out_free; 801 802 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 803 if (path->slots[0]) { 804 path->slots[0]--; 805 btrfs_item_key_to_cpu(path->nodes[0], &key, 806 path->slots[0]); 807 if (key.objectid == bytenr && 808 key.type == BTRFS_EXTENT_ITEM_KEY && 809 key.offset == fs_info->nodesize) 810 ret = 0; 811 } 812 } 813 814 if (ret == 0) { 815 leaf = path->nodes[0]; 816 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 817 if (item_size >= sizeof(*ei)) { 818 ei = btrfs_item_ptr(leaf, path->slots[0], 819 struct btrfs_extent_item); 820 num_refs = btrfs_extent_refs(leaf, ei); 821 extent_flags = btrfs_extent_flags(leaf, ei); 822 } else { 823 ret = -EINVAL; 824 btrfs_print_v0_err(fs_info); 825 if (trans) 826 btrfs_abort_transaction(trans, ret); 827 else 828 btrfs_handle_fs_error(fs_info, ret, NULL); 829 830 goto out_free; 831 } 832 833 BUG_ON(num_refs == 0); 834 } else { 835 num_refs = 0; 836 extent_flags = 0; 837 ret = 0; 838 } 839 840 if (!trans) 841 goto out; 842 843 delayed_refs = &trans->transaction->delayed_refs; 844 spin_lock(&delayed_refs->lock); 845 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 846 if (head) { 847 if (!mutex_trylock(&head->mutex)) { 848 refcount_inc(&head->refs); 849 spin_unlock(&delayed_refs->lock); 850 851 btrfs_release_path(path); 852 853 /* 854 * Mutex was contended, block until it's released and try 855 * again 856 */ 857 mutex_lock(&head->mutex); 858 mutex_unlock(&head->mutex); 859 btrfs_put_delayed_ref_head(head); 860 goto search_again; 861 } 862 spin_lock(&head->lock); 863 if (head->extent_op && head->extent_op->update_flags) 864 extent_flags |= head->extent_op->flags_to_set; 865 else 866 BUG_ON(num_refs == 0); 867 868 num_refs += head->ref_mod; 869 spin_unlock(&head->lock); 870 mutex_unlock(&head->mutex); 871 } 872 spin_unlock(&delayed_refs->lock); 873 out: 874 WARN_ON(num_refs == 0); 875 if (refs) 876 *refs = num_refs; 877 if (flags) 878 *flags = extent_flags; 879 out_free: 880 btrfs_free_path(path); 881 return ret; 882 } 883 884 /* 885 * Back reference rules. Back refs have three main goals: 886 * 887 * 1) differentiate between all holders of references to an extent so that 888 * when a reference is dropped we can make sure it was a valid reference 889 * before freeing the extent. 890 * 891 * 2) Provide enough information to quickly find the holders of an extent 892 * if we notice a given block is corrupted or bad. 893 * 894 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 895 * maintenance. This is actually the same as #2, but with a slightly 896 * different use case. 897 * 898 * There are two kinds of back refs. The implicit back refs is optimized 899 * for pointers in non-shared tree blocks. For a given pointer in a block, 900 * back refs of this kind provide information about the block's owner tree 901 * and the pointer's key. These information allow us to find the block by 902 * b-tree searching. The full back refs is for pointers in tree blocks not 903 * referenced by their owner trees. The location of tree block is recorded 904 * in the back refs. Actually the full back refs is generic, and can be 905 * used in all cases the implicit back refs is used. The major shortcoming 906 * of the full back refs is its overhead. Every time a tree block gets 907 * COWed, we have to update back refs entry for all pointers in it. 908 * 909 * For a newly allocated tree block, we use implicit back refs for 910 * pointers in it. This means most tree related operations only involve 911 * implicit back refs. For a tree block created in old transaction, the 912 * only way to drop a reference to it is COW it. So we can detect the 913 * event that tree block loses its owner tree's reference and do the 914 * back refs conversion. 915 * 916 * When a tree block is COWed through a tree, there are four cases: 917 * 918 * The reference count of the block is one and the tree is the block's 919 * owner tree. Nothing to do in this case. 920 * 921 * The reference count of the block is one and the tree is not the 922 * block's owner tree. In this case, full back refs is used for pointers 923 * in the block. Remove these full back refs, add implicit back refs for 924 * every pointers in the new block. 925 * 926 * The reference count of the block is greater than one and the tree is 927 * the block's owner tree. In this case, implicit back refs is used for 928 * pointers in the block. Add full back refs for every pointers in the 929 * block, increase lower level extents' reference counts. The original 930 * implicit back refs are entailed to the new block. 931 * 932 * The reference count of the block is greater than one and the tree is 933 * not the block's owner tree. Add implicit back refs for every pointer in 934 * the new block, increase lower level extents' reference count. 935 * 936 * Back Reference Key composing: 937 * 938 * The key objectid corresponds to the first byte in the extent, 939 * The key type is used to differentiate between types of back refs. 940 * There are different meanings of the key offset for different types 941 * of back refs. 942 * 943 * File extents can be referenced by: 944 * 945 * - multiple snapshots, subvolumes, or different generations in one subvol 946 * - different files inside a single subvolume 947 * - different offsets inside a file (bookend extents in file.c) 948 * 949 * The extent ref structure for the implicit back refs has fields for: 950 * 951 * - Objectid of the subvolume root 952 * - objectid of the file holding the reference 953 * - original offset in the file 954 * - how many bookend extents 955 * 956 * The key offset for the implicit back refs is hash of the first 957 * three fields. 958 * 959 * The extent ref structure for the full back refs has field for: 960 * 961 * - number of pointers in the tree leaf 962 * 963 * The key offset for the implicit back refs is the first byte of 964 * the tree leaf 965 * 966 * When a file extent is allocated, The implicit back refs is used. 967 * the fields are filled in: 968 * 969 * (root_key.objectid, inode objectid, offset in file, 1) 970 * 971 * When a file extent is removed file truncation, we find the 972 * corresponding implicit back refs and check the following fields: 973 * 974 * (btrfs_header_owner(leaf), inode objectid, offset in file) 975 * 976 * Btree extents can be referenced by: 977 * 978 * - Different subvolumes 979 * 980 * Both the implicit back refs and the full back refs for tree blocks 981 * only consist of key. The key offset for the implicit back refs is 982 * objectid of block's owner tree. The key offset for the full back refs 983 * is the first byte of parent block. 984 * 985 * When implicit back refs is used, information about the lowest key and 986 * level of the tree block are required. These information are stored in 987 * tree block info structure. 988 */ 989 990 /* 991 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 992 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 993 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 994 */ 995 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 996 struct btrfs_extent_inline_ref *iref, 997 enum btrfs_inline_ref_type is_data) 998 { 999 int type = btrfs_extent_inline_ref_type(eb, iref); 1000 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 1001 1002 if (type == BTRFS_TREE_BLOCK_REF_KEY || 1003 type == BTRFS_SHARED_BLOCK_REF_KEY || 1004 type == BTRFS_SHARED_DATA_REF_KEY || 1005 type == BTRFS_EXTENT_DATA_REF_KEY) { 1006 if (is_data == BTRFS_REF_TYPE_BLOCK) { 1007 if (type == BTRFS_TREE_BLOCK_REF_KEY) 1008 return type; 1009 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1010 ASSERT(eb->fs_info); 1011 /* 1012 * Every shared one has parent tree 1013 * block, which must be aligned to 1014 * nodesize. 1015 */ 1016 if (offset && 1017 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1018 return type; 1019 } 1020 } else if (is_data == BTRFS_REF_TYPE_DATA) { 1021 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1022 return type; 1023 if (type == BTRFS_SHARED_DATA_REF_KEY) { 1024 ASSERT(eb->fs_info); 1025 /* 1026 * Every shared one has parent tree 1027 * block, which must be aligned to 1028 * nodesize. 1029 */ 1030 if (offset && 1031 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1032 return type; 1033 } 1034 } else { 1035 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 1036 return type; 1037 } 1038 } 1039 1040 btrfs_print_leaf((struct extent_buffer *)eb); 1041 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", 1042 eb->start, type); 1043 WARN_ON(1); 1044 1045 return BTRFS_REF_TYPE_INVALID; 1046 } 1047 1048 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1049 { 1050 u32 high_crc = ~(u32)0; 1051 u32 low_crc = ~(u32)0; 1052 __le64 lenum; 1053 1054 lenum = cpu_to_le64(root_objectid); 1055 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1056 lenum = cpu_to_le64(owner); 1057 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1058 lenum = cpu_to_le64(offset); 1059 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1060 1061 return ((u64)high_crc << 31) ^ (u64)low_crc; 1062 } 1063 1064 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1065 struct btrfs_extent_data_ref *ref) 1066 { 1067 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1068 btrfs_extent_data_ref_objectid(leaf, ref), 1069 btrfs_extent_data_ref_offset(leaf, ref)); 1070 } 1071 1072 static int match_extent_data_ref(struct extent_buffer *leaf, 1073 struct btrfs_extent_data_ref *ref, 1074 u64 root_objectid, u64 owner, u64 offset) 1075 { 1076 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1077 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1078 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1079 return 0; 1080 return 1; 1081 } 1082 1083 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1084 struct btrfs_path *path, 1085 u64 bytenr, u64 parent, 1086 u64 root_objectid, 1087 u64 owner, u64 offset) 1088 { 1089 struct btrfs_root *root = trans->fs_info->extent_root; 1090 struct btrfs_key key; 1091 struct btrfs_extent_data_ref *ref; 1092 struct extent_buffer *leaf; 1093 u32 nritems; 1094 int ret; 1095 int recow; 1096 int err = -ENOENT; 1097 1098 key.objectid = bytenr; 1099 if (parent) { 1100 key.type = BTRFS_SHARED_DATA_REF_KEY; 1101 key.offset = parent; 1102 } else { 1103 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1104 key.offset = hash_extent_data_ref(root_objectid, 1105 owner, offset); 1106 } 1107 again: 1108 recow = 0; 1109 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1110 if (ret < 0) { 1111 err = ret; 1112 goto fail; 1113 } 1114 1115 if (parent) { 1116 if (!ret) 1117 return 0; 1118 goto fail; 1119 } 1120 1121 leaf = path->nodes[0]; 1122 nritems = btrfs_header_nritems(leaf); 1123 while (1) { 1124 if (path->slots[0] >= nritems) { 1125 ret = btrfs_next_leaf(root, path); 1126 if (ret < 0) 1127 err = ret; 1128 if (ret) 1129 goto fail; 1130 1131 leaf = path->nodes[0]; 1132 nritems = btrfs_header_nritems(leaf); 1133 recow = 1; 1134 } 1135 1136 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1137 if (key.objectid != bytenr || 1138 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1139 goto fail; 1140 1141 ref = btrfs_item_ptr(leaf, path->slots[0], 1142 struct btrfs_extent_data_ref); 1143 1144 if (match_extent_data_ref(leaf, ref, root_objectid, 1145 owner, offset)) { 1146 if (recow) { 1147 btrfs_release_path(path); 1148 goto again; 1149 } 1150 err = 0; 1151 break; 1152 } 1153 path->slots[0]++; 1154 } 1155 fail: 1156 return err; 1157 } 1158 1159 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1160 struct btrfs_path *path, 1161 u64 bytenr, u64 parent, 1162 u64 root_objectid, u64 owner, 1163 u64 offset, int refs_to_add) 1164 { 1165 struct btrfs_root *root = trans->fs_info->extent_root; 1166 struct btrfs_key key; 1167 struct extent_buffer *leaf; 1168 u32 size; 1169 u32 num_refs; 1170 int ret; 1171 1172 key.objectid = bytenr; 1173 if (parent) { 1174 key.type = BTRFS_SHARED_DATA_REF_KEY; 1175 key.offset = parent; 1176 size = sizeof(struct btrfs_shared_data_ref); 1177 } else { 1178 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1179 key.offset = hash_extent_data_ref(root_objectid, 1180 owner, offset); 1181 size = sizeof(struct btrfs_extent_data_ref); 1182 } 1183 1184 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1185 if (ret && ret != -EEXIST) 1186 goto fail; 1187 1188 leaf = path->nodes[0]; 1189 if (parent) { 1190 struct btrfs_shared_data_ref *ref; 1191 ref = btrfs_item_ptr(leaf, path->slots[0], 1192 struct btrfs_shared_data_ref); 1193 if (ret == 0) { 1194 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1195 } else { 1196 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1197 num_refs += refs_to_add; 1198 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1199 } 1200 } else { 1201 struct btrfs_extent_data_ref *ref; 1202 while (ret == -EEXIST) { 1203 ref = btrfs_item_ptr(leaf, path->slots[0], 1204 struct btrfs_extent_data_ref); 1205 if (match_extent_data_ref(leaf, ref, root_objectid, 1206 owner, offset)) 1207 break; 1208 btrfs_release_path(path); 1209 key.offset++; 1210 ret = btrfs_insert_empty_item(trans, root, path, &key, 1211 size); 1212 if (ret && ret != -EEXIST) 1213 goto fail; 1214 1215 leaf = path->nodes[0]; 1216 } 1217 ref = btrfs_item_ptr(leaf, path->slots[0], 1218 struct btrfs_extent_data_ref); 1219 if (ret == 0) { 1220 btrfs_set_extent_data_ref_root(leaf, ref, 1221 root_objectid); 1222 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1223 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1224 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1225 } else { 1226 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1227 num_refs += refs_to_add; 1228 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1229 } 1230 } 1231 btrfs_mark_buffer_dirty(leaf); 1232 ret = 0; 1233 fail: 1234 btrfs_release_path(path); 1235 return ret; 1236 } 1237 1238 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1239 struct btrfs_path *path, 1240 int refs_to_drop, int *last_ref) 1241 { 1242 struct btrfs_key key; 1243 struct btrfs_extent_data_ref *ref1 = NULL; 1244 struct btrfs_shared_data_ref *ref2 = NULL; 1245 struct extent_buffer *leaf; 1246 u32 num_refs = 0; 1247 int ret = 0; 1248 1249 leaf = path->nodes[0]; 1250 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1251 1252 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1253 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1254 struct btrfs_extent_data_ref); 1255 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1256 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1257 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1258 struct btrfs_shared_data_ref); 1259 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1260 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 1261 btrfs_print_v0_err(trans->fs_info); 1262 btrfs_abort_transaction(trans, -EINVAL); 1263 return -EINVAL; 1264 } else { 1265 BUG(); 1266 } 1267 1268 BUG_ON(num_refs < refs_to_drop); 1269 num_refs -= refs_to_drop; 1270 1271 if (num_refs == 0) { 1272 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1273 *last_ref = 1; 1274 } else { 1275 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1276 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1277 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1278 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1279 btrfs_mark_buffer_dirty(leaf); 1280 } 1281 return ret; 1282 } 1283 1284 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1285 struct btrfs_extent_inline_ref *iref) 1286 { 1287 struct btrfs_key key; 1288 struct extent_buffer *leaf; 1289 struct btrfs_extent_data_ref *ref1; 1290 struct btrfs_shared_data_ref *ref2; 1291 u32 num_refs = 0; 1292 int type; 1293 1294 leaf = path->nodes[0]; 1295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1296 1297 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 1298 if (iref) { 1299 /* 1300 * If type is invalid, we should have bailed out earlier than 1301 * this call. 1302 */ 1303 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 1304 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1305 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1306 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1307 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1308 } else { 1309 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1310 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1311 } 1312 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1313 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1314 struct btrfs_extent_data_ref); 1315 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1316 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1317 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1318 struct btrfs_shared_data_ref); 1319 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1320 } else { 1321 WARN_ON(1); 1322 } 1323 return num_refs; 1324 } 1325 1326 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1327 struct btrfs_path *path, 1328 u64 bytenr, u64 parent, 1329 u64 root_objectid) 1330 { 1331 struct btrfs_root *root = trans->fs_info->extent_root; 1332 struct btrfs_key key; 1333 int ret; 1334 1335 key.objectid = bytenr; 1336 if (parent) { 1337 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1338 key.offset = parent; 1339 } else { 1340 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1341 key.offset = root_objectid; 1342 } 1343 1344 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1345 if (ret > 0) 1346 ret = -ENOENT; 1347 return ret; 1348 } 1349 1350 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1351 struct btrfs_path *path, 1352 u64 bytenr, u64 parent, 1353 u64 root_objectid) 1354 { 1355 struct btrfs_key key; 1356 int ret; 1357 1358 key.objectid = bytenr; 1359 if (parent) { 1360 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1361 key.offset = parent; 1362 } else { 1363 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1364 key.offset = root_objectid; 1365 } 1366 1367 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, 1368 path, &key, 0); 1369 btrfs_release_path(path); 1370 return ret; 1371 } 1372 1373 static inline int extent_ref_type(u64 parent, u64 owner) 1374 { 1375 int type; 1376 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1377 if (parent > 0) 1378 type = BTRFS_SHARED_BLOCK_REF_KEY; 1379 else 1380 type = BTRFS_TREE_BLOCK_REF_KEY; 1381 } else { 1382 if (parent > 0) 1383 type = BTRFS_SHARED_DATA_REF_KEY; 1384 else 1385 type = BTRFS_EXTENT_DATA_REF_KEY; 1386 } 1387 return type; 1388 } 1389 1390 static int find_next_key(struct btrfs_path *path, int level, 1391 struct btrfs_key *key) 1392 1393 { 1394 for (; level < BTRFS_MAX_LEVEL; level++) { 1395 if (!path->nodes[level]) 1396 break; 1397 if (path->slots[level] + 1 >= 1398 btrfs_header_nritems(path->nodes[level])) 1399 continue; 1400 if (level == 0) 1401 btrfs_item_key_to_cpu(path->nodes[level], key, 1402 path->slots[level] + 1); 1403 else 1404 btrfs_node_key_to_cpu(path->nodes[level], key, 1405 path->slots[level] + 1); 1406 return 0; 1407 } 1408 return 1; 1409 } 1410 1411 /* 1412 * look for inline back ref. if back ref is found, *ref_ret is set 1413 * to the address of inline back ref, and 0 is returned. 1414 * 1415 * if back ref isn't found, *ref_ret is set to the address where it 1416 * should be inserted, and -ENOENT is returned. 1417 * 1418 * if insert is true and there are too many inline back refs, the path 1419 * points to the extent item, and -EAGAIN is returned. 1420 * 1421 * NOTE: inline back refs are ordered in the same way that back ref 1422 * items in the tree are ordered. 1423 */ 1424 static noinline_for_stack 1425 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1426 struct btrfs_path *path, 1427 struct btrfs_extent_inline_ref **ref_ret, 1428 u64 bytenr, u64 num_bytes, 1429 u64 parent, u64 root_objectid, 1430 u64 owner, u64 offset, int insert) 1431 { 1432 struct btrfs_fs_info *fs_info = trans->fs_info; 1433 struct btrfs_root *root = fs_info->extent_root; 1434 struct btrfs_key key; 1435 struct extent_buffer *leaf; 1436 struct btrfs_extent_item *ei; 1437 struct btrfs_extent_inline_ref *iref; 1438 u64 flags; 1439 u64 item_size; 1440 unsigned long ptr; 1441 unsigned long end; 1442 int extra_size; 1443 int type; 1444 int want; 1445 int ret; 1446 int err = 0; 1447 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1448 int needed; 1449 1450 key.objectid = bytenr; 1451 key.type = BTRFS_EXTENT_ITEM_KEY; 1452 key.offset = num_bytes; 1453 1454 want = extent_ref_type(parent, owner); 1455 if (insert) { 1456 extra_size = btrfs_extent_inline_ref_size(want); 1457 path->keep_locks = 1; 1458 } else 1459 extra_size = -1; 1460 1461 /* 1462 * Owner is our level, so we can just add one to get the level for the 1463 * block we are interested in. 1464 */ 1465 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1466 key.type = BTRFS_METADATA_ITEM_KEY; 1467 key.offset = owner; 1468 } 1469 1470 again: 1471 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1472 if (ret < 0) { 1473 err = ret; 1474 goto out; 1475 } 1476 1477 /* 1478 * We may be a newly converted file system which still has the old fat 1479 * extent entries for metadata, so try and see if we have one of those. 1480 */ 1481 if (ret > 0 && skinny_metadata) { 1482 skinny_metadata = false; 1483 if (path->slots[0]) { 1484 path->slots[0]--; 1485 btrfs_item_key_to_cpu(path->nodes[0], &key, 1486 path->slots[0]); 1487 if (key.objectid == bytenr && 1488 key.type == BTRFS_EXTENT_ITEM_KEY && 1489 key.offset == num_bytes) 1490 ret = 0; 1491 } 1492 if (ret) { 1493 key.objectid = bytenr; 1494 key.type = BTRFS_EXTENT_ITEM_KEY; 1495 key.offset = num_bytes; 1496 btrfs_release_path(path); 1497 goto again; 1498 } 1499 } 1500 1501 if (ret && !insert) { 1502 err = -ENOENT; 1503 goto out; 1504 } else if (WARN_ON(ret)) { 1505 err = -EIO; 1506 goto out; 1507 } 1508 1509 leaf = path->nodes[0]; 1510 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1511 if (unlikely(item_size < sizeof(*ei))) { 1512 err = -EINVAL; 1513 btrfs_print_v0_err(fs_info); 1514 btrfs_abort_transaction(trans, err); 1515 goto out; 1516 } 1517 1518 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1519 flags = btrfs_extent_flags(leaf, ei); 1520 1521 ptr = (unsigned long)(ei + 1); 1522 end = (unsigned long)ei + item_size; 1523 1524 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1525 ptr += sizeof(struct btrfs_tree_block_info); 1526 BUG_ON(ptr > end); 1527 } 1528 1529 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 1530 needed = BTRFS_REF_TYPE_DATA; 1531 else 1532 needed = BTRFS_REF_TYPE_BLOCK; 1533 1534 err = -ENOENT; 1535 while (1) { 1536 if (ptr >= end) { 1537 WARN_ON(ptr > end); 1538 break; 1539 } 1540 iref = (struct btrfs_extent_inline_ref *)ptr; 1541 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 1542 if (type == BTRFS_REF_TYPE_INVALID) { 1543 err = -EUCLEAN; 1544 goto out; 1545 } 1546 1547 if (want < type) 1548 break; 1549 if (want > type) { 1550 ptr += btrfs_extent_inline_ref_size(type); 1551 continue; 1552 } 1553 1554 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1555 struct btrfs_extent_data_ref *dref; 1556 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1557 if (match_extent_data_ref(leaf, dref, root_objectid, 1558 owner, offset)) { 1559 err = 0; 1560 break; 1561 } 1562 if (hash_extent_data_ref_item(leaf, dref) < 1563 hash_extent_data_ref(root_objectid, owner, offset)) 1564 break; 1565 } else { 1566 u64 ref_offset; 1567 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1568 if (parent > 0) { 1569 if (parent == ref_offset) { 1570 err = 0; 1571 break; 1572 } 1573 if (ref_offset < parent) 1574 break; 1575 } else { 1576 if (root_objectid == ref_offset) { 1577 err = 0; 1578 break; 1579 } 1580 if (ref_offset < root_objectid) 1581 break; 1582 } 1583 } 1584 ptr += btrfs_extent_inline_ref_size(type); 1585 } 1586 if (err == -ENOENT && insert) { 1587 if (item_size + extra_size >= 1588 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1589 err = -EAGAIN; 1590 goto out; 1591 } 1592 /* 1593 * To add new inline back ref, we have to make sure 1594 * there is no corresponding back ref item. 1595 * For simplicity, we just do not add new inline back 1596 * ref if there is any kind of item for this block 1597 */ 1598 if (find_next_key(path, 0, &key) == 0 && 1599 key.objectid == bytenr && 1600 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1601 err = -EAGAIN; 1602 goto out; 1603 } 1604 } 1605 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1606 out: 1607 if (insert) { 1608 path->keep_locks = 0; 1609 btrfs_unlock_up_safe(path, 1); 1610 } 1611 return err; 1612 } 1613 1614 /* 1615 * helper to add new inline back ref 1616 */ 1617 static noinline_for_stack 1618 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1619 struct btrfs_path *path, 1620 struct btrfs_extent_inline_ref *iref, 1621 u64 parent, u64 root_objectid, 1622 u64 owner, u64 offset, int refs_to_add, 1623 struct btrfs_delayed_extent_op *extent_op) 1624 { 1625 struct extent_buffer *leaf; 1626 struct btrfs_extent_item *ei; 1627 unsigned long ptr; 1628 unsigned long end; 1629 unsigned long item_offset; 1630 u64 refs; 1631 int size; 1632 int type; 1633 1634 leaf = path->nodes[0]; 1635 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1636 item_offset = (unsigned long)iref - (unsigned long)ei; 1637 1638 type = extent_ref_type(parent, owner); 1639 size = btrfs_extent_inline_ref_size(type); 1640 1641 btrfs_extend_item(path, size); 1642 1643 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1644 refs = btrfs_extent_refs(leaf, ei); 1645 refs += refs_to_add; 1646 btrfs_set_extent_refs(leaf, ei, refs); 1647 if (extent_op) 1648 __run_delayed_extent_op(extent_op, leaf, ei); 1649 1650 ptr = (unsigned long)ei + item_offset; 1651 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1652 if (ptr < end - size) 1653 memmove_extent_buffer(leaf, ptr + size, ptr, 1654 end - size - ptr); 1655 1656 iref = (struct btrfs_extent_inline_ref *)ptr; 1657 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1658 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1659 struct btrfs_extent_data_ref *dref; 1660 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1661 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1662 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1663 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1664 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1665 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1666 struct btrfs_shared_data_ref *sref; 1667 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1668 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1669 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1670 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1671 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1672 } else { 1673 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1674 } 1675 btrfs_mark_buffer_dirty(leaf); 1676 } 1677 1678 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1679 struct btrfs_path *path, 1680 struct btrfs_extent_inline_ref **ref_ret, 1681 u64 bytenr, u64 num_bytes, u64 parent, 1682 u64 root_objectid, u64 owner, u64 offset) 1683 { 1684 int ret; 1685 1686 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1687 num_bytes, parent, root_objectid, 1688 owner, offset, 0); 1689 if (ret != -ENOENT) 1690 return ret; 1691 1692 btrfs_release_path(path); 1693 *ref_ret = NULL; 1694 1695 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1696 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1697 root_objectid); 1698 } else { 1699 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1700 root_objectid, owner, offset); 1701 } 1702 return ret; 1703 } 1704 1705 /* 1706 * helper to update/remove inline back ref 1707 */ 1708 static noinline_for_stack 1709 void update_inline_extent_backref(struct btrfs_path *path, 1710 struct btrfs_extent_inline_ref *iref, 1711 int refs_to_mod, 1712 struct btrfs_delayed_extent_op *extent_op, 1713 int *last_ref) 1714 { 1715 struct extent_buffer *leaf = path->nodes[0]; 1716 struct btrfs_extent_item *ei; 1717 struct btrfs_extent_data_ref *dref = NULL; 1718 struct btrfs_shared_data_ref *sref = NULL; 1719 unsigned long ptr; 1720 unsigned long end; 1721 u32 item_size; 1722 int size; 1723 int type; 1724 u64 refs; 1725 1726 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1727 refs = btrfs_extent_refs(leaf, ei); 1728 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1729 refs += refs_to_mod; 1730 btrfs_set_extent_refs(leaf, ei, refs); 1731 if (extent_op) 1732 __run_delayed_extent_op(extent_op, leaf, ei); 1733 1734 /* 1735 * If type is invalid, we should have bailed out after 1736 * lookup_inline_extent_backref(). 1737 */ 1738 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1739 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1740 1741 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1742 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1743 refs = btrfs_extent_data_ref_count(leaf, dref); 1744 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1745 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1746 refs = btrfs_shared_data_ref_count(leaf, sref); 1747 } else { 1748 refs = 1; 1749 BUG_ON(refs_to_mod != -1); 1750 } 1751 1752 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1753 refs += refs_to_mod; 1754 1755 if (refs > 0) { 1756 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1757 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1758 else 1759 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1760 } else { 1761 *last_ref = 1; 1762 size = btrfs_extent_inline_ref_size(type); 1763 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1764 ptr = (unsigned long)iref; 1765 end = (unsigned long)ei + item_size; 1766 if (ptr + size < end) 1767 memmove_extent_buffer(leaf, ptr, ptr + size, 1768 end - ptr - size); 1769 item_size -= size; 1770 btrfs_truncate_item(path, item_size, 1); 1771 } 1772 btrfs_mark_buffer_dirty(leaf); 1773 } 1774 1775 static noinline_for_stack 1776 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1777 struct btrfs_path *path, 1778 u64 bytenr, u64 num_bytes, u64 parent, 1779 u64 root_objectid, u64 owner, 1780 u64 offset, int refs_to_add, 1781 struct btrfs_delayed_extent_op *extent_op) 1782 { 1783 struct btrfs_extent_inline_ref *iref; 1784 int ret; 1785 1786 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1787 num_bytes, parent, root_objectid, 1788 owner, offset, 1); 1789 if (ret == 0) { 1790 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1791 update_inline_extent_backref(path, iref, refs_to_add, 1792 extent_op, NULL); 1793 } else if (ret == -ENOENT) { 1794 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1795 root_objectid, owner, offset, 1796 refs_to_add, extent_op); 1797 ret = 0; 1798 } 1799 return ret; 1800 } 1801 1802 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1803 struct btrfs_path *path, 1804 u64 bytenr, u64 parent, u64 root_objectid, 1805 u64 owner, u64 offset, int refs_to_add) 1806 { 1807 int ret; 1808 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1809 BUG_ON(refs_to_add != 1); 1810 ret = insert_tree_block_ref(trans, path, bytenr, parent, 1811 root_objectid); 1812 } else { 1813 ret = insert_extent_data_ref(trans, path, bytenr, parent, 1814 root_objectid, owner, offset, 1815 refs_to_add); 1816 } 1817 return ret; 1818 } 1819 1820 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1821 struct btrfs_path *path, 1822 struct btrfs_extent_inline_ref *iref, 1823 int refs_to_drop, int is_data, int *last_ref) 1824 { 1825 int ret = 0; 1826 1827 BUG_ON(!is_data && refs_to_drop != 1); 1828 if (iref) { 1829 update_inline_extent_backref(path, iref, -refs_to_drop, NULL, 1830 last_ref); 1831 } else if (is_data) { 1832 ret = remove_extent_data_ref(trans, path, refs_to_drop, 1833 last_ref); 1834 } else { 1835 *last_ref = 1; 1836 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1837 } 1838 return ret; 1839 } 1840 1841 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1842 u64 *discarded_bytes) 1843 { 1844 int j, ret = 0; 1845 u64 bytes_left, end; 1846 u64 aligned_start = ALIGN(start, 1 << 9); 1847 1848 if (WARN_ON(start != aligned_start)) { 1849 len -= aligned_start - start; 1850 len = round_down(len, 1 << 9); 1851 start = aligned_start; 1852 } 1853 1854 *discarded_bytes = 0; 1855 1856 if (!len) 1857 return 0; 1858 1859 end = start + len; 1860 bytes_left = len; 1861 1862 /* Skip any superblocks on this device. */ 1863 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1864 u64 sb_start = btrfs_sb_offset(j); 1865 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1866 u64 size = sb_start - start; 1867 1868 if (!in_range(sb_start, start, bytes_left) && 1869 !in_range(sb_end, start, bytes_left) && 1870 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1871 continue; 1872 1873 /* 1874 * Superblock spans beginning of range. Adjust start and 1875 * try again. 1876 */ 1877 if (sb_start <= start) { 1878 start += sb_end - start; 1879 if (start > end) { 1880 bytes_left = 0; 1881 break; 1882 } 1883 bytes_left = end - start; 1884 continue; 1885 } 1886 1887 if (size) { 1888 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 1889 GFP_NOFS, 0); 1890 if (!ret) 1891 *discarded_bytes += size; 1892 else if (ret != -EOPNOTSUPP) 1893 return ret; 1894 } 1895 1896 start = sb_end; 1897 if (start > end) { 1898 bytes_left = 0; 1899 break; 1900 } 1901 bytes_left = end - start; 1902 } 1903 1904 if (bytes_left) { 1905 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 1906 GFP_NOFS, 0); 1907 if (!ret) 1908 *discarded_bytes += bytes_left; 1909 } 1910 return ret; 1911 } 1912 1913 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1914 u64 num_bytes, u64 *actual_bytes) 1915 { 1916 int ret; 1917 u64 discarded_bytes = 0; 1918 struct btrfs_bio *bbio = NULL; 1919 1920 1921 /* 1922 * Avoid races with device replace and make sure our bbio has devices 1923 * associated to its stripes that don't go away while we are discarding. 1924 */ 1925 btrfs_bio_counter_inc_blocked(fs_info); 1926 /* Tell the block device(s) that the sectors can be discarded */ 1927 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 1928 &bbio, 0); 1929 /* Error condition is -ENOMEM */ 1930 if (!ret) { 1931 struct btrfs_bio_stripe *stripe = bbio->stripes; 1932 int i; 1933 1934 1935 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1936 u64 bytes; 1937 struct request_queue *req_q; 1938 1939 if (!stripe->dev->bdev) { 1940 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 1941 continue; 1942 } 1943 req_q = bdev_get_queue(stripe->dev->bdev); 1944 if (!blk_queue_discard(req_q)) 1945 continue; 1946 1947 ret = btrfs_issue_discard(stripe->dev->bdev, 1948 stripe->physical, 1949 stripe->length, 1950 &bytes); 1951 if (!ret) 1952 discarded_bytes += bytes; 1953 else if (ret != -EOPNOTSUPP) 1954 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1955 1956 /* 1957 * Just in case we get back EOPNOTSUPP for some reason, 1958 * just ignore the return value so we don't screw up 1959 * people calling discard_extent. 1960 */ 1961 ret = 0; 1962 } 1963 btrfs_put_bbio(bbio); 1964 } 1965 btrfs_bio_counter_dec(fs_info); 1966 1967 if (actual_bytes) 1968 *actual_bytes = discarded_bytes; 1969 1970 1971 if (ret == -EOPNOTSUPP) 1972 ret = 0; 1973 return ret; 1974 } 1975 1976 /* Can return -ENOMEM */ 1977 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1978 struct btrfs_ref *generic_ref) 1979 { 1980 struct btrfs_fs_info *fs_info = trans->fs_info; 1981 int old_ref_mod, new_ref_mod; 1982 int ret; 1983 1984 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && 1985 generic_ref->action); 1986 BUG_ON(generic_ref->type == BTRFS_REF_METADATA && 1987 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); 1988 1989 if (generic_ref->type == BTRFS_REF_METADATA) 1990 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, 1991 NULL, &old_ref_mod, &new_ref_mod); 1992 else 1993 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, 1994 &old_ref_mod, &new_ref_mod); 1995 1996 btrfs_ref_tree_mod(fs_info, generic_ref); 1997 1998 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) 1999 sub_pinned_bytes(fs_info, generic_ref); 2000 2001 return ret; 2002 } 2003 2004 /* 2005 * __btrfs_inc_extent_ref - insert backreference for a given extent 2006 * 2007 * @trans: Handle of transaction 2008 * 2009 * @node: The delayed ref node used to get the bytenr/length for 2010 * extent whose references are incremented. 2011 * 2012 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ 2013 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical 2014 * bytenr of the parent block. Since new extents are always 2015 * created with indirect references, this will only be the case 2016 * when relocating a shared extent. In that case, root_objectid 2017 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must 2018 * be 0 2019 * 2020 * @root_objectid: The id of the root where this modification has originated, 2021 * this can be either one of the well-known metadata trees or 2022 * the subvolume id which references this extent. 2023 * 2024 * @owner: For data extents it is the inode number of the owning file. 2025 * For metadata extents this parameter holds the level in the 2026 * tree of the extent. 2027 * 2028 * @offset: For metadata extents the offset is ignored and is currently 2029 * always passed as 0. For data extents it is the fileoffset 2030 * this extent belongs to. 2031 * 2032 * @refs_to_add Number of references to add 2033 * 2034 * @extent_op Pointer to a structure, holding information necessary when 2035 * updating a tree block's flags 2036 * 2037 */ 2038 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2039 struct btrfs_delayed_ref_node *node, 2040 u64 parent, u64 root_objectid, 2041 u64 owner, u64 offset, int refs_to_add, 2042 struct btrfs_delayed_extent_op *extent_op) 2043 { 2044 struct btrfs_path *path; 2045 struct extent_buffer *leaf; 2046 struct btrfs_extent_item *item; 2047 struct btrfs_key key; 2048 u64 bytenr = node->bytenr; 2049 u64 num_bytes = node->num_bytes; 2050 u64 refs; 2051 int ret; 2052 2053 path = btrfs_alloc_path(); 2054 if (!path) 2055 return -ENOMEM; 2056 2057 path->reada = READA_FORWARD; 2058 path->leave_spinning = 1; 2059 /* this will setup the path even if it fails to insert the back ref */ 2060 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 2061 parent, root_objectid, owner, 2062 offset, refs_to_add, extent_op); 2063 if ((ret < 0 && ret != -EAGAIN) || !ret) 2064 goto out; 2065 2066 /* 2067 * Ok we had -EAGAIN which means we didn't have space to insert and 2068 * inline extent ref, so just update the reference count and add a 2069 * normal backref. 2070 */ 2071 leaf = path->nodes[0]; 2072 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2073 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2074 refs = btrfs_extent_refs(leaf, item); 2075 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2076 if (extent_op) 2077 __run_delayed_extent_op(extent_op, leaf, item); 2078 2079 btrfs_mark_buffer_dirty(leaf); 2080 btrfs_release_path(path); 2081 2082 path->reada = READA_FORWARD; 2083 path->leave_spinning = 1; 2084 /* now insert the actual backref */ 2085 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, 2086 owner, offset, refs_to_add); 2087 if (ret) 2088 btrfs_abort_transaction(trans, ret); 2089 out: 2090 btrfs_free_path(path); 2091 return ret; 2092 } 2093 2094 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2095 struct btrfs_delayed_ref_node *node, 2096 struct btrfs_delayed_extent_op *extent_op, 2097 int insert_reserved) 2098 { 2099 int ret = 0; 2100 struct btrfs_delayed_data_ref *ref; 2101 struct btrfs_key ins; 2102 u64 parent = 0; 2103 u64 ref_root = 0; 2104 u64 flags = 0; 2105 2106 ins.objectid = node->bytenr; 2107 ins.offset = node->num_bytes; 2108 ins.type = BTRFS_EXTENT_ITEM_KEY; 2109 2110 ref = btrfs_delayed_node_to_data_ref(node); 2111 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); 2112 2113 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2114 parent = ref->parent; 2115 ref_root = ref->root; 2116 2117 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2118 if (extent_op) 2119 flags |= extent_op->flags_to_set; 2120 ret = alloc_reserved_file_extent(trans, parent, ref_root, 2121 flags, ref->objectid, 2122 ref->offset, &ins, 2123 node->ref_mod); 2124 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2125 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2126 ref->objectid, ref->offset, 2127 node->ref_mod, extent_op); 2128 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2129 ret = __btrfs_free_extent(trans, node, parent, 2130 ref_root, ref->objectid, 2131 ref->offset, node->ref_mod, 2132 extent_op); 2133 } else { 2134 BUG(); 2135 } 2136 return ret; 2137 } 2138 2139 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2140 struct extent_buffer *leaf, 2141 struct btrfs_extent_item *ei) 2142 { 2143 u64 flags = btrfs_extent_flags(leaf, ei); 2144 if (extent_op->update_flags) { 2145 flags |= extent_op->flags_to_set; 2146 btrfs_set_extent_flags(leaf, ei, flags); 2147 } 2148 2149 if (extent_op->update_key) { 2150 struct btrfs_tree_block_info *bi; 2151 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2152 bi = (struct btrfs_tree_block_info *)(ei + 1); 2153 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2154 } 2155 } 2156 2157 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2158 struct btrfs_delayed_ref_head *head, 2159 struct btrfs_delayed_extent_op *extent_op) 2160 { 2161 struct btrfs_fs_info *fs_info = trans->fs_info; 2162 struct btrfs_key key; 2163 struct btrfs_path *path; 2164 struct btrfs_extent_item *ei; 2165 struct extent_buffer *leaf; 2166 u32 item_size; 2167 int ret; 2168 int err = 0; 2169 int metadata = !extent_op->is_data; 2170 2171 if (trans->aborted) 2172 return 0; 2173 2174 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2175 metadata = 0; 2176 2177 path = btrfs_alloc_path(); 2178 if (!path) 2179 return -ENOMEM; 2180 2181 key.objectid = head->bytenr; 2182 2183 if (metadata) { 2184 key.type = BTRFS_METADATA_ITEM_KEY; 2185 key.offset = extent_op->level; 2186 } else { 2187 key.type = BTRFS_EXTENT_ITEM_KEY; 2188 key.offset = head->num_bytes; 2189 } 2190 2191 again: 2192 path->reada = READA_FORWARD; 2193 path->leave_spinning = 1; 2194 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2195 if (ret < 0) { 2196 err = ret; 2197 goto out; 2198 } 2199 if (ret > 0) { 2200 if (metadata) { 2201 if (path->slots[0] > 0) { 2202 path->slots[0]--; 2203 btrfs_item_key_to_cpu(path->nodes[0], &key, 2204 path->slots[0]); 2205 if (key.objectid == head->bytenr && 2206 key.type == BTRFS_EXTENT_ITEM_KEY && 2207 key.offset == head->num_bytes) 2208 ret = 0; 2209 } 2210 if (ret > 0) { 2211 btrfs_release_path(path); 2212 metadata = 0; 2213 2214 key.objectid = head->bytenr; 2215 key.offset = head->num_bytes; 2216 key.type = BTRFS_EXTENT_ITEM_KEY; 2217 goto again; 2218 } 2219 } else { 2220 err = -EIO; 2221 goto out; 2222 } 2223 } 2224 2225 leaf = path->nodes[0]; 2226 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2227 2228 if (unlikely(item_size < sizeof(*ei))) { 2229 err = -EINVAL; 2230 btrfs_print_v0_err(fs_info); 2231 btrfs_abort_transaction(trans, err); 2232 goto out; 2233 } 2234 2235 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2236 __run_delayed_extent_op(extent_op, leaf, ei); 2237 2238 btrfs_mark_buffer_dirty(leaf); 2239 out: 2240 btrfs_free_path(path); 2241 return err; 2242 } 2243 2244 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2245 struct btrfs_delayed_ref_node *node, 2246 struct btrfs_delayed_extent_op *extent_op, 2247 int insert_reserved) 2248 { 2249 int ret = 0; 2250 struct btrfs_delayed_tree_ref *ref; 2251 u64 parent = 0; 2252 u64 ref_root = 0; 2253 2254 ref = btrfs_delayed_node_to_tree_ref(node); 2255 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); 2256 2257 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2258 parent = ref->parent; 2259 ref_root = ref->root; 2260 2261 if (node->ref_mod != 1) { 2262 btrfs_err(trans->fs_info, 2263 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2264 node->bytenr, node->ref_mod, node->action, ref_root, 2265 parent); 2266 return -EIO; 2267 } 2268 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2269 BUG_ON(!extent_op || !extent_op->update_flags); 2270 ret = alloc_reserved_tree_block(trans, node, extent_op); 2271 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2272 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2273 ref->level, 0, 1, extent_op); 2274 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2275 ret = __btrfs_free_extent(trans, node, parent, ref_root, 2276 ref->level, 0, 1, extent_op); 2277 } else { 2278 BUG(); 2279 } 2280 return ret; 2281 } 2282 2283 /* helper function to actually process a single delayed ref entry */ 2284 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2285 struct btrfs_delayed_ref_node *node, 2286 struct btrfs_delayed_extent_op *extent_op, 2287 int insert_reserved) 2288 { 2289 int ret = 0; 2290 2291 if (trans->aborted) { 2292 if (insert_reserved) 2293 btrfs_pin_extent(trans->fs_info, node->bytenr, 2294 node->num_bytes, 1); 2295 return 0; 2296 } 2297 2298 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2299 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2300 ret = run_delayed_tree_ref(trans, node, extent_op, 2301 insert_reserved); 2302 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2303 node->type == BTRFS_SHARED_DATA_REF_KEY) 2304 ret = run_delayed_data_ref(trans, node, extent_op, 2305 insert_reserved); 2306 else 2307 BUG(); 2308 if (ret && insert_reserved) 2309 btrfs_pin_extent(trans->fs_info, node->bytenr, 2310 node->num_bytes, 1); 2311 return ret; 2312 } 2313 2314 static inline struct btrfs_delayed_ref_node * 2315 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2316 { 2317 struct btrfs_delayed_ref_node *ref; 2318 2319 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 2320 return NULL; 2321 2322 /* 2323 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2324 * This is to prevent a ref count from going down to zero, which deletes 2325 * the extent item from the extent tree, when there still are references 2326 * to add, which would fail because they would not find the extent item. 2327 */ 2328 if (!list_empty(&head->ref_add_list)) 2329 return list_first_entry(&head->ref_add_list, 2330 struct btrfs_delayed_ref_node, add_list); 2331 2332 ref = rb_entry(rb_first_cached(&head->ref_tree), 2333 struct btrfs_delayed_ref_node, ref_node); 2334 ASSERT(list_empty(&ref->add_list)); 2335 return ref; 2336 } 2337 2338 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 2339 struct btrfs_delayed_ref_head *head) 2340 { 2341 spin_lock(&delayed_refs->lock); 2342 head->processing = 0; 2343 delayed_refs->num_heads_ready++; 2344 spin_unlock(&delayed_refs->lock); 2345 btrfs_delayed_ref_unlock(head); 2346 } 2347 2348 static struct btrfs_delayed_extent_op *cleanup_extent_op( 2349 struct btrfs_delayed_ref_head *head) 2350 { 2351 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 2352 2353 if (!extent_op) 2354 return NULL; 2355 2356 if (head->must_insert_reserved) { 2357 head->extent_op = NULL; 2358 btrfs_free_delayed_extent_op(extent_op); 2359 return NULL; 2360 } 2361 return extent_op; 2362 } 2363 2364 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 2365 struct btrfs_delayed_ref_head *head) 2366 { 2367 struct btrfs_delayed_extent_op *extent_op; 2368 int ret; 2369 2370 extent_op = cleanup_extent_op(head); 2371 if (!extent_op) 2372 return 0; 2373 head->extent_op = NULL; 2374 spin_unlock(&head->lock); 2375 ret = run_delayed_extent_op(trans, head, extent_op); 2376 btrfs_free_delayed_extent_op(extent_op); 2377 return ret ? ret : 1; 2378 } 2379 2380 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 2381 struct btrfs_delayed_ref_root *delayed_refs, 2382 struct btrfs_delayed_ref_head *head) 2383 { 2384 int nr_items = 1; /* Dropping this ref head update. */ 2385 2386 if (head->total_ref_mod < 0) { 2387 struct btrfs_space_info *space_info; 2388 u64 flags; 2389 2390 if (head->is_data) 2391 flags = BTRFS_BLOCK_GROUP_DATA; 2392 else if (head->is_system) 2393 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2394 else 2395 flags = BTRFS_BLOCK_GROUP_METADATA; 2396 space_info = btrfs_find_space_info(fs_info, flags); 2397 ASSERT(space_info); 2398 percpu_counter_add_batch(&space_info->total_bytes_pinned, 2399 -head->num_bytes, 2400 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2401 2402 /* 2403 * We had csum deletions accounted for in our delayed refs rsv, 2404 * we need to drop the csum leaves for this update from our 2405 * delayed_refs_rsv. 2406 */ 2407 if (head->is_data) { 2408 spin_lock(&delayed_refs->lock); 2409 delayed_refs->pending_csums -= head->num_bytes; 2410 spin_unlock(&delayed_refs->lock); 2411 nr_items += btrfs_csum_bytes_to_leaves(fs_info, 2412 head->num_bytes); 2413 } 2414 } 2415 2416 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 2417 } 2418 2419 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 2420 struct btrfs_delayed_ref_head *head) 2421 { 2422 2423 struct btrfs_fs_info *fs_info = trans->fs_info; 2424 struct btrfs_delayed_ref_root *delayed_refs; 2425 int ret; 2426 2427 delayed_refs = &trans->transaction->delayed_refs; 2428 2429 ret = run_and_cleanup_extent_op(trans, head); 2430 if (ret < 0) { 2431 unselect_delayed_ref_head(delayed_refs, head); 2432 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2433 return ret; 2434 } else if (ret) { 2435 return ret; 2436 } 2437 2438 /* 2439 * Need to drop our head ref lock and re-acquire the delayed ref lock 2440 * and then re-check to make sure nobody got added. 2441 */ 2442 spin_unlock(&head->lock); 2443 spin_lock(&delayed_refs->lock); 2444 spin_lock(&head->lock); 2445 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 2446 spin_unlock(&head->lock); 2447 spin_unlock(&delayed_refs->lock); 2448 return 1; 2449 } 2450 btrfs_delete_ref_head(delayed_refs, head); 2451 spin_unlock(&head->lock); 2452 spin_unlock(&delayed_refs->lock); 2453 2454 if (head->must_insert_reserved) { 2455 btrfs_pin_extent(fs_info, head->bytenr, 2456 head->num_bytes, 1); 2457 if (head->is_data) { 2458 ret = btrfs_del_csums(trans, fs_info, head->bytenr, 2459 head->num_bytes); 2460 } 2461 } 2462 2463 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 2464 2465 trace_run_delayed_ref_head(fs_info, head, 0); 2466 btrfs_delayed_ref_unlock(head); 2467 btrfs_put_delayed_ref_head(head); 2468 return 0; 2469 } 2470 2471 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 2472 struct btrfs_trans_handle *trans) 2473 { 2474 struct btrfs_delayed_ref_root *delayed_refs = 2475 &trans->transaction->delayed_refs; 2476 struct btrfs_delayed_ref_head *head = NULL; 2477 int ret; 2478 2479 spin_lock(&delayed_refs->lock); 2480 head = btrfs_select_ref_head(delayed_refs); 2481 if (!head) { 2482 spin_unlock(&delayed_refs->lock); 2483 return head; 2484 } 2485 2486 /* 2487 * Grab the lock that says we are going to process all the refs for 2488 * this head 2489 */ 2490 ret = btrfs_delayed_ref_lock(delayed_refs, head); 2491 spin_unlock(&delayed_refs->lock); 2492 2493 /* 2494 * We may have dropped the spin lock to get the head mutex lock, and 2495 * that might have given someone else time to free the head. If that's 2496 * true, it has been removed from our list and we can move on. 2497 */ 2498 if (ret == -EAGAIN) 2499 head = ERR_PTR(-EAGAIN); 2500 2501 return head; 2502 } 2503 2504 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 2505 struct btrfs_delayed_ref_head *locked_ref, 2506 unsigned long *run_refs) 2507 { 2508 struct btrfs_fs_info *fs_info = trans->fs_info; 2509 struct btrfs_delayed_ref_root *delayed_refs; 2510 struct btrfs_delayed_extent_op *extent_op; 2511 struct btrfs_delayed_ref_node *ref; 2512 int must_insert_reserved = 0; 2513 int ret; 2514 2515 delayed_refs = &trans->transaction->delayed_refs; 2516 2517 lockdep_assert_held(&locked_ref->mutex); 2518 lockdep_assert_held(&locked_ref->lock); 2519 2520 while ((ref = select_delayed_ref(locked_ref))) { 2521 if (ref->seq && 2522 btrfs_check_delayed_seq(fs_info, ref->seq)) { 2523 spin_unlock(&locked_ref->lock); 2524 unselect_delayed_ref_head(delayed_refs, locked_ref); 2525 return -EAGAIN; 2526 } 2527 2528 (*run_refs)++; 2529 ref->in_tree = 0; 2530 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 2531 RB_CLEAR_NODE(&ref->ref_node); 2532 if (!list_empty(&ref->add_list)) 2533 list_del(&ref->add_list); 2534 /* 2535 * When we play the delayed ref, also correct the ref_mod on 2536 * head 2537 */ 2538 switch (ref->action) { 2539 case BTRFS_ADD_DELAYED_REF: 2540 case BTRFS_ADD_DELAYED_EXTENT: 2541 locked_ref->ref_mod -= ref->ref_mod; 2542 break; 2543 case BTRFS_DROP_DELAYED_REF: 2544 locked_ref->ref_mod += ref->ref_mod; 2545 break; 2546 default: 2547 WARN_ON(1); 2548 } 2549 atomic_dec(&delayed_refs->num_entries); 2550 2551 /* 2552 * Record the must_insert_reserved flag before we drop the 2553 * spin lock. 2554 */ 2555 must_insert_reserved = locked_ref->must_insert_reserved; 2556 locked_ref->must_insert_reserved = 0; 2557 2558 extent_op = locked_ref->extent_op; 2559 locked_ref->extent_op = NULL; 2560 spin_unlock(&locked_ref->lock); 2561 2562 ret = run_one_delayed_ref(trans, ref, extent_op, 2563 must_insert_reserved); 2564 2565 btrfs_free_delayed_extent_op(extent_op); 2566 if (ret) { 2567 unselect_delayed_ref_head(delayed_refs, locked_ref); 2568 btrfs_put_delayed_ref(ref); 2569 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2570 ret); 2571 return ret; 2572 } 2573 2574 btrfs_put_delayed_ref(ref); 2575 cond_resched(); 2576 2577 spin_lock(&locked_ref->lock); 2578 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2579 } 2580 2581 return 0; 2582 } 2583 2584 /* 2585 * Returns 0 on success or if called with an already aborted transaction. 2586 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2587 */ 2588 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2589 unsigned long nr) 2590 { 2591 struct btrfs_fs_info *fs_info = trans->fs_info; 2592 struct btrfs_delayed_ref_root *delayed_refs; 2593 struct btrfs_delayed_ref_head *locked_ref = NULL; 2594 ktime_t start = ktime_get(); 2595 int ret; 2596 unsigned long count = 0; 2597 unsigned long actual_count = 0; 2598 2599 delayed_refs = &trans->transaction->delayed_refs; 2600 do { 2601 if (!locked_ref) { 2602 locked_ref = btrfs_obtain_ref_head(trans); 2603 if (IS_ERR_OR_NULL(locked_ref)) { 2604 if (PTR_ERR(locked_ref) == -EAGAIN) { 2605 continue; 2606 } else { 2607 break; 2608 } 2609 } 2610 count++; 2611 } 2612 /* 2613 * We need to try and merge add/drops of the same ref since we 2614 * can run into issues with relocate dropping the implicit ref 2615 * and then it being added back again before the drop can 2616 * finish. If we merged anything we need to re-loop so we can 2617 * get a good ref. 2618 * Or we can get node references of the same type that weren't 2619 * merged when created due to bumps in the tree mod seq, and 2620 * we need to merge them to prevent adding an inline extent 2621 * backref before dropping it (triggering a BUG_ON at 2622 * insert_inline_extent_backref()). 2623 */ 2624 spin_lock(&locked_ref->lock); 2625 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2626 2627 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, 2628 &actual_count); 2629 if (ret < 0 && ret != -EAGAIN) { 2630 /* 2631 * Error, btrfs_run_delayed_refs_for_head already 2632 * unlocked everything so just bail out 2633 */ 2634 return ret; 2635 } else if (!ret) { 2636 /* 2637 * Success, perform the usual cleanup of a processed 2638 * head 2639 */ 2640 ret = cleanup_ref_head(trans, locked_ref); 2641 if (ret > 0 ) { 2642 /* We dropped our lock, we need to loop. */ 2643 ret = 0; 2644 continue; 2645 } else if (ret) { 2646 return ret; 2647 } 2648 } 2649 2650 /* 2651 * Either success case or btrfs_run_delayed_refs_for_head 2652 * returned -EAGAIN, meaning we need to select another head 2653 */ 2654 2655 locked_ref = NULL; 2656 cond_resched(); 2657 } while ((nr != -1 && count < nr) || locked_ref); 2658 2659 /* 2660 * We don't want to include ref heads since we can have empty ref heads 2661 * and those will drastically skew our runtime down since we just do 2662 * accounting, no actual extent tree updates. 2663 */ 2664 if (actual_count > 0) { 2665 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2666 u64 avg; 2667 2668 /* 2669 * We weigh the current average higher than our current runtime 2670 * to avoid large swings in the average. 2671 */ 2672 spin_lock(&delayed_refs->lock); 2673 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2674 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2675 spin_unlock(&delayed_refs->lock); 2676 } 2677 return 0; 2678 } 2679 2680 #ifdef SCRAMBLE_DELAYED_REFS 2681 /* 2682 * Normally delayed refs get processed in ascending bytenr order. This 2683 * correlates in most cases to the order added. To expose dependencies on this 2684 * order, we start to process the tree in the middle instead of the beginning 2685 */ 2686 static u64 find_middle(struct rb_root *root) 2687 { 2688 struct rb_node *n = root->rb_node; 2689 struct btrfs_delayed_ref_node *entry; 2690 int alt = 1; 2691 u64 middle; 2692 u64 first = 0, last = 0; 2693 2694 n = rb_first(root); 2695 if (n) { 2696 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2697 first = entry->bytenr; 2698 } 2699 n = rb_last(root); 2700 if (n) { 2701 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2702 last = entry->bytenr; 2703 } 2704 n = root->rb_node; 2705 2706 while (n) { 2707 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2708 WARN_ON(!entry->in_tree); 2709 2710 middle = entry->bytenr; 2711 2712 if (alt) 2713 n = n->rb_left; 2714 else 2715 n = n->rb_right; 2716 2717 alt = 1 - alt; 2718 } 2719 return middle; 2720 } 2721 #endif 2722 2723 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2724 { 2725 u64 num_bytes; 2726 2727 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2728 sizeof(struct btrfs_extent_inline_ref)); 2729 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2730 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2731 2732 /* 2733 * We don't ever fill up leaves all the way so multiply by 2 just to be 2734 * closer to what we're really going to want to use. 2735 */ 2736 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2737 } 2738 2739 /* 2740 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2741 * would require to store the csums for that many bytes. 2742 */ 2743 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2744 { 2745 u64 csum_size; 2746 u64 num_csums_per_leaf; 2747 u64 num_csums; 2748 2749 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2750 num_csums_per_leaf = div64_u64(csum_size, 2751 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2752 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2753 num_csums += num_csums_per_leaf - 1; 2754 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2755 return num_csums; 2756 } 2757 2758 /* 2759 * this starts processing the delayed reference count updates and 2760 * extent insertions we have queued up so far. count can be 2761 * 0, which means to process everything in the tree at the start 2762 * of the run (but not newly added entries), or it can be some target 2763 * number you'd like to process. 2764 * 2765 * Returns 0 on success or if called with an aborted transaction 2766 * Returns <0 on error and aborts the transaction 2767 */ 2768 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2769 unsigned long count) 2770 { 2771 struct btrfs_fs_info *fs_info = trans->fs_info; 2772 struct rb_node *node; 2773 struct btrfs_delayed_ref_root *delayed_refs; 2774 struct btrfs_delayed_ref_head *head; 2775 int ret; 2776 int run_all = count == (unsigned long)-1; 2777 2778 /* We'll clean this up in btrfs_cleanup_transaction */ 2779 if (trans->aborted) 2780 return 0; 2781 2782 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2783 return 0; 2784 2785 delayed_refs = &trans->transaction->delayed_refs; 2786 if (count == 0) 2787 count = atomic_read(&delayed_refs->num_entries) * 2; 2788 2789 again: 2790 #ifdef SCRAMBLE_DELAYED_REFS 2791 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2792 #endif 2793 ret = __btrfs_run_delayed_refs(trans, count); 2794 if (ret < 0) { 2795 btrfs_abort_transaction(trans, ret); 2796 return ret; 2797 } 2798 2799 if (run_all) { 2800 btrfs_create_pending_block_groups(trans); 2801 2802 spin_lock(&delayed_refs->lock); 2803 node = rb_first_cached(&delayed_refs->href_root); 2804 if (!node) { 2805 spin_unlock(&delayed_refs->lock); 2806 goto out; 2807 } 2808 head = rb_entry(node, struct btrfs_delayed_ref_head, 2809 href_node); 2810 refcount_inc(&head->refs); 2811 spin_unlock(&delayed_refs->lock); 2812 2813 /* Mutex was contended, block until it's released and retry. */ 2814 mutex_lock(&head->mutex); 2815 mutex_unlock(&head->mutex); 2816 2817 btrfs_put_delayed_ref_head(head); 2818 cond_resched(); 2819 goto again; 2820 } 2821 out: 2822 return 0; 2823 } 2824 2825 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2826 u64 bytenr, u64 num_bytes, u64 flags, 2827 int level, int is_data) 2828 { 2829 struct btrfs_delayed_extent_op *extent_op; 2830 int ret; 2831 2832 extent_op = btrfs_alloc_delayed_extent_op(); 2833 if (!extent_op) 2834 return -ENOMEM; 2835 2836 extent_op->flags_to_set = flags; 2837 extent_op->update_flags = true; 2838 extent_op->update_key = false; 2839 extent_op->is_data = is_data ? true : false; 2840 extent_op->level = level; 2841 2842 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2843 if (ret) 2844 btrfs_free_delayed_extent_op(extent_op); 2845 return ret; 2846 } 2847 2848 static noinline int check_delayed_ref(struct btrfs_root *root, 2849 struct btrfs_path *path, 2850 u64 objectid, u64 offset, u64 bytenr) 2851 { 2852 struct btrfs_delayed_ref_head *head; 2853 struct btrfs_delayed_ref_node *ref; 2854 struct btrfs_delayed_data_ref *data_ref; 2855 struct btrfs_delayed_ref_root *delayed_refs; 2856 struct btrfs_transaction *cur_trans; 2857 struct rb_node *node; 2858 int ret = 0; 2859 2860 spin_lock(&root->fs_info->trans_lock); 2861 cur_trans = root->fs_info->running_transaction; 2862 if (cur_trans) 2863 refcount_inc(&cur_trans->use_count); 2864 spin_unlock(&root->fs_info->trans_lock); 2865 if (!cur_trans) 2866 return 0; 2867 2868 delayed_refs = &cur_trans->delayed_refs; 2869 spin_lock(&delayed_refs->lock); 2870 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 2871 if (!head) { 2872 spin_unlock(&delayed_refs->lock); 2873 btrfs_put_transaction(cur_trans); 2874 return 0; 2875 } 2876 2877 if (!mutex_trylock(&head->mutex)) { 2878 refcount_inc(&head->refs); 2879 spin_unlock(&delayed_refs->lock); 2880 2881 btrfs_release_path(path); 2882 2883 /* 2884 * Mutex was contended, block until it's released and let 2885 * caller try again 2886 */ 2887 mutex_lock(&head->mutex); 2888 mutex_unlock(&head->mutex); 2889 btrfs_put_delayed_ref_head(head); 2890 btrfs_put_transaction(cur_trans); 2891 return -EAGAIN; 2892 } 2893 spin_unlock(&delayed_refs->lock); 2894 2895 spin_lock(&head->lock); 2896 /* 2897 * XXX: We should replace this with a proper search function in the 2898 * future. 2899 */ 2900 for (node = rb_first_cached(&head->ref_tree); node; 2901 node = rb_next(node)) { 2902 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 2903 /* If it's a shared ref we know a cross reference exists */ 2904 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2905 ret = 1; 2906 break; 2907 } 2908 2909 data_ref = btrfs_delayed_node_to_data_ref(ref); 2910 2911 /* 2912 * If our ref doesn't match the one we're currently looking at 2913 * then we have a cross reference. 2914 */ 2915 if (data_ref->root != root->root_key.objectid || 2916 data_ref->objectid != objectid || 2917 data_ref->offset != offset) { 2918 ret = 1; 2919 break; 2920 } 2921 } 2922 spin_unlock(&head->lock); 2923 mutex_unlock(&head->mutex); 2924 btrfs_put_transaction(cur_trans); 2925 return ret; 2926 } 2927 2928 static noinline int check_committed_ref(struct btrfs_root *root, 2929 struct btrfs_path *path, 2930 u64 objectid, u64 offset, u64 bytenr) 2931 { 2932 struct btrfs_fs_info *fs_info = root->fs_info; 2933 struct btrfs_root *extent_root = fs_info->extent_root; 2934 struct extent_buffer *leaf; 2935 struct btrfs_extent_data_ref *ref; 2936 struct btrfs_extent_inline_ref *iref; 2937 struct btrfs_extent_item *ei; 2938 struct btrfs_key key; 2939 u32 item_size; 2940 int type; 2941 int ret; 2942 2943 key.objectid = bytenr; 2944 key.offset = (u64)-1; 2945 key.type = BTRFS_EXTENT_ITEM_KEY; 2946 2947 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2948 if (ret < 0) 2949 goto out; 2950 BUG_ON(ret == 0); /* Corruption */ 2951 2952 ret = -ENOENT; 2953 if (path->slots[0] == 0) 2954 goto out; 2955 2956 path->slots[0]--; 2957 leaf = path->nodes[0]; 2958 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2959 2960 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2961 goto out; 2962 2963 ret = 1; 2964 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2965 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2966 2967 if (item_size != sizeof(*ei) + 2968 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2969 goto out; 2970 2971 if (btrfs_extent_generation(leaf, ei) <= 2972 btrfs_root_last_snapshot(&root->root_item)) 2973 goto out; 2974 2975 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2976 2977 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2978 if (type != BTRFS_EXTENT_DATA_REF_KEY) 2979 goto out; 2980 2981 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2982 if (btrfs_extent_refs(leaf, ei) != 2983 btrfs_extent_data_ref_count(leaf, ref) || 2984 btrfs_extent_data_ref_root(leaf, ref) != 2985 root->root_key.objectid || 2986 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2987 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2988 goto out; 2989 2990 ret = 0; 2991 out: 2992 return ret; 2993 } 2994 2995 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 2996 u64 bytenr) 2997 { 2998 struct btrfs_path *path; 2999 int ret; 3000 3001 path = btrfs_alloc_path(); 3002 if (!path) 3003 return -ENOMEM; 3004 3005 do { 3006 ret = check_committed_ref(root, path, objectid, 3007 offset, bytenr); 3008 if (ret && ret != -ENOENT) 3009 goto out; 3010 3011 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 3012 } while (ret == -EAGAIN); 3013 3014 out: 3015 btrfs_free_path(path); 3016 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3017 WARN_ON(ret > 0); 3018 return ret; 3019 } 3020 3021 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3022 struct btrfs_root *root, 3023 struct extent_buffer *buf, 3024 int full_backref, int inc) 3025 { 3026 struct btrfs_fs_info *fs_info = root->fs_info; 3027 u64 bytenr; 3028 u64 num_bytes; 3029 u64 parent; 3030 u64 ref_root; 3031 u32 nritems; 3032 struct btrfs_key key; 3033 struct btrfs_file_extent_item *fi; 3034 struct btrfs_ref generic_ref = { 0 }; 3035 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); 3036 int i; 3037 int action; 3038 int level; 3039 int ret = 0; 3040 3041 if (btrfs_is_testing(fs_info)) 3042 return 0; 3043 3044 ref_root = btrfs_header_owner(buf); 3045 nritems = btrfs_header_nritems(buf); 3046 level = btrfs_header_level(buf); 3047 3048 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3049 return 0; 3050 3051 if (full_backref) 3052 parent = buf->start; 3053 else 3054 parent = 0; 3055 if (inc) 3056 action = BTRFS_ADD_DELAYED_REF; 3057 else 3058 action = BTRFS_DROP_DELAYED_REF; 3059 3060 for (i = 0; i < nritems; i++) { 3061 if (level == 0) { 3062 btrfs_item_key_to_cpu(buf, &key, i); 3063 if (key.type != BTRFS_EXTENT_DATA_KEY) 3064 continue; 3065 fi = btrfs_item_ptr(buf, i, 3066 struct btrfs_file_extent_item); 3067 if (btrfs_file_extent_type(buf, fi) == 3068 BTRFS_FILE_EXTENT_INLINE) 3069 continue; 3070 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3071 if (bytenr == 0) 3072 continue; 3073 3074 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3075 key.offset -= btrfs_file_extent_offset(buf, fi); 3076 btrfs_init_generic_ref(&generic_ref, action, bytenr, 3077 num_bytes, parent); 3078 generic_ref.real_root = root->root_key.objectid; 3079 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, 3080 key.offset); 3081 generic_ref.skip_qgroup = for_reloc; 3082 if (inc) 3083 ret = btrfs_inc_extent_ref(trans, &generic_ref); 3084 else 3085 ret = btrfs_free_extent(trans, &generic_ref); 3086 if (ret) 3087 goto fail; 3088 } else { 3089 bytenr = btrfs_node_blockptr(buf, i); 3090 num_bytes = fs_info->nodesize; 3091 btrfs_init_generic_ref(&generic_ref, action, bytenr, 3092 num_bytes, parent); 3093 generic_ref.real_root = root->root_key.objectid; 3094 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); 3095 generic_ref.skip_qgroup = for_reloc; 3096 if (inc) 3097 ret = btrfs_inc_extent_ref(trans, &generic_ref); 3098 else 3099 ret = btrfs_free_extent(trans, &generic_ref); 3100 if (ret) 3101 goto fail; 3102 } 3103 } 3104 return 0; 3105 fail: 3106 return ret; 3107 } 3108 3109 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3110 struct extent_buffer *buf, int full_backref) 3111 { 3112 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3113 } 3114 3115 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3116 struct extent_buffer *buf, int full_backref) 3117 { 3118 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3119 } 3120 3121 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3122 struct btrfs_path *path, 3123 struct btrfs_block_group_cache *cache) 3124 { 3125 struct btrfs_fs_info *fs_info = trans->fs_info; 3126 int ret; 3127 struct btrfs_root *extent_root = fs_info->extent_root; 3128 unsigned long bi; 3129 struct extent_buffer *leaf; 3130 3131 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3132 if (ret) { 3133 if (ret > 0) 3134 ret = -ENOENT; 3135 goto fail; 3136 } 3137 3138 leaf = path->nodes[0]; 3139 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3140 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3141 btrfs_mark_buffer_dirty(leaf); 3142 fail: 3143 btrfs_release_path(path); 3144 return ret; 3145 3146 } 3147 3148 static struct btrfs_block_group_cache *next_block_group( 3149 struct btrfs_block_group_cache *cache) 3150 { 3151 struct btrfs_fs_info *fs_info = cache->fs_info; 3152 struct rb_node *node; 3153 3154 spin_lock(&fs_info->block_group_cache_lock); 3155 3156 /* If our block group was removed, we need a full search. */ 3157 if (RB_EMPTY_NODE(&cache->cache_node)) { 3158 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3159 3160 spin_unlock(&fs_info->block_group_cache_lock); 3161 btrfs_put_block_group(cache); 3162 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3163 } 3164 node = rb_next(&cache->cache_node); 3165 btrfs_put_block_group(cache); 3166 if (node) { 3167 cache = rb_entry(node, struct btrfs_block_group_cache, 3168 cache_node); 3169 btrfs_get_block_group(cache); 3170 } else 3171 cache = NULL; 3172 spin_unlock(&fs_info->block_group_cache_lock); 3173 return cache; 3174 } 3175 3176 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3177 struct btrfs_trans_handle *trans, 3178 struct btrfs_path *path) 3179 { 3180 struct btrfs_fs_info *fs_info = block_group->fs_info; 3181 struct btrfs_root *root = fs_info->tree_root; 3182 struct inode *inode = NULL; 3183 struct extent_changeset *data_reserved = NULL; 3184 u64 alloc_hint = 0; 3185 int dcs = BTRFS_DC_ERROR; 3186 u64 num_pages = 0; 3187 int retries = 0; 3188 int ret = 0; 3189 3190 /* 3191 * If this block group is smaller than 100 megs don't bother caching the 3192 * block group. 3193 */ 3194 if (block_group->key.offset < (100 * SZ_1M)) { 3195 spin_lock(&block_group->lock); 3196 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3197 spin_unlock(&block_group->lock); 3198 return 0; 3199 } 3200 3201 if (trans->aborted) 3202 return 0; 3203 again: 3204 inode = lookup_free_space_inode(block_group, path); 3205 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3206 ret = PTR_ERR(inode); 3207 btrfs_release_path(path); 3208 goto out; 3209 } 3210 3211 if (IS_ERR(inode)) { 3212 BUG_ON(retries); 3213 retries++; 3214 3215 if (block_group->ro) 3216 goto out_free; 3217 3218 ret = create_free_space_inode(trans, block_group, path); 3219 if (ret) 3220 goto out_free; 3221 goto again; 3222 } 3223 3224 /* 3225 * We want to set the generation to 0, that way if anything goes wrong 3226 * from here on out we know not to trust this cache when we load up next 3227 * time. 3228 */ 3229 BTRFS_I(inode)->generation = 0; 3230 ret = btrfs_update_inode(trans, root, inode); 3231 if (ret) { 3232 /* 3233 * So theoretically we could recover from this, simply set the 3234 * super cache generation to 0 so we know to invalidate the 3235 * cache, but then we'd have to keep track of the block groups 3236 * that fail this way so we know we _have_ to reset this cache 3237 * before the next commit or risk reading stale cache. So to 3238 * limit our exposure to horrible edge cases lets just abort the 3239 * transaction, this only happens in really bad situations 3240 * anyway. 3241 */ 3242 btrfs_abort_transaction(trans, ret); 3243 goto out_put; 3244 } 3245 WARN_ON(ret); 3246 3247 /* We've already setup this transaction, go ahead and exit */ 3248 if (block_group->cache_generation == trans->transid && 3249 i_size_read(inode)) { 3250 dcs = BTRFS_DC_SETUP; 3251 goto out_put; 3252 } 3253 3254 if (i_size_read(inode) > 0) { 3255 ret = btrfs_check_trunc_cache_free_space(fs_info, 3256 &fs_info->global_block_rsv); 3257 if (ret) 3258 goto out_put; 3259 3260 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3261 if (ret) 3262 goto out_put; 3263 } 3264 3265 spin_lock(&block_group->lock); 3266 if (block_group->cached != BTRFS_CACHE_FINISHED || 3267 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3268 /* 3269 * don't bother trying to write stuff out _if_ 3270 * a) we're not cached, 3271 * b) we're with nospace_cache mount option, 3272 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3273 */ 3274 dcs = BTRFS_DC_WRITTEN; 3275 spin_unlock(&block_group->lock); 3276 goto out_put; 3277 } 3278 spin_unlock(&block_group->lock); 3279 3280 /* 3281 * We hit an ENOSPC when setting up the cache in this transaction, just 3282 * skip doing the setup, we've already cleared the cache so we're safe. 3283 */ 3284 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3285 ret = -ENOSPC; 3286 goto out_put; 3287 } 3288 3289 /* 3290 * Try to preallocate enough space based on how big the block group is. 3291 * Keep in mind this has to include any pinned space which could end up 3292 * taking up quite a bit since it's not folded into the other space 3293 * cache. 3294 */ 3295 num_pages = div_u64(block_group->key.offset, SZ_256M); 3296 if (!num_pages) 3297 num_pages = 1; 3298 3299 num_pages *= 16; 3300 num_pages *= PAGE_SIZE; 3301 3302 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); 3303 if (ret) 3304 goto out_put; 3305 3306 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3307 num_pages, num_pages, 3308 &alloc_hint); 3309 /* 3310 * Our cache requires contiguous chunks so that we don't modify a bunch 3311 * of metadata or split extents when writing the cache out, which means 3312 * we can enospc if we are heavily fragmented in addition to just normal 3313 * out of space conditions. So if we hit this just skip setting up any 3314 * other block groups for this transaction, maybe we'll unpin enough 3315 * space the next time around. 3316 */ 3317 if (!ret) 3318 dcs = BTRFS_DC_SETUP; 3319 else if (ret == -ENOSPC) 3320 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3321 3322 out_put: 3323 iput(inode); 3324 out_free: 3325 btrfs_release_path(path); 3326 out: 3327 spin_lock(&block_group->lock); 3328 if (!ret && dcs == BTRFS_DC_SETUP) 3329 block_group->cache_generation = trans->transid; 3330 block_group->disk_cache_state = dcs; 3331 spin_unlock(&block_group->lock); 3332 3333 extent_changeset_free(data_reserved); 3334 return ret; 3335 } 3336 3337 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 3338 { 3339 struct btrfs_fs_info *fs_info = trans->fs_info; 3340 struct btrfs_block_group_cache *cache, *tmp; 3341 struct btrfs_transaction *cur_trans = trans->transaction; 3342 struct btrfs_path *path; 3343 3344 if (list_empty(&cur_trans->dirty_bgs) || 3345 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3346 return 0; 3347 3348 path = btrfs_alloc_path(); 3349 if (!path) 3350 return -ENOMEM; 3351 3352 /* Could add new block groups, use _safe just in case */ 3353 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3354 dirty_list) { 3355 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3356 cache_save_setup(cache, trans, path); 3357 } 3358 3359 btrfs_free_path(path); 3360 return 0; 3361 } 3362 3363 /* 3364 * transaction commit does final block group cache writeback during a 3365 * critical section where nothing is allowed to change the FS. This is 3366 * required in order for the cache to actually match the block group, 3367 * but can introduce a lot of latency into the commit. 3368 * 3369 * So, btrfs_start_dirty_block_groups is here to kick off block group 3370 * cache IO. There's a chance we'll have to redo some of it if the 3371 * block group changes again during the commit, but it greatly reduces 3372 * the commit latency by getting rid of the easy block groups while 3373 * we're still allowing others to join the commit. 3374 */ 3375 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 3376 { 3377 struct btrfs_fs_info *fs_info = trans->fs_info; 3378 struct btrfs_block_group_cache *cache; 3379 struct btrfs_transaction *cur_trans = trans->transaction; 3380 int ret = 0; 3381 int should_put; 3382 struct btrfs_path *path = NULL; 3383 LIST_HEAD(dirty); 3384 struct list_head *io = &cur_trans->io_bgs; 3385 int num_started = 0; 3386 int loops = 0; 3387 3388 spin_lock(&cur_trans->dirty_bgs_lock); 3389 if (list_empty(&cur_trans->dirty_bgs)) { 3390 spin_unlock(&cur_trans->dirty_bgs_lock); 3391 return 0; 3392 } 3393 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3394 spin_unlock(&cur_trans->dirty_bgs_lock); 3395 3396 again: 3397 /* 3398 * make sure all the block groups on our dirty list actually 3399 * exist 3400 */ 3401 btrfs_create_pending_block_groups(trans); 3402 3403 if (!path) { 3404 path = btrfs_alloc_path(); 3405 if (!path) 3406 return -ENOMEM; 3407 } 3408 3409 /* 3410 * cache_write_mutex is here only to save us from balance or automatic 3411 * removal of empty block groups deleting this block group while we are 3412 * writing out the cache 3413 */ 3414 mutex_lock(&trans->transaction->cache_write_mutex); 3415 while (!list_empty(&dirty)) { 3416 bool drop_reserve = true; 3417 3418 cache = list_first_entry(&dirty, 3419 struct btrfs_block_group_cache, 3420 dirty_list); 3421 /* 3422 * this can happen if something re-dirties a block 3423 * group that is already under IO. Just wait for it to 3424 * finish and then do it all again 3425 */ 3426 if (!list_empty(&cache->io_list)) { 3427 list_del_init(&cache->io_list); 3428 btrfs_wait_cache_io(trans, cache, path); 3429 btrfs_put_block_group(cache); 3430 } 3431 3432 3433 /* 3434 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3435 * if it should update the cache_state. Don't delete 3436 * until after we wait. 3437 * 3438 * Since we're not running in the commit critical section 3439 * we need the dirty_bgs_lock to protect from update_block_group 3440 */ 3441 spin_lock(&cur_trans->dirty_bgs_lock); 3442 list_del_init(&cache->dirty_list); 3443 spin_unlock(&cur_trans->dirty_bgs_lock); 3444 3445 should_put = 1; 3446 3447 cache_save_setup(cache, trans, path); 3448 3449 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3450 cache->io_ctl.inode = NULL; 3451 ret = btrfs_write_out_cache(trans, cache, path); 3452 if (ret == 0 && cache->io_ctl.inode) { 3453 num_started++; 3454 should_put = 0; 3455 3456 /* 3457 * The cache_write_mutex is protecting the 3458 * io_list, also refer to the definition of 3459 * btrfs_transaction::io_bgs for more details 3460 */ 3461 list_add_tail(&cache->io_list, io); 3462 } else { 3463 /* 3464 * if we failed to write the cache, the 3465 * generation will be bad and life goes on 3466 */ 3467 ret = 0; 3468 } 3469 } 3470 if (!ret) { 3471 ret = write_one_cache_group(trans, path, cache); 3472 /* 3473 * Our block group might still be attached to the list 3474 * of new block groups in the transaction handle of some 3475 * other task (struct btrfs_trans_handle->new_bgs). This 3476 * means its block group item isn't yet in the extent 3477 * tree. If this happens ignore the error, as we will 3478 * try again later in the critical section of the 3479 * transaction commit. 3480 */ 3481 if (ret == -ENOENT) { 3482 ret = 0; 3483 spin_lock(&cur_trans->dirty_bgs_lock); 3484 if (list_empty(&cache->dirty_list)) { 3485 list_add_tail(&cache->dirty_list, 3486 &cur_trans->dirty_bgs); 3487 btrfs_get_block_group(cache); 3488 drop_reserve = false; 3489 } 3490 spin_unlock(&cur_trans->dirty_bgs_lock); 3491 } else if (ret) { 3492 btrfs_abort_transaction(trans, ret); 3493 } 3494 } 3495 3496 /* if it's not on the io list, we need to put the block group */ 3497 if (should_put) 3498 btrfs_put_block_group(cache); 3499 if (drop_reserve) 3500 btrfs_delayed_refs_rsv_release(fs_info, 1); 3501 3502 if (ret) 3503 break; 3504 3505 /* 3506 * Avoid blocking other tasks for too long. It might even save 3507 * us from writing caches for block groups that are going to be 3508 * removed. 3509 */ 3510 mutex_unlock(&trans->transaction->cache_write_mutex); 3511 mutex_lock(&trans->transaction->cache_write_mutex); 3512 } 3513 mutex_unlock(&trans->transaction->cache_write_mutex); 3514 3515 /* 3516 * go through delayed refs for all the stuff we've just kicked off 3517 * and then loop back (just once) 3518 */ 3519 ret = btrfs_run_delayed_refs(trans, 0); 3520 if (!ret && loops == 0) { 3521 loops++; 3522 spin_lock(&cur_trans->dirty_bgs_lock); 3523 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3524 /* 3525 * dirty_bgs_lock protects us from concurrent block group 3526 * deletes too (not just cache_write_mutex). 3527 */ 3528 if (!list_empty(&dirty)) { 3529 spin_unlock(&cur_trans->dirty_bgs_lock); 3530 goto again; 3531 } 3532 spin_unlock(&cur_trans->dirty_bgs_lock); 3533 } else if (ret < 0) { 3534 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3535 } 3536 3537 btrfs_free_path(path); 3538 return ret; 3539 } 3540 3541 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 3542 { 3543 struct btrfs_fs_info *fs_info = trans->fs_info; 3544 struct btrfs_block_group_cache *cache; 3545 struct btrfs_transaction *cur_trans = trans->transaction; 3546 int ret = 0; 3547 int should_put; 3548 struct btrfs_path *path; 3549 struct list_head *io = &cur_trans->io_bgs; 3550 int num_started = 0; 3551 3552 path = btrfs_alloc_path(); 3553 if (!path) 3554 return -ENOMEM; 3555 3556 /* 3557 * Even though we are in the critical section of the transaction commit, 3558 * we can still have concurrent tasks adding elements to this 3559 * transaction's list of dirty block groups. These tasks correspond to 3560 * endio free space workers started when writeback finishes for a 3561 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3562 * allocate new block groups as a result of COWing nodes of the root 3563 * tree when updating the free space inode. The writeback for the space 3564 * caches is triggered by an earlier call to 3565 * btrfs_start_dirty_block_groups() and iterations of the following 3566 * loop. 3567 * Also we want to do the cache_save_setup first and then run the 3568 * delayed refs to make sure we have the best chance at doing this all 3569 * in one shot. 3570 */ 3571 spin_lock(&cur_trans->dirty_bgs_lock); 3572 while (!list_empty(&cur_trans->dirty_bgs)) { 3573 cache = list_first_entry(&cur_trans->dirty_bgs, 3574 struct btrfs_block_group_cache, 3575 dirty_list); 3576 3577 /* 3578 * this can happen if cache_save_setup re-dirties a block 3579 * group that is already under IO. Just wait for it to 3580 * finish and then do it all again 3581 */ 3582 if (!list_empty(&cache->io_list)) { 3583 spin_unlock(&cur_trans->dirty_bgs_lock); 3584 list_del_init(&cache->io_list); 3585 btrfs_wait_cache_io(trans, cache, path); 3586 btrfs_put_block_group(cache); 3587 spin_lock(&cur_trans->dirty_bgs_lock); 3588 } 3589 3590 /* 3591 * don't remove from the dirty list until after we've waited 3592 * on any pending IO 3593 */ 3594 list_del_init(&cache->dirty_list); 3595 spin_unlock(&cur_trans->dirty_bgs_lock); 3596 should_put = 1; 3597 3598 cache_save_setup(cache, trans, path); 3599 3600 if (!ret) 3601 ret = btrfs_run_delayed_refs(trans, 3602 (unsigned long) -1); 3603 3604 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3605 cache->io_ctl.inode = NULL; 3606 ret = btrfs_write_out_cache(trans, cache, path); 3607 if (ret == 0 && cache->io_ctl.inode) { 3608 num_started++; 3609 should_put = 0; 3610 list_add_tail(&cache->io_list, io); 3611 } else { 3612 /* 3613 * if we failed to write the cache, the 3614 * generation will be bad and life goes on 3615 */ 3616 ret = 0; 3617 } 3618 } 3619 if (!ret) { 3620 ret = write_one_cache_group(trans, path, cache); 3621 /* 3622 * One of the free space endio workers might have 3623 * created a new block group while updating a free space 3624 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3625 * and hasn't released its transaction handle yet, in 3626 * which case the new block group is still attached to 3627 * its transaction handle and its creation has not 3628 * finished yet (no block group item in the extent tree 3629 * yet, etc). If this is the case, wait for all free 3630 * space endio workers to finish and retry. This is a 3631 * a very rare case so no need for a more efficient and 3632 * complex approach. 3633 */ 3634 if (ret == -ENOENT) { 3635 wait_event(cur_trans->writer_wait, 3636 atomic_read(&cur_trans->num_writers) == 1); 3637 ret = write_one_cache_group(trans, path, cache); 3638 } 3639 if (ret) 3640 btrfs_abort_transaction(trans, ret); 3641 } 3642 3643 /* if its not on the io list, we need to put the block group */ 3644 if (should_put) 3645 btrfs_put_block_group(cache); 3646 btrfs_delayed_refs_rsv_release(fs_info, 1); 3647 spin_lock(&cur_trans->dirty_bgs_lock); 3648 } 3649 spin_unlock(&cur_trans->dirty_bgs_lock); 3650 3651 /* 3652 * Refer to the definition of io_bgs member for details why it's safe 3653 * to use it without any locking 3654 */ 3655 while (!list_empty(io)) { 3656 cache = list_first_entry(io, struct btrfs_block_group_cache, 3657 io_list); 3658 list_del_init(&cache->io_list); 3659 btrfs_wait_cache_io(trans, cache, path); 3660 btrfs_put_block_group(cache); 3661 } 3662 3663 btrfs_free_path(path); 3664 return ret; 3665 } 3666 3667 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3668 { 3669 struct btrfs_block_group_cache *block_group; 3670 int readonly = 0; 3671 3672 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3673 if (!block_group || block_group->ro) 3674 readonly = 1; 3675 if (block_group) 3676 btrfs_put_block_group(block_group); 3677 return readonly; 3678 } 3679 3680 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3681 { 3682 struct btrfs_block_group_cache *bg; 3683 bool ret = true; 3684 3685 bg = btrfs_lookup_block_group(fs_info, bytenr); 3686 if (!bg) 3687 return false; 3688 3689 spin_lock(&bg->lock); 3690 if (bg->ro) 3691 ret = false; 3692 else 3693 atomic_inc(&bg->nocow_writers); 3694 spin_unlock(&bg->lock); 3695 3696 /* no put on block group, done by btrfs_dec_nocow_writers */ 3697 if (!ret) 3698 btrfs_put_block_group(bg); 3699 3700 return ret; 3701 3702 } 3703 3704 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3705 { 3706 struct btrfs_block_group_cache *bg; 3707 3708 bg = btrfs_lookup_block_group(fs_info, bytenr); 3709 ASSERT(bg); 3710 if (atomic_dec_and_test(&bg->nocow_writers)) 3711 wake_up_var(&bg->nocow_writers); 3712 /* 3713 * Once for our lookup and once for the lookup done by a previous call 3714 * to btrfs_inc_nocow_writers() 3715 */ 3716 btrfs_put_block_group(bg); 3717 btrfs_put_block_group(bg); 3718 } 3719 3720 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3721 { 3722 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 3723 } 3724 3725 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3726 { 3727 u64 extra_flags = chunk_to_extended(flags) & 3728 BTRFS_EXTENDED_PROFILE_MASK; 3729 3730 write_seqlock(&fs_info->profiles_lock); 3731 if (flags & BTRFS_BLOCK_GROUP_DATA) 3732 fs_info->avail_data_alloc_bits |= extra_flags; 3733 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3734 fs_info->avail_metadata_alloc_bits |= extra_flags; 3735 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3736 fs_info->avail_system_alloc_bits |= extra_flags; 3737 write_sequnlock(&fs_info->profiles_lock); 3738 } 3739 3740 /* 3741 * returns target flags in extended format or 0 if restripe for this 3742 * chunk_type is not in progress 3743 * 3744 * should be called with balance_lock held 3745 */ 3746 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3747 { 3748 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3749 u64 target = 0; 3750 3751 if (!bctl) 3752 return 0; 3753 3754 if (flags & BTRFS_BLOCK_GROUP_DATA && 3755 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3756 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3757 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3758 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3759 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3760 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3761 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3762 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3763 } 3764 3765 return target; 3766 } 3767 3768 /* 3769 * @flags: available profiles in extended format (see ctree.h) 3770 * 3771 * Returns reduced profile in chunk format. If profile changing is in 3772 * progress (either running or paused) picks the target profile (if it's 3773 * already available), otherwise falls back to plain reducing. 3774 */ 3775 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 3776 { 3777 u64 num_devices = fs_info->fs_devices->rw_devices; 3778 u64 target; 3779 u64 raid_type; 3780 u64 allowed = 0; 3781 3782 /* 3783 * see if restripe for this chunk_type is in progress, if so 3784 * try to reduce to the target profile 3785 */ 3786 spin_lock(&fs_info->balance_lock); 3787 target = get_restripe_target(fs_info, flags); 3788 if (target) { 3789 /* pick target profile only if it's already available */ 3790 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3791 spin_unlock(&fs_info->balance_lock); 3792 return extended_to_chunk(target); 3793 } 3794 } 3795 spin_unlock(&fs_info->balance_lock); 3796 3797 /* First, mask out the RAID levels which aren't possible */ 3798 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 3799 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 3800 allowed |= btrfs_raid_array[raid_type].bg_flag; 3801 } 3802 allowed &= flags; 3803 3804 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 3805 allowed = BTRFS_BLOCK_GROUP_RAID6; 3806 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 3807 allowed = BTRFS_BLOCK_GROUP_RAID5; 3808 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 3809 allowed = BTRFS_BLOCK_GROUP_RAID10; 3810 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 3811 allowed = BTRFS_BLOCK_GROUP_RAID1; 3812 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 3813 allowed = BTRFS_BLOCK_GROUP_RAID0; 3814 3815 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 3816 3817 return extended_to_chunk(flags | allowed); 3818 } 3819 3820 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 3821 { 3822 unsigned seq; 3823 u64 flags; 3824 3825 do { 3826 flags = orig_flags; 3827 seq = read_seqbegin(&fs_info->profiles_lock); 3828 3829 if (flags & BTRFS_BLOCK_GROUP_DATA) 3830 flags |= fs_info->avail_data_alloc_bits; 3831 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3832 flags |= fs_info->avail_system_alloc_bits; 3833 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3834 flags |= fs_info->avail_metadata_alloc_bits; 3835 } while (read_seqretry(&fs_info->profiles_lock, seq)); 3836 3837 return btrfs_reduce_alloc_profile(fs_info, flags); 3838 } 3839 3840 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 3841 { 3842 struct btrfs_fs_info *fs_info = root->fs_info; 3843 u64 flags; 3844 u64 ret; 3845 3846 if (data) 3847 flags = BTRFS_BLOCK_GROUP_DATA; 3848 else if (root == fs_info->chunk_root) 3849 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3850 else 3851 flags = BTRFS_BLOCK_GROUP_METADATA; 3852 3853 ret = get_alloc_profile(fs_info, flags); 3854 return ret; 3855 } 3856 3857 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) 3858 { 3859 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); 3860 } 3861 3862 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) 3863 { 3864 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3865 } 3866 3867 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) 3868 { 3869 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3870 } 3871 3872 static void force_metadata_allocation(struct btrfs_fs_info *info) 3873 { 3874 struct list_head *head = &info->space_info; 3875 struct btrfs_space_info *found; 3876 3877 rcu_read_lock(); 3878 list_for_each_entry_rcu(found, head, list) { 3879 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3880 found->force_alloc = CHUNK_ALLOC_FORCE; 3881 } 3882 rcu_read_unlock(); 3883 } 3884 3885 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3886 struct btrfs_space_info *sinfo, int force) 3887 { 3888 u64 bytes_used = btrfs_space_info_used(sinfo, false); 3889 u64 thresh; 3890 3891 if (force == CHUNK_ALLOC_FORCE) 3892 return 1; 3893 3894 /* 3895 * in limited mode, we want to have some free space up to 3896 * about 1% of the FS size. 3897 */ 3898 if (force == CHUNK_ALLOC_LIMITED) { 3899 thresh = btrfs_super_total_bytes(fs_info->super_copy); 3900 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 3901 3902 if (sinfo->total_bytes - bytes_used < thresh) 3903 return 1; 3904 } 3905 3906 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 3907 return 0; 3908 return 1; 3909 } 3910 3911 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 3912 { 3913 u64 num_dev; 3914 3915 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 3916 if (!num_dev) 3917 num_dev = fs_info->fs_devices->rw_devices; 3918 3919 return num_dev; 3920 } 3921 3922 /* 3923 * If @is_allocation is true, reserve space in the system space info necessary 3924 * for allocating a chunk, otherwise if it's false, reserve space necessary for 3925 * removing a chunk. 3926 */ 3927 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 3928 { 3929 struct btrfs_fs_info *fs_info = trans->fs_info; 3930 struct btrfs_space_info *info; 3931 u64 left; 3932 u64 thresh; 3933 int ret = 0; 3934 u64 num_devs; 3935 3936 /* 3937 * Needed because we can end up allocating a system chunk and for an 3938 * atomic and race free space reservation in the chunk block reserve. 3939 */ 3940 lockdep_assert_held(&fs_info->chunk_mutex); 3941 3942 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3943 spin_lock(&info->lock); 3944 left = info->total_bytes - btrfs_space_info_used(info, true); 3945 spin_unlock(&info->lock); 3946 3947 num_devs = get_profile_num_devs(fs_info, type); 3948 3949 /* num_devs device items to update and 1 chunk item to add or remove */ 3950 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 3951 btrfs_calc_trans_metadata_size(fs_info, 1); 3952 3953 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3954 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3955 left, thresh, type); 3956 btrfs_dump_space_info(fs_info, info, 0, 0); 3957 } 3958 3959 if (left < thresh) { 3960 u64 flags = btrfs_system_alloc_profile(fs_info); 3961 3962 /* 3963 * Ignore failure to create system chunk. We might end up not 3964 * needing it, as we might not need to COW all nodes/leafs from 3965 * the paths we visit in the chunk tree (they were already COWed 3966 * or created in the current transaction for example). 3967 */ 3968 ret = btrfs_alloc_chunk(trans, flags); 3969 } 3970 3971 if (!ret) { 3972 ret = btrfs_block_rsv_add(fs_info->chunk_root, 3973 &fs_info->chunk_block_rsv, 3974 thresh, BTRFS_RESERVE_NO_FLUSH); 3975 if (!ret) 3976 trans->chunk_bytes_reserved += thresh; 3977 } 3978 } 3979 3980 /* 3981 * If force is CHUNK_ALLOC_FORCE: 3982 * - return 1 if it successfully allocates a chunk, 3983 * - return errors including -ENOSPC otherwise. 3984 * If force is NOT CHUNK_ALLOC_FORCE: 3985 * - return 0 if it doesn't need to allocate a new chunk, 3986 * - return 1 if it successfully allocates a chunk, 3987 * - return errors including -ENOSPC otherwise. 3988 */ 3989 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3990 enum btrfs_chunk_alloc_enum force) 3991 { 3992 struct btrfs_fs_info *fs_info = trans->fs_info; 3993 struct btrfs_space_info *space_info; 3994 bool wait_for_alloc = false; 3995 bool should_alloc = false; 3996 int ret = 0; 3997 3998 /* Don't re-enter if we're already allocating a chunk */ 3999 if (trans->allocating_chunk) 4000 return -ENOSPC; 4001 4002 space_info = btrfs_find_space_info(fs_info, flags); 4003 ASSERT(space_info); 4004 4005 do { 4006 spin_lock(&space_info->lock); 4007 if (force < space_info->force_alloc) 4008 force = space_info->force_alloc; 4009 should_alloc = should_alloc_chunk(fs_info, space_info, force); 4010 if (space_info->full) { 4011 /* No more free physical space */ 4012 if (should_alloc) 4013 ret = -ENOSPC; 4014 else 4015 ret = 0; 4016 spin_unlock(&space_info->lock); 4017 return ret; 4018 } else if (!should_alloc) { 4019 spin_unlock(&space_info->lock); 4020 return 0; 4021 } else if (space_info->chunk_alloc) { 4022 /* 4023 * Someone is already allocating, so we need to block 4024 * until this someone is finished and then loop to 4025 * recheck if we should continue with our allocation 4026 * attempt. 4027 */ 4028 wait_for_alloc = true; 4029 spin_unlock(&space_info->lock); 4030 mutex_lock(&fs_info->chunk_mutex); 4031 mutex_unlock(&fs_info->chunk_mutex); 4032 } else { 4033 /* Proceed with allocation */ 4034 space_info->chunk_alloc = 1; 4035 wait_for_alloc = false; 4036 spin_unlock(&space_info->lock); 4037 } 4038 4039 cond_resched(); 4040 } while (wait_for_alloc); 4041 4042 mutex_lock(&fs_info->chunk_mutex); 4043 trans->allocating_chunk = true; 4044 4045 /* 4046 * If we have mixed data/metadata chunks we want to make sure we keep 4047 * allocating mixed chunks instead of individual chunks. 4048 */ 4049 if (btrfs_mixed_space_info(space_info)) 4050 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4051 4052 /* 4053 * if we're doing a data chunk, go ahead and make sure that 4054 * we keep a reasonable number of metadata chunks allocated in the 4055 * FS as well. 4056 */ 4057 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4058 fs_info->data_chunk_allocations++; 4059 if (!(fs_info->data_chunk_allocations % 4060 fs_info->metadata_ratio)) 4061 force_metadata_allocation(fs_info); 4062 } 4063 4064 /* 4065 * Check if we have enough space in SYSTEM chunk because we may need 4066 * to update devices. 4067 */ 4068 check_system_chunk(trans, flags); 4069 4070 ret = btrfs_alloc_chunk(trans, flags); 4071 trans->allocating_chunk = false; 4072 4073 spin_lock(&space_info->lock); 4074 if (ret < 0) { 4075 if (ret == -ENOSPC) 4076 space_info->full = 1; 4077 else 4078 goto out; 4079 } else { 4080 ret = 1; 4081 space_info->max_extent_size = 0; 4082 } 4083 4084 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4085 out: 4086 space_info->chunk_alloc = 0; 4087 spin_unlock(&space_info->lock); 4088 mutex_unlock(&fs_info->chunk_mutex); 4089 /* 4090 * When we allocate a new chunk we reserve space in the chunk block 4091 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4092 * add new nodes/leafs to it if we end up needing to do it when 4093 * inserting the chunk item and updating device items as part of the 4094 * second phase of chunk allocation, performed by 4095 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4096 * large number of new block groups to create in our transaction 4097 * handle's new_bgs list to avoid exhausting the chunk block reserve 4098 * in extreme cases - like having a single transaction create many new 4099 * block groups when starting to write out the free space caches of all 4100 * the block groups that were made dirty during the lifetime of the 4101 * transaction. 4102 */ 4103 if (trans->chunk_bytes_reserved >= (u64)SZ_2M) 4104 btrfs_create_pending_block_groups(trans); 4105 4106 return ret; 4107 } 4108 4109 static int update_block_group(struct btrfs_trans_handle *trans, 4110 u64 bytenr, u64 num_bytes, int alloc) 4111 { 4112 struct btrfs_fs_info *info = trans->fs_info; 4113 struct btrfs_block_group_cache *cache = NULL; 4114 u64 total = num_bytes; 4115 u64 old_val; 4116 u64 byte_in_group; 4117 int factor; 4118 int ret = 0; 4119 4120 /* block accounting for super block */ 4121 spin_lock(&info->delalloc_root_lock); 4122 old_val = btrfs_super_bytes_used(info->super_copy); 4123 if (alloc) 4124 old_val += num_bytes; 4125 else 4126 old_val -= num_bytes; 4127 btrfs_set_super_bytes_used(info->super_copy, old_val); 4128 spin_unlock(&info->delalloc_root_lock); 4129 4130 while (total) { 4131 cache = btrfs_lookup_block_group(info, bytenr); 4132 if (!cache) { 4133 ret = -ENOENT; 4134 break; 4135 } 4136 factor = btrfs_bg_type_to_factor(cache->flags); 4137 4138 /* 4139 * If this block group has free space cache written out, we 4140 * need to make sure to load it if we are removing space. This 4141 * is because we need the unpinning stage to actually add the 4142 * space back to the block group, otherwise we will leak space. 4143 */ 4144 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4145 cache_block_group(cache, 1); 4146 4147 byte_in_group = bytenr - cache->key.objectid; 4148 WARN_ON(byte_in_group > cache->key.offset); 4149 4150 spin_lock(&cache->space_info->lock); 4151 spin_lock(&cache->lock); 4152 4153 if (btrfs_test_opt(info, SPACE_CACHE) && 4154 cache->disk_cache_state < BTRFS_DC_CLEAR) 4155 cache->disk_cache_state = BTRFS_DC_CLEAR; 4156 4157 old_val = btrfs_block_group_used(&cache->item); 4158 num_bytes = min(total, cache->key.offset - byte_in_group); 4159 if (alloc) { 4160 old_val += num_bytes; 4161 btrfs_set_block_group_used(&cache->item, old_val); 4162 cache->reserved -= num_bytes; 4163 cache->space_info->bytes_reserved -= num_bytes; 4164 cache->space_info->bytes_used += num_bytes; 4165 cache->space_info->disk_used += num_bytes * factor; 4166 spin_unlock(&cache->lock); 4167 spin_unlock(&cache->space_info->lock); 4168 } else { 4169 old_val -= num_bytes; 4170 btrfs_set_block_group_used(&cache->item, old_val); 4171 cache->pinned += num_bytes; 4172 btrfs_space_info_update_bytes_pinned(info, 4173 cache->space_info, num_bytes); 4174 cache->space_info->bytes_used -= num_bytes; 4175 cache->space_info->disk_used -= num_bytes * factor; 4176 spin_unlock(&cache->lock); 4177 spin_unlock(&cache->space_info->lock); 4178 4179 trace_btrfs_space_reservation(info, "pinned", 4180 cache->space_info->flags, 4181 num_bytes, 1); 4182 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 4183 num_bytes, 4184 BTRFS_TOTAL_BYTES_PINNED_BATCH); 4185 set_extent_dirty(info->pinned_extents, 4186 bytenr, bytenr + num_bytes - 1, 4187 GFP_NOFS | __GFP_NOFAIL); 4188 } 4189 4190 spin_lock(&trans->transaction->dirty_bgs_lock); 4191 if (list_empty(&cache->dirty_list)) { 4192 list_add_tail(&cache->dirty_list, 4193 &trans->transaction->dirty_bgs); 4194 trans->delayed_ref_updates++; 4195 btrfs_get_block_group(cache); 4196 } 4197 spin_unlock(&trans->transaction->dirty_bgs_lock); 4198 4199 /* 4200 * No longer have used bytes in this block group, queue it for 4201 * deletion. We do this after adding the block group to the 4202 * dirty list to avoid races between cleaner kthread and space 4203 * cache writeout. 4204 */ 4205 if (!alloc && old_val == 0) 4206 btrfs_mark_bg_unused(cache); 4207 4208 btrfs_put_block_group(cache); 4209 total -= num_bytes; 4210 bytenr += num_bytes; 4211 } 4212 4213 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 4214 btrfs_update_delayed_refs_rsv(trans); 4215 return ret; 4216 } 4217 4218 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 4219 { 4220 struct btrfs_block_group_cache *cache; 4221 u64 bytenr; 4222 4223 spin_lock(&fs_info->block_group_cache_lock); 4224 bytenr = fs_info->first_logical_byte; 4225 spin_unlock(&fs_info->block_group_cache_lock); 4226 4227 if (bytenr < (u64)-1) 4228 return bytenr; 4229 4230 cache = btrfs_lookup_first_block_group(fs_info, search_start); 4231 if (!cache) 4232 return 0; 4233 4234 bytenr = cache->key.objectid; 4235 btrfs_put_block_group(cache); 4236 4237 return bytenr; 4238 } 4239 4240 static int pin_down_extent(struct btrfs_block_group_cache *cache, 4241 u64 bytenr, u64 num_bytes, int reserved) 4242 { 4243 struct btrfs_fs_info *fs_info = cache->fs_info; 4244 4245 spin_lock(&cache->space_info->lock); 4246 spin_lock(&cache->lock); 4247 cache->pinned += num_bytes; 4248 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, 4249 num_bytes); 4250 if (reserved) { 4251 cache->reserved -= num_bytes; 4252 cache->space_info->bytes_reserved -= num_bytes; 4253 } 4254 spin_unlock(&cache->lock); 4255 spin_unlock(&cache->space_info->lock); 4256 4257 trace_btrfs_space_reservation(fs_info, "pinned", 4258 cache->space_info->flags, num_bytes, 1); 4259 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 4260 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); 4261 set_extent_dirty(fs_info->pinned_extents, bytenr, 4262 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 4263 return 0; 4264 } 4265 4266 /* 4267 * this function must be called within transaction 4268 */ 4269 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 4270 u64 bytenr, u64 num_bytes, int reserved) 4271 { 4272 struct btrfs_block_group_cache *cache; 4273 4274 cache = btrfs_lookup_block_group(fs_info, bytenr); 4275 BUG_ON(!cache); /* Logic error */ 4276 4277 pin_down_extent(cache, bytenr, num_bytes, reserved); 4278 4279 btrfs_put_block_group(cache); 4280 return 0; 4281 } 4282 4283 /* 4284 * this function must be called within transaction 4285 */ 4286 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 4287 u64 bytenr, u64 num_bytes) 4288 { 4289 struct btrfs_block_group_cache *cache; 4290 int ret; 4291 4292 cache = btrfs_lookup_block_group(fs_info, bytenr); 4293 if (!cache) 4294 return -EINVAL; 4295 4296 /* 4297 * pull in the free space cache (if any) so that our pin 4298 * removes the free space from the cache. We have load_only set 4299 * to one because the slow code to read in the free extents does check 4300 * the pinned extents. 4301 */ 4302 cache_block_group(cache, 1); 4303 4304 pin_down_extent(cache, bytenr, num_bytes, 0); 4305 4306 /* remove us from the free space cache (if we're there at all) */ 4307 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 4308 btrfs_put_block_group(cache); 4309 return ret; 4310 } 4311 4312 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 4313 u64 start, u64 num_bytes) 4314 { 4315 int ret; 4316 struct btrfs_block_group_cache *block_group; 4317 struct btrfs_caching_control *caching_ctl; 4318 4319 block_group = btrfs_lookup_block_group(fs_info, start); 4320 if (!block_group) 4321 return -EINVAL; 4322 4323 cache_block_group(block_group, 0); 4324 caching_ctl = get_caching_control(block_group); 4325 4326 if (!caching_ctl) { 4327 /* Logic error */ 4328 BUG_ON(!block_group_cache_done(block_group)); 4329 ret = btrfs_remove_free_space(block_group, start, num_bytes); 4330 } else { 4331 mutex_lock(&caching_ctl->mutex); 4332 4333 if (start >= caching_ctl->progress) { 4334 ret = add_excluded_extent(fs_info, start, num_bytes); 4335 } else if (start + num_bytes <= caching_ctl->progress) { 4336 ret = btrfs_remove_free_space(block_group, 4337 start, num_bytes); 4338 } else { 4339 num_bytes = caching_ctl->progress - start; 4340 ret = btrfs_remove_free_space(block_group, 4341 start, num_bytes); 4342 if (ret) 4343 goto out_lock; 4344 4345 num_bytes = (start + num_bytes) - 4346 caching_ctl->progress; 4347 start = caching_ctl->progress; 4348 ret = add_excluded_extent(fs_info, start, num_bytes); 4349 } 4350 out_lock: 4351 mutex_unlock(&caching_ctl->mutex); 4352 put_caching_control(caching_ctl); 4353 } 4354 btrfs_put_block_group(block_group); 4355 return ret; 4356 } 4357 4358 int btrfs_exclude_logged_extents(struct extent_buffer *eb) 4359 { 4360 struct btrfs_fs_info *fs_info = eb->fs_info; 4361 struct btrfs_file_extent_item *item; 4362 struct btrfs_key key; 4363 int found_type; 4364 int i; 4365 int ret = 0; 4366 4367 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 4368 return 0; 4369 4370 for (i = 0; i < btrfs_header_nritems(eb); i++) { 4371 btrfs_item_key_to_cpu(eb, &key, i); 4372 if (key.type != BTRFS_EXTENT_DATA_KEY) 4373 continue; 4374 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 4375 found_type = btrfs_file_extent_type(eb, item); 4376 if (found_type == BTRFS_FILE_EXTENT_INLINE) 4377 continue; 4378 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 4379 continue; 4380 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 4381 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 4382 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 4383 if (ret) 4384 break; 4385 } 4386 4387 return ret; 4388 } 4389 4390 static void 4391 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 4392 { 4393 atomic_inc(&bg->reservations); 4394 } 4395 4396 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 4397 const u64 start) 4398 { 4399 struct btrfs_block_group_cache *bg; 4400 4401 bg = btrfs_lookup_block_group(fs_info, start); 4402 ASSERT(bg); 4403 if (atomic_dec_and_test(&bg->reservations)) 4404 wake_up_var(&bg->reservations); 4405 btrfs_put_block_group(bg); 4406 } 4407 4408 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 4409 { 4410 struct btrfs_space_info *space_info = bg->space_info; 4411 4412 ASSERT(bg->ro); 4413 4414 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 4415 return; 4416 4417 /* 4418 * Our block group is read only but before we set it to read only, 4419 * some task might have had allocated an extent from it already, but it 4420 * has not yet created a respective ordered extent (and added it to a 4421 * root's list of ordered extents). 4422 * Therefore wait for any task currently allocating extents, since the 4423 * block group's reservations counter is incremented while a read lock 4424 * on the groups' semaphore is held and decremented after releasing 4425 * the read access on that semaphore and creating the ordered extent. 4426 */ 4427 down_write(&space_info->groups_sem); 4428 up_write(&space_info->groups_sem); 4429 4430 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 4431 } 4432 4433 /** 4434 * btrfs_add_reserved_bytes - update the block_group and space info counters 4435 * @cache: The cache we are manipulating 4436 * @ram_bytes: The number of bytes of file content, and will be same to 4437 * @num_bytes except for the compress path. 4438 * @num_bytes: The number of bytes in question 4439 * @delalloc: The blocks are allocated for the delalloc write 4440 * 4441 * This is called by the allocator when it reserves space. If this is a 4442 * reservation and the block group has become read only we cannot make the 4443 * reservation and return -EAGAIN, otherwise this function always succeeds. 4444 */ 4445 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 4446 u64 ram_bytes, u64 num_bytes, int delalloc) 4447 { 4448 struct btrfs_space_info *space_info = cache->space_info; 4449 int ret = 0; 4450 4451 spin_lock(&space_info->lock); 4452 spin_lock(&cache->lock); 4453 if (cache->ro) { 4454 ret = -EAGAIN; 4455 } else { 4456 cache->reserved += num_bytes; 4457 space_info->bytes_reserved += num_bytes; 4458 btrfs_space_info_update_bytes_may_use(cache->fs_info, 4459 space_info, -ram_bytes); 4460 if (delalloc) 4461 cache->delalloc_bytes += num_bytes; 4462 } 4463 spin_unlock(&cache->lock); 4464 spin_unlock(&space_info->lock); 4465 return ret; 4466 } 4467 4468 /** 4469 * btrfs_free_reserved_bytes - update the block_group and space info counters 4470 * @cache: The cache we are manipulating 4471 * @num_bytes: The number of bytes in question 4472 * @delalloc: The blocks are allocated for the delalloc write 4473 * 4474 * This is called by somebody who is freeing space that was never actually used 4475 * on disk. For example if you reserve some space for a new leaf in transaction 4476 * A and before transaction A commits you free that leaf, you call this with 4477 * reserve set to 0 in order to clear the reservation. 4478 */ 4479 4480 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 4481 u64 num_bytes, int delalloc) 4482 { 4483 struct btrfs_space_info *space_info = cache->space_info; 4484 4485 spin_lock(&space_info->lock); 4486 spin_lock(&cache->lock); 4487 if (cache->ro) 4488 space_info->bytes_readonly += num_bytes; 4489 cache->reserved -= num_bytes; 4490 space_info->bytes_reserved -= num_bytes; 4491 space_info->max_extent_size = 0; 4492 4493 if (delalloc) 4494 cache->delalloc_bytes -= num_bytes; 4495 spin_unlock(&cache->lock); 4496 spin_unlock(&space_info->lock); 4497 } 4498 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 4499 { 4500 struct btrfs_caching_control *next; 4501 struct btrfs_caching_control *caching_ctl; 4502 struct btrfs_block_group_cache *cache; 4503 4504 down_write(&fs_info->commit_root_sem); 4505 4506 list_for_each_entry_safe(caching_ctl, next, 4507 &fs_info->caching_block_groups, list) { 4508 cache = caching_ctl->block_group; 4509 if (block_group_cache_done(cache)) { 4510 cache->last_byte_to_unpin = (u64)-1; 4511 list_del_init(&caching_ctl->list); 4512 put_caching_control(caching_ctl); 4513 } else { 4514 cache->last_byte_to_unpin = caching_ctl->progress; 4515 } 4516 } 4517 4518 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4519 fs_info->pinned_extents = &fs_info->freed_extents[1]; 4520 else 4521 fs_info->pinned_extents = &fs_info->freed_extents[0]; 4522 4523 up_write(&fs_info->commit_root_sem); 4524 4525 btrfs_update_global_block_rsv(fs_info); 4526 } 4527 4528 /* 4529 * Returns the free cluster for the given space info and sets empty_cluster to 4530 * what it should be based on the mount options. 4531 */ 4532 static struct btrfs_free_cluster * 4533 fetch_cluster_info(struct btrfs_fs_info *fs_info, 4534 struct btrfs_space_info *space_info, u64 *empty_cluster) 4535 { 4536 struct btrfs_free_cluster *ret = NULL; 4537 4538 *empty_cluster = 0; 4539 if (btrfs_mixed_space_info(space_info)) 4540 return ret; 4541 4542 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4543 ret = &fs_info->meta_alloc_cluster; 4544 if (btrfs_test_opt(fs_info, SSD)) 4545 *empty_cluster = SZ_2M; 4546 else 4547 *empty_cluster = SZ_64K; 4548 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 4549 btrfs_test_opt(fs_info, SSD_SPREAD)) { 4550 *empty_cluster = SZ_2M; 4551 ret = &fs_info->data_alloc_cluster; 4552 } 4553 4554 return ret; 4555 } 4556 4557 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 4558 u64 start, u64 end, 4559 const bool return_free_space) 4560 { 4561 struct btrfs_block_group_cache *cache = NULL; 4562 struct btrfs_space_info *space_info; 4563 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4564 struct btrfs_free_cluster *cluster = NULL; 4565 u64 len; 4566 u64 total_unpinned = 0; 4567 u64 empty_cluster = 0; 4568 bool readonly; 4569 4570 while (start <= end) { 4571 readonly = false; 4572 if (!cache || 4573 start >= cache->key.objectid + cache->key.offset) { 4574 if (cache) 4575 btrfs_put_block_group(cache); 4576 total_unpinned = 0; 4577 cache = btrfs_lookup_block_group(fs_info, start); 4578 BUG_ON(!cache); /* Logic error */ 4579 4580 cluster = fetch_cluster_info(fs_info, 4581 cache->space_info, 4582 &empty_cluster); 4583 empty_cluster <<= 1; 4584 } 4585 4586 len = cache->key.objectid + cache->key.offset - start; 4587 len = min(len, end + 1 - start); 4588 4589 if (start < cache->last_byte_to_unpin) { 4590 len = min(len, cache->last_byte_to_unpin - start); 4591 if (return_free_space) 4592 btrfs_add_free_space(cache, start, len); 4593 } 4594 4595 start += len; 4596 total_unpinned += len; 4597 space_info = cache->space_info; 4598 4599 /* 4600 * If this space cluster has been marked as fragmented and we've 4601 * unpinned enough in this block group to potentially allow a 4602 * cluster to be created inside of it go ahead and clear the 4603 * fragmented check. 4604 */ 4605 if (cluster && cluster->fragmented && 4606 total_unpinned > empty_cluster) { 4607 spin_lock(&cluster->lock); 4608 cluster->fragmented = 0; 4609 spin_unlock(&cluster->lock); 4610 } 4611 4612 spin_lock(&space_info->lock); 4613 spin_lock(&cache->lock); 4614 cache->pinned -= len; 4615 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); 4616 4617 trace_btrfs_space_reservation(fs_info, "pinned", 4618 space_info->flags, len, 0); 4619 space_info->max_extent_size = 0; 4620 percpu_counter_add_batch(&space_info->total_bytes_pinned, 4621 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); 4622 if (cache->ro) { 4623 space_info->bytes_readonly += len; 4624 readonly = true; 4625 } 4626 spin_unlock(&cache->lock); 4627 if (!readonly && return_free_space && 4628 global_rsv->space_info == space_info) { 4629 u64 to_add = len; 4630 4631 spin_lock(&global_rsv->lock); 4632 if (!global_rsv->full) { 4633 to_add = min(len, global_rsv->size - 4634 global_rsv->reserved); 4635 global_rsv->reserved += to_add; 4636 btrfs_space_info_update_bytes_may_use(fs_info, 4637 space_info, to_add); 4638 if (global_rsv->reserved >= global_rsv->size) 4639 global_rsv->full = 1; 4640 trace_btrfs_space_reservation(fs_info, 4641 "space_info", 4642 space_info->flags, 4643 to_add, 1); 4644 len -= to_add; 4645 } 4646 spin_unlock(&global_rsv->lock); 4647 /* Add to any tickets we may have */ 4648 if (len) 4649 btrfs_space_info_add_new_bytes(fs_info, 4650 space_info, len); 4651 } 4652 spin_unlock(&space_info->lock); 4653 } 4654 4655 if (cache) 4656 btrfs_put_block_group(cache); 4657 return 0; 4658 } 4659 4660 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 4661 { 4662 struct btrfs_fs_info *fs_info = trans->fs_info; 4663 struct btrfs_block_group_cache *block_group, *tmp; 4664 struct list_head *deleted_bgs; 4665 struct extent_io_tree *unpin; 4666 u64 start; 4667 u64 end; 4668 int ret; 4669 4670 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4671 unpin = &fs_info->freed_extents[1]; 4672 else 4673 unpin = &fs_info->freed_extents[0]; 4674 4675 while (!trans->aborted) { 4676 struct extent_state *cached_state = NULL; 4677 4678 mutex_lock(&fs_info->unused_bg_unpin_mutex); 4679 ret = find_first_extent_bit(unpin, 0, &start, &end, 4680 EXTENT_DIRTY, &cached_state); 4681 if (ret) { 4682 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 4683 break; 4684 } 4685 4686 if (btrfs_test_opt(fs_info, DISCARD)) 4687 ret = btrfs_discard_extent(fs_info, start, 4688 end + 1 - start, NULL); 4689 4690 clear_extent_dirty(unpin, start, end, &cached_state); 4691 unpin_extent_range(fs_info, start, end, true); 4692 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 4693 free_extent_state(cached_state); 4694 cond_resched(); 4695 } 4696 4697 /* 4698 * Transaction is finished. We don't need the lock anymore. We 4699 * do need to clean up the block groups in case of a transaction 4700 * abort. 4701 */ 4702 deleted_bgs = &trans->transaction->deleted_bgs; 4703 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 4704 u64 trimmed = 0; 4705 4706 ret = -EROFS; 4707 if (!trans->aborted) 4708 ret = btrfs_discard_extent(fs_info, 4709 block_group->key.objectid, 4710 block_group->key.offset, 4711 &trimmed); 4712 4713 list_del_init(&block_group->bg_list); 4714 btrfs_put_block_group_trimming(block_group); 4715 btrfs_put_block_group(block_group); 4716 4717 if (ret) { 4718 const char *errstr = btrfs_decode_error(ret); 4719 btrfs_warn(fs_info, 4720 "discard failed while removing blockgroup: errno=%d %s", 4721 ret, errstr); 4722 } 4723 } 4724 4725 return 0; 4726 } 4727 4728 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 4729 struct btrfs_delayed_ref_node *node, u64 parent, 4730 u64 root_objectid, u64 owner_objectid, 4731 u64 owner_offset, int refs_to_drop, 4732 struct btrfs_delayed_extent_op *extent_op) 4733 { 4734 struct btrfs_fs_info *info = trans->fs_info; 4735 struct btrfs_key key; 4736 struct btrfs_path *path; 4737 struct btrfs_root *extent_root = info->extent_root; 4738 struct extent_buffer *leaf; 4739 struct btrfs_extent_item *ei; 4740 struct btrfs_extent_inline_ref *iref; 4741 int ret; 4742 int is_data; 4743 int extent_slot = 0; 4744 int found_extent = 0; 4745 int num_to_del = 1; 4746 u32 item_size; 4747 u64 refs; 4748 u64 bytenr = node->bytenr; 4749 u64 num_bytes = node->num_bytes; 4750 int last_ref = 0; 4751 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 4752 4753 path = btrfs_alloc_path(); 4754 if (!path) 4755 return -ENOMEM; 4756 4757 path->reada = READA_FORWARD; 4758 path->leave_spinning = 1; 4759 4760 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 4761 BUG_ON(!is_data && refs_to_drop != 1); 4762 4763 if (is_data) 4764 skinny_metadata = false; 4765 4766 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 4767 parent, root_objectid, owner_objectid, 4768 owner_offset); 4769 if (ret == 0) { 4770 extent_slot = path->slots[0]; 4771 while (extent_slot >= 0) { 4772 btrfs_item_key_to_cpu(path->nodes[0], &key, 4773 extent_slot); 4774 if (key.objectid != bytenr) 4775 break; 4776 if (key.type == BTRFS_EXTENT_ITEM_KEY && 4777 key.offset == num_bytes) { 4778 found_extent = 1; 4779 break; 4780 } 4781 if (key.type == BTRFS_METADATA_ITEM_KEY && 4782 key.offset == owner_objectid) { 4783 found_extent = 1; 4784 break; 4785 } 4786 if (path->slots[0] - extent_slot > 5) 4787 break; 4788 extent_slot--; 4789 } 4790 4791 if (!found_extent) { 4792 BUG_ON(iref); 4793 ret = remove_extent_backref(trans, path, NULL, 4794 refs_to_drop, 4795 is_data, &last_ref); 4796 if (ret) { 4797 btrfs_abort_transaction(trans, ret); 4798 goto out; 4799 } 4800 btrfs_release_path(path); 4801 path->leave_spinning = 1; 4802 4803 key.objectid = bytenr; 4804 key.type = BTRFS_EXTENT_ITEM_KEY; 4805 key.offset = num_bytes; 4806 4807 if (!is_data && skinny_metadata) { 4808 key.type = BTRFS_METADATA_ITEM_KEY; 4809 key.offset = owner_objectid; 4810 } 4811 4812 ret = btrfs_search_slot(trans, extent_root, 4813 &key, path, -1, 1); 4814 if (ret > 0 && skinny_metadata && path->slots[0]) { 4815 /* 4816 * Couldn't find our skinny metadata item, 4817 * see if we have ye olde extent item. 4818 */ 4819 path->slots[0]--; 4820 btrfs_item_key_to_cpu(path->nodes[0], &key, 4821 path->slots[0]); 4822 if (key.objectid == bytenr && 4823 key.type == BTRFS_EXTENT_ITEM_KEY && 4824 key.offset == num_bytes) 4825 ret = 0; 4826 } 4827 4828 if (ret > 0 && skinny_metadata) { 4829 skinny_metadata = false; 4830 key.objectid = bytenr; 4831 key.type = BTRFS_EXTENT_ITEM_KEY; 4832 key.offset = num_bytes; 4833 btrfs_release_path(path); 4834 ret = btrfs_search_slot(trans, extent_root, 4835 &key, path, -1, 1); 4836 } 4837 4838 if (ret) { 4839 btrfs_err(info, 4840 "umm, got %d back from search, was looking for %llu", 4841 ret, bytenr); 4842 if (ret > 0) 4843 btrfs_print_leaf(path->nodes[0]); 4844 } 4845 if (ret < 0) { 4846 btrfs_abort_transaction(trans, ret); 4847 goto out; 4848 } 4849 extent_slot = path->slots[0]; 4850 } 4851 } else if (WARN_ON(ret == -ENOENT)) { 4852 btrfs_print_leaf(path->nodes[0]); 4853 btrfs_err(info, 4854 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 4855 bytenr, parent, root_objectid, owner_objectid, 4856 owner_offset); 4857 btrfs_abort_transaction(trans, ret); 4858 goto out; 4859 } else { 4860 btrfs_abort_transaction(trans, ret); 4861 goto out; 4862 } 4863 4864 leaf = path->nodes[0]; 4865 item_size = btrfs_item_size_nr(leaf, extent_slot); 4866 if (unlikely(item_size < sizeof(*ei))) { 4867 ret = -EINVAL; 4868 btrfs_print_v0_err(info); 4869 btrfs_abort_transaction(trans, ret); 4870 goto out; 4871 } 4872 ei = btrfs_item_ptr(leaf, extent_slot, 4873 struct btrfs_extent_item); 4874 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 4875 key.type == BTRFS_EXTENT_ITEM_KEY) { 4876 struct btrfs_tree_block_info *bi; 4877 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 4878 bi = (struct btrfs_tree_block_info *)(ei + 1); 4879 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 4880 } 4881 4882 refs = btrfs_extent_refs(leaf, ei); 4883 if (refs < refs_to_drop) { 4884 btrfs_err(info, 4885 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 4886 refs_to_drop, refs, bytenr); 4887 ret = -EINVAL; 4888 btrfs_abort_transaction(trans, ret); 4889 goto out; 4890 } 4891 refs -= refs_to_drop; 4892 4893 if (refs > 0) { 4894 if (extent_op) 4895 __run_delayed_extent_op(extent_op, leaf, ei); 4896 /* 4897 * In the case of inline back ref, reference count will 4898 * be updated by remove_extent_backref 4899 */ 4900 if (iref) { 4901 BUG_ON(!found_extent); 4902 } else { 4903 btrfs_set_extent_refs(leaf, ei, refs); 4904 btrfs_mark_buffer_dirty(leaf); 4905 } 4906 if (found_extent) { 4907 ret = remove_extent_backref(trans, path, iref, 4908 refs_to_drop, is_data, 4909 &last_ref); 4910 if (ret) { 4911 btrfs_abort_transaction(trans, ret); 4912 goto out; 4913 } 4914 } 4915 } else { 4916 if (found_extent) { 4917 BUG_ON(is_data && refs_to_drop != 4918 extent_data_ref_count(path, iref)); 4919 if (iref) { 4920 BUG_ON(path->slots[0] != extent_slot); 4921 } else { 4922 BUG_ON(path->slots[0] != extent_slot + 1); 4923 path->slots[0] = extent_slot; 4924 num_to_del = 2; 4925 } 4926 } 4927 4928 last_ref = 1; 4929 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4930 num_to_del); 4931 if (ret) { 4932 btrfs_abort_transaction(trans, ret); 4933 goto out; 4934 } 4935 btrfs_release_path(path); 4936 4937 if (is_data) { 4938 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 4939 if (ret) { 4940 btrfs_abort_transaction(trans, ret); 4941 goto out; 4942 } 4943 } 4944 4945 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 4946 if (ret) { 4947 btrfs_abort_transaction(trans, ret); 4948 goto out; 4949 } 4950 4951 ret = update_block_group(trans, bytenr, num_bytes, 0); 4952 if (ret) { 4953 btrfs_abort_transaction(trans, ret); 4954 goto out; 4955 } 4956 } 4957 btrfs_release_path(path); 4958 4959 out: 4960 btrfs_free_path(path); 4961 return ret; 4962 } 4963 4964 /* 4965 * when we free an block, it is possible (and likely) that we free the last 4966 * delayed ref for that extent as well. This searches the delayed ref tree for 4967 * a given extent, and if there are no other delayed refs to be processed, it 4968 * removes it from the tree. 4969 */ 4970 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 4971 u64 bytenr) 4972 { 4973 struct btrfs_delayed_ref_head *head; 4974 struct btrfs_delayed_ref_root *delayed_refs; 4975 int ret = 0; 4976 4977 delayed_refs = &trans->transaction->delayed_refs; 4978 spin_lock(&delayed_refs->lock); 4979 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 4980 if (!head) 4981 goto out_delayed_unlock; 4982 4983 spin_lock(&head->lock); 4984 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 4985 goto out; 4986 4987 if (cleanup_extent_op(head) != NULL) 4988 goto out; 4989 4990 /* 4991 * waiting for the lock here would deadlock. If someone else has it 4992 * locked they are already in the process of dropping it anyway 4993 */ 4994 if (!mutex_trylock(&head->mutex)) 4995 goto out; 4996 4997 btrfs_delete_ref_head(delayed_refs, head); 4998 head->processing = 0; 4999 5000 spin_unlock(&head->lock); 5001 spin_unlock(&delayed_refs->lock); 5002 5003 BUG_ON(head->extent_op); 5004 if (head->must_insert_reserved) 5005 ret = 1; 5006 5007 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); 5008 mutex_unlock(&head->mutex); 5009 btrfs_put_delayed_ref_head(head); 5010 return ret; 5011 out: 5012 spin_unlock(&head->lock); 5013 5014 out_delayed_unlock: 5015 spin_unlock(&delayed_refs->lock); 5016 return 0; 5017 } 5018 5019 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5020 struct btrfs_root *root, 5021 struct extent_buffer *buf, 5022 u64 parent, int last_ref) 5023 { 5024 struct btrfs_fs_info *fs_info = root->fs_info; 5025 struct btrfs_ref generic_ref = { 0 }; 5026 int pin = 1; 5027 int ret; 5028 5029 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, 5030 buf->start, buf->len, parent); 5031 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 5032 root->root_key.objectid); 5033 5034 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5035 int old_ref_mod, new_ref_mod; 5036 5037 btrfs_ref_tree_mod(fs_info, &generic_ref); 5038 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, 5039 &old_ref_mod, &new_ref_mod); 5040 BUG_ON(ret); /* -ENOMEM */ 5041 pin = old_ref_mod >= 0 && new_ref_mod < 0; 5042 } 5043 5044 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 5045 struct btrfs_block_group_cache *cache; 5046 5047 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5048 ret = check_ref_cleanup(trans, buf->start); 5049 if (!ret) 5050 goto out; 5051 } 5052 5053 pin = 0; 5054 cache = btrfs_lookup_block_group(fs_info, buf->start); 5055 5056 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 5057 pin_down_extent(cache, buf->start, buf->len, 1); 5058 btrfs_put_block_group(cache); 5059 goto out; 5060 } 5061 5062 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 5063 5064 btrfs_add_free_space(cache, buf->start, buf->len); 5065 btrfs_free_reserved_bytes(cache, buf->len, 0); 5066 btrfs_put_block_group(cache); 5067 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 5068 } 5069 out: 5070 if (pin) 5071 add_pinned_bytes(fs_info, &generic_ref); 5072 5073 if (last_ref) { 5074 /* 5075 * Deleting the buffer, clear the corrupt flag since it doesn't 5076 * matter anymore. 5077 */ 5078 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 5079 } 5080 } 5081 5082 /* Can return -ENOMEM */ 5083 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) 5084 { 5085 struct btrfs_fs_info *fs_info = trans->fs_info; 5086 int old_ref_mod, new_ref_mod; 5087 int ret; 5088 5089 if (btrfs_is_testing(fs_info)) 5090 return 0; 5091 5092 /* 5093 * tree log blocks never actually go into the extent allocation 5094 * tree, just update pinning info and exit early. 5095 */ 5096 if ((ref->type == BTRFS_REF_METADATA && 5097 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || 5098 (ref->type == BTRFS_REF_DATA && 5099 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { 5100 /* unlocks the pinned mutex */ 5101 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); 5102 old_ref_mod = new_ref_mod = 0; 5103 ret = 0; 5104 } else if (ref->type == BTRFS_REF_METADATA) { 5105 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, 5106 &old_ref_mod, &new_ref_mod); 5107 } else { 5108 ret = btrfs_add_delayed_data_ref(trans, ref, 0, 5109 &old_ref_mod, &new_ref_mod); 5110 } 5111 5112 if (!((ref->type == BTRFS_REF_METADATA && 5113 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || 5114 (ref->type == BTRFS_REF_DATA && 5115 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) 5116 btrfs_ref_tree_mod(fs_info, ref); 5117 5118 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) 5119 add_pinned_bytes(fs_info, ref); 5120 5121 return ret; 5122 } 5123 5124 /* 5125 * when we wait for progress in the block group caching, its because 5126 * our allocation attempt failed at least once. So, we must sleep 5127 * and let some progress happen before we try again. 5128 * 5129 * This function will sleep at least once waiting for new free space to 5130 * show up, and then it will check the block group free space numbers 5131 * for our min num_bytes. Another option is to have it go ahead 5132 * and look in the rbtree for a free extent of a given size, but this 5133 * is a good start. 5134 * 5135 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 5136 * any of the information in this block group. 5137 */ 5138 static noinline void 5139 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 5140 u64 num_bytes) 5141 { 5142 struct btrfs_caching_control *caching_ctl; 5143 5144 caching_ctl = get_caching_control(cache); 5145 if (!caching_ctl) 5146 return; 5147 5148 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 5149 (cache->free_space_ctl->free_space >= num_bytes)); 5150 5151 put_caching_control(caching_ctl); 5152 } 5153 5154 static noinline int 5155 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5156 { 5157 struct btrfs_caching_control *caching_ctl; 5158 int ret = 0; 5159 5160 caching_ctl = get_caching_control(cache); 5161 if (!caching_ctl) 5162 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 5163 5164 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 5165 if (cache->cached == BTRFS_CACHE_ERROR) 5166 ret = -EIO; 5167 put_caching_control(caching_ctl); 5168 return ret; 5169 } 5170 5171 enum btrfs_loop_type { 5172 LOOP_CACHING_NOWAIT, 5173 LOOP_CACHING_WAIT, 5174 LOOP_ALLOC_CHUNK, 5175 LOOP_NO_EMPTY_SIZE, 5176 }; 5177 5178 static inline void 5179 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 5180 int delalloc) 5181 { 5182 if (delalloc) 5183 down_read(&cache->data_rwsem); 5184 } 5185 5186 static inline void 5187 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 5188 int delalloc) 5189 { 5190 btrfs_get_block_group(cache); 5191 if (delalloc) 5192 down_read(&cache->data_rwsem); 5193 } 5194 5195 static struct btrfs_block_group_cache * 5196 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 5197 struct btrfs_free_cluster *cluster, 5198 int delalloc) 5199 { 5200 struct btrfs_block_group_cache *used_bg = NULL; 5201 5202 spin_lock(&cluster->refill_lock); 5203 while (1) { 5204 used_bg = cluster->block_group; 5205 if (!used_bg) 5206 return NULL; 5207 5208 if (used_bg == block_group) 5209 return used_bg; 5210 5211 btrfs_get_block_group(used_bg); 5212 5213 if (!delalloc) 5214 return used_bg; 5215 5216 if (down_read_trylock(&used_bg->data_rwsem)) 5217 return used_bg; 5218 5219 spin_unlock(&cluster->refill_lock); 5220 5221 /* We should only have one-level nested. */ 5222 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 5223 5224 spin_lock(&cluster->refill_lock); 5225 if (used_bg == cluster->block_group) 5226 return used_bg; 5227 5228 up_read(&used_bg->data_rwsem); 5229 btrfs_put_block_group(used_bg); 5230 } 5231 } 5232 5233 static inline void 5234 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 5235 int delalloc) 5236 { 5237 if (delalloc) 5238 up_read(&cache->data_rwsem); 5239 btrfs_put_block_group(cache); 5240 } 5241 5242 /* 5243 * Structure used internally for find_free_extent() function. Wraps needed 5244 * parameters. 5245 */ 5246 struct find_free_extent_ctl { 5247 /* Basic allocation info */ 5248 u64 ram_bytes; 5249 u64 num_bytes; 5250 u64 empty_size; 5251 u64 flags; 5252 int delalloc; 5253 5254 /* Where to start the search inside the bg */ 5255 u64 search_start; 5256 5257 /* For clustered allocation */ 5258 u64 empty_cluster; 5259 5260 bool have_caching_bg; 5261 bool orig_have_caching_bg; 5262 5263 /* RAID index, converted from flags */ 5264 int index; 5265 5266 /* 5267 * Current loop number, check find_free_extent_update_loop() for details 5268 */ 5269 int loop; 5270 5271 /* 5272 * Whether we're refilling a cluster, if true we need to re-search 5273 * current block group but don't try to refill the cluster again. 5274 */ 5275 bool retry_clustered; 5276 5277 /* 5278 * Whether we're updating free space cache, if true we need to re-search 5279 * current block group but don't try updating free space cache again. 5280 */ 5281 bool retry_unclustered; 5282 5283 /* If current block group is cached */ 5284 int cached; 5285 5286 /* Max contiguous hole found */ 5287 u64 max_extent_size; 5288 5289 /* Total free space from free space cache, not always contiguous */ 5290 u64 total_free_space; 5291 5292 /* Found result */ 5293 u64 found_offset; 5294 }; 5295 5296 5297 /* 5298 * Helper function for find_free_extent(). 5299 * 5300 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 5301 * Return -EAGAIN to inform caller that we need to re-search this block group 5302 * Return >0 to inform caller that we find nothing 5303 * Return 0 means we have found a location and set ffe_ctl->found_offset. 5304 */ 5305 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, 5306 struct btrfs_free_cluster *last_ptr, 5307 struct find_free_extent_ctl *ffe_ctl, 5308 struct btrfs_block_group_cache **cluster_bg_ret) 5309 { 5310 struct btrfs_block_group_cache *cluster_bg; 5311 u64 aligned_cluster; 5312 u64 offset; 5313 int ret; 5314 5315 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 5316 if (!cluster_bg) 5317 goto refill_cluster; 5318 if (cluster_bg != bg && (cluster_bg->ro || 5319 !block_group_bits(cluster_bg, ffe_ctl->flags))) 5320 goto release_cluster; 5321 5322 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 5323 ffe_ctl->num_bytes, cluster_bg->key.objectid, 5324 &ffe_ctl->max_extent_size); 5325 if (offset) { 5326 /* We have a block, we're done */ 5327 spin_unlock(&last_ptr->refill_lock); 5328 trace_btrfs_reserve_extent_cluster(cluster_bg, 5329 ffe_ctl->search_start, ffe_ctl->num_bytes); 5330 *cluster_bg_ret = cluster_bg; 5331 ffe_ctl->found_offset = offset; 5332 return 0; 5333 } 5334 WARN_ON(last_ptr->block_group != cluster_bg); 5335 5336 release_cluster: 5337 /* 5338 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 5339 * lets just skip it and let the allocator find whatever block it can 5340 * find. If we reach this point, we will have tried the cluster 5341 * allocator plenty of times and not have found anything, so we are 5342 * likely way too fragmented for the clustering stuff to find anything. 5343 * 5344 * However, if the cluster is taken from the current block group, 5345 * release the cluster first, so that we stand a better chance of 5346 * succeeding in the unclustered allocation. 5347 */ 5348 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 5349 spin_unlock(&last_ptr->refill_lock); 5350 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 5351 return -ENOENT; 5352 } 5353 5354 /* This cluster didn't work out, free it and start over */ 5355 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5356 5357 if (cluster_bg != bg) 5358 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 5359 5360 refill_cluster: 5361 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 5362 spin_unlock(&last_ptr->refill_lock); 5363 return -ENOENT; 5364 } 5365 5366 aligned_cluster = max_t(u64, 5367 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 5368 bg->full_stripe_len); 5369 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, 5370 ffe_ctl->num_bytes, aligned_cluster); 5371 if (ret == 0) { 5372 /* Now pull our allocation out of this cluster */ 5373 offset = btrfs_alloc_from_cluster(bg, last_ptr, 5374 ffe_ctl->num_bytes, ffe_ctl->search_start, 5375 &ffe_ctl->max_extent_size); 5376 if (offset) { 5377 /* We found one, proceed */ 5378 spin_unlock(&last_ptr->refill_lock); 5379 trace_btrfs_reserve_extent_cluster(bg, 5380 ffe_ctl->search_start, 5381 ffe_ctl->num_bytes); 5382 ffe_ctl->found_offset = offset; 5383 return 0; 5384 } 5385 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 5386 !ffe_ctl->retry_clustered) { 5387 spin_unlock(&last_ptr->refill_lock); 5388 5389 ffe_ctl->retry_clustered = true; 5390 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 5391 ffe_ctl->empty_cluster + ffe_ctl->empty_size); 5392 return -EAGAIN; 5393 } 5394 /* 5395 * At this point we either didn't find a cluster or we weren't able to 5396 * allocate a block from our cluster. Free the cluster we've been 5397 * trying to use, and go to the next block group. 5398 */ 5399 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5400 spin_unlock(&last_ptr->refill_lock); 5401 return 1; 5402 } 5403 5404 /* 5405 * Return >0 to inform caller that we find nothing 5406 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 5407 * Return -EAGAIN to inform caller that we need to re-search this block group 5408 */ 5409 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, 5410 struct btrfs_free_cluster *last_ptr, 5411 struct find_free_extent_ctl *ffe_ctl) 5412 { 5413 u64 offset; 5414 5415 /* 5416 * We are doing an unclustered allocation, set the fragmented flag so 5417 * we don't bother trying to setup a cluster again until we get more 5418 * space. 5419 */ 5420 if (unlikely(last_ptr)) { 5421 spin_lock(&last_ptr->lock); 5422 last_ptr->fragmented = 1; 5423 spin_unlock(&last_ptr->lock); 5424 } 5425 if (ffe_ctl->cached) { 5426 struct btrfs_free_space_ctl *free_space_ctl; 5427 5428 free_space_ctl = bg->free_space_ctl; 5429 spin_lock(&free_space_ctl->tree_lock); 5430 if (free_space_ctl->free_space < 5431 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 5432 ffe_ctl->empty_size) { 5433 ffe_ctl->total_free_space = max_t(u64, 5434 ffe_ctl->total_free_space, 5435 free_space_ctl->free_space); 5436 spin_unlock(&free_space_ctl->tree_lock); 5437 return 1; 5438 } 5439 spin_unlock(&free_space_ctl->tree_lock); 5440 } 5441 5442 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 5443 ffe_ctl->num_bytes, ffe_ctl->empty_size, 5444 &ffe_ctl->max_extent_size); 5445 5446 /* 5447 * If we didn't find a chunk, and we haven't failed on this block group 5448 * before, and this block group is in the middle of caching and we are 5449 * ok with waiting, then go ahead and wait for progress to be made, and 5450 * set @retry_unclustered to true. 5451 * 5452 * If @retry_unclustered is true then we've already waited on this 5453 * block group once and should move on to the next block group. 5454 */ 5455 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && 5456 ffe_ctl->loop > LOOP_CACHING_NOWAIT) { 5457 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 5458 ffe_ctl->empty_size); 5459 ffe_ctl->retry_unclustered = true; 5460 return -EAGAIN; 5461 } else if (!offset) { 5462 return 1; 5463 } 5464 ffe_ctl->found_offset = offset; 5465 return 0; 5466 } 5467 5468 /* 5469 * Return >0 means caller needs to re-search for free extent 5470 * Return 0 means we have the needed free extent. 5471 * Return <0 means we failed to locate any free extent. 5472 */ 5473 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 5474 struct btrfs_free_cluster *last_ptr, 5475 struct btrfs_key *ins, 5476 struct find_free_extent_ctl *ffe_ctl, 5477 int full_search, bool use_cluster) 5478 { 5479 struct btrfs_root *root = fs_info->extent_root; 5480 int ret; 5481 5482 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 5483 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 5484 ffe_ctl->orig_have_caching_bg = true; 5485 5486 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && 5487 ffe_ctl->have_caching_bg) 5488 return 1; 5489 5490 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) 5491 return 1; 5492 5493 if (ins->objectid) { 5494 if (!use_cluster && last_ptr) { 5495 spin_lock(&last_ptr->lock); 5496 last_ptr->window_start = ins->objectid; 5497 spin_unlock(&last_ptr->lock); 5498 } 5499 return 0; 5500 } 5501 5502 /* 5503 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 5504 * caching kthreads as we move along 5505 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 5506 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 5507 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 5508 * again 5509 */ 5510 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 5511 ffe_ctl->index = 0; 5512 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { 5513 /* 5514 * We want to skip the LOOP_CACHING_WAIT step if we 5515 * don't have any uncached bgs and we've already done a 5516 * full search through. 5517 */ 5518 if (ffe_ctl->orig_have_caching_bg || !full_search) 5519 ffe_ctl->loop = LOOP_CACHING_WAIT; 5520 else 5521 ffe_ctl->loop = LOOP_ALLOC_CHUNK; 5522 } else { 5523 ffe_ctl->loop++; 5524 } 5525 5526 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 5527 struct btrfs_trans_handle *trans; 5528 int exist = 0; 5529 5530 trans = current->journal_info; 5531 if (trans) 5532 exist = 1; 5533 else 5534 trans = btrfs_join_transaction(root); 5535 5536 if (IS_ERR(trans)) { 5537 ret = PTR_ERR(trans); 5538 return ret; 5539 } 5540 5541 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, 5542 CHUNK_ALLOC_FORCE); 5543 5544 /* 5545 * If we can't allocate a new chunk we've already looped 5546 * through at least once, move on to the NO_EMPTY_SIZE 5547 * case. 5548 */ 5549 if (ret == -ENOSPC) 5550 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; 5551 5552 /* Do not bail out on ENOSPC since we can do more. */ 5553 if (ret < 0 && ret != -ENOSPC) 5554 btrfs_abort_transaction(trans, ret); 5555 else 5556 ret = 0; 5557 if (!exist) 5558 btrfs_end_transaction(trans); 5559 if (ret) 5560 return ret; 5561 } 5562 5563 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 5564 /* 5565 * Don't loop again if we already have no empty_size and 5566 * no empty_cluster. 5567 */ 5568 if (ffe_ctl->empty_size == 0 && 5569 ffe_ctl->empty_cluster == 0) 5570 return -ENOSPC; 5571 ffe_ctl->empty_size = 0; 5572 ffe_ctl->empty_cluster = 0; 5573 } 5574 return 1; 5575 } 5576 return -ENOSPC; 5577 } 5578 5579 /* 5580 * walks the btree of allocated extents and find a hole of a given size. 5581 * The key ins is changed to record the hole: 5582 * ins->objectid == start position 5583 * ins->flags = BTRFS_EXTENT_ITEM_KEY 5584 * ins->offset == the size of the hole. 5585 * Any available blocks before search_start are skipped. 5586 * 5587 * If there is no suitable free space, we will record the max size of 5588 * the free space extent currently. 5589 * 5590 * The overall logic and call chain: 5591 * 5592 * find_free_extent() 5593 * |- Iterate through all block groups 5594 * | |- Get a valid block group 5595 * | |- Try to do clustered allocation in that block group 5596 * | |- Try to do unclustered allocation in that block group 5597 * | |- Check if the result is valid 5598 * | | |- If valid, then exit 5599 * | |- Jump to next block group 5600 * | 5601 * |- Push harder to find free extents 5602 * |- If not found, re-iterate all block groups 5603 */ 5604 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 5605 u64 ram_bytes, u64 num_bytes, u64 empty_size, 5606 u64 hint_byte, struct btrfs_key *ins, 5607 u64 flags, int delalloc) 5608 { 5609 int ret = 0; 5610 struct btrfs_free_cluster *last_ptr = NULL; 5611 struct btrfs_block_group_cache *block_group = NULL; 5612 struct find_free_extent_ctl ffe_ctl = {0}; 5613 struct btrfs_space_info *space_info; 5614 bool use_cluster = true; 5615 bool full_search = false; 5616 5617 WARN_ON(num_bytes < fs_info->sectorsize); 5618 5619 ffe_ctl.ram_bytes = ram_bytes; 5620 ffe_ctl.num_bytes = num_bytes; 5621 ffe_ctl.empty_size = empty_size; 5622 ffe_ctl.flags = flags; 5623 ffe_ctl.search_start = 0; 5624 ffe_ctl.retry_clustered = false; 5625 ffe_ctl.retry_unclustered = false; 5626 ffe_ctl.delalloc = delalloc; 5627 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); 5628 ffe_ctl.have_caching_bg = false; 5629 ffe_ctl.orig_have_caching_bg = false; 5630 ffe_ctl.found_offset = 0; 5631 5632 ins->type = BTRFS_EXTENT_ITEM_KEY; 5633 ins->objectid = 0; 5634 ins->offset = 0; 5635 5636 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 5637 5638 space_info = btrfs_find_space_info(fs_info, flags); 5639 if (!space_info) { 5640 btrfs_err(fs_info, "No space info for %llu", flags); 5641 return -ENOSPC; 5642 } 5643 5644 /* 5645 * If our free space is heavily fragmented we may not be able to make 5646 * big contiguous allocations, so instead of doing the expensive search 5647 * for free space, simply return ENOSPC with our max_extent_size so we 5648 * can go ahead and search for a more manageable chunk. 5649 * 5650 * If our max_extent_size is large enough for our allocation simply 5651 * disable clustering since we will likely not be able to find enough 5652 * space to create a cluster and induce latency trying. 5653 */ 5654 if (unlikely(space_info->max_extent_size)) { 5655 spin_lock(&space_info->lock); 5656 if (space_info->max_extent_size && 5657 num_bytes > space_info->max_extent_size) { 5658 ins->offset = space_info->max_extent_size; 5659 spin_unlock(&space_info->lock); 5660 return -ENOSPC; 5661 } else if (space_info->max_extent_size) { 5662 use_cluster = false; 5663 } 5664 spin_unlock(&space_info->lock); 5665 } 5666 5667 last_ptr = fetch_cluster_info(fs_info, space_info, 5668 &ffe_ctl.empty_cluster); 5669 if (last_ptr) { 5670 spin_lock(&last_ptr->lock); 5671 if (last_ptr->block_group) 5672 hint_byte = last_ptr->window_start; 5673 if (last_ptr->fragmented) { 5674 /* 5675 * We still set window_start so we can keep track of the 5676 * last place we found an allocation to try and save 5677 * some time. 5678 */ 5679 hint_byte = last_ptr->window_start; 5680 use_cluster = false; 5681 } 5682 spin_unlock(&last_ptr->lock); 5683 } 5684 5685 ffe_ctl.search_start = max(ffe_ctl.search_start, 5686 first_logical_byte(fs_info, 0)); 5687 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); 5688 if (ffe_ctl.search_start == hint_byte) { 5689 block_group = btrfs_lookup_block_group(fs_info, 5690 ffe_ctl.search_start); 5691 /* 5692 * we don't want to use the block group if it doesn't match our 5693 * allocation bits, or if its not cached. 5694 * 5695 * However if we are re-searching with an ideal block group 5696 * picked out then we don't care that the block group is cached. 5697 */ 5698 if (block_group && block_group_bits(block_group, flags) && 5699 block_group->cached != BTRFS_CACHE_NO) { 5700 down_read(&space_info->groups_sem); 5701 if (list_empty(&block_group->list) || 5702 block_group->ro) { 5703 /* 5704 * someone is removing this block group, 5705 * we can't jump into the have_block_group 5706 * target because our list pointers are not 5707 * valid 5708 */ 5709 btrfs_put_block_group(block_group); 5710 up_read(&space_info->groups_sem); 5711 } else { 5712 ffe_ctl.index = btrfs_bg_flags_to_raid_index( 5713 block_group->flags); 5714 btrfs_lock_block_group(block_group, delalloc); 5715 goto have_block_group; 5716 } 5717 } else if (block_group) { 5718 btrfs_put_block_group(block_group); 5719 } 5720 } 5721 search: 5722 ffe_ctl.have_caching_bg = false; 5723 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || 5724 ffe_ctl.index == 0) 5725 full_search = true; 5726 down_read(&space_info->groups_sem); 5727 list_for_each_entry(block_group, 5728 &space_info->block_groups[ffe_ctl.index], list) { 5729 /* If the block group is read-only, we can skip it entirely. */ 5730 if (unlikely(block_group->ro)) 5731 continue; 5732 5733 btrfs_grab_block_group(block_group, delalloc); 5734 ffe_ctl.search_start = block_group->key.objectid; 5735 5736 /* 5737 * this can happen if we end up cycling through all the 5738 * raid types, but we want to make sure we only allocate 5739 * for the proper type. 5740 */ 5741 if (!block_group_bits(block_group, flags)) { 5742 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5743 BTRFS_BLOCK_GROUP_RAID1_MASK | 5744 BTRFS_BLOCK_GROUP_RAID56_MASK | 5745 BTRFS_BLOCK_GROUP_RAID10; 5746 5747 /* 5748 * if they asked for extra copies and this block group 5749 * doesn't provide them, bail. This does allow us to 5750 * fill raid0 from raid1. 5751 */ 5752 if ((flags & extra) && !(block_group->flags & extra)) 5753 goto loop; 5754 } 5755 5756 have_block_group: 5757 ffe_ctl.cached = block_group_cache_done(block_group); 5758 if (unlikely(!ffe_ctl.cached)) { 5759 ffe_ctl.have_caching_bg = true; 5760 ret = cache_block_group(block_group, 0); 5761 BUG_ON(ret < 0); 5762 ret = 0; 5763 } 5764 5765 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 5766 goto loop; 5767 5768 /* 5769 * Ok we want to try and use the cluster allocator, so 5770 * lets look there 5771 */ 5772 if (last_ptr && use_cluster) { 5773 struct btrfs_block_group_cache *cluster_bg = NULL; 5774 5775 ret = find_free_extent_clustered(block_group, last_ptr, 5776 &ffe_ctl, &cluster_bg); 5777 5778 if (ret == 0) { 5779 if (cluster_bg && cluster_bg != block_group) { 5780 btrfs_release_block_group(block_group, 5781 delalloc); 5782 block_group = cluster_bg; 5783 } 5784 goto checks; 5785 } else if (ret == -EAGAIN) { 5786 goto have_block_group; 5787 } else if (ret > 0) { 5788 goto loop; 5789 } 5790 /* ret == -ENOENT case falls through */ 5791 } 5792 5793 ret = find_free_extent_unclustered(block_group, last_ptr, 5794 &ffe_ctl); 5795 if (ret == -EAGAIN) 5796 goto have_block_group; 5797 else if (ret > 0) 5798 goto loop; 5799 /* ret == 0 case falls through */ 5800 checks: 5801 ffe_ctl.search_start = round_up(ffe_ctl.found_offset, 5802 fs_info->stripesize); 5803 5804 /* move on to the next group */ 5805 if (ffe_ctl.search_start + num_bytes > 5806 block_group->key.objectid + block_group->key.offset) { 5807 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 5808 num_bytes); 5809 goto loop; 5810 } 5811 5812 if (ffe_ctl.found_offset < ffe_ctl.search_start) 5813 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 5814 ffe_ctl.search_start - ffe_ctl.found_offset); 5815 5816 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 5817 num_bytes, delalloc); 5818 if (ret == -EAGAIN) { 5819 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 5820 num_bytes); 5821 goto loop; 5822 } 5823 btrfs_inc_block_group_reservations(block_group); 5824 5825 /* we are all good, lets return */ 5826 ins->objectid = ffe_ctl.search_start; 5827 ins->offset = num_bytes; 5828 5829 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, 5830 num_bytes); 5831 btrfs_release_block_group(block_group, delalloc); 5832 break; 5833 loop: 5834 ffe_ctl.retry_clustered = false; 5835 ffe_ctl.retry_unclustered = false; 5836 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 5837 ffe_ctl.index); 5838 btrfs_release_block_group(block_group, delalloc); 5839 cond_resched(); 5840 } 5841 up_read(&space_info->groups_sem); 5842 5843 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, 5844 full_search, use_cluster); 5845 if (ret > 0) 5846 goto search; 5847 5848 if (ret == -ENOSPC) { 5849 /* 5850 * Use ffe_ctl->total_free_space as fallback if we can't find 5851 * any contiguous hole. 5852 */ 5853 if (!ffe_ctl.max_extent_size) 5854 ffe_ctl.max_extent_size = ffe_ctl.total_free_space; 5855 spin_lock(&space_info->lock); 5856 space_info->max_extent_size = ffe_ctl.max_extent_size; 5857 spin_unlock(&space_info->lock); 5858 ins->offset = ffe_ctl.max_extent_size; 5859 } 5860 return ret; 5861 } 5862 5863 /* 5864 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 5865 * hole that is at least as big as @num_bytes. 5866 * 5867 * @root - The root that will contain this extent 5868 * 5869 * @ram_bytes - The amount of space in ram that @num_bytes take. This 5870 * is used for accounting purposes. This value differs 5871 * from @num_bytes only in the case of compressed extents. 5872 * 5873 * @num_bytes - Number of bytes to allocate on-disk. 5874 * 5875 * @min_alloc_size - Indicates the minimum amount of space that the 5876 * allocator should try to satisfy. In some cases 5877 * @num_bytes may be larger than what is required and if 5878 * the filesystem is fragmented then allocation fails. 5879 * However, the presence of @min_alloc_size gives a 5880 * chance to try and satisfy the smaller allocation. 5881 * 5882 * @empty_size - A hint that you plan on doing more COW. This is the 5883 * size in bytes the allocator should try to find free 5884 * next to the block it returns. This is just a hint and 5885 * may be ignored by the allocator. 5886 * 5887 * @hint_byte - Hint to the allocator to start searching above the byte 5888 * address passed. It might be ignored. 5889 * 5890 * @ins - This key is modified to record the found hole. It will 5891 * have the following values: 5892 * ins->objectid == start position 5893 * ins->flags = BTRFS_EXTENT_ITEM_KEY 5894 * ins->offset == the size of the hole. 5895 * 5896 * @is_data - Boolean flag indicating whether an extent is 5897 * allocated for data (true) or metadata (false) 5898 * 5899 * @delalloc - Boolean flag indicating whether this allocation is for 5900 * delalloc or not. If 'true' data_rwsem of block groups 5901 * is going to be acquired. 5902 * 5903 * 5904 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 5905 * case -ENOSPC is returned then @ins->offset will contain the size of the 5906 * largest available hole the allocator managed to find. 5907 */ 5908 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 5909 u64 num_bytes, u64 min_alloc_size, 5910 u64 empty_size, u64 hint_byte, 5911 struct btrfs_key *ins, int is_data, int delalloc) 5912 { 5913 struct btrfs_fs_info *fs_info = root->fs_info; 5914 bool final_tried = num_bytes == min_alloc_size; 5915 u64 flags; 5916 int ret; 5917 5918 flags = get_alloc_profile_by_root(root, is_data); 5919 again: 5920 WARN_ON(num_bytes < fs_info->sectorsize); 5921 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 5922 hint_byte, ins, flags, delalloc); 5923 if (!ret && !is_data) { 5924 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 5925 } else if (ret == -ENOSPC) { 5926 if (!final_tried && ins->offset) { 5927 num_bytes = min(num_bytes >> 1, ins->offset); 5928 num_bytes = round_down(num_bytes, 5929 fs_info->sectorsize); 5930 num_bytes = max(num_bytes, min_alloc_size); 5931 ram_bytes = num_bytes; 5932 if (num_bytes == min_alloc_size) 5933 final_tried = true; 5934 goto again; 5935 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 5936 struct btrfs_space_info *sinfo; 5937 5938 sinfo = btrfs_find_space_info(fs_info, flags); 5939 btrfs_err(fs_info, 5940 "allocation failed flags %llu, wanted %llu", 5941 flags, num_bytes); 5942 if (sinfo) 5943 btrfs_dump_space_info(fs_info, sinfo, 5944 num_bytes, 1); 5945 } 5946 } 5947 5948 return ret; 5949 } 5950 5951 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 5952 u64 start, u64 len, 5953 int pin, int delalloc) 5954 { 5955 struct btrfs_block_group_cache *cache; 5956 int ret = 0; 5957 5958 cache = btrfs_lookup_block_group(fs_info, start); 5959 if (!cache) { 5960 btrfs_err(fs_info, "Unable to find block group for %llu", 5961 start); 5962 return -ENOSPC; 5963 } 5964 5965 if (pin) 5966 pin_down_extent(cache, start, len, 1); 5967 else { 5968 if (btrfs_test_opt(fs_info, DISCARD)) 5969 ret = btrfs_discard_extent(fs_info, start, len, NULL); 5970 btrfs_add_free_space(cache, start, len); 5971 btrfs_free_reserved_bytes(cache, len, delalloc); 5972 trace_btrfs_reserved_extent_free(fs_info, start, len); 5973 } 5974 5975 btrfs_put_block_group(cache); 5976 return ret; 5977 } 5978 5979 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 5980 u64 start, u64 len, int delalloc) 5981 { 5982 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 5983 } 5984 5985 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 5986 u64 start, u64 len) 5987 { 5988 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 5989 } 5990 5991 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5992 u64 parent, u64 root_objectid, 5993 u64 flags, u64 owner, u64 offset, 5994 struct btrfs_key *ins, int ref_mod) 5995 { 5996 struct btrfs_fs_info *fs_info = trans->fs_info; 5997 int ret; 5998 struct btrfs_extent_item *extent_item; 5999 struct btrfs_extent_inline_ref *iref; 6000 struct btrfs_path *path; 6001 struct extent_buffer *leaf; 6002 int type; 6003 u32 size; 6004 6005 if (parent > 0) 6006 type = BTRFS_SHARED_DATA_REF_KEY; 6007 else 6008 type = BTRFS_EXTENT_DATA_REF_KEY; 6009 6010 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6011 6012 path = btrfs_alloc_path(); 6013 if (!path) 6014 return -ENOMEM; 6015 6016 path->leave_spinning = 1; 6017 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6018 ins, size); 6019 if (ret) { 6020 btrfs_free_path(path); 6021 return ret; 6022 } 6023 6024 leaf = path->nodes[0]; 6025 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6026 struct btrfs_extent_item); 6027 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6028 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6029 btrfs_set_extent_flags(leaf, extent_item, 6030 flags | BTRFS_EXTENT_FLAG_DATA); 6031 6032 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6033 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6034 if (parent > 0) { 6035 struct btrfs_shared_data_ref *ref; 6036 ref = (struct btrfs_shared_data_ref *)(iref + 1); 6037 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6038 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 6039 } else { 6040 struct btrfs_extent_data_ref *ref; 6041 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 6042 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 6043 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 6044 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 6045 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 6046 } 6047 6048 btrfs_mark_buffer_dirty(path->nodes[0]); 6049 btrfs_free_path(path); 6050 6051 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); 6052 if (ret) 6053 return ret; 6054 6055 ret = update_block_group(trans, ins->objectid, ins->offset, 1); 6056 if (ret) { /* -ENOENT, logic error */ 6057 btrfs_err(fs_info, "update block group failed for %llu %llu", 6058 ins->objectid, ins->offset); 6059 BUG(); 6060 } 6061 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 6062 return ret; 6063 } 6064 6065 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 6066 struct btrfs_delayed_ref_node *node, 6067 struct btrfs_delayed_extent_op *extent_op) 6068 { 6069 struct btrfs_fs_info *fs_info = trans->fs_info; 6070 int ret; 6071 struct btrfs_extent_item *extent_item; 6072 struct btrfs_key extent_key; 6073 struct btrfs_tree_block_info *block_info; 6074 struct btrfs_extent_inline_ref *iref; 6075 struct btrfs_path *path; 6076 struct extent_buffer *leaf; 6077 struct btrfs_delayed_tree_ref *ref; 6078 u32 size = sizeof(*extent_item) + sizeof(*iref); 6079 u64 num_bytes; 6080 u64 flags = extent_op->flags_to_set; 6081 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 6082 6083 ref = btrfs_delayed_node_to_tree_ref(node); 6084 6085 extent_key.objectid = node->bytenr; 6086 if (skinny_metadata) { 6087 extent_key.offset = ref->level; 6088 extent_key.type = BTRFS_METADATA_ITEM_KEY; 6089 num_bytes = fs_info->nodesize; 6090 } else { 6091 extent_key.offset = node->num_bytes; 6092 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 6093 size += sizeof(*block_info); 6094 num_bytes = node->num_bytes; 6095 } 6096 6097 path = btrfs_alloc_path(); 6098 if (!path) 6099 return -ENOMEM; 6100 6101 path->leave_spinning = 1; 6102 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6103 &extent_key, size); 6104 if (ret) { 6105 btrfs_free_path(path); 6106 return ret; 6107 } 6108 6109 leaf = path->nodes[0]; 6110 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6111 struct btrfs_extent_item); 6112 btrfs_set_extent_refs(leaf, extent_item, 1); 6113 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6114 btrfs_set_extent_flags(leaf, extent_item, 6115 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 6116 6117 if (skinny_metadata) { 6118 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6119 } else { 6120 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 6121 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 6122 btrfs_set_tree_block_level(leaf, block_info, ref->level); 6123 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 6124 } 6125 6126 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 6127 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 6128 btrfs_set_extent_inline_ref_type(leaf, iref, 6129 BTRFS_SHARED_BLOCK_REF_KEY); 6130 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); 6131 } else { 6132 btrfs_set_extent_inline_ref_type(leaf, iref, 6133 BTRFS_TREE_BLOCK_REF_KEY); 6134 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); 6135 } 6136 6137 btrfs_mark_buffer_dirty(leaf); 6138 btrfs_free_path(path); 6139 6140 ret = remove_from_free_space_tree(trans, extent_key.objectid, 6141 num_bytes); 6142 if (ret) 6143 return ret; 6144 6145 ret = update_block_group(trans, extent_key.objectid, 6146 fs_info->nodesize, 1); 6147 if (ret) { /* -ENOENT, logic error */ 6148 btrfs_err(fs_info, "update block group failed for %llu %llu", 6149 extent_key.objectid, extent_key.offset); 6150 BUG(); 6151 } 6152 6153 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, 6154 fs_info->nodesize); 6155 return ret; 6156 } 6157 6158 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6159 struct btrfs_root *root, u64 owner, 6160 u64 offset, u64 ram_bytes, 6161 struct btrfs_key *ins) 6162 { 6163 struct btrfs_ref generic_ref = { 0 }; 6164 int ret; 6165 6166 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 6167 6168 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 6169 ins->objectid, ins->offset, 0); 6170 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); 6171 btrfs_ref_tree_mod(root->fs_info, &generic_ref); 6172 ret = btrfs_add_delayed_data_ref(trans, &generic_ref, 6173 ram_bytes, NULL, NULL); 6174 return ret; 6175 } 6176 6177 /* 6178 * this is used by the tree logging recovery code. It records that 6179 * an extent has been allocated and makes sure to clear the free 6180 * space cache bits as well 6181 */ 6182 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 6183 u64 root_objectid, u64 owner, u64 offset, 6184 struct btrfs_key *ins) 6185 { 6186 struct btrfs_fs_info *fs_info = trans->fs_info; 6187 int ret; 6188 struct btrfs_block_group_cache *block_group; 6189 struct btrfs_space_info *space_info; 6190 6191 /* 6192 * Mixed block groups will exclude before processing the log so we only 6193 * need to do the exclude dance if this fs isn't mixed. 6194 */ 6195 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 6196 ret = __exclude_logged_extent(fs_info, ins->objectid, 6197 ins->offset); 6198 if (ret) 6199 return ret; 6200 } 6201 6202 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 6203 if (!block_group) 6204 return -EINVAL; 6205 6206 space_info = block_group->space_info; 6207 spin_lock(&space_info->lock); 6208 spin_lock(&block_group->lock); 6209 space_info->bytes_reserved += ins->offset; 6210 block_group->reserved += ins->offset; 6211 spin_unlock(&block_group->lock); 6212 spin_unlock(&space_info->lock); 6213 6214 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 6215 offset, ins, 1); 6216 btrfs_put_block_group(block_group); 6217 return ret; 6218 } 6219 6220 static struct extent_buffer * 6221 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6222 u64 bytenr, int level, u64 owner) 6223 { 6224 struct btrfs_fs_info *fs_info = root->fs_info; 6225 struct extent_buffer *buf; 6226 6227 buf = btrfs_find_create_tree_block(fs_info, bytenr); 6228 if (IS_ERR(buf)) 6229 return buf; 6230 6231 /* 6232 * Extra safety check in case the extent tree is corrupted and extent 6233 * allocator chooses to use a tree block which is already used and 6234 * locked. 6235 */ 6236 if (buf->lock_owner == current->pid) { 6237 btrfs_err_rl(fs_info, 6238 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 6239 buf->start, btrfs_header_owner(buf), current->pid); 6240 free_extent_buffer(buf); 6241 return ERR_PTR(-EUCLEAN); 6242 } 6243 6244 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 6245 btrfs_tree_lock(buf); 6246 btrfs_clean_tree_block(buf); 6247 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 6248 6249 btrfs_set_lock_blocking_write(buf); 6250 set_extent_buffer_uptodate(buf); 6251 6252 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 6253 btrfs_set_header_level(buf, level); 6254 btrfs_set_header_bytenr(buf, buf->start); 6255 btrfs_set_header_generation(buf, trans->transid); 6256 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 6257 btrfs_set_header_owner(buf, owner); 6258 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 6259 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 6260 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 6261 buf->log_index = root->log_transid % 2; 6262 /* 6263 * we allow two log transactions at a time, use different 6264 * EXTENT bit to differentiate dirty pages. 6265 */ 6266 if (buf->log_index == 0) 6267 set_extent_dirty(&root->dirty_log_pages, buf->start, 6268 buf->start + buf->len - 1, GFP_NOFS); 6269 else 6270 set_extent_new(&root->dirty_log_pages, buf->start, 6271 buf->start + buf->len - 1); 6272 } else { 6273 buf->log_index = -1; 6274 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 6275 buf->start + buf->len - 1, GFP_NOFS); 6276 } 6277 trans->dirty = true; 6278 /* this returns a buffer locked for blocking */ 6279 return buf; 6280 } 6281 6282 /* 6283 * finds a free extent and does all the dirty work required for allocation 6284 * returns the tree buffer or an ERR_PTR on error. 6285 */ 6286 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 6287 struct btrfs_root *root, 6288 u64 parent, u64 root_objectid, 6289 const struct btrfs_disk_key *key, 6290 int level, u64 hint, 6291 u64 empty_size) 6292 { 6293 struct btrfs_fs_info *fs_info = root->fs_info; 6294 struct btrfs_key ins; 6295 struct btrfs_block_rsv *block_rsv; 6296 struct extent_buffer *buf; 6297 struct btrfs_delayed_extent_op *extent_op; 6298 struct btrfs_ref generic_ref = { 0 }; 6299 u64 flags = 0; 6300 int ret; 6301 u32 blocksize = fs_info->nodesize; 6302 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 6303 6304 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6305 if (btrfs_is_testing(fs_info)) { 6306 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 6307 level, root_objectid); 6308 if (!IS_ERR(buf)) 6309 root->alloc_bytenr += blocksize; 6310 return buf; 6311 } 6312 #endif 6313 6314 block_rsv = btrfs_use_block_rsv(trans, root, blocksize); 6315 if (IS_ERR(block_rsv)) 6316 return ERR_CAST(block_rsv); 6317 6318 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 6319 empty_size, hint, &ins, 0, 0); 6320 if (ret) 6321 goto out_unuse; 6322 6323 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 6324 root_objectid); 6325 if (IS_ERR(buf)) { 6326 ret = PTR_ERR(buf); 6327 goto out_free_reserved; 6328 } 6329 6330 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 6331 if (parent == 0) 6332 parent = ins.objectid; 6333 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 6334 } else 6335 BUG_ON(parent > 0); 6336 6337 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6338 extent_op = btrfs_alloc_delayed_extent_op(); 6339 if (!extent_op) { 6340 ret = -ENOMEM; 6341 goto out_free_buf; 6342 } 6343 if (key) 6344 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6345 else 6346 memset(&extent_op->key, 0, sizeof(extent_op->key)); 6347 extent_op->flags_to_set = flags; 6348 extent_op->update_key = skinny_metadata ? false : true; 6349 extent_op->update_flags = true; 6350 extent_op->is_data = false; 6351 extent_op->level = level; 6352 6353 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 6354 ins.objectid, ins.offset, parent); 6355 generic_ref.real_root = root->root_key.objectid; 6356 btrfs_init_tree_ref(&generic_ref, level, root_objectid); 6357 btrfs_ref_tree_mod(fs_info, &generic_ref); 6358 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, 6359 extent_op, NULL, NULL); 6360 if (ret) 6361 goto out_free_delayed; 6362 } 6363 return buf; 6364 6365 out_free_delayed: 6366 btrfs_free_delayed_extent_op(extent_op); 6367 out_free_buf: 6368 free_extent_buffer(buf); 6369 out_free_reserved: 6370 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 6371 out_unuse: 6372 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); 6373 return ERR_PTR(ret); 6374 } 6375 6376 struct walk_control { 6377 u64 refs[BTRFS_MAX_LEVEL]; 6378 u64 flags[BTRFS_MAX_LEVEL]; 6379 struct btrfs_key update_progress; 6380 struct btrfs_key drop_progress; 6381 int drop_level; 6382 int stage; 6383 int level; 6384 int shared_level; 6385 int update_ref; 6386 int keep_locks; 6387 int reada_slot; 6388 int reada_count; 6389 int restarted; 6390 }; 6391 6392 #define DROP_REFERENCE 1 6393 #define UPDATE_BACKREF 2 6394 6395 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 6396 struct btrfs_root *root, 6397 struct walk_control *wc, 6398 struct btrfs_path *path) 6399 { 6400 struct btrfs_fs_info *fs_info = root->fs_info; 6401 u64 bytenr; 6402 u64 generation; 6403 u64 refs; 6404 u64 flags; 6405 u32 nritems; 6406 struct btrfs_key key; 6407 struct extent_buffer *eb; 6408 int ret; 6409 int slot; 6410 int nread = 0; 6411 6412 if (path->slots[wc->level] < wc->reada_slot) { 6413 wc->reada_count = wc->reada_count * 2 / 3; 6414 wc->reada_count = max(wc->reada_count, 2); 6415 } else { 6416 wc->reada_count = wc->reada_count * 3 / 2; 6417 wc->reada_count = min_t(int, wc->reada_count, 6418 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 6419 } 6420 6421 eb = path->nodes[wc->level]; 6422 nritems = btrfs_header_nritems(eb); 6423 6424 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 6425 if (nread >= wc->reada_count) 6426 break; 6427 6428 cond_resched(); 6429 bytenr = btrfs_node_blockptr(eb, slot); 6430 generation = btrfs_node_ptr_generation(eb, slot); 6431 6432 if (slot == path->slots[wc->level]) 6433 goto reada; 6434 6435 if (wc->stage == UPDATE_BACKREF && 6436 generation <= root->root_key.offset) 6437 continue; 6438 6439 /* We don't lock the tree block, it's OK to be racy here */ 6440 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 6441 wc->level - 1, 1, &refs, 6442 &flags); 6443 /* We don't care about errors in readahead. */ 6444 if (ret < 0) 6445 continue; 6446 BUG_ON(refs == 0); 6447 6448 if (wc->stage == DROP_REFERENCE) { 6449 if (refs == 1) 6450 goto reada; 6451 6452 if (wc->level == 1 && 6453 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6454 continue; 6455 if (!wc->update_ref || 6456 generation <= root->root_key.offset) 6457 continue; 6458 btrfs_node_key_to_cpu(eb, &key, slot); 6459 ret = btrfs_comp_cpu_keys(&key, 6460 &wc->update_progress); 6461 if (ret < 0) 6462 continue; 6463 } else { 6464 if (wc->level == 1 && 6465 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6466 continue; 6467 } 6468 reada: 6469 readahead_tree_block(fs_info, bytenr); 6470 nread++; 6471 } 6472 wc->reada_slot = slot; 6473 } 6474 6475 /* 6476 * helper to process tree block while walking down the tree. 6477 * 6478 * when wc->stage == UPDATE_BACKREF, this function updates 6479 * back refs for pointers in the block. 6480 * 6481 * NOTE: return value 1 means we should stop walking down. 6482 */ 6483 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 6484 struct btrfs_root *root, 6485 struct btrfs_path *path, 6486 struct walk_control *wc, int lookup_info) 6487 { 6488 struct btrfs_fs_info *fs_info = root->fs_info; 6489 int level = wc->level; 6490 struct extent_buffer *eb = path->nodes[level]; 6491 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6492 int ret; 6493 6494 if (wc->stage == UPDATE_BACKREF && 6495 btrfs_header_owner(eb) != root->root_key.objectid) 6496 return 1; 6497 6498 /* 6499 * when reference count of tree block is 1, it won't increase 6500 * again. once full backref flag is set, we never clear it. 6501 */ 6502 if (lookup_info && 6503 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 6504 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 6505 BUG_ON(!path->locks[level]); 6506 ret = btrfs_lookup_extent_info(trans, fs_info, 6507 eb->start, level, 1, 6508 &wc->refs[level], 6509 &wc->flags[level]); 6510 BUG_ON(ret == -ENOMEM); 6511 if (ret) 6512 return ret; 6513 BUG_ON(wc->refs[level] == 0); 6514 } 6515 6516 if (wc->stage == DROP_REFERENCE) { 6517 if (wc->refs[level] > 1) 6518 return 1; 6519 6520 if (path->locks[level] && !wc->keep_locks) { 6521 btrfs_tree_unlock_rw(eb, path->locks[level]); 6522 path->locks[level] = 0; 6523 } 6524 return 0; 6525 } 6526 6527 /* wc->stage == UPDATE_BACKREF */ 6528 if (!(wc->flags[level] & flag)) { 6529 BUG_ON(!path->locks[level]); 6530 ret = btrfs_inc_ref(trans, root, eb, 1); 6531 BUG_ON(ret); /* -ENOMEM */ 6532 ret = btrfs_dec_ref(trans, root, eb, 0); 6533 BUG_ON(ret); /* -ENOMEM */ 6534 ret = btrfs_set_disk_extent_flags(trans, eb->start, 6535 eb->len, flag, 6536 btrfs_header_level(eb), 0); 6537 BUG_ON(ret); /* -ENOMEM */ 6538 wc->flags[level] |= flag; 6539 } 6540 6541 /* 6542 * the block is shared by multiple trees, so it's not good to 6543 * keep the tree lock 6544 */ 6545 if (path->locks[level] && level > 0) { 6546 btrfs_tree_unlock_rw(eb, path->locks[level]); 6547 path->locks[level] = 0; 6548 } 6549 return 0; 6550 } 6551 6552 /* 6553 * This is used to verify a ref exists for this root to deal with a bug where we 6554 * would have a drop_progress key that hadn't been updated properly. 6555 */ 6556 static int check_ref_exists(struct btrfs_trans_handle *trans, 6557 struct btrfs_root *root, u64 bytenr, u64 parent, 6558 int level) 6559 { 6560 struct btrfs_path *path; 6561 struct btrfs_extent_inline_ref *iref; 6562 int ret; 6563 6564 path = btrfs_alloc_path(); 6565 if (!path) 6566 return -ENOMEM; 6567 6568 ret = lookup_extent_backref(trans, path, &iref, bytenr, 6569 root->fs_info->nodesize, parent, 6570 root->root_key.objectid, level, 0); 6571 btrfs_free_path(path); 6572 if (ret == -ENOENT) 6573 return 0; 6574 if (ret < 0) 6575 return ret; 6576 return 1; 6577 } 6578 6579 /* 6580 * helper to process tree block pointer. 6581 * 6582 * when wc->stage == DROP_REFERENCE, this function checks 6583 * reference count of the block pointed to. if the block 6584 * is shared and we need update back refs for the subtree 6585 * rooted at the block, this function changes wc->stage to 6586 * UPDATE_BACKREF. if the block is shared and there is no 6587 * need to update back, this function drops the reference 6588 * to the block. 6589 * 6590 * NOTE: return value 1 means we should stop walking down. 6591 */ 6592 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 6593 struct btrfs_root *root, 6594 struct btrfs_path *path, 6595 struct walk_control *wc, int *lookup_info) 6596 { 6597 struct btrfs_fs_info *fs_info = root->fs_info; 6598 u64 bytenr; 6599 u64 generation; 6600 u64 parent; 6601 struct btrfs_key key; 6602 struct btrfs_key first_key; 6603 struct btrfs_ref ref = { 0 }; 6604 struct extent_buffer *next; 6605 int level = wc->level; 6606 int reada = 0; 6607 int ret = 0; 6608 bool need_account = false; 6609 6610 generation = btrfs_node_ptr_generation(path->nodes[level], 6611 path->slots[level]); 6612 /* 6613 * if the lower level block was created before the snapshot 6614 * was created, we know there is no need to update back refs 6615 * for the subtree 6616 */ 6617 if (wc->stage == UPDATE_BACKREF && 6618 generation <= root->root_key.offset) { 6619 *lookup_info = 1; 6620 return 1; 6621 } 6622 6623 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 6624 btrfs_node_key_to_cpu(path->nodes[level], &first_key, 6625 path->slots[level]); 6626 6627 next = find_extent_buffer(fs_info, bytenr); 6628 if (!next) { 6629 next = btrfs_find_create_tree_block(fs_info, bytenr); 6630 if (IS_ERR(next)) 6631 return PTR_ERR(next); 6632 6633 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 6634 level - 1); 6635 reada = 1; 6636 } 6637 btrfs_tree_lock(next); 6638 btrfs_set_lock_blocking_write(next); 6639 6640 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 6641 &wc->refs[level - 1], 6642 &wc->flags[level - 1]); 6643 if (ret < 0) 6644 goto out_unlock; 6645 6646 if (unlikely(wc->refs[level - 1] == 0)) { 6647 btrfs_err(fs_info, "Missing references."); 6648 ret = -EIO; 6649 goto out_unlock; 6650 } 6651 *lookup_info = 0; 6652 6653 if (wc->stage == DROP_REFERENCE) { 6654 if (wc->refs[level - 1] > 1) { 6655 need_account = true; 6656 if (level == 1 && 6657 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6658 goto skip; 6659 6660 if (!wc->update_ref || 6661 generation <= root->root_key.offset) 6662 goto skip; 6663 6664 btrfs_node_key_to_cpu(path->nodes[level], &key, 6665 path->slots[level]); 6666 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 6667 if (ret < 0) 6668 goto skip; 6669 6670 wc->stage = UPDATE_BACKREF; 6671 wc->shared_level = level - 1; 6672 } 6673 } else { 6674 if (level == 1 && 6675 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 6676 goto skip; 6677 } 6678 6679 if (!btrfs_buffer_uptodate(next, generation, 0)) { 6680 btrfs_tree_unlock(next); 6681 free_extent_buffer(next); 6682 next = NULL; 6683 *lookup_info = 1; 6684 } 6685 6686 if (!next) { 6687 if (reada && level == 1) 6688 reada_walk_down(trans, root, wc, path); 6689 next = read_tree_block(fs_info, bytenr, generation, level - 1, 6690 &first_key); 6691 if (IS_ERR(next)) { 6692 return PTR_ERR(next); 6693 } else if (!extent_buffer_uptodate(next)) { 6694 free_extent_buffer(next); 6695 return -EIO; 6696 } 6697 btrfs_tree_lock(next); 6698 btrfs_set_lock_blocking_write(next); 6699 } 6700 6701 level--; 6702 ASSERT(level == btrfs_header_level(next)); 6703 if (level != btrfs_header_level(next)) { 6704 btrfs_err(root->fs_info, "mismatched level"); 6705 ret = -EIO; 6706 goto out_unlock; 6707 } 6708 path->nodes[level] = next; 6709 path->slots[level] = 0; 6710 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6711 wc->level = level; 6712 if (wc->level == 1) 6713 wc->reada_slot = 0; 6714 return 0; 6715 skip: 6716 wc->refs[level - 1] = 0; 6717 wc->flags[level - 1] = 0; 6718 if (wc->stage == DROP_REFERENCE) { 6719 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 6720 parent = path->nodes[level]->start; 6721 } else { 6722 ASSERT(root->root_key.objectid == 6723 btrfs_header_owner(path->nodes[level])); 6724 if (root->root_key.objectid != 6725 btrfs_header_owner(path->nodes[level])) { 6726 btrfs_err(root->fs_info, 6727 "mismatched block owner"); 6728 ret = -EIO; 6729 goto out_unlock; 6730 } 6731 parent = 0; 6732 } 6733 6734 /* 6735 * If we had a drop_progress we need to verify the refs are set 6736 * as expected. If we find our ref then we know that from here 6737 * on out everything should be correct, and we can clear the 6738 * ->restarted flag. 6739 */ 6740 if (wc->restarted) { 6741 ret = check_ref_exists(trans, root, bytenr, parent, 6742 level - 1); 6743 if (ret < 0) 6744 goto out_unlock; 6745 if (ret == 0) 6746 goto no_delete; 6747 ret = 0; 6748 wc->restarted = 0; 6749 } 6750 6751 /* 6752 * Reloc tree doesn't contribute to qgroup numbers, and we have 6753 * already accounted them at merge time (replace_path), 6754 * thus we could skip expensive subtree trace here. 6755 */ 6756 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 6757 need_account) { 6758 ret = btrfs_qgroup_trace_subtree(trans, next, 6759 generation, level - 1); 6760 if (ret) { 6761 btrfs_err_rl(fs_info, 6762 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 6763 ret); 6764 } 6765 } 6766 6767 /* 6768 * We need to update the next key in our walk control so we can 6769 * update the drop_progress key accordingly. We don't care if 6770 * find_next_key doesn't find a key because that means we're at 6771 * the end and are going to clean up now. 6772 */ 6773 wc->drop_level = level; 6774 find_next_key(path, level, &wc->drop_progress); 6775 6776 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 6777 fs_info->nodesize, parent); 6778 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); 6779 ret = btrfs_free_extent(trans, &ref); 6780 if (ret) 6781 goto out_unlock; 6782 } 6783 no_delete: 6784 *lookup_info = 1; 6785 ret = 1; 6786 6787 out_unlock: 6788 btrfs_tree_unlock(next); 6789 free_extent_buffer(next); 6790 6791 return ret; 6792 } 6793 6794 /* 6795 * helper to process tree block while walking up the tree. 6796 * 6797 * when wc->stage == DROP_REFERENCE, this function drops 6798 * reference count on the block. 6799 * 6800 * when wc->stage == UPDATE_BACKREF, this function changes 6801 * wc->stage back to DROP_REFERENCE if we changed wc->stage 6802 * to UPDATE_BACKREF previously while processing the block. 6803 * 6804 * NOTE: return value 1 means we should stop walking up. 6805 */ 6806 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 6807 struct btrfs_root *root, 6808 struct btrfs_path *path, 6809 struct walk_control *wc) 6810 { 6811 struct btrfs_fs_info *fs_info = root->fs_info; 6812 int ret; 6813 int level = wc->level; 6814 struct extent_buffer *eb = path->nodes[level]; 6815 u64 parent = 0; 6816 6817 if (wc->stage == UPDATE_BACKREF) { 6818 BUG_ON(wc->shared_level < level); 6819 if (level < wc->shared_level) 6820 goto out; 6821 6822 ret = find_next_key(path, level + 1, &wc->update_progress); 6823 if (ret > 0) 6824 wc->update_ref = 0; 6825 6826 wc->stage = DROP_REFERENCE; 6827 wc->shared_level = -1; 6828 path->slots[level] = 0; 6829 6830 /* 6831 * check reference count again if the block isn't locked. 6832 * we should start walking down the tree again if reference 6833 * count is one. 6834 */ 6835 if (!path->locks[level]) { 6836 BUG_ON(level == 0); 6837 btrfs_tree_lock(eb); 6838 btrfs_set_lock_blocking_write(eb); 6839 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6840 6841 ret = btrfs_lookup_extent_info(trans, fs_info, 6842 eb->start, level, 1, 6843 &wc->refs[level], 6844 &wc->flags[level]); 6845 if (ret < 0) { 6846 btrfs_tree_unlock_rw(eb, path->locks[level]); 6847 path->locks[level] = 0; 6848 return ret; 6849 } 6850 BUG_ON(wc->refs[level] == 0); 6851 if (wc->refs[level] == 1) { 6852 btrfs_tree_unlock_rw(eb, path->locks[level]); 6853 path->locks[level] = 0; 6854 return 1; 6855 } 6856 } 6857 } 6858 6859 /* wc->stage == DROP_REFERENCE */ 6860 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 6861 6862 if (wc->refs[level] == 1) { 6863 if (level == 0) { 6864 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6865 ret = btrfs_dec_ref(trans, root, eb, 1); 6866 else 6867 ret = btrfs_dec_ref(trans, root, eb, 0); 6868 BUG_ON(ret); /* -ENOMEM */ 6869 if (is_fstree(root->root_key.objectid)) { 6870 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 6871 if (ret) { 6872 btrfs_err_rl(fs_info, 6873 "error %d accounting leaf items, quota is out of sync, rescan required", 6874 ret); 6875 } 6876 } 6877 } 6878 /* make block locked assertion in btrfs_clean_tree_block happy */ 6879 if (!path->locks[level] && 6880 btrfs_header_generation(eb) == trans->transid) { 6881 btrfs_tree_lock(eb); 6882 btrfs_set_lock_blocking_write(eb); 6883 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 6884 } 6885 btrfs_clean_tree_block(eb); 6886 } 6887 6888 if (eb == root->node) { 6889 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6890 parent = eb->start; 6891 else if (root->root_key.objectid != btrfs_header_owner(eb)) 6892 goto owner_mismatch; 6893 } else { 6894 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6895 parent = path->nodes[level + 1]->start; 6896 else if (root->root_key.objectid != 6897 btrfs_header_owner(path->nodes[level + 1])) 6898 goto owner_mismatch; 6899 } 6900 6901 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6902 out: 6903 wc->refs[level] = 0; 6904 wc->flags[level] = 0; 6905 return 0; 6906 6907 owner_mismatch: 6908 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 6909 btrfs_header_owner(eb), root->root_key.objectid); 6910 return -EUCLEAN; 6911 } 6912 6913 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 6914 struct btrfs_root *root, 6915 struct btrfs_path *path, 6916 struct walk_control *wc) 6917 { 6918 int level = wc->level; 6919 int lookup_info = 1; 6920 int ret; 6921 6922 while (level >= 0) { 6923 ret = walk_down_proc(trans, root, path, wc, lookup_info); 6924 if (ret > 0) 6925 break; 6926 6927 if (level == 0) 6928 break; 6929 6930 if (path->slots[level] >= 6931 btrfs_header_nritems(path->nodes[level])) 6932 break; 6933 6934 ret = do_walk_down(trans, root, path, wc, &lookup_info); 6935 if (ret > 0) { 6936 path->slots[level]++; 6937 continue; 6938 } else if (ret < 0) 6939 return ret; 6940 level = wc->level; 6941 } 6942 return 0; 6943 } 6944 6945 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 6946 struct btrfs_root *root, 6947 struct btrfs_path *path, 6948 struct walk_control *wc, int max_level) 6949 { 6950 int level = wc->level; 6951 int ret; 6952 6953 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 6954 while (level < max_level && path->nodes[level]) { 6955 wc->level = level; 6956 if (path->slots[level] + 1 < 6957 btrfs_header_nritems(path->nodes[level])) { 6958 path->slots[level]++; 6959 return 0; 6960 } else { 6961 ret = walk_up_proc(trans, root, path, wc); 6962 if (ret > 0) 6963 return 0; 6964 if (ret < 0) 6965 return ret; 6966 6967 if (path->locks[level]) { 6968 btrfs_tree_unlock_rw(path->nodes[level], 6969 path->locks[level]); 6970 path->locks[level] = 0; 6971 } 6972 free_extent_buffer(path->nodes[level]); 6973 path->nodes[level] = NULL; 6974 level++; 6975 } 6976 } 6977 return 1; 6978 } 6979 6980 /* 6981 * drop a subvolume tree. 6982 * 6983 * this function traverses the tree freeing any blocks that only 6984 * referenced by the tree. 6985 * 6986 * when a shared tree block is found. this function decreases its 6987 * reference count by one. if update_ref is true, this function 6988 * also make sure backrefs for the shared block and all lower level 6989 * blocks are properly updated. 6990 * 6991 * If called with for_reloc == 0, may exit early with -EAGAIN 6992 */ 6993 int btrfs_drop_snapshot(struct btrfs_root *root, 6994 struct btrfs_block_rsv *block_rsv, int update_ref, 6995 int for_reloc) 6996 { 6997 struct btrfs_fs_info *fs_info = root->fs_info; 6998 struct btrfs_path *path; 6999 struct btrfs_trans_handle *trans; 7000 struct btrfs_root *tree_root = fs_info->tree_root; 7001 struct btrfs_root_item *root_item = &root->root_item; 7002 struct walk_control *wc; 7003 struct btrfs_key key; 7004 int err = 0; 7005 int ret; 7006 int level; 7007 bool root_dropped = false; 7008 7009 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 7010 7011 path = btrfs_alloc_path(); 7012 if (!path) { 7013 err = -ENOMEM; 7014 goto out; 7015 } 7016 7017 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7018 if (!wc) { 7019 btrfs_free_path(path); 7020 err = -ENOMEM; 7021 goto out; 7022 } 7023 7024 trans = btrfs_start_transaction(tree_root, 0); 7025 if (IS_ERR(trans)) { 7026 err = PTR_ERR(trans); 7027 goto out_free; 7028 } 7029 7030 err = btrfs_run_delayed_items(trans); 7031 if (err) 7032 goto out_end_trans; 7033 7034 if (block_rsv) 7035 trans->block_rsv = block_rsv; 7036 7037 /* 7038 * This will help us catch people modifying the fs tree while we're 7039 * dropping it. It is unsafe to mess with the fs tree while it's being 7040 * dropped as we unlock the root node and parent nodes as we walk down 7041 * the tree, assuming nothing will change. If something does change 7042 * then we'll have stale information and drop references to blocks we've 7043 * already dropped. 7044 */ 7045 set_bit(BTRFS_ROOT_DELETING, &root->state); 7046 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7047 level = btrfs_header_level(root->node); 7048 path->nodes[level] = btrfs_lock_root_node(root); 7049 btrfs_set_lock_blocking_write(path->nodes[level]); 7050 path->slots[level] = 0; 7051 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7052 memset(&wc->update_progress, 0, 7053 sizeof(wc->update_progress)); 7054 } else { 7055 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7056 memcpy(&wc->update_progress, &key, 7057 sizeof(wc->update_progress)); 7058 7059 level = root_item->drop_level; 7060 BUG_ON(level == 0); 7061 path->lowest_level = level; 7062 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7063 path->lowest_level = 0; 7064 if (ret < 0) { 7065 err = ret; 7066 goto out_end_trans; 7067 } 7068 WARN_ON(ret > 0); 7069 7070 /* 7071 * unlock our path, this is safe because only this 7072 * function is allowed to delete this snapshot 7073 */ 7074 btrfs_unlock_up_safe(path, 0); 7075 7076 level = btrfs_header_level(root->node); 7077 while (1) { 7078 btrfs_tree_lock(path->nodes[level]); 7079 btrfs_set_lock_blocking_write(path->nodes[level]); 7080 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7081 7082 ret = btrfs_lookup_extent_info(trans, fs_info, 7083 path->nodes[level]->start, 7084 level, 1, &wc->refs[level], 7085 &wc->flags[level]); 7086 if (ret < 0) { 7087 err = ret; 7088 goto out_end_trans; 7089 } 7090 BUG_ON(wc->refs[level] == 0); 7091 7092 if (level == root_item->drop_level) 7093 break; 7094 7095 btrfs_tree_unlock(path->nodes[level]); 7096 path->locks[level] = 0; 7097 WARN_ON(wc->refs[level] != 1); 7098 level--; 7099 } 7100 } 7101 7102 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 7103 wc->level = level; 7104 wc->shared_level = -1; 7105 wc->stage = DROP_REFERENCE; 7106 wc->update_ref = update_ref; 7107 wc->keep_locks = 0; 7108 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 7109 7110 while (1) { 7111 7112 ret = walk_down_tree(trans, root, path, wc); 7113 if (ret < 0) { 7114 err = ret; 7115 break; 7116 } 7117 7118 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 7119 if (ret < 0) { 7120 err = ret; 7121 break; 7122 } 7123 7124 if (ret > 0) { 7125 BUG_ON(wc->stage != DROP_REFERENCE); 7126 break; 7127 } 7128 7129 if (wc->stage == DROP_REFERENCE) { 7130 wc->drop_level = wc->level; 7131 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 7132 &wc->drop_progress, 7133 path->slots[wc->drop_level]); 7134 } 7135 btrfs_cpu_key_to_disk(&root_item->drop_progress, 7136 &wc->drop_progress); 7137 root_item->drop_level = wc->drop_level; 7138 7139 BUG_ON(wc->level == 0); 7140 if (btrfs_should_end_transaction(trans) || 7141 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 7142 ret = btrfs_update_root(trans, tree_root, 7143 &root->root_key, 7144 root_item); 7145 if (ret) { 7146 btrfs_abort_transaction(trans, ret); 7147 err = ret; 7148 goto out_end_trans; 7149 } 7150 7151 btrfs_end_transaction_throttle(trans); 7152 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 7153 btrfs_debug(fs_info, 7154 "drop snapshot early exit"); 7155 err = -EAGAIN; 7156 goto out_free; 7157 } 7158 7159 trans = btrfs_start_transaction(tree_root, 0); 7160 if (IS_ERR(trans)) { 7161 err = PTR_ERR(trans); 7162 goto out_free; 7163 } 7164 if (block_rsv) 7165 trans->block_rsv = block_rsv; 7166 } 7167 } 7168 btrfs_release_path(path); 7169 if (err) 7170 goto out_end_trans; 7171 7172 ret = btrfs_del_root(trans, &root->root_key); 7173 if (ret) { 7174 btrfs_abort_transaction(trans, ret); 7175 err = ret; 7176 goto out_end_trans; 7177 } 7178 7179 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7180 ret = btrfs_find_root(tree_root, &root->root_key, path, 7181 NULL, NULL); 7182 if (ret < 0) { 7183 btrfs_abort_transaction(trans, ret); 7184 err = ret; 7185 goto out_end_trans; 7186 } else if (ret > 0) { 7187 /* if we fail to delete the orphan item this time 7188 * around, it'll get picked up the next time. 7189 * 7190 * The most common failure here is just -ENOENT. 7191 */ 7192 btrfs_del_orphan_item(trans, tree_root, 7193 root->root_key.objectid); 7194 } 7195 } 7196 7197 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 7198 btrfs_add_dropped_root(trans, root); 7199 } else { 7200 free_extent_buffer(root->node); 7201 free_extent_buffer(root->commit_root); 7202 btrfs_put_fs_root(root); 7203 } 7204 root_dropped = true; 7205 out_end_trans: 7206 btrfs_end_transaction_throttle(trans); 7207 out_free: 7208 kfree(wc); 7209 btrfs_free_path(path); 7210 out: 7211 /* 7212 * So if we need to stop dropping the snapshot for whatever reason we 7213 * need to make sure to add it back to the dead root list so that we 7214 * keep trying to do the work later. This also cleans up roots if we 7215 * don't have it in the radix (like when we recover after a power fail 7216 * or unmount) so we don't leak memory. 7217 */ 7218 if (!for_reloc && !root_dropped) 7219 btrfs_add_dead_root(root); 7220 if (err && err != -EAGAIN) 7221 btrfs_handle_fs_error(fs_info, err, NULL); 7222 return err; 7223 } 7224 7225 /* 7226 * drop subtree rooted at tree block 'node'. 7227 * 7228 * NOTE: this function will unlock and release tree block 'node' 7229 * only used by relocation code 7230 */ 7231 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 7232 struct btrfs_root *root, 7233 struct extent_buffer *node, 7234 struct extent_buffer *parent) 7235 { 7236 struct btrfs_fs_info *fs_info = root->fs_info; 7237 struct btrfs_path *path; 7238 struct walk_control *wc; 7239 int level; 7240 int parent_level; 7241 int ret = 0; 7242 int wret; 7243 7244 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 7245 7246 path = btrfs_alloc_path(); 7247 if (!path) 7248 return -ENOMEM; 7249 7250 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7251 if (!wc) { 7252 btrfs_free_path(path); 7253 return -ENOMEM; 7254 } 7255 7256 btrfs_assert_tree_locked(parent); 7257 parent_level = btrfs_header_level(parent); 7258 extent_buffer_get(parent); 7259 path->nodes[parent_level] = parent; 7260 path->slots[parent_level] = btrfs_header_nritems(parent); 7261 7262 btrfs_assert_tree_locked(node); 7263 level = btrfs_header_level(node); 7264 path->nodes[level] = node; 7265 path->slots[level] = 0; 7266 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7267 7268 wc->refs[parent_level] = 1; 7269 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7270 wc->level = level; 7271 wc->shared_level = -1; 7272 wc->stage = DROP_REFERENCE; 7273 wc->update_ref = 0; 7274 wc->keep_locks = 1; 7275 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 7276 7277 while (1) { 7278 wret = walk_down_tree(trans, root, path, wc); 7279 if (wret < 0) { 7280 ret = wret; 7281 break; 7282 } 7283 7284 wret = walk_up_tree(trans, root, path, wc, parent_level); 7285 if (wret < 0) 7286 ret = wret; 7287 if (wret != 0) 7288 break; 7289 } 7290 7291 kfree(wc); 7292 btrfs_free_path(path); 7293 return ret; 7294 } 7295 7296 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 7297 { 7298 u64 num_devices; 7299 u64 stripped; 7300 7301 /* 7302 * if restripe for this chunk_type is on pick target profile and 7303 * return, otherwise do the usual balance 7304 */ 7305 stripped = get_restripe_target(fs_info, flags); 7306 if (stripped) 7307 return extended_to_chunk(stripped); 7308 7309 num_devices = fs_info->fs_devices->rw_devices; 7310 7311 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | 7312 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; 7313 7314 if (num_devices == 1) { 7315 stripped |= BTRFS_BLOCK_GROUP_DUP; 7316 stripped = flags & ~stripped; 7317 7318 /* turn raid0 into single device chunks */ 7319 if (flags & BTRFS_BLOCK_GROUP_RAID0) 7320 return stripped; 7321 7322 /* turn mirroring into duplication */ 7323 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | 7324 BTRFS_BLOCK_GROUP_RAID10)) 7325 return stripped | BTRFS_BLOCK_GROUP_DUP; 7326 } else { 7327 /* they already had raid on here, just return */ 7328 if (flags & stripped) 7329 return flags; 7330 7331 stripped |= BTRFS_BLOCK_GROUP_DUP; 7332 stripped = flags & ~stripped; 7333 7334 /* switch duplicated blocks with raid1 */ 7335 if (flags & BTRFS_BLOCK_GROUP_DUP) 7336 return stripped | BTRFS_BLOCK_GROUP_RAID1; 7337 7338 /* this is drive concat, leave it alone */ 7339 } 7340 7341 return flags; 7342 } 7343 7344 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 7345 { 7346 struct btrfs_space_info *sinfo = cache->space_info; 7347 u64 num_bytes; 7348 u64 sinfo_used; 7349 u64 min_allocable_bytes; 7350 int ret = -ENOSPC; 7351 7352 /* 7353 * We need some metadata space and system metadata space for 7354 * allocating chunks in some corner cases until we force to set 7355 * it to be readonly. 7356 */ 7357 if ((sinfo->flags & 7358 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 7359 !force) 7360 min_allocable_bytes = SZ_1M; 7361 else 7362 min_allocable_bytes = 0; 7363 7364 spin_lock(&sinfo->lock); 7365 spin_lock(&cache->lock); 7366 7367 if (cache->ro) { 7368 cache->ro++; 7369 ret = 0; 7370 goto out; 7371 } 7372 7373 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 7374 cache->bytes_super - btrfs_block_group_used(&cache->item); 7375 sinfo_used = btrfs_space_info_used(sinfo, true); 7376 7377 if (sinfo_used + num_bytes + min_allocable_bytes <= 7378 sinfo->total_bytes) { 7379 sinfo->bytes_readonly += num_bytes; 7380 cache->ro++; 7381 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 7382 ret = 0; 7383 } 7384 out: 7385 spin_unlock(&cache->lock); 7386 spin_unlock(&sinfo->lock); 7387 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 7388 btrfs_info(cache->fs_info, 7389 "unable to make block group %llu ro", 7390 cache->key.objectid); 7391 btrfs_info(cache->fs_info, 7392 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", 7393 sinfo_used, num_bytes, min_allocable_bytes); 7394 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 7395 } 7396 return ret; 7397 } 7398 7399 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) 7400 7401 { 7402 struct btrfs_fs_info *fs_info = cache->fs_info; 7403 struct btrfs_trans_handle *trans; 7404 u64 alloc_flags; 7405 int ret; 7406 7407 again: 7408 trans = btrfs_join_transaction(fs_info->extent_root); 7409 if (IS_ERR(trans)) 7410 return PTR_ERR(trans); 7411 7412 /* 7413 * we're not allowed to set block groups readonly after the dirty 7414 * block groups cache has started writing. If it already started, 7415 * back off and let this transaction commit 7416 */ 7417 mutex_lock(&fs_info->ro_block_group_mutex); 7418 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 7419 u64 transid = trans->transid; 7420 7421 mutex_unlock(&fs_info->ro_block_group_mutex); 7422 btrfs_end_transaction(trans); 7423 7424 ret = btrfs_wait_for_commit(fs_info, transid); 7425 if (ret) 7426 return ret; 7427 goto again; 7428 } 7429 7430 /* 7431 * if we are changing raid levels, try to allocate a corresponding 7432 * block group with the new raid level. 7433 */ 7434 alloc_flags = update_block_group_flags(fs_info, cache->flags); 7435 if (alloc_flags != cache->flags) { 7436 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7437 /* 7438 * ENOSPC is allowed here, we may have enough space 7439 * already allocated at the new raid level to 7440 * carry on 7441 */ 7442 if (ret == -ENOSPC) 7443 ret = 0; 7444 if (ret < 0) 7445 goto out; 7446 } 7447 7448 ret = inc_block_group_ro(cache, 0); 7449 if (!ret) 7450 goto out; 7451 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 7452 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7453 if (ret < 0) 7454 goto out; 7455 ret = inc_block_group_ro(cache, 0); 7456 out: 7457 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 7458 alloc_flags = update_block_group_flags(fs_info, cache->flags); 7459 mutex_lock(&fs_info->chunk_mutex); 7460 check_system_chunk(trans, alloc_flags); 7461 mutex_unlock(&fs_info->chunk_mutex); 7462 } 7463 mutex_unlock(&fs_info->ro_block_group_mutex); 7464 7465 btrfs_end_transaction(trans); 7466 return ret; 7467 } 7468 7469 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 7470 { 7471 u64 alloc_flags = get_alloc_profile(trans->fs_info, type); 7472 7473 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7474 } 7475 7476 /* 7477 * helper to account the unused space of all the readonly block group in the 7478 * space_info. takes mirrors into account. 7479 */ 7480 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 7481 { 7482 struct btrfs_block_group_cache *block_group; 7483 u64 free_bytes = 0; 7484 int factor; 7485 7486 /* It's df, we don't care if it's racy */ 7487 if (list_empty(&sinfo->ro_bgs)) 7488 return 0; 7489 7490 spin_lock(&sinfo->lock); 7491 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 7492 spin_lock(&block_group->lock); 7493 7494 if (!block_group->ro) { 7495 spin_unlock(&block_group->lock); 7496 continue; 7497 } 7498 7499 factor = btrfs_bg_type_to_factor(block_group->flags); 7500 free_bytes += (block_group->key.offset - 7501 btrfs_block_group_used(&block_group->item)) * 7502 factor; 7503 7504 spin_unlock(&block_group->lock); 7505 } 7506 spin_unlock(&sinfo->lock); 7507 7508 return free_bytes; 7509 } 7510 7511 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 7512 { 7513 struct btrfs_space_info *sinfo = cache->space_info; 7514 u64 num_bytes; 7515 7516 BUG_ON(!cache->ro); 7517 7518 spin_lock(&sinfo->lock); 7519 spin_lock(&cache->lock); 7520 if (!--cache->ro) { 7521 num_bytes = cache->key.offset - cache->reserved - 7522 cache->pinned - cache->bytes_super - 7523 btrfs_block_group_used(&cache->item); 7524 sinfo->bytes_readonly -= num_bytes; 7525 list_del_init(&cache->ro_list); 7526 } 7527 spin_unlock(&cache->lock); 7528 spin_unlock(&sinfo->lock); 7529 } 7530 7531 /* 7532 * Checks to see if it's even possible to relocate this block group. 7533 * 7534 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 7535 * ok to go ahead and try. 7536 */ 7537 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 7538 { 7539 struct btrfs_block_group_cache *block_group; 7540 struct btrfs_space_info *space_info; 7541 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7542 struct btrfs_device *device; 7543 u64 min_free; 7544 u64 dev_min = 1; 7545 u64 dev_nr = 0; 7546 u64 target; 7547 int debug; 7548 int index; 7549 int full = 0; 7550 int ret = 0; 7551 7552 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 7553 7554 block_group = btrfs_lookup_block_group(fs_info, bytenr); 7555 7556 /* odd, couldn't find the block group, leave it alone */ 7557 if (!block_group) { 7558 if (debug) 7559 btrfs_warn(fs_info, 7560 "can't find block group for bytenr %llu", 7561 bytenr); 7562 return -1; 7563 } 7564 7565 min_free = btrfs_block_group_used(&block_group->item); 7566 7567 /* no bytes used, we're good */ 7568 if (!min_free) 7569 goto out; 7570 7571 space_info = block_group->space_info; 7572 spin_lock(&space_info->lock); 7573 7574 full = space_info->full; 7575 7576 /* 7577 * if this is the last block group we have in this space, we can't 7578 * relocate it unless we're able to allocate a new chunk below. 7579 * 7580 * Otherwise, we need to make sure we have room in the space to handle 7581 * all of the extents from this block group. If we can, we're good 7582 */ 7583 if ((space_info->total_bytes != block_group->key.offset) && 7584 (btrfs_space_info_used(space_info, false) + min_free < 7585 space_info->total_bytes)) { 7586 spin_unlock(&space_info->lock); 7587 goto out; 7588 } 7589 spin_unlock(&space_info->lock); 7590 7591 /* 7592 * ok we don't have enough space, but maybe we have free space on our 7593 * devices to allocate new chunks for relocation, so loop through our 7594 * alloc devices and guess if we have enough space. if this block 7595 * group is going to be restriped, run checks against the target 7596 * profile instead of the current one. 7597 */ 7598 ret = -1; 7599 7600 /* 7601 * index: 7602 * 0: raid10 7603 * 1: raid1 7604 * 2: dup 7605 * 3: raid0 7606 * 4: single 7607 */ 7608 target = get_restripe_target(fs_info, block_group->flags); 7609 if (target) { 7610 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); 7611 } else { 7612 /* 7613 * this is just a balance, so if we were marked as full 7614 * we know there is no space for a new chunk 7615 */ 7616 if (full) { 7617 if (debug) 7618 btrfs_warn(fs_info, 7619 "no space to alloc new chunk for block group %llu", 7620 block_group->key.objectid); 7621 goto out; 7622 } 7623 7624 index = btrfs_bg_flags_to_raid_index(block_group->flags); 7625 } 7626 7627 if (index == BTRFS_RAID_RAID10) { 7628 dev_min = 4; 7629 /* Divide by 2 */ 7630 min_free >>= 1; 7631 } else if (index == BTRFS_RAID_RAID1) { 7632 dev_min = 2; 7633 } else if (index == BTRFS_RAID_DUP) { 7634 /* Multiply by 2 */ 7635 min_free <<= 1; 7636 } else if (index == BTRFS_RAID_RAID0) { 7637 dev_min = fs_devices->rw_devices; 7638 min_free = div64_u64(min_free, dev_min); 7639 } 7640 7641 mutex_lock(&fs_info->chunk_mutex); 7642 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 7643 u64 dev_offset; 7644 7645 /* 7646 * check to make sure we can actually find a chunk with enough 7647 * space to fit our block group in. 7648 */ 7649 if (device->total_bytes > device->bytes_used + min_free && 7650 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7651 ret = find_free_dev_extent(device, min_free, 7652 &dev_offset, NULL); 7653 if (!ret) 7654 dev_nr++; 7655 7656 if (dev_nr >= dev_min) 7657 break; 7658 7659 ret = -1; 7660 } 7661 } 7662 if (debug && ret == -1) 7663 btrfs_warn(fs_info, 7664 "no space to allocate a new chunk for block group %llu", 7665 block_group->key.objectid); 7666 mutex_unlock(&fs_info->chunk_mutex); 7667 out: 7668 btrfs_put_block_group(block_group); 7669 return ret; 7670 } 7671 7672 static int find_first_block_group(struct btrfs_fs_info *fs_info, 7673 struct btrfs_path *path, 7674 struct btrfs_key *key) 7675 { 7676 struct btrfs_root *root = fs_info->extent_root; 7677 int ret = 0; 7678 struct btrfs_key found_key; 7679 struct extent_buffer *leaf; 7680 struct btrfs_block_group_item bg; 7681 u64 flags; 7682 int slot; 7683 7684 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7685 if (ret < 0) 7686 goto out; 7687 7688 while (1) { 7689 slot = path->slots[0]; 7690 leaf = path->nodes[0]; 7691 if (slot >= btrfs_header_nritems(leaf)) { 7692 ret = btrfs_next_leaf(root, path); 7693 if (ret == 0) 7694 continue; 7695 if (ret < 0) 7696 goto out; 7697 break; 7698 } 7699 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7700 7701 if (found_key.objectid >= key->objectid && 7702 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 7703 struct extent_map_tree *em_tree; 7704 struct extent_map *em; 7705 7706 em_tree = &root->fs_info->mapping_tree; 7707 read_lock(&em_tree->lock); 7708 em = lookup_extent_mapping(em_tree, found_key.objectid, 7709 found_key.offset); 7710 read_unlock(&em_tree->lock); 7711 if (!em) { 7712 btrfs_err(fs_info, 7713 "logical %llu len %llu found bg but no related chunk", 7714 found_key.objectid, found_key.offset); 7715 ret = -ENOENT; 7716 } else if (em->start != found_key.objectid || 7717 em->len != found_key.offset) { 7718 btrfs_err(fs_info, 7719 "block group %llu len %llu mismatch with chunk %llu len %llu", 7720 found_key.objectid, found_key.offset, 7721 em->start, em->len); 7722 ret = -EUCLEAN; 7723 } else { 7724 read_extent_buffer(leaf, &bg, 7725 btrfs_item_ptr_offset(leaf, slot), 7726 sizeof(bg)); 7727 flags = btrfs_block_group_flags(&bg) & 7728 BTRFS_BLOCK_GROUP_TYPE_MASK; 7729 7730 if (flags != (em->map_lookup->type & 7731 BTRFS_BLOCK_GROUP_TYPE_MASK)) { 7732 btrfs_err(fs_info, 7733 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 7734 found_key.objectid, 7735 found_key.offset, flags, 7736 (BTRFS_BLOCK_GROUP_TYPE_MASK & 7737 em->map_lookup->type)); 7738 ret = -EUCLEAN; 7739 } else { 7740 ret = 0; 7741 } 7742 } 7743 free_extent_map(em); 7744 goto out; 7745 } 7746 path->slots[0]++; 7747 } 7748 out: 7749 return ret; 7750 } 7751 7752 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 7753 { 7754 struct btrfs_block_group_cache *block_group; 7755 u64 last = 0; 7756 7757 while (1) { 7758 struct inode *inode; 7759 7760 block_group = btrfs_lookup_first_block_group(info, last); 7761 while (block_group) { 7762 wait_block_group_cache_done(block_group); 7763 spin_lock(&block_group->lock); 7764 if (block_group->iref) 7765 break; 7766 spin_unlock(&block_group->lock); 7767 block_group = next_block_group(block_group); 7768 } 7769 if (!block_group) { 7770 if (last == 0) 7771 break; 7772 last = 0; 7773 continue; 7774 } 7775 7776 inode = block_group->inode; 7777 block_group->iref = 0; 7778 block_group->inode = NULL; 7779 spin_unlock(&block_group->lock); 7780 ASSERT(block_group->io_ctl.inode == NULL); 7781 iput(inode); 7782 last = block_group->key.objectid + block_group->key.offset; 7783 btrfs_put_block_group(block_group); 7784 } 7785 } 7786 7787 /* 7788 * Must be called only after stopping all workers, since we could have block 7789 * group caching kthreads running, and therefore they could race with us if we 7790 * freed the block groups before stopping them. 7791 */ 7792 int btrfs_free_block_groups(struct btrfs_fs_info *info) 7793 { 7794 struct btrfs_block_group_cache *block_group; 7795 struct btrfs_space_info *space_info; 7796 struct btrfs_caching_control *caching_ctl; 7797 struct rb_node *n; 7798 7799 down_write(&info->commit_root_sem); 7800 while (!list_empty(&info->caching_block_groups)) { 7801 caching_ctl = list_entry(info->caching_block_groups.next, 7802 struct btrfs_caching_control, list); 7803 list_del(&caching_ctl->list); 7804 put_caching_control(caching_ctl); 7805 } 7806 up_write(&info->commit_root_sem); 7807 7808 spin_lock(&info->unused_bgs_lock); 7809 while (!list_empty(&info->unused_bgs)) { 7810 block_group = list_first_entry(&info->unused_bgs, 7811 struct btrfs_block_group_cache, 7812 bg_list); 7813 list_del_init(&block_group->bg_list); 7814 btrfs_put_block_group(block_group); 7815 } 7816 spin_unlock(&info->unused_bgs_lock); 7817 7818 spin_lock(&info->block_group_cache_lock); 7819 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7820 block_group = rb_entry(n, struct btrfs_block_group_cache, 7821 cache_node); 7822 rb_erase(&block_group->cache_node, 7823 &info->block_group_cache_tree); 7824 RB_CLEAR_NODE(&block_group->cache_node); 7825 spin_unlock(&info->block_group_cache_lock); 7826 7827 down_write(&block_group->space_info->groups_sem); 7828 list_del(&block_group->list); 7829 up_write(&block_group->space_info->groups_sem); 7830 7831 /* 7832 * We haven't cached this block group, which means we could 7833 * possibly have excluded extents on this block group. 7834 */ 7835 if (block_group->cached == BTRFS_CACHE_NO || 7836 block_group->cached == BTRFS_CACHE_ERROR) 7837 free_excluded_extents(block_group); 7838 7839 btrfs_remove_free_space_cache(block_group); 7840 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 7841 ASSERT(list_empty(&block_group->dirty_list)); 7842 ASSERT(list_empty(&block_group->io_list)); 7843 ASSERT(list_empty(&block_group->bg_list)); 7844 ASSERT(atomic_read(&block_group->count) == 1); 7845 btrfs_put_block_group(block_group); 7846 7847 spin_lock(&info->block_group_cache_lock); 7848 } 7849 spin_unlock(&info->block_group_cache_lock); 7850 7851 /* now that all the block groups are freed, go through and 7852 * free all the space_info structs. This is only called during 7853 * the final stages of unmount, and so we know nobody is 7854 * using them. We call synchronize_rcu() once before we start, 7855 * just to be on the safe side. 7856 */ 7857 synchronize_rcu(); 7858 7859 btrfs_release_global_block_rsv(info); 7860 7861 while (!list_empty(&info->space_info)) { 7862 int i; 7863 7864 space_info = list_entry(info->space_info.next, 7865 struct btrfs_space_info, 7866 list); 7867 7868 /* 7869 * Do not hide this behind enospc_debug, this is actually 7870 * important and indicates a real bug if this happens. 7871 */ 7872 if (WARN_ON(space_info->bytes_pinned > 0 || 7873 space_info->bytes_reserved > 0 || 7874 space_info->bytes_may_use > 0)) 7875 btrfs_dump_space_info(info, space_info, 0, 0); 7876 list_del(&space_info->list); 7877 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 7878 struct kobject *kobj; 7879 kobj = space_info->block_group_kobjs[i]; 7880 space_info->block_group_kobjs[i] = NULL; 7881 if (kobj) { 7882 kobject_del(kobj); 7883 kobject_put(kobj); 7884 } 7885 } 7886 kobject_del(&space_info->kobj); 7887 kobject_put(&space_info->kobj); 7888 } 7889 return 0; 7890 } 7891 7892 static void link_block_group(struct btrfs_block_group_cache *cache) 7893 { 7894 struct btrfs_space_info *space_info = cache->space_info; 7895 struct btrfs_fs_info *fs_info = cache->fs_info; 7896 int index = btrfs_bg_flags_to_raid_index(cache->flags); 7897 bool first = false; 7898 7899 down_write(&space_info->groups_sem); 7900 if (list_empty(&space_info->block_groups[index])) 7901 first = true; 7902 list_add_tail(&cache->list, &space_info->block_groups[index]); 7903 up_write(&space_info->groups_sem); 7904 7905 if (first) { 7906 struct raid_kobject *rkobj; 7907 unsigned int nofs_flag; 7908 int ret; 7909 7910 /* 7911 * Setup a NOFS context because kobject_add(), deep in its call 7912 * chain, does GFP_KERNEL allocations, and we are often called 7913 * in a context where if reclaim is triggered we can deadlock 7914 * (we are either holding a transaction handle or some lock 7915 * required for a transaction commit). 7916 */ 7917 nofs_flag = memalloc_nofs_save(); 7918 rkobj = kzalloc(sizeof(*rkobj), GFP_KERNEL); 7919 if (!rkobj) { 7920 memalloc_nofs_restore(nofs_flag); 7921 btrfs_warn(cache->fs_info, 7922 "couldn't alloc memory for raid level kobject"); 7923 return; 7924 } 7925 rkobj->flags = cache->flags; 7926 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 7927 ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", 7928 btrfs_bg_type_to_raid_name(rkobj->flags)); 7929 memalloc_nofs_restore(nofs_flag); 7930 if (ret) { 7931 kobject_put(&rkobj->kobj); 7932 btrfs_warn(fs_info, 7933 "failed to add kobject for block cache, ignoring"); 7934 return; 7935 } 7936 space_info->block_group_kobjs[index] = &rkobj->kobj; 7937 } 7938 } 7939 7940 static struct btrfs_block_group_cache * 7941 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 7942 u64 start, u64 size) 7943 { 7944 struct btrfs_block_group_cache *cache; 7945 7946 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7947 if (!cache) 7948 return NULL; 7949 7950 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 7951 GFP_NOFS); 7952 if (!cache->free_space_ctl) { 7953 kfree(cache); 7954 return NULL; 7955 } 7956 7957 cache->key.objectid = start; 7958 cache->key.offset = size; 7959 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7960 7961 cache->fs_info = fs_info; 7962 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 7963 set_free_space_tree_thresholds(cache); 7964 7965 atomic_set(&cache->count, 1); 7966 spin_lock_init(&cache->lock); 7967 init_rwsem(&cache->data_rwsem); 7968 INIT_LIST_HEAD(&cache->list); 7969 INIT_LIST_HEAD(&cache->cluster_list); 7970 INIT_LIST_HEAD(&cache->bg_list); 7971 INIT_LIST_HEAD(&cache->ro_list); 7972 INIT_LIST_HEAD(&cache->dirty_list); 7973 INIT_LIST_HEAD(&cache->io_list); 7974 btrfs_init_free_space_ctl(cache); 7975 atomic_set(&cache->trimming, 0); 7976 mutex_init(&cache->free_space_lock); 7977 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 7978 7979 return cache; 7980 } 7981 7982 7983 /* 7984 * Iterate all chunks and verify that each of them has the corresponding block 7985 * group 7986 */ 7987 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 7988 { 7989 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7990 struct extent_map *em; 7991 struct btrfs_block_group_cache *bg; 7992 u64 start = 0; 7993 int ret = 0; 7994 7995 while (1) { 7996 read_lock(&map_tree->lock); 7997 /* 7998 * lookup_extent_mapping will return the first extent map 7999 * intersecting the range, so setting @len to 1 is enough to 8000 * get the first chunk. 8001 */ 8002 em = lookup_extent_mapping(map_tree, start, 1); 8003 read_unlock(&map_tree->lock); 8004 if (!em) 8005 break; 8006 8007 bg = btrfs_lookup_block_group(fs_info, em->start); 8008 if (!bg) { 8009 btrfs_err(fs_info, 8010 "chunk start=%llu len=%llu doesn't have corresponding block group", 8011 em->start, em->len); 8012 ret = -EUCLEAN; 8013 free_extent_map(em); 8014 break; 8015 } 8016 if (bg->key.objectid != em->start || 8017 bg->key.offset != em->len || 8018 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 8019 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 8020 btrfs_err(fs_info, 8021 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 8022 em->start, em->len, 8023 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 8024 bg->key.objectid, bg->key.offset, 8025 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 8026 ret = -EUCLEAN; 8027 free_extent_map(em); 8028 btrfs_put_block_group(bg); 8029 break; 8030 } 8031 start = em->start + em->len; 8032 free_extent_map(em); 8033 btrfs_put_block_group(bg); 8034 } 8035 return ret; 8036 } 8037 8038 int btrfs_read_block_groups(struct btrfs_fs_info *info) 8039 { 8040 struct btrfs_path *path; 8041 int ret; 8042 struct btrfs_block_group_cache *cache; 8043 struct btrfs_space_info *space_info; 8044 struct btrfs_key key; 8045 struct btrfs_key found_key; 8046 struct extent_buffer *leaf; 8047 int need_clear = 0; 8048 u64 cache_gen; 8049 u64 feature; 8050 int mixed; 8051 8052 feature = btrfs_super_incompat_flags(info->super_copy); 8053 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 8054 8055 key.objectid = 0; 8056 key.offset = 0; 8057 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8058 path = btrfs_alloc_path(); 8059 if (!path) 8060 return -ENOMEM; 8061 path->reada = READA_FORWARD; 8062 8063 cache_gen = btrfs_super_cache_generation(info->super_copy); 8064 if (btrfs_test_opt(info, SPACE_CACHE) && 8065 btrfs_super_generation(info->super_copy) != cache_gen) 8066 need_clear = 1; 8067 if (btrfs_test_opt(info, CLEAR_CACHE)) 8068 need_clear = 1; 8069 8070 while (1) { 8071 ret = find_first_block_group(info, path, &key); 8072 if (ret > 0) 8073 break; 8074 if (ret != 0) 8075 goto error; 8076 8077 leaf = path->nodes[0]; 8078 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8079 8080 cache = btrfs_create_block_group_cache(info, found_key.objectid, 8081 found_key.offset); 8082 if (!cache) { 8083 ret = -ENOMEM; 8084 goto error; 8085 } 8086 8087 if (need_clear) { 8088 /* 8089 * When we mount with old space cache, we need to 8090 * set BTRFS_DC_CLEAR and set dirty flag. 8091 * 8092 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 8093 * truncate the old free space cache inode and 8094 * setup a new one. 8095 * b) Setting 'dirty flag' makes sure that we flush 8096 * the new space cache info onto disk. 8097 */ 8098 if (btrfs_test_opt(info, SPACE_CACHE)) 8099 cache->disk_cache_state = BTRFS_DC_CLEAR; 8100 } 8101 8102 read_extent_buffer(leaf, &cache->item, 8103 btrfs_item_ptr_offset(leaf, path->slots[0]), 8104 sizeof(cache->item)); 8105 cache->flags = btrfs_block_group_flags(&cache->item); 8106 if (!mixed && 8107 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 8108 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 8109 btrfs_err(info, 8110 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 8111 cache->key.objectid); 8112 ret = -EINVAL; 8113 goto error; 8114 } 8115 8116 key.objectid = found_key.objectid + found_key.offset; 8117 btrfs_release_path(path); 8118 8119 /* 8120 * We need to exclude the super stripes now so that the space 8121 * info has super bytes accounted for, otherwise we'll think 8122 * we have more space than we actually do. 8123 */ 8124 ret = exclude_super_stripes(cache); 8125 if (ret) { 8126 /* 8127 * We may have excluded something, so call this just in 8128 * case. 8129 */ 8130 free_excluded_extents(cache); 8131 btrfs_put_block_group(cache); 8132 goto error; 8133 } 8134 8135 /* 8136 * check for two cases, either we are full, and therefore 8137 * don't need to bother with the caching work since we won't 8138 * find any space, or we are empty, and we can just add all 8139 * the space in and be done with it. This saves us _a_lot_ of 8140 * time, particularly in the full case. 8141 */ 8142 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8143 cache->last_byte_to_unpin = (u64)-1; 8144 cache->cached = BTRFS_CACHE_FINISHED; 8145 free_excluded_extents(cache); 8146 } else if (btrfs_block_group_used(&cache->item) == 0) { 8147 cache->last_byte_to_unpin = (u64)-1; 8148 cache->cached = BTRFS_CACHE_FINISHED; 8149 add_new_free_space(cache, found_key.objectid, 8150 found_key.objectid + 8151 found_key.offset); 8152 free_excluded_extents(cache); 8153 } 8154 8155 ret = btrfs_add_block_group_cache(info, cache); 8156 if (ret) { 8157 btrfs_remove_free_space_cache(cache); 8158 btrfs_put_block_group(cache); 8159 goto error; 8160 } 8161 8162 trace_btrfs_add_block_group(info, cache, 0); 8163 btrfs_update_space_info(info, cache->flags, found_key.offset, 8164 btrfs_block_group_used(&cache->item), 8165 cache->bytes_super, &space_info); 8166 8167 cache->space_info = space_info; 8168 8169 link_block_group(cache); 8170 8171 set_avail_alloc_bits(info, cache->flags); 8172 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 8173 inc_block_group_ro(cache, 1); 8174 } else if (btrfs_block_group_used(&cache->item) == 0) { 8175 ASSERT(list_empty(&cache->bg_list)); 8176 btrfs_mark_bg_unused(cache); 8177 } 8178 } 8179 8180 list_for_each_entry_rcu(space_info, &info->space_info, list) { 8181 if (!(get_alloc_profile(info, space_info->flags) & 8182 (BTRFS_BLOCK_GROUP_RAID10 | 8183 BTRFS_BLOCK_GROUP_RAID1_MASK | 8184 BTRFS_BLOCK_GROUP_RAID56_MASK | 8185 BTRFS_BLOCK_GROUP_DUP))) 8186 continue; 8187 /* 8188 * avoid allocating from un-mirrored block group if there are 8189 * mirrored block groups. 8190 */ 8191 list_for_each_entry(cache, 8192 &space_info->block_groups[BTRFS_RAID_RAID0], 8193 list) 8194 inc_block_group_ro(cache, 1); 8195 list_for_each_entry(cache, 8196 &space_info->block_groups[BTRFS_RAID_SINGLE], 8197 list) 8198 inc_block_group_ro(cache, 1); 8199 } 8200 8201 btrfs_init_global_block_rsv(info); 8202 ret = check_chunk_block_group_mappings(info); 8203 error: 8204 btrfs_free_path(path); 8205 return ret; 8206 } 8207 8208 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 8209 { 8210 struct btrfs_fs_info *fs_info = trans->fs_info; 8211 struct btrfs_block_group_cache *block_group; 8212 struct btrfs_root *extent_root = fs_info->extent_root; 8213 struct btrfs_block_group_item item; 8214 struct btrfs_key key; 8215 int ret = 0; 8216 8217 if (!trans->can_flush_pending_bgs) 8218 return; 8219 8220 while (!list_empty(&trans->new_bgs)) { 8221 block_group = list_first_entry(&trans->new_bgs, 8222 struct btrfs_block_group_cache, 8223 bg_list); 8224 if (ret) 8225 goto next; 8226 8227 spin_lock(&block_group->lock); 8228 memcpy(&item, &block_group->item, sizeof(item)); 8229 memcpy(&key, &block_group->key, sizeof(key)); 8230 spin_unlock(&block_group->lock); 8231 8232 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8233 sizeof(item)); 8234 if (ret) 8235 btrfs_abort_transaction(trans, ret); 8236 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); 8237 if (ret) 8238 btrfs_abort_transaction(trans, ret); 8239 add_block_group_free_space(trans, block_group); 8240 /* already aborted the transaction if it failed. */ 8241 next: 8242 btrfs_delayed_refs_rsv_release(fs_info, 1); 8243 list_del_init(&block_group->bg_list); 8244 } 8245 btrfs_trans_release_chunk_metadata(trans); 8246 } 8247 8248 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, 8249 u64 type, u64 chunk_offset, u64 size) 8250 { 8251 struct btrfs_fs_info *fs_info = trans->fs_info; 8252 struct btrfs_block_group_cache *cache; 8253 int ret; 8254 8255 btrfs_set_log_full_commit(trans); 8256 8257 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 8258 if (!cache) 8259 return -ENOMEM; 8260 8261 btrfs_set_block_group_used(&cache->item, bytes_used); 8262 btrfs_set_block_group_chunk_objectid(&cache->item, 8263 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 8264 btrfs_set_block_group_flags(&cache->item, type); 8265 8266 cache->flags = type; 8267 cache->last_byte_to_unpin = (u64)-1; 8268 cache->cached = BTRFS_CACHE_FINISHED; 8269 cache->needs_free_space = 1; 8270 ret = exclude_super_stripes(cache); 8271 if (ret) { 8272 /* 8273 * We may have excluded something, so call this just in 8274 * case. 8275 */ 8276 free_excluded_extents(cache); 8277 btrfs_put_block_group(cache); 8278 return ret; 8279 } 8280 8281 add_new_free_space(cache, chunk_offset, chunk_offset + size); 8282 8283 free_excluded_extents(cache); 8284 8285 #ifdef CONFIG_BTRFS_DEBUG 8286 if (btrfs_should_fragment_free_space(cache)) { 8287 u64 new_bytes_used = size - bytes_used; 8288 8289 bytes_used += new_bytes_used >> 1; 8290 fragment_free_space(cache); 8291 } 8292 #endif 8293 /* 8294 * Ensure the corresponding space_info object is created and 8295 * assigned to our block group. We want our bg to be added to the rbtree 8296 * with its ->space_info set. 8297 */ 8298 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 8299 ASSERT(cache->space_info); 8300 8301 ret = btrfs_add_block_group_cache(fs_info, cache); 8302 if (ret) { 8303 btrfs_remove_free_space_cache(cache); 8304 btrfs_put_block_group(cache); 8305 return ret; 8306 } 8307 8308 /* 8309 * Now that our block group has its ->space_info set and is inserted in 8310 * the rbtree, update the space info's counters. 8311 */ 8312 trace_btrfs_add_block_group(fs_info, cache, 1); 8313 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, 8314 cache->bytes_super, &cache->space_info); 8315 btrfs_update_global_block_rsv(fs_info); 8316 8317 link_block_group(cache); 8318 8319 list_add_tail(&cache->bg_list, &trans->new_bgs); 8320 trans->delayed_ref_updates++; 8321 btrfs_update_delayed_refs_rsv(trans); 8322 8323 set_avail_alloc_bits(fs_info, type); 8324 return 0; 8325 } 8326 8327 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 8328 { 8329 u64 extra_flags = chunk_to_extended(flags) & 8330 BTRFS_EXTENDED_PROFILE_MASK; 8331 8332 write_seqlock(&fs_info->profiles_lock); 8333 if (flags & BTRFS_BLOCK_GROUP_DATA) 8334 fs_info->avail_data_alloc_bits &= ~extra_flags; 8335 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8336 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8337 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8338 fs_info->avail_system_alloc_bits &= ~extra_flags; 8339 write_sequnlock(&fs_info->profiles_lock); 8340 } 8341 8342 /* 8343 * Clear incompat bits for the following feature(s): 8344 * 8345 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 8346 * in the whole filesystem 8347 */ 8348 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 8349 { 8350 if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { 8351 struct list_head *head = &fs_info->space_info; 8352 struct btrfs_space_info *sinfo; 8353 8354 list_for_each_entry_rcu(sinfo, head, list) { 8355 bool found = false; 8356 8357 down_read(&sinfo->groups_sem); 8358 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 8359 found = true; 8360 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 8361 found = true; 8362 up_read(&sinfo->groups_sem); 8363 8364 if (found) 8365 return; 8366 } 8367 btrfs_clear_fs_incompat(fs_info, RAID56); 8368 } 8369 } 8370 8371 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8372 u64 group_start, struct extent_map *em) 8373 { 8374 struct btrfs_fs_info *fs_info = trans->fs_info; 8375 struct btrfs_root *root = fs_info->extent_root; 8376 struct btrfs_path *path; 8377 struct btrfs_block_group_cache *block_group; 8378 struct btrfs_free_cluster *cluster; 8379 struct btrfs_root *tree_root = fs_info->tree_root; 8380 struct btrfs_key key; 8381 struct inode *inode; 8382 struct kobject *kobj = NULL; 8383 int ret; 8384 int index; 8385 int factor; 8386 struct btrfs_caching_control *caching_ctl = NULL; 8387 bool remove_em; 8388 bool remove_rsv = false; 8389 8390 block_group = btrfs_lookup_block_group(fs_info, group_start); 8391 BUG_ON(!block_group); 8392 BUG_ON(!block_group->ro); 8393 8394 trace_btrfs_remove_block_group(block_group); 8395 /* 8396 * Free the reserved super bytes from this block group before 8397 * remove it. 8398 */ 8399 free_excluded_extents(block_group); 8400 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, 8401 block_group->key.offset); 8402 8403 memcpy(&key, &block_group->key, sizeof(key)); 8404 index = btrfs_bg_flags_to_raid_index(block_group->flags); 8405 factor = btrfs_bg_type_to_factor(block_group->flags); 8406 8407 /* make sure this block group isn't part of an allocation cluster */ 8408 cluster = &fs_info->data_alloc_cluster; 8409 spin_lock(&cluster->refill_lock); 8410 btrfs_return_cluster_to_free_space(block_group, cluster); 8411 spin_unlock(&cluster->refill_lock); 8412 8413 /* 8414 * make sure this block group isn't part of a metadata 8415 * allocation cluster 8416 */ 8417 cluster = &fs_info->meta_alloc_cluster; 8418 spin_lock(&cluster->refill_lock); 8419 btrfs_return_cluster_to_free_space(block_group, cluster); 8420 spin_unlock(&cluster->refill_lock); 8421 8422 path = btrfs_alloc_path(); 8423 if (!path) { 8424 ret = -ENOMEM; 8425 goto out; 8426 } 8427 8428 /* 8429 * get the inode first so any iput calls done for the io_list 8430 * aren't the final iput (no unlinks allowed now) 8431 */ 8432 inode = lookup_free_space_inode(block_group, path); 8433 8434 mutex_lock(&trans->transaction->cache_write_mutex); 8435 /* 8436 * Make sure our free space cache IO is done before removing the 8437 * free space inode 8438 */ 8439 spin_lock(&trans->transaction->dirty_bgs_lock); 8440 if (!list_empty(&block_group->io_list)) { 8441 list_del_init(&block_group->io_list); 8442 8443 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 8444 8445 spin_unlock(&trans->transaction->dirty_bgs_lock); 8446 btrfs_wait_cache_io(trans, block_group, path); 8447 btrfs_put_block_group(block_group); 8448 spin_lock(&trans->transaction->dirty_bgs_lock); 8449 } 8450 8451 if (!list_empty(&block_group->dirty_list)) { 8452 list_del_init(&block_group->dirty_list); 8453 remove_rsv = true; 8454 btrfs_put_block_group(block_group); 8455 } 8456 spin_unlock(&trans->transaction->dirty_bgs_lock); 8457 mutex_unlock(&trans->transaction->cache_write_mutex); 8458 8459 if (!IS_ERR(inode)) { 8460 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 8461 if (ret) { 8462 btrfs_add_delayed_iput(inode); 8463 goto out; 8464 } 8465 clear_nlink(inode); 8466 /* One for the block groups ref */ 8467 spin_lock(&block_group->lock); 8468 if (block_group->iref) { 8469 block_group->iref = 0; 8470 block_group->inode = NULL; 8471 spin_unlock(&block_group->lock); 8472 iput(inode); 8473 } else { 8474 spin_unlock(&block_group->lock); 8475 } 8476 /* One for our lookup ref */ 8477 btrfs_add_delayed_iput(inode); 8478 } 8479 8480 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 8481 key.offset = block_group->key.objectid; 8482 key.type = 0; 8483 8484 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 8485 if (ret < 0) 8486 goto out; 8487 if (ret > 0) 8488 btrfs_release_path(path); 8489 if (ret == 0) { 8490 ret = btrfs_del_item(trans, tree_root, path); 8491 if (ret) 8492 goto out; 8493 btrfs_release_path(path); 8494 } 8495 8496 spin_lock(&fs_info->block_group_cache_lock); 8497 rb_erase(&block_group->cache_node, 8498 &fs_info->block_group_cache_tree); 8499 RB_CLEAR_NODE(&block_group->cache_node); 8500 8501 if (fs_info->first_logical_byte == block_group->key.objectid) 8502 fs_info->first_logical_byte = (u64)-1; 8503 spin_unlock(&fs_info->block_group_cache_lock); 8504 8505 down_write(&block_group->space_info->groups_sem); 8506 /* 8507 * we must use list_del_init so people can check to see if they 8508 * are still on the list after taking the semaphore 8509 */ 8510 list_del_init(&block_group->list); 8511 if (list_empty(&block_group->space_info->block_groups[index])) { 8512 kobj = block_group->space_info->block_group_kobjs[index]; 8513 block_group->space_info->block_group_kobjs[index] = NULL; 8514 clear_avail_alloc_bits(fs_info, block_group->flags); 8515 } 8516 up_write(&block_group->space_info->groups_sem); 8517 clear_incompat_bg_bits(fs_info, block_group->flags); 8518 if (kobj) { 8519 kobject_del(kobj); 8520 kobject_put(kobj); 8521 } 8522 8523 if (block_group->has_caching_ctl) 8524 caching_ctl = get_caching_control(block_group); 8525 if (block_group->cached == BTRFS_CACHE_STARTED) 8526 wait_block_group_cache_done(block_group); 8527 if (block_group->has_caching_ctl) { 8528 down_write(&fs_info->commit_root_sem); 8529 if (!caching_ctl) { 8530 struct btrfs_caching_control *ctl; 8531 8532 list_for_each_entry(ctl, 8533 &fs_info->caching_block_groups, list) 8534 if (ctl->block_group == block_group) { 8535 caching_ctl = ctl; 8536 refcount_inc(&caching_ctl->count); 8537 break; 8538 } 8539 } 8540 if (caching_ctl) 8541 list_del_init(&caching_ctl->list); 8542 up_write(&fs_info->commit_root_sem); 8543 if (caching_ctl) { 8544 /* Once for the caching bgs list and once for us. */ 8545 put_caching_control(caching_ctl); 8546 put_caching_control(caching_ctl); 8547 } 8548 } 8549 8550 spin_lock(&trans->transaction->dirty_bgs_lock); 8551 WARN_ON(!list_empty(&block_group->dirty_list)); 8552 WARN_ON(!list_empty(&block_group->io_list)); 8553 spin_unlock(&trans->transaction->dirty_bgs_lock); 8554 8555 btrfs_remove_free_space_cache(block_group); 8556 8557 spin_lock(&block_group->space_info->lock); 8558 list_del_init(&block_group->ro_list); 8559 8560 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8561 WARN_ON(block_group->space_info->total_bytes 8562 < block_group->key.offset); 8563 WARN_ON(block_group->space_info->bytes_readonly 8564 < block_group->key.offset); 8565 WARN_ON(block_group->space_info->disk_total 8566 < block_group->key.offset * factor); 8567 } 8568 block_group->space_info->total_bytes -= block_group->key.offset; 8569 block_group->space_info->bytes_readonly -= block_group->key.offset; 8570 block_group->space_info->disk_total -= block_group->key.offset * factor; 8571 8572 spin_unlock(&block_group->space_info->lock); 8573 8574 memcpy(&key, &block_group->key, sizeof(key)); 8575 8576 mutex_lock(&fs_info->chunk_mutex); 8577 spin_lock(&block_group->lock); 8578 block_group->removed = 1; 8579 /* 8580 * At this point trimming can't start on this block group, because we 8581 * removed the block group from the tree fs_info->block_group_cache_tree 8582 * so no one can't find it anymore and even if someone already got this 8583 * block group before we removed it from the rbtree, they have already 8584 * incremented block_group->trimming - if they didn't, they won't find 8585 * any free space entries because we already removed them all when we 8586 * called btrfs_remove_free_space_cache(). 8587 * 8588 * And we must not remove the extent map from the fs_info->mapping_tree 8589 * to prevent the same logical address range and physical device space 8590 * ranges from being reused for a new block group. This is because our 8591 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 8592 * completely transactionless, so while it is trimming a range the 8593 * currently running transaction might finish and a new one start, 8594 * allowing for new block groups to be created that can reuse the same 8595 * physical device locations unless we take this special care. 8596 * 8597 * There may also be an implicit trim operation if the file system 8598 * is mounted with -odiscard. The same protections must remain 8599 * in place until the extents have been discarded completely when 8600 * the transaction commit has completed. 8601 */ 8602 remove_em = (atomic_read(&block_group->trimming) == 0); 8603 spin_unlock(&block_group->lock); 8604 8605 mutex_unlock(&fs_info->chunk_mutex); 8606 8607 ret = remove_block_group_free_space(trans, block_group); 8608 if (ret) 8609 goto out; 8610 8611 btrfs_put_block_group(block_group); 8612 btrfs_put_block_group(block_group); 8613 8614 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 8615 if (ret > 0) 8616 ret = -EIO; 8617 if (ret < 0) 8618 goto out; 8619 8620 ret = btrfs_del_item(trans, root, path); 8621 if (ret) 8622 goto out; 8623 8624 if (remove_em) { 8625 struct extent_map_tree *em_tree; 8626 8627 em_tree = &fs_info->mapping_tree; 8628 write_lock(&em_tree->lock); 8629 remove_extent_mapping(em_tree, em); 8630 write_unlock(&em_tree->lock); 8631 /* once for the tree */ 8632 free_extent_map(em); 8633 } 8634 out: 8635 if (remove_rsv) 8636 btrfs_delayed_refs_rsv_release(fs_info, 1); 8637 btrfs_free_path(path); 8638 return ret; 8639 } 8640 8641 struct btrfs_trans_handle * 8642 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 8643 const u64 chunk_offset) 8644 { 8645 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8646 struct extent_map *em; 8647 struct map_lookup *map; 8648 unsigned int num_items; 8649 8650 read_lock(&em_tree->lock); 8651 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 8652 read_unlock(&em_tree->lock); 8653 ASSERT(em && em->start == chunk_offset); 8654 8655 /* 8656 * We need to reserve 3 + N units from the metadata space info in order 8657 * to remove a block group (done at btrfs_remove_chunk() and at 8658 * btrfs_remove_block_group()), which are used for: 8659 * 8660 * 1 unit for adding the free space inode's orphan (located in the tree 8661 * of tree roots). 8662 * 1 unit for deleting the block group item (located in the extent 8663 * tree). 8664 * 1 unit for deleting the free space item (located in tree of tree 8665 * roots). 8666 * N units for deleting N device extent items corresponding to each 8667 * stripe (located in the device tree). 8668 * 8669 * In order to remove a block group we also need to reserve units in the 8670 * system space info in order to update the chunk tree (update one or 8671 * more device items and remove one chunk item), but this is done at 8672 * btrfs_remove_chunk() through a call to check_system_chunk(). 8673 */ 8674 map = em->map_lookup; 8675 num_items = 3 + map->num_stripes; 8676 free_extent_map(em); 8677 8678 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 8679 num_items, 1); 8680 } 8681 8682 /* 8683 * Process the unused_bgs list and remove any that don't have any allocated 8684 * space inside of them. 8685 */ 8686 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 8687 { 8688 struct btrfs_block_group_cache *block_group; 8689 struct btrfs_space_info *space_info; 8690 struct btrfs_trans_handle *trans; 8691 int ret = 0; 8692 8693 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 8694 return; 8695 8696 spin_lock(&fs_info->unused_bgs_lock); 8697 while (!list_empty(&fs_info->unused_bgs)) { 8698 u64 start, end; 8699 int trimming; 8700 8701 block_group = list_first_entry(&fs_info->unused_bgs, 8702 struct btrfs_block_group_cache, 8703 bg_list); 8704 list_del_init(&block_group->bg_list); 8705 8706 space_info = block_group->space_info; 8707 8708 if (ret || btrfs_mixed_space_info(space_info)) { 8709 btrfs_put_block_group(block_group); 8710 continue; 8711 } 8712 spin_unlock(&fs_info->unused_bgs_lock); 8713 8714 mutex_lock(&fs_info->delete_unused_bgs_mutex); 8715 8716 /* Don't want to race with allocators so take the groups_sem */ 8717 down_write(&space_info->groups_sem); 8718 spin_lock(&block_group->lock); 8719 if (block_group->reserved || block_group->pinned || 8720 btrfs_block_group_used(&block_group->item) || 8721 block_group->ro || 8722 list_is_singular(&block_group->list)) { 8723 /* 8724 * We want to bail if we made new allocations or have 8725 * outstanding allocations in this block group. We do 8726 * the ro check in case balance is currently acting on 8727 * this block group. 8728 */ 8729 trace_btrfs_skip_unused_block_group(block_group); 8730 spin_unlock(&block_group->lock); 8731 up_write(&space_info->groups_sem); 8732 goto next; 8733 } 8734 spin_unlock(&block_group->lock); 8735 8736 /* We don't want to force the issue, only flip if it's ok. */ 8737 ret = inc_block_group_ro(block_group, 0); 8738 up_write(&space_info->groups_sem); 8739 if (ret < 0) { 8740 ret = 0; 8741 goto next; 8742 } 8743 8744 /* 8745 * Want to do this before we do anything else so we can recover 8746 * properly if we fail to join the transaction. 8747 */ 8748 trans = btrfs_start_trans_remove_block_group(fs_info, 8749 block_group->key.objectid); 8750 if (IS_ERR(trans)) { 8751 btrfs_dec_block_group_ro(block_group); 8752 ret = PTR_ERR(trans); 8753 goto next; 8754 } 8755 8756 /* 8757 * We could have pending pinned extents for this block group, 8758 * just delete them, we don't care about them anymore. 8759 */ 8760 start = block_group->key.objectid; 8761 end = start + block_group->key.offset - 1; 8762 /* 8763 * Hold the unused_bg_unpin_mutex lock to avoid racing with 8764 * btrfs_finish_extent_commit(). If we are at transaction N, 8765 * another task might be running finish_extent_commit() for the 8766 * previous transaction N - 1, and have seen a range belonging 8767 * to the block group in freed_extents[] before we were able to 8768 * clear the whole block group range from freed_extents[]. This 8769 * means that task can lookup for the block group after we 8770 * unpinned it from freed_extents[] and removed it, leading to 8771 * a BUG_ON() at btrfs_unpin_extent_range(). 8772 */ 8773 mutex_lock(&fs_info->unused_bg_unpin_mutex); 8774 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 8775 EXTENT_DIRTY); 8776 if (ret) { 8777 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 8778 btrfs_dec_block_group_ro(block_group); 8779 goto end_trans; 8780 } 8781 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 8782 EXTENT_DIRTY); 8783 if (ret) { 8784 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 8785 btrfs_dec_block_group_ro(block_group); 8786 goto end_trans; 8787 } 8788 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 8789 8790 /* Reset pinned so btrfs_put_block_group doesn't complain */ 8791 spin_lock(&space_info->lock); 8792 spin_lock(&block_group->lock); 8793 8794 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 8795 -block_group->pinned); 8796 space_info->bytes_readonly += block_group->pinned; 8797 percpu_counter_add_batch(&space_info->total_bytes_pinned, 8798 -block_group->pinned, 8799 BTRFS_TOTAL_BYTES_PINNED_BATCH); 8800 block_group->pinned = 0; 8801 8802 spin_unlock(&block_group->lock); 8803 spin_unlock(&space_info->lock); 8804 8805 /* DISCARD can flip during remount */ 8806 trimming = btrfs_test_opt(fs_info, DISCARD); 8807 8808 /* Implicit trim during transaction commit. */ 8809 if (trimming) 8810 btrfs_get_block_group_trimming(block_group); 8811 8812 /* 8813 * Btrfs_remove_chunk will abort the transaction if things go 8814 * horribly wrong. 8815 */ 8816 ret = btrfs_remove_chunk(trans, block_group->key.objectid); 8817 8818 if (ret) { 8819 if (trimming) 8820 btrfs_put_block_group_trimming(block_group); 8821 goto end_trans; 8822 } 8823 8824 /* 8825 * If we're not mounted with -odiscard, we can just forget 8826 * about this block group. Otherwise we'll need to wait 8827 * until transaction commit to do the actual discard. 8828 */ 8829 if (trimming) { 8830 spin_lock(&fs_info->unused_bgs_lock); 8831 /* 8832 * A concurrent scrub might have added us to the list 8833 * fs_info->unused_bgs, so use a list_move operation 8834 * to add the block group to the deleted_bgs list. 8835 */ 8836 list_move(&block_group->bg_list, 8837 &trans->transaction->deleted_bgs); 8838 spin_unlock(&fs_info->unused_bgs_lock); 8839 btrfs_get_block_group(block_group); 8840 } 8841 end_trans: 8842 btrfs_end_transaction(trans); 8843 next: 8844 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 8845 btrfs_put_block_group(block_group); 8846 spin_lock(&fs_info->unused_bgs_lock); 8847 } 8848 spin_unlock(&fs_info->unused_bgs_lock); 8849 } 8850 8851 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 8852 u64 start, u64 end) 8853 { 8854 return unpin_extent_range(fs_info, start, end, false); 8855 } 8856 8857 /* 8858 * It used to be that old block groups would be left around forever. 8859 * Iterating over them would be enough to trim unused space. Since we 8860 * now automatically remove them, we also need to iterate over unallocated 8861 * space. 8862 * 8863 * We don't want a transaction for this since the discard may take a 8864 * substantial amount of time. We don't require that a transaction be 8865 * running, but we do need to take a running transaction into account 8866 * to ensure that we're not discarding chunks that were released or 8867 * allocated in the current transaction. 8868 * 8869 * Holding the chunks lock will prevent other threads from allocating 8870 * or releasing chunks, but it won't prevent a running transaction 8871 * from committing and releasing the memory that the pending chunks 8872 * list head uses. For that, we need to take a reference to the 8873 * transaction and hold the commit root sem. We only need to hold 8874 * it while performing the free space search since we have already 8875 * held back allocations. 8876 */ 8877 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) 8878 { 8879 u64 start = SZ_1M, len = 0, end = 0; 8880 int ret; 8881 8882 *trimmed = 0; 8883 8884 /* Discard not supported = nothing to do. */ 8885 if (!blk_queue_discard(bdev_get_queue(device->bdev))) 8886 return 0; 8887 8888 /* Not writable = nothing to do. */ 8889 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 8890 return 0; 8891 8892 /* No free space = nothing to do. */ 8893 if (device->total_bytes <= device->bytes_used) 8894 return 0; 8895 8896 ret = 0; 8897 8898 while (1) { 8899 struct btrfs_fs_info *fs_info = device->fs_info; 8900 u64 bytes; 8901 8902 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 8903 if (ret) 8904 break; 8905 8906 find_first_clear_extent_bit(&device->alloc_state, start, 8907 &start, &end, 8908 CHUNK_TRIMMED | CHUNK_ALLOCATED); 8909 8910 /* Ensure we skip the reserved area in the first 1M */ 8911 start = max_t(u64, start, SZ_1M); 8912 8913 /* 8914 * If find_first_clear_extent_bit find a range that spans the 8915 * end of the device it will set end to -1, in this case it's up 8916 * to the caller to trim the value to the size of the device. 8917 */ 8918 end = min(end, device->total_bytes - 1); 8919 8920 len = end - start + 1; 8921 8922 /* We didn't find any extents */ 8923 if (!len) { 8924 mutex_unlock(&fs_info->chunk_mutex); 8925 ret = 0; 8926 break; 8927 } 8928 8929 ret = btrfs_issue_discard(device->bdev, start, len, 8930 &bytes); 8931 if (!ret) 8932 set_extent_bits(&device->alloc_state, start, 8933 start + bytes - 1, 8934 CHUNK_TRIMMED); 8935 mutex_unlock(&fs_info->chunk_mutex); 8936 8937 if (ret) 8938 break; 8939 8940 start += len; 8941 *trimmed += bytes; 8942 8943 if (fatal_signal_pending(current)) { 8944 ret = -ERESTARTSYS; 8945 break; 8946 } 8947 8948 cond_resched(); 8949 } 8950 8951 return ret; 8952 } 8953 8954 /* 8955 * Trim the whole filesystem by: 8956 * 1) trimming the free space in each block group 8957 * 2) trimming the unallocated space on each device 8958 * 8959 * This will also continue trimming even if a block group or device encounters 8960 * an error. The return value will be the last error, or 0 if nothing bad 8961 * happens. 8962 */ 8963 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 8964 { 8965 struct btrfs_block_group_cache *cache = NULL; 8966 struct btrfs_device *device; 8967 struct list_head *devices; 8968 u64 group_trimmed; 8969 u64 range_end = U64_MAX; 8970 u64 start; 8971 u64 end; 8972 u64 trimmed = 0; 8973 u64 bg_failed = 0; 8974 u64 dev_failed = 0; 8975 int bg_ret = 0; 8976 int dev_ret = 0; 8977 int ret = 0; 8978 8979 /* 8980 * Check range overflow if range->len is set. 8981 * The default range->len is U64_MAX. 8982 */ 8983 if (range->len != U64_MAX && 8984 check_add_overflow(range->start, range->len, &range_end)) 8985 return -EINVAL; 8986 8987 cache = btrfs_lookup_first_block_group(fs_info, range->start); 8988 for (; cache; cache = next_block_group(cache)) { 8989 if (cache->key.objectid >= range_end) { 8990 btrfs_put_block_group(cache); 8991 break; 8992 } 8993 8994 start = max(range->start, cache->key.objectid); 8995 end = min(range_end, cache->key.objectid + cache->key.offset); 8996 8997 if (end - start >= range->minlen) { 8998 if (!block_group_cache_done(cache)) { 8999 ret = cache_block_group(cache, 0); 9000 if (ret) { 9001 bg_failed++; 9002 bg_ret = ret; 9003 continue; 9004 } 9005 ret = wait_block_group_cache_done(cache); 9006 if (ret) { 9007 bg_failed++; 9008 bg_ret = ret; 9009 continue; 9010 } 9011 } 9012 ret = btrfs_trim_block_group(cache, 9013 &group_trimmed, 9014 start, 9015 end, 9016 range->minlen); 9017 9018 trimmed += group_trimmed; 9019 if (ret) { 9020 bg_failed++; 9021 bg_ret = ret; 9022 continue; 9023 } 9024 } 9025 } 9026 9027 if (bg_failed) 9028 btrfs_warn(fs_info, 9029 "failed to trim %llu block group(s), last error %d", 9030 bg_failed, bg_ret); 9031 mutex_lock(&fs_info->fs_devices->device_list_mutex); 9032 devices = &fs_info->fs_devices->devices; 9033 list_for_each_entry(device, devices, dev_list) { 9034 ret = btrfs_trim_free_extents(device, &group_trimmed); 9035 if (ret) { 9036 dev_failed++; 9037 dev_ret = ret; 9038 break; 9039 } 9040 9041 trimmed += group_trimmed; 9042 } 9043 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 9044 9045 if (dev_failed) 9046 btrfs_warn(fs_info, 9047 "failed to trim %llu device(s), last error %d", 9048 dev_failed, dev_ret); 9049 range->len = trimmed; 9050 if (bg_ret) 9051 return bg_ret; 9052 return dev_ret; 9053 } 9054 9055 /* 9056 * btrfs_{start,end}_write_no_snapshotting() are similar to 9057 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 9058 * data into the page cache through nocow before the subvolume is snapshoted, 9059 * but flush the data into disk after the snapshot creation, or to prevent 9060 * operations while snapshotting is ongoing and that cause the snapshot to be 9061 * inconsistent (writes followed by expanding truncates for example). 9062 */ 9063 void btrfs_end_write_no_snapshotting(struct btrfs_root *root) 9064 { 9065 percpu_counter_dec(&root->subv_writers->counter); 9066 cond_wake_up(&root->subv_writers->wait); 9067 } 9068 9069 int btrfs_start_write_no_snapshotting(struct btrfs_root *root) 9070 { 9071 if (atomic_read(&root->will_be_snapshotted)) 9072 return 0; 9073 9074 percpu_counter_inc(&root->subv_writers->counter); 9075 /* 9076 * Make sure counter is updated before we check for snapshot creation. 9077 */ 9078 smp_mb(); 9079 if (atomic_read(&root->will_be_snapshotted)) { 9080 btrfs_end_write_no_snapshotting(root); 9081 return 0; 9082 } 9083 return 1; 9084 } 9085 9086 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 9087 { 9088 while (true) { 9089 int ret; 9090 9091 ret = btrfs_start_write_no_snapshotting(root); 9092 if (ret) 9093 break; 9094 wait_var_event(&root->will_be_snapshotted, 9095 !atomic_read(&root->will_be_snapshotted)); 9096 } 9097 } 9098 9099 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) 9100 { 9101 struct btrfs_fs_info *fs_info = bg->fs_info; 9102 9103 spin_lock(&fs_info->unused_bgs_lock); 9104 if (list_empty(&bg->bg_list)) { 9105 btrfs_get_block_group(bg); 9106 trace_btrfs_add_unused_block_group(bg); 9107 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 9108 } 9109 spin_unlock(&fs_info->unused_bgs_lock); 9110 } 9111