1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "tree-log.h" 20 #include "disk-io.h" 21 #include "print-tree.h" 22 #include "volumes.h" 23 #include "raid56.h" 24 #include "locking.h" 25 #include "free-space-cache.h" 26 #include "free-space-tree.h" 27 #include "math.h" 28 #include "sysfs.h" 29 #include "qgroup.h" 30 #include "ref-verify.h" 31 32 #undef SCRAMBLE_DELAYED_REFS 33 34 /* 35 * control flags for do_chunk_alloc's force field 36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 37 * if we really need one. 38 * 39 * CHUNK_ALLOC_LIMITED means to only try and allocate one 40 * if we have very few chunks already allocated. This is 41 * used as part of the clustering code to help make sure 42 * we have a good pool of storage to cluster in, without 43 * filling the FS with empty chunks 44 * 45 * CHUNK_ALLOC_FORCE means it must try to allocate one 46 * 47 */ 48 enum { 49 CHUNK_ALLOC_NO_FORCE = 0, 50 CHUNK_ALLOC_LIMITED = 1, 51 CHUNK_ALLOC_FORCE = 2, 52 }; 53 54 /* 55 * Declare a helper function to detect underflow of various space info members 56 */ 57 #define DECLARE_SPACE_INFO_UPDATE(name) \ 58 static inline void update_##name(struct btrfs_space_info *sinfo, \ 59 s64 bytes) \ 60 { \ 61 if (bytes < 0 && sinfo->name < -bytes) { \ 62 WARN_ON(1); \ 63 sinfo->name = 0; \ 64 return; \ 65 } \ 66 sinfo->name += bytes; \ 67 } 68 69 DECLARE_SPACE_INFO_UPDATE(bytes_may_use); 70 DECLARE_SPACE_INFO_UPDATE(bytes_pinned); 71 72 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 73 struct btrfs_delayed_ref_node *node, u64 parent, 74 u64 root_objectid, u64 owner_objectid, 75 u64 owner_offset, int refs_to_drop, 76 struct btrfs_delayed_extent_op *extra_op); 77 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 78 struct extent_buffer *leaf, 79 struct btrfs_extent_item *ei); 80 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 81 u64 parent, u64 root_objectid, 82 u64 flags, u64 owner, u64 offset, 83 struct btrfs_key *ins, int ref_mod); 84 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 85 struct btrfs_delayed_ref_node *node, 86 struct btrfs_delayed_extent_op *extent_op); 87 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 88 int force); 89 static int find_next_key(struct btrfs_path *path, int level, 90 struct btrfs_key *key); 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 92 struct btrfs_space_info *info, u64 bytes, 93 int dump_block_groups); 94 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 95 u64 num_bytes); 96 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 97 struct btrfs_space_info *space_info, 98 u64 num_bytes); 99 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 100 struct btrfs_space_info *space_info, 101 u64 num_bytes); 102 103 static noinline int 104 block_group_cache_done(struct btrfs_block_group_cache *cache) 105 { 106 smp_mb(); 107 return cache->cached == BTRFS_CACHE_FINISHED || 108 cache->cached == BTRFS_CACHE_ERROR; 109 } 110 111 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 112 { 113 return (cache->flags & bits) == bits; 114 } 115 116 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 117 { 118 atomic_inc(&cache->count); 119 } 120 121 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 122 { 123 if (atomic_dec_and_test(&cache->count)) { 124 WARN_ON(cache->pinned > 0); 125 WARN_ON(cache->reserved > 0); 126 127 /* 128 * If not empty, someone is still holding mutex of 129 * full_stripe_lock, which can only be released by caller. 130 * And it will definitely cause use-after-free when caller 131 * tries to release full stripe lock. 132 * 133 * No better way to resolve, but only to warn. 134 */ 135 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 136 kfree(cache->free_space_ctl); 137 kfree(cache); 138 } 139 } 140 141 /* 142 * this adds the block group to the fs_info rb tree for the block group 143 * cache 144 */ 145 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 146 struct btrfs_block_group_cache *block_group) 147 { 148 struct rb_node **p; 149 struct rb_node *parent = NULL; 150 struct btrfs_block_group_cache *cache; 151 152 spin_lock(&info->block_group_cache_lock); 153 p = &info->block_group_cache_tree.rb_node; 154 155 while (*p) { 156 parent = *p; 157 cache = rb_entry(parent, struct btrfs_block_group_cache, 158 cache_node); 159 if (block_group->key.objectid < cache->key.objectid) { 160 p = &(*p)->rb_left; 161 } else if (block_group->key.objectid > cache->key.objectid) { 162 p = &(*p)->rb_right; 163 } else { 164 spin_unlock(&info->block_group_cache_lock); 165 return -EEXIST; 166 } 167 } 168 169 rb_link_node(&block_group->cache_node, parent, p); 170 rb_insert_color(&block_group->cache_node, 171 &info->block_group_cache_tree); 172 173 if (info->first_logical_byte > block_group->key.objectid) 174 info->first_logical_byte = block_group->key.objectid; 175 176 spin_unlock(&info->block_group_cache_lock); 177 178 return 0; 179 } 180 181 /* 182 * This will return the block group at or after bytenr if contains is 0, else 183 * it will return the block group that contains the bytenr 184 */ 185 static struct btrfs_block_group_cache * 186 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 187 int contains) 188 { 189 struct btrfs_block_group_cache *cache, *ret = NULL; 190 struct rb_node *n; 191 u64 end, start; 192 193 spin_lock(&info->block_group_cache_lock); 194 n = info->block_group_cache_tree.rb_node; 195 196 while (n) { 197 cache = rb_entry(n, struct btrfs_block_group_cache, 198 cache_node); 199 end = cache->key.objectid + cache->key.offset - 1; 200 start = cache->key.objectid; 201 202 if (bytenr < start) { 203 if (!contains && (!ret || start < ret->key.objectid)) 204 ret = cache; 205 n = n->rb_left; 206 } else if (bytenr > start) { 207 if (contains && bytenr <= end) { 208 ret = cache; 209 break; 210 } 211 n = n->rb_right; 212 } else { 213 ret = cache; 214 break; 215 } 216 } 217 if (ret) { 218 btrfs_get_block_group(ret); 219 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 220 info->first_logical_byte = ret->key.objectid; 221 } 222 spin_unlock(&info->block_group_cache_lock); 223 224 return ret; 225 } 226 227 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 228 u64 start, u64 num_bytes) 229 { 230 u64 end = start + num_bytes - 1; 231 set_extent_bits(&fs_info->freed_extents[0], 232 start, end, EXTENT_UPTODATE); 233 set_extent_bits(&fs_info->freed_extents[1], 234 start, end, EXTENT_UPTODATE); 235 return 0; 236 } 237 238 static void free_excluded_extents(struct btrfs_block_group_cache *cache) 239 { 240 struct btrfs_fs_info *fs_info = cache->fs_info; 241 u64 start, end; 242 243 start = cache->key.objectid; 244 end = start + cache->key.offset - 1; 245 246 clear_extent_bits(&fs_info->freed_extents[0], 247 start, end, EXTENT_UPTODATE); 248 clear_extent_bits(&fs_info->freed_extents[1], 249 start, end, EXTENT_UPTODATE); 250 } 251 252 static int exclude_super_stripes(struct btrfs_block_group_cache *cache) 253 { 254 struct btrfs_fs_info *fs_info = cache->fs_info; 255 u64 bytenr; 256 u64 *logical; 257 int stripe_len; 258 int i, nr, ret; 259 260 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 261 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 262 cache->bytes_super += stripe_len; 263 ret = add_excluded_extent(fs_info, cache->key.objectid, 264 stripe_len); 265 if (ret) 266 return ret; 267 } 268 269 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 270 bytenr = btrfs_sb_offset(i); 271 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 272 bytenr, &logical, &nr, &stripe_len); 273 if (ret) 274 return ret; 275 276 while (nr--) { 277 u64 start, len; 278 279 if (logical[nr] > cache->key.objectid + 280 cache->key.offset) 281 continue; 282 283 if (logical[nr] + stripe_len <= cache->key.objectid) 284 continue; 285 286 start = logical[nr]; 287 if (start < cache->key.objectid) { 288 start = cache->key.objectid; 289 len = (logical[nr] + stripe_len) - start; 290 } else { 291 len = min_t(u64, stripe_len, 292 cache->key.objectid + 293 cache->key.offset - start); 294 } 295 296 cache->bytes_super += len; 297 ret = add_excluded_extent(fs_info, start, len); 298 if (ret) { 299 kfree(logical); 300 return ret; 301 } 302 } 303 304 kfree(logical); 305 } 306 return 0; 307 } 308 309 static struct btrfs_caching_control * 310 get_caching_control(struct btrfs_block_group_cache *cache) 311 { 312 struct btrfs_caching_control *ctl; 313 314 spin_lock(&cache->lock); 315 if (!cache->caching_ctl) { 316 spin_unlock(&cache->lock); 317 return NULL; 318 } 319 320 ctl = cache->caching_ctl; 321 refcount_inc(&ctl->count); 322 spin_unlock(&cache->lock); 323 return ctl; 324 } 325 326 static void put_caching_control(struct btrfs_caching_control *ctl) 327 { 328 if (refcount_dec_and_test(&ctl->count)) 329 kfree(ctl); 330 } 331 332 #ifdef CONFIG_BTRFS_DEBUG 333 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 334 { 335 struct btrfs_fs_info *fs_info = block_group->fs_info; 336 u64 start = block_group->key.objectid; 337 u64 len = block_group->key.offset; 338 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 339 fs_info->nodesize : fs_info->sectorsize; 340 u64 step = chunk << 1; 341 342 while (len > chunk) { 343 btrfs_remove_free_space(block_group, start, chunk); 344 start += step; 345 if (len < step) 346 len = 0; 347 else 348 len -= step; 349 } 350 } 351 #endif 352 353 /* 354 * this is only called by cache_block_group, since we could have freed extents 355 * we need to check the pinned_extents for any extents that can't be used yet 356 * since their free space will be released as soon as the transaction commits. 357 */ 358 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 359 u64 start, u64 end) 360 { 361 struct btrfs_fs_info *info = block_group->fs_info; 362 u64 extent_start, extent_end, size, total_added = 0; 363 int ret; 364 365 while (start < end) { 366 ret = find_first_extent_bit(info->pinned_extents, start, 367 &extent_start, &extent_end, 368 EXTENT_DIRTY | EXTENT_UPTODATE, 369 NULL); 370 if (ret) 371 break; 372 373 if (extent_start <= start) { 374 start = extent_end + 1; 375 } else if (extent_start > start && extent_start < end) { 376 size = extent_start - start; 377 total_added += size; 378 ret = btrfs_add_free_space(block_group, start, 379 size); 380 BUG_ON(ret); /* -ENOMEM or logic error */ 381 start = extent_end + 1; 382 } else { 383 break; 384 } 385 } 386 387 if (start < end) { 388 size = end - start; 389 total_added += size; 390 ret = btrfs_add_free_space(block_group, start, size); 391 BUG_ON(ret); /* -ENOMEM or logic error */ 392 } 393 394 return total_added; 395 } 396 397 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 398 { 399 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 400 struct btrfs_fs_info *fs_info = block_group->fs_info; 401 struct btrfs_root *extent_root = fs_info->extent_root; 402 struct btrfs_path *path; 403 struct extent_buffer *leaf; 404 struct btrfs_key key; 405 u64 total_found = 0; 406 u64 last = 0; 407 u32 nritems; 408 int ret; 409 bool wakeup = true; 410 411 path = btrfs_alloc_path(); 412 if (!path) 413 return -ENOMEM; 414 415 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 416 417 #ifdef CONFIG_BTRFS_DEBUG 418 /* 419 * If we're fragmenting we don't want to make anybody think we can 420 * allocate from this block group until we've had a chance to fragment 421 * the free space. 422 */ 423 if (btrfs_should_fragment_free_space(block_group)) 424 wakeup = false; 425 #endif 426 /* 427 * We don't want to deadlock with somebody trying to allocate a new 428 * extent for the extent root while also trying to search the extent 429 * root to add free space. So we skip locking and search the commit 430 * root, since its read-only 431 */ 432 path->skip_locking = 1; 433 path->search_commit_root = 1; 434 path->reada = READA_FORWARD; 435 436 key.objectid = last; 437 key.offset = 0; 438 key.type = BTRFS_EXTENT_ITEM_KEY; 439 440 next: 441 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 442 if (ret < 0) 443 goto out; 444 445 leaf = path->nodes[0]; 446 nritems = btrfs_header_nritems(leaf); 447 448 while (1) { 449 if (btrfs_fs_closing(fs_info) > 1) { 450 last = (u64)-1; 451 break; 452 } 453 454 if (path->slots[0] < nritems) { 455 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 456 } else { 457 ret = find_next_key(path, 0, &key); 458 if (ret) 459 break; 460 461 if (need_resched() || 462 rwsem_is_contended(&fs_info->commit_root_sem)) { 463 if (wakeup) 464 caching_ctl->progress = last; 465 btrfs_release_path(path); 466 up_read(&fs_info->commit_root_sem); 467 mutex_unlock(&caching_ctl->mutex); 468 cond_resched(); 469 mutex_lock(&caching_ctl->mutex); 470 down_read(&fs_info->commit_root_sem); 471 goto next; 472 } 473 474 ret = btrfs_next_leaf(extent_root, path); 475 if (ret < 0) 476 goto out; 477 if (ret) 478 break; 479 leaf = path->nodes[0]; 480 nritems = btrfs_header_nritems(leaf); 481 continue; 482 } 483 484 if (key.objectid < last) { 485 key.objectid = last; 486 key.offset = 0; 487 key.type = BTRFS_EXTENT_ITEM_KEY; 488 489 if (wakeup) 490 caching_ctl->progress = last; 491 btrfs_release_path(path); 492 goto next; 493 } 494 495 if (key.objectid < block_group->key.objectid) { 496 path->slots[0]++; 497 continue; 498 } 499 500 if (key.objectid >= block_group->key.objectid + 501 block_group->key.offset) 502 break; 503 504 if (key.type == BTRFS_EXTENT_ITEM_KEY || 505 key.type == BTRFS_METADATA_ITEM_KEY) { 506 total_found += add_new_free_space(block_group, last, 507 key.objectid); 508 if (key.type == BTRFS_METADATA_ITEM_KEY) 509 last = key.objectid + 510 fs_info->nodesize; 511 else 512 last = key.objectid + key.offset; 513 514 if (total_found > CACHING_CTL_WAKE_UP) { 515 total_found = 0; 516 if (wakeup) 517 wake_up(&caching_ctl->wait); 518 } 519 } 520 path->slots[0]++; 521 } 522 ret = 0; 523 524 total_found += add_new_free_space(block_group, last, 525 block_group->key.objectid + 526 block_group->key.offset); 527 caching_ctl->progress = (u64)-1; 528 529 out: 530 btrfs_free_path(path); 531 return ret; 532 } 533 534 static noinline void caching_thread(struct btrfs_work *work) 535 { 536 struct btrfs_block_group_cache *block_group; 537 struct btrfs_fs_info *fs_info; 538 struct btrfs_caching_control *caching_ctl; 539 int ret; 540 541 caching_ctl = container_of(work, struct btrfs_caching_control, work); 542 block_group = caching_ctl->block_group; 543 fs_info = block_group->fs_info; 544 545 mutex_lock(&caching_ctl->mutex); 546 down_read(&fs_info->commit_root_sem); 547 548 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 549 ret = load_free_space_tree(caching_ctl); 550 else 551 ret = load_extent_tree_free(caching_ctl); 552 553 spin_lock(&block_group->lock); 554 block_group->caching_ctl = NULL; 555 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 556 spin_unlock(&block_group->lock); 557 558 #ifdef CONFIG_BTRFS_DEBUG 559 if (btrfs_should_fragment_free_space(block_group)) { 560 u64 bytes_used; 561 562 spin_lock(&block_group->space_info->lock); 563 spin_lock(&block_group->lock); 564 bytes_used = block_group->key.offset - 565 btrfs_block_group_used(&block_group->item); 566 block_group->space_info->bytes_used += bytes_used >> 1; 567 spin_unlock(&block_group->lock); 568 spin_unlock(&block_group->space_info->lock); 569 fragment_free_space(block_group); 570 } 571 #endif 572 573 caching_ctl->progress = (u64)-1; 574 575 up_read(&fs_info->commit_root_sem); 576 free_excluded_extents(block_group); 577 mutex_unlock(&caching_ctl->mutex); 578 579 wake_up(&caching_ctl->wait); 580 581 put_caching_control(caching_ctl); 582 btrfs_put_block_group(block_group); 583 } 584 585 static int cache_block_group(struct btrfs_block_group_cache *cache, 586 int load_cache_only) 587 { 588 DEFINE_WAIT(wait); 589 struct btrfs_fs_info *fs_info = cache->fs_info; 590 struct btrfs_caching_control *caching_ctl; 591 int ret = 0; 592 593 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 594 if (!caching_ctl) 595 return -ENOMEM; 596 597 INIT_LIST_HEAD(&caching_ctl->list); 598 mutex_init(&caching_ctl->mutex); 599 init_waitqueue_head(&caching_ctl->wait); 600 caching_ctl->block_group = cache; 601 caching_ctl->progress = cache->key.objectid; 602 refcount_set(&caching_ctl->count, 1); 603 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 604 caching_thread, NULL, NULL); 605 606 spin_lock(&cache->lock); 607 /* 608 * This should be a rare occasion, but this could happen I think in the 609 * case where one thread starts to load the space cache info, and then 610 * some other thread starts a transaction commit which tries to do an 611 * allocation while the other thread is still loading the space cache 612 * info. The previous loop should have kept us from choosing this block 613 * group, but if we've moved to the state where we will wait on caching 614 * block groups we need to first check if we're doing a fast load here, 615 * so we can wait for it to finish, otherwise we could end up allocating 616 * from a block group who's cache gets evicted for one reason or 617 * another. 618 */ 619 while (cache->cached == BTRFS_CACHE_FAST) { 620 struct btrfs_caching_control *ctl; 621 622 ctl = cache->caching_ctl; 623 refcount_inc(&ctl->count); 624 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 625 spin_unlock(&cache->lock); 626 627 schedule(); 628 629 finish_wait(&ctl->wait, &wait); 630 put_caching_control(ctl); 631 spin_lock(&cache->lock); 632 } 633 634 if (cache->cached != BTRFS_CACHE_NO) { 635 spin_unlock(&cache->lock); 636 kfree(caching_ctl); 637 return 0; 638 } 639 WARN_ON(cache->caching_ctl); 640 cache->caching_ctl = caching_ctl; 641 cache->cached = BTRFS_CACHE_FAST; 642 spin_unlock(&cache->lock); 643 644 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 645 mutex_lock(&caching_ctl->mutex); 646 ret = load_free_space_cache(cache); 647 648 spin_lock(&cache->lock); 649 if (ret == 1) { 650 cache->caching_ctl = NULL; 651 cache->cached = BTRFS_CACHE_FINISHED; 652 cache->last_byte_to_unpin = (u64)-1; 653 caching_ctl->progress = (u64)-1; 654 } else { 655 if (load_cache_only) { 656 cache->caching_ctl = NULL; 657 cache->cached = BTRFS_CACHE_NO; 658 } else { 659 cache->cached = BTRFS_CACHE_STARTED; 660 cache->has_caching_ctl = 1; 661 } 662 } 663 spin_unlock(&cache->lock); 664 #ifdef CONFIG_BTRFS_DEBUG 665 if (ret == 1 && 666 btrfs_should_fragment_free_space(cache)) { 667 u64 bytes_used; 668 669 spin_lock(&cache->space_info->lock); 670 spin_lock(&cache->lock); 671 bytes_used = cache->key.offset - 672 btrfs_block_group_used(&cache->item); 673 cache->space_info->bytes_used += bytes_used >> 1; 674 spin_unlock(&cache->lock); 675 spin_unlock(&cache->space_info->lock); 676 fragment_free_space(cache); 677 } 678 #endif 679 mutex_unlock(&caching_ctl->mutex); 680 681 wake_up(&caching_ctl->wait); 682 if (ret == 1) { 683 put_caching_control(caching_ctl); 684 free_excluded_extents(cache); 685 return 0; 686 } 687 } else { 688 /* 689 * We're either using the free space tree or no caching at all. 690 * Set cached to the appropriate value and wakeup any waiters. 691 */ 692 spin_lock(&cache->lock); 693 if (load_cache_only) { 694 cache->caching_ctl = NULL; 695 cache->cached = BTRFS_CACHE_NO; 696 } else { 697 cache->cached = BTRFS_CACHE_STARTED; 698 cache->has_caching_ctl = 1; 699 } 700 spin_unlock(&cache->lock); 701 wake_up(&caching_ctl->wait); 702 } 703 704 if (load_cache_only) { 705 put_caching_control(caching_ctl); 706 return 0; 707 } 708 709 down_write(&fs_info->commit_root_sem); 710 refcount_inc(&caching_ctl->count); 711 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 712 up_write(&fs_info->commit_root_sem); 713 714 btrfs_get_block_group(cache); 715 716 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 717 718 return ret; 719 } 720 721 /* 722 * return the block group that starts at or after bytenr 723 */ 724 static struct btrfs_block_group_cache * 725 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 726 { 727 return block_group_cache_tree_search(info, bytenr, 0); 728 } 729 730 /* 731 * return the block group that contains the given bytenr 732 */ 733 struct btrfs_block_group_cache *btrfs_lookup_block_group( 734 struct btrfs_fs_info *info, 735 u64 bytenr) 736 { 737 return block_group_cache_tree_search(info, bytenr, 1); 738 } 739 740 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 741 u64 flags) 742 { 743 struct list_head *head = &info->space_info; 744 struct btrfs_space_info *found; 745 746 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 747 748 rcu_read_lock(); 749 list_for_each_entry_rcu(found, head, list) { 750 if (found->flags & flags) { 751 rcu_read_unlock(); 752 return found; 753 } 754 } 755 rcu_read_unlock(); 756 return NULL; 757 } 758 759 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, 760 struct btrfs_ref *ref) 761 { 762 struct btrfs_space_info *space_info; 763 s64 num_bytes = -ref->len; 764 u64 flags; 765 766 if (ref->type == BTRFS_REF_METADATA) { 767 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) 768 flags = BTRFS_BLOCK_GROUP_SYSTEM; 769 else 770 flags = BTRFS_BLOCK_GROUP_METADATA; 771 } else { 772 flags = BTRFS_BLOCK_GROUP_DATA; 773 } 774 775 space_info = __find_space_info(fs_info, flags); 776 ASSERT(space_info); 777 percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, 778 BTRFS_TOTAL_BYTES_PINNED_BATCH); 779 } 780 781 /* 782 * after adding space to the filesystem, we need to clear the full flags 783 * on all the space infos. 784 */ 785 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 786 { 787 struct list_head *head = &info->space_info; 788 struct btrfs_space_info *found; 789 790 rcu_read_lock(); 791 list_for_each_entry_rcu(found, head, list) 792 found->full = 0; 793 rcu_read_unlock(); 794 } 795 796 /* simple helper to search for an existing data extent at a given offset */ 797 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 798 { 799 int ret; 800 struct btrfs_key key; 801 struct btrfs_path *path; 802 803 path = btrfs_alloc_path(); 804 if (!path) 805 return -ENOMEM; 806 807 key.objectid = start; 808 key.offset = len; 809 key.type = BTRFS_EXTENT_ITEM_KEY; 810 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 811 btrfs_free_path(path); 812 return ret; 813 } 814 815 /* 816 * helper function to lookup reference count and flags of a tree block. 817 * 818 * the head node for delayed ref is used to store the sum of all the 819 * reference count modifications queued up in the rbtree. the head 820 * node may also store the extent flags to set. This way you can check 821 * to see what the reference count and extent flags would be if all of 822 * the delayed refs are not processed. 823 */ 824 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 825 struct btrfs_fs_info *fs_info, u64 bytenr, 826 u64 offset, int metadata, u64 *refs, u64 *flags) 827 { 828 struct btrfs_delayed_ref_head *head; 829 struct btrfs_delayed_ref_root *delayed_refs; 830 struct btrfs_path *path; 831 struct btrfs_extent_item *ei; 832 struct extent_buffer *leaf; 833 struct btrfs_key key; 834 u32 item_size; 835 u64 num_refs; 836 u64 extent_flags; 837 int ret; 838 839 /* 840 * If we don't have skinny metadata, don't bother doing anything 841 * different 842 */ 843 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 844 offset = fs_info->nodesize; 845 metadata = 0; 846 } 847 848 path = btrfs_alloc_path(); 849 if (!path) 850 return -ENOMEM; 851 852 if (!trans) { 853 path->skip_locking = 1; 854 path->search_commit_root = 1; 855 } 856 857 search_again: 858 key.objectid = bytenr; 859 key.offset = offset; 860 if (metadata) 861 key.type = BTRFS_METADATA_ITEM_KEY; 862 else 863 key.type = BTRFS_EXTENT_ITEM_KEY; 864 865 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 866 if (ret < 0) 867 goto out_free; 868 869 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 870 if (path->slots[0]) { 871 path->slots[0]--; 872 btrfs_item_key_to_cpu(path->nodes[0], &key, 873 path->slots[0]); 874 if (key.objectid == bytenr && 875 key.type == BTRFS_EXTENT_ITEM_KEY && 876 key.offset == fs_info->nodesize) 877 ret = 0; 878 } 879 } 880 881 if (ret == 0) { 882 leaf = path->nodes[0]; 883 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 884 if (item_size >= sizeof(*ei)) { 885 ei = btrfs_item_ptr(leaf, path->slots[0], 886 struct btrfs_extent_item); 887 num_refs = btrfs_extent_refs(leaf, ei); 888 extent_flags = btrfs_extent_flags(leaf, ei); 889 } else { 890 ret = -EINVAL; 891 btrfs_print_v0_err(fs_info); 892 if (trans) 893 btrfs_abort_transaction(trans, ret); 894 else 895 btrfs_handle_fs_error(fs_info, ret, NULL); 896 897 goto out_free; 898 } 899 900 BUG_ON(num_refs == 0); 901 } else { 902 num_refs = 0; 903 extent_flags = 0; 904 ret = 0; 905 } 906 907 if (!trans) 908 goto out; 909 910 delayed_refs = &trans->transaction->delayed_refs; 911 spin_lock(&delayed_refs->lock); 912 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 913 if (head) { 914 if (!mutex_trylock(&head->mutex)) { 915 refcount_inc(&head->refs); 916 spin_unlock(&delayed_refs->lock); 917 918 btrfs_release_path(path); 919 920 /* 921 * Mutex was contended, block until it's released and try 922 * again 923 */ 924 mutex_lock(&head->mutex); 925 mutex_unlock(&head->mutex); 926 btrfs_put_delayed_ref_head(head); 927 goto search_again; 928 } 929 spin_lock(&head->lock); 930 if (head->extent_op && head->extent_op->update_flags) 931 extent_flags |= head->extent_op->flags_to_set; 932 else 933 BUG_ON(num_refs == 0); 934 935 num_refs += head->ref_mod; 936 spin_unlock(&head->lock); 937 mutex_unlock(&head->mutex); 938 } 939 spin_unlock(&delayed_refs->lock); 940 out: 941 WARN_ON(num_refs == 0); 942 if (refs) 943 *refs = num_refs; 944 if (flags) 945 *flags = extent_flags; 946 out_free: 947 btrfs_free_path(path); 948 return ret; 949 } 950 951 /* 952 * Back reference rules. Back refs have three main goals: 953 * 954 * 1) differentiate between all holders of references to an extent so that 955 * when a reference is dropped we can make sure it was a valid reference 956 * before freeing the extent. 957 * 958 * 2) Provide enough information to quickly find the holders of an extent 959 * if we notice a given block is corrupted or bad. 960 * 961 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 962 * maintenance. This is actually the same as #2, but with a slightly 963 * different use case. 964 * 965 * There are two kinds of back refs. The implicit back refs is optimized 966 * for pointers in non-shared tree blocks. For a given pointer in a block, 967 * back refs of this kind provide information about the block's owner tree 968 * and the pointer's key. These information allow us to find the block by 969 * b-tree searching. The full back refs is for pointers in tree blocks not 970 * referenced by their owner trees. The location of tree block is recorded 971 * in the back refs. Actually the full back refs is generic, and can be 972 * used in all cases the implicit back refs is used. The major shortcoming 973 * of the full back refs is its overhead. Every time a tree block gets 974 * COWed, we have to update back refs entry for all pointers in it. 975 * 976 * For a newly allocated tree block, we use implicit back refs for 977 * pointers in it. This means most tree related operations only involve 978 * implicit back refs. For a tree block created in old transaction, the 979 * only way to drop a reference to it is COW it. So we can detect the 980 * event that tree block loses its owner tree's reference and do the 981 * back refs conversion. 982 * 983 * When a tree block is COWed through a tree, there are four cases: 984 * 985 * The reference count of the block is one and the tree is the block's 986 * owner tree. Nothing to do in this case. 987 * 988 * The reference count of the block is one and the tree is not the 989 * block's owner tree. In this case, full back refs is used for pointers 990 * in the block. Remove these full back refs, add implicit back refs for 991 * every pointers in the new block. 992 * 993 * The reference count of the block is greater than one and the tree is 994 * the block's owner tree. In this case, implicit back refs is used for 995 * pointers in the block. Add full back refs for every pointers in the 996 * block, increase lower level extents' reference counts. The original 997 * implicit back refs are entailed to the new block. 998 * 999 * The reference count of the block is greater than one and the tree is 1000 * not the block's owner tree. Add implicit back refs for every pointer in 1001 * the new block, increase lower level extents' reference count. 1002 * 1003 * Back Reference Key composing: 1004 * 1005 * The key objectid corresponds to the first byte in the extent, 1006 * The key type is used to differentiate between types of back refs. 1007 * There are different meanings of the key offset for different types 1008 * of back refs. 1009 * 1010 * File extents can be referenced by: 1011 * 1012 * - multiple snapshots, subvolumes, or different generations in one subvol 1013 * - different files inside a single subvolume 1014 * - different offsets inside a file (bookend extents in file.c) 1015 * 1016 * The extent ref structure for the implicit back refs has fields for: 1017 * 1018 * - Objectid of the subvolume root 1019 * - objectid of the file holding the reference 1020 * - original offset in the file 1021 * - how many bookend extents 1022 * 1023 * The key offset for the implicit back refs is hash of the first 1024 * three fields. 1025 * 1026 * The extent ref structure for the full back refs has field for: 1027 * 1028 * - number of pointers in the tree leaf 1029 * 1030 * The key offset for the implicit back refs is the first byte of 1031 * the tree leaf 1032 * 1033 * When a file extent is allocated, The implicit back refs is used. 1034 * the fields are filled in: 1035 * 1036 * (root_key.objectid, inode objectid, offset in file, 1) 1037 * 1038 * When a file extent is removed file truncation, we find the 1039 * corresponding implicit back refs and check the following fields: 1040 * 1041 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1042 * 1043 * Btree extents can be referenced by: 1044 * 1045 * - Different subvolumes 1046 * 1047 * Both the implicit back refs and the full back refs for tree blocks 1048 * only consist of key. The key offset for the implicit back refs is 1049 * objectid of block's owner tree. The key offset for the full back refs 1050 * is the first byte of parent block. 1051 * 1052 * When implicit back refs is used, information about the lowest key and 1053 * level of the tree block are required. These information are stored in 1054 * tree block info structure. 1055 */ 1056 1057 /* 1058 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 1059 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 1060 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 1061 */ 1062 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 1063 struct btrfs_extent_inline_ref *iref, 1064 enum btrfs_inline_ref_type is_data) 1065 { 1066 int type = btrfs_extent_inline_ref_type(eb, iref); 1067 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 1068 1069 if (type == BTRFS_TREE_BLOCK_REF_KEY || 1070 type == BTRFS_SHARED_BLOCK_REF_KEY || 1071 type == BTRFS_SHARED_DATA_REF_KEY || 1072 type == BTRFS_EXTENT_DATA_REF_KEY) { 1073 if (is_data == BTRFS_REF_TYPE_BLOCK) { 1074 if (type == BTRFS_TREE_BLOCK_REF_KEY) 1075 return type; 1076 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1077 ASSERT(eb->fs_info); 1078 /* 1079 * Every shared one has parent tree 1080 * block, which must be aligned to 1081 * nodesize. 1082 */ 1083 if (offset && 1084 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1085 return type; 1086 } 1087 } else if (is_data == BTRFS_REF_TYPE_DATA) { 1088 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1089 return type; 1090 if (type == BTRFS_SHARED_DATA_REF_KEY) { 1091 ASSERT(eb->fs_info); 1092 /* 1093 * Every shared one has parent tree 1094 * block, which must be aligned to 1095 * nodesize. 1096 */ 1097 if (offset && 1098 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1099 return type; 1100 } 1101 } else { 1102 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 1103 return type; 1104 } 1105 } 1106 1107 btrfs_print_leaf((struct extent_buffer *)eb); 1108 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", 1109 eb->start, type); 1110 WARN_ON(1); 1111 1112 return BTRFS_REF_TYPE_INVALID; 1113 } 1114 1115 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1116 { 1117 u32 high_crc = ~(u32)0; 1118 u32 low_crc = ~(u32)0; 1119 __le64 lenum; 1120 1121 lenum = cpu_to_le64(root_objectid); 1122 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1123 lenum = cpu_to_le64(owner); 1124 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1125 lenum = cpu_to_le64(offset); 1126 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1127 1128 return ((u64)high_crc << 31) ^ (u64)low_crc; 1129 } 1130 1131 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1132 struct btrfs_extent_data_ref *ref) 1133 { 1134 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1135 btrfs_extent_data_ref_objectid(leaf, ref), 1136 btrfs_extent_data_ref_offset(leaf, ref)); 1137 } 1138 1139 static int match_extent_data_ref(struct extent_buffer *leaf, 1140 struct btrfs_extent_data_ref *ref, 1141 u64 root_objectid, u64 owner, u64 offset) 1142 { 1143 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1144 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1145 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1146 return 0; 1147 return 1; 1148 } 1149 1150 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1151 struct btrfs_path *path, 1152 u64 bytenr, u64 parent, 1153 u64 root_objectid, 1154 u64 owner, u64 offset) 1155 { 1156 struct btrfs_root *root = trans->fs_info->extent_root; 1157 struct btrfs_key key; 1158 struct btrfs_extent_data_ref *ref; 1159 struct extent_buffer *leaf; 1160 u32 nritems; 1161 int ret; 1162 int recow; 1163 int err = -ENOENT; 1164 1165 key.objectid = bytenr; 1166 if (parent) { 1167 key.type = BTRFS_SHARED_DATA_REF_KEY; 1168 key.offset = parent; 1169 } else { 1170 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1171 key.offset = hash_extent_data_ref(root_objectid, 1172 owner, offset); 1173 } 1174 again: 1175 recow = 0; 1176 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1177 if (ret < 0) { 1178 err = ret; 1179 goto fail; 1180 } 1181 1182 if (parent) { 1183 if (!ret) 1184 return 0; 1185 goto fail; 1186 } 1187 1188 leaf = path->nodes[0]; 1189 nritems = btrfs_header_nritems(leaf); 1190 while (1) { 1191 if (path->slots[0] >= nritems) { 1192 ret = btrfs_next_leaf(root, path); 1193 if (ret < 0) 1194 err = ret; 1195 if (ret) 1196 goto fail; 1197 1198 leaf = path->nodes[0]; 1199 nritems = btrfs_header_nritems(leaf); 1200 recow = 1; 1201 } 1202 1203 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1204 if (key.objectid != bytenr || 1205 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1206 goto fail; 1207 1208 ref = btrfs_item_ptr(leaf, path->slots[0], 1209 struct btrfs_extent_data_ref); 1210 1211 if (match_extent_data_ref(leaf, ref, root_objectid, 1212 owner, offset)) { 1213 if (recow) { 1214 btrfs_release_path(path); 1215 goto again; 1216 } 1217 err = 0; 1218 break; 1219 } 1220 path->slots[0]++; 1221 } 1222 fail: 1223 return err; 1224 } 1225 1226 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1227 struct btrfs_path *path, 1228 u64 bytenr, u64 parent, 1229 u64 root_objectid, u64 owner, 1230 u64 offset, int refs_to_add) 1231 { 1232 struct btrfs_root *root = trans->fs_info->extent_root; 1233 struct btrfs_key key; 1234 struct extent_buffer *leaf; 1235 u32 size; 1236 u32 num_refs; 1237 int ret; 1238 1239 key.objectid = bytenr; 1240 if (parent) { 1241 key.type = BTRFS_SHARED_DATA_REF_KEY; 1242 key.offset = parent; 1243 size = sizeof(struct btrfs_shared_data_ref); 1244 } else { 1245 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1246 key.offset = hash_extent_data_ref(root_objectid, 1247 owner, offset); 1248 size = sizeof(struct btrfs_extent_data_ref); 1249 } 1250 1251 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1252 if (ret && ret != -EEXIST) 1253 goto fail; 1254 1255 leaf = path->nodes[0]; 1256 if (parent) { 1257 struct btrfs_shared_data_ref *ref; 1258 ref = btrfs_item_ptr(leaf, path->slots[0], 1259 struct btrfs_shared_data_ref); 1260 if (ret == 0) { 1261 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1262 } else { 1263 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1264 num_refs += refs_to_add; 1265 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1266 } 1267 } else { 1268 struct btrfs_extent_data_ref *ref; 1269 while (ret == -EEXIST) { 1270 ref = btrfs_item_ptr(leaf, path->slots[0], 1271 struct btrfs_extent_data_ref); 1272 if (match_extent_data_ref(leaf, ref, root_objectid, 1273 owner, offset)) 1274 break; 1275 btrfs_release_path(path); 1276 key.offset++; 1277 ret = btrfs_insert_empty_item(trans, root, path, &key, 1278 size); 1279 if (ret && ret != -EEXIST) 1280 goto fail; 1281 1282 leaf = path->nodes[0]; 1283 } 1284 ref = btrfs_item_ptr(leaf, path->slots[0], 1285 struct btrfs_extent_data_ref); 1286 if (ret == 0) { 1287 btrfs_set_extent_data_ref_root(leaf, ref, 1288 root_objectid); 1289 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1290 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1291 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1292 } else { 1293 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1294 num_refs += refs_to_add; 1295 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1296 } 1297 } 1298 btrfs_mark_buffer_dirty(leaf); 1299 ret = 0; 1300 fail: 1301 btrfs_release_path(path); 1302 return ret; 1303 } 1304 1305 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1306 struct btrfs_path *path, 1307 int refs_to_drop, int *last_ref) 1308 { 1309 struct btrfs_key key; 1310 struct btrfs_extent_data_ref *ref1 = NULL; 1311 struct btrfs_shared_data_ref *ref2 = NULL; 1312 struct extent_buffer *leaf; 1313 u32 num_refs = 0; 1314 int ret = 0; 1315 1316 leaf = path->nodes[0]; 1317 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1318 1319 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1320 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1321 struct btrfs_extent_data_ref); 1322 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1323 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1324 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1325 struct btrfs_shared_data_ref); 1326 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1327 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 1328 btrfs_print_v0_err(trans->fs_info); 1329 btrfs_abort_transaction(trans, -EINVAL); 1330 return -EINVAL; 1331 } else { 1332 BUG(); 1333 } 1334 1335 BUG_ON(num_refs < refs_to_drop); 1336 num_refs -= refs_to_drop; 1337 1338 if (num_refs == 0) { 1339 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1340 *last_ref = 1; 1341 } else { 1342 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1343 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1344 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1345 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1346 btrfs_mark_buffer_dirty(leaf); 1347 } 1348 return ret; 1349 } 1350 1351 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1352 struct btrfs_extent_inline_ref *iref) 1353 { 1354 struct btrfs_key key; 1355 struct extent_buffer *leaf; 1356 struct btrfs_extent_data_ref *ref1; 1357 struct btrfs_shared_data_ref *ref2; 1358 u32 num_refs = 0; 1359 int type; 1360 1361 leaf = path->nodes[0]; 1362 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1363 1364 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 1365 if (iref) { 1366 /* 1367 * If type is invalid, we should have bailed out earlier than 1368 * this call. 1369 */ 1370 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 1371 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1372 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1373 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1374 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1375 } else { 1376 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1377 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1378 } 1379 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1380 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1381 struct btrfs_extent_data_ref); 1382 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1383 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1384 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1385 struct btrfs_shared_data_ref); 1386 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1387 } else { 1388 WARN_ON(1); 1389 } 1390 return num_refs; 1391 } 1392 1393 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1394 struct btrfs_path *path, 1395 u64 bytenr, u64 parent, 1396 u64 root_objectid) 1397 { 1398 struct btrfs_root *root = trans->fs_info->extent_root; 1399 struct btrfs_key key; 1400 int ret; 1401 1402 key.objectid = bytenr; 1403 if (parent) { 1404 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1405 key.offset = parent; 1406 } else { 1407 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1408 key.offset = root_objectid; 1409 } 1410 1411 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1412 if (ret > 0) 1413 ret = -ENOENT; 1414 return ret; 1415 } 1416 1417 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1418 struct btrfs_path *path, 1419 u64 bytenr, u64 parent, 1420 u64 root_objectid) 1421 { 1422 struct btrfs_key key; 1423 int ret; 1424 1425 key.objectid = bytenr; 1426 if (parent) { 1427 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1428 key.offset = parent; 1429 } else { 1430 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1431 key.offset = root_objectid; 1432 } 1433 1434 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, 1435 path, &key, 0); 1436 btrfs_release_path(path); 1437 return ret; 1438 } 1439 1440 static inline int extent_ref_type(u64 parent, u64 owner) 1441 { 1442 int type; 1443 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1444 if (parent > 0) 1445 type = BTRFS_SHARED_BLOCK_REF_KEY; 1446 else 1447 type = BTRFS_TREE_BLOCK_REF_KEY; 1448 } else { 1449 if (parent > 0) 1450 type = BTRFS_SHARED_DATA_REF_KEY; 1451 else 1452 type = BTRFS_EXTENT_DATA_REF_KEY; 1453 } 1454 return type; 1455 } 1456 1457 static int find_next_key(struct btrfs_path *path, int level, 1458 struct btrfs_key *key) 1459 1460 { 1461 for (; level < BTRFS_MAX_LEVEL; level++) { 1462 if (!path->nodes[level]) 1463 break; 1464 if (path->slots[level] + 1 >= 1465 btrfs_header_nritems(path->nodes[level])) 1466 continue; 1467 if (level == 0) 1468 btrfs_item_key_to_cpu(path->nodes[level], key, 1469 path->slots[level] + 1); 1470 else 1471 btrfs_node_key_to_cpu(path->nodes[level], key, 1472 path->slots[level] + 1); 1473 return 0; 1474 } 1475 return 1; 1476 } 1477 1478 /* 1479 * look for inline back ref. if back ref is found, *ref_ret is set 1480 * to the address of inline back ref, and 0 is returned. 1481 * 1482 * if back ref isn't found, *ref_ret is set to the address where it 1483 * should be inserted, and -ENOENT is returned. 1484 * 1485 * if insert is true and there are too many inline back refs, the path 1486 * points to the extent item, and -EAGAIN is returned. 1487 * 1488 * NOTE: inline back refs are ordered in the same way that back ref 1489 * items in the tree are ordered. 1490 */ 1491 static noinline_for_stack 1492 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1493 struct btrfs_path *path, 1494 struct btrfs_extent_inline_ref **ref_ret, 1495 u64 bytenr, u64 num_bytes, 1496 u64 parent, u64 root_objectid, 1497 u64 owner, u64 offset, int insert) 1498 { 1499 struct btrfs_fs_info *fs_info = trans->fs_info; 1500 struct btrfs_root *root = fs_info->extent_root; 1501 struct btrfs_key key; 1502 struct extent_buffer *leaf; 1503 struct btrfs_extent_item *ei; 1504 struct btrfs_extent_inline_ref *iref; 1505 u64 flags; 1506 u64 item_size; 1507 unsigned long ptr; 1508 unsigned long end; 1509 int extra_size; 1510 int type; 1511 int want; 1512 int ret; 1513 int err = 0; 1514 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1515 int needed; 1516 1517 key.objectid = bytenr; 1518 key.type = BTRFS_EXTENT_ITEM_KEY; 1519 key.offset = num_bytes; 1520 1521 want = extent_ref_type(parent, owner); 1522 if (insert) { 1523 extra_size = btrfs_extent_inline_ref_size(want); 1524 path->keep_locks = 1; 1525 } else 1526 extra_size = -1; 1527 1528 /* 1529 * Owner is our level, so we can just add one to get the level for the 1530 * block we are interested in. 1531 */ 1532 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1533 key.type = BTRFS_METADATA_ITEM_KEY; 1534 key.offset = owner; 1535 } 1536 1537 again: 1538 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1539 if (ret < 0) { 1540 err = ret; 1541 goto out; 1542 } 1543 1544 /* 1545 * We may be a newly converted file system which still has the old fat 1546 * extent entries for metadata, so try and see if we have one of those. 1547 */ 1548 if (ret > 0 && skinny_metadata) { 1549 skinny_metadata = false; 1550 if (path->slots[0]) { 1551 path->slots[0]--; 1552 btrfs_item_key_to_cpu(path->nodes[0], &key, 1553 path->slots[0]); 1554 if (key.objectid == bytenr && 1555 key.type == BTRFS_EXTENT_ITEM_KEY && 1556 key.offset == num_bytes) 1557 ret = 0; 1558 } 1559 if (ret) { 1560 key.objectid = bytenr; 1561 key.type = BTRFS_EXTENT_ITEM_KEY; 1562 key.offset = num_bytes; 1563 btrfs_release_path(path); 1564 goto again; 1565 } 1566 } 1567 1568 if (ret && !insert) { 1569 err = -ENOENT; 1570 goto out; 1571 } else if (WARN_ON(ret)) { 1572 err = -EIO; 1573 goto out; 1574 } 1575 1576 leaf = path->nodes[0]; 1577 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1578 if (unlikely(item_size < sizeof(*ei))) { 1579 err = -EINVAL; 1580 btrfs_print_v0_err(fs_info); 1581 btrfs_abort_transaction(trans, err); 1582 goto out; 1583 } 1584 1585 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1586 flags = btrfs_extent_flags(leaf, ei); 1587 1588 ptr = (unsigned long)(ei + 1); 1589 end = (unsigned long)ei + item_size; 1590 1591 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1592 ptr += sizeof(struct btrfs_tree_block_info); 1593 BUG_ON(ptr > end); 1594 } 1595 1596 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 1597 needed = BTRFS_REF_TYPE_DATA; 1598 else 1599 needed = BTRFS_REF_TYPE_BLOCK; 1600 1601 err = -ENOENT; 1602 while (1) { 1603 if (ptr >= end) { 1604 WARN_ON(ptr > end); 1605 break; 1606 } 1607 iref = (struct btrfs_extent_inline_ref *)ptr; 1608 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 1609 if (type == BTRFS_REF_TYPE_INVALID) { 1610 err = -EUCLEAN; 1611 goto out; 1612 } 1613 1614 if (want < type) 1615 break; 1616 if (want > type) { 1617 ptr += btrfs_extent_inline_ref_size(type); 1618 continue; 1619 } 1620 1621 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1622 struct btrfs_extent_data_ref *dref; 1623 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1624 if (match_extent_data_ref(leaf, dref, root_objectid, 1625 owner, offset)) { 1626 err = 0; 1627 break; 1628 } 1629 if (hash_extent_data_ref_item(leaf, dref) < 1630 hash_extent_data_ref(root_objectid, owner, offset)) 1631 break; 1632 } else { 1633 u64 ref_offset; 1634 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1635 if (parent > 0) { 1636 if (parent == ref_offset) { 1637 err = 0; 1638 break; 1639 } 1640 if (ref_offset < parent) 1641 break; 1642 } else { 1643 if (root_objectid == ref_offset) { 1644 err = 0; 1645 break; 1646 } 1647 if (ref_offset < root_objectid) 1648 break; 1649 } 1650 } 1651 ptr += btrfs_extent_inline_ref_size(type); 1652 } 1653 if (err == -ENOENT && insert) { 1654 if (item_size + extra_size >= 1655 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1656 err = -EAGAIN; 1657 goto out; 1658 } 1659 /* 1660 * To add new inline back ref, we have to make sure 1661 * there is no corresponding back ref item. 1662 * For simplicity, we just do not add new inline back 1663 * ref if there is any kind of item for this block 1664 */ 1665 if (find_next_key(path, 0, &key) == 0 && 1666 key.objectid == bytenr && 1667 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1668 err = -EAGAIN; 1669 goto out; 1670 } 1671 } 1672 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1673 out: 1674 if (insert) { 1675 path->keep_locks = 0; 1676 btrfs_unlock_up_safe(path, 1); 1677 } 1678 return err; 1679 } 1680 1681 /* 1682 * helper to add new inline back ref 1683 */ 1684 static noinline_for_stack 1685 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1686 struct btrfs_path *path, 1687 struct btrfs_extent_inline_ref *iref, 1688 u64 parent, u64 root_objectid, 1689 u64 owner, u64 offset, int refs_to_add, 1690 struct btrfs_delayed_extent_op *extent_op) 1691 { 1692 struct extent_buffer *leaf; 1693 struct btrfs_extent_item *ei; 1694 unsigned long ptr; 1695 unsigned long end; 1696 unsigned long item_offset; 1697 u64 refs; 1698 int size; 1699 int type; 1700 1701 leaf = path->nodes[0]; 1702 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1703 item_offset = (unsigned long)iref - (unsigned long)ei; 1704 1705 type = extent_ref_type(parent, owner); 1706 size = btrfs_extent_inline_ref_size(type); 1707 1708 btrfs_extend_item(path, size); 1709 1710 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1711 refs = btrfs_extent_refs(leaf, ei); 1712 refs += refs_to_add; 1713 btrfs_set_extent_refs(leaf, ei, refs); 1714 if (extent_op) 1715 __run_delayed_extent_op(extent_op, leaf, ei); 1716 1717 ptr = (unsigned long)ei + item_offset; 1718 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1719 if (ptr < end - size) 1720 memmove_extent_buffer(leaf, ptr + size, ptr, 1721 end - size - ptr); 1722 1723 iref = (struct btrfs_extent_inline_ref *)ptr; 1724 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1725 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1726 struct btrfs_extent_data_ref *dref; 1727 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1728 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1729 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1730 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1731 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1732 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1733 struct btrfs_shared_data_ref *sref; 1734 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1735 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1736 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1737 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1738 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1739 } else { 1740 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1741 } 1742 btrfs_mark_buffer_dirty(leaf); 1743 } 1744 1745 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1746 struct btrfs_path *path, 1747 struct btrfs_extent_inline_ref **ref_ret, 1748 u64 bytenr, u64 num_bytes, u64 parent, 1749 u64 root_objectid, u64 owner, u64 offset) 1750 { 1751 int ret; 1752 1753 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1754 num_bytes, parent, root_objectid, 1755 owner, offset, 0); 1756 if (ret != -ENOENT) 1757 return ret; 1758 1759 btrfs_release_path(path); 1760 *ref_ret = NULL; 1761 1762 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1763 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1764 root_objectid); 1765 } else { 1766 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1767 root_objectid, owner, offset); 1768 } 1769 return ret; 1770 } 1771 1772 /* 1773 * helper to update/remove inline back ref 1774 */ 1775 static noinline_for_stack 1776 void update_inline_extent_backref(struct btrfs_path *path, 1777 struct btrfs_extent_inline_ref *iref, 1778 int refs_to_mod, 1779 struct btrfs_delayed_extent_op *extent_op, 1780 int *last_ref) 1781 { 1782 struct extent_buffer *leaf = path->nodes[0]; 1783 struct btrfs_extent_item *ei; 1784 struct btrfs_extent_data_ref *dref = NULL; 1785 struct btrfs_shared_data_ref *sref = NULL; 1786 unsigned long ptr; 1787 unsigned long end; 1788 u32 item_size; 1789 int size; 1790 int type; 1791 u64 refs; 1792 1793 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1794 refs = btrfs_extent_refs(leaf, ei); 1795 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1796 refs += refs_to_mod; 1797 btrfs_set_extent_refs(leaf, ei, refs); 1798 if (extent_op) 1799 __run_delayed_extent_op(extent_op, leaf, ei); 1800 1801 /* 1802 * If type is invalid, we should have bailed out after 1803 * lookup_inline_extent_backref(). 1804 */ 1805 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1806 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1807 1808 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1809 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1810 refs = btrfs_extent_data_ref_count(leaf, dref); 1811 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1812 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1813 refs = btrfs_shared_data_ref_count(leaf, sref); 1814 } else { 1815 refs = 1; 1816 BUG_ON(refs_to_mod != -1); 1817 } 1818 1819 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1820 refs += refs_to_mod; 1821 1822 if (refs > 0) { 1823 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1824 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1825 else 1826 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1827 } else { 1828 *last_ref = 1; 1829 size = btrfs_extent_inline_ref_size(type); 1830 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1831 ptr = (unsigned long)iref; 1832 end = (unsigned long)ei + item_size; 1833 if (ptr + size < end) 1834 memmove_extent_buffer(leaf, ptr, ptr + size, 1835 end - ptr - size); 1836 item_size -= size; 1837 btrfs_truncate_item(path, item_size, 1); 1838 } 1839 btrfs_mark_buffer_dirty(leaf); 1840 } 1841 1842 static noinline_for_stack 1843 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1844 struct btrfs_path *path, 1845 u64 bytenr, u64 num_bytes, u64 parent, 1846 u64 root_objectid, u64 owner, 1847 u64 offset, int refs_to_add, 1848 struct btrfs_delayed_extent_op *extent_op) 1849 { 1850 struct btrfs_extent_inline_ref *iref; 1851 int ret; 1852 1853 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1854 num_bytes, parent, root_objectid, 1855 owner, offset, 1); 1856 if (ret == 0) { 1857 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1858 update_inline_extent_backref(path, iref, refs_to_add, 1859 extent_op, NULL); 1860 } else if (ret == -ENOENT) { 1861 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1862 root_objectid, owner, offset, 1863 refs_to_add, extent_op); 1864 ret = 0; 1865 } 1866 return ret; 1867 } 1868 1869 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1870 struct btrfs_path *path, 1871 u64 bytenr, u64 parent, u64 root_objectid, 1872 u64 owner, u64 offset, int refs_to_add) 1873 { 1874 int ret; 1875 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1876 BUG_ON(refs_to_add != 1); 1877 ret = insert_tree_block_ref(trans, path, bytenr, parent, 1878 root_objectid); 1879 } else { 1880 ret = insert_extent_data_ref(trans, path, bytenr, parent, 1881 root_objectid, owner, offset, 1882 refs_to_add); 1883 } 1884 return ret; 1885 } 1886 1887 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1888 struct btrfs_path *path, 1889 struct btrfs_extent_inline_ref *iref, 1890 int refs_to_drop, int is_data, int *last_ref) 1891 { 1892 int ret = 0; 1893 1894 BUG_ON(!is_data && refs_to_drop != 1); 1895 if (iref) { 1896 update_inline_extent_backref(path, iref, -refs_to_drop, NULL, 1897 last_ref); 1898 } else if (is_data) { 1899 ret = remove_extent_data_ref(trans, path, refs_to_drop, 1900 last_ref); 1901 } else { 1902 *last_ref = 1; 1903 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1904 } 1905 return ret; 1906 } 1907 1908 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1909 u64 *discarded_bytes) 1910 { 1911 int j, ret = 0; 1912 u64 bytes_left, end; 1913 u64 aligned_start = ALIGN(start, 1 << 9); 1914 1915 if (WARN_ON(start != aligned_start)) { 1916 len -= aligned_start - start; 1917 len = round_down(len, 1 << 9); 1918 start = aligned_start; 1919 } 1920 1921 *discarded_bytes = 0; 1922 1923 if (!len) 1924 return 0; 1925 1926 end = start + len; 1927 bytes_left = len; 1928 1929 /* Skip any superblocks on this device. */ 1930 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1931 u64 sb_start = btrfs_sb_offset(j); 1932 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1933 u64 size = sb_start - start; 1934 1935 if (!in_range(sb_start, start, bytes_left) && 1936 !in_range(sb_end, start, bytes_left) && 1937 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1938 continue; 1939 1940 /* 1941 * Superblock spans beginning of range. Adjust start and 1942 * try again. 1943 */ 1944 if (sb_start <= start) { 1945 start += sb_end - start; 1946 if (start > end) { 1947 bytes_left = 0; 1948 break; 1949 } 1950 bytes_left = end - start; 1951 continue; 1952 } 1953 1954 if (size) { 1955 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 1956 GFP_NOFS, 0); 1957 if (!ret) 1958 *discarded_bytes += size; 1959 else if (ret != -EOPNOTSUPP) 1960 return ret; 1961 } 1962 1963 start = sb_end; 1964 if (start > end) { 1965 bytes_left = 0; 1966 break; 1967 } 1968 bytes_left = end - start; 1969 } 1970 1971 if (bytes_left) { 1972 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 1973 GFP_NOFS, 0); 1974 if (!ret) 1975 *discarded_bytes += bytes_left; 1976 } 1977 return ret; 1978 } 1979 1980 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1981 u64 num_bytes, u64 *actual_bytes) 1982 { 1983 int ret; 1984 u64 discarded_bytes = 0; 1985 struct btrfs_bio *bbio = NULL; 1986 1987 1988 /* 1989 * Avoid races with device replace and make sure our bbio has devices 1990 * associated to its stripes that don't go away while we are discarding. 1991 */ 1992 btrfs_bio_counter_inc_blocked(fs_info); 1993 /* Tell the block device(s) that the sectors can be discarded */ 1994 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 1995 &bbio, 0); 1996 /* Error condition is -ENOMEM */ 1997 if (!ret) { 1998 struct btrfs_bio_stripe *stripe = bbio->stripes; 1999 int i; 2000 2001 2002 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2003 u64 bytes; 2004 struct request_queue *req_q; 2005 2006 if (!stripe->dev->bdev) { 2007 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 2008 continue; 2009 } 2010 req_q = bdev_get_queue(stripe->dev->bdev); 2011 if (!blk_queue_discard(req_q)) 2012 continue; 2013 2014 ret = btrfs_issue_discard(stripe->dev->bdev, 2015 stripe->physical, 2016 stripe->length, 2017 &bytes); 2018 if (!ret) 2019 discarded_bytes += bytes; 2020 else if (ret != -EOPNOTSUPP) 2021 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2022 2023 /* 2024 * Just in case we get back EOPNOTSUPP for some reason, 2025 * just ignore the return value so we don't screw up 2026 * people calling discard_extent. 2027 */ 2028 ret = 0; 2029 } 2030 btrfs_put_bbio(bbio); 2031 } 2032 btrfs_bio_counter_dec(fs_info); 2033 2034 if (actual_bytes) 2035 *actual_bytes = discarded_bytes; 2036 2037 2038 if (ret == -EOPNOTSUPP) 2039 ret = 0; 2040 return ret; 2041 } 2042 2043 /* Can return -ENOMEM */ 2044 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2045 struct btrfs_ref *generic_ref) 2046 { 2047 struct btrfs_fs_info *fs_info = trans->fs_info; 2048 int old_ref_mod, new_ref_mod; 2049 int ret; 2050 2051 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && 2052 generic_ref->action); 2053 BUG_ON(generic_ref->type == BTRFS_REF_METADATA && 2054 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); 2055 2056 if (generic_ref->type == BTRFS_REF_METADATA) 2057 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, 2058 NULL, &old_ref_mod, &new_ref_mod); 2059 else 2060 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, 2061 &old_ref_mod, &new_ref_mod); 2062 2063 btrfs_ref_tree_mod(fs_info, generic_ref); 2064 2065 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) 2066 add_pinned_bytes(fs_info, generic_ref); 2067 2068 return ret; 2069 } 2070 2071 /* 2072 * __btrfs_inc_extent_ref - insert backreference for a given extent 2073 * 2074 * @trans: Handle of transaction 2075 * 2076 * @node: The delayed ref node used to get the bytenr/length for 2077 * extent whose references are incremented. 2078 * 2079 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ 2080 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical 2081 * bytenr of the parent block. Since new extents are always 2082 * created with indirect references, this will only be the case 2083 * when relocating a shared extent. In that case, root_objectid 2084 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must 2085 * be 0 2086 * 2087 * @root_objectid: The id of the root where this modification has originated, 2088 * this can be either one of the well-known metadata trees or 2089 * the subvolume id which references this extent. 2090 * 2091 * @owner: For data extents it is the inode number of the owning file. 2092 * For metadata extents this parameter holds the level in the 2093 * tree of the extent. 2094 * 2095 * @offset: For metadata extents the offset is ignored and is currently 2096 * always passed as 0. For data extents it is the fileoffset 2097 * this extent belongs to. 2098 * 2099 * @refs_to_add Number of references to add 2100 * 2101 * @extent_op Pointer to a structure, holding information necessary when 2102 * updating a tree block's flags 2103 * 2104 */ 2105 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2106 struct btrfs_delayed_ref_node *node, 2107 u64 parent, u64 root_objectid, 2108 u64 owner, u64 offset, int refs_to_add, 2109 struct btrfs_delayed_extent_op *extent_op) 2110 { 2111 struct btrfs_path *path; 2112 struct extent_buffer *leaf; 2113 struct btrfs_extent_item *item; 2114 struct btrfs_key key; 2115 u64 bytenr = node->bytenr; 2116 u64 num_bytes = node->num_bytes; 2117 u64 refs; 2118 int ret; 2119 2120 path = btrfs_alloc_path(); 2121 if (!path) 2122 return -ENOMEM; 2123 2124 path->reada = READA_FORWARD; 2125 path->leave_spinning = 1; 2126 /* this will setup the path even if it fails to insert the back ref */ 2127 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 2128 parent, root_objectid, owner, 2129 offset, refs_to_add, extent_op); 2130 if ((ret < 0 && ret != -EAGAIN) || !ret) 2131 goto out; 2132 2133 /* 2134 * Ok we had -EAGAIN which means we didn't have space to insert and 2135 * inline extent ref, so just update the reference count and add a 2136 * normal backref. 2137 */ 2138 leaf = path->nodes[0]; 2139 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2140 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2141 refs = btrfs_extent_refs(leaf, item); 2142 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2143 if (extent_op) 2144 __run_delayed_extent_op(extent_op, leaf, item); 2145 2146 btrfs_mark_buffer_dirty(leaf); 2147 btrfs_release_path(path); 2148 2149 path->reada = READA_FORWARD; 2150 path->leave_spinning = 1; 2151 /* now insert the actual backref */ 2152 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, 2153 owner, offset, refs_to_add); 2154 if (ret) 2155 btrfs_abort_transaction(trans, ret); 2156 out: 2157 btrfs_free_path(path); 2158 return ret; 2159 } 2160 2161 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2162 struct btrfs_delayed_ref_node *node, 2163 struct btrfs_delayed_extent_op *extent_op, 2164 int insert_reserved) 2165 { 2166 int ret = 0; 2167 struct btrfs_delayed_data_ref *ref; 2168 struct btrfs_key ins; 2169 u64 parent = 0; 2170 u64 ref_root = 0; 2171 u64 flags = 0; 2172 2173 ins.objectid = node->bytenr; 2174 ins.offset = node->num_bytes; 2175 ins.type = BTRFS_EXTENT_ITEM_KEY; 2176 2177 ref = btrfs_delayed_node_to_data_ref(node); 2178 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); 2179 2180 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2181 parent = ref->parent; 2182 ref_root = ref->root; 2183 2184 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2185 if (extent_op) 2186 flags |= extent_op->flags_to_set; 2187 ret = alloc_reserved_file_extent(trans, parent, ref_root, 2188 flags, ref->objectid, 2189 ref->offset, &ins, 2190 node->ref_mod); 2191 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2192 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2193 ref->objectid, ref->offset, 2194 node->ref_mod, extent_op); 2195 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2196 ret = __btrfs_free_extent(trans, node, parent, 2197 ref_root, ref->objectid, 2198 ref->offset, node->ref_mod, 2199 extent_op); 2200 } else { 2201 BUG(); 2202 } 2203 return ret; 2204 } 2205 2206 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2207 struct extent_buffer *leaf, 2208 struct btrfs_extent_item *ei) 2209 { 2210 u64 flags = btrfs_extent_flags(leaf, ei); 2211 if (extent_op->update_flags) { 2212 flags |= extent_op->flags_to_set; 2213 btrfs_set_extent_flags(leaf, ei, flags); 2214 } 2215 2216 if (extent_op->update_key) { 2217 struct btrfs_tree_block_info *bi; 2218 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2219 bi = (struct btrfs_tree_block_info *)(ei + 1); 2220 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2221 } 2222 } 2223 2224 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2225 struct btrfs_delayed_ref_head *head, 2226 struct btrfs_delayed_extent_op *extent_op) 2227 { 2228 struct btrfs_fs_info *fs_info = trans->fs_info; 2229 struct btrfs_key key; 2230 struct btrfs_path *path; 2231 struct btrfs_extent_item *ei; 2232 struct extent_buffer *leaf; 2233 u32 item_size; 2234 int ret; 2235 int err = 0; 2236 int metadata = !extent_op->is_data; 2237 2238 if (trans->aborted) 2239 return 0; 2240 2241 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2242 metadata = 0; 2243 2244 path = btrfs_alloc_path(); 2245 if (!path) 2246 return -ENOMEM; 2247 2248 key.objectid = head->bytenr; 2249 2250 if (metadata) { 2251 key.type = BTRFS_METADATA_ITEM_KEY; 2252 key.offset = extent_op->level; 2253 } else { 2254 key.type = BTRFS_EXTENT_ITEM_KEY; 2255 key.offset = head->num_bytes; 2256 } 2257 2258 again: 2259 path->reada = READA_FORWARD; 2260 path->leave_spinning = 1; 2261 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2262 if (ret < 0) { 2263 err = ret; 2264 goto out; 2265 } 2266 if (ret > 0) { 2267 if (metadata) { 2268 if (path->slots[0] > 0) { 2269 path->slots[0]--; 2270 btrfs_item_key_to_cpu(path->nodes[0], &key, 2271 path->slots[0]); 2272 if (key.objectid == head->bytenr && 2273 key.type == BTRFS_EXTENT_ITEM_KEY && 2274 key.offset == head->num_bytes) 2275 ret = 0; 2276 } 2277 if (ret > 0) { 2278 btrfs_release_path(path); 2279 metadata = 0; 2280 2281 key.objectid = head->bytenr; 2282 key.offset = head->num_bytes; 2283 key.type = BTRFS_EXTENT_ITEM_KEY; 2284 goto again; 2285 } 2286 } else { 2287 err = -EIO; 2288 goto out; 2289 } 2290 } 2291 2292 leaf = path->nodes[0]; 2293 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2294 2295 if (unlikely(item_size < sizeof(*ei))) { 2296 err = -EINVAL; 2297 btrfs_print_v0_err(fs_info); 2298 btrfs_abort_transaction(trans, err); 2299 goto out; 2300 } 2301 2302 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2303 __run_delayed_extent_op(extent_op, leaf, ei); 2304 2305 btrfs_mark_buffer_dirty(leaf); 2306 out: 2307 btrfs_free_path(path); 2308 return err; 2309 } 2310 2311 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2312 struct btrfs_delayed_ref_node *node, 2313 struct btrfs_delayed_extent_op *extent_op, 2314 int insert_reserved) 2315 { 2316 int ret = 0; 2317 struct btrfs_delayed_tree_ref *ref; 2318 u64 parent = 0; 2319 u64 ref_root = 0; 2320 2321 ref = btrfs_delayed_node_to_tree_ref(node); 2322 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); 2323 2324 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2325 parent = ref->parent; 2326 ref_root = ref->root; 2327 2328 if (node->ref_mod != 1) { 2329 btrfs_err(trans->fs_info, 2330 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2331 node->bytenr, node->ref_mod, node->action, ref_root, 2332 parent); 2333 return -EIO; 2334 } 2335 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2336 BUG_ON(!extent_op || !extent_op->update_flags); 2337 ret = alloc_reserved_tree_block(trans, node, extent_op); 2338 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2339 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2340 ref->level, 0, 1, extent_op); 2341 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2342 ret = __btrfs_free_extent(trans, node, parent, ref_root, 2343 ref->level, 0, 1, extent_op); 2344 } else { 2345 BUG(); 2346 } 2347 return ret; 2348 } 2349 2350 /* helper function to actually process a single delayed ref entry */ 2351 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2352 struct btrfs_delayed_ref_node *node, 2353 struct btrfs_delayed_extent_op *extent_op, 2354 int insert_reserved) 2355 { 2356 int ret = 0; 2357 2358 if (trans->aborted) { 2359 if (insert_reserved) 2360 btrfs_pin_extent(trans->fs_info, node->bytenr, 2361 node->num_bytes, 1); 2362 return 0; 2363 } 2364 2365 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2366 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2367 ret = run_delayed_tree_ref(trans, node, extent_op, 2368 insert_reserved); 2369 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2370 node->type == BTRFS_SHARED_DATA_REF_KEY) 2371 ret = run_delayed_data_ref(trans, node, extent_op, 2372 insert_reserved); 2373 else 2374 BUG(); 2375 if (ret && insert_reserved) 2376 btrfs_pin_extent(trans->fs_info, node->bytenr, 2377 node->num_bytes, 1); 2378 return ret; 2379 } 2380 2381 static inline struct btrfs_delayed_ref_node * 2382 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2383 { 2384 struct btrfs_delayed_ref_node *ref; 2385 2386 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 2387 return NULL; 2388 2389 /* 2390 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2391 * This is to prevent a ref count from going down to zero, which deletes 2392 * the extent item from the extent tree, when there still are references 2393 * to add, which would fail because they would not find the extent item. 2394 */ 2395 if (!list_empty(&head->ref_add_list)) 2396 return list_first_entry(&head->ref_add_list, 2397 struct btrfs_delayed_ref_node, add_list); 2398 2399 ref = rb_entry(rb_first_cached(&head->ref_tree), 2400 struct btrfs_delayed_ref_node, ref_node); 2401 ASSERT(list_empty(&ref->add_list)); 2402 return ref; 2403 } 2404 2405 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 2406 struct btrfs_delayed_ref_head *head) 2407 { 2408 spin_lock(&delayed_refs->lock); 2409 head->processing = 0; 2410 delayed_refs->num_heads_ready++; 2411 spin_unlock(&delayed_refs->lock); 2412 btrfs_delayed_ref_unlock(head); 2413 } 2414 2415 static struct btrfs_delayed_extent_op *cleanup_extent_op( 2416 struct btrfs_delayed_ref_head *head) 2417 { 2418 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 2419 2420 if (!extent_op) 2421 return NULL; 2422 2423 if (head->must_insert_reserved) { 2424 head->extent_op = NULL; 2425 btrfs_free_delayed_extent_op(extent_op); 2426 return NULL; 2427 } 2428 return extent_op; 2429 } 2430 2431 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 2432 struct btrfs_delayed_ref_head *head) 2433 { 2434 struct btrfs_delayed_extent_op *extent_op; 2435 int ret; 2436 2437 extent_op = cleanup_extent_op(head); 2438 if (!extent_op) 2439 return 0; 2440 head->extent_op = NULL; 2441 spin_unlock(&head->lock); 2442 ret = run_delayed_extent_op(trans, head, extent_op); 2443 btrfs_free_delayed_extent_op(extent_op); 2444 return ret ? ret : 1; 2445 } 2446 2447 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 2448 struct btrfs_delayed_ref_root *delayed_refs, 2449 struct btrfs_delayed_ref_head *head) 2450 { 2451 int nr_items = 1; /* Dropping this ref head update. */ 2452 2453 if (head->total_ref_mod < 0) { 2454 struct btrfs_space_info *space_info; 2455 u64 flags; 2456 2457 if (head->is_data) 2458 flags = BTRFS_BLOCK_GROUP_DATA; 2459 else if (head->is_system) 2460 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2461 else 2462 flags = BTRFS_BLOCK_GROUP_METADATA; 2463 space_info = __find_space_info(fs_info, flags); 2464 ASSERT(space_info); 2465 percpu_counter_add_batch(&space_info->total_bytes_pinned, 2466 -head->num_bytes, 2467 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2468 2469 /* 2470 * We had csum deletions accounted for in our delayed refs rsv, 2471 * we need to drop the csum leaves for this update from our 2472 * delayed_refs_rsv. 2473 */ 2474 if (head->is_data) { 2475 spin_lock(&delayed_refs->lock); 2476 delayed_refs->pending_csums -= head->num_bytes; 2477 spin_unlock(&delayed_refs->lock); 2478 nr_items += btrfs_csum_bytes_to_leaves(fs_info, 2479 head->num_bytes); 2480 } 2481 } 2482 2483 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 2484 } 2485 2486 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 2487 struct btrfs_delayed_ref_head *head) 2488 { 2489 2490 struct btrfs_fs_info *fs_info = trans->fs_info; 2491 struct btrfs_delayed_ref_root *delayed_refs; 2492 int ret; 2493 2494 delayed_refs = &trans->transaction->delayed_refs; 2495 2496 ret = run_and_cleanup_extent_op(trans, head); 2497 if (ret < 0) { 2498 unselect_delayed_ref_head(delayed_refs, head); 2499 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2500 return ret; 2501 } else if (ret) { 2502 return ret; 2503 } 2504 2505 /* 2506 * Need to drop our head ref lock and re-acquire the delayed ref lock 2507 * and then re-check to make sure nobody got added. 2508 */ 2509 spin_unlock(&head->lock); 2510 spin_lock(&delayed_refs->lock); 2511 spin_lock(&head->lock); 2512 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 2513 spin_unlock(&head->lock); 2514 spin_unlock(&delayed_refs->lock); 2515 return 1; 2516 } 2517 btrfs_delete_ref_head(delayed_refs, head); 2518 spin_unlock(&head->lock); 2519 spin_unlock(&delayed_refs->lock); 2520 2521 if (head->must_insert_reserved) { 2522 btrfs_pin_extent(fs_info, head->bytenr, 2523 head->num_bytes, 1); 2524 if (head->is_data) { 2525 ret = btrfs_del_csums(trans, fs_info, head->bytenr, 2526 head->num_bytes); 2527 } 2528 } 2529 2530 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 2531 2532 trace_run_delayed_ref_head(fs_info, head, 0); 2533 btrfs_delayed_ref_unlock(head); 2534 btrfs_put_delayed_ref_head(head); 2535 return 0; 2536 } 2537 2538 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 2539 struct btrfs_trans_handle *trans) 2540 { 2541 struct btrfs_delayed_ref_root *delayed_refs = 2542 &trans->transaction->delayed_refs; 2543 struct btrfs_delayed_ref_head *head = NULL; 2544 int ret; 2545 2546 spin_lock(&delayed_refs->lock); 2547 head = btrfs_select_ref_head(delayed_refs); 2548 if (!head) { 2549 spin_unlock(&delayed_refs->lock); 2550 return head; 2551 } 2552 2553 /* 2554 * Grab the lock that says we are going to process all the refs for 2555 * this head 2556 */ 2557 ret = btrfs_delayed_ref_lock(delayed_refs, head); 2558 spin_unlock(&delayed_refs->lock); 2559 2560 /* 2561 * We may have dropped the spin lock to get the head mutex lock, and 2562 * that might have given someone else time to free the head. If that's 2563 * true, it has been removed from our list and we can move on. 2564 */ 2565 if (ret == -EAGAIN) 2566 head = ERR_PTR(-EAGAIN); 2567 2568 return head; 2569 } 2570 2571 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 2572 struct btrfs_delayed_ref_head *locked_ref, 2573 unsigned long *run_refs) 2574 { 2575 struct btrfs_fs_info *fs_info = trans->fs_info; 2576 struct btrfs_delayed_ref_root *delayed_refs; 2577 struct btrfs_delayed_extent_op *extent_op; 2578 struct btrfs_delayed_ref_node *ref; 2579 int must_insert_reserved = 0; 2580 int ret; 2581 2582 delayed_refs = &trans->transaction->delayed_refs; 2583 2584 lockdep_assert_held(&locked_ref->mutex); 2585 lockdep_assert_held(&locked_ref->lock); 2586 2587 while ((ref = select_delayed_ref(locked_ref))) { 2588 if (ref->seq && 2589 btrfs_check_delayed_seq(fs_info, ref->seq)) { 2590 spin_unlock(&locked_ref->lock); 2591 unselect_delayed_ref_head(delayed_refs, locked_ref); 2592 return -EAGAIN; 2593 } 2594 2595 (*run_refs)++; 2596 ref->in_tree = 0; 2597 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 2598 RB_CLEAR_NODE(&ref->ref_node); 2599 if (!list_empty(&ref->add_list)) 2600 list_del(&ref->add_list); 2601 /* 2602 * When we play the delayed ref, also correct the ref_mod on 2603 * head 2604 */ 2605 switch (ref->action) { 2606 case BTRFS_ADD_DELAYED_REF: 2607 case BTRFS_ADD_DELAYED_EXTENT: 2608 locked_ref->ref_mod -= ref->ref_mod; 2609 break; 2610 case BTRFS_DROP_DELAYED_REF: 2611 locked_ref->ref_mod += ref->ref_mod; 2612 break; 2613 default: 2614 WARN_ON(1); 2615 } 2616 atomic_dec(&delayed_refs->num_entries); 2617 2618 /* 2619 * Record the must_insert_reserved flag before we drop the 2620 * spin lock. 2621 */ 2622 must_insert_reserved = locked_ref->must_insert_reserved; 2623 locked_ref->must_insert_reserved = 0; 2624 2625 extent_op = locked_ref->extent_op; 2626 locked_ref->extent_op = NULL; 2627 spin_unlock(&locked_ref->lock); 2628 2629 ret = run_one_delayed_ref(trans, ref, extent_op, 2630 must_insert_reserved); 2631 2632 btrfs_free_delayed_extent_op(extent_op); 2633 if (ret) { 2634 unselect_delayed_ref_head(delayed_refs, locked_ref); 2635 btrfs_put_delayed_ref(ref); 2636 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2637 ret); 2638 return ret; 2639 } 2640 2641 btrfs_put_delayed_ref(ref); 2642 cond_resched(); 2643 2644 spin_lock(&locked_ref->lock); 2645 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2646 } 2647 2648 return 0; 2649 } 2650 2651 /* 2652 * Returns 0 on success or if called with an already aborted transaction. 2653 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2654 */ 2655 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2656 unsigned long nr) 2657 { 2658 struct btrfs_fs_info *fs_info = trans->fs_info; 2659 struct btrfs_delayed_ref_root *delayed_refs; 2660 struct btrfs_delayed_ref_head *locked_ref = NULL; 2661 ktime_t start = ktime_get(); 2662 int ret; 2663 unsigned long count = 0; 2664 unsigned long actual_count = 0; 2665 2666 delayed_refs = &trans->transaction->delayed_refs; 2667 do { 2668 if (!locked_ref) { 2669 locked_ref = btrfs_obtain_ref_head(trans); 2670 if (IS_ERR_OR_NULL(locked_ref)) { 2671 if (PTR_ERR(locked_ref) == -EAGAIN) { 2672 continue; 2673 } else { 2674 break; 2675 } 2676 } 2677 count++; 2678 } 2679 /* 2680 * We need to try and merge add/drops of the same ref since we 2681 * can run into issues with relocate dropping the implicit ref 2682 * and then it being added back again before the drop can 2683 * finish. If we merged anything we need to re-loop so we can 2684 * get a good ref. 2685 * Or we can get node references of the same type that weren't 2686 * merged when created due to bumps in the tree mod seq, and 2687 * we need to merge them to prevent adding an inline extent 2688 * backref before dropping it (triggering a BUG_ON at 2689 * insert_inline_extent_backref()). 2690 */ 2691 spin_lock(&locked_ref->lock); 2692 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2693 2694 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, 2695 &actual_count); 2696 if (ret < 0 && ret != -EAGAIN) { 2697 /* 2698 * Error, btrfs_run_delayed_refs_for_head already 2699 * unlocked everything so just bail out 2700 */ 2701 return ret; 2702 } else if (!ret) { 2703 /* 2704 * Success, perform the usual cleanup of a processed 2705 * head 2706 */ 2707 ret = cleanup_ref_head(trans, locked_ref); 2708 if (ret > 0 ) { 2709 /* We dropped our lock, we need to loop. */ 2710 ret = 0; 2711 continue; 2712 } else if (ret) { 2713 return ret; 2714 } 2715 } 2716 2717 /* 2718 * Either success case or btrfs_run_delayed_refs_for_head 2719 * returned -EAGAIN, meaning we need to select another head 2720 */ 2721 2722 locked_ref = NULL; 2723 cond_resched(); 2724 } while ((nr != -1 && count < nr) || locked_ref); 2725 2726 /* 2727 * We don't want to include ref heads since we can have empty ref heads 2728 * and those will drastically skew our runtime down since we just do 2729 * accounting, no actual extent tree updates. 2730 */ 2731 if (actual_count > 0) { 2732 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2733 u64 avg; 2734 2735 /* 2736 * We weigh the current average higher than our current runtime 2737 * to avoid large swings in the average. 2738 */ 2739 spin_lock(&delayed_refs->lock); 2740 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2741 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2742 spin_unlock(&delayed_refs->lock); 2743 } 2744 return 0; 2745 } 2746 2747 #ifdef SCRAMBLE_DELAYED_REFS 2748 /* 2749 * Normally delayed refs get processed in ascending bytenr order. This 2750 * correlates in most cases to the order added. To expose dependencies on this 2751 * order, we start to process the tree in the middle instead of the beginning 2752 */ 2753 static u64 find_middle(struct rb_root *root) 2754 { 2755 struct rb_node *n = root->rb_node; 2756 struct btrfs_delayed_ref_node *entry; 2757 int alt = 1; 2758 u64 middle; 2759 u64 first = 0, last = 0; 2760 2761 n = rb_first(root); 2762 if (n) { 2763 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2764 first = entry->bytenr; 2765 } 2766 n = rb_last(root); 2767 if (n) { 2768 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2769 last = entry->bytenr; 2770 } 2771 n = root->rb_node; 2772 2773 while (n) { 2774 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2775 WARN_ON(!entry->in_tree); 2776 2777 middle = entry->bytenr; 2778 2779 if (alt) 2780 n = n->rb_left; 2781 else 2782 n = n->rb_right; 2783 2784 alt = 1 - alt; 2785 } 2786 return middle; 2787 } 2788 #endif 2789 2790 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2791 { 2792 u64 num_bytes; 2793 2794 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2795 sizeof(struct btrfs_extent_inline_ref)); 2796 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2797 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2798 2799 /* 2800 * We don't ever fill up leaves all the way so multiply by 2 just to be 2801 * closer to what we're really going to want to use. 2802 */ 2803 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2804 } 2805 2806 /* 2807 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2808 * would require to store the csums for that many bytes. 2809 */ 2810 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2811 { 2812 u64 csum_size; 2813 u64 num_csums_per_leaf; 2814 u64 num_csums; 2815 2816 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2817 num_csums_per_leaf = div64_u64(csum_size, 2818 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2819 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2820 num_csums += num_csums_per_leaf - 1; 2821 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2822 return num_csums; 2823 } 2824 2825 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 2826 { 2827 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 2828 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2829 bool ret = false; 2830 u64 reserved; 2831 2832 spin_lock(&global_rsv->lock); 2833 reserved = global_rsv->reserved; 2834 spin_unlock(&global_rsv->lock); 2835 2836 /* 2837 * Since the global reserve is just kind of magic we don't really want 2838 * to rely on it to save our bacon, so if our size is more than the 2839 * delayed_refs_rsv and the global rsv then it's time to think about 2840 * bailing. 2841 */ 2842 spin_lock(&delayed_refs_rsv->lock); 2843 reserved += delayed_refs_rsv->reserved; 2844 if (delayed_refs_rsv->size >= reserved) 2845 ret = true; 2846 spin_unlock(&delayed_refs_rsv->lock); 2847 return ret; 2848 } 2849 2850 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 2851 { 2852 u64 num_entries = 2853 atomic_read(&trans->transaction->delayed_refs.num_entries); 2854 u64 avg_runtime; 2855 u64 val; 2856 2857 smp_mb(); 2858 avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 2859 val = num_entries * avg_runtime; 2860 if (val >= NSEC_PER_SEC) 2861 return 1; 2862 if (val >= NSEC_PER_SEC / 2) 2863 return 2; 2864 2865 return btrfs_check_space_for_delayed_refs(trans->fs_info); 2866 } 2867 2868 /* 2869 * this starts processing the delayed reference count updates and 2870 * extent insertions we have queued up so far. count can be 2871 * 0, which means to process everything in the tree at the start 2872 * of the run (but not newly added entries), or it can be some target 2873 * number you'd like to process. 2874 * 2875 * Returns 0 on success or if called with an aborted transaction 2876 * Returns <0 on error and aborts the transaction 2877 */ 2878 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2879 unsigned long count) 2880 { 2881 struct btrfs_fs_info *fs_info = trans->fs_info; 2882 struct rb_node *node; 2883 struct btrfs_delayed_ref_root *delayed_refs; 2884 struct btrfs_delayed_ref_head *head; 2885 int ret; 2886 int run_all = count == (unsigned long)-1; 2887 2888 /* We'll clean this up in btrfs_cleanup_transaction */ 2889 if (trans->aborted) 2890 return 0; 2891 2892 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2893 return 0; 2894 2895 delayed_refs = &trans->transaction->delayed_refs; 2896 if (count == 0) 2897 count = atomic_read(&delayed_refs->num_entries) * 2; 2898 2899 again: 2900 #ifdef SCRAMBLE_DELAYED_REFS 2901 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2902 #endif 2903 ret = __btrfs_run_delayed_refs(trans, count); 2904 if (ret < 0) { 2905 btrfs_abort_transaction(trans, ret); 2906 return ret; 2907 } 2908 2909 if (run_all) { 2910 btrfs_create_pending_block_groups(trans); 2911 2912 spin_lock(&delayed_refs->lock); 2913 node = rb_first_cached(&delayed_refs->href_root); 2914 if (!node) { 2915 spin_unlock(&delayed_refs->lock); 2916 goto out; 2917 } 2918 head = rb_entry(node, struct btrfs_delayed_ref_head, 2919 href_node); 2920 refcount_inc(&head->refs); 2921 spin_unlock(&delayed_refs->lock); 2922 2923 /* Mutex was contended, block until it's released and retry. */ 2924 mutex_lock(&head->mutex); 2925 mutex_unlock(&head->mutex); 2926 2927 btrfs_put_delayed_ref_head(head); 2928 cond_resched(); 2929 goto again; 2930 } 2931 out: 2932 return 0; 2933 } 2934 2935 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2936 u64 bytenr, u64 num_bytes, u64 flags, 2937 int level, int is_data) 2938 { 2939 struct btrfs_delayed_extent_op *extent_op; 2940 int ret; 2941 2942 extent_op = btrfs_alloc_delayed_extent_op(); 2943 if (!extent_op) 2944 return -ENOMEM; 2945 2946 extent_op->flags_to_set = flags; 2947 extent_op->update_flags = true; 2948 extent_op->update_key = false; 2949 extent_op->is_data = is_data ? true : false; 2950 extent_op->level = level; 2951 2952 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2953 if (ret) 2954 btrfs_free_delayed_extent_op(extent_op); 2955 return ret; 2956 } 2957 2958 static noinline int check_delayed_ref(struct btrfs_root *root, 2959 struct btrfs_path *path, 2960 u64 objectid, u64 offset, u64 bytenr) 2961 { 2962 struct btrfs_delayed_ref_head *head; 2963 struct btrfs_delayed_ref_node *ref; 2964 struct btrfs_delayed_data_ref *data_ref; 2965 struct btrfs_delayed_ref_root *delayed_refs; 2966 struct btrfs_transaction *cur_trans; 2967 struct rb_node *node; 2968 int ret = 0; 2969 2970 spin_lock(&root->fs_info->trans_lock); 2971 cur_trans = root->fs_info->running_transaction; 2972 if (cur_trans) 2973 refcount_inc(&cur_trans->use_count); 2974 spin_unlock(&root->fs_info->trans_lock); 2975 if (!cur_trans) 2976 return 0; 2977 2978 delayed_refs = &cur_trans->delayed_refs; 2979 spin_lock(&delayed_refs->lock); 2980 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 2981 if (!head) { 2982 spin_unlock(&delayed_refs->lock); 2983 btrfs_put_transaction(cur_trans); 2984 return 0; 2985 } 2986 2987 if (!mutex_trylock(&head->mutex)) { 2988 refcount_inc(&head->refs); 2989 spin_unlock(&delayed_refs->lock); 2990 2991 btrfs_release_path(path); 2992 2993 /* 2994 * Mutex was contended, block until it's released and let 2995 * caller try again 2996 */ 2997 mutex_lock(&head->mutex); 2998 mutex_unlock(&head->mutex); 2999 btrfs_put_delayed_ref_head(head); 3000 btrfs_put_transaction(cur_trans); 3001 return -EAGAIN; 3002 } 3003 spin_unlock(&delayed_refs->lock); 3004 3005 spin_lock(&head->lock); 3006 /* 3007 * XXX: We should replace this with a proper search function in the 3008 * future. 3009 */ 3010 for (node = rb_first_cached(&head->ref_tree); node; 3011 node = rb_next(node)) { 3012 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 3013 /* If it's a shared ref we know a cross reference exists */ 3014 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3015 ret = 1; 3016 break; 3017 } 3018 3019 data_ref = btrfs_delayed_node_to_data_ref(ref); 3020 3021 /* 3022 * If our ref doesn't match the one we're currently looking at 3023 * then we have a cross reference. 3024 */ 3025 if (data_ref->root != root->root_key.objectid || 3026 data_ref->objectid != objectid || 3027 data_ref->offset != offset) { 3028 ret = 1; 3029 break; 3030 } 3031 } 3032 spin_unlock(&head->lock); 3033 mutex_unlock(&head->mutex); 3034 btrfs_put_transaction(cur_trans); 3035 return ret; 3036 } 3037 3038 static noinline int check_committed_ref(struct btrfs_root *root, 3039 struct btrfs_path *path, 3040 u64 objectid, u64 offset, u64 bytenr) 3041 { 3042 struct btrfs_fs_info *fs_info = root->fs_info; 3043 struct btrfs_root *extent_root = fs_info->extent_root; 3044 struct extent_buffer *leaf; 3045 struct btrfs_extent_data_ref *ref; 3046 struct btrfs_extent_inline_ref *iref; 3047 struct btrfs_extent_item *ei; 3048 struct btrfs_key key; 3049 u32 item_size; 3050 int type; 3051 int ret; 3052 3053 key.objectid = bytenr; 3054 key.offset = (u64)-1; 3055 key.type = BTRFS_EXTENT_ITEM_KEY; 3056 3057 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3058 if (ret < 0) 3059 goto out; 3060 BUG_ON(ret == 0); /* Corruption */ 3061 3062 ret = -ENOENT; 3063 if (path->slots[0] == 0) 3064 goto out; 3065 3066 path->slots[0]--; 3067 leaf = path->nodes[0]; 3068 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3069 3070 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3071 goto out; 3072 3073 ret = 1; 3074 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3075 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3076 3077 if (item_size != sizeof(*ei) + 3078 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3079 goto out; 3080 3081 if (btrfs_extent_generation(leaf, ei) <= 3082 btrfs_root_last_snapshot(&root->root_item)) 3083 goto out; 3084 3085 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3086 3087 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 3088 if (type != BTRFS_EXTENT_DATA_REF_KEY) 3089 goto out; 3090 3091 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3092 if (btrfs_extent_refs(leaf, ei) != 3093 btrfs_extent_data_ref_count(leaf, ref) || 3094 btrfs_extent_data_ref_root(leaf, ref) != 3095 root->root_key.objectid || 3096 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3097 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3098 goto out; 3099 3100 ret = 0; 3101 out: 3102 return ret; 3103 } 3104 3105 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3106 u64 bytenr) 3107 { 3108 struct btrfs_path *path; 3109 int ret; 3110 3111 path = btrfs_alloc_path(); 3112 if (!path) 3113 return -ENOMEM; 3114 3115 do { 3116 ret = check_committed_ref(root, path, objectid, 3117 offset, bytenr); 3118 if (ret && ret != -ENOENT) 3119 goto out; 3120 3121 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 3122 } while (ret == -EAGAIN); 3123 3124 out: 3125 btrfs_free_path(path); 3126 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3127 WARN_ON(ret > 0); 3128 return ret; 3129 } 3130 3131 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3132 struct btrfs_root *root, 3133 struct extent_buffer *buf, 3134 int full_backref, int inc) 3135 { 3136 struct btrfs_fs_info *fs_info = root->fs_info; 3137 u64 bytenr; 3138 u64 num_bytes; 3139 u64 parent; 3140 u64 ref_root; 3141 u32 nritems; 3142 struct btrfs_key key; 3143 struct btrfs_file_extent_item *fi; 3144 struct btrfs_ref generic_ref = { 0 }; 3145 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); 3146 int i; 3147 int action; 3148 int level; 3149 int ret = 0; 3150 3151 if (btrfs_is_testing(fs_info)) 3152 return 0; 3153 3154 ref_root = btrfs_header_owner(buf); 3155 nritems = btrfs_header_nritems(buf); 3156 level = btrfs_header_level(buf); 3157 3158 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3159 return 0; 3160 3161 if (full_backref) 3162 parent = buf->start; 3163 else 3164 parent = 0; 3165 if (inc) 3166 action = BTRFS_ADD_DELAYED_REF; 3167 else 3168 action = BTRFS_DROP_DELAYED_REF; 3169 3170 for (i = 0; i < nritems; i++) { 3171 if (level == 0) { 3172 btrfs_item_key_to_cpu(buf, &key, i); 3173 if (key.type != BTRFS_EXTENT_DATA_KEY) 3174 continue; 3175 fi = btrfs_item_ptr(buf, i, 3176 struct btrfs_file_extent_item); 3177 if (btrfs_file_extent_type(buf, fi) == 3178 BTRFS_FILE_EXTENT_INLINE) 3179 continue; 3180 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3181 if (bytenr == 0) 3182 continue; 3183 3184 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3185 key.offset -= btrfs_file_extent_offset(buf, fi); 3186 btrfs_init_generic_ref(&generic_ref, action, bytenr, 3187 num_bytes, parent); 3188 generic_ref.real_root = root->root_key.objectid; 3189 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, 3190 key.offset); 3191 generic_ref.skip_qgroup = for_reloc; 3192 if (inc) 3193 ret = btrfs_inc_extent_ref(trans, &generic_ref); 3194 else 3195 ret = btrfs_free_extent(trans, &generic_ref); 3196 if (ret) 3197 goto fail; 3198 } else { 3199 bytenr = btrfs_node_blockptr(buf, i); 3200 num_bytes = fs_info->nodesize; 3201 btrfs_init_generic_ref(&generic_ref, action, bytenr, 3202 num_bytes, parent); 3203 generic_ref.real_root = root->root_key.objectid; 3204 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); 3205 generic_ref.skip_qgroup = for_reloc; 3206 if (inc) 3207 ret = btrfs_inc_extent_ref(trans, &generic_ref); 3208 else 3209 ret = btrfs_free_extent(trans, &generic_ref); 3210 if (ret) 3211 goto fail; 3212 } 3213 } 3214 return 0; 3215 fail: 3216 return ret; 3217 } 3218 3219 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3220 struct extent_buffer *buf, int full_backref) 3221 { 3222 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3223 } 3224 3225 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3226 struct extent_buffer *buf, int full_backref) 3227 { 3228 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3229 } 3230 3231 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3232 struct btrfs_path *path, 3233 struct btrfs_block_group_cache *cache) 3234 { 3235 struct btrfs_fs_info *fs_info = trans->fs_info; 3236 int ret; 3237 struct btrfs_root *extent_root = fs_info->extent_root; 3238 unsigned long bi; 3239 struct extent_buffer *leaf; 3240 3241 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3242 if (ret) { 3243 if (ret > 0) 3244 ret = -ENOENT; 3245 goto fail; 3246 } 3247 3248 leaf = path->nodes[0]; 3249 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3250 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3251 btrfs_mark_buffer_dirty(leaf); 3252 fail: 3253 btrfs_release_path(path); 3254 return ret; 3255 3256 } 3257 3258 static struct btrfs_block_group_cache *next_block_group( 3259 struct btrfs_block_group_cache *cache) 3260 { 3261 struct btrfs_fs_info *fs_info = cache->fs_info; 3262 struct rb_node *node; 3263 3264 spin_lock(&fs_info->block_group_cache_lock); 3265 3266 /* If our block group was removed, we need a full search. */ 3267 if (RB_EMPTY_NODE(&cache->cache_node)) { 3268 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3269 3270 spin_unlock(&fs_info->block_group_cache_lock); 3271 btrfs_put_block_group(cache); 3272 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3273 } 3274 node = rb_next(&cache->cache_node); 3275 btrfs_put_block_group(cache); 3276 if (node) { 3277 cache = rb_entry(node, struct btrfs_block_group_cache, 3278 cache_node); 3279 btrfs_get_block_group(cache); 3280 } else 3281 cache = NULL; 3282 spin_unlock(&fs_info->block_group_cache_lock); 3283 return cache; 3284 } 3285 3286 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3287 struct btrfs_trans_handle *trans, 3288 struct btrfs_path *path) 3289 { 3290 struct btrfs_fs_info *fs_info = block_group->fs_info; 3291 struct btrfs_root *root = fs_info->tree_root; 3292 struct inode *inode = NULL; 3293 struct extent_changeset *data_reserved = NULL; 3294 u64 alloc_hint = 0; 3295 int dcs = BTRFS_DC_ERROR; 3296 u64 num_pages = 0; 3297 int retries = 0; 3298 int ret = 0; 3299 3300 /* 3301 * If this block group is smaller than 100 megs don't bother caching the 3302 * block group. 3303 */ 3304 if (block_group->key.offset < (100 * SZ_1M)) { 3305 spin_lock(&block_group->lock); 3306 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3307 spin_unlock(&block_group->lock); 3308 return 0; 3309 } 3310 3311 if (trans->aborted) 3312 return 0; 3313 again: 3314 inode = lookup_free_space_inode(block_group, path); 3315 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3316 ret = PTR_ERR(inode); 3317 btrfs_release_path(path); 3318 goto out; 3319 } 3320 3321 if (IS_ERR(inode)) { 3322 BUG_ON(retries); 3323 retries++; 3324 3325 if (block_group->ro) 3326 goto out_free; 3327 3328 ret = create_free_space_inode(trans, block_group, path); 3329 if (ret) 3330 goto out_free; 3331 goto again; 3332 } 3333 3334 /* 3335 * We want to set the generation to 0, that way if anything goes wrong 3336 * from here on out we know not to trust this cache when we load up next 3337 * time. 3338 */ 3339 BTRFS_I(inode)->generation = 0; 3340 ret = btrfs_update_inode(trans, root, inode); 3341 if (ret) { 3342 /* 3343 * So theoretically we could recover from this, simply set the 3344 * super cache generation to 0 so we know to invalidate the 3345 * cache, but then we'd have to keep track of the block groups 3346 * that fail this way so we know we _have_ to reset this cache 3347 * before the next commit or risk reading stale cache. So to 3348 * limit our exposure to horrible edge cases lets just abort the 3349 * transaction, this only happens in really bad situations 3350 * anyway. 3351 */ 3352 btrfs_abort_transaction(trans, ret); 3353 goto out_put; 3354 } 3355 WARN_ON(ret); 3356 3357 /* We've already setup this transaction, go ahead and exit */ 3358 if (block_group->cache_generation == trans->transid && 3359 i_size_read(inode)) { 3360 dcs = BTRFS_DC_SETUP; 3361 goto out_put; 3362 } 3363 3364 if (i_size_read(inode) > 0) { 3365 ret = btrfs_check_trunc_cache_free_space(fs_info, 3366 &fs_info->global_block_rsv); 3367 if (ret) 3368 goto out_put; 3369 3370 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3371 if (ret) 3372 goto out_put; 3373 } 3374 3375 spin_lock(&block_group->lock); 3376 if (block_group->cached != BTRFS_CACHE_FINISHED || 3377 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3378 /* 3379 * don't bother trying to write stuff out _if_ 3380 * a) we're not cached, 3381 * b) we're with nospace_cache mount option, 3382 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3383 */ 3384 dcs = BTRFS_DC_WRITTEN; 3385 spin_unlock(&block_group->lock); 3386 goto out_put; 3387 } 3388 spin_unlock(&block_group->lock); 3389 3390 /* 3391 * We hit an ENOSPC when setting up the cache in this transaction, just 3392 * skip doing the setup, we've already cleared the cache so we're safe. 3393 */ 3394 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3395 ret = -ENOSPC; 3396 goto out_put; 3397 } 3398 3399 /* 3400 * Try to preallocate enough space based on how big the block group is. 3401 * Keep in mind this has to include any pinned space which could end up 3402 * taking up quite a bit since it's not folded into the other space 3403 * cache. 3404 */ 3405 num_pages = div_u64(block_group->key.offset, SZ_256M); 3406 if (!num_pages) 3407 num_pages = 1; 3408 3409 num_pages *= 16; 3410 num_pages *= PAGE_SIZE; 3411 3412 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); 3413 if (ret) 3414 goto out_put; 3415 3416 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3417 num_pages, num_pages, 3418 &alloc_hint); 3419 /* 3420 * Our cache requires contiguous chunks so that we don't modify a bunch 3421 * of metadata or split extents when writing the cache out, which means 3422 * we can enospc if we are heavily fragmented in addition to just normal 3423 * out of space conditions. So if we hit this just skip setting up any 3424 * other block groups for this transaction, maybe we'll unpin enough 3425 * space the next time around. 3426 */ 3427 if (!ret) 3428 dcs = BTRFS_DC_SETUP; 3429 else if (ret == -ENOSPC) 3430 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3431 3432 out_put: 3433 iput(inode); 3434 out_free: 3435 btrfs_release_path(path); 3436 out: 3437 spin_lock(&block_group->lock); 3438 if (!ret && dcs == BTRFS_DC_SETUP) 3439 block_group->cache_generation = trans->transid; 3440 block_group->disk_cache_state = dcs; 3441 spin_unlock(&block_group->lock); 3442 3443 extent_changeset_free(data_reserved); 3444 return ret; 3445 } 3446 3447 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 3448 { 3449 struct btrfs_fs_info *fs_info = trans->fs_info; 3450 struct btrfs_block_group_cache *cache, *tmp; 3451 struct btrfs_transaction *cur_trans = trans->transaction; 3452 struct btrfs_path *path; 3453 3454 if (list_empty(&cur_trans->dirty_bgs) || 3455 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3456 return 0; 3457 3458 path = btrfs_alloc_path(); 3459 if (!path) 3460 return -ENOMEM; 3461 3462 /* Could add new block groups, use _safe just in case */ 3463 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3464 dirty_list) { 3465 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3466 cache_save_setup(cache, trans, path); 3467 } 3468 3469 btrfs_free_path(path); 3470 return 0; 3471 } 3472 3473 /* 3474 * transaction commit does final block group cache writeback during a 3475 * critical section where nothing is allowed to change the FS. This is 3476 * required in order for the cache to actually match the block group, 3477 * but can introduce a lot of latency into the commit. 3478 * 3479 * So, btrfs_start_dirty_block_groups is here to kick off block group 3480 * cache IO. There's a chance we'll have to redo some of it if the 3481 * block group changes again during the commit, but it greatly reduces 3482 * the commit latency by getting rid of the easy block groups while 3483 * we're still allowing others to join the commit. 3484 */ 3485 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 3486 { 3487 struct btrfs_fs_info *fs_info = trans->fs_info; 3488 struct btrfs_block_group_cache *cache; 3489 struct btrfs_transaction *cur_trans = trans->transaction; 3490 int ret = 0; 3491 int should_put; 3492 struct btrfs_path *path = NULL; 3493 LIST_HEAD(dirty); 3494 struct list_head *io = &cur_trans->io_bgs; 3495 int num_started = 0; 3496 int loops = 0; 3497 3498 spin_lock(&cur_trans->dirty_bgs_lock); 3499 if (list_empty(&cur_trans->dirty_bgs)) { 3500 spin_unlock(&cur_trans->dirty_bgs_lock); 3501 return 0; 3502 } 3503 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3504 spin_unlock(&cur_trans->dirty_bgs_lock); 3505 3506 again: 3507 /* 3508 * make sure all the block groups on our dirty list actually 3509 * exist 3510 */ 3511 btrfs_create_pending_block_groups(trans); 3512 3513 if (!path) { 3514 path = btrfs_alloc_path(); 3515 if (!path) 3516 return -ENOMEM; 3517 } 3518 3519 /* 3520 * cache_write_mutex is here only to save us from balance or automatic 3521 * removal of empty block groups deleting this block group while we are 3522 * writing out the cache 3523 */ 3524 mutex_lock(&trans->transaction->cache_write_mutex); 3525 while (!list_empty(&dirty)) { 3526 bool drop_reserve = true; 3527 3528 cache = list_first_entry(&dirty, 3529 struct btrfs_block_group_cache, 3530 dirty_list); 3531 /* 3532 * this can happen if something re-dirties a block 3533 * group that is already under IO. Just wait for it to 3534 * finish and then do it all again 3535 */ 3536 if (!list_empty(&cache->io_list)) { 3537 list_del_init(&cache->io_list); 3538 btrfs_wait_cache_io(trans, cache, path); 3539 btrfs_put_block_group(cache); 3540 } 3541 3542 3543 /* 3544 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3545 * if it should update the cache_state. Don't delete 3546 * until after we wait. 3547 * 3548 * Since we're not running in the commit critical section 3549 * we need the dirty_bgs_lock to protect from update_block_group 3550 */ 3551 spin_lock(&cur_trans->dirty_bgs_lock); 3552 list_del_init(&cache->dirty_list); 3553 spin_unlock(&cur_trans->dirty_bgs_lock); 3554 3555 should_put = 1; 3556 3557 cache_save_setup(cache, trans, path); 3558 3559 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3560 cache->io_ctl.inode = NULL; 3561 ret = btrfs_write_out_cache(trans, cache, path); 3562 if (ret == 0 && cache->io_ctl.inode) { 3563 num_started++; 3564 should_put = 0; 3565 3566 /* 3567 * The cache_write_mutex is protecting the 3568 * io_list, also refer to the definition of 3569 * btrfs_transaction::io_bgs for more details 3570 */ 3571 list_add_tail(&cache->io_list, io); 3572 } else { 3573 /* 3574 * if we failed to write the cache, the 3575 * generation will be bad and life goes on 3576 */ 3577 ret = 0; 3578 } 3579 } 3580 if (!ret) { 3581 ret = write_one_cache_group(trans, path, cache); 3582 /* 3583 * Our block group might still be attached to the list 3584 * of new block groups in the transaction handle of some 3585 * other task (struct btrfs_trans_handle->new_bgs). This 3586 * means its block group item isn't yet in the extent 3587 * tree. If this happens ignore the error, as we will 3588 * try again later in the critical section of the 3589 * transaction commit. 3590 */ 3591 if (ret == -ENOENT) { 3592 ret = 0; 3593 spin_lock(&cur_trans->dirty_bgs_lock); 3594 if (list_empty(&cache->dirty_list)) { 3595 list_add_tail(&cache->dirty_list, 3596 &cur_trans->dirty_bgs); 3597 btrfs_get_block_group(cache); 3598 drop_reserve = false; 3599 } 3600 spin_unlock(&cur_trans->dirty_bgs_lock); 3601 } else if (ret) { 3602 btrfs_abort_transaction(trans, ret); 3603 } 3604 } 3605 3606 /* if it's not on the io list, we need to put the block group */ 3607 if (should_put) 3608 btrfs_put_block_group(cache); 3609 if (drop_reserve) 3610 btrfs_delayed_refs_rsv_release(fs_info, 1); 3611 3612 if (ret) 3613 break; 3614 3615 /* 3616 * Avoid blocking other tasks for too long. It might even save 3617 * us from writing caches for block groups that are going to be 3618 * removed. 3619 */ 3620 mutex_unlock(&trans->transaction->cache_write_mutex); 3621 mutex_lock(&trans->transaction->cache_write_mutex); 3622 } 3623 mutex_unlock(&trans->transaction->cache_write_mutex); 3624 3625 /* 3626 * go through delayed refs for all the stuff we've just kicked off 3627 * and then loop back (just once) 3628 */ 3629 ret = btrfs_run_delayed_refs(trans, 0); 3630 if (!ret && loops == 0) { 3631 loops++; 3632 spin_lock(&cur_trans->dirty_bgs_lock); 3633 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3634 /* 3635 * dirty_bgs_lock protects us from concurrent block group 3636 * deletes too (not just cache_write_mutex). 3637 */ 3638 if (!list_empty(&dirty)) { 3639 spin_unlock(&cur_trans->dirty_bgs_lock); 3640 goto again; 3641 } 3642 spin_unlock(&cur_trans->dirty_bgs_lock); 3643 } else if (ret < 0) { 3644 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3645 } 3646 3647 btrfs_free_path(path); 3648 return ret; 3649 } 3650 3651 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 3652 { 3653 struct btrfs_fs_info *fs_info = trans->fs_info; 3654 struct btrfs_block_group_cache *cache; 3655 struct btrfs_transaction *cur_trans = trans->transaction; 3656 int ret = 0; 3657 int should_put; 3658 struct btrfs_path *path; 3659 struct list_head *io = &cur_trans->io_bgs; 3660 int num_started = 0; 3661 3662 path = btrfs_alloc_path(); 3663 if (!path) 3664 return -ENOMEM; 3665 3666 /* 3667 * Even though we are in the critical section of the transaction commit, 3668 * we can still have concurrent tasks adding elements to this 3669 * transaction's list of dirty block groups. These tasks correspond to 3670 * endio free space workers started when writeback finishes for a 3671 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3672 * allocate new block groups as a result of COWing nodes of the root 3673 * tree when updating the free space inode. The writeback for the space 3674 * caches is triggered by an earlier call to 3675 * btrfs_start_dirty_block_groups() and iterations of the following 3676 * loop. 3677 * Also we want to do the cache_save_setup first and then run the 3678 * delayed refs to make sure we have the best chance at doing this all 3679 * in one shot. 3680 */ 3681 spin_lock(&cur_trans->dirty_bgs_lock); 3682 while (!list_empty(&cur_trans->dirty_bgs)) { 3683 cache = list_first_entry(&cur_trans->dirty_bgs, 3684 struct btrfs_block_group_cache, 3685 dirty_list); 3686 3687 /* 3688 * this can happen if cache_save_setup re-dirties a block 3689 * group that is already under IO. Just wait for it to 3690 * finish and then do it all again 3691 */ 3692 if (!list_empty(&cache->io_list)) { 3693 spin_unlock(&cur_trans->dirty_bgs_lock); 3694 list_del_init(&cache->io_list); 3695 btrfs_wait_cache_io(trans, cache, path); 3696 btrfs_put_block_group(cache); 3697 spin_lock(&cur_trans->dirty_bgs_lock); 3698 } 3699 3700 /* 3701 * don't remove from the dirty list until after we've waited 3702 * on any pending IO 3703 */ 3704 list_del_init(&cache->dirty_list); 3705 spin_unlock(&cur_trans->dirty_bgs_lock); 3706 should_put = 1; 3707 3708 cache_save_setup(cache, trans, path); 3709 3710 if (!ret) 3711 ret = btrfs_run_delayed_refs(trans, 3712 (unsigned long) -1); 3713 3714 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3715 cache->io_ctl.inode = NULL; 3716 ret = btrfs_write_out_cache(trans, cache, path); 3717 if (ret == 0 && cache->io_ctl.inode) { 3718 num_started++; 3719 should_put = 0; 3720 list_add_tail(&cache->io_list, io); 3721 } else { 3722 /* 3723 * if we failed to write the cache, the 3724 * generation will be bad and life goes on 3725 */ 3726 ret = 0; 3727 } 3728 } 3729 if (!ret) { 3730 ret = write_one_cache_group(trans, path, cache); 3731 /* 3732 * One of the free space endio workers might have 3733 * created a new block group while updating a free space 3734 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3735 * and hasn't released its transaction handle yet, in 3736 * which case the new block group is still attached to 3737 * its transaction handle and its creation has not 3738 * finished yet (no block group item in the extent tree 3739 * yet, etc). If this is the case, wait for all free 3740 * space endio workers to finish and retry. This is a 3741 * a very rare case so no need for a more efficient and 3742 * complex approach. 3743 */ 3744 if (ret == -ENOENT) { 3745 wait_event(cur_trans->writer_wait, 3746 atomic_read(&cur_trans->num_writers) == 1); 3747 ret = write_one_cache_group(trans, path, cache); 3748 } 3749 if (ret) 3750 btrfs_abort_transaction(trans, ret); 3751 } 3752 3753 /* if its not on the io list, we need to put the block group */ 3754 if (should_put) 3755 btrfs_put_block_group(cache); 3756 btrfs_delayed_refs_rsv_release(fs_info, 1); 3757 spin_lock(&cur_trans->dirty_bgs_lock); 3758 } 3759 spin_unlock(&cur_trans->dirty_bgs_lock); 3760 3761 /* 3762 * Refer to the definition of io_bgs member for details why it's safe 3763 * to use it without any locking 3764 */ 3765 while (!list_empty(io)) { 3766 cache = list_first_entry(io, struct btrfs_block_group_cache, 3767 io_list); 3768 list_del_init(&cache->io_list); 3769 btrfs_wait_cache_io(trans, cache, path); 3770 btrfs_put_block_group(cache); 3771 } 3772 3773 btrfs_free_path(path); 3774 return ret; 3775 } 3776 3777 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3778 { 3779 struct btrfs_block_group_cache *block_group; 3780 int readonly = 0; 3781 3782 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3783 if (!block_group || block_group->ro) 3784 readonly = 1; 3785 if (block_group) 3786 btrfs_put_block_group(block_group); 3787 return readonly; 3788 } 3789 3790 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3791 { 3792 struct btrfs_block_group_cache *bg; 3793 bool ret = true; 3794 3795 bg = btrfs_lookup_block_group(fs_info, bytenr); 3796 if (!bg) 3797 return false; 3798 3799 spin_lock(&bg->lock); 3800 if (bg->ro) 3801 ret = false; 3802 else 3803 atomic_inc(&bg->nocow_writers); 3804 spin_unlock(&bg->lock); 3805 3806 /* no put on block group, done by btrfs_dec_nocow_writers */ 3807 if (!ret) 3808 btrfs_put_block_group(bg); 3809 3810 return ret; 3811 3812 } 3813 3814 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3815 { 3816 struct btrfs_block_group_cache *bg; 3817 3818 bg = btrfs_lookup_block_group(fs_info, bytenr); 3819 ASSERT(bg); 3820 if (atomic_dec_and_test(&bg->nocow_writers)) 3821 wake_up_var(&bg->nocow_writers); 3822 /* 3823 * Once for our lookup and once for the lookup done by a previous call 3824 * to btrfs_inc_nocow_writers() 3825 */ 3826 btrfs_put_block_group(bg); 3827 btrfs_put_block_group(bg); 3828 } 3829 3830 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3831 { 3832 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 3833 } 3834 3835 static const char *alloc_name(u64 flags) 3836 { 3837 switch (flags) { 3838 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3839 return "mixed"; 3840 case BTRFS_BLOCK_GROUP_METADATA: 3841 return "metadata"; 3842 case BTRFS_BLOCK_GROUP_DATA: 3843 return "data"; 3844 case BTRFS_BLOCK_GROUP_SYSTEM: 3845 return "system"; 3846 default: 3847 WARN_ON(1); 3848 return "invalid-combination"; 3849 }; 3850 } 3851 3852 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 3853 { 3854 3855 struct btrfs_space_info *space_info; 3856 int i; 3857 int ret; 3858 3859 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 3860 if (!space_info) 3861 return -ENOMEM; 3862 3863 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 3864 GFP_KERNEL); 3865 if (ret) { 3866 kfree(space_info); 3867 return ret; 3868 } 3869 3870 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3871 INIT_LIST_HEAD(&space_info->block_groups[i]); 3872 init_rwsem(&space_info->groups_sem); 3873 spin_lock_init(&space_info->lock); 3874 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3875 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3876 init_waitqueue_head(&space_info->wait); 3877 INIT_LIST_HEAD(&space_info->ro_bgs); 3878 INIT_LIST_HEAD(&space_info->tickets); 3879 INIT_LIST_HEAD(&space_info->priority_tickets); 3880 3881 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 3882 info->space_info_kobj, "%s", 3883 alloc_name(space_info->flags)); 3884 if (ret) { 3885 percpu_counter_destroy(&space_info->total_bytes_pinned); 3886 kfree(space_info); 3887 return ret; 3888 } 3889 3890 list_add_rcu(&space_info->list, &info->space_info); 3891 if (flags & BTRFS_BLOCK_GROUP_DATA) 3892 info->data_sinfo = space_info; 3893 3894 return ret; 3895 } 3896 3897 static void update_space_info(struct btrfs_fs_info *info, u64 flags, 3898 u64 total_bytes, u64 bytes_used, 3899 u64 bytes_readonly, 3900 struct btrfs_space_info **space_info) 3901 { 3902 struct btrfs_space_info *found; 3903 int factor; 3904 3905 factor = btrfs_bg_type_to_factor(flags); 3906 3907 found = __find_space_info(info, flags); 3908 ASSERT(found); 3909 spin_lock(&found->lock); 3910 found->total_bytes += total_bytes; 3911 found->disk_total += total_bytes * factor; 3912 found->bytes_used += bytes_used; 3913 found->disk_used += bytes_used * factor; 3914 found->bytes_readonly += bytes_readonly; 3915 if (total_bytes > 0) 3916 found->full = 0; 3917 space_info_add_new_bytes(info, found, total_bytes - 3918 bytes_used - bytes_readonly); 3919 spin_unlock(&found->lock); 3920 *space_info = found; 3921 } 3922 3923 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3924 { 3925 u64 extra_flags = chunk_to_extended(flags) & 3926 BTRFS_EXTENDED_PROFILE_MASK; 3927 3928 write_seqlock(&fs_info->profiles_lock); 3929 if (flags & BTRFS_BLOCK_GROUP_DATA) 3930 fs_info->avail_data_alloc_bits |= extra_flags; 3931 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3932 fs_info->avail_metadata_alloc_bits |= extra_flags; 3933 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3934 fs_info->avail_system_alloc_bits |= extra_flags; 3935 write_sequnlock(&fs_info->profiles_lock); 3936 } 3937 3938 /* 3939 * returns target flags in extended format or 0 if restripe for this 3940 * chunk_type is not in progress 3941 * 3942 * should be called with balance_lock held 3943 */ 3944 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3945 { 3946 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3947 u64 target = 0; 3948 3949 if (!bctl) 3950 return 0; 3951 3952 if (flags & BTRFS_BLOCK_GROUP_DATA && 3953 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3954 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3955 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3956 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3957 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3958 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3959 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3960 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3961 } 3962 3963 return target; 3964 } 3965 3966 /* 3967 * @flags: available profiles in extended format (see ctree.h) 3968 * 3969 * Returns reduced profile in chunk format. If profile changing is in 3970 * progress (either running or paused) picks the target profile (if it's 3971 * already available), otherwise falls back to plain reducing. 3972 */ 3973 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 3974 { 3975 u64 num_devices = fs_info->fs_devices->rw_devices; 3976 u64 target; 3977 u64 raid_type; 3978 u64 allowed = 0; 3979 3980 /* 3981 * see if restripe for this chunk_type is in progress, if so 3982 * try to reduce to the target profile 3983 */ 3984 spin_lock(&fs_info->balance_lock); 3985 target = get_restripe_target(fs_info, flags); 3986 if (target) { 3987 /* pick target profile only if it's already available */ 3988 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3989 spin_unlock(&fs_info->balance_lock); 3990 return extended_to_chunk(target); 3991 } 3992 } 3993 spin_unlock(&fs_info->balance_lock); 3994 3995 /* First, mask out the RAID levels which aren't possible */ 3996 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 3997 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 3998 allowed |= btrfs_raid_array[raid_type].bg_flag; 3999 } 4000 allowed &= flags; 4001 4002 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4003 allowed = BTRFS_BLOCK_GROUP_RAID6; 4004 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4005 allowed = BTRFS_BLOCK_GROUP_RAID5; 4006 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4007 allowed = BTRFS_BLOCK_GROUP_RAID10; 4008 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4009 allowed = BTRFS_BLOCK_GROUP_RAID1; 4010 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4011 allowed = BTRFS_BLOCK_GROUP_RAID0; 4012 4013 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4014 4015 return extended_to_chunk(flags | allowed); 4016 } 4017 4018 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4019 { 4020 unsigned seq; 4021 u64 flags; 4022 4023 do { 4024 flags = orig_flags; 4025 seq = read_seqbegin(&fs_info->profiles_lock); 4026 4027 if (flags & BTRFS_BLOCK_GROUP_DATA) 4028 flags |= fs_info->avail_data_alloc_bits; 4029 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4030 flags |= fs_info->avail_system_alloc_bits; 4031 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4032 flags |= fs_info->avail_metadata_alloc_bits; 4033 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4034 4035 return btrfs_reduce_alloc_profile(fs_info, flags); 4036 } 4037 4038 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 4039 { 4040 struct btrfs_fs_info *fs_info = root->fs_info; 4041 u64 flags; 4042 u64 ret; 4043 4044 if (data) 4045 flags = BTRFS_BLOCK_GROUP_DATA; 4046 else if (root == fs_info->chunk_root) 4047 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4048 else 4049 flags = BTRFS_BLOCK_GROUP_METADATA; 4050 4051 ret = get_alloc_profile(fs_info, flags); 4052 return ret; 4053 } 4054 4055 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) 4056 { 4057 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); 4058 } 4059 4060 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) 4061 { 4062 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4063 } 4064 4065 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) 4066 { 4067 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4068 } 4069 4070 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4071 bool may_use_included) 4072 { 4073 ASSERT(s_info); 4074 return s_info->bytes_used + s_info->bytes_reserved + 4075 s_info->bytes_pinned + s_info->bytes_readonly + 4076 (may_use_included ? s_info->bytes_may_use : 0); 4077 } 4078 4079 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 4080 { 4081 struct btrfs_root *root = inode->root; 4082 struct btrfs_fs_info *fs_info = root->fs_info; 4083 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 4084 u64 used; 4085 int ret = 0; 4086 int need_commit = 2; 4087 int have_pinned_space; 4088 4089 /* make sure bytes are sectorsize aligned */ 4090 bytes = ALIGN(bytes, fs_info->sectorsize); 4091 4092 if (btrfs_is_free_space_inode(inode)) { 4093 need_commit = 0; 4094 ASSERT(current->journal_info); 4095 } 4096 4097 again: 4098 /* make sure we have enough space to handle the data first */ 4099 spin_lock(&data_sinfo->lock); 4100 used = btrfs_space_info_used(data_sinfo, true); 4101 4102 if (used + bytes > data_sinfo->total_bytes) { 4103 struct btrfs_trans_handle *trans; 4104 4105 /* 4106 * if we don't have enough free bytes in this space then we need 4107 * to alloc a new chunk. 4108 */ 4109 if (!data_sinfo->full) { 4110 u64 alloc_target; 4111 4112 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4113 spin_unlock(&data_sinfo->lock); 4114 4115 alloc_target = btrfs_data_alloc_profile(fs_info); 4116 /* 4117 * It is ugly that we don't call nolock join 4118 * transaction for the free space inode case here. 4119 * But it is safe because we only do the data space 4120 * reservation for the free space cache in the 4121 * transaction context, the common join transaction 4122 * just increase the counter of the current transaction 4123 * handler, doesn't try to acquire the trans_lock of 4124 * the fs. 4125 */ 4126 trans = btrfs_join_transaction(root); 4127 if (IS_ERR(trans)) 4128 return PTR_ERR(trans); 4129 4130 ret = do_chunk_alloc(trans, alloc_target, 4131 CHUNK_ALLOC_NO_FORCE); 4132 btrfs_end_transaction(trans); 4133 if (ret < 0) { 4134 if (ret != -ENOSPC) 4135 return ret; 4136 else { 4137 have_pinned_space = 1; 4138 goto commit_trans; 4139 } 4140 } 4141 4142 goto again; 4143 } 4144 4145 /* 4146 * If we don't have enough pinned space to deal with this 4147 * allocation, and no removed chunk in current transaction, 4148 * don't bother committing the transaction. 4149 */ 4150 have_pinned_space = __percpu_counter_compare( 4151 &data_sinfo->total_bytes_pinned, 4152 used + bytes - data_sinfo->total_bytes, 4153 BTRFS_TOTAL_BYTES_PINNED_BATCH); 4154 spin_unlock(&data_sinfo->lock); 4155 4156 /* commit the current transaction and try again */ 4157 commit_trans: 4158 if (need_commit) { 4159 need_commit--; 4160 4161 if (need_commit > 0) { 4162 btrfs_start_delalloc_roots(fs_info, -1); 4163 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, 4164 (u64)-1); 4165 } 4166 4167 trans = btrfs_join_transaction(root); 4168 if (IS_ERR(trans)) 4169 return PTR_ERR(trans); 4170 if (have_pinned_space >= 0 || 4171 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4172 &trans->transaction->flags) || 4173 need_commit > 0) { 4174 ret = btrfs_commit_transaction(trans); 4175 if (ret) 4176 return ret; 4177 /* 4178 * The cleaner kthread might still be doing iput 4179 * operations. Wait for it to finish so that 4180 * more space is released. We don't need to 4181 * explicitly run the delayed iputs here because 4182 * the commit_transaction would have woken up 4183 * the cleaner. 4184 */ 4185 ret = btrfs_wait_on_delayed_iputs(fs_info); 4186 if (ret) 4187 return ret; 4188 goto again; 4189 } else { 4190 btrfs_end_transaction(trans); 4191 } 4192 } 4193 4194 trace_btrfs_space_reservation(fs_info, 4195 "space_info:enospc", 4196 data_sinfo->flags, bytes, 1); 4197 return -ENOSPC; 4198 } 4199 update_bytes_may_use(data_sinfo, bytes); 4200 trace_btrfs_space_reservation(fs_info, "space_info", 4201 data_sinfo->flags, bytes, 1); 4202 spin_unlock(&data_sinfo->lock); 4203 4204 return 0; 4205 } 4206 4207 int btrfs_check_data_free_space(struct inode *inode, 4208 struct extent_changeset **reserved, u64 start, u64 len) 4209 { 4210 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4211 int ret; 4212 4213 /* align the range */ 4214 len = round_up(start + len, fs_info->sectorsize) - 4215 round_down(start, fs_info->sectorsize); 4216 start = round_down(start, fs_info->sectorsize); 4217 4218 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4219 if (ret < 0) 4220 return ret; 4221 4222 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4223 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); 4224 if (ret < 0) 4225 btrfs_free_reserved_data_space_noquota(inode, start, len); 4226 else 4227 ret = 0; 4228 return ret; 4229 } 4230 4231 /* 4232 * Called if we need to clear a data reservation for this inode 4233 * Normally in a error case. 4234 * 4235 * This one will *NOT* use accurate qgroup reserved space API, just for case 4236 * which we can't sleep and is sure it won't affect qgroup reserved space. 4237 * Like clear_bit_hook(). 4238 */ 4239 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4240 u64 len) 4241 { 4242 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4243 struct btrfs_space_info *data_sinfo; 4244 4245 /* Make sure the range is aligned to sectorsize */ 4246 len = round_up(start + len, fs_info->sectorsize) - 4247 round_down(start, fs_info->sectorsize); 4248 start = round_down(start, fs_info->sectorsize); 4249 4250 data_sinfo = fs_info->data_sinfo; 4251 spin_lock(&data_sinfo->lock); 4252 update_bytes_may_use(data_sinfo, -len); 4253 trace_btrfs_space_reservation(fs_info, "space_info", 4254 data_sinfo->flags, len, 0); 4255 spin_unlock(&data_sinfo->lock); 4256 } 4257 4258 /* 4259 * Called if we need to clear a data reservation for this inode 4260 * Normally in a error case. 4261 * 4262 * This one will handle the per-inode data rsv map for accurate reserved 4263 * space framework. 4264 */ 4265 void btrfs_free_reserved_data_space(struct inode *inode, 4266 struct extent_changeset *reserved, u64 start, u64 len) 4267 { 4268 struct btrfs_root *root = BTRFS_I(inode)->root; 4269 4270 /* Make sure the range is aligned to sectorsize */ 4271 len = round_up(start + len, root->fs_info->sectorsize) - 4272 round_down(start, root->fs_info->sectorsize); 4273 start = round_down(start, root->fs_info->sectorsize); 4274 4275 btrfs_free_reserved_data_space_noquota(inode, start, len); 4276 btrfs_qgroup_free_data(inode, reserved, start, len); 4277 } 4278 4279 static void force_metadata_allocation(struct btrfs_fs_info *info) 4280 { 4281 struct list_head *head = &info->space_info; 4282 struct btrfs_space_info *found; 4283 4284 rcu_read_lock(); 4285 list_for_each_entry_rcu(found, head, list) { 4286 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4287 found->force_alloc = CHUNK_ALLOC_FORCE; 4288 } 4289 rcu_read_unlock(); 4290 } 4291 4292 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4293 { 4294 return (global->size << 1); 4295 } 4296 4297 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4298 struct btrfs_space_info *sinfo, int force) 4299 { 4300 u64 bytes_used = btrfs_space_info_used(sinfo, false); 4301 u64 thresh; 4302 4303 if (force == CHUNK_ALLOC_FORCE) 4304 return 1; 4305 4306 /* 4307 * in limited mode, we want to have some free space up to 4308 * about 1% of the FS size. 4309 */ 4310 if (force == CHUNK_ALLOC_LIMITED) { 4311 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4312 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4313 4314 if (sinfo->total_bytes - bytes_used < thresh) 4315 return 1; 4316 } 4317 4318 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 4319 return 0; 4320 return 1; 4321 } 4322 4323 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4324 { 4325 u64 num_dev; 4326 4327 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4328 BTRFS_BLOCK_GROUP_RAID0 | 4329 BTRFS_BLOCK_GROUP_RAID5 | 4330 BTRFS_BLOCK_GROUP_RAID6)) 4331 num_dev = fs_info->fs_devices->rw_devices; 4332 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4333 num_dev = 2; 4334 else 4335 num_dev = 1; /* DUP or single */ 4336 4337 return num_dev; 4338 } 4339 4340 /* 4341 * If @is_allocation is true, reserve space in the system space info necessary 4342 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4343 * removing a chunk. 4344 */ 4345 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 4346 { 4347 struct btrfs_fs_info *fs_info = trans->fs_info; 4348 struct btrfs_space_info *info; 4349 u64 left; 4350 u64 thresh; 4351 int ret = 0; 4352 u64 num_devs; 4353 4354 /* 4355 * Needed because we can end up allocating a system chunk and for an 4356 * atomic and race free space reservation in the chunk block reserve. 4357 */ 4358 lockdep_assert_held(&fs_info->chunk_mutex); 4359 4360 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4361 spin_lock(&info->lock); 4362 left = info->total_bytes - btrfs_space_info_used(info, true); 4363 spin_unlock(&info->lock); 4364 4365 num_devs = get_profile_num_devs(fs_info, type); 4366 4367 /* num_devs device items to update and 1 chunk item to add or remove */ 4368 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4369 btrfs_calc_trans_metadata_size(fs_info, 1); 4370 4371 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4372 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4373 left, thresh, type); 4374 dump_space_info(fs_info, info, 0, 0); 4375 } 4376 4377 if (left < thresh) { 4378 u64 flags = btrfs_system_alloc_profile(fs_info); 4379 4380 /* 4381 * Ignore failure to create system chunk. We might end up not 4382 * needing it, as we might not need to COW all nodes/leafs from 4383 * the paths we visit in the chunk tree (they were already COWed 4384 * or created in the current transaction for example). 4385 */ 4386 ret = btrfs_alloc_chunk(trans, flags); 4387 } 4388 4389 if (!ret) { 4390 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4391 &fs_info->chunk_block_rsv, 4392 thresh, BTRFS_RESERVE_NO_FLUSH); 4393 if (!ret) 4394 trans->chunk_bytes_reserved += thresh; 4395 } 4396 } 4397 4398 /* 4399 * If force is CHUNK_ALLOC_FORCE: 4400 * - return 1 if it successfully allocates a chunk, 4401 * - return errors including -ENOSPC otherwise. 4402 * If force is NOT CHUNK_ALLOC_FORCE: 4403 * - return 0 if it doesn't need to allocate a new chunk, 4404 * - return 1 if it successfully allocates a chunk, 4405 * - return errors including -ENOSPC otherwise. 4406 */ 4407 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 4408 int force) 4409 { 4410 struct btrfs_fs_info *fs_info = trans->fs_info; 4411 struct btrfs_space_info *space_info; 4412 bool wait_for_alloc = false; 4413 bool should_alloc = false; 4414 int ret = 0; 4415 4416 /* Don't re-enter if we're already allocating a chunk */ 4417 if (trans->allocating_chunk) 4418 return -ENOSPC; 4419 4420 space_info = __find_space_info(fs_info, flags); 4421 ASSERT(space_info); 4422 4423 do { 4424 spin_lock(&space_info->lock); 4425 if (force < space_info->force_alloc) 4426 force = space_info->force_alloc; 4427 should_alloc = should_alloc_chunk(fs_info, space_info, force); 4428 if (space_info->full) { 4429 /* No more free physical space */ 4430 if (should_alloc) 4431 ret = -ENOSPC; 4432 else 4433 ret = 0; 4434 spin_unlock(&space_info->lock); 4435 return ret; 4436 } else if (!should_alloc) { 4437 spin_unlock(&space_info->lock); 4438 return 0; 4439 } else if (space_info->chunk_alloc) { 4440 /* 4441 * Someone is already allocating, so we need to block 4442 * until this someone is finished and then loop to 4443 * recheck if we should continue with our allocation 4444 * attempt. 4445 */ 4446 wait_for_alloc = true; 4447 spin_unlock(&space_info->lock); 4448 mutex_lock(&fs_info->chunk_mutex); 4449 mutex_unlock(&fs_info->chunk_mutex); 4450 } else { 4451 /* Proceed with allocation */ 4452 space_info->chunk_alloc = 1; 4453 wait_for_alloc = false; 4454 spin_unlock(&space_info->lock); 4455 } 4456 4457 cond_resched(); 4458 } while (wait_for_alloc); 4459 4460 mutex_lock(&fs_info->chunk_mutex); 4461 trans->allocating_chunk = true; 4462 4463 /* 4464 * If we have mixed data/metadata chunks we want to make sure we keep 4465 * allocating mixed chunks instead of individual chunks. 4466 */ 4467 if (btrfs_mixed_space_info(space_info)) 4468 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4469 4470 /* 4471 * if we're doing a data chunk, go ahead and make sure that 4472 * we keep a reasonable number of metadata chunks allocated in the 4473 * FS as well. 4474 */ 4475 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4476 fs_info->data_chunk_allocations++; 4477 if (!(fs_info->data_chunk_allocations % 4478 fs_info->metadata_ratio)) 4479 force_metadata_allocation(fs_info); 4480 } 4481 4482 /* 4483 * Check if we have enough space in SYSTEM chunk because we may need 4484 * to update devices. 4485 */ 4486 check_system_chunk(trans, flags); 4487 4488 ret = btrfs_alloc_chunk(trans, flags); 4489 trans->allocating_chunk = false; 4490 4491 spin_lock(&space_info->lock); 4492 if (ret < 0) { 4493 if (ret == -ENOSPC) 4494 space_info->full = 1; 4495 else 4496 goto out; 4497 } else { 4498 ret = 1; 4499 space_info->max_extent_size = 0; 4500 } 4501 4502 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4503 out: 4504 space_info->chunk_alloc = 0; 4505 spin_unlock(&space_info->lock); 4506 mutex_unlock(&fs_info->chunk_mutex); 4507 /* 4508 * When we allocate a new chunk we reserve space in the chunk block 4509 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4510 * add new nodes/leafs to it if we end up needing to do it when 4511 * inserting the chunk item and updating device items as part of the 4512 * second phase of chunk allocation, performed by 4513 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4514 * large number of new block groups to create in our transaction 4515 * handle's new_bgs list to avoid exhausting the chunk block reserve 4516 * in extreme cases - like having a single transaction create many new 4517 * block groups when starting to write out the free space caches of all 4518 * the block groups that were made dirty during the lifetime of the 4519 * transaction. 4520 */ 4521 if (trans->chunk_bytes_reserved >= (u64)SZ_2M) 4522 btrfs_create_pending_block_groups(trans); 4523 4524 return ret; 4525 } 4526 4527 static int can_overcommit(struct btrfs_fs_info *fs_info, 4528 struct btrfs_space_info *space_info, u64 bytes, 4529 enum btrfs_reserve_flush_enum flush, 4530 bool system_chunk) 4531 { 4532 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4533 u64 profile; 4534 u64 space_size; 4535 u64 avail; 4536 u64 used; 4537 int factor; 4538 4539 /* Don't overcommit when in mixed mode. */ 4540 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4541 return 0; 4542 4543 if (system_chunk) 4544 profile = btrfs_system_alloc_profile(fs_info); 4545 else 4546 profile = btrfs_metadata_alloc_profile(fs_info); 4547 4548 used = btrfs_space_info_used(space_info, false); 4549 4550 /* 4551 * We only want to allow over committing if we have lots of actual space 4552 * free, but if we don't have enough space to handle the global reserve 4553 * space then we could end up having a real enospc problem when trying 4554 * to allocate a chunk or some other such important allocation. 4555 */ 4556 spin_lock(&global_rsv->lock); 4557 space_size = calc_global_rsv_need_space(global_rsv); 4558 spin_unlock(&global_rsv->lock); 4559 if (used + space_size >= space_info->total_bytes) 4560 return 0; 4561 4562 used += space_info->bytes_may_use; 4563 4564 avail = atomic64_read(&fs_info->free_chunk_space); 4565 4566 /* 4567 * If we have dup, raid1 or raid10 then only half of the free 4568 * space is actually usable. For raid56, the space info used 4569 * doesn't include the parity drive, so we don't have to 4570 * change the math 4571 */ 4572 factor = btrfs_bg_type_to_factor(profile); 4573 avail = div_u64(avail, factor); 4574 4575 /* 4576 * If we aren't flushing all things, let us overcommit up to 4577 * 1/2th of the space. If we can flush, don't let us overcommit 4578 * too much, let it overcommit up to 1/8 of the space. 4579 */ 4580 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4581 avail >>= 3; 4582 else 4583 avail >>= 1; 4584 4585 if (used + bytes < space_info->total_bytes + avail) 4586 return 1; 4587 return 0; 4588 } 4589 4590 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4591 unsigned long nr_pages, int nr_items) 4592 { 4593 struct super_block *sb = fs_info->sb; 4594 4595 if (down_read_trylock(&sb->s_umount)) { 4596 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4597 up_read(&sb->s_umount); 4598 } else { 4599 /* 4600 * We needn't worry the filesystem going from r/w to r/o though 4601 * we don't acquire ->s_umount mutex, because the filesystem 4602 * should guarantee the delalloc inodes list be empty after 4603 * the filesystem is readonly(all dirty pages are written to 4604 * the disk). 4605 */ 4606 btrfs_start_delalloc_roots(fs_info, nr_items); 4607 if (!current->journal_info) 4608 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4609 } 4610 } 4611 4612 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4613 u64 to_reclaim) 4614 { 4615 u64 bytes; 4616 u64 nr; 4617 4618 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4619 nr = div64_u64(to_reclaim, bytes); 4620 if (!nr) 4621 nr = 1; 4622 return nr; 4623 } 4624 4625 #define EXTENT_SIZE_PER_ITEM SZ_256K 4626 4627 /* 4628 * shrink metadata reservation for delalloc 4629 */ 4630 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4631 u64 orig, bool wait_ordered) 4632 { 4633 struct btrfs_space_info *space_info; 4634 struct btrfs_trans_handle *trans; 4635 u64 delalloc_bytes; 4636 u64 dio_bytes; 4637 u64 async_pages; 4638 u64 items; 4639 long time_left; 4640 unsigned long nr_pages; 4641 int loops; 4642 4643 /* Calc the number of the pages we need flush for space reservation */ 4644 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4645 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4646 4647 trans = (struct btrfs_trans_handle *)current->journal_info; 4648 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4649 4650 delalloc_bytes = percpu_counter_sum_positive( 4651 &fs_info->delalloc_bytes); 4652 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 4653 if (delalloc_bytes == 0 && dio_bytes == 0) { 4654 if (trans) 4655 return; 4656 if (wait_ordered) 4657 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4658 return; 4659 } 4660 4661 /* 4662 * If we are doing more ordered than delalloc we need to just wait on 4663 * ordered extents, otherwise we'll waste time trying to flush delalloc 4664 * that likely won't give us the space back we need. 4665 */ 4666 if (dio_bytes > delalloc_bytes) 4667 wait_ordered = true; 4668 4669 loops = 0; 4670 while ((delalloc_bytes || dio_bytes) && loops < 3) { 4671 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 4672 4673 /* 4674 * Triggers inode writeback for up to nr_pages. This will invoke 4675 * ->writepages callback and trigger delalloc filling 4676 * (btrfs_run_delalloc_range()). 4677 */ 4678 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4679 4680 /* 4681 * We need to wait for the compressed pages to start before 4682 * we continue. 4683 */ 4684 async_pages = atomic_read(&fs_info->async_delalloc_pages); 4685 if (!async_pages) 4686 goto skip_async; 4687 4688 /* 4689 * Calculate how many compressed pages we want to be written 4690 * before we continue. I.e if there are more async pages than we 4691 * require wait_event will wait until nr_pages are written. 4692 */ 4693 if (async_pages <= nr_pages) 4694 async_pages = 0; 4695 else 4696 async_pages -= nr_pages; 4697 4698 wait_event(fs_info->async_submit_wait, 4699 atomic_read(&fs_info->async_delalloc_pages) <= 4700 (int)async_pages); 4701 skip_async: 4702 spin_lock(&space_info->lock); 4703 if (list_empty(&space_info->tickets) && 4704 list_empty(&space_info->priority_tickets)) { 4705 spin_unlock(&space_info->lock); 4706 break; 4707 } 4708 spin_unlock(&space_info->lock); 4709 4710 loops++; 4711 if (wait_ordered && !trans) { 4712 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4713 } else { 4714 time_left = schedule_timeout_killable(1); 4715 if (time_left) 4716 break; 4717 } 4718 delalloc_bytes = percpu_counter_sum_positive( 4719 &fs_info->delalloc_bytes); 4720 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 4721 } 4722 } 4723 4724 struct reserve_ticket { 4725 u64 orig_bytes; 4726 u64 bytes; 4727 int error; 4728 struct list_head list; 4729 wait_queue_head_t wait; 4730 }; 4731 4732 /** 4733 * maybe_commit_transaction - possibly commit the transaction if its ok to 4734 * @root - the root we're allocating for 4735 * @bytes - the number of bytes we want to reserve 4736 * @force - force the commit 4737 * 4738 * This will check to make sure that committing the transaction will actually 4739 * get us somewhere and then commit the transaction if it does. Otherwise it 4740 * will return -ENOSPC. 4741 */ 4742 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4743 struct btrfs_space_info *space_info) 4744 { 4745 struct reserve_ticket *ticket = NULL; 4746 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4747 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 4748 struct btrfs_trans_handle *trans; 4749 u64 bytes_needed; 4750 u64 reclaim_bytes = 0; 4751 4752 trans = (struct btrfs_trans_handle *)current->journal_info; 4753 if (trans) 4754 return -EAGAIN; 4755 4756 spin_lock(&space_info->lock); 4757 if (!list_empty(&space_info->priority_tickets)) 4758 ticket = list_first_entry(&space_info->priority_tickets, 4759 struct reserve_ticket, list); 4760 else if (!list_empty(&space_info->tickets)) 4761 ticket = list_first_entry(&space_info->tickets, 4762 struct reserve_ticket, list); 4763 bytes_needed = (ticket) ? ticket->bytes : 0; 4764 spin_unlock(&space_info->lock); 4765 4766 if (!bytes_needed) 4767 return 0; 4768 4769 trans = btrfs_join_transaction(fs_info->extent_root); 4770 if (IS_ERR(trans)) 4771 return PTR_ERR(trans); 4772 4773 /* 4774 * See if there is enough pinned space to make this reservation, or if 4775 * we have block groups that are going to be freed, allowing us to 4776 * possibly do a chunk allocation the next loop through. 4777 */ 4778 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 4779 __percpu_counter_compare(&space_info->total_bytes_pinned, 4780 bytes_needed, 4781 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 4782 goto commit; 4783 4784 /* 4785 * See if there is some space in the delayed insertion reservation for 4786 * this reservation. 4787 */ 4788 if (space_info != delayed_rsv->space_info) 4789 goto enospc; 4790 4791 spin_lock(&delayed_rsv->lock); 4792 reclaim_bytes += delayed_rsv->reserved; 4793 spin_unlock(&delayed_rsv->lock); 4794 4795 spin_lock(&delayed_refs_rsv->lock); 4796 reclaim_bytes += delayed_refs_rsv->reserved; 4797 spin_unlock(&delayed_refs_rsv->lock); 4798 if (reclaim_bytes >= bytes_needed) 4799 goto commit; 4800 bytes_needed -= reclaim_bytes; 4801 4802 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4803 bytes_needed, 4804 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 4805 goto enospc; 4806 4807 commit: 4808 return btrfs_commit_transaction(trans); 4809 enospc: 4810 btrfs_end_transaction(trans); 4811 return -ENOSPC; 4812 } 4813 4814 /* 4815 * Try to flush some data based on policy set by @state. This is only advisory 4816 * and may fail for various reasons. The caller is supposed to examine the 4817 * state of @space_info to detect the outcome. 4818 */ 4819 static void flush_space(struct btrfs_fs_info *fs_info, 4820 struct btrfs_space_info *space_info, u64 num_bytes, 4821 int state) 4822 { 4823 struct btrfs_root *root = fs_info->extent_root; 4824 struct btrfs_trans_handle *trans; 4825 int nr; 4826 int ret = 0; 4827 4828 switch (state) { 4829 case FLUSH_DELAYED_ITEMS_NR: 4830 case FLUSH_DELAYED_ITEMS: 4831 if (state == FLUSH_DELAYED_ITEMS_NR) 4832 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4833 else 4834 nr = -1; 4835 4836 trans = btrfs_join_transaction(root); 4837 if (IS_ERR(trans)) { 4838 ret = PTR_ERR(trans); 4839 break; 4840 } 4841 ret = btrfs_run_delayed_items_nr(trans, nr); 4842 btrfs_end_transaction(trans); 4843 break; 4844 case FLUSH_DELALLOC: 4845 case FLUSH_DELALLOC_WAIT: 4846 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 4847 state == FLUSH_DELALLOC_WAIT); 4848 break; 4849 case FLUSH_DELAYED_REFS_NR: 4850 case FLUSH_DELAYED_REFS: 4851 trans = btrfs_join_transaction(root); 4852 if (IS_ERR(trans)) { 4853 ret = PTR_ERR(trans); 4854 break; 4855 } 4856 if (state == FLUSH_DELAYED_REFS_NR) 4857 nr = calc_reclaim_items_nr(fs_info, num_bytes); 4858 else 4859 nr = 0; 4860 btrfs_run_delayed_refs(trans, nr); 4861 btrfs_end_transaction(trans); 4862 break; 4863 case ALLOC_CHUNK: 4864 case ALLOC_CHUNK_FORCE: 4865 trans = btrfs_join_transaction(root); 4866 if (IS_ERR(trans)) { 4867 ret = PTR_ERR(trans); 4868 break; 4869 } 4870 ret = do_chunk_alloc(trans, 4871 btrfs_metadata_alloc_profile(fs_info), 4872 (state == ALLOC_CHUNK) ? 4873 CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); 4874 btrfs_end_transaction(trans); 4875 if (ret > 0 || ret == -ENOSPC) 4876 ret = 0; 4877 break; 4878 case COMMIT_TRANS: 4879 /* 4880 * If we have pending delayed iputs then we could free up a 4881 * bunch of pinned space, so make sure we run the iputs before 4882 * we do our pinned bytes check below. 4883 */ 4884 btrfs_run_delayed_iputs(fs_info); 4885 btrfs_wait_on_delayed_iputs(fs_info); 4886 4887 ret = may_commit_transaction(fs_info, space_info); 4888 break; 4889 default: 4890 ret = -ENOSPC; 4891 break; 4892 } 4893 4894 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 4895 ret); 4896 return; 4897 } 4898 4899 static inline u64 4900 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 4901 struct btrfs_space_info *space_info, 4902 bool system_chunk) 4903 { 4904 struct reserve_ticket *ticket; 4905 u64 used; 4906 u64 expected; 4907 u64 to_reclaim = 0; 4908 4909 list_for_each_entry(ticket, &space_info->tickets, list) 4910 to_reclaim += ticket->bytes; 4911 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4912 to_reclaim += ticket->bytes; 4913 if (to_reclaim) 4914 return to_reclaim; 4915 4916 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4917 if (can_overcommit(fs_info, space_info, to_reclaim, 4918 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 4919 return 0; 4920 4921 used = btrfs_space_info_used(space_info, true); 4922 4923 if (can_overcommit(fs_info, space_info, SZ_1M, 4924 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 4925 expected = div_factor_fine(space_info->total_bytes, 95); 4926 else 4927 expected = div_factor_fine(space_info->total_bytes, 90); 4928 4929 if (used > expected) 4930 to_reclaim = used - expected; 4931 else 4932 to_reclaim = 0; 4933 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4934 space_info->bytes_reserved); 4935 return to_reclaim; 4936 } 4937 4938 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 4939 struct btrfs_space_info *space_info, 4940 u64 used, bool system_chunk) 4941 { 4942 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4943 4944 /* If we're just plain full then async reclaim just slows us down. */ 4945 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4946 return 0; 4947 4948 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 4949 system_chunk)) 4950 return 0; 4951 4952 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4953 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4954 } 4955 4956 static bool wake_all_tickets(struct list_head *head) 4957 { 4958 struct reserve_ticket *ticket; 4959 4960 while (!list_empty(head)) { 4961 ticket = list_first_entry(head, struct reserve_ticket, list); 4962 list_del_init(&ticket->list); 4963 ticket->error = -ENOSPC; 4964 wake_up(&ticket->wait); 4965 if (ticket->bytes != ticket->orig_bytes) 4966 return true; 4967 } 4968 return false; 4969 } 4970 4971 /* 4972 * This is for normal flushers, we can wait all goddamned day if we want to. We 4973 * will loop and continuously try to flush as long as we are making progress. 4974 * We count progress as clearing off tickets each time we have to loop. 4975 */ 4976 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4977 { 4978 struct btrfs_fs_info *fs_info; 4979 struct btrfs_space_info *space_info; 4980 u64 to_reclaim; 4981 int flush_state; 4982 int commit_cycles = 0; 4983 u64 last_tickets_id; 4984 4985 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4986 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4987 4988 spin_lock(&space_info->lock); 4989 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 4990 false); 4991 if (!to_reclaim) { 4992 space_info->flush = 0; 4993 spin_unlock(&space_info->lock); 4994 return; 4995 } 4996 last_tickets_id = space_info->tickets_id; 4997 spin_unlock(&space_info->lock); 4998 4999 flush_state = FLUSH_DELAYED_ITEMS_NR; 5000 do { 5001 flush_space(fs_info, space_info, to_reclaim, flush_state); 5002 spin_lock(&space_info->lock); 5003 if (list_empty(&space_info->tickets)) { 5004 space_info->flush = 0; 5005 spin_unlock(&space_info->lock); 5006 return; 5007 } 5008 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 5009 space_info, 5010 false); 5011 if (last_tickets_id == space_info->tickets_id) { 5012 flush_state++; 5013 } else { 5014 last_tickets_id = space_info->tickets_id; 5015 flush_state = FLUSH_DELAYED_ITEMS_NR; 5016 if (commit_cycles) 5017 commit_cycles--; 5018 } 5019 5020 /* 5021 * We don't want to force a chunk allocation until we've tried 5022 * pretty hard to reclaim space. Think of the case where we 5023 * freed up a bunch of space and so have a lot of pinned space 5024 * to reclaim. We would rather use that than possibly create a 5025 * underutilized metadata chunk. So if this is our first run 5026 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 5027 * commit the transaction. If nothing has changed the next go 5028 * around then we can force a chunk allocation. 5029 */ 5030 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 5031 flush_state++; 5032 5033 if (flush_state > COMMIT_TRANS) { 5034 commit_cycles++; 5035 if (commit_cycles > 2) { 5036 if (wake_all_tickets(&space_info->tickets)) { 5037 flush_state = FLUSH_DELAYED_ITEMS_NR; 5038 commit_cycles--; 5039 } else { 5040 space_info->flush = 0; 5041 } 5042 } else { 5043 flush_state = FLUSH_DELAYED_ITEMS_NR; 5044 } 5045 } 5046 spin_unlock(&space_info->lock); 5047 } while (flush_state <= COMMIT_TRANS); 5048 } 5049 5050 void btrfs_init_async_reclaim_work(struct work_struct *work) 5051 { 5052 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5053 } 5054 5055 static const enum btrfs_flush_state priority_flush_states[] = { 5056 FLUSH_DELAYED_ITEMS_NR, 5057 FLUSH_DELAYED_ITEMS, 5058 ALLOC_CHUNK, 5059 }; 5060 5061 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5062 struct btrfs_space_info *space_info, 5063 struct reserve_ticket *ticket) 5064 { 5065 u64 to_reclaim; 5066 int flush_state; 5067 5068 spin_lock(&space_info->lock); 5069 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5070 false); 5071 if (!to_reclaim) { 5072 spin_unlock(&space_info->lock); 5073 return; 5074 } 5075 spin_unlock(&space_info->lock); 5076 5077 flush_state = 0; 5078 do { 5079 flush_space(fs_info, space_info, to_reclaim, 5080 priority_flush_states[flush_state]); 5081 flush_state++; 5082 spin_lock(&space_info->lock); 5083 if (ticket->bytes == 0) { 5084 spin_unlock(&space_info->lock); 5085 return; 5086 } 5087 spin_unlock(&space_info->lock); 5088 } while (flush_state < ARRAY_SIZE(priority_flush_states)); 5089 } 5090 5091 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5092 struct btrfs_space_info *space_info, 5093 struct reserve_ticket *ticket) 5094 5095 { 5096 DEFINE_WAIT(wait); 5097 u64 reclaim_bytes = 0; 5098 int ret = 0; 5099 5100 spin_lock(&space_info->lock); 5101 while (ticket->bytes > 0 && ticket->error == 0) { 5102 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5103 if (ret) { 5104 ret = -EINTR; 5105 break; 5106 } 5107 spin_unlock(&space_info->lock); 5108 5109 schedule(); 5110 5111 finish_wait(&ticket->wait, &wait); 5112 spin_lock(&space_info->lock); 5113 } 5114 if (!ret) 5115 ret = ticket->error; 5116 if (!list_empty(&ticket->list)) 5117 list_del_init(&ticket->list); 5118 if (ticket->bytes && ticket->bytes < ticket->orig_bytes) 5119 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 5120 spin_unlock(&space_info->lock); 5121 5122 if (reclaim_bytes) 5123 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); 5124 return ret; 5125 } 5126 5127 /** 5128 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5129 * @root - the root we're allocating for 5130 * @space_info - the space info we want to allocate from 5131 * @orig_bytes - the number of bytes we want 5132 * @flush - whether or not we can flush to make our reservation 5133 * 5134 * This will reserve orig_bytes number of bytes from the space info associated 5135 * with the block_rsv. If there is not enough space it will make an attempt to 5136 * flush out space to make room. It will do this by flushing delalloc if 5137 * possible or committing the transaction. If flush is 0 then no attempts to 5138 * regain reservations will be made and this will fail if there is not enough 5139 * space already. 5140 */ 5141 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 5142 struct btrfs_space_info *space_info, 5143 u64 orig_bytes, 5144 enum btrfs_reserve_flush_enum flush, 5145 bool system_chunk) 5146 { 5147 struct reserve_ticket ticket; 5148 u64 used; 5149 u64 reclaim_bytes = 0; 5150 int ret = 0; 5151 5152 ASSERT(orig_bytes); 5153 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5154 5155 spin_lock(&space_info->lock); 5156 ret = -ENOSPC; 5157 used = btrfs_space_info_used(space_info, true); 5158 5159 /* 5160 * If we have enough space then hooray, make our reservation and carry 5161 * on. If not see if we can overcommit, and if we can, hooray carry on. 5162 * If not things get more complicated. 5163 */ 5164 if (used + orig_bytes <= space_info->total_bytes) { 5165 update_bytes_may_use(space_info, orig_bytes); 5166 trace_btrfs_space_reservation(fs_info, "space_info", 5167 space_info->flags, orig_bytes, 1); 5168 ret = 0; 5169 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, 5170 system_chunk)) { 5171 update_bytes_may_use(space_info, orig_bytes); 5172 trace_btrfs_space_reservation(fs_info, "space_info", 5173 space_info->flags, orig_bytes, 1); 5174 ret = 0; 5175 } 5176 5177 /* 5178 * If we couldn't make a reservation then setup our reservation ticket 5179 * and kick the async worker if it's not already running. 5180 * 5181 * If we are a priority flusher then we just need to add our ticket to 5182 * the list and we will do our own flushing further down. 5183 */ 5184 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5185 ticket.orig_bytes = orig_bytes; 5186 ticket.bytes = orig_bytes; 5187 ticket.error = 0; 5188 init_waitqueue_head(&ticket.wait); 5189 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5190 list_add_tail(&ticket.list, &space_info->tickets); 5191 if (!space_info->flush) { 5192 space_info->flush = 1; 5193 trace_btrfs_trigger_flush(fs_info, 5194 space_info->flags, 5195 orig_bytes, flush, 5196 "enospc"); 5197 queue_work(system_unbound_wq, 5198 &fs_info->async_reclaim_work); 5199 } 5200 } else { 5201 list_add_tail(&ticket.list, 5202 &space_info->priority_tickets); 5203 } 5204 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5205 used += orig_bytes; 5206 /* 5207 * We will do the space reservation dance during log replay, 5208 * which means we won't have fs_info->fs_root set, so don't do 5209 * the async reclaim as we will panic. 5210 */ 5211 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5212 need_do_async_reclaim(fs_info, space_info, 5213 used, system_chunk) && 5214 !work_busy(&fs_info->async_reclaim_work)) { 5215 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5216 orig_bytes, flush, "preempt"); 5217 queue_work(system_unbound_wq, 5218 &fs_info->async_reclaim_work); 5219 } 5220 } 5221 spin_unlock(&space_info->lock); 5222 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5223 return ret; 5224 5225 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5226 return wait_reserve_ticket(fs_info, space_info, &ticket); 5227 5228 ret = 0; 5229 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5230 spin_lock(&space_info->lock); 5231 if (ticket.bytes) { 5232 if (ticket.bytes < orig_bytes) 5233 reclaim_bytes = orig_bytes - ticket.bytes; 5234 list_del_init(&ticket.list); 5235 ret = -ENOSPC; 5236 } 5237 spin_unlock(&space_info->lock); 5238 5239 if (reclaim_bytes) 5240 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); 5241 ASSERT(list_empty(&ticket.list)); 5242 return ret; 5243 } 5244 5245 /** 5246 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5247 * @root - the root we're allocating for 5248 * @block_rsv - the block_rsv we're allocating for 5249 * @orig_bytes - the number of bytes we want 5250 * @flush - whether or not we can flush to make our reservation 5251 * 5252 * This will reserve orig_bytes number of bytes from the space info associated 5253 * with the block_rsv. If there is not enough space it will make an attempt to 5254 * flush out space to make room. It will do this by flushing delalloc if 5255 * possible or committing the transaction. If flush is 0 then no attempts to 5256 * regain reservations will be made and this will fail if there is not enough 5257 * space already. 5258 */ 5259 static int reserve_metadata_bytes(struct btrfs_root *root, 5260 struct btrfs_block_rsv *block_rsv, 5261 u64 orig_bytes, 5262 enum btrfs_reserve_flush_enum flush) 5263 { 5264 struct btrfs_fs_info *fs_info = root->fs_info; 5265 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5266 int ret; 5267 bool system_chunk = (root == fs_info->chunk_root); 5268 5269 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 5270 orig_bytes, flush, system_chunk); 5271 if (ret == -ENOSPC && 5272 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5273 if (block_rsv != global_rsv && 5274 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5275 ret = 0; 5276 } 5277 if (ret == -ENOSPC) { 5278 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5279 block_rsv->space_info->flags, 5280 orig_bytes, 1); 5281 5282 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 5283 dump_space_info(fs_info, block_rsv->space_info, 5284 orig_bytes, 0); 5285 } 5286 return ret; 5287 } 5288 5289 static struct btrfs_block_rsv *get_block_rsv( 5290 const struct btrfs_trans_handle *trans, 5291 const struct btrfs_root *root) 5292 { 5293 struct btrfs_fs_info *fs_info = root->fs_info; 5294 struct btrfs_block_rsv *block_rsv = NULL; 5295 5296 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5297 (root == fs_info->csum_root && trans->adding_csums) || 5298 (root == fs_info->uuid_root)) 5299 block_rsv = trans->block_rsv; 5300 5301 if (!block_rsv) 5302 block_rsv = root->block_rsv; 5303 5304 if (!block_rsv) 5305 block_rsv = &fs_info->empty_block_rsv; 5306 5307 return block_rsv; 5308 } 5309 5310 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5311 u64 num_bytes) 5312 { 5313 int ret = -ENOSPC; 5314 spin_lock(&block_rsv->lock); 5315 if (block_rsv->reserved >= num_bytes) { 5316 block_rsv->reserved -= num_bytes; 5317 if (block_rsv->reserved < block_rsv->size) 5318 block_rsv->full = 0; 5319 ret = 0; 5320 } 5321 spin_unlock(&block_rsv->lock); 5322 return ret; 5323 } 5324 5325 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5326 u64 num_bytes, bool update_size) 5327 { 5328 spin_lock(&block_rsv->lock); 5329 block_rsv->reserved += num_bytes; 5330 if (update_size) 5331 block_rsv->size += num_bytes; 5332 else if (block_rsv->reserved >= block_rsv->size) 5333 block_rsv->full = 1; 5334 spin_unlock(&block_rsv->lock); 5335 } 5336 5337 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5338 struct btrfs_block_rsv *dest, u64 num_bytes, 5339 int min_factor) 5340 { 5341 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5342 u64 min_bytes; 5343 5344 if (global_rsv->space_info != dest->space_info) 5345 return -ENOSPC; 5346 5347 spin_lock(&global_rsv->lock); 5348 min_bytes = div_factor(global_rsv->size, min_factor); 5349 if (global_rsv->reserved < min_bytes + num_bytes) { 5350 spin_unlock(&global_rsv->lock); 5351 return -ENOSPC; 5352 } 5353 global_rsv->reserved -= num_bytes; 5354 if (global_rsv->reserved < global_rsv->size) 5355 global_rsv->full = 0; 5356 spin_unlock(&global_rsv->lock); 5357 5358 block_rsv_add_bytes(dest, num_bytes, true); 5359 return 0; 5360 } 5361 5362 /** 5363 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. 5364 * @fs_info - the fs info for our fs. 5365 * @src - the source block rsv to transfer from. 5366 * @num_bytes - the number of bytes to transfer. 5367 * 5368 * This transfers up to the num_bytes amount from the src rsv to the 5369 * delayed_refs_rsv. Any extra bytes are returned to the space info. 5370 */ 5371 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 5372 struct btrfs_block_rsv *src, 5373 u64 num_bytes) 5374 { 5375 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 5376 u64 to_free = 0; 5377 5378 spin_lock(&src->lock); 5379 src->reserved -= num_bytes; 5380 src->size -= num_bytes; 5381 spin_unlock(&src->lock); 5382 5383 spin_lock(&delayed_refs_rsv->lock); 5384 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 5385 u64 delta = delayed_refs_rsv->size - 5386 delayed_refs_rsv->reserved; 5387 if (num_bytes > delta) { 5388 to_free = num_bytes - delta; 5389 num_bytes = delta; 5390 } 5391 } else { 5392 to_free = num_bytes; 5393 num_bytes = 0; 5394 } 5395 5396 if (num_bytes) 5397 delayed_refs_rsv->reserved += num_bytes; 5398 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 5399 delayed_refs_rsv->full = 1; 5400 spin_unlock(&delayed_refs_rsv->lock); 5401 5402 if (num_bytes) 5403 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5404 0, num_bytes, 1); 5405 if (to_free) 5406 space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, 5407 to_free); 5408 } 5409 5410 /** 5411 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. 5412 * @fs_info - the fs_info for our fs. 5413 * @flush - control how we can flush for this reservation. 5414 * 5415 * This will refill the delayed block_rsv up to 1 items size worth of space and 5416 * will return -ENOSPC if we can't make the reservation. 5417 */ 5418 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 5419 enum btrfs_reserve_flush_enum flush) 5420 { 5421 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5422 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); 5423 u64 num_bytes = 0; 5424 int ret = -ENOSPC; 5425 5426 spin_lock(&block_rsv->lock); 5427 if (block_rsv->reserved < block_rsv->size) { 5428 num_bytes = block_rsv->size - block_rsv->reserved; 5429 num_bytes = min(num_bytes, limit); 5430 } 5431 spin_unlock(&block_rsv->lock); 5432 5433 if (!num_bytes) 5434 return 0; 5435 5436 ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, 5437 num_bytes, flush); 5438 if (ret) 5439 return ret; 5440 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5441 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5442 0, num_bytes, 1); 5443 return 0; 5444 } 5445 5446 /* 5447 * This is for space we already have accounted in space_info->bytes_may_use, so 5448 * basically when we're returning space from block_rsv's. 5449 */ 5450 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5451 struct btrfs_space_info *space_info, 5452 u64 num_bytes) 5453 { 5454 struct reserve_ticket *ticket; 5455 struct list_head *head; 5456 u64 used; 5457 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5458 bool check_overcommit = false; 5459 5460 spin_lock(&space_info->lock); 5461 head = &space_info->priority_tickets; 5462 5463 /* 5464 * If we are over our limit then we need to check and see if we can 5465 * overcommit, and if we can't then we just need to free up our space 5466 * and not satisfy any requests. 5467 */ 5468 used = btrfs_space_info_used(space_info, true); 5469 if (used - num_bytes >= space_info->total_bytes) 5470 check_overcommit = true; 5471 again: 5472 while (!list_empty(head) && num_bytes) { 5473 ticket = list_first_entry(head, struct reserve_ticket, 5474 list); 5475 /* 5476 * We use 0 bytes because this space is already reserved, so 5477 * adding the ticket space would be a double count. 5478 */ 5479 if (check_overcommit && 5480 !can_overcommit(fs_info, space_info, 0, flush, false)) 5481 break; 5482 if (num_bytes >= ticket->bytes) { 5483 list_del_init(&ticket->list); 5484 num_bytes -= ticket->bytes; 5485 ticket->bytes = 0; 5486 space_info->tickets_id++; 5487 wake_up(&ticket->wait); 5488 } else { 5489 ticket->bytes -= num_bytes; 5490 num_bytes = 0; 5491 } 5492 } 5493 5494 if (num_bytes && head == &space_info->priority_tickets) { 5495 head = &space_info->tickets; 5496 flush = BTRFS_RESERVE_FLUSH_ALL; 5497 goto again; 5498 } 5499 update_bytes_may_use(space_info, -num_bytes); 5500 trace_btrfs_space_reservation(fs_info, "space_info", 5501 space_info->flags, num_bytes, 0); 5502 spin_unlock(&space_info->lock); 5503 } 5504 5505 /* 5506 * This is for newly allocated space that isn't accounted in 5507 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5508 * we use this helper. 5509 */ 5510 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5511 struct btrfs_space_info *space_info, 5512 u64 num_bytes) 5513 { 5514 struct reserve_ticket *ticket; 5515 struct list_head *head = &space_info->priority_tickets; 5516 5517 again: 5518 while (!list_empty(head) && num_bytes) { 5519 ticket = list_first_entry(head, struct reserve_ticket, 5520 list); 5521 if (num_bytes >= ticket->bytes) { 5522 trace_btrfs_space_reservation(fs_info, "space_info", 5523 space_info->flags, 5524 ticket->bytes, 1); 5525 list_del_init(&ticket->list); 5526 num_bytes -= ticket->bytes; 5527 update_bytes_may_use(space_info, ticket->bytes); 5528 ticket->bytes = 0; 5529 space_info->tickets_id++; 5530 wake_up(&ticket->wait); 5531 } else { 5532 trace_btrfs_space_reservation(fs_info, "space_info", 5533 space_info->flags, 5534 num_bytes, 1); 5535 update_bytes_may_use(space_info, num_bytes); 5536 ticket->bytes -= num_bytes; 5537 num_bytes = 0; 5538 } 5539 } 5540 5541 if (num_bytes && head == &space_info->priority_tickets) { 5542 head = &space_info->tickets; 5543 goto again; 5544 } 5545 } 5546 5547 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5548 struct btrfs_block_rsv *block_rsv, 5549 struct btrfs_block_rsv *dest, u64 num_bytes, 5550 u64 *qgroup_to_release_ret) 5551 { 5552 struct btrfs_space_info *space_info = block_rsv->space_info; 5553 u64 qgroup_to_release = 0; 5554 u64 ret; 5555 5556 spin_lock(&block_rsv->lock); 5557 if (num_bytes == (u64)-1) { 5558 num_bytes = block_rsv->size; 5559 qgroup_to_release = block_rsv->qgroup_rsv_size; 5560 } 5561 block_rsv->size -= num_bytes; 5562 if (block_rsv->reserved >= block_rsv->size) { 5563 num_bytes = block_rsv->reserved - block_rsv->size; 5564 block_rsv->reserved = block_rsv->size; 5565 block_rsv->full = 1; 5566 } else { 5567 num_bytes = 0; 5568 } 5569 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { 5570 qgroup_to_release = block_rsv->qgroup_rsv_reserved - 5571 block_rsv->qgroup_rsv_size; 5572 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; 5573 } else { 5574 qgroup_to_release = 0; 5575 } 5576 spin_unlock(&block_rsv->lock); 5577 5578 ret = num_bytes; 5579 if (num_bytes > 0) { 5580 if (dest) { 5581 spin_lock(&dest->lock); 5582 if (!dest->full) { 5583 u64 bytes_to_add; 5584 5585 bytes_to_add = dest->size - dest->reserved; 5586 bytes_to_add = min(num_bytes, bytes_to_add); 5587 dest->reserved += bytes_to_add; 5588 if (dest->reserved >= dest->size) 5589 dest->full = 1; 5590 num_bytes -= bytes_to_add; 5591 } 5592 spin_unlock(&dest->lock); 5593 } 5594 if (num_bytes) 5595 space_info_add_old_bytes(fs_info, space_info, 5596 num_bytes); 5597 } 5598 if (qgroup_to_release_ret) 5599 *qgroup_to_release_ret = qgroup_to_release; 5600 return ret; 5601 } 5602 5603 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5604 struct btrfs_block_rsv *dst, u64 num_bytes, 5605 bool update_size) 5606 { 5607 int ret; 5608 5609 ret = block_rsv_use_bytes(src, num_bytes); 5610 if (ret) 5611 return ret; 5612 5613 block_rsv_add_bytes(dst, num_bytes, update_size); 5614 return 0; 5615 } 5616 5617 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5618 { 5619 memset(rsv, 0, sizeof(*rsv)); 5620 spin_lock_init(&rsv->lock); 5621 rsv->type = type; 5622 } 5623 5624 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 5625 struct btrfs_block_rsv *rsv, 5626 unsigned short type) 5627 { 5628 btrfs_init_block_rsv(rsv, type); 5629 rsv->space_info = __find_space_info(fs_info, 5630 BTRFS_BLOCK_GROUP_METADATA); 5631 } 5632 5633 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5634 unsigned short type) 5635 { 5636 struct btrfs_block_rsv *block_rsv; 5637 5638 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5639 if (!block_rsv) 5640 return NULL; 5641 5642 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); 5643 return block_rsv; 5644 } 5645 5646 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5647 struct btrfs_block_rsv *rsv) 5648 { 5649 if (!rsv) 5650 return; 5651 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5652 kfree(rsv); 5653 } 5654 5655 int btrfs_block_rsv_add(struct btrfs_root *root, 5656 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5657 enum btrfs_reserve_flush_enum flush) 5658 { 5659 int ret; 5660 5661 if (num_bytes == 0) 5662 return 0; 5663 5664 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5665 if (!ret) 5666 block_rsv_add_bytes(block_rsv, num_bytes, true); 5667 5668 return ret; 5669 } 5670 5671 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5672 { 5673 u64 num_bytes = 0; 5674 int ret = -ENOSPC; 5675 5676 if (!block_rsv) 5677 return 0; 5678 5679 spin_lock(&block_rsv->lock); 5680 num_bytes = div_factor(block_rsv->size, min_factor); 5681 if (block_rsv->reserved >= num_bytes) 5682 ret = 0; 5683 spin_unlock(&block_rsv->lock); 5684 5685 return ret; 5686 } 5687 5688 int btrfs_block_rsv_refill(struct btrfs_root *root, 5689 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5690 enum btrfs_reserve_flush_enum flush) 5691 { 5692 u64 num_bytes = 0; 5693 int ret = -ENOSPC; 5694 5695 if (!block_rsv) 5696 return 0; 5697 5698 spin_lock(&block_rsv->lock); 5699 num_bytes = min_reserved; 5700 if (block_rsv->reserved >= num_bytes) 5701 ret = 0; 5702 else 5703 num_bytes -= block_rsv->reserved; 5704 spin_unlock(&block_rsv->lock); 5705 5706 if (!ret) 5707 return 0; 5708 5709 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5710 if (!ret) { 5711 block_rsv_add_bytes(block_rsv, num_bytes, false); 5712 return 0; 5713 } 5714 5715 return ret; 5716 } 5717 5718 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5719 struct btrfs_block_rsv *block_rsv, 5720 u64 num_bytes, u64 *qgroup_to_release) 5721 { 5722 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5723 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 5724 struct btrfs_block_rsv *target = delayed_rsv; 5725 5726 if (target->full || target == block_rsv) 5727 target = global_rsv; 5728 5729 if (block_rsv->space_info != target->space_info) 5730 target = NULL; 5731 5732 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, 5733 qgroup_to_release); 5734 } 5735 5736 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5737 struct btrfs_block_rsv *block_rsv, 5738 u64 num_bytes) 5739 { 5740 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 5741 } 5742 5743 /** 5744 * btrfs_inode_rsv_release - release any excessive reservation. 5745 * @inode - the inode we need to release from. 5746 * @qgroup_free - free or convert qgroup meta. 5747 * Unlike normal operation, qgroup meta reservation needs to know if we are 5748 * freeing qgroup reservation or just converting it into per-trans. Normally 5749 * @qgroup_free is true for error handling, and false for normal release. 5750 * 5751 * This is the same as btrfs_block_rsv_release, except that it handles the 5752 * tracepoint for the reservation. 5753 */ 5754 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) 5755 { 5756 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5757 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5758 u64 released = 0; 5759 u64 qgroup_to_release = 0; 5760 5761 /* 5762 * Since we statically set the block_rsv->size we just want to say we 5763 * are releasing 0 bytes, and then we'll just get the reservation over 5764 * the size free'd. 5765 */ 5766 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, 5767 &qgroup_to_release); 5768 if (released > 0) 5769 trace_btrfs_space_reservation(fs_info, "delalloc", 5770 btrfs_ino(inode), released, 0); 5771 if (qgroup_free) 5772 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); 5773 else 5774 btrfs_qgroup_convert_reserved_meta(inode->root, 5775 qgroup_to_release); 5776 } 5777 5778 /** 5779 * btrfs_delayed_refs_rsv_release - release a ref head's reservation. 5780 * @fs_info - the fs_info for our fs. 5781 * @nr - the number of items to drop. 5782 * 5783 * This drops the delayed ref head's count from the delayed refs rsv and frees 5784 * any excess reservation we had. 5785 */ 5786 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 5787 { 5788 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5789 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5790 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); 5791 u64 released = 0; 5792 5793 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 5794 num_bytes, NULL); 5795 if (released) 5796 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5797 0, released, 0); 5798 } 5799 5800 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5801 { 5802 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5803 struct btrfs_space_info *sinfo = block_rsv->space_info; 5804 u64 num_bytes; 5805 5806 /* 5807 * The global block rsv is based on the size of the extent tree, the 5808 * checksum tree and the root tree. If the fs is empty we want to set 5809 * it to a minimal amount for safety. 5810 */ 5811 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5812 btrfs_root_used(&fs_info->csum_root->root_item) + 5813 btrfs_root_used(&fs_info->tree_root->root_item); 5814 num_bytes = max_t(u64, num_bytes, SZ_16M); 5815 5816 spin_lock(&sinfo->lock); 5817 spin_lock(&block_rsv->lock); 5818 5819 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5820 5821 if (block_rsv->reserved < block_rsv->size) { 5822 num_bytes = btrfs_space_info_used(sinfo, true); 5823 if (sinfo->total_bytes > num_bytes) { 5824 num_bytes = sinfo->total_bytes - num_bytes; 5825 num_bytes = min(num_bytes, 5826 block_rsv->size - block_rsv->reserved); 5827 block_rsv->reserved += num_bytes; 5828 update_bytes_may_use(sinfo, num_bytes); 5829 trace_btrfs_space_reservation(fs_info, "space_info", 5830 sinfo->flags, num_bytes, 5831 1); 5832 } 5833 } else if (block_rsv->reserved > block_rsv->size) { 5834 num_bytes = block_rsv->reserved - block_rsv->size; 5835 update_bytes_may_use(sinfo, -num_bytes); 5836 trace_btrfs_space_reservation(fs_info, "space_info", 5837 sinfo->flags, num_bytes, 0); 5838 block_rsv->reserved = block_rsv->size; 5839 } 5840 5841 if (block_rsv->reserved == block_rsv->size) 5842 block_rsv->full = 1; 5843 else 5844 block_rsv->full = 0; 5845 5846 spin_unlock(&block_rsv->lock); 5847 spin_unlock(&sinfo->lock); 5848 } 5849 5850 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5851 { 5852 struct btrfs_space_info *space_info; 5853 5854 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5855 fs_info->chunk_block_rsv.space_info = space_info; 5856 5857 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5858 fs_info->global_block_rsv.space_info = space_info; 5859 fs_info->trans_block_rsv.space_info = space_info; 5860 fs_info->empty_block_rsv.space_info = space_info; 5861 fs_info->delayed_block_rsv.space_info = space_info; 5862 fs_info->delayed_refs_rsv.space_info = space_info; 5863 5864 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; 5865 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; 5866 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5867 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5868 if (fs_info->quota_root) 5869 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5870 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5871 5872 update_global_block_rsv(fs_info); 5873 } 5874 5875 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5876 { 5877 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5878 (u64)-1, NULL); 5879 WARN_ON(fs_info->trans_block_rsv.size > 0); 5880 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5881 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5882 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5883 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5884 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5885 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); 5886 WARN_ON(fs_info->delayed_refs_rsv.size > 0); 5887 } 5888 5889 /* 5890 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 5891 * @trans - the trans that may have generated delayed refs 5892 * 5893 * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 5894 * it'll calculate the additional size and add it to the delayed_refs_rsv. 5895 */ 5896 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 5897 { 5898 struct btrfs_fs_info *fs_info = trans->fs_info; 5899 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 5900 u64 num_bytes; 5901 5902 if (!trans->delayed_ref_updates) 5903 return; 5904 5905 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 5906 trans->delayed_ref_updates); 5907 spin_lock(&delayed_rsv->lock); 5908 delayed_rsv->size += num_bytes; 5909 delayed_rsv->full = 0; 5910 spin_unlock(&delayed_rsv->lock); 5911 trans->delayed_ref_updates = 0; 5912 } 5913 5914 /* 5915 * To be called after all the new block groups attached to the transaction 5916 * handle have been created (btrfs_create_pending_block_groups()). 5917 */ 5918 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5919 { 5920 struct btrfs_fs_info *fs_info = trans->fs_info; 5921 5922 if (!trans->chunk_bytes_reserved) 5923 return; 5924 5925 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5926 5927 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5928 trans->chunk_bytes_reserved, NULL); 5929 trans->chunk_bytes_reserved = 0; 5930 } 5931 5932 /* 5933 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5934 * root: the root of the parent directory 5935 * rsv: block reservation 5936 * items: the number of items that we need do reservation 5937 * use_global_rsv: allow fallback to the global block reservation 5938 * 5939 * This function is used to reserve the space for snapshot/subvolume 5940 * creation and deletion. Those operations are different with the 5941 * common file/directory operations, they change two fs/file trees 5942 * and root tree, the number of items that the qgroup reserves is 5943 * different with the free space reservation. So we can not use 5944 * the space reservation mechanism in start_transaction(). 5945 */ 5946 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5947 struct btrfs_block_rsv *rsv, int items, 5948 bool use_global_rsv) 5949 { 5950 u64 qgroup_num_bytes = 0; 5951 u64 num_bytes; 5952 int ret; 5953 struct btrfs_fs_info *fs_info = root->fs_info; 5954 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5955 5956 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5957 /* One for parent inode, two for dir entries */ 5958 qgroup_num_bytes = 3 * fs_info->nodesize; 5959 ret = btrfs_qgroup_reserve_meta_prealloc(root, 5960 qgroup_num_bytes, true); 5961 if (ret) 5962 return ret; 5963 } 5964 5965 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 5966 rsv->space_info = __find_space_info(fs_info, 5967 BTRFS_BLOCK_GROUP_METADATA); 5968 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5969 BTRFS_RESERVE_FLUSH_ALL); 5970 5971 if (ret == -ENOSPC && use_global_rsv) 5972 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); 5973 5974 if (ret && qgroup_num_bytes) 5975 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 5976 5977 return ret; 5978 } 5979 5980 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 5981 struct btrfs_block_rsv *rsv) 5982 { 5983 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5984 } 5985 5986 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, 5987 struct btrfs_inode *inode) 5988 { 5989 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5990 u64 reserve_size = 0; 5991 u64 qgroup_rsv_size = 0; 5992 u64 csum_leaves; 5993 unsigned outstanding_extents; 5994 5995 lockdep_assert_held(&inode->lock); 5996 outstanding_extents = inode->outstanding_extents; 5997 if (outstanding_extents) 5998 reserve_size = btrfs_calc_trans_metadata_size(fs_info, 5999 outstanding_extents + 1); 6000 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, 6001 inode->csum_bytes); 6002 reserve_size += btrfs_calc_trans_metadata_size(fs_info, 6003 csum_leaves); 6004 /* 6005 * For qgroup rsv, the calculation is very simple: 6006 * account one nodesize for each outstanding extent 6007 * 6008 * This is overestimating in most cases. 6009 */ 6010 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; 6011 6012 spin_lock(&block_rsv->lock); 6013 block_rsv->size = reserve_size; 6014 block_rsv->qgroup_rsv_size = qgroup_rsv_size; 6015 spin_unlock(&block_rsv->lock); 6016 } 6017 6018 static void calc_inode_reservations(struct btrfs_fs_info *fs_info, 6019 u64 num_bytes, u64 *meta_reserve, 6020 u64 *qgroup_reserve) 6021 { 6022 u64 nr_extents = count_max_extents(num_bytes); 6023 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); 6024 6025 /* We add one for the inode update at finish ordered time */ 6026 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, 6027 nr_extents + csum_leaves + 1); 6028 *qgroup_reserve = nr_extents * fs_info->nodesize; 6029 } 6030 6031 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 6032 { 6033 struct btrfs_root *root = inode->root; 6034 struct btrfs_fs_info *fs_info = root->fs_info; 6035 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 6036 u64 meta_reserve, qgroup_reserve; 6037 unsigned nr_extents; 6038 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 6039 int ret = 0; 6040 bool delalloc_lock = true; 6041 6042 /* If we are a free space inode we need to not flush since we will be in 6043 * the middle of a transaction commit. We also don't need the delalloc 6044 * mutex since we won't race with anybody. We need this mostly to make 6045 * lockdep shut its filthy mouth. 6046 * 6047 * If we have a transaction open (can happen if we call truncate_block 6048 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 6049 */ 6050 if (btrfs_is_free_space_inode(inode)) { 6051 flush = BTRFS_RESERVE_NO_FLUSH; 6052 delalloc_lock = false; 6053 } else { 6054 if (current->journal_info) 6055 flush = BTRFS_RESERVE_FLUSH_LIMIT; 6056 6057 if (btrfs_transaction_in_commit(fs_info)) 6058 schedule_timeout(1); 6059 } 6060 6061 if (delalloc_lock) 6062 mutex_lock(&inode->delalloc_mutex); 6063 6064 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6065 6066 /* 6067 * We always want to do it this way, every other way is wrong and ends 6068 * in tears. Pre-reserving the amount we are going to add will always 6069 * be the right way, because otherwise if we have enough parallelism we 6070 * could end up with thousands of inodes all holding little bits of 6071 * reservations they were able to make previously and the only way to 6072 * reclaim that space is to ENOSPC out the operations and clear 6073 * everything out and try again, which is bad. This way we just 6074 * over-reserve slightly, and clean up the mess when we are done. 6075 */ 6076 calc_inode_reservations(fs_info, num_bytes, &meta_reserve, 6077 &qgroup_reserve); 6078 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); 6079 if (ret) 6080 goto out_fail; 6081 ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); 6082 if (ret) 6083 goto out_qgroup; 6084 6085 /* 6086 * Now we need to update our outstanding extents and csum bytes _first_ 6087 * and then add the reservation to the block_rsv. This keeps us from 6088 * racing with an ordered completion or some such that would think it 6089 * needs to free the reservation we just made. 6090 */ 6091 spin_lock(&inode->lock); 6092 nr_extents = count_max_extents(num_bytes); 6093 btrfs_mod_outstanding_extents(inode, nr_extents); 6094 inode->csum_bytes += num_bytes; 6095 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6096 spin_unlock(&inode->lock); 6097 6098 /* Now we can safely add our space to our block rsv */ 6099 block_rsv_add_bytes(block_rsv, meta_reserve, false); 6100 trace_btrfs_space_reservation(root->fs_info, "delalloc", 6101 btrfs_ino(inode), meta_reserve, 1); 6102 6103 spin_lock(&block_rsv->lock); 6104 block_rsv->qgroup_rsv_reserved += qgroup_reserve; 6105 spin_unlock(&block_rsv->lock); 6106 6107 if (delalloc_lock) 6108 mutex_unlock(&inode->delalloc_mutex); 6109 return 0; 6110 out_qgroup: 6111 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); 6112 out_fail: 6113 btrfs_inode_rsv_release(inode, true); 6114 if (delalloc_lock) 6115 mutex_unlock(&inode->delalloc_mutex); 6116 return ret; 6117 } 6118 6119 /** 6120 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6121 * @inode: the inode to release the reservation for. 6122 * @num_bytes: the number of bytes we are releasing. 6123 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation 6124 * 6125 * This will release the metadata reservation for an inode. This can be called 6126 * once we complete IO for a given set of bytes to release their metadata 6127 * reservations, or on error for the same reason. 6128 */ 6129 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 6130 bool qgroup_free) 6131 { 6132 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6133 6134 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6135 spin_lock(&inode->lock); 6136 inode->csum_bytes -= num_bytes; 6137 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6138 spin_unlock(&inode->lock); 6139 6140 if (btrfs_is_testing(fs_info)) 6141 return; 6142 6143 btrfs_inode_rsv_release(inode, qgroup_free); 6144 } 6145 6146 /** 6147 * btrfs_delalloc_release_extents - release our outstanding_extents 6148 * @inode: the inode to balance the reservation for. 6149 * @num_bytes: the number of bytes we originally reserved with 6150 * @qgroup_free: do we need to free qgroup meta reservation or convert them. 6151 * 6152 * When we reserve space we increase outstanding_extents for the extents we may 6153 * add. Once we've set the range as delalloc or created our ordered extents we 6154 * have outstanding_extents to track the real usage, so we use this to free our 6155 * temporarily tracked outstanding_extents. This _must_ be used in conjunction 6156 * with btrfs_delalloc_reserve_metadata. 6157 */ 6158 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, 6159 bool qgroup_free) 6160 { 6161 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6162 unsigned num_extents; 6163 6164 spin_lock(&inode->lock); 6165 num_extents = count_max_extents(num_bytes); 6166 btrfs_mod_outstanding_extents(inode, -num_extents); 6167 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6168 spin_unlock(&inode->lock); 6169 6170 if (btrfs_is_testing(fs_info)) 6171 return; 6172 6173 btrfs_inode_rsv_release(inode, qgroup_free); 6174 } 6175 6176 /** 6177 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6178 * delalloc 6179 * @inode: inode we're writing to 6180 * @start: start range we are writing to 6181 * @len: how long the range we are writing to 6182 * @reserved: mandatory parameter, record actually reserved qgroup ranges of 6183 * current reservation. 6184 * 6185 * This will do the following things 6186 * 6187 * o reserve space in data space info for num bytes 6188 * and reserve precious corresponding qgroup space 6189 * (Done in check_data_free_space) 6190 * 6191 * o reserve space for metadata space, based on the number of outstanding 6192 * extents and how much csums will be needed 6193 * also reserve metadata space in a per root over-reserve method. 6194 * o add to the inodes->delalloc_bytes 6195 * o add it to the fs_info's delalloc inodes list. 6196 * (Above 3 all done in delalloc_reserve_metadata) 6197 * 6198 * Return 0 for success 6199 * Return <0 for error(-ENOSPC or -EQUOT) 6200 */ 6201 int btrfs_delalloc_reserve_space(struct inode *inode, 6202 struct extent_changeset **reserved, u64 start, u64 len) 6203 { 6204 int ret; 6205 6206 ret = btrfs_check_data_free_space(inode, reserved, start, len); 6207 if (ret < 0) 6208 return ret; 6209 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6210 if (ret < 0) 6211 btrfs_free_reserved_data_space(inode, *reserved, start, len); 6212 return ret; 6213 } 6214 6215 /** 6216 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6217 * @inode: inode we're releasing space for 6218 * @start: start position of the space already reserved 6219 * @len: the len of the space already reserved 6220 * @release_bytes: the len of the space we consumed or didn't use 6221 * 6222 * This function will release the metadata space that was not used and will 6223 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6224 * list if there are no delalloc bytes left. 6225 * Also it will handle the qgroup reserved space. 6226 */ 6227 void btrfs_delalloc_release_space(struct inode *inode, 6228 struct extent_changeset *reserved, 6229 u64 start, u64 len, bool qgroup_free) 6230 { 6231 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); 6232 btrfs_free_reserved_data_space(inode, reserved, start, len); 6233 } 6234 6235 static int update_block_group(struct btrfs_trans_handle *trans, 6236 u64 bytenr, u64 num_bytes, int alloc) 6237 { 6238 struct btrfs_fs_info *info = trans->fs_info; 6239 struct btrfs_block_group_cache *cache = NULL; 6240 u64 total = num_bytes; 6241 u64 old_val; 6242 u64 byte_in_group; 6243 int factor; 6244 int ret = 0; 6245 6246 /* block accounting for super block */ 6247 spin_lock(&info->delalloc_root_lock); 6248 old_val = btrfs_super_bytes_used(info->super_copy); 6249 if (alloc) 6250 old_val += num_bytes; 6251 else 6252 old_val -= num_bytes; 6253 btrfs_set_super_bytes_used(info->super_copy, old_val); 6254 spin_unlock(&info->delalloc_root_lock); 6255 6256 while (total) { 6257 cache = btrfs_lookup_block_group(info, bytenr); 6258 if (!cache) { 6259 ret = -ENOENT; 6260 break; 6261 } 6262 factor = btrfs_bg_type_to_factor(cache->flags); 6263 6264 /* 6265 * If this block group has free space cache written out, we 6266 * need to make sure to load it if we are removing space. This 6267 * is because we need the unpinning stage to actually add the 6268 * space back to the block group, otherwise we will leak space. 6269 */ 6270 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6271 cache_block_group(cache, 1); 6272 6273 byte_in_group = bytenr - cache->key.objectid; 6274 WARN_ON(byte_in_group > cache->key.offset); 6275 6276 spin_lock(&cache->space_info->lock); 6277 spin_lock(&cache->lock); 6278 6279 if (btrfs_test_opt(info, SPACE_CACHE) && 6280 cache->disk_cache_state < BTRFS_DC_CLEAR) 6281 cache->disk_cache_state = BTRFS_DC_CLEAR; 6282 6283 old_val = btrfs_block_group_used(&cache->item); 6284 num_bytes = min(total, cache->key.offset - byte_in_group); 6285 if (alloc) { 6286 old_val += num_bytes; 6287 btrfs_set_block_group_used(&cache->item, old_val); 6288 cache->reserved -= num_bytes; 6289 cache->space_info->bytes_reserved -= num_bytes; 6290 cache->space_info->bytes_used += num_bytes; 6291 cache->space_info->disk_used += num_bytes * factor; 6292 spin_unlock(&cache->lock); 6293 spin_unlock(&cache->space_info->lock); 6294 } else { 6295 old_val -= num_bytes; 6296 btrfs_set_block_group_used(&cache->item, old_val); 6297 cache->pinned += num_bytes; 6298 update_bytes_pinned(cache->space_info, num_bytes); 6299 cache->space_info->bytes_used -= num_bytes; 6300 cache->space_info->disk_used -= num_bytes * factor; 6301 spin_unlock(&cache->lock); 6302 spin_unlock(&cache->space_info->lock); 6303 6304 trace_btrfs_space_reservation(info, "pinned", 6305 cache->space_info->flags, 6306 num_bytes, 1); 6307 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 6308 num_bytes, 6309 BTRFS_TOTAL_BYTES_PINNED_BATCH); 6310 set_extent_dirty(info->pinned_extents, 6311 bytenr, bytenr + num_bytes - 1, 6312 GFP_NOFS | __GFP_NOFAIL); 6313 } 6314 6315 spin_lock(&trans->transaction->dirty_bgs_lock); 6316 if (list_empty(&cache->dirty_list)) { 6317 list_add_tail(&cache->dirty_list, 6318 &trans->transaction->dirty_bgs); 6319 trans->delayed_ref_updates++; 6320 btrfs_get_block_group(cache); 6321 } 6322 spin_unlock(&trans->transaction->dirty_bgs_lock); 6323 6324 /* 6325 * No longer have used bytes in this block group, queue it for 6326 * deletion. We do this after adding the block group to the 6327 * dirty list to avoid races between cleaner kthread and space 6328 * cache writeout. 6329 */ 6330 if (!alloc && old_val == 0) 6331 btrfs_mark_bg_unused(cache); 6332 6333 btrfs_put_block_group(cache); 6334 total -= num_bytes; 6335 bytenr += num_bytes; 6336 } 6337 6338 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 6339 btrfs_update_delayed_refs_rsv(trans); 6340 return ret; 6341 } 6342 6343 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6344 { 6345 struct btrfs_block_group_cache *cache; 6346 u64 bytenr; 6347 6348 spin_lock(&fs_info->block_group_cache_lock); 6349 bytenr = fs_info->first_logical_byte; 6350 spin_unlock(&fs_info->block_group_cache_lock); 6351 6352 if (bytenr < (u64)-1) 6353 return bytenr; 6354 6355 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6356 if (!cache) 6357 return 0; 6358 6359 bytenr = cache->key.objectid; 6360 btrfs_put_block_group(cache); 6361 6362 return bytenr; 6363 } 6364 6365 static int pin_down_extent(struct btrfs_block_group_cache *cache, 6366 u64 bytenr, u64 num_bytes, int reserved) 6367 { 6368 struct btrfs_fs_info *fs_info = cache->fs_info; 6369 6370 spin_lock(&cache->space_info->lock); 6371 spin_lock(&cache->lock); 6372 cache->pinned += num_bytes; 6373 update_bytes_pinned(cache->space_info, num_bytes); 6374 if (reserved) { 6375 cache->reserved -= num_bytes; 6376 cache->space_info->bytes_reserved -= num_bytes; 6377 } 6378 spin_unlock(&cache->lock); 6379 spin_unlock(&cache->space_info->lock); 6380 6381 trace_btrfs_space_reservation(fs_info, "pinned", 6382 cache->space_info->flags, num_bytes, 1); 6383 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 6384 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); 6385 set_extent_dirty(fs_info->pinned_extents, bytenr, 6386 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6387 return 0; 6388 } 6389 6390 /* 6391 * this function must be called within transaction 6392 */ 6393 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6394 u64 bytenr, u64 num_bytes, int reserved) 6395 { 6396 struct btrfs_block_group_cache *cache; 6397 6398 cache = btrfs_lookup_block_group(fs_info, bytenr); 6399 BUG_ON(!cache); /* Logic error */ 6400 6401 pin_down_extent(cache, bytenr, num_bytes, reserved); 6402 6403 btrfs_put_block_group(cache); 6404 return 0; 6405 } 6406 6407 /* 6408 * this function must be called within transaction 6409 */ 6410 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6411 u64 bytenr, u64 num_bytes) 6412 { 6413 struct btrfs_block_group_cache *cache; 6414 int ret; 6415 6416 cache = btrfs_lookup_block_group(fs_info, bytenr); 6417 if (!cache) 6418 return -EINVAL; 6419 6420 /* 6421 * pull in the free space cache (if any) so that our pin 6422 * removes the free space from the cache. We have load_only set 6423 * to one because the slow code to read in the free extents does check 6424 * the pinned extents. 6425 */ 6426 cache_block_group(cache, 1); 6427 6428 pin_down_extent(cache, bytenr, num_bytes, 0); 6429 6430 /* remove us from the free space cache (if we're there at all) */ 6431 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6432 btrfs_put_block_group(cache); 6433 return ret; 6434 } 6435 6436 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6437 u64 start, u64 num_bytes) 6438 { 6439 int ret; 6440 struct btrfs_block_group_cache *block_group; 6441 struct btrfs_caching_control *caching_ctl; 6442 6443 block_group = btrfs_lookup_block_group(fs_info, start); 6444 if (!block_group) 6445 return -EINVAL; 6446 6447 cache_block_group(block_group, 0); 6448 caching_ctl = get_caching_control(block_group); 6449 6450 if (!caching_ctl) { 6451 /* Logic error */ 6452 BUG_ON(!block_group_cache_done(block_group)); 6453 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6454 } else { 6455 mutex_lock(&caching_ctl->mutex); 6456 6457 if (start >= caching_ctl->progress) { 6458 ret = add_excluded_extent(fs_info, start, num_bytes); 6459 } else if (start + num_bytes <= caching_ctl->progress) { 6460 ret = btrfs_remove_free_space(block_group, 6461 start, num_bytes); 6462 } else { 6463 num_bytes = caching_ctl->progress - start; 6464 ret = btrfs_remove_free_space(block_group, 6465 start, num_bytes); 6466 if (ret) 6467 goto out_lock; 6468 6469 num_bytes = (start + num_bytes) - 6470 caching_ctl->progress; 6471 start = caching_ctl->progress; 6472 ret = add_excluded_extent(fs_info, start, num_bytes); 6473 } 6474 out_lock: 6475 mutex_unlock(&caching_ctl->mutex); 6476 put_caching_control(caching_ctl); 6477 } 6478 btrfs_put_block_group(block_group); 6479 return ret; 6480 } 6481 6482 int btrfs_exclude_logged_extents(struct extent_buffer *eb) 6483 { 6484 struct btrfs_fs_info *fs_info = eb->fs_info; 6485 struct btrfs_file_extent_item *item; 6486 struct btrfs_key key; 6487 int found_type; 6488 int i; 6489 int ret = 0; 6490 6491 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6492 return 0; 6493 6494 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6495 btrfs_item_key_to_cpu(eb, &key, i); 6496 if (key.type != BTRFS_EXTENT_DATA_KEY) 6497 continue; 6498 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6499 found_type = btrfs_file_extent_type(eb, item); 6500 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6501 continue; 6502 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6503 continue; 6504 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6505 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6506 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 6507 if (ret) 6508 break; 6509 } 6510 6511 return ret; 6512 } 6513 6514 static void 6515 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6516 { 6517 atomic_inc(&bg->reservations); 6518 } 6519 6520 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6521 const u64 start) 6522 { 6523 struct btrfs_block_group_cache *bg; 6524 6525 bg = btrfs_lookup_block_group(fs_info, start); 6526 ASSERT(bg); 6527 if (atomic_dec_and_test(&bg->reservations)) 6528 wake_up_var(&bg->reservations); 6529 btrfs_put_block_group(bg); 6530 } 6531 6532 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6533 { 6534 struct btrfs_space_info *space_info = bg->space_info; 6535 6536 ASSERT(bg->ro); 6537 6538 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6539 return; 6540 6541 /* 6542 * Our block group is read only but before we set it to read only, 6543 * some task might have had allocated an extent from it already, but it 6544 * has not yet created a respective ordered extent (and added it to a 6545 * root's list of ordered extents). 6546 * Therefore wait for any task currently allocating extents, since the 6547 * block group's reservations counter is incremented while a read lock 6548 * on the groups' semaphore is held and decremented after releasing 6549 * the read access on that semaphore and creating the ordered extent. 6550 */ 6551 down_write(&space_info->groups_sem); 6552 up_write(&space_info->groups_sem); 6553 6554 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 6555 } 6556 6557 /** 6558 * btrfs_add_reserved_bytes - update the block_group and space info counters 6559 * @cache: The cache we are manipulating 6560 * @ram_bytes: The number of bytes of file content, and will be same to 6561 * @num_bytes except for the compress path. 6562 * @num_bytes: The number of bytes in question 6563 * @delalloc: The blocks are allocated for the delalloc write 6564 * 6565 * This is called by the allocator when it reserves space. If this is a 6566 * reservation and the block group has become read only we cannot make the 6567 * reservation and return -EAGAIN, otherwise this function always succeeds. 6568 */ 6569 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6570 u64 ram_bytes, u64 num_bytes, int delalloc) 6571 { 6572 struct btrfs_space_info *space_info = cache->space_info; 6573 int ret = 0; 6574 6575 spin_lock(&space_info->lock); 6576 spin_lock(&cache->lock); 6577 if (cache->ro) { 6578 ret = -EAGAIN; 6579 } else { 6580 cache->reserved += num_bytes; 6581 space_info->bytes_reserved += num_bytes; 6582 update_bytes_may_use(space_info, -ram_bytes); 6583 if (delalloc) 6584 cache->delalloc_bytes += num_bytes; 6585 } 6586 spin_unlock(&cache->lock); 6587 spin_unlock(&space_info->lock); 6588 return ret; 6589 } 6590 6591 /** 6592 * btrfs_free_reserved_bytes - update the block_group and space info counters 6593 * @cache: The cache we are manipulating 6594 * @num_bytes: The number of bytes in question 6595 * @delalloc: The blocks are allocated for the delalloc write 6596 * 6597 * This is called by somebody who is freeing space that was never actually used 6598 * on disk. For example if you reserve some space for a new leaf in transaction 6599 * A and before transaction A commits you free that leaf, you call this with 6600 * reserve set to 0 in order to clear the reservation. 6601 */ 6602 6603 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6604 u64 num_bytes, int delalloc) 6605 { 6606 struct btrfs_space_info *space_info = cache->space_info; 6607 6608 spin_lock(&space_info->lock); 6609 spin_lock(&cache->lock); 6610 if (cache->ro) 6611 space_info->bytes_readonly += num_bytes; 6612 cache->reserved -= num_bytes; 6613 space_info->bytes_reserved -= num_bytes; 6614 space_info->max_extent_size = 0; 6615 6616 if (delalloc) 6617 cache->delalloc_bytes -= num_bytes; 6618 spin_unlock(&cache->lock); 6619 spin_unlock(&space_info->lock); 6620 } 6621 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6622 { 6623 struct btrfs_caching_control *next; 6624 struct btrfs_caching_control *caching_ctl; 6625 struct btrfs_block_group_cache *cache; 6626 6627 down_write(&fs_info->commit_root_sem); 6628 6629 list_for_each_entry_safe(caching_ctl, next, 6630 &fs_info->caching_block_groups, list) { 6631 cache = caching_ctl->block_group; 6632 if (block_group_cache_done(cache)) { 6633 cache->last_byte_to_unpin = (u64)-1; 6634 list_del_init(&caching_ctl->list); 6635 put_caching_control(caching_ctl); 6636 } else { 6637 cache->last_byte_to_unpin = caching_ctl->progress; 6638 } 6639 } 6640 6641 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6642 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6643 else 6644 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6645 6646 up_write(&fs_info->commit_root_sem); 6647 6648 update_global_block_rsv(fs_info); 6649 } 6650 6651 /* 6652 * Returns the free cluster for the given space info and sets empty_cluster to 6653 * what it should be based on the mount options. 6654 */ 6655 static struct btrfs_free_cluster * 6656 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6657 struct btrfs_space_info *space_info, u64 *empty_cluster) 6658 { 6659 struct btrfs_free_cluster *ret = NULL; 6660 6661 *empty_cluster = 0; 6662 if (btrfs_mixed_space_info(space_info)) 6663 return ret; 6664 6665 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6666 ret = &fs_info->meta_alloc_cluster; 6667 if (btrfs_test_opt(fs_info, SSD)) 6668 *empty_cluster = SZ_2M; 6669 else 6670 *empty_cluster = SZ_64K; 6671 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 6672 btrfs_test_opt(fs_info, SSD_SPREAD)) { 6673 *empty_cluster = SZ_2M; 6674 ret = &fs_info->data_alloc_cluster; 6675 } 6676 6677 return ret; 6678 } 6679 6680 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6681 u64 start, u64 end, 6682 const bool return_free_space) 6683 { 6684 struct btrfs_block_group_cache *cache = NULL; 6685 struct btrfs_space_info *space_info; 6686 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6687 struct btrfs_free_cluster *cluster = NULL; 6688 u64 len; 6689 u64 total_unpinned = 0; 6690 u64 empty_cluster = 0; 6691 bool readonly; 6692 6693 while (start <= end) { 6694 readonly = false; 6695 if (!cache || 6696 start >= cache->key.objectid + cache->key.offset) { 6697 if (cache) 6698 btrfs_put_block_group(cache); 6699 total_unpinned = 0; 6700 cache = btrfs_lookup_block_group(fs_info, start); 6701 BUG_ON(!cache); /* Logic error */ 6702 6703 cluster = fetch_cluster_info(fs_info, 6704 cache->space_info, 6705 &empty_cluster); 6706 empty_cluster <<= 1; 6707 } 6708 6709 len = cache->key.objectid + cache->key.offset - start; 6710 len = min(len, end + 1 - start); 6711 6712 if (start < cache->last_byte_to_unpin) { 6713 len = min(len, cache->last_byte_to_unpin - start); 6714 if (return_free_space) 6715 btrfs_add_free_space(cache, start, len); 6716 } 6717 6718 start += len; 6719 total_unpinned += len; 6720 space_info = cache->space_info; 6721 6722 /* 6723 * If this space cluster has been marked as fragmented and we've 6724 * unpinned enough in this block group to potentially allow a 6725 * cluster to be created inside of it go ahead and clear the 6726 * fragmented check. 6727 */ 6728 if (cluster && cluster->fragmented && 6729 total_unpinned > empty_cluster) { 6730 spin_lock(&cluster->lock); 6731 cluster->fragmented = 0; 6732 spin_unlock(&cluster->lock); 6733 } 6734 6735 spin_lock(&space_info->lock); 6736 spin_lock(&cache->lock); 6737 cache->pinned -= len; 6738 update_bytes_pinned(space_info, -len); 6739 6740 trace_btrfs_space_reservation(fs_info, "pinned", 6741 space_info->flags, len, 0); 6742 space_info->max_extent_size = 0; 6743 percpu_counter_add_batch(&space_info->total_bytes_pinned, 6744 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); 6745 if (cache->ro) { 6746 space_info->bytes_readonly += len; 6747 readonly = true; 6748 } 6749 spin_unlock(&cache->lock); 6750 if (!readonly && return_free_space && 6751 global_rsv->space_info == space_info) { 6752 u64 to_add = len; 6753 6754 spin_lock(&global_rsv->lock); 6755 if (!global_rsv->full) { 6756 to_add = min(len, global_rsv->size - 6757 global_rsv->reserved); 6758 global_rsv->reserved += to_add; 6759 update_bytes_may_use(space_info, to_add); 6760 if (global_rsv->reserved >= global_rsv->size) 6761 global_rsv->full = 1; 6762 trace_btrfs_space_reservation(fs_info, 6763 "space_info", 6764 space_info->flags, 6765 to_add, 1); 6766 len -= to_add; 6767 } 6768 spin_unlock(&global_rsv->lock); 6769 /* Add to any tickets we may have */ 6770 if (len) 6771 space_info_add_new_bytes(fs_info, space_info, 6772 len); 6773 } 6774 spin_unlock(&space_info->lock); 6775 } 6776 6777 if (cache) 6778 btrfs_put_block_group(cache); 6779 return 0; 6780 } 6781 6782 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 6783 { 6784 struct btrfs_fs_info *fs_info = trans->fs_info; 6785 struct btrfs_block_group_cache *block_group, *tmp; 6786 struct list_head *deleted_bgs; 6787 struct extent_io_tree *unpin; 6788 u64 start; 6789 u64 end; 6790 int ret; 6791 6792 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6793 unpin = &fs_info->freed_extents[1]; 6794 else 6795 unpin = &fs_info->freed_extents[0]; 6796 6797 while (!trans->aborted) { 6798 struct extent_state *cached_state = NULL; 6799 6800 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6801 ret = find_first_extent_bit(unpin, 0, &start, &end, 6802 EXTENT_DIRTY, &cached_state); 6803 if (ret) { 6804 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6805 break; 6806 } 6807 6808 if (btrfs_test_opt(fs_info, DISCARD)) 6809 ret = btrfs_discard_extent(fs_info, start, 6810 end + 1 - start, NULL); 6811 6812 clear_extent_dirty(unpin, start, end, &cached_state); 6813 unpin_extent_range(fs_info, start, end, true); 6814 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6815 free_extent_state(cached_state); 6816 cond_resched(); 6817 } 6818 6819 /* 6820 * Transaction is finished. We don't need the lock anymore. We 6821 * do need to clean up the block groups in case of a transaction 6822 * abort. 6823 */ 6824 deleted_bgs = &trans->transaction->deleted_bgs; 6825 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6826 u64 trimmed = 0; 6827 6828 ret = -EROFS; 6829 if (!trans->aborted) 6830 ret = btrfs_discard_extent(fs_info, 6831 block_group->key.objectid, 6832 block_group->key.offset, 6833 &trimmed); 6834 6835 list_del_init(&block_group->bg_list); 6836 btrfs_put_block_group_trimming(block_group); 6837 btrfs_put_block_group(block_group); 6838 6839 if (ret) { 6840 const char *errstr = btrfs_decode_error(ret); 6841 btrfs_warn(fs_info, 6842 "discard failed while removing blockgroup: errno=%d %s", 6843 ret, errstr); 6844 } 6845 } 6846 6847 return 0; 6848 } 6849 6850 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6851 struct btrfs_delayed_ref_node *node, u64 parent, 6852 u64 root_objectid, u64 owner_objectid, 6853 u64 owner_offset, int refs_to_drop, 6854 struct btrfs_delayed_extent_op *extent_op) 6855 { 6856 struct btrfs_fs_info *info = trans->fs_info; 6857 struct btrfs_key key; 6858 struct btrfs_path *path; 6859 struct btrfs_root *extent_root = info->extent_root; 6860 struct extent_buffer *leaf; 6861 struct btrfs_extent_item *ei; 6862 struct btrfs_extent_inline_ref *iref; 6863 int ret; 6864 int is_data; 6865 int extent_slot = 0; 6866 int found_extent = 0; 6867 int num_to_del = 1; 6868 u32 item_size; 6869 u64 refs; 6870 u64 bytenr = node->bytenr; 6871 u64 num_bytes = node->num_bytes; 6872 int last_ref = 0; 6873 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 6874 6875 path = btrfs_alloc_path(); 6876 if (!path) 6877 return -ENOMEM; 6878 6879 path->reada = READA_FORWARD; 6880 path->leave_spinning = 1; 6881 6882 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6883 BUG_ON(!is_data && refs_to_drop != 1); 6884 6885 if (is_data) 6886 skinny_metadata = false; 6887 6888 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 6889 parent, root_objectid, owner_objectid, 6890 owner_offset); 6891 if (ret == 0) { 6892 extent_slot = path->slots[0]; 6893 while (extent_slot >= 0) { 6894 btrfs_item_key_to_cpu(path->nodes[0], &key, 6895 extent_slot); 6896 if (key.objectid != bytenr) 6897 break; 6898 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6899 key.offset == num_bytes) { 6900 found_extent = 1; 6901 break; 6902 } 6903 if (key.type == BTRFS_METADATA_ITEM_KEY && 6904 key.offset == owner_objectid) { 6905 found_extent = 1; 6906 break; 6907 } 6908 if (path->slots[0] - extent_slot > 5) 6909 break; 6910 extent_slot--; 6911 } 6912 6913 if (!found_extent) { 6914 BUG_ON(iref); 6915 ret = remove_extent_backref(trans, path, NULL, 6916 refs_to_drop, 6917 is_data, &last_ref); 6918 if (ret) { 6919 btrfs_abort_transaction(trans, ret); 6920 goto out; 6921 } 6922 btrfs_release_path(path); 6923 path->leave_spinning = 1; 6924 6925 key.objectid = bytenr; 6926 key.type = BTRFS_EXTENT_ITEM_KEY; 6927 key.offset = num_bytes; 6928 6929 if (!is_data && skinny_metadata) { 6930 key.type = BTRFS_METADATA_ITEM_KEY; 6931 key.offset = owner_objectid; 6932 } 6933 6934 ret = btrfs_search_slot(trans, extent_root, 6935 &key, path, -1, 1); 6936 if (ret > 0 && skinny_metadata && path->slots[0]) { 6937 /* 6938 * Couldn't find our skinny metadata item, 6939 * see if we have ye olde extent item. 6940 */ 6941 path->slots[0]--; 6942 btrfs_item_key_to_cpu(path->nodes[0], &key, 6943 path->slots[0]); 6944 if (key.objectid == bytenr && 6945 key.type == BTRFS_EXTENT_ITEM_KEY && 6946 key.offset == num_bytes) 6947 ret = 0; 6948 } 6949 6950 if (ret > 0 && skinny_metadata) { 6951 skinny_metadata = false; 6952 key.objectid = bytenr; 6953 key.type = BTRFS_EXTENT_ITEM_KEY; 6954 key.offset = num_bytes; 6955 btrfs_release_path(path); 6956 ret = btrfs_search_slot(trans, extent_root, 6957 &key, path, -1, 1); 6958 } 6959 6960 if (ret) { 6961 btrfs_err(info, 6962 "umm, got %d back from search, was looking for %llu", 6963 ret, bytenr); 6964 if (ret > 0) 6965 btrfs_print_leaf(path->nodes[0]); 6966 } 6967 if (ret < 0) { 6968 btrfs_abort_transaction(trans, ret); 6969 goto out; 6970 } 6971 extent_slot = path->slots[0]; 6972 } 6973 } else if (WARN_ON(ret == -ENOENT)) { 6974 btrfs_print_leaf(path->nodes[0]); 6975 btrfs_err(info, 6976 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6977 bytenr, parent, root_objectid, owner_objectid, 6978 owner_offset); 6979 btrfs_abort_transaction(trans, ret); 6980 goto out; 6981 } else { 6982 btrfs_abort_transaction(trans, ret); 6983 goto out; 6984 } 6985 6986 leaf = path->nodes[0]; 6987 item_size = btrfs_item_size_nr(leaf, extent_slot); 6988 if (unlikely(item_size < sizeof(*ei))) { 6989 ret = -EINVAL; 6990 btrfs_print_v0_err(info); 6991 btrfs_abort_transaction(trans, ret); 6992 goto out; 6993 } 6994 ei = btrfs_item_ptr(leaf, extent_slot, 6995 struct btrfs_extent_item); 6996 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6997 key.type == BTRFS_EXTENT_ITEM_KEY) { 6998 struct btrfs_tree_block_info *bi; 6999 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 7000 bi = (struct btrfs_tree_block_info *)(ei + 1); 7001 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 7002 } 7003 7004 refs = btrfs_extent_refs(leaf, ei); 7005 if (refs < refs_to_drop) { 7006 btrfs_err(info, 7007 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7008 refs_to_drop, refs, bytenr); 7009 ret = -EINVAL; 7010 btrfs_abort_transaction(trans, ret); 7011 goto out; 7012 } 7013 refs -= refs_to_drop; 7014 7015 if (refs > 0) { 7016 if (extent_op) 7017 __run_delayed_extent_op(extent_op, leaf, ei); 7018 /* 7019 * In the case of inline back ref, reference count will 7020 * be updated by remove_extent_backref 7021 */ 7022 if (iref) { 7023 BUG_ON(!found_extent); 7024 } else { 7025 btrfs_set_extent_refs(leaf, ei, refs); 7026 btrfs_mark_buffer_dirty(leaf); 7027 } 7028 if (found_extent) { 7029 ret = remove_extent_backref(trans, path, iref, 7030 refs_to_drop, is_data, 7031 &last_ref); 7032 if (ret) { 7033 btrfs_abort_transaction(trans, ret); 7034 goto out; 7035 } 7036 } 7037 } else { 7038 if (found_extent) { 7039 BUG_ON(is_data && refs_to_drop != 7040 extent_data_ref_count(path, iref)); 7041 if (iref) { 7042 BUG_ON(path->slots[0] != extent_slot); 7043 } else { 7044 BUG_ON(path->slots[0] != extent_slot + 1); 7045 path->slots[0] = extent_slot; 7046 num_to_del = 2; 7047 } 7048 } 7049 7050 last_ref = 1; 7051 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7052 num_to_del); 7053 if (ret) { 7054 btrfs_abort_transaction(trans, ret); 7055 goto out; 7056 } 7057 btrfs_release_path(path); 7058 7059 if (is_data) { 7060 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7061 if (ret) { 7062 btrfs_abort_transaction(trans, ret); 7063 goto out; 7064 } 7065 } 7066 7067 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 7068 if (ret) { 7069 btrfs_abort_transaction(trans, ret); 7070 goto out; 7071 } 7072 7073 ret = update_block_group(trans, bytenr, num_bytes, 0); 7074 if (ret) { 7075 btrfs_abort_transaction(trans, ret); 7076 goto out; 7077 } 7078 } 7079 btrfs_release_path(path); 7080 7081 out: 7082 btrfs_free_path(path); 7083 return ret; 7084 } 7085 7086 /* 7087 * when we free an block, it is possible (and likely) that we free the last 7088 * delayed ref for that extent as well. This searches the delayed ref tree for 7089 * a given extent, and if there are no other delayed refs to be processed, it 7090 * removes it from the tree. 7091 */ 7092 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7093 u64 bytenr) 7094 { 7095 struct btrfs_delayed_ref_head *head; 7096 struct btrfs_delayed_ref_root *delayed_refs; 7097 int ret = 0; 7098 7099 delayed_refs = &trans->transaction->delayed_refs; 7100 spin_lock(&delayed_refs->lock); 7101 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7102 if (!head) 7103 goto out_delayed_unlock; 7104 7105 spin_lock(&head->lock); 7106 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 7107 goto out; 7108 7109 if (cleanup_extent_op(head) != NULL) 7110 goto out; 7111 7112 /* 7113 * waiting for the lock here would deadlock. If someone else has it 7114 * locked they are already in the process of dropping it anyway 7115 */ 7116 if (!mutex_trylock(&head->mutex)) 7117 goto out; 7118 7119 btrfs_delete_ref_head(delayed_refs, head); 7120 head->processing = 0; 7121 7122 spin_unlock(&head->lock); 7123 spin_unlock(&delayed_refs->lock); 7124 7125 BUG_ON(head->extent_op); 7126 if (head->must_insert_reserved) 7127 ret = 1; 7128 7129 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); 7130 mutex_unlock(&head->mutex); 7131 btrfs_put_delayed_ref_head(head); 7132 return ret; 7133 out: 7134 spin_unlock(&head->lock); 7135 7136 out_delayed_unlock: 7137 spin_unlock(&delayed_refs->lock); 7138 return 0; 7139 } 7140 7141 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7142 struct btrfs_root *root, 7143 struct extent_buffer *buf, 7144 u64 parent, int last_ref) 7145 { 7146 struct btrfs_fs_info *fs_info = root->fs_info; 7147 struct btrfs_ref generic_ref = { 0 }; 7148 int pin = 1; 7149 int ret; 7150 7151 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, 7152 buf->start, buf->len, parent); 7153 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 7154 root->root_key.objectid); 7155 7156 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7157 int old_ref_mod, new_ref_mod; 7158 7159 btrfs_ref_tree_mod(fs_info, &generic_ref); 7160 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, 7161 &old_ref_mod, &new_ref_mod); 7162 BUG_ON(ret); /* -ENOMEM */ 7163 pin = old_ref_mod >= 0 && new_ref_mod < 0; 7164 } 7165 7166 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 7167 struct btrfs_block_group_cache *cache; 7168 7169 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7170 ret = check_ref_cleanup(trans, buf->start); 7171 if (!ret) 7172 goto out; 7173 } 7174 7175 pin = 0; 7176 cache = btrfs_lookup_block_group(fs_info, buf->start); 7177 7178 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7179 pin_down_extent(cache, buf->start, buf->len, 1); 7180 btrfs_put_block_group(cache); 7181 goto out; 7182 } 7183 7184 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7185 7186 btrfs_add_free_space(cache, buf->start, buf->len); 7187 btrfs_free_reserved_bytes(cache, buf->len, 0); 7188 btrfs_put_block_group(cache); 7189 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7190 } 7191 out: 7192 if (pin) 7193 add_pinned_bytes(fs_info, &generic_ref); 7194 7195 if (last_ref) { 7196 /* 7197 * Deleting the buffer, clear the corrupt flag since it doesn't 7198 * matter anymore. 7199 */ 7200 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7201 } 7202 } 7203 7204 /* Can return -ENOMEM */ 7205 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) 7206 { 7207 struct btrfs_fs_info *fs_info = trans->fs_info; 7208 int old_ref_mod, new_ref_mod; 7209 int ret; 7210 7211 if (btrfs_is_testing(fs_info)) 7212 return 0; 7213 7214 /* 7215 * tree log blocks never actually go into the extent allocation 7216 * tree, just update pinning info and exit early. 7217 */ 7218 if ((ref->type == BTRFS_REF_METADATA && 7219 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || 7220 (ref->type == BTRFS_REF_DATA && 7221 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { 7222 /* unlocks the pinned mutex */ 7223 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); 7224 old_ref_mod = new_ref_mod = 0; 7225 ret = 0; 7226 } else if (ref->type == BTRFS_REF_METADATA) { 7227 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, 7228 &old_ref_mod, &new_ref_mod); 7229 } else { 7230 ret = btrfs_add_delayed_data_ref(trans, ref, 0, 7231 &old_ref_mod, &new_ref_mod); 7232 } 7233 7234 if (!((ref->type == BTRFS_REF_METADATA && 7235 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || 7236 (ref->type == BTRFS_REF_DATA && 7237 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) 7238 btrfs_ref_tree_mod(fs_info, ref); 7239 7240 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) 7241 add_pinned_bytes(fs_info, ref); 7242 7243 return ret; 7244 } 7245 7246 /* 7247 * when we wait for progress in the block group caching, its because 7248 * our allocation attempt failed at least once. So, we must sleep 7249 * and let some progress happen before we try again. 7250 * 7251 * This function will sleep at least once waiting for new free space to 7252 * show up, and then it will check the block group free space numbers 7253 * for our min num_bytes. Another option is to have it go ahead 7254 * and look in the rbtree for a free extent of a given size, but this 7255 * is a good start. 7256 * 7257 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7258 * any of the information in this block group. 7259 */ 7260 static noinline void 7261 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7262 u64 num_bytes) 7263 { 7264 struct btrfs_caching_control *caching_ctl; 7265 7266 caching_ctl = get_caching_control(cache); 7267 if (!caching_ctl) 7268 return; 7269 7270 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7271 (cache->free_space_ctl->free_space >= num_bytes)); 7272 7273 put_caching_control(caching_ctl); 7274 } 7275 7276 static noinline int 7277 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7278 { 7279 struct btrfs_caching_control *caching_ctl; 7280 int ret = 0; 7281 7282 caching_ctl = get_caching_control(cache); 7283 if (!caching_ctl) 7284 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7285 7286 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7287 if (cache->cached == BTRFS_CACHE_ERROR) 7288 ret = -EIO; 7289 put_caching_control(caching_ctl); 7290 return ret; 7291 } 7292 7293 enum btrfs_loop_type { 7294 LOOP_CACHING_NOWAIT = 0, 7295 LOOP_CACHING_WAIT = 1, 7296 LOOP_ALLOC_CHUNK = 2, 7297 LOOP_NO_EMPTY_SIZE = 3, 7298 }; 7299 7300 static inline void 7301 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7302 int delalloc) 7303 { 7304 if (delalloc) 7305 down_read(&cache->data_rwsem); 7306 } 7307 7308 static inline void 7309 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7310 int delalloc) 7311 { 7312 btrfs_get_block_group(cache); 7313 if (delalloc) 7314 down_read(&cache->data_rwsem); 7315 } 7316 7317 static struct btrfs_block_group_cache * 7318 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7319 struct btrfs_free_cluster *cluster, 7320 int delalloc) 7321 { 7322 struct btrfs_block_group_cache *used_bg = NULL; 7323 7324 spin_lock(&cluster->refill_lock); 7325 while (1) { 7326 used_bg = cluster->block_group; 7327 if (!used_bg) 7328 return NULL; 7329 7330 if (used_bg == block_group) 7331 return used_bg; 7332 7333 btrfs_get_block_group(used_bg); 7334 7335 if (!delalloc) 7336 return used_bg; 7337 7338 if (down_read_trylock(&used_bg->data_rwsem)) 7339 return used_bg; 7340 7341 spin_unlock(&cluster->refill_lock); 7342 7343 /* We should only have one-level nested. */ 7344 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7345 7346 spin_lock(&cluster->refill_lock); 7347 if (used_bg == cluster->block_group) 7348 return used_bg; 7349 7350 up_read(&used_bg->data_rwsem); 7351 btrfs_put_block_group(used_bg); 7352 } 7353 } 7354 7355 static inline void 7356 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7357 int delalloc) 7358 { 7359 if (delalloc) 7360 up_read(&cache->data_rwsem); 7361 btrfs_put_block_group(cache); 7362 } 7363 7364 /* 7365 * Structure used internally for find_free_extent() function. Wraps needed 7366 * parameters. 7367 */ 7368 struct find_free_extent_ctl { 7369 /* Basic allocation info */ 7370 u64 ram_bytes; 7371 u64 num_bytes; 7372 u64 empty_size; 7373 u64 flags; 7374 int delalloc; 7375 7376 /* Where to start the search inside the bg */ 7377 u64 search_start; 7378 7379 /* For clustered allocation */ 7380 u64 empty_cluster; 7381 7382 bool have_caching_bg; 7383 bool orig_have_caching_bg; 7384 7385 /* RAID index, converted from flags */ 7386 int index; 7387 7388 /* 7389 * Current loop number, check find_free_extent_update_loop() for details 7390 */ 7391 int loop; 7392 7393 /* 7394 * Whether we're refilling a cluster, if true we need to re-search 7395 * current block group but don't try to refill the cluster again. 7396 */ 7397 bool retry_clustered; 7398 7399 /* 7400 * Whether we're updating free space cache, if true we need to re-search 7401 * current block group but don't try updating free space cache again. 7402 */ 7403 bool retry_unclustered; 7404 7405 /* If current block group is cached */ 7406 int cached; 7407 7408 /* Max contiguous hole found */ 7409 u64 max_extent_size; 7410 7411 /* Total free space from free space cache, not always contiguous */ 7412 u64 total_free_space; 7413 7414 /* Found result */ 7415 u64 found_offset; 7416 }; 7417 7418 7419 /* 7420 * Helper function for find_free_extent(). 7421 * 7422 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 7423 * Return -EAGAIN to inform caller that we need to re-search this block group 7424 * Return >0 to inform caller that we find nothing 7425 * Return 0 means we have found a location and set ffe_ctl->found_offset. 7426 */ 7427 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, 7428 struct btrfs_free_cluster *last_ptr, 7429 struct find_free_extent_ctl *ffe_ctl, 7430 struct btrfs_block_group_cache **cluster_bg_ret) 7431 { 7432 struct btrfs_block_group_cache *cluster_bg; 7433 u64 aligned_cluster; 7434 u64 offset; 7435 int ret; 7436 7437 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 7438 if (!cluster_bg) 7439 goto refill_cluster; 7440 if (cluster_bg != bg && (cluster_bg->ro || 7441 !block_group_bits(cluster_bg, ffe_ctl->flags))) 7442 goto release_cluster; 7443 7444 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 7445 ffe_ctl->num_bytes, cluster_bg->key.objectid, 7446 &ffe_ctl->max_extent_size); 7447 if (offset) { 7448 /* We have a block, we're done */ 7449 spin_unlock(&last_ptr->refill_lock); 7450 trace_btrfs_reserve_extent_cluster(cluster_bg, 7451 ffe_ctl->search_start, ffe_ctl->num_bytes); 7452 *cluster_bg_ret = cluster_bg; 7453 ffe_ctl->found_offset = offset; 7454 return 0; 7455 } 7456 WARN_ON(last_ptr->block_group != cluster_bg); 7457 7458 release_cluster: 7459 /* 7460 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 7461 * lets just skip it and let the allocator find whatever block it can 7462 * find. If we reach this point, we will have tried the cluster 7463 * allocator plenty of times and not have found anything, so we are 7464 * likely way too fragmented for the clustering stuff to find anything. 7465 * 7466 * However, if the cluster is taken from the current block group, 7467 * release the cluster first, so that we stand a better chance of 7468 * succeeding in the unclustered allocation. 7469 */ 7470 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 7471 spin_unlock(&last_ptr->refill_lock); 7472 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 7473 return -ENOENT; 7474 } 7475 7476 /* This cluster didn't work out, free it and start over */ 7477 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7478 7479 if (cluster_bg != bg) 7480 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 7481 7482 refill_cluster: 7483 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 7484 spin_unlock(&last_ptr->refill_lock); 7485 return -ENOENT; 7486 } 7487 7488 aligned_cluster = max_t(u64, 7489 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 7490 bg->full_stripe_len); 7491 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, 7492 ffe_ctl->num_bytes, aligned_cluster); 7493 if (ret == 0) { 7494 /* Now pull our allocation out of this cluster */ 7495 offset = btrfs_alloc_from_cluster(bg, last_ptr, 7496 ffe_ctl->num_bytes, ffe_ctl->search_start, 7497 &ffe_ctl->max_extent_size); 7498 if (offset) { 7499 /* We found one, proceed */ 7500 spin_unlock(&last_ptr->refill_lock); 7501 trace_btrfs_reserve_extent_cluster(bg, 7502 ffe_ctl->search_start, 7503 ffe_ctl->num_bytes); 7504 ffe_ctl->found_offset = offset; 7505 return 0; 7506 } 7507 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 7508 !ffe_ctl->retry_clustered) { 7509 spin_unlock(&last_ptr->refill_lock); 7510 7511 ffe_ctl->retry_clustered = true; 7512 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 7513 ffe_ctl->empty_cluster + ffe_ctl->empty_size); 7514 return -EAGAIN; 7515 } 7516 /* 7517 * At this point we either didn't find a cluster or we weren't able to 7518 * allocate a block from our cluster. Free the cluster we've been 7519 * trying to use, and go to the next block group. 7520 */ 7521 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7522 spin_unlock(&last_ptr->refill_lock); 7523 return 1; 7524 } 7525 7526 /* 7527 * Return >0 to inform caller that we find nothing 7528 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 7529 * Return -EAGAIN to inform caller that we need to re-search this block group 7530 */ 7531 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, 7532 struct btrfs_free_cluster *last_ptr, 7533 struct find_free_extent_ctl *ffe_ctl) 7534 { 7535 u64 offset; 7536 7537 /* 7538 * We are doing an unclustered allocation, set the fragmented flag so 7539 * we don't bother trying to setup a cluster again until we get more 7540 * space. 7541 */ 7542 if (unlikely(last_ptr)) { 7543 spin_lock(&last_ptr->lock); 7544 last_ptr->fragmented = 1; 7545 spin_unlock(&last_ptr->lock); 7546 } 7547 if (ffe_ctl->cached) { 7548 struct btrfs_free_space_ctl *free_space_ctl; 7549 7550 free_space_ctl = bg->free_space_ctl; 7551 spin_lock(&free_space_ctl->tree_lock); 7552 if (free_space_ctl->free_space < 7553 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 7554 ffe_ctl->empty_size) { 7555 ffe_ctl->total_free_space = max_t(u64, 7556 ffe_ctl->total_free_space, 7557 free_space_ctl->free_space); 7558 spin_unlock(&free_space_ctl->tree_lock); 7559 return 1; 7560 } 7561 spin_unlock(&free_space_ctl->tree_lock); 7562 } 7563 7564 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 7565 ffe_ctl->num_bytes, ffe_ctl->empty_size, 7566 &ffe_ctl->max_extent_size); 7567 7568 /* 7569 * If we didn't find a chunk, and we haven't failed on this block group 7570 * before, and this block group is in the middle of caching and we are 7571 * ok with waiting, then go ahead and wait for progress to be made, and 7572 * set @retry_unclustered to true. 7573 * 7574 * If @retry_unclustered is true then we've already waited on this 7575 * block group once and should move on to the next block group. 7576 */ 7577 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && 7578 ffe_ctl->loop > LOOP_CACHING_NOWAIT) { 7579 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 7580 ffe_ctl->empty_size); 7581 ffe_ctl->retry_unclustered = true; 7582 return -EAGAIN; 7583 } else if (!offset) { 7584 return 1; 7585 } 7586 ffe_ctl->found_offset = offset; 7587 return 0; 7588 } 7589 7590 /* 7591 * Return >0 means caller needs to re-search for free extent 7592 * Return 0 means we have the needed free extent. 7593 * Return <0 means we failed to locate any free extent. 7594 */ 7595 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 7596 struct btrfs_free_cluster *last_ptr, 7597 struct btrfs_key *ins, 7598 struct find_free_extent_ctl *ffe_ctl, 7599 int full_search, bool use_cluster) 7600 { 7601 struct btrfs_root *root = fs_info->extent_root; 7602 int ret; 7603 7604 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 7605 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 7606 ffe_ctl->orig_have_caching_bg = true; 7607 7608 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && 7609 ffe_ctl->have_caching_bg) 7610 return 1; 7611 7612 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) 7613 return 1; 7614 7615 if (ins->objectid) { 7616 if (!use_cluster && last_ptr) { 7617 spin_lock(&last_ptr->lock); 7618 last_ptr->window_start = ins->objectid; 7619 spin_unlock(&last_ptr->lock); 7620 } 7621 return 0; 7622 } 7623 7624 /* 7625 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7626 * caching kthreads as we move along 7627 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7628 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7629 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7630 * again 7631 */ 7632 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 7633 ffe_ctl->index = 0; 7634 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { 7635 /* 7636 * We want to skip the LOOP_CACHING_WAIT step if we 7637 * don't have any uncached bgs and we've already done a 7638 * full search through. 7639 */ 7640 if (ffe_ctl->orig_have_caching_bg || !full_search) 7641 ffe_ctl->loop = LOOP_CACHING_WAIT; 7642 else 7643 ffe_ctl->loop = LOOP_ALLOC_CHUNK; 7644 } else { 7645 ffe_ctl->loop++; 7646 } 7647 7648 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 7649 struct btrfs_trans_handle *trans; 7650 int exist = 0; 7651 7652 trans = current->journal_info; 7653 if (trans) 7654 exist = 1; 7655 else 7656 trans = btrfs_join_transaction(root); 7657 7658 if (IS_ERR(trans)) { 7659 ret = PTR_ERR(trans); 7660 return ret; 7661 } 7662 7663 ret = do_chunk_alloc(trans, ffe_ctl->flags, 7664 CHUNK_ALLOC_FORCE); 7665 7666 /* 7667 * If we can't allocate a new chunk we've already looped 7668 * through at least once, move on to the NO_EMPTY_SIZE 7669 * case. 7670 */ 7671 if (ret == -ENOSPC) 7672 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; 7673 7674 /* Do not bail out on ENOSPC since we can do more. */ 7675 if (ret < 0 && ret != -ENOSPC) 7676 btrfs_abort_transaction(trans, ret); 7677 else 7678 ret = 0; 7679 if (!exist) 7680 btrfs_end_transaction(trans); 7681 if (ret) 7682 return ret; 7683 } 7684 7685 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 7686 /* 7687 * Don't loop again if we already have no empty_size and 7688 * no empty_cluster. 7689 */ 7690 if (ffe_ctl->empty_size == 0 && 7691 ffe_ctl->empty_cluster == 0) 7692 return -ENOSPC; 7693 ffe_ctl->empty_size = 0; 7694 ffe_ctl->empty_cluster = 0; 7695 } 7696 return 1; 7697 } 7698 return -ENOSPC; 7699 } 7700 7701 /* 7702 * walks the btree of allocated extents and find a hole of a given size. 7703 * The key ins is changed to record the hole: 7704 * ins->objectid == start position 7705 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7706 * ins->offset == the size of the hole. 7707 * Any available blocks before search_start are skipped. 7708 * 7709 * If there is no suitable free space, we will record the max size of 7710 * the free space extent currently. 7711 * 7712 * The overall logic and call chain: 7713 * 7714 * find_free_extent() 7715 * |- Iterate through all block groups 7716 * | |- Get a valid block group 7717 * | |- Try to do clustered allocation in that block group 7718 * | |- Try to do unclustered allocation in that block group 7719 * | |- Check if the result is valid 7720 * | | |- If valid, then exit 7721 * | |- Jump to next block group 7722 * | 7723 * |- Push harder to find free extents 7724 * |- If not found, re-iterate all block groups 7725 */ 7726 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7727 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7728 u64 hint_byte, struct btrfs_key *ins, 7729 u64 flags, int delalloc) 7730 { 7731 int ret = 0; 7732 struct btrfs_free_cluster *last_ptr = NULL; 7733 struct btrfs_block_group_cache *block_group = NULL; 7734 struct find_free_extent_ctl ffe_ctl = {0}; 7735 struct btrfs_space_info *space_info; 7736 bool use_cluster = true; 7737 bool full_search = false; 7738 7739 WARN_ON(num_bytes < fs_info->sectorsize); 7740 7741 ffe_ctl.ram_bytes = ram_bytes; 7742 ffe_ctl.num_bytes = num_bytes; 7743 ffe_ctl.empty_size = empty_size; 7744 ffe_ctl.flags = flags; 7745 ffe_ctl.search_start = 0; 7746 ffe_ctl.retry_clustered = false; 7747 ffe_ctl.retry_unclustered = false; 7748 ffe_ctl.delalloc = delalloc; 7749 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); 7750 ffe_ctl.have_caching_bg = false; 7751 ffe_ctl.orig_have_caching_bg = false; 7752 ffe_ctl.found_offset = 0; 7753 7754 ins->type = BTRFS_EXTENT_ITEM_KEY; 7755 ins->objectid = 0; 7756 ins->offset = 0; 7757 7758 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7759 7760 space_info = __find_space_info(fs_info, flags); 7761 if (!space_info) { 7762 btrfs_err(fs_info, "No space info for %llu", flags); 7763 return -ENOSPC; 7764 } 7765 7766 /* 7767 * If our free space is heavily fragmented we may not be able to make 7768 * big contiguous allocations, so instead of doing the expensive search 7769 * for free space, simply return ENOSPC with our max_extent_size so we 7770 * can go ahead and search for a more manageable chunk. 7771 * 7772 * If our max_extent_size is large enough for our allocation simply 7773 * disable clustering since we will likely not be able to find enough 7774 * space to create a cluster and induce latency trying. 7775 */ 7776 if (unlikely(space_info->max_extent_size)) { 7777 spin_lock(&space_info->lock); 7778 if (space_info->max_extent_size && 7779 num_bytes > space_info->max_extent_size) { 7780 ins->offset = space_info->max_extent_size; 7781 spin_unlock(&space_info->lock); 7782 return -ENOSPC; 7783 } else if (space_info->max_extent_size) { 7784 use_cluster = false; 7785 } 7786 spin_unlock(&space_info->lock); 7787 } 7788 7789 last_ptr = fetch_cluster_info(fs_info, space_info, 7790 &ffe_ctl.empty_cluster); 7791 if (last_ptr) { 7792 spin_lock(&last_ptr->lock); 7793 if (last_ptr->block_group) 7794 hint_byte = last_ptr->window_start; 7795 if (last_ptr->fragmented) { 7796 /* 7797 * We still set window_start so we can keep track of the 7798 * last place we found an allocation to try and save 7799 * some time. 7800 */ 7801 hint_byte = last_ptr->window_start; 7802 use_cluster = false; 7803 } 7804 spin_unlock(&last_ptr->lock); 7805 } 7806 7807 ffe_ctl.search_start = max(ffe_ctl.search_start, 7808 first_logical_byte(fs_info, 0)); 7809 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); 7810 if (ffe_ctl.search_start == hint_byte) { 7811 block_group = btrfs_lookup_block_group(fs_info, 7812 ffe_ctl.search_start); 7813 /* 7814 * we don't want to use the block group if it doesn't match our 7815 * allocation bits, or if its not cached. 7816 * 7817 * However if we are re-searching with an ideal block group 7818 * picked out then we don't care that the block group is cached. 7819 */ 7820 if (block_group && block_group_bits(block_group, flags) && 7821 block_group->cached != BTRFS_CACHE_NO) { 7822 down_read(&space_info->groups_sem); 7823 if (list_empty(&block_group->list) || 7824 block_group->ro) { 7825 /* 7826 * someone is removing this block group, 7827 * we can't jump into the have_block_group 7828 * target because our list pointers are not 7829 * valid 7830 */ 7831 btrfs_put_block_group(block_group); 7832 up_read(&space_info->groups_sem); 7833 } else { 7834 ffe_ctl.index = btrfs_bg_flags_to_raid_index( 7835 block_group->flags); 7836 btrfs_lock_block_group(block_group, delalloc); 7837 goto have_block_group; 7838 } 7839 } else if (block_group) { 7840 btrfs_put_block_group(block_group); 7841 } 7842 } 7843 search: 7844 ffe_ctl.have_caching_bg = false; 7845 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || 7846 ffe_ctl.index == 0) 7847 full_search = true; 7848 down_read(&space_info->groups_sem); 7849 list_for_each_entry(block_group, 7850 &space_info->block_groups[ffe_ctl.index], list) { 7851 /* If the block group is read-only, we can skip it entirely. */ 7852 if (unlikely(block_group->ro)) 7853 continue; 7854 7855 btrfs_grab_block_group(block_group, delalloc); 7856 ffe_ctl.search_start = block_group->key.objectid; 7857 7858 /* 7859 * this can happen if we end up cycling through all the 7860 * raid types, but we want to make sure we only allocate 7861 * for the proper type. 7862 */ 7863 if (!block_group_bits(block_group, flags)) { 7864 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7865 BTRFS_BLOCK_GROUP_RAID1 | 7866 BTRFS_BLOCK_GROUP_RAID5 | 7867 BTRFS_BLOCK_GROUP_RAID6 | 7868 BTRFS_BLOCK_GROUP_RAID10; 7869 7870 /* 7871 * if they asked for extra copies and this block group 7872 * doesn't provide them, bail. This does allow us to 7873 * fill raid0 from raid1. 7874 */ 7875 if ((flags & extra) && !(block_group->flags & extra)) 7876 goto loop; 7877 } 7878 7879 have_block_group: 7880 ffe_ctl.cached = block_group_cache_done(block_group); 7881 if (unlikely(!ffe_ctl.cached)) { 7882 ffe_ctl.have_caching_bg = true; 7883 ret = cache_block_group(block_group, 0); 7884 BUG_ON(ret < 0); 7885 ret = 0; 7886 } 7887 7888 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7889 goto loop; 7890 7891 /* 7892 * Ok we want to try and use the cluster allocator, so 7893 * lets look there 7894 */ 7895 if (last_ptr && use_cluster) { 7896 struct btrfs_block_group_cache *cluster_bg = NULL; 7897 7898 ret = find_free_extent_clustered(block_group, last_ptr, 7899 &ffe_ctl, &cluster_bg); 7900 7901 if (ret == 0) { 7902 if (cluster_bg && cluster_bg != block_group) { 7903 btrfs_release_block_group(block_group, 7904 delalloc); 7905 block_group = cluster_bg; 7906 } 7907 goto checks; 7908 } else if (ret == -EAGAIN) { 7909 goto have_block_group; 7910 } else if (ret > 0) { 7911 goto loop; 7912 } 7913 /* ret == -ENOENT case falls through */ 7914 } 7915 7916 ret = find_free_extent_unclustered(block_group, last_ptr, 7917 &ffe_ctl); 7918 if (ret == -EAGAIN) 7919 goto have_block_group; 7920 else if (ret > 0) 7921 goto loop; 7922 /* ret == 0 case falls through */ 7923 checks: 7924 ffe_ctl.search_start = round_up(ffe_ctl.found_offset, 7925 fs_info->stripesize); 7926 7927 /* move on to the next group */ 7928 if (ffe_ctl.search_start + num_bytes > 7929 block_group->key.objectid + block_group->key.offset) { 7930 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 7931 num_bytes); 7932 goto loop; 7933 } 7934 7935 if (ffe_ctl.found_offset < ffe_ctl.search_start) 7936 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 7937 ffe_ctl.search_start - ffe_ctl.found_offset); 7938 7939 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 7940 num_bytes, delalloc); 7941 if (ret == -EAGAIN) { 7942 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 7943 num_bytes); 7944 goto loop; 7945 } 7946 btrfs_inc_block_group_reservations(block_group); 7947 7948 /* we are all good, lets return */ 7949 ins->objectid = ffe_ctl.search_start; 7950 ins->offset = num_bytes; 7951 7952 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, 7953 num_bytes); 7954 btrfs_release_block_group(block_group, delalloc); 7955 break; 7956 loop: 7957 ffe_ctl.retry_clustered = false; 7958 ffe_ctl.retry_unclustered = false; 7959 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 7960 ffe_ctl.index); 7961 btrfs_release_block_group(block_group, delalloc); 7962 cond_resched(); 7963 } 7964 up_read(&space_info->groups_sem); 7965 7966 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, 7967 full_search, use_cluster); 7968 if (ret > 0) 7969 goto search; 7970 7971 if (ret == -ENOSPC) { 7972 /* 7973 * Use ffe_ctl->total_free_space as fallback if we can't find 7974 * any contiguous hole. 7975 */ 7976 if (!ffe_ctl.max_extent_size) 7977 ffe_ctl.max_extent_size = ffe_ctl.total_free_space; 7978 spin_lock(&space_info->lock); 7979 space_info->max_extent_size = ffe_ctl.max_extent_size; 7980 spin_unlock(&space_info->lock); 7981 ins->offset = ffe_ctl.max_extent_size; 7982 } 7983 return ret; 7984 } 7985 7986 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 7987 do { \ 7988 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 7989 spin_lock(&__rsv->lock); \ 7990 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 7991 __rsv->size, __rsv->reserved); \ 7992 spin_unlock(&__rsv->lock); \ 7993 } while (0) 7994 7995 static void dump_space_info(struct btrfs_fs_info *fs_info, 7996 struct btrfs_space_info *info, u64 bytes, 7997 int dump_block_groups) 7998 { 7999 struct btrfs_block_group_cache *cache; 8000 int index = 0; 8001 8002 spin_lock(&info->lock); 8003 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 8004 info->flags, 8005 info->total_bytes - btrfs_space_info_used(info, true), 8006 info->full ? "" : "not "); 8007 btrfs_info(fs_info, 8008 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 8009 info->total_bytes, info->bytes_used, info->bytes_pinned, 8010 info->bytes_reserved, info->bytes_may_use, 8011 info->bytes_readonly); 8012 spin_unlock(&info->lock); 8013 8014 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 8015 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 8016 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 8017 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 8018 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 8019 8020 if (!dump_block_groups) 8021 return; 8022 8023 down_read(&info->groups_sem); 8024 again: 8025 list_for_each_entry(cache, &info->block_groups[index], list) { 8026 spin_lock(&cache->lock); 8027 btrfs_info(fs_info, 8028 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 8029 cache->key.objectid, cache->key.offset, 8030 btrfs_block_group_used(&cache->item), cache->pinned, 8031 cache->reserved, cache->ro ? "[readonly]" : ""); 8032 btrfs_dump_free_space(cache, bytes); 8033 spin_unlock(&cache->lock); 8034 } 8035 if (++index < BTRFS_NR_RAID_TYPES) 8036 goto again; 8037 up_read(&info->groups_sem); 8038 } 8039 8040 /* 8041 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 8042 * hole that is at least as big as @num_bytes. 8043 * 8044 * @root - The root that will contain this extent 8045 * 8046 * @ram_bytes - The amount of space in ram that @num_bytes take. This 8047 * is used for accounting purposes. This value differs 8048 * from @num_bytes only in the case of compressed extents. 8049 * 8050 * @num_bytes - Number of bytes to allocate on-disk. 8051 * 8052 * @min_alloc_size - Indicates the minimum amount of space that the 8053 * allocator should try to satisfy. In some cases 8054 * @num_bytes may be larger than what is required and if 8055 * the filesystem is fragmented then allocation fails. 8056 * However, the presence of @min_alloc_size gives a 8057 * chance to try and satisfy the smaller allocation. 8058 * 8059 * @empty_size - A hint that you plan on doing more COW. This is the 8060 * size in bytes the allocator should try to find free 8061 * next to the block it returns. This is just a hint and 8062 * may be ignored by the allocator. 8063 * 8064 * @hint_byte - Hint to the allocator to start searching above the byte 8065 * address passed. It might be ignored. 8066 * 8067 * @ins - This key is modified to record the found hole. It will 8068 * have the following values: 8069 * ins->objectid == start position 8070 * ins->flags = BTRFS_EXTENT_ITEM_KEY 8071 * ins->offset == the size of the hole. 8072 * 8073 * @is_data - Boolean flag indicating whether an extent is 8074 * allocated for data (true) or metadata (false) 8075 * 8076 * @delalloc - Boolean flag indicating whether this allocation is for 8077 * delalloc or not. If 'true' data_rwsem of block groups 8078 * is going to be acquired. 8079 * 8080 * 8081 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 8082 * case -ENOSPC is returned then @ins->offset will contain the size of the 8083 * largest available hole the allocator managed to find. 8084 */ 8085 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 8086 u64 num_bytes, u64 min_alloc_size, 8087 u64 empty_size, u64 hint_byte, 8088 struct btrfs_key *ins, int is_data, int delalloc) 8089 { 8090 struct btrfs_fs_info *fs_info = root->fs_info; 8091 bool final_tried = num_bytes == min_alloc_size; 8092 u64 flags; 8093 int ret; 8094 8095 flags = get_alloc_profile_by_root(root, is_data); 8096 again: 8097 WARN_ON(num_bytes < fs_info->sectorsize); 8098 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 8099 hint_byte, ins, flags, delalloc); 8100 if (!ret && !is_data) { 8101 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 8102 } else if (ret == -ENOSPC) { 8103 if (!final_tried && ins->offset) { 8104 num_bytes = min(num_bytes >> 1, ins->offset); 8105 num_bytes = round_down(num_bytes, 8106 fs_info->sectorsize); 8107 num_bytes = max(num_bytes, min_alloc_size); 8108 ram_bytes = num_bytes; 8109 if (num_bytes == min_alloc_size) 8110 final_tried = true; 8111 goto again; 8112 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8113 struct btrfs_space_info *sinfo; 8114 8115 sinfo = __find_space_info(fs_info, flags); 8116 btrfs_err(fs_info, 8117 "allocation failed flags %llu, wanted %llu", 8118 flags, num_bytes); 8119 if (sinfo) 8120 dump_space_info(fs_info, sinfo, num_bytes, 1); 8121 } 8122 } 8123 8124 return ret; 8125 } 8126 8127 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8128 u64 start, u64 len, 8129 int pin, int delalloc) 8130 { 8131 struct btrfs_block_group_cache *cache; 8132 int ret = 0; 8133 8134 cache = btrfs_lookup_block_group(fs_info, start); 8135 if (!cache) { 8136 btrfs_err(fs_info, "Unable to find block group for %llu", 8137 start); 8138 return -ENOSPC; 8139 } 8140 8141 if (pin) 8142 pin_down_extent(cache, start, len, 1); 8143 else { 8144 if (btrfs_test_opt(fs_info, DISCARD)) 8145 ret = btrfs_discard_extent(fs_info, start, len, NULL); 8146 btrfs_add_free_space(cache, start, len); 8147 btrfs_free_reserved_bytes(cache, len, delalloc); 8148 trace_btrfs_reserved_extent_free(fs_info, start, len); 8149 } 8150 8151 btrfs_put_block_group(cache); 8152 return ret; 8153 } 8154 8155 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8156 u64 start, u64 len, int delalloc) 8157 { 8158 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8159 } 8160 8161 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8162 u64 start, u64 len) 8163 { 8164 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8165 } 8166 8167 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8168 u64 parent, u64 root_objectid, 8169 u64 flags, u64 owner, u64 offset, 8170 struct btrfs_key *ins, int ref_mod) 8171 { 8172 struct btrfs_fs_info *fs_info = trans->fs_info; 8173 int ret; 8174 struct btrfs_extent_item *extent_item; 8175 struct btrfs_extent_inline_ref *iref; 8176 struct btrfs_path *path; 8177 struct extent_buffer *leaf; 8178 int type; 8179 u32 size; 8180 8181 if (parent > 0) 8182 type = BTRFS_SHARED_DATA_REF_KEY; 8183 else 8184 type = BTRFS_EXTENT_DATA_REF_KEY; 8185 8186 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8187 8188 path = btrfs_alloc_path(); 8189 if (!path) 8190 return -ENOMEM; 8191 8192 path->leave_spinning = 1; 8193 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8194 ins, size); 8195 if (ret) { 8196 btrfs_free_path(path); 8197 return ret; 8198 } 8199 8200 leaf = path->nodes[0]; 8201 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8202 struct btrfs_extent_item); 8203 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8204 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8205 btrfs_set_extent_flags(leaf, extent_item, 8206 flags | BTRFS_EXTENT_FLAG_DATA); 8207 8208 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8209 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8210 if (parent > 0) { 8211 struct btrfs_shared_data_ref *ref; 8212 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8213 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8214 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8215 } else { 8216 struct btrfs_extent_data_ref *ref; 8217 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8218 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8219 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8220 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8221 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8222 } 8223 8224 btrfs_mark_buffer_dirty(path->nodes[0]); 8225 btrfs_free_path(path); 8226 8227 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); 8228 if (ret) 8229 return ret; 8230 8231 ret = update_block_group(trans, ins->objectid, ins->offset, 1); 8232 if (ret) { /* -ENOENT, logic error */ 8233 btrfs_err(fs_info, "update block group failed for %llu %llu", 8234 ins->objectid, ins->offset); 8235 BUG(); 8236 } 8237 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8238 return ret; 8239 } 8240 8241 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8242 struct btrfs_delayed_ref_node *node, 8243 struct btrfs_delayed_extent_op *extent_op) 8244 { 8245 struct btrfs_fs_info *fs_info = trans->fs_info; 8246 int ret; 8247 struct btrfs_extent_item *extent_item; 8248 struct btrfs_key extent_key; 8249 struct btrfs_tree_block_info *block_info; 8250 struct btrfs_extent_inline_ref *iref; 8251 struct btrfs_path *path; 8252 struct extent_buffer *leaf; 8253 struct btrfs_delayed_tree_ref *ref; 8254 u32 size = sizeof(*extent_item) + sizeof(*iref); 8255 u64 num_bytes; 8256 u64 flags = extent_op->flags_to_set; 8257 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8258 8259 ref = btrfs_delayed_node_to_tree_ref(node); 8260 8261 extent_key.objectid = node->bytenr; 8262 if (skinny_metadata) { 8263 extent_key.offset = ref->level; 8264 extent_key.type = BTRFS_METADATA_ITEM_KEY; 8265 num_bytes = fs_info->nodesize; 8266 } else { 8267 extent_key.offset = node->num_bytes; 8268 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 8269 size += sizeof(*block_info); 8270 num_bytes = node->num_bytes; 8271 } 8272 8273 path = btrfs_alloc_path(); 8274 if (!path) 8275 return -ENOMEM; 8276 8277 path->leave_spinning = 1; 8278 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8279 &extent_key, size); 8280 if (ret) { 8281 btrfs_free_path(path); 8282 return ret; 8283 } 8284 8285 leaf = path->nodes[0]; 8286 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8287 struct btrfs_extent_item); 8288 btrfs_set_extent_refs(leaf, extent_item, 1); 8289 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8290 btrfs_set_extent_flags(leaf, extent_item, 8291 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8292 8293 if (skinny_metadata) { 8294 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8295 } else { 8296 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8297 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 8298 btrfs_set_tree_block_level(leaf, block_info, ref->level); 8299 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8300 } 8301 8302 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 8303 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8304 btrfs_set_extent_inline_ref_type(leaf, iref, 8305 BTRFS_SHARED_BLOCK_REF_KEY); 8306 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); 8307 } else { 8308 btrfs_set_extent_inline_ref_type(leaf, iref, 8309 BTRFS_TREE_BLOCK_REF_KEY); 8310 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); 8311 } 8312 8313 btrfs_mark_buffer_dirty(leaf); 8314 btrfs_free_path(path); 8315 8316 ret = remove_from_free_space_tree(trans, extent_key.objectid, 8317 num_bytes); 8318 if (ret) 8319 return ret; 8320 8321 ret = update_block_group(trans, extent_key.objectid, 8322 fs_info->nodesize, 1); 8323 if (ret) { /* -ENOENT, logic error */ 8324 btrfs_err(fs_info, "update block group failed for %llu %llu", 8325 extent_key.objectid, extent_key.offset); 8326 BUG(); 8327 } 8328 8329 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, 8330 fs_info->nodesize); 8331 return ret; 8332 } 8333 8334 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8335 struct btrfs_root *root, u64 owner, 8336 u64 offset, u64 ram_bytes, 8337 struct btrfs_key *ins) 8338 { 8339 struct btrfs_ref generic_ref = { 0 }; 8340 int ret; 8341 8342 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 8343 8344 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 8345 ins->objectid, ins->offset, 0); 8346 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); 8347 btrfs_ref_tree_mod(root->fs_info, &generic_ref); 8348 ret = btrfs_add_delayed_data_ref(trans, &generic_ref, 8349 ram_bytes, NULL, NULL); 8350 return ret; 8351 } 8352 8353 /* 8354 * this is used by the tree logging recovery code. It records that 8355 * an extent has been allocated and makes sure to clear the free 8356 * space cache bits as well 8357 */ 8358 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8359 u64 root_objectid, u64 owner, u64 offset, 8360 struct btrfs_key *ins) 8361 { 8362 struct btrfs_fs_info *fs_info = trans->fs_info; 8363 int ret; 8364 struct btrfs_block_group_cache *block_group; 8365 struct btrfs_space_info *space_info; 8366 8367 /* 8368 * Mixed block groups will exclude before processing the log so we only 8369 * need to do the exclude dance if this fs isn't mixed. 8370 */ 8371 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8372 ret = __exclude_logged_extent(fs_info, ins->objectid, 8373 ins->offset); 8374 if (ret) 8375 return ret; 8376 } 8377 8378 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8379 if (!block_group) 8380 return -EINVAL; 8381 8382 space_info = block_group->space_info; 8383 spin_lock(&space_info->lock); 8384 spin_lock(&block_group->lock); 8385 space_info->bytes_reserved += ins->offset; 8386 block_group->reserved += ins->offset; 8387 spin_unlock(&block_group->lock); 8388 spin_unlock(&space_info->lock); 8389 8390 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 8391 offset, ins, 1); 8392 btrfs_put_block_group(block_group); 8393 return ret; 8394 } 8395 8396 static struct extent_buffer * 8397 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8398 u64 bytenr, int level, u64 owner) 8399 { 8400 struct btrfs_fs_info *fs_info = root->fs_info; 8401 struct extent_buffer *buf; 8402 8403 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8404 if (IS_ERR(buf)) 8405 return buf; 8406 8407 /* 8408 * Extra safety check in case the extent tree is corrupted and extent 8409 * allocator chooses to use a tree block which is already used and 8410 * locked. 8411 */ 8412 if (buf->lock_owner == current->pid) { 8413 btrfs_err_rl(fs_info, 8414 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 8415 buf->start, btrfs_header_owner(buf), current->pid); 8416 free_extent_buffer(buf); 8417 return ERR_PTR(-EUCLEAN); 8418 } 8419 8420 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8421 btrfs_tree_lock(buf); 8422 btrfs_clean_tree_block(buf); 8423 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8424 8425 btrfs_set_lock_blocking_write(buf); 8426 set_extent_buffer_uptodate(buf); 8427 8428 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 8429 btrfs_set_header_level(buf, level); 8430 btrfs_set_header_bytenr(buf, buf->start); 8431 btrfs_set_header_generation(buf, trans->transid); 8432 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 8433 btrfs_set_header_owner(buf, owner); 8434 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 8435 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 8436 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8437 buf->log_index = root->log_transid % 2; 8438 /* 8439 * we allow two log transactions at a time, use different 8440 * EXTENT bit to differentiate dirty pages. 8441 */ 8442 if (buf->log_index == 0) 8443 set_extent_dirty(&root->dirty_log_pages, buf->start, 8444 buf->start + buf->len - 1, GFP_NOFS); 8445 else 8446 set_extent_new(&root->dirty_log_pages, buf->start, 8447 buf->start + buf->len - 1); 8448 } else { 8449 buf->log_index = -1; 8450 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8451 buf->start + buf->len - 1, GFP_NOFS); 8452 } 8453 trans->dirty = true; 8454 /* this returns a buffer locked for blocking */ 8455 return buf; 8456 } 8457 8458 static struct btrfs_block_rsv * 8459 use_block_rsv(struct btrfs_trans_handle *trans, 8460 struct btrfs_root *root, u32 blocksize) 8461 { 8462 struct btrfs_fs_info *fs_info = root->fs_info; 8463 struct btrfs_block_rsv *block_rsv; 8464 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8465 int ret; 8466 bool global_updated = false; 8467 8468 block_rsv = get_block_rsv(trans, root); 8469 8470 if (unlikely(block_rsv->size == 0)) 8471 goto try_reserve; 8472 again: 8473 ret = block_rsv_use_bytes(block_rsv, blocksize); 8474 if (!ret) 8475 return block_rsv; 8476 8477 if (block_rsv->failfast) 8478 return ERR_PTR(ret); 8479 8480 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8481 global_updated = true; 8482 update_global_block_rsv(fs_info); 8483 goto again; 8484 } 8485 8486 /* 8487 * The global reserve still exists to save us from ourselves, so don't 8488 * warn_on if we are short on our delayed refs reserve. 8489 */ 8490 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && 8491 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8492 static DEFINE_RATELIMIT_STATE(_rs, 8493 DEFAULT_RATELIMIT_INTERVAL * 10, 8494 /*DEFAULT_RATELIMIT_BURST*/ 1); 8495 if (__ratelimit(&_rs)) 8496 WARN(1, KERN_DEBUG 8497 "BTRFS: block rsv returned %d\n", ret); 8498 } 8499 try_reserve: 8500 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8501 BTRFS_RESERVE_NO_FLUSH); 8502 if (!ret) 8503 return block_rsv; 8504 /* 8505 * If we couldn't reserve metadata bytes try and use some from 8506 * the global reserve if its space type is the same as the global 8507 * reservation. 8508 */ 8509 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8510 block_rsv->space_info == global_rsv->space_info) { 8511 ret = block_rsv_use_bytes(global_rsv, blocksize); 8512 if (!ret) 8513 return global_rsv; 8514 } 8515 return ERR_PTR(ret); 8516 } 8517 8518 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8519 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8520 { 8521 block_rsv_add_bytes(block_rsv, blocksize, false); 8522 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); 8523 } 8524 8525 /* 8526 * finds a free extent and does all the dirty work required for allocation 8527 * returns the tree buffer or an ERR_PTR on error. 8528 */ 8529 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8530 struct btrfs_root *root, 8531 u64 parent, u64 root_objectid, 8532 const struct btrfs_disk_key *key, 8533 int level, u64 hint, 8534 u64 empty_size) 8535 { 8536 struct btrfs_fs_info *fs_info = root->fs_info; 8537 struct btrfs_key ins; 8538 struct btrfs_block_rsv *block_rsv; 8539 struct extent_buffer *buf; 8540 struct btrfs_delayed_extent_op *extent_op; 8541 struct btrfs_ref generic_ref = { 0 }; 8542 u64 flags = 0; 8543 int ret; 8544 u32 blocksize = fs_info->nodesize; 8545 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8546 8547 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8548 if (btrfs_is_testing(fs_info)) { 8549 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8550 level, root_objectid); 8551 if (!IS_ERR(buf)) 8552 root->alloc_bytenr += blocksize; 8553 return buf; 8554 } 8555 #endif 8556 8557 block_rsv = use_block_rsv(trans, root, blocksize); 8558 if (IS_ERR(block_rsv)) 8559 return ERR_CAST(block_rsv); 8560 8561 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8562 empty_size, hint, &ins, 0, 0); 8563 if (ret) 8564 goto out_unuse; 8565 8566 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 8567 root_objectid); 8568 if (IS_ERR(buf)) { 8569 ret = PTR_ERR(buf); 8570 goto out_free_reserved; 8571 } 8572 8573 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8574 if (parent == 0) 8575 parent = ins.objectid; 8576 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8577 } else 8578 BUG_ON(parent > 0); 8579 8580 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8581 extent_op = btrfs_alloc_delayed_extent_op(); 8582 if (!extent_op) { 8583 ret = -ENOMEM; 8584 goto out_free_buf; 8585 } 8586 if (key) 8587 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8588 else 8589 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8590 extent_op->flags_to_set = flags; 8591 extent_op->update_key = skinny_metadata ? false : true; 8592 extent_op->update_flags = true; 8593 extent_op->is_data = false; 8594 extent_op->level = level; 8595 8596 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 8597 ins.objectid, ins.offset, parent); 8598 generic_ref.real_root = root->root_key.objectid; 8599 btrfs_init_tree_ref(&generic_ref, level, root_objectid); 8600 btrfs_ref_tree_mod(fs_info, &generic_ref); 8601 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, 8602 extent_op, NULL, NULL); 8603 if (ret) 8604 goto out_free_delayed; 8605 } 8606 return buf; 8607 8608 out_free_delayed: 8609 btrfs_free_delayed_extent_op(extent_op); 8610 out_free_buf: 8611 free_extent_buffer(buf); 8612 out_free_reserved: 8613 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8614 out_unuse: 8615 unuse_block_rsv(fs_info, block_rsv, blocksize); 8616 return ERR_PTR(ret); 8617 } 8618 8619 struct walk_control { 8620 u64 refs[BTRFS_MAX_LEVEL]; 8621 u64 flags[BTRFS_MAX_LEVEL]; 8622 struct btrfs_key update_progress; 8623 struct btrfs_key drop_progress; 8624 int drop_level; 8625 int stage; 8626 int level; 8627 int shared_level; 8628 int update_ref; 8629 int keep_locks; 8630 int reada_slot; 8631 int reada_count; 8632 int restarted; 8633 }; 8634 8635 #define DROP_REFERENCE 1 8636 #define UPDATE_BACKREF 2 8637 8638 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8639 struct btrfs_root *root, 8640 struct walk_control *wc, 8641 struct btrfs_path *path) 8642 { 8643 struct btrfs_fs_info *fs_info = root->fs_info; 8644 u64 bytenr; 8645 u64 generation; 8646 u64 refs; 8647 u64 flags; 8648 u32 nritems; 8649 struct btrfs_key key; 8650 struct extent_buffer *eb; 8651 int ret; 8652 int slot; 8653 int nread = 0; 8654 8655 if (path->slots[wc->level] < wc->reada_slot) { 8656 wc->reada_count = wc->reada_count * 2 / 3; 8657 wc->reada_count = max(wc->reada_count, 2); 8658 } else { 8659 wc->reada_count = wc->reada_count * 3 / 2; 8660 wc->reada_count = min_t(int, wc->reada_count, 8661 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8662 } 8663 8664 eb = path->nodes[wc->level]; 8665 nritems = btrfs_header_nritems(eb); 8666 8667 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8668 if (nread >= wc->reada_count) 8669 break; 8670 8671 cond_resched(); 8672 bytenr = btrfs_node_blockptr(eb, slot); 8673 generation = btrfs_node_ptr_generation(eb, slot); 8674 8675 if (slot == path->slots[wc->level]) 8676 goto reada; 8677 8678 if (wc->stage == UPDATE_BACKREF && 8679 generation <= root->root_key.offset) 8680 continue; 8681 8682 /* We don't lock the tree block, it's OK to be racy here */ 8683 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8684 wc->level - 1, 1, &refs, 8685 &flags); 8686 /* We don't care about errors in readahead. */ 8687 if (ret < 0) 8688 continue; 8689 BUG_ON(refs == 0); 8690 8691 if (wc->stage == DROP_REFERENCE) { 8692 if (refs == 1) 8693 goto reada; 8694 8695 if (wc->level == 1 && 8696 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8697 continue; 8698 if (!wc->update_ref || 8699 generation <= root->root_key.offset) 8700 continue; 8701 btrfs_node_key_to_cpu(eb, &key, slot); 8702 ret = btrfs_comp_cpu_keys(&key, 8703 &wc->update_progress); 8704 if (ret < 0) 8705 continue; 8706 } else { 8707 if (wc->level == 1 && 8708 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8709 continue; 8710 } 8711 reada: 8712 readahead_tree_block(fs_info, bytenr); 8713 nread++; 8714 } 8715 wc->reada_slot = slot; 8716 } 8717 8718 /* 8719 * helper to process tree block while walking down the tree. 8720 * 8721 * when wc->stage == UPDATE_BACKREF, this function updates 8722 * back refs for pointers in the block. 8723 * 8724 * NOTE: return value 1 means we should stop walking down. 8725 */ 8726 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8727 struct btrfs_root *root, 8728 struct btrfs_path *path, 8729 struct walk_control *wc, int lookup_info) 8730 { 8731 struct btrfs_fs_info *fs_info = root->fs_info; 8732 int level = wc->level; 8733 struct extent_buffer *eb = path->nodes[level]; 8734 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8735 int ret; 8736 8737 if (wc->stage == UPDATE_BACKREF && 8738 btrfs_header_owner(eb) != root->root_key.objectid) 8739 return 1; 8740 8741 /* 8742 * when reference count of tree block is 1, it won't increase 8743 * again. once full backref flag is set, we never clear it. 8744 */ 8745 if (lookup_info && 8746 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8747 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8748 BUG_ON(!path->locks[level]); 8749 ret = btrfs_lookup_extent_info(trans, fs_info, 8750 eb->start, level, 1, 8751 &wc->refs[level], 8752 &wc->flags[level]); 8753 BUG_ON(ret == -ENOMEM); 8754 if (ret) 8755 return ret; 8756 BUG_ON(wc->refs[level] == 0); 8757 } 8758 8759 if (wc->stage == DROP_REFERENCE) { 8760 if (wc->refs[level] > 1) 8761 return 1; 8762 8763 if (path->locks[level] && !wc->keep_locks) { 8764 btrfs_tree_unlock_rw(eb, path->locks[level]); 8765 path->locks[level] = 0; 8766 } 8767 return 0; 8768 } 8769 8770 /* wc->stage == UPDATE_BACKREF */ 8771 if (!(wc->flags[level] & flag)) { 8772 BUG_ON(!path->locks[level]); 8773 ret = btrfs_inc_ref(trans, root, eb, 1); 8774 BUG_ON(ret); /* -ENOMEM */ 8775 ret = btrfs_dec_ref(trans, root, eb, 0); 8776 BUG_ON(ret); /* -ENOMEM */ 8777 ret = btrfs_set_disk_extent_flags(trans, eb->start, 8778 eb->len, flag, 8779 btrfs_header_level(eb), 0); 8780 BUG_ON(ret); /* -ENOMEM */ 8781 wc->flags[level] |= flag; 8782 } 8783 8784 /* 8785 * the block is shared by multiple trees, so it's not good to 8786 * keep the tree lock 8787 */ 8788 if (path->locks[level] && level > 0) { 8789 btrfs_tree_unlock_rw(eb, path->locks[level]); 8790 path->locks[level] = 0; 8791 } 8792 return 0; 8793 } 8794 8795 /* 8796 * This is used to verify a ref exists for this root to deal with a bug where we 8797 * would have a drop_progress key that hadn't been updated properly. 8798 */ 8799 static int check_ref_exists(struct btrfs_trans_handle *trans, 8800 struct btrfs_root *root, u64 bytenr, u64 parent, 8801 int level) 8802 { 8803 struct btrfs_path *path; 8804 struct btrfs_extent_inline_ref *iref; 8805 int ret; 8806 8807 path = btrfs_alloc_path(); 8808 if (!path) 8809 return -ENOMEM; 8810 8811 ret = lookup_extent_backref(trans, path, &iref, bytenr, 8812 root->fs_info->nodesize, parent, 8813 root->root_key.objectid, level, 0); 8814 btrfs_free_path(path); 8815 if (ret == -ENOENT) 8816 return 0; 8817 if (ret < 0) 8818 return ret; 8819 return 1; 8820 } 8821 8822 /* 8823 * helper to process tree block pointer. 8824 * 8825 * when wc->stage == DROP_REFERENCE, this function checks 8826 * reference count of the block pointed to. if the block 8827 * is shared and we need update back refs for the subtree 8828 * rooted at the block, this function changes wc->stage to 8829 * UPDATE_BACKREF. if the block is shared and there is no 8830 * need to update back, this function drops the reference 8831 * to the block. 8832 * 8833 * NOTE: return value 1 means we should stop walking down. 8834 */ 8835 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8836 struct btrfs_root *root, 8837 struct btrfs_path *path, 8838 struct walk_control *wc, int *lookup_info) 8839 { 8840 struct btrfs_fs_info *fs_info = root->fs_info; 8841 u64 bytenr; 8842 u64 generation; 8843 u64 parent; 8844 struct btrfs_key key; 8845 struct btrfs_key first_key; 8846 struct btrfs_ref ref = { 0 }; 8847 struct extent_buffer *next; 8848 int level = wc->level; 8849 int reada = 0; 8850 int ret = 0; 8851 bool need_account = false; 8852 8853 generation = btrfs_node_ptr_generation(path->nodes[level], 8854 path->slots[level]); 8855 /* 8856 * if the lower level block was created before the snapshot 8857 * was created, we know there is no need to update back refs 8858 * for the subtree 8859 */ 8860 if (wc->stage == UPDATE_BACKREF && 8861 generation <= root->root_key.offset) { 8862 *lookup_info = 1; 8863 return 1; 8864 } 8865 8866 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8867 btrfs_node_key_to_cpu(path->nodes[level], &first_key, 8868 path->slots[level]); 8869 8870 next = find_extent_buffer(fs_info, bytenr); 8871 if (!next) { 8872 next = btrfs_find_create_tree_block(fs_info, bytenr); 8873 if (IS_ERR(next)) 8874 return PTR_ERR(next); 8875 8876 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8877 level - 1); 8878 reada = 1; 8879 } 8880 btrfs_tree_lock(next); 8881 btrfs_set_lock_blocking_write(next); 8882 8883 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8884 &wc->refs[level - 1], 8885 &wc->flags[level - 1]); 8886 if (ret < 0) 8887 goto out_unlock; 8888 8889 if (unlikely(wc->refs[level - 1] == 0)) { 8890 btrfs_err(fs_info, "Missing references."); 8891 ret = -EIO; 8892 goto out_unlock; 8893 } 8894 *lookup_info = 0; 8895 8896 if (wc->stage == DROP_REFERENCE) { 8897 if (wc->refs[level - 1] > 1) { 8898 need_account = true; 8899 if (level == 1 && 8900 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8901 goto skip; 8902 8903 if (!wc->update_ref || 8904 generation <= root->root_key.offset) 8905 goto skip; 8906 8907 btrfs_node_key_to_cpu(path->nodes[level], &key, 8908 path->slots[level]); 8909 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8910 if (ret < 0) 8911 goto skip; 8912 8913 wc->stage = UPDATE_BACKREF; 8914 wc->shared_level = level - 1; 8915 } 8916 } else { 8917 if (level == 1 && 8918 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8919 goto skip; 8920 } 8921 8922 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8923 btrfs_tree_unlock(next); 8924 free_extent_buffer(next); 8925 next = NULL; 8926 *lookup_info = 1; 8927 } 8928 8929 if (!next) { 8930 if (reada && level == 1) 8931 reada_walk_down(trans, root, wc, path); 8932 next = read_tree_block(fs_info, bytenr, generation, level - 1, 8933 &first_key); 8934 if (IS_ERR(next)) { 8935 return PTR_ERR(next); 8936 } else if (!extent_buffer_uptodate(next)) { 8937 free_extent_buffer(next); 8938 return -EIO; 8939 } 8940 btrfs_tree_lock(next); 8941 btrfs_set_lock_blocking_write(next); 8942 } 8943 8944 level--; 8945 ASSERT(level == btrfs_header_level(next)); 8946 if (level != btrfs_header_level(next)) { 8947 btrfs_err(root->fs_info, "mismatched level"); 8948 ret = -EIO; 8949 goto out_unlock; 8950 } 8951 path->nodes[level] = next; 8952 path->slots[level] = 0; 8953 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8954 wc->level = level; 8955 if (wc->level == 1) 8956 wc->reada_slot = 0; 8957 return 0; 8958 skip: 8959 wc->refs[level - 1] = 0; 8960 wc->flags[level - 1] = 0; 8961 if (wc->stage == DROP_REFERENCE) { 8962 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8963 parent = path->nodes[level]->start; 8964 } else { 8965 ASSERT(root->root_key.objectid == 8966 btrfs_header_owner(path->nodes[level])); 8967 if (root->root_key.objectid != 8968 btrfs_header_owner(path->nodes[level])) { 8969 btrfs_err(root->fs_info, 8970 "mismatched block owner"); 8971 ret = -EIO; 8972 goto out_unlock; 8973 } 8974 parent = 0; 8975 } 8976 8977 /* 8978 * If we had a drop_progress we need to verify the refs are set 8979 * as expected. If we find our ref then we know that from here 8980 * on out everything should be correct, and we can clear the 8981 * ->restarted flag. 8982 */ 8983 if (wc->restarted) { 8984 ret = check_ref_exists(trans, root, bytenr, parent, 8985 level - 1); 8986 if (ret < 0) 8987 goto out_unlock; 8988 if (ret == 0) 8989 goto no_delete; 8990 ret = 0; 8991 wc->restarted = 0; 8992 } 8993 8994 /* 8995 * Reloc tree doesn't contribute to qgroup numbers, and we have 8996 * already accounted them at merge time (replace_path), 8997 * thus we could skip expensive subtree trace here. 8998 */ 8999 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 9000 need_account) { 9001 ret = btrfs_qgroup_trace_subtree(trans, next, 9002 generation, level - 1); 9003 if (ret) { 9004 btrfs_err_rl(fs_info, 9005 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 9006 ret); 9007 } 9008 } 9009 9010 /* 9011 * We need to update the next key in our walk control so we can 9012 * update the drop_progress key accordingly. We don't care if 9013 * find_next_key doesn't find a key because that means we're at 9014 * the end and are going to clean up now. 9015 */ 9016 wc->drop_level = level; 9017 find_next_key(path, level, &wc->drop_progress); 9018 9019 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 9020 fs_info->nodesize, parent); 9021 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); 9022 ret = btrfs_free_extent(trans, &ref); 9023 if (ret) 9024 goto out_unlock; 9025 } 9026 no_delete: 9027 *lookup_info = 1; 9028 ret = 1; 9029 9030 out_unlock: 9031 btrfs_tree_unlock(next); 9032 free_extent_buffer(next); 9033 9034 return ret; 9035 } 9036 9037 /* 9038 * helper to process tree block while walking up the tree. 9039 * 9040 * when wc->stage == DROP_REFERENCE, this function drops 9041 * reference count on the block. 9042 * 9043 * when wc->stage == UPDATE_BACKREF, this function changes 9044 * wc->stage back to DROP_REFERENCE if we changed wc->stage 9045 * to UPDATE_BACKREF previously while processing the block. 9046 * 9047 * NOTE: return value 1 means we should stop walking up. 9048 */ 9049 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 9050 struct btrfs_root *root, 9051 struct btrfs_path *path, 9052 struct walk_control *wc) 9053 { 9054 struct btrfs_fs_info *fs_info = root->fs_info; 9055 int ret; 9056 int level = wc->level; 9057 struct extent_buffer *eb = path->nodes[level]; 9058 u64 parent = 0; 9059 9060 if (wc->stage == UPDATE_BACKREF) { 9061 BUG_ON(wc->shared_level < level); 9062 if (level < wc->shared_level) 9063 goto out; 9064 9065 ret = find_next_key(path, level + 1, &wc->update_progress); 9066 if (ret > 0) 9067 wc->update_ref = 0; 9068 9069 wc->stage = DROP_REFERENCE; 9070 wc->shared_level = -1; 9071 path->slots[level] = 0; 9072 9073 /* 9074 * check reference count again if the block isn't locked. 9075 * we should start walking down the tree again if reference 9076 * count is one. 9077 */ 9078 if (!path->locks[level]) { 9079 BUG_ON(level == 0); 9080 btrfs_tree_lock(eb); 9081 btrfs_set_lock_blocking_write(eb); 9082 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9083 9084 ret = btrfs_lookup_extent_info(trans, fs_info, 9085 eb->start, level, 1, 9086 &wc->refs[level], 9087 &wc->flags[level]); 9088 if (ret < 0) { 9089 btrfs_tree_unlock_rw(eb, path->locks[level]); 9090 path->locks[level] = 0; 9091 return ret; 9092 } 9093 BUG_ON(wc->refs[level] == 0); 9094 if (wc->refs[level] == 1) { 9095 btrfs_tree_unlock_rw(eb, path->locks[level]); 9096 path->locks[level] = 0; 9097 return 1; 9098 } 9099 } 9100 } 9101 9102 /* wc->stage == DROP_REFERENCE */ 9103 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 9104 9105 if (wc->refs[level] == 1) { 9106 if (level == 0) { 9107 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9108 ret = btrfs_dec_ref(trans, root, eb, 1); 9109 else 9110 ret = btrfs_dec_ref(trans, root, eb, 0); 9111 BUG_ON(ret); /* -ENOMEM */ 9112 if (is_fstree(root->root_key.objectid)) { 9113 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 9114 if (ret) { 9115 btrfs_err_rl(fs_info, 9116 "error %d accounting leaf items, quota is out of sync, rescan required", 9117 ret); 9118 } 9119 } 9120 } 9121 /* make block locked assertion in btrfs_clean_tree_block happy */ 9122 if (!path->locks[level] && 9123 btrfs_header_generation(eb) == trans->transid) { 9124 btrfs_tree_lock(eb); 9125 btrfs_set_lock_blocking_write(eb); 9126 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9127 } 9128 btrfs_clean_tree_block(eb); 9129 } 9130 9131 if (eb == root->node) { 9132 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9133 parent = eb->start; 9134 else if (root->root_key.objectid != btrfs_header_owner(eb)) 9135 goto owner_mismatch; 9136 } else { 9137 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9138 parent = path->nodes[level + 1]->start; 9139 else if (root->root_key.objectid != 9140 btrfs_header_owner(path->nodes[level + 1])) 9141 goto owner_mismatch; 9142 } 9143 9144 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 9145 out: 9146 wc->refs[level] = 0; 9147 wc->flags[level] = 0; 9148 return 0; 9149 9150 owner_mismatch: 9151 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 9152 btrfs_header_owner(eb), root->root_key.objectid); 9153 return -EUCLEAN; 9154 } 9155 9156 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 9157 struct btrfs_root *root, 9158 struct btrfs_path *path, 9159 struct walk_control *wc) 9160 { 9161 int level = wc->level; 9162 int lookup_info = 1; 9163 int ret; 9164 9165 while (level >= 0) { 9166 ret = walk_down_proc(trans, root, path, wc, lookup_info); 9167 if (ret > 0) 9168 break; 9169 9170 if (level == 0) 9171 break; 9172 9173 if (path->slots[level] >= 9174 btrfs_header_nritems(path->nodes[level])) 9175 break; 9176 9177 ret = do_walk_down(trans, root, path, wc, &lookup_info); 9178 if (ret > 0) { 9179 path->slots[level]++; 9180 continue; 9181 } else if (ret < 0) 9182 return ret; 9183 level = wc->level; 9184 } 9185 return 0; 9186 } 9187 9188 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 9189 struct btrfs_root *root, 9190 struct btrfs_path *path, 9191 struct walk_control *wc, int max_level) 9192 { 9193 int level = wc->level; 9194 int ret; 9195 9196 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 9197 while (level < max_level && path->nodes[level]) { 9198 wc->level = level; 9199 if (path->slots[level] + 1 < 9200 btrfs_header_nritems(path->nodes[level])) { 9201 path->slots[level]++; 9202 return 0; 9203 } else { 9204 ret = walk_up_proc(trans, root, path, wc); 9205 if (ret > 0) 9206 return 0; 9207 if (ret < 0) 9208 return ret; 9209 9210 if (path->locks[level]) { 9211 btrfs_tree_unlock_rw(path->nodes[level], 9212 path->locks[level]); 9213 path->locks[level] = 0; 9214 } 9215 free_extent_buffer(path->nodes[level]); 9216 path->nodes[level] = NULL; 9217 level++; 9218 } 9219 } 9220 return 1; 9221 } 9222 9223 /* 9224 * drop a subvolume tree. 9225 * 9226 * this function traverses the tree freeing any blocks that only 9227 * referenced by the tree. 9228 * 9229 * when a shared tree block is found. this function decreases its 9230 * reference count by one. if update_ref is true, this function 9231 * also make sure backrefs for the shared block and all lower level 9232 * blocks are properly updated. 9233 * 9234 * If called with for_reloc == 0, may exit early with -EAGAIN 9235 */ 9236 int btrfs_drop_snapshot(struct btrfs_root *root, 9237 struct btrfs_block_rsv *block_rsv, int update_ref, 9238 int for_reloc) 9239 { 9240 struct btrfs_fs_info *fs_info = root->fs_info; 9241 struct btrfs_path *path; 9242 struct btrfs_trans_handle *trans; 9243 struct btrfs_root *tree_root = fs_info->tree_root; 9244 struct btrfs_root_item *root_item = &root->root_item; 9245 struct walk_control *wc; 9246 struct btrfs_key key; 9247 int err = 0; 9248 int ret; 9249 int level; 9250 bool root_dropped = false; 9251 9252 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 9253 9254 path = btrfs_alloc_path(); 9255 if (!path) { 9256 err = -ENOMEM; 9257 goto out; 9258 } 9259 9260 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9261 if (!wc) { 9262 btrfs_free_path(path); 9263 err = -ENOMEM; 9264 goto out; 9265 } 9266 9267 trans = btrfs_start_transaction(tree_root, 0); 9268 if (IS_ERR(trans)) { 9269 err = PTR_ERR(trans); 9270 goto out_free; 9271 } 9272 9273 err = btrfs_run_delayed_items(trans); 9274 if (err) 9275 goto out_end_trans; 9276 9277 if (block_rsv) 9278 trans->block_rsv = block_rsv; 9279 9280 /* 9281 * This will help us catch people modifying the fs tree while we're 9282 * dropping it. It is unsafe to mess with the fs tree while it's being 9283 * dropped as we unlock the root node and parent nodes as we walk down 9284 * the tree, assuming nothing will change. If something does change 9285 * then we'll have stale information and drop references to blocks we've 9286 * already dropped. 9287 */ 9288 set_bit(BTRFS_ROOT_DELETING, &root->state); 9289 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9290 level = btrfs_header_level(root->node); 9291 path->nodes[level] = btrfs_lock_root_node(root); 9292 btrfs_set_lock_blocking_write(path->nodes[level]); 9293 path->slots[level] = 0; 9294 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9295 memset(&wc->update_progress, 0, 9296 sizeof(wc->update_progress)); 9297 } else { 9298 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9299 memcpy(&wc->update_progress, &key, 9300 sizeof(wc->update_progress)); 9301 9302 level = root_item->drop_level; 9303 BUG_ON(level == 0); 9304 path->lowest_level = level; 9305 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9306 path->lowest_level = 0; 9307 if (ret < 0) { 9308 err = ret; 9309 goto out_end_trans; 9310 } 9311 WARN_ON(ret > 0); 9312 9313 /* 9314 * unlock our path, this is safe because only this 9315 * function is allowed to delete this snapshot 9316 */ 9317 btrfs_unlock_up_safe(path, 0); 9318 9319 level = btrfs_header_level(root->node); 9320 while (1) { 9321 btrfs_tree_lock(path->nodes[level]); 9322 btrfs_set_lock_blocking_write(path->nodes[level]); 9323 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9324 9325 ret = btrfs_lookup_extent_info(trans, fs_info, 9326 path->nodes[level]->start, 9327 level, 1, &wc->refs[level], 9328 &wc->flags[level]); 9329 if (ret < 0) { 9330 err = ret; 9331 goto out_end_trans; 9332 } 9333 BUG_ON(wc->refs[level] == 0); 9334 9335 if (level == root_item->drop_level) 9336 break; 9337 9338 btrfs_tree_unlock(path->nodes[level]); 9339 path->locks[level] = 0; 9340 WARN_ON(wc->refs[level] != 1); 9341 level--; 9342 } 9343 } 9344 9345 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 9346 wc->level = level; 9347 wc->shared_level = -1; 9348 wc->stage = DROP_REFERENCE; 9349 wc->update_ref = update_ref; 9350 wc->keep_locks = 0; 9351 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9352 9353 while (1) { 9354 9355 ret = walk_down_tree(trans, root, path, wc); 9356 if (ret < 0) { 9357 err = ret; 9358 break; 9359 } 9360 9361 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9362 if (ret < 0) { 9363 err = ret; 9364 break; 9365 } 9366 9367 if (ret > 0) { 9368 BUG_ON(wc->stage != DROP_REFERENCE); 9369 break; 9370 } 9371 9372 if (wc->stage == DROP_REFERENCE) { 9373 wc->drop_level = wc->level; 9374 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 9375 &wc->drop_progress, 9376 path->slots[wc->drop_level]); 9377 } 9378 btrfs_cpu_key_to_disk(&root_item->drop_progress, 9379 &wc->drop_progress); 9380 root_item->drop_level = wc->drop_level; 9381 9382 BUG_ON(wc->level == 0); 9383 if (btrfs_should_end_transaction(trans) || 9384 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9385 ret = btrfs_update_root(trans, tree_root, 9386 &root->root_key, 9387 root_item); 9388 if (ret) { 9389 btrfs_abort_transaction(trans, ret); 9390 err = ret; 9391 goto out_end_trans; 9392 } 9393 9394 btrfs_end_transaction_throttle(trans); 9395 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9396 btrfs_debug(fs_info, 9397 "drop snapshot early exit"); 9398 err = -EAGAIN; 9399 goto out_free; 9400 } 9401 9402 trans = btrfs_start_transaction(tree_root, 0); 9403 if (IS_ERR(trans)) { 9404 err = PTR_ERR(trans); 9405 goto out_free; 9406 } 9407 if (block_rsv) 9408 trans->block_rsv = block_rsv; 9409 } 9410 } 9411 btrfs_release_path(path); 9412 if (err) 9413 goto out_end_trans; 9414 9415 ret = btrfs_del_root(trans, &root->root_key); 9416 if (ret) { 9417 btrfs_abort_transaction(trans, ret); 9418 err = ret; 9419 goto out_end_trans; 9420 } 9421 9422 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9423 ret = btrfs_find_root(tree_root, &root->root_key, path, 9424 NULL, NULL); 9425 if (ret < 0) { 9426 btrfs_abort_transaction(trans, ret); 9427 err = ret; 9428 goto out_end_trans; 9429 } else if (ret > 0) { 9430 /* if we fail to delete the orphan item this time 9431 * around, it'll get picked up the next time. 9432 * 9433 * The most common failure here is just -ENOENT. 9434 */ 9435 btrfs_del_orphan_item(trans, tree_root, 9436 root->root_key.objectid); 9437 } 9438 } 9439 9440 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9441 btrfs_add_dropped_root(trans, root); 9442 } else { 9443 free_extent_buffer(root->node); 9444 free_extent_buffer(root->commit_root); 9445 btrfs_put_fs_root(root); 9446 } 9447 root_dropped = true; 9448 out_end_trans: 9449 btrfs_end_transaction_throttle(trans); 9450 out_free: 9451 kfree(wc); 9452 btrfs_free_path(path); 9453 out: 9454 /* 9455 * So if we need to stop dropping the snapshot for whatever reason we 9456 * need to make sure to add it back to the dead root list so that we 9457 * keep trying to do the work later. This also cleans up roots if we 9458 * don't have it in the radix (like when we recover after a power fail 9459 * or unmount) so we don't leak memory. 9460 */ 9461 if (!for_reloc && !root_dropped) 9462 btrfs_add_dead_root(root); 9463 if (err && err != -EAGAIN) 9464 btrfs_handle_fs_error(fs_info, err, NULL); 9465 return err; 9466 } 9467 9468 /* 9469 * drop subtree rooted at tree block 'node'. 9470 * 9471 * NOTE: this function will unlock and release tree block 'node' 9472 * only used by relocation code 9473 */ 9474 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9475 struct btrfs_root *root, 9476 struct extent_buffer *node, 9477 struct extent_buffer *parent) 9478 { 9479 struct btrfs_fs_info *fs_info = root->fs_info; 9480 struct btrfs_path *path; 9481 struct walk_control *wc; 9482 int level; 9483 int parent_level; 9484 int ret = 0; 9485 int wret; 9486 9487 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9488 9489 path = btrfs_alloc_path(); 9490 if (!path) 9491 return -ENOMEM; 9492 9493 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9494 if (!wc) { 9495 btrfs_free_path(path); 9496 return -ENOMEM; 9497 } 9498 9499 btrfs_assert_tree_locked(parent); 9500 parent_level = btrfs_header_level(parent); 9501 extent_buffer_get(parent); 9502 path->nodes[parent_level] = parent; 9503 path->slots[parent_level] = btrfs_header_nritems(parent); 9504 9505 btrfs_assert_tree_locked(node); 9506 level = btrfs_header_level(node); 9507 path->nodes[level] = node; 9508 path->slots[level] = 0; 9509 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9510 9511 wc->refs[parent_level] = 1; 9512 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9513 wc->level = level; 9514 wc->shared_level = -1; 9515 wc->stage = DROP_REFERENCE; 9516 wc->update_ref = 0; 9517 wc->keep_locks = 1; 9518 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9519 9520 while (1) { 9521 wret = walk_down_tree(trans, root, path, wc); 9522 if (wret < 0) { 9523 ret = wret; 9524 break; 9525 } 9526 9527 wret = walk_up_tree(trans, root, path, wc, parent_level); 9528 if (wret < 0) 9529 ret = wret; 9530 if (wret != 0) 9531 break; 9532 } 9533 9534 kfree(wc); 9535 btrfs_free_path(path); 9536 return ret; 9537 } 9538 9539 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9540 { 9541 u64 num_devices; 9542 u64 stripped; 9543 9544 /* 9545 * if restripe for this chunk_type is on pick target profile and 9546 * return, otherwise do the usual balance 9547 */ 9548 stripped = get_restripe_target(fs_info, flags); 9549 if (stripped) 9550 return extended_to_chunk(stripped); 9551 9552 num_devices = fs_info->fs_devices->rw_devices; 9553 9554 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9555 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9556 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9557 9558 if (num_devices == 1) { 9559 stripped |= BTRFS_BLOCK_GROUP_DUP; 9560 stripped = flags & ~stripped; 9561 9562 /* turn raid0 into single device chunks */ 9563 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9564 return stripped; 9565 9566 /* turn mirroring into duplication */ 9567 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9568 BTRFS_BLOCK_GROUP_RAID10)) 9569 return stripped | BTRFS_BLOCK_GROUP_DUP; 9570 } else { 9571 /* they already had raid on here, just return */ 9572 if (flags & stripped) 9573 return flags; 9574 9575 stripped |= BTRFS_BLOCK_GROUP_DUP; 9576 stripped = flags & ~stripped; 9577 9578 /* switch duplicated blocks with raid1 */ 9579 if (flags & BTRFS_BLOCK_GROUP_DUP) 9580 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9581 9582 /* this is drive concat, leave it alone */ 9583 } 9584 9585 return flags; 9586 } 9587 9588 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9589 { 9590 struct btrfs_space_info *sinfo = cache->space_info; 9591 u64 num_bytes; 9592 u64 sinfo_used; 9593 u64 min_allocable_bytes; 9594 int ret = -ENOSPC; 9595 9596 /* 9597 * We need some metadata space and system metadata space for 9598 * allocating chunks in some corner cases until we force to set 9599 * it to be readonly. 9600 */ 9601 if ((sinfo->flags & 9602 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9603 !force) 9604 min_allocable_bytes = SZ_1M; 9605 else 9606 min_allocable_bytes = 0; 9607 9608 spin_lock(&sinfo->lock); 9609 spin_lock(&cache->lock); 9610 9611 if (cache->ro) { 9612 cache->ro++; 9613 ret = 0; 9614 goto out; 9615 } 9616 9617 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9618 cache->bytes_super - btrfs_block_group_used(&cache->item); 9619 sinfo_used = btrfs_space_info_used(sinfo, true); 9620 9621 if (sinfo_used + num_bytes + min_allocable_bytes <= 9622 sinfo->total_bytes) { 9623 sinfo->bytes_readonly += num_bytes; 9624 cache->ro++; 9625 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9626 ret = 0; 9627 } 9628 out: 9629 spin_unlock(&cache->lock); 9630 spin_unlock(&sinfo->lock); 9631 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 9632 btrfs_info(cache->fs_info, 9633 "unable to make block group %llu ro", 9634 cache->key.objectid); 9635 btrfs_info(cache->fs_info, 9636 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", 9637 sinfo_used, num_bytes, min_allocable_bytes); 9638 dump_space_info(cache->fs_info, cache->space_info, 0, 0); 9639 } 9640 return ret; 9641 } 9642 9643 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) 9644 9645 { 9646 struct btrfs_fs_info *fs_info = cache->fs_info; 9647 struct btrfs_trans_handle *trans; 9648 u64 alloc_flags; 9649 int ret; 9650 9651 again: 9652 trans = btrfs_join_transaction(fs_info->extent_root); 9653 if (IS_ERR(trans)) 9654 return PTR_ERR(trans); 9655 9656 /* 9657 * we're not allowed to set block groups readonly after the dirty 9658 * block groups cache has started writing. If it already started, 9659 * back off and let this transaction commit 9660 */ 9661 mutex_lock(&fs_info->ro_block_group_mutex); 9662 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9663 u64 transid = trans->transid; 9664 9665 mutex_unlock(&fs_info->ro_block_group_mutex); 9666 btrfs_end_transaction(trans); 9667 9668 ret = btrfs_wait_for_commit(fs_info, transid); 9669 if (ret) 9670 return ret; 9671 goto again; 9672 } 9673 9674 /* 9675 * if we are changing raid levels, try to allocate a corresponding 9676 * block group with the new raid level. 9677 */ 9678 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9679 if (alloc_flags != cache->flags) { 9680 ret = do_chunk_alloc(trans, alloc_flags, 9681 CHUNK_ALLOC_FORCE); 9682 /* 9683 * ENOSPC is allowed here, we may have enough space 9684 * already allocated at the new raid level to 9685 * carry on 9686 */ 9687 if (ret == -ENOSPC) 9688 ret = 0; 9689 if (ret < 0) 9690 goto out; 9691 } 9692 9693 ret = inc_block_group_ro(cache, 0); 9694 if (!ret) 9695 goto out; 9696 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9697 ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9698 if (ret < 0) 9699 goto out; 9700 ret = inc_block_group_ro(cache, 0); 9701 out: 9702 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9703 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9704 mutex_lock(&fs_info->chunk_mutex); 9705 check_system_chunk(trans, alloc_flags); 9706 mutex_unlock(&fs_info->chunk_mutex); 9707 } 9708 mutex_unlock(&fs_info->ro_block_group_mutex); 9709 9710 btrfs_end_transaction(trans); 9711 return ret; 9712 } 9713 9714 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 9715 { 9716 u64 alloc_flags = get_alloc_profile(trans->fs_info, type); 9717 9718 return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9719 } 9720 9721 /* 9722 * helper to account the unused space of all the readonly block group in the 9723 * space_info. takes mirrors into account. 9724 */ 9725 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9726 { 9727 struct btrfs_block_group_cache *block_group; 9728 u64 free_bytes = 0; 9729 int factor; 9730 9731 /* It's df, we don't care if it's racy */ 9732 if (list_empty(&sinfo->ro_bgs)) 9733 return 0; 9734 9735 spin_lock(&sinfo->lock); 9736 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9737 spin_lock(&block_group->lock); 9738 9739 if (!block_group->ro) { 9740 spin_unlock(&block_group->lock); 9741 continue; 9742 } 9743 9744 factor = btrfs_bg_type_to_factor(block_group->flags); 9745 free_bytes += (block_group->key.offset - 9746 btrfs_block_group_used(&block_group->item)) * 9747 factor; 9748 9749 spin_unlock(&block_group->lock); 9750 } 9751 spin_unlock(&sinfo->lock); 9752 9753 return free_bytes; 9754 } 9755 9756 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9757 { 9758 struct btrfs_space_info *sinfo = cache->space_info; 9759 u64 num_bytes; 9760 9761 BUG_ON(!cache->ro); 9762 9763 spin_lock(&sinfo->lock); 9764 spin_lock(&cache->lock); 9765 if (!--cache->ro) { 9766 num_bytes = cache->key.offset - cache->reserved - 9767 cache->pinned - cache->bytes_super - 9768 btrfs_block_group_used(&cache->item); 9769 sinfo->bytes_readonly -= num_bytes; 9770 list_del_init(&cache->ro_list); 9771 } 9772 spin_unlock(&cache->lock); 9773 spin_unlock(&sinfo->lock); 9774 } 9775 9776 /* 9777 * Checks to see if it's even possible to relocate this block group. 9778 * 9779 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9780 * ok to go ahead and try. 9781 */ 9782 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9783 { 9784 struct btrfs_block_group_cache *block_group; 9785 struct btrfs_space_info *space_info; 9786 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9787 struct btrfs_device *device; 9788 u64 min_free; 9789 u64 dev_min = 1; 9790 u64 dev_nr = 0; 9791 u64 target; 9792 int debug; 9793 int index; 9794 int full = 0; 9795 int ret = 0; 9796 9797 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9798 9799 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9800 9801 /* odd, couldn't find the block group, leave it alone */ 9802 if (!block_group) { 9803 if (debug) 9804 btrfs_warn(fs_info, 9805 "can't find block group for bytenr %llu", 9806 bytenr); 9807 return -1; 9808 } 9809 9810 min_free = btrfs_block_group_used(&block_group->item); 9811 9812 /* no bytes used, we're good */ 9813 if (!min_free) 9814 goto out; 9815 9816 space_info = block_group->space_info; 9817 spin_lock(&space_info->lock); 9818 9819 full = space_info->full; 9820 9821 /* 9822 * if this is the last block group we have in this space, we can't 9823 * relocate it unless we're able to allocate a new chunk below. 9824 * 9825 * Otherwise, we need to make sure we have room in the space to handle 9826 * all of the extents from this block group. If we can, we're good 9827 */ 9828 if ((space_info->total_bytes != block_group->key.offset) && 9829 (btrfs_space_info_used(space_info, false) + min_free < 9830 space_info->total_bytes)) { 9831 spin_unlock(&space_info->lock); 9832 goto out; 9833 } 9834 spin_unlock(&space_info->lock); 9835 9836 /* 9837 * ok we don't have enough space, but maybe we have free space on our 9838 * devices to allocate new chunks for relocation, so loop through our 9839 * alloc devices and guess if we have enough space. if this block 9840 * group is going to be restriped, run checks against the target 9841 * profile instead of the current one. 9842 */ 9843 ret = -1; 9844 9845 /* 9846 * index: 9847 * 0: raid10 9848 * 1: raid1 9849 * 2: dup 9850 * 3: raid0 9851 * 4: single 9852 */ 9853 target = get_restripe_target(fs_info, block_group->flags); 9854 if (target) { 9855 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); 9856 } else { 9857 /* 9858 * this is just a balance, so if we were marked as full 9859 * we know there is no space for a new chunk 9860 */ 9861 if (full) { 9862 if (debug) 9863 btrfs_warn(fs_info, 9864 "no space to alloc new chunk for block group %llu", 9865 block_group->key.objectid); 9866 goto out; 9867 } 9868 9869 index = btrfs_bg_flags_to_raid_index(block_group->flags); 9870 } 9871 9872 if (index == BTRFS_RAID_RAID10) { 9873 dev_min = 4; 9874 /* Divide by 2 */ 9875 min_free >>= 1; 9876 } else if (index == BTRFS_RAID_RAID1) { 9877 dev_min = 2; 9878 } else if (index == BTRFS_RAID_DUP) { 9879 /* Multiply by 2 */ 9880 min_free <<= 1; 9881 } else if (index == BTRFS_RAID_RAID0) { 9882 dev_min = fs_devices->rw_devices; 9883 min_free = div64_u64(min_free, dev_min); 9884 } 9885 9886 mutex_lock(&fs_info->chunk_mutex); 9887 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9888 u64 dev_offset; 9889 9890 /* 9891 * check to make sure we can actually find a chunk with enough 9892 * space to fit our block group in. 9893 */ 9894 if (device->total_bytes > device->bytes_used + min_free && 9895 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 9896 ret = find_free_dev_extent(device, min_free, 9897 &dev_offset, NULL); 9898 if (!ret) 9899 dev_nr++; 9900 9901 if (dev_nr >= dev_min) 9902 break; 9903 9904 ret = -1; 9905 } 9906 } 9907 if (debug && ret == -1) 9908 btrfs_warn(fs_info, 9909 "no space to allocate a new chunk for block group %llu", 9910 block_group->key.objectid); 9911 mutex_unlock(&fs_info->chunk_mutex); 9912 out: 9913 btrfs_put_block_group(block_group); 9914 return ret; 9915 } 9916 9917 static int find_first_block_group(struct btrfs_fs_info *fs_info, 9918 struct btrfs_path *path, 9919 struct btrfs_key *key) 9920 { 9921 struct btrfs_root *root = fs_info->extent_root; 9922 int ret = 0; 9923 struct btrfs_key found_key; 9924 struct extent_buffer *leaf; 9925 struct btrfs_block_group_item bg; 9926 u64 flags; 9927 int slot; 9928 9929 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9930 if (ret < 0) 9931 goto out; 9932 9933 while (1) { 9934 slot = path->slots[0]; 9935 leaf = path->nodes[0]; 9936 if (slot >= btrfs_header_nritems(leaf)) { 9937 ret = btrfs_next_leaf(root, path); 9938 if (ret == 0) 9939 continue; 9940 if (ret < 0) 9941 goto out; 9942 break; 9943 } 9944 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9945 9946 if (found_key.objectid >= key->objectid && 9947 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9948 struct extent_map_tree *em_tree; 9949 struct extent_map *em; 9950 9951 em_tree = &root->fs_info->mapping_tree.map_tree; 9952 read_lock(&em_tree->lock); 9953 em = lookup_extent_mapping(em_tree, found_key.objectid, 9954 found_key.offset); 9955 read_unlock(&em_tree->lock); 9956 if (!em) { 9957 btrfs_err(fs_info, 9958 "logical %llu len %llu found bg but no related chunk", 9959 found_key.objectid, found_key.offset); 9960 ret = -ENOENT; 9961 } else if (em->start != found_key.objectid || 9962 em->len != found_key.offset) { 9963 btrfs_err(fs_info, 9964 "block group %llu len %llu mismatch with chunk %llu len %llu", 9965 found_key.objectid, found_key.offset, 9966 em->start, em->len); 9967 ret = -EUCLEAN; 9968 } else { 9969 read_extent_buffer(leaf, &bg, 9970 btrfs_item_ptr_offset(leaf, slot), 9971 sizeof(bg)); 9972 flags = btrfs_block_group_flags(&bg) & 9973 BTRFS_BLOCK_GROUP_TYPE_MASK; 9974 9975 if (flags != (em->map_lookup->type & 9976 BTRFS_BLOCK_GROUP_TYPE_MASK)) { 9977 btrfs_err(fs_info, 9978 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 9979 found_key.objectid, 9980 found_key.offset, flags, 9981 (BTRFS_BLOCK_GROUP_TYPE_MASK & 9982 em->map_lookup->type)); 9983 ret = -EUCLEAN; 9984 } else { 9985 ret = 0; 9986 } 9987 } 9988 free_extent_map(em); 9989 goto out; 9990 } 9991 path->slots[0]++; 9992 } 9993 out: 9994 return ret; 9995 } 9996 9997 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9998 { 9999 struct btrfs_block_group_cache *block_group; 10000 u64 last = 0; 10001 10002 while (1) { 10003 struct inode *inode; 10004 10005 block_group = btrfs_lookup_first_block_group(info, last); 10006 while (block_group) { 10007 wait_block_group_cache_done(block_group); 10008 spin_lock(&block_group->lock); 10009 if (block_group->iref) 10010 break; 10011 spin_unlock(&block_group->lock); 10012 block_group = next_block_group(block_group); 10013 } 10014 if (!block_group) { 10015 if (last == 0) 10016 break; 10017 last = 0; 10018 continue; 10019 } 10020 10021 inode = block_group->inode; 10022 block_group->iref = 0; 10023 block_group->inode = NULL; 10024 spin_unlock(&block_group->lock); 10025 ASSERT(block_group->io_ctl.inode == NULL); 10026 iput(inode); 10027 last = block_group->key.objectid + block_group->key.offset; 10028 btrfs_put_block_group(block_group); 10029 } 10030 } 10031 10032 /* 10033 * Must be called only after stopping all workers, since we could have block 10034 * group caching kthreads running, and therefore they could race with us if we 10035 * freed the block groups before stopping them. 10036 */ 10037 int btrfs_free_block_groups(struct btrfs_fs_info *info) 10038 { 10039 struct btrfs_block_group_cache *block_group; 10040 struct btrfs_space_info *space_info; 10041 struct btrfs_caching_control *caching_ctl; 10042 struct rb_node *n; 10043 10044 down_write(&info->commit_root_sem); 10045 while (!list_empty(&info->caching_block_groups)) { 10046 caching_ctl = list_entry(info->caching_block_groups.next, 10047 struct btrfs_caching_control, list); 10048 list_del(&caching_ctl->list); 10049 put_caching_control(caching_ctl); 10050 } 10051 up_write(&info->commit_root_sem); 10052 10053 spin_lock(&info->unused_bgs_lock); 10054 while (!list_empty(&info->unused_bgs)) { 10055 block_group = list_first_entry(&info->unused_bgs, 10056 struct btrfs_block_group_cache, 10057 bg_list); 10058 list_del_init(&block_group->bg_list); 10059 btrfs_put_block_group(block_group); 10060 } 10061 spin_unlock(&info->unused_bgs_lock); 10062 10063 spin_lock(&info->block_group_cache_lock); 10064 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 10065 block_group = rb_entry(n, struct btrfs_block_group_cache, 10066 cache_node); 10067 rb_erase(&block_group->cache_node, 10068 &info->block_group_cache_tree); 10069 RB_CLEAR_NODE(&block_group->cache_node); 10070 spin_unlock(&info->block_group_cache_lock); 10071 10072 down_write(&block_group->space_info->groups_sem); 10073 list_del(&block_group->list); 10074 up_write(&block_group->space_info->groups_sem); 10075 10076 /* 10077 * We haven't cached this block group, which means we could 10078 * possibly have excluded extents on this block group. 10079 */ 10080 if (block_group->cached == BTRFS_CACHE_NO || 10081 block_group->cached == BTRFS_CACHE_ERROR) 10082 free_excluded_extents(block_group); 10083 10084 btrfs_remove_free_space_cache(block_group); 10085 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 10086 ASSERT(list_empty(&block_group->dirty_list)); 10087 ASSERT(list_empty(&block_group->io_list)); 10088 ASSERT(list_empty(&block_group->bg_list)); 10089 ASSERT(atomic_read(&block_group->count) == 1); 10090 btrfs_put_block_group(block_group); 10091 10092 spin_lock(&info->block_group_cache_lock); 10093 } 10094 spin_unlock(&info->block_group_cache_lock); 10095 10096 /* now that all the block groups are freed, go through and 10097 * free all the space_info structs. This is only called during 10098 * the final stages of unmount, and so we know nobody is 10099 * using them. We call synchronize_rcu() once before we start, 10100 * just to be on the safe side. 10101 */ 10102 synchronize_rcu(); 10103 10104 release_global_block_rsv(info); 10105 10106 while (!list_empty(&info->space_info)) { 10107 int i; 10108 10109 space_info = list_entry(info->space_info.next, 10110 struct btrfs_space_info, 10111 list); 10112 10113 /* 10114 * Do not hide this behind enospc_debug, this is actually 10115 * important and indicates a real bug if this happens. 10116 */ 10117 if (WARN_ON(space_info->bytes_pinned > 0 || 10118 space_info->bytes_reserved > 0 || 10119 space_info->bytes_may_use > 0)) 10120 dump_space_info(info, space_info, 0, 0); 10121 list_del(&space_info->list); 10122 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 10123 struct kobject *kobj; 10124 kobj = space_info->block_group_kobjs[i]; 10125 space_info->block_group_kobjs[i] = NULL; 10126 if (kobj) { 10127 kobject_del(kobj); 10128 kobject_put(kobj); 10129 } 10130 } 10131 kobject_del(&space_info->kobj); 10132 kobject_put(&space_info->kobj); 10133 } 10134 return 0; 10135 } 10136 10137 /* link_block_group will queue up kobjects to add when we're reclaim-safe */ 10138 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) 10139 { 10140 struct btrfs_space_info *space_info; 10141 struct raid_kobject *rkobj; 10142 LIST_HEAD(list); 10143 int index; 10144 int ret = 0; 10145 10146 spin_lock(&fs_info->pending_raid_kobjs_lock); 10147 list_splice_init(&fs_info->pending_raid_kobjs, &list); 10148 spin_unlock(&fs_info->pending_raid_kobjs_lock); 10149 10150 list_for_each_entry(rkobj, &list, list) { 10151 space_info = __find_space_info(fs_info, rkobj->flags); 10152 index = btrfs_bg_flags_to_raid_index(rkobj->flags); 10153 10154 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 10155 "%s", get_raid_name(index)); 10156 if (ret) { 10157 kobject_put(&rkobj->kobj); 10158 break; 10159 } 10160 } 10161 if (ret) 10162 btrfs_warn(fs_info, 10163 "failed to add kobject for block cache, ignoring"); 10164 } 10165 10166 static void link_block_group(struct btrfs_block_group_cache *cache) 10167 { 10168 struct btrfs_space_info *space_info = cache->space_info; 10169 struct btrfs_fs_info *fs_info = cache->fs_info; 10170 int index = btrfs_bg_flags_to_raid_index(cache->flags); 10171 bool first = false; 10172 10173 down_write(&space_info->groups_sem); 10174 if (list_empty(&space_info->block_groups[index])) 10175 first = true; 10176 list_add_tail(&cache->list, &space_info->block_groups[index]); 10177 up_write(&space_info->groups_sem); 10178 10179 if (first) { 10180 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 10181 if (!rkobj) { 10182 btrfs_warn(cache->fs_info, 10183 "couldn't alloc memory for raid level kobject"); 10184 return; 10185 } 10186 rkobj->flags = cache->flags; 10187 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 10188 10189 spin_lock(&fs_info->pending_raid_kobjs_lock); 10190 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); 10191 spin_unlock(&fs_info->pending_raid_kobjs_lock); 10192 space_info->block_group_kobjs[index] = &rkobj->kobj; 10193 } 10194 } 10195 10196 static struct btrfs_block_group_cache * 10197 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 10198 u64 start, u64 size) 10199 { 10200 struct btrfs_block_group_cache *cache; 10201 10202 cache = kzalloc(sizeof(*cache), GFP_NOFS); 10203 if (!cache) 10204 return NULL; 10205 10206 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 10207 GFP_NOFS); 10208 if (!cache->free_space_ctl) { 10209 kfree(cache); 10210 return NULL; 10211 } 10212 10213 cache->key.objectid = start; 10214 cache->key.offset = size; 10215 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10216 10217 cache->fs_info = fs_info; 10218 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 10219 set_free_space_tree_thresholds(cache); 10220 10221 atomic_set(&cache->count, 1); 10222 spin_lock_init(&cache->lock); 10223 init_rwsem(&cache->data_rwsem); 10224 INIT_LIST_HEAD(&cache->list); 10225 INIT_LIST_HEAD(&cache->cluster_list); 10226 INIT_LIST_HEAD(&cache->bg_list); 10227 INIT_LIST_HEAD(&cache->ro_list); 10228 INIT_LIST_HEAD(&cache->dirty_list); 10229 INIT_LIST_HEAD(&cache->io_list); 10230 btrfs_init_free_space_ctl(cache); 10231 atomic_set(&cache->trimming, 0); 10232 mutex_init(&cache->free_space_lock); 10233 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 10234 10235 return cache; 10236 } 10237 10238 10239 /* 10240 * Iterate all chunks and verify that each of them has the corresponding block 10241 * group 10242 */ 10243 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 10244 { 10245 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 10246 struct extent_map *em; 10247 struct btrfs_block_group_cache *bg; 10248 u64 start = 0; 10249 int ret = 0; 10250 10251 while (1) { 10252 read_lock(&map_tree->map_tree.lock); 10253 /* 10254 * lookup_extent_mapping will return the first extent map 10255 * intersecting the range, so setting @len to 1 is enough to 10256 * get the first chunk. 10257 */ 10258 em = lookup_extent_mapping(&map_tree->map_tree, start, 1); 10259 read_unlock(&map_tree->map_tree.lock); 10260 if (!em) 10261 break; 10262 10263 bg = btrfs_lookup_block_group(fs_info, em->start); 10264 if (!bg) { 10265 btrfs_err(fs_info, 10266 "chunk start=%llu len=%llu doesn't have corresponding block group", 10267 em->start, em->len); 10268 ret = -EUCLEAN; 10269 free_extent_map(em); 10270 break; 10271 } 10272 if (bg->key.objectid != em->start || 10273 bg->key.offset != em->len || 10274 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 10275 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 10276 btrfs_err(fs_info, 10277 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 10278 em->start, em->len, 10279 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 10280 bg->key.objectid, bg->key.offset, 10281 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 10282 ret = -EUCLEAN; 10283 free_extent_map(em); 10284 btrfs_put_block_group(bg); 10285 break; 10286 } 10287 start = em->start + em->len; 10288 free_extent_map(em); 10289 btrfs_put_block_group(bg); 10290 } 10291 return ret; 10292 } 10293 10294 int btrfs_read_block_groups(struct btrfs_fs_info *info) 10295 { 10296 struct btrfs_path *path; 10297 int ret; 10298 struct btrfs_block_group_cache *cache; 10299 struct btrfs_space_info *space_info; 10300 struct btrfs_key key; 10301 struct btrfs_key found_key; 10302 struct extent_buffer *leaf; 10303 int need_clear = 0; 10304 u64 cache_gen; 10305 u64 feature; 10306 int mixed; 10307 10308 feature = btrfs_super_incompat_flags(info->super_copy); 10309 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 10310 10311 key.objectid = 0; 10312 key.offset = 0; 10313 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10314 path = btrfs_alloc_path(); 10315 if (!path) 10316 return -ENOMEM; 10317 path->reada = READA_FORWARD; 10318 10319 cache_gen = btrfs_super_cache_generation(info->super_copy); 10320 if (btrfs_test_opt(info, SPACE_CACHE) && 10321 btrfs_super_generation(info->super_copy) != cache_gen) 10322 need_clear = 1; 10323 if (btrfs_test_opt(info, CLEAR_CACHE)) 10324 need_clear = 1; 10325 10326 while (1) { 10327 ret = find_first_block_group(info, path, &key); 10328 if (ret > 0) 10329 break; 10330 if (ret != 0) 10331 goto error; 10332 10333 leaf = path->nodes[0]; 10334 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 10335 10336 cache = btrfs_create_block_group_cache(info, found_key.objectid, 10337 found_key.offset); 10338 if (!cache) { 10339 ret = -ENOMEM; 10340 goto error; 10341 } 10342 10343 if (need_clear) { 10344 /* 10345 * When we mount with old space cache, we need to 10346 * set BTRFS_DC_CLEAR and set dirty flag. 10347 * 10348 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 10349 * truncate the old free space cache inode and 10350 * setup a new one. 10351 * b) Setting 'dirty flag' makes sure that we flush 10352 * the new space cache info onto disk. 10353 */ 10354 if (btrfs_test_opt(info, SPACE_CACHE)) 10355 cache->disk_cache_state = BTRFS_DC_CLEAR; 10356 } 10357 10358 read_extent_buffer(leaf, &cache->item, 10359 btrfs_item_ptr_offset(leaf, path->slots[0]), 10360 sizeof(cache->item)); 10361 cache->flags = btrfs_block_group_flags(&cache->item); 10362 if (!mixed && 10363 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 10364 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 10365 btrfs_err(info, 10366 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 10367 cache->key.objectid); 10368 ret = -EINVAL; 10369 goto error; 10370 } 10371 10372 key.objectid = found_key.objectid + found_key.offset; 10373 btrfs_release_path(path); 10374 10375 /* 10376 * We need to exclude the super stripes now so that the space 10377 * info has super bytes accounted for, otherwise we'll think 10378 * we have more space than we actually do. 10379 */ 10380 ret = exclude_super_stripes(cache); 10381 if (ret) { 10382 /* 10383 * We may have excluded something, so call this just in 10384 * case. 10385 */ 10386 free_excluded_extents(cache); 10387 btrfs_put_block_group(cache); 10388 goto error; 10389 } 10390 10391 /* 10392 * check for two cases, either we are full, and therefore 10393 * don't need to bother with the caching work since we won't 10394 * find any space, or we are empty, and we can just add all 10395 * the space in and be done with it. This saves us _a_lot_ of 10396 * time, particularly in the full case. 10397 */ 10398 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10399 cache->last_byte_to_unpin = (u64)-1; 10400 cache->cached = BTRFS_CACHE_FINISHED; 10401 free_excluded_extents(cache); 10402 } else if (btrfs_block_group_used(&cache->item) == 0) { 10403 cache->last_byte_to_unpin = (u64)-1; 10404 cache->cached = BTRFS_CACHE_FINISHED; 10405 add_new_free_space(cache, found_key.objectid, 10406 found_key.objectid + 10407 found_key.offset); 10408 free_excluded_extents(cache); 10409 } 10410 10411 ret = btrfs_add_block_group_cache(info, cache); 10412 if (ret) { 10413 btrfs_remove_free_space_cache(cache); 10414 btrfs_put_block_group(cache); 10415 goto error; 10416 } 10417 10418 trace_btrfs_add_block_group(info, cache, 0); 10419 update_space_info(info, cache->flags, found_key.offset, 10420 btrfs_block_group_used(&cache->item), 10421 cache->bytes_super, &space_info); 10422 10423 cache->space_info = space_info; 10424 10425 link_block_group(cache); 10426 10427 set_avail_alloc_bits(info, cache->flags); 10428 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10429 inc_block_group_ro(cache, 1); 10430 } else if (btrfs_block_group_used(&cache->item) == 0) { 10431 ASSERT(list_empty(&cache->bg_list)); 10432 btrfs_mark_bg_unused(cache); 10433 } 10434 } 10435 10436 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10437 if (!(get_alloc_profile(info, space_info->flags) & 10438 (BTRFS_BLOCK_GROUP_RAID10 | 10439 BTRFS_BLOCK_GROUP_RAID1 | 10440 BTRFS_BLOCK_GROUP_RAID5 | 10441 BTRFS_BLOCK_GROUP_RAID6 | 10442 BTRFS_BLOCK_GROUP_DUP))) 10443 continue; 10444 /* 10445 * avoid allocating from un-mirrored block group if there are 10446 * mirrored block groups. 10447 */ 10448 list_for_each_entry(cache, 10449 &space_info->block_groups[BTRFS_RAID_RAID0], 10450 list) 10451 inc_block_group_ro(cache, 1); 10452 list_for_each_entry(cache, 10453 &space_info->block_groups[BTRFS_RAID_SINGLE], 10454 list) 10455 inc_block_group_ro(cache, 1); 10456 } 10457 10458 btrfs_add_raid_kobjects(info); 10459 init_global_block_rsv(info); 10460 ret = check_chunk_block_group_mappings(info); 10461 error: 10462 btrfs_free_path(path); 10463 return ret; 10464 } 10465 10466 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 10467 { 10468 struct btrfs_fs_info *fs_info = trans->fs_info; 10469 struct btrfs_block_group_cache *block_group; 10470 struct btrfs_root *extent_root = fs_info->extent_root; 10471 struct btrfs_block_group_item item; 10472 struct btrfs_key key; 10473 int ret = 0; 10474 10475 if (!trans->can_flush_pending_bgs) 10476 return; 10477 10478 while (!list_empty(&trans->new_bgs)) { 10479 block_group = list_first_entry(&trans->new_bgs, 10480 struct btrfs_block_group_cache, 10481 bg_list); 10482 if (ret) 10483 goto next; 10484 10485 spin_lock(&block_group->lock); 10486 memcpy(&item, &block_group->item, sizeof(item)); 10487 memcpy(&key, &block_group->key, sizeof(key)); 10488 spin_unlock(&block_group->lock); 10489 10490 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10491 sizeof(item)); 10492 if (ret) 10493 btrfs_abort_transaction(trans, ret); 10494 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); 10495 if (ret) 10496 btrfs_abort_transaction(trans, ret); 10497 add_block_group_free_space(trans, block_group); 10498 /* already aborted the transaction if it failed. */ 10499 next: 10500 btrfs_delayed_refs_rsv_release(fs_info, 1); 10501 list_del_init(&block_group->bg_list); 10502 } 10503 btrfs_trans_release_chunk_metadata(trans); 10504 } 10505 10506 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, 10507 u64 type, u64 chunk_offset, u64 size) 10508 { 10509 struct btrfs_fs_info *fs_info = trans->fs_info; 10510 struct btrfs_block_group_cache *cache; 10511 int ret; 10512 10513 btrfs_set_log_full_commit(trans); 10514 10515 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10516 if (!cache) 10517 return -ENOMEM; 10518 10519 btrfs_set_block_group_used(&cache->item, bytes_used); 10520 btrfs_set_block_group_chunk_objectid(&cache->item, 10521 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 10522 btrfs_set_block_group_flags(&cache->item, type); 10523 10524 cache->flags = type; 10525 cache->last_byte_to_unpin = (u64)-1; 10526 cache->cached = BTRFS_CACHE_FINISHED; 10527 cache->needs_free_space = 1; 10528 ret = exclude_super_stripes(cache); 10529 if (ret) { 10530 /* 10531 * We may have excluded something, so call this just in 10532 * case. 10533 */ 10534 free_excluded_extents(cache); 10535 btrfs_put_block_group(cache); 10536 return ret; 10537 } 10538 10539 add_new_free_space(cache, chunk_offset, chunk_offset + size); 10540 10541 free_excluded_extents(cache); 10542 10543 #ifdef CONFIG_BTRFS_DEBUG 10544 if (btrfs_should_fragment_free_space(cache)) { 10545 u64 new_bytes_used = size - bytes_used; 10546 10547 bytes_used += new_bytes_used >> 1; 10548 fragment_free_space(cache); 10549 } 10550 #endif 10551 /* 10552 * Ensure the corresponding space_info object is created and 10553 * assigned to our block group. We want our bg to be added to the rbtree 10554 * with its ->space_info set. 10555 */ 10556 cache->space_info = __find_space_info(fs_info, cache->flags); 10557 ASSERT(cache->space_info); 10558 10559 ret = btrfs_add_block_group_cache(fs_info, cache); 10560 if (ret) { 10561 btrfs_remove_free_space_cache(cache); 10562 btrfs_put_block_group(cache); 10563 return ret; 10564 } 10565 10566 /* 10567 * Now that our block group has its ->space_info set and is inserted in 10568 * the rbtree, update the space info's counters. 10569 */ 10570 trace_btrfs_add_block_group(fs_info, cache, 1); 10571 update_space_info(fs_info, cache->flags, size, bytes_used, 10572 cache->bytes_super, &cache->space_info); 10573 update_global_block_rsv(fs_info); 10574 10575 link_block_group(cache); 10576 10577 list_add_tail(&cache->bg_list, &trans->new_bgs); 10578 trans->delayed_ref_updates++; 10579 btrfs_update_delayed_refs_rsv(trans); 10580 10581 set_avail_alloc_bits(fs_info, type); 10582 return 0; 10583 } 10584 10585 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10586 { 10587 u64 extra_flags = chunk_to_extended(flags) & 10588 BTRFS_EXTENDED_PROFILE_MASK; 10589 10590 write_seqlock(&fs_info->profiles_lock); 10591 if (flags & BTRFS_BLOCK_GROUP_DATA) 10592 fs_info->avail_data_alloc_bits &= ~extra_flags; 10593 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10594 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10595 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10596 fs_info->avail_system_alloc_bits &= ~extra_flags; 10597 write_sequnlock(&fs_info->profiles_lock); 10598 } 10599 10600 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10601 u64 group_start, struct extent_map *em) 10602 { 10603 struct btrfs_fs_info *fs_info = trans->fs_info; 10604 struct btrfs_root *root = fs_info->extent_root; 10605 struct btrfs_path *path; 10606 struct btrfs_block_group_cache *block_group; 10607 struct btrfs_free_cluster *cluster; 10608 struct btrfs_root *tree_root = fs_info->tree_root; 10609 struct btrfs_key key; 10610 struct inode *inode; 10611 struct kobject *kobj = NULL; 10612 int ret; 10613 int index; 10614 int factor; 10615 struct btrfs_caching_control *caching_ctl = NULL; 10616 bool remove_em; 10617 bool remove_rsv = false; 10618 10619 block_group = btrfs_lookup_block_group(fs_info, group_start); 10620 BUG_ON(!block_group); 10621 BUG_ON(!block_group->ro); 10622 10623 trace_btrfs_remove_block_group(block_group); 10624 /* 10625 * Free the reserved super bytes from this block group before 10626 * remove it. 10627 */ 10628 free_excluded_extents(block_group); 10629 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, 10630 block_group->key.offset); 10631 10632 memcpy(&key, &block_group->key, sizeof(key)); 10633 index = btrfs_bg_flags_to_raid_index(block_group->flags); 10634 factor = btrfs_bg_type_to_factor(block_group->flags); 10635 10636 /* make sure this block group isn't part of an allocation cluster */ 10637 cluster = &fs_info->data_alloc_cluster; 10638 spin_lock(&cluster->refill_lock); 10639 btrfs_return_cluster_to_free_space(block_group, cluster); 10640 spin_unlock(&cluster->refill_lock); 10641 10642 /* 10643 * make sure this block group isn't part of a metadata 10644 * allocation cluster 10645 */ 10646 cluster = &fs_info->meta_alloc_cluster; 10647 spin_lock(&cluster->refill_lock); 10648 btrfs_return_cluster_to_free_space(block_group, cluster); 10649 spin_unlock(&cluster->refill_lock); 10650 10651 path = btrfs_alloc_path(); 10652 if (!path) { 10653 ret = -ENOMEM; 10654 goto out; 10655 } 10656 10657 /* 10658 * get the inode first so any iput calls done for the io_list 10659 * aren't the final iput (no unlinks allowed now) 10660 */ 10661 inode = lookup_free_space_inode(block_group, path); 10662 10663 mutex_lock(&trans->transaction->cache_write_mutex); 10664 /* 10665 * Make sure our free space cache IO is done before removing the 10666 * free space inode 10667 */ 10668 spin_lock(&trans->transaction->dirty_bgs_lock); 10669 if (!list_empty(&block_group->io_list)) { 10670 list_del_init(&block_group->io_list); 10671 10672 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10673 10674 spin_unlock(&trans->transaction->dirty_bgs_lock); 10675 btrfs_wait_cache_io(trans, block_group, path); 10676 btrfs_put_block_group(block_group); 10677 spin_lock(&trans->transaction->dirty_bgs_lock); 10678 } 10679 10680 if (!list_empty(&block_group->dirty_list)) { 10681 list_del_init(&block_group->dirty_list); 10682 remove_rsv = true; 10683 btrfs_put_block_group(block_group); 10684 } 10685 spin_unlock(&trans->transaction->dirty_bgs_lock); 10686 mutex_unlock(&trans->transaction->cache_write_mutex); 10687 10688 if (!IS_ERR(inode)) { 10689 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10690 if (ret) { 10691 btrfs_add_delayed_iput(inode); 10692 goto out; 10693 } 10694 clear_nlink(inode); 10695 /* One for the block groups ref */ 10696 spin_lock(&block_group->lock); 10697 if (block_group->iref) { 10698 block_group->iref = 0; 10699 block_group->inode = NULL; 10700 spin_unlock(&block_group->lock); 10701 iput(inode); 10702 } else { 10703 spin_unlock(&block_group->lock); 10704 } 10705 /* One for our lookup ref */ 10706 btrfs_add_delayed_iput(inode); 10707 } 10708 10709 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10710 key.offset = block_group->key.objectid; 10711 key.type = 0; 10712 10713 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10714 if (ret < 0) 10715 goto out; 10716 if (ret > 0) 10717 btrfs_release_path(path); 10718 if (ret == 0) { 10719 ret = btrfs_del_item(trans, tree_root, path); 10720 if (ret) 10721 goto out; 10722 btrfs_release_path(path); 10723 } 10724 10725 spin_lock(&fs_info->block_group_cache_lock); 10726 rb_erase(&block_group->cache_node, 10727 &fs_info->block_group_cache_tree); 10728 RB_CLEAR_NODE(&block_group->cache_node); 10729 10730 if (fs_info->first_logical_byte == block_group->key.objectid) 10731 fs_info->first_logical_byte = (u64)-1; 10732 spin_unlock(&fs_info->block_group_cache_lock); 10733 10734 down_write(&block_group->space_info->groups_sem); 10735 /* 10736 * we must use list_del_init so people can check to see if they 10737 * are still on the list after taking the semaphore 10738 */ 10739 list_del_init(&block_group->list); 10740 if (list_empty(&block_group->space_info->block_groups[index])) { 10741 kobj = block_group->space_info->block_group_kobjs[index]; 10742 block_group->space_info->block_group_kobjs[index] = NULL; 10743 clear_avail_alloc_bits(fs_info, block_group->flags); 10744 } 10745 up_write(&block_group->space_info->groups_sem); 10746 if (kobj) { 10747 kobject_del(kobj); 10748 kobject_put(kobj); 10749 } 10750 10751 if (block_group->has_caching_ctl) 10752 caching_ctl = get_caching_control(block_group); 10753 if (block_group->cached == BTRFS_CACHE_STARTED) 10754 wait_block_group_cache_done(block_group); 10755 if (block_group->has_caching_ctl) { 10756 down_write(&fs_info->commit_root_sem); 10757 if (!caching_ctl) { 10758 struct btrfs_caching_control *ctl; 10759 10760 list_for_each_entry(ctl, 10761 &fs_info->caching_block_groups, list) 10762 if (ctl->block_group == block_group) { 10763 caching_ctl = ctl; 10764 refcount_inc(&caching_ctl->count); 10765 break; 10766 } 10767 } 10768 if (caching_ctl) 10769 list_del_init(&caching_ctl->list); 10770 up_write(&fs_info->commit_root_sem); 10771 if (caching_ctl) { 10772 /* Once for the caching bgs list and once for us. */ 10773 put_caching_control(caching_ctl); 10774 put_caching_control(caching_ctl); 10775 } 10776 } 10777 10778 spin_lock(&trans->transaction->dirty_bgs_lock); 10779 WARN_ON(!list_empty(&block_group->dirty_list)); 10780 WARN_ON(!list_empty(&block_group->io_list)); 10781 spin_unlock(&trans->transaction->dirty_bgs_lock); 10782 10783 btrfs_remove_free_space_cache(block_group); 10784 10785 spin_lock(&block_group->space_info->lock); 10786 list_del_init(&block_group->ro_list); 10787 10788 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10789 WARN_ON(block_group->space_info->total_bytes 10790 < block_group->key.offset); 10791 WARN_ON(block_group->space_info->bytes_readonly 10792 < block_group->key.offset); 10793 WARN_ON(block_group->space_info->disk_total 10794 < block_group->key.offset * factor); 10795 } 10796 block_group->space_info->total_bytes -= block_group->key.offset; 10797 block_group->space_info->bytes_readonly -= block_group->key.offset; 10798 block_group->space_info->disk_total -= block_group->key.offset * factor; 10799 10800 spin_unlock(&block_group->space_info->lock); 10801 10802 memcpy(&key, &block_group->key, sizeof(key)); 10803 10804 mutex_lock(&fs_info->chunk_mutex); 10805 spin_lock(&block_group->lock); 10806 block_group->removed = 1; 10807 /* 10808 * At this point trimming can't start on this block group, because we 10809 * removed the block group from the tree fs_info->block_group_cache_tree 10810 * so no one can't find it anymore and even if someone already got this 10811 * block group before we removed it from the rbtree, they have already 10812 * incremented block_group->trimming - if they didn't, they won't find 10813 * any free space entries because we already removed them all when we 10814 * called btrfs_remove_free_space_cache(). 10815 * 10816 * And we must not remove the extent map from the fs_info->mapping_tree 10817 * to prevent the same logical address range and physical device space 10818 * ranges from being reused for a new block group. This is because our 10819 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10820 * completely transactionless, so while it is trimming a range the 10821 * currently running transaction might finish and a new one start, 10822 * allowing for new block groups to be created that can reuse the same 10823 * physical device locations unless we take this special care. 10824 * 10825 * There may also be an implicit trim operation if the file system 10826 * is mounted with -odiscard. The same protections must remain 10827 * in place until the extents have been discarded completely when 10828 * the transaction commit has completed. 10829 */ 10830 remove_em = (atomic_read(&block_group->trimming) == 0); 10831 spin_unlock(&block_group->lock); 10832 10833 if (remove_em) { 10834 struct extent_map_tree *em_tree; 10835 10836 em_tree = &fs_info->mapping_tree.map_tree; 10837 write_lock(&em_tree->lock); 10838 remove_extent_mapping(em_tree, em); 10839 write_unlock(&em_tree->lock); 10840 /* once for the tree */ 10841 free_extent_map(em); 10842 } 10843 10844 mutex_unlock(&fs_info->chunk_mutex); 10845 10846 ret = remove_block_group_free_space(trans, block_group); 10847 if (ret) 10848 goto out; 10849 10850 btrfs_put_block_group(block_group); 10851 btrfs_put_block_group(block_group); 10852 10853 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10854 if (ret > 0) 10855 ret = -EIO; 10856 if (ret < 0) 10857 goto out; 10858 10859 ret = btrfs_del_item(trans, root, path); 10860 out: 10861 if (remove_rsv) 10862 btrfs_delayed_refs_rsv_release(fs_info, 1); 10863 btrfs_free_path(path); 10864 return ret; 10865 } 10866 10867 struct btrfs_trans_handle * 10868 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10869 const u64 chunk_offset) 10870 { 10871 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10872 struct extent_map *em; 10873 struct map_lookup *map; 10874 unsigned int num_items; 10875 10876 read_lock(&em_tree->lock); 10877 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10878 read_unlock(&em_tree->lock); 10879 ASSERT(em && em->start == chunk_offset); 10880 10881 /* 10882 * We need to reserve 3 + N units from the metadata space info in order 10883 * to remove a block group (done at btrfs_remove_chunk() and at 10884 * btrfs_remove_block_group()), which are used for: 10885 * 10886 * 1 unit for adding the free space inode's orphan (located in the tree 10887 * of tree roots). 10888 * 1 unit for deleting the block group item (located in the extent 10889 * tree). 10890 * 1 unit for deleting the free space item (located in tree of tree 10891 * roots). 10892 * N units for deleting N device extent items corresponding to each 10893 * stripe (located in the device tree). 10894 * 10895 * In order to remove a block group we also need to reserve units in the 10896 * system space info in order to update the chunk tree (update one or 10897 * more device items and remove one chunk item), but this is done at 10898 * btrfs_remove_chunk() through a call to check_system_chunk(). 10899 */ 10900 map = em->map_lookup; 10901 num_items = 3 + map->num_stripes; 10902 free_extent_map(em); 10903 10904 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10905 num_items, 1); 10906 } 10907 10908 /* 10909 * Process the unused_bgs list and remove any that don't have any allocated 10910 * space inside of them. 10911 */ 10912 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10913 { 10914 struct btrfs_block_group_cache *block_group; 10915 struct btrfs_space_info *space_info; 10916 struct btrfs_trans_handle *trans; 10917 int ret = 0; 10918 10919 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 10920 return; 10921 10922 spin_lock(&fs_info->unused_bgs_lock); 10923 while (!list_empty(&fs_info->unused_bgs)) { 10924 u64 start, end; 10925 int trimming; 10926 10927 block_group = list_first_entry(&fs_info->unused_bgs, 10928 struct btrfs_block_group_cache, 10929 bg_list); 10930 list_del_init(&block_group->bg_list); 10931 10932 space_info = block_group->space_info; 10933 10934 if (ret || btrfs_mixed_space_info(space_info)) { 10935 btrfs_put_block_group(block_group); 10936 continue; 10937 } 10938 spin_unlock(&fs_info->unused_bgs_lock); 10939 10940 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10941 10942 /* Don't want to race with allocators so take the groups_sem */ 10943 down_write(&space_info->groups_sem); 10944 spin_lock(&block_group->lock); 10945 if (block_group->reserved || block_group->pinned || 10946 btrfs_block_group_used(&block_group->item) || 10947 block_group->ro || 10948 list_is_singular(&block_group->list)) { 10949 /* 10950 * We want to bail if we made new allocations or have 10951 * outstanding allocations in this block group. We do 10952 * the ro check in case balance is currently acting on 10953 * this block group. 10954 */ 10955 trace_btrfs_skip_unused_block_group(block_group); 10956 spin_unlock(&block_group->lock); 10957 up_write(&space_info->groups_sem); 10958 goto next; 10959 } 10960 spin_unlock(&block_group->lock); 10961 10962 /* We don't want to force the issue, only flip if it's ok. */ 10963 ret = inc_block_group_ro(block_group, 0); 10964 up_write(&space_info->groups_sem); 10965 if (ret < 0) { 10966 ret = 0; 10967 goto next; 10968 } 10969 10970 /* 10971 * Want to do this before we do anything else so we can recover 10972 * properly if we fail to join the transaction. 10973 */ 10974 trans = btrfs_start_trans_remove_block_group(fs_info, 10975 block_group->key.objectid); 10976 if (IS_ERR(trans)) { 10977 btrfs_dec_block_group_ro(block_group); 10978 ret = PTR_ERR(trans); 10979 goto next; 10980 } 10981 10982 /* 10983 * We could have pending pinned extents for this block group, 10984 * just delete them, we don't care about them anymore. 10985 */ 10986 start = block_group->key.objectid; 10987 end = start + block_group->key.offset - 1; 10988 /* 10989 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10990 * btrfs_finish_extent_commit(). If we are at transaction N, 10991 * another task might be running finish_extent_commit() for the 10992 * previous transaction N - 1, and have seen a range belonging 10993 * to the block group in freed_extents[] before we were able to 10994 * clear the whole block group range from freed_extents[]. This 10995 * means that task can lookup for the block group after we 10996 * unpinned it from freed_extents[] and removed it, leading to 10997 * a BUG_ON() at btrfs_unpin_extent_range(). 10998 */ 10999 mutex_lock(&fs_info->unused_bg_unpin_mutex); 11000 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 11001 EXTENT_DIRTY); 11002 if (ret) { 11003 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11004 btrfs_dec_block_group_ro(block_group); 11005 goto end_trans; 11006 } 11007 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 11008 EXTENT_DIRTY); 11009 if (ret) { 11010 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11011 btrfs_dec_block_group_ro(block_group); 11012 goto end_trans; 11013 } 11014 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11015 11016 /* Reset pinned so btrfs_put_block_group doesn't complain */ 11017 spin_lock(&space_info->lock); 11018 spin_lock(&block_group->lock); 11019 11020 update_bytes_pinned(space_info, -block_group->pinned); 11021 space_info->bytes_readonly += block_group->pinned; 11022 percpu_counter_add_batch(&space_info->total_bytes_pinned, 11023 -block_group->pinned, 11024 BTRFS_TOTAL_BYTES_PINNED_BATCH); 11025 block_group->pinned = 0; 11026 11027 spin_unlock(&block_group->lock); 11028 spin_unlock(&space_info->lock); 11029 11030 /* DISCARD can flip during remount */ 11031 trimming = btrfs_test_opt(fs_info, DISCARD); 11032 11033 /* Implicit trim during transaction commit. */ 11034 if (trimming) 11035 btrfs_get_block_group_trimming(block_group); 11036 11037 /* 11038 * Btrfs_remove_chunk will abort the transaction if things go 11039 * horribly wrong. 11040 */ 11041 ret = btrfs_remove_chunk(trans, block_group->key.objectid); 11042 11043 if (ret) { 11044 if (trimming) 11045 btrfs_put_block_group_trimming(block_group); 11046 goto end_trans; 11047 } 11048 11049 /* 11050 * If we're not mounted with -odiscard, we can just forget 11051 * about this block group. Otherwise we'll need to wait 11052 * until transaction commit to do the actual discard. 11053 */ 11054 if (trimming) { 11055 spin_lock(&fs_info->unused_bgs_lock); 11056 /* 11057 * A concurrent scrub might have added us to the list 11058 * fs_info->unused_bgs, so use a list_move operation 11059 * to add the block group to the deleted_bgs list. 11060 */ 11061 list_move(&block_group->bg_list, 11062 &trans->transaction->deleted_bgs); 11063 spin_unlock(&fs_info->unused_bgs_lock); 11064 btrfs_get_block_group(block_group); 11065 } 11066 end_trans: 11067 btrfs_end_transaction(trans); 11068 next: 11069 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 11070 btrfs_put_block_group(block_group); 11071 spin_lock(&fs_info->unused_bgs_lock); 11072 } 11073 spin_unlock(&fs_info->unused_bgs_lock); 11074 } 11075 11076 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 11077 { 11078 struct btrfs_super_block *disk_super; 11079 u64 features; 11080 u64 flags; 11081 int mixed = 0; 11082 int ret; 11083 11084 disk_super = fs_info->super_copy; 11085 if (!btrfs_super_root(disk_super)) 11086 return -EINVAL; 11087 11088 features = btrfs_super_incompat_flags(disk_super); 11089 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 11090 mixed = 1; 11091 11092 flags = BTRFS_BLOCK_GROUP_SYSTEM; 11093 ret = create_space_info(fs_info, flags); 11094 if (ret) 11095 goto out; 11096 11097 if (mixed) { 11098 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 11099 ret = create_space_info(fs_info, flags); 11100 } else { 11101 flags = BTRFS_BLOCK_GROUP_METADATA; 11102 ret = create_space_info(fs_info, flags); 11103 if (ret) 11104 goto out; 11105 11106 flags = BTRFS_BLOCK_GROUP_DATA; 11107 ret = create_space_info(fs_info, flags); 11108 } 11109 out: 11110 return ret; 11111 } 11112 11113 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 11114 u64 start, u64 end) 11115 { 11116 return unpin_extent_range(fs_info, start, end, false); 11117 } 11118 11119 /* 11120 * It used to be that old block groups would be left around forever. 11121 * Iterating over them would be enough to trim unused space. Since we 11122 * now automatically remove them, we also need to iterate over unallocated 11123 * space. 11124 * 11125 * We don't want a transaction for this since the discard may take a 11126 * substantial amount of time. We don't require that a transaction be 11127 * running, but we do need to take a running transaction into account 11128 * to ensure that we're not discarding chunks that were released or 11129 * allocated in the current transaction. 11130 * 11131 * Holding the chunks lock will prevent other threads from allocating 11132 * or releasing chunks, but it won't prevent a running transaction 11133 * from committing and releasing the memory that the pending chunks 11134 * list head uses. For that, we need to take a reference to the 11135 * transaction and hold the commit root sem. We only need to hold 11136 * it while performing the free space search since we have already 11137 * held back allocations. 11138 */ 11139 static int btrfs_trim_free_extents(struct btrfs_device *device, 11140 struct fstrim_range *range, u64 *trimmed) 11141 { 11142 u64 start, len = 0, end = 0; 11143 int ret; 11144 11145 start = max_t(u64, range->start, SZ_1M); 11146 *trimmed = 0; 11147 11148 /* Discard not supported = nothing to do. */ 11149 if (!blk_queue_discard(bdev_get_queue(device->bdev))) 11150 return 0; 11151 11152 /* Not writable = nothing to do. */ 11153 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 11154 return 0; 11155 11156 /* No free space = nothing to do. */ 11157 if (device->total_bytes <= device->bytes_used) 11158 return 0; 11159 11160 ret = 0; 11161 11162 while (1) { 11163 struct btrfs_fs_info *fs_info = device->fs_info; 11164 u64 bytes; 11165 11166 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 11167 if (ret) 11168 break; 11169 11170 find_first_clear_extent_bit(&device->alloc_state, start, 11171 &start, &end, 11172 CHUNK_TRIMMED | CHUNK_ALLOCATED); 11173 /* 11174 * If find_first_clear_extent_bit find a range that spans the 11175 * end of the device it will set end to -1, in this case it's up 11176 * to the caller to trim the value to the size of the device. 11177 */ 11178 end = min(end, device->total_bytes - 1); 11179 len = end - start + 1; 11180 11181 /* We didn't find any extents */ 11182 if (!len) { 11183 mutex_unlock(&fs_info->chunk_mutex); 11184 ret = 0; 11185 break; 11186 } 11187 11188 /* Keep going until we satisfy minlen or reach end of space */ 11189 if (len < range->minlen) { 11190 mutex_unlock(&fs_info->chunk_mutex); 11191 start += len; 11192 continue; 11193 } 11194 11195 /* If we are out of the passed range break */ 11196 if (start > range->start + range->len - 1) { 11197 mutex_unlock(&fs_info->chunk_mutex); 11198 break; 11199 } 11200 11201 start = max(range->start, start); 11202 len = min(range->len, len); 11203 11204 ret = btrfs_issue_discard(device->bdev, start, len, 11205 &bytes); 11206 if (!ret) 11207 set_extent_bits(&device->alloc_state, start, 11208 start + bytes - 1, 11209 CHUNK_TRIMMED); 11210 mutex_unlock(&fs_info->chunk_mutex); 11211 11212 if (ret) 11213 break; 11214 11215 start += len; 11216 *trimmed += bytes; 11217 11218 /* We've trimmed enough */ 11219 if (*trimmed >= range->len) 11220 break; 11221 11222 if (fatal_signal_pending(current)) { 11223 ret = -ERESTARTSYS; 11224 break; 11225 } 11226 11227 cond_resched(); 11228 } 11229 11230 return ret; 11231 } 11232 11233 /* 11234 * Trim the whole filesystem by: 11235 * 1) trimming the free space in each block group 11236 * 2) trimming the unallocated space on each device 11237 * 11238 * This will also continue trimming even if a block group or device encounters 11239 * an error. The return value will be the last error, or 0 if nothing bad 11240 * happens. 11241 */ 11242 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 11243 { 11244 struct btrfs_block_group_cache *cache = NULL; 11245 struct btrfs_device *device; 11246 struct list_head *devices; 11247 u64 group_trimmed; 11248 u64 start; 11249 u64 end; 11250 u64 trimmed = 0; 11251 u64 bg_failed = 0; 11252 u64 dev_failed = 0; 11253 int bg_ret = 0; 11254 int dev_ret = 0; 11255 int ret = 0; 11256 11257 cache = btrfs_lookup_first_block_group(fs_info, range->start); 11258 for (; cache; cache = next_block_group(cache)) { 11259 if (cache->key.objectid >= (range->start + range->len)) { 11260 btrfs_put_block_group(cache); 11261 break; 11262 } 11263 11264 start = max(range->start, cache->key.objectid); 11265 end = min(range->start + range->len, 11266 cache->key.objectid + cache->key.offset); 11267 11268 if (end - start >= range->minlen) { 11269 if (!block_group_cache_done(cache)) { 11270 ret = cache_block_group(cache, 0); 11271 if (ret) { 11272 bg_failed++; 11273 bg_ret = ret; 11274 continue; 11275 } 11276 ret = wait_block_group_cache_done(cache); 11277 if (ret) { 11278 bg_failed++; 11279 bg_ret = ret; 11280 continue; 11281 } 11282 } 11283 ret = btrfs_trim_block_group(cache, 11284 &group_trimmed, 11285 start, 11286 end, 11287 range->minlen); 11288 11289 trimmed += group_trimmed; 11290 if (ret) { 11291 bg_failed++; 11292 bg_ret = ret; 11293 continue; 11294 } 11295 } 11296 } 11297 11298 if (bg_failed) 11299 btrfs_warn(fs_info, 11300 "failed to trim %llu block group(s), last error %d", 11301 bg_failed, bg_ret); 11302 mutex_lock(&fs_info->fs_devices->device_list_mutex); 11303 devices = &fs_info->fs_devices->devices; 11304 list_for_each_entry(device, devices, dev_list) { 11305 ret = btrfs_trim_free_extents(device, range, &group_trimmed); 11306 if (ret) { 11307 dev_failed++; 11308 dev_ret = ret; 11309 break; 11310 } 11311 11312 trimmed += group_trimmed; 11313 } 11314 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 11315 11316 if (dev_failed) 11317 btrfs_warn(fs_info, 11318 "failed to trim %llu device(s), last error %d", 11319 dev_failed, dev_ret); 11320 range->len = trimmed; 11321 if (bg_ret) 11322 return bg_ret; 11323 return dev_ret; 11324 } 11325 11326 /* 11327 * btrfs_{start,end}_write_no_snapshotting() are similar to 11328 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 11329 * data into the page cache through nocow before the subvolume is snapshoted, 11330 * but flush the data into disk after the snapshot creation, or to prevent 11331 * operations while snapshotting is ongoing and that cause the snapshot to be 11332 * inconsistent (writes followed by expanding truncates for example). 11333 */ 11334 void btrfs_end_write_no_snapshotting(struct btrfs_root *root) 11335 { 11336 percpu_counter_dec(&root->subv_writers->counter); 11337 cond_wake_up(&root->subv_writers->wait); 11338 } 11339 11340 int btrfs_start_write_no_snapshotting(struct btrfs_root *root) 11341 { 11342 if (atomic_read(&root->will_be_snapshotted)) 11343 return 0; 11344 11345 percpu_counter_inc(&root->subv_writers->counter); 11346 /* 11347 * Make sure counter is updated before we check for snapshot creation. 11348 */ 11349 smp_mb(); 11350 if (atomic_read(&root->will_be_snapshotted)) { 11351 btrfs_end_write_no_snapshotting(root); 11352 return 0; 11353 } 11354 return 1; 11355 } 11356 11357 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11358 { 11359 while (true) { 11360 int ret; 11361 11362 ret = btrfs_start_write_no_snapshotting(root); 11363 if (ret) 11364 break; 11365 wait_var_event(&root->will_be_snapshotted, 11366 !atomic_read(&root->will_be_snapshotted)); 11367 } 11368 } 11369 11370 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) 11371 { 11372 struct btrfs_fs_info *fs_info = bg->fs_info; 11373 11374 spin_lock(&fs_info->unused_bgs_lock); 11375 if (list_empty(&bg->bg_list)) { 11376 btrfs_get_block_group(bg); 11377 trace_btrfs_add_unused_block_group(bg); 11378 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 11379 } 11380 spin_unlock(&fs_info->unused_bgs_lock); 11381 } 11382