1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/list_sort.h> 4 #include "misc.h" 5 #include "ctree.h" 6 #include "block-group.h" 7 #include "space-info.h" 8 #include "disk-io.h" 9 #include "free-space-cache.h" 10 #include "free-space-tree.h" 11 #include "volumes.h" 12 #include "transaction.h" 13 #include "ref-verify.h" 14 #include "sysfs.h" 15 #include "tree-log.h" 16 #include "delalloc-space.h" 17 #include "discard.h" 18 #include "raid56.h" 19 #include "zoned.h" 20 21 /* 22 * Return target flags in extended format or 0 if restripe for this chunk_type 23 * is not in progress 24 * 25 * Should be called with balance_lock held 26 */ 27 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 28 { 29 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 30 u64 target = 0; 31 32 if (!bctl) 33 return 0; 34 35 if (flags & BTRFS_BLOCK_GROUP_DATA && 36 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 37 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 38 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 39 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 40 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 41 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 42 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 43 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 44 } 45 46 return target; 47 } 48 49 /* 50 * @flags: available profiles in extended format (see ctree.h) 51 * 52 * Return reduced profile in chunk format. If profile changing is in progress 53 * (either running or paused) picks the target profile (if it's already 54 * available), otherwise falls back to plain reducing. 55 */ 56 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 57 { 58 u64 num_devices = fs_info->fs_devices->rw_devices; 59 u64 target; 60 u64 raid_type; 61 u64 allowed = 0; 62 63 /* 64 * See if restripe for this chunk_type is in progress, if so try to 65 * reduce to the target profile 66 */ 67 spin_lock(&fs_info->balance_lock); 68 target = get_restripe_target(fs_info, flags); 69 if (target) { 70 spin_unlock(&fs_info->balance_lock); 71 return extended_to_chunk(target); 72 } 73 spin_unlock(&fs_info->balance_lock); 74 75 /* First, mask out the RAID levels which aren't possible */ 76 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 77 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 78 allowed |= btrfs_raid_array[raid_type].bg_flag; 79 } 80 allowed &= flags; 81 82 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 83 allowed = BTRFS_BLOCK_GROUP_RAID6; 84 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 85 allowed = BTRFS_BLOCK_GROUP_RAID5; 86 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 87 allowed = BTRFS_BLOCK_GROUP_RAID10; 88 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 89 allowed = BTRFS_BLOCK_GROUP_RAID1; 90 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 91 allowed = BTRFS_BLOCK_GROUP_RAID0; 92 93 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 94 95 return extended_to_chunk(flags | allowed); 96 } 97 98 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 99 { 100 unsigned seq; 101 u64 flags; 102 103 do { 104 flags = orig_flags; 105 seq = read_seqbegin(&fs_info->profiles_lock); 106 107 if (flags & BTRFS_BLOCK_GROUP_DATA) 108 flags |= fs_info->avail_data_alloc_bits; 109 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 110 flags |= fs_info->avail_system_alloc_bits; 111 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 112 flags |= fs_info->avail_metadata_alloc_bits; 113 } while (read_seqretry(&fs_info->profiles_lock, seq)); 114 115 return btrfs_reduce_alloc_profile(fs_info, flags); 116 } 117 118 void btrfs_get_block_group(struct btrfs_block_group *cache) 119 { 120 refcount_inc(&cache->refs); 121 } 122 123 void btrfs_put_block_group(struct btrfs_block_group *cache) 124 { 125 if (refcount_dec_and_test(&cache->refs)) { 126 WARN_ON(cache->pinned > 0); 127 /* 128 * If there was a failure to cleanup a log tree, very likely due 129 * to an IO failure on a writeback attempt of one or more of its 130 * extent buffers, we could not do proper (and cheap) unaccounting 131 * of their reserved space, so don't warn on reserved > 0 in that 132 * case. 133 */ 134 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || 135 !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) 136 WARN_ON(cache->reserved > 0); 137 138 /* 139 * A block_group shouldn't be on the discard_list anymore. 140 * Remove the block_group from the discard_list to prevent us 141 * from causing a panic due to NULL pointer dereference. 142 */ 143 if (WARN_ON(!list_empty(&cache->discard_list))) 144 btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, 145 cache); 146 147 /* 148 * If not empty, someone is still holding mutex of 149 * full_stripe_lock, which can only be released by caller. 150 * And it will definitely cause use-after-free when caller 151 * tries to release full stripe lock. 152 * 153 * No better way to resolve, but only to warn. 154 */ 155 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 156 kfree(cache->free_space_ctl); 157 kfree(cache->physical_map); 158 kfree(cache); 159 } 160 } 161 162 /* 163 * This adds the block group to the fs_info rb tree for the block group cache 164 */ 165 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 166 struct btrfs_block_group *block_group) 167 { 168 struct rb_node **p; 169 struct rb_node *parent = NULL; 170 struct btrfs_block_group *cache; 171 bool leftmost = true; 172 173 ASSERT(block_group->length != 0); 174 175 write_lock(&info->block_group_cache_lock); 176 p = &info->block_group_cache_tree.rb_root.rb_node; 177 178 while (*p) { 179 parent = *p; 180 cache = rb_entry(parent, struct btrfs_block_group, cache_node); 181 if (block_group->start < cache->start) { 182 p = &(*p)->rb_left; 183 } else if (block_group->start > cache->start) { 184 p = &(*p)->rb_right; 185 leftmost = false; 186 } else { 187 write_unlock(&info->block_group_cache_lock); 188 return -EEXIST; 189 } 190 } 191 192 rb_link_node(&block_group->cache_node, parent, p); 193 rb_insert_color_cached(&block_group->cache_node, 194 &info->block_group_cache_tree, leftmost); 195 196 write_unlock(&info->block_group_cache_lock); 197 198 return 0; 199 } 200 201 /* 202 * This will return the block group at or after bytenr if contains is 0, else 203 * it will return the block group that contains the bytenr 204 */ 205 static struct btrfs_block_group *block_group_cache_tree_search( 206 struct btrfs_fs_info *info, u64 bytenr, int contains) 207 { 208 struct btrfs_block_group *cache, *ret = NULL; 209 struct rb_node *n; 210 u64 end, start; 211 212 read_lock(&info->block_group_cache_lock); 213 n = info->block_group_cache_tree.rb_root.rb_node; 214 215 while (n) { 216 cache = rb_entry(n, struct btrfs_block_group, cache_node); 217 end = cache->start + cache->length - 1; 218 start = cache->start; 219 220 if (bytenr < start) { 221 if (!contains && (!ret || start < ret->start)) 222 ret = cache; 223 n = n->rb_left; 224 } else if (bytenr > start) { 225 if (contains && bytenr <= end) { 226 ret = cache; 227 break; 228 } 229 n = n->rb_right; 230 } else { 231 ret = cache; 232 break; 233 } 234 } 235 if (ret) 236 btrfs_get_block_group(ret); 237 read_unlock(&info->block_group_cache_lock); 238 239 return ret; 240 } 241 242 /* 243 * Return the block group that starts at or after bytenr 244 */ 245 struct btrfs_block_group *btrfs_lookup_first_block_group( 246 struct btrfs_fs_info *info, u64 bytenr) 247 { 248 return block_group_cache_tree_search(info, bytenr, 0); 249 } 250 251 /* 252 * Return the block group that contains the given bytenr 253 */ 254 struct btrfs_block_group *btrfs_lookup_block_group( 255 struct btrfs_fs_info *info, u64 bytenr) 256 { 257 return block_group_cache_tree_search(info, bytenr, 1); 258 } 259 260 struct btrfs_block_group *btrfs_next_block_group( 261 struct btrfs_block_group *cache) 262 { 263 struct btrfs_fs_info *fs_info = cache->fs_info; 264 struct rb_node *node; 265 266 read_lock(&fs_info->block_group_cache_lock); 267 268 /* If our block group was removed, we need a full search. */ 269 if (RB_EMPTY_NODE(&cache->cache_node)) { 270 const u64 next_bytenr = cache->start + cache->length; 271 272 read_unlock(&fs_info->block_group_cache_lock); 273 btrfs_put_block_group(cache); 274 return btrfs_lookup_first_block_group(fs_info, next_bytenr); 275 } 276 node = rb_next(&cache->cache_node); 277 btrfs_put_block_group(cache); 278 if (node) { 279 cache = rb_entry(node, struct btrfs_block_group, cache_node); 280 btrfs_get_block_group(cache); 281 } else 282 cache = NULL; 283 read_unlock(&fs_info->block_group_cache_lock); 284 return cache; 285 } 286 287 /** 288 * Check if we can do a NOCOW write for a given extent. 289 * 290 * @fs_info: The filesystem information object. 291 * @bytenr: Logical start address of the extent. 292 * 293 * Check if we can do a NOCOW write for the given extent, and increments the 294 * number of NOCOW writers in the block group that contains the extent, as long 295 * as the block group exists and it's currently not in read-only mode. 296 * 297 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller 298 * is responsible for calling btrfs_dec_nocow_writers() later. 299 * 300 * Or NULL if we can not do a NOCOW write 301 */ 302 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 303 u64 bytenr) 304 { 305 struct btrfs_block_group *bg; 306 bool can_nocow = true; 307 308 bg = btrfs_lookup_block_group(fs_info, bytenr); 309 if (!bg) 310 return NULL; 311 312 spin_lock(&bg->lock); 313 if (bg->ro) 314 can_nocow = false; 315 else 316 atomic_inc(&bg->nocow_writers); 317 spin_unlock(&bg->lock); 318 319 if (!can_nocow) { 320 btrfs_put_block_group(bg); 321 return NULL; 322 } 323 324 /* No put on block group, done by btrfs_dec_nocow_writers(). */ 325 return bg; 326 } 327 328 /** 329 * Decrement the number of NOCOW writers in a block group. 330 * 331 * @bg: The block group. 332 * 333 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), 334 * and on the block group returned by that call. Typically this is called after 335 * creating an ordered extent for a NOCOW write, to prevent races with scrub and 336 * relocation. 337 * 338 * After this call, the caller should not use the block group anymore. It it wants 339 * to use it, then it should get a reference on it before calling this function. 340 */ 341 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) 342 { 343 if (atomic_dec_and_test(&bg->nocow_writers)) 344 wake_up_var(&bg->nocow_writers); 345 346 /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ 347 btrfs_put_block_group(bg); 348 } 349 350 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 351 { 352 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 353 } 354 355 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 356 const u64 start) 357 { 358 struct btrfs_block_group *bg; 359 360 bg = btrfs_lookup_block_group(fs_info, start); 361 ASSERT(bg); 362 if (atomic_dec_and_test(&bg->reservations)) 363 wake_up_var(&bg->reservations); 364 btrfs_put_block_group(bg); 365 } 366 367 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 368 { 369 struct btrfs_space_info *space_info = bg->space_info; 370 371 ASSERT(bg->ro); 372 373 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 374 return; 375 376 /* 377 * Our block group is read only but before we set it to read only, 378 * some task might have had allocated an extent from it already, but it 379 * has not yet created a respective ordered extent (and added it to a 380 * root's list of ordered extents). 381 * Therefore wait for any task currently allocating extents, since the 382 * block group's reservations counter is incremented while a read lock 383 * on the groups' semaphore is held and decremented after releasing 384 * the read access on that semaphore and creating the ordered extent. 385 */ 386 down_write(&space_info->groups_sem); 387 up_write(&space_info->groups_sem); 388 389 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 390 } 391 392 struct btrfs_caching_control *btrfs_get_caching_control( 393 struct btrfs_block_group *cache) 394 { 395 struct btrfs_caching_control *ctl; 396 397 spin_lock(&cache->lock); 398 if (!cache->caching_ctl) { 399 spin_unlock(&cache->lock); 400 return NULL; 401 } 402 403 ctl = cache->caching_ctl; 404 refcount_inc(&ctl->count); 405 spin_unlock(&cache->lock); 406 return ctl; 407 } 408 409 void btrfs_put_caching_control(struct btrfs_caching_control *ctl) 410 { 411 if (refcount_dec_and_test(&ctl->count)) 412 kfree(ctl); 413 } 414 415 /* 416 * When we wait for progress in the block group caching, its because our 417 * allocation attempt failed at least once. So, we must sleep and let some 418 * progress happen before we try again. 419 * 420 * This function will sleep at least once waiting for new free space to show 421 * up, and then it will check the block group free space numbers for our min 422 * num_bytes. Another option is to have it go ahead and look in the rbtree for 423 * a free extent of a given size, but this is a good start. 424 * 425 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 426 * any of the information in this block group. 427 */ 428 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 429 u64 num_bytes) 430 { 431 struct btrfs_caching_control *caching_ctl; 432 433 caching_ctl = btrfs_get_caching_control(cache); 434 if (!caching_ctl) 435 return; 436 437 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 438 (cache->free_space_ctl->free_space >= num_bytes)); 439 440 btrfs_put_caching_control(caching_ctl); 441 } 442 443 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, 444 struct btrfs_caching_control *caching_ctl) 445 { 446 wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 447 return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; 448 } 449 450 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 451 { 452 struct btrfs_caching_control *caching_ctl; 453 int ret; 454 455 caching_ctl = btrfs_get_caching_control(cache); 456 if (!caching_ctl) 457 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 458 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 459 btrfs_put_caching_control(caching_ctl); 460 return ret; 461 } 462 463 #ifdef CONFIG_BTRFS_DEBUG 464 static void fragment_free_space(struct btrfs_block_group *block_group) 465 { 466 struct btrfs_fs_info *fs_info = block_group->fs_info; 467 u64 start = block_group->start; 468 u64 len = block_group->length; 469 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 470 fs_info->nodesize : fs_info->sectorsize; 471 u64 step = chunk << 1; 472 473 while (len > chunk) { 474 btrfs_remove_free_space(block_group, start, chunk); 475 start += step; 476 if (len < step) 477 len = 0; 478 else 479 len -= step; 480 } 481 } 482 #endif 483 484 /* 485 * This is only called by btrfs_cache_block_group, since we could have freed 486 * extents we need to check the pinned_extents for any extents that can't be 487 * used yet since their free space will be released as soon as the transaction 488 * commits. 489 */ 490 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) 491 { 492 struct btrfs_fs_info *info = block_group->fs_info; 493 u64 extent_start, extent_end, size, total_added = 0; 494 int ret; 495 496 while (start < end) { 497 ret = find_first_extent_bit(&info->excluded_extents, start, 498 &extent_start, &extent_end, 499 EXTENT_DIRTY | EXTENT_UPTODATE, 500 NULL); 501 if (ret) 502 break; 503 504 if (extent_start <= start) { 505 start = extent_end + 1; 506 } else if (extent_start > start && extent_start < end) { 507 size = extent_start - start; 508 total_added += size; 509 ret = btrfs_add_free_space_async_trimmed(block_group, 510 start, size); 511 BUG_ON(ret); /* -ENOMEM or logic error */ 512 start = extent_end + 1; 513 } else { 514 break; 515 } 516 } 517 518 if (start < end) { 519 size = end - start; 520 total_added += size; 521 ret = btrfs_add_free_space_async_trimmed(block_group, start, 522 size); 523 BUG_ON(ret); /* -ENOMEM or logic error */ 524 } 525 526 return total_added; 527 } 528 529 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 530 { 531 struct btrfs_block_group *block_group = caching_ctl->block_group; 532 struct btrfs_fs_info *fs_info = block_group->fs_info; 533 struct btrfs_root *extent_root; 534 struct btrfs_path *path; 535 struct extent_buffer *leaf; 536 struct btrfs_key key; 537 u64 total_found = 0; 538 u64 last = 0; 539 u32 nritems; 540 int ret; 541 bool wakeup = true; 542 543 path = btrfs_alloc_path(); 544 if (!path) 545 return -ENOMEM; 546 547 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 548 extent_root = btrfs_extent_root(fs_info, last); 549 550 #ifdef CONFIG_BTRFS_DEBUG 551 /* 552 * If we're fragmenting we don't want to make anybody think we can 553 * allocate from this block group until we've had a chance to fragment 554 * the free space. 555 */ 556 if (btrfs_should_fragment_free_space(block_group)) 557 wakeup = false; 558 #endif 559 /* 560 * We don't want to deadlock with somebody trying to allocate a new 561 * extent for the extent root while also trying to search the extent 562 * root to add free space. So we skip locking and search the commit 563 * root, since its read-only 564 */ 565 path->skip_locking = 1; 566 path->search_commit_root = 1; 567 path->reada = READA_FORWARD; 568 569 key.objectid = last; 570 key.offset = 0; 571 key.type = BTRFS_EXTENT_ITEM_KEY; 572 573 next: 574 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 575 if (ret < 0) 576 goto out; 577 578 leaf = path->nodes[0]; 579 nritems = btrfs_header_nritems(leaf); 580 581 while (1) { 582 if (btrfs_fs_closing(fs_info) > 1) { 583 last = (u64)-1; 584 break; 585 } 586 587 if (path->slots[0] < nritems) { 588 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 589 } else { 590 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 591 if (ret) 592 break; 593 594 if (need_resched() || 595 rwsem_is_contended(&fs_info->commit_root_sem)) { 596 btrfs_release_path(path); 597 up_read(&fs_info->commit_root_sem); 598 mutex_unlock(&caching_ctl->mutex); 599 cond_resched(); 600 mutex_lock(&caching_ctl->mutex); 601 down_read(&fs_info->commit_root_sem); 602 goto next; 603 } 604 605 ret = btrfs_next_leaf(extent_root, path); 606 if (ret < 0) 607 goto out; 608 if (ret) 609 break; 610 leaf = path->nodes[0]; 611 nritems = btrfs_header_nritems(leaf); 612 continue; 613 } 614 615 if (key.objectid < last) { 616 key.objectid = last; 617 key.offset = 0; 618 key.type = BTRFS_EXTENT_ITEM_KEY; 619 btrfs_release_path(path); 620 goto next; 621 } 622 623 if (key.objectid < block_group->start) { 624 path->slots[0]++; 625 continue; 626 } 627 628 if (key.objectid >= block_group->start + block_group->length) 629 break; 630 631 if (key.type == BTRFS_EXTENT_ITEM_KEY || 632 key.type == BTRFS_METADATA_ITEM_KEY) { 633 total_found += add_new_free_space(block_group, last, 634 key.objectid); 635 if (key.type == BTRFS_METADATA_ITEM_KEY) 636 last = key.objectid + 637 fs_info->nodesize; 638 else 639 last = key.objectid + key.offset; 640 641 if (total_found > CACHING_CTL_WAKE_UP) { 642 total_found = 0; 643 if (wakeup) 644 wake_up(&caching_ctl->wait); 645 } 646 } 647 path->slots[0]++; 648 } 649 ret = 0; 650 651 total_found += add_new_free_space(block_group, last, 652 block_group->start + block_group->length); 653 654 out: 655 btrfs_free_path(path); 656 return ret; 657 } 658 659 static noinline void caching_thread(struct btrfs_work *work) 660 { 661 struct btrfs_block_group *block_group; 662 struct btrfs_fs_info *fs_info; 663 struct btrfs_caching_control *caching_ctl; 664 int ret; 665 666 caching_ctl = container_of(work, struct btrfs_caching_control, work); 667 block_group = caching_ctl->block_group; 668 fs_info = block_group->fs_info; 669 670 mutex_lock(&caching_ctl->mutex); 671 down_read(&fs_info->commit_root_sem); 672 673 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 674 ret = load_free_space_cache(block_group); 675 if (ret == 1) { 676 ret = 0; 677 goto done; 678 } 679 680 /* 681 * We failed to load the space cache, set ourselves to 682 * CACHE_STARTED and carry on. 683 */ 684 spin_lock(&block_group->lock); 685 block_group->cached = BTRFS_CACHE_STARTED; 686 spin_unlock(&block_group->lock); 687 wake_up(&caching_ctl->wait); 688 } 689 690 /* 691 * If we are in the transaction that populated the free space tree we 692 * can't actually cache from the free space tree as our commit root and 693 * real root are the same, so we could change the contents of the blocks 694 * while caching. Instead do the slow caching in this case, and after 695 * the transaction has committed we will be safe. 696 */ 697 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && 698 !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) 699 ret = load_free_space_tree(caching_ctl); 700 else 701 ret = load_extent_tree_free(caching_ctl); 702 done: 703 spin_lock(&block_group->lock); 704 block_group->caching_ctl = NULL; 705 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 706 spin_unlock(&block_group->lock); 707 708 #ifdef CONFIG_BTRFS_DEBUG 709 if (btrfs_should_fragment_free_space(block_group)) { 710 u64 bytes_used; 711 712 spin_lock(&block_group->space_info->lock); 713 spin_lock(&block_group->lock); 714 bytes_used = block_group->length - block_group->used; 715 block_group->space_info->bytes_used += bytes_used >> 1; 716 spin_unlock(&block_group->lock); 717 spin_unlock(&block_group->space_info->lock); 718 fragment_free_space(block_group); 719 } 720 #endif 721 722 up_read(&fs_info->commit_root_sem); 723 btrfs_free_excluded_extents(block_group); 724 mutex_unlock(&caching_ctl->mutex); 725 726 wake_up(&caching_ctl->wait); 727 728 btrfs_put_caching_control(caching_ctl); 729 btrfs_put_block_group(block_group); 730 } 731 732 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) 733 { 734 struct btrfs_fs_info *fs_info = cache->fs_info; 735 struct btrfs_caching_control *caching_ctl = NULL; 736 int ret = 0; 737 738 /* Allocator for zoned filesystems does not use the cache at all */ 739 if (btrfs_is_zoned(fs_info)) 740 return 0; 741 742 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 743 if (!caching_ctl) 744 return -ENOMEM; 745 746 INIT_LIST_HEAD(&caching_ctl->list); 747 mutex_init(&caching_ctl->mutex); 748 init_waitqueue_head(&caching_ctl->wait); 749 caching_ctl->block_group = cache; 750 refcount_set(&caching_ctl->count, 2); 751 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 752 753 spin_lock(&cache->lock); 754 if (cache->cached != BTRFS_CACHE_NO) { 755 kfree(caching_ctl); 756 757 caching_ctl = cache->caching_ctl; 758 if (caching_ctl) 759 refcount_inc(&caching_ctl->count); 760 spin_unlock(&cache->lock); 761 goto out; 762 } 763 WARN_ON(cache->caching_ctl); 764 cache->caching_ctl = caching_ctl; 765 cache->cached = BTRFS_CACHE_STARTED; 766 spin_unlock(&cache->lock); 767 768 write_lock(&fs_info->block_group_cache_lock); 769 refcount_inc(&caching_ctl->count); 770 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 771 write_unlock(&fs_info->block_group_cache_lock); 772 773 btrfs_get_block_group(cache); 774 775 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 776 out: 777 if (wait && caching_ctl) 778 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 779 if (caching_ctl) 780 btrfs_put_caching_control(caching_ctl); 781 782 return ret; 783 } 784 785 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 786 { 787 u64 extra_flags = chunk_to_extended(flags) & 788 BTRFS_EXTENDED_PROFILE_MASK; 789 790 write_seqlock(&fs_info->profiles_lock); 791 if (flags & BTRFS_BLOCK_GROUP_DATA) 792 fs_info->avail_data_alloc_bits &= ~extra_flags; 793 if (flags & BTRFS_BLOCK_GROUP_METADATA) 794 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 795 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 796 fs_info->avail_system_alloc_bits &= ~extra_flags; 797 write_sequnlock(&fs_info->profiles_lock); 798 } 799 800 /* 801 * Clear incompat bits for the following feature(s): 802 * 803 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 804 * in the whole filesystem 805 * 806 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 807 */ 808 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 809 { 810 bool found_raid56 = false; 811 bool found_raid1c34 = false; 812 813 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 814 (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 815 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 816 struct list_head *head = &fs_info->space_info; 817 struct btrfs_space_info *sinfo; 818 819 list_for_each_entry_rcu(sinfo, head, list) { 820 down_read(&sinfo->groups_sem); 821 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 822 found_raid56 = true; 823 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 824 found_raid56 = true; 825 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 826 found_raid1c34 = true; 827 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 828 found_raid1c34 = true; 829 up_read(&sinfo->groups_sem); 830 } 831 if (!found_raid56) 832 btrfs_clear_fs_incompat(fs_info, RAID56); 833 if (!found_raid1c34) 834 btrfs_clear_fs_incompat(fs_info, RAID1C34); 835 } 836 } 837 838 static int remove_block_group_item(struct btrfs_trans_handle *trans, 839 struct btrfs_path *path, 840 struct btrfs_block_group *block_group) 841 { 842 struct btrfs_fs_info *fs_info = trans->fs_info; 843 struct btrfs_root *root; 844 struct btrfs_key key; 845 int ret; 846 847 root = btrfs_block_group_root(fs_info); 848 key.objectid = block_group->start; 849 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 850 key.offset = block_group->length; 851 852 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 853 if (ret > 0) 854 ret = -ENOENT; 855 if (ret < 0) 856 return ret; 857 858 ret = btrfs_del_item(trans, root, path); 859 return ret; 860 } 861 862 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 863 u64 group_start, struct extent_map *em) 864 { 865 struct btrfs_fs_info *fs_info = trans->fs_info; 866 struct btrfs_path *path; 867 struct btrfs_block_group *block_group; 868 struct btrfs_free_cluster *cluster; 869 struct inode *inode; 870 struct kobject *kobj = NULL; 871 int ret; 872 int index; 873 int factor; 874 struct btrfs_caching_control *caching_ctl = NULL; 875 bool remove_em; 876 bool remove_rsv = false; 877 878 block_group = btrfs_lookup_block_group(fs_info, group_start); 879 BUG_ON(!block_group); 880 BUG_ON(!block_group->ro); 881 882 trace_btrfs_remove_block_group(block_group); 883 /* 884 * Free the reserved super bytes from this block group before 885 * remove it. 886 */ 887 btrfs_free_excluded_extents(block_group); 888 btrfs_free_ref_tree_range(fs_info, block_group->start, 889 block_group->length); 890 891 index = btrfs_bg_flags_to_raid_index(block_group->flags); 892 factor = btrfs_bg_type_to_factor(block_group->flags); 893 894 /* make sure this block group isn't part of an allocation cluster */ 895 cluster = &fs_info->data_alloc_cluster; 896 spin_lock(&cluster->refill_lock); 897 btrfs_return_cluster_to_free_space(block_group, cluster); 898 spin_unlock(&cluster->refill_lock); 899 900 /* 901 * make sure this block group isn't part of a metadata 902 * allocation cluster 903 */ 904 cluster = &fs_info->meta_alloc_cluster; 905 spin_lock(&cluster->refill_lock); 906 btrfs_return_cluster_to_free_space(block_group, cluster); 907 spin_unlock(&cluster->refill_lock); 908 909 btrfs_clear_treelog_bg(block_group); 910 btrfs_clear_data_reloc_bg(block_group); 911 912 path = btrfs_alloc_path(); 913 if (!path) { 914 ret = -ENOMEM; 915 goto out; 916 } 917 918 /* 919 * get the inode first so any iput calls done for the io_list 920 * aren't the final iput (no unlinks allowed now) 921 */ 922 inode = lookup_free_space_inode(block_group, path); 923 924 mutex_lock(&trans->transaction->cache_write_mutex); 925 /* 926 * Make sure our free space cache IO is done before removing the 927 * free space inode 928 */ 929 spin_lock(&trans->transaction->dirty_bgs_lock); 930 if (!list_empty(&block_group->io_list)) { 931 list_del_init(&block_group->io_list); 932 933 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 934 935 spin_unlock(&trans->transaction->dirty_bgs_lock); 936 btrfs_wait_cache_io(trans, block_group, path); 937 btrfs_put_block_group(block_group); 938 spin_lock(&trans->transaction->dirty_bgs_lock); 939 } 940 941 if (!list_empty(&block_group->dirty_list)) { 942 list_del_init(&block_group->dirty_list); 943 remove_rsv = true; 944 btrfs_put_block_group(block_group); 945 } 946 spin_unlock(&trans->transaction->dirty_bgs_lock); 947 mutex_unlock(&trans->transaction->cache_write_mutex); 948 949 ret = btrfs_remove_free_space_inode(trans, inode, block_group); 950 if (ret) 951 goto out; 952 953 write_lock(&fs_info->block_group_cache_lock); 954 rb_erase_cached(&block_group->cache_node, 955 &fs_info->block_group_cache_tree); 956 RB_CLEAR_NODE(&block_group->cache_node); 957 958 /* Once for the block groups rbtree */ 959 btrfs_put_block_group(block_group); 960 961 write_unlock(&fs_info->block_group_cache_lock); 962 963 down_write(&block_group->space_info->groups_sem); 964 /* 965 * we must use list_del_init so people can check to see if they 966 * are still on the list after taking the semaphore 967 */ 968 list_del_init(&block_group->list); 969 if (list_empty(&block_group->space_info->block_groups[index])) { 970 kobj = block_group->space_info->block_group_kobjs[index]; 971 block_group->space_info->block_group_kobjs[index] = NULL; 972 clear_avail_alloc_bits(fs_info, block_group->flags); 973 } 974 up_write(&block_group->space_info->groups_sem); 975 clear_incompat_bg_bits(fs_info, block_group->flags); 976 if (kobj) { 977 kobject_del(kobj); 978 kobject_put(kobj); 979 } 980 981 if (block_group->cached == BTRFS_CACHE_STARTED) 982 btrfs_wait_block_group_cache_done(block_group); 983 984 write_lock(&fs_info->block_group_cache_lock); 985 caching_ctl = btrfs_get_caching_control(block_group); 986 if (!caching_ctl) { 987 struct btrfs_caching_control *ctl; 988 989 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { 990 if (ctl->block_group == block_group) { 991 caching_ctl = ctl; 992 refcount_inc(&caching_ctl->count); 993 break; 994 } 995 } 996 } 997 if (caching_ctl) 998 list_del_init(&caching_ctl->list); 999 write_unlock(&fs_info->block_group_cache_lock); 1000 1001 if (caching_ctl) { 1002 /* Once for the caching bgs list and once for us. */ 1003 btrfs_put_caching_control(caching_ctl); 1004 btrfs_put_caching_control(caching_ctl); 1005 } 1006 1007 spin_lock(&trans->transaction->dirty_bgs_lock); 1008 WARN_ON(!list_empty(&block_group->dirty_list)); 1009 WARN_ON(!list_empty(&block_group->io_list)); 1010 spin_unlock(&trans->transaction->dirty_bgs_lock); 1011 1012 btrfs_remove_free_space_cache(block_group); 1013 1014 spin_lock(&block_group->space_info->lock); 1015 list_del_init(&block_group->ro_list); 1016 1017 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1018 WARN_ON(block_group->space_info->total_bytes 1019 < block_group->length); 1020 WARN_ON(block_group->space_info->bytes_readonly 1021 < block_group->length - block_group->zone_unusable); 1022 WARN_ON(block_group->space_info->bytes_zone_unusable 1023 < block_group->zone_unusable); 1024 WARN_ON(block_group->space_info->disk_total 1025 < block_group->length * factor); 1026 WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 1027 &block_group->runtime_flags) && 1028 block_group->space_info->active_total_bytes 1029 < block_group->length); 1030 } 1031 block_group->space_info->total_bytes -= block_group->length; 1032 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) 1033 block_group->space_info->active_total_bytes -= block_group->length; 1034 block_group->space_info->bytes_readonly -= 1035 (block_group->length - block_group->zone_unusable); 1036 block_group->space_info->bytes_zone_unusable -= 1037 block_group->zone_unusable; 1038 block_group->space_info->disk_total -= block_group->length * factor; 1039 1040 spin_unlock(&block_group->space_info->lock); 1041 1042 /* 1043 * Remove the free space for the block group from the free space tree 1044 * and the block group's item from the extent tree before marking the 1045 * block group as removed. This is to prevent races with tasks that 1046 * freeze and unfreeze a block group, this task and another task 1047 * allocating a new block group - the unfreeze task ends up removing 1048 * the block group's extent map before the task calling this function 1049 * deletes the block group item from the extent tree, allowing for 1050 * another task to attempt to create another block group with the same 1051 * item key (and failing with -EEXIST and a transaction abort). 1052 */ 1053 ret = remove_block_group_free_space(trans, block_group); 1054 if (ret) 1055 goto out; 1056 1057 ret = remove_block_group_item(trans, path, block_group); 1058 if (ret < 0) 1059 goto out; 1060 1061 spin_lock(&block_group->lock); 1062 set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); 1063 1064 /* 1065 * At this point trimming or scrub can't start on this block group, 1066 * because we removed the block group from the rbtree 1067 * fs_info->block_group_cache_tree so no one can't find it anymore and 1068 * even if someone already got this block group before we removed it 1069 * from the rbtree, they have already incremented block_group->frozen - 1070 * if they didn't, for the trimming case they won't find any free space 1071 * entries because we already removed them all when we called 1072 * btrfs_remove_free_space_cache(). 1073 * 1074 * And we must not remove the extent map from the fs_info->mapping_tree 1075 * to prevent the same logical address range and physical device space 1076 * ranges from being reused for a new block group. This is needed to 1077 * avoid races with trimming and scrub. 1078 * 1079 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 1080 * completely transactionless, so while it is trimming a range the 1081 * currently running transaction might finish and a new one start, 1082 * allowing for new block groups to be created that can reuse the same 1083 * physical device locations unless we take this special care. 1084 * 1085 * There may also be an implicit trim operation if the file system 1086 * is mounted with -odiscard. The same protections must remain 1087 * in place until the extents have been discarded completely when 1088 * the transaction commit has completed. 1089 */ 1090 remove_em = (atomic_read(&block_group->frozen) == 0); 1091 spin_unlock(&block_group->lock); 1092 1093 if (remove_em) { 1094 struct extent_map_tree *em_tree; 1095 1096 em_tree = &fs_info->mapping_tree; 1097 write_lock(&em_tree->lock); 1098 remove_extent_mapping(em_tree, em); 1099 write_unlock(&em_tree->lock); 1100 /* once for the tree */ 1101 free_extent_map(em); 1102 } 1103 1104 out: 1105 /* Once for the lookup reference */ 1106 btrfs_put_block_group(block_group); 1107 if (remove_rsv) 1108 btrfs_delayed_refs_rsv_release(fs_info, 1); 1109 btrfs_free_path(path); 1110 return ret; 1111 } 1112 1113 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 1114 struct btrfs_fs_info *fs_info, const u64 chunk_offset) 1115 { 1116 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1117 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1118 struct extent_map *em; 1119 struct map_lookup *map; 1120 unsigned int num_items; 1121 1122 read_lock(&em_tree->lock); 1123 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1124 read_unlock(&em_tree->lock); 1125 ASSERT(em && em->start == chunk_offset); 1126 1127 /* 1128 * We need to reserve 3 + N units from the metadata space info in order 1129 * to remove a block group (done at btrfs_remove_chunk() and at 1130 * btrfs_remove_block_group()), which are used for: 1131 * 1132 * 1 unit for adding the free space inode's orphan (located in the tree 1133 * of tree roots). 1134 * 1 unit for deleting the block group item (located in the extent 1135 * tree). 1136 * 1 unit for deleting the free space item (located in tree of tree 1137 * roots). 1138 * N units for deleting N device extent items corresponding to each 1139 * stripe (located in the device tree). 1140 * 1141 * In order to remove a block group we also need to reserve units in the 1142 * system space info in order to update the chunk tree (update one or 1143 * more device items and remove one chunk item), but this is done at 1144 * btrfs_remove_chunk() through a call to check_system_chunk(). 1145 */ 1146 map = em->map_lookup; 1147 num_items = 3 + map->num_stripes; 1148 free_extent_map(em); 1149 1150 return btrfs_start_transaction_fallback_global_rsv(root, num_items); 1151 } 1152 1153 /* 1154 * Mark block group @cache read-only, so later write won't happen to block 1155 * group @cache. 1156 * 1157 * If @force is not set, this function will only mark the block group readonly 1158 * if we have enough free space (1M) in other metadata/system block groups. 1159 * If @force is not set, this function will mark the block group readonly 1160 * without checking free space. 1161 * 1162 * NOTE: This function doesn't care if other block groups can contain all the 1163 * data in this block group. That check should be done by relocation routine, 1164 * not this function. 1165 */ 1166 static int inc_block_group_ro(struct btrfs_block_group *cache, int force) 1167 { 1168 struct btrfs_space_info *sinfo = cache->space_info; 1169 u64 num_bytes; 1170 int ret = -ENOSPC; 1171 1172 spin_lock(&sinfo->lock); 1173 spin_lock(&cache->lock); 1174 1175 if (cache->swap_extents) { 1176 ret = -ETXTBSY; 1177 goto out; 1178 } 1179 1180 if (cache->ro) { 1181 cache->ro++; 1182 ret = 0; 1183 goto out; 1184 } 1185 1186 num_bytes = cache->length - cache->reserved - cache->pinned - 1187 cache->bytes_super - cache->zone_unusable - cache->used; 1188 1189 /* 1190 * Data never overcommits, even in mixed mode, so do just the straight 1191 * check of left over space in how much we have allocated. 1192 */ 1193 if (force) { 1194 ret = 0; 1195 } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 1196 u64 sinfo_used = btrfs_space_info_used(sinfo, true); 1197 1198 /* 1199 * Here we make sure if we mark this bg RO, we still have enough 1200 * free space as buffer. 1201 */ 1202 if (sinfo_used + num_bytes <= sinfo->total_bytes) 1203 ret = 0; 1204 } else { 1205 /* 1206 * We overcommit metadata, so we need to do the 1207 * btrfs_can_overcommit check here, and we need to pass in 1208 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 1209 * leeway to allow us to mark this block group as read only. 1210 */ 1211 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 1212 BTRFS_RESERVE_NO_FLUSH)) 1213 ret = 0; 1214 } 1215 1216 if (!ret) { 1217 sinfo->bytes_readonly += num_bytes; 1218 if (btrfs_is_zoned(cache->fs_info)) { 1219 /* Migrate zone_unusable bytes to readonly */ 1220 sinfo->bytes_readonly += cache->zone_unusable; 1221 sinfo->bytes_zone_unusable -= cache->zone_unusable; 1222 cache->zone_unusable = 0; 1223 } 1224 cache->ro++; 1225 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1226 } 1227 out: 1228 spin_unlock(&cache->lock); 1229 spin_unlock(&sinfo->lock); 1230 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1231 btrfs_info(cache->fs_info, 1232 "unable to make block group %llu ro", cache->start); 1233 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1234 } 1235 return ret; 1236 } 1237 1238 static bool clean_pinned_extents(struct btrfs_trans_handle *trans, 1239 struct btrfs_block_group *bg) 1240 { 1241 struct btrfs_fs_info *fs_info = bg->fs_info; 1242 struct btrfs_transaction *prev_trans = NULL; 1243 const u64 start = bg->start; 1244 const u64 end = start + bg->length - 1; 1245 int ret; 1246 1247 spin_lock(&fs_info->trans_lock); 1248 if (trans->transaction->list.prev != &fs_info->trans_list) { 1249 prev_trans = list_last_entry(&trans->transaction->list, 1250 struct btrfs_transaction, list); 1251 refcount_inc(&prev_trans->use_count); 1252 } 1253 spin_unlock(&fs_info->trans_lock); 1254 1255 /* 1256 * Hold the unused_bg_unpin_mutex lock to avoid racing with 1257 * btrfs_finish_extent_commit(). If we are at transaction N, another 1258 * task might be running finish_extent_commit() for the previous 1259 * transaction N - 1, and have seen a range belonging to the block 1260 * group in pinned_extents before we were able to clear the whole block 1261 * group range from pinned_extents. This means that task can lookup for 1262 * the block group after we unpinned it from pinned_extents and removed 1263 * it, leading to a BUG_ON() at unpin_extent_range(). 1264 */ 1265 mutex_lock(&fs_info->unused_bg_unpin_mutex); 1266 if (prev_trans) { 1267 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, 1268 EXTENT_DIRTY); 1269 if (ret) 1270 goto out; 1271 } 1272 1273 ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, 1274 EXTENT_DIRTY); 1275 out: 1276 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1277 if (prev_trans) 1278 btrfs_put_transaction(prev_trans); 1279 1280 return ret == 0; 1281 } 1282 1283 /* 1284 * Process the unused_bgs list and remove any that don't have any allocated 1285 * space inside of them. 1286 */ 1287 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 1288 { 1289 struct btrfs_block_group *block_group; 1290 struct btrfs_space_info *space_info; 1291 struct btrfs_trans_handle *trans; 1292 const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); 1293 int ret = 0; 1294 1295 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1296 return; 1297 1298 if (btrfs_fs_closing(fs_info)) 1299 return; 1300 1301 /* 1302 * Long running balances can keep us blocked here for eternity, so 1303 * simply skip deletion if we're unable to get the mutex. 1304 */ 1305 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 1306 return; 1307 1308 spin_lock(&fs_info->unused_bgs_lock); 1309 while (!list_empty(&fs_info->unused_bgs)) { 1310 int trimming; 1311 1312 block_group = list_first_entry(&fs_info->unused_bgs, 1313 struct btrfs_block_group, 1314 bg_list); 1315 list_del_init(&block_group->bg_list); 1316 1317 space_info = block_group->space_info; 1318 1319 if (ret || btrfs_mixed_space_info(space_info)) { 1320 btrfs_put_block_group(block_group); 1321 continue; 1322 } 1323 spin_unlock(&fs_info->unused_bgs_lock); 1324 1325 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 1326 1327 /* Don't want to race with allocators so take the groups_sem */ 1328 down_write(&space_info->groups_sem); 1329 1330 /* 1331 * Async discard moves the final block group discard to be prior 1332 * to the unused_bgs code path. Therefore, if it's not fully 1333 * trimmed, punt it back to the async discard lists. 1334 */ 1335 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && 1336 !btrfs_is_free_space_trimmed(block_group)) { 1337 trace_btrfs_skip_unused_block_group(block_group); 1338 up_write(&space_info->groups_sem); 1339 /* Requeue if we failed because of async discard */ 1340 btrfs_discard_queue_work(&fs_info->discard_ctl, 1341 block_group); 1342 goto next; 1343 } 1344 1345 spin_lock(&block_group->lock); 1346 if (block_group->reserved || block_group->pinned || 1347 block_group->used || block_group->ro || 1348 list_is_singular(&block_group->list)) { 1349 /* 1350 * We want to bail if we made new allocations or have 1351 * outstanding allocations in this block group. We do 1352 * the ro check in case balance is currently acting on 1353 * this block group. 1354 */ 1355 trace_btrfs_skip_unused_block_group(block_group); 1356 spin_unlock(&block_group->lock); 1357 up_write(&space_info->groups_sem); 1358 goto next; 1359 } 1360 spin_unlock(&block_group->lock); 1361 1362 /* We don't want to force the issue, only flip if it's ok. */ 1363 ret = inc_block_group_ro(block_group, 0); 1364 up_write(&space_info->groups_sem); 1365 if (ret < 0) { 1366 ret = 0; 1367 goto next; 1368 } 1369 1370 ret = btrfs_zone_finish(block_group); 1371 if (ret < 0) { 1372 btrfs_dec_block_group_ro(block_group); 1373 if (ret == -EAGAIN) 1374 ret = 0; 1375 goto next; 1376 } 1377 1378 /* 1379 * Want to do this before we do anything else so we can recover 1380 * properly if we fail to join the transaction. 1381 */ 1382 trans = btrfs_start_trans_remove_block_group(fs_info, 1383 block_group->start); 1384 if (IS_ERR(trans)) { 1385 btrfs_dec_block_group_ro(block_group); 1386 ret = PTR_ERR(trans); 1387 goto next; 1388 } 1389 1390 /* 1391 * We could have pending pinned extents for this block group, 1392 * just delete them, we don't care about them anymore. 1393 */ 1394 if (!clean_pinned_extents(trans, block_group)) { 1395 btrfs_dec_block_group_ro(block_group); 1396 goto end_trans; 1397 } 1398 1399 /* 1400 * At this point, the block_group is read only and should fail 1401 * new allocations. However, btrfs_finish_extent_commit() can 1402 * cause this block_group to be placed back on the discard 1403 * lists because now the block_group isn't fully discarded. 1404 * Bail here and try again later after discarding everything. 1405 */ 1406 spin_lock(&fs_info->discard_ctl.lock); 1407 if (!list_empty(&block_group->discard_list)) { 1408 spin_unlock(&fs_info->discard_ctl.lock); 1409 btrfs_dec_block_group_ro(block_group); 1410 btrfs_discard_queue_work(&fs_info->discard_ctl, 1411 block_group); 1412 goto end_trans; 1413 } 1414 spin_unlock(&fs_info->discard_ctl.lock); 1415 1416 /* Reset pinned so btrfs_put_block_group doesn't complain */ 1417 spin_lock(&space_info->lock); 1418 spin_lock(&block_group->lock); 1419 1420 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1421 -block_group->pinned); 1422 space_info->bytes_readonly += block_group->pinned; 1423 block_group->pinned = 0; 1424 1425 spin_unlock(&block_group->lock); 1426 spin_unlock(&space_info->lock); 1427 1428 /* 1429 * The normal path here is an unused block group is passed here, 1430 * then trimming is handled in the transaction commit path. 1431 * Async discard interposes before this to do the trimming 1432 * before coming down the unused block group path as trimming 1433 * will no longer be done later in the transaction commit path. 1434 */ 1435 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1436 goto flip_async; 1437 1438 /* 1439 * DISCARD can flip during remount. On zoned filesystems, we 1440 * need to reset sequential-required zones. 1441 */ 1442 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || 1443 btrfs_is_zoned(fs_info); 1444 1445 /* Implicit trim during transaction commit. */ 1446 if (trimming) 1447 btrfs_freeze_block_group(block_group); 1448 1449 /* 1450 * Btrfs_remove_chunk will abort the transaction if things go 1451 * horribly wrong. 1452 */ 1453 ret = btrfs_remove_chunk(trans, block_group->start); 1454 1455 if (ret) { 1456 if (trimming) 1457 btrfs_unfreeze_block_group(block_group); 1458 goto end_trans; 1459 } 1460 1461 /* 1462 * If we're not mounted with -odiscard, we can just forget 1463 * about this block group. Otherwise we'll need to wait 1464 * until transaction commit to do the actual discard. 1465 */ 1466 if (trimming) { 1467 spin_lock(&fs_info->unused_bgs_lock); 1468 /* 1469 * A concurrent scrub might have added us to the list 1470 * fs_info->unused_bgs, so use a list_move operation 1471 * to add the block group to the deleted_bgs list. 1472 */ 1473 list_move(&block_group->bg_list, 1474 &trans->transaction->deleted_bgs); 1475 spin_unlock(&fs_info->unused_bgs_lock); 1476 btrfs_get_block_group(block_group); 1477 } 1478 end_trans: 1479 btrfs_end_transaction(trans); 1480 next: 1481 btrfs_put_block_group(block_group); 1482 spin_lock(&fs_info->unused_bgs_lock); 1483 } 1484 spin_unlock(&fs_info->unused_bgs_lock); 1485 mutex_unlock(&fs_info->reclaim_bgs_lock); 1486 return; 1487 1488 flip_async: 1489 btrfs_end_transaction(trans); 1490 mutex_unlock(&fs_info->reclaim_bgs_lock); 1491 btrfs_put_block_group(block_group); 1492 btrfs_discard_punt_unused_bgs_list(fs_info); 1493 } 1494 1495 void btrfs_mark_bg_unused(struct btrfs_block_group *bg) 1496 { 1497 struct btrfs_fs_info *fs_info = bg->fs_info; 1498 1499 spin_lock(&fs_info->unused_bgs_lock); 1500 if (list_empty(&bg->bg_list)) { 1501 btrfs_get_block_group(bg); 1502 trace_btrfs_add_unused_block_group(bg); 1503 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1504 } 1505 spin_unlock(&fs_info->unused_bgs_lock); 1506 } 1507 1508 /* 1509 * We want block groups with a low number of used bytes to be in the beginning 1510 * of the list, so they will get reclaimed first. 1511 */ 1512 static int reclaim_bgs_cmp(void *unused, const struct list_head *a, 1513 const struct list_head *b) 1514 { 1515 const struct btrfs_block_group *bg1, *bg2; 1516 1517 bg1 = list_entry(a, struct btrfs_block_group, bg_list); 1518 bg2 = list_entry(b, struct btrfs_block_group, bg_list); 1519 1520 return bg1->used > bg2->used; 1521 } 1522 1523 static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) 1524 { 1525 if (btrfs_is_zoned(fs_info)) 1526 return btrfs_zoned_should_reclaim(fs_info); 1527 return true; 1528 } 1529 1530 void btrfs_reclaim_bgs_work(struct work_struct *work) 1531 { 1532 struct btrfs_fs_info *fs_info = 1533 container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 1534 struct btrfs_block_group *bg; 1535 struct btrfs_space_info *space_info; 1536 1537 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1538 return; 1539 1540 if (btrfs_fs_closing(fs_info)) 1541 return; 1542 1543 if (!btrfs_should_reclaim(fs_info)) 1544 return; 1545 1546 sb_start_write(fs_info->sb); 1547 1548 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 1549 sb_end_write(fs_info->sb); 1550 return; 1551 } 1552 1553 /* 1554 * Long running balances can keep us blocked here for eternity, so 1555 * simply skip reclaim if we're unable to get the mutex. 1556 */ 1557 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { 1558 btrfs_exclop_finish(fs_info); 1559 sb_end_write(fs_info->sb); 1560 return; 1561 } 1562 1563 spin_lock(&fs_info->unused_bgs_lock); 1564 /* 1565 * Sort happens under lock because we can't simply splice it and sort. 1566 * The block groups might still be in use and reachable via bg_list, 1567 * and their presence in the reclaim_bgs list must be preserved. 1568 */ 1569 list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); 1570 while (!list_empty(&fs_info->reclaim_bgs)) { 1571 u64 zone_unusable; 1572 int ret = 0; 1573 1574 bg = list_first_entry(&fs_info->reclaim_bgs, 1575 struct btrfs_block_group, 1576 bg_list); 1577 list_del_init(&bg->bg_list); 1578 1579 space_info = bg->space_info; 1580 spin_unlock(&fs_info->unused_bgs_lock); 1581 1582 /* Don't race with allocators so take the groups_sem */ 1583 down_write(&space_info->groups_sem); 1584 1585 spin_lock(&bg->lock); 1586 if (bg->reserved || bg->pinned || bg->ro) { 1587 /* 1588 * We want to bail if we made new allocations or have 1589 * outstanding allocations in this block group. We do 1590 * the ro check in case balance is currently acting on 1591 * this block group. 1592 */ 1593 spin_unlock(&bg->lock); 1594 up_write(&space_info->groups_sem); 1595 goto next; 1596 } 1597 spin_unlock(&bg->lock); 1598 1599 /* Get out fast, in case we're unmounting the filesystem */ 1600 if (btrfs_fs_closing(fs_info)) { 1601 up_write(&space_info->groups_sem); 1602 goto next; 1603 } 1604 1605 /* 1606 * Cache the zone_unusable value before turning the block group 1607 * to read only. As soon as the blog group is read only it's 1608 * zone_unusable value gets moved to the block group's read-only 1609 * bytes and isn't available for calculations anymore. 1610 */ 1611 zone_unusable = bg->zone_unusable; 1612 ret = inc_block_group_ro(bg, 0); 1613 up_write(&space_info->groups_sem); 1614 if (ret < 0) 1615 goto next; 1616 1617 btrfs_info(fs_info, 1618 "reclaiming chunk %llu with %llu%% used %llu%% unusable", 1619 bg->start, div_u64(bg->used * 100, bg->length), 1620 div64_u64(zone_unusable * 100, bg->length)); 1621 trace_btrfs_reclaim_block_group(bg); 1622 ret = btrfs_relocate_chunk(fs_info, bg->start); 1623 if (ret) { 1624 btrfs_dec_block_group_ro(bg); 1625 btrfs_err(fs_info, "error relocating chunk %llu", 1626 bg->start); 1627 } 1628 1629 next: 1630 btrfs_put_block_group(bg); 1631 spin_lock(&fs_info->unused_bgs_lock); 1632 } 1633 spin_unlock(&fs_info->unused_bgs_lock); 1634 mutex_unlock(&fs_info->reclaim_bgs_lock); 1635 btrfs_exclop_finish(fs_info); 1636 sb_end_write(fs_info->sb); 1637 } 1638 1639 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) 1640 { 1641 spin_lock(&fs_info->unused_bgs_lock); 1642 if (!list_empty(&fs_info->reclaim_bgs)) 1643 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); 1644 spin_unlock(&fs_info->unused_bgs_lock); 1645 } 1646 1647 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) 1648 { 1649 struct btrfs_fs_info *fs_info = bg->fs_info; 1650 1651 spin_lock(&fs_info->unused_bgs_lock); 1652 if (list_empty(&bg->bg_list)) { 1653 btrfs_get_block_group(bg); 1654 trace_btrfs_add_reclaim_block_group(bg); 1655 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); 1656 } 1657 spin_unlock(&fs_info->unused_bgs_lock); 1658 } 1659 1660 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 1661 struct btrfs_path *path) 1662 { 1663 struct extent_map_tree *em_tree; 1664 struct extent_map *em; 1665 struct btrfs_block_group_item bg; 1666 struct extent_buffer *leaf; 1667 int slot; 1668 u64 flags; 1669 int ret = 0; 1670 1671 slot = path->slots[0]; 1672 leaf = path->nodes[0]; 1673 1674 em_tree = &fs_info->mapping_tree; 1675 read_lock(&em_tree->lock); 1676 em = lookup_extent_mapping(em_tree, key->objectid, key->offset); 1677 read_unlock(&em_tree->lock); 1678 if (!em) { 1679 btrfs_err(fs_info, 1680 "logical %llu len %llu found bg but no related chunk", 1681 key->objectid, key->offset); 1682 return -ENOENT; 1683 } 1684 1685 if (em->start != key->objectid || em->len != key->offset) { 1686 btrfs_err(fs_info, 1687 "block group %llu len %llu mismatch with chunk %llu len %llu", 1688 key->objectid, key->offset, em->start, em->len); 1689 ret = -EUCLEAN; 1690 goto out_free_em; 1691 } 1692 1693 read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), 1694 sizeof(bg)); 1695 flags = btrfs_stack_block_group_flags(&bg) & 1696 BTRFS_BLOCK_GROUP_TYPE_MASK; 1697 1698 if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1699 btrfs_err(fs_info, 1700 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 1701 key->objectid, key->offset, flags, 1702 (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); 1703 ret = -EUCLEAN; 1704 } 1705 1706 out_free_em: 1707 free_extent_map(em); 1708 return ret; 1709 } 1710 1711 static int find_first_block_group(struct btrfs_fs_info *fs_info, 1712 struct btrfs_path *path, 1713 struct btrfs_key *key) 1714 { 1715 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1716 int ret; 1717 struct btrfs_key found_key; 1718 1719 btrfs_for_each_slot(root, key, &found_key, path, ret) { 1720 if (found_key.objectid >= key->objectid && 1721 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1722 return read_bg_from_eb(fs_info, &found_key, path); 1723 } 1724 } 1725 return ret; 1726 } 1727 1728 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1729 { 1730 u64 extra_flags = chunk_to_extended(flags) & 1731 BTRFS_EXTENDED_PROFILE_MASK; 1732 1733 write_seqlock(&fs_info->profiles_lock); 1734 if (flags & BTRFS_BLOCK_GROUP_DATA) 1735 fs_info->avail_data_alloc_bits |= extra_flags; 1736 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1737 fs_info->avail_metadata_alloc_bits |= extra_flags; 1738 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1739 fs_info->avail_system_alloc_bits |= extra_flags; 1740 write_sequnlock(&fs_info->profiles_lock); 1741 } 1742 1743 /** 1744 * Map a physical disk address to a list of logical addresses 1745 * 1746 * @fs_info: the filesystem 1747 * @chunk_start: logical address of block group 1748 * @bdev: physical device to resolve, can be NULL to indicate any device 1749 * @physical: physical address to map to logical addresses 1750 * @logical: return array of logical addresses which map to @physical 1751 * @naddrs: length of @logical 1752 * @stripe_len: size of IO stripe for the given block group 1753 * 1754 * Maps a particular @physical disk address to a list of @logical addresses. 1755 * Used primarily to exclude those portions of a block group that contain super 1756 * block copies. 1757 */ 1758 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 1759 struct block_device *bdev, u64 physical, u64 **logical, 1760 int *naddrs, int *stripe_len) 1761 { 1762 struct extent_map *em; 1763 struct map_lookup *map; 1764 u64 *buf; 1765 u64 bytenr; 1766 u64 data_stripe_length; 1767 u64 io_stripe_size; 1768 int i, nr = 0; 1769 int ret = 0; 1770 1771 em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 1772 if (IS_ERR(em)) 1773 return -EIO; 1774 1775 map = em->map_lookup; 1776 data_stripe_length = em->orig_block_len; 1777 io_stripe_size = map->stripe_len; 1778 chunk_start = em->start; 1779 1780 /* For RAID5/6 adjust to a full IO stripe length */ 1781 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1782 io_stripe_size = map->stripe_len * nr_data_stripes(map); 1783 1784 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 1785 if (!buf) { 1786 ret = -ENOMEM; 1787 goto out; 1788 } 1789 1790 for (i = 0; i < map->num_stripes; i++) { 1791 bool already_inserted = false; 1792 u64 stripe_nr; 1793 u64 offset; 1794 int j; 1795 1796 if (!in_range(physical, map->stripes[i].physical, 1797 data_stripe_length)) 1798 continue; 1799 1800 if (bdev && map->stripes[i].dev->bdev != bdev) 1801 continue; 1802 1803 stripe_nr = physical - map->stripes[i].physical; 1804 stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); 1805 1806 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 1807 BTRFS_BLOCK_GROUP_RAID10)) { 1808 stripe_nr = stripe_nr * map->num_stripes + i; 1809 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 1810 } 1811 /* 1812 * The remaining case would be for RAID56, multiply by 1813 * nr_data_stripes(). Alternatively, just use rmap_len below 1814 * instead of map->stripe_len 1815 */ 1816 1817 bytenr = chunk_start + stripe_nr * io_stripe_size + offset; 1818 1819 /* Ensure we don't add duplicate addresses */ 1820 for (j = 0; j < nr; j++) { 1821 if (buf[j] == bytenr) { 1822 already_inserted = true; 1823 break; 1824 } 1825 } 1826 1827 if (!already_inserted) 1828 buf[nr++] = bytenr; 1829 } 1830 1831 *logical = buf; 1832 *naddrs = nr; 1833 *stripe_len = io_stripe_size; 1834 out: 1835 free_extent_map(em); 1836 return ret; 1837 } 1838 1839 static int exclude_super_stripes(struct btrfs_block_group *cache) 1840 { 1841 struct btrfs_fs_info *fs_info = cache->fs_info; 1842 const bool zoned = btrfs_is_zoned(fs_info); 1843 u64 bytenr; 1844 u64 *logical; 1845 int stripe_len; 1846 int i, nr, ret; 1847 1848 if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 1849 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 1850 cache->bytes_super += stripe_len; 1851 ret = btrfs_add_excluded_extent(fs_info, cache->start, 1852 stripe_len); 1853 if (ret) 1854 return ret; 1855 } 1856 1857 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1858 bytenr = btrfs_sb_offset(i); 1859 ret = btrfs_rmap_block(fs_info, cache->start, NULL, 1860 bytenr, &logical, &nr, &stripe_len); 1861 if (ret) 1862 return ret; 1863 1864 /* Shouldn't have super stripes in sequential zones */ 1865 if (zoned && nr) { 1866 btrfs_err(fs_info, 1867 "zoned: block group %llu must not contain super block", 1868 cache->start); 1869 return -EUCLEAN; 1870 } 1871 1872 while (nr--) { 1873 u64 len = min_t(u64, stripe_len, 1874 cache->start + cache->length - logical[nr]); 1875 1876 cache->bytes_super += len; 1877 ret = btrfs_add_excluded_extent(fs_info, logical[nr], 1878 len); 1879 if (ret) { 1880 kfree(logical); 1881 return ret; 1882 } 1883 } 1884 1885 kfree(logical); 1886 } 1887 return 0; 1888 } 1889 1890 static struct btrfs_block_group *btrfs_create_block_group_cache( 1891 struct btrfs_fs_info *fs_info, u64 start) 1892 { 1893 struct btrfs_block_group *cache; 1894 1895 cache = kzalloc(sizeof(*cache), GFP_NOFS); 1896 if (!cache) 1897 return NULL; 1898 1899 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 1900 GFP_NOFS); 1901 if (!cache->free_space_ctl) { 1902 kfree(cache); 1903 return NULL; 1904 } 1905 1906 cache->start = start; 1907 1908 cache->fs_info = fs_info; 1909 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 1910 1911 cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 1912 1913 refcount_set(&cache->refs, 1); 1914 spin_lock_init(&cache->lock); 1915 init_rwsem(&cache->data_rwsem); 1916 INIT_LIST_HEAD(&cache->list); 1917 INIT_LIST_HEAD(&cache->cluster_list); 1918 INIT_LIST_HEAD(&cache->bg_list); 1919 INIT_LIST_HEAD(&cache->ro_list); 1920 INIT_LIST_HEAD(&cache->discard_list); 1921 INIT_LIST_HEAD(&cache->dirty_list); 1922 INIT_LIST_HEAD(&cache->io_list); 1923 INIT_LIST_HEAD(&cache->active_bg_list); 1924 btrfs_init_free_space_ctl(cache, cache->free_space_ctl); 1925 atomic_set(&cache->frozen, 0); 1926 mutex_init(&cache->free_space_lock); 1927 cache->full_stripe_locks_root.root = RB_ROOT; 1928 mutex_init(&cache->full_stripe_locks_root.lock); 1929 1930 return cache; 1931 } 1932 1933 /* 1934 * Iterate all chunks and verify that each of them has the corresponding block 1935 * group 1936 */ 1937 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 1938 { 1939 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 1940 struct extent_map *em; 1941 struct btrfs_block_group *bg; 1942 u64 start = 0; 1943 int ret = 0; 1944 1945 while (1) { 1946 read_lock(&map_tree->lock); 1947 /* 1948 * lookup_extent_mapping will return the first extent map 1949 * intersecting the range, so setting @len to 1 is enough to 1950 * get the first chunk. 1951 */ 1952 em = lookup_extent_mapping(map_tree, start, 1); 1953 read_unlock(&map_tree->lock); 1954 if (!em) 1955 break; 1956 1957 bg = btrfs_lookup_block_group(fs_info, em->start); 1958 if (!bg) { 1959 btrfs_err(fs_info, 1960 "chunk start=%llu len=%llu doesn't have corresponding block group", 1961 em->start, em->len); 1962 ret = -EUCLEAN; 1963 free_extent_map(em); 1964 break; 1965 } 1966 if (bg->start != em->start || bg->length != em->len || 1967 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 1968 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1969 btrfs_err(fs_info, 1970 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 1971 em->start, em->len, 1972 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 1973 bg->start, bg->length, 1974 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 1975 ret = -EUCLEAN; 1976 free_extent_map(em); 1977 btrfs_put_block_group(bg); 1978 break; 1979 } 1980 start = em->start + em->len; 1981 free_extent_map(em); 1982 btrfs_put_block_group(bg); 1983 } 1984 return ret; 1985 } 1986 1987 static int read_one_block_group(struct btrfs_fs_info *info, 1988 struct btrfs_block_group_item *bgi, 1989 const struct btrfs_key *key, 1990 int need_clear) 1991 { 1992 struct btrfs_block_group *cache; 1993 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 1994 int ret; 1995 1996 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 1997 1998 cache = btrfs_create_block_group_cache(info, key->objectid); 1999 if (!cache) 2000 return -ENOMEM; 2001 2002 cache->length = key->offset; 2003 cache->used = btrfs_stack_block_group_used(bgi); 2004 cache->flags = btrfs_stack_block_group_flags(bgi); 2005 cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 2006 2007 set_free_space_tree_thresholds(cache); 2008 2009 if (need_clear) { 2010 /* 2011 * When we mount with old space cache, we need to 2012 * set BTRFS_DC_CLEAR and set dirty flag. 2013 * 2014 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 2015 * truncate the old free space cache inode and 2016 * setup a new one. 2017 * b) Setting 'dirty flag' makes sure that we flush 2018 * the new space cache info onto disk. 2019 */ 2020 if (btrfs_test_opt(info, SPACE_CACHE)) 2021 cache->disk_cache_state = BTRFS_DC_CLEAR; 2022 } 2023 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 2024 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 2025 btrfs_err(info, 2026 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 2027 cache->start); 2028 ret = -EINVAL; 2029 goto error; 2030 } 2031 2032 ret = btrfs_load_block_group_zone_info(cache, false); 2033 if (ret) { 2034 btrfs_err(info, "zoned: failed to load zone info of bg %llu", 2035 cache->start); 2036 goto error; 2037 } 2038 2039 /* 2040 * We need to exclude the super stripes now so that the space info has 2041 * super bytes accounted for, otherwise we'll think we have more space 2042 * than we actually do. 2043 */ 2044 ret = exclude_super_stripes(cache); 2045 if (ret) { 2046 /* We may have excluded something, so call this just in case. */ 2047 btrfs_free_excluded_extents(cache); 2048 goto error; 2049 } 2050 2051 /* 2052 * For zoned filesystem, space after the allocation offset is the only 2053 * free space for a block group. So, we don't need any caching work. 2054 * btrfs_calc_zone_unusable() will set the amount of free space and 2055 * zone_unusable space. 2056 * 2057 * For regular filesystem, check for two cases, either we are full, and 2058 * therefore don't need to bother with the caching work since we won't 2059 * find any space, or we are empty, and we can just add all the space 2060 * in and be done with it. This saves us _a_lot_ of time, particularly 2061 * in the full case. 2062 */ 2063 if (btrfs_is_zoned(info)) { 2064 btrfs_calc_zone_unusable(cache); 2065 /* Should not have any excluded extents. Just in case, though. */ 2066 btrfs_free_excluded_extents(cache); 2067 } else if (cache->length == cache->used) { 2068 cache->cached = BTRFS_CACHE_FINISHED; 2069 btrfs_free_excluded_extents(cache); 2070 } else if (cache->used == 0) { 2071 cache->cached = BTRFS_CACHE_FINISHED; 2072 add_new_free_space(cache, cache->start, 2073 cache->start + cache->length); 2074 btrfs_free_excluded_extents(cache); 2075 } 2076 2077 ret = btrfs_add_block_group_cache(info, cache); 2078 if (ret) { 2079 btrfs_remove_free_space_cache(cache); 2080 goto error; 2081 } 2082 trace_btrfs_add_block_group(info, cache, 0); 2083 btrfs_add_bg_to_space_info(info, cache); 2084 2085 set_avail_alloc_bits(info, cache->flags); 2086 if (btrfs_chunk_writeable(info, cache->start)) { 2087 if (cache->used == 0) { 2088 ASSERT(list_empty(&cache->bg_list)); 2089 if (btrfs_test_opt(info, DISCARD_ASYNC)) 2090 btrfs_discard_queue_work(&info->discard_ctl, cache); 2091 else 2092 btrfs_mark_bg_unused(cache); 2093 } 2094 } else { 2095 inc_block_group_ro(cache, 1); 2096 } 2097 2098 return 0; 2099 error: 2100 btrfs_put_block_group(cache); 2101 return ret; 2102 } 2103 2104 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) 2105 { 2106 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 2107 struct rb_node *node; 2108 int ret = 0; 2109 2110 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 2111 struct extent_map *em; 2112 struct map_lookup *map; 2113 struct btrfs_block_group *bg; 2114 2115 em = rb_entry(node, struct extent_map, rb_node); 2116 map = em->map_lookup; 2117 bg = btrfs_create_block_group_cache(fs_info, em->start); 2118 if (!bg) { 2119 ret = -ENOMEM; 2120 break; 2121 } 2122 2123 /* Fill dummy cache as FULL */ 2124 bg->length = em->len; 2125 bg->flags = map->type; 2126 bg->cached = BTRFS_CACHE_FINISHED; 2127 bg->used = em->len; 2128 bg->flags = map->type; 2129 ret = btrfs_add_block_group_cache(fs_info, bg); 2130 /* 2131 * We may have some valid block group cache added already, in 2132 * that case we skip to the next one. 2133 */ 2134 if (ret == -EEXIST) { 2135 ret = 0; 2136 btrfs_put_block_group(bg); 2137 continue; 2138 } 2139 2140 if (ret) { 2141 btrfs_remove_free_space_cache(bg); 2142 btrfs_put_block_group(bg); 2143 break; 2144 } 2145 2146 btrfs_add_bg_to_space_info(fs_info, bg); 2147 2148 set_avail_alloc_bits(fs_info, bg->flags); 2149 } 2150 if (!ret) 2151 btrfs_init_global_block_rsv(fs_info); 2152 return ret; 2153 } 2154 2155 int btrfs_read_block_groups(struct btrfs_fs_info *info) 2156 { 2157 struct btrfs_root *root = btrfs_block_group_root(info); 2158 struct btrfs_path *path; 2159 int ret; 2160 struct btrfs_block_group *cache; 2161 struct btrfs_space_info *space_info; 2162 struct btrfs_key key; 2163 int need_clear = 0; 2164 u64 cache_gen; 2165 2166 /* 2167 * Either no extent root (with ibadroots rescue option) or we have 2168 * unsupported RO options. The fs can never be mounted read-write, so no 2169 * need to waste time searching block group items. 2170 * 2171 * This also allows new extent tree related changes to be RO compat, 2172 * no need for a full incompat flag. 2173 */ 2174 if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & 2175 ~BTRFS_FEATURE_COMPAT_RO_SUPP)) 2176 return fill_dummy_bgs(info); 2177 2178 key.objectid = 0; 2179 key.offset = 0; 2180 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2181 path = btrfs_alloc_path(); 2182 if (!path) 2183 return -ENOMEM; 2184 2185 cache_gen = btrfs_super_cache_generation(info->super_copy); 2186 if (btrfs_test_opt(info, SPACE_CACHE) && 2187 btrfs_super_generation(info->super_copy) != cache_gen) 2188 need_clear = 1; 2189 if (btrfs_test_opt(info, CLEAR_CACHE)) 2190 need_clear = 1; 2191 2192 while (1) { 2193 struct btrfs_block_group_item bgi; 2194 struct extent_buffer *leaf; 2195 int slot; 2196 2197 ret = find_first_block_group(info, path, &key); 2198 if (ret > 0) 2199 break; 2200 if (ret != 0) 2201 goto error; 2202 2203 leaf = path->nodes[0]; 2204 slot = path->slots[0]; 2205 2206 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 2207 sizeof(bgi)); 2208 2209 btrfs_item_key_to_cpu(leaf, &key, slot); 2210 btrfs_release_path(path); 2211 ret = read_one_block_group(info, &bgi, &key, need_clear); 2212 if (ret < 0) 2213 goto error; 2214 key.objectid += key.offset; 2215 key.offset = 0; 2216 } 2217 btrfs_release_path(path); 2218 2219 list_for_each_entry(space_info, &info->space_info, list) { 2220 int i; 2221 2222 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2223 if (list_empty(&space_info->block_groups[i])) 2224 continue; 2225 cache = list_first_entry(&space_info->block_groups[i], 2226 struct btrfs_block_group, 2227 list); 2228 btrfs_sysfs_add_block_group_type(cache); 2229 } 2230 2231 if (!(btrfs_get_alloc_profile(info, space_info->flags) & 2232 (BTRFS_BLOCK_GROUP_RAID10 | 2233 BTRFS_BLOCK_GROUP_RAID1_MASK | 2234 BTRFS_BLOCK_GROUP_RAID56_MASK | 2235 BTRFS_BLOCK_GROUP_DUP))) 2236 continue; 2237 /* 2238 * Avoid allocating from un-mirrored block group if there are 2239 * mirrored block groups. 2240 */ 2241 list_for_each_entry(cache, 2242 &space_info->block_groups[BTRFS_RAID_RAID0], 2243 list) 2244 inc_block_group_ro(cache, 1); 2245 list_for_each_entry(cache, 2246 &space_info->block_groups[BTRFS_RAID_SINGLE], 2247 list) 2248 inc_block_group_ro(cache, 1); 2249 } 2250 2251 btrfs_init_global_block_rsv(info); 2252 ret = check_chunk_block_group_mappings(info); 2253 error: 2254 btrfs_free_path(path); 2255 /* 2256 * We've hit some error while reading the extent tree, and have 2257 * rescue=ibadroots mount option. 2258 * Try to fill the tree using dummy block groups so that the user can 2259 * continue to mount and grab their data. 2260 */ 2261 if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) 2262 ret = fill_dummy_bgs(info); 2263 return ret; 2264 } 2265 2266 /* 2267 * This function, insert_block_group_item(), belongs to the phase 2 of chunk 2268 * allocation. 2269 * 2270 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2271 * phases. 2272 */ 2273 static int insert_block_group_item(struct btrfs_trans_handle *trans, 2274 struct btrfs_block_group *block_group) 2275 { 2276 struct btrfs_fs_info *fs_info = trans->fs_info; 2277 struct btrfs_block_group_item bgi; 2278 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2279 struct btrfs_key key; 2280 2281 spin_lock(&block_group->lock); 2282 btrfs_set_stack_block_group_used(&bgi, block_group->used); 2283 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2284 block_group->global_root_id); 2285 btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2286 key.objectid = block_group->start; 2287 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2288 key.offset = block_group->length; 2289 spin_unlock(&block_group->lock); 2290 2291 return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 2292 } 2293 2294 static int insert_dev_extent(struct btrfs_trans_handle *trans, 2295 struct btrfs_device *device, u64 chunk_offset, 2296 u64 start, u64 num_bytes) 2297 { 2298 struct btrfs_fs_info *fs_info = device->fs_info; 2299 struct btrfs_root *root = fs_info->dev_root; 2300 struct btrfs_path *path; 2301 struct btrfs_dev_extent *extent; 2302 struct extent_buffer *leaf; 2303 struct btrfs_key key; 2304 int ret; 2305 2306 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 2307 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 2308 path = btrfs_alloc_path(); 2309 if (!path) 2310 return -ENOMEM; 2311 2312 key.objectid = device->devid; 2313 key.type = BTRFS_DEV_EXTENT_KEY; 2314 key.offset = start; 2315 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); 2316 if (ret) 2317 goto out; 2318 2319 leaf = path->nodes[0]; 2320 extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 2321 btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); 2322 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 2323 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2324 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 2325 2326 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 2327 btrfs_mark_buffer_dirty(leaf); 2328 out: 2329 btrfs_free_path(path); 2330 return ret; 2331 } 2332 2333 /* 2334 * This function belongs to phase 2. 2335 * 2336 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2337 * phases. 2338 */ 2339 static int insert_dev_extents(struct btrfs_trans_handle *trans, 2340 u64 chunk_offset, u64 chunk_size) 2341 { 2342 struct btrfs_fs_info *fs_info = trans->fs_info; 2343 struct btrfs_device *device; 2344 struct extent_map *em; 2345 struct map_lookup *map; 2346 u64 dev_offset; 2347 u64 stripe_size; 2348 int i; 2349 int ret = 0; 2350 2351 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 2352 if (IS_ERR(em)) 2353 return PTR_ERR(em); 2354 2355 map = em->map_lookup; 2356 stripe_size = em->orig_block_len; 2357 2358 /* 2359 * Take the device list mutex to prevent races with the final phase of 2360 * a device replace operation that replaces the device object associated 2361 * with the map's stripes, because the device object's id can change 2362 * at any time during that final phase of the device replace operation 2363 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 2364 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 2365 * resulting in persisting a device extent item with such ID. 2366 */ 2367 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2368 for (i = 0; i < map->num_stripes; i++) { 2369 device = map->stripes[i].dev; 2370 dev_offset = map->stripes[i].physical; 2371 2372 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, 2373 stripe_size); 2374 if (ret) 2375 break; 2376 } 2377 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2378 2379 free_extent_map(em); 2380 return ret; 2381 } 2382 2383 /* 2384 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of 2385 * chunk allocation. 2386 * 2387 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2388 * phases. 2389 */ 2390 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 2391 { 2392 struct btrfs_fs_info *fs_info = trans->fs_info; 2393 struct btrfs_block_group *block_group; 2394 int ret = 0; 2395 2396 while (!list_empty(&trans->new_bgs)) { 2397 int index; 2398 2399 block_group = list_first_entry(&trans->new_bgs, 2400 struct btrfs_block_group, 2401 bg_list); 2402 if (ret) 2403 goto next; 2404 2405 index = btrfs_bg_flags_to_raid_index(block_group->flags); 2406 2407 ret = insert_block_group_item(trans, block_group); 2408 if (ret) 2409 btrfs_abort_transaction(trans, ret); 2410 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, 2411 &block_group->runtime_flags)) { 2412 mutex_lock(&fs_info->chunk_mutex); 2413 ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); 2414 mutex_unlock(&fs_info->chunk_mutex); 2415 if (ret) 2416 btrfs_abort_transaction(trans, ret); 2417 } 2418 ret = insert_dev_extents(trans, block_group->start, 2419 block_group->length); 2420 if (ret) 2421 btrfs_abort_transaction(trans, ret); 2422 add_block_group_free_space(trans, block_group); 2423 2424 /* 2425 * If we restriped during balance, we may have added a new raid 2426 * type, so now add the sysfs entries when it is safe to do so. 2427 * We don't have to worry about locking here as it's handled in 2428 * btrfs_sysfs_add_block_group_type. 2429 */ 2430 if (block_group->space_info->block_group_kobjs[index] == NULL) 2431 btrfs_sysfs_add_block_group_type(block_group); 2432 2433 /* Already aborted the transaction if it failed. */ 2434 next: 2435 btrfs_delayed_refs_rsv_release(fs_info, 1); 2436 list_del_init(&block_group->bg_list); 2437 } 2438 btrfs_trans_release_chunk_metadata(trans); 2439 } 2440 2441 /* 2442 * For extent tree v2 we use the block_group_item->chunk_offset to point at our 2443 * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. 2444 */ 2445 static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) 2446 { 2447 u64 div = SZ_1G; 2448 u64 index; 2449 2450 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 2451 return BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2452 2453 /* If we have a smaller fs index based on 128MiB. */ 2454 if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) 2455 div = SZ_128M; 2456 2457 offset = div64_u64(offset, div); 2458 div64_u64_rem(offset, fs_info->nr_global_roots, &index); 2459 return index; 2460 } 2461 2462 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, 2463 u64 bytes_used, u64 type, 2464 u64 chunk_offset, u64 size) 2465 { 2466 struct btrfs_fs_info *fs_info = trans->fs_info; 2467 struct btrfs_block_group *cache; 2468 int ret; 2469 2470 btrfs_set_log_full_commit(trans); 2471 2472 cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2473 if (!cache) 2474 return ERR_PTR(-ENOMEM); 2475 2476 cache->length = size; 2477 set_free_space_tree_thresholds(cache); 2478 cache->used = bytes_used; 2479 cache->flags = type; 2480 cache->cached = BTRFS_CACHE_FINISHED; 2481 cache->global_root_id = calculate_global_root_id(fs_info, cache->start); 2482 2483 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 2484 cache->needs_free_space = 1; 2485 2486 ret = btrfs_load_block_group_zone_info(cache, true); 2487 if (ret) { 2488 btrfs_put_block_group(cache); 2489 return ERR_PTR(ret); 2490 } 2491 2492 ret = exclude_super_stripes(cache); 2493 if (ret) { 2494 /* We may have excluded something, so call this just in case */ 2495 btrfs_free_excluded_extents(cache); 2496 btrfs_put_block_group(cache); 2497 return ERR_PTR(ret); 2498 } 2499 2500 add_new_free_space(cache, chunk_offset, chunk_offset + size); 2501 2502 btrfs_free_excluded_extents(cache); 2503 2504 /* 2505 * Ensure the corresponding space_info object is created and 2506 * assigned to our block group. We want our bg to be added to the rbtree 2507 * with its ->space_info set. 2508 */ 2509 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 2510 ASSERT(cache->space_info); 2511 2512 ret = btrfs_add_block_group_cache(fs_info, cache); 2513 if (ret) { 2514 btrfs_remove_free_space_cache(cache); 2515 btrfs_put_block_group(cache); 2516 return ERR_PTR(ret); 2517 } 2518 2519 /* 2520 * Now that our block group has its ->space_info set and is inserted in 2521 * the rbtree, update the space info's counters. 2522 */ 2523 trace_btrfs_add_block_group(fs_info, cache, 1); 2524 btrfs_add_bg_to_space_info(fs_info, cache); 2525 btrfs_update_global_block_rsv(fs_info); 2526 2527 #ifdef CONFIG_BTRFS_DEBUG 2528 if (btrfs_should_fragment_free_space(cache)) { 2529 u64 new_bytes_used = size - bytes_used; 2530 2531 cache->space_info->bytes_used += new_bytes_used >> 1; 2532 fragment_free_space(cache); 2533 } 2534 #endif 2535 2536 list_add_tail(&cache->bg_list, &trans->new_bgs); 2537 trans->delayed_ref_updates++; 2538 btrfs_update_delayed_refs_rsv(trans); 2539 2540 set_avail_alloc_bits(fs_info, type); 2541 return cache; 2542 } 2543 2544 /* 2545 * Mark one block group RO, can be called several times for the same block 2546 * group. 2547 * 2548 * @cache: the destination block group 2549 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 2550 * ensure we still have some free space after marking this 2551 * block group RO. 2552 */ 2553 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 2554 bool do_chunk_alloc) 2555 { 2556 struct btrfs_fs_info *fs_info = cache->fs_info; 2557 struct btrfs_trans_handle *trans; 2558 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2559 u64 alloc_flags; 2560 int ret; 2561 bool dirty_bg_running; 2562 2563 /* 2564 * This can only happen when we are doing read-only scrub on read-only 2565 * mount. 2566 * In that case we should not start a new transaction on read-only fs. 2567 * Thus here we skip all chunk allocations. 2568 */ 2569 if (sb_rdonly(fs_info->sb)) { 2570 mutex_lock(&fs_info->ro_block_group_mutex); 2571 ret = inc_block_group_ro(cache, 0); 2572 mutex_unlock(&fs_info->ro_block_group_mutex); 2573 return ret; 2574 } 2575 2576 do { 2577 trans = btrfs_join_transaction(root); 2578 if (IS_ERR(trans)) 2579 return PTR_ERR(trans); 2580 2581 dirty_bg_running = false; 2582 2583 /* 2584 * We're not allowed to set block groups readonly after the dirty 2585 * block group cache has started writing. If it already started, 2586 * back off and let this transaction commit. 2587 */ 2588 mutex_lock(&fs_info->ro_block_group_mutex); 2589 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 2590 u64 transid = trans->transid; 2591 2592 mutex_unlock(&fs_info->ro_block_group_mutex); 2593 btrfs_end_transaction(trans); 2594 2595 ret = btrfs_wait_for_commit(fs_info, transid); 2596 if (ret) 2597 return ret; 2598 dirty_bg_running = true; 2599 } 2600 } while (dirty_bg_running); 2601 2602 if (do_chunk_alloc) { 2603 /* 2604 * If we are changing raid levels, try to allocate a 2605 * corresponding block group with the new raid level. 2606 */ 2607 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2608 if (alloc_flags != cache->flags) { 2609 ret = btrfs_chunk_alloc(trans, alloc_flags, 2610 CHUNK_ALLOC_FORCE); 2611 /* 2612 * ENOSPC is allowed here, we may have enough space 2613 * already allocated at the new raid level to carry on 2614 */ 2615 if (ret == -ENOSPC) 2616 ret = 0; 2617 if (ret < 0) 2618 goto out; 2619 } 2620 } 2621 2622 ret = inc_block_group_ro(cache, 0); 2623 if (!do_chunk_alloc || ret == -ETXTBSY) 2624 goto unlock_out; 2625 if (!ret) 2626 goto out; 2627 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 2628 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2629 if (ret < 0) 2630 goto out; 2631 /* 2632 * We have allocated a new chunk. We also need to activate that chunk to 2633 * grant metadata tickets for zoned filesystem. 2634 */ 2635 ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); 2636 if (ret < 0) 2637 goto out; 2638 2639 ret = inc_block_group_ro(cache, 0); 2640 if (ret == -ETXTBSY) 2641 goto unlock_out; 2642 out: 2643 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 2644 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2645 mutex_lock(&fs_info->chunk_mutex); 2646 check_system_chunk(trans, alloc_flags); 2647 mutex_unlock(&fs_info->chunk_mutex); 2648 } 2649 unlock_out: 2650 mutex_unlock(&fs_info->ro_block_group_mutex); 2651 2652 btrfs_end_transaction(trans); 2653 return ret; 2654 } 2655 2656 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 2657 { 2658 struct btrfs_space_info *sinfo = cache->space_info; 2659 u64 num_bytes; 2660 2661 BUG_ON(!cache->ro); 2662 2663 spin_lock(&sinfo->lock); 2664 spin_lock(&cache->lock); 2665 if (!--cache->ro) { 2666 if (btrfs_is_zoned(cache->fs_info)) { 2667 /* Migrate zone_unusable bytes back */ 2668 cache->zone_unusable = 2669 (cache->alloc_offset - cache->used) + 2670 (cache->length - cache->zone_capacity); 2671 sinfo->bytes_zone_unusable += cache->zone_unusable; 2672 sinfo->bytes_readonly -= cache->zone_unusable; 2673 } 2674 num_bytes = cache->length - cache->reserved - 2675 cache->pinned - cache->bytes_super - 2676 cache->zone_unusable - cache->used; 2677 sinfo->bytes_readonly -= num_bytes; 2678 list_del_init(&cache->ro_list); 2679 } 2680 spin_unlock(&cache->lock); 2681 spin_unlock(&sinfo->lock); 2682 } 2683 2684 static int update_block_group_item(struct btrfs_trans_handle *trans, 2685 struct btrfs_path *path, 2686 struct btrfs_block_group *cache) 2687 { 2688 struct btrfs_fs_info *fs_info = trans->fs_info; 2689 int ret; 2690 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2691 unsigned long bi; 2692 struct extent_buffer *leaf; 2693 struct btrfs_block_group_item bgi; 2694 struct btrfs_key key; 2695 2696 key.objectid = cache->start; 2697 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2698 key.offset = cache->length; 2699 2700 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2701 if (ret) { 2702 if (ret > 0) 2703 ret = -ENOENT; 2704 goto fail; 2705 } 2706 2707 leaf = path->nodes[0]; 2708 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2709 btrfs_set_stack_block_group_used(&bgi, cache->used); 2710 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2711 cache->global_root_id); 2712 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 2713 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 2714 btrfs_mark_buffer_dirty(leaf); 2715 fail: 2716 btrfs_release_path(path); 2717 return ret; 2718 2719 } 2720 2721 static int cache_save_setup(struct btrfs_block_group *block_group, 2722 struct btrfs_trans_handle *trans, 2723 struct btrfs_path *path) 2724 { 2725 struct btrfs_fs_info *fs_info = block_group->fs_info; 2726 struct btrfs_root *root = fs_info->tree_root; 2727 struct inode *inode = NULL; 2728 struct extent_changeset *data_reserved = NULL; 2729 u64 alloc_hint = 0; 2730 int dcs = BTRFS_DC_ERROR; 2731 u64 cache_size = 0; 2732 int retries = 0; 2733 int ret = 0; 2734 2735 if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 2736 return 0; 2737 2738 /* 2739 * If this block group is smaller than 100 megs don't bother caching the 2740 * block group. 2741 */ 2742 if (block_group->length < (100 * SZ_1M)) { 2743 spin_lock(&block_group->lock); 2744 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 2745 spin_unlock(&block_group->lock); 2746 return 0; 2747 } 2748 2749 if (TRANS_ABORTED(trans)) 2750 return 0; 2751 again: 2752 inode = lookup_free_space_inode(block_group, path); 2753 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 2754 ret = PTR_ERR(inode); 2755 btrfs_release_path(path); 2756 goto out; 2757 } 2758 2759 if (IS_ERR(inode)) { 2760 BUG_ON(retries); 2761 retries++; 2762 2763 if (block_group->ro) 2764 goto out_free; 2765 2766 ret = create_free_space_inode(trans, block_group, path); 2767 if (ret) 2768 goto out_free; 2769 goto again; 2770 } 2771 2772 /* 2773 * We want to set the generation to 0, that way if anything goes wrong 2774 * from here on out we know not to trust this cache when we load up next 2775 * time. 2776 */ 2777 BTRFS_I(inode)->generation = 0; 2778 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 2779 if (ret) { 2780 /* 2781 * So theoretically we could recover from this, simply set the 2782 * super cache generation to 0 so we know to invalidate the 2783 * cache, but then we'd have to keep track of the block groups 2784 * that fail this way so we know we _have_ to reset this cache 2785 * before the next commit or risk reading stale cache. So to 2786 * limit our exposure to horrible edge cases lets just abort the 2787 * transaction, this only happens in really bad situations 2788 * anyway. 2789 */ 2790 btrfs_abort_transaction(trans, ret); 2791 goto out_put; 2792 } 2793 WARN_ON(ret); 2794 2795 /* We've already setup this transaction, go ahead and exit */ 2796 if (block_group->cache_generation == trans->transid && 2797 i_size_read(inode)) { 2798 dcs = BTRFS_DC_SETUP; 2799 goto out_put; 2800 } 2801 2802 if (i_size_read(inode) > 0) { 2803 ret = btrfs_check_trunc_cache_free_space(fs_info, 2804 &fs_info->global_block_rsv); 2805 if (ret) 2806 goto out_put; 2807 2808 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 2809 if (ret) 2810 goto out_put; 2811 } 2812 2813 spin_lock(&block_group->lock); 2814 if (block_group->cached != BTRFS_CACHE_FINISHED || 2815 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 2816 /* 2817 * don't bother trying to write stuff out _if_ 2818 * a) we're not cached, 2819 * b) we're with nospace_cache mount option, 2820 * c) we're with v2 space_cache (FREE_SPACE_TREE). 2821 */ 2822 dcs = BTRFS_DC_WRITTEN; 2823 spin_unlock(&block_group->lock); 2824 goto out_put; 2825 } 2826 spin_unlock(&block_group->lock); 2827 2828 /* 2829 * We hit an ENOSPC when setting up the cache in this transaction, just 2830 * skip doing the setup, we've already cleared the cache so we're safe. 2831 */ 2832 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 2833 ret = -ENOSPC; 2834 goto out_put; 2835 } 2836 2837 /* 2838 * Try to preallocate enough space based on how big the block group is. 2839 * Keep in mind this has to include any pinned space which could end up 2840 * taking up quite a bit since it's not folded into the other space 2841 * cache. 2842 */ 2843 cache_size = div_u64(block_group->length, SZ_256M); 2844 if (!cache_size) 2845 cache_size = 1; 2846 2847 cache_size *= 16; 2848 cache_size *= fs_info->sectorsize; 2849 2850 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 2851 cache_size, false); 2852 if (ret) 2853 goto out_put; 2854 2855 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, 2856 cache_size, cache_size, 2857 &alloc_hint); 2858 /* 2859 * Our cache requires contiguous chunks so that we don't modify a bunch 2860 * of metadata or split extents when writing the cache out, which means 2861 * we can enospc if we are heavily fragmented in addition to just normal 2862 * out of space conditions. So if we hit this just skip setting up any 2863 * other block groups for this transaction, maybe we'll unpin enough 2864 * space the next time around. 2865 */ 2866 if (!ret) 2867 dcs = BTRFS_DC_SETUP; 2868 else if (ret == -ENOSPC) 2869 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 2870 2871 out_put: 2872 iput(inode); 2873 out_free: 2874 btrfs_release_path(path); 2875 out: 2876 spin_lock(&block_group->lock); 2877 if (!ret && dcs == BTRFS_DC_SETUP) 2878 block_group->cache_generation = trans->transid; 2879 block_group->disk_cache_state = dcs; 2880 spin_unlock(&block_group->lock); 2881 2882 extent_changeset_free(data_reserved); 2883 return ret; 2884 } 2885 2886 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 2887 { 2888 struct btrfs_fs_info *fs_info = trans->fs_info; 2889 struct btrfs_block_group *cache, *tmp; 2890 struct btrfs_transaction *cur_trans = trans->transaction; 2891 struct btrfs_path *path; 2892 2893 if (list_empty(&cur_trans->dirty_bgs) || 2894 !btrfs_test_opt(fs_info, SPACE_CACHE)) 2895 return 0; 2896 2897 path = btrfs_alloc_path(); 2898 if (!path) 2899 return -ENOMEM; 2900 2901 /* Could add new block groups, use _safe just in case */ 2902 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 2903 dirty_list) { 2904 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 2905 cache_save_setup(cache, trans, path); 2906 } 2907 2908 btrfs_free_path(path); 2909 return 0; 2910 } 2911 2912 /* 2913 * Transaction commit does final block group cache writeback during a critical 2914 * section where nothing is allowed to change the FS. This is required in 2915 * order for the cache to actually match the block group, but can introduce a 2916 * lot of latency into the commit. 2917 * 2918 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 2919 * There's a chance we'll have to redo some of it if the block group changes 2920 * again during the commit, but it greatly reduces the commit latency by 2921 * getting rid of the easy block groups while we're still allowing others to 2922 * join the commit. 2923 */ 2924 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 2925 { 2926 struct btrfs_fs_info *fs_info = trans->fs_info; 2927 struct btrfs_block_group *cache; 2928 struct btrfs_transaction *cur_trans = trans->transaction; 2929 int ret = 0; 2930 int should_put; 2931 struct btrfs_path *path = NULL; 2932 LIST_HEAD(dirty); 2933 struct list_head *io = &cur_trans->io_bgs; 2934 int loops = 0; 2935 2936 spin_lock(&cur_trans->dirty_bgs_lock); 2937 if (list_empty(&cur_trans->dirty_bgs)) { 2938 spin_unlock(&cur_trans->dirty_bgs_lock); 2939 return 0; 2940 } 2941 list_splice_init(&cur_trans->dirty_bgs, &dirty); 2942 spin_unlock(&cur_trans->dirty_bgs_lock); 2943 2944 again: 2945 /* Make sure all the block groups on our dirty list actually exist */ 2946 btrfs_create_pending_block_groups(trans); 2947 2948 if (!path) { 2949 path = btrfs_alloc_path(); 2950 if (!path) { 2951 ret = -ENOMEM; 2952 goto out; 2953 } 2954 } 2955 2956 /* 2957 * cache_write_mutex is here only to save us from balance or automatic 2958 * removal of empty block groups deleting this block group while we are 2959 * writing out the cache 2960 */ 2961 mutex_lock(&trans->transaction->cache_write_mutex); 2962 while (!list_empty(&dirty)) { 2963 bool drop_reserve = true; 2964 2965 cache = list_first_entry(&dirty, struct btrfs_block_group, 2966 dirty_list); 2967 /* 2968 * This can happen if something re-dirties a block group that 2969 * is already under IO. Just wait for it to finish and then do 2970 * it all again 2971 */ 2972 if (!list_empty(&cache->io_list)) { 2973 list_del_init(&cache->io_list); 2974 btrfs_wait_cache_io(trans, cache, path); 2975 btrfs_put_block_group(cache); 2976 } 2977 2978 2979 /* 2980 * btrfs_wait_cache_io uses the cache->dirty_list to decide if 2981 * it should update the cache_state. Don't delete until after 2982 * we wait. 2983 * 2984 * Since we're not running in the commit critical section 2985 * we need the dirty_bgs_lock to protect from update_block_group 2986 */ 2987 spin_lock(&cur_trans->dirty_bgs_lock); 2988 list_del_init(&cache->dirty_list); 2989 spin_unlock(&cur_trans->dirty_bgs_lock); 2990 2991 should_put = 1; 2992 2993 cache_save_setup(cache, trans, path); 2994 2995 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 2996 cache->io_ctl.inode = NULL; 2997 ret = btrfs_write_out_cache(trans, cache, path); 2998 if (ret == 0 && cache->io_ctl.inode) { 2999 should_put = 0; 3000 3001 /* 3002 * The cache_write_mutex is protecting the 3003 * io_list, also refer to the definition of 3004 * btrfs_transaction::io_bgs for more details 3005 */ 3006 list_add_tail(&cache->io_list, io); 3007 } else { 3008 /* 3009 * If we failed to write the cache, the 3010 * generation will be bad and life goes on 3011 */ 3012 ret = 0; 3013 } 3014 } 3015 if (!ret) { 3016 ret = update_block_group_item(trans, path, cache); 3017 /* 3018 * Our block group might still be attached to the list 3019 * of new block groups in the transaction handle of some 3020 * other task (struct btrfs_trans_handle->new_bgs). This 3021 * means its block group item isn't yet in the extent 3022 * tree. If this happens ignore the error, as we will 3023 * try again later in the critical section of the 3024 * transaction commit. 3025 */ 3026 if (ret == -ENOENT) { 3027 ret = 0; 3028 spin_lock(&cur_trans->dirty_bgs_lock); 3029 if (list_empty(&cache->dirty_list)) { 3030 list_add_tail(&cache->dirty_list, 3031 &cur_trans->dirty_bgs); 3032 btrfs_get_block_group(cache); 3033 drop_reserve = false; 3034 } 3035 spin_unlock(&cur_trans->dirty_bgs_lock); 3036 } else if (ret) { 3037 btrfs_abort_transaction(trans, ret); 3038 } 3039 } 3040 3041 /* If it's not on the io list, we need to put the block group */ 3042 if (should_put) 3043 btrfs_put_block_group(cache); 3044 if (drop_reserve) 3045 btrfs_delayed_refs_rsv_release(fs_info, 1); 3046 /* 3047 * Avoid blocking other tasks for too long. It might even save 3048 * us from writing caches for block groups that are going to be 3049 * removed. 3050 */ 3051 mutex_unlock(&trans->transaction->cache_write_mutex); 3052 if (ret) 3053 goto out; 3054 mutex_lock(&trans->transaction->cache_write_mutex); 3055 } 3056 mutex_unlock(&trans->transaction->cache_write_mutex); 3057 3058 /* 3059 * Go through delayed refs for all the stuff we've just kicked off 3060 * and then loop back (just once) 3061 */ 3062 if (!ret) 3063 ret = btrfs_run_delayed_refs(trans, 0); 3064 if (!ret && loops == 0) { 3065 loops++; 3066 spin_lock(&cur_trans->dirty_bgs_lock); 3067 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3068 /* 3069 * dirty_bgs_lock protects us from concurrent block group 3070 * deletes too (not just cache_write_mutex). 3071 */ 3072 if (!list_empty(&dirty)) { 3073 spin_unlock(&cur_trans->dirty_bgs_lock); 3074 goto again; 3075 } 3076 spin_unlock(&cur_trans->dirty_bgs_lock); 3077 } 3078 out: 3079 if (ret < 0) { 3080 spin_lock(&cur_trans->dirty_bgs_lock); 3081 list_splice_init(&dirty, &cur_trans->dirty_bgs); 3082 spin_unlock(&cur_trans->dirty_bgs_lock); 3083 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3084 } 3085 3086 btrfs_free_path(path); 3087 return ret; 3088 } 3089 3090 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 3091 { 3092 struct btrfs_fs_info *fs_info = trans->fs_info; 3093 struct btrfs_block_group *cache; 3094 struct btrfs_transaction *cur_trans = trans->transaction; 3095 int ret = 0; 3096 int should_put; 3097 struct btrfs_path *path; 3098 struct list_head *io = &cur_trans->io_bgs; 3099 3100 path = btrfs_alloc_path(); 3101 if (!path) 3102 return -ENOMEM; 3103 3104 /* 3105 * Even though we are in the critical section of the transaction commit, 3106 * we can still have concurrent tasks adding elements to this 3107 * transaction's list of dirty block groups. These tasks correspond to 3108 * endio free space workers started when writeback finishes for a 3109 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3110 * allocate new block groups as a result of COWing nodes of the root 3111 * tree when updating the free space inode. The writeback for the space 3112 * caches is triggered by an earlier call to 3113 * btrfs_start_dirty_block_groups() and iterations of the following 3114 * loop. 3115 * Also we want to do the cache_save_setup first and then run the 3116 * delayed refs to make sure we have the best chance at doing this all 3117 * in one shot. 3118 */ 3119 spin_lock(&cur_trans->dirty_bgs_lock); 3120 while (!list_empty(&cur_trans->dirty_bgs)) { 3121 cache = list_first_entry(&cur_trans->dirty_bgs, 3122 struct btrfs_block_group, 3123 dirty_list); 3124 3125 /* 3126 * This can happen if cache_save_setup re-dirties a block group 3127 * that is already under IO. Just wait for it to finish and 3128 * then do it all again 3129 */ 3130 if (!list_empty(&cache->io_list)) { 3131 spin_unlock(&cur_trans->dirty_bgs_lock); 3132 list_del_init(&cache->io_list); 3133 btrfs_wait_cache_io(trans, cache, path); 3134 btrfs_put_block_group(cache); 3135 spin_lock(&cur_trans->dirty_bgs_lock); 3136 } 3137 3138 /* 3139 * Don't remove from the dirty list until after we've waited on 3140 * any pending IO 3141 */ 3142 list_del_init(&cache->dirty_list); 3143 spin_unlock(&cur_trans->dirty_bgs_lock); 3144 should_put = 1; 3145 3146 cache_save_setup(cache, trans, path); 3147 3148 if (!ret) 3149 ret = btrfs_run_delayed_refs(trans, 3150 (unsigned long) -1); 3151 3152 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3153 cache->io_ctl.inode = NULL; 3154 ret = btrfs_write_out_cache(trans, cache, path); 3155 if (ret == 0 && cache->io_ctl.inode) { 3156 should_put = 0; 3157 list_add_tail(&cache->io_list, io); 3158 } else { 3159 /* 3160 * If we failed to write the cache, the 3161 * generation will be bad and life goes on 3162 */ 3163 ret = 0; 3164 } 3165 } 3166 if (!ret) { 3167 ret = update_block_group_item(trans, path, cache); 3168 /* 3169 * One of the free space endio workers might have 3170 * created a new block group while updating a free space 3171 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3172 * and hasn't released its transaction handle yet, in 3173 * which case the new block group is still attached to 3174 * its transaction handle and its creation has not 3175 * finished yet (no block group item in the extent tree 3176 * yet, etc). If this is the case, wait for all free 3177 * space endio workers to finish and retry. This is a 3178 * very rare case so no need for a more efficient and 3179 * complex approach. 3180 */ 3181 if (ret == -ENOENT) { 3182 wait_event(cur_trans->writer_wait, 3183 atomic_read(&cur_trans->num_writers) == 1); 3184 ret = update_block_group_item(trans, path, cache); 3185 } 3186 if (ret) 3187 btrfs_abort_transaction(trans, ret); 3188 } 3189 3190 /* If its not on the io list, we need to put the block group */ 3191 if (should_put) 3192 btrfs_put_block_group(cache); 3193 btrfs_delayed_refs_rsv_release(fs_info, 1); 3194 spin_lock(&cur_trans->dirty_bgs_lock); 3195 } 3196 spin_unlock(&cur_trans->dirty_bgs_lock); 3197 3198 /* 3199 * Refer to the definition of io_bgs member for details why it's safe 3200 * to use it without any locking 3201 */ 3202 while (!list_empty(io)) { 3203 cache = list_first_entry(io, struct btrfs_block_group, 3204 io_list); 3205 list_del_init(&cache->io_list); 3206 btrfs_wait_cache_io(trans, cache, path); 3207 btrfs_put_block_group(cache); 3208 } 3209 3210 btrfs_free_path(path); 3211 return ret; 3212 } 3213 3214 static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, 3215 u64 bytes_freed) 3216 { 3217 const struct btrfs_space_info *space_info = bg->space_info; 3218 const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); 3219 const u64 new_val = bg->used; 3220 const u64 old_val = new_val + bytes_freed; 3221 u64 thresh; 3222 3223 if (reclaim_thresh == 0) 3224 return false; 3225 3226 thresh = div_factor_fine(bg->length, reclaim_thresh); 3227 3228 /* 3229 * If we were below the threshold before don't reclaim, we are likely a 3230 * brand new block group and we don't want to relocate new block groups. 3231 */ 3232 if (old_val < thresh) 3233 return false; 3234 if (new_val >= thresh) 3235 return false; 3236 return true; 3237 } 3238 3239 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 3240 u64 bytenr, u64 num_bytes, bool alloc) 3241 { 3242 struct btrfs_fs_info *info = trans->fs_info; 3243 struct btrfs_block_group *cache = NULL; 3244 u64 total = num_bytes; 3245 u64 old_val; 3246 u64 byte_in_group; 3247 int factor; 3248 int ret = 0; 3249 3250 /* Block accounting for super block */ 3251 spin_lock(&info->delalloc_root_lock); 3252 old_val = btrfs_super_bytes_used(info->super_copy); 3253 if (alloc) 3254 old_val += num_bytes; 3255 else 3256 old_val -= num_bytes; 3257 btrfs_set_super_bytes_used(info->super_copy, old_val); 3258 spin_unlock(&info->delalloc_root_lock); 3259 3260 while (total) { 3261 bool reclaim; 3262 3263 cache = btrfs_lookup_block_group(info, bytenr); 3264 if (!cache) { 3265 ret = -ENOENT; 3266 break; 3267 } 3268 factor = btrfs_bg_type_to_factor(cache->flags); 3269 3270 /* 3271 * If this block group has free space cache written out, we 3272 * need to make sure to load it if we are removing space. This 3273 * is because we need the unpinning stage to actually add the 3274 * space back to the block group, otherwise we will leak space. 3275 */ 3276 if (!alloc && !btrfs_block_group_done(cache)) 3277 btrfs_cache_block_group(cache, true); 3278 3279 byte_in_group = bytenr - cache->start; 3280 WARN_ON(byte_in_group > cache->length); 3281 3282 spin_lock(&cache->space_info->lock); 3283 spin_lock(&cache->lock); 3284 3285 if (btrfs_test_opt(info, SPACE_CACHE) && 3286 cache->disk_cache_state < BTRFS_DC_CLEAR) 3287 cache->disk_cache_state = BTRFS_DC_CLEAR; 3288 3289 old_val = cache->used; 3290 num_bytes = min(total, cache->length - byte_in_group); 3291 if (alloc) { 3292 old_val += num_bytes; 3293 cache->used = old_val; 3294 cache->reserved -= num_bytes; 3295 cache->space_info->bytes_reserved -= num_bytes; 3296 cache->space_info->bytes_used += num_bytes; 3297 cache->space_info->disk_used += num_bytes * factor; 3298 spin_unlock(&cache->lock); 3299 spin_unlock(&cache->space_info->lock); 3300 } else { 3301 old_val -= num_bytes; 3302 cache->used = old_val; 3303 cache->pinned += num_bytes; 3304 btrfs_space_info_update_bytes_pinned(info, 3305 cache->space_info, num_bytes); 3306 cache->space_info->bytes_used -= num_bytes; 3307 cache->space_info->disk_used -= num_bytes * factor; 3308 3309 reclaim = should_reclaim_block_group(cache, num_bytes); 3310 spin_unlock(&cache->lock); 3311 spin_unlock(&cache->space_info->lock); 3312 3313 set_extent_dirty(&trans->transaction->pinned_extents, 3314 bytenr, bytenr + num_bytes - 1, 3315 GFP_NOFS | __GFP_NOFAIL); 3316 } 3317 3318 spin_lock(&trans->transaction->dirty_bgs_lock); 3319 if (list_empty(&cache->dirty_list)) { 3320 list_add_tail(&cache->dirty_list, 3321 &trans->transaction->dirty_bgs); 3322 trans->delayed_ref_updates++; 3323 btrfs_get_block_group(cache); 3324 } 3325 spin_unlock(&trans->transaction->dirty_bgs_lock); 3326 3327 /* 3328 * No longer have used bytes in this block group, queue it for 3329 * deletion. We do this after adding the block group to the 3330 * dirty list to avoid races between cleaner kthread and space 3331 * cache writeout. 3332 */ 3333 if (!alloc && old_val == 0) { 3334 if (!btrfs_test_opt(info, DISCARD_ASYNC)) 3335 btrfs_mark_bg_unused(cache); 3336 } else if (!alloc && reclaim) { 3337 btrfs_mark_bg_to_reclaim(cache); 3338 } 3339 3340 btrfs_put_block_group(cache); 3341 total -= num_bytes; 3342 bytenr += num_bytes; 3343 } 3344 3345 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 3346 btrfs_update_delayed_refs_rsv(trans); 3347 return ret; 3348 } 3349 3350 /** 3351 * btrfs_add_reserved_bytes - update the block_group and space info counters 3352 * @cache: The cache we are manipulating 3353 * @ram_bytes: The number of bytes of file content, and will be same to 3354 * @num_bytes except for the compress path. 3355 * @num_bytes: The number of bytes in question 3356 * @delalloc: The blocks are allocated for the delalloc write 3357 * 3358 * This is called by the allocator when it reserves space. If this is a 3359 * reservation and the block group has become read only we cannot make the 3360 * reservation and return -EAGAIN, otherwise this function always succeeds. 3361 */ 3362 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 3363 u64 ram_bytes, u64 num_bytes, int delalloc) 3364 { 3365 struct btrfs_space_info *space_info = cache->space_info; 3366 int ret = 0; 3367 3368 spin_lock(&space_info->lock); 3369 spin_lock(&cache->lock); 3370 if (cache->ro) { 3371 ret = -EAGAIN; 3372 } else { 3373 cache->reserved += num_bytes; 3374 space_info->bytes_reserved += num_bytes; 3375 trace_btrfs_space_reservation(cache->fs_info, "space_info", 3376 space_info->flags, num_bytes, 1); 3377 btrfs_space_info_update_bytes_may_use(cache->fs_info, 3378 space_info, -ram_bytes); 3379 if (delalloc) 3380 cache->delalloc_bytes += num_bytes; 3381 3382 /* 3383 * Compression can use less space than we reserved, so wake 3384 * tickets if that happens 3385 */ 3386 if (num_bytes < ram_bytes) 3387 btrfs_try_granting_tickets(cache->fs_info, space_info); 3388 } 3389 spin_unlock(&cache->lock); 3390 spin_unlock(&space_info->lock); 3391 return ret; 3392 } 3393 3394 /** 3395 * btrfs_free_reserved_bytes - update the block_group and space info counters 3396 * @cache: The cache we are manipulating 3397 * @num_bytes: The number of bytes in question 3398 * @delalloc: The blocks are allocated for the delalloc write 3399 * 3400 * This is called by somebody who is freeing space that was never actually used 3401 * on disk. For example if you reserve some space for a new leaf in transaction 3402 * A and before transaction A commits you free that leaf, you call this with 3403 * reserve set to 0 in order to clear the reservation. 3404 */ 3405 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 3406 u64 num_bytes, int delalloc) 3407 { 3408 struct btrfs_space_info *space_info = cache->space_info; 3409 3410 spin_lock(&space_info->lock); 3411 spin_lock(&cache->lock); 3412 if (cache->ro) 3413 space_info->bytes_readonly += num_bytes; 3414 cache->reserved -= num_bytes; 3415 space_info->bytes_reserved -= num_bytes; 3416 space_info->max_extent_size = 0; 3417 3418 if (delalloc) 3419 cache->delalloc_bytes -= num_bytes; 3420 spin_unlock(&cache->lock); 3421 3422 btrfs_try_granting_tickets(cache->fs_info, space_info); 3423 spin_unlock(&space_info->lock); 3424 } 3425 3426 static void force_metadata_allocation(struct btrfs_fs_info *info) 3427 { 3428 struct list_head *head = &info->space_info; 3429 struct btrfs_space_info *found; 3430 3431 list_for_each_entry(found, head, list) { 3432 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3433 found->force_alloc = CHUNK_ALLOC_FORCE; 3434 } 3435 } 3436 3437 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3438 struct btrfs_space_info *sinfo, int force) 3439 { 3440 u64 bytes_used = btrfs_space_info_used(sinfo, false); 3441 u64 thresh; 3442 3443 if (force == CHUNK_ALLOC_FORCE) 3444 return 1; 3445 3446 /* 3447 * in limited mode, we want to have some free space up to 3448 * about 1% of the FS size. 3449 */ 3450 if (force == CHUNK_ALLOC_LIMITED) { 3451 thresh = btrfs_super_total_bytes(fs_info->super_copy); 3452 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 3453 3454 if (sinfo->total_bytes - bytes_used < thresh) 3455 return 1; 3456 } 3457 3458 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 3459 return 0; 3460 return 1; 3461 } 3462 3463 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 3464 { 3465 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 3466 3467 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 3468 } 3469 3470 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) 3471 { 3472 struct btrfs_block_group *bg; 3473 int ret; 3474 3475 /* 3476 * Check if we have enough space in the system space info because we 3477 * will need to update device items in the chunk btree and insert a new 3478 * chunk item in the chunk btree as well. This will allocate a new 3479 * system block group if needed. 3480 */ 3481 check_system_chunk(trans, flags); 3482 3483 bg = btrfs_create_chunk(trans, flags); 3484 if (IS_ERR(bg)) { 3485 ret = PTR_ERR(bg); 3486 goto out; 3487 } 3488 3489 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3490 /* 3491 * Normally we are not expected to fail with -ENOSPC here, since we have 3492 * previously reserved space in the system space_info and allocated one 3493 * new system chunk if necessary. However there are three exceptions: 3494 * 3495 * 1) We may have enough free space in the system space_info but all the 3496 * existing system block groups have a profile which can not be used 3497 * for extent allocation. 3498 * 3499 * This happens when mounting in degraded mode. For example we have a 3500 * RAID1 filesystem with 2 devices, lose one device and mount the fs 3501 * using the other device in degraded mode. If we then allocate a chunk, 3502 * we may have enough free space in the existing system space_info, but 3503 * none of the block groups can be used for extent allocation since they 3504 * have a RAID1 profile, and because we are in degraded mode with a 3505 * single device, we are forced to allocate a new system chunk with a 3506 * SINGLE profile. Making check_system_chunk() iterate over all system 3507 * block groups and check if they have a usable profile and enough space 3508 * can be slow on very large filesystems, so we tolerate the -ENOSPC and 3509 * try again after forcing allocation of a new system chunk. Like this 3510 * we avoid paying the cost of that search in normal circumstances, when 3511 * we were not mounted in degraded mode; 3512 * 3513 * 2) We had enough free space info the system space_info, and one suitable 3514 * block group to allocate from when we called check_system_chunk() 3515 * above. However right after we called it, the only system block group 3516 * with enough free space got turned into RO mode by a running scrub, 3517 * and in this case we have to allocate a new one and retry. We only 3518 * need do this allocate and retry once, since we have a transaction 3519 * handle and scrub uses the commit root to search for block groups; 3520 * 3521 * 3) We had one system block group with enough free space when we called 3522 * check_system_chunk(), but after that, right before we tried to 3523 * allocate the last extent buffer we needed, a discard operation came 3524 * in and it temporarily removed the last free space entry from the 3525 * block group (discard removes a free space entry, discards it, and 3526 * then adds back the entry to the block group cache). 3527 */ 3528 if (ret == -ENOSPC) { 3529 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); 3530 struct btrfs_block_group *sys_bg; 3531 3532 sys_bg = btrfs_create_chunk(trans, sys_flags); 3533 if (IS_ERR(sys_bg)) { 3534 ret = PTR_ERR(sys_bg); 3535 btrfs_abort_transaction(trans, ret); 3536 goto out; 3537 } 3538 3539 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3540 if (ret) { 3541 btrfs_abort_transaction(trans, ret); 3542 goto out; 3543 } 3544 3545 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3546 if (ret) { 3547 btrfs_abort_transaction(trans, ret); 3548 goto out; 3549 } 3550 } else if (ret) { 3551 btrfs_abort_transaction(trans, ret); 3552 goto out; 3553 } 3554 out: 3555 btrfs_trans_release_chunk_metadata(trans); 3556 3557 if (ret) 3558 return ERR_PTR(ret); 3559 3560 btrfs_get_block_group(bg); 3561 return bg; 3562 } 3563 3564 /* 3565 * Chunk allocation is done in 2 phases: 3566 * 3567 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for 3568 * the chunk, the chunk mapping, create its block group and add the items 3569 * that belong in the chunk btree to it - more specifically, we need to 3570 * update device items in the chunk btree and add a new chunk item to it. 3571 * 3572 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block 3573 * group item to the extent btree and the device extent items to the devices 3574 * btree. 3575 * 3576 * This is done to prevent deadlocks. For example when COWing a node from the 3577 * extent btree we are holding a write lock on the node's parent and if we 3578 * trigger chunk allocation and attempted to insert the new block group item 3579 * in the extent btree right way, we could deadlock because the path for the 3580 * insertion can include that parent node. At first glance it seems impossible 3581 * to trigger chunk allocation after starting a transaction since tasks should 3582 * reserve enough transaction units (metadata space), however while that is true 3583 * most of the time, chunk allocation may still be triggered for several reasons: 3584 * 3585 * 1) When reserving metadata, we check if there is enough free space in the 3586 * metadata space_info and therefore don't trigger allocation of a new chunk. 3587 * However later when the task actually tries to COW an extent buffer from 3588 * the extent btree or from the device btree for example, it is forced to 3589 * allocate a new block group (chunk) because the only one that had enough 3590 * free space was just turned to RO mode by a running scrub for example (or 3591 * device replace, block group reclaim thread, etc), so we can not use it 3592 * for allocating an extent and end up being forced to allocate a new one; 3593 * 3594 * 2) Because we only check that the metadata space_info has enough free bytes, 3595 * we end up not allocating a new metadata chunk in that case. However if 3596 * the filesystem was mounted in degraded mode, none of the existing block 3597 * groups might be suitable for extent allocation due to their incompatible 3598 * profile (for e.g. mounting a 2 devices filesystem, where all block groups 3599 * use a RAID1 profile, in degraded mode using a single device). In this case 3600 * when the task attempts to COW some extent buffer of the extent btree for 3601 * example, it will trigger allocation of a new metadata block group with a 3602 * suitable profile (SINGLE profile in the example of the degraded mount of 3603 * the RAID1 filesystem); 3604 * 3605 * 3) The task has reserved enough transaction units / metadata space, but when 3606 * it attempts to COW an extent buffer from the extent or device btree for 3607 * example, it does not find any free extent in any metadata block group, 3608 * therefore forced to try to allocate a new metadata block group. 3609 * This is because some other task allocated all available extents in the 3610 * meanwhile - this typically happens with tasks that don't reserve space 3611 * properly, either intentionally or as a bug. One example where this is 3612 * done intentionally is fsync, as it does not reserve any transaction units 3613 * and ends up allocating a variable number of metadata extents for log 3614 * tree extent buffers; 3615 * 3616 * 4) The task has reserved enough transaction units / metadata space, but right 3617 * before it tries to allocate the last extent buffer it needs, a discard 3618 * operation comes in and, temporarily, removes the last free space entry from 3619 * the only metadata block group that had free space (discard starts by 3620 * removing a free space entry from a block group, then does the discard 3621 * operation and, once it's done, it adds back the free space entry to the 3622 * block group). 3623 * 3624 * We also need this 2 phases setup when adding a device to a filesystem with 3625 * a seed device - we must create new metadata and system chunks without adding 3626 * any of the block group items to the chunk, extent and device btrees. If we 3627 * did not do it this way, we would get ENOSPC when attempting to update those 3628 * btrees, since all the chunks from the seed device are read-only. 3629 * 3630 * Phase 1 does the updates and insertions to the chunk btree because if we had 3631 * it done in phase 2 and have a thundering herd of tasks allocating chunks in 3632 * parallel, we risk having too many system chunks allocated by many tasks if 3633 * many tasks reach phase 1 without the previous ones completing phase 2. In the 3634 * extreme case this leads to exhaustion of the system chunk array in the 3635 * superblock. This is easier to trigger if using a btree node/leaf size of 64K 3636 * and with RAID filesystems (so we have more device items in the chunk btree). 3637 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of 3638 * the system chunk array due to concurrent allocations") provides more details. 3639 * 3640 * Allocation of system chunks does not happen through this function. A task that 3641 * needs to update the chunk btree (the only btree that uses system chunks), must 3642 * preallocate chunk space by calling either check_system_chunk() or 3643 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or 3644 * metadata chunk or when removing a chunk, while the later is used before doing 3645 * a modification to the chunk btree - use cases for the later are adding, 3646 * removing and resizing a device as well as relocation of a system chunk. 3647 * See the comment below for more details. 3648 * 3649 * The reservation of system space, done through check_system_chunk(), as well 3650 * as all the updates and insertions into the chunk btree must be done while 3651 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing 3652 * an extent buffer from the chunks btree we never trigger allocation of a new 3653 * system chunk, which would result in a deadlock (trying to lock twice an 3654 * extent buffer of the chunk btree, first time before triggering the chunk 3655 * allocation and the second time during chunk allocation while attempting to 3656 * update the chunks btree). The system chunk array is also updated while holding 3657 * that mutex. The same logic applies to removing chunks - we must reserve system 3658 * space, update the chunk btree and the system chunk array in the superblock 3659 * while holding fs_info->chunk_mutex. 3660 * 3661 * This function, btrfs_chunk_alloc(), belongs to phase 1. 3662 * 3663 * If @force is CHUNK_ALLOC_FORCE: 3664 * - return 1 if it successfully allocates a chunk, 3665 * - return errors including -ENOSPC otherwise. 3666 * If @force is NOT CHUNK_ALLOC_FORCE: 3667 * - return 0 if it doesn't need to allocate a new chunk, 3668 * - return 1 if it successfully allocates a chunk, 3669 * - return errors including -ENOSPC otherwise. 3670 */ 3671 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3672 enum btrfs_chunk_alloc_enum force) 3673 { 3674 struct btrfs_fs_info *fs_info = trans->fs_info; 3675 struct btrfs_space_info *space_info; 3676 struct btrfs_block_group *ret_bg; 3677 bool wait_for_alloc = false; 3678 bool should_alloc = false; 3679 bool from_extent_allocation = false; 3680 int ret = 0; 3681 3682 if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { 3683 from_extent_allocation = true; 3684 force = CHUNK_ALLOC_FORCE; 3685 } 3686 3687 /* Don't re-enter if we're already allocating a chunk */ 3688 if (trans->allocating_chunk) 3689 return -ENOSPC; 3690 /* 3691 * Allocation of system chunks can not happen through this path, as we 3692 * could end up in a deadlock if we are allocating a data or metadata 3693 * chunk and there is another task modifying the chunk btree. 3694 * 3695 * This is because while we are holding the chunk mutex, we will attempt 3696 * to add the new chunk item to the chunk btree or update an existing 3697 * device item in the chunk btree, while the other task that is modifying 3698 * the chunk btree is attempting to COW an extent buffer while holding a 3699 * lock on it and on its parent - if the COW operation triggers a system 3700 * chunk allocation, then we can deadlock because we are holding the 3701 * chunk mutex and we may need to access that extent buffer or its parent 3702 * in order to add the chunk item or update a device item. 3703 * 3704 * Tasks that want to modify the chunk tree should reserve system space 3705 * before updating the chunk btree, by calling either 3706 * btrfs_reserve_chunk_metadata() or check_system_chunk(). 3707 * It's possible that after a task reserves the space, it still ends up 3708 * here - this happens in the cases described above at do_chunk_alloc(). 3709 * The task will have to either retry or fail. 3710 */ 3711 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3712 return -ENOSPC; 3713 3714 space_info = btrfs_find_space_info(fs_info, flags); 3715 ASSERT(space_info); 3716 3717 do { 3718 spin_lock(&space_info->lock); 3719 if (force < space_info->force_alloc) 3720 force = space_info->force_alloc; 3721 should_alloc = should_alloc_chunk(fs_info, space_info, force); 3722 if (space_info->full) { 3723 /* No more free physical space */ 3724 if (should_alloc) 3725 ret = -ENOSPC; 3726 else 3727 ret = 0; 3728 spin_unlock(&space_info->lock); 3729 return ret; 3730 } else if (!should_alloc) { 3731 spin_unlock(&space_info->lock); 3732 return 0; 3733 } else if (space_info->chunk_alloc) { 3734 /* 3735 * Someone is already allocating, so we need to block 3736 * until this someone is finished and then loop to 3737 * recheck if we should continue with our allocation 3738 * attempt. 3739 */ 3740 wait_for_alloc = true; 3741 force = CHUNK_ALLOC_NO_FORCE; 3742 spin_unlock(&space_info->lock); 3743 mutex_lock(&fs_info->chunk_mutex); 3744 mutex_unlock(&fs_info->chunk_mutex); 3745 } else { 3746 /* Proceed with allocation */ 3747 space_info->chunk_alloc = 1; 3748 wait_for_alloc = false; 3749 spin_unlock(&space_info->lock); 3750 } 3751 3752 cond_resched(); 3753 } while (wait_for_alloc); 3754 3755 mutex_lock(&fs_info->chunk_mutex); 3756 trans->allocating_chunk = true; 3757 3758 /* 3759 * If we have mixed data/metadata chunks we want to make sure we keep 3760 * allocating mixed chunks instead of individual chunks. 3761 */ 3762 if (btrfs_mixed_space_info(space_info)) 3763 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3764 3765 /* 3766 * if we're doing a data chunk, go ahead and make sure that 3767 * we keep a reasonable number of metadata chunks allocated in the 3768 * FS as well. 3769 */ 3770 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3771 fs_info->data_chunk_allocations++; 3772 if (!(fs_info->data_chunk_allocations % 3773 fs_info->metadata_ratio)) 3774 force_metadata_allocation(fs_info); 3775 } 3776 3777 ret_bg = do_chunk_alloc(trans, flags); 3778 trans->allocating_chunk = false; 3779 3780 if (IS_ERR(ret_bg)) { 3781 ret = PTR_ERR(ret_bg); 3782 } else if (from_extent_allocation) { 3783 /* 3784 * New block group is likely to be used soon. Try to activate 3785 * it now. Failure is OK for now. 3786 */ 3787 btrfs_zone_activate(ret_bg); 3788 } 3789 3790 if (!ret) 3791 btrfs_put_block_group(ret_bg); 3792 3793 spin_lock(&space_info->lock); 3794 if (ret < 0) { 3795 if (ret == -ENOSPC) 3796 space_info->full = 1; 3797 else 3798 goto out; 3799 } else { 3800 ret = 1; 3801 space_info->max_extent_size = 0; 3802 } 3803 3804 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3805 out: 3806 space_info->chunk_alloc = 0; 3807 spin_unlock(&space_info->lock); 3808 mutex_unlock(&fs_info->chunk_mutex); 3809 3810 return ret; 3811 } 3812 3813 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 3814 { 3815 u64 num_dev; 3816 3817 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 3818 if (!num_dev) 3819 num_dev = fs_info->fs_devices->rw_devices; 3820 3821 return num_dev; 3822 } 3823 3824 static void reserve_chunk_space(struct btrfs_trans_handle *trans, 3825 u64 bytes, 3826 u64 type) 3827 { 3828 struct btrfs_fs_info *fs_info = trans->fs_info; 3829 struct btrfs_space_info *info; 3830 u64 left; 3831 int ret = 0; 3832 3833 /* 3834 * Needed because we can end up allocating a system chunk and for an 3835 * atomic and race free space reservation in the chunk block reserve. 3836 */ 3837 lockdep_assert_held(&fs_info->chunk_mutex); 3838 3839 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3840 spin_lock(&info->lock); 3841 left = info->total_bytes - btrfs_space_info_used(info, true); 3842 spin_unlock(&info->lock); 3843 3844 if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3845 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3846 left, bytes, type); 3847 btrfs_dump_space_info(fs_info, info, 0, 0); 3848 } 3849 3850 if (left < bytes) { 3851 u64 flags = btrfs_system_alloc_profile(fs_info); 3852 struct btrfs_block_group *bg; 3853 3854 /* 3855 * Ignore failure to create system chunk. We might end up not 3856 * needing it, as we might not need to COW all nodes/leafs from 3857 * the paths we visit in the chunk tree (they were already COWed 3858 * or created in the current transaction for example). 3859 */ 3860 bg = btrfs_create_chunk(trans, flags); 3861 if (IS_ERR(bg)) { 3862 ret = PTR_ERR(bg); 3863 } else { 3864 /* 3865 * We have a new chunk. We also need to activate it for 3866 * zoned filesystem. 3867 */ 3868 ret = btrfs_zoned_activate_one_bg(fs_info, info, true); 3869 if (ret < 0) 3870 return; 3871 3872 /* 3873 * If we fail to add the chunk item here, we end up 3874 * trying again at phase 2 of chunk allocation, at 3875 * btrfs_create_pending_block_groups(). So ignore 3876 * any error here. An ENOSPC here could happen, due to 3877 * the cases described at do_chunk_alloc() - the system 3878 * block group we just created was just turned into RO 3879 * mode by a scrub for example, or a running discard 3880 * temporarily removed its free space entries, etc. 3881 */ 3882 btrfs_chunk_alloc_add_chunk_item(trans, bg); 3883 } 3884 } 3885 3886 if (!ret) { 3887 ret = btrfs_block_rsv_add(fs_info, 3888 &fs_info->chunk_block_rsv, 3889 bytes, BTRFS_RESERVE_NO_FLUSH); 3890 if (!ret) 3891 trans->chunk_bytes_reserved += bytes; 3892 } 3893 } 3894 3895 /* 3896 * Reserve space in the system space for allocating or removing a chunk. 3897 * The caller must be holding fs_info->chunk_mutex. 3898 */ 3899 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 3900 { 3901 struct btrfs_fs_info *fs_info = trans->fs_info; 3902 const u64 num_devs = get_profile_num_devs(fs_info, type); 3903 u64 bytes; 3904 3905 /* num_devs device items to update and 1 chunk item to add or remove. */ 3906 bytes = btrfs_calc_metadata_size(fs_info, num_devs) + 3907 btrfs_calc_insert_metadata_size(fs_info, 1); 3908 3909 reserve_chunk_space(trans, bytes, type); 3910 } 3911 3912 /* 3913 * Reserve space in the system space, if needed, for doing a modification to the 3914 * chunk btree. 3915 * 3916 * @trans: A transaction handle. 3917 * @is_item_insertion: Indicate if the modification is for inserting a new item 3918 * in the chunk btree or if it's for the deletion or update 3919 * of an existing item. 3920 * 3921 * This is used in a context where we need to update the chunk btree outside 3922 * block group allocation and removal, to avoid a deadlock with a concurrent 3923 * task that is allocating a metadata or data block group and therefore needs to 3924 * update the chunk btree while holding the chunk mutex. After the update to the 3925 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. 3926 * 3927 */ 3928 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, 3929 bool is_item_insertion) 3930 { 3931 struct btrfs_fs_info *fs_info = trans->fs_info; 3932 u64 bytes; 3933 3934 if (is_item_insertion) 3935 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 3936 else 3937 bytes = btrfs_calc_metadata_size(fs_info, 1); 3938 3939 mutex_lock(&fs_info->chunk_mutex); 3940 reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); 3941 mutex_unlock(&fs_info->chunk_mutex); 3942 } 3943 3944 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 3945 { 3946 struct btrfs_block_group *block_group; 3947 3948 block_group = btrfs_lookup_first_block_group(info, 0); 3949 while (block_group) { 3950 btrfs_wait_block_group_cache_done(block_group); 3951 spin_lock(&block_group->lock); 3952 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, 3953 &block_group->runtime_flags)) { 3954 struct inode *inode = block_group->inode; 3955 3956 block_group->inode = NULL; 3957 spin_unlock(&block_group->lock); 3958 3959 ASSERT(block_group->io_ctl.inode == NULL); 3960 iput(inode); 3961 } else { 3962 spin_unlock(&block_group->lock); 3963 } 3964 block_group = btrfs_next_block_group(block_group); 3965 } 3966 } 3967 3968 /* 3969 * Must be called only after stopping all workers, since we could have block 3970 * group caching kthreads running, and therefore they could race with us if we 3971 * freed the block groups before stopping them. 3972 */ 3973 int btrfs_free_block_groups(struct btrfs_fs_info *info) 3974 { 3975 struct btrfs_block_group *block_group; 3976 struct btrfs_space_info *space_info; 3977 struct btrfs_caching_control *caching_ctl; 3978 struct rb_node *n; 3979 3980 write_lock(&info->block_group_cache_lock); 3981 while (!list_empty(&info->caching_block_groups)) { 3982 caching_ctl = list_entry(info->caching_block_groups.next, 3983 struct btrfs_caching_control, list); 3984 list_del(&caching_ctl->list); 3985 btrfs_put_caching_control(caching_ctl); 3986 } 3987 write_unlock(&info->block_group_cache_lock); 3988 3989 spin_lock(&info->unused_bgs_lock); 3990 while (!list_empty(&info->unused_bgs)) { 3991 block_group = list_first_entry(&info->unused_bgs, 3992 struct btrfs_block_group, 3993 bg_list); 3994 list_del_init(&block_group->bg_list); 3995 btrfs_put_block_group(block_group); 3996 } 3997 3998 while (!list_empty(&info->reclaim_bgs)) { 3999 block_group = list_first_entry(&info->reclaim_bgs, 4000 struct btrfs_block_group, 4001 bg_list); 4002 list_del_init(&block_group->bg_list); 4003 btrfs_put_block_group(block_group); 4004 } 4005 spin_unlock(&info->unused_bgs_lock); 4006 4007 spin_lock(&info->zone_active_bgs_lock); 4008 while (!list_empty(&info->zone_active_bgs)) { 4009 block_group = list_first_entry(&info->zone_active_bgs, 4010 struct btrfs_block_group, 4011 active_bg_list); 4012 list_del_init(&block_group->active_bg_list); 4013 btrfs_put_block_group(block_group); 4014 } 4015 spin_unlock(&info->zone_active_bgs_lock); 4016 4017 write_lock(&info->block_group_cache_lock); 4018 while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { 4019 block_group = rb_entry(n, struct btrfs_block_group, 4020 cache_node); 4021 rb_erase_cached(&block_group->cache_node, 4022 &info->block_group_cache_tree); 4023 RB_CLEAR_NODE(&block_group->cache_node); 4024 write_unlock(&info->block_group_cache_lock); 4025 4026 down_write(&block_group->space_info->groups_sem); 4027 list_del(&block_group->list); 4028 up_write(&block_group->space_info->groups_sem); 4029 4030 /* 4031 * We haven't cached this block group, which means we could 4032 * possibly have excluded extents on this block group. 4033 */ 4034 if (block_group->cached == BTRFS_CACHE_NO || 4035 block_group->cached == BTRFS_CACHE_ERROR) 4036 btrfs_free_excluded_extents(block_group); 4037 4038 btrfs_remove_free_space_cache(block_group); 4039 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 4040 ASSERT(list_empty(&block_group->dirty_list)); 4041 ASSERT(list_empty(&block_group->io_list)); 4042 ASSERT(list_empty(&block_group->bg_list)); 4043 ASSERT(refcount_read(&block_group->refs) == 1); 4044 ASSERT(block_group->swap_extents == 0); 4045 btrfs_put_block_group(block_group); 4046 4047 write_lock(&info->block_group_cache_lock); 4048 } 4049 write_unlock(&info->block_group_cache_lock); 4050 4051 btrfs_release_global_block_rsv(info); 4052 4053 while (!list_empty(&info->space_info)) { 4054 space_info = list_entry(info->space_info.next, 4055 struct btrfs_space_info, 4056 list); 4057 4058 /* 4059 * Do not hide this behind enospc_debug, this is actually 4060 * important and indicates a real bug if this happens. 4061 */ 4062 if (WARN_ON(space_info->bytes_pinned > 0 || 4063 space_info->bytes_may_use > 0)) 4064 btrfs_dump_space_info(info, space_info, 0, 0); 4065 4066 /* 4067 * If there was a failure to cleanup a log tree, very likely due 4068 * to an IO failure on a writeback attempt of one or more of its 4069 * extent buffers, we could not do proper (and cheap) unaccounting 4070 * of their reserved space, so don't warn on bytes_reserved > 0 in 4071 * that case. 4072 */ 4073 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || 4074 !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { 4075 if (WARN_ON(space_info->bytes_reserved > 0)) 4076 btrfs_dump_space_info(info, space_info, 0, 0); 4077 } 4078 4079 WARN_ON(space_info->reclaim_size > 0); 4080 list_del(&space_info->list); 4081 btrfs_sysfs_remove_space_info(space_info); 4082 } 4083 return 0; 4084 } 4085 4086 void btrfs_freeze_block_group(struct btrfs_block_group *cache) 4087 { 4088 atomic_inc(&cache->frozen); 4089 } 4090 4091 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) 4092 { 4093 struct btrfs_fs_info *fs_info = block_group->fs_info; 4094 struct extent_map_tree *em_tree; 4095 struct extent_map *em; 4096 bool cleanup; 4097 4098 spin_lock(&block_group->lock); 4099 cleanup = (atomic_dec_and_test(&block_group->frozen) && 4100 test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); 4101 spin_unlock(&block_group->lock); 4102 4103 if (cleanup) { 4104 em_tree = &fs_info->mapping_tree; 4105 write_lock(&em_tree->lock); 4106 em = lookup_extent_mapping(em_tree, block_group->start, 4107 1); 4108 BUG_ON(!em); /* logic error, can't happen */ 4109 remove_extent_mapping(em_tree, em); 4110 write_unlock(&em_tree->lock); 4111 4112 /* once for us and once for the tree */ 4113 free_extent_map(em); 4114 free_extent_map(em); 4115 4116 /* 4117 * We may have left one free space entry and other possible 4118 * tasks trimming this block group have left 1 entry each one. 4119 * Free them if any. 4120 */ 4121 btrfs_remove_free_space_cache(block_group); 4122 } 4123 } 4124 4125 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) 4126 { 4127 bool ret = true; 4128 4129 spin_lock(&bg->lock); 4130 if (bg->ro) 4131 ret = false; 4132 else 4133 bg->swap_extents++; 4134 spin_unlock(&bg->lock); 4135 4136 return ret; 4137 } 4138 4139 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) 4140 { 4141 spin_lock(&bg->lock); 4142 ASSERT(!bg->ro); 4143 ASSERT(bg->swap_extents >= amount); 4144 bg->swap_extents -= amount; 4145 spin_unlock(&bg->lock); 4146 } 4147