1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/list_sort.h> 4 #include "misc.h" 5 #include "ctree.h" 6 #include "block-group.h" 7 #include "space-info.h" 8 #include "disk-io.h" 9 #include "free-space-cache.h" 10 #include "free-space-tree.h" 11 #include "volumes.h" 12 #include "transaction.h" 13 #include "ref-verify.h" 14 #include "sysfs.h" 15 #include "tree-log.h" 16 #include "delalloc-space.h" 17 #include "discard.h" 18 #include "raid56.h" 19 #include "zoned.h" 20 21 /* 22 * Return target flags in extended format or 0 if restripe for this chunk_type 23 * is not in progress 24 * 25 * Should be called with balance_lock held 26 */ 27 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 28 { 29 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 30 u64 target = 0; 31 32 if (!bctl) 33 return 0; 34 35 if (flags & BTRFS_BLOCK_GROUP_DATA && 36 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 37 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 38 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 39 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 40 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 41 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 42 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 43 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 44 } 45 46 return target; 47 } 48 49 /* 50 * @flags: available profiles in extended format (see ctree.h) 51 * 52 * Return reduced profile in chunk format. If profile changing is in progress 53 * (either running or paused) picks the target profile (if it's already 54 * available), otherwise falls back to plain reducing. 55 */ 56 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 57 { 58 u64 num_devices = fs_info->fs_devices->rw_devices; 59 u64 target; 60 u64 raid_type; 61 u64 allowed = 0; 62 63 /* 64 * See if restripe for this chunk_type is in progress, if so try to 65 * reduce to the target profile 66 */ 67 spin_lock(&fs_info->balance_lock); 68 target = get_restripe_target(fs_info, flags); 69 if (target) { 70 spin_unlock(&fs_info->balance_lock); 71 return extended_to_chunk(target); 72 } 73 spin_unlock(&fs_info->balance_lock); 74 75 /* First, mask out the RAID levels which aren't possible */ 76 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 77 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 78 allowed |= btrfs_raid_array[raid_type].bg_flag; 79 } 80 allowed &= flags; 81 82 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 83 allowed = BTRFS_BLOCK_GROUP_RAID6; 84 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 85 allowed = BTRFS_BLOCK_GROUP_RAID5; 86 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 87 allowed = BTRFS_BLOCK_GROUP_RAID10; 88 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 89 allowed = BTRFS_BLOCK_GROUP_RAID1; 90 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 91 allowed = BTRFS_BLOCK_GROUP_RAID0; 92 93 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 94 95 return extended_to_chunk(flags | allowed); 96 } 97 98 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 99 { 100 unsigned seq; 101 u64 flags; 102 103 do { 104 flags = orig_flags; 105 seq = read_seqbegin(&fs_info->profiles_lock); 106 107 if (flags & BTRFS_BLOCK_GROUP_DATA) 108 flags |= fs_info->avail_data_alloc_bits; 109 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 110 flags |= fs_info->avail_system_alloc_bits; 111 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 112 flags |= fs_info->avail_metadata_alloc_bits; 113 } while (read_seqretry(&fs_info->profiles_lock, seq)); 114 115 return btrfs_reduce_alloc_profile(fs_info, flags); 116 } 117 118 void btrfs_get_block_group(struct btrfs_block_group *cache) 119 { 120 refcount_inc(&cache->refs); 121 } 122 123 void btrfs_put_block_group(struct btrfs_block_group *cache) 124 { 125 if (refcount_dec_and_test(&cache->refs)) { 126 WARN_ON(cache->pinned > 0); 127 /* 128 * If there was a failure to cleanup a log tree, very likely due 129 * to an IO failure on a writeback attempt of one or more of its 130 * extent buffers, we could not do proper (and cheap) unaccounting 131 * of their reserved space, so don't warn on reserved > 0 in that 132 * case. 133 */ 134 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || 135 !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) 136 WARN_ON(cache->reserved > 0); 137 138 /* 139 * A block_group shouldn't be on the discard_list anymore. 140 * Remove the block_group from the discard_list to prevent us 141 * from causing a panic due to NULL pointer dereference. 142 */ 143 if (WARN_ON(!list_empty(&cache->discard_list))) 144 btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, 145 cache); 146 147 /* 148 * If not empty, someone is still holding mutex of 149 * full_stripe_lock, which can only be released by caller. 150 * And it will definitely cause use-after-free when caller 151 * tries to release full stripe lock. 152 * 153 * No better way to resolve, but only to warn. 154 */ 155 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 156 kfree(cache->free_space_ctl); 157 kfree(cache->physical_map); 158 kfree(cache); 159 } 160 } 161 162 /* 163 * This adds the block group to the fs_info rb tree for the block group cache 164 */ 165 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 166 struct btrfs_block_group *block_group) 167 { 168 struct rb_node **p; 169 struct rb_node *parent = NULL; 170 struct btrfs_block_group *cache; 171 bool leftmost = true; 172 173 ASSERT(block_group->length != 0); 174 175 write_lock(&info->block_group_cache_lock); 176 p = &info->block_group_cache_tree.rb_root.rb_node; 177 178 while (*p) { 179 parent = *p; 180 cache = rb_entry(parent, struct btrfs_block_group, cache_node); 181 if (block_group->start < cache->start) { 182 p = &(*p)->rb_left; 183 } else if (block_group->start > cache->start) { 184 p = &(*p)->rb_right; 185 leftmost = false; 186 } else { 187 write_unlock(&info->block_group_cache_lock); 188 return -EEXIST; 189 } 190 } 191 192 rb_link_node(&block_group->cache_node, parent, p); 193 rb_insert_color_cached(&block_group->cache_node, 194 &info->block_group_cache_tree, leftmost); 195 196 write_unlock(&info->block_group_cache_lock); 197 198 return 0; 199 } 200 201 /* 202 * This will return the block group at or after bytenr if contains is 0, else 203 * it will return the block group that contains the bytenr 204 */ 205 static struct btrfs_block_group *block_group_cache_tree_search( 206 struct btrfs_fs_info *info, u64 bytenr, int contains) 207 { 208 struct btrfs_block_group *cache, *ret = NULL; 209 struct rb_node *n; 210 u64 end, start; 211 212 read_lock(&info->block_group_cache_lock); 213 n = info->block_group_cache_tree.rb_root.rb_node; 214 215 while (n) { 216 cache = rb_entry(n, struct btrfs_block_group, cache_node); 217 end = cache->start + cache->length - 1; 218 start = cache->start; 219 220 if (bytenr < start) { 221 if (!contains && (!ret || start < ret->start)) 222 ret = cache; 223 n = n->rb_left; 224 } else if (bytenr > start) { 225 if (contains && bytenr <= end) { 226 ret = cache; 227 break; 228 } 229 n = n->rb_right; 230 } else { 231 ret = cache; 232 break; 233 } 234 } 235 if (ret) 236 btrfs_get_block_group(ret); 237 read_unlock(&info->block_group_cache_lock); 238 239 return ret; 240 } 241 242 /* 243 * Return the block group that starts at or after bytenr 244 */ 245 struct btrfs_block_group *btrfs_lookup_first_block_group( 246 struct btrfs_fs_info *info, u64 bytenr) 247 { 248 return block_group_cache_tree_search(info, bytenr, 0); 249 } 250 251 /* 252 * Return the block group that contains the given bytenr 253 */ 254 struct btrfs_block_group *btrfs_lookup_block_group( 255 struct btrfs_fs_info *info, u64 bytenr) 256 { 257 return block_group_cache_tree_search(info, bytenr, 1); 258 } 259 260 struct btrfs_block_group *btrfs_next_block_group( 261 struct btrfs_block_group *cache) 262 { 263 struct btrfs_fs_info *fs_info = cache->fs_info; 264 struct rb_node *node; 265 266 read_lock(&fs_info->block_group_cache_lock); 267 268 /* If our block group was removed, we need a full search. */ 269 if (RB_EMPTY_NODE(&cache->cache_node)) { 270 const u64 next_bytenr = cache->start + cache->length; 271 272 read_unlock(&fs_info->block_group_cache_lock); 273 btrfs_put_block_group(cache); 274 return btrfs_lookup_first_block_group(fs_info, next_bytenr); 275 } 276 node = rb_next(&cache->cache_node); 277 btrfs_put_block_group(cache); 278 if (node) { 279 cache = rb_entry(node, struct btrfs_block_group, cache_node); 280 btrfs_get_block_group(cache); 281 } else 282 cache = NULL; 283 read_unlock(&fs_info->block_group_cache_lock); 284 return cache; 285 } 286 287 /** 288 * Check if we can do a NOCOW write for a given extent. 289 * 290 * @fs_info: The filesystem information object. 291 * @bytenr: Logical start address of the extent. 292 * 293 * Check if we can do a NOCOW write for the given extent, and increments the 294 * number of NOCOW writers in the block group that contains the extent, as long 295 * as the block group exists and it's currently not in read-only mode. 296 * 297 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller 298 * is responsible for calling btrfs_dec_nocow_writers() later. 299 * 300 * Or NULL if we can not do a NOCOW write 301 */ 302 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 303 u64 bytenr) 304 { 305 struct btrfs_block_group *bg; 306 bool can_nocow = true; 307 308 bg = btrfs_lookup_block_group(fs_info, bytenr); 309 if (!bg) 310 return NULL; 311 312 spin_lock(&bg->lock); 313 if (bg->ro) 314 can_nocow = false; 315 else 316 atomic_inc(&bg->nocow_writers); 317 spin_unlock(&bg->lock); 318 319 if (!can_nocow) { 320 btrfs_put_block_group(bg); 321 return NULL; 322 } 323 324 /* No put on block group, done by btrfs_dec_nocow_writers(). */ 325 return bg; 326 } 327 328 /** 329 * Decrement the number of NOCOW writers in a block group. 330 * 331 * @bg: The block group. 332 * 333 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), 334 * and on the block group returned by that call. Typically this is called after 335 * creating an ordered extent for a NOCOW write, to prevent races with scrub and 336 * relocation. 337 * 338 * After this call, the caller should not use the block group anymore. It it wants 339 * to use it, then it should get a reference on it before calling this function. 340 */ 341 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) 342 { 343 if (atomic_dec_and_test(&bg->nocow_writers)) 344 wake_up_var(&bg->nocow_writers); 345 346 /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ 347 btrfs_put_block_group(bg); 348 } 349 350 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 351 { 352 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 353 } 354 355 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 356 const u64 start) 357 { 358 struct btrfs_block_group *bg; 359 360 bg = btrfs_lookup_block_group(fs_info, start); 361 ASSERT(bg); 362 if (atomic_dec_and_test(&bg->reservations)) 363 wake_up_var(&bg->reservations); 364 btrfs_put_block_group(bg); 365 } 366 367 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 368 { 369 struct btrfs_space_info *space_info = bg->space_info; 370 371 ASSERT(bg->ro); 372 373 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 374 return; 375 376 /* 377 * Our block group is read only but before we set it to read only, 378 * some task might have had allocated an extent from it already, but it 379 * has not yet created a respective ordered extent (and added it to a 380 * root's list of ordered extents). 381 * Therefore wait for any task currently allocating extents, since the 382 * block group's reservations counter is incremented while a read lock 383 * on the groups' semaphore is held and decremented after releasing 384 * the read access on that semaphore and creating the ordered extent. 385 */ 386 down_write(&space_info->groups_sem); 387 up_write(&space_info->groups_sem); 388 389 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 390 } 391 392 struct btrfs_caching_control *btrfs_get_caching_control( 393 struct btrfs_block_group *cache) 394 { 395 struct btrfs_caching_control *ctl; 396 397 spin_lock(&cache->lock); 398 if (!cache->caching_ctl) { 399 spin_unlock(&cache->lock); 400 return NULL; 401 } 402 403 ctl = cache->caching_ctl; 404 refcount_inc(&ctl->count); 405 spin_unlock(&cache->lock); 406 return ctl; 407 } 408 409 void btrfs_put_caching_control(struct btrfs_caching_control *ctl) 410 { 411 if (refcount_dec_and_test(&ctl->count)) 412 kfree(ctl); 413 } 414 415 /* 416 * When we wait for progress in the block group caching, its because our 417 * allocation attempt failed at least once. So, we must sleep and let some 418 * progress happen before we try again. 419 * 420 * This function will sleep at least once waiting for new free space to show 421 * up, and then it will check the block group free space numbers for our min 422 * num_bytes. Another option is to have it go ahead and look in the rbtree for 423 * a free extent of a given size, but this is a good start. 424 * 425 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 426 * any of the information in this block group. 427 */ 428 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 429 u64 num_bytes) 430 { 431 struct btrfs_caching_control *caching_ctl; 432 433 caching_ctl = btrfs_get_caching_control(cache); 434 if (!caching_ctl) 435 return; 436 437 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 438 (cache->free_space_ctl->free_space >= num_bytes)); 439 440 btrfs_put_caching_control(caching_ctl); 441 } 442 443 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, 444 struct btrfs_caching_control *caching_ctl) 445 { 446 wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 447 return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; 448 } 449 450 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 451 { 452 struct btrfs_caching_control *caching_ctl; 453 int ret; 454 455 caching_ctl = btrfs_get_caching_control(cache); 456 if (!caching_ctl) 457 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 458 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 459 btrfs_put_caching_control(caching_ctl); 460 return ret; 461 } 462 463 #ifdef CONFIG_BTRFS_DEBUG 464 static void fragment_free_space(struct btrfs_block_group *block_group) 465 { 466 struct btrfs_fs_info *fs_info = block_group->fs_info; 467 u64 start = block_group->start; 468 u64 len = block_group->length; 469 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 470 fs_info->nodesize : fs_info->sectorsize; 471 u64 step = chunk << 1; 472 473 while (len > chunk) { 474 btrfs_remove_free_space(block_group, start, chunk); 475 start += step; 476 if (len < step) 477 len = 0; 478 else 479 len -= step; 480 } 481 } 482 #endif 483 484 /* 485 * This is only called by btrfs_cache_block_group, since we could have freed 486 * extents we need to check the pinned_extents for any extents that can't be 487 * used yet since their free space will be released as soon as the transaction 488 * commits. 489 */ 490 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) 491 { 492 struct btrfs_fs_info *info = block_group->fs_info; 493 u64 extent_start, extent_end, size, total_added = 0; 494 int ret; 495 496 while (start < end) { 497 ret = find_first_extent_bit(&info->excluded_extents, start, 498 &extent_start, &extent_end, 499 EXTENT_DIRTY | EXTENT_UPTODATE, 500 NULL); 501 if (ret) 502 break; 503 504 if (extent_start <= start) { 505 start = extent_end + 1; 506 } else if (extent_start > start && extent_start < end) { 507 size = extent_start - start; 508 total_added += size; 509 ret = btrfs_add_free_space_async_trimmed(block_group, 510 start, size); 511 BUG_ON(ret); /* -ENOMEM or logic error */ 512 start = extent_end + 1; 513 } else { 514 break; 515 } 516 } 517 518 if (start < end) { 519 size = end - start; 520 total_added += size; 521 ret = btrfs_add_free_space_async_trimmed(block_group, start, 522 size); 523 BUG_ON(ret); /* -ENOMEM or logic error */ 524 } 525 526 return total_added; 527 } 528 529 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 530 { 531 struct btrfs_block_group *block_group = caching_ctl->block_group; 532 struct btrfs_fs_info *fs_info = block_group->fs_info; 533 struct btrfs_root *extent_root; 534 struct btrfs_path *path; 535 struct extent_buffer *leaf; 536 struct btrfs_key key; 537 u64 total_found = 0; 538 u64 last = 0; 539 u32 nritems; 540 int ret; 541 bool wakeup = true; 542 543 path = btrfs_alloc_path(); 544 if (!path) 545 return -ENOMEM; 546 547 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 548 extent_root = btrfs_extent_root(fs_info, last); 549 550 #ifdef CONFIG_BTRFS_DEBUG 551 /* 552 * If we're fragmenting we don't want to make anybody think we can 553 * allocate from this block group until we've had a chance to fragment 554 * the free space. 555 */ 556 if (btrfs_should_fragment_free_space(block_group)) 557 wakeup = false; 558 #endif 559 /* 560 * We don't want to deadlock with somebody trying to allocate a new 561 * extent for the extent root while also trying to search the extent 562 * root to add free space. So we skip locking and search the commit 563 * root, since its read-only 564 */ 565 path->skip_locking = 1; 566 path->search_commit_root = 1; 567 path->reada = READA_FORWARD; 568 569 key.objectid = last; 570 key.offset = 0; 571 key.type = BTRFS_EXTENT_ITEM_KEY; 572 573 next: 574 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 575 if (ret < 0) 576 goto out; 577 578 leaf = path->nodes[0]; 579 nritems = btrfs_header_nritems(leaf); 580 581 while (1) { 582 if (btrfs_fs_closing(fs_info) > 1) { 583 last = (u64)-1; 584 break; 585 } 586 587 if (path->slots[0] < nritems) { 588 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 589 } else { 590 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 591 if (ret) 592 break; 593 594 if (need_resched() || 595 rwsem_is_contended(&fs_info->commit_root_sem)) { 596 if (wakeup) 597 caching_ctl->progress = last; 598 btrfs_release_path(path); 599 up_read(&fs_info->commit_root_sem); 600 mutex_unlock(&caching_ctl->mutex); 601 cond_resched(); 602 mutex_lock(&caching_ctl->mutex); 603 down_read(&fs_info->commit_root_sem); 604 goto next; 605 } 606 607 ret = btrfs_next_leaf(extent_root, path); 608 if (ret < 0) 609 goto out; 610 if (ret) 611 break; 612 leaf = path->nodes[0]; 613 nritems = btrfs_header_nritems(leaf); 614 continue; 615 } 616 617 if (key.objectid < last) { 618 key.objectid = last; 619 key.offset = 0; 620 key.type = BTRFS_EXTENT_ITEM_KEY; 621 622 if (wakeup) 623 caching_ctl->progress = last; 624 btrfs_release_path(path); 625 goto next; 626 } 627 628 if (key.objectid < block_group->start) { 629 path->slots[0]++; 630 continue; 631 } 632 633 if (key.objectid >= block_group->start + block_group->length) 634 break; 635 636 if (key.type == BTRFS_EXTENT_ITEM_KEY || 637 key.type == BTRFS_METADATA_ITEM_KEY) { 638 total_found += add_new_free_space(block_group, last, 639 key.objectid); 640 if (key.type == BTRFS_METADATA_ITEM_KEY) 641 last = key.objectid + 642 fs_info->nodesize; 643 else 644 last = key.objectid + key.offset; 645 646 if (total_found > CACHING_CTL_WAKE_UP) { 647 total_found = 0; 648 if (wakeup) 649 wake_up(&caching_ctl->wait); 650 } 651 } 652 path->slots[0]++; 653 } 654 ret = 0; 655 656 total_found += add_new_free_space(block_group, last, 657 block_group->start + block_group->length); 658 caching_ctl->progress = (u64)-1; 659 660 out: 661 btrfs_free_path(path); 662 return ret; 663 } 664 665 static noinline void caching_thread(struct btrfs_work *work) 666 { 667 struct btrfs_block_group *block_group; 668 struct btrfs_fs_info *fs_info; 669 struct btrfs_caching_control *caching_ctl; 670 int ret; 671 672 caching_ctl = container_of(work, struct btrfs_caching_control, work); 673 block_group = caching_ctl->block_group; 674 fs_info = block_group->fs_info; 675 676 mutex_lock(&caching_ctl->mutex); 677 down_read(&fs_info->commit_root_sem); 678 679 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 680 ret = load_free_space_cache(block_group); 681 if (ret == 1) { 682 ret = 0; 683 goto done; 684 } 685 686 /* 687 * We failed to load the space cache, set ourselves to 688 * CACHE_STARTED and carry on. 689 */ 690 spin_lock(&block_group->lock); 691 block_group->cached = BTRFS_CACHE_STARTED; 692 spin_unlock(&block_group->lock); 693 wake_up(&caching_ctl->wait); 694 } 695 696 /* 697 * If we are in the transaction that populated the free space tree we 698 * can't actually cache from the free space tree as our commit root and 699 * real root are the same, so we could change the contents of the blocks 700 * while caching. Instead do the slow caching in this case, and after 701 * the transaction has committed we will be safe. 702 */ 703 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && 704 !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) 705 ret = load_free_space_tree(caching_ctl); 706 else 707 ret = load_extent_tree_free(caching_ctl); 708 done: 709 spin_lock(&block_group->lock); 710 block_group->caching_ctl = NULL; 711 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 712 spin_unlock(&block_group->lock); 713 714 #ifdef CONFIG_BTRFS_DEBUG 715 if (btrfs_should_fragment_free_space(block_group)) { 716 u64 bytes_used; 717 718 spin_lock(&block_group->space_info->lock); 719 spin_lock(&block_group->lock); 720 bytes_used = block_group->length - block_group->used; 721 block_group->space_info->bytes_used += bytes_used >> 1; 722 spin_unlock(&block_group->lock); 723 spin_unlock(&block_group->space_info->lock); 724 fragment_free_space(block_group); 725 } 726 #endif 727 728 caching_ctl->progress = (u64)-1; 729 730 up_read(&fs_info->commit_root_sem); 731 btrfs_free_excluded_extents(block_group); 732 mutex_unlock(&caching_ctl->mutex); 733 734 wake_up(&caching_ctl->wait); 735 736 btrfs_put_caching_control(caching_ctl); 737 btrfs_put_block_group(block_group); 738 } 739 740 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) 741 { 742 struct btrfs_fs_info *fs_info = cache->fs_info; 743 struct btrfs_caching_control *caching_ctl = NULL; 744 int ret = 0; 745 746 /* Allocator for zoned filesystems does not use the cache at all */ 747 if (btrfs_is_zoned(fs_info)) 748 return 0; 749 750 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 751 if (!caching_ctl) 752 return -ENOMEM; 753 754 INIT_LIST_HEAD(&caching_ctl->list); 755 mutex_init(&caching_ctl->mutex); 756 init_waitqueue_head(&caching_ctl->wait); 757 caching_ctl->block_group = cache; 758 caching_ctl->progress = cache->start; 759 refcount_set(&caching_ctl->count, 2); 760 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 761 762 spin_lock(&cache->lock); 763 if (cache->cached != BTRFS_CACHE_NO) { 764 kfree(caching_ctl); 765 766 caching_ctl = cache->caching_ctl; 767 if (caching_ctl) 768 refcount_inc(&caching_ctl->count); 769 spin_unlock(&cache->lock); 770 goto out; 771 } 772 WARN_ON(cache->caching_ctl); 773 cache->caching_ctl = caching_ctl; 774 cache->cached = BTRFS_CACHE_STARTED; 775 cache->has_caching_ctl = 1; 776 spin_unlock(&cache->lock); 777 778 write_lock(&fs_info->block_group_cache_lock); 779 refcount_inc(&caching_ctl->count); 780 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 781 write_unlock(&fs_info->block_group_cache_lock); 782 783 btrfs_get_block_group(cache); 784 785 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 786 out: 787 if (wait && caching_ctl) 788 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 789 if (caching_ctl) 790 btrfs_put_caching_control(caching_ctl); 791 792 return ret; 793 } 794 795 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 796 { 797 u64 extra_flags = chunk_to_extended(flags) & 798 BTRFS_EXTENDED_PROFILE_MASK; 799 800 write_seqlock(&fs_info->profiles_lock); 801 if (flags & BTRFS_BLOCK_GROUP_DATA) 802 fs_info->avail_data_alloc_bits &= ~extra_flags; 803 if (flags & BTRFS_BLOCK_GROUP_METADATA) 804 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 805 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 806 fs_info->avail_system_alloc_bits &= ~extra_flags; 807 write_sequnlock(&fs_info->profiles_lock); 808 } 809 810 /* 811 * Clear incompat bits for the following feature(s): 812 * 813 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 814 * in the whole filesystem 815 * 816 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 817 */ 818 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 819 { 820 bool found_raid56 = false; 821 bool found_raid1c34 = false; 822 823 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 824 (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 825 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 826 struct list_head *head = &fs_info->space_info; 827 struct btrfs_space_info *sinfo; 828 829 list_for_each_entry_rcu(sinfo, head, list) { 830 down_read(&sinfo->groups_sem); 831 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 832 found_raid56 = true; 833 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 834 found_raid56 = true; 835 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 836 found_raid1c34 = true; 837 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 838 found_raid1c34 = true; 839 up_read(&sinfo->groups_sem); 840 } 841 if (!found_raid56) 842 btrfs_clear_fs_incompat(fs_info, RAID56); 843 if (!found_raid1c34) 844 btrfs_clear_fs_incompat(fs_info, RAID1C34); 845 } 846 } 847 848 static int remove_block_group_item(struct btrfs_trans_handle *trans, 849 struct btrfs_path *path, 850 struct btrfs_block_group *block_group) 851 { 852 struct btrfs_fs_info *fs_info = trans->fs_info; 853 struct btrfs_root *root; 854 struct btrfs_key key; 855 int ret; 856 857 root = btrfs_block_group_root(fs_info); 858 key.objectid = block_group->start; 859 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 860 key.offset = block_group->length; 861 862 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 863 if (ret > 0) 864 ret = -ENOENT; 865 if (ret < 0) 866 return ret; 867 868 ret = btrfs_del_item(trans, root, path); 869 return ret; 870 } 871 872 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 873 u64 group_start, struct extent_map *em) 874 { 875 struct btrfs_fs_info *fs_info = trans->fs_info; 876 struct btrfs_path *path; 877 struct btrfs_block_group *block_group; 878 struct btrfs_free_cluster *cluster; 879 struct inode *inode; 880 struct kobject *kobj = NULL; 881 int ret; 882 int index; 883 int factor; 884 struct btrfs_caching_control *caching_ctl = NULL; 885 bool remove_em; 886 bool remove_rsv = false; 887 888 block_group = btrfs_lookup_block_group(fs_info, group_start); 889 BUG_ON(!block_group); 890 BUG_ON(!block_group->ro); 891 892 trace_btrfs_remove_block_group(block_group); 893 /* 894 * Free the reserved super bytes from this block group before 895 * remove it. 896 */ 897 btrfs_free_excluded_extents(block_group); 898 btrfs_free_ref_tree_range(fs_info, block_group->start, 899 block_group->length); 900 901 index = btrfs_bg_flags_to_raid_index(block_group->flags); 902 factor = btrfs_bg_type_to_factor(block_group->flags); 903 904 /* make sure this block group isn't part of an allocation cluster */ 905 cluster = &fs_info->data_alloc_cluster; 906 spin_lock(&cluster->refill_lock); 907 btrfs_return_cluster_to_free_space(block_group, cluster); 908 spin_unlock(&cluster->refill_lock); 909 910 /* 911 * make sure this block group isn't part of a metadata 912 * allocation cluster 913 */ 914 cluster = &fs_info->meta_alloc_cluster; 915 spin_lock(&cluster->refill_lock); 916 btrfs_return_cluster_to_free_space(block_group, cluster); 917 spin_unlock(&cluster->refill_lock); 918 919 btrfs_clear_treelog_bg(block_group); 920 btrfs_clear_data_reloc_bg(block_group); 921 922 path = btrfs_alloc_path(); 923 if (!path) { 924 ret = -ENOMEM; 925 goto out; 926 } 927 928 /* 929 * get the inode first so any iput calls done for the io_list 930 * aren't the final iput (no unlinks allowed now) 931 */ 932 inode = lookup_free_space_inode(block_group, path); 933 934 mutex_lock(&trans->transaction->cache_write_mutex); 935 /* 936 * Make sure our free space cache IO is done before removing the 937 * free space inode 938 */ 939 spin_lock(&trans->transaction->dirty_bgs_lock); 940 if (!list_empty(&block_group->io_list)) { 941 list_del_init(&block_group->io_list); 942 943 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 944 945 spin_unlock(&trans->transaction->dirty_bgs_lock); 946 btrfs_wait_cache_io(trans, block_group, path); 947 btrfs_put_block_group(block_group); 948 spin_lock(&trans->transaction->dirty_bgs_lock); 949 } 950 951 if (!list_empty(&block_group->dirty_list)) { 952 list_del_init(&block_group->dirty_list); 953 remove_rsv = true; 954 btrfs_put_block_group(block_group); 955 } 956 spin_unlock(&trans->transaction->dirty_bgs_lock); 957 mutex_unlock(&trans->transaction->cache_write_mutex); 958 959 ret = btrfs_remove_free_space_inode(trans, inode, block_group); 960 if (ret) 961 goto out; 962 963 write_lock(&fs_info->block_group_cache_lock); 964 rb_erase_cached(&block_group->cache_node, 965 &fs_info->block_group_cache_tree); 966 RB_CLEAR_NODE(&block_group->cache_node); 967 968 /* Once for the block groups rbtree */ 969 btrfs_put_block_group(block_group); 970 971 write_unlock(&fs_info->block_group_cache_lock); 972 973 down_write(&block_group->space_info->groups_sem); 974 /* 975 * we must use list_del_init so people can check to see if they 976 * are still on the list after taking the semaphore 977 */ 978 list_del_init(&block_group->list); 979 if (list_empty(&block_group->space_info->block_groups[index])) { 980 kobj = block_group->space_info->block_group_kobjs[index]; 981 block_group->space_info->block_group_kobjs[index] = NULL; 982 clear_avail_alloc_bits(fs_info, block_group->flags); 983 } 984 up_write(&block_group->space_info->groups_sem); 985 clear_incompat_bg_bits(fs_info, block_group->flags); 986 if (kobj) { 987 kobject_del(kobj); 988 kobject_put(kobj); 989 } 990 991 if (block_group->has_caching_ctl) 992 caching_ctl = btrfs_get_caching_control(block_group); 993 if (block_group->cached == BTRFS_CACHE_STARTED) 994 btrfs_wait_block_group_cache_done(block_group); 995 if (block_group->has_caching_ctl) { 996 write_lock(&fs_info->block_group_cache_lock); 997 if (!caching_ctl) { 998 struct btrfs_caching_control *ctl; 999 1000 list_for_each_entry(ctl, 1001 &fs_info->caching_block_groups, list) 1002 if (ctl->block_group == block_group) { 1003 caching_ctl = ctl; 1004 refcount_inc(&caching_ctl->count); 1005 break; 1006 } 1007 } 1008 if (caching_ctl) 1009 list_del_init(&caching_ctl->list); 1010 write_unlock(&fs_info->block_group_cache_lock); 1011 if (caching_ctl) { 1012 /* Once for the caching bgs list and once for us. */ 1013 btrfs_put_caching_control(caching_ctl); 1014 btrfs_put_caching_control(caching_ctl); 1015 } 1016 } 1017 1018 spin_lock(&trans->transaction->dirty_bgs_lock); 1019 WARN_ON(!list_empty(&block_group->dirty_list)); 1020 WARN_ON(!list_empty(&block_group->io_list)); 1021 spin_unlock(&trans->transaction->dirty_bgs_lock); 1022 1023 btrfs_remove_free_space_cache(block_group); 1024 1025 spin_lock(&block_group->space_info->lock); 1026 list_del_init(&block_group->ro_list); 1027 1028 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1029 WARN_ON(block_group->space_info->total_bytes 1030 < block_group->length); 1031 WARN_ON(block_group->space_info->bytes_readonly 1032 < block_group->length - block_group->zone_unusable); 1033 WARN_ON(block_group->space_info->bytes_zone_unusable 1034 < block_group->zone_unusable); 1035 WARN_ON(block_group->space_info->disk_total 1036 < block_group->length * factor); 1037 WARN_ON(block_group->zone_is_active && 1038 block_group->space_info->active_total_bytes 1039 < block_group->length); 1040 } 1041 block_group->space_info->total_bytes -= block_group->length; 1042 if (block_group->zone_is_active) 1043 block_group->space_info->active_total_bytes -= block_group->length; 1044 block_group->space_info->bytes_readonly -= 1045 (block_group->length - block_group->zone_unusable); 1046 block_group->space_info->bytes_zone_unusable -= 1047 block_group->zone_unusable; 1048 block_group->space_info->disk_total -= block_group->length * factor; 1049 1050 spin_unlock(&block_group->space_info->lock); 1051 1052 /* 1053 * Remove the free space for the block group from the free space tree 1054 * and the block group's item from the extent tree before marking the 1055 * block group as removed. This is to prevent races with tasks that 1056 * freeze and unfreeze a block group, this task and another task 1057 * allocating a new block group - the unfreeze task ends up removing 1058 * the block group's extent map before the task calling this function 1059 * deletes the block group item from the extent tree, allowing for 1060 * another task to attempt to create another block group with the same 1061 * item key (and failing with -EEXIST and a transaction abort). 1062 */ 1063 ret = remove_block_group_free_space(trans, block_group); 1064 if (ret) 1065 goto out; 1066 1067 ret = remove_block_group_item(trans, path, block_group); 1068 if (ret < 0) 1069 goto out; 1070 1071 spin_lock(&block_group->lock); 1072 block_group->removed = 1; 1073 /* 1074 * At this point trimming or scrub can't start on this block group, 1075 * because we removed the block group from the rbtree 1076 * fs_info->block_group_cache_tree so no one can't find it anymore and 1077 * even if someone already got this block group before we removed it 1078 * from the rbtree, they have already incremented block_group->frozen - 1079 * if they didn't, for the trimming case they won't find any free space 1080 * entries because we already removed them all when we called 1081 * btrfs_remove_free_space_cache(). 1082 * 1083 * And we must not remove the extent map from the fs_info->mapping_tree 1084 * to prevent the same logical address range and physical device space 1085 * ranges from being reused for a new block group. This is needed to 1086 * avoid races with trimming and scrub. 1087 * 1088 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 1089 * completely transactionless, so while it is trimming a range the 1090 * currently running transaction might finish and a new one start, 1091 * allowing for new block groups to be created that can reuse the same 1092 * physical device locations unless we take this special care. 1093 * 1094 * There may also be an implicit trim operation if the file system 1095 * is mounted with -odiscard. The same protections must remain 1096 * in place until the extents have been discarded completely when 1097 * the transaction commit has completed. 1098 */ 1099 remove_em = (atomic_read(&block_group->frozen) == 0); 1100 spin_unlock(&block_group->lock); 1101 1102 if (remove_em) { 1103 struct extent_map_tree *em_tree; 1104 1105 em_tree = &fs_info->mapping_tree; 1106 write_lock(&em_tree->lock); 1107 remove_extent_mapping(em_tree, em); 1108 write_unlock(&em_tree->lock); 1109 /* once for the tree */ 1110 free_extent_map(em); 1111 } 1112 1113 out: 1114 /* Once for the lookup reference */ 1115 btrfs_put_block_group(block_group); 1116 if (remove_rsv) 1117 btrfs_delayed_refs_rsv_release(fs_info, 1); 1118 btrfs_free_path(path); 1119 return ret; 1120 } 1121 1122 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 1123 struct btrfs_fs_info *fs_info, const u64 chunk_offset) 1124 { 1125 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1126 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1127 struct extent_map *em; 1128 struct map_lookup *map; 1129 unsigned int num_items; 1130 1131 read_lock(&em_tree->lock); 1132 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1133 read_unlock(&em_tree->lock); 1134 ASSERT(em && em->start == chunk_offset); 1135 1136 /* 1137 * We need to reserve 3 + N units from the metadata space info in order 1138 * to remove a block group (done at btrfs_remove_chunk() and at 1139 * btrfs_remove_block_group()), which are used for: 1140 * 1141 * 1 unit for adding the free space inode's orphan (located in the tree 1142 * of tree roots). 1143 * 1 unit for deleting the block group item (located in the extent 1144 * tree). 1145 * 1 unit for deleting the free space item (located in tree of tree 1146 * roots). 1147 * N units for deleting N device extent items corresponding to each 1148 * stripe (located in the device tree). 1149 * 1150 * In order to remove a block group we also need to reserve units in the 1151 * system space info in order to update the chunk tree (update one or 1152 * more device items and remove one chunk item), but this is done at 1153 * btrfs_remove_chunk() through a call to check_system_chunk(). 1154 */ 1155 map = em->map_lookup; 1156 num_items = 3 + map->num_stripes; 1157 free_extent_map(em); 1158 1159 return btrfs_start_transaction_fallback_global_rsv(root, num_items); 1160 } 1161 1162 /* 1163 * Mark block group @cache read-only, so later write won't happen to block 1164 * group @cache. 1165 * 1166 * If @force is not set, this function will only mark the block group readonly 1167 * if we have enough free space (1M) in other metadata/system block groups. 1168 * If @force is not set, this function will mark the block group readonly 1169 * without checking free space. 1170 * 1171 * NOTE: This function doesn't care if other block groups can contain all the 1172 * data in this block group. That check should be done by relocation routine, 1173 * not this function. 1174 */ 1175 static int inc_block_group_ro(struct btrfs_block_group *cache, int force) 1176 { 1177 struct btrfs_space_info *sinfo = cache->space_info; 1178 u64 num_bytes; 1179 int ret = -ENOSPC; 1180 1181 spin_lock(&sinfo->lock); 1182 spin_lock(&cache->lock); 1183 1184 if (cache->swap_extents) { 1185 ret = -ETXTBSY; 1186 goto out; 1187 } 1188 1189 if (cache->ro) { 1190 cache->ro++; 1191 ret = 0; 1192 goto out; 1193 } 1194 1195 num_bytes = cache->length - cache->reserved - cache->pinned - 1196 cache->bytes_super - cache->zone_unusable - cache->used; 1197 1198 /* 1199 * Data never overcommits, even in mixed mode, so do just the straight 1200 * check of left over space in how much we have allocated. 1201 */ 1202 if (force) { 1203 ret = 0; 1204 } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 1205 u64 sinfo_used = btrfs_space_info_used(sinfo, true); 1206 1207 /* 1208 * Here we make sure if we mark this bg RO, we still have enough 1209 * free space as buffer. 1210 */ 1211 if (sinfo_used + num_bytes <= sinfo->total_bytes) 1212 ret = 0; 1213 } else { 1214 /* 1215 * We overcommit metadata, so we need to do the 1216 * btrfs_can_overcommit check here, and we need to pass in 1217 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 1218 * leeway to allow us to mark this block group as read only. 1219 */ 1220 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 1221 BTRFS_RESERVE_NO_FLUSH)) 1222 ret = 0; 1223 } 1224 1225 if (!ret) { 1226 sinfo->bytes_readonly += num_bytes; 1227 if (btrfs_is_zoned(cache->fs_info)) { 1228 /* Migrate zone_unusable bytes to readonly */ 1229 sinfo->bytes_readonly += cache->zone_unusable; 1230 sinfo->bytes_zone_unusable -= cache->zone_unusable; 1231 cache->zone_unusable = 0; 1232 } 1233 cache->ro++; 1234 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1235 } 1236 out: 1237 spin_unlock(&cache->lock); 1238 spin_unlock(&sinfo->lock); 1239 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1240 btrfs_info(cache->fs_info, 1241 "unable to make block group %llu ro", cache->start); 1242 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1243 } 1244 return ret; 1245 } 1246 1247 static bool clean_pinned_extents(struct btrfs_trans_handle *trans, 1248 struct btrfs_block_group *bg) 1249 { 1250 struct btrfs_fs_info *fs_info = bg->fs_info; 1251 struct btrfs_transaction *prev_trans = NULL; 1252 const u64 start = bg->start; 1253 const u64 end = start + bg->length - 1; 1254 int ret; 1255 1256 spin_lock(&fs_info->trans_lock); 1257 if (trans->transaction->list.prev != &fs_info->trans_list) { 1258 prev_trans = list_last_entry(&trans->transaction->list, 1259 struct btrfs_transaction, list); 1260 refcount_inc(&prev_trans->use_count); 1261 } 1262 spin_unlock(&fs_info->trans_lock); 1263 1264 /* 1265 * Hold the unused_bg_unpin_mutex lock to avoid racing with 1266 * btrfs_finish_extent_commit(). If we are at transaction N, another 1267 * task might be running finish_extent_commit() for the previous 1268 * transaction N - 1, and have seen a range belonging to the block 1269 * group in pinned_extents before we were able to clear the whole block 1270 * group range from pinned_extents. This means that task can lookup for 1271 * the block group after we unpinned it from pinned_extents and removed 1272 * it, leading to a BUG_ON() at unpin_extent_range(). 1273 */ 1274 mutex_lock(&fs_info->unused_bg_unpin_mutex); 1275 if (prev_trans) { 1276 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, 1277 EXTENT_DIRTY); 1278 if (ret) 1279 goto out; 1280 } 1281 1282 ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, 1283 EXTENT_DIRTY); 1284 out: 1285 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1286 if (prev_trans) 1287 btrfs_put_transaction(prev_trans); 1288 1289 return ret == 0; 1290 } 1291 1292 /* 1293 * Process the unused_bgs list and remove any that don't have any allocated 1294 * space inside of them. 1295 */ 1296 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 1297 { 1298 struct btrfs_block_group *block_group; 1299 struct btrfs_space_info *space_info; 1300 struct btrfs_trans_handle *trans; 1301 const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); 1302 int ret = 0; 1303 1304 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1305 return; 1306 1307 /* 1308 * Long running balances can keep us blocked here for eternity, so 1309 * simply skip deletion if we're unable to get the mutex. 1310 */ 1311 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 1312 return; 1313 1314 spin_lock(&fs_info->unused_bgs_lock); 1315 while (!list_empty(&fs_info->unused_bgs)) { 1316 int trimming; 1317 1318 block_group = list_first_entry(&fs_info->unused_bgs, 1319 struct btrfs_block_group, 1320 bg_list); 1321 list_del_init(&block_group->bg_list); 1322 1323 space_info = block_group->space_info; 1324 1325 if (ret || btrfs_mixed_space_info(space_info)) { 1326 btrfs_put_block_group(block_group); 1327 continue; 1328 } 1329 spin_unlock(&fs_info->unused_bgs_lock); 1330 1331 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 1332 1333 /* Don't want to race with allocators so take the groups_sem */ 1334 down_write(&space_info->groups_sem); 1335 1336 /* 1337 * Async discard moves the final block group discard to be prior 1338 * to the unused_bgs code path. Therefore, if it's not fully 1339 * trimmed, punt it back to the async discard lists. 1340 */ 1341 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && 1342 !btrfs_is_free_space_trimmed(block_group)) { 1343 trace_btrfs_skip_unused_block_group(block_group); 1344 up_write(&space_info->groups_sem); 1345 /* Requeue if we failed because of async discard */ 1346 btrfs_discard_queue_work(&fs_info->discard_ctl, 1347 block_group); 1348 goto next; 1349 } 1350 1351 spin_lock(&block_group->lock); 1352 if (block_group->reserved || block_group->pinned || 1353 block_group->used || block_group->ro || 1354 list_is_singular(&block_group->list)) { 1355 /* 1356 * We want to bail if we made new allocations or have 1357 * outstanding allocations in this block group. We do 1358 * the ro check in case balance is currently acting on 1359 * this block group. 1360 */ 1361 trace_btrfs_skip_unused_block_group(block_group); 1362 spin_unlock(&block_group->lock); 1363 up_write(&space_info->groups_sem); 1364 goto next; 1365 } 1366 spin_unlock(&block_group->lock); 1367 1368 /* We don't want to force the issue, only flip if it's ok. */ 1369 ret = inc_block_group_ro(block_group, 0); 1370 up_write(&space_info->groups_sem); 1371 if (ret < 0) { 1372 ret = 0; 1373 goto next; 1374 } 1375 1376 ret = btrfs_zone_finish(block_group); 1377 if (ret < 0) { 1378 btrfs_dec_block_group_ro(block_group); 1379 if (ret == -EAGAIN) 1380 ret = 0; 1381 goto next; 1382 } 1383 1384 /* 1385 * Want to do this before we do anything else so we can recover 1386 * properly if we fail to join the transaction. 1387 */ 1388 trans = btrfs_start_trans_remove_block_group(fs_info, 1389 block_group->start); 1390 if (IS_ERR(trans)) { 1391 btrfs_dec_block_group_ro(block_group); 1392 ret = PTR_ERR(trans); 1393 goto next; 1394 } 1395 1396 /* 1397 * We could have pending pinned extents for this block group, 1398 * just delete them, we don't care about them anymore. 1399 */ 1400 if (!clean_pinned_extents(trans, block_group)) { 1401 btrfs_dec_block_group_ro(block_group); 1402 goto end_trans; 1403 } 1404 1405 /* 1406 * At this point, the block_group is read only and should fail 1407 * new allocations. However, btrfs_finish_extent_commit() can 1408 * cause this block_group to be placed back on the discard 1409 * lists because now the block_group isn't fully discarded. 1410 * Bail here and try again later after discarding everything. 1411 */ 1412 spin_lock(&fs_info->discard_ctl.lock); 1413 if (!list_empty(&block_group->discard_list)) { 1414 spin_unlock(&fs_info->discard_ctl.lock); 1415 btrfs_dec_block_group_ro(block_group); 1416 btrfs_discard_queue_work(&fs_info->discard_ctl, 1417 block_group); 1418 goto end_trans; 1419 } 1420 spin_unlock(&fs_info->discard_ctl.lock); 1421 1422 /* Reset pinned so btrfs_put_block_group doesn't complain */ 1423 spin_lock(&space_info->lock); 1424 spin_lock(&block_group->lock); 1425 1426 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1427 -block_group->pinned); 1428 space_info->bytes_readonly += block_group->pinned; 1429 block_group->pinned = 0; 1430 1431 spin_unlock(&block_group->lock); 1432 spin_unlock(&space_info->lock); 1433 1434 /* 1435 * The normal path here is an unused block group is passed here, 1436 * then trimming is handled in the transaction commit path. 1437 * Async discard interposes before this to do the trimming 1438 * before coming down the unused block group path as trimming 1439 * will no longer be done later in the transaction commit path. 1440 */ 1441 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1442 goto flip_async; 1443 1444 /* 1445 * DISCARD can flip during remount. On zoned filesystems, we 1446 * need to reset sequential-required zones. 1447 */ 1448 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || 1449 btrfs_is_zoned(fs_info); 1450 1451 /* Implicit trim during transaction commit. */ 1452 if (trimming) 1453 btrfs_freeze_block_group(block_group); 1454 1455 /* 1456 * Btrfs_remove_chunk will abort the transaction if things go 1457 * horribly wrong. 1458 */ 1459 ret = btrfs_remove_chunk(trans, block_group->start); 1460 1461 if (ret) { 1462 if (trimming) 1463 btrfs_unfreeze_block_group(block_group); 1464 goto end_trans; 1465 } 1466 1467 /* 1468 * If we're not mounted with -odiscard, we can just forget 1469 * about this block group. Otherwise we'll need to wait 1470 * until transaction commit to do the actual discard. 1471 */ 1472 if (trimming) { 1473 spin_lock(&fs_info->unused_bgs_lock); 1474 /* 1475 * A concurrent scrub might have added us to the list 1476 * fs_info->unused_bgs, so use a list_move operation 1477 * to add the block group to the deleted_bgs list. 1478 */ 1479 list_move(&block_group->bg_list, 1480 &trans->transaction->deleted_bgs); 1481 spin_unlock(&fs_info->unused_bgs_lock); 1482 btrfs_get_block_group(block_group); 1483 } 1484 end_trans: 1485 btrfs_end_transaction(trans); 1486 next: 1487 btrfs_put_block_group(block_group); 1488 spin_lock(&fs_info->unused_bgs_lock); 1489 } 1490 spin_unlock(&fs_info->unused_bgs_lock); 1491 mutex_unlock(&fs_info->reclaim_bgs_lock); 1492 return; 1493 1494 flip_async: 1495 btrfs_end_transaction(trans); 1496 mutex_unlock(&fs_info->reclaim_bgs_lock); 1497 btrfs_put_block_group(block_group); 1498 btrfs_discard_punt_unused_bgs_list(fs_info); 1499 } 1500 1501 void btrfs_mark_bg_unused(struct btrfs_block_group *bg) 1502 { 1503 struct btrfs_fs_info *fs_info = bg->fs_info; 1504 1505 spin_lock(&fs_info->unused_bgs_lock); 1506 if (list_empty(&bg->bg_list)) { 1507 btrfs_get_block_group(bg); 1508 trace_btrfs_add_unused_block_group(bg); 1509 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1510 } 1511 spin_unlock(&fs_info->unused_bgs_lock); 1512 } 1513 1514 /* 1515 * We want block groups with a low number of used bytes to be in the beginning 1516 * of the list, so they will get reclaimed first. 1517 */ 1518 static int reclaim_bgs_cmp(void *unused, const struct list_head *a, 1519 const struct list_head *b) 1520 { 1521 const struct btrfs_block_group *bg1, *bg2; 1522 1523 bg1 = list_entry(a, struct btrfs_block_group, bg_list); 1524 bg2 = list_entry(b, struct btrfs_block_group, bg_list); 1525 1526 return bg1->used > bg2->used; 1527 } 1528 1529 static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) 1530 { 1531 if (btrfs_is_zoned(fs_info)) 1532 return btrfs_zoned_should_reclaim(fs_info); 1533 return true; 1534 } 1535 1536 void btrfs_reclaim_bgs_work(struct work_struct *work) 1537 { 1538 struct btrfs_fs_info *fs_info = 1539 container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 1540 struct btrfs_block_group *bg; 1541 struct btrfs_space_info *space_info; 1542 1543 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1544 return; 1545 1546 if (!btrfs_should_reclaim(fs_info)) 1547 return; 1548 1549 sb_start_write(fs_info->sb); 1550 1551 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 1552 sb_end_write(fs_info->sb); 1553 return; 1554 } 1555 1556 /* 1557 * Long running balances can keep us blocked here for eternity, so 1558 * simply skip reclaim if we're unable to get the mutex. 1559 */ 1560 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { 1561 btrfs_exclop_finish(fs_info); 1562 sb_end_write(fs_info->sb); 1563 return; 1564 } 1565 1566 spin_lock(&fs_info->unused_bgs_lock); 1567 /* 1568 * Sort happens under lock because we can't simply splice it and sort. 1569 * The block groups might still be in use and reachable via bg_list, 1570 * and their presence in the reclaim_bgs list must be preserved. 1571 */ 1572 list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); 1573 while (!list_empty(&fs_info->reclaim_bgs)) { 1574 u64 zone_unusable; 1575 int ret = 0; 1576 1577 bg = list_first_entry(&fs_info->reclaim_bgs, 1578 struct btrfs_block_group, 1579 bg_list); 1580 list_del_init(&bg->bg_list); 1581 1582 space_info = bg->space_info; 1583 spin_unlock(&fs_info->unused_bgs_lock); 1584 1585 /* Don't race with allocators so take the groups_sem */ 1586 down_write(&space_info->groups_sem); 1587 1588 spin_lock(&bg->lock); 1589 if (bg->reserved || bg->pinned || bg->ro) { 1590 /* 1591 * We want to bail if we made new allocations or have 1592 * outstanding allocations in this block group. We do 1593 * the ro check in case balance is currently acting on 1594 * this block group. 1595 */ 1596 spin_unlock(&bg->lock); 1597 up_write(&space_info->groups_sem); 1598 goto next; 1599 } 1600 spin_unlock(&bg->lock); 1601 1602 /* Get out fast, in case we're unmounting the filesystem */ 1603 if (btrfs_fs_closing(fs_info)) { 1604 up_write(&space_info->groups_sem); 1605 goto next; 1606 } 1607 1608 /* 1609 * Cache the zone_unusable value before turning the block group 1610 * to read only. As soon as the blog group is read only it's 1611 * zone_unusable value gets moved to the block group's read-only 1612 * bytes and isn't available for calculations anymore. 1613 */ 1614 zone_unusable = bg->zone_unusable; 1615 ret = inc_block_group_ro(bg, 0); 1616 up_write(&space_info->groups_sem); 1617 if (ret < 0) 1618 goto next; 1619 1620 btrfs_info(fs_info, 1621 "reclaiming chunk %llu with %llu%% used %llu%% unusable", 1622 bg->start, div_u64(bg->used * 100, bg->length), 1623 div64_u64(zone_unusable * 100, bg->length)); 1624 trace_btrfs_reclaim_block_group(bg); 1625 ret = btrfs_relocate_chunk(fs_info, bg->start); 1626 if (ret) { 1627 btrfs_dec_block_group_ro(bg); 1628 btrfs_err(fs_info, "error relocating chunk %llu", 1629 bg->start); 1630 } 1631 1632 next: 1633 btrfs_put_block_group(bg); 1634 spin_lock(&fs_info->unused_bgs_lock); 1635 } 1636 spin_unlock(&fs_info->unused_bgs_lock); 1637 mutex_unlock(&fs_info->reclaim_bgs_lock); 1638 btrfs_exclop_finish(fs_info); 1639 sb_end_write(fs_info->sb); 1640 } 1641 1642 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) 1643 { 1644 spin_lock(&fs_info->unused_bgs_lock); 1645 if (!list_empty(&fs_info->reclaim_bgs)) 1646 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); 1647 spin_unlock(&fs_info->unused_bgs_lock); 1648 } 1649 1650 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) 1651 { 1652 struct btrfs_fs_info *fs_info = bg->fs_info; 1653 1654 spin_lock(&fs_info->unused_bgs_lock); 1655 if (list_empty(&bg->bg_list)) { 1656 btrfs_get_block_group(bg); 1657 trace_btrfs_add_reclaim_block_group(bg); 1658 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); 1659 } 1660 spin_unlock(&fs_info->unused_bgs_lock); 1661 } 1662 1663 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 1664 struct btrfs_path *path) 1665 { 1666 struct extent_map_tree *em_tree; 1667 struct extent_map *em; 1668 struct btrfs_block_group_item bg; 1669 struct extent_buffer *leaf; 1670 int slot; 1671 u64 flags; 1672 int ret = 0; 1673 1674 slot = path->slots[0]; 1675 leaf = path->nodes[0]; 1676 1677 em_tree = &fs_info->mapping_tree; 1678 read_lock(&em_tree->lock); 1679 em = lookup_extent_mapping(em_tree, key->objectid, key->offset); 1680 read_unlock(&em_tree->lock); 1681 if (!em) { 1682 btrfs_err(fs_info, 1683 "logical %llu len %llu found bg but no related chunk", 1684 key->objectid, key->offset); 1685 return -ENOENT; 1686 } 1687 1688 if (em->start != key->objectid || em->len != key->offset) { 1689 btrfs_err(fs_info, 1690 "block group %llu len %llu mismatch with chunk %llu len %llu", 1691 key->objectid, key->offset, em->start, em->len); 1692 ret = -EUCLEAN; 1693 goto out_free_em; 1694 } 1695 1696 read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), 1697 sizeof(bg)); 1698 flags = btrfs_stack_block_group_flags(&bg) & 1699 BTRFS_BLOCK_GROUP_TYPE_MASK; 1700 1701 if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1702 btrfs_err(fs_info, 1703 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 1704 key->objectid, key->offset, flags, 1705 (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); 1706 ret = -EUCLEAN; 1707 } 1708 1709 out_free_em: 1710 free_extent_map(em); 1711 return ret; 1712 } 1713 1714 static int find_first_block_group(struct btrfs_fs_info *fs_info, 1715 struct btrfs_path *path, 1716 struct btrfs_key *key) 1717 { 1718 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1719 int ret; 1720 struct btrfs_key found_key; 1721 1722 btrfs_for_each_slot(root, key, &found_key, path, ret) { 1723 if (found_key.objectid >= key->objectid && 1724 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1725 return read_bg_from_eb(fs_info, &found_key, path); 1726 } 1727 } 1728 return ret; 1729 } 1730 1731 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1732 { 1733 u64 extra_flags = chunk_to_extended(flags) & 1734 BTRFS_EXTENDED_PROFILE_MASK; 1735 1736 write_seqlock(&fs_info->profiles_lock); 1737 if (flags & BTRFS_BLOCK_GROUP_DATA) 1738 fs_info->avail_data_alloc_bits |= extra_flags; 1739 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1740 fs_info->avail_metadata_alloc_bits |= extra_flags; 1741 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1742 fs_info->avail_system_alloc_bits |= extra_flags; 1743 write_sequnlock(&fs_info->profiles_lock); 1744 } 1745 1746 /** 1747 * Map a physical disk address to a list of logical addresses 1748 * 1749 * @fs_info: the filesystem 1750 * @chunk_start: logical address of block group 1751 * @bdev: physical device to resolve, can be NULL to indicate any device 1752 * @physical: physical address to map to logical addresses 1753 * @logical: return array of logical addresses which map to @physical 1754 * @naddrs: length of @logical 1755 * @stripe_len: size of IO stripe for the given block group 1756 * 1757 * Maps a particular @physical disk address to a list of @logical addresses. 1758 * Used primarily to exclude those portions of a block group that contain super 1759 * block copies. 1760 */ 1761 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 1762 struct block_device *bdev, u64 physical, u64 **logical, 1763 int *naddrs, int *stripe_len) 1764 { 1765 struct extent_map *em; 1766 struct map_lookup *map; 1767 u64 *buf; 1768 u64 bytenr; 1769 u64 data_stripe_length; 1770 u64 io_stripe_size; 1771 int i, nr = 0; 1772 int ret = 0; 1773 1774 em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 1775 if (IS_ERR(em)) 1776 return -EIO; 1777 1778 map = em->map_lookup; 1779 data_stripe_length = em->orig_block_len; 1780 io_stripe_size = map->stripe_len; 1781 chunk_start = em->start; 1782 1783 /* For RAID5/6 adjust to a full IO stripe length */ 1784 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1785 io_stripe_size = map->stripe_len * nr_data_stripes(map); 1786 1787 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 1788 if (!buf) { 1789 ret = -ENOMEM; 1790 goto out; 1791 } 1792 1793 for (i = 0; i < map->num_stripes; i++) { 1794 bool already_inserted = false; 1795 u64 stripe_nr; 1796 u64 offset; 1797 int j; 1798 1799 if (!in_range(physical, map->stripes[i].physical, 1800 data_stripe_length)) 1801 continue; 1802 1803 if (bdev && map->stripes[i].dev->bdev != bdev) 1804 continue; 1805 1806 stripe_nr = physical - map->stripes[i].physical; 1807 stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); 1808 1809 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 1810 BTRFS_BLOCK_GROUP_RAID10)) { 1811 stripe_nr = stripe_nr * map->num_stripes + i; 1812 stripe_nr = div_u64(stripe_nr, map->sub_stripes); 1813 } 1814 /* 1815 * The remaining case would be for RAID56, multiply by 1816 * nr_data_stripes(). Alternatively, just use rmap_len below 1817 * instead of map->stripe_len 1818 */ 1819 1820 bytenr = chunk_start + stripe_nr * io_stripe_size + offset; 1821 1822 /* Ensure we don't add duplicate addresses */ 1823 for (j = 0; j < nr; j++) { 1824 if (buf[j] == bytenr) { 1825 already_inserted = true; 1826 break; 1827 } 1828 } 1829 1830 if (!already_inserted) 1831 buf[nr++] = bytenr; 1832 } 1833 1834 *logical = buf; 1835 *naddrs = nr; 1836 *stripe_len = io_stripe_size; 1837 out: 1838 free_extent_map(em); 1839 return ret; 1840 } 1841 1842 static int exclude_super_stripes(struct btrfs_block_group *cache) 1843 { 1844 struct btrfs_fs_info *fs_info = cache->fs_info; 1845 const bool zoned = btrfs_is_zoned(fs_info); 1846 u64 bytenr; 1847 u64 *logical; 1848 int stripe_len; 1849 int i, nr, ret; 1850 1851 if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 1852 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 1853 cache->bytes_super += stripe_len; 1854 ret = btrfs_add_excluded_extent(fs_info, cache->start, 1855 stripe_len); 1856 if (ret) 1857 return ret; 1858 } 1859 1860 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1861 bytenr = btrfs_sb_offset(i); 1862 ret = btrfs_rmap_block(fs_info, cache->start, NULL, 1863 bytenr, &logical, &nr, &stripe_len); 1864 if (ret) 1865 return ret; 1866 1867 /* Shouldn't have super stripes in sequential zones */ 1868 if (zoned && nr) { 1869 btrfs_err(fs_info, 1870 "zoned: block group %llu must not contain super block", 1871 cache->start); 1872 return -EUCLEAN; 1873 } 1874 1875 while (nr--) { 1876 u64 len = min_t(u64, stripe_len, 1877 cache->start + cache->length - logical[nr]); 1878 1879 cache->bytes_super += len; 1880 ret = btrfs_add_excluded_extent(fs_info, logical[nr], 1881 len); 1882 if (ret) { 1883 kfree(logical); 1884 return ret; 1885 } 1886 } 1887 1888 kfree(logical); 1889 } 1890 return 0; 1891 } 1892 1893 static void link_block_group(struct btrfs_block_group *cache) 1894 { 1895 struct btrfs_space_info *space_info = cache->space_info; 1896 int index = btrfs_bg_flags_to_raid_index(cache->flags); 1897 1898 down_write(&space_info->groups_sem); 1899 list_add_tail(&cache->list, &space_info->block_groups[index]); 1900 up_write(&space_info->groups_sem); 1901 } 1902 1903 static struct btrfs_block_group *btrfs_create_block_group_cache( 1904 struct btrfs_fs_info *fs_info, u64 start) 1905 { 1906 struct btrfs_block_group *cache; 1907 1908 cache = kzalloc(sizeof(*cache), GFP_NOFS); 1909 if (!cache) 1910 return NULL; 1911 1912 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 1913 GFP_NOFS); 1914 if (!cache->free_space_ctl) { 1915 kfree(cache); 1916 return NULL; 1917 } 1918 1919 cache->start = start; 1920 1921 cache->fs_info = fs_info; 1922 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 1923 1924 cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 1925 1926 refcount_set(&cache->refs, 1); 1927 spin_lock_init(&cache->lock); 1928 init_rwsem(&cache->data_rwsem); 1929 INIT_LIST_HEAD(&cache->list); 1930 INIT_LIST_HEAD(&cache->cluster_list); 1931 INIT_LIST_HEAD(&cache->bg_list); 1932 INIT_LIST_HEAD(&cache->ro_list); 1933 INIT_LIST_HEAD(&cache->discard_list); 1934 INIT_LIST_HEAD(&cache->dirty_list); 1935 INIT_LIST_HEAD(&cache->io_list); 1936 INIT_LIST_HEAD(&cache->active_bg_list); 1937 btrfs_init_free_space_ctl(cache, cache->free_space_ctl); 1938 atomic_set(&cache->frozen, 0); 1939 mutex_init(&cache->free_space_lock); 1940 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 1941 1942 return cache; 1943 } 1944 1945 /* 1946 * Iterate all chunks and verify that each of them has the corresponding block 1947 * group 1948 */ 1949 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 1950 { 1951 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 1952 struct extent_map *em; 1953 struct btrfs_block_group *bg; 1954 u64 start = 0; 1955 int ret = 0; 1956 1957 while (1) { 1958 read_lock(&map_tree->lock); 1959 /* 1960 * lookup_extent_mapping will return the first extent map 1961 * intersecting the range, so setting @len to 1 is enough to 1962 * get the first chunk. 1963 */ 1964 em = lookup_extent_mapping(map_tree, start, 1); 1965 read_unlock(&map_tree->lock); 1966 if (!em) 1967 break; 1968 1969 bg = btrfs_lookup_block_group(fs_info, em->start); 1970 if (!bg) { 1971 btrfs_err(fs_info, 1972 "chunk start=%llu len=%llu doesn't have corresponding block group", 1973 em->start, em->len); 1974 ret = -EUCLEAN; 1975 free_extent_map(em); 1976 break; 1977 } 1978 if (bg->start != em->start || bg->length != em->len || 1979 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 1980 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1981 btrfs_err(fs_info, 1982 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 1983 em->start, em->len, 1984 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 1985 bg->start, bg->length, 1986 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 1987 ret = -EUCLEAN; 1988 free_extent_map(em); 1989 btrfs_put_block_group(bg); 1990 break; 1991 } 1992 start = em->start + em->len; 1993 free_extent_map(em); 1994 btrfs_put_block_group(bg); 1995 } 1996 return ret; 1997 } 1998 1999 static int read_one_block_group(struct btrfs_fs_info *info, 2000 struct btrfs_block_group_item *bgi, 2001 const struct btrfs_key *key, 2002 int need_clear) 2003 { 2004 struct btrfs_block_group *cache; 2005 struct btrfs_space_info *space_info; 2006 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 2007 int ret; 2008 2009 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 2010 2011 cache = btrfs_create_block_group_cache(info, key->objectid); 2012 if (!cache) 2013 return -ENOMEM; 2014 2015 cache->length = key->offset; 2016 cache->used = btrfs_stack_block_group_used(bgi); 2017 cache->flags = btrfs_stack_block_group_flags(bgi); 2018 cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 2019 2020 set_free_space_tree_thresholds(cache); 2021 2022 if (need_clear) { 2023 /* 2024 * When we mount with old space cache, we need to 2025 * set BTRFS_DC_CLEAR and set dirty flag. 2026 * 2027 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 2028 * truncate the old free space cache inode and 2029 * setup a new one. 2030 * b) Setting 'dirty flag' makes sure that we flush 2031 * the new space cache info onto disk. 2032 */ 2033 if (btrfs_test_opt(info, SPACE_CACHE)) 2034 cache->disk_cache_state = BTRFS_DC_CLEAR; 2035 } 2036 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 2037 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 2038 btrfs_err(info, 2039 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 2040 cache->start); 2041 ret = -EINVAL; 2042 goto error; 2043 } 2044 2045 ret = btrfs_load_block_group_zone_info(cache, false); 2046 if (ret) { 2047 btrfs_err(info, "zoned: failed to load zone info of bg %llu", 2048 cache->start); 2049 goto error; 2050 } 2051 2052 /* 2053 * We need to exclude the super stripes now so that the space info has 2054 * super bytes accounted for, otherwise we'll think we have more space 2055 * than we actually do. 2056 */ 2057 ret = exclude_super_stripes(cache); 2058 if (ret) { 2059 /* We may have excluded something, so call this just in case. */ 2060 btrfs_free_excluded_extents(cache); 2061 goto error; 2062 } 2063 2064 /* 2065 * For zoned filesystem, space after the allocation offset is the only 2066 * free space for a block group. So, we don't need any caching work. 2067 * btrfs_calc_zone_unusable() will set the amount of free space and 2068 * zone_unusable space. 2069 * 2070 * For regular filesystem, check for two cases, either we are full, and 2071 * therefore don't need to bother with the caching work since we won't 2072 * find any space, or we are empty, and we can just add all the space 2073 * in and be done with it. This saves us _a_lot_ of time, particularly 2074 * in the full case. 2075 */ 2076 if (btrfs_is_zoned(info)) { 2077 btrfs_calc_zone_unusable(cache); 2078 /* Should not have any excluded extents. Just in case, though. */ 2079 btrfs_free_excluded_extents(cache); 2080 } else if (cache->length == cache->used) { 2081 cache->last_byte_to_unpin = (u64)-1; 2082 cache->cached = BTRFS_CACHE_FINISHED; 2083 btrfs_free_excluded_extents(cache); 2084 } else if (cache->used == 0) { 2085 cache->last_byte_to_unpin = (u64)-1; 2086 cache->cached = BTRFS_CACHE_FINISHED; 2087 add_new_free_space(cache, cache->start, 2088 cache->start + cache->length); 2089 btrfs_free_excluded_extents(cache); 2090 } 2091 2092 ret = btrfs_add_block_group_cache(info, cache); 2093 if (ret) { 2094 btrfs_remove_free_space_cache(cache); 2095 goto error; 2096 } 2097 trace_btrfs_add_block_group(info, cache, 0); 2098 btrfs_update_space_info(info, cache->flags, cache->length, 2099 cache->used, cache->bytes_super, 2100 cache->zone_unusable, cache->zone_is_active, 2101 &space_info); 2102 2103 cache->space_info = space_info; 2104 2105 link_block_group(cache); 2106 2107 set_avail_alloc_bits(info, cache->flags); 2108 if (btrfs_chunk_writeable(info, cache->start)) { 2109 if (cache->used == 0) { 2110 ASSERT(list_empty(&cache->bg_list)); 2111 if (btrfs_test_opt(info, DISCARD_ASYNC)) 2112 btrfs_discard_queue_work(&info->discard_ctl, cache); 2113 else 2114 btrfs_mark_bg_unused(cache); 2115 } 2116 } else { 2117 inc_block_group_ro(cache, 1); 2118 } 2119 2120 return 0; 2121 error: 2122 btrfs_put_block_group(cache); 2123 return ret; 2124 } 2125 2126 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) 2127 { 2128 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 2129 struct btrfs_space_info *space_info; 2130 struct rb_node *node; 2131 int ret = 0; 2132 2133 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 2134 struct extent_map *em; 2135 struct map_lookup *map; 2136 struct btrfs_block_group *bg; 2137 2138 em = rb_entry(node, struct extent_map, rb_node); 2139 map = em->map_lookup; 2140 bg = btrfs_create_block_group_cache(fs_info, em->start); 2141 if (!bg) { 2142 ret = -ENOMEM; 2143 break; 2144 } 2145 2146 /* Fill dummy cache as FULL */ 2147 bg->length = em->len; 2148 bg->flags = map->type; 2149 bg->last_byte_to_unpin = (u64)-1; 2150 bg->cached = BTRFS_CACHE_FINISHED; 2151 bg->used = em->len; 2152 bg->flags = map->type; 2153 ret = btrfs_add_block_group_cache(fs_info, bg); 2154 /* 2155 * We may have some valid block group cache added already, in 2156 * that case we skip to the next one. 2157 */ 2158 if (ret == -EEXIST) { 2159 ret = 0; 2160 btrfs_put_block_group(bg); 2161 continue; 2162 } 2163 2164 if (ret) { 2165 btrfs_remove_free_space_cache(bg); 2166 btrfs_put_block_group(bg); 2167 break; 2168 } 2169 2170 btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, 2171 0, 0, false, &space_info); 2172 bg->space_info = space_info; 2173 link_block_group(bg); 2174 2175 set_avail_alloc_bits(fs_info, bg->flags); 2176 } 2177 if (!ret) 2178 btrfs_init_global_block_rsv(fs_info); 2179 return ret; 2180 } 2181 2182 int btrfs_read_block_groups(struct btrfs_fs_info *info) 2183 { 2184 struct btrfs_root *root = btrfs_block_group_root(info); 2185 struct btrfs_path *path; 2186 int ret; 2187 struct btrfs_block_group *cache; 2188 struct btrfs_space_info *space_info; 2189 struct btrfs_key key; 2190 int need_clear = 0; 2191 u64 cache_gen; 2192 2193 if (!root) 2194 return fill_dummy_bgs(info); 2195 2196 key.objectid = 0; 2197 key.offset = 0; 2198 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2199 path = btrfs_alloc_path(); 2200 if (!path) 2201 return -ENOMEM; 2202 2203 cache_gen = btrfs_super_cache_generation(info->super_copy); 2204 if (btrfs_test_opt(info, SPACE_CACHE) && 2205 btrfs_super_generation(info->super_copy) != cache_gen) 2206 need_clear = 1; 2207 if (btrfs_test_opt(info, CLEAR_CACHE)) 2208 need_clear = 1; 2209 2210 while (1) { 2211 struct btrfs_block_group_item bgi; 2212 struct extent_buffer *leaf; 2213 int slot; 2214 2215 ret = find_first_block_group(info, path, &key); 2216 if (ret > 0) 2217 break; 2218 if (ret != 0) 2219 goto error; 2220 2221 leaf = path->nodes[0]; 2222 slot = path->slots[0]; 2223 2224 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 2225 sizeof(bgi)); 2226 2227 btrfs_item_key_to_cpu(leaf, &key, slot); 2228 btrfs_release_path(path); 2229 ret = read_one_block_group(info, &bgi, &key, need_clear); 2230 if (ret < 0) 2231 goto error; 2232 key.objectid += key.offset; 2233 key.offset = 0; 2234 } 2235 btrfs_release_path(path); 2236 2237 list_for_each_entry(space_info, &info->space_info, list) { 2238 int i; 2239 2240 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2241 if (list_empty(&space_info->block_groups[i])) 2242 continue; 2243 cache = list_first_entry(&space_info->block_groups[i], 2244 struct btrfs_block_group, 2245 list); 2246 btrfs_sysfs_add_block_group_type(cache); 2247 } 2248 2249 if (!(btrfs_get_alloc_profile(info, space_info->flags) & 2250 (BTRFS_BLOCK_GROUP_RAID10 | 2251 BTRFS_BLOCK_GROUP_RAID1_MASK | 2252 BTRFS_BLOCK_GROUP_RAID56_MASK | 2253 BTRFS_BLOCK_GROUP_DUP))) 2254 continue; 2255 /* 2256 * Avoid allocating from un-mirrored block group if there are 2257 * mirrored block groups. 2258 */ 2259 list_for_each_entry(cache, 2260 &space_info->block_groups[BTRFS_RAID_RAID0], 2261 list) 2262 inc_block_group_ro(cache, 1); 2263 list_for_each_entry(cache, 2264 &space_info->block_groups[BTRFS_RAID_SINGLE], 2265 list) 2266 inc_block_group_ro(cache, 1); 2267 } 2268 2269 btrfs_init_global_block_rsv(info); 2270 ret = check_chunk_block_group_mappings(info); 2271 error: 2272 btrfs_free_path(path); 2273 /* 2274 * We've hit some error while reading the extent tree, and have 2275 * rescue=ibadroots mount option. 2276 * Try to fill the tree using dummy block groups so that the user can 2277 * continue to mount and grab their data. 2278 */ 2279 if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) 2280 ret = fill_dummy_bgs(info); 2281 return ret; 2282 } 2283 2284 /* 2285 * This function, insert_block_group_item(), belongs to the phase 2 of chunk 2286 * allocation. 2287 * 2288 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2289 * phases. 2290 */ 2291 static int insert_block_group_item(struct btrfs_trans_handle *trans, 2292 struct btrfs_block_group *block_group) 2293 { 2294 struct btrfs_fs_info *fs_info = trans->fs_info; 2295 struct btrfs_block_group_item bgi; 2296 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2297 struct btrfs_key key; 2298 2299 spin_lock(&block_group->lock); 2300 btrfs_set_stack_block_group_used(&bgi, block_group->used); 2301 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2302 block_group->global_root_id); 2303 btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2304 key.objectid = block_group->start; 2305 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2306 key.offset = block_group->length; 2307 spin_unlock(&block_group->lock); 2308 2309 return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 2310 } 2311 2312 static int insert_dev_extent(struct btrfs_trans_handle *trans, 2313 struct btrfs_device *device, u64 chunk_offset, 2314 u64 start, u64 num_bytes) 2315 { 2316 struct btrfs_fs_info *fs_info = device->fs_info; 2317 struct btrfs_root *root = fs_info->dev_root; 2318 struct btrfs_path *path; 2319 struct btrfs_dev_extent *extent; 2320 struct extent_buffer *leaf; 2321 struct btrfs_key key; 2322 int ret; 2323 2324 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 2325 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 2326 path = btrfs_alloc_path(); 2327 if (!path) 2328 return -ENOMEM; 2329 2330 key.objectid = device->devid; 2331 key.type = BTRFS_DEV_EXTENT_KEY; 2332 key.offset = start; 2333 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); 2334 if (ret) 2335 goto out; 2336 2337 leaf = path->nodes[0]; 2338 extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 2339 btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); 2340 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 2341 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2342 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 2343 2344 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 2345 btrfs_mark_buffer_dirty(leaf); 2346 out: 2347 btrfs_free_path(path); 2348 return ret; 2349 } 2350 2351 /* 2352 * This function belongs to phase 2. 2353 * 2354 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2355 * phases. 2356 */ 2357 static int insert_dev_extents(struct btrfs_trans_handle *trans, 2358 u64 chunk_offset, u64 chunk_size) 2359 { 2360 struct btrfs_fs_info *fs_info = trans->fs_info; 2361 struct btrfs_device *device; 2362 struct extent_map *em; 2363 struct map_lookup *map; 2364 u64 dev_offset; 2365 u64 stripe_size; 2366 int i; 2367 int ret = 0; 2368 2369 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 2370 if (IS_ERR(em)) 2371 return PTR_ERR(em); 2372 2373 map = em->map_lookup; 2374 stripe_size = em->orig_block_len; 2375 2376 /* 2377 * Take the device list mutex to prevent races with the final phase of 2378 * a device replace operation that replaces the device object associated 2379 * with the map's stripes, because the device object's id can change 2380 * at any time during that final phase of the device replace operation 2381 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 2382 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 2383 * resulting in persisting a device extent item with such ID. 2384 */ 2385 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2386 for (i = 0; i < map->num_stripes; i++) { 2387 device = map->stripes[i].dev; 2388 dev_offset = map->stripes[i].physical; 2389 2390 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, 2391 stripe_size); 2392 if (ret) 2393 break; 2394 } 2395 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2396 2397 free_extent_map(em); 2398 return ret; 2399 } 2400 2401 /* 2402 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of 2403 * chunk allocation. 2404 * 2405 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2406 * phases. 2407 */ 2408 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 2409 { 2410 struct btrfs_fs_info *fs_info = trans->fs_info; 2411 struct btrfs_block_group *block_group; 2412 int ret = 0; 2413 2414 while (!list_empty(&trans->new_bgs)) { 2415 int index; 2416 2417 block_group = list_first_entry(&trans->new_bgs, 2418 struct btrfs_block_group, 2419 bg_list); 2420 if (ret) 2421 goto next; 2422 2423 index = btrfs_bg_flags_to_raid_index(block_group->flags); 2424 2425 ret = insert_block_group_item(trans, block_group); 2426 if (ret) 2427 btrfs_abort_transaction(trans, ret); 2428 if (!block_group->chunk_item_inserted) { 2429 mutex_lock(&fs_info->chunk_mutex); 2430 ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); 2431 mutex_unlock(&fs_info->chunk_mutex); 2432 if (ret) 2433 btrfs_abort_transaction(trans, ret); 2434 } 2435 ret = insert_dev_extents(trans, block_group->start, 2436 block_group->length); 2437 if (ret) 2438 btrfs_abort_transaction(trans, ret); 2439 add_block_group_free_space(trans, block_group); 2440 2441 /* 2442 * If we restriped during balance, we may have added a new raid 2443 * type, so now add the sysfs entries when it is safe to do so. 2444 * We don't have to worry about locking here as it's handled in 2445 * btrfs_sysfs_add_block_group_type. 2446 */ 2447 if (block_group->space_info->block_group_kobjs[index] == NULL) 2448 btrfs_sysfs_add_block_group_type(block_group); 2449 2450 /* Already aborted the transaction if it failed. */ 2451 next: 2452 btrfs_delayed_refs_rsv_release(fs_info, 1); 2453 list_del_init(&block_group->bg_list); 2454 } 2455 btrfs_trans_release_chunk_metadata(trans); 2456 } 2457 2458 /* 2459 * For extent tree v2 we use the block_group_item->chunk_offset to point at our 2460 * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. 2461 */ 2462 static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) 2463 { 2464 u64 div = SZ_1G; 2465 u64 index; 2466 2467 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 2468 return BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2469 2470 /* If we have a smaller fs index based on 128MiB. */ 2471 if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) 2472 div = SZ_128M; 2473 2474 offset = div64_u64(offset, div); 2475 div64_u64_rem(offset, fs_info->nr_global_roots, &index); 2476 return index; 2477 } 2478 2479 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, 2480 u64 bytes_used, u64 type, 2481 u64 chunk_offset, u64 size) 2482 { 2483 struct btrfs_fs_info *fs_info = trans->fs_info; 2484 struct btrfs_block_group *cache; 2485 int ret; 2486 2487 btrfs_set_log_full_commit(trans); 2488 2489 cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2490 if (!cache) 2491 return ERR_PTR(-ENOMEM); 2492 2493 cache->length = size; 2494 set_free_space_tree_thresholds(cache); 2495 cache->used = bytes_used; 2496 cache->flags = type; 2497 cache->last_byte_to_unpin = (u64)-1; 2498 cache->cached = BTRFS_CACHE_FINISHED; 2499 cache->global_root_id = calculate_global_root_id(fs_info, cache->start); 2500 2501 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 2502 cache->needs_free_space = 1; 2503 2504 ret = btrfs_load_block_group_zone_info(cache, true); 2505 if (ret) { 2506 btrfs_put_block_group(cache); 2507 return ERR_PTR(ret); 2508 } 2509 2510 ret = exclude_super_stripes(cache); 2511 if (ret) { 2512 /* We may have excluded something, so call this just in case */ 2513 btrfs_free_excluded_extents(cache); 2514 btrfs_put_block_group(cache); 2515 return ERR_PTR(ret); 2516 } 2517 2518 add_new_free_space(cache, chunk_offset, chunk_offset + size); 2519 2520 btrfs_free_excluded_extents(cache); 2521 2522 #ifdef CONFIG_BTRFS_DEBUG 2523 if (btrfs_should_fragment_free_space(cache)) { 2524 u64 new_bytes_used = size - bytes_used; 2525 2526 bytes_used += new_bytes_used >> 1; 2527 fragment_free_space(cache); 2528 } 2529 #endif 2530 /* 2531 * Ensure the corresponding space_info object is created and 2532 * assigned to our block group. We want our bg to be added to the rbtree 2533 * with its ->space_info set. 2534 */ 2535 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 2536 ASSERT(cache->space_info); 2537 2538 ret = btrfs_add_block_group_cache(fs_info, cache); 2539 if (ret) { 2540 btrfs_remove_free_space_cache(cache); 2541 btrfs_put_block_group(cache); 2542 return ERR_PTR(ret); 2543 } 2544 2545 /* 2546 * Now that our block group has its ->space_info set and is inserted in 2547 * the rbtree, update the space info's counters. 2548 */ 2549 trace_btrfs_add_block_group(fs_info, cache, 1); 2550 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, 2551 cache->bytes_super, cache->zone_unusable, 2552 cache->zone_is_active, &cache->space_info); 2553 btrfs_update_global_block_rsv(fs_info); 2554 2555 link_block_group(cache); 2556 2557 list_add_tail(&cache->bg_list, &trans->new_bgs); 2558 trans->delayed_ref_updates++; 2559 btrfs_update_delayed_refs_rsv(trans); 2560 2561 set_avail_alloc_bits(fs_info, type); 2562 return cache; 2563 } 2564 2565 /* 2566 * Mark one block group RO, can be called several times for the same block 2567 * group. 2568 * 2569 * @cache: the destination block group 2570 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 2571 * ensure we still have some free space after marking this 2572 * block group RO. 2573 */ 2574 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 2575 bool do_chunk_alloc) 2576 { 2577 struct btrfs_fs_info *fs_info = cache->fs_info; 2578 struct btrfs_trans_handle *trans; 2579 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2580 u64 alloc_flags; 2581 int ret; 2582 bool dirty_bg_running; 2583 2584 /* 2585 * This can only happen when we are doing read-only scrub on read-only 2586 * mount. 2587 * In that case we should not start a new transaction on read-only fs. 2588 * Thus here we skip all chunk allocations. 2589 */ 2590 if (sb_rdonly(fs_info->sb)) { 2591 mutex_lock(&fs_info->ro_block_group_mutex); 2592 ret = inc_block_group_ro(cache, 0); 2593 mutex_unlock(&fs_info->ro_block_group_mutex); 2594 return ret; 2595 } 2596 2597 do { 2598 trans = btrfs_join_transaction(root); 2599 if (IS_ERR(trans)) 2600 return PTR_ERR(trans); 2601 2602 dirty_bg_running = false; 2603 2604 /* 2605 * We're not allowed to set block groups readonly after the dirty 2606 * block group cache has started writing. If it already started, 2607 * back off and let this transaction commit. 2608 */ 2609 mutex_lock(&fs_info->ro_block_group_mutex); 2610 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 2611 u64 transid = trans->transid; 2612 2613 mutex_unlock(&fs_info->ro_block_group_mutex); 2614 btrfs_end_transaction(trans); 2615 2616 ret = btrfs_wait_for_commit(fs_info, transid); 2617 if (ret) 2618 return ret; 2619 dirty_bg_running = true; 2620 } 2621 } while (dirty_bg_running); 2622 2623 if (do_chunk_alloc) { 2624 /* 2625 * If we are changing raid levels, try to allocate a 2626 * corresponding block group with the new raid level. 2627 */ 2628 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2629 if (alloc_flags != cache->flags) { 2630 ret = btrfs_chunk_alloc(trans, alloc_flags, 2631 CHUNK_ALLOC_FORCE); 2632 /* 2633 * ENOSPC is allowed here, we may have enough space 2634 * already allocated at the new raid level to carry on 2635 */ 2636 if (ret == -ENOSPC) 2637 ret = 0; 2638 if (ret < 0) 2639 goto out; 2640 } 2641 } 2642 2643 ret = inc_block_group_ro(cache, 0); 2644 if (!do_chunk_alloc || ret == -ETXTBSY) 2645 goto unlock_out; 2646 if (!ret) 2647 goto out; 2648 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 2649 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2650 if (ret < 0) 2651 goto out; 2652 /* 2653 * We have allocated a new chunk. We also need to activate that chunk to 2654 * grant metadata tickets for zoned filesystem. 2655 */ 2656 ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); 2657 if (ret < 0) 2658 goto out; 2659 2660 ret = inc_block_group_ro(cache, 0); 2661 if (ret == -ETXTBSY) 2662 goto unlock_out; 2663 out: 2664 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 2665 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2666 mutex_lock(&fs_info->chunk_mutex); 2667 check_system_chunk(trans, alloc_flags); 2668 mutex_unlock(&fs_info->chunk_mutex); 2669 } 2670 unlock_out: 2671 mutex_unlock(&fs_info->ro_block_group_mutex); 2672 2673 btrfs_end_transaction(trans); 2674 return ret; 2675 } 2676 2677 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 2678 { 2679 struct btrfs_space_info *sinfo = cache->space_info; 2680 u64 num_bytes; 2681 2682 BUG_ON(!cache->ro); 2683 2684 spin_lock(&sinfo->lock); 2685 spin_lock(&cache->lock); 2686 if (!--cache->ro) { 2687 if (btrfs_is_zoned(cache->fs_info)) { 2688 /* Migrate zone_unusable bytes back */ 2689 cache->zone_unusable = 2690 (cache->alloc_offset - cache->used) + 2691 (cache->length - cache->zone_capacity); 2692 sinfo->bytes_zone_unusable += cache->zone_unusable; 2693 sinfo->bytes_readonly -= cache->zone_unusable; 2694 } 2695 num_bytes = cache->length - cache->reserved - 2696 cache->pinned - cache->bytes_super - 2697 cache->zone_unusable - cache->used; 2698 sinfo->bytes_readonly -= num_bytes; 2699 list_del_init(&cache->ro_list); 2700 } 2701 spin_unlock(&cache->lock); 2702 spin_unlock(&sinfo->lock); 2703 } 2704 2705 static int update_block_group_item(struct btrfs_trans_handle *trans, 2706 struct btrfs_path *path, 2707 struct btrfs_block_group *cache) 2708 { 2709 struct btrfs_fs_info *fs_info = trans->fs_info; 2710 int ret; 2711 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2712 unsigned long bi; 2713 struct extent_buffer *leaf; 2714 struct btrfs_block_group_item bgi; 2715 struct btrfs_key key; 2716 2717 key.objectid = cache->start; 2718 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2719 key.offset = cache->length; 2720 2721 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2722 if (ret) { 2723 if (ret > 0) 2724 ret = -ENOENT; 2725 goto fail; 2726 } 2727 2728 leaf = path->nodes[0]; 2729 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2730 btrfs_set_stack_block_group_used(&bgi, cache->used); 2731 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2732 cache->global_root_id); 2733 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 2734 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 2735 btrfs_mark_buffer_dirty(leaf); 2736 fail: 2737 btrfs_release_path(path); 2738 return ret; 2739 2740 } 2741 2742 static int cache_save_setup(struct btrfs_block_group *block_group, 2743 struct btrfs_trans_handle *trans, 2744 struct btrfs_path *path) 2745 { 2746 struct btrfs_fs_info *fs_info = block_group->fs_info; 2747 struct btrfs_root *root = fs_info->tree_root; 2748 struct inode *inode = NULL; 2749 struct extent_changeset *data_reserved = NULL; 2750 u64 alloc_hint = 0; 2751 int dcs = BTRFS_DC_ERROR; 2752 u64 cache_size = 0; 2753 int retries = 0; 2754 int ret = 0; 2755 2756 if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 2757 return 0; 2758 2759 /* 2760 * If this block group is smaller than 100 megs don't bother caching the 2761 * block group. 2762 */ 2763 if (block_group->length < (100 * SZ_1M)) { 2764 spin_lock(&block_group->lock); 2765 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 2766 spin_unlock(&block_group->lock); 2767 return 0; 2768 } 2769 2770 if (TRANS_ABORTED(trans)) 2771 return 0; 2772 again: 2773 inode = lookup_free_space_inode(block_group, path); 2774 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 2775 ret = PTR_ERR(inode); 2776 btrfs_release_path(path); 2777 goto out; 2778 } 2779 2780 if (IS_ERR(inode)) { 2781 BUG_ON(retries); 2782 retries++; 2783 2784 if (block_group->ro) 2785 goto out_free; 2786 2787 ret = create_free_space_inode(trans, block_group, path); 2788 if (ret) 2789 goto out_free; 2790 goto again; 2791 } 2792 2793 /* 2794 * We want to set the generation to 0, that way if anything goes wrong 2795 * from here on out we know not to trust this cache when we load up next 2796 * time. 2797 */ 2798 BTRFS_I(inode)->generation = 0; 2799 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 2800 if (ret) { 2801 /* 2802 * So theoretically we could recover from this, simply set the 2803 * super cache generation to 0 so we know to invalidate the 2804 * cache, but then we'd have to keep track of the block groups 2805 * that fail this way so we know we _have_ to reset this cache 2806 * before the next commit or risk reading stale cache. So to 2807 * limit our exposure to horrible edge cases lets just abort the 2808 * transaction, this only happens in really bad situations 2809 * anyway. 2810 */ 2811 btrfs_abort_transaction(trans, ret); 2812 goto out_put; 2813 } 2814 WARN_ON(ret); 2815 2816 /* We've already setup this transaction, go ahead and exit */ 2817 if (block_group->cache_generation == trans->transid && 2818 i_size_read(inode)) { 2819 dcs = BTRFS_DC_SETUP; 2820 goto out_put; 2821 } 2822 2823 if (i_size_read(inode) > 0) { 2824 ret = btrfs_check_trunc_cache_free_space(fs_info, 2825 &fs_info->global_block_rsv); 2826 if (ret) 2827 goto out_put; 2828 2829 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 2830 if (ret) 2831 goto out_put; 2832 } 2833 2834 spin_lock(&block_group->lock); 2835 if (block_group->cached != BTRFS_CACHE_FINISHED || 2836 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 2837 /* 2838 * don't bother trying to write stuff out _if_ 2839 * a) we're not cached, 2840 * b) we're with nospace_cache mount option, 2841 * c) we're with v2 space_cache (FREE_SPACE_TREE). 2842 */ 2843 dcs = BTRFS_DC_WRITTEN; 2844 spin_unlock(&block_group->lock); 2845 goto out_put; 2846 } 2847 spin_unlock(&block_group->lock); 2848 2849 /* 2850 * We hit an ENOSPC when setting up the cache in this transaction, just 2851 * skip doing the setup, we've already cleared the cache so we're safe. 2852 */ 2853 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 2854 ret = -ENOSPC; 2855 goto out_put; 2856 } 2857 2858 /* 2859 * Try to preallocate enough space based on how big the block group is. 2860 * Keep in mind this has to include any pinned space which could end up 2861 * taking up quite a bit since it's not folded into the other space 2862 * cache. 2863 */ 2864 cache_size = div_u64(block_group->length, SZ_256M); 2865 if (!cache_size) 2866 cache_size = 1; 2867 2868 cache_size *= 16; 2869 cache_size *= fs_info->sectorsize; 2870 2871 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 2872 cache_size); 2873 if (ret) 2874 goto out_put; 2875 2876 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, 2877 cache_size, cache_size, 2878 &alloc_hint); 2879 /* 2880 * Our cache requires contiguous chunks so that we don't modify a bunch 2881 * of metadata or split extents when writing the cache out, which means 2882 * we can enospc if we are heavily fragmented in addition to just normal 2883 * out of space conditions. So if we hit this just skip setting up any 2884 * other block groups for this transaction, maybe we'll unpin enough 2885 * space the next time around. 2886 */ 2887 if (!ret) 2888 dcs = BTRFS_DC_SETUP; 2889 else if (ret == -ENOSPC) 2890 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 2891 2892 out_put: 2893 iput(inode); 2894 out_free: 2895 btrfs_release_path(path); 2896 out: 2897 spin_lock(&block_group->lock); 2898 if (!ret && dcs == BTRFS_DC_SETUP) 2899 block_group->cache_generation = trans->transid; 2900 block_group->disk_cache_state = dcs; 2901 spin_unlock(&block_group->lock); 2902 2903 extent_changeset_free(data_reserved); 2904 return ret; 2905 } 2906 2907 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 2908 { 2909 struct btrfs_fs_info *fs_info = trans->fs_info; 2910 struct btrfs_block_group *cache, *tmp; 2911 struct btrfs_transaction *cur_trans = trans->transaction; 2912 struct btrfs_path *path; 2913 2914 if (list_empty(&cur_trans->dirty_bgs) || 2915 !btrfs_test_opt(fs_info, SPACE_CACHE)) 2916 return 0; 2917 2918 path = btrfs_alloc_path(); 2919 if (!path) 2920 return -ENOMEM; 2921 2922 /* Could add new block groups, use _safe just in case */ 2923 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 2924 dirty_list) { 2925 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 2926 cache_save_setup(cache, trans, path); 2927 } 2928 2929 btrfs_free_path(path); 2930 return 0; 2931 } 2932 2933 /* 2934 * Transaction commit does final block group cache writeback during a critical 2935 * section where nothing is allowed to change the FS. This is required in 2936 * order for the cache to actually match the block group, but can introduce a 2937 * lot of latency into the commit. 2938 * 2939 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 2940 * There's a chance we'll have to redo some of it if the block group changes 2941 * again during the commit, but it greatly reduces the commit latency by 2942 * getting rid of the easy block groups while we're still allowing others to 2943 * join the commit. 2944 */ 2945 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 2946 { 2947 struct btrfs_fs_info *fs_info = trans->fs_info; 2948 struct btrfs_block_group *cache; 2949 struct btrfs_transaction *cur_trans = trans->transaction; 2950 int ret = 0; 2951 int should_put; 2952 struct btrfs_path *path = NULL; 2953 LIST_HEAD(dirty); 2954 struct list_head *io = &cur_trans->io_bgs; 2955 int loops = 0; 2956 2957 spin_lock(&cur_trans->dirty_bgs_lock); 2958 if (list_empty(&cur_trans->dirty_bgs)) { 2959 spin_unlock(&cur_trans->dirty_bgs_lock); 2960 return 0; 2961 } 2962 list_splice_init(&cur_trans->dirty_bgs, &dirty); 2963 spin_unlock(&cur_trans->dirty_bgs_lock); 2964 2965 again: 2966 /* Make sure all the block groups on our dirty list actually exist */ 2967 btrfs_create_pending_block_groups(trans); 2968 2969 if (!path) { 2970 path = btrfs_alloc_path(); 2971 if (!path) { 2972 ret = -ENOMEM; 2973 goto out; 2974 } 2975 } 2976 2977 /* 2978 * cache_write_mutex is here only to save us from balance or automatic 2979 * removal of empty block groups deleting this block group while we are 2980 * writing out the cache 2981 */ 2982 mutex_lock(&trans->transaction->cache_write_mutex); 2983 while (!list_empty(&dirty)) { 2984 bool drop_reserve = true; 2985 2986 cache = list_first_entry(&dirty, struct btrfs_block_group, 2987 dirty_list); 2988 /* 2989 * This can happen if something re-dirties a block group that 2990 * is already under IO. Just wait for it to finish and then do 2991 * it all again 2992 */ 2993 if (!list_empty(&cache->io_list)) { 2994 list_del_init(&cache->io_list); 2995 btrfs_wait_cache_io(trans, cache, path); 2996 btrfs_put_block_group(cache); 2997 } 2998 2999 3000 /* 3001 * btrfs_wait_cache_io uses the cache->dirty_list to decide if 3002 * it should update the cache_state. Don't delete until after 3003 * we wait. 3004 * 3005 * Since we're not running in the commit critical section 3006 * we need the dirty_bgs_lock to protect from update_block_group 3007 */ 3008 spin_lock(&cur_trans->dirty_bgs_lock); 3009 list_del_init(&cache->dirty_list); 3010 spin_unlock(&cur_trans->dirty_bgs_lock); 3011 3012 should_put = 1; 3013 3014 cache_save_setup(cache, trans, path); 3015 3016 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3017 cache->io_ctl.inode = NULL; 3018 ret = btrfs_write_out_cache(trans, cache, path); 3019 if (ret == 0 && cache->io_ctl.inode) { 3020 should_put = 0; 3021 3022 /* 3023 * The cache_write_mutex is protecting the 3024 * io_list, also refer to the definition of 3025 * btrfs_transaction::io_bgs for more details 3026 */ 3027 list_add_tail(&cache->io_list, io); 3028 } else { 3029 /* 3030 * If we failed to write the cache, the 3031 * generation will be bad and life goes on 3032 */ 3033 ret = 0; 3034 } 3035 } 3036 if (!ret) { 3037 ret = update_block_group_item(trans, path, cache); 3038 /* 3039 * Our block group might still be attached to the list 3040 * of new block groups in the transaction handle of some 3041 * other task (struct btrfs_trans_handle->new_bgs). This 3042 * means its block group item isn't yet in the extent 3043 * tree. If this happens ignore the error, as we will 3044 * try again later in the critical section of the 3045 * transaction commit. 3046 */ 3047 if (ret == -ENOENT) { 3048 ret = 0; 3049 spin_lock(&cur_trans->dirty_bgs_lock); 3050 if (list_empty(&cache->dirty_list)) { 3051 list_add_tail(&cache->dirty_list, 3052 &cur_trans->dirty_bgs); 3053 btrfs_get_block_group(cache); 3054 drop_reserve = false; 3055 } 3056 spin_unlock(&cur_trans->dirty_bgs_lock); 3057 } else if (ret) { 3058 btrfs_abort_transaction(trans, ret); 3059 } 3060 } 3061 3062 /* If it's not on the io list, we need to put the block group */ 3063 if (should_put) 3064 btrfs_put_block_group(cache); 3065 if (drop_reserve) 3066 btrfs_delayed_refs_rsv_release(fs_info, 1); 3067 /* 3068 * Avoid blocking other tasks for too long. It might even save 3069 * us from writing caches for block groups that are going to be 3070 * removed. 3071 */ 3072 mutex_unlock(&trans->transaction->cache_write_mutex); 3073 if (ret) 3074 goto out; 3075 mutex_lock(&trans->transaction->cache_write_mutex); 3076 } 3077 mutex_unlock(&trans->transaction->cache_write_mutex); 3078 3079 /* 3080 * Go through delayed refs for all the stuff we've just kicked off 3081 * and then loop back (just once) 3082 */ 3083 if (!ret) 3084 ret = btrfs_run_delayed_refs(trans, 0); 3085 if (!ret && loops == 0) { 3086 loops++; 3087 spin_lock(&cur_trans->dirty_bgs_lock); 3088 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3089 /* 3090 * dirty_bgs_lock protects us from concurrent block group 3091 * deletes too (not just cache_write_mutex). 3092 */ 3093 if (!list_empty(&dirty)) { 3094 spin_unlock(&cur_trans->dirty_bgs_lock); 3095 goto again; 3096 } 3097 spin_unlock(&cur_trans->dirty_bgs_lock); 3098 } 3099 out: 3100 if (ret < 0) { 3101 spin_lock(&cur_trans->dirty_bgs_lock); 3102 list_splice_init(&dirty, &cur_trans->dirty_bgs); 3103 spin_unlock(&cur_trans->dirty_bgs_lock); 3104 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3105 } 3106 3107 btrfs_free_path(path); 3108 return ret; 3109 } 3110 3111 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 3112 { 3113 struct btrfs_fs_info *fs_info = trans->fs_info; 3114 struct btrfs_block_group *cache; 3115 struct btrfs_transaction *cur_trans = trans->transaction; 3116 int ret = 0; 3117 int should_put; 3118 struct btrfs_path *path; 3119 struct list_head *io = &cur_trans->io_bgs; 3120 3121 path = btrfs_alloc_path(); 3122 if (!path) 3123 return -ENOMEM; 3124 3125 /* 3126 * Even though we are in the critical section of the transaction commit, 3127 * we can still have concurrent tasks adding elements to this 3128 * transaction's list of dirty block groups. These tasks correspond to 3129 * endio free space workers started when writeback finishes for a 3130 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3131 * allocate new block groups as a result of COWing nodes of the root 3132 * tree when updating the free space inode. The writeback for the space 3133 * caches is triggered by an earlier call to 3134 * btrfs_start_dirty_block_groups() and iterations of the following 3135 * loop. 3136 * Also we want to do the cache_save_setup first and then run the 3137 * delayed refs to make sure we have the best chance at doing this all 3138 * in one shot. 3139 */ 3140 spin_lock(&cur_trans->dirty_bgs_lock); 3141 while (!list_empty(&cur_trans->dirty_bgs)) { 3142 cache = list_first_entry(&cur_trans->dirty_bgs, 3143 struct btrfs_block_group, 3144 dirty_list); 3145 3146 /* 3147 * This can happen if cache_save_setup re-dirties a block group 3148 * that is already under IO. Just wait for it to finish and 3149 * then do it all again 3150 */ 3151 if (!list_empty(&cache->io_list)) { 3152 spin_unlock(&cur_trans->dirty_bgs_lock); 3153 list_del_init(&cache->io_list); 3154 btrfs_wait_cache_io(trans, cache, path); 3155 btrfs_put_block_group(cache); 3156 spin_lock(&cur_trans->dirty_bgs_lock); 3157 } 3158 3159 /* 3160 * Don't remove from the dirty list until after we've waited on 3161 * any pending IO 3162 */ 3163 list_del_init(&cache->dirty_list); 3164 spin_unlock(&cur_trans->dirty_bgs_lock); 3165 should_put = 1; 3166 3167 cache_save_setup(cache, trans, path); 3168 3169 if (!ret) 3170 ret = btrfs_run_delayed_refs(trans, 3171 (unsigned long) -1); 3172 3173 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3174 cache->io_ctl.inode = NULL; 3175 ret = btrfs_write_out_cache(trans, cache, path); 3176 if (ret == 0 && cache->io_ctl.inode) { 3177 should_put = 0; 3178 list_add_tail(&cache->io_list, io); 3179 } else { 3180 /* 3181 * If we failed to write the cache, the 3182 * generation will be bad and life goes on 3183 */ 3184 ret = 0; 3185 } 3186 } 3187 if (!ret) { 3188 ret = update_block_group_item(trans, path, cache); 3189 /* 3190 * One of the free space endio workers might have 3191 * created a new block group while updating a free space 3192 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3193 * and hasn't released its transaction handle yet, in 3194 * which case the new block group is still attached to 3195 * its transaction handle and its creation has not 3196 * finished yet (no block group item in the extent tree 3197 * yet, etc). If this is the case, wait for all free 3198 * space endio workers to finish and retry. This is a 3199 * very rare case so no need for a more efficient and 3200 * complex approach. 3201 */ 3202 if (ret == -ENOENT) { 3203 wait_event(cur_trans->writer_wait, 3204 atomic_read(&cur_trans->num_writers) == 1); 3205 ret = update_block_group_item(trans, path, cache); 3206 } 3207 if (ret) 3208 btrfs_abort_transaction(trans, ret); 3209 } 3210 3211 /* If its not on the io list, we need to put the block group */ 3212 if (should_put) 3213 btrfs_put_block_group(cache); 3214 btrfs_delayed_refs_rsv_release(fs_info, 1); 3215 spin_lock(&cur_trans->dirty_bgs_lock); 3216 } 3217 spin_unlock(&cur_trans->dirty_bgs_lock); 3218 3219 /* 3220 * Refer to the definition of io_bgs member for details why it's safe 3221 * to use it without any locking 3222 */ 3223 while (!list_empty(io)) { 3224 cache = list_first_entry(io, struct btrfs_block_group, 3225 io_list); 3226 list_del_init(&cache->io_list); 3227 btrfs_wait_cache_io(trans, cache, path); 3228 btrfs_put_block_group(cache); 3229 } 3230 3231 btrfs_free_path(path); 3232 return ret; 3233 } 3234 3235 static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, 3236 u64 bytes_freed) 3237 { 3238 const struct btrfs_space_info *space_info = bg->space_info; 3239 const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); 3240 const u64 new_val = bg->used; 3241 const u64 old_val = new_val + bytes_freed; 3242 u64 thresh; 3243 3244 if (reclaim_thresh == 0) 3245 return false; 3246 3247 thresh = div_factor_fine(bg->length, reclaim_thresh); 3248 3249 /* 3250 * If we were below the threshold before don't reclaim, we are likely a 3251 * brand new block group and we don't want to relocate new block groups. 3252 */ 3253 if (old_val < thresh) 3254 return false; 3255 if (new_val >= thresh) 3256 return false; 3257 return true; 3258 } 3259 3260 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 3261 u64 bytenr, u64 num_bytes, bool alloc) 3262 { 3263 struct btrfs_fs_info *info = trans->fs_info; 3264 struct btrfs_block_group *cache = NULL; 3265 u64 total = num_bytes; 3266 u64 old_val; 3267 u64 byte_in_group; 3268 int factor; 3269 int ret = 0; 3270 3271 /* Block accounting for super block */ 3272 spin_lock(&info->delalloc_root_lock); 3273 old_val = btrfs_super_bytes_used(info->super_copy); 3274 if (alloc) 3275 old_val += num_bytes; 3276 else 3277 old_val -= num_bytes; 3278 btrfs_set_super_bytes_used(info->super_copy, old_val); 3279 spin_unlock(&info->delalloc_root_lock); 3280 3281 while (total) { 3282 bool reclaim; 3283 3284 cache = btrfs_lookup_block_group(info, bytenr); 3285 if (!cache) { 3286 ret = -ENOENT; 3287 break; 3288 } 3289 factor = btrfs_bg_type_to_factor(cache->flags); 3290 3291 /* 3292 * If this block group has free space cache written out, we 3293 * need to make sure to load it if we are removing space. This 3294 * is because we need the unpinning stage to actually add the 3295 * space back to the block group, otherwise we will leak space. 3296 */ 3297 if (!alloc && !btrfs_block_group_done(cache)) 3298 btrfs_cache_block_group(cache, true); 3299 3300 byte_in_group = bytenr - cache->start; 3301 WARN_ON(byte_in_group > cache->length); 3302 3303 spin_lock(&cache->space_info->lock); 3304 spin_lock(&cache->lock); 3305 3306 if (btrfs_test_opt(info, SPACE_CACHE) && 3307 cache->disk_cache_state < BTRFS_DC_CLEAR) 3308 cache->disk_cache_state = BTRFS_DC_CLEAR; 3309 3310 old_val = cache->used; 3311 num_bytes = min(total, cache->length - byte_in_group); 3312 if (alloc) { 3313 old_val += num_bytes; 3314 cache->used = old_val; 3315 cache->reserved -= num_bytes; 3316 cache->space_info->bytes_reserved -= num_bytes; 3317 cache->space_info->bytes_used += num_bytes; 3318 cache->space_info->disk_used += num_bytes * factor; 3319 spin_unlock(&cache->lock); 3320 spin_unlock(&cache->space_info->lock); 3321 } else { 3322 old_val -= num_bytes; 3323 cache->used = old_val; 3324 cache->pinned += num_bytes; 3325 btrfs_space_info_update_bytes_pinned(info, 3326 cache->space_info, num_bytes); 3327 cache->space_info->bytes_used -= num_bytes; 3328 cache->space_info->disk_used -= num_bytes * factor; 3329 3330 reclaim = should_reclaim_block_group(cache, num_bytes); 3331 spin_unlock(&cache->lock); 3332 spin_unlock(&cache->space_info->lock); 3333 3334 set_extent_dirty(&trans->transaction->pinned_extents, 3335 bytenr, bytenr + num_bytes - 1, 3336 GFP_NOFS | __GFP_NOFAIL); 3337 } 3338 3339 spin_lock(&trans->transaction->dirty_bgs_lock); 3340 if (list_empty(&cache->dirty_list)) { 3341 list_add_tail(&cache->dirty_list, 3342 &trans->transaction->dirty_bgs); 3343 trans->delayed_ref_updates++; 3344 btrfs_get_block_group(cache); 3345 } 3346 spin_unlock(&trans->transaction->dirty_bgs_lock); 3347 3348 /* 3349 * No longer have used bytes in this block group, queue it for 3350 * deletion. We do this after adding the block group to the 3351 * dirty list to avoid races between cleaner kthread and space 3352 * cache writeout. 3353 */ 3354 if (!alloc && old_val == 0) { 3355 if (!btrfs_test_opt(info, DISCARD_ASYNC)) 3356 btrfs_mark_bg_unused(cache); 3357 } else if (!alloc && reclaim) { 3358 btrfs_mark_bg_to_reclaim(cache); 3359 } 3360 3361 btrfs_put_block_group(cache); 3362 total -= num_bytes; 3363 bytenr += num_bytes; 3364 } 3365 3366 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 3367 btrfs_update_delayed_refs_rsv(trans); 3368 return ret; 3369 } 3370 3371 /** 3372 * btrfs_add_reserved_bytes - update the block_group and space info counters 3373 * @cache: The cache we are manipulating 3374 * @ram_bytes: The number of bytes of file content, and will be same to 3375 * @num_bytes except for the compress path. 3376 * @num_bytes: The number of bytes in question 3377 * @delalloc: The blocks are allocated for the delalloc write 3378 * 3379 * This is called by the allocator when it reserves space. If this is a 3380 * reservation and the block group has become read only we cannot make the 3381 * reservation and return -EAGAIN, otherwise this function always succeeds. 3382 */ 3383 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 3384 u64 ram_bytes, u64 num_bytes, int delalloc) 3385 { 3386 struct btrfs_space_info *space_info = cache->space_info; 3387 int ret = 0; 3388 3389 spin_lock(&space_info->lock); 3390 spin_lock(&cache->lock); 3391 if (cache->ro) { 3392 ret = -EAGAIN; 3393 } else { 3394 cache->reserved += num_bytes; 3395 space_info->bytes_reserved += num_bytes; 3396 trace_btrfs_space_reservation(cache->fs_info, "space_info", 3397 space_info->flags, num_bytes, 1); 3398 btrfs_space_info_update_bytes_may_use(cache->fs_info, 3399 space_info, -ram_bytes); 3400 if (delalloc) 3401 cache->delalloc_bytes += num_bytes; 3402 3403 /* 3404 * Compression can use less space than we reserved, so wake 3405 * tickets if that happens 3406 */ 3407 if (num_bytes < ram_bytes) 3408 btrfs_try_granting_tickets(cache->fs_info, space_info); 3409 } 3410 spin_unlock(&cache->lock); 3411 spin_unlock(&space_info->lock); 3412 return ret; 3413 } 3414 3415 /** 3416 * btrfs_free_reserved_bytes - update the block_group and space info counters 3417 * @cache: The cache we are manipulating 3418 * @num_bytes: The number of bytes in question 3419 * @delalloc: The blocks are allocated for the delalloc write 3420 * 3421 * This is called by somebody who is freeing space that was never actually used 3422 * on disk. For example if you reserve some space for a new leaf in transaction 3423 * A and before transaction A commits you free that leaf, you call this with 3424 * reserve set to 0 in order to clear the reservation. 3425 */ 3426 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 3427 u64 num_bytes, int delalloc) 3428 { 3429 struct btrfs_space_info *space_info = cache->space_info; 3430 3431 spin_lock(&space_info->lock); 3432 spin_lock(&cache->lock); 3433 if (cache->ro) 3434 space_info->bytes_readonly += num_bytes; 3435 cache->reserved -= num_bytes; 3436 space_info->bytes_reserved -= num_bytes; 3437 space_info->max_extent_size = 0; 3438 3439 if (delalloc) 3440 cache->delalloc_bytes -= num_bytes; 3441 spin_unlock(&cache->lock); 3442 3443 btrfs_try_granting_tickets(cache->fs_info, space_info); 3444 spin_unlock(&space_info->lock); 3445 } 3446 3447 static void force_metadata_allocation(struct btrfs_fs_info *info) 3448 { 3449 struct list_head *head = &info->space_info; 3450 struct btrfs_space_info *found; 3451 3452 list_for_each_entry(found, head, list) { 3453 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3454 found->force_alloc = CHUNK_ALLOC_FORCE; 3455 } 3456 } 3457 3458 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3459 struct btrfs_space_info *sinfo, int force) 3460 { 3461 u64 bytes_used = btrfs_space_info_used(sinfo, false); 3462 u64 thresh; 3463 3464 if (force == CHUNK_ALLOC_FORCE) 3465 return 1; 3466 3467 /* 3468 * in limited mode, we want to have some free space up to 3469 * about 1% of the FS size. 3470 */ 3471 if (force == CHUNK_ALLOC_LIMITED) { 3472 thresh = btrfs_super_total_bytes(fs_info->super_copy); 3473 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 3474 3475 if (sinfo->total_bytes - bytes_used < thresh) 3476 return 1; 3477 } 3478 3479 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 3480 return 0; 3481 return 1; 3482 } 3483 3484 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 3485 { 3486 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 3487 3488 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 3489 } 3490 3491 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) 3492 { 3493 struct btrfs_block_group *bg; 3494 int ret; 3495 3496 /* 3497 * Check if we have enough space in the system space info because we 3498 * will need to update device items in the chunk btree and insert a new 3499 * chunk item in the chunk btree as well. This will allocate a new 3500 * system block group if needed. 3501 */ 3502 check_system_chunk(trans, flags); 3503 3504 bg = btrfs_create_chunk(trans, flags); 3505 if (IS_ERR(bg)) { 3506 ret = PTR_ERR(bg); 3507 goto out; 3508 } 3509 3510 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3511 /* 3512 * Normally we are not expected to fail with -ENOSPC here, since we have 3513 * previously reserved space in the system space_info and allocated one 3514 * new system chunk if necessary. However there are three exceptions: 3515 * 3516 * 1) We may have enough free space in the system space_info but all the 3517 * existing system block groups have a profile which can not be used 3518 * for extent allocation. 3519 * 3520 * This happens when mounting in degraded mode. For example we have a 3521 * RAID1 filesystem with 2 devices, lose one device and mount the fs 3522 * using the other device in degraded mode. If we then allocate a chunk, 3523 * we may have enough free space in the existing system space_info, but 3524 * none of the block groups can be used for extent allocation since they 3525 * have a RAID1 profile, and because we are in degraded mode with a 3526 * single device, we are forced to allocate a new system chunk with a 3527 * SINGLE profile. Making check_system_chunk() iterate over all system 3528 * block groups and check if they have a usable profile and enough space 3529 * can be slow on very large filesystems, so we tolerate the -ENOSPC and 3530 * try again after forcing allocation of a new system chunk. Like this 3531 * we avoid paying the cost of that search in normal circumstances, when 3532 * we were not mounted in degraded mode; 3533 * 3534 * 2) We had enough free space info the system space_info, and one suitable 3535 * block group to allocate from when we called check_system_chunk() 3536 * above. However right after we called it, the only system block group 3537 * with enough free space got turned into RO mode by a running scrub, 3538 * and in this case we have to allocate a new one and retry. We only 3539 * need do this allocate and retry once, since we have a transaction 3540 * handle and scrub uses the commit root to search for block groups; 3541 * 3542 * 3) We had one system block group with enough free space when we called 3543 * check_system_chunk(), but after that, right before we tried to 3544 * allocate the last extent buffer we needed, a discard operation came 3545 * in and it temporarily removed the last free space entry from the 3546 * block group (discard removes a free space entry, discards it, and 3547 * then adds back the entry to the block group cache). 3548 */ 3549 if (ret == -ENOSPC) { 3550 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); 3551 struct btrfs_block_group *sys_bg; 3552 3553 sys_bg = btrfs_create_chunk(trans, sys_flags); 3554 if (IS_ERR(sys_bg)) { 3555 ret = PTR_ERR(sys_bg); 3556 btrfs_abort_transaction(trans, ret); 3557 goto out; 3558 } 3559 3560 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3561 if (ret) { 3562 btrfs_abort_transaction(trans, ret); 3563 goto out; 3564 } 3565 3566 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3567 if (ret) { 3568 btrfs_abort_transaction(trans, ret); 3569 goto out; 3570 } 3571 } else if (ret) { 3572 btrfs_abort_transaction(trans, ret); 3573 goto out; 3574 } 3575 out: 3576 btrfs_trans_release_chunk_metadata(trans); 3577 3578 if (ret) 3579 return ERR_PTR(ret); 3580 3581 btrfs_get_block_group(bg); 3582 return bg; 3583 } 3584 3585 /* 3586 * Chunk allocation is done in 2 phases: 3587 * 3588 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for 3589 * the chunk, the chunk mapping, create its block group and add the items 3590 * that belong in the chunk btree to it - more specifically, we need to 3591 * update device items in the chunk btree and add a new chunk item to it. 3592 * 3593 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block 3594 * group item to the extent btree and the device extent items to the devices 3595 * btree. 3596 * 3597 * This is done to prevent deadlocks. For example when COWing a node from the 3598 * extent btree we are holding a write lock on the node's parent and if we 3599 * trigger chunk allocation and attempted to insert the new block group item 3600 * in the extent btree right way, we could deadlock because the path for the 3601 * insertion can include that parent node. At first glance it seems impossible 3602 * to trigger chunk allocation after starting a transaction since tasks should 3603 * reserve enough transaction units (metadata space), however while that is true 3604 * most of the time, chunk allocation may still be triggered for several reasons: 3605 * 3606 * 1) When reserving metadata, we check if there is enough free space in the 3607 * metadata space_info and therefore don't trigger allocation of a new chunk. 3608 * However later when the task actually tries to COW an extent buffer from 3609 * the extent btree or from the device btree for example, it is forced to 3610 * allocate a new block group (chunk) because the only one that had enough 3611 * free space was just turned to RO mode by a running scrub for example (or 3612 * device replace, block group reclaim thread, etc), so we can not use it 3613 * for allocating an extent and end up being forced to allocate a new one; 3614 * 3615 * 2) Because we only check that the metadata space_info has enough free bytes, 3616 * we end up not allocating a new metadata chunk in that case. However if 3617 * the filesystem was mounted in degraded mode, none of the existing block 3618 * groups might be suitable for extent allocation due to their incompatible 3619 * profile (for e.g. mounting a 2 devices filesystem, where all block groups 3620 * use a RAID1 profile, in degraded mode using a single device). In this case 3621 * when the task attempts to COW some extent buffer of the extent btree for 3622 * example, it will trigger allocation of a new metadata block group with a 3623 * suitable profile (SINGLE profile in the example of the degraded mount of 3624 * the RAID1 filesystem); 3625 * 3626 * 3) The task has reserved enough transaction units / metadata space, but when 3627 * it attempts to COW an extent buffer from the extent or device btree for 3628 * example, it does not find any free extent in any metadata block group, 3629 * therefore forced to try to allocate a new metadata block group. 3630 * This is because some other task allocated all available extents in the 3631 * meanwhile - this typically happens with tasks that don't reserve space 3632 * properly, either intentionally or as a bug. One example where this is 3633 * done intentionally is fsync, as it does not reserve any transaction units 3634 * and ends up allocating a variable number of metadata extents for log 3635 * tree extent buffers; 3636 * 3637 * 4) The task has reserved enough transaction units / metadata space, but right 3638 * before it tries to allocate the last extent buffer it needs, a discard 3639 * operation comes in and, temporarily, removes the last free space entry from 3640 * the only metadata block group that had free space (discard starts by 3641 * removing a free space entry from a block group, then does the discard 3642 * operation and, once it's done, it adds back the free space entry to the 3643 * block group). 3644 * 3645 * We also need this 2 phases setup when adding a device to a filesystem with 3646 * a seed device - we must create new metadata and system chunks without adding 3647 * any of the block group items to the chunk, extent and device btrees. If we 3648 * did not do it this way, we would get ENOSPC when attempting to update those 3649 * btrees, since all the chunks from the seed device are read-only. 3650 * 3651 * Phase 1 does the updates and insertions to the chunk btree because if we had 3652 * it done in phase 2 and have a thundering herd of tasks allocating chunks in 3653 * parallel, we risk having too many system chunks allocated by many tasks if 3654 * many tasks reach phase 1 without the previous ones completing phase 2. In the 3655 * extreme case this leads to exhaustion of the system chunk array in the 3656 * superblock. This is easier to trigger if using a btree node/leaf size of 64K 3657 * and with RAID filesystems (so we have more device items in the chunk btree). 3658 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of 3659 * the system chunk array due to concurrent allocations") provides more details. 3660 * 3661 * Allocation of system chunks does not happen through this function. A task that 3662 * needs to update the chunk btree (the only btree that uses system chunks), must 3663 * preallocate chunk space by calling either check_system_chunk() or 3664 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or 3665 * metadata chunk or when removing a chunk, while the later is used before doing 3666 * a modification to the chunk btree - use cases for the later are adding, 3667 * removing and resizing a device as well as relocation of a system chunk. 3668 * See the comment below for more details. 3669 * 3670 * The reservation of system space, done through check_system_chunk(), as well 3671 * as all the updates and insertions into the chunk btree must be done while 3672 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing 3673 * an extent buffer from the chunks btree we never trigger allocation of a new 3674 * system chunk, which would result in a deadlock (trying to lock twice an 3675 * extent buffer of the chunk btree, first time before triggering the chunk 3676 * allocation and the second time during chunk allocation while attempting to 3677 * update the chunks btree). The system chunk array is also updated while holding 3678 * that mutex. The same logic applies to removing chunks - we must reserve system 3679 * space, update the chunk btree and the system chunk array in the superblock 3680 * while holding fs_info->chunk_mutex. 3681 * 3682 * This function, btrfs_chunk_alloc(), belongs to phase 1. 3683 * 3684 * If @force is CHUNK_ALLOC_FORCE: 3685 * - return 1 if it successfully allocates a chunk, 3686 * - return errors including -ENOSPC otherwise. 3687 * If @force is NOT CHUNK_ALLOC_FORCE: 3688 * - return 0 if it doesn't need to allocate a new chunk, 3689 * - return 1 if it successfully allocates a chunk, 3690 * - return errors including -ENOSPC otherwise. 3691 */ 3692 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3693 enum btrfs_chunk_alloc_enum force) 3694 { 3695 struct btrfs_fs_info *fs_info = trans->fs_info; 3696 struct btrfs_space_info *space_info; 3697 struct btrfs_block_group *ret_bg; 3698 bool wait_for_alloc = false; 3699 bool should_alloc = false; 3700 bool from_extent_allocation = false; 3701 int ret = 0; 3702 3703 if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { 3704 from_extent_allocation = true; 3705 force = CHUNK_ALLOC_FORCE; 3706 } 3707 3708 /* Don't re-enter if we're already allocating a chunk */ 3709 if (trans->allocating_chunk) 3710 return -ENOSPC; 3711 /* 3712 * Allocation of system chunks can not happen through this path, as we 3713 * could end up in a deadlock if we are allocating a data or metadata 3714 * chunk and there is another task modifying the chunk btree. 3715 * 3716 * This is because while we are holding the chunk mutex, we will attempt 3717 * to add the new chunk item to the chunk btree or update an existing 3718 * device item in the chunk btree, while the other task that is modifying 3719 * the chunk btree is attempting to COW an extent buffer while holding a 3720 * lock on it and on its parent - if the COW operation triggers a system 3721 * chunk allocation, then we can deadlock because we are holding the 3722 * chunk mutex and we may need to access that extent buffer or its parent 3723 * in order to add the chunk item or update a device item. 3724 * 3725 * Tasks that want to modify the chunk tree should reserve system space 3726 * before updating the chunk btree, by calling either 3727 * btrfs_reserve_chunk_metadata() or check_system_chunk(). 3728 * It's possible that after a task reserves the space, it still ends up 3729 * here - this happens in the cases described above at do_chunk_alloc(). 3730 * The task will have to either retry or fail. 3731 */ 3732 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3733 return -ENOSPC; 3734 3735 space_info = btrfs_find_space_info(fs_info, flags); 3736 ASSERT(space_info); 3737 3738 do { 3739 spin_lock(&space_info->lock); 3740 if (force < space_info->force_alloc) 3741 force = space_info->force_alloc; 3742 should_alloc = should_alloc_chunk(fs_info, space_info, force); 3743 if (space_info->full) { 3744 /* No more free physical space */ 3745 if (should_alloc) 3746 ret = -ENOSPC; 3747 else 3748 ret = 0; 3749 spin_unlock(&space_info->lock); 3750 return ret; 3751 } else if (!should_alloc) { 3752 spin_unlock(&space_info->lock); 3753 return 0; 3754 } else if (space_info->chunk_alloc) { 3755 /* 3756 * Someone is already allocating, so we need to block 3757 * until this someone is finished and then loop to 3758 * recheck if we should continue with our allocation 3759 * attempt. 3760 */ 3761 wait_for_alloc = true; 3762 force = CHUNK_ALLOC_NO_FORCE; 3763 spin_unlock(&space_info->lock); 3764 mutex_lock(&fs_info->chunk_mutex); 3765 mutex_unlock(&fs_info->chunk_mutex); 3766 } else { 3767 /* Proceed with allocation */ 3768 space_info->chunk_alloc = 1; 3769 wait_for_alloc = false; 3770 spin_unlock(&space_info->lock); 3771 } 3772 3773 cond_resched(); 3774 } while (wait_for_alloc); 3775 3776 mutex_lock(&fs_info->chunk_mutex); 3777 trans->allocating_chunk = true; 3778 3779 /* 3780 * If we have mixed data/metadata chunks we want to make sure we keep 3781 * allocating mixed chunks instead of individual chunks. 3782 */ 3783 if (btrfs_mixed_space_info(space_info)) 3784 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3785 3786 /* 3787 * if we're doing a data chunk, go ahead and make sure that 3788 * we keep a reasonable number of metadata chunks allocated in the 3789 * FS as well. 3790 */ 3791 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3792 fs_info->data_chunk_allocations++; 3793 if (!(fs_info->data_chunk_allocations % 3794 fs_info->metadata_ratio)) 3795 force_metadata_allocation(fs_info); 3796 } 3797 3798 ret_bg = do_chunk_alloc(trans, flags); 3799 trans->allocating_chunk = false; 3800 3801 if (IS_ERR(ret_bg)) { 3802 ret = PTR_ERR(ret_bg); 3803 } else if (from_extent_allocation) { 3804 /* 3805 * New block group is likely to be used soon. Try to activate 3806 * it now. Failure is OK for now. 3807 */ 3808 btrfs_zone_activate(ret_bg); 3809 } 3810 3811 if (!ret) 3812 btrfs_put_block_group(ret_bg); 3813 3814 spin_lock(&space_info->lock); 3815 if (ret < 0) { 3816 if (ret == -ENOSPC) 3817 space_info->full = 1; 3818 else 3819 goto out; 3820 } else { 3821 ret = 1; 3822 space_info->max_extent_size = 0; 3823 } 3824 3825 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3826 out: 3827 space_info->chunk_alloc = 0; 3828 spin_unlock(&space_info->lock); 3829 mutex_unlock(&fs_info->chunk_mutex); 3830 3831 return ret; 3832 } 3833 3834 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 3835 { 3836 u64 num_dev; 3837 3838 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 3839 if (!num_dev) 3840 num_dev = fs_info->fs_devices->rw_devices; 3841 3842 return num_dev; 3843 } 3844 3845 static void reserve_chunk_space(struct btrfs_trans_handle *trans, 3846 u64 bytes, 3847 u64 type) 3848 { 3849 struct btrfs_fs_info *fs_info = trans->fs_info; 3850 struct btrfs_space_info *info; 3851 u64 left; 3852 int ret = 0; 3853 3854 /* 3855 * Needed because we can end up allocating a system chunk and for an 3856 * atomic and race free space reservation in the chunk block reserve. 3857 */ 3858 lockdep_assert_held(&fs_info->chunk_mutex); 3859 3860 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3861 spin_lock(&info->lock); 3862 left = info->total_bytes - btrfs_space_info_used(info, true); 3863 spin_unlock(&info->lock); 3864 3865 if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3866 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3867 left, bytes, type); 3868 btrfs_dump_space_info(fs_info, info, 0, 0); 3869 } 3870 3871 if (left < bytes) { 3872 u64 flags = btrfs_system_alloc_profile(fs_info); 3873 struct btrfs_block_group *bg; 3874 3875 /* 3876 * Ignore failure to create system chunk. We might end up not 3877 * needing it, as we might not need to COW all nodes/leafs from 3878 * the paths we visit in the chunk tree (they were already COWed 3879 * or created in the current transaction for example). 3880 */ 3881 bg = btrfs_create_chunk(trans, flags); 3882 if (IS_ERR(bg)) { 3883 ret = PTR_ERR(bg); 3884 } else { 3885 /* 3886 * We have a new chunk. We also need to activate it for 3887 * zoned filesystem. 3888 */ 3889 ret = btrfs_zoned_activate_one_bg(fs_info, info, true); 3890 if (ret < 0) 3891 return; 3892 3893 /* 3894 * If we fail to add the chunk item here, we end up 3895 * trying again at phase 2 of chunk allocation, at 3896 * btrfs_create_pending_block_groups(). So ignore 3897 * any error here. An ENOSPC here could happen, due to 3898 * the cases described at do_chunk_alloc() - the system 3899 * block group we just created was just turned into RO 3900 * mode by a scrub for example, or a running discard 3901 * temporarily removed its free space entries, etc. 3902 */ 3903 btrfs_chunk_alloc_add_chunk_item(trans, bg); 3904 } 3905 } 3906 3907 if (!ret) { 3908 ret = btrfs_block_rsv_add(fs_info, 3909 &fs_info->chunk_block_rsv, 3910 bytes, BTRFS_RESERVE_NO_FLUSH); 3911 if (!ret) 3912 trans->chunk_bytes_reserved += bytes; 3913 } 3914 } 3915 3916 /* 3917 * Reserve space in the system space for allocating or removing a chunk. 3918 * The caller must be holding fs_info->chunk_mutex. 3919 */ 3920 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 3921 { 3922 struct btrfs_fs_info *fs_info = trans->fs_info; 3923 const u64 num_devs = get_profile_num_devs(fs_info, type); 3924 u64 bytes; 3925 3926 /* num_devs device items to update and 1 chunk item to add or remove. */ 3927 bytes = btrfs_calc_metadata_size(fs_info, num_devs) + 3928 btrfs_calc_insert_metadata_size(fs_info, 1); 3929 3930 reserve_chunk_space(trans, bytes, type); 3931 } 3932 3933 /* 3934 * Reserve space in the system space, if needed, for doing a modification to the 3935 * chunk btree. 3936 * 3937 * @trans: A transaction handle. 3938 * @is_item_insertion: Indicate if the modification is for inserting a new item 3939 * in the chunk btree or if it's for the deletion or update 3940 * of an existing item. 3941 * 3942 * This is used in a context where we need to update the chunk btree outside 3943 * block group allocation and removal, to avoid a deadlock with a concurrent 3944 * task that is allocating a metadata or data block group and therefore needs to 3945 * update the chunk btree while holding the chunk mutex. After the update to the 3946 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. 3947 * 3948 */ 3949 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, 3950 bool is_item_insertion) 3951 { 3952 struct btrfs_fs_info *fs_info = trans->fs_info; 3953 u64 bytes; 3954 3955 if (is_item_insertion) 3956 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 3957 else 3958 bytes = btrfs_calc_metadata_size(fs_info, 1); 3959 3960 mutex_lock(&fs_info->chunk_mutex); 3961 reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); 3962 mutex_unlock(&fs_info->chunk_mutex); 3963 } 3964 3965 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 3966 { 3967 struct btrfs_block_group *block_group; 3968 u64 last = 0; 3969 3970 while (1) { 3971 struct inode *inode; 3972 3973 block_group = btrfs_lookup_first_block_group(info, last); 3974 while (block_group) { 3975 btrfs_wait_block_group_cache_done(block_group); 3976 spin_lock(&block_group->lock); 3977 if (block_group->iref) 3978 break; 3979 spin_unlock(&block_group->lock); 3980 block_group = btrfs_next_block_group(block_group); 3981 } 3982 if (!block_group) { 3983 if (last == 0) 3984 break; 3985 last = 0; 3986 continue; 3987 } 3988 3989 inode = block_group->inode; 3990 block_group->iref = 0; 3991 block_group->inode = NULL; 3992 spin_unlock(&block_group->lock); 3993 ASSERT(block_group->io_ctl.inode == NULL); 3994 iput(inode); 3995 last = block_group->start + block_group->length; 3996 btrfs_put_block_group(block_group); 3997 } 3998 } 3999 4000 /* 4001 * Must be called only after stopping all workers, since we could have block 4002 * group caching kthreads running, and therefore they could race with us if we 4003 * freed the block groups before stopping them. 4004 */ 4005 int btrfs_free_block_groups(struct btrfs_fs_info *info) 4006 { 4007 struct btrfs_block_group *block_group; 4008 struct btrfs_space_info *space_info; 4009 struct btrfs_caching_control *caching_ctl; 4010 struct rb_node *n; 4011 4012 write_lock(&info->block_group_cache_lock); 4013 while (!list_empty(&info->caching_block_groups)) { 4014 caching_ctl = list_entry(info->caching_block_groups.next, 4015 struct btrfs_caching_control, list); 4016 list_del(&caching_ctl->list); 4017 btrfs_put_caching_control(caching_ctl); 4018 } 4019 write_unlock(&info->block_group_cache_lock); 4020 4021 spin_lock(&info->unused_bgs_lock); 4022 while (!list_empty(&info->unused_bgs)) { 4023 block_group = list_first_entry(&info->unused_bgs, 4024 struct btrfs_block_group, 4025 bg_list); 4026 list_del_init(&block_group->bg_list); 4027 btrfs_put_block_group(block_group); 4028 } 4029 4030 while (!list_empty(&info->reclaim_bgs)) { 4031 block_group = list_first_entry(&info->reclaim_bgs, 4032 struct btrfs_block_group, 4033 bg_list); 4034 list_del_init(&block_group->bg_list); 4035 btrfs_put_block_group(block_group); 4036 } 4037 spin_unlock(&info->unused_bgs_lock); 4038 4039 spin_lock(&info->zone_active_bgs_lock); 4040 while (!list_empty(&info->zone_active_bgs)) { 4041 block_group = list_first_entry(&info->zone_active_bgs, 4042 struct btrfs_block_group, 4043 active_bg_list); 4044 list_del_init(&block_group->active_bg_list); 4045 btrfs_put_block_group(block_group); 4046 } 4047 spin_unlock(&info->zone_active_bgs_lock); 4048 4049 write_lock(&info->block_group_cache_lock); 4050 while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { 4051 block_group = rb_entry(n, struct btrfs_block_group, 4052 cache_node); 4053 rb_erase_cached(&block_group->cache_node, 4054 &info->block_group_cache_tree); 4055 RB_CLEAR_NODE(&block_group->cache_node); 4056 write_unlock(&info->block_group_cache_lock); 4057 4058 down_write(&block_group->space_info->groups_sem); 4059 list_del(&block_group->list); 4060 up_write(&block_group->space_info->groups_sem); 4061 4062 /* 4063 * We haven't cached this block group, which means we could 4064 * possibly have excluded extents on this block group. 4065 */ 4066 if (block_group->cached == BTRFS_CACHE_NO || 4067 block_group->cached == BTRFS_CACHE_ERROR) 4068 btrfs_free_excluded_extents(block_group); 4069 4070 btrfs_remove_free_space_cache(block_group); 4071 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 4072 ASSERT(list_empty(&block_group->dirty_list)); 4073 ASSERT(list_empty(&block_group->io_list)); 4074 ASSERT(list_empty(&block_group->bg_list)); 4075 ASSERT(refcount_read(&block_group->refs) == 1); 4076 ASSERT(block_group->swap_extents == 0); 4077 btrfs_put_block_group(block_group); 4078 4079 write_lock(&info->block_group_cache_lock); 4080 } 4081 write_unlock(&info->block_group_cache_lock); 4082 4083 btrfs_release_global_block_rsv(info); 4084 4085 while (!list_empty(&info->space_info)) { 4086 space_info = list_entry(info->space_info.next, 4087 struct btrfs_space_info, 4088 list); 4089 4090 /* 4091 * Do not hide this behind enospc_debug, this is actually 4092 * important and indicates a real bug if this happens. 4093 */ 4094 if (WARN_ON(space_info->bytes_pinned > 0 || 4095 space_info->bytes_may_use > 0)) 4096 btrfs_dump_space_info(info, space_info, 0, 0); 4097 4098 /* 4099 * If there was a failure to cleanup a log tree, very likely due 4100 * to an IO failure on a writeback attempt of one or more of its 4101 * extent buffers, we could not do proper (and cheap) unaccounting 4102 * of their reserved space, so don't warn on bytes_reserved > 0 in 4103 * that case. 4104 */ 4105 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || 4106 !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { 4107 if (WARN_ON(space_info->bytes_reserved > 0)) 4108 btrfs_dump_space_info(info, space_info, 0, 0); 4109 } 4110 4111 WARN_ON(space_info->reclaim_size > 0); 4112 list_del(&space_info->list); 4113 btrfs_sysfs_remove_space_info(space_info); 4114 } 4115 return 0; 4116 } 4117 4118 void btrfs_freeze_block_group(struct btrfs_block_group *cache) 4119 { 4120 atomic_inc(&cache->frozen); 4121 } 4122 4123 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) 4124 { 4125 struct btrfs_fs_info *fs_info = block_group->fs_info; 4126 struct extent_map_tree *em_tree; 4127 struct extent_map *em; 4128 bool cleanup; 4129 4130 spin_lock(&block_group->lock); 4131 cleanup = (atomic_dec_and_test(&block_group->frozen) && 4132 block_group->removed); 4133 spin_unlock(&block_group->lock); 4134 4135 if (cleanup) { 4136 em_tree = &fs_info->mapping_tree; 4137 write_lock(&em_tree->lock); 4138 em = lookup_extent_mapping(em_tree, block_group->start, 4139 1); 4140 BUG_ON(!em); /* logic error, can't happen */ 4141 remove_extent_mapping(em_tree, em); 4142 write_unlock(&em_tree->lock); 4143 4144 /* once for us and once for the tree */ 4145 free_extent_map(em); 4146 free_extent_map(em); 4147 4148 /* 4149 * We may have left one free space entry and other possible 4150 * tasks trimming this block group have left 1 entry each one. 4151 * Free them if any. 4152 */ 4153 __btrfs_remove_free_space_cache(block_group->free_space_ctl); 4154 } 4155 } 4156 4157 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) 4158 { 4159 bool ret = true; 4160 4161 spin_lock(&bg->lock); 4162 if (bg->ro) 4163 ret = false; 4164 else 4165 bg->swap_extents++; 4166 spin_unlock(&bg->lock); 4167 4168 return ret; 4169 } 4170 4171 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) 4172 { 4173 spin_lock(&bg->lock); 4174 ASSERT(!bg->ro); 4175 ASSERT(bg->swap_extents >= amount); 4176 bg->swap_extents -= amount; 4177 spin_unlock(&bg->lock); 4178 } 4179