1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/sizes.h> 4 #include <linux/list_sort.h> 5 #include "misc.h" 6 #include "ctree.h" 7 #include "block-group.h" 8 #include "space-info.h" 9 #include "disk-io.h" 10 #include "free-space-cache.h" 11 #include "free-space-tree.h" 12 #include "volumes.h" 13 #include "transaction.h" 14 #include "ref-verify.h" 15 #include "sysfs.h" 16 #include "tree-log.h" 17 #include "delalloc-space.h" 18 #include "discard.h" 19 #include "raid56.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 25 #ifdef CONFIG_BTRFS_DEBUG 26 int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) 27 { 28 struct btrfs_fs_info *fs_info = block_group->fs_info; 29 30 return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && 31 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || 32 (btrfs_test_opt(fs_info, FRAGMENT_DATA) && 33 block_group->flags & BTRFS_BLOCK_GROUP_DATA); 34 } 35 #endif 36 37 /* 38 * Return target flags in extended format or 0 if restripe for this chunk_type 39 * is not in progress 40 * 41 * Should be called with balance_lock held 42 */ 43 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 44 { 45 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 46 u64 target = 0; 47 48 if (!bctl) 49 return 0; 50 51 if (flags & BTRFS_BLOCK_GROUP_DATA && 52 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 53 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 54 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 55 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 56 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 57 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 58 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 59 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 60 } 61 62 return target; 63 } 64 65 /* 66 * @flags: available profiles in extended format (see ctree.h) 67 * 68 * Return reduced profile in chunk format. If profile changing is in progress 69 * (either running or paused) picks the target profile (if it's already 70 * available), otherwise falls back to plain reducing. 71 */ 72 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 73 { 74 u64 num_devices = fs_info->fs_devices->rw_devices; 75 u64 target; 76 u64 raid_type; 77 u64 allowed = 0; 78 79 /* 80 * See if restripe for this chunk_type is in progress, if so try to 81 * reduce to the target profile 82 */ 83 spin_lock(&fs_info->balance_lock); 84 target = get_restripe_target(fs_info, flags); 85 if (target) { 86 spin_unlock(&fs_info->balance_lock); 87 return extended_to_chunk(target); 88 } 89 spin_unlock(&fs_info->balance_lock); 90 91 /* First, mask out the RAID levels which aren't possible */ 92 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 93 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 94 allowed |= btrfs_raid_array[raid_type].bg_flag; 95 } 96 allowed &= flags; 97 98 /* Select the highest-redundancy RAID level. */ 99 if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) 100 allowed = BTRFS_BLOCK_GROUP_RAID1C4; 101 else if (allowed & BTRFS_BLOCK_GROUP_RAID6) 102 allowed = BTRFS_BLOCK_GROUP_RAID6; 103 else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) 104 allowed = BTRFS_BLOCK_GROUP_RAID1C3; 105 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 106 allowed = BTRFS_BLOCK_GROUP_RAID5; 107 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 108 allowed = BTRFS_BLOCK_GROUP_RAID10; 109 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 110 allowed = BTRFS_BLOCK_GROUP_RAID1; 111 else if (allowed & BTRFS_BLOCK_GROUP_DUP) 112 allowed = BTRFS_BLOCK_GROUP_DUP; 113 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 114 allowed = BTRFS_BLOCK_GROUP_RAID0; 115 116 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 117 118 return extended_to_chunk(flags | allowed); 119 } 120 121 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 122 { 123 unsigned seq; 124 u64 flags; 125 126 do { 127 flags = orig_flags; 128 seq = read_seqbegin(&fs_info->profiles_lock); 129 130 if (flags & BTRFS_BLOCK_GROUP_DATA) 131 flags |= fs_info->avail_data_alloc_bits; 132 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 133 flags |= fs_info->avail_system_alloc_bits; 134 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 135 flags |= fs_info->avail_metadata_alloc_bits; 136 } while (read_seqretry(&fs_info->profiles_lock, seq)); 137 138 return btrfs_reduce_alloc_profile(fs_info, flags); 139 } 140 141 void btrfs_get_block_group(struct btrfs_block_group *cache) 142 { 143 refcount_inc(&cache->refs); 144 } 145 146 void btrfs_put_block_group(struct btrfs_block_group *cache) 147 { 148 if (refcount_dec_and_test(&cache->refs)) { 149 WARN_ON(cache->pinned > 0); 150 /* 151 * If there was a failure to cleanup a log tree, very likely due 152 * to an IO failure on a writeback attempt of one or more of its 153 * extent buffers, we could not do proper (and cheap) unaccounting 154 * of their reserved space, so don't warn on reserved > 0 in that 155 * case. 156 */ 157 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || 158 !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) 159 WARN_ON(cache->reserved > 0); 160 161 /* 162 * A block_group shouldn't be on the discard_list anymore. 163 * Remove the block_group from the discard_list to prevent us 164 * from causing a panic due to NULL pointer dereference. 165 */ 166 if (WARN_ON(!list_empty(&cache->discard_list))) 167 btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, 168 cache); 169 170 kfree(cache->free_space_ctl); 171 kfree(cache->physical_map); 172 kfree(cache); 173 } 174 } 175 176 /* 177 * This adds the block group to the fs_info rb tree for the block group cache 178 */ 179 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 180 struct btrfs_block_group *block_group) 181 { 182 struct rb_node **p; 183 struct rb_node *parent = NULL; 184 struct btrfs_block_group *cache; 185 bool leftmost = true; 186 187 ASSERT(block_group->length != 0); 188 189 write_lock(&info->block_group_cache_lock); 190 p = &info->block_group_cache_tree.rb_root.rb_node; 191 192 while (*p) { 193 parent = *p; 194 cache = rb_entry(parent, struct btrfs_block_group, cache_node); 195 if (block_group->start < cache->start) { 196 p = &(*p)->rb_left; 197 } else if (block_group->start > cache->start) { 198 p = &(*p)->rb_right; 199 leftmost = false; 200 } else { 201 write_unlock(&info->block_group_cache_lock); 202 return -EEXIST; 203 } 204 } 205 206 rb_link_node(&block_group->cache_node, parent, p); 207 rb_insert_color_cached(&block_group->cache_node, 208 &info->block_group_cache_tree, leftmost); 209 210 write_unlock(&info->block_group_cache_lock); 211 212 return 0; 213 } 214 215 /* 216 * This will return the block group at or after bytenr if contains is 0, else 217 * it will return the block group that contains the bytenr 218 */ 219 static struct btrfs_block_group *block_group_cache_tree_search( 220 struct btrfs_fs_info *info, u64 bytenr, int contains) 221 { 222 struct btrfs_block_group *cache, *ret = NULL; 223 struct rb_node *n; 224 u64 end, start; 225 226 read_lock(&info->block_group_cache_lock); 227 n = info->block_group_cache_tree.rb_root.rb_node; 228 229 while (n) { 230 cache = rb_entry(n, struct btrfs_block_group, cache_node); 231 end = cache->start + cache->length - 1; 232 start = cache->start; 233 234 if (bytenr < start) { 235 if (!contains && (!ret || start < ret->start)) 236 ret = cache; 237 n = n->rb_left; 238 } else if (bytenr > start) { 239 if (contains && bytenr <= end) { 240 ret = cache; 241 break; 242 } 243 n = n->rb_right; 244 } else { 245 ret = cache; 246 break; 247 } 248 } 249 if (ret) 250 btrfs_get_block_group(ret); 251 read_unlock(&info->block_group_cache_lock); 252 253 return ret; 254 } 255 256 /* 257 * Return the block group that starts at or after bytenr 258 */ 259 struct btrfs_block_group *btrfs_lookup_first_block_group( 260 struct btrfs_fs_info *info, u64 bytenr) 261 { 262 return block_group_cache_tree_search(info, bytenr, 0); 263 } 264 265 /* 266 * Return the block group that contains the given bytenr 267 */ 268 struct btrfs_block_group *btrfs_lookup_block_group( 269 struct btrfs_fs_info *info, u64 bytenr) 270 { 271 return block_group_cache_tree_search(info, bytenr, 1); 272 } 273 274 struct btrfs_block_group *btrfs_next_block_group( 275 struct btrfs_block_group *cache) 276 { 277 struct btrfs_fs_info *fs_info = cache->fs_info; 278 struct rb_node *node; 279 280 read_lock(&fs_info->block_group_cache_lock); 281 282 /* If our block group was removed, we need a full search. */ 283 if (RB_EMPTY_NODE(&cache->cache_node)) { 284 const u64 next_bytenr = cache->start + cache->length; 285 286 read_unlock(&fs_info->block_group_cache_lock); 287 btrfs_put_block_group(cache); 288 return btrfs_lookup_first_block_group(fs_info, next_bytenr); 289 } 290 node = rb_next(&cache->cache_node); 291 btrfs_put_block_group(cache); 292 if (node) { 293 cache = rb_entry(node, struct btrfs_block_group, cache_node); 294 btrfs_get_block_group(cache); 295 } else 296 cache = NULL; 297 read_unlock(&fs_info->block_group_cache_lock); 298 return cache; 299 } 300 301 /* 302 * Check if we can do a NOCOW write for a given extent. 303 * 304 * @fs_info: The filesystem information object. 305 * @bytenr: Logical start address of the extent. 306 * 307 * Check if we can do a NOCOW write for the given extent, and increments the 308 * number of NOCOW writers in the block group that contains the extent, as long 309 * as the block group exists and it's currently not in read-only mode. 310 * 311 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller 312 * is responsible for calling btrfs_dec_nocow_writers() later. 313 * 314 * Or NULL if we can not do a NOCOW write 315 */ 316 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 317 u64 bytenr) 318 { 319 struct btrfs_block_group *bg; 320 bool can_nocow = true; 321 322 bg = btrfs_lookup_block_group(fs_info, bytenr); 323 if (!bg) 324 return NULL; 325 326 spin_lock(&bg->lock); 327 if (bg->ro) 328 can_nocow = false; 329 else 330 atomic_inc(&bg->nocow_writers); 331 spin_unlock(&bg->lock); 332 333 if (!can_nocow) { 334 btrfs_put_block_group(bg); 335 return NULL; 336 } 337 338 /* No put on block group, done by btrfs_dec_nocow_writers(). */ 339 return bg; 340 } 341 342 /* 343 * Decrement the number of NOCOW writers in a block group. 344 * 345 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), 346 * and on the block group returned by that call. Typically this is called after 347 * creating an ordered extent for a NOCOW write, to prevent races with scrub and 348 * relocation. 349 * 350 * After this call, the caller should not use the block group anymore. It it wants 351 * to use it, then it should get a reference on it before calling this function. 352 */ 353 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) 354 { 355 if (atomic_dec_and_test(&bg->nocow_writers)) 356 wake_up_var(&bg->nocow_writers); 357 358 /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ 359 btrfs_put_block_group(bg); 360 } 361 362 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 363 { 364 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 365 } 366 367 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 368 const u64 start) 369 { 370 struct btrfs_block_group *bg; 371 372 bg = btrfs_lookup_block_group(fs_info, start); 373 ASSERT(bg); 374 if (atomic_dec_and_test(&bg->reservations)) 375 wake_up_var(&bg->reservations); 376 btrfs_put_block_group(bg); 377 } 378 379 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 380 { 381 struct btrfs_space_info *space_info = bg->space_info; 382 383 ASSERT(bg->ro); 384 385 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 386 return; 387 388 /* 389 * Our block group is read only but before we set it to read only, 390 * some task might have had allocated an extent from it already, but it 391 * has not yet created a respective ordered extent (and added it to a 392 * root's list of ordered extents). 393 * Therefore wait for any task currently allocating extents, since the 394 * block group's reservations counter is incremented while a read lock 395 * on the groups' semaphore is held and decremented after releasing 396 * the read access on that semaphore and creating the ordered extent. 397 */ 398 down_write(&space_info->groups_sem); 399 up_write(&space_info->groups_sem); 400 401 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 402 } 403 404 struct btrfs_caching_control *btrfs_get_caching_control( 405 struct btrfs_block_group *cache) 406 { 407 struct btrfs_caching_control *ctl; 408 409 spin_lock(&cache->lock); 410 if (!cache->caching_ctl) { 411 spin_unlock(&cache->lock); 412 return NULL; 413 } 414 415 ctl = cache->caching_ctl; 416 refcount_inc(&ctl->count); 417 spin_unlock(&cache->lock); 418 return ctl; 419 } 420 421 void btrfs_put_caching_control(struct btrfs_caching_control *ctl) 422 { 423 if (refcount_dec_and_test(&ctl->count)) 424 kfree(ctl); 425 } 426 427 /* 428 * When we wait for progress in the block group caching, its because our 429 * allocation attempt failed at least once. So, we must sleep and let some 430 * progress happen before we try again. 431 * 432 * This function will sleep at least once waiting for new free space to show 433 * up, and then it will check the block group free space numbers for our min 434 * num_bytes. Another option is to have it go ahead and look in the rbtree for 435 * a free extent of a given size, but this is a good start. 436 * 437 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 438 * any of the information in this block group. 439 */ 440 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 441 u64 num_bytes) 442 { 443 struct btrfs_caching_control *caching_ctl; 444 445 caching_ctl = btrfs_get_caching_control(cache); 446 if (!caching_ctl) 447 return; 448 449 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 450 (cache->free_space_ctl->free_space >= num_bytes)); 451 452 btrfs_put_caching_control(caching_ctl); 453 } 454 455 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, 456 struct btrfs_caching_control *caching_ctl) 457 { 458 wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 459 return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; 460 } 461 462 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 463 { 464 struct btrfs_caching_control *caching_ctl; 465 int ret; 466 467 caching_ctl = btrfs_get_caching_control(cache); 468 if (!caching_ctl) 469 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 470 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 471 btrfs_put_caching_control(caching_ctl); 472 return ret; 473 } 474 475 #ifdef CONFIG_BTRFS_DEBUG 476 static void fragment_free_space(struct btrfs_block_group *block_group) 477 { 478 struct btrfs_fs_info *fs_info = block_group->fs_info; 479 u64 start = block_group->start; 480 u64 len = block_group->length; 481 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 482 fs_info->nodesize : fs_info->sectorsize; 483 u64 step = chunk << 1; 484 485 while (len > chunk) { 486 btrfs_remove_free_space(block_group, start, chunk); 487 start += step; 488 if (len < step) 489 len = 0; 490 else 491 len -= step; 492 } 493 } 494 #endif 495 496 /* 497 * This is only called by btrfs_cache_block_group, since we could have freed 498 * extents we need to check the pinned_extents for any extents that can't be 499 * used yet since their free space will be released as soon as the transaction 500 * commits. 501 */ 502 int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, 503 u64 *total_added_ret) 504 { 505 struct btrfs_fs_info *info = block_group->fs_info; 506 u64 extent_start, extent_end, size; 507 int ret; 508 509 if (total_added_ret) 510 *total_added_ret = 0; 511 512 while (start < end) { 513 ret = find_first_extent_bit(&info->excluded_extents, start, 514 &extent_start, &extent_end, 515 EXTENT_DIRTY | EXTENT_UPTODATE, 516 NULL); 517 if (ret) 518 break; 519 520 if (extent_start <= start) { 521 start = extent_end + 1; 522 } else if (extent_start > start && extent_start < end) { 523 size = extent_start - start; 524 ret = btrfs_add_free_space_async_trimmed(block_group, 525 start, size); 526 if (ret) 527 return ret; 528 if (total_added_ret) 529 *total_added_ret += size; 530 start = extent_end + 1; 531 } else { 532 break; 533 } 534 } 535 536 if (start < end) { 537 size = end - start; 538 ret = btrfs_add_free_space_async_trimmed(block_group, start, 539 size); 540 if (ret) 541 return ret; 542 if (total_added_ret) 543 *total_added_ret += size; 544 } 545 546 return 0; 547 } 548 549 /* 550 * Get an arbitrary extent item index / max_index through the block group 551 * 552 * @block_group the block group to sample from 553 * @index: the integral step through the block group to grab from 554 * @max_index: the granularity of the sampling 555 * @key: return value parameter for the item we find 556 * 557 * Pre-conditions on indices: 558 * 0 <= index <= max_index 559 * 0 < max_index 560 * 561 * Returns: 0 on success, 1 if the search didn't yield a useful item, negative 562 * error code on error. 563 */ 564 static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, 565 struct btrfs_block_group *block_group, 566 int index, int max_index, 567 struct btrfs_key *found_key) 568 { 569 struct btrfs_fs_info *fs_info = block_group->fs_info; 570 struct btrfs_root *extent_root; 571 u64 search_offset; 572 u64 search_end = block_group->start + block_group->length; 573 struct btrfs_path *path; 574 struct btrfs_key search_key; 575 int ret = 0; 576 577 ASSERT(index >= 0); 578 ASSERT(index <= max_index); 579 ASSERT(max_index > 0); 580 lockdep_assert_held(&caching_ctl->mutex); 581 lockdep_assert_held_read(&fs_info->commit_root_sem); 582 583 path = btrfs_alloc_path(); 584 if (!path) 585 return -ENOMEM; 586 587 extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, 588 BTRFS_SUPER_INFO_OFFSET)); 589 590 path->skip_locking = 1; 591 path->search_commit_root = 1; 592 path->reada = READA_FORWARD; 593 594 search_offset = index * div_u64(block_group->length, max_index); 595 search_key.objectid = block_group->start + search_offset; 596 search_key.type = BTRFS_EXTENT_ITEM_KEY; 597 search_key.offset = 0; 598 599 btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { 600 /* Success; sampled an extent item in the block group */ 601 if (found_key->type == BTRFS_EXTENT_ITEM_KEY && 602 found_key->objectid >= block_group->start && 603 found_key->objectid + found_key->offset <= search_end) 604 break; 605 606 /* We can't possibly find a valid extent item anymore */ 607 if (found_key->objectid >= search_end) { 608 ret = 1; 609 break; 610 } 611 } 612 613 lockdep_assert_held(&caching_ctl->mutex); 614 lockdep_assert_held_read(&fs_info->commit_root_sem); 615 btrfs_free_path(path); 616 return ret; 617 } 618 619 /* 620 * Best effort attempt to compute a block group's size class while caching it. 621 * 622 * @block_group: the block group we are caching 623 * 624 * We cannot infer the size class while adding free space extents, because that 625 * logic doesn't care about contiguous file extents (it doesn't differentiate 626 * between a 100M extent and 100 contiguous 1M extents). So we need to read the 627 * file extent items. Reading all of them is quite wasteful, because usually 628 * only a handful are enough to give a good answer. Therefore, we just grab 5 of 629 * them at even steps through the block group and pick the smallest size class 630 * we see. Since size class is best effort, and not guaranteed in general, 631 * inaccuracy is acceptable. 632 * 633 * To be more explicit about why this algorithm makes sense: 634 * 635 * If we are caching in a block group from disk, then there are three major cases 636 * to consider: 637 * 1. the block group is well behaved and all extents in it are the same size 638 * class. 639 * 2. the block group is mostly one size class with rare exceptions for last 640 * ditch allocations 641 * 3. the block group was populated before size classes and can have a totally 642 * arbitrary mix of size classes. 643 * 644 * In case 1, looking at any extent in the block group will yield the correct 645 * result. For the mixed cases, taking the minimum size class seems like a good 646 * approximation, since gaps from frees will be usable to the size class. For 647 * 2., a small handful of file extents is likely to yield the right answer. For 648 * 3, we can either read every file extent, or admit that this is best effort 649 * anyway and try to stay fast. 650 * 651 * Returns: 0 on success, negative error code on error. 652 */ 653 static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, 654 struct btrfs_block_group *block_group) 655 { 656 struct btrfs_fs_info *fs_info = block_group->fs_info; 657 struct btrfs_key key; 658 int i; 659 u64 min_size = block_group->length; 660 enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; 661 int ret; 662 663 if (!btrfs_block_group_should_use_size_class(block_group)) 664 return 0; 665 666 lockdep_assert_held(&caching_ctl->mutex); 667 lockdep_assert_held_read(&fs_info->commit_root_sem); 668 for (i = 0; i < 5; ++i) { 669 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); 670 if (ret < 0) 671 goto out; 672 if (ret > 0) 673 continue; 674 min_size = min_t(u64, min_size, key.offset); 675 size_class = btrfs_calc_block_group_size_class(min_size); 676 } 677 if (size_class != BTRFS_BG_SZ_NONE) { 678 spin_lock(&block_group->lock); 679 block_group->size_class = size_class; 680 spin_unlock(&block_group->lock); 681 } 682 out: 683 return ret; 684 } 685 686 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 687 { 688 struct btrfs_block_group *block_group = caching_ctl->block_group; 689 struct btrfs_fs_info *fs_info = block_group->fs_info; 690 struct btrfs_root *extent_root; 691 struct btrfs_path *path; 692 struct extent_buffer *leaf; 693 struct btrfs_key key; 694 u64 total_found = 0; 695 u64 last = 0; 696 u32 nritems; 697 int ret; 698 bool wakeup = true; 699 700 path = btrfs_alloc_path(); 701 if (!path) 702 return -ENOMEM; 703 704 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 705 extent_root = btrfs_extent_root(fs_info, last); 706 707 #ifdef CONFIG_BTRFS_DEBUG 708 /* 709 * If we're fragmenting we don't want to make anybody think we can 710 * allocate from this block group until we've had a chance to fragment 711 * the free space. 712 */ 713 if (btrfs_should_fragment_free_space(block_group)) 714 wakeup = false; 715 #endif 716 /* 717 * We don't want to deadlock with somebody trying to allocate a new 718 * extent for the extent root while also trying to search the extent 719 * root to add free space. So we skip locking and search the commit 720 * root, since its read-only 721 */ 722 path->skip_locking = 1; 723 path->search_commit_root = 1; 724 path->reada = READA_FORWARD; 725 726 key.objectid = last; 727 key.offset = 0; 728 key.type = BTRFS_EXTENT_ITEM_KEY; 729 730 next: 731 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 732 if (ret < 0) 733 goto out; 734 735 leaf = path->nodes[0]; 736 nritems = btrfs_header_nritems(leaf); 737 738 while (1) { 739 if (btrfs_fs_closing(fs_info) > 1) { 740 last = (u64)-1; 741 break; 742 } 743 744 if (path->slots[0] < nritems) { 745 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 746 } else { 747 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 748 if (ret) 749 break; 750 751 if (need_resched() || 752 rwsem_is_contended(&fs_info->commit_root_sem)) { 753 btrfs_release_path(path); 754 up_read(&fs_info->commit_root_sem); 755 mutex_unlock(&caching_ctl->mutex); 756 cond_resched(); 757 mutex_lock(&caching_ctl->mutex); 758 down_read(&fs_info->commit_root_sem); 759 goto next; 760 } 761 762 ret = btrfs_next_leaf(extent_root, path); 763 if (ret < 0) 764 goto out; 765 if (ret) 766 break; 767 leaf = path->nodes[0]; 768 nritems = btrfs_header_nritems(leaf); 769 continue; 770 } 771 772 if (key.objectid < last) { 773 key.objectid = last; 774 key.offset = 0; 775 key.type = BTRFS_EXTENT_ITEM_KEY; 776 btrfs_release_path(path); 777 goto next; 778 } 779 780 if (key.objectid < block_group->start) { 781 path->slots[0]++; 782 continue; 783 } 784 785 if (key.objectid >= block_group->start + block_group->length) 786 break; 787 788 if (key.type == BTRFS_EXTENT_ITEM_KEY || 789 key.type == BTRFS_METADATA_ITEM_KEY) { 790 u64 space_added; 791 792 ret = add_new_free_space(block_group, last, key.objectid, 793 &space_added); 794 if (ret) 795 goto out; 796 total_found += space_added; 797 if (key.type == BTRFS_METADATA_ITEM_KEY) 798 last = key.objectid + 799 fs_info->nodesize; 800 else 801 last = key.objectid + key.offset; 802 803 if (total_found > CACHING_CTL_WAKE_UP) { 804 total_found = 0; 805 if (wakeup) 806 wake_up(&caching_ctl->wait); 807 } 808 } 809 path->slots[0]++; 810 } 811 812 ret = add_new_free_space(block_group, last, 813 block_group->start + block_group->length, 814 NULL); 815 out: 816 btrfs_free_path(path); 817 return ret; 818 } 819 820 static noinline void caching_thread(struct btrfs_work *work) 821 { 822 struct btrfs_block_group *block_group; 823 struct btrfs_fs_info *fs_info; 824 struct btrfs_caching_control *caching_ctl; 825 int ret; 826 827 caching_ctl = container_of(work, struct btrfs_caching_control, work); 828 block_group = caching_ctl->block_group; 829 fs_info = block_group->fs_info; 830 831 mutex_lock(&caching_ctl->mutex); 832 down_read(&fs_info->commit_root_sem); 833 834 load_block_group_size_class(caching_ctl, block_group); 835 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 836 ret = load_free_space_cache(block_group); 837 if (ret == 1) { 838 ret = 0; 839 goto done; 840 } 841 842 /* 843 * We failed to load the space cache, set ourselves to 844 * CACHE_STARTED and carry on. 845 */ 846 spin_lock(&block_group->lock); 847 block_group->cached = BTRFS_CACHE_STARTED; 848 spin_unlock(&block_group->lock); 849 wake_up(&caching_ctl->wait); 850 } 851 852 /* 853 * If we are in the transaction that populated the free space tree we 854 * can't actually cache from the free space tree as our commit root and 855 * real root are the same, so we could change the contents of the blocks 856 * while caching. Instead do the slow caching in this case, and after 857 * the transaction has committed we will be safe. 858 */ 859 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && 860 !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) 861 ret = load_free_space_tree(caching_ctl); 862 else 863 ret = load_extent_tree_free(caching_ctl); 864 done: 865 spin_lock(&block_group->lock); 866 block_group->caching_ctl = NULL; 867 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 868 spin_unlock(&block_group->lock); 869 870 #ifdef CONFIG_BTRFS_DEBUG 871 if (btrfs_should_fragment_free_space(block_group)) { 872 u64 bytes_used; 873 874 spin_lock(&block_group->space_info->lock); 875 spin_lock(&block_group->lock); 876 bytes_used = block_group->length - block_group->used; 877 block_group->space_info->bytes_used += bytes_used >> 1; 878 spin_unlock(&block_group->lock); 879 spin_unlock(&block_group->space_info->lock); 880 fragment_free_space(block_group); 881 } 882 #endif 883 884 up_read(&fs_info->commit_root_sem); 885 btrfs_free_excluded_extents(block_group); 886 mutex_unlock(&caching_ctl->mutex); 887 888 wake_up(&caching_ctl->wait); 889 890 btrfs_put_caching_control(caching_ctl); 891 btrfs_put_block_group(block_group); 892 } 893 894 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) 895 { 896 struct btrfs_fs_info *fs_info = cache->fs_info; 897 struct btrfs_caching_control *caching_ctl = NULL; 898 int ret = 0; 899 900 /* Allocator for zoned filesystems does not use the cache at all */ 901 if (btrfs_is_zoned(fs_info)) 902 return 0; 903 904 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 905 if (!caching_ctl) 906 return -ENOMEM; 907 908 INIT_LIST_HEAD(&caching_ctl->list); 909 mutex_init(&caching_ctl->mutex); 910 init_waitqueue_head(&caching_ctl->wait); 911 caching_ctl->block_group = cache; 912 refcount_set(&caching_ctl->count, 2); 913 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 914 915 spin_lock(&cache->lock); 916 if (cache->cached != BTRFS_CACHE_NO) { 917 kfree(caching_ctl); 918 919 caching_ctl = cache->caching_ctl; 920 if (caching_ctl) 921 refcount_inc(&caching_ctl->count); 922 spin_unlock(&cache->lock); 923 goto out; 924 } 925 WARN_ON(cache->caching_ctl); 926 cache->caching_ctl = caching_ctl; 927 cache->cached = BTRFS_CACHE_STARTED; 928 spin_unlock(&cache->lock); 929 930 write_lock(&fs_info->block_group_cache_lock); 931 refcount_inc(&caching_ctl->count); 932 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 933 write_unlock(&fs_info->block_group_cache_lock); 934 935 btrfs_get_block_group(cache); 936 937 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 938 out: 939 if (wait && caching_ctl) 940 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 941 if (caching_ctl) 942 btrfs_put_caching_control(caching_ctl); 943 944 return ret; 945 } 946 947 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 948 { 949 u64 extra_flags = chunk_to_extended(flags) & 950 BTRFS_EXTENDED_PROFILE_MASK; 951 952 write_seqlock(&fs_info->profiles_lock); 953 if (flags & BTRFS_BLOCK_GROUP_DATA) 954 fs_info->avail_data_alloc_bits &= ~extra_flags; 955 if (flags & BTRFS_BLOCK_GROUP_METADATA) 956 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 957 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 958 fs_info->avail_system_alloc_bits &= ~extra_flags; 959 write_sequnlock(&fs_info->profiles_lock); 960 } 961 962 /* 963 * Clear incompat bits for the following feature(s): 964 * 965 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 966 * in the whole filesystem 967 * 968 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 969 */ 970 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 971 { 972 bool found_raid56 = false; 973 bool found_raid1c34 = false; 974 975 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 976 (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 977 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 978 struct list_head *head = &fs_info->space_info; 979 struct btrfs_space_info *sinfo; 980 981 list_for_each_entry_rcu(sinfo, head, list) { 982 down_read(&sinfo->groups_sem); 983 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 984 found_raid56 = true; 985 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 986 found_raid56 = true; 987 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 988 found_raid1c34 = true; 989 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 990 found_raid1c34 = true; 991 up_read(&sinfo->groups_sem); 992 } 993 if (!found_raid56) 994 btrfs_clear_fs_incompat(fs_info, RAID56); 995 if (!found_raid1c34) 996 btrfs_clear_fs_incompat(fs_info, RAID1C34); 997 } 998 } 999 1000 static int remove_block_group_item(struct btrfs_trans_handle *trans, 1001 struct btrfs_path *path, 1002 struct btrfs_block_group *block_group) 1003 { 1004 struct btrfs_fs_info *fs_info = trans->fs_info; 1005 struct btrfs_root *root; 1006 struct btrfs_key key; 1007 int ret; 1008 1009 root = btrfs_block_group_root(fs_info); 1010 key.objectid = block_group->start; 1011 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 1012 key.offset = block_group->length; 1013 1014 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1015 if (ret > 0) 1016 ret = -ENOENT; 1017 if (ret < 0) 1018 return ret; 1019 1020 ret = btrfs_del_item(trans, root, path); 1021 return ret; 1022 } 1023 1024 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 1025 u64 group_start, struct extent_map *em) 1026 { 1027 struct btrfs_fs_info *fs_info = trans->fs_info; 1028 struct btrfs_path *path; 1029 struct btrfs_block_group *block_group; 1030 struct btrfs_free_cluster *cluster; 1031 struct inode *inode; 1032 struct kobject *kobj = NULL; 1033 int ret; 1034 int index; 1035 int factor; 1036 struct btrfs_caching_control *caching_ctl = NULL; 1037 bool remove_em; 1038 bool remove_rsv = false; 1039 1040 block_group = btrfs_lookup_block_group(fs_info, group_start); 1041 BUG_ON(!block_group); 1042 BUG_ON(!block_group->ro); 1043 1044 trace_btrfs_remove_block_group(block_group); 1045 /* 1046 * Free the reserved super bytes from this block group before 1047 * remove it. 1048 */ 1049 btrfs_free_excluded_extents(block_group); 1050 btrfs_free_ref_tree_range(fs_info, block_group->start, 1051 block_group->length); 1052 1053 index = btrfs_bg_flags_to_raid_index(block_group->flags); 1054 factor = btrfs_bg_type_to_factor(block_group->flags); 1055 1056 /* make sure this block group isn't part of an allocation cluster */ 1057 cluster = &fs_info->data_alloc_cluster; 1058 spin_lock(&cluster->refill_lock); 1059 btrfs_return_cluster_to_free_space(block_group, cluster); 1060 spin_unlock(&cluster->refill_lock); 1061 1062 /* 1063 * make sure this block group isn't part of a metadata 1064 * allocation cluster 1065 */ 1066 cluster = &fs_info->meta_alloc_cluster; 1067 spin_lock(&cluster->refill_lock); 1068 btrfs_return_cluster_to_free_space(block_group, cluster); 1069 spin_unlock(&cluster->refill_lock); 1070 1071 btrfs_clear_treelog_bg(block_group); 1072 btrfs_clear_data_reloc_bg(block_group); 1073 1074 path = btrfs_alloc_path(); 1075 if (!path) { 1076 ret = -ENOMEM; 1077 goto out; 1078 } 1079 1080 /* 1081 * get the inode first so any iput calls done for the io_list 1082 * aren't the final iput (no unlinks allowed now) 1083 */ 1084 inode = lookup_free_space_inode(block_group, path); 1085 1086 mutex_lock(&trans->transaction->cache_write_mutex); 1087 /* 1088 * Make sure our free space cache IO is done before removing the 1089 * free space inode 1090 */ 1091 spin_lock(&trans->transaction->dirty_bgs_lock); 1092 if (!list_empty(&block_group->io_list)) { 1093 list_del_init(&block_group->io_list); 1094 1095 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 1096 1097 spin_unlock(&trans->transaction->dirty_bgs_lock); 1098 btrfs_wait_cache_io(trans, block_group, path); 1099 btrfs_put_block_group(block_group); 1100 spin_lock(&trans->transaction->dirty_bgs_lock); 1101 } 1102 1103 if (!list_empty(&block_group->dirty_list)) { 1104 list_del_init(&block_group->dirty_list); 1105 remove_rsv = true; 1106 btrfs_put_block_group(block_group); 1107 } 1108 spin_unlock(&trans->transaction->dirty_bgs_lock); 1109 mutex_unlock(&trans->transaction->cache_write_mutex); 1110 1111 ret = btrfs_remove_free_space_inode(trans, inode, block_group); 1112 if (ret) 1113 goto out; 1114 1115 write_lock(&fs_info->block_group_cache_lock); 1116 rb_erase_cached(&block_group->cache_node, 1117 &fs_info->block_group_cache_tree); 1118 RB_CLEAR_NODE(&block_group->cache_node); 1119 1120 /* Once for the block groups rbtree */ 1121 btrfs_put_block_group(block_group); 1122 1123 write_unlock(&fs_info->block_group_cache_lock); 1124 1125 down_write(&block_group->space_info->groups_sem); 1126 /* 1127 * we must use list_del_init so people can check to see if they 1128 * are still on the list after taking the semaphore 1129 */ 1130 list_del_init(&block_group->list); 1131 if (list_empty(&block_group->space_info->block_groups[index])) { 1132 kobj = block_group->space_info->block_group_kobjs[index]; 1133 block_group->space_info->block_group_kobjs[index] = NULL; 1134 clear_avail_alloc_bits(fs_info, block_group->flags); 1135 } 1136 up_write(&block_group->space_info->groups_sem); 1137 clear_incompat_bg_bits(fs_info, block_group->flags); 1138 if (kobj) { 1139 kobject_del(kobj); 1140 kobject_put(kobj); 1141 } 1142 1143 if (block_group->cached == BTRFS_CACHE_STARTED) 1144 btrfs_wait_block_group_cache_done(block_group); 1145 1146 write_lock(&fs_info->block_group_cache_lock); 1147 caching_ctl = btrfs_get_caching_control(block_group); 1148 if (!caching_ctl) { 1149 struct btrfs_caching_control *ctl; 1150 1151 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { 1152 if (ctl->block_group == block_group) { 1153 caching_ctl = ctl; 1154 refcount_inc(&caching_ctl->count); 1155 break; 1156 } 1157 } 1158 } 1159 if (caching_ctl) 1160 list_del_init(&caching_ctl->list); 1161 write_unlock(&fs_info->block_group_cache_lock); 1162 1163 if (caching_ctl) { 1164 /* Once for the caching bgs list and once for us. */ 1165 btrfs_put_caching_control(caching_ctl); 1166 btrfs_put_caching_control(caching_ctl); 1167 } 1168 1169 spin_lock(&trans->transaction->dirty_bgs_lock); 1170 WARN_ON(!list_empty(&block_group->dirty_list)); 1171 WARN_ON(!list_empty(&block_group->io_list)); 1172 spin_unlock(&trans->transaction->dirty_bgs_lock); 1173 1174 btrfs_remove_free_space_cache(block_group); 1175 1176 spin_lock(&block_group->space_info->lock); 1177 list_del_init(&block_group->ro_list); 1178 1179 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1180 WARN_ON(block_group->space_info->total_bytes 1181 < block_group->length); 1182 WARN_ON(block_group->space_info->bytes_readonly 1183 < block_group->length - block_group->zone_unusable); 1184 WARN_ON(block_group->space_info->bytes_zone_unusable 1185 < block_group->zone_unusable); 1186 WARN_ON(block_group->space_info->disk_total 1187 < block_group->length * factor); 1188 } 1189 block_group->space_info->total_bytes -= block_group->length; 1190 block_group->space_info->bytes_readonly -= 1191 (block_group->length - block_group->zone_unusable); 1192 block_group->space_info->bytes_zone_unusable -= 1193 block_group->zone_unusable; 1194 block_group->space_info->disk_total -= block_group->length * factor; 1195 1196 spin_unlock(&block_group->space_info->lock); 1197 1198 /* 1199 * Remove the free space for the block group from the free space tree 1200 * and the block group's item from the extent tree before marking the 1201 * block group as removed. This is to prevent races with tasks that 1202 * freeze and unfreeze a block group, this task and another task 1203 * allocating a new block group - the unfreeze task ends up removing 1204 * the block group's extent map before the task calling this function 1205 * deletes the block group item from the extent tree, allowing for 1206 * another task to attempt to create another block group with the same 1207 * item key (and failing with -EEXIST and a transaction abort). 1208 */ 1209 ret = remove_block_group_free_space(trans, block_group); 1210 if (ret) 1211 goto out; 1212 1213 ret = remove_block_group_item(trans, path, block_group); 1214 if (ret < 0) 1215 goto out; 1216 1217 spin_lock(&block_group->lock); 1218 set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); 1219 1220 /* 1221 * At this point trimming or scrub can't start on this block group, 1222 * because we removed the block group from the rbtree 1223 * fs_info->block_group_cache_tree so no one can't find it anymore and 1224 * even if someone already got this block group before we removed it 1225 * from the rbtree, they have already incremented block_group->frozen - 1226 * if they didn't, for the trimming case they won't find any free space 1227 * entries because we already removed them all when we called 1228 * btrfs_remove_free_space_cache(). 1229 * 1230 * And we must not remove the extent map from the fs_info->mapping_tree 1231 * to prevent the same logical address range and physical device space 1232 * ranges from being reused for a new block group. This is needed to 1233 * avoid races with trimming and scrub. 1234 * 1235 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 1236 * completely transactionless, so while it is trimming a range the 1237 * currently running transaction might finish and a new one start, 1238 * allowing for new block groups to be created that can reuse the same 1239 * physical device locations unless we take this special care. 1240 * 1241 * There may also be an implicit trim operation if the file system 1242 * is mounted with -odiscard. The same protections must remain 1243 * in place until the extents have been discarded completely when 1244 * the transaction commit has completed. 1245 */ 1246 remove_em = (atomic_read(&block_group->frozen) == 0); 1247 spin_unlock(&block_group->lock); 1248 1249 if (remove_em) { 1250 struct extent_map_tree *em_tree; 1251 1252 em_tree = &fs_info->mapping_tree; 1253 write_lock(&em_tree->lock); 1254 remove_extent_mapping(em_tree, em); 1255 write_unlock(&em_tree->lock); 1256 /* once for the tree */ 1257 free_extent_map(em); 1258 } 1259 1260 out: 1261 /* Once for the lookup reference */ 1262 btrfs_put_block_group(block_group); 1263 if (remove_rsv) 1264 btrfs_delayed_refs_rsv_release(fs_info, 1); 1265 btrfs_free_path(path); 1266 return ret; 1267 } 1268 1269 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 1270 struct btrfs_fs_info *fs_info, const u64 chunk_offset) 1271 { 1272 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1273 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1274 struct extent_map *em; 1275 struct map_lookup *map; 1276 unsigned int num_items; 1277 1278 read_lock(&em_tree->lock); 1279 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1280 read_unlock(&em_tree->lock); 1281 ASSERT(em && em->start == chunk_offset); 1282 1283 /* 1284 * We need to reserve 3 + N units from the metadata space info in order 1285 * to remove a block group (done at btrfs_remove_chunk() and at 1286 * btrfs_remove_block_group()), which are used for: 1287 * 1288 * 1 unit for adding the free space inode's orphan (located in the tree 1289 * of tree roots). 1290 * 1 unit for deleting the block group item (located in the extent 1291 * tree). 1292 * 1 unit for deleting the free space item (located in tree of tree 1293 * roots). 1294 * N units for deleting N device extent items corresponding to each 1295 * stripe (located in the device tree). 1296 * 1297 * In order to remove a block group we also need to reserve units in the 1298 * system space info in order to update the chunk tree (update one or 1299 * more device items and remove one chunk item), but this is done at 1300 * btrfs_remove_chunk() through a call to check_system_chunk(). 1301 */ 1302 map = em->map_lookup; 1303 num_items = 3 + map->num_stripes; 1304 free_extent_map(em); 1305 1306 return btrfs_start_transaction_fallback_global_rsv(root, num_items); 1307 } 1308 1309 /* 1310 * Mark block group @cache read-only, so later write won't happen to block 1311 * group @cache. 1312 * 1313 * If @force is not set, this function will only mark the block group readonly 1314 * if we have enough free space (1M) in other metadata/system block groups. 1315 * If @force is not set, this function will mark the block group readonly 1316 * without checking free space. 1317 * 1318 * NOTE: This function doesn't care if other block groups can contain all the 1319 * data in this block group. That check should be done by relocation routine, 1320 * not this function. 1321 */ 1322 static int inc_block_group_ro(struct btrfs_block_group *cache, int force) 1323 { 1324 struct btrfs_space_info *sinfo = cache->space_info; 1325 u64 num_bytes; 1326 int ret = -ENOSPC; 1327 1328 spin_lock(&sinfo->lock); 1329 spin_lock(&cache->lock); 1330 1331 if (cache->swap_extents) { 1332 ret = -ETXTBSY; 1333 goto out; 1334 } 1335 1336 if (cache->ro) { 1337 cache->ro++; 1338 ret = 0; 1339 goto out; 1340 } 1341 1342 num_bytes = cache->length - cache->reserved - cache->pinned - 1343 cache->bytes_super - cache->zone_unusable - cache->used; 1344 1345 /* 1346 * Data never overcommits, even in mixed mode, so do just the straight 1347 * check of left over space in how much we have allocated. 1348 */ 1349 if (force) { 1350 ret = 0; 1351 } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 1352 u64 sinfo_used = btrfs_space_info_used(sinfo, true); 1353 1354 /* 1355 * Here we make sure if we mark this bg RO, we still have enough 1356 * free space as buffer. 1357 */ 1358 if (sinfo_used + num_bytes <= sinfo->total_bytes) 1359 ret = 0; 1360 } else { 1361 /* 1362 * We overcommit metadata, so we need to do the 1363 * btrfs_can_overcommit check here, and we need to pass in 1364 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 1365 * leeway to allow us to mark this block group as read only. 1366 */ 1367 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 1368 BTRFS_RESERVE_NO_FLUSH)) 1369 ret = 0; 1370 } 1371 1372 if (!ret) { 1373 sinfo->bytes_readonly += num_bytes; 1374 if (btrfs_is_zoned(cache->fs_info)) { 1375 /* Migrate zone_unusable bytes to readonly */ 1376 sinfo->bytes_readonly += cache->zone_unusable; 1377 sinfo->bytes_zone_unusable -= cache->zone_unusable; 1378 cache->zone_unusable = 0; 1379 } 1380 cache->ro++; 1381 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1382 } 1383 out: 1384 spin_unlock(&cache->lock); 1385 spin_unlock(&sinfo->lock); 1386 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1387 btrfs_info(cache->fs_info, 1388 "unable to make block group %llu ro", cache->start); 1389 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1390 } 1391 return ret; 1392 } 1393 1394 static bool clean_pinned_extents(struct btrfs_trans_handle *trans, 1395 struct btrfs_block_group *bg) 1396 { 1397 struct btrfs_fs_info *fs_info = bg->fs_info; 1398 struct btrfs_transaction *prev_trans = NULL; 1399 const u64 start = bg->start; 1400 const u64 end = start + bg->length - 1; 1401 int ret; 1402 1403 spin_lock(&fs_info->trans_lock); 1404 if (trans->transaction->list.prev != &fs_info->trans_list) { 1405 prev_trans = list_last_entry(&trans->transaction->list, 1406 struct btrfs_transaction, list); 1407 refcount_inc(&prev_trans->use_count); 1408 } 1409 spin_unlock(&fs_info->trans_lock); 1410 1411 /* 1412 * Hold the unused_bg_unpin_mutex lock to avoid racing with 1413 * btrfs_finish_extent_commit(). If we are at transaction N, another 1414 * task might be running finish_extent_commit() for the previous 1415 * transaction N - 1, and have seen a range belonging to the block 1416 * group in pinned_extents before we were able to clear the whole block 1417 * group range from pinned_extents. This means that task can lookup for 1418 * the block group after we unpinned it from pinned_extents and removed 1419 * it, leading to a BUG_ON() at unpin_extent_range(). 1420 */ 1421 mutex_lock(&fs_info->unused_bg_unpin_mutex); 1422 if (prev_trans) { 1423 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, 1424 EXTENT_DIRTY); 1425 if (ret) 1426 goto out; 1427 } 1428 1429 ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, 1430 EXTENT_DIRTY); 1431 out: 1432 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1433 if (prev_trans) 1434 btrfs_put_transaction(prev_trans); 1435 1436 return ret == 0; 1437 } 1438 1439 /* 1440 * Process the unused_bgs list and remove any that don't have any allocated 1441 * space inside of them. 1442 */ 1443 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 1444 { 1445 struct btrfs_block_group *block_group; 1446 struct btrfs_space_info *space_info; 1447 struct btrfs_trans_handle *trans; 1448 const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); 1449 int ret = 0; 1450 1451 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1452 return; 1453 1454 if (btrfs_fs_closing(fs_info)) 1455 return; 1456 1457 /* 1458 * Long running balances can keep us blocked here for eternity, so 1459 * simply skip deletion if we're unable to get the mutex. 1460 */ 1461 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 1462 return; 1463 1464 spin_lock(&fs_info->unused_bgs_lock); 1465 while (!list_empty(&fs_info->unused_bgs)) { 1466 int trimming; 1467 1468 block_group = list_first_entry(&fs_info->unused_bgs, 1469 struct btrfs_block_group, 1470 bg_list); 1471 list_del_init(&block_group->bg_list); 1472 1473 space_info = block_group->space_info; 1474 1475 if (ret || btrfs_mixed_space_info(space_info)) { 1476 btrfs_put_block_group(block_group); 1477 continue; 1478 } 1479 spin_unlock(&fs_info->unused_bgs_lock); 1480 1481 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 1482 1483 /* Don't want to race with allocators so take the groups_sem */ 1484 down_write(&space_info->groups_sem); 1485 1486 /* 1487 * Async discard moves the final block group discard to be prior 1488 * to the unused_bgs code path. Therefore, if it's not fully 1489 * trimmed, punt it back to the async discard lists. 1490 */ 1491 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && 1492 !btrfs_is_free_space_trimmed(block_group)) { 1493 trace_btrfs_skip_unused_block_group(block_group); 1494 up_write(&space_info->groups_sem); 1495 /* Requeue if we failed because of async discard */ 1496 btrfs_discard_queue_work(&fs_info->discard_ctl, 1497 block_group); 1498 goto next; 1499 } 1500 1501 spin_lock(&block_group->lock); 1502 if (block_group->reserved || block_group->pinned || 1503 block_group->used || block_group->ro || 1504 list_is_singular(&block_group->list)) { 1505 /* 1506 * We want to bail if we made new allocations or have 1507 * outstanding allocations in this block group. We do 1508 * the ro check in case balance is currently acting on 1509 * this block group. 1510 */ 1511 trace_btrfs_skip_unused_block_group(block_group); 1512 spin_unlock(&block_group->lock); 1513 up_write(&space_info->groups_sem); 1514 goto next; 1515 } 1516 spin_unlock(&block_group->lock); 1517 1518 /* We don't want to force the issue, only flip if it's ok. */ 1519 ret = inc_block_group_ro(block_group, 0); 1520 up_write(&space_info->groups_sem); 1521 if (ret < 0) { 1522 ret = 0; 1523 goto next; 1524 } 1525 1526 ret = btrfs_zone_finish(block_group); 1527 if (ret < 0) { 1528 btrfs_dec_block_group_ro(block_group); 1529 if (ret == -EAGAIN) 1530 ret = 0; 1531 goto next; 1532 } 1533 1534 /* 1535 * Want to do this before we do anything else so we can recover 1536 * properly if we fail to join the transaction. 1537 */ 1538 trans = btrfs_start_trans_remove_block_group(fs_info, 1539 block_group->start); 1540 if (IS_ERR(trans)) { 1541 btrfs_dec_block_group_ro(block_group); 1542 ret = PTR_ERR(trans); 1543 goto next; 1544 } 1545 1546 /* 1547 * We could have pending pinned extents for this block group, 1548 * just delete them, we don't care about them anymore. 1549 */ 1550 if (!clean_pinned_extents(trans, block_group)) { 1551 btrfs_dec_block_group_ro(block_group); 1552 goto end_trans; 1553 } 1554 1555 /* 1556 * At this point, the block_group is read only and should fail 1557 * new allocations. However, btrfs_finish_extent_commit() can 1558 * cause this block_group to be placed back on the discard 1559 * lists because now the block_group isn't fully discarded. 1560 * Bail here and try again later after discarding everything. 1561 */ 1562 spin_lock(&fs_info->discard_ctl.lock); 1563 if (!list_empty(&block_group->discard_list)) { 1564 spin_unlock(&fs_info->discard_ctl.lock); 1565 btrfs_dec_block_group_ro(block_group); 1566 btrfs_discard_queue_work(&fs_info->discard_ctl, 1567 block_group); 1568 goto end_trans; 1569 } 1570 spin_unlock(&fs_info->discard_ctl.lock); 1571 1572 /* Reset pinned so btrfs_put_block_group doesn't complain */ 1573 spin_lock(&space_info->lock); 1574 spin_lock(&block_group->lock); 1575 1576 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1577 -block_group->pinned); 1578 space_info->bytes_readonly += block_group->pinned; 1579 block_group->pinned = 0; 1580 1581 spin_unlock(&block_group->lock); 1582 spin_unlock(&space_info->lock); 1583 1584 /* 1585 * The normal path here is an unused block group is passed here, 1586 * then trimming is handled in the transaction commit path. 1587 * Async discard interposes before this to do the trimming 1588 * before coming down the unused block group path as trimming 1589 * will no longer be done later in the transaction commit path. 1590 */ 1591 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1592 goto flip_async; 1593 1594 /* 1595 * DISCARD can flip during remount. On zoned filesystems, we 1596 * need to reset sequential-required zones. 1597 */ 1598 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || 1599 btrfs_is_zoned(fs_info); 1600 1601 /* Implicit trim during transaction commit. */ 1602 if (trimming) 1603 btrfs_freeze_block_group(block_group); 1604 1605 /* 1606 * Btrfs_remove_chunk will abort the transaction if things go 1607 * horribly wrong. 1608 */ 1609 ret = btrfs_remove_chunk(trans, block_group->start); 1610 1611 if (ret) { 1612 if (trimming) 1613 btrfs_unfreeze_block_group(block_group); 1614 goto end_trans; 1615 } 1616 1617 /* 1618 * If we're not mounted with -odiscard, we can just forget 1619 * about this block group. Otherwise we'll need to wait 1620 * until transaction commit to do the actual discard. 1621 */ 1622 if (trimming) { 1623 spin_lock(&fs_info->unused_bgs_lock); 1624 /* 1625 * A concurrent scrub might have added us to the list 1626 * fs_info->unused_bgs, so use a list_move operation 1627 * to add the block group to the deleted_bgs list. 1628 */ 1629 list_move(&block_group->bg_list, 1630 &trans->transaction->deleted_bgs); 1631 spin_unlock(&fs_info->unused_bgs_lock); 1632 btrfs_get_block_group(block_group); 1633 } 1634 end_trans: 1635 btrfs_end_transaction(trans); 1636 next: 1637 btrfs_put_block_group(block_group); 1638 spin_lock(&fs_info->unused_bgs_lock); 1639 } 1640 spin_unlock(&fs_info->unused_bgs_lock); 1641 mutex_unlock(&fs_info->reclaim_bgs_lock); 1642 return; 1643 1644 flip_async: 1645 btrfs_end_transaction(trans); 1646 mutex_unlock(&fs_info->reclaim_bgs_lock); 1647 btrfs_put_block_group(block_group); 1648 btrfs_discard_punt_unused_bgs_list(fs_info); 1649 } 1650 1651 void btrfs_mark_bg_unused(struct btrfs_block_group *bg) 1652 { 1653 struct btrfs_fs_info *fs_info = bg->fs_info; 1654 1655 spin_lock(&fs_info->unused_bgs_lock); 1656 if (list_empty(&bg->bg_list)) { 1657 btrfs_get_block_group(bg); 1658 trace_btrfs_add_unused_block_group(bg); 1659 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1660 } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { 1661 /* Pull out the block group from the reclaim_bgs list. */ 1662 trace_btrfs_add_unused_block_group(bg); 1663 list_move_tail(&bg->bg_list, &fs_info->unused_bgs); 1664 } 1665 spin_unlock(&fs_info->unused_bgs_lock); 1666 } 1667 1668 /* 1669 * We want block groups with a low number of used bytes to be in the beginning 1670 * of the list, so they will get reclaimed first. 1671 */ 1672 static int reclaim_bgs_cmp(void *unused, const struct list_head *a, 1673 const struct list_head *b) 1674 { 1675 const struct btrfs_block_group *bg1, *bg2; 1676 1677 bg1 = list_entry(a, struct btrfs_block_group, bg_list); 1678 bg2 = list_entry(b, struct btrfs_block_group, bg_list); 1679 1680 return bg1->used > bg2->used; 1681 } 1682 1683 static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) 1684 { 1685 if (btrfs_is_zoned(fs_info)) 1686 return btrfs_zoned_should_reclaim(fs_info); 1687 return true; 1688 } 1689 1690 static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) 1691 { 1692 const struct btrfs_space_info *space_info = bg->space_info; 1693 const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); 1694 const u64 new_val = bg->used; 1695 const u64 old_val = new_val + bytes_freed; 1696 u64 thresh; 1697 1698 if (reclaim_thresh == 0) 1699 return false; 1700 1701 thresh = mult_perc(bg->length, reclaim_thresh); 1702 1703 /* 1704 * If we were below the threshold before don't reclaim, we are likely a 1705 * brand new block group and we don't want to relocate new block groups. 1706 */ 1707 if (old_val < thresh) 1708 return false; 1709 if (new_val >= thresh) 1710 return false; 1711 return true; 1712 } 1713 1714 void btrfs_reclaim_bgs_work(struct work_struct *work) 1715 { 1716 struct btrfs_fs_info *fs_info = 1717 container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 1718 struct btrfs_block_group *bg; 1719 struct btrfs_space_info *space_info; 1720 1721 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1722 return; 1723 1724 if (btrfs_fs_closing(fs_info)) 1725 return; 1726 1727 if (!btrfs_should_reclaim(fs_info)) 1728 return; 1729 1730 sb_start_write(fs_info->sb); 1731 1732 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 1733 sb_end_write(fs_info->sb); 1734 return; 1735 } 1736 1737 /* 1738 * Long running balances can keep us blocked here for eternity, so 1739 * simply skip reclaim if we're unable to get the mutex. 1740 */ 1741 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { 1742 btrfs_exclop_finish(fs_info); 1743 sb_end_write(fs_info->sb); 1744 return; 1745 } 1746 1747 spin_lock(&fs_info->unused_bgs_lock); 1748 /* 1749 * Sort happens under lock because we can't simply splice it and sort. 1750 * The block groups might still be in use and reachable via bg_list, 1751 * and their presence in the reclaim_bgs list must be preserved. 1752 */ 1753 list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); 1754 while (!list_empty(&fs_info->reclaim_bgs)) { 1755 u64 zone_unusable; 1756 int ret = 0; 1757 1758 bg = list_first_entry(&fs_info->reclaim_bgs, 1759 struct btrfs_block_group, 1760 bg_list); 1761 list_del_init(&bg->bg_list); 1762 1763 space_info = bg->space_info; 1764 spin_unlock(&fs_info->unused_bgs_lock); 1765 1766 /* Don't race with allocators so take the groups_sem */ 1767 down_write(&space_info->groups_sem); 1768 1769 spin_lock(&bg->lock); 1770 if (bg->reserved || bg->pinned || bg->ro) { 1771 /* 1772 * We want to bail if we made new allocations or have 1773 * outstanding allocations in this block group. We do 1774 * the ro check in case balance is currently acting on 1775 * this block group. 1776 */ 1777 spin_unlock(&bg->lock); 1778 up_write(&space_info->groups_sem); 1779 goto next; 1780 } 1781 if (bg->used == 0) { 1782 /* 1783 * It is possible that we trigger relocation on a block 1784 * group as its extents are deleted and it first goes 1785 * below the threshold, then shortly after goes empty. 1786 * 1787 * In this case, relocating it does delete it, but has 1788 * some overhead in relocation specific metadata, looking 1789 * for the non-existent extents and running some extra 1790 * transactions, which we can avoid by using one of the 1791 * other mechanisms for dealing with empty block groups. 1792 */ 1793 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1794 btrfs_mark_bg_unused(bg); 1795 spin_unlock(&bg->lock); 1796 up_write(&space_info->groups_sem); 1797 goto next; 1798 1799 } 1800 /* 1801 * The block group might no longer meet the reclaim condition by 1802 * the time we get around to reclaiming it, so to avoid 1803 * reclaiming overly full block_groups, skip reclaiming them. 1804 * 1805 * Since the decision making process also depends on the amount 1806 * being freed, pass in a fake giant value to skip that extra 1807 * check, which is more meaningful when adding to the list in 1808 * the first place. 1809 */ 1810 if (!should_reclaim_block_group(bg, bg->length)) { 1811 spin_unlock(&bg->lock); 1812 up_write(&space_info->groups_sem); 1813 goto next; 1814 } 1815 spin_unlock(&bg->lock); 1816 1817 /* 1818 * Get out fast, in case we're read-only or unmounting the 1819 * filesystem. It is OK to drop block groups from the list even 1820 * for the read-only case. As we did sb_start_write(), 1821 * "mount -o remount,ro" won't happen and read-only filesystem 1822 * means it is forced read-only due to a fatal error. So, it 1823 * never gets back to read-write to let us reclaim again. 1824 */ 1825 if (btrfs_need_cleaner_sleep(fs_info)) { 1826 up_write(&space_info->groups_sem); 1827 goto next; 1828 } 1829 1830 /* 1831 * Cache the zone_unusable value before turning the block group 1832 * to read only. As soon as the blog group is read only it's 1833 * zone_unusable value gets moved to the block group's read-only 1834 * bytes and isn't available for calculations anymore. 1835 */ 1836 zone_unusable = bg->zone_unusable; 1837 ret = inc_block_group_ro(bg, 0); 1838 up_write(&space_info->groups_sem); 1839 if (ret < 0) 1840 goto next; 1841 1842 btrfs_info(fs_info, 1843 "reclaiming chunk %llu with %llu%% used %llu%% unusable", 1844 bg->start, 1845 div64_u64(bg->used * 100, bg->length), 1846 div64_u64(zone_unusable * 100, bg->length)); 1847 trace_btrfs_reclaim_block_group(bg); 1848 ret = btrfs_relocate_chunk(fs_info, bg->start); 1849 if (ret) { 1850 btrfs_dec_block_group_ro(bg); 1851 btrfs_err(fs_info, "error relocating chunk %llu", 1852 bg->start); 1853 } 1854 1855 next: 1856 if (ret) 1857 btrfs_mark_bg_to_reclaim(bg); 1858 btrfs_put_block_group(bg); 1859 1860 mutex_unlock(&fs_info->reclaim_bgs_lock); 1861 /* 1862 * Reclaiming all the block groups in the list can take really 1863 * long. Prioritize cleaning up unused block groups. 1864 */ 1865 btrfs_delete_unused_bgs(fs_info); 1866 /* 1867 * If we are interrupted by a balance, we can just bail out. The 1868 * cleaner thread restart again if necessary. 1869 */ 1870 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 1871 goto end; 1872 spin_lock(&fs_info->unused_bgs_lock); 1873 } 1874 spin_unlock(&fs_info->unused_bgs_lock); 1875 mutex_unlock(&fs_info->reclaim_bgs_lock); 1876 end: 1877 btrfs_exclop_finish(fs_info); 1878 sb_end_write(fs_info->sb); 1879 } 1880 1881 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) 1882 { 1883 spin_lock(&fs_info->unused_bgs_lock); 1884 if (!list_empty(&fs_info->reclaim_bgs)) 1885 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); 1886 spin_unlock(&fs_info->unused_bgs_lock); 1887 } 1888 1889 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) 1890 { 1891 struct btrfs_fs_info *fs_info = bg->fs_info; 1892 1893 spin_lock(&fs_info->unused_bgs_lock); 1894 if (list_empty(&bg->bg_list)) { 1895 btrfs_get_block_group(bg); 1896 trace_btrfs_add_reclaim_block_group(bg); 1897 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); 1898 } 1899 spin_unlock(&fs_info->unused_bgs_lock); 1900 } 1901 1902 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 1903 struct btrfs_path *path) 1904 { 1905 struct extent_map_tree *em_tree; 1906 struct extent_map *em; 1907 struct btrfs_block_group_item bg; 1908 struct extent_buffer *leaf; 1909 int slot; 1910 u64 flags; 1911 int ret = 0; 1912 1913 slot = path->slots[0]; 1914 leaf = path->nodes[0]; 1915 1916 em_tree = &fs_info->mapping_tree; 1917 read_lock(&em_tree->lock); 1918 em = lookup_extent_mapping(em_tree, key->objectid, key->offset); 1919 read_unlock(&em_tree->lock); 1920 if (!em) { 1921 btrfs_err(fs_info, 1922 "logical %llu len %llu found bg but no related chunk", 1923 key->objectid, key->offset); 1924 return -ENOENT; 1925 } 1926 1927 if (em->start != key->objectid || em->len != key->offset) { 1928 btrfs_err(fs_info, 1929 "block group %llu len %llu mismatch with chunk %llu len %llu", 1930 key->objectid, key->offset, em->start, em->len); 1931 ret = -EUCLEAN; 1932 goto out_free_em; 1933 } 1934 1935 read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), 1936 sizeof(bg)); 1937 flags = btrfs_stack_block_group_flags(&bg) & 1938 BTRFS_BLOCK_GROUP_TYPE_MASK; 1939 1940 if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1941 btrfs_err(fs_info, 1942 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 1943 key->objectid, key->offset, flags, 1944 (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); 1945 ret = -EUCLEAN; 1946 } 1947 1948 out_free_em: 1949 free_extent_map(em); 1950 return ret; 1951 } 1952 1953 static int find_first_block_group(struct btrfs_fs_info *fs_info, 1954 struct btrfs_path *path, 1955 struct btrfs_key *key) 1956 { 1957 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1958 int ret; 1959 struct btrfs_key found_key; 1960 1961 btrfs_for_each_slot(root, key, &found_key, path, ret) { 1962 if (found_key.objectid >= key->objectid && 1963 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1964 return read_bg_from_eb(fs_info, &found_key, path); 1965 } 1966 } 1967 return ret; 1968 } 1969 1970 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1971 { 1972 u64 extra_flags = chunk_to_extended(flags) & 1973 BTRFS_EXTENDED_PROFILE_MASK; 1974 1975 write_seqlock(&fs_info->profiles_lock); 1976 if (flags & BTRFS_BLOCK_GROUP_DATA) 1977 fs_info->avail_data_alloc_bits |= extra_flags; 1978 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1979 fs_info->avail_metadata_alloc_bits |= extra_flags; 1980 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1981 fs_info->avail_system_alloc_bits |= extra_flags; 1982 write_sequnlock(&fs_info->profiles_lock); 1983 } 1984 1985 /* 1986 * Map a physical disk address to a list of logical addresses. 1987 * 1988 * @fs_info: the filesystem 1989 * @chunk_start: logical address of block group 1990 * @physical: physical address to map to logical addresses 1991 * @logical: return array of logical addresses which map to @physical 1992 * @naddrs: length of @logical 1993 * @stripe_len: size of IO stripe for the given block group 1994 * 1995 * Maps a particular @physical disk address to a list of @logical addresses. 1996 * Used primarily to exclude those portions of a block group that contain super 1997 * block copies. 1998 */ 1999 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 2000 u64 physical, u64 **logical, int *naddrs, int *stripe_len) 2001 { 2002 struct extent_map *em; 2003 struct map_lookup *map; 2004 u64 *buf; 2005 u64 bytenr; 2006 u64 data_stripe_length; 2007 u64 io_stripe_size; 2008 int i, nr = 0; 2009 int ret = 0; 2010 2011 em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 2012 if (IS_ERR(em)) 2013 return -EIO; 2014 2015 map = em->map_lookup; 2016 data_stripe_length = em->orig_block_len; 2017 io_stripe_size = BTRFS_STRIPE_LEN; 2018 chunk_start = em->start; 2019 2020 /* For RAID5/6 adjust to a full IO stripe length */ 2021 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2022 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2023 2024 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 2025 if (!buf) { 2026 ret = -ENOMEM; 2027 goto out; 2028 } 2029 2030 for (i = 0; i < map->num_stripes; i++) { 2031 bool already_inserted = false; 2032 u32 stripe_nr; 2033 u32 offset; 2034 int j; 2035 2036 if (!in_range(physical, map->stripes[i].physical, 2037 data_stripe_length)) 2038 continue; 2039 2040 stripe_nr = (physical - map->stripes[i].physical) >> 2041 BTRFS_STRIPE_LEN_SHIFT; 2042 offset = (physical - map->stripes[i].physical) & 2043 BTRFS_STRIPE_LEN_MASK; 2044 2045 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2046 BTRFS_BLOCK_GROUP_RAID10)) 2047 stripe_nr = div_u64(stripe_nr * map->num_stripes + i, 2048 map->sub_stripes); 2049 /* 2050 * The remaining case would be for RAID56, multiply by 2051 * nr_data_stripes(). Alternatively, just use rmap_len below 2052 * instead of map->stripe_len 2053 */ 2054 bytenr = chunk_start + stripe_nr * io_stripe_size + offset; 2055 2056 /* Ensure we don't add duplicate addresses */ 2057 for (j = 0; j < nr; j++) { 2058 if (buf[j] == bytenr) { 2059 already_inserted = true; 2060 break; 2061 } 2062 } 2063 2064 if (!already_inserted) 2065 buf[nr++] = bytenr; 2066 } 2067 2068 *logical = buf; 2069 *naddrs = nr; 2070 *stripe_len = io_stripe_size; 2071 out: 2072 free_extent_map(em); 2073 return ret; 2074 } 2075 2076 static int exclude_super_stripes(struct btrfs_block_group *cache) 2077 { 2078 struct btrfs_fs_info *fs_info = cache->fs_info; 2079 const bool zoned = btrfs_is_zoned(fs_info); 2080 u64 bytenr; 2081 u64 *logical; 2082 int stripe_len; 2083 int i, nr, ret; 2084 2085 if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 2086 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 2087 cache->bytes_super += stripe_len; 2088 ret = btrfs_add_excluded_extent(fs_info, cache->start, 2089 stripe_len); 2090 if (ret) 2091 return ret; 2092 } 2093 2094 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2095 bytenr = btrfs_sb_offset(i); 2096 ret = btrfs_rmap_block(fs_info, cache->start, 2097 bytenr, &logical, &nr, &stripe_len); 2098 if (ret) 2099 return ret; 2100 2101 /* Shouldn't have super stripes in sequential zones */ 2102 if (zoned && nr) { 2103 kfree(logical); 2104 btrfs_err(fs_info, 2105 "zoned: block group %llu must not contain super block", 2106 cache->start); 2107 return -EUCLEAN; 2108 } 2109 2110 while (nr--) { 2111 u64 len = min_t(u64, stripe_len, 2112 cache->start + cache->length - logical[nr]); 2113 2114 cache->bytes_super += len; 2115 ret = btrfs_add_excluded_extent(fs_info, logical[nr], 2116 len); 2117 if (ret) { 2118 kfree(logical); 2119 return ret; 2120 } 2121 } 2122 2123 kfree(logical); 2124 } 2125 return 0; 2126 } 2127 2128 static struct btrfs_block_group *btrfs_create_block_group_cache( 2129 struct btrfs_fs_info *fs_info, u64 start) 2130 { 2131 struct btrfs_block_group *cache; 2132 2133 cache = kzalloc(sizeof(*cache), GFP_NOFS); 2134 if (!cache) 2135 return NULL; 2136 2137 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 2138 GFP_NOFS); 2139 if (!cache->free_space_ctl) { 2140 kfree(cache); 2141 return NULL; 2142 } 2143 2144 cache->start = start; 2145 2146 cache->fs_info = fs_info; 2147 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 2148 2149 cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 2150 2151 refcount_set(&cache->refs, 1); 2152 spin_lock_init(&cache->lock); 2153 init_rwsem(&cache->data_rwsem); 2154 INIT_LIST_HEAD(&cache->list); 2155 INIT_LIST_HEAD(&cache->cluster_list); 2156 INIT_LIST_HEAD(&cache->bg_list); 2157 INIT_LIST_HEAD(&cache->ro_list); 2158 INIT_LIST_HEAD(&cache->discard_list); 2159 INIT_LIST_HEAD(&cache->dirty_list); 2160 INIT_LIST_HEAD(&cache->io_list); 2161 INIT_LIST_HEAD(&cache->active_bg_list); 2162 btrfs_init_free_space_ctl(cache, cache->free_space_ctl); 2163 atomic_set(&cache->frozen, 0); 2164 mutex_init(&cache->free_space_lock); 2165 2166 return cache; 2167 } 2168 2169 /* 2170 * Iterate all chunks and verify that each of them has the corresponding block 2171 * group 2172 */ 2173 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 2174 { 2175 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 2176 struct extent_map *em; 2177 struct btrfs_block_group *bg; 2178 u64 start = 0; 2179 int ret = 0; 2180 2181 while (1) { 2182 read_lock(&map_tree->lock); 2183 /* 2184 * lookup_extent_mapping will return the first extent map 2185 * intersecting the range, so setting @len to 1 is enough to 2186 * get the first chunk. 2187 */ 2188 em = lookup_extent_mapping(map_tree, start, 1); 2189 read_unlock(&map_tree->lock); 2190 if (!em) 2191 break; 2192 2193 bg = btrfs_lookup_block_group(fs_info, em->start); 2194 if (!bg) { 2195 btrfs_err(fs_info, 2196 "chunk start=%llu len=%llu doesn't have corresponding block group", 2197 em->start, em->len); 2198 ret = -EUCLEAN; 2199 free_extent_map(em); 2200 break; 2201 } 2202 if (bg->start != em->start || bg->length != em->len || 2203 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 2204 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 2205 btrfs_err(fs_info, 2206 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 2207 em->start, em->len, 2208 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 2209 bg->start, bg->length, 2210 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 2211 ret = -EUCLEAN; 2212 free_extent_map(em); 2213 btrfs_put_block_group(bg); 2214 break; 2215 } 2216 start = em->start + em->len; 2217 free_extent_map(em); 2218 btrfs_put_block_group(bg); 2219 } 2220 return ret; 2221 } 2222 2223 static int read_one_block_group(struct btrfs_fs_info *info, 2224 struct btrfs_block_group_item *bgi, 2225 const struct btrfs_key *key, 2226 int need_clear) 2227 { 2228 struct btrfs_block_group *cache; 2229 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 2230 int ret; 2231 2232 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 2233 2234 cache = btrfs_create_block_group_cache(info, key->objectid); 2235 if (!cache) 2236 return -ENOMEM; 2237 2238 cache->length = key->offset; 2239 cache->used = btrfs_stack_block_group_used(bgi); 2240 cache->commit_used = cache->used; 2241 cache->flags = btrfs_stack_block_group_flags(bgi); 2242 cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 2243 2244 set_free_space_tree_thresholds(cache); 2245 2246 if (need_clear) { 2247 /* 2248 * When we mount with old space cache, we need to 2249 * set BTRFS_DC_CLEAR and set dirty flag. 2250 * 2251 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 2252 * truncate the old free space cache inode and 2253 * setup a new one. 2254 * b) Setting 'dirty flag' makes sure that we flush 2255 * the new space cache info onto disk. 2256 */ 2257 if (btrfs_test_opt(info, SPACE_CACHE)) 2258 cache->disk_cache_state = BTRFS_DC_CLEAR; 2259 } 2260 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 2261 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 2262 btrfs_err(info, 2263 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 2264 cache->start); 2265 ret = -EINVAL; 2266 goto error; 2267 } 2268 2269 ret = btrfs_load_block_group_zone_info(cache, false); 2270 if (ret) { 2271 btrfs_err(info, "zoned: failed to load zone info of bg %llu", 2272 cache->start); 2273 goto error; 2274 } 2275 2276 /* 2277 * We need to exclude the super stripes now so that the space info has 2278 * super bytes accounted for, otherwise we'll think we have more space 2279 * than we actually do. 2280 */ 2281 ret = exclude_super_stripes(cache); 2282 if (ret) { 2283 /* We may have excluded something, so call this just in case. */ 2284 btrfs_free_excluded_extents(cache); 2285 goto error; 2286 } 2287 2288 /* 2289 * For zoned filesystem, space after the allocation offset is the only 2290 * free space for a block group. So, we don't need any caching work. 2291 * btrfs_calc_zone_unusable() will set the amount of free space and 2292 * zone_unusable space. 2293 * 2294 * For regular filesystem, check for two cases, either we are full, and 2295 * therefore don't need to bother with the caching work since we won't 2296 * find any space, or we are empty, and we can just add all the space 2297 * in and be done with it. This saves us _a_lot_ of time, particularly 2298 * in the full case. 2299 */ 2300 if (btrfs_is_zoned(info)) { 2301 btrfs_calc_zone_unusable(cache); 2302 /* Should not have any excluded extents. Just in case, though. */ 2303 btrfs_free_excluded_extents(cache); 2304 } else if (cache->length == cache->used) { 2305 cache->cached = BTRFS_CACHE_FINISHED; 2306 btrfs_free_excluded_extents(cache); 2307 } else if (cache->used == 0) { 2308 cache->cached = BTRFS_CACHE_FINISHED; 2309 ret = add_new_free_space(cache, cache->start, 2310 cache->start + cache->length, NULL); 2311 btrfs_free_excluded_extents(cache); 2312 if (ret) 2313 goto error; 2314 } 2315 2316 ret = btrfs_add_block_group_cache(info, cache); 2317 if (ret) { 2318 btrfs_remove_free_space_cache(cache); 2319 goto error; 2320 } 2321 trace_btrfs_add_block_group(info, cache, 0); 2322 btrfs_add_bg_to_space_info(info, cache); 2323 2324 set_avail_alloc_bits(info, cache->flags); 2325 if (btrfs_chunk_writeable(info, cache->start)) { 2326 if (cache->used == 0) { 2327 ASSERT(list_empty(&cache->bg_list)); 2328 if (btrfs_test_opt(info, DISCARD_ASYNC)) 2329 btrfs_discard_queue_work(&info->discard_ctl, cache); 2330 else 2331 btrfs_mark_bg_unused(cache); 2332 } 2333 } else { 2334 inc_block_group_ro(cache, 1); 2335 } 2336 2337 return 0; 2338 error: 2339 btrfs_put_block_group(cache); 2340 return ret; 2341 } 2342 2343 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) 2344 { 2345 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 2346 struct rb_node *node; 2347 int ret = 0; 2348 2349 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 2350 struct extent_map *em; 2351 struct map_lookup *map; 2352 struct btrfs_block_group *bg; 2353 2354 em = rb_entry(node, struct extent_map, rb_node); 2355 map = em->map_lookup; 2356 bg = btrfs_create_block_group_cache(fs_info, em->start); 2357 if (!bg) { 2358 ret = -ENOMEM; 2359 break; 2360 } 2361 2362 /* Fill dummy cache as FULL */ 2363 bg->length = em->len; 2364 bg->flags = map->type; 2365 bg->cached = BTRFS_CACHE_FINISHED; 2366 bg->used = em->len; 2367 bg->flags = map->type; 2368 ret = btrfs_add_block_group_cache(fs_info, bg); 2369 /* 2370 * We may have some valid block group cache added already, in 2371 * that case we skip to the next one. 2372 */ 2373 if (ret == -EEXIST) { 2374 ret = 0; 2375 btrfs_put_block_group(bg); 2376 continue; 2377 } 2378 2379 if (ret) { 2380 btrfs_remove_free_space_cache(bg); 2381 btrfs_put_block_group(bg); 2382 break; 2383 } 2384 2385 btrfs_add_bg_to_space_info(fs_info, bg); 2386 2387 set_avail_alloc_bits(fs_info, bg->flags); 2388 } 2389 if (!ret) 2390 btrfs_init_global_block_rsv(fs_info); 2391 return ret; 2392 } 2393 2394 int btrfs_read_block_groups(struct btrfs_fs_info *info) 2395 { 2396 struct btrfs_root *root = btrfs_block_group_root(info); 2397 struct btrfs_path *path; 2398 int ret; 2399 struct btrfs_block_group *cache; 2400 struct btrfs_space_info *space_info; 2401 struct btrfs_key key; 2402 int need_clear = 0; 2403 u64 cache_gen; 2404 2405 /* 2406 * Either no extent root (with ibadroots rescue option) or we have 2407 * unsupported RO options. The fs can never be mounted read-write, so no 2408 * need to waste time searching block group items. 2409 * 2410 * This also allows new extent tree related changes to be RO compat, 2411 * no need for a full incompat flag. 2412 */ 2413 if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & 2414 ~BTRFS_FEATURE_COMPAT_RO_SUPP)) 2415 return fill_dummy_bgs(info); 2416 2417 key.objectid = 0; 2418 key.offset = 0; 2419 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2420 path = btrfs_alloc_path(); 2421 if (!path) 2422 return -ENOMEM; 2423 2424 cache_gen = btrfs_super_cache_generation(info->super_copy); 2425 if (btrfs_test_opt(info, SPACE_CACHE) && 2426 btrfs_super_generation(info->super_copy) != cache_gen) 2427 need_clear = 1; 2428 if (btrfs_test_opt(info, CLEAR_CACHE)) 2429 need_clear = 1; 2430 2431 while (1) { 2432 struct btrfs_block_group_item bgi; 2433 struct extent_buffer *leaf; 2434 int slot; 2435 2436 ret = find_first_block_group(info, path, &key); 2437 if (ret > 0) 2438 break; 2439 if (ret != 0) 2440 goto error; 2441 2442 leaf = path->nodes[0]; 2443 slot = path->slots[0]; 2444 2445 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 2446 sizeof(bgi)); 2447 2448 btrfs_item_key_to_cpu(leaf, &key, slot); 2449 btrfs_release_path(path); 2450 ret = read_one_block_group(info, &bgi, &key, need_clear); 2451 if (ret < 0) 2452 goto error; 2453 key.objectid += key.offset; 2454 key.offset = 0; 2455 } 2456 btrfs_release_path(path); 2457 2458 list_for_each_entry(space_info, &info->space_info, list) { 2459 int i; 2460 2461 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2462 if (list_empty(&space_info->block_groups[i])) 2463 continue; 2464 cache = list_first_entry(&space_info->block_groups[i], 2465 struct btrfs_block_group, 2466 list); 2467 btrfs_sysfs_add_block_group_type(cache); 2468 } 2469 2470 if (!(btrfs_get_alloc_profile(info, space_info->flags) & 2471 (BTRFS_BLOCK_GROUP_RAID10 | 2472 BTRFS_BLOCK_GROUP_RAID1_MASK | 2473 BTRFS_BLOCK_GROUP_RAID56_MASK | 2474 BTRFS_BLOCK_GROUP_DUP))) 2475 continue; 2476 /* 2477 * Avoid allocating from un-mirrored block group if there are 2478 * mirrored block groups. 2479 */ 2480 list_for_each_entry(cache, 2481 &space_info->block_groups[BTRFS_RAID_RAID0], 2482 list) 2483 inc_block_group_ro(cache, 1); 2484 list_for_each_entry(cache, 2485 &space_info->block_groups[BTRFS_RAID_SINGLE], 2486 list) 2487 inc_block_group_ro(cache, 1); 2488 } 2489 2490 btrfs_init_global_block_rsv(info); 2491 ret = check_chunk_block_group_mappings(info); 2492 error: 2493 btrfs_free_path(path); 2494 /* 2495 * We've hit some error while reading the extent tree, and have 2496 * rescue=ibadroots mount option. 2497 * Try to fill the tree using dummy block groups so that the user can 2498 * continue to mount and grab their data. 2499 */ 2500 if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) 2501 ret = fill_dummy_bgs(info); 2502 return ret; 2503 } 2504 2505 /* 2506 * This function, insert_block_group_item(), belongs to the phase 2 of chunk 2507 * allocation. 2508 * 2509 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2510 * phases. 2511 */ 2512 static int insert_block_group_item(struct btrfs_trans_handle *trans, 2513 struct btrfs_block_group *block_group) 2514 { 2515 struct btrfs_fs_info *fs_info = trans->fs_info; 2516 struct btrfs_block_group_item bgi; 2517 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2518 struct btrfs_key key; 2519 u64 old_commit_used; 2520 int ret; 2521 2522 spin_lock(&block_group->lock); 2523 btrfs_set_stack_block_group_used(&bgi, block_group->used); 2524 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2525 block_group->global_root_id); 2526 btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2527 old_commit_used = block_group->commit_used; 2528 block_group->commit_used = block_group->used; 2529 key.objectid = block_group->start; 2530 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2531 key.offset = block_group->length; 2532 spin_unlock(&block_group->lock); 2533 2534 ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 2535 if (ret < 0) { 2536 spin_lock(&block_group->lock); 2537 block_group->commit_used = old_commit_used; 2538 spin_unlock(&block_group->lock); 2539 } 2540 2541 return ret; 2542 } 2543 2544 static int insert_dev_extent(struct btrfs_trans_handle *trans, 2545 struct btrfs_device *device, u64 chunk_offset, 2546 u64 start, u64 num_bytes) 2547 { 2548 struct btrfs_fs_info *fs_info = device->fs_info; 2549 struct btrfs_root *root = fs_info->dev_root; 2550 struct btrfs_path *path; 2551 struct btrfs_dev_extent *extent; 2552 struct extent_buffer *leaf; 2553 struct btrfs_key key; 2554 int ret; 2555 2556 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 2557 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 2558 path = btrfs_alloc_path(); 2559 if (!path) 2560 return -ENOMEM; 2561 2562 key.objectid = device->devid; 2563 key.type = BTRFS_DEV_EXTENT_KEY; 2564 key.offset = start; 2565 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); 2566 if (ret) 2567 goto out; 2568 2569 leaf = path->nodes[0]; 2570 extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 2571 btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); 2572 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 2573 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2574 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 2575 2576 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 2577 btrfs_mark_buffer_dirty(leaf); 2578 out: 2579 btrfs_free_path(path); 2580 return ret; 2581 } 2582 2583 /* 2584 * This function belongs to phase 2. 2585 * 2586 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2587 * phases. 2588 */ 2589 static int insert_dev_extents(struct btrfs_trans_handle *trans, 2590 u64 chunk_offset, u64 chunk_size) 2591 { 2592 struct btrfs_fs_info *fs_info = trans->fs_info; 2593 struct btrfs_device *device; 2594 struct extent_map *em; 2595 struct map_lookup *map; 2596 u64 dev_offset; 2597 u64 stripe_size; 2598 int i; 2599 int ret = 0; 2600 2601 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 2602 if (IS_ERR(em)) 2603 return PTR_ERR(em); 2604 2605 map = em->map_lookup; 2606 stripe_size = em->orig_block_len; 2607 2608 /* 2609 * Take the device list mutex to prevent races with the final phase of 2610 * a device replace operation that replaces the device object associated 2611 * with the map's stripes, because the device object's id can change 2612 * at any time during that final phase of the device replace operation 2613 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 2614 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 2615 * resulting in persisting a device extent item with such ID. 2616 */ 2617 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2618 for (i = 0; i < map->num_stripes; i++) { 2619 device = map->stripes[i].dev; 2620 dev_offset = map->stripes[i].physical; 2621 2622 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, 2623 stripe_size); 2624 if (ret) 2625 break; 2626 } 2627 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2628 2629 free_extent_map(em); 2630 return ret; 2631 } 2632 2633 /* 2634 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of 2635 * chunk allocation. 2636 * 2637 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2638 * phases. 2639 */ 2640 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 2641 { 2642 struct btrfs_fs_info *fs_info = trans->fs_info; 2643 struct btrfs_block_group *block_group; 2644 int ret = 0; 2645 2646 while (!list_empty(&trans->new_bgs)) { 2647 int index; 2648 2649 block_group = list_first_entry(&trans->new_bgs, 2650 struct btrfs_block_group, 2651 bg_list); 2652 if (ret) 2653 goto next; 2654 2655 index = btrfs_bg_flags_to_raid_index(block_group->flags); 2656 2657 ret = insert_block_group_item(trans, block_group); 2658 if (ret) 2659 btrfs_abort_transaction(trans, ret); 2660 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, 2661 &block_group->runtime_flags)) { 2662 mutex_lock(&fs_info->chunk_mutex); 2663 ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); 2664 mutex_unlock(&fs_info->chunk_mutex); 2665 if (ret) 2666 btrfs_abort_transaction(trans, ret); 2667 } 2668 ret = insert_dev_extents(trans, block_group->start, 2669 block_group->length); 2670 if (ret) 2671 btrfs_abort_transaction(trans, ret); 2672 add_block_group_free_space(trans, block_group); 2673 2674 /* 2675 * If we restriped during balance, we may have added a new raid 2676 * type, so now add the sysfs entries when it is safe to do so. 2677 * We don't have to worry about locking here as it's handled in 2678 * btrfs_sysfs_add_block_group_type. 2679 */ 2680 if (block_group->space_info->block_group_kobjs[index] == NULL) 2681 btrfs_sysfs_add_block_group_type(block_group); 2682 2683 /* Already aborted the transaction if it failed. */ 2684 next: 2685 btrfs_delayed_refs_rsv_release(fs_info, 1); 2686 list_del_init(&block_group->bg_list); 2687 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); 2688 } 2689 btrfs_trans_release_chunk_metadata(trans); 2690 } 2691 2692 /* 2693 * For extent tree v2 we use the block_group_item->chunk_offset to point at our 2694 * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. 2695 */ 2696 static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) 2697 { 2698 u64 div = SZ_1G; 2699 u64 index; 2700 2701 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 2702 return BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2703 2704 /* If we have a smaller fs index based on 128MiB. */ 2705 if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) 2706 div = SZ_128M; 2707 2708 offset = div64_u64(offset, div); 2709 div64_u64_rem(offset, fs_info->nr_global_roots, &index); 2710 return index; 2711 } 2712 2713 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, 2714 u64 type, 2715 u64 chunk_offset, u64 size) 2716 { 2717 struct btrfs_fs_info *fs_info = trans->fs_info; 2718 struct btrfs_block_group *cache; 2719 int ret; 2720 2721 btrfs_set_log_full_commit(trans); 2722 2723 cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2724 if (!cache) 2725 return ERR_PTR(-ENOMEM); 2726 2727 /* 2728 * Mark it as new before adding it to the rbtree of block groups or any 2729 * list, so that no other task finds it and calls btrfs_mark_bg_unused() 2730 * before the new flag is set. 2731 */ 2732 set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); 2733 2734 cache->length = size; 2735 set_free_space_tree_thresholds(cache); 2736 cache->flags = type; 2737 cache->cached = BTRFS_CACHE_FINISHED; 2738 cache->global_root_id = calculate_global_root_id(fs_info, cache->start); 2739 2740 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 2741 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); 2742 2743 ret = btrfs_load_block_group_zone_info(cache, true); 2744 if (ret) { 2745 btrfs_put_block_group(cache); 2746 return ERR_PTR(ret); 2747 } 2748 2749 ret = exclude_super_stripes(cache); 2750 if (ret) { 2751 /* We may have excluded something, so call this just in case */ 2752 btrfs_free_excluded_extents(cache); 2753 btrfs_put_block_group(cache); 2754 return ERR_PTR(ret); 2755 } 2756 2757 ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); 2758 btrfs_free_excluded_extents(cache); 2759 if (ret) { 2760 btrfs_put_block_group(cache); 2761 return ERR_PTR(ret); 2762 } 2763 2764 /* 2765 * Ensure the corresponding space_info object is created and 2766 * assigned to our block group. We want our bg to be added to the rbtree 2767 * with its ->space_info set. 2768 */ 2769 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 2770 ASSERT(cache->space_info); 2771 2772 ret = btrfs_add_block_group_cache(fs_info, cache); 2773 if (ret) { 2774 btrfs_remove_free_space_cache(cache); 2775 btrfs_put_block_group(cache); 2776 return ERR_PTR(ret); 2777 } 2778 2779 /* 2780 * Now that our block group has its ->space_info set and is inserted in 2781 * the rbtree, update the space info's counters. 2782 */ 2783 trace_btrfs_add_block_group(fs_info, cache, 1); 2784 btrfs_add_bg_to_space_info(fs_info, cache); 2785 btrfs_update_global_block_rsv(fs_info); 2786 2787 #ifdef CONFIG_BTRFS_DEBUG 2788 if (btrfs_should_fragment_free_space(cache)) { 2789 cache->space_info->bytes_used += size >> 1; 2790 fragment_free_space(cache); 2791 } 2792 #endif 2793 2794 list_add_tail(&cache->bg_list, &trans->new_bgs); 2795 trans->delayed_ref_updates++; 2796 btrfs_update_delayed_refs_rsv(trans); 2797 2798 set_avail_alloc_bits(fs_info, type); 2799 return cache; 2800 } 2801 2802 /* 2803 * Mark one block group RO, can be called several times for the same block 2804 * group. 2805 * 2806 * @cache: the destination block group 2807 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 2808 * ensure we still have some free space after marking this 2809 * block group RO. 2810 */ 2811 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 2812 bool do_chunk_alloc) 2813 { 2814 struct btrfs_fs_info *fs_info = cache->fs_info; 2815 struct btrfs_trans_handle *trans; 2816 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2817 u64 alloc_flags; 2818 int ret; 2819 bool dirty_bg_running; 2820 2821 /* 2822 * This can only happen when we are doing read-only scrub on read-only 2823 * mount. 2824 * In that case we should not start a new transaction on read-only fs. 2825 * Thus here we skip all chunk allocations. 2826 */ 2827 if (sb_rdonly(fs_info->sb)) { 2828 mutex_lock(&fs_info->ro_block_group_mutex); 2829 ret = inc_block_group_ro(cache, 0); 2830 mutex_unlock(&fs_info->ro_block_group_mutex); 2831 return ret; 2832 } 2833 2834 do { 2835 trans = btrfs_join_transaction(root); 2836 if (IS_ERR(trans)) 2837 return PTR_ERR(trans); 2838 2839 dirty_bg_running = false; 2840 2841 /* 2842 * We're not allowed to set block groups readonly after the dirty 2843 * block group cache has started writing. If it already started, 2844 * back off and let this transaction commit. 2845 */ 2846 mutex_lock(&fs_info->ro_block_group_mutex); 2847 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 2848 u64 transid = trans->transid; 2849 2850 mutex_unlock(&fs_info->ro_block_group_mutex); 2851 btrfs_end_transaction(trans); 2852 2853 ret = btrfs_wait_for_commit(fs_info, transid); 2854 if (ret) 2855 return ret; 2856 dirty_bg_running = true; 2857 } 2858 } while (dirty_bg_running); 2859 2860 if (do_chunk_alloc) { 2861 /* 2862 * If we are changing raid levels, try to allocate a 2863 * corresponding block group with the new raid level. 2864 */ 2865 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2866 if (alloc_flags != cache->flags) { 2867 ret = btrfs_chunk_alloc(trans, alloc_flags, 2868 CHUNK_ALLOC_FORCE); 2869 /* 2870 * ENOSPC is allowed here, we may have enough space 2871 * already allocated at the new raid level to carry on 2872 */ 2873 if (ret == -ENOSPC) 2874 ret = 0; 2875 if (ret < 0) 2876 goto out; 2877 } 2878 } 2879 2880 ret = inc_block_group_ro(cache, 0); 2881 if (!ret) 2882 goto out; 2883 if (ret == -ETXTBSY) 2884 goto unlock_out; 2885 2886 /* 2887 * Skip chunk alloction if the bg is SYSTEM, this is to avoid system 2888 * chunk allocation storm to exhaust the system chunk array. Otherwise 2889 * we still want to try our best to mark the block group read-only. 2890 */ 2891 if (!do_chunk_alloc && ret == -ENOSPC && 2892 (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) 2893 goto unlock_out; 2894 2895 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 2896 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2897 if (ret < 0) 2898 goto out; 2899 /* 2900 * We have allocated a new chunk. We also need to activate that chunk to 2901 * grant metadata tickets for zoned filesystem. 2902 */ 2903 ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); 2904 if (ret < 0) 2905 goto out; 2906 2907 ret = inc_block_group_ro(cache, 0); 2908 if (ret == -ETXTBSY) 2909 goto unlock_out; 2910 out: 2911 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 2912 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2913 mutex_lock(&fs_info->chunk_mutex); 2914 check_system_chunk(trans, alloc_flags); 2915 mutex_unlock(&fs_info->chunk_mutex); 2916 } 2917 unlock_out: 2918 mutex_unlock(&fs_info->ro_block_group_mutex); 2919 2920 btrfs_end_transaction(trans); 2921 return ret; 2922 } 2923 2924 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 2925 { 2926 struct btrfs_space_info *sinfo = cache->space_info; 2927 u64 num_bytes; 2928 2929 BUG_ON(!cache->ro); 2930 2931 spin_lock(&sinfo->lock); 2932 spin_lock(&cache->lock); 2933 if (!--cache->ro) { 2934 if (btrfs_is_zoned(cache->fs_info)) { 2935 /* Migrate zone_unusable bytes back */ 2936 cache->zone_unusable = 2937 (cache->alloc_offset - cache->used) + 2938 (cache->length - cache->zone_capacity); 2939 sinfo->bytes_zone_unusable += cache->zone_unusable; 2940 sinfo->bytes_readonly -= cache->zone_unusable; 2941 } 2942 num_bytes = cache->length - cache->reserved - 2943 cache->pinned - cache->bytes_super - 2944 cache->zone_unusable - cache->used; 2945 sinfo->bytes_readonly -= num_bytes; 2946 list_del_init(&cache->ro_list); 2947 } 2948 spin_unlock(&cache->lock); 2949 spin_unlock(&sinfo->lock); 2950 } 2951 2952 static int update_block_group_item(struct btrfs_trans_handle *trans, 2953 struct btrfs_path *path, 2954 struct btrfs_block_group *cache) 2955 { 2956 struct btrfs_fs_info *fs_info = trans->fs_info; 2957 int ret; 2958 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2959 unsigned long bi; 2960 struct extent_buffer *leaf; 2961 struct btrfs_block_group_item bgi; 2962 struct btrfs_key key; 2963 u64 old_commit_used; 2964 u64 used; 2965 2966 /* 2967 * Block group items update can be triggered out of commit transaction 2968 * critical section, thus we need a consistent view of used bytes. 2969 * We cannot use cache->used directly outside of the spin lock, as it 2970 * may be changed. 2971 */ 2972 spin_lock(&cache->lock); 2973 old_commit_used = cache->commit_used; 2974 used = cache->used; 2975 /* No change in used bytes, can safely skip it. */ 2976 if (cache->commit_used == used) { 2977 spin_unlock(&cache->lock); 2978 return 0; 2979 } 2980 cache->commit_used = used; 2981 spin_unlock(&cache->lock); 2982 2983 key.objectid = cache->start; 2984 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2985 key.offset = cache->length; 2986 2987 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2988 if (ret) { 2989 if (ret > 0) 2990 ret = -ENOENT; 2991 goto fail; 2992 } 2993 2994 leaf = path->nodes[0]; 2995 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2996 btrfs_set_stack_block_group_used(&bgi, used); 2997 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2998 cache->global_root_id); 2999 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 3000 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 3001 btrfs_mark_buffer_dirty(leaf); 3002 fail: 3003 btrfs_release_path(path); 3004 /* We didn't update the block group item, need to revert @commit_used. */ 3005 if (ret < 0) { 3006 spin_lock(&cache->lock); 3007 cache->commit_used = old_commit_used; 3008 spin_unlock(&cache->lock); 3009 } 3010 return ret; 3011 3012 } 3013 3014 static int cache_save_setup(struct btrfs_block_group *block_group, 3015 struct btrfs_trans_handle *trans, 3016 struct btrfs_path *path) 3017 { 3018 struct btrfs_fs_info *fs_info = block_group->fs_info; 3019 struct btrfs_root *root = fs_info->tree_root; 3020 struct inode *inode = NULL; 3021 struct extent_changeset *data_reserved = NULL; 3022 u64 alloc_hint = 0; 3023 int dcs = BTRFS_DC_ERROR; 3024 u64 cache_size = 0; 3025 int retries = 0; 3026 int ret = 0; 3027 3028 if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 3029 return 0; 3030 3031 /* 3032 * If this block group is smaller than 100 megs don't bother caching the 3033 * block group. 3034 */ 3035 if (block_group->length < (100 * SZ_1M)) { 3036 spin_lock(&block_group->lock); 3037 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3038 spin_unlock(&block_group->lock); 3039 return 0; 3040 } 3041 3042 if (TRANS_ABORTED(trans)) 3043 return 0; 3044 again: 3045 inode = lookup_free_space_inode(block_group, path); 3046 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3047 ret = PTR_ERR(inode); 3048 btrfs_release_path(path); 3049 goto out; 3050 } 3051 3052 if (IS_ERR(inode)) { 3053 BUG_ON(retries); 3054 retries++; 3055 3056 if (block_group->ro) 3057 goto out_free; 3058 3059 ret = create_free_space_inode(trans, block_group, path); 3060 if (ret) 3061 goto out_free; 3062 goto again; 3063 } 3064 3065 /* 3066 * We want to set the generation to 0, that way if anything goes wrong 3067 * from here on out we know not to trust this cache when we load up next 3068 * time. 3069 */ 3070 BTRFS_I(inode)->generation = 0; 3071 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 3072 if (ret) { 3073 /* 3074 * So theoretically we could recover from this, simply set the 3075 * super cache generation to 0 so we know to invalidate the 3076 * cache, but then we'd have to keep track of the block groups 3077 * that fail this way so we know we _have_ to reset this cache 3078 * before the next commit or risk reading stale cache. So to 3079 * limit our exposure to horrible edge cases lets just abort the 3080 * transaction, this only happens in really bad situations 3081 * anyway. 3082 */ 3083 btrfs_abort_transaction(trans, ret); 3084 goto out_put; 3085 } 3086 WARN_ON(ret); 3087 3088 /* We've already setup this transaction, go ahead and exit */ 3089 if (block_group->cache_generation == trans->transid && 3090 i_size_read(inode)) { 3091 dcs = BTRFS_DC_SETUP; 3092 goto out_put; 3093 } 3094 3095 if (i_size_read(inode) > 0) { 3096 ret = btrfs_check_trunc_cache_free_space(fs_info, 3097 &fs_info->global_block_rsv); 3098 if (ret) 3099 goto out_put; 3100 3101 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3102 if (ret) 3103 goto out_put; 3104 } 3105 3106 spin_lock(&block_group->lock); 3107 if (block_group->cached != BTRFS_CACHE_FINISHED || 3108 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3109 /* 3110 * don't bother trying to write stuff out _if_ 3111 * a) we're not cached, 3112 * b) we're with nospace_cache mount option, 3113 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3114 */ 3115 dcs = BTRFS_DC_WRITTEN; 3116 spin_unlock(&block_group->lock); 3117 goto out_put; 3118 } 3119 spin_unlock(&block_group->lock); 3120 3121 /* 3122 * We hit an ENOSPC when setting up the cache in this transaction, just 3123 * skip doing the setup, we've already cleared the cache so we're safe. 3124 */ 3125 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3126 ret = -ENOSPC; 3127 goto out_put; 3128 } 3129 3130 /* 3131 * Try to preallocate enough space based on how big the block group is. 3132 * Keep in mind this has to include any pinned space which could end up 3133 * taking up quite a bit since it's not folded into the other space 3134 * cache. 3135 */ 3136 cache_size = div_u64(block_group->length, SZ_256M); 3137 if (!cache_size) 3138 cache_size = 1; 3139 3140 cache_size *= 16; 3141 cache_size *= fs_info->sectorsize; 3142 3143 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 3144 cache_size, false); 3145 if (ret) 3146 goto out_put; 3147 3148 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, 3149 cache_size, cache_size, 3150 &alloc_hint); 3151 /* 3152 * Our cache requires contiguous chunks so that we don't modify a bunch 3153 * of metadata or split extents when writing the cache out, which means 3154 * we can enospc if we are heavily fragmented in addition to just normal 3155 * out of space conditions. So if we hit this just skip setting up any 3156 * other block groups for this transaction, maybe we'll unpin enough 3157 * space the next time around. 3158 */ 3159 if (!ret) 3160 dcs = BTRFS_DC_SETUP; 3161 else if (ret == -ENOSPC) 3162 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3163 3164 out_put: 3165 iput(inode); 3166 out_free: 3167 btrfs_release_path(path); 3168 out: 3169 spin_lock(&block_group->lock); 3170 if (!ret && dcs == BTRFS_DC_SETUP) 3171 block_group->cache_generation = trans->transid; 3172 block_group->disk_cache_state = dcs; 3173 spin_unlock(&block_group->lock); 3174 3175 extent_changeset_free(data_reserved); 3176 return ret; 3177 } 3178 3179 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 3180 { 3181 struct btrfs_fs_info *fs_info = trans->fs_info; 3182 struct btrfs_block_group *cache, *tmp; 3183 struct btrfs_transaction *cur_trans = trans->transaction; 3184 struct btrfs_path *path; 3185 3186 if (list_empty(&cur_trans->dirty_bgs) || 3187 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3188 return 0; 3189 3190 path = btrfs_alloc_path(); 3191 if (!path) 3192 return -ENOMEM; 3193 3194 /* Could add new block groups, use _safe just in case */ 3195 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3196 dirty_list) { 3197 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3198 cache_save_setup(cache, trans, path); 3199 } 3200 3201 btrfs_free_path(path); 3202 return 0; 3203 } 3204 3205 /* 3206 * Transaction commit does final block group cache writeback during a critical 3207 * section where nothing is allowed to change the FS. This is required in 3208 * order for the cache to actually match the block group, but can introduce a 3209 * lot of latency into the commit. 3210 * 3211 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 3212 * There's a chance we'll have to redo some of it if the block group changes 3213 * again during the commit, but it greatly reduces the commit latency by 3214 * getting rid of the easy block groups while we're still allowing others to 3215 * join the commit. 3216 */ 3217 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 3218 { 3219 struct btrfs_fs_info *fs_info = trans->fs_info; 3220 struct btrfs_block_group *cache; 3221 struct btrfs_transaction *cur_trans = trans->transaction; 3222 int ret = 0; 3223 int should_put; 3224 struct btrfs_path *path = NULL; 3225 LIST_HEAD(dirty); 3226 struct list_head *io = &cur_trans->io_bgs; 3227 int loops = 0; 3228 3229 spin_lock(&cur_trans->dirty_bgs_lock); 3230 if (list_empty(&cur_trans->dirty_bgs)) { 3231 spin_unlock(&cur_trans->dirty_bgs_lock); 3232 return 0; 3233 } 3234 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3235 spin_unlock(&cur_trans->dirty_bgs_lock); 3236 3237 again: 3238 /* Make sure all the block groups on our dirty list actually exist */ 3239 btrfs_create_pending_block_groups(trans); 3240 3241 if (!path) { 3242 path = btrfs_alloc_path(); 3243 if (!path) { 3244 ret = -ENOMEM; 3245 goto out; 3246 } 3247 } 3248 3249 /* 3250 * cache_write_mutex is here only to save us from balance or automatic 3251 * removal of empty block groups deleting this block group while we are 3252 * writing out the cache 3253 */ 3254 mutex_lock(&trans->transaction->cache_write_mutex); 3255 while (!list_empty(&dirty)) { 3256 bool drop_reserve = true; 3257 3258 cache = list_first_entry(&dirty, struct btrfs_block_group, 3259 dirty_list); 3260 /* 3261 * This can happen if something re-dirties a block group that 3262 * is already under IO. Just wait for it to finish and then do 3263 * it all again 3264 */ 3265 if (!list_empty(&cache->io_list)) { 3266 list_del_init(&cache->io_list); 3267 btrfs_wait_cache_io(trans, cache, path); 3268 btrfs_put_block_group(cache); 3269 } 3270 3271 3272 /* 3273 * btrfs_wait_cache_io uses the cache->dirty_list to decide if 3274 * it should update the cache_state. Don't delete until after 3275 * we wait. 3276 * 3277 * Since we're not running in the commit critical section 3278 * we need the dirty_bgs_lock to protect from update_block_group 3279 */ 3280 spin_lock(&cur_trans->dirty_bgs_lock); 3281 list_del_init(&cache->dirty_list); 3282 spin_unlock(&cur_trans->dirty_bgs_lock); 3283 3284 should_put = 1; 3285 3286 cache_save_setup(cache, trans, path); 3287 3288 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3289 cache->io_ctl.inode = NULL; 3290 ret = btrfs_write_out_cache(trans, cache, path); 3291 if (ret == 0 && cache->io_ctl.inode) { 3292 should_put = 0; 3293 3294 /* 3295 * The cache_write_mutex is protecting the 3296 * io_list, also refer to the definition of 3297 * btrfs_transaction::io_bgs for more details 3298 */ 3299 list_add_tail(&cache->io_list, io); 3300 } else { 3301 /* 3302 * If we failed to write the cache, the 3303 * generation will be bad and life goes on 3304 */ 3305 ret = 0; 3306 } 3307 } 3308 if (!ret) { 3309 ret = update_block_group_item(trans, path, cache); 3310 /* 3311 * Our block group might still be attached to the list 3312 * of new block groups in the transaction handle of some 3313 * other task (struct btrfs_trans_handle->new_bgs). This 3314 * means its block group item isn't yet in the extent 3315 * tree. If this happens ignore the error, as we will 3316 * try again later in the critical section of the 3317 * transaction commit. 3318 */ 3319 if (ret == -ENOENT) { 3320 ret = 0; 3321 spin_lock(&cur_trans->dirty_bgs_lock); 3322 if (list_empty(&cache->dirty_list)) { 3323 list_add_tail(&cache->dirty_list, 3324 &cur_trans->dirty_bgs); 3325 btrfs_get_block_group(cache); 3326 drop_reserve = false; 3327 } 3328 spin_unlock(&cur_trans->dirty_bgs_lock); 3329 } else if (ret) { 3330 btrfs_abort_transaction(trans, ret); 3331 } 3332 } 3333 3334 /* If it's not on the io list, we need to put the block group */ 3335 if (should_put) 3336 btrfs_put_block_group(cache); 3337 if (drop_reserve) 3338 btrfs_delayed_refs_rsv_release(fs_info, 1); 3339 /* 3340 * Avoid blocking other tasks for too long. It might even save 3341 * us from writing caches for block groups that are going to be 3342 * removed. 3343 */ 3344 mutex_unlock(&trans->transaction->cache_write_mutex); 3345 if (ret) 3346 goto out; 3347 mutex_lock(&trans->transaction->cache_write_mutex); 3348 } 3349 mutex_unlock(&trans->transaction->cache_write_mutex); 3350 3351 /* 3352 * Go through delayed refs for all the stuff we've just kicked off 3353 * and then loop back (just once) 3354 */ 3355 if (!ret) 3356 ret = btrfs_run_delayed_refs(trans, 0); 3357 if (!ret && loops == 0) { 3358 loops++; 3359 spin_lock(&cur_trans->dirty_bgs_lock); 3360 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3361 /* 3362 * dirty_bgs_lock protects us from concurrent block group 3363 * deletes too (not just cache_write_mutex). 3364 */ 3365 if (!list_empty(&dirty)) { 3366 spin_unlock(&cur_trans->dirty_bgs_lock); 3367 goto again; 3368 } 3369 spin_unlock(&cur_trans->dirty_bgs_lock); 3370 } 3371 out: 3372 if (ret < 0) { 3373 spin_lock(&cur_trans->dirty_bgs_lock); 3374 list_splice_init(&dirty, &cur_trans->dirty_bgs); 3375 spin_unlock(&cur_trans->dirty_bgs_lock); 3376 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3377 } 3378 3379 btrfs_free_path(path); 3380 return ret; 3381 } 3382 3383 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 3384 { 3385 struct btrfs_fs_info *fs_info = trans->fs_info; 3386 struct btrfs_block_group *cache; 3387 struct btrfs_transaction *cur_trans = trans->transaction; 3388 int ret = 0; 3389 int should_put; 3390 struct btrfs_path *path; 3391 struct list_head *io = &cur_trans->io_bgs; 3392 3393 path = btrfs_alloc_path(); 3394 if (!path) 3395 return -ENOMEM; 3396 3397 /* 3398 * Even though we are in the critical section of the transaction commit, 3399 * we can still have concurrent tasks adding elements to this 3400 * transaction's list of dirty block groups. These tasks correspond to 3401 * endio free space workers started when writeback finishes for a 3402 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3403 * allocate new block groups as a result of COWing nodes of the root 3404 * tree when updating the free space inode. The writeback for the space 3405 * caches is triggered by an earlier call to 3406 * btrfs_start_dirty_block_groups() and iterations of the following 3407 * loop. 3408 * Also we want to do the cache_save_setup first and then run the 3409 * delayed refs to make sure we have the best chance at doing this all 3410 * in one shot. 3411 */ 3412 spin_lock(&cur_trans->dirty_bgs_lock); 3413 while (!list_empty(&cur_trans->dirty_bgs)) { 3414 cache = list_first_entry(&cur_trans->dirty_bgs, 3415 struct btrfs_block_group, 3416 dirty_list); 3417 3418 /* 3419 * This can happen if cache_save_setup re-dirties a block group 3420 * that is already under IO. Just wait for it to finish and 3421 * then do it all again 3422 */ 3423 if (!list_empty(&cache->io_list)) { 3424 spin_unlock(&cur_trans->dirty_bgs_lock); 3425 list_del_init(&cache->io_list); 3426 btrfs_wait_cache_io(trans, cache, path); 3427 btrfs_put_block_group(cache); 3428 spin_lock(&cur_trans->dirty_bgs_lock); 3429 } 3430 3431 /* 3432 * Don't remove from the dirty list until after we've waited on 3433 * any pending IO 3434 */ 3435 list_del_init(&cache->dirty_list); 3436 spin_unlock(&cur_trans->dirty_bgs_lock); 3437 should_put = 1; 3438 3439 cache_save_setup(cache, trans, path); 3440 3441 if (!ret) 3442 ret = btrfs_run_delayed_refs(trans, 3443 (unsigned long) -1); 3444 3445 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3446 cache->io_ctl.inode = NULL; 3447 ret = btrfs_write_out_cache(trans, cache, path); 3448 if (ret == 0 && cache->io_ctl.inode) { 3449 should_put = 0; 3450 list_add_tail(&cache->io_list, io); 3451 } else { 3452 /* 3453 * If we failed to write the cache, the 3454 * generation will be bad and life goes on 3455 */ 3456 ret = 0; 3457 } 3458 } 3459 if (!ret) { 3460 ret = update_block_group_item(trans, path, cache); 3461 /* 3462 * One of the free space endio workers might have 3463 * created a new block group while updating a free space 3464 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3465 * and hasn't released its transaction handle yet, in 3466 * which case the new block group is still attached to 3467 * its transaction handle and its creation has not 3468 * finished yet (no block group item in the extent tree 3469 * yet, etc). If this is the case, wait for all free 3470 * space endio workers to finish and retry. This is a 3471 * very rare case so no need for a more efficient and 3472 * complex approach. 3473 */ 3474 if (ret == -ENOENT) { 3475 wait_event(cur_trans->writer_wait, 3476 atomic_read(&cur_trans->num_writers) == 1); 3477 ret = update_block_group_item(trans, path, cache); 3478 } 3479 if (ret) 3480 btrfs_abort_transaction(trans, ret); 3481 } 3482 3483 /* If its not on the io list, we need to put the block group */ 3484 if (should_put) 3485 btrfs_put_block_group(cache); 3486 btrfs_delayed_refs_rsv_release(fs_info, 1); 3487 spin_lock(&cur_trans->dirty_bgs_lock); 3488 } 3489 spin_unlock(&cur_trans->dirty_bgs_lock); 3490 3491 /* 3492 * Refer to the definition of io_bgs member for details why it's safe 3493 * to use it without any locking 3494 */ 3495 while (!list_empty(io)) { 3496 cache = list_first_entry(io, struct btrfs_block_group, 3497 io_list); 3498 list_del_init(&cache->io_list); 3499 btrfs_wait_cache_io(trans, cache, path); 3500 btrfs_put_block_group(cache); 3501 } 3502 3503 btrfs_free_path(path); 3504 return ret; 3505 } 3506 3507 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 3508 u64 bytenr, u64 num_bytes, bool alloc) 3509 { 3510 struct btrfs_fs_info *info = trans->fs_info; 3511 struct btrfs_block_group *cache = NULL; 3512 u64 total = num_bytes; 3513 u64 old_val; 3514 u64 byte_in_group; 3515 int factor; 3516 int ret = 0; 3517 3518 /* Block accounting for super block */ 3519 spin_lock(&info->delalloc_root_lock); 3520 old_val = btrfs_super_bytes_used(info->super_copy); 3521 if (alloc) 3522 old_val += num_bytes; 3523 else 3524 old_val -= num_bytes; 3525 btrfs_set_super_bytes_used(info->super_copy, old_val); 3526 spin_unlock(&info->delalloc_root_lock); 3527 3528 while (total) { 3529 struct btrfs_space_info *space_info; 3530 bool reclaim = false; 3531 3532 cache = btrfs_lookup_block_group(info, bytenr); 3533 if (!cache) { 3534 ret = -ENOENT; 3535 break; 3536 } 3537 space_info = cache->space_info; 3538 factor = btrfs_bg_type_to_factor(cache->flags); 3539 3540 /* 3541 * If this block group has free space cache written out, we 3542 * need to make sure to load it if we are removing space. This 3543 * is because we need the unpinning stage to actually add the 3544 * space back to the block group, otherwise we will leak space. 3545 */ 3546 if (!alloc && !btrfs_block_group_done(cache)) 3547 btrfs_cache_block_group(cache, true); 3548 3549 byte_in_group = bytenr - cache->start; 3550 WARN_ON(byte_in_group > cache->length); 3551 3552 spin_lock(&space_info->lock); 3553 spin_lock(&cache->lock); 3554 3555 if (btrfs_test_opt(info, SPACE_CACHE) && 3556 cache->disk_cache_state < BTRFS_DC_CLEAR) 3557 cache->disk_cache_state = BTRFS_DC_CLEAR; 3558 3559 old_val = cache->used; 3560 num_bytes = min(total, cache->length - byte_in_group); 3561 if (alloc) { 3562 old_val += num_bytes; 3563 cache->used = old_val; 3564 cache->reserved -= num_bytes; 3565 space_info->bytes_reserved -= num_bytes; 3566 space_info->bytes_used += num_bytes; 3567 space_info->disk_used += num_bytes * factor; 3568 spin_unlock(&cache->lock); 3569 spin_unlock(&space_info->lock); 3570 } else { 3571 old_val -= num_bytes; 3572 cache->used = old_val; 3573 cache->pinned += num_bytes; 3574 btrfs_space_info_update_bytes_pinned(info, space_info, 3575 num_bytes); 3576 space_info->bytes_used -= num_bytes; 3577 space_info->disk_used -= num_bytes * factor; 3578 3579 reclaim = should_reclaim_block_group(cache, num_bytes); 3580 3581 spin_unlock(&cache->lock); 3582 spin_unlock(&space_info->lock); 3583 3584 set_extent_bit(&trans->transaction->pinned_extents, 3585 bytenr, bytenr + num_bytes - 1, 3586 EXTENT_DIRTY, NULL); 3587 } 3588 3589 spin_lock(&trans->transaction->dirty_bgs_lock); 3590 if (list_empty(&cache->dirty_list)) { 3591 list_add_tail(&cache->dirty_list, 3592 &trans->transaction->dirty_bgs); 3593 trans->delayed_ref_updates++; 3594 btrfs_get_block_group(cache); 3595 } 3596 spin_unlock(&trans->transaction->dirty_bgs_lock); 3597 3598 /* 3599 * No longer have used bytes in this block group, queue it for 3600 * deletion. We do this after adding the block group to the 3601 * dirty list to avoid races between cleaner kthread and space 3602 * cache writeout. 3603 */ 3604 if (!alloc && old_val == 0) { 3605 if (!btrfs_test_opt(info, DISCARD_ASYNC)) 3606 btrfs_mark_bg_unused(cache); 3607 } else if (!alloc && reclaim) { 3608 btrfs_mark_bg_to_reclaim(cache); 3609 } 3610 3611 btrfs_put_block_group(cache); 3612 total -= num_bytes; 3613 bytenr += num_bytes; 3614 } 3615 3616 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 3617 btrfs_update_delayed_refs_rsv(trans); 3618 return ret; 3619 } 3620 3621 /* 3622 * Update the block_group and space info counters. 3623 * 3624 * @cache: The cache we are manipulating 3625 * @ram_bytes: The number of bytes of file content, and will be same to 3626 * @num_bytes except for the compress path. 3627 * @num_bytes: The number of bytes in question 3628 * @delalloc: The blocks are allocated for the delalloc write 3629 * 3630 * This is called by the allocator when it reserves space. If this is a 3631 * reservation and the block group has become read only we cannot make the 3632 * reservation and return -EAGAIN, otherwise this function always succeeds. 3633 */ 3634 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 3635 u64 ram_bytes, u64 num_bytes, int delalloc, 3636 bool force_wrong_size_class) 3637 { 3638 struct btrfs_space_info *space_info = cache->space_info; 3639 enum btrfs_block_group_size_class size_class; 3640 int ret = 0; 3641 3642 spin_lock(&space_info->lock); 3643 spin_lock(&cache->lock); 3644 if (cache->ro) { 3645 ret = -EAGAIN; 3646 goto out; 3647 } 3648 3649 if (btrfs_block_group_should_use_size_class(cache)) { 3650 size_class = btrfs_calc_block_group_size_class(num_bytes); 3651 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); 3652 if (ret) 3653 goto out; 3654 } 3655 cache->reserved += num_bytes; 3656 space_info->bytes_reserved += num_bytes; 3657 trace_btrfs_space_reservation(cache->fs_info, "space_info", 3658 space_info->flags, num_bytes, 1); 3659 btrfs_space_info_update_bytes_may_use(cache->fs_info, 3660 space_info, -ram_bytes); 3661 if (delalloc) 3662 cache->delalloc_bytes += num_bytes; 3663 3664 /* 3665 * Compression can use less space than we reserved, so wake tickets if 3666 * that happens. 3667 */ 3668 if (num_bytes < ram_bytes) 3669 btrfs_try_granting_tickets(cache->fs_info, space_info); 3670 out: 3671 spin_unlock(&cache->lock); 3672 spin_unlock(&space_info->lock); 3673 return ret; 3674 } 3675 3676 /* 3677 * Update the block_group and space info counters. 3678 * 3679 * @cache: The cache we are manipulating 3680 * @num_bytes: The number of bytes in question 3681 * @delalloc: The blocks are allocated for the delalloc write 3682 * 3683 * This is called by somebody who is freeing space that was never actually used 3684 * on disk. For example if you reserve some space for a new leaf in transaction 3685 * A and before transaction A commits you free that leaf, you call this with 3686 * reserve set to 0 in order to clear the reservation. 3687 */ 3688 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 3689 u64 num_bytes, int delalloc) 3690 { 3691 struct btrfs_space_info *space_info = cache->space_info; 3692 3693 spin_lock(&space_info->lock); 3694 spin_lock(&cache->lock); 3695 if (cache->ro) 3696 space_info->bytes_readonly += num_bytes; 3697 cache->reserved -= num_bytes; 3698 space_info->bytes_reserved -= num_bytes; 3699 space_info->max_extent_size = 0; 3700 3701 if (delalloc) 3702 cache->delalloc_bytes -= num_bytes; 3703 spin_unlock(&cache->lock); 3704 3705 btrfs_try_granting_tickets(cache->fs_info, space_info); 3706 spin_unlock(&space_info->lock); 3707 } 3708 3709 static void force_metadata_allocation(struct btrfs_fs_info *info) 3710 { 3711 struct list_head *head = &info->space_info; 3712 struct btrfs_space_info *found; 3713 3714 list_for_each_entry(found, head, list) { 3715 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3716 found->force_alloc = CHUNK_ALLOC_FORCE; 3717 } 3718 } 3719 3720 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3721 struct btrfs_space_info *sinfo, int force) 3722 { 3723 u64 bytes_used = btrfs_space_info_used(sinfo, false); 3724 u64 thresh; 3725 3726 if (force == CHUNK_ALLOC_FORCE) 3727 return 1; 3728 3729 /* 3730 * in limited mode, we want to have some free space up to 3731 * about 1% of the FS size. 3732 */ 3733 if (force == CHUNK_ALLOC_LIMITED) { 3734 thresh = btrfs_super_total_bytes(fs_info->super_copy); 3735 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); 3736 3737 if (sinfo->total_bytes - bytes_used < thresh) 3738 return 1; 3739 } 3740 3741 if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) 3742 return 0; 3743 return 1; 3744 } 3745 3746 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 3747 { 3748 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 3749 3750 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 3751 } 3752 3753 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) 3754 { 3755 struct btrfs_block_group *bg; 3756 int ret; 3757 3758 /* 3759 * Check if we have enough space in the system space info because we 3760 * will need to update device items in the chunk btree and insert a new 3761 * chunk item in the chunk btree as well. This will allocate a new 3762 * system block group if needed. 3763 */ 3764 check_system_chunk(trans, flags); 3765 3766 bg = btrfs_create_chunk(trans, flags); 3767 if (IS_ERR(bg)) { 3768 ret = PTR_ERR(bg); 3769 goto out; 3770 } 3771 3772 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3773 /* 3774 * Normally we are not expected to fail with -ENOSPC here, since we have 3775 * previously reserved space in the system space_info and allocated one 3776 * new system chunk if necessary. However there are three exceptions: 3777 * 3778 * 1) We may have enough free space in the system space_info but all the 3779 * existing system block groups have a profile which can not be used 3780 * for extent allocation. 3781 * 3782 * This happens when mounting in degraded mode. For example we have a 3783 * RAID1 filesystem with 2 devices, lose one device and mount the fs 3784 * using the other device in degraded mode. If we then allocate a chunk, 3785 * we may have enough free space in the existing system space_info, but 3786 * none of the block groups can be used for extent allocation since they 3787 * have a RAID1 profile, and because we are in degraded mode with a 3788 * single device, we are forced to allocate a new system chunk with a 3789 * SINGLE profile. Making check_system_chunk() iterate over all system 3790 * block groups and check if they have a usable profile and enough space 3791 * can be slow on very large filesystems, so we tolerate the -ENOSPC and 3792 * try again after forcing allocation of a new system chunk. Like this 3793 * we avoid paying the cost of that search in normal circumstances, when 3794 * we were not mounted in degraded mode; 3795 * 3796 * 2) We had enough free space info the system space_info, and one suitable 3797 * block group to allocate from when we called check_system_chunk() 3798 * above. However right after we called it, the only system block group 3799 * with enough free space got turned into RO mode by a running scrub, 3800 * and in this case we have to allocate a new one and retry. We only 3801 * need do this allocate and retry once, since we have a transaction 3802 * handle and scrub uses the commit root to search for block groups; 3803 * 3804 * 3) We had one system block group with enough free space when we called 3805 * check_system_chunk(), but after that, right before we tried to 3806 * allocate the last extent buffer we needed, a discard operation came 3807 * in and it temporarily removed the last free space entry from the 3808 * block group (discard removes a free space entry, discards it, and 3809 * then adds back the entry to the block group cache). 3810 */ 3811 if (ret == -ENOSPC) { 3812 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); 3813 struct btrfs_block_group *sys_bg; 3814 3815 sys_bg = btrfs_create_chunk(trans, sys_flags); 3816 if (IS_ERR(sys_bg)) { 3817 ret = PTR_ERR(sys_bg); 3818 btrfs_abort_transaction(trans, ret); 3819 goto out; 3820 } 3821 3822 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3823 if (ret) { 3824 btrfs_abort_transaction(trans, ret); 3825 goto out; 3826 } 3827 3828 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3829 if (ret) { 3830 btrfs_abort_transaction(trans, ret); 3831 goto out; 3832 } 3833 } else if (ret) { 3834 btrfs_abort_transaction(trans, ret); 3835 goto out; 3836 } 3837 out: 3838 btrfs_trans_release_chunk_metadata(trans); 3839 3840 if (ret) 3841 return ERR_PTR(ret); 3842 3843 btrfs_get_block_group(bg); 3844 return bg; 3845 } 3846 3847 /* 3848 * Chunk allocation is done in 2 phases: 3849 * 3850 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for 3851 * the chunk, the chunk mapping, create its block group and add the items 3852 * that belong in the chunk btree to it - more specifically, we need to 3853 * update device items in the chunk btree and add a new chunk item to it. 3854 * 3855 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block 3856 * group item to the extent btree and the device extent items to the devices 3857 * btree. 3858 * 3859 * This is done to prevent deadlocks. For example when COWing a node from the 3860 * extent btree we are holding a write lock on the node's parent and if we 3861 * trigger chunk allocation and attempted to insert the new block group item 3862 * in the extent btree right way, we could deadlock because the path for the 3863 * insertion can include that parent node. At first glance it seems impossible 3864 * to trigger chunk allocation after starting a transaction since tasks should 3865 * reserve enough transaction units (metadata space), however while that is true 3866 * most of the time, chunk allocation may still be triggered for several reasons: 3867 * 3868 * 1) When reserving metadata, we check if there is enough free space in the 3869 * metadata space_info and therefore don't trigger allocation of a new chunk. 3870 * However later when the task actually tries to COW an extent buffer from 3871 * the extent btree or from the device btree for example, it is forced to 3872 * allocate a new block group (chunk) because the only one that had enough 3873 * free space was just turned to RO mode by a running scrub for example (or 3874 * device replace, block group reclaim thread, etc), so we can not use it 3875 * for allocating an extent and end up being forced to allocate a new one; 3876 * 3877 * 2) Because we only check that the metadata space_info has enough free bytes, 3878 * we end up not allocating a new metadata chunk in that case. However if 3879 * the filesystem was mounted in degraded mode, none of the existing block 3880 * groups might be suitable for extent allocation due to their incompatible 3881 * profile (for e.g. mounting a 2 devices filesystem, where all block groups 3882 * use a RAID1 profile, in degraded mode using a single device). In this case 3883 * when the task attempts to COW some extent buffer of the extent btree for 3884 * example, it will trigger allocation of a new metadata block group with a 3885 * suitable profile (SINGLE profile in the example of the degraded mount of 3886 * the RAID1 filesystem); 3887 * 3888 * 3) The task has reserved enough transaction units / metadata space, but when 3889 * it attempts to COW an extent buffer from the extent or device btree for 3890 * example, it does not find any free extent in any metadata block group, 3891 * therefore forced to try to allocate a new metadata block group. 3892 * This is because some other task allocated all available extents in the 3893 * meanwhile - this typically happens with tasks that don't reserve space 3894 * properly, either intentionally or as a bug. One example where this is 3895 * done intentionally is fsync, as it does not reserve any transaction units 3896 * and ends up allocating a variable number of metadata extents for log 3897 * tree extent buffers; 3898 * 3899 * 4) The task has reserved enough transaction units / metadata space, but right 3900 * before it tries to allocate the last extent buffer it needs, a discard 3901 * operation comes in and, temporarily, removes the last free space entry from 3902 * the only metadata block group that had free space (discard starts by 3903 * removing a free space entry from a block group, then does the discard 3904 * operation and, once it's done, it adds back the free space entry to the 3905 * block group). 3906 * 3907 * We also need this 2 phases setup when adding a device to a filesystem with 3908 * a seed device - we must create new metadata and system chunks without adding 3909 * any of the block group items to the chunk, extent and device btrees. If we 3910 * did not do it this way, we would get ENOSPC when attempting to update those 3911 * btrees, since all the chunks from the seed device are read-only. 3912 * 3913 * Phase 1 does the updates and insertions to the chunk btree because if we had 3914 * it done in phase 2 and have a thundering herd of tasks allocating chunks in 3915 * parallel, we risk having too many system chunks allocated by many tasks if 3916 * many tasks reach phase 1 without the previous ones completing phase 2. In the 3917 * extreme case this leads to exhaustion of the system chunk array in the 3918 * superblock. This is easier to trigger if using a btree node/leaf size of 64K 3919 * and with RAID filesystems (so we have more device items in the chunk btree). 3920 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of 3921 * the system chunk array due to concurrent allocations") provides more details. 3922 * 3923 * Allocation of system chunks does not happen through this function. A task that 3924 * needs to update the chunk btree (the only btree that uses system chunks), must 3925 * preallocate chunk space by calling either check_system_chunk() or 3926 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or 3927 * metadata chunk or when removing a chunk, while the later is used before doing 3928 * a modification to the chunk btree - use cases for the later are adding, 3929 * removing and resizing a device as well as relocation of a system chunk. 3930 * See the comment below for more details. 3931 * 3932 * The reservation of system space, done through check_system_chunk(), as well 3933 * as all the updates and insertions into the chunk btree must be done while 3934 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing 3935 * an extent buffer from the chunks btree we never trigger allocation of a new 3936 * system chunk, which would result in a deadlock (trying to lock twice an 3937 * extent buffer of the chunk btree, first time before triggering the chunk 3938 * allocation and the second time during chunk allocation while attempting to 3939 * update the chunks btree). The system chunk array is also updated while holding 3940 * that mutex. The same logic applies to removing chunks - we must reserve system 3941 * space, update the chunk btree and the system chunk array in the superblock 3942 * while holding fs_info->chunk_mutex. 3943 * 3944 * This function, btrfs_chunk_alloc(), belongs to phase 1. 3945 * 3946 * If @force is CHUNK_ALLOC_FORCE: 3947 * - return 1 if it successfully allocates a chunk, 3948 * - return errors including -ENOSPC otherwise. 3949 * If @force is NOT CHUNK_ALLOC_FORCE: 3950 * - return 0 if it doesn't need to allocate a new chunk, 3951 * - return 1 if it successfully allocates a chunk, 3952 * - return errors including -ENOSPC otherwise. 3953 */ 3954 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3955 enum btrfs_chunk_alloc_enum force) 3956 { 3957 struct btrfs_fs_info *fs_info = trans->fs_info; 3958 struct btrfs_space_info *space_info; 3959 struct btrfs_block_group *ret_bg; 3960 bool wait_for_alloc = false; 3961 bool should_alloc = false; 3962 bool from_extent_allocation = false; 3963 int ret = 0; 3964 3965 if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { 3966 from_extent_allocation = true; 3967 force = CHUNK_ALLOC_FORCE; 3968 } 3969 3970 /* Don't re-enter if we're already allocating a chunk */ 3971 if (trans->allocating_chunk) 3972 return -ENOSPC; 3973 /* 3974 * Allocation of system chunks can not happen through this path, as we 3975 * could end up in a deadlock if we are allocating a data or metadata 3976 * chunk and there is another task modifying the chunk btree. 3977 * 3978 * This is because while we are holding the chunk mutex, we will attempt 3979 * to add the new chunk item to the chunk btree or update an existing 3980 * device item in the chunk btree, while the other task that is modifying 3981 * the chunk btree is attempting to COW an extent buffer while holding a 3982 * lock on it and on its parent - if the COW operation triggers a system 3983 * chunk allocation, then we can deadlock because we are holding the 3984 * chunk mutex and we may need to access that extent buffer or its parent 3985 * in order to add the chunk item or update a device item. 3986 * 3987 * Tasks that want to modify the chunk tree should reserve system space 3988 * before updating the chunk btree, by calling either 3989 * btrfs_reserve_chunk_metadata() or check_system_chunk(). 3990 * It's possible that after a task reserves the space, it still ends up 3991 * here - this happens in the cases described above at do_chunk_alloc(). 3992 * The task will have to either retry or fail. 3993 */ 3994 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3995 return -ENOSPC; 3996 3997 space_info = btrfs_find_space_info(fs_info, flags); 3998 ASSERT(space_info); 3999 4000 do { 4001 spin_lock(&space_info->lock); 4002 if (force < space_info->force_alloc) 4003 force = space_info->force_alloc; 4004 should_alloc = should_alloc_chunk(fs_info, space_info, force); 4005 if (space_info->full) { 4006 /* No more free physical space */ 4007 if (should_alloc) 4008 ret = -ENOSPC; 4009 else 4010 ret = 0; 4011 spin_unlock(&space_info->lock); 4012 return ret; 4013 } else if (!should_alloc) { 4014 spin_unlock(&space_info->lock); 4015 return 0; 4016 } else if (space_info->chunk_alloc) { 4017 /* 4018 * Someone is already allocating, so we need to block 4019 * until this someone is finished and then loop to 4020 * recheck if we should continue with our allocation 4021 * attempt. 4022 */ 4023 wait_for_alloc = true; 4024 force = CHUNK_ALLOC_NO_FORCE; 4025 spin_unlock(&space_info->lock); 4026 mutex_lock(&fs_info->chunk_mutex); 4027 mutex_unlock(&fs_info->chunk_mutex); 4028 } else { 4029 /* Proceed with allocation */ 4030 space_info->chunk_alloc = 1; 4031 wait_for_alloc = false; 4032 spin_unlock(&space_info->lock); 4033 } 4034 4035 cond_resched(); 4036 } while (wait_for_alloc); 4037 4038 mutex_lock(&fs_info->chunk_mutex); 4039 trans->allocating_chunk = true; 4040 4041 /* 4042 * If we have mixed data/metadata chunks we want to make sure we keep 4043 * allocating mixed chunks instead of individual chunks. 4044 */ 4045 if (btrfs_mixed_space_info(space_info)) 4046 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4047 4048 /* 4049 * if we're doing a data chunk, go ahead and make sure that 4050 * we keep a reasonable number of metadata chunks allocated in the 4051 * FS as well. 4052 */ 4053 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4054 fs_info->data_chunk_allocations++; 4055 if (!(fs_info->data_chunk_allocations % 4056 fs_info->metadata_ratio)) 4057 force_metadata_allocation(fs_info); 4058 } 4059 4060 ret_bg = do_chunk_alloc(trans, flags); 4061 trans->allocating_chunk = false; 4062 4063 if (IS_ERR(ret_bg)) { 4064 ret = PTR_ERR(ret_bg); 4065 } else if (from_extent_allocation) { 4066 /* 4067 * New block group is likely to be used soon. Try to activate 4068 * it now. Failure is OK for now. 4069 */ 4070 btrfs_zone_activate(ret_bg); 4071 } 4072 4073 if (!ret) 4074 btrfs_put_block_group(ret_bg); 4075 4076 spin_lock(&space_info->lock); 4077 if (ret < 0) { 4078 if (ret == -ENOSPC) 4079 space_info->full = 1; 4080 else 4081 goto out; 4082 } else { 4083 ret = 1; 4084 space_info->max_extent_size = 0; 4085 } 4086 4087 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4088 out: 4089 space_info->chunk_alloc = 0; 4090 spin_unlock(&space_info->lock); 4091 mutex_unlock(&fs_info->chunk_mutex); 4092 4093 return ret; 4094 } 4095 4096 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4097 { 4098 u64 num_dev; 4099 4100 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 4101 if (!num_dev) 4102 num_dev = fs_info->fs_devices->rw_devices; 4103 4104 return num_dev; 4105 } 4106 4107 static void reserve_chunk_space(struct btrfs_trans_handle *trans, 4108 u64 bytes, 4109 u64 type) 4110 { 4111 struct btrfs_fs_info *fs_info = trans->fs_info; 4112 struct btrfs_space_info *info; 4113 u64 left; 4114 int ret = 0; 4115 4116 /* 4117 * Needed because we can end up allocating a system chunk and for an 4118 * atomic and race free space reservation in the chunk block reserve. 4119 */ 4120 lockdep_assert_held(&fs_info->chunk_mutex); 4121 4122 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4123 spin_lock(&info->lock); 4124 left = info->total_bytes - btrfs_space_info_used(info, true); 4125 spin_unlock(&info->lock); 4126 4127 if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4128 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4129 left, bytes, type); 4130 btrfs_dump_space_info(fs_info, info, 0, 0); 4131 } 4132 4133 if (left < bytes) { 4134 u64 flags = btrfs_system_alloc_profile(fs_info); 4135 struct btrfs_block_group *bg; 4136 4137 /* 4138 * Ignore failure to create system chunk. We might end up not 4139 * needing it, as we might not need to COW all nodes/leafs from 4140 * the paths we visit in the chunk tree (they were already COWed 4141 * or created in the current transaction for example). 4142 */ 4143 bg = btrfs_create_chunk(trans, flags); 4144 if (IS_ERR(bg)) { 4145 ret = PTR_ERR(bg); 4146 } else { 4147 /* 4148 * We have a new chunk. We also need to activate it for 4149 * zoned filesystem. 4150 */ 4151 ret = btrfs_zoned_activate_one_bg(fs_info, info, true); 4152 if (ret < 0) 4153 return; 4154 4155 /* 4156 * If we fail to add the chunk item here, we end up 4157 * trying again at phase 2 of chunk allocation, at 4158 * btrfs_create_pending_block_groups(). So ignore 4159 * any error here. An ENOSPC here could happen, due to 4160 * the cases described at do_chunk_alloc() - the system 4161 * block group we just created was just turned into RO 4162 * mode by a scrub for example, or a running discard 4163 * temporarily removed its free space entries, etc. 4164 */ 4165 btrfs_chunk_alloc_add_chunk_item(trans, bg); 4166 } 4167 } 4168 4169 if (!ret) { 4170 ret = btrfs_block_rsv_add(fs_info, 4171 &fs_info->chunk_block_rsv, 4172 bytes, BTRFS_RESERVE_NO_FLUSH); 4173 if (!ret) 4174 trans->chunk_bytes_reserved += bytes; 4175 } 4176 } 4177 4178 /* 4179 * Reserve space in the system space for allocating or removing a chunk. 4180 * The caller must be holding fs_info->chunk_mutex. 4181 */ 4182 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 4183 { 4184 struct btrfs_fs_info *fs_info = trans->fs_info; 4185 const u64 num_devs = get_profile_num_devs(fs_info, type); 4186 u64 bytes; 4187 4188 /* num_devs device items to update and 1 chunk item to add or remove. */ 4189 bytes = btrfs_calc_metadata_size(fs_info, num_devs) + 4190 btrfs_calc_insert_metadata_size(fs_info, 1); 4191 4192 reserve_chunk_space(trans, bytes, type); 4193 } 4194 4195 /* 4196 * Reserve space in the system space, if needed, for doing a modification to the 4197 * chunk btree. 4198 * 4199 * @trans: A transaction handle. 4200 * @is_item_insertion: Indicate if the modification is for inserting a new item 4201 * in the chunk btree or if it's for the deletion or update 4202 * of an existing item. 4203 * 4204 * This is used in a context where we need to update the chunk btree outside 4205 * block group allocation and removal, to avoid a deadlock with a concurrent 4206 * task that is allocating a metadata or data block group and therefore needs to 4207 * update the chunk btree while holding the chunk mutex. After the update to the 4208 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. 4209 * 4210 */ 4211 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, 4212 bool is_item_insertion) 4213 { 4214 struct btrfs_fs_info *fs_info = trans->fs_info; 4215 u64 bytes; 4216 4217 if (is_item_insertion) 4218 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 4219 else 4220 bytes = btrfs_calc_metadata_size(fs_info, 1); 4221 4222 mutex_lock(&fs_info->chunk_mutex); 4223 reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); 4224 mutex_unlock(&fs_info->chunk_mutex); 4225 } 4226 4227 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 4228 { 4229 struct btrfs_block_group *block_group; 4230 4231 block_group = btrfs_lookup_first_block_group(info, 0); 4232 while (block_group) { 4233 btrfs_wait_block_group_cache_done(block_group); 4234 spin_lock(&block_group->lock); 4235 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, 4236 &block_group->runtime_flags)) { 4237 struct inode *inode = block_group->inode; 4238 4239 block_group->inode = NULL; 4240 spin_unlock(&block_group->lock); 4241 4242 ASSERT(block_group->io_ctl.inode == NULL); 4243 iput(inode); 4244 } else { 4245 spin_unlock(&block_group->lock); 4246 } 4247 block_group = btrfs_next_block_group(block_group); 4248 } 4249 } 4250 4251 /* 4252 * Must be called only after stopping all workers, since we could have block 4253 * group caching kthreads running, and therefore they could race with us if we 4254 * freed the block groups before stopping them. 4255 */ 4256 int btrfs_free_block_groups(struct btrfs_fs_info *info) 4257 { 4258 struct btrfs_block_group *block_group; 4259 struct btrfs_space_info *space_info; 4260 struct btrfs_caching_control *caching_ctl; 4261 struct rb_node *n; 4262 4263 write_lock(&info->block_group_cache_lock); 4264 while (!list_empty(&info->caching_block_groups)) { 4265 caching_ctl = list_entry(info->caching_block_groups.next, 4266 struct btrfs_caching_control, list); 4267 list_del(&caching_ctl->list); 4268 btrfs_put_caching_control(caching_ctl); 4269 } 4270 write_unlock(&info->block_group_cache_lock); 4271 4272 spin_lock(&info->unused_bgs_lock); 4273 while (!list_empty(&info->unused_bgs)) { 4274 block_group = list_first_entry(&info->unused_bgs, 4275 struct btrfs_block_group, 4276 bg_list); 4277 list_del_init(&block_group->bg_list); 4278 btrfs_put_block_group(block_group); 4279 } 4280 4281 while (!list_empty(&info->reclaim_bgs)) { 4282 block_group = list_first_entry(&info->reclaim_bgs, 4283 struct btrfs_block_group, 4284 bg_list); 4285 list_del_init(&block_group->bg_list); 4286 btrfs_put_block_group(block_group); 4287 } 4288 spin_unlock(&info->unused_bgs_lock); 4289 4290 spin_lock(&info->zone_active_bgs_lock); 4291 while (!list_empty(&info->zone_active_bgs)) { 4292 block_group = list_first_entry(&info->zone_active_bgs, 4293 struct btrfs_block_group, 4294 active_bg_list); 4295 list_del_init(&block_group->active_bg_list); 4296 btrfs_put_block_group(block_group); 4297 } 4298 spin_unlock(&info->zone_active_bgs_lock); 4299 4300 write_lock(&info->block_group_cache_lock); 4301 while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { 4302 block_group = rb_entry(n, struct btrfs_block_group, 4303 cache_node); 4304 rb_erase_cached(&block_group->cache_node, 4305 &info->block_group_cache_tree); 4306 RB_CLEAR_NODE(&block_group->cache_node); 4307 write_unlock(&info->block_group_cache_lock); 4308 4309 down_write(&block_group->space_info->groups_sem); 4310 list_del(&block_group->list); 4311 up_write(&block_group->space_info->groups_sem); 4312 4313 /* 4314 * We haven't cached this block group, which means we could 4315 * possibly have excluded extents on this block group. 4316 */ 4317 if (block_group->cached == BTRFS_CACHE_NO || 4318 block_group->cached == BTRFS_CACHE_ERROR) 4319 btrfs_free_excluded_extents(block_group); 4320 4321 btrfs_remove_free_space_cache(block_group); 4322 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 4323 ASSERT(list_empty(&block_group->dirty_list)); 4324 ASSERT(list_empty(&block_group->io_list)); 4325 ASSERT(list_empty(&block_group->bg_list)); 4326 ASSERT(refcount_read(&block_group->refs) == 1); 4327 ASSERT(block_group->swap_extents == 0); 4328 btrfs_put_block_group(block_group); 4329 4330 write_lock(&info->block_group_cache_lock); 4331 } 4332 write_unlock(&info->block_group_cache_lock); 4333 4334 btrfs_release_global_block_rsv(info); 4335 4336 while (!list_empty(&info->space_info)) { 4337 space_info = list_entry(info->space_info.next, 4338 struct btrfs_space_info, 4339 list); 4340 4341 /* 4342 * Do not hide this behind enospc_debug, this is actually 4343 * important and indicates a real bug if this happens. 4344 */ 4345 if (WARN_ON(space_info->bytes_pinned > 0 || 4346 space_info->bytes_may_use > 0)) 4347 btrfs_dump_space_info(info, space_info, 0, 0); 4348 4349 /* 4350 * If there was a failure to cleanup a log tree, very likely due 4351 * to an IO failure on a writeback attempt of one or more of its 4352 * extent buffers, we could not do proper (and cheap) unaccounting 4353 * of their reserved space, so don't warn on bytes_reserved > 0 in 4354 * that case. 4355 */ 4356 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || 4357 !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { 4358 if (WARN_ON(space_info->bytes_reserved > 0)) 4359 btrfs_dump_space_info(info, space_info, 0, 0); 4360 } 4361 4362 WARN_ON(space_info->reclaim_size > 0); 4363 list_del(&space_info->list); 4364 btrfs_sysfs_remove_space_info(space_info); 4365 } 4366 return 0; 4367 } 4368 4369 void btrfs_freeze_block_group(struct btrfs_block_group *cache) 4370 { 4371 atomic_inc(&cache->frozen); 4372 } 4373 4374 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) 4375 { 4376 struct btrfs_fs_info *fs_info = block_group->fs_info; 4377 struct extent_map_tree *em_tree; 4378 struct extent_map *em; 4379 bool cleanup; 4380 4381 spin_lock(&block_group->lock); 4382 cleanup = (atomic_dec_and_test(&block_group->frozen) && 4383 test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); 4384 spin_unlock(&block_group->lock); 4385 4386 if (cleanup) { 4387 em_tree = &fs_info->mapping_tree; 4388 write_lock(&em_tree->lock); 4389 em = lookup_extent_mapping(em_tree, block_group->start, 4390 1); 4391 BUG_ON(!em); /* logic error, can't happen */ 4392 remove_extent_mapping(em_tree, em); 4393 write_unlock(&em_tree->lock); 4394 4395 /* once for us and once for the tree */ 4396 free_extent_map(em); 4397 free_extent_map(em); 4398 4399 /* 4400 * We may have left one free space entry and other possible 4401 * tasks trimming this block group have left 1 entry each one. 4402 * Free them if any. 4403 */ 4404 btrfs_remove_free_space_cache(block_group); 4405 } 4406 } 4407 4408 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) 4409 { 4410 bool ret = true; 4411 4412 spin_lock(&bg->lock); 4413 if (bg->ro) 4414 ret = false; 4415 else 4416 bg->swap_extents++; 4417 spin_unlock(&bg->lock); 4418 4419 return ret; 4420 } 4421 4422 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) 4423 { 4424 spin_lock(&bg->lock); 4425 ASSERT(!bg->ro); 4426 ASSERT(bg->swap_extents >= amount); 4427 bg->swap_extents -= amount; 4428 spin_unlock(&bg->lock); 4429 } 4430 4431 enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) 4432 { 4433 if (size <= SZ_128K) 4434 return BTRFS_BG_SZ_SMALL; 4435 if (size <= SZ_8M) 4436 return BTRFS_BG_SZ_MEDIUM; 4437 return BTRFS_BG_SZ_LARGE; 4438 } 4439 4440 /* 4441 * Handle a block group allocating an extent in a size class 4442 * 4443 * @bg: The block group we allocated in. 4444 * @size_class: The size class of the allocation. 4445 * @force_wrong_size_class: Whether we are desperate enough to allow 4446 * mismatched size classes. 4447 * 4448 * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the 4449 * case of a race that leads to the wrong size class without 4450 * force_wrong_size_class set. 4451 * 4452 * find_free_extent will skip block groups with a mismatched size class until 4453 * it really needs to avoid ENOSPC. In that case it will set 4454 * force_wrong_size_class. However, if a block group is newly allocated and 4455 * doesn't yet have a size class, then it is possible for two allocations of 4456 * different sizes to race and both try to use it. The loser is caught here and 4457 * has to retry. 4458 */ 4459 int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, 4460 enum btrfs_block_group_size_class size_class, 4461 bool force_wrong_size_class) 4462 { 4463 ASSERT(size_class != BTRFS_BG_SZ_NONE); 4464 4465 /* The new allocation is in the right size class, do nothing */ 4466 if (bg->size_class == size_class) 4467 return 0; 4468 /* 4469 * The new allocation is in a mismatched size class. 4470 * This means one of two things: 4471 * 4472 * 1. Two tasks in find_free_extent for different size_classes raced 4473 * and hit the same empty block_group. Make the loser try again. 4474 * 2. A call to find_free_extent got desperate enough to set 4475 * 'force_wrong_slab'. Don't change the size_class, but allow the 4476 * allocation. 4477 */ 4478 if (bg->size_class != BTRFS_BG_SZ_NONE) { 4479 if (force_wrong_size_class) 4480 return 0; 4481 return -EAGAIN; 4482 } 4483 /* 4484 * The happy new block group case: the new allocation is the first 4485 * one in the block_group so we set size_class. 4486 */ 4487 bg->size_class = size_class; 4488 4489 return 0; 4490 } 4491 4492 bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) 4493 { 4494 if (btrfs_is_zoned(bg->fs_info)) 4495 return false; 4496 if (!btrfs_is_block_group_data_only(bg)) 4497 return false; 4498 return true; 4499 } 4500