1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/sizes.h> 4 #include <linux/list_sort.h> 5 #include "misc.h" 6 #include "ctree.h" 7 #include "block-group.h" 8 #include "space-info.h" 9 #include "disk-io.h" 10 #include "free-space-cache.h" 11 #include "free-space-tree.h" 12 #include "volumes.h" 13 #include "transaction.h" 14 #include "ref-verify.h" 15 #include "sysfs.h" 16 #include "tree-log.h" 17 #include "delalloc-space.h" 18 #include "discard.h" 19 #include "raid56.h" 20 #include "zoned.h" 21 #include "fs.h" 22 #include "accessors.h" 23 #include "extent-tree.h" 24 25 #ifdef CONFIG_BTRFS_DEBUG 26 int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) 27 { 28 struct btrfs_fs_info *fs_info = block_group->fs_info; 29 30 return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && 31 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || 32 (btrfs_test_opt(fs_info, FRAGMENT_DATA) && 33 block_group->flags & BTRFS_BLOCK_GROUP_DATA); 34 } 35 #endif 36 37 /* 38 * Return target flags in extended format or 0 if restripe for this chunk_type 39 * is not in progress 40 * 41 * Should be called with balance_lock held 42 */ 43 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 44 { 45 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 46 u64 target = 0; 47 48 if (!bctl) 49 return 0; 50 51 if (flags & BTRFS_BLOCK_GROUP_DATA && 52 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 53 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 54 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 55 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 56 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 57 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 58 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 59 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 60 } 61 62 return target; 63 } 64 65 /* 66 * @flags: available profiles in extended format (see ctree.h) 67 * 68 * Return reduced profile in chunk format. If profile changing is in progress 69 * (either running or paused) picks the target profile (if it's already 70 * available), otherwise falls back to plain reducing. 71 */ 72 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 73 { 74 u64 num_devices = fs_info->fs_devices->rw_devices; 75 u64 target; 76 u64 raid_type; 77 u64 allowed = 0; 78 79 /* 80 * See if restripe for this chunk_type is in progress, if so try to 81 * reduce to the target profile 82 */ 83 spin_lock(&fs_info->balance_lock); 84 target = get_restripe_target(fs_info, flags); 85 if (target) { 86 spin_unlock(&fs_info->balance_lock); 87 return extended_to_chunk(target); 88 } 89 spin_unlock(&fs_info->balance_lock); 90 91 /* First, mask out the RAID levels which aren't possible */ 92 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 93 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 94 allowed |= btrfs_raid_array[raid_type].bg_flag; 95 } 96 allowed &= flags; 97 98 /* Select the highest-redundancy RAID level. */ 99 if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) 100 allowed = BTRFS_BLOCK_GROUP_RAID1C4; 101 else if (allowed & BTRFS_BLOCK_GROUP_RAID6) 102 allowed = BTRFS_BLOCK_GROUP_RAID6; 103 else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) 104 allowed = BTRFS_BLOCK_GROUP_RAID1C3; 105 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 106 allowed = BTRFS_BLOCK_GROUP_RAID5; 107 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 108 allowed = BTRFS_BLOCK_GROUP_RAID10; 109 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 110 allowed = BTRFS_BLOCK_GROUP_RAID1; 111 else if (allowed & BTRFS_BLOCK_GROUP_DUP) 112 allowed = BTRFS_BLOCK_GROUP_DUP; 113 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 114 allowed = BTRFS_BLOCK_GROUP_RAID0; 115 116 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 117 118 return extended_to_chunk(flags | allowed); 119 } 120 121 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 122 { 123 unsigned seq; 124 u64 flags; 125 126 do { 127 flags = orig_flags; 128 seq = read_seqbegin(&fs_info->profiles_lock); 129 130 if (flags & BTRFS_BLOCK_GROUP_DATA) 131 flags |= fs_info->avail_data_alloc_bits; 132 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 133 flags |= fs_info->avail_system_alloc_bits; 134 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 135 flags |= fs_info->avail_metadata_alloc_bits; 136 } while (read_seqretry(&fs_info->profiles_lock, seq)); 137 138 return btrfs_reduce_alloc_profile(fs_info, flags); 139 } 140 141 void btrfs_get_block_group(struct btrfs_block_group *cache) 142 { 143 refcount_inc(&cache->refs); 144 } 145 146 void btrfs_put_block_group(struct btrfs_block_group *cache) 147 { 148 if (refcount_dec_and_test(&cache->refs)) { 149 WARN_ON(cache->pinned > 0); 150 /* 151 * If there was a failure to cleanup a log tree, very likely due 152 * to an IO failure on a writeback attempt of one or more of its 153 * extent buffers, we could not do proper (and cheap) unaccounting 154 * of their reserved space, so don't warn on reserved > 0 in that 155 * case. 156 */ 157 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || 158 !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) 159 WARN_ON(cache->reserved > 0); 160 161 /* 162 * A block_group shouldn't be on the discard_list anymore. 163 * Remove the block_group from the discard_list to prevent us 164 * from causing a panic due to NULL pointer dereference. 165 */ 166 if (WARN_ON(!list_empty(&cache->discard_list))) 167 btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, 168 cache); 169 170 kfree(cache->free_space_ctl); 171 kfree(cache->physical_map); 172 kfree(cache); 173 } 174 } 175 176 /* 177 * This adds the block group to the fs_info rb tree for the block group cache 178 */ 179 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 180 struct btrfs_block_group *block_group) 181 { 182 struct rb_node **p; 183 struct rb_node *parent = NULL; 184 struct btrfs_block_group *cache; 185 bool leftmost = true; 186 187 ASSERT(block_group->length != 0); 188 189 write_lock(&info->block_group_cache_lock); 190 p = &info->block_group_cache_tree.rb_root.rb_node; 191 192 while (*p) { 193 parent = *p; 194 cache = rb_entry(parent, struct btrfs_block_group, cache_node); 195 if (block_group->start < cache->start) { 196 p = &(*p)->rb_left; 197 } else if (block_group->start > cache->start) { 198 p = &(*p)->rb_right; 199 leftmost = false; 200 } else { 201 write_unlock(&info->block_group_cache_lock); 202 return -EEXIST; 203 } 204 } 205 206 rb_link_node(&block_group->cache_node, parent, p); 207 rb_insert_color_cached(&block_group->cache_node, 208 &info->block_group_cache_tree, leftmost); 209 210 write_unlock(&info->block_group_cache_lock); 211 212 return 0; 213 } 214 215 /* 216 * This will return the block group at or after bytenr if contains is 0, else 217 * it will return the block group that contains the bytenr 218 */ 219 static struct btrfs_block_group *block_group_cache_tree_search( 220 struct btrfs_fs_info *info, u64 bytenr, int contains) 221 { 222 struct btrfs_block_group *cache, *ret = NULL; 223 struct rb_node *n; 224 u64 end, start; 225 226 read_lock(&info->block_group_cache_lock); 227 n = info->block_group_cache_tree.rb_root.rb_node; 228 229 while (n) { 230 cache = rb_entry(n, struct btrfs_block_group, cache_node); 231 end = cache->start + cache->length - 1; 232 start = cache->start; 233 234 if (bytenr < start) { 235 if (!contains && (!ret || start < ret->start)) 236 ret = cache; 237 n = n->rb_left; 238 } else if (bytenr > start) { 239 if (contains && bytenr <= end) { 240 ret = cache; 241 break; 242 } 243 n = n->rb_right; 244 } else { 245 ret = cache; 246 break; 247 } 248 } 249 if (ret) 250 btrfs_get_block_group(ret); 251 read_unlock(&info->block_group_cache_lock); 252 253 return ret; 254 } 255 256 /* 257 * Return the block group that starts at or after bytenr 258 */ 259 struct btrfs_block_group *btrfs_lookup_first_block_group( 260 struct btrfs_fs_info *info, u64 bytenr) 261 { 262 return block_group_cache_tree_search(info, bytenr, 0); 263 } 264 265 /* 266 * Return the block group that contains the given bytenr 267 */ 268 struct btrfs_block_group *btrfs_lookup_block_group( 269 struct btrfs_fs_info *info, u64 bytenr) 270 { 271 return block_group_cache_tree_search(info, bytenr, 1); 272 } 273 274 struct btrfs_block_group *btrfs_next_block_group( 275 struct btrfs_block_group *cache) 276 { 277 struct btrfs_fs_info *fs_info = cache->fs_info; 278 struct rb_node *node; 279 280 read_lock(&fs_info->block_group_cache_lock); 281 282 /* If our block group was removed, we need a full search. */ 283 if (RB_EMPTY_NODE(&cache->cache_node)) { 284 const u64 next_bytenr = cache->start + cache->length; 285 286 read_unlock(&fs_info->block_group_cache_lock); 287 btrfs_put_block_group(cache); 288 return btrfs_lookup_first_block_group(fs_info, next_bytenr); 289 } 290 node = rb_next(&cache->cache_node); 291 btrfs_put_block_group(cache); 292 if (node) { 293 cache = rb_entry(node, struct btrfs_block_group, cache_node); 294 btrfs_get_block_group(cache); 295 } else 296 cache = NULL; 297 read_unlock(&fs_info->block_group_cache_lock); 298 return cache; 299 } 300 301 /* 302 * Check if we can do a NOCOW write for a given extent. 303 * 304 * @fs_info: The filesystem information object. 305 * @bytenr: Logical start address of the extent. 306 * 307 * Check if we can do a NOCOW write for the given extent, and increments the 308 * number of NOCOW writers in the block group that contains the extent, as long 309 * as the block group exists and it's currently not in read-only mode. 310 * 311 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller 312 * is responsible for calling btrfs_dec_nocow_writers() later. 313 * 314 * Or NULL if we can not do a NOCOW write 315 */ 316 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 317 u64 bytenr) 318 { 319 struct btrfs_block_group *bg; 320 bool can_nocow = true; 321 322 bg = btrfs_lookup_block_group(fs_info, bytenr); 323 if (!bg) 324 return NULL; 325 326 spin_lock(&bg->lock); 327 if (bg->ro) 328 can_nocow = false; 329 else 330 atomic_inc(&bg->nocow_writers); 331 spin_unlock(&bg->lock); 332 333 if (!can_nocow) { 334 btrfs_put_block_group(bg); 335 return NULL; 336 } 337 338 /* No put on block group, done by btrfs_dec_nocow_writers(). */ 339 return bg; 340 } 341 342 /* 343 * Decrement the number of NOCOW writers in a block group. 344 * 345 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), 346 * and on the block group returned by that call. Typically this is called after 347 * creating an ordered extent for a NOCOW write, to prevent races with scrub and 348 * relocation. 349 * 350 * After this call, the caller should not use the block group anymore. It it wants 351 * to use it, then it should get a reference on it before calling this function. 352 */ 353 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) 354 { 355 if (atomic_dec_and_test(&bg->nocow_writers)) 356 wake_up_var(&bg->nocow_writers); 357 358 /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ 359 btrfs_put_block_group(bg); 360 } 361 362 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 363 { 364 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 365 } 366 367 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 368 const u64 start) 369 { 370 struct btrfs_block_group *bg; 371 372 bg = btrfs_lookup_block_group(fs_info, start); 373 ASSERT(bg); 374 if (atomic_dec_and_test(&bg->reservations)) 375 wake_up_var(&bg->reservations); 376 btrfs_put_block_group(bg); 377 } 378 379 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 380 { 381 struct btrfs_space_info *space_info = bg->space_info; 382 383 ASSERT(bg->ro); 384 385 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 386 return; 387 388 /* 389 * Our block group is read only but before we set it to read only, 390 * some task might have had allocated an extent from it already, but it 391 * has not yet created a respective ordered extent (and added it to a 392 * root's list of ordered extents). 393 * Therefore wait for any task currently allocating extents, since the 394 * block group's reservations counter is incremented while a read lock 395 * on the groups' semaphore is held and decremented after releasing 396 * the read access on that semaphore and creating the ordered extent. 397 */ 398 down_write(&space_info->groups_sem); 399 up_write(&space_info->groups_sem); 400 401 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 402 } 403 404 struct btrfs_caching_control *btrfs_get_caching_control( 405 struct btrfs_block_group *cache) 406 { 407 struct btrfs_caching_control *ctl; 408 409 spin_lock(&cache->lock); 410 if (!cache->caching_ctl) { 411 spin_unlock(&cache->lock); 412 return NULL; 413 } 414 415 ctl = cache->caching_ctl; 416 refcount_inc(&ctl->count); 417 spin_unlock(&cache->lock); 418 return ctl; 419 } 420 421 void btrfs_put_caching_control(struct btrfs_caching_control *ctl) 422 { 423 if (refcount_dec_and_test(&ctl->count)) 424 kfree(ctl); 425 } 426 427 /* 428 * When we wait for progress in the block group caching, its because our 429 * allocation attempt failed at least once. So, we must sleep and let some 430 * progress happen before we try again. 431 * 432 * This function will sleep at least once waiting for new free space to show 433 * up, and then it will check the block group free space numbers for our min 434 * num_bytes. Another option is to have it go ahead and look in the rbtree for 435 * a free extent of a given size, but this is a good start. 436 * 437 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 438 * any of the information in this block group. 439 */ 440 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 441 u64 num_bytes) 442 { 443 struct btrfs_caching_control *caching_ctl; 444 int progress; 445 446 caching_ctl = btrfs_get_caching_control(cache); 447 if (!caching_ctl) 448 return; 449 450 /* 451 * We've already failed to allocate from this block group, so even if 452 * there's enough space in the block group it isn't contiguous enough to 453 * allow for an allocation, so wait for at least the next wakeup tick, 454 * or for the thing to be done. 455 */ 456 progress = atomic_read(&caching_ctl->progress); 457 458 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 459 (progress != atomic_read(&caching_ctl->progress) && 460 (cache->free_space_ctl->free_space >= num_bytes))); 461 462 btrfs_put_caching_control(caching_ctl); 463 } 464 465 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, 466 struct btrfs_caching_control *caching_ctl) 467 { 468 wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 469 return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; 470 } 471 472 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 473 { 474 struct btrfs_caching_control *caching_ctl; 475 int ret; 476 477 caching_ctl = btrfs_get_caching_control(cache); 478 if (!caching_ctl) 479 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 480 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 481 btrfs_put_caching_control(caching_ctl); 482 return ret; 483 } 484 485 #ifdef CONFIG_BTRFS_DEBUG 486 static void fragment_free_space(struct btrfs_block_group *block_group) 487 { 488 struct btrfs_fs_info *fs_info = block_group->fs_info; 489 u64 start = block_group->start; 490 u64 len = block_group->length; 491 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 492 fs_info->nodesize : fs_info->sectorsize; 493 u64 step = chunk << 1; 494 495 while (len > chunk) { 496 btrfs_remove_free_space(block_group, start, chunk); 497 start += step; 498 if (len < step) 499 len = 0; 500 else 501 len -= step; 502 } 503 } 504 #endif 505 506 /* 507 * This is only called by btrfs_cache_block_group, since we could have freed 508 * extents we need to check the pinned_extents for any extents that can't be 509 * used yet since their free space will be released as soon as the transaction 510 * commits. 511 */ 512 int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, 513 u64 *total_added_ret) 514 { 515 struct btrfs_fs_info *info = block_group->fs_info; 516 u64 extent_start, extent_end, size; 517 int ret; 518 519 if (total_added_ret) 520 *total_added_ret = 0; 521 522 while (start < end) { 523 ret = find_first_extent_bit(&info->excluded_extents, start, 524 &extent_start, &extent_end, 525 EXTENT_DIRTY | EXTENT_UPTODATE, 526 NULL); 527 if (ret) 528 break; 529 530 if (extent_start <= start) { 531 start = extent_end + 1; 532 } else if (extent_start > start && extent_start < end) { 533 size = extent_start - start; 534 ret = btrfs_add_free_space_async_trimmed(block_group, 535 start, size); 536 if (ret) 537 return ret; 538 if (total_added_ret) 539 *total_added_ret += size; 540 start = extent_end + 1; 541 } else { 542 break; 543 } 544 } 545 546 if (start < end) { 547 size = end - start; 548 ret = btrfs_add_free_space_async_trimmed(block_group, start, 549 size); 550 if (ret) 551 return ret; 552 if (total_added_ret) 553 *total_added_ret += size; 554 } 555 556 return 0; 557 } 558 559 /* 560 * Get an arbitrary extent item index / max_index through the block group 561 * 562 * @block_group the block group to sample from 563 * @index: the integral step through the block group to grab from 564 * @max_index: the granularity of the sampling 565 * @key: return value parameter for the item we find 566 * 567 * Pre-conditions on indices: 568 * 0 <= index <= max_index 569 * 0 < max_index 570 * 571 * Returns: 0 on success, 1 if the search didn't yield a useful item, negative 572 * error code on error. 573 */ 574 static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, 575 struct btrfs_block_group *block_group, 576 int index, int max_index, 577 struct btrfs_key *found_key) 578 { 579 struct btrfs_fs_info *fs_info = block_group->fs_info; 580 struct btrfs_root *extent_root; 581 u64 search_offset; 582 u64 search_end = block_group->start + block_group->length; 583 struct btrfs_path *path; 584 struct btrfs_key search_key; 585 int ret = 0; 586 587 ASSERT(index >= 0); 588 ASSERT(index <= max_index); 589 ASSERT(max_index > 0); 590 lockdep_assert_held(&caching_ctl->mutex); 591 lockdep_assert_held_read(&fs_info->commit_root_sem); 592 593 path = btrfs_alloc_path(); 594 if (!path) 595 return -ENOMEM; 596 597 extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, 598 BTRFS_SUPER_INFO_OFFSET)); 599 600 path->skip_locking = 1; 601 path->search_commit_root = 1; 602 path->reada = READA_FORWARD; 603 604 search_offset = index * div_u64(block_group->length, max_index); 605 search_key.objectid = block_group->start + search_offset; 606 search_key.type = BTRFS_EXTENT_ITEM_KEY; 607 search_key.offset = 0; 608 609 btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { 610 /* Success; sampled an extent item in the block group */ 611 if (found_key->type == BTRFS_EXTENT_ITEM_KEY && 612 found_key->objectid >= block_group->start && 613 found_key->objectid + found_key->offset <= search_end) 614 break; 615 616 /* We can't possibly find a valid extent item anymore */ 617 if (found_key->objectid >= search_end) { 618 ret = 1; 619 break; 620 } 621 } 622 623 lockdep_assert_held(&caching_ctl->mutex); 624 lockdep_assert_held_read(&fs_info->commit_root_sem); 625 btrfs_free_path(path); 626 return ret; 627 } 628 629 /* 630 * Best effort attempt to compute a block group's size class while caching it. 631 * 632 * @block_group: the block group we are caching 633 * 634 * We cannot infer the size class while adding free space extents, because that 635 * logic doesn't care about contiguous file extents (it doesn't differentiate 636 * between a 100M extent and 100 contiguous 1M extents). So we need to read the 637 * file extent items. Reading all of them is quite wasteful, because usually 638 * only a handful are enough to give a good answer. Therefore, we just grab 5 of 639 * them at even steps through the block group and pick the smallest size class 640 * we see. Since size class is best effort, and not guaranteed in general, 641 * inaccuracy is acceptable. 642 * 643 * To be more explicit about why this algorithm makes sense: 644 * 645 * If we are caching in a block group from disk, then there are three major cases 646 * to consider: 647 * 1. the block group is well behaved and all extents in it are the same size 648 * class. 649 * 2. the block group is mostly one size class with rare exceptions for last 650 * ditch allocations 651 * 3. the block group was populated before size classes and can have a totally 652 * arbitrary mix of size classes. 653 * 654 * In case 1, looking at any extent in the block group will yield the correct 655 * result. For the mixed cases, taking the minimum size class seems like a good 656 * approximation, since gaps from frees will be usable to the size class. For 657 * 2., a small handful of file extents is likely to yield the right answer. For 658 * 3, we can either read every file extent, or admit that this is best effort 659 * anyway and try to stay fast. 660 * 661 * Returns: 0 on success, negative error code on error. 662 */ 663 static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, 664 struct btrfs_block_group *block_group) 665 { 666 struct btrfs_fs_info *fs_info = block_group->fs_info; 667 struct btrfs_key key; 668 int i; 669 u64 min_size = block_group->length; 670 enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; 671 int ret; 672 673 if (!btrfs_block_group_should_use_size_class(block_group)) 674 return 0; 675 676 lockdep_assert_held(&caching_ctl->mutex); 677 lockdep_assert_held_read(&fs_info->commit_root_sem); 678 for (i = 0; i < 5; ++i) { 679 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); 680 if (ret < 0) 681 goto out; 682 if (ret > 0) 683 continue; 684 min_size = min_t(u64, min_size, key.offset); 685 size_class = btrfs_calc_block_group_size_class(min_size); 686 } 687 if (size_class != BTRFS_BG_SZ_NONE) { 688 spin_lock(&block_group->lock); 689 block_group->size_class = size_class; 690 spin_unlock(&block_group->lock); 691 } 692 out: 693 return ret; 694 } 695 696 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 697 { 698 struct btrfs_block_group *block_group = caching_ctl->block_group; 699 struct btrfs_fs_info *fs_info = block_group->fs_info; 700 struct btrfs_root *extent_root; 701 struct btrfs_path *path; 702 struct extent_buffer *leaf; 703 struct btrfs_key key; 704 u64 total_found = 0; 705 u64 last = 0; 706 u32 nritems; 707 int ret; 708 bool wakeup = true; 709 710 path = btrfs_alloc_path(); 711 if (!path) 712 return -ENOMEM; 713 714 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 715 extent_root = btrfs_extent_root(fs_info, last); 716 717 #ifdef CONFIG_BTRFS_DEBUG 718 /* 719 * If we're fragmenting we don't want to make anybody think we can 720 * allocate from this block group until we've had a chance to fragment 721 * the free space. 722 */ 723 if (btrfs_should_fragment_free_space(block_group)) 724 wakeup = false; 725 #endif 726 /* 727 * We don't want to deadlock with somebody trying to allocate a new 728 * extent for the extent root while also trying to search the extent 729 * root to add free space. So we skip locking and search the commit 730 * root, since its read-only 731 */ 732 path->skip_locking = 1; 733 path->search_commit_root = 1; 734 path->reada = READA_FORWARD; 735 736 key.objectid = last; 737 key.offset = 0; 738 key.type = BTRFS_EXTENT_ITEM_KEY; 739 740 next: 741 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 742 if (ret < 0) 743 goto out; 744 745 leaf = path->nodes[0]; 746 nritems = btrfs_header_nritems(leaf); 747 748 while (1) { 749 if (btrfs_fs_closing(fs_info) > 1) { 750 last = (u64)-1; 751 break; 752 } 753 754 if (path->slots[0] < nritems) { 755 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 756 } else { 757 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 758 if (ret) 759 break; 760 761 if (need_resched() || 762 rwsem_is_contended(&fs_info->commit_root_sem)) { 763 btrfs_release_path(path); 764 up_read(&fs_info->commit_root_sem); 765 mutex_unlock(&caching_ctl->mutex); 766 cond_resched(); 767 mutex_lock(&caching_ctl->mutex); 768 down_read(&fs_info->commit_root_sem); 769 goto next; 770 } 771 772 ret = btrfs_next_leaf(extent_root, path); 773 if (ret < 0) 774 goto out; 775 if (ret) 776 break; 777 leaf = path->nodes[0]; 778 nritems = btrfs_header_nritems(leaf); 779 continue; 780 } 781 782 if (key.objectid < last) { 783 key.objectid = last; 784 key.offset = 0; 785 key.type = BTRFS_EXTENT_ITEM_KEY; 786 btrfs_release_path(path); 787 goto next; 788 } 789 790 if (key.objectid < block_group->start) { 791 path->slots[0]++; 792 continue; 793 } 794 795 if (key.objectid >= block_group->start + block_group->length) 796 break; 797 798 if (key.type == BTRFS_EXTENT_ITEM_KEY || 799 key.type == BTRFS_METADATA_ITEM_KEY) { 800 u64 space_added; 801 802 ret = add_new_free_space(block_group, last, key.objectid, 803 &space_added); 804 if (ret) 805 goto out; 806 total_found += space_added; 807 if (key.type == BTRFS_METADATA_ITEM_KEY) 808 last = key.objectid + 809 fs_info->nodesize; 810 else 811 last = key.objectid + key.offset; 812 813 if (total_found > CACHING_CTL_WAKE_UP) { 814 total_found = 0; 815 if (wakeup) { 816 atomic_inc(&caching_ctl->progress); 817 wake_up(&caching_ctl->wait); 818 } 819 } 820 } 821 path->slots[0]++; 822 } 823 824 ret = add_new_free_space(block_group, last, 825 block_group->start + block_group->length, 826 NULL); 827 out: 828 btrfs_free_path(path); 829 return ret; 830 } 831 832 static noinline void caching_thread(struct btrfs_work *work) 833 { 834 struct btrfs_block_group *block_group; 835 struct btrfs_fs_info *fs_info; 836 struct btrfs_caching_control *caching_ctl; 837 int ret; 838 839 caching_ctl = container_of(work, struct btrfs_caching_control, work); 840 block_group = caching_ctl->block_group; 841 fs_info = block_group->fs_info; 842 843 mutex_lock(&caching_ctl->mutex); 844 down_read(&fs_info->commit_root_sem); 845 846 load_block_group_size_class(caching_ctl, block_group); 847 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 848 ret = load_free_space_cache(block_group); 849 if (ret == 1) { 850 ret = 0; 851 goto done; 852 } 853 854 /* 855 * We failed to load the space cache, set ourselves to 856 * CACHE_STARTED and carry on. 857 */ 858 spin_lock(&block_group->lock); 859 block_group->cached = BTRFS_CACHE_STARTED; 860 spin_unlock(&block_group->lock); 861 wake_up(&caching_ctl->wait); 862 } 863 864 /* 865 * If we are in the transaction that populated the free space tree we 866 * can't actually cache from the free space tree as our commit root and 867 * real root are the same, so we could change the contents of the blocks 868 * while caching. Instead do the slow caching in this case, and after 869 * the transaction has committed we will be safe. 870 */ 871 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && 872 !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) 873 ret = load_free_space_tree(caching_ctl); 874 else 875 ret = load_extent_tree_free(caching_ctl); 876 done: 877 spin_lock(&block_group->lock); 878 block_group->caching_ctl = NULL; 879 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 880 spin_unlock(&block_group->lock); 881 882 #ifdef CONFIG_BTRFS_DEBUG 883 if (btrfs_should_fragment_free_space(block_group)) { 884 u64 bytes_used; 885 886 spin_lock(&block_group->space_info->lock); 887 spin_lock(&block_group->lock); 888 bytes_used = block_group->length - block_group->used; 889 block_group->space_info->bytes_used += bytes_used >> 1; 890 spin_unlock(&block_group->lock); 891 spin_unlock(&block_group->space_info->lock); 892 fragment_free_space(block_group); 893 } 894 #endif 895 896 up_read(&fs_info->commit_root_sem); 897 btrfs_free_excluded_extents(block_group); 898 mutex_unlock(&caching_ctl->mutex); 899 900 wake_up(&caching_ctl->wait); 901 902 btrfs_put_caching_control(caching_ctl); 903 btrfs_put_block_group(block_group); 904 } 905 906 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) 907 { 908 struct btrfs_fs_info *fs_info = cache->fs_info; 909 struct btrfs_caching_control *caching_ctl = NULL; 910 int ret = 0; 911 912 /* Allocator for zoned filesystems does not use the cache at all */ 913 if (btrfs_is_zoned(fs_info)) 914 return 0; 915 916 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 917 if (!caching_ctl) 918 return -ENOMEM; 919 920 INIT_LIST_HEAD(&caching_ctl->list); 921 mutex_init(&caching_ctl->mutex); 922 init_waitqueue_head(&caching_ctl->wait); 923 caching_ctl->block_group = cache; 924 refcount_set(&caching_ctl->count, 2); 925 atomic_set(&caching_ctl->progress, 0); 926 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 927 928 spin_lock(&cache->lock); 929 if (cache->cached != BTRFS_CACHE_NO) { 930 kfree(caching_ctl); 931 932 caching_ctl = cache->caching_ctl; 933 if (caching_ctl) 934 refcount_inc(&caching_ctl->count); 935 spin_unlock(&cache->lock); 936 goto out; 937 } 938 WARN_ON(cache->caching_ctl); 939 cache->caching_ctl = caching_ctl; 940 cache->cached = BTRFS_CACHE_STARTED; 941 spin_unlock(&cache->lock); 942 943 write_lock(&fs_info->block_group_cache_lock); 944 refcount_inc(&caching_ctl->count); 945 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 946 write_unlock(&fs_info->block_group_cache_lock); 947 948 btrfs_get_block_group(cache); 949 950 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 951 out: 952 if (wait && caching_ctl) 953 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); 954 if (caching_ctl) 955 btrfs_put_caching_control(caching_ctl); 956 957 return ret; 958 } 959 960 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 961 { 962 u64 extra_flags = chunk_to_extended(flags) & 963 BTRFS_EXTENDED_PROFILE_MASK; 964 965 write_seqlock(&fs_info->profiles_lock); 966 if (flags & BTRFS_BLOCK_GROUP_DATA) 967 fs_info->avail_data_alloc_bits &= ~extra_flags; 968 if (flags & BTRFS_BLOCK_GROUP_METADATA) 969 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 970 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 971 fs_info->avail_system_alloc_bits &= ~extra_flags; 972 write_sequnlock(&fs_info->profiles_lock); 973 } 974 975 /* 976 * Clear incompat bits for the following feature(s): 977 * 978 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 979 * in the whole filesystem 980 * 981 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 982 */ 983 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 984 { 985 bool found_raid56 = false; 986 bool found_raid1c34 = false; 987 988 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 989 (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 990 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 991 struct list_head *head = &fs_info->space_info; 992 struct btrfs_space_info *sinfo; 993 994 list_for_each_entry_rcu(sinfo, head, list) { 995 down_read(&sinfo->groups_sem); 996 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 997 found_raid56 = true; 998 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 999 found_raid56 = true; 1000 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 1001 found_raid1c34 = true; 1002 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 1003 found_raid1c34 = true; 1004 up_read(&sinfo->groups_sem); 1005 } 1006 if (!found_raid56) 1007 btrfs_clear_fs_incompat(fs_info, RAID56); 1008 if (!found_raid1c34) 1009 btrfs_clear_fs_incompat(fs_info, RAID1C34); 1010 } 1011 } 1012 1013 static int remove_block_group_item(struct btrfs_trans_handle *trans, 1014 struct btrfs_path *path, 1015 struct btrfs_block_group *block_group) 1016 { 1017 struct btrfs_fs_info *fs_info = trans->fs_info; 1018 struct btrfs_root *root; 1019 struct btrfs_key key; 1020 int ret; 1021 1022 root = btrfs_block_group_root(fs_info); 1023 key.objectid = block_group->start; 1024 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 1025 key.offset = block_group->length; 1026 1027 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1028 if (ret > 0) 1029 ret = -ENOENT; 1030 if (ret < 0) 1031 return ret; 1032 1033 ret = btrfs_del_item(trans, root, path); 1034 return ret; 1035 } 1036 1037 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 1038 u64 group_start, struct extent_map *em) 1039 { 1040 struct btrfs_fs_info *fs_info = trans->fs_info; 1041 struct btrfs_path *path; 1042 struct btrfs_block_group *block_group; 1043 struct btrfs_free_cluster *cluster; 1044 struct inode *inode; 1045 struct kobject *kobj = NULL; 1046 int ret; 1047 int index; 1048 int factor; 1049 struct btrfs_caching_control *caching_ctl = NULL; 1050 bool remove_em; 1051 bool remove_rsv = false; 1052 1053 block_group = btrfs_lookup_block_group(fs_info, group_start); 1054 BUG_ON(!block_group); 1055 BUG_ON(!block_group->ro); 1056 1057 trace_btrfs_remove_block_group(block_group); 1058 /* 1059 * Free the reserved super bytes from this block group before 1060 * remove it. 1061 */ 1062 btrfs_free_excluded_extents(block_group); 1063 btrfs_free_ref_tree_range(fs_info, block_group->start, 1064 block_group->length); 1065 1066 index = btrfs_bg_flags_to_raid_index(block_group->flags); 1067 factor = btrfs_bg_type_to_factor(block_group->flags); 1068 1069 /* make sure this block group isn't part of an allocation cluster */ 1070 cluster = &fs_info->data_alloc_cluster; 1071 spin_lock(&cluster->refill_lock); 1072 btrfs_return_cluster_to_free_space(block_group, cluster); 1073 spin_unlock(&cluster->refill_lock); 1074 1075 /* 1076 * make sure this block group isn't part of a metadata 1077 * allocation cluster 1078 */ 1079 cluster = &fs_info->meta_alloc_cluster; 1080 spin_lock(&cluster->refill_lock); 1081 btrfs_return_cluster_to_free_space(block_group, cluster); 1082 spin_unlock(&cluster->refill_lock); 1083 1084 btrfs_clear_treelog_bg(block_group); 1085 btrfs_clear_data_reloc_bg(block_group); 1086 1087 path = btrfs_alloc_path(); 1088 if (!path) { 1089 ret = -ENOMEM; 1090 goto out; 1091 } 1092 1093 /* 1094 * get the inode first so any iput calls done for the io_list 1095 * aren't the final iput (no unlinks allowed now) 1096 */ 1097 inode = lookup_free_space_inode(block_group, path); 1098 1099 mutex_lock(&trans->transaction->cache_write_mutex); 1100 /* 1101 * Make sure our free space cache IO is done before removing the 1102 * free space inode 1103 */ 1104 spin_lock(&trans->transaction->dirty_bgs_lock); 1105 if (!list_empty(&block_group->io_list)) { 1106 list_del_init(&block_group->io_list); 1107 1108 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 1109 1110 spin_unlock(&trans->transaction->dirty_bgs_lock); 1111 btrfs_wait_cache_io(trans, block_group, path); 1112 btrfs_put_block_group(block_group); 1113 spin_lock(&trans->transaction->dirty_bgs_lock); 1114 } 1115 1116 if (!list_empty(&block_group->dirty_list)) { 1117 list_del_init(&block_group->dirty_list); 1118 remove_rsv = true; 1119 btrfs_put_block_group(block_group); 1120 } 1121 spin_unlock(&trans->transaction->dirty_bgs_lock); 1122 mutex_unlock(&trans->transaction->cache_write_mutex); 1123 1124 ret = btrfs_remove_free_space_inode(trans, inode, block_group); 1125 if (ret) 1126 goto out; 1127 1128 write_lock(&fs_info->block_group_cache_lock); 1129 rb_erase_cached(&block_group->cache_node, 1130 &fs_info->block_group_cache_tree); 1131 RB_CLEAR_NODE(&block_group->cache_node); 1132 1133 /* Once for the block groups rbtree */ 1134 btrfs_put_block_group(block_group); 1135 1136 write_unlock(&fs_info->block_group_cache_lock); 1137 1138 down_write(&block_group->space_info->groups_sem); 1139 /* 1140 * we must use list_del_init so people can check to see if they 1141 * are still on the list after taking the semaphore 1142 */ 1143 list_del_init(&block_group->list); 1144 if (list_empty(&block_group->space_info->block_groups[index])) { 1145 kobj = block_group->space_info->block_group_kobjs[index]; 1146 block_group->space_info->block_group_kobjs[index] = NULL; 1147 clear_avail_alloc_bits(fs_info, block_group->flags); 1148 } 1149 up_write(&block_group->space_info->groups_sem); 1150 clear_incompat_bg_bits(fs_info, block_group->flags); 1151 if (kobj) { 1152 kobject_del(kobj); 1153 kobject_put(kobj); 1154 } 1155 1156 if (block_group->cached == BTRFS_CACHE_STARTED) 1157 btrfs_wait_block_group_cache_done(block_group); 1158 1159 write_lock(&fs_info->block_group_cache_lock); 1160 caching_ctl = btrfs_get_caching_control(block_group); 1161 if (!caching_ctl) { 1162 struct btrfs_caching_control *ctl; 1163 1164 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { 1165 if (ctl->block_group == block_group) { 1166 caching_ctl = ctl; 1167 refcount_inc(&caching_ctl->count); 1168 break; 1169 } 1170 } 1171 } 1172 if (caching_ctl) 1173 list_del_init(&caching_ctl->list); 1174 write_unlock(&fs_info->block_group_cache_lock); 1175 1176 if (caching_ctl) { 1177 /* Once for the caching bgs list and once for us. */ 1178 btrfs_put_caching_control(caching_ctl); 1179 btrfs_put_caching_control(caching_ctl); 1180 } 1181 1182 spin_lock(&trans->transaction->dirty_bgs_lock); 1183 WARN_ON(!list_empty(&block_group->dirty_list)); 1184 WARN_ON(!list_empty(&block_group->io_list)); 1185 spin_unlock(&trans->transaction->dirty_bgs_lock); 1186 1187 btrfs_remove_free_space_cache(block_group); 1188 1189 spin_lock(&block_group->space_info->lock); 1190 list_del_init(&block_group->ro_list); 1191 1192 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1193 WARN_ON(block_group->space_info->total_bytes 1194 < block_group->length); 1195 WARN_ON(block_group->space_info->bytes_readonly 1196 < block_group->length - block_group->zone_unusable); 1197 WARN_ON(block_group->space_info->bytes_zone_unusable 1198 < block_group->zone_unusable); 1199 WARN_ON(block_group->space_info->disk_total 1200 < block_group->length * factor); 1201 } 1202 block_group->space_info->total_bytes -= block_group->length; 1203 block_group->space_info->bytes_readonly -= 1204 (block_group->length - block_group->zone_unusable); 1205 block_group->space_info->bytes_zone_unusable -= 1206 block_group->zone_unusable; 1207 block_group->space_info->disk_total -= block_group->length * factor; 1208 1209 spin_unlock(&block_group->space_info->lock); 1210 1211 /* 1212 * Remove the free space for the block group from the free space tree 1213 * and the block group's item from the extent tree before marking the 1214 * block group as removed. This is to prevent races with tasks that 1215 * freeze and unfreeze a block group, this task and another task 1216 * allocating a new block group - the unfreeze task ends up removing 1217 * the block group's extent map before the task calling this function 1218 * deletes the block group item from the extent tree, allowing for 1219 * another task to attempt to create another block group with the same 1220 * item key (and failing with -EEXIST and a transaction abort). 1221 */ 1222 ret = remove_block_group_free_space(trans, block_group); 1223 if (ret) 1224 goto out; 1225 1226 ret = remove_block_group_item(trans, path, block_group); 1227 if (ret < 0) 1228 goto out; 1229 1230 spin_lock(&block_group->lock); 1231 set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); 1232 1233 /* 1234 * At this point trimming or scrub can't start on this block group, 1235 * because we removed the block group from the rbtree 1236 * fs_info->block_group_cache_tree so no one can't find it anymore and 1237 * even if someone already got this block group before we removed it 1238 * from the rbtree, they have already incremented block_group->frozen - 1239 * if they didn't, for the trimming case they won't find any free space 1240 * entries because we already removed them all when we called 1241 * btrfs_remove_free_space_cache(). 1242 * 1243 * And we must not remove the extent map from the fs_info->mapping_tree 1244 * to prevent the same logical address range and physical device space 1245 * ranges from being reused for a new block group. This is needed to 1246 * avoid races with trimming and scrub. 1247 * 1248 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 1249 * completely transactionless, so while it is trimming a range the 1250 * currently running transaction might finish and a new one start, 1251 * allowing for new block groups to be created that can reuse the same 1252 * physical device locations unless we take this special care. 1253 * 1254 * There may also be an implicit trim operation if the file system 1255 * is mounted with -odiscard. The same protections must remain 1256 * in place until the extents have been discarded completely when 1257 * the transaction commit has completed. 1258 */ 1259 remove_em = (atomic_read(&block_group->frozen) == 0); 1260 spin_unlock(&block_group->lock); 1261 1262 if (remove_em) { 1263 struct extent_map_tree *em_tree; 1264 1265 em_tree = &fs_info->mapping_tree; 1266 write_lock(&em_tree->lock); 1267 remove_extent_mapping(em_tree, em); 1268 write_unlock(&em_tree->lock); 1269 /* once for the tree */ 1270 free_extent_map(em); 1271 } 1272 1273 out: 1274 /* Once for the lookup reference */ 1275 btrfs_put_block_group(block_group); 1276 if (remove_rsv) 1277 btrfs_delayed_refs_rsv_release(fs_info, 1); 1278 btrfs_free_path(path); 1279 return ret; 1280 } 1281 1282 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 1283 struct btrfs_fs_info *fs_info, const u64 chunk_offset) 1284 { 1285 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1286 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1287 struct extent_map *em; 1288 struct map_lookup *map; 1289 unsigned int num_items; 1290 1291 read_lock(&em_tree->lock); 1292 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1293 read_unlock(&em_tree->lock); 1294 ASSERT(em && em->start == chunk_offset); 1295 1296 /* 1297 * We need to reserve 3 + N units from the metadata space info in order 1298 * to remove a block group (done at btrfs_remove_chunk() and at 1299 * btrfs_remove_block_group()), which are used for: 1300 * 1301 * 1 unit for adding the free space inode's orphan (located in the tree 1302 * of tree roots). 1303 * 1 unit for deleting the block group item (located in the extent 1304 * tree). 1305 * 1 unit for deleting the free space item (located in tree of tree 1306 * roots). 1307 * N units for deleting N device extent items corresponding to each 1308 * stripe (located in the device tree). 1309 * 1310 * In order to remove a block group we also need to reserve units in the 1311 * system space info in order to update the chunk tree (update one or 1312 * more device items and remove one chunk item), but this is done at 1313 * btrfs_remove_chunk() through a call to check_system_chunk(). 1314 */ 1315 map = em->map_lookup; 1316 num_items = 3 + map->num_stripes; 1317 free_extent_map(em); 1318 1319 return btrfs_start_transaction_fallback_global_rsv(root, num_items); 1320 } 1321 1322 /* 1323 * Mark block group @cache read-only, so later write won't happen to block 1324 * group @cache. 1325 * 1326 * If @force is not set, this function will only mark the block group readonly 1327 * if we have enough free space (1M) in other metadata/system block groups. 1328 * If @force is not set, this function will mark the block group readonly 1329 * without checking free space. 1330 * 1331 * NOTE: This function doesn't care if other block groups can contain all the 1332 * data in this block group. That check should be done by relocation routine, 1333 * not this function. 1334 */ 1335 static int inc_block_group_ro(struct btrfs_block_group *cache, int force) 1336 { 1337 struct btrfs_space_info *sinfo = cache->space_info; 1338 u64 num_bytes; 1339 int ret = -ENOSPC; 1340 1341 spin_lock(&sinfo->lock); 1342 spin_lock(&cache->lock); 1343 1344 if (cache->swap_extents) { 1345 ret = -ETXTBSY; 1346 goto out; 1347 } 1348 1349 if (cache->ro) { 1350 cache->ro++; 1351 ret = 0; 1352 goto out; 1353 } 1354 1355 num_bytes = cache->length - cache->reserved - cache->pinned - 1356 cache->bytes_super - cache->zone_unusable - cache->used; 1357 1358 /* 1359 * Data never overcommits, even in mixed mode, so do just the straight 1360 * check of left over space in how much we have allocated. 1361 */ 1362 if (force) { 1363 ret = 0; 1364 } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { 1365 u64 sinfo_used = btrfs_space_info_used(sinfo, true); 1366 1367 /* 1368 * Here we make sure if we mark this bg RO, we still have enough 1369 * free space as buffer. 1370 */ 1371 if (sinfo_used + num_bytes <= sinfo->total_bytes) 1372 ret = 0; 1373 } else { 1374 /* 1375 * We overcommit metadata, so we need to do the 1376 * btrfs_can_overcommit check here, and we need to pass in 1377 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of 1378 * leeway to allow us to mark this block group as read only. 1379 */ 1380 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, 1381 BTRFS_RESERVE_NO_FLUSH)) 1382 ret = 0; 1383 } 1384 1385 if (!ret) { 1386 sinfo->bytes_readonly += num_bytes; 1387 if (btrfs_is_zoned(cache->fs_info)) { 1388 /* Migrate zone_unusable bytes to readonly */ 1389 sinfo->bytes_readonly += cache->zone_unusable; 1390 sinfo->bytes_zone_unusable -= cache->zone_unusable; 1391 cache->zone_unusable = 0; 1392 } 1393 cache->ro++; 1394 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1395 } 1396 out: 1397 spin_unlock(&cache->lock); 1398 spin_unlock(&sinfo->lock); 1399 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1400 btrfs_info(cache->fs_info, 1401 "unable to make block group %llu ro", cache->start); 1402 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1403 } 1404 return ret; 1405 } 1406 1407 static bool clean_pinned_extents(struct btrfs_trans_handle *trans, 1408 struct btrfs_block_group *bg) 1409 { 1410 struct btrfs_fs_info *fs_info = bg->fs_info; 1411 struct btrfs_transaction *prev_trans = NULL; 1412 const u64 start = bg->start; 1413 const u64 end = start + bg->length - 1; 1414 int ret; 1415 1416 spin_lock(&fs_info->trans_lock); 1417 if (trans->transaction->list.prev != &fs_info->trans_list) { 1418 prev_trans = list_last_entry(&trans->transaction->list, 1419 struct btrfs_transaction, list); 1420 refcount_inc(&prev_trans->use_count); 1421 } 1422 spin_unlock(&fs_info->trans_lock); 1423 1424 /* 1425 * Hold the unused_bg_unpin_mutex lock to avoid racing with 1426 * btrfs_finish_extent_commit(). If we are at transaction N, another 1427 * task might be running finish_extent_commit() for the previous 1428 * transaction N - 1, and have seen a range belonging to the block 1429 * group in pinned_extents before we were able to clear the whole block 1430 * group range from pinned_extents. This means that task can lookup for 1431 * the block group after we unpinned it from pinned_extents and removed 1432 * it, leading to a BUG_ON() at unpin_extent_range(). 1433 */ 1434 mutex_lock(&fs_info->unused_bg_unpin_mutex); 1435 if (prev_trans) { 1436 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, 1437 EXTENT_DIRTY); 1438 if (ret) 1439 goto out; 1440 } 1441 1442 ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, 1443 EXTENT_DIRTY); 1444 out: 1445 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1446 if (prev_trans) 1447 btrfs_put_transaction(prev_trans); 1448 1449 return ret == 0; 1450 } 1451 1452 /* 1453 * Process the unused_bgs list and remove any that don't have any allocated 1454 * space inside of them. 1455 */ 1456 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 1457 { 1458 struct btrfs_block_group *block_group; 1459 struct btrfs_space_info *space_info; 1460 struct btrfs_trans_handle *trans; 1461 const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); 1462 int ret = 0; 1463 1464 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1465 return; 1466 1467 if (btrfs_fs_closing(fs_info)) 1468 return; 1469 1470 /* 1471 * Long running balances can keep us blocked here for eternity, so 1472 * simply skip deletion if we're unable to get the mutex. 1473 */ 1474 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 1475 return; 1476 1477 spin_lock(&fs_info->unused_bgs_lock); 1478 while (!list_empty(&fs_info->unused_bgs)) { 1479 int trimming; 1480 1481 block_group = list_first_entry(&fs_info->unused_bgs, 1482 struct btrfs_block_group, 1483 bg_list); 1484 list_del_init(&block_group->bg_list); 1485 1486 space_info = block_group->space_info; 1487 1488 if (ret || btrfs_mixed_space_info(space_info)) { 1489 btrfs_put_block_group(block_group); 1490 continue; 1491 } 1492 spin_unlock(&fs_info->unused_bgs_lock); 1493 1494 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 1495 1496 /* Don't want to race with allocators so take the groups_sem */ 1497 down_write(&space_info->groups_sem); 1498 1499 /* 1500 * Async discard moves the final block group discard to be prior 1501 * to the unused_bgs code path. Therefore, if it's not fully 1502 * trimmed, punt it back to the async discard lists. 1503 */ 1504 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && 1505 !btrfs_is_free_space_trimmed(block_group)) { 1506 trace_btrfs_skip_unused_block_group(block_group); 1507 up_write(&space_info->groups_sem); 1508 /* Requeue if we failed because of async discard */ 1509 btrfs_discard_queue_work(&fs_info->discard_ctl, 1510 block_group); 1511 goto next; 1512 } 1513 1514 spin_lock(&block_group->lock); 1515 if (block_group->reserved || block_group->pinned || 1516 block_group->used || block_group->ro || 1517 list_is_singular(&block_group->list)) { 1518 /* 1519 * We want to bail if we made new allocations or have 1520 * outstanding allocations in this block group. We do 1521 * the ro check in case balance is currently acting on 1522 * this block group. 1523 */ 1524 trace_btrfs_skip_unused_block_group(block_group); 1525 spin_unlock(&block_group->lock); 1526 up_write(&space_info->groups_sem); 1527 goto next; 1528 } 1529 spin_unlock(&block_group->lock); 1530 1531 /* We don't want to force the issue, only flip if it's ok. */ 1532 ret = inc_block_group_ro(block_group, 0); 1533 up_write(&space_info->groups_sem); 1534 if (ret < 0) { 1535 ret = 0; 1536 goto next; 1537 } 1538 1539 ret = btrfs_zone_finish(block_group); 1540 if (ret < 0) { 1541 btrfs_dec_block_group_ro(block_group); 1542 if (ret == -EAGAIN) 1543 ret = 0; 1544 goto next; 1545 } 1546 1547 /* 1548 * Want to do this before we do anything else so we can recover 1549 * properly if we fail to join the transaction. 1550 */ 1551 trans = btrfs_start_trans_remove_block_group(fs_info, 1552 block_group->start); 1553 if (IS_ERR(trans)) { 1554 btrfs_dec_block_group_ro(block_group); 1555 ret = PTR_ERR(trans); 1556 goto next; 1557 } 1558 1559 /* 1560 * We could have pending pinned extents for this block group, 1561 * just delete them, we don't care about them anymore. 1562 */ 1563 if (!clean_pinned_extents(trans, block_group)) { 1564 btrfs_dec_block_group_ro(block_group); 1565 goto end_trans; 1566 } 1567 1568 /* 1569 * At this point, the block_group is read only and should fail 1570 * new allocations. However, btrfs_finish_extent_commit() can 1571 * cause this block_group to be placed back on the discard 1572 * lists because now the block_group isn't fully discarded. 1573 * Bail here and try again later after discarding everything. 1574 */ 1575 spin_lock(&fs_info->discard_ctl.lock); 1576 if (!list_empty(&block_group->discard_list)) { 1577 spin_unlock(&fs_info->discard_ctl.lock); 1578 btrfs_dec_block_group_ro(block_group); 1579 btrfs_discard_queue_work(&fs_info->discard_ctl, 1580 block_group); 1581 goto end_trans; 1582 } 1583 spin_unlock(&fs_info->discard_ctl.lock); 1584 1585 /* Reset pinned so btrfs_put_block_group doesn't complain */ 1586 spin_lock(&space_info->lock); 1587 spin_lock(&block_group->lock); 1588 1589 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1590 -block_group->pinned); 1591 space_info->bytes_readonly += block_group->pinned; 1592 block_group->pinned = 0; 1593 1594 spin_unlock(&block_group->lock); 1595 spin_unlock(&space_info->lock); 1596 1597 /* 1598 * The normal path here is an unused block group is passed here, 1599 * then trimming is handled in the transaction commit path. 1600 * Async discard interposes before this to do the trimming 1601 * before coming down the unused block group path as trimming 1602 * will no longer be done later in the transaction commit path. 1603 */ 1604 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1605 goto flip_async; 1606 1607 /* 1608 * DISCARD can flip during remount. On zoned filesystems, we 1609 * need to reset sequential-required zones. 1610 */ 1611 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || 1612 btrfs_is_zoned(fs_info); 1613 1614 /* Implicit trim during transaction commit. */ 1615 if (trimming) 1616 btrfs_freeze_block_group(block_group); 1617 1618 /* 1619 * Btrfs_remove_chunk will abort the transaction if things go 1620 * horribly wrong. 1621 */ 1622 ret = btrfs_remove_chunk(trans, block_group->start); 1623 1624 if (ret) { 1625 if (trimming) 1626 btrfs_unfreeze_block_group(block_group); 1627 goto end_trans; 1628 } 1629 1630 /* 1631 * If we're not mounted with -odiscard, we can just forget 1632 * about this block group. Otherwise we'll need to wait 1633 * until transaction commit to do the actual discard. 1634 */ 1635 if (trimming) { 1636 spin_lock(&fs_info->unused_bgs_lock); 1637 /* 1638 * A concurrent scrub might have added us to the list 1639 * fs_info->unused_bgs, so use a list_move operation 1640 * to add the block group to the deleted_bgs list. 1641 */ 1642 list_move(&block_group->bg_list, 1643 &trans->transaction->deleted_bgs); 1644 spin_unlock(&fs_info->unused_bgs_lock); 1645 btrfs_get_block_group(block_group); 1646 } 1647 end_trans: 1648 btrfs_end_transaction(trans); 1649 next: 1650 btrfs_put_block_group(block_group); 1651 spin_lock(&fs_info->unused_bgs_lock); 1652 } 1653 spin_unlock(&fs_info->unused_bgs_lock); 1654 mutex_unlock(&fs_info->reclaim_bgs_lock); 1655 return; 1656 1657 flip_async: 1658 btrfs_end_transaction(trans); 1659 mutex_unlock(&fs_info->reclaim_bgs_lock); 1660 btrfs_put_block_group(block_group); 1661 btrfs_discard_punt_unused_bgs_list(fs_info); 1662 } 1663 1664 void btrfs_mark_bg_unused(struct btrfs_block_group *bg) 1665 { 1666 struct btrfs_fs_info *fs_info = bg->fs_info; 1667 1668 spin_lock(&fs_info->unused_bgs_lock); 1669 if (list_empty(&bg->bg_list)) { 1670 btrfs_get_block_group(bg); 1671 trace_btrfs_add_unused_block_group(bg); 1672 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1673 } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { 1674 /* Pull out the block group from the reclaim_bgs list. */ 1675 trace_btrfs_add_unused_block_group(bg); 1676 list_move_tail(&bg->bg_list, &fs_info->unused_bgs); 1677 } 1678 spin_unlock(&fs_info->unused_bgs_lock); 1679 } 1680 1681 /* 1682 * We want block groups with a low number of used bytes to be in the beginning 1683 * of the list, so they will get reclaimed first. 1684 */ 1685 static int reclaim_bgs_cmp(void *unused, const struct list_head *a, 1686 const struct list_head *b) 1687 { 1688 const struct btrfs_block_group *bg1, *bg2; 1689 1690 bg1 = list_entry(a, struct btrfs_block_group, bg_list); 1691 bg2 = list_entry(b, struct btrfs_block_group, bg_list); 1692 1693 return bg1->used > bg2->used; 1694 } 1695 1696 static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) 1697 { 1698 if (btrfs_is_zoned(fs_info)) 1699 return btrfs_zoned_should_reclaim(fs_info); 1700 return true; 1701 } 1702 1703 static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) 1704 { 1705 const struct btrfs_space_info *space_info = bg->space_info; 1706 const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); 1707 const u64 new_val = bg->used; 1708 const u64 old_val = new_val + bytes_freed; 1709 u64 thresh; 1710 1711 if (reclaim_thresh == 0) 1712 return false; 1713 1714 thresh = mult_perc(bg->length, reclaim_thresh); 1715 1716 /* 1717 * If we were below the threshold before don't reclaim, we are likely a 1718 * brand new block group and we don't want to relocate new block groups. 1719 */ 1720 if (old_val < thresh) 1721 return false; 1722 if (new_val >= thresh) 1723 return false; 1724 return true; 1725 } 1726 1727 void btrfs_reclaim_bgs_work(struct work_struct *work) 1728 { 1729 struct btrfs_fs_info *fs_info = 1730 container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 1731 struct btrfs_block_group *bg; 1732 struct btrfs_space_info *space_info; 1733 1734 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1735 return; 1736 1737 if (btrfs_fs_closing(fs_info)) 1738 return; 1739 1740 if (!btrfs_should_reclaim(fs_info)) 1741 return; 1742 1743 sb_start_write(fs_info->sb); 1744 1745 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 1746 sb_end_write(fs_info->sb); 1747 return; 1748 } 1749 1750 /* 1751 * Long running balances can keep us blocked here for eternity, so 1752 * simply skip reclaim if we're unable to get the mutex. 1753 */ 1754 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { 1755 btrfs_exclop_finish(fs_info); 1756 sb_end_write(fs_info->sb); 1757 return; 1758 } 1759 1760 spin_lock(&fs_info->unused_bgs_lock); 1761 /* 1762 * Sort happens under lock because we can't simply splice it and sort. 1763 * The block groups might still be in use and reachable via bg_list, 1764 * and their presence in the reclaim_bgs list must be preserved. 1765 */ 1766 list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); 1767 while (!list_empty(&fs_info->reclaim_bgs)) { 1768 u64 zone_unusable; 1769 int ret = 0; 1770 1771 bg = list_first_entry(&fs_info->reclaim_bgs, 1772 struct btrfs_block_group, 1773 bg_list); 1774 list_del_init(&bg->bg_list); 1775 1776 space_info = bg->space_info; 1777 spin_unlock(&fs_info->unused_bgs_lock); 1778 1779 /* Don't race with allocators so take the groups_sem */ 1780 down_write(&space_info->groups_sem); 1781 1782 spin_lock(&bg->lock); 1783 if (bg->reserved || bg->pinned || bg->ro) { 1784 /* 1785 * We want to bail if we made new allocations or have 1786 * outstanding allocations in this block group. We do 1787 * the ro check in case balance is currently acting on 1788 * this block group. 1789 */ 1790 spin_unlock(&bg->lock); 1791 up_write(&space_info->groups_sem); 1792 goto next; 1793 } 1794 if (bg->used == 0) { 1795 /* 1796 * It is possible that we trigger relocation on a block 1797 * group as its extents are deleted and it first goes 1798 * below the threshold, then shortly after goes empty. 1799 * 1800 * In this case, relocating it does delete it, but has 1801 * some overhead in relocation specific metadata, looking 1802 * for the non-existent extents and running some extra 1803 * transactions, which we can avoid by using one of the 1804 * other mechanisms for dealing with empty block groups. 1805 */ 1806 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1807 btrfs_mark_bg_unused(bg); 1808 spin_unlock(&bg->lock); 1809 up_write(&space_info->groups_sem); 1810 goto next; 1811 1812 } 1813 /* 1814 * The block group might no longer meet the reclaim condition by 1815 * the time we get around to reclaiming it, so to avoid 1816 * reclaiming overly full block_groups, skip reclaiming them. 1817 * 1818 * Since the decision making process also depends on the amount 1819 * being freed, pass in a fake giant value to skip that extra 1820 * check, which is more meaningful when adding to the list in 1821 * the first place. 1822 */ 1823 if (!should_reclaim_block_group(bg, bg->length)) { 1824 spin_unlock(&bg->lock); 1825 up_write(&space_info->groups_sem); 1826 goto next; 1827 } 1828 spin_unlock(&bg->lock); 1829 1830 /* 1831 * Get out fast, in case we're read-only or unmounting the 1832 * filesystem. It is OK to drop block groups from the list even 1833 * for the read-only case. As we did sb_start_write(), 1834 * "mount -o remount,ro" won't happen and read-only filesystem 1835 * means it is forced read-only due to a fatal error. So, it 1836 * never gets back to read-write to let us reclaim again. 1837 */ 1838 if (btrfs_need_cleaner_sleep(fs_info)) { 1839 up_write(&space_info->groups_sem); 1840 goto next; 1841 } 1842 1843 /* 1844 * Cache the zone_unusable value before turning the block group 1845 * to read only. As soon as the blog group is read only it's 1846 * zone_unusable value gets moved to the block group's read-only 1847 * bytes and isn't available for calculations anymore. 1848 */ 1849 zone_unusable = bg->zone_unusable; 1850 ret = inc_block_group_ro(bg, 0); 1851 up_write(&space_info->groups_sem); 1852 if (ret < 0) 1853 goto next; 1854 1855 btrfs_info(fs_info, 1856 "reclaiming chunk %llu with %llu%% used %llu%% unusable", 1857 bg->start, 1858 div64_u64(bg->used * 100, bg->length), 1859 div64_u64(zone_unusable * 100, bg->length)); 1860 trace_btrfs_reclaim_block_group(bg); 1861 ret = btrfs_relocate_chunk(fs_info, bg->start); 1862 if (ret) { 1863 btrfs_dec_block_group_ro(bg); 1864 btrfs_err(fs_info, "error relocating chunk %llu", 1865 bg->start); 1866 } 1867 1868 next: 1869 if (ret) 1870 btrfs_mark_bg_to_reclaim(bg); 1871 btrfs_put_block_group(bg); 1872 1873 mutex_unlock(&fs_info->reclaim_bgs_lock); 1874 /* 1875 * Reclaiming all the block groups in the list can take really 1876 * long. Prioritize cleaning up unused block groups. 1877 */ 1878 btrfs_delete_unused_bgs(fs_info); 1879 /* 1880 * If we are interrupted by a balance, we can just bail out. The 1881 * cleaner thread restart again if necessary. 1882 */ 1883 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 1884 goto end; 1885 spin_lock(&fs_info->unused_bgs_lock); 1886 } 1887 spin_unlock(&fs_info->unused_bgs_lock); 1888 mutex_unlock(&fs_info->reclaim_bgs_lock); 1889 end: 1890 btrfs_exclop_finish(fs_info); 1891 sb_end_write(fs_info->sb); 1892 } 1893 1894 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) 1895 { 1896 spin_lock(&fs_info->unused_bgs_lock); 1897 if (!list_empty(&fs_info->reclaim_bgs)) 1898 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); 1899 spin_unlock(&fs_info->unused_bgs_lock); 1900 } 1901 1902 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) 1903 { 1904 struct btrfs_fs_info *fs_info = bg->fs_info; 1905 1906 spin_lock(&fs_info->unused_bgs_lock); 1907 if (list_empty(&bg->bg_list)) { 1908 btrfs_get_block_group(bg); 1909 trace_btrfs_add_reclaim_block_group(bg); 1910 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); 1911 } 1912 spin_unlock(&fs_info->unused_bgs_lock); 1913 } 1914 1915 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, 1916 struct btrfs_path *path) 1917 { 1918 struct extent_map_tree *em_tree; 1919 struct extent_map *em; 1920 struct btrfs_block_group_item bg; 1921 struct extent_buffer *leaf; 1922 int slot; 1923 u64 flags; 1924 int ret = 0; 1925 1926 slot = path->slots[0]; 1927 leaf = path->nodes[0]; 1928 1929 em_tree = &fs_info->mapping_tree; 1930 read_lock(&em_tree->lock); 1931 em = lookup_extent_mapping(em_tree, key->objectid, key->offset); 1932 read_unlock(&em_tree->lock); 1933 if (!em) { 1934 btrfs_err(fs_info, 1935 "logical %llu len %llu found bg but no related chunk", 1936 key->objectid, key->offset); 1937 return -ENOENT; 1938 } 1939 1940 if (em->start != key->objectid || em->len != key->offset) { 1941 btrfs_err(fs_info, 1942 "block group %llu len %llu mismatch with chunk %llu len %llu", 1943 key->objectid, key->offset, em->start, em->len); 1944 ret = -EUCLEAN; 1945 goto out_free_em; 1946 } 1947 1948 read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), 1949 sizeof(bg)); 1950 flags = btrfs_stack_block_group_flags(&bg) & 1951 BTRFS_BLOCK_GROUP_TYPE_MASK; 1952 1953 if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1954 btrfs_err(fs_info, 1955 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 1956 key->objectid, key->offset, flags, 1957 (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); 1958 ret = -EUCLEAN; 1959 } 1960 1961 out_free_em: 1962 free_extent_map(em); 1963 return ret; 1964 } 1965 1966 static int find_first_block_group(struct btrfs_fs_info *fs_info, 1967 struct btrfs_path *path, 1968 struct btrfs_key *key) 1969 { 1970 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1971 int ret; 1972 struct btrfs_key found_key; 1973 1974 btrfs_for_each_slot(root, key, &found_key, path, ret) { 1975 if (found_key.objectid >= key->objectid && 1976 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1977 return read_bg_from_eb(fs_info, &found_key, path); 1978 } 1979 } 1980 return ret; 1981 } 1982 1983 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1984 { 1985 u64 extra_flags = chunk_to_extended(flags) & 1986 BTRFS_EXTENDED_PROFILE_MASK; 1987 1988 write_seqlock(&fs_info->profiles_lock); 1989 if (flags & BTRFS_BLOCK_GROUP_DATA) 1990 fs_info->avail_data_alloc_bits |= extra_flags; 1991 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1992 fs_info->avail_metadata_alloc_bits |= extra_flags; 1993 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1994 fs_info->avail_system_alloc_bits |= extra_flags; 1995 write_sequnlock(&fs_info->profiles_lock); 1996 } 1997 1998 /* 1999 * Map a physical disk address to a list of logical addresses. 2000 * 2001 * @fs_info: the filesystem 2002 * @chunk_start: logical address of block group 2003 * @physical: physical address to map to logical addresses 2004 * @logical: return array of logical addresses which map to @physical 2005 * @naddrs: length of @logical 2006 * @stripe_len: size of IO stripe for the given block group 2007 * 2008 * Maps a particular @physical disk address to a list of @logical addresses. 2009 * Used primarily to exclude those portions of a block group that contain super 2010 * block copies. 2011 */ 2012 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 2013 u64 physical, u64 **logical, int *naddrs, int *stripe_len) 2014 { 2015 struct extent_map *em; 2016 struct map_lookup *map; 2017 u64 *buf; 2018 u64 bytenr; 2019 u64 data_stripe_length; 2020 u64 io_stripe_size; 2021 int i, nr = 0; 2022 int ret = 0; 2023 2024 em = btrfs_get_chunk_map(fs_info, chunk_start, 1); 2025 if (IS_ERR(em)) 2026 return -EIO; 2027 2028 map = em->map_lookup; 2029 data_stripe_length = em->orig_block_len; 2030 io_stripe_size = BTRFS_STRIPE_LEN; 2031 chunk_start = em->start; 2032 2033 /* For RAID5/6 adjust to a full IO stripe length */ 2034 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2035 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2036 2037 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 2038 if (!buf) { 2039 ret = -ENOMEM; 2040 goto out; 2041 } 2042 2043 for (i = 0; i < map->num_stripes; i++) { 2044 bool already_inserted = false; 2045 u32 stripe_nr; 2046 u32 offset; 2047 int j; 2048 2049 if (!in_range(physical, map->stripes[i].physical, 2050 data_stripe_length)) 2051 continue; 2052 2053 stripe_nr = (physical - map->stripes[i].physical) >> 2054 BTRFS_STRIPE_LEN_SHIFT; 2055 offset = (physical - map->stripes[i].physical) & 2056 BTRFS_STRIPE_LEN_MASK; 2057 2058 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 2059 BTRFS_BLOCK_GROUP_RAID10)) 2060 stripe_nr = div_u64(stripe_nr * map->num_stripes + i, 2061 map->sub_stripes); 2062 /* 2063 * The remaining case would be for RAID56, multiply by 2064 * nr_data_stripes(). Alternatively, just use rmap_len below 2065 * instead of map->stripe_len 2066 */ 2067 bytenr = chunk_start + stripe_nr * io_stripe_size + offset; 2068 2069 /* Ensure we don't add duplicate addresses */ 2070 for (j = 0; j < nr; j++) { 2071 if (buf[j] == bytenr) { 2072 already_inserted = true; 2073 break; 2074 } 2075 } 2076 2077 if (!already_inserted) 2078 buf[nr++] = bytenr; 2079 } 2080 2081 *logical = buf; 2082 *naddrs = nr; 2083 *stripe_len = io_stripe_size; 2084 out: 2085 free_extent_map(em); 2086 return ret; 2087 } 2088 2089 static int exclude_super_stripes(struct btrfs_block_group *cache) 2090 { 2091 struct btrfs_fs_info *fs_info = cache->fs_info; 2092 const bool zoned = btrfs_is_zoned(fs_info); 2093 u64 bytenr; 2094 u64 *logical; 2095 int stripe_len; 2096 int i, nr, ret; 2097 2098 if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 2099 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 2100 cache->bytes_super += stripe_len; 2101 ret = btrfs_add_excluded_extent(fs_info, cache->start, 2102 stripe_len); 2103 if (ret) 2104 return ret; 2105 } 2106 2107 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2108 bytenr = btrfs_sb_offset(i); 2109 ret = btrfs_rmap_block(fs_info, cache->start, 2110 bytenr, &logical, &nr, &stripe_len); 2111 if (ret) 2112 return ret; 2113 2114 /* Shouldn't have super stripes in sequential zones */ 2115 if (zoned && nr) { 2116 kfree(logical); 2117 btrfs_err(fs_info, 2118 "zoned: block group %llu must not contain super block", 2119 cache->start); 2120 return -EUCLEAN; 2121 } 2122 2123 while (nr--) { 2124 u64 len = min_t(u64, stripe_len, 2125 cache->start + cache->length - logical[nr]); 2126 2127 cache->bytes_super += len; 2128 ret = btrfs_add_excluded_extent(fs_info, logical[nr], 2129 len); 2130 if (ret) { 2131 kfree(logical); 2132 return ret; 2133 } 2134 } 2135 2136 kfree(logical); 2137 } 2138 return 0; 2139 } 2140 2141 static struct btrfs_block_group *btrfs_create_block_group_cache( 2142 struct btrfs_fs_info *fs_info, u64 start) 2143 { 2144 struct btrfs_block_group *cache; 2145 2146 cache = kzalloc(sizeof(*cache), GFP_NOFS); 2147 if (!cache) 2148 return NULL; 2149 2150 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 2151 GFP_NOFS); 2152 if (!cache->free_space_ctl) { 2153 kfree(cache); 2154 return NULL; 2155 } 2156 2157 cache->start = start; 2158 2159 cache->fs_info = fs_info; 2160 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 2161 2162 cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; 2163 2164 refcount_set(&cache->refs, 1); 2165 spin_lock_init(&cache->lock); 2166 init_rwsem(&cache->data_rwsem); 2167 INIT_LIST_HEAD(&cache->list); 2168 INIT_LIST_HEAD(&cache->cluster_list); 2169 INIT_LIST_HEAD(&cache->bg_list); 2170 INIT_LIST_HEAD(&cache->ro_list); 2171 INIT_LIST_HEAD(&cache->discard_list); 2172 INIT_LIST_HEAD(&cache->dirty_list); 2173 INIT_LIST_HEAD(&cache->io_list); 2174 INIT_LIST_HEAD(&cache->active_bg_list); 2175 btrfs_init_free_space_ctl(cache, cache->free_space_ctl); 2176 atomic_set(&cache->frozen, 0); 2177 mutex_init(&cache->free_space_lock); 2178 2179 return cache; 2180 } 2181 2182 /* 2183 * Iterate all chunks and verify that each of them has the corresponding block 2184 * group 2185 */ 2186 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 2187 { 2188 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 2189 struct extent_map *em; 2190 struct btrfs_block_group *bg; 2191 u64 start = 0; 2192 int ret = 0; 2193 2194 while (1) { 2195 read_lock(&map_tree->lock); 2196 /* 2197 * lookup_extent_mapping will return the first extent map 2198 * intersecting the range, so setting @len to 1 is enough to 2199 * get the first chunk. 2200 */ 2201 em = lookup_extent_mapping(map_tree, start, 1); 2202 read_unlock(&map_tree->lock); 2203 if (!em) 2204 break; 2205 2206 bg = btrfs_lookup_block_group(fs_info, em->start); 2207 if (!bg) { 2208 btrfs_err(fs_info, 2209 "chunk start=%llu len=%llu doesn't have corresponding block group", 2210 em->start, em->len); 2211 ret = -EUCLEAN; 2212 free_extent_map(em); 2213 break; 2214 } 2215 if (bg->start != em->start || bg->length != em->len || 2216 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 2217 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 2218 btrfs_err(fs_info, 2219 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 2220 em->start, em->len, 2221 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 2222 bg->start, bg->length, 2223 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 2224 ret = -EUCLEAN; 2225 free_extent_map(em); 2226 btrfs_put_block_group(bg); 2227 break; 2228 } 2229 start = em->start + em->len; 2230 free_extent_map(em); 2231 btrfs_put_block_group(bg); 2232 } 2233 return ret; 2234 } 2235 2236 static int read_one_block_group(struct btrfs_fs_info *info, 2237 struct btrfs_block_group_item *bgi, 2238 const struct btrfs_key *key, 2239 int need_clear) 2240 { 2241 struct btrfs_block_group *cache; 2242 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 2243 int ret; 2244 2245 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 2246 2247 cache = btrfs_create_block_group_cache(info, key->objectid); 2248 if (!cache) 2249 return -ENOMEM; 2250 2251 cache->length = key->offset; 2252 cache->used = btrfs_stack_block_group_used(bgi); 2253 cache->commit_used = cache->used; 2254 cache->flags = btrfs_stack_block_group_flags(bgi); 2255 cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 2256 2257 set_free_space_tree_thresholds(cache); 2258 2259 if (need_clear) { 2260 /* 2261 * When we mount with old space cache, we need to 2262 * set BTRFS_DC_CLEAR and set dirty flag. 2263 * 2264 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 2265 * truncate the old free space cache inode and 2266 * setup a new one. 2267 * b) Setting 'dirty flag' makes sure that we flush 2268 * the new space cache info onto disk. 2269 */ 2270 if (btrfs_test_opt(info, SPACE_CACHE)) 2271 cache->disk_cache_state = BTRFS_DC_CLEAR; 2272 } 2273 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 2274 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 2275 btrfs_err(info, 2276 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 2277 cache->start); 2278 ret = -EINVAL; 2279 goto error; 2280 } 2281 2282 ret = btrfs_load_block_group_zone_info(cache, false); 2283 if (ret) { 2284 btrfs_err(info, "zoned: failed to load zone info of bg %llu", 2285 cache->start); 2286 goto error; 2287 } 2288 2289 /* 2290 * We need to exclude the super stripes now so that the space info has 2291 * super bytes accounted for, otherwise we'll think we have more space 2292 * than we actually do. 2293 */ 2294 ret = exclude_super_stripes(cache); 2295 if (ret) { 2296 /* We may have excluded something, so call this just in case. */ 2297 btrfs_free_excluded_extents(cache); 2298 goto error; 2299 } 2300 2301 /* 2302 * For zoned filesystem, space after the allocation offset is the only 2303 * free space for a block group. So, we don't need any caching work. 2304 * btrfs_calc_zone_unusable() will set the amount of free space and 2305 * zone_unusable space. 2306 * 2307 * For regular filesystem, check for two cases, either we are full, and 2308 * therefore don't need to bother with the caching work since we won't 2309 * find any space, or we are empty, and we can just add all the space 2310 * in and be done with it. This saves us _a_lot_ of time, particularly 2311 * in the full case. 2312 */ 2313 if (btrfs_is_zoned(info)) { 2314 btrfs_calc_zone_unusable(cache); 2315 /* Should not have any excluded extents. Just in case, though. */ 2316 btrfs_free_excluded_extents(cache); 2317 } else if (cache->length == cache->used) { 2318 cache->cached = BTRFS_CACHE_FINISHED; 2319 btrfs_free_excluded_extents(cache); 2320 } else if (cache->used == 0) { 2321 cache->cached = BTRFS_CACHE_FINISHED; 2322 ret = add_new_free_space(cache, cache->start, 2323 cache->start + cache->length, NULL); 2324 btrfs_free_excluded_extents(cache); 2325 if (ret) 2326 goto error; 2327 } 2328 2329 ret = btrfs_add_block_group_cache(info, cache); 2330 if (ret) { 2331 btrfs_remove_free_space_cache(cache); 2332 goto error; 2333 } 2334 trace_btrfs_add_block_group(info, cache, 0); 2335 btrfs_add_bg_to_space_info(info, cache); 2336 2337 set_avail_alloc_bits(info, cache->flags); 2338 if (btrfs_chunk_writeable(info, cache->start)) { 2339 if (cache->used == 0) { 2340 ASSERT(list_empty(&cache->bg_list)); 2341 if (btrfs_test_opt(info, DISCARD_ASYNC)) 2342 btrfs_discard_queue_work(&info->discard_ctl, cache); 2343 else 2344 btrfs_mark_bg_unused(cache); 2345 } 2346 } else { 2347 inc_block_group_ro(cache, 1); 2348 } 2349 2350 return 0; 2351 error: 2352 btrfs_put_block_group(cache); 2353 return ret; 2354 } 2355 2356 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) 2357 { 2358 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 2359 struct rb_node *node; 2360 int ret = 0; 2361 2362 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 2363 struct extent_map *em; 2364 struct map_lookup *map; 2365 struct btrfs_block_group *bg; 2366 2367 em = rb_entry(node, struct extent_map, rb_node); 2368 map = em->map_lookup; 2369 bg = btrfs_create_block_group_cache(fs_info, em->start); 2370 if (!bg) { 2371 ret = -ENOMEM; 2372 break; 2373 } 2374 2375 /* Fill dummy cache as FULL */ 2376 bg->length = em->len; 2377 bg->flags = map->type; 2378 bg->cached = BTRFS_CACHE_FINISHED; 2379 bg->used = em->len; 2380 bg->flags = map->type; 2381 ret = btrfs_add_block_group_cache(fs_info, bg); 2382 /* 2383 * We may have some valid block group cache added already, in 2384 * that case we skip to the next one. 2385 */ 2386 if (ret == -EEXIST) { 2387 ret = 0; 2388 btrfs_put_block_group(bg); 2389 continue; 2390 } 2391 2392 if (ret) { 2393 btrfs_remove_free_space_cache(bg); 2394 btrfs_put_block_group(bg); 2395 break; 2396 } 2397 2398 btrfs_add_bg_to_space_info(fs_info, bg); 2399 2400 set_avail_alloc_bits(fs_info, bg->flags); 2401 } 2402 if (!ret) 2403 btrfs_init_global_block_rsv(fs_info); 2404 return ret; 2405 } 2406 2407 int btrfs_read_block_groups(struct btrfs_fs_info *info) 2408 { 2409 struct btrfs_root *root = btrfs_block_group_root(info); 2410 struct btrfs_path *path; 2411 int ret; 2412 struct btrfs_block_group *cache; 2413 struct btrfs_space_info *space_info; 2414 struct btrfs_key key; 2415 int need_clear = 0; 2416 u64 cache_gen; 2417 2418 /* 2419 * Either no extent root (with ibadroots rescue option) or we have 2420 * unsupported RO options. The fs can never be mounted read-write, so no 2421 * need to waste time searching block group items. 2422 * 2423 * This also allows new extent tree related changes to be RO compat, 2424 * no need for a full incompat flag. 2425 */ 2426 if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & 2427 ~BTRFS_FEATURE_COMPAT_RO_SUPP)) 2428 return fill_dummy_bgs(info); 2429 2430 key.objectid = 0; 2431 key.offset = 0; 2432 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2433 path = btrfs_alloc_path(); 2434 if (!path) 2435 return -ENOMEM; 2436 2437 cache_gen = btrfs_super_cache_generation(info->super_copy); 2438 if (btrfs_test_opt(info, SPACE_CACHE) && 2439 btrfs_super_generation(info->super_copy) != cache_gen) 2440 need_clear = 1; 2441 if (btrfs_test_opt(info, CLEAR_CACHE)) 2442 need_clear = 1; 2443 2444 while (1) { 2445 struct btrfs_block_group_item bgi; 2446 struct extent_buffer *leaf; 2447 int slot; 2448 2449 ret = find_first_block_group(info, path, &key); 2450 if (ret > 0) 2451 break; 2452 if (ret != 0) 2453 goto error; 2454 2455 leaf = path->nodes[0]; 2456 slot = path->slots[0]; 2457 2458 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 2459 sizeof(bgi)); 2460 2461 btrfs_item_key_to_cpu(leaf, &key, slot); 2462 btrfs_release_path(path); 2463 ret = read_one_block_group(info, &bgi, &key, need_clear); 2464 if (ret < 0) 2465 goto error; 2466 key.objectid += key.offset; 2467 key.offset = 0; 2468 } 2469 btrfs_release_path(path); 2470 2471 list_for_each_entry(space_info, &info->space_info, list) { 2472 int i; 2473 2474 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2475 if (list_empty(&space_info->block_groups[i])) 2476 continue; 2477 cache = list_first_entry(&space_info->block_groups[i], 2478 struct btrfs_block_group, 2479 list); 2480 btrfs_sysfs_add_block_group_type(cache); 2481 } 2482 2483 if (!(btrfs_get_alloc_profile(info, space_info->flags) & 2484 (BTRFS_BLOCK_GROUP_RAID10 | 2485 BTRFS_BLOCK_GROUP_RAID1_MASK | 2486 BTRFS_BLOCK_GROUP_RAID56_MASK | 2487 BTRFS_BLOCK_GROUP_DUP))) 2488 continue; 2489 /* 2490 * Avoid allocating from un-mirrored block group if there are 2491 * mirrored block groups. 2492 */ 2493 list_for_each_entry(cache, 2494 &space_info->block_groups[BTRFS_RAID_RAID0], 2495 list) 2496 inc_block_group_ro(cache, 1); 2497 list_for_each_entry(cache, 2498 &space_info->block_groups[BTRFS_RAID_SINGLE], 2499 list) 2500 inc_block_group_ro(cache, 1); 2501 } 2502 2503 btrfs_init_global_block_rsv(info); 2504 ret = check_chunk_block_group_mappings(info); 2505 error: 2506 btrfs_free_path(path); 2507 /* 2508 * We've hit some error while reading the extent tree, and have 2509 * rescue=ibadroots mount option. 2510 * Try to fill the tree using dummy block groups so that the user can 2511 * continue to mount and grab their data. 2512 */ 2513 if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) 2514 ret = fill_dummy_bgs(info); 2515 return ret; 2516 } 2517 2518 /* 2519 * This function, insert_block_group_item(), belongs to the phase 2 of chunk 2520 * allocation. 2521 * 2522 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2523 * phases. 2524 */ 2525 static int insert_block_group_item(struct btrfs_trans_handle *trans, 2526 struct btrfs_block_group *block_group) 2527 { 2528 struct btrfs_fs_info *fs_info = trans->fs_info; 2529 struct btrfs_block_group_item bgi; 2530 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2531 struct btrfs_key key; 2532 u64 old_commit_used; 2533 int ret; 2534 2535 spin_lock(&block_group->lock); 2536 btrfs_set_stack_block_group_used(&bgi, block_group->used); 2537 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2538 block_group->global_root_id); 2539 btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2540 old_commit_used = block_group->commit_used; 2541 block_group->commit_used = block_group->used; 2542 key.objectid = block_group->start; 2543 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2544 key.offset = block_group->length; 2545 spin_unlock(&block_group->lock); 2546 2547 ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 2548 if (ret < 0) { 2549 spin_lock(&block_group->lock); 2550 block_group->commit_used = old_commit_used; 2551 spin_unlock(&block_group->lock); 2552 } 2553 2554 return ret; 2555 } 2556 2557 static int insert_dev_extent(struct btrfs_trans_handle *trans, 2558 struct btrfs_device *device, u64 chunk_offset, 2559 u64 start, u64 num_bytes) 2560 { 2561 struct btrfs_fs_info *fs_info = device->fs_info; 2562 struct btrfs_root *root = fs_info->dev_root; 2563 struct btrfs_path *path; 2564 struct btrfs_dev_extent *extent; 2565 struct extent_buffer *leaf; 2566 struct btrfs_key key; 2567 int ret; 2568 2569 WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); 2570 WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 2571 path = btrfs_alloc_path(); 2572 if (!path) 2573 return -ENOMEM; 2574 2575 key.objectid = device->devid; 2576 key.type = BTRFS_DEV_EXTENT_KEY; 2577 key.offset = start; 2578 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); 2579 if (ret) 2580 goto out; 2581 2582 leaf = path->nodes[0]; 2583 extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 2584 btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); 2585 btrfs_set_dev_extent_chunk_objectid(leaf, extent, 2586 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2587 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); 2588 2589 btrfs_set_dev_extent_length(leaf, extent, num_bytes); 2590 btrfs_mark_buffer_dirty(leaf); 2591 out: 2592 btrfs_free_path(path); 2593 return ret; 2594 } 2595 2596 /* 2597 * This function belongs to phase 2. 2598 * 2599 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2600 * phases. 2601 */ 2602 static int insert_dev_extents(struct btrfs_trans_handle *trans, 2603 u64 chunk_offset, u64 chunk_size) 2604 { 2605 struct btrfs_fs_info *fs_info = trans->fs_info; 2606 struct btrfs_device *device; 2607 struct extent_map *em; 2608 struct map_lookup *map; 2609 u64 dev_offset; 2610 u64 stripe_size; 2611 int i; 2612 int ret = 0; 2613 2614 em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); 2615 if (IS_ERR(em)) 2616 return PTR_ERR(em); 2617 2618 map = em->map_lookup; 2619 stripe_size = em->orig_block_len; 2620 2621 /* 2622 * Take the device list mutex to prevent races with the final phase of 2623 * a device replace operation that replaces the device object associated 2624 * with the map's stripes, because the device object's id can change 2625 * at any time during that final phase of the device replace operation 2626 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 2627 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 2628 * resulting in persisting a device extent item with such ID. 2629 */ 2630 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2631 for (i = 0; i < map->num_stripes; i++) { 2632 device = map->stripes[i].dev; 2633 dev_offset = map->stripes[i].physical; 2634 2635 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, 2636 stripe_size); 2637 if (ret) 2638 break; 2639 } 2640 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2641 2642 free_extent_map(em); 2643 return ret; 2644 } 2645 2646 /* 2647 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of 2648 * chunk allocation. 2649 * 2650 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 2651 * phases. 2652 */ 2653 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 2654 { 2655 struct btrfs_fs_info *fs_info = trans->fs_info; 2656 struct btrfs_block_group *block_group; 2657 int ret = 0; 2658 2659 while (!list_empty(&trans->new_bgs)) { 2660 int index; 2661 2662 block_group = list_first_entry(&trans->new_bgs, 2663 struct btrfs_block_group, 2664 bg_list); 2665 if (ret) 2666 goto next; 2667 2668 index = btrfs_bg_flags_to_raid_index(block_group->flags); 2669 2670 ret = insert_block_group_item(trans, block_group); 2671 if (ret) 2672 btrfs_abort_transaction(trans, ret); 2673 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, 2674 &block_group->runtime_flags)) { 2675 mutex_lock(&fs_info->chunk_mutex); 2676 ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); 2677 mutex_unlock(&fs_info->chunk_mutex); 2678 if (ret) 2679 btrfs_abort_transaction(trans, ret); 2680 } 2681 ret = insert_dev_extents(trans, block_group->start, 2682 block_group->length); 2683 if (ret) 2684 btrfs_abort_transaction(trans, ret); 2685 add_block_group_free_space(trans, block_group); 2686 2687 /* 2688 * If we restriped during balance, we may have added a new raid 2689 * type, so now add the sysfs entries when it is safe to do so. 2690 * We don't have to worry about locking here as it's handled in 2691 * btrfs_sysfs_add_block_group_type. 2692 */ 2693 if (block_group->space_info->block_group_kobjs[index] == NULL) 2694 btrfs_sysfs_add_block_group_type(block_group); 2695 2696 /* Already aborted the transaction if it failed. */ 2697 next: 2698 btrfs_delayed_refs_rsv_release(fs_info, 1); 2699 list_del_init(&block_group->bg_list); 2700 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); 2701 } 2702 btrfs_trans_release_chunk_metadata(trans); 2703 } 2704 2705 /* 2706 * For extent tree v2 we use the block_group_item->chunk_offset to point at our 2707 * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. 2708 */ 2709 static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) 2710 { 2711 u64 div = SZ_1G; 2712 u64 index; 2713 2714 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 2715 return BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2716 2717 /* If we have a smaller fs index based on 128MiB. */ 2718 if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) 2719 div = SZ_128M; 2720 2721 offset = div64_u64(offset, div); 2722 div64_u64_rem(offset, fs_info->nr_global_roots, &index); 2723 return index; 2724 } 2725 2726 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, 2727 u64 type, 2728 u64 chunk_offset, u64 size) 2729 { 2730 struct btrfs_fs_info *fs_info = trans->fs_info; 2731 struct btrfs_block_group *cache; 2732 int ret; 2733 2734 btrfs_set_log_full_commit(trans); 2735 2736 cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2737 if (!cache) 2738 return ERR_PTR(-ENOMEM); 2739 2740 /* 2741 * Mark it as new before adding it to the rbtree of block groups or any 2742 * list, so that no other task finds it and calls btrfs_mark_bg_unused() 2743 * before the new flag is set. 2744 */ 2745 set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); 2746 2747 cache->length = size; 2748 set_free_space_tree_thresholds(cache); 2749 cache->flags = type; 2750 cache->cached = BTRFS_CACHE_FINISHED; 2751 cache->global_root_id = calculate_global_root_id(fs_info, cache->start); 2752 2753 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 2754 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); 2755 2756 ret = btrfs_load_block_group_zone_info(cache, true); 2757 if (ret) { 2758 btrfs_put_block_group(cache); 2759 return ERR_PTR(ret); 2760 } 2761 2762 ret = exclude_super_stripes(cache); 2763 if (ret) { 2764 /* We may have excluded something, so call this just in case */ 2765 btrfs_free_excluded_extents(cache); 2766 btrfs_put_block_group(cache); 2767 return ERR_PTR(ret); 2768 } 2769 2770 ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); 2771 btrfs_free_excluded_extents(cache); 2772 if (ret) { 2773 btrfs_put_block_group(cache); 2774 return ERR_PTR(ret); 2775 } 2776 2777 /* 2778 * Ensure the corresponding space_info object is created and 2779 * assigned to our block group. We want our bg to be added to the rbtree 2780 * with its ->space_info set. 2781 */ 2782 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 2783 ASSERT(cache->space_info); 2784 2785 ret = btrfs_add_block_group_cache(fs_info, cache); 2786 if (ret) { 2787 btrfs_remove_free_space_cache(cache); 2788 btrfs_put_block_group(cache); 2789 return ERR_PTR(ret); 2790 } 2791 2792 /* 2793 * Now that our block group has its ->space_info set and is inserted in 2794 * the rbtree, update the space info's counters. 2795 */ 2796 trace_btrfs_add_block_group(fs_info, cache, 1); 2797 btrfs_add_bg_to_space_info(fs_info, cache); 2798 btrfs_update_global_block_rsv(fs_info); 2799 2800 #ifdef CONFIG_BTRFS_DEBUG 2801 if (btrfs_should_fragment_free_space(cache)) { 2802 cache->space_info->bytes_used += size >> 1; 2803 fragment_free_space(cache); 2804 } 2805 #endif 2806 2807 list_add_tail(&cache->bg_list, &trans->new_bgs); 2808 trans->delayed_ref_updates++; 2809 btrfs_update_delayed_refs_rsv(trans); 2810 2811 set_avail_alloc_bits(fs_info, type); 2812 return cache; 2813 } 2814 2815 /* 2816 * Mark one block group RO, can be called several times for the same block 2817 * group. 2818 * 2819 * @cache: the destination block group 2820 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 2821 * ensure we still have some free space after marking this 2822 * block group RO. 2823 */ 2824 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 2825 bool do_chunk_alloc) 2826 { 2827 struct btrfs_fs_info *fs_info = cache->fs_info; 2828 struct btrfs_trans_handle *trans; 2829 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2830 u64 alloc_flags; 2831 int ret; 2832 bool dirty_bg_running; 2833 2834 /* 2835 * This can only happen when we are doing read-only scrub on read-only 2836 * mount. 2837 * In that case we should not start a new transaction on read-only fs. 2838 * Thus here we skip all chunk allocations. 2839 */ 2840 if (sb_rdonly(fs_info->sb)) { 2841 mutex_lock(&fs_info->ro_block_group_mutex); 2842 ret = inc_block_group_ro(cache, 0); 2843 mutex_unlock(&fs_info->ro_block_group_mutex); 2844 return ret; 2845 } 2846 2847 do { 2848 trans = btrfs_join_transaction(root); 2849 if (IS_ERR(trans)) 2850 return PTR_ERR(trans); 2851 2852 dirty_bg_running = false; 2853 2854 /* 2855 * We're not allowed to set block groups readonly after the dirty 2856 * block group cache has started writing. If it already started, 2857 * back off and let this transaction commit. 2858 */ 2859 mutex_lock(&fs_info->ro_block_group_mutex); 2860 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 2861 u64 transid = trans->transid; 2862 2863 mutex_unlock(&fs_info->ro_block_group_mutex); 2864 btrfs_end_transaction(trans); 2865 2866 ret = btrfs_wait_for_commit(fs_info, transid); 2867 if (ret) 2868 return ret; 2869 dirty_bg_running = true; 2870 } 2871 } while (dirty_bg_running); 2872 2873 if (do_chunk_alloc) { 2874 /* 2875 * If we are changing raid levels, try to allocate a 2876 * corresponding block group with the new raid level. 2877 */ 2878 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2879 if (alloc_flags != cache->flags) { 2880 ret = btrfs_chunk_alloc(trans, alloc_flags, 2881 CHUNK_ALLOC_FORCE); 2882 /* 2883 * ENOSPC is allowed here, we may have enough space 2884 * already allocated at the new raid level to carry on 2885 */ 2886 if (ret == -ENOSPC) 2887 ret = 0; 2888 if (ret < 0) 2889 goto out; 2890 } 2891 } 2892 2893 ret = inc_block_group_ro(cache, 0); 2894 if (!ret) 2895 goto out; 2896 if (ret == -ETXTBSY) 2897 goto unlock_out; 2898 2899 /* 2900 * Skip chunk alloction if the bg is SYSTEM, this is to avoid system 2901 * chunk allocation storm to exhaust the system chunk array. Otherwise 2902 * we still want to try our best to mark the block group read-only. 2903 */ 2904 if (!do_chunk_alloc && ret == -ENOSPC && 2905 (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) 2906 goto unlock_out; 2907 2908 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 2909 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2910 if (ret < 0) 2911 goto out; 2912 /* 2913 * We have allocated a new chunk. We also need to activate that chunk to 2914 * grant metadata tickets for zoned filesystem. 2915 */ 2916 ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); 2917 if (ret < 0) 2918 goto out; 2919 2920 ret = inc_block_group_ro(cache, 0); 2921 if (ret == -ETXTBSY) 2922 goto unlock_out; 2923 out: 2924 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 2925 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); 2926 mutex_lock(&fs_info->chunk_mutex); 2927 check_system_chunk(trans, alloc_flags); 2928 mutex_unlock(&fs_info->chunk_mutex); 2929 } 2930 unlock_out: 2931 mutex_unlock(&fs_info->ro_block_group_mutex); 2932 2933 btrfs_end_transaction(trans); 2934 return ret; 2935 } 2936 2937 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 2938 { 2939 struct btrfs_space_info *sinfo = cache->space_info; 2940 u64 num_bytes; 2941 2942 BUG_ON(!cache->ro); 2943 2944 spin_lock(&sinfo->lock); 2945 spin_lock(&cache->lock); 2946 if (!--cache->ro) { 2947 if (btrfs_is_zoned(cache->fs_info)) { 2948 /* Migrate zone_unusable bytes back */ 2949 cache->zone_unusable = 2950 (cache->alloc_offset - cache->used) + 2951 (cache->length - cache->zone_capacity); 2952 sinfo->bytes_zone_unusable += cache->zone_unusable; 2953 sinfo->bytes_readonly -= cache->zone_unusable; 2954 } 2955 num_bytes = cache->length - cache->reserved - 2956 cache->pinned - cache->bytes_super - 2957 cache->zone_unusable - cache->used; 2958 sinfo->bytes_readonly -= num_bytes; 2959 list_del_init(&cache->ro_list); 2960 } 2961 spin_unlock(&cache->lock); 2962 spin_unlock(&sinfo->lock); 2963 } 2964 2965 static int update_block_group_item(struct btrfs_trans_handle *trans, 2966 struct btrfs_path *path, 2967 struct btrfs_block_group *cache) 2968 { 2969 struct btrfs_fs_info *fs_info = trans->fs_info; 2970 int ret; 2971 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2972 unsigned long bi; 2973 struct extent_buffer *leaf; 2974 struct btrfs_block_group_item bgi; 2975 struct btrfs_key key; 2976 u64 old_commit_used; 2977 u64 used; 2978 2979 /* 2980 * Block group items update can be triggered out of commit transaction 2981 * critical section, thus we need a consistent view of used bytes. 2982 * We cannot use cache->used directly outside of the spin lock, as it 2983 * may be changed. 2984 */ 2985 spin_lock(&cache->lock); 2986 old_commit_used = cache->commit_used; 2987 used = cache->used; 2988 /* No change in used bytes, can safely skip it. */ 2989 if (cache->commit_used == used) { 2990 spin_unlock(&cache->lock); 2991 return 0; 2992 } 2993 cache->commit_used = used; 2994 spin_unlock(&cache->lock); 2995 2996 key.objectid = cache->start; 2997 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2998 key.offset = cache->length; 2999 3000 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 3001 if (ret) { 3002 if (ret > 0) 3003 ret = -ENOENT; 3004 goto fail; 3005 } 3006 3007 leaf = path->nodes[0]; 3008 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3009 btrfs_set_stack_block_group_used(&bgi, used); 3010 btrfs_set_stack_block_group_chunk_objectid(&bgi, 3011 cache->global_root_id); 3012 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 3013 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 3014 btrfs_mark_buffer_dirty(leaf); 3015 fail: 3016 btrfs_release_path(path); 3017 /* We didn't update the block group item, need to revert @commit_used. */ 3018 if (ret < 0) { 3019 spin_lock(&cache->lock); 3020 cache->commit_used = old_commit_used; 3021 spin_unlock(&cache->lock); 3022 } 3023 return ret; 3024 3025 } 3026 3027 static int cache_save_setup(struct btrfs_block_group *block_group, 3028 struct btrfs_trans_handle *trans, 3029 struct btrfs_path *path) 3030 { 3031 struct btrfs_fs_info *fs_info = block_group->fs_info; 3032 struct btrfs_root *root = fs_info->tree_root; 3033 struct inode *inode = NULL; 3034 struct extent_changeset *data_reserved = NULL; 3035 u64 alloc_hint = 0; 3036 int dcs = BTRFS_DC_ERROR; 3037 u64 cache_size = 0; 3038 int retries = 0; 3039 int ret = 0; 3040 3041 if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 3042 return 0; 3043 3044 /* 3045 * If this block group is smaller than 100 megs don't bother caching the 3046 * block group. 3047 */ 3048 if (block_group->length < (100 * SZ_1M)) { 3049 spin_lock(&block_group->lock); 3050 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3051 spin_unlock(&block_group->lock); 3052 return 0; 3053 } 3054 3055 if (TRANS_ABORTED(trans)) 3056 return 0; 3057 again: 3058 inode = lookup_free_space_inode(block_group, path); 3059 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3060 ret = PTR_ERR(inode); 3061 btrfs_release_path(path); 3062 goto out; 3063 } 3064 3065 if (IS_ERR(inode)) { 3066 BUG_ON(retries); 3067 retries++; 3068 3069 if (block_group->ro) 3070 goto out_free; 3071 3072 ret = create_free_space_inode(trans, block_group, path); 3073 if (ret) 3074 goto out_free; 3075 goto again; 3076 } 3077 3078 /* 3079 * We want to set the generation to 0, that way if anything goes wrong 3080 * from here on out we know not to trust this cache when we load up next 3081 * time. 3082 */ 3083 BTRFS_I(inode)->generation = 0; 3084 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 3085 if (ret) { 3086 /* 3087 * So theoretically we could recover from this, simply set the 3088 * super cache generation to 0 so we know to invalidate the 3089 * cache, but then we'd have to keep track of the block groups 3090 * that fail this way so we know we _have_ to reset this cache 3091 * before the next commit or risk reading stale cache. So to 3092 * limit our exposure to horrible edge cases lets just abort the 3093 * transaction, this only happens in really bad situations 3094 * anyway. 3095 */ 3096 btrfs_abort_transaction(trans, ret); 3097 goto out_put; 3098 } 3099 WARN_ON(ret); 3100 3101 /* We've already setup this transaction, go ahead and exit */ 3102 if (block_group->cache_generation == trans->transid && 3103 i_size_read(inode)) { 3104 dcs = BTRFS_DC_SETUP; 3105 goto out_put; 3106 } 3107 3108 if (i_size_read(inode) > 0) { 3109 ret = btrfs_check_trunc_cache_free_space(fs_info, 3110 &fs_info->global_block_rsv); 3111 if (ret) 3112 goto out_put; 3113 3114 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3115 if (ret) 3116 goto out_put; 3117 } 3118 3119 spin_lock(&block_group->lock); 3120 if (block_group->cached != BTRFS_CACHE_FINISHED || 3121 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3122 /* 3123 * don't bother trying to write stuff out _if_ 3124 * a) we're not cached, 3125 * b) we're with nospace_cache mount option, 3126 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3127 */ 3128 dcs = BTRFS_DC_WRITTEN; 3129 spin_unlock(&block_group->lock); 3130 goto out_put; 3131 } 3132 spin_unlock(&block_group->lock); 3133 3134 /* 3135 * We hit an ENOSPC when setting up the cache in this transaction, just 3136 * skip doing the setup, we've already cleared the cache so we're safe. 3137 */ 3138 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3139 ret = -ENOSPC; 3140 goto out_put; 3141 } 3142 3143 /* 3144 * Try to preallocate enough space based on how big the block group is. 3145 * Keep in mind this has to include any pinned space which could end up 3146 * taking up quite a bit since it's not folded into the other space 3147 * cache. 3148 */ 3149 cache_size = div_u64(block_group->length, SZ_256M); 3150 if (!cache_size) 3151 cache_size = 1; 3152 3153 cache_size *= 16; 3154 cache_size *= fs_info->sectorsize; 3155 3156 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 3157 cache_size, false); 3158 if (ret) 3159 goto out_put; 3160 3161 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, 3162 cache_size, cache_size, 3163 &alloc_hint); 3164 /* 3165 * Our cache requires contiguous chunks so that we don't modify a bunch 3166 * of metadata or split extents when writing the cache out, which means 3167 * we can enospc if we are heavily fragmented in addition to just normal 3168 * out of space conditions. So if we hit this just skip setting up any 3169 * other block groups for this transaction, maybe we'll unpin enough 3170 * space the next time around. 3171 */ 3172 if (!ret) 3173 dcs = BTRFS_DC_SETUP; 3174 else if (ret == -ENOSPC) 3175 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3176 3177 out_put: 3178 iput(inode); 3179 out_free: 3180 btrfs_release_path(path); 3181 out: 3182 spin_lock(&block_group->lock); 3183 if (!ret && dcs == BTRFS_DC_SETUP) 3184 block_group->cache_generation = trans->transid; 3185 block_group->disk_cache_state = dcs; 3186 spin_unlock(&block_group->lock); 3187 3188 extent_changeset_free(data_reserved); 3189 return ret; 3190 } 3191 3192 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 3193 { 3194 struct btrfs_fs_info *fs_info = trans->fs_info; 3195 struct btrfs_block_group *cache, *tmp; 3196 struct btrfs_transaction *cur_trans = trans->transaction; 3197 struct btrfs_path *path; 3198 3199 if (list_empty(&cur_trans->dirty_bgs) || 3200 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3201 return 0; 3202 3203 path = btrfs_alloc_path(); 3204 if (!path) 3205 return -ENOMEM; 3206 3207 /* Could add new block groups, use _safe just in case */ 3208 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3209 dirty_list) { 3210 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3211 cache_save_setup(cache, trans, path); 3212 } 3213 3214 btrfs_free_path(path); 3215 return 0; 3216 } 3217 3218 /* 3219 * Transaction commit does final block group cache writeback during a critical 3220 * section where nothing is allowed to change the FS. This is required in 3221 * order for the cache to actually match the block group, but can introduce a 3222 * lot of latency into the commit. 3223 * 3224 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 3225 * There's a chance we'll have to redo some of it if the block group changes 3226 * again during the commit, but it greatly reduces the commit latency by 3227 * getting rid of the easy block groups while we're still allowing others to 3228 * join the commit. 3229 */ 3230 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 3231 { 3232 struct btrfs_fs_info *fs_info = trans->fs_info; 3233 struct btrfs_block_group *cache; 3234 struct btrfs_transaction *cur_trans = trans->transaction; 3235 int ret = 0; 3236 int should_put; 3237 struct btrfs_path *path = NULL; 3238 LIST_HEAD(dirty); 3239 struct list_head *io = &cur_trans->io_bgs; 3240 int loops = 0; 3241 3242 spin_lock(&cur_trans->dirty_bgs_lock); 3243 if (list_empty(&cur_trans->dirty_bgs)) { 3244 spin_unlock(&cur_trans->dirty_bgs_lock); 3245 return 0; 3246 } 3247 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3248 spin_unlock(&cur_trans->dirty_bgs_lock); 3249 3250 again: 3251 /* Make sure all the block groups on our dirty list actually exist */ 3252 btrfs_create_pending_block_groups(trans); 3253 3254 if (!path) { 3255 path = btrfs_alloc_path(); 3256 if (!path) { 3257 ret = -ENOMEM; 3258 goto out; 3259 } 3260 } 3261 3262 /* 3263 * cache_write_mutex is here only to save us from balance or automatic 3264 * removal of empty block groups deleting this block group while we are 3265 * writing out the cache 3266 */ 3267 mutex_lock(&trans->transaction->cache_write_mutex); 3268 while (!list_empty(&dirty)) { 3269 bool drop_reserve = true; 3270 3271 cache = list_first_entry(&dirty, struct btrfs_block_group, 3272 dirty_list); 3273 /* 3274 * This can happen if something re-dirties a block group that 3275 * is already under IO. Just wait for it to finish and then do 3276 * it all again 3277 */ 3278 if (!list_empty(&cache->io_list)) { 3279 list_del_init(&cache->io_list); 3280 btrfs_wait_cache_io(trans, cache, path); 3281 btrfs_put_block_group(cache); 3282 } 3283 3284 3285 /* 3286 * btrfs_wait_cache_io uses the cache->dirty_list to decide if 3287 * it should update the cache_state. Don't delete until after 3288 * we wait. 3289 * 3290 * Since we're not running in the commit critical section 3291 * we need the dirty_bgs_lock to protect from update_block_group 3292 */ 3293 spin_lock(&cur_trans->dirty_bgs_lock); 3294 list_del_init(&cache->dirty_list); 3295 spin_unlock(&cur_trans->dirty_bgs_lock); 3296 3297 should_put = 1; 3298 3299 cache_save_setup(cache, trans, path); 3300 3301 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3302 cache->io_ctl.inode = NULL; 3303 ret = btrfs_write_out_cache(trans, cache, path); 3304 if (ret == 0 && cache->io_ctl.inode) { 3305 should_put = 0; 3306 3307 /* 3308 * The cache_write_mutex is protecting the 3309 * io_list, also refer to the definition of 3310 * btrfs_transaction::io_bgs for more details 3311 */ 3312 list_add_tail(&cache->io_list, io); 3313 } else { 3314 /* 3315 * If we failed to write the cache, the 3316 * generation will be bad and life goes on 3317 */ 3318 ret = 0; 3319 } 3320 } 3321 if (!ret) { 3322 ret = update_block_group_item(trans, path, cache); 3323 /* 3324 * Our block group might still be attached to the list 3325 * of new block groups in the transaction handle of some 3326 * other task (struct btrfs_trans_handle->new_bgs). This 3327 * means its block group item isn't yet in the extent 3328 * tree. If this happens ignore the error, as we will 3329 * try again later in the critical section of the 3330 * transaction commit. 3331 */ 3332 if (ret == -ENOENT) { 3333 ret = 0; 3334 spin_lock(&cur_trans->dirty_bgs_lock); 3335 if (list_empty(&cache->dirty_list)) { 3336 list_add_tail(&cache->dirty_list, 3337 &cur_trans->dirty_bgs); 3338 btrfs_get_block_group(cache); 3339 drop_reserve = false; 3340 } 3341 spin_unlock(&cur_trans->dirty_bgs_lock); 3342 } else if (ret) { 3343 btrfs_abort_transaction(trans, ret); 3344 } 3345 } 3346 3347 /* If it's not on the io list, we need to put the block group */ 3348 if (should_put) 3349 btrfs_put_block_group(cache); 3350 if (drop_reserve) 3351 btrfs_delayed_refs_rsv_release(fs_info, 1); 3352 /* 3353 * Avoid blocking other tasks for too long. It might even save 3354 * us from writing caches for block groups that are going to be 3355 * removed. 3356 */ 3357 mutex_unlock(&trans->transaction->cache_write_mutex); 3358 if (ret) 3359 goto out; 3360 mutex_lock(&trans->transaction->cache_write_mutex); 3361 } 3362 mutex_unlock(&trans->transaction->cache_write_mutex); 3363 3364 /* 3365 * Go through delayed refs for all the stuff we've just kicked off 3366 * and then loop back (just once) 3367 */ 3368 if (!ret) 3369 ret = btrfs_run_delayed_refs(trans, 0); 3370 if (!ret && loops == 0) { 3371 loops++; 3372 spin_lock(&cur_trans->dirty_bgs_lock); 3373 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3374 /* 3375 * dirty_bgs_lock protects us from concurrent block group 3376 * deletes too (not just cache_write_mutex). 3377 */ 3378 if (!list_empty(&dirty)) { 3379 spin_unlock(&cur_trans->dirty_bgs_lock); 3380 goto again; 3381 } 3382 spin_unlock(&cur_trans->dirty_bgs_lock); 3383 } 3384 out: 3385 if (ret < 0) { 3386 spin_lock(&cur_trans->dirty_bgs_lock); 3387 list_splice_init(&dirty, &cur_trans->dirty_bgs); 3388 spin_unlock(&cur_trans->dirty_bgs_lock); 3389 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3390 } 3391 3392 btrfs_free_path(path); 3393 return ret; 3394 } 3395 3396 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 3397 { 3398 struct btrfs_fs_info *fs_info = trans->fs_info; 3399 struct btrfs_block_group *cache; 3400 struct btrfs_transaction *cur_trans = trans->transaction; 3401 int ret = 0; 3402 int should_put; 3403 struct btrfs_path *path; 3404 struct list_head *io = &cur_trans->io_bgs; 3405 3406 path = btrfs_alloc_path(); 3407 if (!path) 3408 return -ENOMEM; 3409 3410 /* 3411 * Even though we are in the critical section of the transaction commit, 3412 * we can still have concurrent tasks adding elements to this 3413 * transaction's list of dirty block groups. These tasks correspond to 3414 * endio free space workers started when writeback finishes for a 3415 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3416 * allocate new block groups as a result of COWing nodes of the root 3417 * tree when updating the free space inode. The writeback for the space 3418 * caches is triggered by an earlier call to 3419 * btrfs_start_dirty_block_groups() and iterations of the following 3420 * loop. 3421 * Also we want to do the cache_save_setup first and then run the 3422 * delayed refs to make sure we have the best chance at doing this all 3423 * in one shot. 3424 */ 3425 spin_lock(&cur_trans->dirty_bgs_lock); 3426 while (!list_empty(&cur_trans->dirty_bgs)) { 3427 cache = list_first_entry(&cur_trans->dirty_bgs, 3428 struct btrfs_block_group, 3429 dirty_list); 3430 3431 /* 3432 * This can happen if cache_save_setup re-dirties a block group 3433 * that is already under IO. Just wait for it to finish and 3434 * then do it all again 3435 */ 3436 if (!list_empty(&cache->io_list)) { 3437 spin_unlock(&cur_trans->dirty_bgs_lock); 3438 list_del_init(&cache->io_list); 3439 btrfs_wait_cache_io(trans, cache, path); 3440 btrfs_put_block_group(cache); 3441 spin_lock(&cur_trans->dirty_bgs_lock); 3442 } 3443 3444 /* 3445 * Don't remove from the dirty list until after we've waited on 3446 * any pending IO 3447 */ 3448 list_del_init(&cache->dirty_list); 3449 spin_unlock(&cur_trans->dirty_bgs_lock); 3450 should_put = 1; 3451 3452 cache_save_setup(cache, trans, path); 3453 3454 if (!ret) 3455 ret = btrfs_run_delayed_refs(trans, 3456 (unsigned long) -1); 3457 3458 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3459 cache->io_ctl.inode = NULL; 3460 ret = btrfs_write_out_cache(trans, cache, path); 3461 if (ret == 0 && cache->io_ctl.inode) { 3462 should_put = 0; 3463 list_add_tail(&cache->io_list, io); 3464 } else { 3465 /* 3466 * If we failed to write the cache, the 3467 * generation will be bad and life goes on 3468 */ 3469 ret = 0; 3470 } 3471 } 3472 if (!ret) { 3473 ret = update_block_group_item(trans, path, cache); 3474 /* 3475 * One of the free space endio workers might have 3476 * created a new block group while updating a free space 3477 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3478 * and hasn't released its transaction handle yet, in 3479 * which case the new block group is still attached to 3480 * its transaction handle and its creation has not 3481 * finished yet (no block group item in the extent tree 3482 * yet, etc). If this is the case, wait for all free 3483 * space endio workers to finish and retry. This is a 3484 * very rare case so no need for a more efficient and 3485 * complex approach. 3486 */ 3487 if (ret == -ENOENT) { 3488 wait_event(cur_trans->writer_wait, 3489 atomic_read(&cur_trans->num_writers) == 1); 3490 ret = update_block_group_item(trans, path, cache); 3491 } 3492 if (ret) 3493 btrfs_abort_transaction(trans, ret); 3494 } 3495 3496 /* If its not on the io list, we need to put the block group */ 3497 if (should_put) 3498 btrfs_put_block_group(cache); 3499 btrfs_delayed_refs_rsv_release(fs_info, 1); 3500 spin_lock(&cur_trans->dirty_bgs_lock); 3501 } 3502 spin_unlock(&cur_trans->dirty_bgs_lock); 3503 3504 /* 3505 * Refer to the definition of io_bgs member for details why it's safe 3506 * to use it without any locking 3507 */ 3508 while (!list_empty(io)) { 3509 cache = list_first_entry(io, struct btrfs_block_group, 3510 io_list); 3511 list_del_init(&cache->io_list); 3512 btrfs_wait_cache_io(trans, cache, path); 3513 btrfs_put_block_group(cache); 3514 } 3515 3516 btrfs_free_path(path); 3517 return ret; 3518 } 3519 3520 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 3521 u64 bytenr, u64 num_bytes, bool alloc) 3522 { 3523 struct btrfs_fs_info *info = trans->fs_info; 3524 struct btrfs_block_group *cache = NULL; 3525 u64 total = num_bytes; 3526 u64 old_val; 3527 u64 byte_in_group; 3528 int factor; 3529 int ret = 0; 3530 3531 /* Block accounting for super block */ 3532 spin_lock(&info->delalloc_root_lock); 3533 old_val = btrfs_super_bytes_used(info->super_copy); 3534 if (alloc) 3535 old_val += num_bytes; 3536 else 3537 old_val -= num_bytes; 3538 btrfs_set_super_bytes_used(info->super_copy, old_val); 3539 spin_unlock(&info->delalloc_root_lock); 3540 3541 while (total) { 3542 struct btrfs_space_info *space_info; 3543 bool reclaim = false; 3544 3545 cache = btrfs_lookup_block_group(info, bytenr); 3546 if (!cache) { 3547 ret = -ENOENT; 3548 break; 3549 } 3550 space_info = cache->space_info; 3551 factor = btrfs_bg_type_to_factor(cache->flags); 3552 3553 /* 3554 * If this block group has free space cache written out, we 3555 * need to make sure to load it if we are removing space. This 3556 * is because we need the unpinning stage to actually add the 3557 * space back to the block group, otherwise we will leak space. 3558 */ 3559 if (!alloc && !btrfs_block_group_done(cache)) 3560 btrfs_cache_block_group(cache, true); 3561 3562 byte_in_group = bytenr - cache->start; 3563 WARN_ON(byte_in_group > cache->length); 3564 3565 spin_lock(&space_info->lock); 3566 spin_lock(&cache->lock); 3567 3568 if (btrfs_test_opt(info, SPACE_CACHE) && 3569 cache->disk_cache_state < BTRFS_DC_CLEAR) 3570 cache->disk_cache_state = BTRFS_DC_CLEAR; 3571 3572 old_val = cache->used; 3573 num_bytes = min(total, cache->length - byte_in_group); 3574 if (alloc) { 3575 old_val += num_bytes; 3576 cache->used = old_val; 3577 cache->reserved -= num_bytes; 3578 space_info->bytes_reserved -= num_bytes; 3579 space_info->bytes_used += num_bytes; 3580 space_info->disk_used += num_bytes * factor; 3581 spin_unlock(&cache->lock); 3582 spin_unlock(&space_info->lock); 3583 } else { 3584 old_val -= num_bytes; 3585 cache->used = old_val; 3586 cache->pinned += num_bytes; 3587 btrfs_space_info_update_bytes_pinned(info, space_info, 3588 num_bytes); 3589 space_info->bytes_used -= num_bytes; 3590 space_info->disk_used -= num_bytes * factor; 3591 3592 reclaim = should_reclaim_block_group(cache, num_bytes); 3593 3594 spin_unlock(&cache->lock); 3595 spin_unlock(&space_info->lock); 3596 3597 set_extent_bit(&trans->transaction->pinned_extents, 3598 bytenr, bytenr + num_bytes - 1, 3599 EXTENT_DIRTY, NULL); 3600 } 3601 3602 spin_lock(&trans->transaction->dirty_bgs_lock); 3603 if (list_empty(&cache->dirty_list)) { 3604 list_add_tail(&cache->dirty_list, 3605 &trans->transaction->dirty_bgs); 3606 trans->delayed_ref_updates++; 3607 btrfs_get_block_group(cache); 3608 } 3609 spin_unlock(&trans->transaction->dirty_bgs_lock); 3610 3611 /* 3612 * No longer have used bytes in this block group, queue it for 3613 * deletion. We do this after adding the block group to the 3614 * dirty list to avoid races between cleaner kthread and space 3615 * cache writeout. 3616 */ 3617 if (!alloc && old_val == 0) { 3618 if (!btrfs_test_opt(info, DISCARD_ASYNC)) 3619 btrfs_mark_bg_unused(cache); 3620 } else if (!alloc && reclaim) { 3621 btrfs_mark_bg_to_reclaim(cache); 3622 } 3623 3624 btrfs_put_block_group(cache); 3625 total -= num_bytes; 3626 bytenr += num_bytes; 3627 } 3628 3629 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 3630 btrfs_update_delayed_refs_rsv(trans); 3631 return ret; 3632 } 3633 3634 /* 3635 * Update the block_group and space info counters. 3636 * 3637 * @cache: The cache we are manipulating 3638 * @ram_bytes: The number of bytes of file content, and will be same to 3639 * @num_bytes except for the compress path. 3640 * @num_bytes: The number of bytes in question 3641 * @delalloc: The blocks are allocated for the delalloc write 3642 * 3643 * This is called by the allocator when it reserves space. If this is a 3644 * reservation and the block group has become read only we cannot make the 3645 * reservation and return -EAGAIN, otherwise this function always succeeds. 3646 */ 3647 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 3648 u64 ram_bytes, u64 num_bytes, int delalloc, 3649 bool force_wrong_size_class) 3650 { 3651 struct btrfs_space_info *space_info = cache->space_info; 3652 enum btrfs_block_group_size_class size_class; 3653 int ret = 0; 3654 3655 spin_lock(&space_info->lock); 3656 spin_lock(&cache->lock); 3657 if (cache->ro) { 3658 ret = -EAGAIN; 3659 goto out; 3660 } 3661 3662 if (btrfs_block_group_should_use_size_class(cache)) { 3663 size_class = btrfs_calc_block_group_size_class(num_bytes); 3664 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); 3665 if (ret) 3666 goto out; 3667 } 3668 cache->reserved += num_bytes; 3669 space_info->bytes_reserved += num_bytes; 3670 trace_btrfs_space_reservation(cache->fs_info, "space_info", 3671 space_info->flags, num_bytes, 1); 3672 btrfs_space_info_update_bytes_may_use(cache->fs_info, 3673 space_info, -ram_bytes); 3674 if (delalloc) 3675 cache->delalloc_bytes += num_bytes; 3676 3677 /* 3678 * Compression can use less space than we reserved, so wake tickets if 3679 * that happens. 3680 */ 3681 if (num_bytes < ram_bytes) 3682 btrfs_try_granting_tickets(cache->fs_info, space_info); 3683 out: 3684 spin_unlock(&cache->lock); 3685 spin_unlock(&space_info->lock); 3686 return ret; 3687 } 3688 3689 /* 3690 * Update the block_group and space info counters. 3691 * 3692 * @cache: The cache we are manipulating 3693 * @num_bytes: The number of bytes in question 3694 * @delalloc: The blocks are allocated for the delalloc write 3695 * 3696 * This is called by somebody who is freeing space that was never actually used 3697 * on disk. For example if you reserve some space for a new leaf in transaction 3698 * A and before transaction A commits you free that leaf, you call this with 3699 * reserve set to 0 in order to clear the reservation. 3700 */ 3701 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 3702 u64 num_bytes, int delalloc) 3703 { 3704 struct btrfs_space_info *space_info = cache->space_info; 3705 3706 spin_lock(&space_info->lock); 3707 spin_lock(&cache->lock); 3708 if (cache->ro) 3709 space_info->bytes_readonly += num_bytes; 3710 cache->reserved -= num_bytes; 3711 space_info->bytes_reserved -= num_bytes; 3712 space_info->max_extent_size = 0; 3713 3714 if (delalloc) 3715 cache->delalloc_bytes -= num_bytes; 3716 spin_unlock(&cache->lock); 3717 3718 btrfs_try_granting_tickets(cache->fs_info, space_info); 3719 spin_unlock(&space_info->lock); 3720 } 3721 3722 static void force_metadata_allocation(struct btrfs_fs_info *info) 3723 { 3724 struct list_head *head = &info->space_info; 3725 struct btrfs_space_info *found; 3726 3727 list_for_each_entry(found, head, list) { 3728 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3729 found->force_alloc = CHUNK_ALLOC_FORCE; 3730 } 3731 } 3732 3733 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3734 struct btrfs_space_info *sinfo, int force) 3735 { 3736 u64 bytes_used = btrfs_space_info_used(sinfo, false); 3737 u64 thresh; 3738 3739 if (force == CHUNK_ALLOC_FORCE) 3740 return 1; 3741 3742 /* 3743 * in limited mode, we want to have some free space up to 3744 * about 1% of the FS size. 3745 */ 3746 if (force == CHUNK_ALLOC_LIMITED) { 3747 thresh = btrfs_super_total_bytes(fs_info->super_copy); 3748 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); 3749 3750 if (sinfo->total_bytes - bytes_used < thresh) 3751 return 1; 3752 } 3753 3754 if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) 3755 return 0; 3756 return 1; 3757 } 3758 3759 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 3760 { 3761 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 3762 3763 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 3764 } 3765 3766 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) 3767 { 3768 struct btrfs_block_group *bg; 3769 int ret; 3770 3771 /* 3772 * Check if we have enough space in the system space info because we 3773 * will need to update device items in the chunk btree and insert a new 3774 * chunk item in the chunk btree as well. This will allocate a new 3775 * system block group if needed. 3776 */ 3777 check_system_chunk(trans, flags); 3778 3779 bg = btrfs_create_chunk(trans, flags); 3780 if (IS_ERR(bg)) { 3781 ret = PTR_ERR(bg); 3782 goto out; 3783 } 3784 3785 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3786 /* 3787 * Normally we are not expected to fail with -ENOSPC here, since we have 3788 * previously reserved space in the system space_info and allocated one 3789 * new system chunk if necessary. However there are three exceptions: 3790 * 3791 * 1) We may have enough free space in the system space_info but all the 3792 * existing system block groups have a profile which can not be used 3793 * for extent allocation. 3794 * 3795 * This happens when mounting in degraded mode. For example we have a 3796 * RAID1 filesystem with 2 devices, lose one device and mount the fs 3797 * using the other device in degraded mode. If we then allocate a chunk, 3798 * we may have enough free space in the existing system space_info, but 3799 * none of the block groups can be used for extent allocation since they 3800 * have a RAID1 profile, and because we are in degraded mode with a 3801 * single device, we are forced to allocate a new system chunk with a 3802 * SINGLE profile. Making check_system_chunk() iterate over all system 3803 * block groups and check if they have a usable profile and enough space 3804 * can be slow on very large filesystems, so we tolerate the -ENOSPC and 3805 * try again after forcing allocation of a new system chunk. Like this 3806 * we avoid paying the cost of that search in normal circumstances, when 3807 * we were not mounted in degraded mode; 3808 * 3809 * 2) We had enough free space info the system space_info, and one suitable 3810 * block group to allocate from when we called check_system_chunk() 3811 * above. However right after we called it, the only system block group 3812 * with enough free space got turned into RO mode by a running scrub, 3813 * and in this case we have to allocate a new one and retry. We only 3814 * need do this allocate and retry once, since we have a transaction 3815 * handle and scrub uses the commit root to search for block groups; 3816 * 3817 * 3) We had one system block group with enough free space when we called 3818 * check_system_chunk(), but after that, right before we tried to 3819 * allocate the last extent buffer we needed, a discard operation came 3820 * in and it temporarily removed the last free space entry from the 3821 * block group (discard removes a free space entry, discards it, and 3822 * then adds back the entry to the block group cache). 3823 */ 3824 if (ret == -ENOSPC) { 3825 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); 3826 struct btrfs_block_group *sys_bg; 3827 3828 sys_bg = btrfs_create_chunk(trans, sys_flags); 3829 if (IS_ERR(sys_bg)) { 3830 ret = PTR_ERR(sys_bg); 3831 btrfs_abort_transaction(trans, ret); 3832 goto out; 3833 } 3834 3835 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3836 if (ret) { 3837 btrfs_abort_transaction(trans, ret); 3838 goto out; 3839 } 3840 3841 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); 3842 if (ret) { 3843 btrfs_abort_transaction(trans, ret); 3844 goto out; 3845 } 3846 } else if (ret) { 3847 btrfs_abort_transaction(trans, ret); 3848 goto out; 3849 } 3850 out: 3851 btrfs_trans_release_chunk_metadata(trans); 3852 3853 if (ret) 3854 return ERR_PTR(ret); 3855 3856 btrfs_get_block_group(bg); 3857 return bg; 3858 } 3859 3860 /* 3861 * Chunk allocation is done in 2 phases: 3862 * 3863 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for 3864 * the chunk, the chunk mapping, create its block group and add the items 3865 * that belong in the chunk btree to it - more specifically, we need to 3866 * update device items in the chunk btree and add a new chunk item to it. 3867 * 3868 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block 3869 * group item to the extent btree and the device extent items to the devices 3870 * btree. 3871 * 3872 * This is done to prevent deadlocks. For example when COWing a node from the 3873 * extent btree we are holding a write lock on the node's parent and if we 3874 * trigger chunk allocation and attempted to insert the new block group item 3875 * in the extent btree right way, we could deadlock because the path for the 3876 * insertion can include that parent node. At first glance it seems impossible 3877 * to trigger chunk allocation after starting a transaction since tasks should 3878 * reserve enough transaction units (metadata space), however while that is true 3879 * most of the time, chunk allocation may still be triggered for several reasons: 3880 * 3881 * 1) When reserving metadata, we check if there is enough free space in the 3882 * metadata space_info and therefore don't trigger allocation of a new chunk. 3883 * However later when the task actually tries to COW an extent buffer from 3884 * the extent btree or from the device btree for example, it is forced to 3885 * allocate a new block group (chunk) because the only one that had enough 3886 * free space was just turned to RO mode by a running scrub for example (or 3887 * device replace, block group reclaim thread, etc), so we can not use it 3888 * for allocating an extent and end up being forced to allocate a new one; 3889 * 3890 * 2) Because we only check that the metadata space_info has enough free bytes, 3891 * we end up not allocating a new metadata chunk in that case. However if 3892 * the filesystem was mounted in degraded mode, none of the existing block 3893 * groups might be suitable for extent allocation due to their incompatible 3894 * profile (for e.g. mounting a 2 devices filesystem, where all block groups 3895 * use a RAID1 profile, in degraded mode using a single device). In this case 3896 * when the task attempts to COW some extent buffer of the extent btree for 3897 * example, it will trigger allocation of a new metadata block group with a 3898 * suitable profile (SINGLE profile in the example of the degraded mount of 3899 * the RAID1 filesystem); 3900 * 3901 * 3) The task has reserved enough transaction units / metadata space, but when 3902 * it attempts to COW an extent buffer from the extent or device btree for 3903 * example, it does not find any free extent in any metadata block group, 3904 * therefore forced to try to allocate a new metadata block group. 3905 * This is because some other task allocated all available extents in the 3906 * meanwhile - this typically happens with tasks that don't reserve space 3907 * properly, either intentionally or as a bug. One example where this is 3908 * done intentionally is fsync, as it does not reserve any transaction units 3909 * and ends up allocating a variable number of metadata extents for log 3910 * tree extent buffers; 3911 * 3912 * 4) The task has reserved enough transaction units / metadata space, but right 3913 * before it tries to allocate the last extent buffer it needs, a discard 3914 * operation comes in and, temporarily, removes the last free space entry from 3915 * the only metadata block group that had free space (discard starts by 3916 * removing a free space entry from a block group, then does the discard 3917 * operation and, once it's done, it adds back the free space entry to the 3918 * block group). 3919 * 3920 * We also need this 2 phases setup when adding a device to a filesystem with 3921 * a seed device - we must create new metadata and system chunks without adding 3922 * any of the block group items to the chunk, extent and device btrees. If we 3923 * did not do it this way, we would get ENOSPC when attempting to update those 3924 * btrees, since all the chunks from the seed device are read-only. 3925 * 3926 * Phase 1 does the updates and insertions to the chunk btree because if we had 3927 * it done in phase 2 and have a thundering herd of tasks allocating chunks in 3928 * parallel, we risk having too many system chunks allocated by many tasks if 3929 * many tasks reach phase 1 without the previous ones completing phase 2. In the 3930 * extreme case this leads to exhaustion of the system chunk array in the 3931 * superblock. This is easier to trigger if using a btree node/leaf size of 64K 3932 * and with RAID filesystems (so we have more device items in the chunk btree). 3933 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of 3934 * the system chunk array due to concurrent allocations") provides more details. 3935 * 3936 * Allocation of system chunks does not happen through this function. A task that 3937 * needs to update the chunk btree (the only btree that uses system chunks), must 3938 * preallocate chunk space by calling either check_system_chunk() or 3939 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or 3940 * metadata chunk or when removing a chunk, while the later is used before doing 3941 * a modification to the chunk btree - use cases for the later are adding, 3942 * removing and resizing a device as well as relocation of a system chunk. 3943 * See the comment below for more details. 3944 * 3945 * The reservation of system space, done through check_system_chunk(), as well 3946 * as all the updates and insertions into the chunk btree must be done while 3947 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing 3948 * an extent buffer from the chunks btree we never trigger allocation of a new 3949 * system chunk, which would result in a deadlock (trying to lock twice an 3950 * extent buffer of the chunk btree, first time before triggering the chunk 3951 * allocation and the second time during chunk allocation while attempting to 3952 * update the chunks btree). The system chunk array is also updated while holding 3953 * that mutex. The same logic applies to removing chunks - we must reserve system 3954 * space, update the chunk btree and the system chunk array in the superblock 3955 * while holding fs_info->chunk_mutex. 3956 * 3957 * This function, btrfs_chunk_alloc(), belongs to phase 1. 3958 * 3959 * If @force is CHUNK_ALLOC_FORCE: 3960 * - return 1 if it successfully allocates a chunk, 3961 * - return errors including -ENOSPC otherwise. 3962 * If @force is NOT CHUNK_ALLOC_FORCE: 3963 * - return 0 if it doesn't need to allocate a new chunk, 3964 * - return 1 if it successfully allocates a chunk, 3965 * - return errors including -ENOSPC otherwise. 3966 */ 3967 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3968 enum btrfs_chunk_alloc_enum force) 3969 { 3970 struct btrfs_fs_info *fs_info = trans->fs_info; 3971 struct btrfs_space_info *space_info; 3972 struct btrfs_block_group *ret_bg; 3973 bool wait_for_alloc = false; 3974 bool should_alloc = false; 3975 bool from_extent_allocation = false; 3976 int ret = 0; 3977 3978 if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { 3979 from_extent_allocation = true; 3980 force = CHUNK_ALLOC_FORCE; 3981 } 3982 3983 /* Don't re-enter if we're already allocating a chunk */ 3984 if (trans->allocating_chunk) 3985 return -ENOSPC; 3986 /* 3987 * Allocation of system chunks can not happen through this path, as we 3988 * could end up in a deadlock if we are allocating a data or metadata 3989 * chunk and there is another task modifying the chunk btree. 3990 * 3991 * This is because while we are holding the chunk mutex, we will attempt 3992 * to add the new chunk item to the chunk btree or update an existing 3993 * device item in the chunk btree, while the other task that is modifying 3994 * the chunk btree is attempting to COW an extent buffer while holding a 3995 * lock on it and on its parent - if the COW operation triggers a system 3996 * chunk allocation, then we can deadlock because we are holding the 3997 * chunk mutex and we may need to access that extent buffer or its parent 3998 * in order to add the chunk item or update a device item. 3999 * 4000 * Tasks that want to modify the chunk tree should reserve system space 4001 * before updating the chunk btree, by calling either 4002 * btrfs_reserve_chunk_metadata() or check_system_chunk(). 4003 * It's possible that after a task reserves the space, it still ends up 4004 * here - this happens in the cases described above at do_chunk_alloc(). 4005 * The task will have to either retry or fail. 4006 */ 4007 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4008 return -ENOSPC; 4009 4010 space_info = btrfs_find_space_info(fs_info, flags); 4011 ASSERT(space_info); 4012 4013 do { 4014 spin_lock(&space_info->lock); 4015 if (force < space_info->force_alloc) 4016 force = space_info->force_alloc; 4017 should_alloc = should_alloc_chunk(fs_info, space_info, force); 4018 if (space_info->full) { 4019 /* No more free physical space */ 4020 if (should_alloc) 4021 ret = -ENOSPC; 4022 else 4023 ret = 0; 4024 spin_unlock(&space_info->lock); 4025 return ret; 4026 } else if (!should_alloc) { 4027 spin_unlock(&space_info->lock); 4028 return 0; 4029 } else if (space_info->chunk_alloc) { 4030 /* 4031 * Someone is already allocating, so we need to block 4032 * until this someone is finished and then loop to 4033 * recheck if we should continue with our allocation 4034 * attempt. 4035 */ 4036 wait_for_alloc = true; 4037 force = CHUNK_ALLOC_NO_FORCE; 4038 spin_unlock(&space_info->lock); 4039 mutex_lock(&fs_info->chunk_mutex); 4040 mutex_unlock(&fs_info->chunk_mutex); 4041 } else { 4042 /* Proceed with allocation */ 4043 space_info->chunk_alloc = 1; 4044 wait_for_alloc = false; 4045 spin_unlock(&space_info->lock); 4046 } 4047 4048 cond_resched(); 4049 } while (wait_for_alloc); 4050 4051 mutex_lock(&fs_info->chunk_mutex); 4052 trans->allocating_chunk = true; 4053 4054 /* 4055 * If we have mixed data/metadata chunks we want to make sure we keep 4056 * allocating mixed chunks instead of individual chunks. 4057 */ 4058 if (btrfs_mixed_space_info(space_info)) 4059 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4060 4061 /* 4062 * if we're doing a data chunk, go ahead and make sure that 4063 * we keep a reasonable number of metadata chunks allocated in the 4064 * FS as well. 4065 */ 4066 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4067 fs_info->data_chunk_allocations++; 4068 if (!(fs_info->data_chunk_allocations % 4069 fs_info->metadata_ratio)) 4070 force_metadata_allocation(fs_info); 4071 } 4072 4073 ret_bg = do_chunk_alloc(trans, flags); 4074 trans->allocating_chunk = false; 4075 4076 if (IS_ERR(ret_bg)) { 4077 ret = PTR_ERR(ret_bg); 4078 } else if (from_extent_allocation) { 4079 /* 4080 * New block group is likely to be used soon. Try to activate 4081 * it now. Failure is OK for now. 4082 */ 4083 btrfs_zone_activate(ret_bg); 4084 } 4085 4086 if (!ret) 4087 btrfs_put_block_group(ret_bg); 4088 4089 spin_lock(&space_info->lock); 4090 if (ret < 0) { 4091 if (ret == -ENOSPC) 4092 space_info->full = 1; 4093 else 4094 goto out; 4095 } else { 4096 ret = 1; 4097 space_info->max_extent_size = 0; 4098 } 4099 4100 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4101 out: 4102 space_info->chunk_alloc = 0; 4103 spin_unlock(&space_info->lock); 4104 mutex_unlock(&fs_info->chunk_mutex); 4105 4106 return ret; 4107 } 4108 4109 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4110 { 4111 u64 num_dev; 4112 4113 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 4114 if (!num_dev) 4115 num_dev = fs_info->fs_devices->rw_devices; 4116 4117 return num_dev; 4118 } 4119 4120 static void reserve_chunk_space(struct btrfs_trans_handle *trans, 4121 u64 bytes, 4122 u64 type) 4123 { 4124 struct btrfs_fs_info *fs_info = trans->fs_info; 4125 struct btrfs_space_info *info; 4126 u64 left; 4127 int ret = 0; 4128 4129 /* 4130 * Needed because we can end up allocating a system chunk and for an 4131 * atomic and race free space reservation in the chunk block reserve. 4132 */ 4133 lockdep_assert_held(&fs_info->chunk_mutex); 4134 4135 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4136 spin_lock(&info->lock); 4137 left = info->total_bytes - btrfs_space_info_used(info, true); 4138 spin_unlock(&info->lock); 4139 4140 if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4141 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4142 left, bytes, type); 4143 btrfs_dump_space_info(fs_info, info, 0, 0); 4144 } 4145 4146 if (left < bytes) { 4147 u64 flags = btrfs_system_alloc_profile(fs_info); 4148 struct btrfs_block_group *bg; 4149 4150 /* 4151 * Ignore failure to create system chunk. We might end up not 4152 * needing it, as we might not need to COW all nodes/leafs from 4153 * the paths we visit in the chunk tree (they were already COWed 4154 * or created in the current transaction for example). 4155 */ 4156 bg = btrfs_create_chunk(trans, flags); 4157 if (IS_ERR(bg)) { 4158 ret = PTR_ERR(bg); 4159 } else { 4160 /* 4161 * We have a new chunk. We also need to activate it for 4162 * zoned filesystem. 4163 */ 4164 ret = btrfs_zoned_activate_one_bg(fs_info, info, true); 4165 if (ret < 0) 4166 return; 4167 4168 /* 4169 * If we fail to add the chunk item here, we end up 4170 * trying again at phase 2 of chunk allocation, at 4171 * btrfs_create_pending_block_groups(). So ignore 4172 * any error here. An ENOSPC here could happen, due to 4173 * the cases described at do_chunk_alloc() - the system 4174 * block group we just created was just turned into RO 4175 * mode by a scrub for example, or a running discard 4176 * temporarily removed its free space entries, etc. 4177 */ 4178 btrfs_chunk_alloc_add_chunk_item(trans, bg); 4179 } 4180 } 4181 4182 if (!ret) { 4183 ret = btrfs_block_rsv_add(fs_info, 4184 &fs_info->chunk_block_rsv, 4185 bytes, BTRFS_RESERVE_NO_FLUSH); 4186 if (!ret) 4187 trans->chunk_bytes_reserved += bytes; 4188 } 4189 } 4190 4191 /* 4192 * Reserve space in the system space for allocating or removing a chunk. 4193 * The caller must be holding fs_info->chunk_mutex. 4194 */ 4195 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 4196 { 4197 struct btrfs_fs_info *fs_info = trans->fs_info; 4198 const u64 num_devs = get_profile_num_devs(fs_info, type); 4199 u64 bytes; 4200 4201 /* num_devs device items to update and 1 chunk item to add or remove. */ 4202 bytes = btrfs_calc_metadata_size(fs_info, num_devs) + 4203 btrfs_calc_insert_metadata_size(fs_info, 1); 4204 4205 reserve_chunk_space(trans, bytes, type); 4206 } 4207 4208 /* 4209 * Reserve space in the system space, if needed, for doing a modification to the 4210 * chunk btree. 4211 * 4212 * @trans: A transaction handle. 4213 * @is_item_insertion: Indicate if the modification is for inserting a new item 4214 * in the chunk btree or if it's for the deletion or update 4215 * of an existing item. 4216 * 4217 * This is used in a context where we need to update the chunk btree outside 4218 * block group allocation and removal, to avoid a deadlock with a concurrent 4219 * task that is allocating a metadata or data block group and therefore needs to 4220 * update the chunk btree while holding the chunk mutex. After the update to the 4221 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. 4222 * 4223 */ 4224 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, 4225 bool is_item_insertion) 4226 { 4227 struct btrfs_fs_info *fs_info = trans->fs_info; 4228 u64 bytes; 4229 4230 if (is_item_insertion) 4231 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 4232 else 4233 bytes = btrfs_calc_metadata_size(fs_info, 1); 4234 4235 mutex_lock(&fs_info->chunk_mutex); 4236 reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); 4237 mutex_unlock(&fs_info->chunk_mutex); 4238 } 4239 4240 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 4241 { 4242 struct btrfs_block_group *block_group; 4243 4244 block_group = btrfs_lookup_first_block_group(info, 0); 4245 while (block_group) { 4246 btrfs_wait_block_group_cache_done(block_group); 4247 spin_lock(&block_group->lock); 4248 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, 4249 &block_group->runtime_flags)) { 4250 struct inode *inode = block_group->inode; 4251 4252 block_group->inode = NULL; 4253 spin_unlock(&block_group->lock); 4254 4255 ASSERT(block_group->io_ctl.inode == NULL); 4256 iput(inode); 4257 } else { 4258 spin_unlock(&block_group->lock); 4259 } 4260 block_group = btrfs_next_block_group(block_group); 4261 } 4262 } 4263 4264 /* 4265 * Must be called only after stopping all workers, since we could have block 4266 * group caching kthreads running, and therefore they could race with us if we 4267 * freed the block groups before stopping them. 4268 */ 4269 int btrfs_free_block_groups(struct btrfs_fs_info *info) 4270 { 4271 struct btrfs_block_group *block_group; 4272 struct btrfs_space_info *space_info; 4273 struct btrfs_caching_control *caching_ctl; 4274 struct rb_node *n; 4275 4276 write_lock(&info->block_group_cache_lock); 4277 while (!list_empty(&info->caching_block_groups)) { 4278 caching_ctl = list_entry(info->caching_block_groups.next, 4279 struct btrfs_caching_control, list); 4280 list_del(&caching_ctl->list); 4281 btrfs_put_caching_control(caching_ctl); 4282 } 4283 write_unlock(&info->block_group_cache_lock); 4284 4285 spin_lock(&info->unused_bgs_lock); 4286 while (!list_empty(&info->unused_bgs)) { 4287 block_group = list_first_entry(&info->unused_bgs, 4288 struct btrfs_block_group, 4289 bg_list); 4290 list_del_init(&block_group->bg_list); 4291 btrfs_put_block_group(block_group); 4292 } 4293 4294 while (!list_empty(&info->reclaim_bgs)) { 4295 block_group = list_first_entry(&info->reclaim_bgs, 4296 struct btrfs_block_group, 4297 bg_list); 4298 list_del_init(&block_group->bg_list); 4299 btrfs_put_block_group(block_group); 4300 } 4301 spin_unlock(&info->unused_bgs_lock); 4302 4303 spin_lock(&info->zone_active_bgs_lock); 4304 while (!list_empty(&info->zone_active_bgs)) { 4305 block_group = list_first_entry(&info->zone_active_bgs, 4306 struct btrfs_block_group, 4307 active_bg_list); 4308 list_del_init(&block_group->active_bg_list); 4309 btrfs_put_block_group(block_group); 4310 } 4311 spin_unlock(&info->zone_active_bgs_lock); 4312 4313 write_lock(&info->block_group_cache_lock); 4314 while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { 4315 block_group = rb_entry(n, struct btrfs_block_group, 4316 cache_node); 4317 rb_erase_cached(&block_group->cache_node, 4318 &info->block_group_cache_tree); 4319 RB_CLEAR_NODE(&block_group->cache_node); 4320 write_unlock(&info->block_group_cache_lock); 4321 4322 down_write(&block_group->space_info->groups_sem); 4323 list_del(&block_group->list); 4324 up_write(&block_group->space_info->groups_sem); 4325 4326 /* 4327 * We haven't cached this block group, which means we could 4328 * possibly have excluded extents on this block group. 4329 */ 4330 if (block_group->cached == BTRFS_CACHE_NO || 4331 block_group->cached == BTRFS_CACHE_ERROR) 4332 btrfs_free_excluded_extents(block_group); 4333 4334 btrfs_remove_free_space_cache(block_group); 4335 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 4336 ASSERT(list_empty(&block_group->dirty_list)); 4337 ASSERT(list_empty(&block_group->io_list)); 4338 ASSERT(list_empty(&block_group->bg_list)); 4339 ASSERT(refcount_read(&block_group->refs) == 1); 4340 ASSERT(block_group->swap_extents == 0); 4341 btrfs_put_block_group(block_group); 4342 4343 write_lock(&info->block_group_cache_lock); 4344 } 4345 write_unlock(&info->block_group_cache_lock); 4346 4347 btrfs_release_global_block_rsv(info); 4348 4349 while (!list_empty(&info->space_info)) { 4350 space_info = list_entry(info->space_info.next, 4351 struct btrfs_space_info, 4352 list); 4353 4354 /* 4355 * Do not hide this behind enospc_debug, this is actually 4356 * important and indicates a real bug if this happens. 4357 */ 4358 if (WARN_ON(space_info->bytes_pinned > 0 || 4359 space_info->bytes_may_use > 0)) 4360 btrfs_dump_space_info(info, space_info, 0, 0); 4361 4362 /* 4363 * If there was a failure to cleanup a log tree, very likely due 4364 * to an IO failure on a writeback attempt of one or more of its 4365 * extent buffers, we could not do proper (and cheap) unaccounting 4366 * of their reserved space, so don't warn on bytes_reserved > 0 in 4367 * that case. 4368 */ 4369 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || 4370 !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { 4371 if (WARN_ON(space_info->bytes_reserved > 0)) 4372 btrfs_dump_space_info(info, space_info, 0, 0); 4373 } 4374 4375 WARN_ON(space_info->reclaim_size > 0); 4376 list_del(&space_info->list); 4377 btrfs_sysfs_remove_space_info(space_info); 4378 } 4379 return 0; 4380 } 4381 4382 void btrfs_freeze_block_group(struct btrfs_block_group *cache) 4383 { 4384 atomic_inc(&cache->frozen); 4385 } 4386 4387 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) 4388 { 4389 struct btrfs_fs_info *fs_info = block_group->fs_info; 4390 struct extent_map_tree *em_tree; 4391 struct extent_map *em; 4392 bool cleanup; 4393 4394 spin_lock(&block_group->lock); 4395 cleanup = (atomic_dec_and_test(&block_group->frozen) && 4396 test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); 4397 spin_unlock(&block_group->lock); 4398 4399 if (cleanup) { 4400 em_tree = &fs_info->mapping_tree; 4401 write_lock(&em_tree->lock); 4402 em = lookup_extent_mapping(em_tree, block_group->start, 4403 1); 4404 BUG_ON(!em); /* logic error, can't happen */ 4405 remove_extent_mapping(em_tree, em); 4406 write_unlock(&em_tree->lock); 4407 4408 /* once for us and once for the tree */ 4409 free_extent_map(em); 4410 free_extent_map(em); 4411 4412 /* 4413 * We may have left one free space entry and other possible 4414 * tasks trimming this block group have left 1 entry each one. 4415 * Free them if any. 4416 */ 4417 btrfs_remove_free_space_cache(block_group); 4418 } 4419 } 4420 4421 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) 4422 { 4423 bool ret = true; 4424 4425 spin_lock(&bg->lock); 4426 if (bg->ro) 4427 ret = false; 4428 else 4429 bg->swap_extents++; 4430 spin_unlock(&bg->lock); 4431 4432 return ret; 4433 } 4434 4435 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) 4436 { 4437 spin_lock(&bg->lock); 4438 ASSERT(!bg->ro); 4439 ASSERT(bg->swap_extents >= amount); 4440 bg->swap_extents -= amount; 4441 spin_unlock(&bg->lock); 4442 } 4443 4444 enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size) 4445 { 4446 if (size <= SZ_128K) 4447 return BTRFS_BG_SZ_SMALL; 4448 if (size <= SZ_8M) 4449 return BTRFS_BG_SZ_MEDIUM; 4450 return BTRFS_BG_SZ_LARGE; 4451 } 4452 4453 /* 4454 * Handle a block group allocating an extent in a size class 4455 * 4456 * @bg: The block group we allocated in. 4457 * @size_class: The size class of the allocation. 4458 * @force_wrong_size_class: Whether we are desperate enough to allow 4459 * mismatched size classes. 4460 * 4461 * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the 4462 * case of a race that leads to the wrong size class without 4463 * force_wrong_size_class set. 4464 * 4465 * find_free_extent will skip block groups with a mismatched size class until 4466 * it really needs to avoid ENOSPC. In that case it will set 4467 * force_wrong_size_class. However, if a block group is newly allocated and 4468 * doesn't yet have a size class, then it is possible for two allocations of 4469 * different sizes to race and both try to use it. The loser is caught here and 4470 * has to retry. 4471 */ 4472 int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, 4473 enum btrfs_block_group_size_class size_class, 4474 bool force_wrong_size_class) 4475 { 4476 ASSERT(size_class != BTRFS_BG_SZ_NONE); 4477 4478 /* The new allocation is in the right size class, do nothing */ 4479 if (bg->size_class == size_class) 4480 return 0; 4481 /* 4482 * The new allocation is in a mismatched size class. 4483 * This means one of two things: 4484 * 4485 * 1. Two tasks in find_free_extent for different size_classes raced 4486 * and hit the same empty block_group. Make the loser try again. 4487 * 2. A call to find_free_extent got desperate enough to set 4488 * 'force_wrong_slab'. Don't change the size_class, but allow the 4489 * allocation. 4490 */ 4491 if (bg->size_class != BTRFS_BG_SZ_NONE) { 4492 if (force_wrong_size_class) 4493 return 0; 4494 return -EAGAIN; 4495 } 4496 /* 4497 * The happy new block group case: the new allocation is the first 4498 * one in the block_group so we set size_class. 4499 */ 4500 bg->size_class = size_class; 4501 4502 return 0; 4503 } 4504 4505 bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) 4506 { 4507 if (btrfs_is_zoned(bg->fs_info)) 4508 return false; 4509 if (!btrfs_is_block_group_data_only(bg)) 4510 return false; 4511 return true; 4512 } 4513