1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "block-group.h" 6 #include "space-info.h" 7 #include "disk-io.h" 8 #include "free-space-cache.h" 9 #include "free-space-tree.h" 10 #include "disk-io.h" 11 #include "volumes.h" 12 #include "transaction.h" 13 #include "ref-verify.h" 14 #include "sysfs.h" 15 #include "tree-log.h" 16 #include "delalloc-space.h" 17 18 /* 19 * Return target flags in extended format or 0 if restripe for this chunk_type 20 * is not in progress 21 * 22 * Should be called with balance_lock held 23 */ 24 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 25 { 26 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 27 u64 target = 0; 28 29 if (!bctl) 30 return 0; 31 32 if (flags & BTRFS_BLOCK_GROUP_DATA && 33 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 34 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 35 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 36 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 37 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 38 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 39 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 40 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 41 } 42 43 return target; 44 } 45 46 /* 47 * @flags: available profiles in extended format (see ctree.h) 48 * 49 * Return reduced profile in chunk format. If profile changing is in progress 50 * (either running or paused) picks the target profile (if it's already 51 * available), otherwise falls back to plain reducing. 52 */ 53 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 54 { 55 u64 num_devices = fs_info->fs_devices->rw_devices; 56 u64 target; 57 u64 raid_type; 58 u64 allowed = 0; 59 60 /* 61 * See if restripe for this chunk_type is in progress, if so try to 62 * reduce to the target profile 63 */ 64 spin_lock(&fs_info->balance_lock); 65 target = get_restripe_target(fs_info, flags); 66 if (target) { 67 /* Pick target profile only if it's already available */ 68 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 69 spin_unlock(&fs_info->balance_lock); 70 return extended_to_chunk(target); 71 } 72 } 73 spin_unlock(&fs_info->balance_lock); 74 75 /* First, mask out the RAID levels which aren't possible */ 76 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 77 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 78 allowed |= btrfs_raid_array[raid_type].bg_flag; 79 } 80 allowed &= flags; 81 82 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 83 allowed = BTRFS_BLOCK_GROUP_RAID6; 84 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 85 allowed = BTRFS_BLOCK_GROUP_RAID5; 86 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 87 allowed = BTRFS_BLOCK_GROUP_RAID10; 88 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 89 allowed = BTRFS_BLOCK_GROUP_RAID1; 90 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 91 allowed = BTRFS_BLOCK_GROUP_RAID0; 92 93 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 94 95 return extended_to_chunk(flags | allowed); 96 } 97 98 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 99 { 100 unsigned seq; 101 u64 flags; 102 103 do { 104 flags = orig_flags; 105 seq = read_seqbegin(&fs_info->profiles_lock); 106 107 if (flags & BTRFS_BLOCK_GROUP_DATA) 108 flags |= fs_info->avail_data_alloc_bits; 109 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 110 flags |= fs_info->avail_system_alloc_bits; 111 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 112 flags |= fs_info->avail_metadata_alloc_bits; 113 } while (read_seqretry(&fs_info->profiles_lock, seq)); 114 115 return btrfs_reduce_alloc_profile(fs_info, flags); 116 } 117 118 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 119 { 120 return get_alloc_profile(fs_info, orig_flags); 121 } 122 123 void btrfs_get_block_group(struct btrfs_block_group *cache) 124 { 125 atomic_inc(&cache->count); 126 } 127 128 void btrfs_put_block_group(struct btrfs_block_group *cache) 129 { 130 if (atomic_dec_and_test(&cache->count)) { 131 WARN_ON(cache->pinned > 0); 132 WARN_ON(cache->reserved > 0); 133 134 /* 135 * If not empty, someone is still holding mutex of 136 * full_stripe_lock, which can only be released by caller. 137 * And it will definitely cause use-after-free when caller 138 * tries to release full stripe lock. 139 * 140 * No better way to resolve, but only to warn. 141 */ 142 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 143 kfree(cache->free_space_ctl); 144 kfree(cache); 145 } 146 } 147 148 /* 149 * This adds the block group to the fs_info rb tree for the block group cache 150 */ 151 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 152 struct btrfs_block_group *block_group) 153 { 154 struct rb_node **p; 155 struct rb_node *parent = NULL; 156 struct btrfs_block_group *cache; 157 158 spin_lock(&info->block_group_cache_lock); 159 p = &info->block_group_cache_tree.rb_node; 160 161 while (*p) { 162 parent = *p; 163 cache = rb_entry(parent, struct btrfs_block_group, cache_node); 164 if (block_group->start < cache->start) { 165 p = &(*p)->rb_left; 166 } else if (block_group->start > cache->start) { 167 p = &(*p)->rb_right; 168 } else { 169 spin_unlock(&info->block_group_cache_lock); 170 return -EEXIST; 171 } 172 } 173 174 rb_link_node(&block_group->cache_node, parent, p); 175 rb_insert_color(&block_group->cache_node, 176 &info->block_group_cache_tree); 177 178 if (info->first_logical_byte > block_group->start) 179 info->first_logical_byte = block_group->start; 180 181 spin_unlock(&info->block_group_cache_lock); 182 183 return 0; 184 } 185 186 /* 187 * This will return the block group at or after bytenr if contains is 0, else 188 * it will return the block group that contains the bytenr 189 */ 190 static struct btrfs_block_group *block_group_cache_tree_search( 191 struct btrfs_fs_info *info, u64 bytenr, int contains) 192 { 193 struct btrfs_block_group *cache, *ret = NULL; 194 struct rb_node *n; 195 u64 end, start; 196 197 spin_lock(&info->block_group_cache_lock); 198 n = info->block_group_cache_tree.rb_node; 199 200 while (n) { 201 cache = rb_entry(n, struct btrfs_block_group, cache_node); 202 end = cache->start + cache->length - 1; 203 start = cache->start; 204 205 if (bytenr < start) { 206 if (!contains && (!ret || start < ret->start)) 207 ret = cache; 208 n = n->rb_left; 209 } else if (bytenr > start) { 210 if (contains && bytenr <= end) { 211 ret = cache; 212 break; 213 } 214 n = n->rb_right; 215 } else { 216 ret = cache; 217 break; 218 } 219 } 220 if (ret) { 221 btrfs_get_block_group(ret); 222 if (bytenr == 0 && info->first_logical_byte > ret->start) 223 info->first_logical_byte = ret->start; 224 } 225 spin_unlock(&info->block_group_cache_lock); 226 227 return ret; 228 } 229 230 /* 231 * Return the block group that starts at or after bytenr 232 */ 233 struct btrfs_block_group *btrfs_lookup_first_block_group( 234 struct btrfs_fs_info *info, u64 bytenr) 235 { 236 return block_group_cache_tree_search(info, bytenr, 0); 237 } 238 239 /* 240 * Return the block group that contains the given bytenr 241 */ 242 struct btrfs_block_group *btrfs_lookup_block_group( 243 struct btrfs_fs_info *info, u64 bytenr) 244 { 245 return block_group_cache_tree_search(info, bytenr, 1); 246 } 247 248 struct btrfs_block_group *btrfs_next_block_group( 249 struct btrfs_block_group *cache) 250 { 251 struct btrfs_fs_info *fs_info = cache->fs_info; 252 struct rb_node *node; 253 254 spin_lock(&fs_info->block_group_cache_lock); 255 256 /* If our block group was removed, we need a full search. */ 257 if (RB_EMPTY_NODE(&cache->cache_node)) { 258 const u64 next_bytenr = cache->start + cache->length; 259 260 spin_unlock(&fs_info->block_group_cache_lock); 261 btrfs_put_block_group(cache); 262 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 263 } 264 node = rb_next(&cache->cache_node); 265 btrfs_put_block_group(cache); 266 if (node) { 267 cache = rb_entry(node, struct btrfs_block_group, cache_node); 268 btrfs_get_block_group(cache); 269 } else 270 cache = NULL; 271 spin_unlock(&fs_info->block_group_cache_lock); 272 return cache; 273 } 274 275 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 276 { 277 struct btrfs_block_group *bg; 278 bool ret = true; 279 280 bg = btrfs_lookup_block_group(fs_info, bytenr); 281 if (!bg) 282 return false; 283 284 spin_lock(&bg->lock); 285 if (bg->ro) 286 ret = false; 287 else 288 atomic_inc(&bg->nocow_writers); 289 spin_unlock(&bg->lock); 290 291 /* No put on block group, done by btrfs_dec_nocow_writers */ 292 if (!ret) 293 btrfs_put_block_group(bg); 294 295 return ret; 296 } 297 298 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 299 { 300 struct btrfs_block_group *bg; 301 302 bg = btrfs_lookup_block_group(fs_info, bytenr); 303 ASSERT(bg); 304 if (atomic_dec_and_test(&bg->nocow_writers)) 305 wake_up_var(&bg->nocow_writers); 306 /* 307 * Once for our lookup and once for the lookup done by a previous call 308 * to btrfs_inc_nocow_writers() 309 */ 310 btrfs_put_block_group(bg); 311 btrfs_put_block_group(bg); 312 } 313 314 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) 315 { 316 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 317 } 318 319 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 320 const u64 start) 321 { 322 struct btrfs_block_group *bg; 323 324 bg = btrfs_lookup_block_group(fs_info, start); 325 ASSERT(bg); 326 if (atomic_dec_and_test(&bg->reservations)) 327 wake_up_var(&bg->reservations); 328 btrfs_put_block_group(bg); 329 } 330 331 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) 332 { 333 struct btrfs_space_info *space_info = bg->space_info; 334 335 ASSERT(bg->ro); 336 337 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 338 return; 339 340 /* 341 * Our block group is read only but before we set it to read only, 342 * some task might have had allocated an extent from it already, but it 343 * has not yet created a respective ordered extent (and added it to a 344 * root's list of ordered extents). 345 * Therefore wait for any task currently allocating extents, since the 346 * block group's reservations counter is incremented while a read lock 347 * on the groups' semaphore is held and decremented after releasing 348 * the read access on that semaphore and creating the ordered extent. 349 */ 350 down_write(&space_info->groups_sem); 351 up_write(&space_info->groups_sem); 352 353 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 354 } 355 356 struct btrfs_caching_control *btrfs_get_caching_control( 357 struct btrfs_block_group *cache) 358 { 359 struct btrfs_caching_control *ctl; 360 361 spin_lock(&cache->lock); 362 if (!cache->caching_ctl) { 363 spin_unlock(&cache->lock); 364 return NULL; 365 } 366 367 ctl = cache->caching_ctl; 368 refcount_inc(&ctl->count); 369 spin_unlock(&cache->lock); 370 return ctl; 371 } 372 373 void btrfs_put_caching_control(struct btrfs_caching_control *ctl) 374 { 375 if (refcount_dec_and_test(&ctl->count)) 376 kfree(ctl); 377 } 378 379 /* 380 * When we wait for progress in the block group caching, its because our 381 * allocation attempt failed at least once. So, we must sleep and let some 382 * progress happen before we try again. 383 * 384 * This function will sleep at least once waiting for new free space to show 385 * up, and then it will check the block group free space numbers for our min 386 * num_bytes. Another option is to have it go ahead and look in the rbtree for 387 * a free extent of a given size, but this is a good start. 388 * 389 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 390 * any of the information in this block group. 391 */ 392 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 393 u64 num_bytes) 394 { 395 struct btrfs_caching_control *caching_ctl; 396 397 caching_ctl = btrfs_get_caching_control(cache); 398 if (!caching_ctl) 399 return; 400 401 wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || 402 (cache->free_space_ctl->free_space >= num_bytes)); 403 404 btrfs_put_caching_control(caching_ctl); 405 } 406 407 int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) 408 { 409 struct btrfs_caching_control *caching_ctl; 410 int ret = 0; 411 412 caching_ctl = btrfs_get_caching_control(cache); 413 if (!caching_ctl) 414 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 415 416 wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); 417 if (cache->cached == BTRFS_CACHE_ERROR) 418 ret = -EIO; 419 btrfs_put_caching_control(caching_ctl); 420 return ret; 421 } 422 423 #ifdef CONFIG_BTRFS_DEBUG 424 static void fragment_free_space(struct btrfs_block_group *block_group) 425 { 426 struct btrfs_fs_info *fs_info = block_group->fs_info; 427 u64 start = block_group->start; 428 u64 len = block_group->length; 429 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 430 fs_info->nodesize : fs_info->sectorsize; 431 u64 step = chunk << 1; 432 433 while (len > chunk) { 434 btrfs_remove_free_space(block_group, start, chunk); 435 start += step; 436 if (len < step) 437 len = 0; 438 else 439 len -= step; 440 } 441 } 442 #endif 443 444 /* 445 * This is only called by btrfs_cache_block_group, since we could have freed 446 * extents we need to check the pinned_extents for any extents that can't be 447 * used yet since their free space will be released as soon as the transaction 448 * commits. 449 */ 450 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) 451 { 452 struct btrfs_fs_info *info = block_group->fs_info; 453 u64 extent_start, extent_end, size, total_added = 0; 454 int ret; 455 456 while (start < end) { 457 ret = find_first_extent_bit(info->pinned_extents, start, 458 &extent_start, &extent_end, 459 EXTENT_DIRTY | EXTENT_UPTODATE, 460 NULL); 461 if (ret) 462 break; 463 464 if (extent_start <= start) { 465 start = extent_end + 1; 466 } else if (extent_start > start && extent_start < end) { 467 size = extent_start - start; 468 total_added += size; 469 ret = btrfs_add_free_space(block_group, start, 470 size); 471 BUG_ON(ret); /* -ENOMEM or logic error */ 472 start = extent_end + 1; 473 } else { 474 break; 475 } 476 } 477 478 if (start < end) { 479 size = end - start; 480 total_added += size; 481 ret = btrfs_add_free_space(block_group, start, size); 482 BUG_ON(ret); /* -ENOMEM or logic error */ 483 } 484 485 return total_added; 486 } 487 488 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 489 { 490 struct btrfs_block_group *block_group = caching_ctl->block_group; 491 struct btrfs_fs_info *fs_info = block_group->fs_info; 492 struct btrfs_root *extent_root = fs_info->extent_root; 493 struct btrfs_path *path; 494 struct extent_buffer *leaf; 495 struct btrfs_key key; 496 u64 total_found = 0; 497 u64 last = 0; 498 u32 nritems; 499 int ret; 500 bool wakeup = true; 501 502 path = btrfs_alloc_path(); 503 if (!path) 504 return -ENOMEM; 505 506 last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 507 508 #ifdef CONFIG_BTRFS_DEBUG 509 /* 510 * If we're fragmenting we don't want to make anybody think we can 511 * allocate from this block group until we've had a chance to fragment 512 * the free space. 513 */ 514 if (btrfs_should_fragment_free_space(block_group)) 515 wakeup = false; 516 #endif 517 /* 518 * We don't want to deadlock with somebody trying to allocate a new 519 * extent for the extent root while also trying to search the extent 520 * root to add free space. So we skip locking and search the commit 521 * root, since its read-only 522 */ 523 path->skip_locking = 1; 524 path->search_commit_root = 1; 525 path->reada = READA_FORWARD; 526 527 key.objectid = last; 528 key.offset = 0; 529 key.type = BTRFS_EXTENT_ITEM_KEY; 530 531 next: 532 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 533 if (ret < 0) 534 goto out; 535 536 leaf = path->nodes[0]; 537 nritems = btrfs_header_nritems(leaf); 538 539 while (1) { 540 if (btrfs_fs_closing(fs_info) > 1) { 541 last = (u64)-1; 542 break; 543 } 544 545 if (path->slots[0] < nritems) { 546 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 547 } else { 548 ret = btrfs_find_next_key(extent_root, path, &key, 0, 0); 549 if (ret) 550 break; 551 552 if (need_resched() || 553 rwsem_is_contended(&fs_info->commit_root_sem)) { 554 if (wakeup) 555 caching_ctl->progress = last; 556 btrfs_release_path(path); 557 up_read(&fs_info->commit_root_sem); 558 mutex_unlock(&caching_ctl->mutex); 559 cond_resched(); 560 mutex_lock(&caching_ctl->mutex); 561 down_read(&fs_info->commit_root_sem); 562 goto next; 563 } 564 565 ret = btrfs_next_leaf(extent_root, path); 566 if (ret < 0) 567 goto out; 568 if (ret) 569 break; 570 leaf = path->nodes[0]; 571 nritems = btrfs_header_nritems(leaf); 572 continue; 573 } 574 575 if (key.objectid < last) { 576 key.objectid = last; 577 key.offset = 0; 578 key.type = BTRFS_EXTENT_ITEM_KEY; 579 580 if (wakeup) 581 caching_ctl->progress = last; 582 btrfs_release_path(path); 583 goto next; 584 } 585 586 if (key.objectid < block_group->start) { 587 path->slots[0]++; 588 continue; 589 } 590 591 if (key.objectid >= block_group->start + block_group->length) 592 break; 593 594 if (key.type == BTRFS_EXTENT_ITEM_KEY || 595 key.type == BTRFS_METADATA_ITEM_KEY) { 596 total_found += add_new_free_space(block_group, last, 597 key.objectid); 598 if (key.type == BTRFS_METADATA_ITEM_KEY) 599 last = key.objectid + 600 fs_info->nodesize; 601 else 602 last = key.objectid + key.offset; 603 604 if (total_found > CACHING_CTL_WAKE_UP) { 605 total_found = 0; 606 if (wakeup) 607 wake_up(&caching_ctl->wait); 608 } 609 } 610 path->slots[0]++; 611 } 612 ret = 0; 613 614 total_found += add_new_free_space(block_group, last, 615 block_group->start + block_group->length); 616 caching_ctl->progress = (u64)-1; 617 618 out: 619 btrfs_free_path(path); 620 return ret; 621 } 622 623 static noinline void caching_thread(struct btrfs_work *work) 624 { 625 struct btrfs_block_group *block_group; 626 struct btrfs_fs_info *fs_info; 627 struct btrfs_caching_control *caching_ctl; 628 int ret; 629 630 caching_ctl = container_of(work, struct btrfs_caching_control, work); 631 block_group = caching_ctl->block_group; 632 fs_info = block_group->fs_info; 633 634 mutex_lock(&caching_ctl->mutex); 635 down_read(&fs_info->commit_root_sem); 636 637 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 638 ret = load_free_space_tree(caching_ctl); 639 else 640 ret = load_extent_tree_free(caching_ctl); 641 642 spin_lock(&block_group->lock); 643 block_group->caching_ctl = NULL; 644 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 645 spin_unlock(&block_group->lock); 646 647 #ifdef CONFIG_BTRFS_DEBUG 648 if (btrfs_should_fragment_free_space(block_group)) { 649 u64 bytes_used; 650 651 spin_lock(&block_group->space_info->lock); 652 spin_lock(&block_group->lock); 653 bytes_used = block_group->length - block_group->used; 654 block_group->space_info->bytes_used += bytes_used >> 1; 655 spin_unlock(&block_group->lock); 656 spin_unlock(&block_group->space_info->lock); 657 fragment_free_space(block_group); 658 } 659 #endif 660 661 caching_ctl->progress = (u64)-1; 662 663 up_read(&fs_info->commit_root_sem); 664 btrfs_free_excluded_extents(block_group); 665 mutex_unlock(&caching_ctl->mutex); 666 667 wake_up(&caching_ctl->wait); 668 669 btrfs_put_caching_control(caching_ctl); 670 btrfs_put_block_group(block_group); 671 } 672 673 int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) 674 { 675 DEFINE_WAIT(wait); 676 struct btrfs_fs_info *fs_info = cache->fs_info; 677 struct btrfs_caching_control *caching_ctl; 678 int ret = 0; 679 680 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 681 if (!caching_ctl) 682 return -ENOMEM; 683 684 INIT_LIST_HEAD(&caching_ctl->list); 685 mutex_init(&caching_ctl->mutex); 686 init_waitqueue_head(&caching_ctl->wait); 687 caching_ctl->block_group = cache; 688 caching_ctl->progress = cache->start; 689 refcount_set(&caching_ctl->count, 1); 690 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 691 692 spin_lock(&cache->lock); 693 /* 694 * This should be a rare occasion, but this could happen I think in the 695 * case where one thread starts to load the space cache info, and then 696 * some other thread starts a transaction commit which tries to do an 697 * allocation while the other thread is still loading the space cache 698 * info. The previous loop should have kept us from choosing this block 699 * group, but if we've moved to the state where we will wait on caching 700 * block groups we need to first check if we're doing a fast load here, 701 * so we can wait for it to finish, otherwise we could end up allocating 702 * from a block group who's cache gets evicted for one reason or 703 * another. 704 */ 705 while (cache->cached == BTRFS_CACHE_FAST) { 706 struct btrfs_caching_control *ctl; 707 708 ctl = cache->caching_ctl; 709 refcount_inc(&ctl->count); 710 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 711 spin_unlock(&cache->lock); 712 713 schedule(); 714 715 finish_wait(&ctl->wait, &wait); 716 btrfs_put_caching_control(ctl); 717 spin_lock(&cache->lock); 718 } 719 720 if (cache->cached != BTRFS_CACHE_NO) { 721 spin_unlock(&cache->lock); 722 kfree(caching_ctl); 723 return 0; 724 } 725 WARN_ON(cache->caching_ctl); 726 cache->caching_ctl = caching_ctl; 727 cache->cached = BTRFS_CACHE_FAST; 728 spin_unlock(&cache->lock); 729 730 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 731 mutex_lock(&caching_ctl->mutex); 732 ret = load_free_space_cache(cache); 733 734 spin_lock(&cache->lock); 735 if (ret == 1) { 736 cache->caching_ctl = NULL; 737 cache->cached = BTRFS_CACHE_FINISHED; 738 cache->last_byte_to_unpin = (u64)-1; 739 caching_ctl->progress = (u64)-1; 740 } else { 741 if (load_cache_only) { 742 cache->caching_ctl = NULL; 743 cache->cached = BTRFS_CACHE_NO; 744 } else { 745 cache->cached = BTRFS_CACHE_STARTED; 746 cache->has_caching_ctl = 1; 747 } 748 } 749 spin_unlock(&cache->lock); 750 #ifdef CONFIG_BTRFS_DEBUG 751 if (ret == 1 && 752 btrfs_should_fragment_free_space(cache)) { 753 u64 bytes_used; 754 755 spin_lock(&cache->space_info->lock); 756 spin_lock(&cache->lock); 757 bytes_used = cache->length - cache->used; 758 cache->space_info->bytes_used += bytes_used >> 1; 759 spin_unlock(&cache->lock); 760 spin_unlock(&cache->space_info->lock); 761 fragment_free_space(cache); 762 } 763 #endif 764 mutex_unlock(&caching_ctl->mutex); 765 766 wake_up(&caching_ctl->wait); 767 if (ret == 1) { 768 btrfs_put_caching_control(caching_ctl); 769 btrfs_free_excluded_extents(cache); 770 return 0; 771 } 772 } else { 773 /* 774 * We're either using the free space tree or no caching at all. 775 * Set cached to the appropriate value and wakeup any waiters. 776 */ 777 spin_lock(&cache->lock); 778 if (load_cache_only) { 779 cache->caching_ctl = NULL; 780 cache->cached = BTRFS_CACHE_NO; 781 } else { 782 cache->cached = BTRFS_CACHE_STARTED; 783 cache->has_caching_ctl = 1; 784 } 785 spin_unlock(&cache->lock); 786 wake_up(&caching_ctl->wait); 787 } 788 789 if (load_cache_only) { 790 btrfs_put_caching_control(caching_ctl); 791 return 0; 792 } 793 794 down_write(&fs_info->commit_root_sem); 795 refcount_inc(&caching_ctl->count); 796 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 797 up_write(&fs_info->commit_root_sem); 798 799 btrfs_get_block_group(cache); 800 801 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 802 803 return ret; 804 } 805 806 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 807 { 808 u64 extra_flags = chunk_to_extended(flags) & 809 BTRFS_EXTENDED_PROFILE_MASK; 810 811 write_seqlock(&fs_info->profiles_lock); 812 if (flags & BTRFS_BLOCK_GROUP_DATA) 813 fs_info->avail_data_alloc_bits &= ~extra_flags; 814 if (flags & BTRFS_BLOCK_GROUP_METADATA) 815 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 816 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 817 fs_info->avail_system_alloc_bits &= ~extra_flags; 818 write_sequnlock(&fs_info->profiles_lock); 819 } 820 821 /* 822 * Clear incompat bits for the following feature(s): 823 * 824 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group 825 * in the whole filesystem 826 * 827 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups 828 */ 829 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) 830 { 831 bool found_raid56 = false; 832 bool found_raid1c34 = false; 833 834 if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || 835 (flags & BTRFS_BLOCK_GROUP_RAID1C3) || 836 (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { 837 struct list_head *head = &fs_info->space_info; 838 struct btrfs_space_info *sinfo; 839 840 list_for_each_entry_rcu(sinfo, head, list) { 841 down_read(&sinfo->groups_sem); 842 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) 843 found_raid56 = true; 844 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) 845 found_raid56 = true; 846 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) 847 found_raid1c34 = true; 848 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) 849 found_raid1c34 = true; 850 up_read(&sinfo->groups_sem); 851 } 852 if (found_raid56) 853 btrfs_clear_fs_incompat(fs_info, RAID56); 854 if (found_raid1c34) 855 btrfs_clear_fs_incompat(fs_info, RAID1C34); 856 } 857 } 858 859 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 860 u64 group_start, struct extent_map *em) 861 { 862 struct btrfs_fs_info *fs_info = trans->fs_info; 863 struct btrfs_root *root = fs_info->extent_root; 864 struct btrfs_path *path; 865 struct btrfs_block_group *block_group; 866 struct btrfs_free_cluster *cluster; 867 struct btrfs_root *tree_root = fs_info->tree_root; 868 struct btrfs_key key; 869 struct inode *inode; 870 struct kobject *kobj = NULL; 871 int ret; 872 int index; 873 int factor; 874 struct btrfs_caching_control *caching_ctl = NULL; 875 bool remove_em; 876 bool remove_rsv = false; 877 878 block_group = btrfs_lookup_block_group(fs_info, group_start); 879 BUG_ON(!block_group); 880 BUG_ON(!block_group->ro); 881 882 trace_btrfs_remove_block_group(block_group); 883 /* 884 * Free the reserved super bytes from this block group before 885 * remove it. 886 */ 887 btrfs_free_excluded_extents(block_group); 888 btrfs_free_ref_tree_range(fs_info, block_group->start, 889 block_group->length); 890 891 index = btrfs_bg_flags_to_raid_index(block_group->flags); 892 factor = btrfs_bg_type_to_factor(block_group->flags); 893 894 /* make sure this block group isn't part of an allocation cluster */ 895 cluster = &fs_info->data_alloc_cluster; 896 spin_lock(&cluster->refill_lock); 897 btrfs_return_cluster_to_free_space(block_group, cluster); 898 spin_unlock(&cluster->refill_lock); 899 900 /* 901 * make sure this block group isn't part of a metadata 902 * allocation cluster 903 */ 904 cluster = &fs_info->meta_alloc_cluster; 905 spin_lock(&cluster->refill_lock); 906 btrfs_return_cluster_to_free_space(block_group, cluster); 907 spin_unlock(&cluster->refill_lock); 908 909 path = btrfs_alloc_path(); 910 if (!path) { 911 ret = -ENOMEM; 912 goto out; 913 } 914 915 /* 916 * get the inode first so any iput calls done for the io_list 917 * aren't the final iput (no unlinks allowed now) 918 */ 919 inode = lookup_free_space_inode(block_group, path); 920 921 mutex_lock(&trans->transaction->cache_write_mutex); 922 /* 923 * Make sure our free space cache IO is done before removing the 924 * free space inode 925 */ 926 spin_lock(&trans->transaction->dirty_bgs_lock); 927 if (!list_empty(&block_group->io_list)) { 928 list_del_init(&block_group->io_list); 929 930 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 931 932 spin_unlock(&trans->transaction->dirty_bgs_lock); 933 btrfs_wait_cache_io(trans, block_group, path); 934 btrfs_put_block_group(block_group); 935 spin_lock(&trans->transaction->dirty_bgs_lock); 936 } 937 938 if (!list_empty(&block_group->dirty_list)) { 939 list_del_init(&block_group->dirty_list); 940 remove_rsv = true; 941 btrfs_put_block_group(block_group); 942 } 943 spin_unlock(&trans->transaction->dirty_bgs_lock); 944 mutex_unlock(&trans->transaction->cache_write_mutex); 945 946 if (!IS_ERR(inode)) { 947 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 948 if (ret) { 949 btrfs_add_delayed_iput(inode); 950 goto out; 951 } 952 clear_nlink(inode); 953 /* One for the block groups ref */ 954 spin_lock(&block_group->lock); 955 if (block_group->iref) { 956 block_group->iref = 0; 957 block_group->inode = NULL; 958 spin_unlock(&block_group->lock); 959 iput(inode); 960 } else { 961 spin_unlock(&block_group->lock); 962 } 963 /* One for our lookup ref */ 964 btrfs_add_delayed_iput(inode); 965 } 966 967 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 968 key.type = 0; 969 key.offset = block_group->start; 970 971 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 972 if (ret < 0) 973 goto out; 974 if (ret > 0) 975 btrfs_release_path(path); 976 if (ret == 0) { 977 ret = btrfs_del_item(trans, tree_root, path); 978 if (ret) 979 goto out; 980 btrfs_release_path(path); 981 } 982 983 spin_lock(&fs_info->block_group_cache_lock); 984 rb_erase(&block_group->cache_node, 985 &fs_info->block_group_cache_tree); 986 RB_CLEAR_NODE(&block_group->cache_node); 987 988 if (fs_info->first_logical_byte == block_group->start) 989 fs_info->first_logical_byte = (u64)-1; 990 spin_unlock(&fs_info->block_group_cache_lock); 991 992 down_write(&block_group->space_info->groups_sem); 993 /* 994 * we must use list_del_init so people can check to see if they 995 * are still on the list after taking the semaphore 996 */ 997 list_del_init(&block_group->list); 998 if (list_empty(&block_group->space_info->block_groups[index])) { 999 kobj = block_group->space_info->block_group_kobjs[index]; 1000 block_group->space_info->block_group_kobjs[index] = NULL; 1001 clear_avail_alloc_bits(fs_info, block_group->flags); 1002 } 1003 up_write(&block_group->space_info->groups_sem); 1004 clear_incompat_bg_bits(fs_info, block_group->flags); 1005 if (kobj) { 1006 kobject_del(kobj); 1007 kobject_put(kobj); 1008 } 1009 1010 if (block_group->has_caching_ctl) 1011 caching_ctl = btrfs_get_caching_control(block_group); 1012 if (block_group->cached == BTRFS_CACHE_STARTED) 1013 btrfs_wait_block_group_cache_done(block_group); 1014 if (block_group->has_caching_ctl) { 1015 down_write(&fs_info->commit_root_sem); 1016 if (!caching_ctl) { 1017 struct btrfs_caching_control *ctl; 1018 1019 list_for_each_entry(ctl, 1020 &fs_info->caching_block_groups, list) 1021 if (ctl->block_group == block_group) { 1022 caching_ctl = ctl; 1023 refcount_inc(&caching_ctl->count); 1024 break; 1025 } 1026 } 1027 if (caching_ctl) 1028 list_del_init(&caching_ctl->list); 1029 up_write(&fs_info->commit_root_sem); 1030 if (caching_ctl) { 1031 /* Once for the caching bgs list and once for us. */ 1032 btrfs_put_caching_control(caching_ctl); 1033 btrfs_put_caching_control(caching_ctl); 1034 } 1035 } 1036 1037 spin_lock(&trans->transaction->dirty_bgs_lock); 1038 WARN_ON(!list_empty(&block_group->dirty_list)); 1039 WARN_ON(!list_empty(&block_group->io_list)); 1040 spin_unlock(&trans->transaction->dirty_bgs_lock); 1041 1042 btrfs_remove_free_space_cache(block_group); 1043 1044 spin_lock(&block_group->space_info->lock); 1045 list_del_init(&block_group->ro_list); 1046 1047 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1048 WARN_ON(block_group->space_info->total_bytes 1049 < block_group->length); 1050 WARN_ON(block_group->space_info->bytes_readonly 1051 < block_group->length); 1052 WARN_ON(block_group->space_info->disk_total 1053 < block_group->length * factor); 1054 } 1055 block_group->space_info->total_bytes -= block_group->length; 1056 block_group->space_info->bytes_readonly -= block_group->length; 1057 block_group->space_info->disk_total -= block_group->length * factor; 1058 1059 spin_unlock(&block_group->space_info->lock); 1060 1061 key.objectid = block_group->start; 1062 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 1063 key.offset = block_group->length; 1064 1065 mutex_lock(&fs_info->chunk_mutex); 1066 spin_lock(&block_group->lock); 1067 block_group->removed = 1; 1068 /* 1069 * At this point trimming can't start on this block group, because we 1070 * removed the block group from the tree fs_info->block_group_cache_tree 1071 * so no one can't find it anymore and even if someone already got this 1072 * block group before we removed it from the rbtree, they have already 1073 * incremented block_group->trimming - if they didn't, they won't find 1074 * any free space entries because we already removed them all when we 1075 * called btrfs_remove_free_space_cache(). 1076 * 1077 * And we must not remove the extent map from the fs_info->mapping_tree 1078 * to prevent the same logical address range and physical device space 1079 * ranges from being reused for a new block group. This is because our 1080 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 1081 * completely transactionless, so while it is trimming a range the 1082 * currently running transaction might finish and a new one start, 1083 * allowing for new block groups to be created that can reuse the same 1084 * physical device locations unless we take this special care. 1085 * 1086 * There may also be an implicit trim operation if the file system 1087 * is mounted with -odiscard. The same protections must remain 1088 * in place until the extents have been discarded completely when 1089 * the transaction commit has completed. 1090 */ 1091 remove_em = (atomic_read(&block_group->trimming) == 0); 1092 spin_unlock(&block_group->lock); 1093 1094 mutex_unlock(&fs_info->chunk_mutex); 1095 1096 ret = remove_block_group_free_space(trans, block_group); 1097 if (ret) 1098 goto out; 1099 1100 btrfs_put_block_group(block_group); 1101 btrfs_put_block_group(block_group); 1102 1103 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1104 if (ret > 0) 1105 ret = -EIO; 1106 if (ret < 0) 1107 goto out; 1108 1109 ret = btrfs_del_item(trans, root, path); 1110 if (ret) 1111 goto out; 1112 1113 if (remove_em) { 1114 struct extent_map_tree *em_tree; 1115 1116 em_tree = &fs_info->mapping_tree; 1117 write_lock(&em_tree->lock); 1118 remove_extent_mapping(em_tree, em); 1119 write_unlock(&em_tree->lock); 1120 /* once for the tree */ 1121 free_extent_map(em); 1122 } 1123 out: 1124 if (remove_rsv) 1125 btrfs_delayed_refs_rsv_release(fs_info, 1); 1126 btrfs_free_path(path); 1127 return ret; 1128 } 1129 1130 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 1131 struct btrfs_fs_info *fs_info, const u64 chunk_offset) 1132 { 1133 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1134 struct extent_map *em; 1135 struct map_lookup *map; 1136 unsigned int num_items; 1137 1138 read_lock(&em_tree->lock); 1139 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 1140 read_unlock(&em_tree->lock); 1141 ASSERT(em && em->start == chunk_offset); 1142 1143 /* 1144 * We need to reserve 3 + N units from the metadata space info in order 1145 * to remove a block group (done at btrfs_remove_chunk() and at 1146 * btrfs_remove_block_group()), which are used for: 1147 * 1148 * 1 unit for adding the free space inode's orphan (located in the tree 1149 * of tree roots). 1150 * 1 unit for deleting the block group item (located in the extent 1151 * tree). 1152 * 1 unit for deleting the free space item (located in tree of tree 1153 * roots). 1154 * N units for deleting N device extent items corresponding to each 1155 * stripe (located in the device tree). 1156 * 1157 * In order to remove a block group we also need to reserve units in the 1158 * system space info in order to update the chunk tree (update one or 1159 * more device items and remove one chunk item), but this is done at 1160 * btrfs_remove_chunk() through a call to check_system_chunk(). 1161 */ 1162 map = em->map_lookup; 1163 num_items = 3 + map->num_stripes; 1164 free_extent_map(em); 1165 1166 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 1167 num_items, 1); 1168 } 1169 1170 /* 1171 * Mark block group @cache read-only, so later write won't happen to block 1172 * group @cache. 1173 * 1174 * If @force is not set, this function will only mark the block group readonly 1175 * if we have enough free space (1M) in other metadata/system block groups. 1176 * If @force is not set, this function will mark the block group readonly 1177 * without checking free space. 1178 * 1179 * NOTE: This function doesn't care if other block groups can contain all the 1180 * data in this block group. That check should be done by relocation routine, 1181 * not this function. 1182 */ 1183 static int inc_block_group_ro(struct btrfs_block_group *cache, int force) 1184 { 1185 struct btrfs_space_info *sinfo = cache->space_info; 1186 u64 num_bytes; 1187 u64 sinfo_used; 1188 u64 min_allocable_bytes; 1189 int ret = -ENOSPC; 1190 1191 /* 1192 * We need some metadata space and system metadata space for 1193 * allocating chunks in some corner cases until we force to set 1194 * it to be readonly. 1195 */ 1196 if ((sinfo->flags & 1197 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 1198 !force) 1199 min_allocable_bytes = SZ_1M; 1200 else 1201 min_allocable_bytes = 0; 1202 1203 spin_lock(&sinfo->lock); 1204 spin_lock(&cache->lock); 1205 1206 if (cache->ro) { 1207 cache->ro++; 1208 ret = 0; 1209 goto out; 1210 } 1211 1212 num_bytes = cache->length - cache->reserved - cache->pinned - 1213 cache->bytes_super - cache->used; 1214 sinfo_used = btrfs_space_info_used(sinfo, true); 1215 1216 /* 1217 * sinfo_used + num_bytes should always <= sinfo->total_bytes. 1218 * 1219 * Here we make sure if we mark this bg RO, we still have enough 1220 * free space as buffer (if min_allocable_bytes is not 0). 1221 */ 1222 if (sinfo_used + num_bytes + min_allocable_bytes <= 1223 sinfo->total_bytes) { 1224 sinfo->bytes_readonly += num_bytes; 1225 cache->ro++; 1226 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 1227 ret = 0; 1228 } 1229 out: 1230 spin_unlock(&cache->lock); 1231 spin_unlock(&sinfo->lock); 1232 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 1233 btrfs_info(cache->fs_info, 1234 "unable to make block group %llu ro", cache->start); 1235 btrfs_info(cache->fs_info, 1236 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", 1237 sinfo_used, num_bytes, min_allocable_bytes); 1238 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); 1239 } 1240 return ret; 1241 } 1242 1243 /* 1244 * Process the unused_bgs list and remove any that don't have any allocated 1245 * space inside of them. 1246 */ 1247 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 1248 { 1249 struct btrfs_block_group *block_group; 1250 struct btrfs_space_info *space_info; 1251 struct btrfs_trans_handle *trans; 1252 int ret = 0; 1253 1254 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1255 return; 1256 1257 spin_lock(&fs_info->unused_bgs_lock); 1258 while (!list_empty(&fs_info->unused_bgs)) { 1259 u64 start, end; 1260 int trimming; 1261 1262 block_group = list_first_entry(&fs_info->unused_bgs, 1263 struct btrfs_block_group, 1264 bg_list); 1265 list_del_init(&block_group->bg_list); 1266 1267 space_info = block_group->space_info; 1268 1269 if (ret || btrfs_mixed_space_info(space_info)) { 1270 btrfs_put_block_group(block_group); 1271 continue; 1272 } 1273 spin_unlock(&fs_info->unused_bgs_lock); 1274 1275 mutex_lock(&fs_info->delete_unused_bgs_mutex); 1276 1277 /* Don't want to race with allocators so take the groups_sem */ 1278 down_write(&space_info->groups_sem); 1279 spin_lock(&block_group->lock); 1280 if (block_group->reserved || block_group->pinned || 1281 block_group->used || block_group->ro || 1282 list_is_singular(&block_group->list)) { 1283 /* 1284 * We want to bail if we made new allocations or have 1285 * outstanding allocations in this block group. We do 1286 * the ro check in case balance is currently acting on 1287 * this block group. 1288 */ 1289 trace_btrfs_skip_unused_block_group(block_group); 1290 spin_unlock(&block_group->lock); 1291 up_write(&space_info->groups_sem); 1292 goto next; 1293 } 1294 spin_unlock(&block_group->lock); 1295 1296 /* We don't want to force the issue, only flip if it's ok. */ 1297 ret = inc_block_group_ro(block_group, 0); 1298 up_write(&space_info->groups_sem); 1299 if (ret < 0) { 1300 ret = 0; 1301 goto next; 1302 } 1303 1304 /* 1305 * Want to do this before we do anything else so we can recover 1306 * properly if we fail to join the transaction. 1307 */ 1308 trans = btrfs_start_trans_remove_block_group(fs_info, 1309 block_group->start); 1310 if (IS_ERR(trans)) { 1311 btrfs_dec_block_group_ro(block_group); 1312 ret = PTR_ERR(trans); 1313 goto next; 1314 } 1315 1316 /* 1317 * We could have pending pinned extents for this block group, 1318 * just delete them, we don't care about them anymore. 1319 */ 1320 start = block_group->start; 1321 end = start + block_group->length - 1; 1322 /* 1323 * Hold the unused_bg_unpin_mutex lock to avoid racing with 1324 * btrfs_finish_extent_commit(). If we are at transaction N, 1325 * another task might be running finish_extent_commit() for the 1326 * previous transaction N - 1, and have seen a range belonging 1327 * to the block group in freed_extents[] before we were able to 1328 * clear the whole block group range from freed_extents[]. This 1329 * means that task can lookup for the block group after we 1330 * unpinned it from freed_extents[] and removed it, leading to 1331 * a BUG_ON() at btrfs_unpin_extent_range(). 1332 */ 1333 mutex_lock(&fs_info->unused_bg_unpin_mutex); 1334 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 1335 EXTENT_DIRTY); 1336 if (ret) { 1337 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1338 btrfs_dec_block_group_ro(block_group); 1339 goto end_trans; 1340 } 1341 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 1342 EXTENT_DIRTY); 1343 if (ret) { 1344 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1345 btrfs_dec_block_group_ro(block_group); 1346 goto end_trans; 1347 } 1348 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 1349 1350 /* Reset pinned so btrfs_put_block_group doesn't complain */ 1351 spin_lock(&space_info->lock); 1352 spin_lock(&block_group->lock); 1353 1354 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1355 -block_group->pinned); 1356 space_info->bytes_readonly += block_group->pinned; 1357 percpu_counter_add_batch(&space_info->total_bytes_pinned, 1358 -block_group->pinned, 1359 BTRFS_TOTAL_BYTES_PINNED_BATCH); 1360 block_group->pinned = 0; 1361 1362 spin_unlock(&block_group->lock); 1363 spin_unlock(&space_info->lock); 1364 1365 /* DISCARD can flip during remount */ 1366 trimming = btrfs_test_opt(fs_info, DISCARD); 1367 1368 /* Implicit trim during transaction commit. */ 1369 if (trimming) 1370 btrfs_get_block_group_trimming(block_group); 1371 1372 /* 1373 * Btrfs_remove_chunk will abort the transaction if things go 1374 * horribly wrong. 1375 */ 1376 ret = btrfs_remove_chunk(trans, block_group->start); 1377 1378 if (ret) { 1379 if (trimming) 1380 btrfs_put_block_group_trimming(block_group); 1381 goto end_trans; 1382 } 1383 1384 /* 1385 * If we're not mounted with -odiscard, we can just forget 1386 * about this block group. Otherwise we'll need to wait 1387 * until transaction commit to do the actual discard. 1388 */ 1389 if (trimming) { 1390 spin_lock(&fs_info->unused_bgs_lock); 1391 /* 1392 * A concurrent scrub might have added us to the list 1393 * fs_info->unused_bgs, so use a list_move operation 1394 * to add the block group to the deleted_bgs list. 1395 */ 1396 list_move(&block_group->bg_list, 1397 &trans->transaction->deleted_bgs); 1398 spin_unlock(&fs_info->unused_bgs_lock); 1399 btrfs_get_block_group(block_group); 1400 } 1401 end_trans: 1402 btrfs_end_transaction(trans); 1403 next: 1404 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 1405 btrfs_put_block_group(block_group); 1406 spin_lock(&fs_info->unused_bgs_lock); 1407 } 1408 spin_unlock(&fs_info->unused_bgs_lock); 1409 } 1410 1411 void btrfs_mark_bg_unused(struct btrfs_block_group *bg) 1412 { 1413 struct btrfs_fs_info *fs_info = bg->fs_info; 1414 1415 spin_lock(&fs_info->unused_bgs_lock); 1416 if (list_empty(&bg->bg_list)) { 1417 btrfs_get_block_group(bg); 1418 trace_btrfs_add_unused_block_group(bg); 1419 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1420 } 1421 spin_unlock(&fs_info->unused_bgs_lock); 1422 } 1423 1424 static int find_first_block_group(struct btrfs_fs_info *fs_info, 1425 struct btrfs_path *path, 1426 struct btrfs_key *key) 1427 { 1428 struct btrfs_root *root = fs_info->extent_root; 1429 int ret = 0; 1430 struct btrfs_key found_key; 1431 struct extent_buffer *leaf; 1432 struct btrfs_block_group_item bg; 1433 u64 flags; 1434 int slot; 1435 1436 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1437 if (ret < 0) 1438 goto out; 1439 1440 while (1) { 1441 slot = path->slots[0]; 1442 leaf = path->nodes[0]; 1443 if (slot >= btrfs_header_nritems(leaf)) { 1444 ret = btrfs_next_leaf(root, path); 1445 if (ret == 0) 1446 continue; 1447 if (ret < 0) 1448 goto out; 1449 break; 1450 } 1451 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1452 1453 if (found_key.objectid >= key->objectid && 1454 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1455 struct extent_map_tree *em_tree; 1456 struct extent_map *em; 1457 1458 em_tree = &root->fs_info->mapping_tree; 1459 read_lock(&em_tree->lock); 1460 em = lookup_extent_mapping(em_tree, found_key.objectid, 1461 found_key.offset); 1462 read_unlock(&em_tree->lock); 1463 if (!em) { 1464 btrfs_err(fs_info, 1465 "logical %llu len %llu found bg but no related chunk", 1466 found_key.objectid, found_key.offset); 1467 ret = -ENOENT; 1468 } else if (em->start != found_key.objectid || 1469 em->len != found_key.offset) { 1470 btrfs_err(fs_info, 1471 "block group %llu len %llu mismatch with chunk %llu len %llu", 1472 found_key.objectid, found_key.offset, 1473 em->start, em->len); 1474 ret = -EUCLEAN; 1475 } else { 1476 read_extent_buffer(leaf, &bg, 1477 btrfs_item_ptr_offset(leaf, slot), 1478 sizeof(bg)); 1479 flags = btrfs_stack_block_group_flags(&bg) & 1480 BTRFS_BLOCK_GROUP_TYPE_MASK; 1481 1482 if (flags != (em->map_lookup->type & 1483 BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1484 btrfs_err(fs_info, 1485 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 1486 found_key.objectid, 1487 found_key.offset, flags, 1488 (BTRFS_BLOCK_GROUP_TYPE_MASK & 1489 em->map_lookup->type)); 1490 ret = -EUCLEAN; 1491 } else { 1492 ret = 0; 1493 } 1494 } 1495 free_extent_map(em); 1496 goto out; 1497 } 1498 path->slots[0]++; 1499 } 1500 out: 1501 return ret; 1502 } 1503 1504 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 1505 { 1506 u64 extra_flags = chunk_to_extended(flags) & 1507 BTRFS_EXTENDED_PROFILE_MASK; 1508 1509 write_seqlock(&fs_info->profiles_lock); 1510 if (flags & BTRFS_BLOCK_GROUP_DATA) 1511 fs_info->avail_data_alloc_bits |= extra_flags; 1512 if (flags & BTRFS_BLOCK_GROUP_METADATA) 1513 fs_info->avail_metadata_alloc_bits |= extra_flags; 1514 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 1515 fs_info->avail_system_alloc_bits |= extra_flags; 1516 write_sequnlock(&fs_info->profiles_lock); 1517 } 1518 1519 static int exclude_super_stripes(struct btrfs_block_group *cache) 1520 { 1521 struct btrfs_fs_info *fs_info = cache->fs_info; 1522 u64 bytenr; 1523 u64 *logical; 1524 int stripe_len; 1525 int i, nr, ret; 1526 1527 if (cache->start < BTRFS_SUPER_INFO_OFFSET) { 1528 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; 1529 cache->bytes_super += stripe_len; 1530 ret = btrfs_add_excluded_extent(fs_info, cache->start, 1531 stripe_len); 1532 if (ret) 1533 return ret; 1534 } 1535 1536 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1537 bytenr = btrfs_sb_offset(i); 1538 ret = btrfs_rmap_block(fs_info, cache->start, 1539 bytenr, &logical, &nr, &stripe_len); 1540 if (ret) 1541 return ret; 1542 1543 while (nr--) { 1544 u64 start, len; 1545 1546 if (logical[nr] > cache->start + cache->length) 1547 continue; 1548 1549 if (logical[nr] + stripe_len <= cache->start) 1550 continue; 1551 1552 start = logical[nr]; 1553 if (start < cache->start) { 1554 start = cache->start; 1555 len = (logical[nr] + stripe_len) - start; 1556 } else { 1557 len = min_t(u64, stripe_len, 1558 cache->start + cache->length - start); 1559 } 1560 1561 cache->bytes_super += len; 1562 ret = btrfs_add_excluded_extent(fs_info, start, len); 1563 if (ret) { 1564 kfree(logical); 1565 return ret; 1566 } 1567 } 1568 1569 kfree(logical); 1570 } 1571 return 0; 1572 } 1573 1574 static void link_block_group(struct btrfs_block_group *cache) 1575 { 1576 struct btrfs_space_info *space_info = cache->space_info; 1577 int index = btrfs_bg_flags_to_raid_index(cache->flags); 1578 bool first = false; 1579 1580 down_write(&space_info->groups_sem); 1581 if (list_empty(&space_info->block_groups[index])) 1582 first = true; 1583 list_add_tail(&cache->list, &space_info->block_groups[index]); 1584 up_write(&space_info->groups_sem); 1585 1586 if (first) 1587 btrfs_sysfs_add_block_group_type(cache); 1588 } 1589 1590 static struct btrfs_block_group *btrfs_create_block_group_cache( 1591 struct btrfs_fs_info *fs_info, u64 start, u64 size) 1592 { 1593 struct btrfs_block_group *cache; 1594 1595 cache = kzalloc(sizeof(*cache), GFP_NOFS); 1596 if (!cache) 1597 return NULL; 1598 1599 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 1600 GFP_NOFS); 1601 if (!cache->free_space_ctl) { 1602 kfree(cache); 1603 return NULL; 1604 } 1605 1606 cache->start = start; 1607 cache->length = size; 1608 1609 cache->fs_info = fs_info; 1610 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 1611 set_free_space_tree_thresholds(cache); 1612 1613 atomic_set(&cache->count, 1); 1614 spin_lock_init(&cache->lock); 1615 init_rwsem(&cache->data_rwsem); 1616 INIT_LIST_HEAD(&cache->list); 1617 INIT_LIST_HEAD(&cache->cluster_list); 1618 INIT_LIST_HEAD(&cache->bg_list); 1619 INIT_LIST_HEAD(&cache->ro_list); 1620 INIT_LIST_HEAD(&cache->dirty_list); 1621 INIT_LIST_HEAD(&cache->io_list); 1622 btrfs_init_free_space_ctl(cache); 1623 atomic_set(&cache->trimming, 0); 1624 mutex_init(&cache->free_space_lock); 1625 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 1626 1627 return cache; 1628 } 1629 1630 /* 1631 * Iterate all chunks and verify that each of them has the corresponding block 1632 * group 1633 */ 1634 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 1635 { 1636 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 1637 struct extent_map *em; 1638 struct btrfs_block_group *bg; 1639 u64 start = 0; 1640 int ret = 0; 1641 1642 while (1) { 1643 read_lock(&map_tree->lock); 1644 /* 1645 * lookup_extent_mapping will return the first extent map 1646 * intersecting the range, so setting @len to 1 is enough to 1647 * get the first chunk. 1648 */ 1649 em = lookup_extent_mapping(map_tree, start, 1); 1650 read_unlock(&map_tree->lock); 1651 if (!em) 1652 break; 1653 1654 bg = btrfs_lookup_block_group(fs_info, em->start); 1655 if (!bg) { 1656 btrfs_err(fs_info, 1657 "chunk start=%llu len=%llu doesn't have corresponding block group", 1658 em->start, em->len); 1659 ret = -EUCLEAN; 1660 free_extent_map(em); 1661 break; 1662 } 1663 if (bg->start != em->start || bg->length != em->len || 1664 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 1665 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 1666 btrfs_err(fs_info, 1667 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 1668 em->start, em->len, 1669 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 1670 bg->start, bg->length, 1671 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 1672 ret = -EUCLEAN; 1673 free_extent_map(em); 1674 btrfs_put_block_group(bg); 1675 break; 1676 } 1677 start = em->start + em->len; 1678 free_extent_map(em); 1679 btrfs_put_block_group(bg); 1680 } 1681 return ret; 1682 } 1683 1684 static int read_one_block_group(struct btrfs_fs_info *info, 1685 struct btrfs_path *path, 1686 const struct btrfs_key *key, 1687 int need_clear) 1688 { 1689 struct extent_buffer *leaf = path->nodes[0]; 1690 struct btrfs_block_group *cache; 1691 struct btrfs_space_info *space_info; 1692 struct btrfs_block_group_item bgi; 1693 const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); 1694 int slot = path->slots[0]; 1695 int ret; 1696 1697 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 1698 1699 cache = btrfs_create_block_group_cache(info, key->objectid, key->offset); 1700 if (!cache) 1701 return -ENOMEM; 1702 1703 if (need_clear) { 1704 /* 1705 * When we mount with old space cache, we need to 1706 * set BTRFS_DC_CLEAR and set dirty flag. 1707 * 1708 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 1709 * truncate the old free space cache inode and 1710 * setup a new one. 1711 * b) Setting 'dirty flag' makes sure that we flush 1712 * the new space cache info onto disk. 1713 */ 1714 if (btrfs_test_opt(info, SPACE_CACHE)) 1715 cache->disk_cache_state = BTRFS_DC_CLEAR; 1716 } 1717 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 1718 sizeof(bgi)); 1719 cache->used = btrfs_stack_block_group_used(&bgi); 1720 cache->flags = btrfs_stack_block_group_flags(&bgi); 1721 if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 1722 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 1723 btrfs_err(info, 1724 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 1725 cache->start); 1726 ret = -EINVAL; 1727 goto error; 1728 } 1729 1730 /* 1731 * We need to exclude the super stripes now so that the space info has 1732 * super bytes accounted for, otherwise we'll think we have more space 1733 * than we actually do. 1734 */ 1735 ret = exclude_super_stripes(cache); 1736 if (ret) { 1737 /* We may have excluded something, so call this just in case. */ 1738 btrfs_free_excluded_extents(cache); 1739 goto error; 1740 } 1741 1742 /* 1743 * Check for two cases, either we are full, and therefore don't need 1744 * to bother with the caching work since we won't find any space, or we 1745 * are empty, and we can just add all the space in and be done with it. 1746 * This saves us _a_lot_ of time, particularly in the full case. 1747 */ 1748 if (key->offset == cache->used) { 1749 cache->last_byte_to_unpin = (u64)-1; 1750 cache->cached = BTRFS_CACHE_FINISHED; 1751 btrfs_free_excluded_extents(cache); 1752 } else if (cache->used == 0) { 1753 cache->last_byte_to_unpin = (u64)-1; 1754 cache->cached = BTRFS_CACHE_FINISHED; 1755 add_new_free_space(cache, key->objectid, 1756 key->objectid + key->offset); 1757 btrfs_free_excluded_extents(cache); 1758 } 1759 1760 ret = btrfs_add_block_group_cache(info, cache); 1761 if (ret) { 1762 btrfs_remove_free_space_cache(cache); 1763 goto error; 1764 } 1765 trace_btrfs_add_block_group(info, cache, 0); 1766 btrfs_update_space_info(info, cache->flags, key->offset, 1767 cache->used, cache->bytes_super, &space_info); 1768 1769 cache->space_info = space_info; 1770 1771 link_block_group(cache); 1772 1773 set_avail_alloc_bits(info, cache->flags); 1774 if (btrfs_chunk_readonly(info, cache->start)) { 1775 inc_block_group_ro(cache, 1); 1776 } else if (cache->used == 0) { 1777 ASSERT(list_empty(&cache->bg_list)); 1778 btrfs_mark_bg_unused(cache); 1779 } 1780 return 0; 1781 error: 1782 btrfs_put_block_group(cache); 1783 return ret; 1784 } 1785 1786 int btrfs_read_block_groups(struct btrfs_fs_info *info) 1787 { 1788 struct btrfs_path *path; 1789 int ret; 1790 struct btrfs_block_group *cache; 1791 struct btrfs_space_info *space_info; 1792 struct btrfs_key key; 1793 int need_clear = 0; 1794 u64 cache_gen; 1795 1796 key.objectid = 0; 1797 key.offset = 0; 1798 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 1799 path = btrfs_alloc_path(); 1800 if (!path) 1801 return -ENOMEM; 1802 path->reada = READA_FORWARD; 1803 1804 cache_gen = btrfs_super_cache_generation(info->super_copy); 1805 if (btrfs_test_opt(info, SPACE_CACHE) && 1806 btrfs_super_generation(info->super_copy) != cache_gen) 1807 need_clear = 1; 1808 if (btrfs_test_opt(info, CLEAR_CACHE)) 1809 need_clear = 1; 1810 1811 while (1) { 1812 ret = find_first_block_group(info, path, &key); 1813 if (ret > 0) 1814 break; 1815 if (ret != 0) 1816 goto error; 1817 1818 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1819 ret = read_one_block_group(info, path, &key, need_clear); 1820 if (ret < 0) 1821 goto error; 1822 key.objectid += key.offset; 1823 key.offset = 0; 1824 btrfs_release_path(path); 1825 } 1826 1827 list_for_each_entry_rcu(space_info, &info->space_info, list) { 1828 if (!(btrfs_get_alloc_profile(info, space_info->flags) & 1829 (BTRFS_BLOCK_GROUP_RAID10 | 1830 BTRFS_BLOCK_GROUP_RAID1_MASK | 1831 BTRFS_BLOCK_GROUP_RAID56_MASK | 1832 BTRFS_BLOCK_GROUP_DUP))) 1833 continue; 1834 /* 1835 * Avoid allocating from un-mirrored block group if there are 1836 * mirrored block groups. 1837 */ 1838 list_for_each_entry(cache, 1839 &space_info->block_groups[BTRFS_RAID_RAID0], 1840 list) 1841 inc_block_group_ro(cache, 1); 1842 list_for_each_entry(cache, 1843 &space_info->block_groups[BTRFS_RAID_SINGLE], 1844 list) 1845 inc_block_group_ro(cache, 1); 1846 } 1847 1848 btrfs_init_global_block_rsv(info); 1849 ret = check_chunk_block_group_mappings(info); 1850 error: 1851 btrfs_free_path(path); 1852 return ret; 1853 } 1854 1855 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 1856 { 1857 struct btrfs_fs_info *fs_info = trans->fs_info; 1858 struct btrfs_block_group *block_group; 1859 struct btrfs_root *extent_root = fs_info->extent_root; 1860 struct btrfs_block_group_item item; 1861 struct btrfs_key key; 1862 int ret = 0; 1863 1864 if (!trans->can_flush_pending_bgs) 1865 return; 1866 1867 while (!list_empty(&trans->new_bgs)) { 1868 block_group = list_first_entry(&trans->new_bgs, 1869 struct btrfs_block_group, 1870 bg_list); 1871 if (ret) 1872 goto next; 1873 1874 spin_lock(&block_group->lock); 1875 btrfs_set_stack_block_group_used(&item, block_group->used); 1876 btrfs_set_stack_block_group_chunk_objectid(&item, 1877 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 1878 btrfs_set_stack_block_group_flags(&item, block_group->flags); 1879 key.objectid = block_group->start; 1880 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 1881 key.offset = block_group->length; 1882 spin_unlock(&block_group->lock); 1883 1884 ret = btrfs_insert_item(trans, extent_root, &key, &item, 1885 sizeof(item)); 1886 if (ret) 1887 btrfs_abort_transaction(trans, ret); 1888 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); 1889 if (ret) 1890 btrfs_abort_transaction(trans, ret); 1891 add_block_group_free_space(trans, block_group); 1892 /* Already aborted the transaction if it failed. */ 1893 next: 1894 btrfs_delayed_refs_rsv_release(fs_info, 1); 1895 list_del_init(&block_group->bg_list); 1896 } 1897 btrfs_trans_release_chunk_metadata(trans); 1898 } 1899 1900 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, 1901 u64 type, u64 chunk_offset, u64 size) 1902 { 1903 struct btrfs_fs_info *fs_info = trans->fs_info; 1904 struct btrfs_block_group *cache; 1905 int ret; 1906 1907 btrfs_set_log_full_commit(trans); 1908 1909 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 1910 if (!cache) 1911 return -ENOMEM; 1912 1913 cache->used = bytes_used; 1914 cache->flags = type; 1915 cache->last_byte_to_unpin = (u64)-1; 1916 cache->cached = BTRFS_CACHE_FINISHED; 1917 cache->needs_free_space = 1; 1918 ret = exclude_super_stripes(cache); 1919 if (ret) { 1920 /* We may have excluded something, so call this just in case */ 1921 btrfs_free_excluded_extents(cache); 1922 btrfs_put_block_group(cache); 1923 return ret; 1924 } 1925 1926 add_new_free_space(cache, chunk_offset, chunk_offset + size); 1927 1928 btrfs_free_excluded_extents(cache); 1929 1930 #ifdef CONFIG_BTRFS_DEBUG 1931 if (btrfs_should_fragment_free_space(cache)) { 1932 u64 new_bytes_used = size - bytes_used; 1933 1934 bytes_used += new_bytes_used >> 1; 1935 fragment_free_space(cache); 1936 } 1937 #endif 1938 /* 1939 * Ensure the corresponding space_info object is created and 1940 * assigned to our block group. We want our bg to be added to the rbtree 1941 * with its ->space_info set. 1942 */ 1943 cache->space_info = btrfs_find_space_info(fs_info, cache->flags); 1944 ASSERT(cache->space_info); 1945 1946 ret = btrfs_add_block_group_cache(fs_info, cache); 1947 if (ret) { 1948 btrfs_remove_free_space_cache(cache); 1949 btrfs_put_block_group(cache); 1950 return ret; 1951 } 1952 1953 /* 1954 * Now that our block group has its ->space_info set and is inserted in 1955 * the rbtree, update the space info's counters. 1956 */ 1957 trace_btrfs_add_block_group(fs_info, cache, 1); 1958 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, 1959 cache->bytes_super, &cache->space_info); 1960 btrfs_update_global_block_rsv(fs_info); 1961 1962 link_block_group(cache); 1963 1964 list_add_tail(&cache->bg_list, &trans->new_bgs); 1965 trans->delayed_ref_updates++; 1966 btrfs_update_delayed_refs_rsv(trans); 1967 1968 set_avail_alloc_bits(fs_info, type); 1969 return 0; 1970 } 1971 1972 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 1973 { 1974 u64 num_devices; 1975 u64 stripped; 1976 1977 /* 1978 * if restripe for this chunk_type is on pick target profile and 1979 * return, otherwise do the usual balance 1980 */ 1981 stripped = get_restripe_target(fs_info, flags); 1982 if (stripped) 1983 return extended_to_chunk(stripped); 1984 1985 num_devices = fs_info->fs_devices->rw_devices; 1986 1987 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | 1988 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; 1989 1990 if (num_devices == 1) { 1991 stripped |= BTRFS_BLOCK_GROUP_DUP; 1992 stripped = flags & ~stripped; 1993 1994 /* turn raid0 into single device chunks */ 1995 if (flags & BTRFS_BLOCK_GROUP_RAID0) 1996 return stripped; 1997 1998 /* turn mirroring into duplication */ 1999 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | 2000 BTRFS_BLOCK_GROUP_RAID10)) 2001 return stripped | BTRFS_BLOCK_GROUP_DUP; 2002 } else { 2003 /* they already had raid on here, just return */ 2004 if (flags & stripped) 2005 return flags; 2006 2007 stripped |= BTRFS_BLOCK_GROUP_DUP; 2008 stripped = flags & ~stripped; 2009 2010 /* switch duplicated blocks with raid1 */ 2011 if (flags & BTRFS_BLOCK_GROUP_DUP) 2012 return stripped | BTRFS_BLOCK_GROUP_RAID1; 2013 2014 /* this is drive concat, leave it alone */ 2015 } 2016 2017 return flags; 2018 } 2019 2020 /* 2021 * Mark one block group RO, can be called several times for the same block 2022 * group. 2023 * 2024 * @cache: the destination block group 2025 * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to 2026 * ensure we still have some free space after marking this 2027 * block group RO. 2028 */ 2029 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, 2030 bool do_chunk_alloc) 2031 { 2032 struct btrfs_fs_info *fs_info = cache->fs_info; 2033 struct btrfs_trans_handle *trans; 2034 u64 alloc_flags; 2035 int ret; 2036 2037 again: 2038 trans = btrfs_join_transaction(fs_info->extent_root); 2039 if (IS_ERR(trans)) 2040 return PTR_ERR(trans); 2041 2042 /* 2043 * we're not allowed to set block groups readonly after the dirty 2044 * block groups cache has started writing. If it already started, 2045 * back off and let this transaction commit 2046 */ 2047 mutex_lock(&fs_info->ro_block_group_mutex); 2048 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 2049 u64 transid = trans->transid; 2050 2051 mutex_unlock(&fs_info->ro_block_group_mutex); 2052 btrfs_end_transaction(trans); 2053 2054 ret = btrfs_wait_for_commit(fs_info, transid); 2055 if (ret) 2056 return ret; 2057 goto again; 2058 } 2059 2060 if (do_chunk_alloc) { 2061 /* 2062 * If we are changing raid levels, try to allocate a 2063 * corresponding block group with the new raid level. 2064 */ 2065 alloc_flags = update_block_group_flags(fs_info, cache->flags); 2066 if (alloc_flags != cache->flags) { 2067 ret = btrfs_chunk_alloc(trans, alloc_flags, 2068 CHUNK_ALLOC_FORCE); 2069 /* 2070 * ENOSPC is allowed here, we may have enough space 2071 * already allocated at the new raid level to carry on 2072 */ 2073 if (ret == -ENOSPC) 2074 ret = 0; 2075 if (ret < 0) 2076 goto out; 2077 } 2078 } 2079 2080 ret = inc_block_group_ro(cache, !do_chunk_alloc); 2081 if (!do_chunk_alloc) 2082 goto unlock_out; 2083 if (!ret) 2084 goto out; 2085 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); 2086 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2087 if (ret < 0) 2088 goto out; 2089 ret = inc_block_group_ro(cache, 0); 2090 out: 2091 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 2092 alloc_flags = update_block_group_flags(fs_info, cache->flags); 2093 mutex_lock(&fs_info->chunk_mutex); 2094 check_system_chunk(trans, alloc_flags); 2095 mutex_unlock(&fs_info->chunk_mutex); 2096 } 2097 unlock_out: 2098 mutex_unlock(&fs_info->ro_block_group_mutex); 2099 2100 btrfs_end_transaction(trans); 2101 return ret; 2102 } 2103 2104 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 2105 { 2106 struct btrfs_space_info *sinfo = cache->space_info; 2107 u64 num_bytes; 2108 2109 BUG_ON(!cache->ro); 2110 2111 spin_lock(&sinfo->lock); 2112 spin_lock(&cache->lock); 2113 if (!--cache->ro) { 2114 num_bytes = cache->length - cache->reserved - 2115 cache->pinned - cache->bytes_super - cache->used; 2116 sinfo->bytes_readonly -= num_bytes; 2117 list_del_init(&cache->ro_list); 2118 } 2119 spin_unlock(&cache->lock); 2120 spin_unlock(&sinfo->lock); 2121 } 2122 2123 static int write_one_cache_group(struct btrfs_trans_handle *trans, 2124 struct btrfs_path *path, 2125 struct btrfs_block_group *cache) 2126 { 2127 struct btrfs_fs_info *fs_info = trans->fs_info; 2128 int ret; 2129 struct btrfs_root *extent_root = fs_info->extent_root; 2130 unsigned long bi; 2131 struct extent_buffer *leaf; 2132 struct btrfs_block_group_item bgi; 2133 struct btrfs_key key; 2134 2135 key.objectid = cache->start; 2136 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2137 key.offset = cache->length; 2138 2139 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); 2140 if (ret) { 2141 if (ret > 0) 2142 ret = -ENOENT; 2143 goto fail; 2144 } 2145 2146 leaf = path->nodes[0]; 2147 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2148 btrfs_set_stack_block_group_used(&bgi, cache->used); 2149 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2150 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2151 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 2152 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 2153 btrfs_mark_buffer_dirty(leaf); 2154 fail: 2155 btrfs_release_path(path); 2156 return ret; 2157 2158 } 2159 2160 static int cache_save_setup(struct btrfs_block_group *block_group, 2161 struct btrfs_trans_handle *trans, 2162 struct btrfs_path *path) 2163 { 2164 struct btrfs_fs_info *fs_info = block_group->fs_info; 2165 struct btrfs_root *root = fs_info->tree_root; 2166 struct inode *inode = NULL; 2167 struct extent_changeset *data_reserved = NULL; 2168 u64 alloc_hint = 0; 2169 int dcs = BTRFS_DC_ERROR; 2170 u64 num_pages = 0; 2171 int retries = 0; 2172 int ret = 0; 2173 2174 /* 2175 * If this block group is smaller than 100 megs don't bother caching the 2176 * block group. 2177 */ 2178 if (block_group->length < (100 * SZ_1M)) { 2179 spin_lock(&block_group->lock); 2180 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 2181 spin_unlock(&block_group->lock); 2182 return 0; 2183 } 2184 2185 if (trans->aborted) 2186 return 0; 2187 again: 2188 inode = lookup_free_space_inode(block_group, path); 2189 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 2190 ret = PTR_ERR(inode); 2191 btrfs_release_path(path); 2192 goto out; 2193 } 2194 2195 if (IS_ERR(inode)) { 2196 BUG_ON(retries); 2197 retries++; 2198 2199 if (block_group->ro) 2200 goto out_free; 2201 2202 ret = create_free_space_inode(trans, block_group, path); 2203 if (ret) 2204 goto out_free; 2205 goto again; 2206 } 2207 2208 /* 2209 * We want to set the generation to 0, that way if anything goes wrong 2210 * from here on out we know not to trust this cache when we load up next 2211 * time. 2212 */ 2213 BTRFS_I(inode)->generation = 0; 2214 ret = btrfs_update_inode(trans, root, inode); 2215 if (ret) { 2216 /* 2217 * So theoretically we could recover from this, simply set the 2218 * super cache generation to 0 so we know to invalidate the 2219 * cache, but then we'd have to keep track of the block groups 2220 * that fail this way so we know we _have_ to reset this cache 2221 * before the next commit or risk reading stale cache. So to 2222 * limit our exposure to horrible edge cases lets just abort the 2223 * transaction, this only happens in really bad situations 2224 * anyway. 2225 */ 2226 btrfs_abort_transaction(trans, ret); 2227 goto out_put; 2228 } 2229 WARN_ON(ret); 2230 2231 /* We've already setup this transaction, go ahead and exit */ 2232 if (block_group->cache_generation == trans->transid && 2233 i_size_read(inode)) { 2234 dcs = BTRFS_DC_SETUP; 2235 goto out_put; 2236 } 2237 2238 if (i_size_read(inode) > 0) { 2239 ret = btrfs_check_trunc_cache_free_space(fs_info, 2240 &fs_info->global_block_rsv); 2241 if (ret) 2242 goto out_put; 2243 2244 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 2245 if (ret) 2246 goto out_put; 2247 } 2248 2249 spin_lock(&block_group->lock); 2250 if (block_group->cached != BTRFS_CACHE_FINISHED || 2251 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 2252 /* 2253 * don't bother trying to write stuff out _if_ 2254 * a) we're not cached, 2255 * b) we're with nospace_cache mount option, 2256 * c) we're with v2 space_cache (FREE_SPACE_TREE). 2257 */ 2258 dcs = BTRFS_DC_WRITTEN; 2259 spin_unlock(&block_group->lock); 2260 goto out_put; 2261 } 2262 spin_unlock(&block_group->lock); 2263 2264 /* 2265 * We hit an ENOSPC when setting up the cache in this transaction, just 2266 * skip doing the setup, we've already cleared the cache so we're safe. 2267 */ 2268 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 2269 ret = -ENOSPC; 2270 goto out_put; 2271 } 2272 2273 /* 2274 * Try to preallocate enough space based on how big the block group is. 2275 * Keep in mind this has to include any pinned space which could end up 2276 * taking up quite a bit since it's not folded into the other space 2277 * cache. 2278 */ 2279 num_pages = div_u64(block_group->length, SZ_256M); 2280 if (!num_pages) 2281 num_pages = 1; 2282 2283 num_pages *= 16; 2284 num_pages *= PAGE_SIZE; 2285 2286 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); 2287 if (ret) 2288 goto out_put; 2289 2290 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2291 num_pages, num_pages, 2292 &alloc_hint); 2293 /* 2294 * Our cache requires contiguous chunks so that we don't modify a bunch 2295 * of metadata or split extents when writing the cache out, which means 2296 * we can enospc if we are heavily fragmented in addition to just normal 2297 * out of space conditions. So if we hit this just skip setting up any 2298 * other block groups for this transaction, maybe we'll unpin enough 2299 * space the next time around. 2300 */ 2301 if (!ret) 2302 dcs = BTRFS_DC_SETUP; 2303 else if (ret == -ENOSPC) 2304 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 2305 2306 out_put: 2307 iput(inode); 2308 out_free: 2309 btrfs_release_path(path); 2310 out: 2311 spin_lock(&block_group->lock); 2312 if (!ret && dcs == BTRFS_DC_SETUP) 2313 block_group->cache_generation = trans->transid; 2314 block_group->disk_cache_state = dcs; 2315 spin_unlock(&block_group->lock); 2316 2317 extent_changeset_free(data_reserved); 2318 return ret; 2319 } 2320 2321 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) 2322 { 2323 struct btrfs_fs_info *fs_info = trans->fs_info; 2324 struct btrfs_block_group *cache, *tmp; 2325 struct btrfs_transaction *cur_trans = trans->transaction; 2326 struct btrfs_path *path; 2327 2328 if (list_empty(&cur_trans->dirty_bgs) || 2329 !btrfs_test_opt(fs_info, SPACE_CACHE)) 2330 return 0; 2331 2332 path = btrfs_alloc_path(); 2333 if (!path) 2334 return -ENOMEM; 2335 2336 /* Could add new block groups, use _safe just in case */ 2337 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 2338 dirty_list) { 2339 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 2340 cache_save_setup(cache, trans, path); 2341 } 2342 2343 btrfs_free_path(path); 2344 return 0; 2345 } 2346 2347 /* 2348 * Transaction commit does final block group cache writeback during a critical 2349 * section where nothing is allowed to change the FS. This is required in 2350 * order for the cache to actually match the block group, but can introduce a 2351 * lot of latency into the commit. 2352 * 2353 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO. 2354 * There's a chance we'll have to redo some of it if the block group changes 2355 * again during the commit, but it greatly reduces the commit latency by 2356 * getting rid of the easy block groups while we're still allowing others to 2357 * join the commit. 2358 */ 2359 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 2360 { 2361 struct btrfs_fs_info *fs_info = trans->fs_info; 2362 struct btrfs_block_group *cache; 2363 struct btrfs_transaction *cur_trans = trans->transaction; 2364 int ret = 0; 2365 int should_put; 2366 struct btrfs_path *path = NULL; 2367 LIST_HEAD(dirty); 2368 struct list_head *io = &cur_trans->io_bgs; 2369 int num_started = 0; 2370 int loops = 0; 2371 2372 spin_lock(&cur_trans->dirty_bgs_lock); 2373 if (list_empty(&cur_trans->dirty_bgs)) { 2374 spin_unlock(&cur_trans->dirty_bgs_lock); 2375 return 0; 2376 } 2377 list_splice_init(&cur_trans->dirty_bgs, &dirty); 2378 spin_unlock(&cur_trans->dirty_bgs_lock); 2379 2380 again: 2381 /* Make sure all the block groups on our dirty list actually exist */ 2382 btrfs_create_pending_block_groups(trans); 2383 2384 if (!path) { 2385 path = btrfs_alloc_path(); 2386 if (!path) 2387 return -ENOMEM; 2388 } 2389 2390 /* 2391 * cache_write_mutex is here only to save us from balance or automatic 2392 * removal of empty block groups deleting this block group while we are 2393 * writing out the cache 2394 */ 2395 mutex_lock(&trans->transaction->cache_write_mutex); 2396 while (!list_empty(&dirty)) { 2397 bool drop_reserve = true; 2398 2399 cache = list_first_entry(&dirty, struct btrfs_block_group, 2400 dirty_list); 2401 /* 2402 * This can happen if something re-dirties a block group that 2403 * is already under IO. Just wait for it to finish and then do 2404 * it all again 2405 */ 2406 if (!list_empty(&cache->io_list)) { 2407 list_del_init(&cache->io_list); 2408 btrfs_wait_cache_io(trans, cache, path); 2409 btrfs_put_block_group(cache); 2410 } 2411 2412 2413 /* 2414 * btrfs_wait_cache_io uses the cache->dirty_list to decide if 2415 * it should update the cache_state. Don't delete until after 2416 * we wait. 2417 * 2418 * Since we're not running in the commit critical section 2419 * we need the dirty_bgs_lock to protect from update_block_group 2420 */ 2421 spin_lock(&cur_trans->dirty_bgs_lock); 2422 list_del_init(&cache->dirty_list); 2423 spin_unlock(&cur_trans->dirty_bgs_lock); 2424 2425 should_put = 1; 2426 2427 cache_save_setup(cache, trans, path); 2428 2429 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 2430 cache->io_ctl.inode = NULL; 2431 ret = btrfs_write_out_cache(trans, cache, path); 2432 if (ret == 0 && cache->io_ctl.inode) { 2433 num_started++; 2434 should_put = 0; 2435 2436 /* 2437 * The cache_write_mutex is protecting the 2438 * io_list, also refer to the definition of 2439 * btrfs_transaction::io_bgs for more details 2440 */ 2441 list_add_tail(&cache->io_list, io); 2442 } else { 2443 /* 2444 * If we failed to write the cache, the 2445 * generation will be bad and life goes on 2446 */ 2447 ret = 0; 2448 } 2449 } 2450 if (!ret) { 2451 ret = write_one_cache_group(trans, path, cache); 2452 /* 2453 * Our block group might still be attached to the list 2454 * of new block groups in the transaction handle of some 2455 * other task (struct btrfs_trans_handle->new_bgs). This 2456 * means its block group item isn't yet in the extent 2457 * tree. If this happens ignore the error, as we will 2458 * try again later in the critical section of the 2459 * transaction commit. 2460 */ 2461 if (ret == -ENOENT) { 2462 ret = 0; 2463 spin_lock(&cur_trans->dirty_bgs_lock); 2464 if (list_empty(&cache->dirty_list)) { 2465 list_add_tail(&cache->dirty_list, 2466 &cur_trans->dirty_bgs); 2467 btrfs_get_block_group(cache); 2468 drop_reserve = false; 2469 } 2470 spin_unlock(&cur_trans->dirty_bgs_lock); 2471 } else if (ret) { 2472 btrfs_abort_transaction(trans, ret); 2473 } 2474 } 2475 2476 /* If it's not on the io list, we need to put the block group */ 2477 if (should_put) 2478 btrfs_put_block_group(cache); 2479 if (drop_reserve) 2480 btrfs_delayed_refs_rsv_release(fs_info, 1); 2481 2482 if (ret) 2483 break; 2484 2485 /* 2486 * Avoid blocking other tasks for too long. It might even save 2487 * us from writing caches for block groups that are going to be 2488 * removed. 2489 */ 2490 mutex_unlock(&trans->transaction->cache_write_mutex); 2491 mutex_lock(&trans->transaction->cache_write_mutex); 2492 } 2493 mutex_unlock(&trans->transaction->cache_write_mutex); 2494 2495 /* 2496 * Go through delayed refs for all the stuff we've just kicked off 2497 * and then loop back (just once) 2498 */ 2499 ret = btrfs_run_delayed_refs(trans, 0); 2500 if (!ret && loops == 0) { 2501 loops++; 2502 spin_lock(&cur_trans->dirty_bgs_lock); 2503 list_splice_init(&cur_trans->dirty_bgs, &dirty); 2504 /* 2505 * dirty_bgs_lock protects us from concurrent block group 2506 * deletes too (not just cache_write_mutex). 2507 */ 2508 if (!list_empty(&dirty)) { 2509 spin_unlock(&cur_trans->dirty_bgs_lock); 2510 goto again; 2511 } 2512 spin_unlock(&cur_trans->dirty_bgs_lock); 2513 } else if (ret < 0) { 2514 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 2515 } 2516 2517 btrfs_free_path(path); 2518 return ret; 2519 } 2520 2521 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) 2522 { 2523 struct btrfs_fs_info *fs_info = trans->fs_info; 2524 struct btrfs_block_group *cache; 2525 struct btrfs_transaction *cur_trans = trans->transaction; 2526 int ret = 0; 2527 int should_put; 2528 struct btrfs_path *path; 2529 struct list_head *io = &cur_trans->io_bgs; 2530 int num_started = 0; 2531 2532 path = btrfs_alloc_path(); 2533 if (!path) 2534 return -ENOMEM; 2535 2536 /* 2537 * Even though we are in the critical section of the transaction commit, 2538 * we can still have concurrent tasks adding elements to this 2539 * transaction's list of dirty block groups. These tasks correspond to 2540 * endio free space workers started when writeback finishes for a 2541 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 2542 * allocate new block groups as a result of COWing nodes of the root 2543 * tree when updating the free space inode. The writeback for the space 2544 * caches is triggered by an earlier call to 2545 * btrfs_start_dirty_block_groups() and iterations of the following 2546 * loop. 2547 * Also we want to do the cache_save_setup first and then run the 2548 * delayed refs to make sure we have the best chance at doing this all 2549 * in one shot. 2550 */ 2551 spin_lock(&cur_trans->dirty_bgs_lock); 2552 while (!list_empty(&cur_trans->dirty_bgs)) { 2553 cache = list_first_entry(&cur_trans->dirty_bgs, 2554 struct btrfs_block_group, 2555 dirty_list); 2556 2557 /* 2558 * This can happen if cache_save_setup re-dirties a block group 2559 * that is already under IO. Just wait for it to finish and 2560 * then do it all again 2561 */ 2562 if (!list_empty(&cache->io_list)) { 2563 spin_unlock(&cur_trans->dirty_bgs_lock); 2564 list_del_init(&cache->io_list); 2565 btrfs_wait_cache_io(trans, cache, path); 2566 btrfs_put_block_group(cache); 2567 spin_lock(&cur_trans->dirty_bgs_lock); 2568 } 2569 2570 /* 2571 * Don't remove from the dirty list until after we've waited on 2572 * any pending IO 2573 */ 2574 list_del_init(&cache->dirty_list); 2575 spin_unlock(&cur_trans->dirty_bgs_lock); 2576 should_put = 1; 2577 2578 cache_save_setup(cache, trans, path); 2579 2580 if (!ret) 2581 ret = btrfs_run_delayed_refs(trans, 2582 (unsigned long) -1); 2583 2584 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 2585 cache->io_ctl.inode = NULL; 2586 ret = btrfs_write_out_cache(trans, cache, path); 2587 if (ret == 0 && cache->io_ctl.inode) { 2588 num_started++; 2589 should_put = 0; 2590 list_add_tail(&cache->io_list, io); 2591 } else { 2592 /* 2593 * If we failed to write the cache, the 2594 * generation will be bad and life goes on 2595 */ 2596 ret = 0; 2597 } 2598 } 2599 if (!ret) { 2600 ret = write_one_cache_group(trans, path, cache); 2601 /* 2602 * One of the free space endio workers might have 2603 * created a new block group while updating a free space 2604 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 2605 * and hasn't released its transaction handle yet, in 2606 * which case the new block group is still attached to 2607 * its transaction handle and its creation has not 2608 * finished yet (no block group item in the extent tree 2609 * yet, etc). If this is the case, wait for all free 2610 * space endio workers to finish and retry. This is a 2611 * a very rare case so no need for a more efficient and 2612 * complex approach. 2613 */ 2614 if (ret == -ENOENT) { 2615 wait_event(cur_trans->writer_wait, 2616 atomic_read(&cur_trans->num_writers) == 1); 2617 ret = write_one_cache_group(trans, path, cache); 2618 } 2619 if (ret) 2620 btrfs_abort_transaction(trans, ret); 2621 } 2622 2623 /* If its not on the io list, we need to put the block group */ 2624 if (should_put) 2625 btrfs_put_block_group(cache); 2626 btrfs_delayed_refs_rsv_release(fs_info, 1); 2627 spin_lock(&cur_trans->dirty_bgs_lock); 2628 } 2629 spin_unlock(&cur_trans->dirty_bgs_lock); 2630 2631 /* 2632 * Refer to the definition of io_bgs member for details why it's safe 2633 * to use it without any locking 2634 */ 2635 while (!list_empty(io)) { 2636 cache = list_first_entry(io, struct btrfs_block_group, 2637 io_list); 2638 list_del_init(&cache->io_list); 2639 btrfs_wait_cache_io(trans, cache, path); 2640 btrfs_put_block_group(cache); 2641 } 2642 2643 btrfs_free_path(path); 2644 return ret; 2645 } 2646 2647 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 2648 u64 bytenr, u64 num_bytes, int alloc) 2649 { 2650 struct btrfs_fs_info *info = trans->fs_info; 2651 struct btrfs_block_group *cache = NULL; 2652 u64 total = num_bytes; 2653 u64 old_val; 2654 u64 byte_in_group; 2655 int factor; 2656 int ret = 0; 2657 2658 /* Block accounting for super block */ 2659 spin_lock(&info->delalloc_root_lock); 2660 old_val = btrfs_super_bytes_used(info->super_copy); 2661 if (alloc) 2662 old_val += num_bytes; 2663 else 2664 old_val -= num_bytes; 2665 btrfs_set_super_bytes_used(info->super_copy, old_val); 2666 spin_unlock(&info->delalloc_root_lock); 2667 2668 while (total) { 2669 cache = btrfs_lookup_block_group(info, bytenr); 2670 if (!cache) { 2671 ret = -ENOENT; 2672 break; 2673 } 2674 factor = btrfs_bg_type_to_factor(cache->flags); 2675 2676 /* 2677 * If this block group has free space cache written out, we 2678 * need to make sure to load it if we are removing space. This 2679 * is because we need the unpinning stage to actually add the 2680 * space back to the block group, otherwise we will leak space. 2681 */ 2682 if (!alloc && !btrfs_block_group_done(cache)) 2683 btrfs_cache_block_group(cache, 1); 2684 2685 byte_in_group = bytenr - cache->start; 2686 WARN_ON(byte_in_group > cache->length); 2687 2688 spin_lock(&cache->space_info->lock); 2689 spin_lock(&cache->lock); 2690 2691 if (btrfs_test_opt(info, SPACE_CACHE) && 2692 cache->disk_cache_state < BTRFS_DC_CLEAR) 2693 cache->disk_cache_state = BTRFS_DC_CLEAR; 2694 2695 old_val = cache->used; 2696 num_bytes = min(total, cache->length - byte_in_group); 2697 if (alloc) { 2698 old_val += num_bytes; 2699 cache->used = old_val; 2700 cache->reserved -= num_bytes; 2701 cache->space_info->bytes_reserved -= num_bytes; 2702 cache->space_info->bytes_used += num_bytes; 2703 cache->space_info->disk_used += num_bytes * factor; 2704 spin_unlock(&cache->lock); 2705 spin_unlock(&cache->space_info->lock); 2706 } else { 2707 old_val -= num_bytes; 2708 cache->used = old_val; 2709 cache->pinned += num_bytes; 2710 btrfs_space_info_update_bytes_pinned(info, 2711 cache->space_info, num_bytes); 2712 cache->space_info->bytes_used -= num_bytes; 2713 cache->space_info->disk_used -= num_bytes * factor; 2714 spin_unlock(&cache->lock); 2715 spin_unlock(&cache->space_info->lock); 2716 2717 percpu_counter_add_batch( 2718 &cache->space_info->total_bytes_pinned, 2719 num_bytes, 2720 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2721 set_extent_dirty(info->pinned_extents, 2722 bytenr, bytenr + num_bytes - 1, 2723 GFP_NOFS | __GFP_NOFAIL); 2724 } 2725 2726 spin_lock(&trans->transaction->dirty_bgs_lock); 2727 if (list_empty(&cache->dirty_list)) { 2728 list_add_tail(&cache->dirty_list, 2729 &trans->transaction->dirty_bgs); 2730 trans->delayed_ref_updates++; 2731 btrfs_get_block_group(cache); 2732 } 2733 spin_unlock(&trans->transaction->dirty_bgs_lock); 2734 2735 /* 2736 * No longer have used bytes in this block group, queue it for 2737 * deletion. We do this after adding the block group to the 2738 * dirty list to avoid races between cleaner kthread and space 2739 * cache writeout. 2740 */ 2741 if (!alloc && old_val == 0) 2742 btrfs_mark_bg_unused(cache); 2743 2744 btrfs_put_block_group(cache); 2745 total -= num_bytes; 2746 bytenr += num_bytes; 2747 } 2748 2749 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 2750 btrfs_update_delayed_refs_rsv(trans); 2751 return ret; 2752 } 2753 2754 /** 2755 * btrfs_add_reserved_bytes - update the block_group and space info counters 2756 * @cache: The cache we are manipulating 2757 * @ram_bytes: The number of bytes of file content, and will be same to 2758 * @num_bytes except for the compress path. 2759 * @num_bytes: The number of bytes in question 2760 * @delalloc: The blocks are allocated for the delalloc write 2761 * 2762 * This is called by the allocator when it reserves space. If this is a 2763 * reservation and the block group has become read only we cannot make the 2764 * reservation and return -EAGAIN, otherwise this function always succeeds. 2765 */ 2766 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, 2767 u64 ram_bytes, u64 num_bytes, int delalloc) 2768 { 2769 struct btrfs_space_info *space_info = cache->space_info; 2770 int ret = 0; 2771 2772 spin_lock(&space_info->lock); 2773 spin_lock(&cache->lock); 2774 if (cache->ro) { 2775 ret = -EAGAIN; 2776 } else { 2777 cache->reserved += num_bytes; 2778 space_info->bytes_reserved += num_bytes; 2779 trace_btrfs_space_reservation(cache->fs_info, "space_info", 2780 space_info->flags, num_bytes, 1); 2781 btrfs_space_info_update_bytes_may_use(cache->fs_info, 2782 space_info, -ram_bytes); 2783 if (delalloc) 2784 cache->delalloc_bytes += num_bytes; 2785 } 2786 spin_unlock(&cache->lock); 2787 spin_unlock(&space_info->lock); 2788 return ret; 2789 } 2790 2791 /** 2792 * btrfs_free_reserved_bytes - update the block_group and space info counters 2793 * @cache: The cache we are manipulating 2794 * @num_bytes: The number of bytes in question 2795 * @delalloc: The blocks are allocated for the delalloc write 2796 * 2797 * This is called by somebody who is freeing space that was never actually used 2798 * on disk. For example if you reserve some space for a new leaf in transaction 2799 * A and before transaction A commits you free that leaf, you call this with 2800 * reserve set to 0 in order to clear the reservation. 2801 */ 2802 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, 2803 u64 num_bytes, int delalloc) 2804 { 2805 struct btrfs_space_info *space_info = cache->space_info; 2806 2807 spin_lock(&space_info->lock); 2808 spin_lock(&cache->lock); 2809 if (cache->ro) 2810 space_info->bytes_readonly += num_bytes; 2811 cache->reserved -= num_bytes; 2812 space_info->bytes_reserved -= num_bytes; 2813 space_info->max_extent_size = 0; 2814 2815 if (delalloc) 2816 cache->delalloc_bytes -= num_bytes; 2817 spin_unlock(&cache->lock); 2818 spin_unlock(&space_info->lock); 2819 } 2820 2821 static void force_metadata_allocation(struct btrfs_fs_info *info) 2822 { 2823 struct list_head *head = &info->space_info; 2824 struct btrfs_space_info *found; 2825 2826 rcu_read_lock(); 2827 list_for_each_entry_rcu(found, head, list) { 2828 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 2829 found->force_alloc = CHUNK_ALLOC_FORCE; 2830 } 2831 rcu_read_unlock(); 2832 } 2833 2834 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 2835 struct btrfs_space_info *sinfo, int force) 2836 { 2837 u64 bytes_used = btrfs_space_info_used(sinfo, false); 2838 u64 thresh; 2839 2840 if (force == CHUNK_ALLOC_FORCE) 2841 return 1; 2842 2843 /* 2844 * in limited mode, we want to have some free space up to 2845 * about 1% of the FS size. 2846 */ 2847 if (force == CHUNK_ALLOC_LIMITED) { 2848 thresh = btrfs_super_total_bytes(fs_info->super_copy); 2849 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 2850 2851 if (sinfo->total_bytes - bytes_used < thresh) 2852 return 1; 2853 } 2854 2855 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 2856 return 0; 2857 return 1; 2858 } 2859 2860 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 2861 { 2862 u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); 2863 2864 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 2865 } 2866 2867 /* 2868 * If force is CHUNK_ALLOC_FORCE: 2869 * - return 1 if it successfully allocates a chunk, 2870 * - return errors including -ENOSPC otherwise. 2871 * If force is NOT CHUNK_ALLOC_FORCE: 2872 * - return 0 if it doesn't need to allocate a new chunk, 2873 * - return 1 if it successfully allocates a chunk, 2874 * - return errors including -ENOSPC otherwise. 2875 */ 2876 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 2877 enum btrfs_chunk_alloc_enum force) 2878 { 2879 struct btrfs_fs_info *fs_info = trans->fs_info; 2880 struct btrfs_space_info *space_info; 2881 bool wait_for_alloc = false; 2882 bool should_alloc = false; 2883 int ret = 0; 2884 2885 /* Don't re-enter if we're already allocating a chunk */ 2886 if (trans->allocating_chunk) 2887 return -ENOSPC; 2888 2889 space_info = btrfs_find_space_info(fs_info, flags); 2890 ASSERT(space_info); 2891 2892 do { 2893 spin_lock(&space_info->lock); 2894 if (force < space_info->force_alloc) 2895 force = space_info->force_alloc; 2896 should_alloc = should_alloc_chunk(fs_info, space_info, force); 2897 if (space_info->full) { 2898 /* No more free physical space */ 2899 if (should_alloc) 2900 ret = -ENOSPC; 2901 else 2902 ret = 0; 2903 spin_unlock(&space_info->lock); 2904 return ret; 2905 } else if (!should_alloc) { 2906 spin_unlock(&space_info->lock); 2907 return 0; 2908 } else if (space_info->chunk_alloc) { 2909 /* 2910 * Someone is already allocating, so we need to block 2911 * until this someone is finished and then loop to 2912 * recheck if we should continue with our allocation 2913 * attempt. 2914 */ 2915 wait_for_alloc = true; 2916 spin_unlock(&space_info->lock); 2917 mutex_lock(&fs_info->chunk_mutex); 2918 mutex_unlock(&fs_info->chunk_mutex); 2919 } else { 2920 /* Proceed with allocation */ 2921 space_info->chunk_alloc = 1; 2922 wait_for_alloc = false; 2923 spin_unlock(&space_info->lock); 2924 } 2925 2926 cond_resched(); 2927 } while (wait_for_alloc); 2928 2929 mutex_lock(&fs_info->chunk_mutex); 2930 trans->allocating_chunk = true; 2931 2932 /* 2933 * If we have mixed data/metadata chunks we want to make sure we keep 2934 * allocating mixed chunks instead of individual chunks. 2935 */ 2936 if (btrfs_mixed_space_info(space_info)) 2937 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 2938 2939 /* 2940 * if we're doing a data chunk, go ahead and make sure that 2941 * we keep a reasonable number of metadata chunks allocated in the 2942 * FS as well. 2943 */ 2944 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 2945 fs_info->data_chunk_allocations++; 2946 if (!(fs_info->data_chunk_allocations % 2947 fs_info->metadata_ratio)) 2948 force_metadata_allocation(fs_info); 2949 } 2950 2951 /* 2952 * Check if we have enough space in SYSTEM chunk because we may need 2953 * to update devices. 2954 */ 2955 check_system_chunk(trans, flags); 2956 2957 ret = btrfs_alloc_chunk(trans, flags); 2958 trans->allocating_chunk = false; 2959 2960 spin_lock(&space_info->lock); 2961 if (ret < 0) { 2962 if (ret == -ENOSPC) 2963 space_info->full = 1; 2964 else 2965 goto out; 2966 } else { 2967 ret = 1; 2968 space_info->max_extent_size = 0; 2969 } 2970 2971 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 2972 out: 2973 space_info->chunk_alloc = 0; 2974 spin_unlock(&space_info->lock); 2975 mutex_unlock(&fs_info->chunk_mutex); 2976 /* 2977 * When we allocate a new chunk we reserve space in the chunk block 2978 * reserve to make sure we can COW nodes/leafs in the chunk tree or 2979 * add new nodes/leafs to it if we end up needing to do it when 2980 * inserting the chunk item and updating device items as part of the 2981 * second phase of chunk allocation, performed by 2982 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 2983 * large number of new block groups to create in our transaction 2984 * handle's new_bgs list to avoid exhausting the chunk block reserve 2985 * in extreme cases - like having a single transaction create many new 2986 * block groups when starting to write out the free space caches of all 2987 * the block groups that were made dirty during the lifetime of the 2988 * transaction. 2989 */ 2990 if (trans->chunk_bytes_reserved >= (u64)SZ_2M) 2991 btrfs_create_pending_block_groups(trans); 2992 2993 return ret; 2994 } 2995 2996 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 2997 { 2998 u64 num_dev; 2999 3000 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; 3001 if (!num_dev) 3002 num_dev = fs_info->fs_devices->rw_devices; 3003 3004 return num_dev; 3005 } 3006 3007 /* 3008 * Reserve space in the system space for allocating or removing a chunk 3009 */ 3010 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 3011 { 3012 struct btrfs_fs_info *fs_info = trans->fs_info; 3013 struct btrfs_space_info *info; 3014 u64 left; 3015 u64 thresh; 3016 int ret = 0; 3017 u64 num_devs; 3018 3019 /* 3020 * Needed because we can end up allocating a system chunk and for an 3021 * atomic and race free space reservation in the chunk block reserve. 3022 */ 3023 lockdep_assert_held(&fs_info->chunk_mutex); 3024 3025 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3026 spin_lock(&info->lock); 3027 left = info->total_bytes - btrfs_space_info_used(info, true); 3028 spin_unlock(&info->lock); 3029 3030 num_devs = get_profile_num_devs(fs_info, type); 3031 3032 /* num_devs device items to update and 1 chunk item to add or remove */ 3033 thresh = btrfs_calc_metadata_size(fs_info, num_devs) + 3034 btrfs_calc_insert_metadata_size(fs_info, 1); 3035 3036 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3037 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3038 left, thresh, type); 3039 btrfs_dump_space_info(fs_info, info, 0, 0); 3040 } 3041 3042 if (left < thresh) { 3043 u64 flags = btrfs_system_alloc_profile(fs_info); 3044 3045 /* 3046 * Ignore failure to create system chunk. We might end up not 3047 * needing it, as we might not need to COW all nodes/leafs from 3048 * the paths we visit in the chunk tree (they were already COWed 3049 * or created in the current transaction for example). 3050 */ 3051 ret = btrfs_alloc_chunk(trans, flags); 3052 } 3053 3054 if (!ret) { 3055 ret = btrfs_block_rsv_add(fs_info->chunk_root, 3056 &fs_info->chunk_block_rsv, 3057 thresh, BTRFS_RESERVE_NO_FLUSH); 3058 if (!ret) 3059 trans->chunk_bytes_reserved += thresh; 3060 } 3061 } 3062 3063 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 3064 { 3065 struct btrfs_block_group *block_group; 3066 u64 last = 0; 3067 3068 while (1) { 3069 struct inode *inode; 3070 3071 block_group = btrfs_lookup_first_block_group(info, last); 3072 while (block_group) { 3073 btrfs_wait_block_group_cache_done(block_group); 3074 spin_lock(&block_group->lock); 3075 if (block_group->iref) 3076 break; 3077 spin_unlock(&block_group->lock); 3078 block_group = btrfs_next_block_group(block_group); 3079 } 3080 if (!block_group) { 3081 if (last == 0) 3082 break; 3083 last = 0; 3084 continue; 3085 } 3086 3087 inode = block_group->inode; 3088 block_group->iref = 0; 3089 block_group->inode = NULL; 3090 spin_unlock(&block_group->lock); 3091 ASSERT(block_group->io_ctl.inode == NULL); 3092 iput(inode); 3093 last = block_group->start + block_group->length; 3094 btrfs_put_block_group(block_group); 3095 } 3096 } 3097 3098 /* 3099 * Must be called only after stopping all workers, since we could have block 3100 * group caching kthreads running, and therefore they could race with us if we 3101 * freed the block groups before stopping them. 3102 */ 3103 int btrfs_free_block_groups(struct btrfs_fs_info *info) 3104 { 3105 struct btrfs_block_group *block_group; 3106 struct btrfs_space_info *space_info; 3107 struct btrfs_caching_control *caching_ctl; 3108 struct rb_node *n; 3109 3110 down_write(&info->commit_root_sem); 3111 while (!list_empty(&info->caching_block_groups)) { 3112 caching_ctl = list_entry(info->caching_block_groups.next, 3113 struct btrfs_caching_control, list); 3114 list_del(&caching_ctl->list); 3115 btrfs_put_caching_control(caching_ctl); 3116 } 3117 up_write(&info->commit_root_sem); 3118 3119 spin_lock(&info->unused_bgs_lock); 3120 while (!list_empty(&info->unused_bgs)) { 3121 block_group = list_first_entry(&info->unused_bgs, 3122 struct btrfs_block_group, 3123 bg_list); 3124 list_del_init(&block_group->bg_list); 3125 btrfs_put_block_group(block_group); 3126 } 3127 spin_unlock(&info->unused_bgs_lock); 3128 3129 spin_lock(&info->block_group_cache_lock); 3130 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 3131 block_group = rb_entry(n, struct btrfs_block_group, 3132 cache_node); 3133 rb_erase(&block_group->cache_node, 3134 &info->block_group_cache_tree); 3135 RB_CLEAR_NODE(&block_group->cache_node); 3136 spin_unlock(&info->block_group_cache_lock); 3137 3138 down_write(&block_group->space_info->groups_sem); 3139 list_del(&block_group->list); 3140 up_write(&block_group->space_info->groups_sem); 3141 3142 /* 3143 * We haven't cached this block group, which means we could 3144 * possibly have excluded extents on this block group. 3145 */ 3146 if (block_group->cached == BTRFS_CACHE_NO || 3147 block_group->cached == BTRFS_CACHE_ERROR) 3148 btrfs_free_excluded_extents(block_group); 3149 3150 btrfs_remove_free_space_cache(block_group); 3151 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 3152 ASSERT(list_empty(&block_group->dirty_list)); 3153 ASSERT(list_empty(&block_group->io_list)); 3154 ASSERT(list_empty(&block_group->bg_list)); 3155 ASSERT(atomic_read(&block_group->count) == 1); 3156 btrfs_put_block_group(block_group); 3157 3158 spin_lock(&info->block_group_cache_lock); 3159 } 3160 spin_unlock(&info->block_group_cache_lock); 3161 3162 /* 3163 * Now that all the block groups are freed, go through and free all the 3164 * space_info structs. This is only called during the final stages of 3165 * unmount, and so we know nobody is using them. We call 3166 * synchronize_rcu() once before we start, just to be on the safe side. 3167 */ 3168 synchronize_rcu(); 3169 3170 btrfs_release_global_block_rsv(info); 3171 3172 while (!list_empty(&info->space_info)) { 3173 space_info = list_entry(info->space_info.next, 3174 struct btrfs_space_info, 3175 list); 3176 3177 /* 3178 * Do not hide this behind enospc_debug, this is actually 3179 * important and indicates a real bug if this happens. 3180 */ 3181 if (WARN_ON(space_info->bytes_pinned > 0 || 3182 space_info->bytes_reserved > 0 || 3183 space_info->bytes_may_use > 0)) 3184 btrfs_dump_space_info(info, space_info, 0, 0); 3185 list_del(&space_info->list); 3186 btrfs_sysfs_remove_space_info(space_info); 3187 } 3188 return 0; 3189 } 3190