1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 init_waitqueue_head(&space_info->wait); 62 INIT_LIST_HEAD(&space_info->ro_bgs); 63 INIT_LIST_HEAD(&space_info->tickets); 64 INIT_LIST_HEAD(&space_info->priority_tickets); 65 66 ret = btrfs_sysfs_add_space_info_type(info, space_info); 67 if (ret) 68 return ret; 69 70 list_add_rcu(&space_info->list, &info->space_info); 71 if (flags & BTRFS_BLOCK_GROUP_DATA) 72 info->data_sinfo = space_info; 73 74 return ret; 75 } 76 77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 78 { 79 struct btrfs_super_block *disk_super; 80 u64 features; 81 u64 flags; 82 int mixed = 0; 83 int ret; 84 85 disk_super = fs_info->super_copy; 86 if (!btrfs_super_root(disk_super)) 87 return -EINVAL; 88 89 features = btrfs_super_incompat_flags(disk_super); 90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 91 mixed = 1; 92 93 flags = BTRFS_BLOCK_GROUP_SYSTEM; 94 ret = create_space_info(fs_info, flags); 95 if (ret) 96 goto out; 97 98 if (mixed) { 99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 100 ret = create_space_info(fs_info, flags); 101 } else { 102 flags = BTRFS_BLOCK_GROUP_METADATA; 103 ret = create_space_info(fs_info, flags); 104 if (ret) 105 goto out; 106 107 flags = BTRFS_BLOCK_GROUP_DATA; 108 ret = create_space_info(fs_info, flags); 109 } 110 out: 111 return ret; 112 } 113 114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 115 u64 total_bytes, u64 bytes_used, 116 u64 bytes_readonly, 117 struct btrfs_space_info **space_info) 118 { 119 struct btrfs_space_info *found; 120 int factor; 121 122 factor = btrfs_bg_type_to_factor(flags); 123 124 found = btrfs_find_space_info(info, flags); 125 ASSERT(found); 126 spin_lock(&found->lock); 127 found->total_bytes += total_bytes; 128 found->disk_total += total_bytes * factor; 129 found->bytes_used += bytes_used; 130 found->disk_used += bytes_used * factor; 131 found->bytes_readonly += bytes_readonly; 132 if (total_bytes > 0) 133 found->full = 0; 134 btrfs_try_granting_tickets(info, found); 135 spin_unlock(&found->lock); 136 *space_info = found; 137 } 138 139 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 140 u64 flags) 141 { 142 struct list_head *head = &info->space_info; 143 struct btrfs_space_info *found; 144 145 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 146 147 rcu_read_lock(); 148 list_for_each_entry_rcu(found, head, list) { 149 if (found->flags & flags) { 150 rcu_read_unlock(); 151 return found; 152 } 153 } 154 rcu_read_unlock(); 155 return NULL; 156 } 157 158 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 159 { 160 return (global->size << 1); 161 } 162 163 static int can_overcommit(struct btrfs_fs_info *fs_info, 164 struct btrfs_space_info *space_info, u64 bytes, 165 enum btrfs_reserve_flush_enum flush, 166 bool system_chunk) 167 { 168 u64 profile; 169 u64 avail; 170 u64 used; 171 int factor; 172 173 /* Don't overcommit when in mixed mode. */ 174 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 175 return 0; 176 177 if (system_chunk) 178 profile = btrfs_system_alloc_profile(fs_info); 179 else 180 profile = btrfs_metadata_alloc_profile(fs_info); 181 182 used = btrfs_space_info_used(space_info, true); 183 avail = atomic64_read(&fs_info->free_chunk_space); 184 185 /* 186 * If we have dup, raid1 or raid10 then only half of the free 187 * space is actually usable. For raid56, the space info used 188 * doesn't include the parity drive, so we don't have to 189 * change the math 190 */ 191 factor = btrfs_bg_type_to_factor(profile); 192 avail = div_u64(avail, factor); 193 194 /* 195 * If we aren't flushing all things, let us overcommit up to 196 * 1/2th of the space. If we can flush, don't let us overcommit 197 * too much, let it overcommit up to 1/8 of the space. 198 */ 199 if (flush == BTRFS_RESERVE_FLUSH_ALL) 200 avail >>= 3; 201 else 202 avail >>= 1; 203 204 if (used + bytes < space_info->total_bytes + avail) 205 return 1; 206 return 0; 207 } 208 209 /* 210 * This is for space we already have accounted in space_info->bytes_may_use, so 211 * basically when we're returning space from block_rsv's. 212 */ 213 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 214 struct btrfs_space_info *space_info) 215 { 216 struct list_head *head; 217 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 218 219 lockdep_assert_held(&space_info->lock); 220 221 head = &space_info->priority_tickets; 222 again: 223 while (!list_empty(head)) { 224 struct reserve_ticket *ticket; 225 u64 used = btrfs_space_info_used(space_info, true); 226 227 ticket = list_first_entry(head, struct reserve_ticket, list); 228 229 /* Check and see if our ticket can be satisified now. */ 230 if ((used + ticket->bytes <= space_info->total_bytes) || 231 can_overcommit(fs_info, space_info, ticket->bytes, flush, 232 false)) { 233 btrfs_space_info_update_bytes_may_use(fs_info, 234 space_info, 235 ticket->bytes); 236 list_del_init(&ticket->list); 237 ticket->bytes = 0; 238 space_info->tickets_id++; 239 wake_up(&ticket->wait); 240 } else { 241 break; 242 } 243 } 244 245 if (head == &space_info->priority_tickets) { 246 head = &space_info->tickets; 247 flush = BTRFS_RESERVE_FLUSH_ALL; 248 goto again; 249 } 250 } 251 252 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 253 do { \ 254 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 255 spin_lock(&__rsv->lock); \ 256 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 257 __rsv->size, __rsv->reserved); \ 258 spin_unlock(&__rsv->lock); \ 259 } while (0) 260 261 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 262 struct btrfs_space_info *info) 263 { 264 lockdep_assert_held(&info->lock); 265 266 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 267 info->flags, 268 info->total_bytes - btrfs_space_info_used(info, true), 269 info->full ? "" : "not "); 270 btrfs_info(fs_info, 271 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 272 info->total_bytes, info->bytes_used, info->bytes_pinned, 273 info->bytes_reserved, info->bytes_may_use, 274 info->bytes_readonly); 275 276 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 277 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 278 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 279 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 280 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 281 282 } 283 284 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 285 struct btrfs_space_info *info, u64 bytes, 286 int dump_block_groups) 287 { 288 struct btrfs_block_group_cache *cache; 289 int index = 0; 290 291 spin_lock(&info->lock); 292 __btrfs_dump_space_info(fs_info, info); 293 spin_unlock(&info->lock); 294 295 if (!dump_block_groups) 296 return; 297 298 down_read(&info->groups_sem); 299 again: 300 list_for_each_entry(cache, &info->block_groups[index], list) { 301 spin_lock(&cache->lock); 302 btrfs_info(fs_info, 303 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 304 cache->start, cache->length, cache->used, cache->pinned, 305 cache->reserved, cache->ro ? "[readonly]" : ""); 306 btrfs_dump_free_space(cache, bytes); 307 spin_unlock(&cache->lock); 308 } 309 if (++index < BTRFS_NR_RAID_TYPES) 310 goto again; 311 up_read(&info->groups_sem); 312 } 313 314 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 315 unsigned long nr_pages, int nr_items) 316 { 317 struct super_block *sb = fs_info->sb; 318 319 if (down_read_trylock(&sb->s_umount)) { 320 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 321 up_read(&sb->s_umount); 322 } else { 323 /* 324 * We needn't worry the filesystem going from r/w to r/o though 325 * we don't acquire ->s_umount mutex, because the filesystem 326 * should guarantee the delalloc inodes list be empty after 327 * the filesystem is readonly(all dirty pages are written to 328 * the disk). 329 */ 330 btrfs_start_delalloc_roots(fs_info, nr_items); 331 if (!current->journal_info) 332 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 333 } 334 } 335 336 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 337 u64 to_reclaim) 338 { 339 u64 bytes; 340 u64 nr; 341 342 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 343 nr = div64_u64(to_reclaim, bytes); 344 if (!nr) 345 nr = 1; 346 return nr; 347 } 348 349 #define EXTENT_SIZE_PER_ITEM SZ_256K 350 351 /* 352 * shrink metadata reservation for delalloc 353 */ 354 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 355 u64 orig, bool wait_ordered) 356 { 357 struct btrfs_space_info *space_info; 358 struct btrfs_trans_handle *trans; 359 u64 delalloc_bytes; 360 u64 dio_bytes; 361 u64 async_pages; 362 u64 items; 363 long time_left; 364 unsigned long nr_pages; 365 int loops; 366 367 /* Calc the number of the pages we need flush for space reservation */ 368 items = calc_reclaim_items_nr(fs_info, to_reclaim); 369 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 370 371 trans = (struct btrfs_trans_handle *)current->journal_info; 372 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 373 374 delalloc_bytes = percpu_counter_sum_positive( 375 &fs_info->delalloc_bytes); 376 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 377 if (delalloc_bytes == 0 && dio_bytes == 0) { 378 if (trans) 379 return; 380 if (wait_ordered) 381 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 382 return; 383 } 384 385 /* 386 * If we are doing more ordered than delalloc we need to just wait on 387 * ordered extents, otherwise we'll waste time trying to flush delalloc 388 * that likely won't give us the space back we need. 389 */ 390 if (dio_bytes > delalloc_bytes) 391 wait_ordered = true; 392 393 loops = 0; 394 while ((delalloc_bytes || dio_bytes) && loops < 3) { 395 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 396 397 /* 398 * Triggers inode writeback for up to nr_pages. This will invoke 399 * ->writepages callback and trigger delalloc filling 400 * (btrfs_run_delalloc_range()). 401 */ 402 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 403 404 /* 405 * We need to wait for the compressed pages to start before 406 * we continue. 407 */ 408 async_pages = atomic_read(&fs_info->async_delalloc_pages); 409 if (!async_pages) 410 goto skip_async; 411 412 /* 413 * Calculate how many compressed pages we want to be written 414 * before we continue. I.e if there are more async pages than we 415 * require wait_event will wait until nr_pages are written. 416 */ 417 if (async_pages <= nr_pages) 418 async_pages = 0; 419 else 420 async_pages -= nr_pages; 421 422 wait_event(fs_info->async_submit_wait, 423 atomic_read(&fs_info->async_delalloc_pages) <= 424 (int)async_pages); 425 skip_async: 426 spin_lock(&space_info->lock); 427 if (list_empty(&space_info->tickets) && 428 list_empty(&space_info->priority_tickets)) { 429 spin_unlock(&space_info->lock); 430 break; 431 } 432 spin_unlock(&space_info->lock); 433 434 loops++; 435 if (wait_ordered && !trans) { 436 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 437 } else { 438 time_left = schedule_timeout_killable(1); 439 if (time_left) 440 break; 441 } 442 delalloc_bytes = percpu_counter_sum_positive( 443 &fs_info->delalloc_bytes); 444 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 445 } 446 } 447 448 /** 449 * maybe_commit_transaction - possibly commit the transaction if its ok to 450 * @root - the root we're allocating for 451 * @bytes - the number of bytes we want to reserve 452 * @force - force the commit 453 * 454 * This will check to make sure that committing the transaction will actually 455 * get us somewhere and then commit the transaction if it does. Otherwise it 456 * will return -ENOSPC. 457 */ 458 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 459 struct btrfs_space_info *space_info) 460 { 461 struct reserve_ticket *ticket = NULL; 462 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 463 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 464 struct btrfs_trans_handle *trans; 465 u64 bytes_needed; 466 u64 reclaim_bytes = 0; 467 u64 cur_free_bytes = 0; 468 469 trans = (struct btrfs_trans_handle *)current->journal_info; 470 if (trans) 471 return -EAGAIN; 472 473 spin_lock(&space_info->lock); 474 cur_free_bytes = btrfs_space_info_used(space_info, true); 475 if (cur_free_bytes < space_info->total_bytes) 476 cur_free_bytes = space_info->total_bytes - cur_free_bytes; 477 else 478 cur_free_bytes = 0; 479 480 if (!list_empty(&space_info->priority_tickets)) 481 ticket = list_first_entry(&space_info->priority_tickets, 482 struct reserve_ticket, list); 483 else if (!list_empty(&space_info->tickets)) 484 ticket = list_first_entry(&space_info->tickets, 485 struct reserve_ticket, list); 486 bytes_needed = (ticket) ? ticket->bytes : 0; 487 488 if (bytes_needed > cur_free_bytes) 489 bytes_needed -= cur_free_bytes; 490 else 491 bytes_needed = 0; 492 spin_unlock(&space_info->lock); 493 494 if (!bytes_needed) 495 return 0; 496 497 trans = btrfs_join_transaction(fs_info->extent_root); 498 if (IS_ERR(trans)) 499 return PTR_ERR(trans); 500 501 /* 502 * See if there is enough pinned space to make this reservation, or if 503 * we have block groups that are going to be freed, allowing us to 504 * possibly do a chunk allocation the next loop through. 505 */ 506 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 507 __percpu_counter_compare(&space_info->total_bytes_pinned, 508 bytes_needed, 509 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 510 goto commit; 511 512 /* 513 * See if there is some space in the delayed insertion reservation for 514 * this reservation. 515 */ 516 if (space_info != delayed_rsv->space_info) 517 goto enospc; 518 519 spin_lock(&delayed_rsv->lock); 520 reclaim_bytes += delayed_rsv->reserved; 521 spin_unlock(&delayed_rsv->lock); 522 523 spin_lock(&delayed_refs_rsv->lock); 524 reclaim_bytes += delayed_refs_rsv->reserved; 525 spin_unlock(&delayed_refs_rsv->lock); 526 if (reclaim_bytes >= bytes_needed) 527 goto commit; 528 bytes_needed -= reclaim_bytes; 529 530 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 531 bytes_needed, 532 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 533 goto enospc; 534 535 commit: 536 return btrfs_commit_transaction(trans); 537 enospc: 538 btrfs_end_transaction(trans); 539 return -ENOSPC; 540 } 541 542 /* 543 * Try to flush some data based on policy set by @state. This is only advisory 544 * and may fail for various reasons. The caller is supposed to examine the 545 * state of @space_info to detect the outcome. 546 */ 547 static void flush_space(struct btrfs_fs_info *fs_info, 548 struct btrfs_space_info *space_info, u64 num_bytes, 549 int state) 550 { 551 struct btrfs_root *root = fs_info->extent_root; 552 struct btrfs_trans_handle *trans; 553 int nr; 554 int ret = 0; 555 556 switch (state) { 557 case FLUSH_DELAYED_ITEMS_NR: 558 case FLUSH_DELAYED_ITEMS: 559 if (state == FLUSH_DELAYED_ITEMS_NR) 560 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 561 else 562 nr = -1; 563 564 trans = btrfs_join_transaction(root); 565 if (IS_ERR(trans)) { 566 ret = PTR_ERR(trans); 567 break; 568 } 569 ret = btrfs_run_delayed_items_nr(trans, nr); 570 btrfs_end_transaction(trans); 571 break; 572 case FLUSH_DELALLOC: 573 case FLUSH_DELALLOC_WAIT: 574 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 575 state == FLUSH_DELALLOC_WAIT); 576 break; 577 case FLUSH_DELAYED_REFS_NR: 578 case FLUSH_DELAYED_REFS: 579 trans = btrfs_join_transaction(root); 580 if (IS_ERR(trans)) { 581 ret = PTR_ERR(trans); 582 break; 583 } 584 if (state == FLUSH_DELAYED_REFS_NR) 585 nr = calc_reclaim_items_nr(fs_info, num_bytes); 586 else 587 nr = 0; 588 btrfs_run_delayed_refs(trans, nr); 589 btrfs_end_transaction(trans); 590 break; 591 case ALLOC_CHUNK: 592 case ALLOC_CHUNK_FORCE: 593 trans = btrfs_join_transaction(root); 594 if (IS_ERR(trans)) { 595 ret = PTR_ERR(trans); 596 break; 597 } 598 ret = btrfs_chunk_alloc(trans, 599 btrfs_metadata_alloc_profile(fs_info), 600 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 601 CHUNK_ALLOC_FORCE); 602 btrfs_end_transaction(trans); 603 if (ret > 0 || ret == -ENOSPC) 604 ret = 0; 605 break; 606 case RUN_DELAYED_IPUTS: 607 /* 608 * If we have pending delayed iputs then we could free up a 609 * bunch of pinned space, so make sure we run the iputs before 610 * we do our pinned bytes check below. 611 */ 612 btrfs_run_delayed_iputs(fs_info); 613 btrfs_wait_on_delayed_iputs(fs_info); 614 break; 615 case COMMIT_TRANS: 616 ret = may_commit_transaction(fs_info, space_info); 617 break; 618 default: 619 ret = -ENOSPC; 620 break; 621 } 622 623 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 624 ret); 625 return; 626 } 627 628 static inline u64 629 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 630 struct btrfs_space_info *space_info, 631 bool system_chunk) 632 { 633 struct reserve_ticket *ticket; 634 u64 used; 635 u64 expected; 636 u64 to_reclaim = 0; 637 638 list_for_each_entry(ticket, &space_info->tickets, list) 639 to_reclaim += ticket->bytes; 640 list_for_each_entry(ticket, &space_info->priority_tickets, list) 641 to_reclaim += ticket->bytes; 642 if (to_reclaim) 643 return to_reclaim; 644 645 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 646 if (can_overcommit(fs_info, space_info, to_reclaim, 647 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 648 return 0; 649 650 used = btrfs_space_info_used(space_info, true); 651 652 if (can_overcommit(fs_info, space_info, SZ_1M, 653 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 654 expected = div_factor_fine(space_info->total_bytes, 95); 655 else 656 expected = div_factor_fine(space_info->total_bytes, 90); 657 658 if (used > expected) 659 to_reclaim = used - expected; 660 else 661 to_reclaim = 0; 662 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 663 space_info->bytes_reserved); 664 return to_reclaim; 665 } 666 667 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 668 struct btrfs_space_info *space_info, 669 u64 used, bool system_chunk) 670 { 671 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 672 673 /* If we're just plain full then async reclaim just slows us down. */ 674 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 675 return 0; 676 677 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 678 system_chunk)) 679 return 0; 680 681 return (used >= thresh && !btrfs_fs_closing(fs_info) && 682 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 683 } 684 685 /* 686 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 687 * @fs_info - fs_info for this fs 688 * @space_info - the space info we were flushing 689 * 690 * We call this when we've exhausted our flushing ability and haven't made 691 * progress in satisfying tickets. The reservation code handles tickets in 692 * order, so if there is a large ticket first and then smaller ones we could 693 * very well satisfy the smaller tickets. This will attempt to wake up any 694 * tickets in the list to catch this case. 695 * 696 * This function returns true if it was able to make progress by clearing out 697 * other tickets, or if it stumbles across a ticket that was smaller than the 698 * first ticket. 699 */ 700 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 701 struct btrfs_space_info *space_info) 702 { 703 struct reserve_ticket *ticket; 704 u64 tickets_id = space_info->tickets_id; 705 u64 first_ticket_bytes = 0; 706 707 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 708 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); 709 __btrfs_dump_space_info(fs_info, space_info); 710 } 711 712 while (!list_empty(&space_info->tickets) && 713 tickets_id == space_info->tickets_id) { 714 ticket = list_first_entry(&space_info->tickets, 715 struct reserve_ticket, list); 716 717 /* 718 * may_commit_transaction will avoid committing the transaction 719 * if it doesn't feel like the space reclaimed by the commit 720 * would result in the ticket succeeding. However if we have a 721 * smaller ticket in the queue it may be small enough to be 722 * satisified by committing the transaction, so if any 723 * subsequent ticket is smaller than the first ticket go ahead 724 * and send us back for another loop through the enospc flushing 725 * code. 726 */ 727 if (first_ticket_bytes == 0) 728 first_ticket_bytes = ticket->bytes; 729 else if (first_ticket_bytes > ticket->bytes) 730 return true; 731 732 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 733 btrfs_info(fs_info, "failing ticket with %llu bytes", 734 ticket->bytes); 735 736 list_del_init(&ticket->list); 737 ticket->error = -ENOSPC; 738 wake_up(&ticket->wait); 739 740 /* 741 * We're just throwing tickets away, so more flushing may not 742 * trip over btrfs_try_granting_tickets, so we need to call it 743 * here to see if we can make progress with the next ticket in 744 * the list. 745 */ 746 btrfs_try_granting_tickets(fs_info, space_info); 747 } 748 return (tickets_id != space_info->tickets_id); 749 } 750 751 /* 752 * This is for normal flushers, we can wait all goddamned day if we want to. We 753 * will loop and continuously try to flush as long as we are making progress. 754 * We count progress as clearing off tickets each time we have to loop. 755 */ 756 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 757 { 758 struct btrfs_fs_info *fs_info; 759 struct btrfs_space_info *space_info; 760 u64 to_reclaim; 761 int flush_state; 762 int commit_cycles = 0; 763 u64 last_tickets_id; 764 765 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 766 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 767 768 spin_lock(&space_info->lock); 769 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 770 false); 771 if (!to_reclaim) { 772 space_info->flush = 0; 773 spin_unlock(&space_info->lock); 774 return; 775 } 776 last_tickets_id = space_info->tickets_id; 777 spin_unlock(&space_info->lock); 778 779 flush_state = FLUSH_DELAYED_ITEMS_NR; 780 do { 781 flush_space(fs_info, space_info, to_reclaim, flush_state); 782 spin_lock(&space_info->lock); 783 if (list_empty(&space_info->tickets)) { 784 space_info->flush = 0; 785 spin_unlock(&space_info->lock); 786 return; 787 } 788 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 789 space_info, 790 false); 791 if (last_tickets_id == space_info->tickets_id) { 792 flush_state++; 793 } else { 794 last_tickets_id = space_info->tickets_id; 795 flush_state = FLUSH_DELAYED_ITEMS_NR; 796 if (commit_cycles) 797 commit_cycles--; 798 } 799 800 /* 801 * We don't want to force a chunk allocation until we've tried 802 * pretty hard to reclaim space. Think of the case where we 803 * freed up a bunch of space and so have a lot of pinned space 804 * to reclaim. We would rather use that than possibly create a 805 * underutilized metadata chunk. So if this is our first run 806 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 807 * commit the transaction. If nothing has changed the next go 808 * around then we can force a chunk allocation. 809 */ 810 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 811 flush_state++; 812 813 if (flush_state > COMMIT_TRANS) { 814 commit_cycles++; 815 if (commit_cycles > 2) { 816 if (maybe_fail_all_tickets(fs_info, space_info)) { 817 flush_state = FLUSH_DELAYED_ITEMS_NR; 818 commit_cycles--; 819 } else { 820 space_info->flush = 0; 821 } 822 } else { 823 flush_state = FLUSH_DELAYED_ITEMS_NR; 824 } 825 } 826 spin_unlock(&space_info->lock); 827 } while (flush_state <= COMMIT_TRANS); 828 } 829 830 void btrfs_init_async_reclaim_work(struct work_struct *work) 831 { 832 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 833 } 834 835 static const enum btrfs_flush_state priority_flush_states[] = { 836 FLUSH_DELAYED_ITEMS_NR, 837 FLUSH_DELAYED_ITEMS, 838 ALLOC_CHUNK, 839 }; 840 841 static const enum btrfs_flush_state evict_flush_states[] = { 842 FLUSH_DELAYED_ITEMS_NR, 843 FLUSH_DELAYED_ITEMS, 844 FLUSH_DELAYED_REFS_NR, 845 FLUSH_DELAYED_REFS, 846 FLUSH_DELALLOC, 847 FLUSH_DELALLOC_WAIT, 848 ALLOC_CHUNK, 849 COMMIT_TRANS, 850 }; 851 852 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 853 struct btrfs_space_info *space_info, 854 struct reserve_ticket *ticket, 855 const enum btrfs_flush_state *states, 856 int states_nr) 857 { 858 u64 to_reclaim; 859 int flush_state; 860 861 spin_lock(&space_info->lock); 862 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 863 false); 864 if (!to_reclaim) { 865 spin_unlock(&space_info->lock); 866 return; 867 } 868 spin_unlock(&space_info->lock); 869 870 flush_state = 0; 871 do { 872 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 873 flush_state++; 874 spin_lock(&space_info->lock); 875 if (ticket->bytes == 0) { 876 spin_unlock(&space_info->lock); 877 return; 878 } 879 spin_unlock(&space_info->lock); 880 } while (flush_state < states_nr); 881 } 882 883 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 884 struct btrfs_space_info *space_info, 885 struct reserve_ticket *ticket) 886 887 { 888 DEFINE_WAIT(wait); 889 int ret = 0; 890 891 spin_lock(&space_info->lock); 892 while (ticket->bytes > 0 && ticket->error == 0) { 893 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 894 if (ret) { 895 /* 896 * Delete us from the list. After we unlock the space 897 * info, we don't want the async reclaim job to reserve 898 * space for this ticket. If that would happen, then the 899 * ticket's task would not known that space was reserved 900 * despite getting an error, resulting in a space leak 901 * (bytes_may_use counter of our space_info). 902 */ 903 list_del_init(&ticket->list); 904 ticket->error = -EINTR; 905 break; 906 } 907 spin_unlock(&space_info->lock); 908 909 schedule(); 910 911 finish_wait(&ticket->wait, &wait); 912 spin_lock(&space_info->lock); 913 } 914 spin_unlock(&space_info->lock); 915 } 916 917 /** 918 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 919 * @fs_info - the fs 920 * @space_info - the space_info for the reservation 921 * @ticket - the ticket for the reservation 922 * @flush - how much we can flush 923 * 924 * This does the work of figuring out how to flush for the ticket, waiting for 925 * the reservation, and returning the appropriate error if there is one. 926 */ 927 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 928 struct btrfs_space_info *space_info, 929 struct reserve_ticket *ticket, 930 enum btrfs_reserve_flush_enum flush) 931 { 932 int ret; 933 934 switch (flush) { 935 case BTRFS_RESERVE_FLUSH_ALL: 936 wait_reserve_ticket(fs_info, space_info, ticket); 937 break; 938 case BTRFS_RESERVE_FLUSH_LIMIT: 939 priority_reclaim_metadata_space(fs_info, space_info, ticket, 940 priority_flush_states, 941 ARRAY_SIZE(priority_flush_states)); 942 break; 943 case BTRFS_RESERVE_FLUSH_EVICT: 944 priority_reclaim_metadata_space(fs_info, space_info, ticket, 945 evict_flush_states, 946 ARRAY_SIZE(evict_flush_states)); 947 break; 948 default: 949 ASSERT(0); 950 break; 951 } 952 953 spin_lock(&space_info->lock); 954 ret = ticket->error; 955 if (ticket->bytes || ticket->error) { 956 /* 957 * Need to delete here for priority tickets. For regular tickets 958 * either the async reclaim job deletes the ticket from the list 959 * or we delete it ourselves at wait_reserve_ticket(). 960 */ 961 list_del_init(&ticket->list); 962 if (!ret) 963 ret = -ENOSPC; 964 } 965 spin_unlock(&space_info->lock); 966 ASSERT(list_empty(&ticket->list)); 967 /* 968 * Check that we can't have an error set if the reservation succeeded, 969 * as that would confuse tasks and lead them to error out without 970 * releasing reserved space (if an error happens the expectation is that 971 * space wasn't reserved at all). 972 */ 973 ASSERT(!(ticket->bytes == 0 && ticket->error)); 974 return ret; 975 } 976 977 /** 978 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 979 * @root - the root we're allocating for 980 * @space_info - the space info we want to allocate from 981 * @orig_bytes - the number of bytes we want 982 * @flush - whether or not we can flush to make our reservation 983 * 984 * This will reserve orig_bytes number of bytes from the space info associated 985 * with the block_rsv. If there is not enough space it will make an attempt to 986 * flush out space to make room. It will do this by flushing delalloc if 987 * possible or committing the transaction. If flush is 0 then no attempts to 988 * regain reservations will be made and this will fail if there is not enough 989 * space already. 990 */ 991 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 992 struct btrfs_space_info *space_info, 993 u64 orig_bytes, 994 enum btrfs_reserve_flush_enum flush, 995 bool system_chunk) 996 { 997 struct reserve_ticket ticket; 998 u64 used; 999 int ret = 0; 1000 bool pending_tickets; 1001 1002 ASSERT(orig_bytes); 1003 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 1004 1005 spin_lock(&space_info->lock); 1006 ret = -ENOSPC; 1007 used = btrfs_space_info_used(space_info, true); 1008 pending_tickets = !list_empty(&space_info->tickets) || 1009 !list_empty(&space_info->priority_tickets); 1010 1011 /* 1012 * Carry on if we have enough space (short-circuit) OR call 1013 * can_overcommit() to ensure we can overcommit to continue. 1014 */ 1015 if (!pending_tickets && 1016 ((used + orig_bytes <= space_info->total_bytes) || 1017 can_overcommit(fs_info, space_info, orig_bytes, flush, 1018 system_chunk))) { 1019 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 1020 orig_bytes); 1021 ret = 0; 1022 } 1023 1024 /* 1025 * If we couldn't make a reservation then setup our reservation ticket 1026 * and kick the async worker if it's not already running. 1027 * 1028 * If we are a priority flusher then we just need to add our ticket to 1029 * the list and we will do our own flushing further down. 1030 */ 1031 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 1032 ticket.bytes = orig_bytes; 1033 ticket.error = 0; 1034 init_waitqueue_head(&ticket.wait); 1035 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 1036 list_add_tail(&ticket.list, &space_info->tickets); 1037 if (!space_info->flush) { 1038 space_info->flush = 1; 1039 trace_btrfs_trigger_flush(fs_info, 1040 space_info->flags, 1041 orig_bytes, flush, 1042 "enospc"); 1043 queue_work(system_unbound_wq, 1044 &fs_info->async_reclaim_work); 1045 } 1046 } else { 1047 list_add_tail(&ticket.list, 1048 &space_info->priority_tickets); 1049 } 1050 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1051 used += orig_bytes; 1052 /* 1053 * We will do the space reservation dance during log replay, 1054 * which means we won't have fs_info->fs_root set, so don't do 1055 * the async reclaim as we will panic. 1056 */ 1057 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1058 need_do_async_reclaim(fs_info, space_info, 1059 used, system_chunk) && 1060 !work_busy(&fs_info->async_reclaim_work)) { 1061 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1062 orig_bytes, flush, "preempt"); 1063 queue_work(system_unbound_wq, 1064 &fs_info->async_reclaim_work); 1065 } 1066 } 1067 spin_unlock(&space_info->lock); 1068 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1069 return ret; 1070 1071 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1072 } 1073 1074 /** 1075 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1076 * @root - the root we're allocating for 1077 * @block_rsv - the block_rsv we're allocating for 1078 * @orig_bytes - the number of bytes we want 1079 * @flush - whether or not we can flush to make our reservation 1080 * 1081 * This will reserve orig_bytes number of bytes from the space info associated 1082 * with the block_rsv. If there is not enough space it will make an attempt to 1083 * flush out space to make room. It will do this by flushing delalloc if 1084 * possible or committing the transaction. If flush is 0 then no attempts to 1085 * regain reservations will be made and this will fail if there is not enough 1086 * space already. 1087 */ 1088 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1089 struct btrfs_block_rsv *block_rsv, 1090 u64 orig_bytes, 1091 enum btrfs_reserve_flush_enum flush) 1092 { 1093 struct btrfs_fs_info *fs_info = root->fs_info; 1094 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1095 int ret; 1096 bool system_chunk = (root == fs_info->chunk_root); 1097 1098 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1099 orig_bytes, flush, system_chunk); 1100 if (ret == -ENOSPC && 1101 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1102 if (block_rsv != global_rsv && 1103 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1104 ret = 0; 1105 } 1106 if (ret == -ENOSPC) { 1107 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1108 block_rsv->space_info->flags, 1109 orig_bytes, 1); 1110 1111 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1112 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1113 orig_bytes, 0); 1114 } 1115 return ret; 1116 } 1117