1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 init_waitqueue_head(&space_info->wait); 62 INIT_LIST_HEAD(&space_info->ro_bgs); 63 INIT_LIST_HEAD(&space_info->tickets); 64 INIT_LIST_HEAD(&space_info->priority_tickets); 65 66 ret = btrfs_sysfs_add_space_info_type(info, space_info); 67 if (ret) 68 return ret; 69 70 list_add_rcu(&space_info->list, &info->space_info); 71 if (flags & BTRFS_BLOCK_GROUP_DATA) 72 info->data_sinfo = space_info; 73 74 return ret; 75 } 76 77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 78 { 79 struct btrfs_super_block *disk_super; 80 u64 features; 81 u64 flags; 82 int mixed = 0; 83 int ret; 84 85 disk_super = fs_info->super_copy; 86 if (!btrfs_super_root(disk_super)) 87 return -EINVAL; 88 89 features = btrfs_super_incompat_flags(disk_super); 90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 91 mixed = 1; 92 93 flags = BTRFS_BLOCK_GROUP_SYSTEM; 94 ret = create_space_info(fs_info, flags); 95 if (ret) 96 goto out; 97 98 if (mixed) { 99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 100 ret = create_space_info(fs_info, flags); 101 } else { 102 flags = BTRFS_BLOCK_GROUP_METADATA; 103 ret = create_space_info(fs_info, flags); 104 if (ret) 105 goto out; 106 107 flags = BTRFS_BLOCK_GROUP_DATA; 108 ret = create_space_info(fs_info, flags); 109 } 110 out: 111 return ret; 112 } 113 114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 115 u64 total_bytes, u64 bytes_used, 116 u64 bytes_readonly, 117 struct btrfs_space_info **space_info) 118 { 119 struct btrfs_space_info *found; 120 int factor; 121 122 factor = btrfs_bg_type_to_factor(flags); 123 124 found = btrfs_find_space_info(info, flags); 125 ASSERT(found); 126 spin_lock(&found->lock); 127 found->total_bytes += total_bytes; 128 found->disk_total += total_bytes * factor; 129 found->bytes_used += bytes_used; 130 found->disk_used += bytes_used * factor; 131 found->bytes_readonly += bytes_readonly; 132 if (total_bytes > 0) 133 found->full = 0; 134 btrfs_try_granting_tickets(info, found); 135 spin_unlock(&found->lock); 136 *space_info = found; 137 } 138 139 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 140 u64 flags) 141 { 142 struct list_head *head = &info->space_info; 143 struct btrfs_space_info *found; 144 145 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 146 147 rcu_read_lock(); 148 list_for_each_entry_rcu(found, head, list) { 149 if (found->flags & flags) { 150 rcu_read_unlock(); 151 return found; 152 } 153 } 154 rcu_read_unlock(); 155 return NULL; 156 } 157 158 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 159 { 160 return (global->size << 1); 161 } 162 163 static int can_overcommit(struct btrfs_fs_info *fs_info, 164 struct btrfs_space_info *space_info, u64 bytes, 165 enum btrfs_reserve_flush_enum flush, 166 bool system_chunk) 167 { 168 u64 profile; 169 u64 avail; 170 u64 used; 171 int factor; 172 173 /* Don't overcommit when in mixed mode. */ 174 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 175 return 0; 176 177 if (system_chunk) 178 profile = btrfs_system_alloc_profile(fs_info); 179 else 180 profile = btrfs_metadata_alloc_profile(fs_info); 181 182 used = btrfs_space_info_used(space_info, true); 183 avail = atomic64_read(&fs_info->free_chunk_space); 184 185 /* 186 * If we have dup, raid1 or raid10 then only half of the free 187 * space is actually usable. For raid56, the space info used 188 * doesn't include the parity drive, so we don't have to 189 * change the math 190 */ 191 factor = btrfs_bg_type_to_factor(profile); 192 avail = div_u64(avail, factor); 193 194 /* 195 * If we aren't flushing all things, let us overcommit up to 196 * 1/2th of the space. If we can flush, don't let us overcommit 197 * too much, let it overcommit up to 1/8 of the space. 198 */ 199 if (flush == BTRFS_RESERVE_FLUSH_ALL) 200 avail >>= 3; 201 else 202 avail >>= 1; 203 204 if (used + bytes < space_info->total_bytes + avail) 205 return 1; 206 return 0; 207 } 208 209 /* 210 * This is for space we already have accounted in space_info->bytes_may_use, so 211 * basically when we're returning space from block_rsv's. 212 */ 213 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 214 struct btrfs_space_info *space_info) 215 { 216 struct list_head *head; 217 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 218 219 lockdep_assert_held(&space_info->lock); 220 221 head = &space_info->priority_tickets; 222 again: 223 while (!list_empty(head)) { 224 struct reserve_ticket *ticket; 225 u64 used = btrfs_space_info_used(space_info, true); 226 227 ticket = list_first_entry(head, struct reserve_ticket, list); 228 229 /* Check and see if our ticket can be satisified now. */ 230 if ((used + ticket->bytes <= space_info->total_bytes) || 231 can_overcommit(fs_info, space_info, ticket->bytes, flush, 232 false)) { 233 btrfs_space_info_update_bytes_may_use(fs_info, 234 space_info, 235 ticket->bytes); 236 list_del_init(&ticket->list); 237 ticket->bytes = 0; 238 space_info->tickets_id++; 239 wake_up(&ticket->wait); 240 } else { 241 break; 242 } 243 } 244 245 if (head == &space_info->priority_tickets) { 246 head = &space_info->tickets; 247 flush = BTRFS_RESERVE_FLUSH_ALL; 248 goto again; 249 } 250 } 251 252 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 253 do { \ 254 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 255 spin_lock(&__rsv->lock); \ 256 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 257 __rsv->size, __rsv->reserved); \ 258 spin_unlock(&__rsv->lock); \ 259 } while (0) 260 261 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 262 struct btrfs_space_info *info, u64 bytes, 263 int dump_block_groups) 264 { 265 struct btrfs_block_group_cache *cache; 266 int index = 0; 267 268 spin_lock(&info->lock); 269 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 270 info->flags, 271 info->total_bytes - btrfs_space_info_used(info, true), 272 info->full ? "" : "not "); 273 btrfs_info(fs_info, 274 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 275 info->total_bytes, info->bytes_used, info->bytes_pinned, 276 info->bytes_reserved, info->bytes_may_use, 277 info->bytes_readonly); 278 spin_unlock(&info->lock); 279 280 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 281 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 282 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 283 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 284 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 285 286 if (!dump_block_groups) 287 return; 288 289 down_read(&info->groups_sem); 290 again: 291 list_for_each_entry(cache, &info->block_groups[index], list) { 292 spin_lock(&cache->lock); 293 btrfs_info(fs_info, 294 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 295 cache->key.objectid, cache->key.offset, 296 btrfs_block_group_used(&cache->item), cache->pinned, 297 cache->reserved, cache->ro ? "[readonly]" : ""); 298 btrfs_dump_free_space(cache, bytes); 299 spin_unlock(&cache->lock); 300 } 301 if (++index < BTRFS_NR_RAID_TYPES) 302 goto again; 303 up_read(&info->groups_sem); 304 } 305 306 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 307 unsigned long nr_pages, int nr_items) 308 { 309 struct super_block *sb = fs_info->sb; 310 311 if (down_read_trylock(&sb->s_umount)) { 312 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 313 up_read(&sb->s_umount); 314 } else { 315 /* 316 * We needn't worry the filesystem going from r/w to r/o though 317 * we don't acquire ->s_umount mutex, because the filesystem 318 * should guarantee the delalloc inodes list be empty after 319 * the filesystem is readonly(all dirty pages are written to 320 * the disk). 321 */ 322 btrfs_start_delalloc_roots(fs_info, nr_items); 323 if (!current->journal_info) 324 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 325 } 326 } 327 328 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 329 u64 to_reclaim) 330 { 331 u64 bytes; 332 u64 nr; 333 334 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 335 nr = div64_u64(to_reclaim, bytes); 336 if (!nr) 337 nr = 1; 338 return nr; 339 } 340 341 #define EXTENT_SIZE_PER_ITEM SZ_256K 342 343 /* 344 * shrink metadata reservation for delalloc 345 */ 346 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 347 u64 orig, bool wait_ordered) 348 { 349 struct btrfs_space_info *space_info; 350 struct btrfs_trans_handle *trans; 351 u64 delalloc_bytes; 352 u64 dio_bytes; 353 u64 async_pages; 354 u64 items; 355 long time_left; 356 unsigned long nr_pages; 357 int loops; 358 359 /* Calc the number of the pages we need flush for space reservation */ 360 items = calc_reclaim_items_nr(fs_info, to_reclaim); 361 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 362 363 trans = (struct btrfs_trans_handle *)current->journal_info; 364 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 365 366 delalloc_bytes = percpu_counter_sum_positive( 367 &fs_info->delalloc_bytes); 368 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 369 if (delalloc_bytes == 0 && dio_bytes == 0) { 370 if (trans) 371 return; 372 if (wait_ordered) 373 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 374 return; 375 } 376 377 /* 378 * If we are doing more ordered than delalloc we need to just wait on 379 * ordered extents, otherwise we'll waste time trying to flush delalloc 380 * that likely won't give us the space back we need. 381 */ 382 if (dio_bytes > delalloc_bytes) 383 wait_ordered = true; 384 385 loops = 0; 386 while ((delalloc_bytes || dio_bytes) && loops < 3) { 387 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 388 389 /* 390 * Triggers inode writeback for up to nr_pages. This will invoke 391 * ->writepages callback and trigger delalloc filling 392 * (btrfs_run_delalloc_range()). 393 */ 394 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 395 396 /* 397 * We need to wait for the compressed pages to start before 398 * we continue. 399 */ 400 async_pages = atomic_read(&fs_info->async_delalloc_pages); 401 if (!async_pages) 402 goto skip_async; 403 404 /* 405 * Calculate how many compressed pages we want to be written 406 * before we continue. I.e if there are more async pages than we 407 * require wait_event will wait until nr_pages are written. 408 */ 409 if (async_pages <= nr_pages) 410 async_pages = 0; 411 else 412 async_pages -= nr_pages; 413 414 wait_event(fs_info->async_submit_wait, 415 atomic_read(&fs_info->async_delalloc_pages) <= 416 (int)async_pages); 417 skip_async: 418 spin_lock(&space_info->lock); 419 if (list_empty(&space_info->tickets) && 420 list_empty(&space_info->priority_tickets)) { 421 spin_unlock(&space_info->lock); 422 break; 423 } 424 spin_unlock(&space_info->lock); 425 426 loops++; 427 if (wait_ordered && !trans) { 428 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 429 } else { 430 time_left = schedule_timeout_killable(1); 431 if (time_left) 432 break; 433 } 434 delalloc_bytes = percpu_counter_sum_positive( 435 &fs_info->delalloc_bytes); 436 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 437 } 438 } 439 440 /** 441 * maybe_commit_transaction - possibly commit the transaction if its ok to 442 * @root - the root we're allocating for 443 * @bytes - the number of bytes we want to reserve 444 * @force - force the commit 445 * 446 * This will check to make sure that committing the transaction will actually 447 * get us somewhere and then commit the transaction if it does. Otherwise it 448 * will return -ENOSPC. 449 */ 450 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 451 struct btrfs_space_info *space_info) 452 { 453 struct reserve_ticket *ticket = NULL; 454 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 455 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 456 struct btrfs_trans_handle *trans; 457 u64 bytes_needed; 458 u64 reclaim_bytes = 0; 459 u64 cur_free_bytes = 0; 460 461 trans = (struct btrfs_trans_handle *)current->journal_info; 462 if (trans) 463 return -EAGAIN; 464 465 spin_lock(&space_info->lock); 466 cur_free_bytes = btrfs_space_info_used(space_info, true); 467 if (cur_free_bytes < space_info->total_bytes) 468 cur_free_bytes = space_info->total_bytes - cur_free_bytes; 469 else 470 cur_free_bytes = 0; 471 472 if (!list_empty(&space_info->priority_tickets)) 473 ticket = list_first_entry(&space_info->priority_tickets, 474 struct reserve_ticket, list); 475 else if (!list_empty(&space_info->tickets)) 476 ticket = list_first_entry(&space_info->tickets, 477 struct reserve_ticket, list); 478 bytes_needed = (ticket) ? ticket->bytes : 0; 479 480 if (bytes_needed > cur_free_bytes) 481 bytes_needed -= cur_free_bytes; 482 else 483 bytes_needed = 0; 484 spin_unlock(&space_info->lock); 485 486 if (!bytes_needed) 487 return 0; 488 489 trans = btrfs_join_transaction(fs_info->extent_root); 490 if (IS_ERR(trans)) 491 return PTR_ERR(trans); 492 493 /* 494 * See if there is enough pinned space to make this reservation, or if 495 * we have block groups that are going to be freed, allowing us to 496 * possibly do a chunk allocation the next loop through. 497 */ 498 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 499 __percpu_counter_compare(&space_info->total_bytes_pinned, 500 bytes_needed, 501 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 502 goto commit; 503 504 /* 505 * See if there is some space in the delayed insertion reservation for 506 * this reservation. 507 */ 508 if (space_info != delayed_rsv->space_info) 509 goto enospc; 510 511 spin_lock(&delayed_rsv->lock); 512 reclaim_bytes += delayed_rsv->reserved; 513 spin_unlock(&delayed_rsv->lock); 514 515 spin_lock(&delayed_refs_rsv->lock); 516 reclaim_bytes += delayed_refs_rsv->reserved; 517 spin_unlock(&delayed_refs_rsv->lock); 518 if (reclaim_bytes >= bytes_needed) 519 goto commit; 520 bytes_needed -= reclaim_bytes; 521 522 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 523 bytes_needed, 524 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 525 goto enospc; 526 527 commit: 528 return btrfs_commit_transaction(trans); 529 enospc: 530 btrfs_end_transaction(trans); 531 return -ENOSPC; 532 } 533 534 /* 535 * Try to flush some data based on policy set by @state. This is only advisory 536 * and may fail for various reasons. The caller is supposed to examine the 537 * state of @space_info to detect the outcome. 538 */ 539 static void flush_space(struct btrfs_fs_info *fs_info, 540 struct btrfs_space_info *space_info, u64 num_bytes, 541 int state) 542 { 543 struct btrfs_root *root = fs_info->extent_root; 544 struct btrfs_trans_handle *trans; 545 int nr; 546 int ret = 0; 547 548 switch (state) { 549 case FLUSH_DELAYED_ITEMS_NR: 550 case FLUSH_DELAYED_ITEMS: 551 if (state == FLUSH_DELAYED_ITEMS_NR) 552 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 553 else 554 nr = -1; 555 556 trans = btrfs_join_transaction(root); 557 if (IS_ERR(trans)) { 558 ret = PTR_ERR(trans); 559 break; 560 } 561 ret = btrfs_run_delayed_items_nr(trans, nr); 562 btrfs_end_transaction(trans); 563 break; 564 case FLUSH_DELALLOC: 565 case FLUSH_DELALLOC_WAIT: 566 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 567 state == FLUSH_DELALLOC_WAIT); 568 break; 569 case FLUSH_DELAYED_REFS_NR: 570 case FLUSH_DELAYED_REFS: 571 trans = btrfs_join_transaction(root); 572 if (IS_ERR(trans)) { 573 ret = PTR_ERR(trans); 574 break; 575 } 576 if (state == FLUSH_DELAYED_REFS_NR) 577 nr = calc_reclaim_items_nr(fs_info, num_bytes); 578 else 579 nr = 0; 580 btrfs_run_delayed_refs(trans, nr); 581 btrfs_end_transaction(trans); 582 break; 583 case ALLOC_CHUNK: 584 case ALLOC_CHUNK_FORCE: 585 trans = btrfs_join_transaction(root); 586 if (IS_ERR(trans)) { 587 ret = PTR_ERR(trans); 588 break; 589 } 590 ret = btrfs_chunk_alloc(trans, 591 btrfs_metadata_alloc_profile(fs_info), 592 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 593 CHUNK_ALLOC_FORCE); 594 btrfs_end_transaction(trans); 595 if (ret > 0 || ret == -ENOSPC) 596 ret = 0; 597 break; 598 case RUN_DELAYED_IPUTS: 599 /* 600 * If we have pending delayed iputs then we could free up a 601 * bunch of pinned space, so make sure we run the iputs before 602 * we do our pinned bytes check below. 603 */ 604 btrfs_run_delayed_iputs(fs_info); 605 btrfs_wait_on_delayed_iputs(fs_info); 606 break; 607 case COMMIT_TRANS: 608 ret = may_commit_transaction(fs_info, space_info); 609 break; 610 default: 611 ret = -ENOSPC; 612 break; 613 } 614 615 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 616 ret); 617 return; 618 } 619 620 static inline u64 621 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 622 struct btrfs_space_info *space_info, 623 bool system_chunk) 624 { 625 struct reserve_ticket *ticket; 626 u64 used; 627 u64 expected; 628 u64 to_reclaim = 0; 629 630 list_for_each_entry(ticket, &space_info->tickets, list) 631 to_reclaim += ticket->bytes; 632 list_for_each_entry(ticket, &space_info->priority_tickets, list) 633 to_reclaim += ticket->bytes; 634 if (to_reclaim) 635 return to_reclaim; 636 637 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 638 if (can_overcommit(fs_info, space_info, to_reclaim, 639 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 640 return 0; 641 642 used = btrfs_space_info_used(space_info, true); 643 644 if (can_overcommit(fs_info, space_info, SZ_1M, 645 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 646 expected = div_factor_fine(space_info->total_bytes, 95); 647 else 648 expected = div_factor_fine(space_info->total_bytes, 90); 649 650 if (used > expected) 651 to_reclaim = used - expected; 652 else 653 to_reclaim = 0; 654 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 655 space_info->bytes_reserved); 656 return to_reclaim; 657 } 658 659 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 660 struct btrfs_space_info *space_info, 661 u64 used, bool system_chunk) 662 { 663 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 664 665 /* If we're just plain full then async reclaim just slows us down. */ 666 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 667 return 0; 668 669 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 670 system_chunk)) 671 return 0; 672 673 return (used >= thresh && !btrfs_fs_closing(fs_info) && 674 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 675 } 676 677 /* 678 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 679 * @fs_info - fs_info for this fs 680 * @space_info - the space info we were flushing 681 * 682 * We call this when we've exhausted our flushing ability and haven't made 683 * progress in satisfying tickets. The reservation code handles tickets in 684 * order, so if there is a large ticket first and then smaller ones we could 685 * very well satisfy the smaller tickets. This will attempt to wake up any 686 * tickets in the list to catch this case. 687 * 688 * This function returns true if it was able to make progress by clearing out 689 * other tickets, or if it stumbles across a ticket that was smaller than the 690 * first ticket. 691 */ 692 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 693 struct btrfs_space_info *space_info) 694 { 695 struct reserve_ticket *ticket; 696 u64 tickets_id = space_info->tickets_id; 697 u64 first_ticket_bytes = 0; 698 699 while (!list_empty(&space_info->tickets) && 700 tickets_id == space_info->tickets_id) { 701 ticket = list_first_entry(&space_info->tickets, 702 struct reserve_ticket, list); 703 704 /* 705 * may_commit_transaction will avoid committing the transaction 706 * if it doesn't feel like the space reclaimed by the commit 707 * would result in the ticket succeeding. However if we have a 708 * smaller ticket in the queue it may be small enough to be 709 * satisified by committing the transaction, so if any 710 * subsequent ticket is smaller than the first ticket go ahead 711 * and send us back for another loop through the enospc flushing 712 * code. 713 */ 714 if (first_ticket_bytes == 0) 715 first_ticket_bytes = ticket->bytes; 716 else if (first_ticket_bytes > ticket->bytes) 717 return true; 718 719 list_del_init(&ticket->list); 720 ticket->error = -ENOSPC; 721 wake_up(&ticket->wait); 722 723 /* 724 * We're just throwing tickets away, so more flushing may not 725 * trip over btrfs_try_granting_tickets, so we need to call it 726 * here to see if we can make progress with the next ticket in 727 * the list. 728 */ 729 btrfs_try_granting_tickets(fs_info, space_info); 730 } 731 return (tickets_id != space_info->tickets_id); 732 } 733 734 /* 735 * This is for normal flushers, we can wait all goddamned day if we want to. We 736 * will loop and continuously try to flush as long as we are making progress. 737 * We count progress as clearing off tickets each time we have to loop. 738 */ 739 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 740 { 741 struct btrfs_fs_info *fs_info; 742 struct btrfs_space_info *space_info; 743 u64 to_reclaim; 744 int flush_state; 745 int commit_cycles = 0; 746 u64 last_tickets_id; 747 748 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 749 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 750 751 spin_lock(&space_info->lock); 752 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 753 false); 754 if (!to_reclaim) { 755 space_info->flush = 0; 756 spin_unlock(&space_info->lock); 757 return; 758 } 759 last_tickets_id = space_info->tickets_id; 760 spin_unlock(&space_info->lock); 761 762 flush_state = FLUSH_DELAYED_ITEMS_NR; 763 do { 764 flush_space(fs_info, space_info, to_reclaim, flush_state); 765 spin_lock(&space_info->lock); 766 if (list_empty(&space_info->tickets)) { 767 space_info->flush = 0; 768 spin_unlock(&space_info->lock); 769 return; 770 } 771 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 772 space_info, 773 false); 774 if (last_tickets_id == space_info->tickets_id) { 775 flush_state++; 776 } else { 777 last_tickets_id = space_info->tickets_id; 778 flush_state = FLUSH_DELAYED_ITEMS_NR; 779 if (commit_cycles) 780 commit_cycles--; 781 } 782 783 /* 784 * We don't want to force a chunk allocation until we've tried 785 * pretty hard to reclaim space. Think of the case where we 786 * freed up a bunch of space and so have a lot of pinned space 787 * to reclaim. We would rather use that than possibly create a 788 * underutilized metadata chunk. So if this is our first run 789 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 790 * commit the transaction. If nothing has changed the next go 791 * around then we can force a chunk allocation. 792 */ 793 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 794 flush_state++; 795 796 if (flush_state > COMMIT_TRANS) { 797 commit_cycles++; 798 if (commit_cycles > 2) { 799 if (maybe_fail_all_tickets(fs_info, space_info)) { 800 flush_state = FLUSH_DELAYED_ITEMS_NR; 801 commit_cycles--; 802 } else { 803 space_info->flush = 0; 804 } 805 } else { 806 flush_state = FLUSH_DELAYED_ITEMS_NR; 807 } 808 } 809 spin_unlock(&space_info->lock); 810 } while (flush_state <= COMMIT_TRANS); 811 } 812 813 void btrfs_init_async_reclaim_work(struct work_struct *work) 814 { 815 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 816 } 817 818 static const enum btrfs_flush_state priority_flush_states[] = { 819 FLUSH_DELAYED_ITEMS_NR, 820 FLUSH_DELAYED_ITEMS, 821 ALLOC_CHUNK, 822 }; 823 824 static const enum btrfs_flush_state evict_flush_states[] = { 825 FLUSH_DELAYED_ITEMS_NR, 826 FLUSH_DELAYED_ITEMS, 827 FLUSH_DELAYED_REFS_NR, 828 FLUSH_DELAYED_REFS, 829 FLUSH_DELALLOC, 830 FLUSH_DELALLOC_WAIT, 831 ALLOC_CHUNK, 832 COMMIT_TRANS, 833 }; 834 835 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 836 struct btrfs_space_info *space_info, 837 struct reserve_ticket *ticket, 838 const enum btrfs_flush_state *states, 839 int states_nr) 840 { 841 u64 to_reclaim; 842 int flush_state; 843 844 spin_lock(&space_info->lock); 845 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 846 false); 847 if (!to_reclaim) { 848 spin_unlock(&space_info->lock); 849 return; 850 } 851 spin_unlock(&space_info->lock); 852 853 flush_state = 0; 854 do { 855 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 856 flush_state++; 857 spin_lock(&space_info->lock); 858 if (ticket->bytes == 0) { 859 spin_unlock(&space_info->lock); 860 return; 861 } 862 spin_unlock(&space_info->lock); 863 } while (flush_state < states_nr); 864 } 865 866 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 867 struct btrfs_space_info *space_info, 868 struct reserve_ticket *ticket) 869 870 { 871 DEFINE_WAIT(wait); 872 int ret = 0; 873 874 spin_lock(&space_info->lock); 875 while (ticket->bytes > 0 && ticket->error == 0) { 876 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 877 if (ret) { 878 ticket->error = -EINTR; 879 break; 880 } 881 spin_unlock(&space_info->lock); 882 883 schedule(); 884 885 finish_wait(&ticket->wait, &wait); 886 spin_lock(&space_info->lock); 887 } 888 spin_unlock(&space_info->lock); 889 } 890 891 /** 892 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 893 * @fs_info - the fs 894 * @space_info - the space_info for the reservation 895 * @ticket - the ticket for the reservation 896 * @flush - how much we can flush 897 * 898 * This does the work of figuring out how to flush for the ticket, waiting for 899 * the reservation, and returning the appropriate error if there is one. 900 */ 901 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 902 struct btrfs_space_info *space_info, 903 struct reserve_ticket *ticket, 904 enum btrfs_reserve_flush_enum flush) 905 { 906 int ret; 907 908 switch (flush) { 909 case BTRFS_RESERVE_FLUSH_ALL: 910 wait_reserve_ticket(fs_info, space_info, ticket); 911 break; 912 case BTRFS_RESERVE_FLUSH_LIMIT: 913 priority_reclaim_metadata_space(fs_info, space_info, ticket, 914 priority_flush_states, 915 ARRAY_SIZE(priority_flush_states)); 916 break; 917 case BTRFS_RESERVE_FLUSH_EVICT: 918 priority_reclaim_metadata_space(fs_info, space_info, ticket, 919 evict_flush_states, 920 ARRAY_SIZE(evict_flush_states)); 921 break; 922 default: 923 ASSERT(0); 924 break; 925 } 926 927 spin_lock(&space_info->lock); 928 ret = ticket->error; 929 if (ticket->bytes || ticket->error) { 930 list_del_init(&ticket->list); 931 if (!ret) 932 ret = -ENOSPC; 933 } 934 spin_unlock(&space_info->lock); 935 ASSERT(list_empty(&ticket->list)); 936 return ret; 937 } 938 939 /** 940 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 941 * @root - the root we're allocating for 942 * @space_info - the space info we want to allocate from 943 * @orig_bytes - the number of bytes we want 944 * @flush - whether or not we can flush to make our reservation 945 * 946 * This will reserve orig_bytes number of bytes from the space info associated 947 * with the block_rsv. If there is not enough space it will make an attempt to 948 * flush out space to make room. It will do this by flushing delalloc if 949 * possible or committing the transaction. If flush is 0 then no attempts to 950 * regain reservations will be made and this will fail if there is not enough 951 * space already. 952 */ 953 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 954 struct btrfs_space_info *space_info, 955 u64 orig_bytes, 956 enum btrfs_reserve_flush_enum flush, 957 bool system_chunk) 958 { 959 struct reserve_ticket ticket; 960 u64 used; 961 int ret = 0; 962 bool pending_tickets; 963 964 ASSERT(orig_bytes); 965 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 966 967 spin_lock(&space_info->lock); 968 ret = -ENOSPC; 969 used = btrfs_space_info_used(space_info, true); 970 pending_tickets = !list_empty(&space_info->tickets) || 971 !list_empty(&space_info->priority_tickets); 972 973 /* 974 * Carry on if we have enough space (short-circuit) OR call 975 * can_overcommit() to ensure we can overcommit to continue. 976 */ 977 if (!pending_tickets && 978 ((used + orig_bytes <= space_info->total_bytes) || 979 can_overcommit(fs_info, space_info, orig_bytes, flush, 980 system_chunk))) { 981 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 982 orig_bytes); 983 ret = 0; 984 } 985 986 /* 987 * If we couldn't make a reservation then setup our reservation ticket 988 * and kick the async worker if it's not already running. 989 * 990 * If we are a priority flusher then we just need to add our ticket to 991 * the list and we will do our own flushing further down. 992 */ 993 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 994 ticket.bytes = orig_bytes; 995 ticket.error = 0; 996 init_waitqueue_head(&ticket.wait); 997 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 998 list_add_tail(&ticket.list, &space_info->tickets); 999 if (!space_info->flush) { 1000 space_info->flush = 1; 1001 trace_btrfs_trigger_flush(fs_info, 1002 space_info->flags, 1003 orig_bytes, flush, 1004 "enospc"); 1005 queue_work(system_unbound_wq, 1006 &fs_info->async_reclaim_work); 1007 } 1008 } else { 1009 list_add_tail(&ticket.list, 1010 &space_info->priority_tickets); 1011 } 1012 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1013 used += orig_bytes; 1014 /* 1015 * We will do the space reservation dance during log replay, 1016 * which means we won't have fs_info->fs_root set, so don't do 1017 * the async reclaim as we will panic. 1018 */ 1019 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1020 need_do_async_reclaim(fs_info, space_info, 1021 used, system_chunk) && 1022 !work_busy(&fs_info->async_reclaim_work)) { 1023 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1024 orig_bytes, flush, "preempt"); 1025 queue_work(system_unbound_wq, 1026 &fs_info->async_reclaim_work); 1027 } 1028 } 1029 spin_unlock(&space_info->lock); 1030 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1031 return ret; 1032 1033 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1034 } 1035 1036 /** 1037 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1038 * @root - the root we're allocating for 1039 * @block_rsv - the block_rsv we're allocating for 1040 * @orig_bytes - the number of bytes we want 1041 * @flush - whether or not we can flush to make our reservation 1042 * 1043 * This will reserve orig_bytes number of bytes from the space info associated 1044 * with the block_rsv. If there is not enough space it will make an attempt to 1045 * flush out space to make room. It will do this by flushing delalloc if 1046 * possible or committing the transaction. If flush is 0 then no attempts to 1047 * regain reservations will be made and this will fail if there is not enough 1048 * space already. 1049 */ 1050 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1051 struct btrfs_block_rsv *block_rsv, 1052 u64 orig_bytes, 1053 enum btrfs_reserve_flush_enum flush) 1054 { 1055 struct btrfs_fs_info *fs_info = root->fs_info; 1056 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1057 int ret; 1058 bool system_chunk = (root == fs_info->chunk_root); 1059 1060 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1061 orig_bytes, flush, system_chunk); 1062 if (ret == -ENOSPC && 1063 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1064 if (block_rsv != global_rsv && 1065 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1066 ret = 0; 1067 } 1068 if (ret == -ENOSPC) { 1069 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1070 block_rsv->space_info->flags, 1071 orig_bytes, 1); 1072 1073 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1074 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1075 orig_bytes, 0); 1076 } 1077 return ret; 1078 } 1079