1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 INIT_LIST_HEAD(&space_info->ro_bgs); 62 INIT_LIST_HEAD(&space_info->tickets); 63 INIT_LIST_HEAD(&space_info->priority_tickets); 64 65 ret = btrfs_sysfs_add_space_info_type(info, space_info); 66 if (ret) 67 return ret; 68 69 list_add_rcu(&space_info->list, &info->space_info); 70 if (flags & BTRFS_BLOCK_GROUP_DATA) 71 info->data_sinfo = space_info; 72 73 return ret; 74 } 75 76 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 77 { 78 struct btrfs_super_block *disk_super; 79 u64 features; 80 u64 flags; 81 int mixed = 0; 82 int ret; 83 84 disk_super = fs_info->super_copy; 85 if (!btrfs_super_root(disk_super)) 86 return -EINVAL; 87 88 features = btrfs_super_incompat_flags(disk_super); 89 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 90 mixed = 1; 91 92 flags = BTRFS_BLOCK_GROUP_SYSTEM; 93 ret = create_space_info(fs_info, flags); 94 if (ret) 95 goto out; 96 97 if (mixed) { 98 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 99 ret = create_space_info(fs_info, flags); 100 } else { 101 flags = BTRFS_BLOCK_GROUP_METADATA; 102 ret = create_space_info(fs_info, flags); 103 if (ret) 104 goto out; 105 106 flags = BTRFS_BLOCK_GROUP_DATA; 107 ret = create_space_info(fs_info, flags); 108 } 109 out: 110 return ret; 111 } 112 113 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 114 u64 total_bytes, u64 bytes_used, 115 u64 bytes_readonly, 116 struct btrfs_space_info **space_info) 117 { 118 struct btrfs_space_info *found; 119 int factor; 120 121 factor = btrfs_bg_type_to_factor(flags); 122 123 found = btrfs_find_space_info(info, flags); 124 ASSERT(found); 125 spin_lock(&found->lock); 126 found->total_bytes += total_bytes; 127 found->disk_total += total_bytes * factor; 128 found->bytes_used += bytes_used; 129 found->disk_used += bytes_used * factor; 130 found->bytes_readonly += bytes_readonly; 131 if (total_bytes > 0) 132 found->full = 0; 133 btrfs_try_granting_tickets(info, found); 134 spin_unlock(&found->lock); 135 *space_info = found; 136 } 137 138 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 139 u64 flags) 140 { 141 struct list_head *head = &info->space_info; 142 struct btrfs_space_info *found; 143 144 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 145 146 rcu_read_lock(); 147 list_for_each_entry_rcu(found, head, list) { 148 if (found->flags & flags) { 149 rcu_read_unlock(); 150 return found; 151 } 152 } 153 rcu_read_unlock(); 154 return NULL; 155 } 156 157 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 158 { 159 return (global->size << 1); 160 } 161 162 static int can_overcommit(struct btrfs_fs_info *fs_info, 163 struct btrfs_space_info *space_info, u64 bytes, 164 enum btrfs_reserve_flush_enum flush, 165 bool system_chunk) 166 { 167 u64 profile; 168 u64 avail; 169 u64 used; 170 int factor; 171 172 /* Don't overcommit when in mixed mode. */ 173 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 174 return 0; 175 176 if (system_chunk) 177 profile = btrfs_system_alloc_profile(fs_info); 178 else 179 profile = btrfs_metadata_alloc_profile(fs_info); 180 181 used = btrfs_space_info_used(space_info, true); 182 avail = atomic64_read(&fs_info->free_chunk_space); 183 184 /* 185 * If we have dup, raid1 or raid10 then only half of the free 186 * space is actually usable. For raid56, the space info used 187 * doesn't include the parity drive, so we don't have to 188 * change the math 189 */ 190 factor = btrfs_bg_type_to_factor(profile); 191 avail = div_u64(avail, factor); 192 193 /* 194 * If we aren't flushing all things, let us overcommit up to 195 * 1/2th of the space. If we can flush, don't let us overcommit 196 * too much, let it overcommit up to 1/8 of the space. 197 */ 198 if (flush == BTRFS_RESERVE_FLUSH_ALL) 199 avail >>= 3; 200 else 201 avail >>= 1; 202 203 if (used + bytes < space_info->total_bytes + avail) 204 return 1; 205 return 0; 206 } 207 208 /* 209 * This is for space we already have accounted in space_info->bytes_may_use, so 210 * basically when we're returning space from block_rsv's. 211 */ 212 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 213 struct btrfs_space_info *space_info) 214 { 215 struct list_head *head; 216 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 217 218 lockdep_assert_held(&space_info->lock); 219 220 head = &space_info->priority_tickets; 221 again: 222 while (!list_empty(head)) { 223 struct reserve_ticket *ticket; 224 u64 used = btrfs_space_info_used(space_info, true); 225 226 ticket = list_first_entry(head, struct reserve_ticket, list); 227 228 /* Check and see if our ticket can be satisified now. */ 229 if ((used + ticket->bytes <= space_info->total_bytes) || 230 can_overcommit(fs_info, space_info, ticket->bytes, flush, 231 false)) { 232 btrfs_space_info_update_bytes_may_use(fs_info, 233 space_info, 234 ticket->bytes); 235 list_del_init(&ticket->list); 236 ticket->bytes = 0; 237 space_info->tickets_id++; 238 wake_up(&ticket->wait); 239 } else { 240 break; 241 } 242 } 243 244 if (head == &space_info->priority_tickets) { 245 head = &space_info->tickets; 246 flush = BTRFS_RESERVE_FLUSH_ALL; 247 goto again; 248 } 249 } 250 251 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 252 do { \ 253 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 254 spin_lock(&__rsv->lock); \ 255 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 256 __rsv->size, __rsv->reserved); \ 257 spin_unlock(&__rsv->lock); \ 258 } while (0) 259 260 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 261 struct btrfs_space_info *info) 262 { 263 lockdep_assert_held(&info->lock); 264 265 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 266 info->flags, 267 info->total_bytes - btrfs_space_info_used(info, true), 268 info->full ? "" : "not "); 269 btrfs_info(fs_info, 270 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 271 info->total_bytes, info->bytes_used, info->bytes_pinned, 272 info->bytes_reserved, info->bytes_may_use, 273 info->bytes_readonly); 274 275 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 276 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 277 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 278 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 279 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 280 281 } 282 283 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 284 struct btrfs_space_info *info, u64 bytes, 285 int dump_block_groups) 286 { 287 struct btrfs_block_group *cache; 288 int index = 0; 289 290 spin_lock(&info->lock); 291 __btrfs_dump_space_info(fs_info, info); 292 spin_unlock(&info->lock); 293 294 if (!dump_block_groups) 295 return; 296 297 down_read(&info->groups_sem); 298 again: 299 list_for_each_entry(cache, &info->block_groups[index], list) { 300 spin_lock(&cache->lock); 301 btrfs_info(fs_info, 302 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 303 cache->start, cache->length, cache->used, cache->pinned, 304 cache->reserved, cache->ro ? "[readonly]" : ""); 305 btrfs_dump_free_space(cache, bytes); 306 spin_unlock(&cache->lock); 307 } 308 if (++index < BTRFS_NR_RAID_TYPES) 309 goto again; 310 up_read(&info->groups_sem); 311 } 312 313 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 314 unsigned long nr_pages, int nr_items) 315 { 316 struct super_block *sb = fs_info->sb; 317 318 if (down_read_trylock(&sb->s_umount)) { 319 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 320 up_read(&sb->s_umount); 321 } else { 322 /* 323 * We needn't worry the filesystem going from r/w to r/o though 324 * we don't acquire ->s_umount mutex, because the filesystem 325 * should guarantee the delalloc inodes list be empty after 326 * the filesystem is readonly(all dirty pages are written to 327 * the disk). 328 */ 329 btrfs_start_delalloc_roots(fs_info, nr_items); 330 if (!current->journal_info) 331 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 332 } 333 } 334 335 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 336 u64 to_reclaim) 337 { 338 u64 bytes; 339 u64 nr; 340 341 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 342 nr = div64_u64(to_reclaim, bytes); 343 if (!nr) 344 nr = 1; 345 return nr; 346 } 347 348 #define EXTENT_SIZE_PER_ITEM SZ_256K 349 350 /* 351 * shrink metadata reservation for delalloc 352 */ 353 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 354 u64 orig, bool wait_ordered) 355 { 356 struct btrfs_space_info *space_info; 357 struct btrfs_trans_handle *trans; 358 u64 delalloc_bytes; 359 u64 dio_bytes; 360 u64 async_pages; 361 u64 items; 362 long time_left; 363 unsigned long nr_pages; 364 int loops; 365 366 /* Calc the number of the pages we need flush for space reservation */ 367 items = calc_reclaim_items_nr(fs_info, to_reclaim); 368 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 369 370 trans = (struct btrfs_trans_handle *)current->journal_info; 371 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 372 373 delalloc_bytes = percpu_counter_sum_positive( 374 &fs_info->delalloc_bytes); 375 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 376 if (delalloc_bytes == 0 && dio_bytes == 0) { 377 if (trans) 378 return; 379 if (wait_ordered) 380 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 381 return; 382 } 383 384 /* 385 * If we are doing more ordered than delalloc we need to just wait on 386 * ordered extents, otherwise we'll waste time trying to flush delalloc 387 * that likely won't give us the space back we need. 388 */ 389 if (dio_bytes > delalloc_bytes) 390 wait_ordered = true; 391 392 loops = 0; 393 while ((delalloc_bytes || dio_bytes) && loops < 3) { 394 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 395 396 /* 397 * Triggers inode writeback for up to nr_pages. This will invoke 398 * ->writepages callback and trigger delalloc filling 399 * (btrfs_run_delalloc_range()). 400 */ 401 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 402 403 /* 404 * We need to wait for the compressed pages to start before 405 * we continue. 406 */ 407 async_pages = atomic_read(&fs_info->async_delalloc_pages); 408 if (!async_pages) 409 goto skip_async; 410 411 /* 412 * Calculate how many compressed pages we want to be written 413 * before we continue. I.e if there are more async pages than we 414 * require wait_event will wait until nr_pages are written. 415 */ 416 if (async_pages <= nr_pages) 417 async_pages = 0; 418 else 419 async_pages -= nr_pages; 420 421 wait_event(fs_info->async_submit_wait, 422 atomic_read(&fs_info->async_delalloc_pages) <= 423 (int)async_pages); 424 skip_async: 425 spin_lock(&space_info->lock); 426 if (list_empty(&space_info->tickets) && 427 list_empty(&space_info->priority_tickets)) { 428 spin_unlock(&space_info->lock); 429 break; 430 } 431 spin_unlock(&space_info->lock); 432 433 loops++; 434 if (wait_ordered && !trans) { 435 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 436 } else { 437 time_left = schedule_timeout_killable(1); 438 if (time_left) 439 break; 440 } 441 delalloc_bytes = percpu_counter_sum_positive( 442 &fs_info->delalloc_bytes); 443 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 444 } 445 } 446 447 /** 448 * maybe_commit_transaction - possibly commit the transaction if its ok to 449 * @root - the root we're allocating for 450 * @bytes - the number of bytes we want to reserve 451 * @force - force the commit 452 * 453 * This will check to make sure that committing the transaction will actually 454 * get us somewhere and then commit the transaction if it does. Otherwise it 455 * will return -ENOSPC. 456 */ 457 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 458 struct btrfs_space_info *space_info) 459 { 460 struct reserve_ticket *ticket = NULL; 461 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 462 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 463 struct btrfs_trans_handle *trans; 464 u64 bytes_needed; 465 u64 reclaim_bytes = 0; 466 u64 cur_free_bytes = 0; 467 468 trans = (struct btrfs_trans_handle *)current->journal_info; 469 if (trans) 470 return -EAGAIN; 471 472 spin_lock(&space_info->lock); 473 cur_free_bytes = btrfs_space_info_used(space_info, true); 474 if (cur_free_bytes < space_info->total_bytes) 475 cur_free_bytes = space_info->total_bytes - cur_free_bytes; 476 else 477 cur_free_bytes = 0; 478 479 if (!list_empty(&space_info->priority_tickets)) 480 ticket = list_first_entry(&space_info->priority_tickets, 481 struct reserve_ticket, list); 482 else if (!list_empty(&space_info->tickets)) 483 ticket = list_first_entry(&space_info->tickets, 484 struct reserve_ticket, list); 485 bytes_needed = (ticket) ? ticket->bytes : 0; 486 487 if (bytes_needed > cur_free_bytes) 488 bytes_needed -= cur_free_bytes; 489 else 490 bytes_needed = 0; 491 spin_unlock(&space_info->lock); 492 493 if (!bytes_needed) 494 return 0; 495 496 trans = btrfs_join_transaction(fs_info->extent_root); 497 if (IS_ERR(trans)) 498 return PTR_ERR(trans); 499 500 /* 501 * See if there is enough pinned space to make this reservation, or if 502 * we have block groups that are going to be freed, allowing us to 503 * possibly do a chunk allocation the next loop through. 504 */ 505 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 506 __percpu_counter_compare(&space_info->total_bytes_pinned, 507 bytes_needed, 508 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 509 goto commit; 510 511 /* 512 * See if there is some space in the delayed insertion reservation for 513 * this reservation. 514 */ 515 if (space_info != delayed_rsv->space_info) 516 goto enospc; 517 518 spin_lock(&delayed_rsv->lock); 519 reclaim_bytes += delayed_rsv->reserved; 520 spin_unlock(&delayed_rsv->lock); 521 522 spin_lock(&delayed_refs_rsv->lock); 523 reclaim_bytes += delayed_refs_rsv->reserved; 524 spin_unlock(&delayed_refs_rsv->lock); 525 if (reclaim_bytes >= bytes_needed) 526 goto commit; 527 bytes_needed -= reclaim_bytes; 528 529 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 530 bytes_needed, 531 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 532 goto enospc; 533 534 commit: 535 return btrfs_commit_transaction(trans); 536 enospc: 537 btrfs_end_transaction(trans); 538 return -ENOSPC; 539 } 540 541 /* 542 * Try to flush some data based on policy set by @state. This is only advisory 543 * and may fail for various reasons. The caller is supposed to examine the 544 * state of @space_info to detect the outcome. 545 */ 546 static void flush_space(struct btrfs_fs_info *fs_info, 547 struct btrfs_space_info *space_info, u64 num_bytes, 548 int state) 549 { 550 struct btrfs_root *root = fs_info->extent_root; 551 struct btrfs_trans_handle *trans; 552 int nr; 553 int ret = 0; 554 555 switch (state) { 556 case FLUSH_DELAYED_ITEMS_NR: 557 case FLUSH_DELAYED_ITEMS: 558 if (state == FLUSH_DELAYED_ITEMS_NR) 559 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 560 else 561 nr = -1; 562 563 trans = btrfs_join_transaction(root); 564 if (IS_ERR(trans)) { 565 ret = PTR_ERR(trans); 566 break; 567 } 568 ret = btrfs_run_delayed_items_nr(trans, nr); 569 btrfs_end_transaction(trans); 570 break; 571 case FLUSH_DELALLOC: 572 case FLUSH_DELALLOC_WAIT: 573 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 574 state == FLUSH_DELALLOC_WAIT); 575 break; 576 case FLUSH_DELAYED_REFS_NR: 577 case FLUSH_DELAYED_REFS: 578 trans = btrfs_join_transaction(root); 579 if (IS_ERR(trans)) { 580 ret = PTR_ERR(trans); 581 break; 582 } 583 if (state == FLUSH_DELAYED_REFS_NR) 584 nr = calc_reclaim_items_nr(fs_info, num_bytes); 585 else 586 nr = 0; 587 btrfs_run_delayed_refs(trans, nr); 588 btrfs_end_transaction(trans); 589 break; 590 case ALLOC_CHUNK: 591 case ALLOC_CHUNK_FORCE: 592 trans = btrfs_join_transaction(root); 593 if (IS_ERR(trans)) { 594 ret = PTR_ERR(trans); 595 break; 596 } 597 ret = btrfs_chunk_alloc(trans, 598 btrfs_metadata_alloc_profile(fs_info), 599 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 600 CHUNK_ALLOC_FORCE); 601 btrfs_end_transaction(trans); 602 if (ret > 0 || ret == -ENOSPC) 603 ret = 0; 604 break; 605 case RUN_DELAYED_IPUTS: 606 /* 607 * If we have pending delayed iputs then we could free up a 608 * bunch of pinned space, so make sure we run the iputs before 609 * we do our pinned bytes check below. 610 */ 611 btrfs_run_delayed_iputs(fs_info); 612 btrfs_wait_on_delayed_iputs(fs_info); 613 break; 614 case COMMIT_TRANS: 615 ret = may_commit_transaction(fs_info, space_info); 616 break; 617 default: 618 ret = -ENOSPC; 619 break; 620 } 621 622 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 623 ret); 624 return; 625 } 626 627 static inline u64 628 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 629 struct btrfs_space_info *space_info, 630 bool system_chunk) 631 { 632 struct reserve_ticket *ticket; 633 u64 used; 634 u64 expected; 635 u64 to_reclaim = 0; 636 637 list_for_each_entry(ticket, &space_info->tickets, list) 638 to_reclaim += ticket->bytes; 639 list_for_each_entry(ticket, &space_info->priority_tickets, list) 640 to_reclaim += ticket->bytes; 641 if (to_reclaim) 642 return to_reclaim; 643 644 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 645 if (can_overcommit(fs_info, space_info, to_reclaim, 646 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 647 return 0; 648 649 used = btrfs_space_info_used(space_info, true); 650 651 if (can_overcommit(fs_info, space_info, SZ_1M, 652 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 653 expected = div_factor_fine(space_info->total_bytes, 95); 654 else 655 expected = div_factor_fine(space_info->total_bytes, 90); 656 657 if (used > expected) 658 to_reclaim = used - expected; 659 else 660 to_reclaim = 0; 661 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 662 space_info->bytes_reserved); 663 return to_reclaim; 664 } 665 666 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 667 struct btrfs_space_info *space_info, 668 u64 used, bool system_chunk) 669 { 670 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 671 672 /* If we're just plain full then async reclaim just slows us down. */ 673 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 674 return 0; 675 676 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 677 system_chunk)) 678 return 0; 679 680 return (used >= thresh && !btrfs_fs_closing(fs_info) && 681 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 682 } 683 684 /* 685 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 686 * @fs_info - fs_info for this fs 687 * @space_info - the space info we were flushing 688 * 689 * We call this when we've exhausted our flushing ability and haven't made 690 * progress in satisfying tickets. The reservation code handles tickets in 691 * order, so if there is a large ticket first and then smaller ones we could 692 * very well satisfy the smaller tickets. This will attempt to wake up any 693 * tickets in the list to catch this case. 694 * 695 * This function returns true if it was able to make progress by clearing out 696 * other tickets, or if it stumbles across a ticket that was smaller than the 697 * first ticket. 698 */ 699 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 700 struct btrfs_space_info *space_info) 701 { 702 struct reserve_ticket *ticket; 703 u64 tickets_id = space_info->tickets_id; 704 u64 first_ticket_bytes = 0; 705 706 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 707 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); 708 __btrfs_dump_space_info(fs_info, space_info); 709 } 710 711 while (!list_empty(&space_info->tickets) && 712 tickets_id == space_info->tickets_id) { 713 ticket = list_first_entry(&space_info->tickets, 714 struct reserve_ticket, list); 715 716 /* 717 * may_commit_transaction will avoid committing the transaction 718 * if it doesn't feel like the space reclaimed by the commit 719 * would result in the ticket succeeding. However if we have a 720 * smaller ticket in the queue it may be small enough to be 721 * satisified by committing the transaction, so if any 722 * subsequent ticket is smaller than the first ticket go ahead 723 * and send us back for another loop through the enospc flushing 724 * code. 725 */ 726 if (first_ticket_bytes == 0) 727 first_ticket_bytes = ticket->bytes; 728 else if (first_ticket_bytes > ticket->bytes) 729 return true; 730 731 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 732 btrfs_info(fs_info, "failing ticket with %llu bytes", 733 ticket->bytes); 734 735 list_del_init(&ticket->list); 736 ticket->error = -ENOSPC; 737 wake_up(&ticket->wait); 738 739 /* 740 * We're just throwing tickets away, so more flushing may not 741 * trip over btrfs_try_granting_tickets, so we need to call it 742 * here to see if we can make progress with the next ticket in 743 * the list. 744 */ 745 btrfs_try_granting_tickets(fs_info, space_info); 746 } 747 return (tickets_id != space_info->tickets_id); 748 } 749 750 /* 751 * This is for normal flushers, we can wait all goddamned day if we want to. We 752 * will loop and continuously try to flush as long as we are making progress. 753 * We count progress as clearing off tickets each time we have to loop. 754 */ 755 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 756 { 757 struct btrfs_fs_info *fs_info; 758 struct btrfs_space_info *space_info; 759 u64 to_reclaim; 760 int flush_state; 761 int commit_cycles = 0; 762 u64 last_tickets_id; 763 764 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 765 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 766 767 spin_lock(&space_info->lock); 768 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 769 false); 770 if (!to_reclaim) { 771 space_info->flush = 0; 772 spin_unlock(&space_info->lock); 773 return; 774 } 775 last_tickets_id = space_info->tickets_id; 776 spin_unlock(&space_info->lock); 777 778 flush_state = FLUSH_DELAYED_ITEMS_NR; 779 do { 780 flush_space(fs_info, space_info, to_reclaim, flush_state); 781 spin_lock(&space_info->lock); 782 if (list_empty(&space_info->tickets)) { 783 space_info->flush = 0; 784 spin_unlock(&space_info->lock); 785 return; 786 } 787 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 788 space_info, 789 false); 790 if (last_tickets_id == space_info->tickets_id) { 791 flush_state++; 792 } else { 793 last_tickets_id = space_info->tickets_id; 794 flush_state = FLUSH_DELAYED_ITEMS_NR; 795 if (commit_cycles) 796 commit_cycles--; 797 } 798 799 /* 800 * We don't want to force a chunk allocation until we've tried 801 * pretty hard to reclaim space. Think of the case where we 802 * freed up a bunch of space and so have a lot of pinned space 803 * to reclaim. We would rather use that than possibly create a 804 * underutilized metadata chunk. So if this is our first run 805 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 806 * commit the transaction. If nothing has changed the next go 807 * around then we can force a chunk allocation. 808 */ 809 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 810 flush_state++; 811 812 if (flush_state > COMMIT_TRANS) { 813 commit_cycles++; 814 if (commit_cycles > 2) { 815 if (maybe_fail_all_tickets(fs_info, space_info)) { 816 flush_state = FLUSH_DELAYED_ITEMS_NR; 817 commit_cycles--; 818 } else { 819 space_info->flush = 0; 820 } 821 } else { 822 flush_state = FLUSH_DELAYED_ITEMS_NR; 823 } 824 } 825 spin_unlock(&space_info->lock); 826 } while (flush_state <= COMMIT_TRANS); 827 } 828 829 void btrfs_init_async_reclaim_work(struct work_struct *work) 830 { 831 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 832 } 833 834 static const enum btrfs_flush_state priority_flush_states[] = { 835 FLUSH_DELAYED_ITEMS_NR, 836 FLUSH_DELAYED_ITEMS, 837 ALLOC_CHUNK, 838 }; 839 840 static const enum btrfs_flush_state evict_flush_states[] = { 841 FLUSH_DELAYED_ITEMS_NR, 842 FLUSH_DELAYED_ITEMS, 843 FLUSH_DELAYED_REFS_NR, 844 FLUSH_DELAYED_REFS, 845 FLUSH_DELALLOC, 846 FLUSH_DELALLOC_WAIT, 847 ALLOC_CHUNK, 848 COMMIT_TRANS, 849 }; 850 851 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 852 struct btrfs_space_info *space_info, 853 struct reserve_ticket *ticket, 854 const enum btrfs_flush_state *states, 855 int states_nr) 856 { 857 u64 to_reclaim; 858 int flush_state; 859 860 spin_lock(&space_info->lock); 861 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 862 false); 863 if (!to_reclaim) { 864 spin_unlock(&space_info->lock); 865 return; 866 } 867 spin_unlock(&space_info->lock); 868 869 flush_state = 0; 870 do { 871 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 872 flush_state++; 873 spin_lock(&space_info->lock); 874 if (ticket->bytes == 0) { 875 spin_unlock(&space_info->lock); 876 return; 877 } 878 spin_unlock(&space_info->lock); 879 } while (flush_state < states_nr); 880 } 881 882 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 883 struct btrfs_space_info *space_info, 884 struct reserve_ticket *ticket) 885 886 { 887 DEFINE_WAIT(wait); 888 int ret = 0; 889 890 spin_lock(&space_info->lock); 891 while (ticket->bytes > 0 && ticket->error == 0) { 892 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 893 if (ret) { 894 /* 895 * Delete us from the list. After we unlock the space 896 * info, we don't want the async reclaim job to reserve 897 * space for this ticket. If that would happen, then the 898 * ticket's task would not known that space was reserved 899 * despite getting an error, resulting in a space leak 900 * (bytes_may_use counter of our space_info). 901 */ 902 list_del_init(&ticket->list); 903 ticket->error = -EINTR; 904 break; 905 } 906 spin_unlock(&space_info->lock); 907 908 schedule(); 909 910 finish_wait(&ticket->wait, &wait); 911 spin_lock(&space_info->lock); 912 } 913 spin_unlock(&space_info->lock); 914 } 915 916 /** 917 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 918 * @fs_info - the fs 919 * @space_info - the space_info for the reservation 920 * @ticket - the ticket for the reservation 921 * @flush - how much we can flush 922 * 923 * This does the work of figuring out how to flush for the ticket, waiting for 924 * the reservation, and returning the appropriate error if there is one. 925 */ 926 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 927 struct btrfs_space_info *space_info, 928 struct reserve_ticket *ticket, 929 enum btrfs_reserve_flush_enum flush) 930 { 931 int ret; 932 933 switch (flush) { 934 case BTRFS_RESERVE_FLUSH_ALL: 935 wait_reserve_ticket(fs_info, space_info, ticket); 936 break; 937 case BTRFS_RESERVE_FLUSH_LIMIT: 938 priority_reclaim_metadata_space(fs_info, space_info, ticket, 939 priority_flush_states, 940 ARRAY_SIZE(priority_flush_states)); 941 break; 942 case BTRFS_RESERVE_FLUSH_EVICT: 943 priority_reclaim_metadata_space(fs_info, space_info, ticket, 944 evict_flush_states, 945 ARRAY_SIZE(evict_flush_states)); 946 break; 947 default: 948 ASSERT(0); 949 break; 950 } 951 952 spin_lock(&space_info->lock); 953 ret = ticket->error; 954 if (ticket->bytes || ticket->error) { 955 /* 956 * Need to delete here for priority tickets. For regular tickets 957 * either the async reclaim job deletes the ticket from the list 958 * or we delete it ourselves at wait_reserve_ticket(). 959 */ 960 list_del_init(&ticket->list); 961 if (!ret) 962 ret = -ENOSPC; 963 } 964 spin_unlock(&space_info->lock); 965 ASSERT(list_empty(&ticket->list)); 966 /* 967 * Check that we can't have an error set if the reservation succeeded, 968 * as that would confuse tasks and lead them to error out without 969 * releasing reserved space (if an error happens the expectation is that 970 * space wasn't reserved at all). 971 */ 972 ASSERT(!(ticket->bytes == 0 && ticket->error)); 973 return ret; 974 } 975 976 /** 977 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 978 * @root - the root we're allocating for 979 * @space_info - the space info we want to allocate from 980 * @orig_bytes - the number of bytes we want 981 * @flush - whether or not we can flush to make our reservation 982 * 983 * This will reserve orig_bytes number of bytes from the space info associated 984 * with the block_rsv. If there is not enough space it will make an attempt to 985 * flush out space to make room. It will do this by flushing delalloc if 986 * possible or committing the transaction. If flush is 0 then no attempts to 987 * regain reservations will be made and this will fail if there is not enough 988 * space already. 989 */ 990 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 991 struct btrfs_space_info *space_info, 992 u64 orig_bytes, 993 enum btrfs_reserve_flush_enum flush, 994 bool system_chunk) 995 { 996 struct reserve_ticket ticket; 997 u64 used; 998 int ret = 0; 999 bool pending_tickets; 1000 1001 ASSERT(orig_bytes); 1002 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 1003 1004 spin_lock(&space_info->lock); 1005 ret = -ENOSPC; 1006 used = btrfs_space_info_used(space_info, true); 1007 pending_tickets = !list_empty(&space_info->tickets) || 1008 !list_empty(&space_info->priority_tickets); 1009 1010 /* 1011 * Carry on if we have enough space (short-circuit) OR call 1012 * can_overcommit() to ensure we can overcommit to continue. 1013 */ 1014 if (!pending_tickets && 1015 ((used + orig_bytes <= space_info->total_bytes) || 1016 can_overcommit(fs_info, space_info, orig_bytes, flush, 1017 system_chunk))) { 1018 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 1019 orig_bytes); 1020 ret = 0; 1021 } 1022 1023 /* 1024 * If we couldn't make a reservation then setup our reservation ticket 1025 * and kick the async worker if it's not already running. 1026 * 1027 * If we are a priority flusher then we just need to add our ticket to 1028 * the list and we will do our own flushing further down. 1029 */ 1030 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 1031 ticket.bytes = orig_bytes; 1032 ticket.error = 0; 1033 init_waitqueue_head(&ticket.wait); 1034 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 1035 list_add_tail(&ticket.list, &space_info->tickets); 1036 if (!space_info->flush) { 1037 space_info->flush = 1; 1038 trace_btrfs_trigger_flush(fs_info, 1039 space_info->flags, 1040 orig_bytes, flush, 1041 "enospc"); 1042 queue_work(system_unbound_wq, 1043 &fs_info->async_reclaim_work); 1044 } 1045 } else { 1046 list_add_tail(&ticket.list, 1047 &space_info->priority_tickets); 1048 } 1049 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1050 used += orig_bytes; 1051 /* 1052 * We will do the space reservation dance during log replay, 1053 * which means we won't have fs_info->fs_root set, so don't do 1054 * the async reclaim as we will panic. 1055 */ 1056 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1057 need_do_async_reclaim(fs_info, space_info, 1058 used, system_chunk) && 1059 !work_busy(&fs_info->async_reclaim_work)) { 1060 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1061 orig_bytes, flush, "preempt"); 1062 queue_work(system_unbound_wq, 1063 &fs_info->async_reclaim_work); 1064 } 1065 } 1066 spin_unlock(&space_info->lock); 1067 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1068 return ret; 1069 1070 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1071 } 1072 1073 /** 1074 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1075 * @root - the root we're allocating for 1076 * @block_rsv - the block_rsv we're allocating for 1077 * @orig_bytes - the number of bytes we want 1078 * @flush - whether or not we can flush to make our reservation 1079 * 1080 * This will reserve orig_bytes number of bytes from the space info associated 1081 * with the block_rsv. If there is not enough space it will make an attempt to 1082 * flush out space to make room. It will do this by flushing delalloc if 1083 * possible or committing the transaction. If flush is 0 then no attempts to 1084 * regain reservations will be made and this will fail if there is not enough 1085 * space already. 1086 */ 1087 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1088 struct btrfs_block_rsv *block_rsv, 1089 u64 orig_bytes, 1090 enum btrfs_reserve_flush_enum flush) 1091 { 1092 struct btrfs_fs_info *fs_info = root->fs_info; 1093 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1094 int ret; 1095 bool system_chunk = (root == fs_info->chunk_root); 1096 1097 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1098 orig_bytes, flush, system_chunk); 1099 if (ret == -ENOSPC && 1100 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1101 if (block_rsv != global_rsv && 1102 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1103 ret = 0; 1104 } 1105 if (ret == -ENOSPC) { 1106 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1107 block_rsv->space_info->flags, 1108 orig_bytes, 1); 1109 1110 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1111 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1112 orig_bytes, 0); 1113 } 1114 return ret; 1115 } 1116