1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 INIT_LIST_HEAD(&space_info->ro_bgs); 62 INIT_LIST_HEAD(&space_info->tickets); 63 INIT_LIST_HEAD(&space_info->priority_tickets); 64 65 ret = btrfs_sysfs_add_space_info_type(info, space_info); 66 if (ret) 67 return ret; 68 69 list_add_rcu(&space_info->list, &info->space_info); 70 if (flags & BTRFS_BLOCK_GROUP_DATA) 71 info->data_sinfo = space_info; 72 73 return ret; 74 } 75 76 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 77 { 78 struct btrfs_super_block *disk_super; 79 u64 features; 80 u64 flags; 81 int mixed = 0; 82 int ret; 83 84 disk_super = fs_info->super_copy; 85 if (!btrfs_super_root(disk_super)) 86 return -EINVAL; 87 88 features = btrfs_super_incompat_flags(disk_super); 89 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 90 mixed = 1; 91 92 flags = BTRFS_BLOCK_GROUP_SYSTEM; 93 ret = create_space_info(fs_info, flags); 94 if (ret) 95 goto out; 96 97 if (mixed) { 98 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 99 ret = create_space_info(fs_info, flags); 100 } else { 101 flags = BTRFS_BLOCK_GROUP_METADATA; 102 ret = create_space_info(fs_info, flags); 103 if (ret) 104 goto out; 105 106 flags = BTRFS_BLOCK_GROUP_DATA; 107 ret = create_space_info(fs_info, flags); 108 } 109 out: 110 return ret; 111 } 112 113 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 114 u64 total_bytes, u64 bytes_used, 115 u64 bytes_readonly, 116 struct btrfs_space_info **space_info) 117 { 118 struct btrfs_space_info *found; 119 int factor; 120 121 factor = btrfs_bg_type_to_factor(flags); 122 123 found = btrfs_find_space_info(info, flags); 124 ASSERT(found); 125 spin_lock(&found->lock); 126 found->total_bytes += total_bytes; 127 found->disk_total += total_bytes * factor; 128 found->bytes_used += bytes_used; 129 found->disk_used += bytes_used * factor; 130 found->bytes_readonly += bytes_readonly; 131 if (total_bytes > 0) 132 found->full = 0; 133 btrfs_try_granting_tickets(info, found); 134 spin_unlock(&found->lock); 135 *space_info = found; 136 } 137 138 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 139 u64 flags) 140 { 141 struct list_head *head = &info->space_info; 142 struct btrfs_space_info *found; 143 144 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 145 146 rcu_read_lock(); 147 list_for_each_entry_rcu(found, head, list) { 148 if (found->flags & flags) { 149 rcu_read_unlock(); 150 return found; 151 } 152 } 153 rcu_read_unlock(); 154 return NULL; 155 } 156 157 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 158 { 159 return (global->size << 1); 160 } 161 162 static int can_overcommit(struct btrfs_fs_info *fs_info, 163 struct btrfs_space_info *space_info, u64 bytes, 164 enum btrfs_reserve_flush_enum flush) 165 { 166 u64 profile; 167 u64 avail; 168 u64 used; 169 int factor; 170 171 /* Don't overcommit when in mixed mode. */ 172 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 173 return 0; 174 175 if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) 176 profile = btrfs_system_alloc_profile(fs_info); 177 else 178 profile = btrfs_metadata_alloc_profile(fs_info); 179 180 used = btrfs_space_info_used(space_info, true); 181 avail = atomic64_read(&fs_info->free_chunk_space); 182 183 /* 184 * If we have dup, raid1 or raid10 then only half of the free 185 * space is actually usable. For raid56, the space info used 186 * doesn't include the parity drive, so we don't have to 187 * change the math 188 */ 189 factor = btrfs_bg_type_to_factor(profile); 190 avail = div_u64(avail, factor); 191 192 /* 193 * If we aren't flushing all things, let us overcommit up to 194 * 1/2th of the space. If we can flush, don't let us overcommit 195 * too much, let it overcommit up to 1/8 of the space. 196 */ 197 if (flush == BTRFS_RESERVE_FLUSH_ALL) 198 avail >>= 3; 199 else 200 avail >>= 1; 201 202 if (used + bytes < space_info->total_bytes + avail) 203 return 1; 204 return 0; 205 } 206 207 /* 208 * This is for space we already have accounted in space_info->bytes_may_use, so 209 * basically when we're returning space from block_rsv's. 210 */ 211 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 212 struct btrfs_space_info *space_info) 213 { 214 struct list_head *head; 215 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 216 217 lockdep_assert_held(&space_info->lock); 218 219 head = &space_info->priority_tickets; 220 again: 221 while (!list_empty(head)) { 222 struct reserve_ticket *ticket; 223 u64 used = btrfs_space_info_used(space_info, true); 224 225 ticket = list_first_entry(head, struct reserve_ticket, list); 226 227 /* Check and see if our ticket can be satisified now. */ 228 if ((used + ticket->bytes <= space_info->total_bytes) || 229 can_overcommit(fs_info, space_info, ticket->bytes, flush)) { 230 btrfs_space_info_update_bytes_may_use(fs_info, 231 space_info, 232 ticket->bytes); 233 list_del_init(&ticket->list); 234 ticket->bytes = 0; 235 space_info->tickets_id++; 236 wake_up(&ticket->wait); 237 } else { 238 break; 239 } 240 } 241 242 if (head == &space_info->priority_tickets) { 243 head = &space_info->tickets; 244 flush = BTRFS_RESERVE_FLUSH_ALL; 245 goto again; 246 } 247 } 248 249 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 250 do { \ 251 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 252 spin_lock(&__rsv->lock); \ 253 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 254 __rsv->size, __rsv->reserved); \ 255 spin_unlock(&__rsv->lock); \ 256 } while (0) 257 258 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 259 struct btrfs_space_info *info) 260 { 261 lockdep_assert_held(&info->lock); 262 263 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 264 info->flags, 265 info->total_bytes - btrfs_space_info_used(info, true), 266 info->full ? "" : "not "); 267 btrfs_info(fs_info, 268 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 269 info->total_bytes, info->bytes_used, info->bytes_pinned, 270 info->bytes_reserved, info->bytes_may_use, 271 info->bytes_readonly); 272 273 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 274 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 275 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 276 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 277 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 278 279 } 280 281 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 282 struct btrfs_space_info *info, u64 bytes, 283 int dump_block_groups) 284 { 285 struct btrfs_block_group *cache; 286 int index = 0; 287 288 spin_lock(&info->lock); 289 __btrfs_dump_space_info(fs_info, info); 290 spin_unlock(&info->lock); 291 292 if (!dump_block_groups) 293 return; 294 295 down_read(&info->groups_sem); 296 again: 297 list_for_each_entry(cache, &info->block_groups[index], list) { 298 spin_lock(&cache->lock); 299 btrfs_info(fs_info, 300 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 301 cache->start, cache->length, cache->used, cache->pinned, 302 cache->reserved, cache->ro ? "[readonly]" : ""); 303 btrfs_dump_free_space(cache, bytes); 304 spin_unlock(&cache->lock); 305 } 306 if (++index < BTRFS_NR_RAID_TYPES) 307 goto again; 308 up_read(&info->groups_sem); 309 } 310 311 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 312 unsigned long nr_pages, int nr_items) 313 { 314 struct super_block *sb = fs_info->sb; 315 316 if (down_read_trylock(&sb->s_umount)) { 317 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 318 up_read(&sb->s_umount); 319 } else { 320 /* 321 * We needn't worry the filesystem going from r/w to r/o though 322 * we don't acquire ->s_umount mutex, because the filesystem 323 * should guarantee the delalloc inodes list be empty after 324 * the filesystem is readonly(all dirty pages are written to 325 * the disk). 326 */ 327 btrfs_start_delalloc_roots(fs_info, nr_items); 328 if (!current->journal_info) 329 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 330 } 331 } 332 333 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 334 u64 to_reclaim) 335 { 336 u64 bytes; 337 u64 nr; 338 339 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 340 nr = div64_u64(to_reclaim, bytes); 341 if (!nr) 342 nr = 1; 343 return nr; 344 } 345 346 #define EXTENT_SIZE_PER_ITEM SZ_256K 347 348 /* 349 * shrink metadata reservation for delalloc 350 */ 351 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 352 u64 orig, bool wait_ordered) 353 { 354 struct btrfs_space_info *space_info; 355 struct btrfs_trans_handle *trans; 356 u64 delalloc_bytes; 357 u64 dio_bytes; 358 u64 async_pages; 359 u64 items; 360 long time_left; 361 unsigned long nr_pages; 362 int loops; 363 364 /* Calc the number of the pages we need flush for space reservation */ 365 items = calc_reclaim_items_nr(fs_info, to_reclaim); 366 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 367 368 trans = (struct btrfs_trans_handle *)current->journal_info; 369 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 370 371 delalloc_bytes = percpu_counter_sum_positive( 372 &fs_info->delalloc_bytes); 373 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 374 if (delalloc_bytes == 0 && dio_bytes == 0) { 375 if (trans) 376 return; 377 if (wait_ordered) 378 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 379 return; 380 } 381 382 /* 383 * If we are doing more ordered than delalloc we need to just wait on 384 * ordered extents, otherwise we'll waste time trying to flush delalloc 385 * that likely won't give us the space back we need. 386 */ 387 if (dio_bytes > delalloc_bytes) 388 wait_ordered = true; 389 390 loops = 0; 391 while ((delalloc_bytes || dio_bytes) && loops < 3) { 392 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 393 394 /* 395 * Triggers inode writeback for up to nr_pages. This will invoke 396 * ->writepages callback and trigger delalloc filling 397 * (btrfs_run_delalloc_range()). 398 */ 399 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 400 401 /* 402 * We need to wait for the compressed pages to start before 403 * we continue. 404 */ 405 async_pages = atomic_read(&fs_info->async_delalloc_pages); 406 if (!async_pages) 407 goto skip_async; 408 409 /* 410 * Calculate how many compressed pages we want to be written 411 * before we continue. I.e if there are more async pages than we 412 * require wait_event will wait until nr_pages are written. 413 */ 414 if (async_pages <= nr_pages) 415 async_pages = 0; 416 else 417 async_pages -= nr_pages; 418 419 wait_event(fs_info->async_submit_wait, 420 atomic_read(&fs_info->async_delalloc_pages) <= 421 (int)async_pages); 422 skip_async: 423 spin_lock(&space_info->lock); 424 if (list_empty(&space_info->tickets) && 425 list_empty(&space_info->priority_tickets)) { 426 spin_unlock(&space_info->lock); 427 break; 428 } 429 spin_unlock(&space_info->lock); 430 431 loops++; 432 if (wait_ordered && !trans) { 433 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 434 } else { 435 time_left = schedule_timeout_killable(1); 436 if (time_left) 437 break; 438 } 439 delalloc_bytes = percpu_counter_sum_positive( 440 &fs_info->delalloc_bytes); 441 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 442 } 443 } 444 445 /** 446 * maybe_commit_transaction - possibly commit the transaction if its ok to 447 * @root - the root we're allocating for 448 * @bytes - the number of bytes we want to reserve 449 * @force - force the commit 450 * 451 * This will check to make sure that committing the transaction will actually 452 * get us somewhere and then commit the transaction if it does. Otherwise it 453 * will return -ENOSPC. 454 */ 455 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 456 struct btrfs_space_info *space_info) 457 { 458 struct reserve_ticket *ticket = NULL; 459 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 460 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 461 struct btrfs_trans_handle *trans; 462 u64 bytes_needed; 463 u64 reclaim_bytes = 0; 464 u64 cur_free_bytes = 0; 465 466 trans = (struct btrfs_trans_handle *)current->journal_info; 467 if (trans) 468 return -EAGAIN; 469 470 spin_lock(&space_info->lock); 471 cur_free_bytes = btrfs_space_info_used(space_info, true); 472 if (cur_free_bytes < space_info->total_bytes) 473 cur_free_bytes = space_info->total_bytes - cur_free_bytes; 474 else 475 cur_free_bytes = 0; 476 477 if (!list_empty(&space_info->priority_tickets)) 478 ticket = list_first_entry(&space_info->priority_tickets, 479 struct reserve_ticket, list); 480 else if (!list_empty(&space_info->tickets)) 481 ticket = list_first_entry(&space_info->tickets, 482 struct reserve_ticket, list); 483 bytes_needed = (ticket) ? ticket->bytes : 0; 484 485 if (bytes_needed > cur_free_bytes) 486 bytes_needed -= cur_free_bytes; 487 else 488 bytes_needed = 0; 489 spin_unlock(&space_info->lock); 490 491 if (!bytes_needed) 492 return 0; 493 494 trans = btrfs_join_transaction(fs_info->extent_root); 495 if (IS_ERR(trans)) 496 return PTR_ERR(trans); 497 498 /* 499 * See if there is enough pinned space to make this reservation, or if 500 * we have block groups that are going to be freed, allowing us to 501 * possibly do a chunk allocation the next loop through. 502 */ 503 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 504 __percpu_counter_compare(&space_info->total_bytes_pinned, 505 bytes_needed, 506 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 507 goto commit; 508 509 /* 510 * See if there is some space in the delayed insertion reservation for 511 * this reservation. 512 */ 513 if (space_info != delayed_rsv->space_info) 514 goto enospc; 515 516 spin_lock(&delayed_rsv->lock); 517 reclaim_bytes += delayed_rsv->reserved; 518 spin_unlock(&delayed_rsv->lock); 519 520 spin_lock(&delayed_refs_rsv->lock); 521 reclaim_bytes += delayed_refs_rsv->reserved; 522 spin_unlock(&delayed_refs_rsv->lock); 523 if (reclaim_bytes >= bytes_needed) 524 goto commit; 525 bytes_needed -= reclaim_bytes; 526 527 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 528 bytes_needed, 529 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 530 goto enospc; 531 532 commit: 533 return btrfs_commit_transaction(trans); 534 enospc: 535 btrfs_end_transaction(trans); 536 return -ENOSPC; 537 } 538 539 /* 540 * Try to flush some data based on policy set by @state. This is only advisory 541 * and may fail for various reasons. The caller is supposed to examine the 542 * state of @space_info to detect the outcome. 543 */ 544 static void flush_space(struct btrfs_fs_info *fs_info, 545 struct btrfs_space_info *space_info, u64 num_bytes, 546 int state) 547 { 548 struct btrfs_root *root = fs_info->extent_root; 549 struct btrfs_trans_handle *trans; 550 int nr; 551 int ret = 0; 552 553 switch (state) { 554 case FLUSH_DELAYED_ITEMS_NR: 555 case FLUSH_DELAYED_ITEMS: 556 if (state == FLUSH_DELAYED_ITEMS_NR) 557 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 558 else 559 nr = -1; 560 561 trans = btrfs_join_transaction(root); 562 if (IS_ERR(trans)) { 563 ret = PTR_ERR(trans); 564 break; 565 } 566 ret = btrfs_run_delayed_items_nr(trans, nr); 567 btrfs_end_transaction(trans); 568 break; 569 case FLUSH_DELALLOC: 570 case FLUSH_DELALLOC_WAIT: 571 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 572 state == FLUSH_DELALLOC_WAIT); 573 break; 574 case FLUSH_DELAYED_REFS_NR: 575 case FLUSH_DELAYED_REFS: 576 trans = btrfs_join_transaction(root); 577 if (IS_ERR(trans)) { 578 ret = PTR_ERR(trans); 579 break; 580 } 581 if (state == FLUSH_DELAYED_REFS_NR) 582 nr = calc_reclaim_items_nr(fs_info, num_bytes); 583 else 584 nr = 0; 585 btrfs_run_delayed_refs(trans, nr); 586 btrfs_end_transaction(trans); 587 break; 588 case ALLOC_CHUNK: 589 case ALLOC_CHUNK_FORCE: 590 trans = btrfs_join_transaction(root); 591 if (IS_ERR(trans)) { 592 ret = PTR_ERR(trans); 593 break; 594 } 595 ret = btrfs_chunk_alloc(trans, 596 btrfs_metadata_alloc_profile(fs_info), 597 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 598 CHUNK_ALLOC_FORCE); 599 btrfs_end_transaction(trans); 600 if (ret > 0 || ret == -ENOSPC) 601 ret = 0; 602 break; 603 case RUN_DELAYED_IPUTS: 604 /* 605 * If we have pending delayed iputs then we could free up a 606 * bunch of pinned space, so make sure we run the iputs before 607 * we do our pinned bytes check below. 608 */ 609 btrfs_run_delayed_iputs(fs_info); 610 btrfs_wait_on_delayed_iputs(fs_info); 611 break; 612 case COMMIT_TRANS: 613 ret = may_commit_transaction(fs_info, space_info); 614 break; 615 default: 616 ret = -ENOSPC; 617 break; 618 } 619 620 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 621 ret); 622 return; 623 } 624 625 static inline u64 626 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 627 struct btrfs_space_info *space_info) 628 { 629 struct reserve_ticket *ticket; 630 u64 used; 631 u64 expected; 632 u64 to_reclaim = 0; 633 634 list_for_each_entry(ticket, &space_info->tickets, list) 635 to_reclaim += ticket->bytes; 636 list_for_each_entry(ticket, &space_info->priority_tickets, list) 637 to_reclaim += ticket->bytes; 638 if (to_reclaim) 639 return to_reclaim; 640 641 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 642 if (can_overcommit(fs_info, space_info, to_reclaim, 643 BTRFS_RESERVE_FLUSH_ALL)) 644 return 0; 645 646 used = btrfs_space_info_used(space_info, true); 647 648 if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 649 expected = div_factor_fine(space_info->total_bytes, 95); 650 else 651 expected = div_factor_fine(space_info->total_bytes, 90); 652 653 if (used > expected) 654 to_reclaim = used - expected; 655 else 656 to_reclaim = 0; 657 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 658 space_info->bytes_reserved); 659 return to_reclaim; 660 } 661 662 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 663 struct btrfs_space_info *space_info, 664 u64 used) 665 { 666 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 667 668 /* If we're just plain full then async reclaim just slows us down. */ 669 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 670 return 0; 671 672 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) 673 return 0; 674 675 return (used >= thresh && !btrfs_fs_closing(fs_info) && 676 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 677 } 678 679 /* 680 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 681 * @fs_info - fs_info for this fs 682 * @space_info - the space info we were flushing 683 * 684 * We call this when we've exhausted our flushing ability and haven't made 685 * progress in satisfying tickets. The reservation code handles tickets in 686 * order, so if there is a large ticket first and then smaller ones we could 687 * very well satisfy the smaller tickets. This will attempt to wake up any 688 * tickets in the list to catch this case. 689 * 690 * This function returns true if it was able to make progress by clearing out 691 * other tickets, or if it stumbles across a ticket that was smaller than the 692 * first ticket. 693 */ 694 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 695 struct btrfs_space_info *space_info) 696 { 697 struct reserve_ticket *ticket; 698 u64 tickets_id = space_info->tickets_id; 699 u64 first_ticket_bytes = 0; 700 701 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 702 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); 703 __btrfs_dump_space_info(fs_info, space_info); 704 } 705 706 while (!list_empty(&space_info->tickets) && 707 tickets_id == space_info->tickets_id) { 708 ticket = list_first_entry(&space_info->tickets, 709 struct reserve_ticket, list); 710 711 /* 712 * may_commit_transaction will avoid committing the transaction 713 * if it doesn't feel like the space reclaimed by the commit 714 * would result in the ticket succeeding. However if we have a 715 * smaller ticket in the queue it may be small enough to be 716 * satisified by committing the transaction, so if any 717 * subsequent ticket is smaller than the first ticket go ahead 718 * and send us back for another loop through the enospc flushing 719 * code. 720 */ 721 if (first_ticket_bytes == 0) 722 first_ticket_bytes = ticket->bytes; 723 else if (first_ticket_bytes > ticket->bytes) 724 return true; 725 726 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 727 btrfs_info(fs_info, "failing ticket with %llu bytes", 728 ticket->bytes); 729 730 list_del_init(&ticket->list); 731 ticket->error = -ENOSPC; 732 wake_up(&ticket->wait); 733 734 /* 735 * We're just throwing tickets away, so more flushing may not 736 * trip over btrfs_try_granting_tickets, so we need to call it 737 * here to see if we can make progress with the next ticket in 738 * the list. 739 */ 740 btrfs_try_granting_tickets(fs_info, space_info); 741 } 742 return (tickets_id != space_info->tickets_id); 743 } 744 745 /* 746 * This is for normal flushers, we can wait all goddamned day if we want to. We 747 * will loop and continuously try to flush as long as we are making progress. 748 * We count progress as clearing off tickets each time we have to loop. 749 */ 750 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 751 { 752 struct btrfs_fs_info *fs_info; 753 struct btrfs_space_info *space_info; 754 u64 to_reclaim; 755 int flush_state; 756 int commit_cycles = 0; 757 u64 last_tickets_id; 758 759 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 760 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 761 762 spin_lock(&space_info->lock); 763 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); 764 if (!to_reclaim) { 765 space_info->flush = 0; 766 spin_unlock(&space_info->lock); 767 return; 768 } 769 last_tickets_id = space_info->tickets_id; 770 spin_unlock(&space_info->lock); 771 772 flush_state = FLUSH_DELAYED_ITEMS_NR; 773 do { 774 flush_space(fs_info, space_info, to_reclaim, flush_state); 775 spin_lock(&space_info->lock); 776 if (list_empty(&space_info->tickets)) { 777 space_info->flush = 0; 778 spin_unlock(&space_info->lock); 779 return; 780 } 781 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 782 space_info); 783 if (last_tickets_id == space_info->tickets_id) { 784 flush_state++; 785 } else { 786 last_tickets_id = space_info->tickets_id; 787 flush_state = FLUSH_DELAYED_ITEMS_NR; 788 if (commit_cycles) 789 commit_cycles--; 790 } 791 792 /* 793 * We don't want to force a chunk allocation until we've tried 794 * pretty hard to reclaim space. Think of the case where we 795 * freed up a bunch of space and so have a lot of pinned space 796 * to reclaim. We would rather use that than possibly create a 797 * underutilized metadata chunk. So if this is our first run 798 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 799 * commit the transaction. If nothing has changed the next go 800 * around then we can force a chunk allocation. 801 */ 802 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 803 flush_state++; 804 805 if (flush_state > COMMIT_TRANS) { 806 commit_cycles++; 807 if (commit_cycles > 2) { 808 if (maybe_fail_all_tickets(fs_info, space_info)) { 809 flush_state = FLUSH_DELAYED_ITEMS_NR; 810 commit_cycles--; 811 } else { 812 space_info->flush = 0; 813 } 814 } else { 815 flush_state = FLUSH_DELAYED_ITEMS_NR; 816 } 817 } 818 spin_unlock(&space_info->lock); 819 } while (flush_state <= COMMIT_TRANS); 820 } 821 822 void btrfs_init_async_reclaim_work(struct work_struct *work) 823 { 824 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 825 } 826 827 static const enum btrfs_flush_state priority_flush_states[] = { 828 FLUSH_DELAYED_ITEMS_NR, 829 FLUSH_DELAYED_ITEMS, 830 ALLOC_CHUNK, 831 }; 832 833 static const enum btrfs_flush_state evict_flush_states[] = { 834 FLUSH_DELAYED_ITEMS_NR, 835 FLUSH_DELAYED_ITEMS, 836 FLUSH_DELAYED_REFS_NR, 837 FLUSH_DELAYED_REFS, 838 FLUSH_DELALLOC, 839 FLUSH_DELALLOC_WAIT, 840 ALLOC_CHUNK, 841 COMMIT_TRANS, 842 }; 843 844 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 845 struct btrfs_space_info *space_info, 846 struct reserve_ticket *ticket, 847 const enum btrfs_flush_state *states, 848 int states_nr) 849 { 850 u64 to_reclaim; 851 int flush_state; 852 853 spin_lock(&space_info->lock); 854 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); 855 if (!to_reclaim) { 856 spin_unlock(&space_info->lock); 857 return; 858 } 859 spin_unlock(&space_info->lock); 860 861 flush_state = 0; 862 do { 863 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 864 flush_state++; 865 spin_lock(&space_info->lock); 866 if (ticket->bytes == 0) { 867 spin_unlock(&space_info->lock); 868 return; 869 } 870 spin_unlock(&space_info->lock); 871 } while (flush_state < states_nr); 872 } 873 874 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 875 struct btrfs_space_info *space_info, 876 struct reserve_ticket *ticket) 877 878 { 879 DEFINE_WAIT(wait); 880 int ret = 0; 881 882 spin_lock(&space_info->lock); 883 while (ticket->bytes > 0 && ticket->error == 0) { 884 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 885 if (ret) { 886 /* 887 * Delete us from the list. After we unlock the space 888 * info, we don't want the async reclaim job to reserve 889 * space for this ticket. If that would happen, then the 890 * ticket's task would not known that space was reserved 891 * despite getting an error, resulting in a space leak 892 * (bytes_may_use counter of our space_info). 893 */ 894 list_del_init(&ticket->list); 895 ticket->error = -EINTR; 896 break; 897 } 898 spin_unlock(&space_info->lock); 899 900 schedule(); 901 902 finish_wait(&ticket->wait, &wait); 903 spin_lock(&space_info->lock); 904 } 905 spin_unlock(&space_info->lock); 906 } 907 908 /** 909 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 910 * @fs_info - the fs 911 * @space_info - the space_info for the reservation 912 * @ticket - the ticket for the reservation 913 * @flush - how much we can flush 914 * 915 * This does the work of figuring out how to flush for the ticket, waiting for 916 * the reservation, and returning the appropriate error if there is one. 917 */ 918 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 919 struct btrfs_space_info *space_info, 920 struct reserve_ticket *ticket, 921 enum btrfs_reserve_flush_enum flush) 922 { 923 int ret; 924 925 switch (flush) { 926 case BTRFS_RESERVE_FLUSH_ALL: 927 wait_reserve_ticket(fs_info, space_info, ticket); 928 break; 929 case BTRFS_RESERVE_FLUSH_LIMIT: 930 priority_reclaim_metadata_space(fs_info, space_info, ticket, 931 priority_flush_states, 932 ARRAY_SIZE(priority_flush_states)); 933 break; 934 case BTRFS_RESERVE_FLUSH_EVICT: 935 priority_reclaim_metadata_space(fs_info, space_info, ticket, 936 evict_flush_states, 937 ARRAY_SIZE(evict_flush_states)); 938 break; 939 default: 940 ASSERT(0); 941 break; 942 } 943 944 spin_lock(&space_info->lock); 945 ret = ticket->error; 946 if (ticket->bytes || ticket->error) { 947 /* 948 * Need to delete here for priority tickets. For regular tickets 949 * either the async reclaim job deletes the ticket from the list 950 * or we delete it ourselves at wait_reserve_ticket(). 951 */ 952 list_del_init(&ticket->list); 953 if (!ret) 954 ret = -ENOSPC; 955 } 956 spin_unlock(&space_info->lock); 957 ASSERT(list_empty(&ticket->list)); 958 /* 959 * Check that we can't have an error set if the reservation succeeded, 960 * as that would confuse tasks and lead them to error out without 961 * releasing reserved space (if an error happens the expectation is that 962 * space wasn't reserved at all). 963 */ 964 ASSERT(!(ticket->bytes == 0 && ticket->error)); 965 return ret; 966 } 967 968 /** 969 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 970 * @root - the root we're allocating for 971 * @space_info - the space info we want to allocate from 972 * @orig_bytes - the number of bytes we want 973 * @flush - whether or not we can flush to make our reservation 974 * 975 * This will reserve orig_bytes number of bytes from the space info associated 976 * with the block_rsv. If there is not enough space it will make an attempt to 977 * flush out space to make room. It will do this by flushing delalloc if 978 * possible or committing the transaction. If flush is 0 then no attempts to 979 * regain reservations will be made and this will fail if there is not enough 980 * space already. 981 */ 982 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 983 struct btrfs_space_info *space_info, 984 u64 orig_bytes, 985 enum btrfs_reserve_flush_enum flush) 986 { 987 struct reserve_ticket ticket; 988 u64 used; 989 int ret = 0; 990 bool pending_tickets; 991 992 ASSERT(orig_bytes); 993 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 994 995 spin_lock(&space_info->lock); 996 ret = -ENOSPC; 997 used = btrfs_space_info_used(space_info, true); 998 pending_tickets = !list_empty(&space_info->tickets) || 999 !list_empty(&space_info->priority_tickets); 1000 1001 /* 1002 * Carry on if we have enough space (short-circuit) OR call 1003 * can_overcommit() to ensure we can overcommit to continue. 1004 */ 1005 if (!pending_tickets && 1006 ((used + orig_bytes <= space_info->total_bytes) || 1007 can_overcommit(fs_info, space_info, orig_bytes, flush))) { 1008 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 1009 orig_bytes); 1010 ret = 0; 1011 } 1012 1013 /* 1014 * If we couldn't make a reservation then setup our reservation ticket 1015 * and kick the async worker if it's not already running. 1016 * 1017 * If we are a priority flusher then we just need to add our ticket to 1018 * the list and we will do our own flushing further down. 1019 */ 1020 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 1021 ticket.bytes = orig_bytes; 1022 ticket.error = 0; 1023 init_waitqueue_head(&ticket.wait); 1024 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 1025 list_add_tail(&ticket.list, &space_info->tickets); 1026 if (!space_info->flush) { 1027 space_info->flush = 1; 1028 trace_btrfs_trigger_flush(fs_info, 1029 space_info->flags, 1030 orig_bytes, flush, 1031 "enospc"); 1032 queue_work(system_unbound_wq, 1033 &fs_info->async_reclaim_work); 1034 } 1035 } else { 1036 list_add_tail(&ticket.list, 1037 &space_info->priority_tickets); 1038 } 1039 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1040 used += orig_bytes; 1041 /* 1042 * We will do the space reservation dance during log replay, 1043 * which means we won't have fs_info->fs_root set, so don't do 1044 * the async reclaim as we will panic. 1045 */ 1046 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1047 need_do_async_reclaim(fs_info, space_info, used) && 1048 !work_busy(&fs_info->async_reclaim_work)) { 1049 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1050 orig_bytes, flush, "preempt"); 1051 queue_work(system_unbound_wq, 1052 &fs_info->async_reclaim_work); 1053 } 1054 } 1055 spin_unlock(&space_info->lock); 1056 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1057 return ret; 1058 1059 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1060 } 1061 1062 /** 1063 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1064 * @root - the root we're allocating for 1065 * @block_rsv - the block_rsv we're allocating for 1066 * @orig_bytes - the number of bytes we want 1067 * @flush - whether or not we can flush to make our reservation 1068 * 1069 * This will reserve orig_bytes number of bytes from the space info associated 1070 * with the block_rsv. If there is not enough space it will make an attempt to 1071 * flush out space to make room. It will do this by flushing delalloc if 1072 * possible or committing the transaction. If flush is 0 then no attempts to 1073 * regain reservations will be made and this will fail if there is not enough 1074 * space already. 1075 */ 1076 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1077 struct btrfs_block_rsv *block_rsv, 1078 u64 orig_bytes, 1079 enum btrfs_reserve_flush_enum flush) 1080 { 1081 struct btrfs_fs_info *fs_info = root->fs_info; 1082 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1083 int ret; 1084 1085 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1086 orig_bytes, flush); 1087 if (ret == -ENOSPC && 1088 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1089 if (block_rsv != global_rsv && 1090 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1091 ret = 0; 1092 } 1093 if (ret == -ENOSPC) { 1094 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1095 block_rsv->space_info->flags, 1096 orig_bytes, 1); 1097 1098 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1099 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1100 orig_bytes, 0); 1101 } 1102 return ret; 1103 } 1104