1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 init_waitqueue_head(&space_info->wait); 62 INIT_LIST_HEAD(&space_info->ro_bgs); 63 INIT_LIST_HEAD(&space_info->tickets); 64 INIT_LIST_HEAD(&space_info->priority_tickets); 65 66 ret = btrfs_sysfs_add_space_info_type(info, space_info); 67 if (ret) 68 return ret; 69 70 list_add_rcu(&space_info->list, &info->space_info); 71 if (flags & BTRFS_BLOCK_GROUP_DATA) 72 info->data_sinfo = space_info; 73 74 return ret; 75 } 76 77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 78 { 79 struct btrfs_super_block *disk_super; 80 u64 features; 81 u64 flags; 82 int mixed = 0; 83 int ret; 84 85 disk_super = fs_info->super_copy; 86 if (!btrfs_super_root(disk_super)) 87 return -EINVAL; 88 89 features = btrfs_super_incompat_flags(disk_super); 90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 91 mixed = 1; 92 93 flags = BTRFS_BLOCK_GROUP_SYSTEM; 94 ret = create_space_info(fs_info, flags); 95 if (ret) 96 goto out; 97 98 if (mixed) { 99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 100 ret = create_space_info(fs_info, flags); 101 } else { 102 flags = BTRFS_BLOCK_GROUP_METADATA; 103 ret = create_space_info(fs_info, flags); 104 if (ret) 105 goto out; 106 107 flags = BTRFS_BLOCK_GROUP_DATA; 108 ret = create_space_info(fs_info, flags); 109 } 110 out: 111 return ret; 112 } 113 114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 115 u64 total_bytes, u64 bytes_used, 116 u64 bytes_readonly, 117 struct btrfs_space_info **space_info) 118 { 119 struct btrfs_space_info *found; 120 int factor; 121 122 factor = btrfs_bg_type_to_factor(flags); 123 124 found = btrfs_find_space_info(info, flags); 125 ASSERT(found); 126 spin_lock(&found->lock); 127 found->total_bytes += total_bytes; 128 found->disk_total += total_bytes * factor; 129 found->bytes_used += bytes_used; 130 found->disk_used += bytes_used * factor; 131 found->bytes_readonly += bytes_readonly; 132 if (total_bytes > 0) 133 found->full = 0; 134 btrfs_try_granting_tickets(info, found); 135 spin_unlock(&found->lock); 136 *space_info = found; 137 } 138 139 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 140 u64 flags) 141 { 142 struct list_head *head = &info->space_info; 143 struct btrfs_space_info *found; 144 145 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 146 147 rcu_read_lock(); 148 list_for_each_entry_rcu(found, head, list) { 149 if (found->flags & flags) { 150 rcu_read_unlock(); 151 return found; 152 } 153 } 154 rcu_read_unlock(); 155 return NULL; 156 } 157 158 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 159 { 160 return (global->size << 1); 161 } 162 163 static int can_overcommit(struct btrfs_fs_info *fs_info, 164 struct btrfs_space_info *space_info, u64 bytes, 165 enum btrfs_reserve_flush_enum flush, 166 bool system_chunk) 167 { 168 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 169 u64 profile; 170 u64 space_size; 171 u64 avail; 172 u64 used; 173 int factor; 174 175 /* Don't overcommit when in mixed mode. */ 176 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 177 return 0; 178 179 if (system_chunk) 180 profile = btrfs_system_alloc_profile(fs_info); 181 else 182 profile = btrfs_metadata_alloc_profile(fs_info); 183 184 used = btrfs_space_info_used(space_info, false); 185 186 /* 187 * We only want to allow over committing if we have lots of actual space 188 * free, but if we don't have enough space to handle the global reserve 189 * space then we could end up having a real enospc problem when trying 190 * to allocate a chunk or some other such important allocation. 191 */ 192 spin_lock(&global_rsv->lock); 193 space_size = calc_global_rsv_need_space(global_rsv); 194 spin_unlock(&global_rsv->lock); 195 if (used + space_size >= space_info->total_bytes) 196 return 0; 197 198 used += space_info->bytes_may_use; 199 200 avail = atomic64_read(&fs_info->free_chunk_space); 201 202 /* 203 * If we have dup, raid1 or raid10 then only half of the free 204 * space is actually usable. For raid56, the space info used 205 * doesn't include the parity drive, so we don't have to 206 * change the math 207 */ 208 factor = btrfs_bg_type_to_factor(profile); 209 avail = div_u64(avail, factor); 210 211 /* 212 * If we aren't flushing all things, let us overcommit up to 213 * 1/2th of the space. If we can flush, don't let us overcommit 214 * too much, let it overcommit up to 1/8 of the space. 215 */ 216 if (flush == BTRFS_RESERVE_FLUSH_ALL) 217 avail >>= 3; 218 else 219 avail >>= 1; 220 221 if (used + bytes < space_info->total_bytes + avail) 222 return 1; 223 return 0; 224 } 225 226 /* 227 * This is for space we already have accounted in space_info->bytes_may_use, so 228 * basically when we're returning space from block_rsv's. 229 */ 230 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 231 struct btrfs_space_info *space_info) 232 { 233 struct list_head *head; 234 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 235 236 lockdep_assert_held(&space_info->lock); 237 238 head = &space_info->priority_tickets; 239 again: 240 while (!list_empty(head)) { 241 struct reserve_ticket *ticket; 242 u64 used = btrfs_space_info_used(space_info, true); 243 244 ticket = list_first_entry(head, struct reserve_ticket, list); 245 246 /* Check and see if our ticket can be satisified now. */ 247 if ((used + ticket->bytes <= space_info->total_bytes) || 248 can_overcommit(fs_info, space_info, ticket->bytes, flush, 249 false)) { 250 btrfs_space_info_update_bytes_may_use(fs_info, 251 space_info, 252 ticket->bytes); 253 list_del_init(&ticket->list); 254 ticket->bytes = 0; 255 space_info->tickets_id++; 256 wake_up(&ticket->wait); 257 } else { 258 break; 259 } 260 } 261 262 if (head == &space_info->priority_tickets) { 263 head = &space_info->tickets; 264 flush = BTRFS_RESERVE_FLUSH_ALL; 265 goto again; 266 } 267 } 268 269 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 270 do { \ 271 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 272 spin_lock(&__rsv->lock); \ 273 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 274 __rsv->size, __rsv->reserved); \ 275 spin_unlock(&__rsv->lock); \ 276 } while (0) 277 278 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 279 struct btrfs_space_info *info, u64 bytes, 280 int dump_block_groups) 281 { 282 struct btrfs_block_group_cache *cache; 283 int index = 0; 284 285 spin_lock(&info->lock); 286 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 287 info->flags, 288 info->total_bytes - btrfs_space_info_used(info, true), 289 info->full ? "" : "not "); 290 btrfs_info(fs_info, 291 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 292 info->total_bytes, info->bytes_used, info->bytes_pinned, 293 info->bytes_reserved, info->bytes_may_use, 294 info->bytes_readonly); 295 spin_unlock(&info->lock); 296 297 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 298 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 299 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 300 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 301 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 302 303 if (!dump_block_groups) 304 return; 305 306 down_read(&info->groups_sem); 307 again: 308 list_for_each_entry(cache, &info->block_groups[index], list) { 309 spin_lock(&cache->lock); 310 btrfs_info(fs_info, 311 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 312 cache->key.objectid, cache->key.offset, 313 btrfs_block_group_used(&cache->item), cache->pinned, 314 cache->reserved, cache->ro ? "[readonly]" : ""); 315 btrfs_dump_free_space(cache, bytes); 316 spin_unlock(&cache->lock); 317 } 318 if (++index < BTRFS_NR_RAID_TYPES) 319 goto again; 320 up_read(&info->groups_sem); 321 } 322 323 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 324 unsigned long nr_pages, int nr_items) 325 { 326 struct super_block *sb = fs_info->sb; 327 328 if (down_read_trylock(&sb->s_umount)) { 329 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 330 up_read(&sb->s_umount); 331 } else { 332 /* 333 * We needn't worry the filesystem going from r/w to r/o though 334 * we don't acquire ->s_umount mutex, because the filesystem 335 * should guarantee the delalloc inodes list be empty after 336 * the filesystem is readonly(all dirty pages are written to 337 * the disk). 338 */ 339 btrfs_start_delalloc_roots(fs_info, nr_items); 340 if (!current->journal_info) 341 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 342 } 343 } 344 345 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 346 u64 to_reclaim) 347 { 348 u64 bytes; 349 u64 nr; 350 351 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 352 nr = div64_u64(to_reclaim, bytes); 353 if (!nr) 354 nr = 1; 355 return nr; 356 } 357 358 #define EXTENT_SIZE_PER_ITEM SZ_256K 359 360 /* 361 * shrink metadata reservation for delalloc 362 */ 363 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 364 u64 orig, bool wait_ordered) 365 { 366 struct btrfs_space_info *space_info; 367 struct btrfs_trans_handle *trans; 368 u64 delalloc_bytes; 369 u64 dio_bytes; 370 u64 async_pages; 371 u64 items; 372 long time_left; 373 unsigned long nr_pages; 374 int loops; 375 376 /* Calc the number of the pages we need flush for space reservation */ 377 items = calc_reclaim_items_nr(fs_info, to_reclaim); 378 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 379 380 trans = (struct btrfs_trans_handle *)current->journal_info; 381 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 382 383 delalloc_bytes = percpu_counter_sum_positive( 384 &fs_info->delalloc_bytes); 385 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 386 if (delalloc_bytes == 0 && dio_bytes == 0) { 387 if (trans) 388 return; 389 if (wait_ordered) 390 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 391 return; 392 } 393 394 /* 395 * If we are doing more ordered than delalloc we need to just wait on 396 * ordered extents, otherwise we'll waste time trying to flush delalloc 397 * that likely won't give us the space back we need. 398 */ 399 if (dio_bytes > delalloc_bytes) 400 wait_ordered = true; 401 402 loops = 0; 403 while ((delalloc_bytes || dio_bytes) && loops < 3) { 404 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 405 406 /* 407 * Triggers inode writeback for up to nr_pages. This will invoke 408 * ->writepages callback and trigger delalloc filling 409 * (btrfs_run_delalloc_range()). 410 */ 411 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 412 413 /* 414 * We need to wait for the compressed pages to start before 415 * we continue. 416 */ 417 async_pages = atomic_read(&fs_info->async_delalloc_pages); 418 if (!async_pages) 419 goto skip_async; 420 421 /* 422 * Calculate how many compressed pages we want to be written 423 * before we continue. I.e if there are more async pages than we 424 * require wait_event will wait until nr_pages are written. 425 */ 426 if (async_pages <= nr_pages) 427 async_pages = 0; 428 else 429 async_pages -= nr_pages; 430 431 wait_event(fs_info->async_submit_wait, 432 atomic_read(&fs_info->async_delalloc_pages) <= 433 (int)async_pages); 434 skip_async: 435 spin_lock(&space_info->lock); 436 if (list_empty(&space_info->tickets) && 437 list_empty(&space_info->priority_tickets)) { 438 spin_unlock(&space_info->lock); 439 break; 440 } 441 spin_unlock(&space_info->lock); 442 443 loops++; 444 if (wait_ordered && !trans) { 445 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 446 } else { 447 time_left = schedule_timeout_killable(1); 448 if (time_left) 449 break; 450 } 451 delalloc_bytes = percpu_counter_sum_positive( 452 &fs_info->delalloc_bytes); 453 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 454 } 455 } 456 457 /** 458 * maybe_commit_transaction - possibly commit the transaction if its ok to 459 * @root - the root we're allocating for 460 * @bytes - the number of bytes we want to reserve 461 * @force - force the commit 462 * 463 * This will check to make sure that committing the transaction will actually 464 * get us somewhere and then commit the transaction if it does. Otherwise it 465 * will return -ENOSPC. 466 */ 467 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 468 struct btrfs_space_info *space_info) 469 { 470 struct reserve_ticket *ticket = NULL; 471 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 472 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 473 struct btrfs_trans_handle *trans; 474 u64 bytes_needed; 475 u64 reclaim_bytes = 0; 476 477 trans = (struct btrfs_trans_handle *)current->journal_info; 478 if (trans) 479 return -EAGAIN; 480 481 spin_lock(&space_info->lock); 482 if (!list_empty(&space_info->priority_tickets)) 483 ticket = list_first_entry(&space_info->priority_tickets, 484 struct reserve_ticket, list); 485 else if (!list_empty(&space_info->tickets)) 486 ticket = list_first_entry(&space_info->tickets, 487 struct reserve_ticket, list); 488 bytes_needed = (ticket) ? ticket->bytes : 0; 489 spin_unlock(&space_info->lock); 490 491 if (!bytes_needed) 492 return 0; 493 494 trans = btrfs_join_transaction(fs_info->extent_root); 495 if (IS_ERR(trans)) 496 return PTR_ERR(trans); 497 498 /* 499 * See if there is enough pinned space to make this reservation, or if 500 * we have block groups that are going to be freed, allowing us to 501 * possibly do a chunk allocation the next loop through. 502 */ 503 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 504 __percpu_counter_compare(&space_info->total_bytes_pinned, 505 bytes_needed, 506 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 507 goto commit; 508 509 /* 510 * See if there is some space in the delayed insertion reservation for 511 * this reservation. 512 */ 513 if (space_info != delayed_rsv->space_info) 514 goto enospc; 515 516 spin_lock(&delayed_rsv->lock); 517 reclaim_bytes += delayed_rsv->reserved; 518 spin_unlock(&delayed_rsv->lock); 519 520 spin_lock(&delayed_refs_rsv->lock); 521 reclaim_bytes += delayed_refs_rsv->reserved; 522 spin_unlock(&delayed_refs_rsv->lock); 523 if (reclaim_bytes >= bytes_needed) 524 goto commit; 525 bytes_needed -= reclaim_bytes; 526 527 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 528 bytes_needed, 529 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 530 goto enospc; 531 532 commit: 533 return btrfs_commit_transaction(trans); 534 enospc: 535 btrfs_end_transaction(trans); 536 return -ENOSPC; 537 } 538 539 /* 540 * Try to flush some data based on policy set by @state. This is only advisory 541 * and may fail for various reasons. The caller is supposed to examine the 542 * state of @space_info to detect the outcome. 543 */ 544 static void flush_space(struct btrfs_fs_info *fs_info, 545 struct btrfs_space_info *space_info, u64 num_bytes, 546 int state) 547 { 548 struct btrfs_root *root = fs_info->extent_root; 549 struct btrfs_trans_handle *trans; 550 int nr; 551 int ret = 0; 552 553 switch (state) { 554 case FLUSH_DELAYED_ITEMS_NR: 555 case FLUSH_DELAYED_ITEMS: 556 if (state == FLUSH_DELAYED_ITEMS_NR) 557 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 558 else 559 nr = -1; 560 561 trans = btrfs_join_transaction(root); 562 if (IS_ERR(trans)) { 563 ret = PTR_ERR(trans); 564 break; 565 } 566 ret = btrfs_run_delayed_items_nr(trans, nr); 567 btrfs_end_transaction(trans); 568 break; 569 case FLUSH_DELALLOC: 570 case FLUSH_DELALLOC_WAIT: 571 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 572 state == FLUSH_DELALLOC_WAIT); 573 break; 574 case FLUSH_DELAYED_REFS_NR: 575 case FLUSH_DELAYED_REFS: 576 trans = btrfs_join_transaction(root); 577 if (IS_ERR(trans)) { 578 ret = PTR_ERR(trans); 579 break; 580 } 581 if (state == FLUSH_DELAYED_REFS_NR) 582 nr = calc_reclaim_items_nr(fs_info, num_bytes); 583 else 584 nr = 0; 585 btrfs_run_delayed_refs(trans, nr); 586 btrfs_end_transaction(trans); 587 break; 588 case ALLOC_CHUNK: 589 case ALLOC_CHUNK_FORCE: 590 trans = btrfs_join_transaction(root); 591 if (IS_ERR(trans)) { 592 ret = PTR_ERR(trans); 593 break; 594 } 595 ret = btrfs_chunk_alloc(trans, 596 btrfs_metadata_alloc_profile(fs_info), 597 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 598 CHUNK_ALLOC_FORCE); 599 btrfs_end_transaction(trans); 600 if (ret > 0 || ret == -ENOSPC) 601 ret = 0; 602 break; 603 case RUN_DELAYED_IPUTS: 604 /* 605 * If we have pending delayed iputs then we could free up a 606 * bunch of pinned space, so make sure we run the iputs before 607 * we do our pinned bytes check below. 608 */ 609 btrfs_run_delayed_iputs(fs_info); 610 btrfs_wait_on_delayed_iputs(fs_info); 611 break; 612 case COMMIT_TRANS: 613 ret = may_commit_transaction(fs_info, space_info); 614 break; 615 default: 616 ret = -ENOSPC; 617 break; 618 } 619 620 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 621 ret); 622 return; 623 } 624 625 static inline u64 626 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 627 struct btrfs_space_info *space_info, 628 bool system_chunk) 629 { 630 struct reserve_ticket *ticket; 631 u64 used; 632 u64 expected; 633 u64 to_reclaim = 0; 634 635 list_for_each_entry(ticket, &space_info->tickets, list) 636 to_reclaim += ticket->bytes; 637 list_for_each_entry(ticket, &space_info->priority_tickets, list) 638 to_reclaim += ticket->bytes; 639 if (to_reclaim) 640 return to_reclaim; 641 642 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 643 if (can_overcommit(fs_info, space_info, to_reclaim, 644 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 645 return 0; 646 647 used = btrfs_space_info_used(space_info, true); 648 649 if (can_overcommit(fs_info, space_info, SZ_1M, 650 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 651 expected = div_factor_fine(space_info->total_bytes, 95); 652 else 653 expected = div_factor_fine(space_info->total_bytes, 90); 654 655 if (used > expected) 656 to_reclaim = used - expected; 657 else 658 to_reclaim = 0; 659 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 660 space_info->bytes_reserved); 661 return to_reclaim; 662 } 663 664 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 665 struct btrfs_space_info *space_info, 666 u64 used, bool system_chunk) 667 { 668 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 669 670 /* If we're just plain full then async reclaim just slows us down. */ 671 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 672 return 0; 673 674 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 675 system_chunk)) 676 return 0; 677 678 return (used >= thresh && !btrfs_fs_closing(fs_info) && 679 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 680 } 681 682 static bool wake_all_tickets(struct list_head *head) 683 { 684 struct reserve_ticket *ticket; 685 686 while (!list_empty(head)) { 687 ticket = list_first_entry(head, struct reserve_ticket, list); 688 list_del_init(&ticket->list); 689 ticket->error = -ENOSPC; 690 wake_up(&ticket->wait); 691 if (ticket->bytes != ticket->orig_bytes) 692 return true; 693 } 694 return false; 695 } 696 697 /* 698 * This is for normal flushers, we can wait all goddamned day if we want to. We 699 * will loop and continuously try to flush as long as we are making progress. 700 * We count progress as clearing off tickets each time we have to loop. 701 */ 702 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 703 { 704 struct btrfs_fs_info *fs_info; 705 struct btrfs_space_info *space_info; 706 u64 to_reclaim; 707 int flush_state; 708 int commit_cycles = 0; 709 u64 last_tickets_id; 710 711 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 712 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 713 714 spin_lock(&space_info->lock); 715 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 716 false); 717 if (!to_reclaim) { 718 space_info->flush = 0; 719 spin_unlock(&space_info->lock); 720 return; 721 } 722 last_tickets_id = space_info->tickets_id; 723 spin_unlock(&space_info->lock); 724 725 flush_state = FLUSH_DELAYED_ITEMS_NR; 726 do { 727 flush_space(fs_info, space_info, to_reclaim, flush_state); 728 spin_lock(&space_info->lock); 729 if (list_empty(&space_info->tickets)) { 730 space_info->flush = 0; 731 spin_unlock(&space_info->lock); 732 return; 733 } 734 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 735 space_info, 736 false); 737 if (last_tickets_id == space_info->tickets_id) { 738 flush_state++; 739 } else { 740 last_tickets_id = space_info->tickets_id; 741 flush_state = FLUSH_DELAYED_ITEMS_NR; 742 if (commit_cycles) 743 commit_cycles--; 744 } 745 746 /* 747 * We don't want to force a chunk allocation until we've tried 748 * pretty hard to reclaim space. Think of the case where we 749 * freed up a bunch of space and so have a lot of pinned space 750 * to reclaim. We would rather use that than possibly create a 751 * underutilized metadata chunk. So if this is our first run 752 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 753 * commit the transaction. If nothing has changed the next go 754 * around then we can force a chunk allocation. 755 */ 756 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 757 flush_state++; 758 759 if (flush_state > COMMIT_TRANS) { 760 commit_cycles++; 761 if (commit_cycles > 2) { 762 if (wake_all_tickets(&space_info->tickets)) { 763 flush_state = FLUSH_DELAYED_ITEMS_NR; 764 commit_cycles--; 765 } else { 766 space_info->flush = 0; 767 } 768 } else { 769 flush_state = FLUSH_DELAYED_ITEMS_NR; 770 } 771 } 772 spin_unlock(&space_info->lock); 773 } while (flush_state <= COMMIT_TRANS); 774 } 775 776 void btrfs_init_async_reclaim_work(struct work_struct *work) 777 { 778 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 779 } 780 781 static const enum btrfs_flush_state priority_flush_states[] = { 782 FLUSH_DELAYED_ITEMS_NR, 783 FLUSH_DELAYED_ITEMS, 784 ALLOC_CHUNK, 785 }; 786 787 static const enum btrfs_flush_state evict_flush_states[] = { 788 FLUSH_DELAYED_ITEMS_NR, 789 FLUSH_DELAYED_ITEMS, 790 FLUSH_DELAYED_REFS_NR, 791 FLUSH_DELAYED_REFS, 792 FLUSH_DELALLOC, 793 FLUSH_DELALLOC_WAIT, 794 ALLOC_CHUNK, 795 COMMIT_TRANS, 796 }; 797 798 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 799 struct btrfs_space_info *space_info, 800 struct reserve_ticket *ticket, 801 const enum btrfs_flush_state *states, 802 int states_nr) 803 { 804 u64 to_reclaim; 805 int flush_state; 806 807 spin_lock(&space_info->lock); 808 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 809 false); 810 if (!to_reclaim) { 811 spin_unlock(&space_info->lock); 812 return; 813 } 814 spin_unlock(&space_info->lock); 815 816 flush_state = 0; 817 do { 818 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 819 flush_state++; 820 spin_lock(&space_info->lock); 821 if (ticket->bytes == 0) { 822 spin_unlock(&space_info->lock); 823 return; 824 } 825 spin_unlock(&space_info->lock); 826 } while (flush_state < states_nr); 827 } 828 829 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 830 struct btrfs_space_info *space_info, 831 struct reserve_ticket *ticket) 832 833 { 834 DEFINE_WAIT(wait); 835 int ret = 0; 836 837 spin_lock(&space_info->lock); 838 while (ticket->bytes > 0 && ticket->error == 0) { 839 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 840 if (ret) { 841 ticket->error = -EINTR; 842 break; 843 } 844 spin_unlock(&space_info->lock); 845 846 schedule(); 847 848 finish_wait(&ticket->wait, &wait); 849 spin_lock(&space_info->lock); 850 } 851 spin_unlock(&space_info->lock); 852 } 853 854 /** 855 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 856 * @fs_info - the fs 857 * @space_info - the space_info for the reservation 858 * @ticket - the ticket for the reservation 859 * @flush - how much we can flush 860 * 861 * This does the work of figuring out how to flush for the ticket, waiting for 862 * the reservation, and returning the appropriate error if there is one. 863 */ 864 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 865 struct btrfs_space_info *space_info, 866 struct reserve_ticket *ticket, 867 enum btrfs_reserve_flush_enum flush) 868 { 869 u64 reclaim_bytes = 0; 870 int ret; 871 872 switch (flush) { 873 case BTRFS_RESERVE_FLUSH_ALL: 874 wait_reserve_ticket(fs_info, space_info, ticket); 875 break; 876 case BTRFS_RESERVE_FLUSH_LIMIT: 877 priority_reclaim_metadata_space(fs_info, space_info, ticket, 878 priority_flush_states, 879 ARRAY_SIZE(priority_flush_states)); 880 break; 881 case BTRFS_RESERVE_FLUSH_EVICT: 882 priority_reclaim_metadata_space(fs_info, space_info, ticket, 883 evict_flush_states, 884 ARRAY_SIZE(evict_flush_states)); 885 break; 886 default: 887 ASSERT(0); 888 break; 889 } 890 891 spin_lock(&space_info->lock); 892 ret = ticket->error; 893 if (ticket->bytes || ticket->error) { 894 if (ticket->bytes < ticket->orig_bytes) 895 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 896 list_del_init(&ticket->list); 897 if (!ret) 898 ret = -ENOSPC; 899 } 900 spin_unlock(&space_info->lock); 901 902 if (reclaim_bytes) 903 btrfs_space_info_add_old_bytes(fs_info, space_info, 904 reclaim_bytes); 905 ASSERT(list_empty(&ticket->list)); 906 return ret; 907 } 908 909 /** 910 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 911 * @root - the root we're allocating for 912 * @space_info - the space info we want to allocate from 913 * @orig_bytes - the number of bytes we want 914 * @flush - whether or not we can flush to make our reservation 915 * 916 * This will reserve orig_bytes number of bytes from the space info associated 917 * with the block_rsv. If there is not enough space it will make an attempt to 918 * flush out space to make room. It will do this by flushing delalloc if 919 * possible or committing the transaction. If flush is 0 then no attempts to 920 * regain reservations will be made and this will fail if there is not enough 921 * space already. 922 */ 923 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 924 struct btrfs_space_info *space_info, 925 u64 orig_bytes, 926 enum btrfs_reserve_flush_enum flush, 927 bool system_chunk) 928 { 929 struct reserve_ticket ticket; 930 u64 used; 931 int ret = 0; 932 bool pending_tickets; 933 934 ASSERT(orig_bytes); 935 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 936 937 spin_lock(&space_info->lock); 938 ret = -ENOSPC; 939 used = btrfs_space_info_used(space_info, true); 940 pending_tickets = !list_empty(&space_info->tickets) || 941 !list_empty(&space_info->priority_tickets); 942 943 /* 944 * Carry on if we have enough space (short-circuit) OR call 945 * can_overcommit() to ensure we can overcommit to continue. 946 */ 947 if (!pending_tickets && 948 ((used + orig_bytes <= space_info->total_bytes) || 949 can_overcommit(fs_info, space_info, orig_bytes, flush, 950 system_chunk))) { 951 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 952 orig_bytes); 953 ret = 0; 954 } 955 956 /* 957 * If we couldn't make a reservation then setup our reservation ticket 958 * and kick the async worker if it's not already running. 959 * 960 * If we are a priority flusher then we just need to add our ticket to 961 * the list and we will do our own flushing further down. 962 */ 963 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 964 ticket.orig_bytes = orig_bytes; 965 ticket.bytes = orig_bytes; 966 ticket.error = 0; 967 init_waitqueue_head(&ticket.wait); 968 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 969 list_add_tail(&ticket.list, &space_info->tickets); 970 if (!space_info->flush) { 971 space_info->flush = 1; 972 trace_btrfs_trigger_flush(fs_info, 973 space_info->flags, 974 orig_bytes, flush, 975 "enospc"); 976 queue_work(system_unbound_wq, 977 &fs_info->async_reclaim_work); 978 } 979 } else { 980 list_add_tail(&ticket.list, 981 &space_info->priority_tickets); 982 } 983 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 984 used += orig_bytes; 985 /* 986 * We will do the space reservation dance during log replay, 987 * which means we won't have fs_info->fs_root set, so don't do 988 * the async reclaim as we will panic. 989 */ 990 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 991 need_do_async_reclaim(fs_info, space_info, 992 used, system_chunk) && 993 !work_busy(&fs_info->async_reclaim_work)) { 994 trace_btrfs_trigger_flush(fs_info, space_info->flags, 995 orig_bytes, flush, "preempt"); 996 queue_work(system_unbound_wq, 997 &fs_info->async_reclaim_work); 998 } 999 } 1000 spin_unlock(&space_info->lock); 1001 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1002 return ret; 1003 1004 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1005 } 1006 1007 /** 1008 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1009 * @root - the root we're allocating for 1010 * @block_rsv - the block_rsv we're allocating for 1011 * @orig_bytes - the number of bytes we want 1012 * @flush - whether or not we can flush to make our reservation 1013 * 1014 * This will reserve orig_bytes number of bytes from the space info associated 1015 * with the block_rsv. If there is not enough space it will make an attempt to 1016 * flush out space to make room. It will do this by flushing delalloc if 1017 * possible or committing the transaction. If flush is 0 then no attempts to 1018 * regain reservations will be made and this will fail if there is not enough 1019 * space already. 1020 */ 1021 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1022 struct btrfs_block_rsv *block_rsv, 1023 u64 orig_bytes, 1024 enum btrfs_reserve_flush_enum flush) 1025 { 1026 struct btrfs_fs_info *fs_info = root->fs_info; 1027 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1028 int ret; 1029 bool system_chunk = (root == fs_info->chunk_root); 1030 1031 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1032 orig_bytes, flush, system_chunk); 1033 if (ret == -ENOSPC && 1034 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1035 if (block_rsv != global_rsv && 1036 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1037 ret = 0; 1038 } 1039 if (ret == -ENOSPC) { 1040 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1041 block_rsv->space_info->flags, 1042 orig_bytes, 1); 1043 1044 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1045 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1046 orig_bytes, 0); 1047 } 1048 return ret; 1049 } 1050