1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 init_waitqueue_head(&space_info->wait); 62 INIT_LIST_HEAD(&space_info->ro_bgs); 63 INIT_LIST_HEAD(&space_info->tickets); 64 INIT_LIST_HEAD(&space_info->priority_tickets); 65 66 ret = btrfs_sysfs_add_space_info_type(info, space_info); 67 if (ret) 68 return ret; 69 70 list_add_rcu(&space_info->list, &info->space_info); 71 if (flags & BTRFS_BLOCK_GROUP_DATA) 72 info->data_sinfo = space_info; 73 74 return ret; 75 } 76 77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 78 { 79 struct btrfs_super_block *disk_super; 80 u64 features; 81 u64 flags; 82 int mixed = 0; 83 int ret; 84 85 disk_super = fs_info->super_copy; 86 if (!btrfs_super_root(disk_super)) 87 return -EINVAL; 88 89 features = btrfs_super_incompat_flags(disk_super); 90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 91 mixed = 1; 92 93 flags = BTRFS_BLOCK_GROUP_SYSTEM; 94 ret = create_space_info(fs_info, flags); 95 if (ret) 96 goto out; 97 98 if (mixed) { 99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 100 ret = create_space_info(fs_info, flags); 101 } else { 102 flags = BTRFS_BLOCK_GROUP_METADATA; 103 ret = create_space_info(fs_info, flags); 104 if (ret) 105 goto out; 106 107 flags = BTRFS_BLOCK_GROUP_DATA; 108 ret = create_space_info(fs_info, flags); 109 } 110 out: 111 return ret; 112 } 113 114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 115 u64 total_bytes, u64 bytes_used, 116 u64 bytes_readonly, 117 struct btrfs_space_info **space_info) 118 { 119 struct btrfs_space_info *found; 120 int factor; 121 122 factor = btrfs_bg_type_to_factor(flags); 123 124 found = btrfs_find_space_info(info, flags); 125 ASSERT(found); 126 spin_lock(&found->lock); 127 found->total_bytes += total_bytes; 128 found->disk_total += total_bytes * factor; 129 found->bytes_used += bytes_used; 130 found->disk_used += bytes_used * factor; 131 found->bytes_readonly += bytes_readonly; 132 if (total_bytes > 0) 133 found->full = 0; 134 btrfs_try_granting_tickets(info, found); 135 spin_unlock(&found->lock); 136 *space_info = found; 137 } 138 139 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 140 u64 flags) 141 { 142 struct list_head *head = &info->space_info; 143 struct btrfs_space_info *found; 144 145 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 146 147 rcu_read_lock(); 148 list_for_each_entry_rcu(found, head, list) { 149 if (found->flags & flags) { 150 rcu_read_unlock(); 151 return found; 152 } 153 } 154 rcu_read_unlock(); 155 return NULL; 156 } 157 158 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 159 { 160 return (global->size << 1); 161 } 162 163 static int can_overcommit(struct btrfs_fs_info *fs_info, 164 struct btrfs_space_info *space_info, u64 bytes, 165 enum btrfs_reserve_flush_enum flush, 166 bool system_chunk) 167 { 168 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 169 u64 profile; 170 u64 space_size; 171 u64 avail; 172 u64 used; 173 int factor; 174 175 /* Don't overcommit when in mixed mode. */ 176 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 177 return 0; 178 179 if (system_chunk) 180 profile = btrfs_system_alloc_profile(fs_info); 181 else 182 profile = btrfs_metadata_alloc_profile(fs_info); 183 184 used = btrfs_space_info_used(space_info, false); 185 186 /* 187 * We only want to allow over committing if we have lots of actual space 188 * free, but if we don't have enough space to handle the global reserve 189 * space then we could end up having a real enospc problem when trying 190 * to allocate a chunk or some other such important allocation. 191 */ 192 spin_lock(&global_rsv->lock); 193 space_size = calc_global_rsv_need_space(global_rsv); 194 spin_unlock(&global_rsv->lock); 195 if (used + space_size >= space_info->total_bytes) 196 return 0; 197 198 used += space_info->bytes_may_use; 199 200 avail = atomic64_read(&fs_info->free_chunk_space); 201 202 /* 203 * If we have dup, raid1 or raid10 then only half of the free 204 * space is actually usable. For raid56, the space info used 205 * doesn't include the parity drive, so we don't have to 206 * change the math 207 */ 208 factor = btrfs_bg_type_to_factor(profile); 209 avail = div_u64(avail, factor); 210 211 /* 212 * If we aren't flushing all things, let us overcommit up to 213 * 1/2th of the space. If we can flush, don't let us overcommit 214 * too much, let it overcommit up to 1/8 of the space. 215 */ 216 if (flush == BTRFS_RESERVE_FLUSH_ALL) 217 avail >>= 3; 218 else 219 avail >>= 1; 220 221 if (used + bytes < space_info->total_bytes + avail) 222 return 1; 223 return 0; 224 } 225 226 /* 227 * This is for space we already have accounted in space_info->bytes_may_use, so 228 * basically when we're returning space from block_rsv's. 229 */ 230 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, 231 struct btrfs_space_info *space_info) 232 { 233 struct list_head *head; 234 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 235 236 lockdep_assert_held(&space_info->lock); 237 238 head = &space_info->priority_tickets; 239 again: 240 while (!list_empty(head)) { 241 struct reserve_ticket *ticket; 242 u64 used = btrfs_space_info_used(space_info, true); 243 244 ticket = list_first_entry(head, struct reserve_ticket, list); 245 246 /* Check and see if our ticket can be satisified now. */ 247 if ((used + ticket->bytes <= space_info->total_bytes) || 248 can_overcommit(fs_info, space_info, ticket->bytes, flush, 249 false)) { 250 btrfs_space_info_update_bytes_may_use(fs_info, 251 space_info, 252 ticket->bytes); 253 list_del_init(&ticket->list); 254 ticket->bytes = 0; 255 space_info->tickets_id++; 256 wake_up(&ticket->wait); 257 } else { 258 break; 259 } 260 } 261 262 if (head == &space_info->priority_tickets) { 263 head = &space_info->tickets; 264 flush = BTRFS_RESERVE_FLUSH_ALL; 265 goto again; 266 } 267 } 268 269 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 270 do { \ 271 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 272 spin_lock(&__rsv->lock); \ 273 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 274 __rsv->size, __rsv->reserved); \ 275 spin_unlock(&__rsv->lock); \ 276 } while (0) 277 278 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 279 struct btrfs_space_info *info, u64 bytes, 280 int dump_block_groups) 281 { 282 struct btrfs_block_group_cache *cache; 283 int index = 0; 284 285 spin_lock(&info->lock); 286 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 287 info->flags, 288 info->total_bytes - btrfs_space_info_used(info, true), 289 info->full ? "" : "not "); 290 btrfs_info(fs_info, 291 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 292 info->total_bytes, info->bytes_used, info->bytes_pinned, 293 info->bytes_reserved, info->bytes_may_use, 294 info->bytes_readonly); 295 spin_unlock(&info->lock); 296 297 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 298 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 299 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 300 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 301 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 302 303 if (!dump_block_groups) 304 return; 305 306 down_read(&info->groups_sem); 307 again: 308 list_for_each_entry(cache, &info->block_groups[index], list) { 309 spin_lock(&cache->lock); 310 btrfs_info(fs_info, 311 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 312 cache->key.objectid, cache->key.offset, 313 btrfs_block_group_used(&cache->item), cache->pinned, 314 cache->reserved, cache->ro ? "[readonly]" : ""); 315 btrfs_dump_free_space(cache, bytes); 316 spin_unlock(&cache->lock); 317 } 318 if (++index < BTRFS_NR_RAID_TYPES) 319 goto again; 320 up_read(&info->groups_sem); 321 } 322 323 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 324 unsigned long nr_pages, int nr_items) 325 { 326 struct super_block *sb = fs_info->sb; 327 328 if (down_read_trylock(&sb->s_umount)) { 329 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 330 up_read(&sb->s_umount); 331 } else { 332 /* 333 * We needn't worry the filesystem going from r/w to r/o though 334 * we don't acquire ->s_umount mutex, because the filesystem 335 * should guarantee the delalloc inodes list be empty after 336 * the filesystem is readonly(all dirty pages are written to 337 * the disk). 338 */ 339 btrfs_start_delalloc_roots(fs_info, nr_items); 340 if (!current->journal_info) 341 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 342 } 343 } 344 345 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 346 u64 to_reclaim) 347 { 348 u64 bytes; 349 u64 nr; 350 351 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 352 nr = div64_u64(to_reclaim, bytes); 353 if (!nr) 354 nr = 1; 355 return nr; 356 } 357 358 #define EXTENT_SIZE_PER_ITEM SZ_256K 359 360 /* 361 * shrink metadata reservation for delalloc 362 */ 363 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 364 u64 orig, bool wait_ordered) 365 { 366 struct btrfs_space_info *space_info; 367 struct btrfs_trans_handle *trans; 368 u64 delalloc_bytes; 369 u64 dio_bytes; 370 u64 async_pages; 371 u64 items; 372 long time_left; 373 unsigned long nr_pages; 374 int loops; 375 376 /* Calc the number of the pages we need flush for space reservation */ 377 items = calc_reclaim_items_nr(fs_info, to_reclaim); 378 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 379 380 trans = (struct btrfs_trans_handle *)current->journal_info; 381 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 382 383 delalloc_bytes = percpu_counter_sum_positive( 384 &fs_info->delalloc_bytes); 385 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 386 if (delalloc_bytes == 0 && dio_bytes == 0) { 387 if (trans) 388 return; 389 if (wait_ordered) 390 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 391 return; 392 } 393 394 /* 395 * If we are doing more ordered than delalloc we need to just wait on 396 * ordered extents, otherwise we'll waste time trying to flush delalloc 397 * that likely won't give us the space back we need. 398 */ 399 if (dio_bytes > delalloc_bytes) 400 wait_ordered = true; 401 402 loops = 0; 403 while ((delalloc_bytes || dio_bytes) && loops < 3) { 404 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 405 406 /* 407 * Triggers inode writeback for up to nr_pages. This will invoke 408 * ->writepages callback and trigger delalloc filling 409 * (btrfs_run_delalloc_range()). 410 */ 411 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 412 413 /* 414 * We need to wait for the compressed pages to start before 415 * we continue. 416 */ 417 async_pages = atomic_read(&fs_info->async_delalloc_pages); 418 if (!async_pages) 419 goto skip_async; 420 421 /* 422 * Calculate how many compressed pages we want to be written 423 * before we continue. I.e if there are more async pages than we 424 * require wait_event will wait until nr_pages are written. 425 */ 426 if (async_pages <= nr_pages) 427 async_pages = 0; 428 else 429 async_pages -= nr_pages; 430 431 wait_event(fs_info->async_submit_wait, 432 atomic_read(&fs_info->async_delalloc_pages) <= 433 (int)async_pages); 434 skip_async: 435 spin_lock(&space_info->lock); 436 if (list_empty(&space_info->tickets) && 437 list_empty(&space_info->priority_tickets)) { 438 spin_unlock(&space_info->lock); 439 break; 440 } 441 spin_unlock(&space_info->lock); 442 443 loops++; 444 if (wait_ordered && !trans) { 445 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 446 } else { 447 time_left = schedule_timeout_killable(1); 448 if (time_left) 449 break; 450 } 451 delalloc_bytes = percpu_counter_sum_positive( 452 &fs_info->delalloc_bytes); 453 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 454 } 455 } 456 457 /** 458 * maybe_commit_transaction - possibly commit the transaction if its ok to 459 * @root - the root we're allocating for 460 * @bytes - the number of bytes we want to reserve 461 * @force - force the commit 462 * 463 * This will check to make sure that committing the transaction will actually 464 * get us somewhere and then commit the transaction if it does. Otherwise it 465 * will return -ENOSPC. 466 */ 467 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 468 struct btrfs_space_info *space_info) 469 { 470 struct reserve_ticket *ticket = NULL; 471 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 472 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 473 struct btrfs_trans_handle *trans; 474 u64 bytes_needed; 475 u64 reclaim_bytes = 0; 476 u64 cur_free_bytes = 0; 477 478 trans = (struct btrfs_trans_handle *)current->journal_info; 479 if (trans) 480 return -EAGAIN; 481 482 spin_lock(&space_info->lock); 483 cur_free_bytes = btrfs_space_info_used(space_info, true); 484 if (cur_free_bytes < space_info->total_bytes) 485 cur_free_bytes = space_info->total_bytes - cur_free_bytes; 486 else 487 cur_free_bytes = 0; 488 489 if (!list_empty(&space_info->priority_tickets)) 490 ticket = list_first_entry(&space_info->priority_tickets, 491 struct reserve_ticket, list); 492 else if (!list_empty(&space_info->tickets)) 493 ticket = list_first_entry(&space_info->tickets, 494 struct reserve_ticket, list); 495 bytes_needed = (ticket) ? ticket->bytes : 0; 496 497 if (bytes_needed > cur_free_bytes) 498 bytes_needed -= cur_free_bytes; 499 else 500 bytes_needed = 0; 501 spin_unlock(&space_info->lock); 502 503 if (!bytes_needed) 504 return 0; 505 506 trans = btrfs_join_transaction(fs_info->extent_root); 507 if (IS_ERR(trans)) 508 return PTR_ERR(trans); 509 510 /* 511 * See if there is enough pinned space to make this reservation, or if 512 * we have block groups that are going to be freed, allowing us to 513 * possibly do a chunk allocation the next loop through. 514 */ 515 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 516 __percpu_counter_compare(&space_info->total_bytes_pinned, 517 bytes_needed, 518 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 519 goto commit; 520 521 /* 522 * See if there is some space in the delayed insertion reservation for 523 * this reservation. 524 */ 525 if (space_info != delayed_rsv->space_info) 526 goto enospc; 527 528 spin_lock(&delayed_rsv->lock); 529 reclaim_bytes += delayed_rsv->reserved; 530 spin_unlock(&delayed_rsv->lock); 531 532 spin_lock(&delayed_refs_rsv->lock); 533 reclaim_bytes += delayed_refs_rsv->reserved; 534 spin_unlock(&delayed_refs_rsv->lock); 535 if (reclaim_bytes >= bytes_needed) 536 goto commit; 537 bytes_needed -= reclaim_bytes; 538 539 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 540 bytes_needed, 541 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 542 goto enospc; 543 544 commit: 545 return btrfs_commit_transaction(trans); 546 enospc: 547 btrfs_end_transaction(trans); 548 return -ENOSPC; 549 } 550 551 /* 552 * Try to flush some data based on policy set by @state. This is only advisory 553 * and may fail for various reasons. The caller is supposed to examine the 554 * state of @space_info to detect the outcome. 555 */ 556 static void flush_space(struct btrfs_fs_info *fs_info, 557 struct btrfs_space_info *space_info, u64 num_bytes, 558 int state) 559 { 560 struct btrfs_root *root = fs_info->extent_root; 561 struct btrfs_trans_handle *trans; 562 int nr; 563 int ret = 0; 564 565 switch (state) { 566 case FLUSH_DELAYED_ITEMS_NR: 567 case FLUSH_DELAYED_ITEMS: 568 if (state == FLUSH_DELAYED_ITEMS_NR) 569 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 570 else 571 nr = -1; 572 573 trans = btrfs_join_transaction(root); 574 if (IS_ERR(trans)) { 575 ret = PTR_ERR(trans); 576 break; 577 } 578 ret = btrfs_run_delayed_items_nr(trans, nr); 579 btrfs_end_transaction(trans); 580 break; 581 case FLUSH_DELALLOC: 582 case FLUSH_DELALLOC_WAIT: 583 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 584 state == FLUSH_DELALLOC_WAIT); 585 break; 586 case FLUSH_DELAYED_REFS_NR: 587 case FLUSH_DELAYED_REFS: 588 trans = btrfs_join_transaction(root); 589 if (IS_ERR(trans)) { 590 ret = PTR_ERR(trans); 591 break; 592 } 593 if (state == FLUSH_DELAYED_REFS_NR) 594 nr = calc_reclaim_items_nr(fs_info, num_bytes); 595 else 596 nr = 0; 597 btrfs_run_delayed_refs(trans, nr); 598 btrfs_end_transaction(trans); 599 break; 600 case ALLOC_CHUNK: 601 case ALLOC_CHUNK_FORCE: 602 trans = btrfs_join_transaction(root); 603 if (IS_ERR(trans)) { 604 ret = PTR_ERR(trans); 605 break; 606 } 607 ret = btrfs_chunk_alloc(trans, 608 btrfs_metadata_alloc_profile(fs_info), 609 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 610 CHUNK_ALLOC_FORCE); 611 btrfs_end_transaction(trans); 612 if (ret > 0 || ret == -ENOSPC) 613 ret = 0; 614 break; 615 case RUN_DELAYED_IPUTS: 616 /* 617 * If we have pending delayed iputs then we could free up a 618 * bunch of pinned space, so make sure we run the iputs before 619 * we do our pinned bytes check below. 620 */ 621 btrfs_run_delayed_iputs(fs_info); 622 btrfs_wait_on_delayed_iputs(fs_info); 623 break; 624 case COMMIT_TRANS: 625 ret = may_commit_transaction(fs_info, space_info); 626 break; 627 default: 628 ret = -ENOSPC; 629 break; 630 } 631 632 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 633 ret); 634 return; 635 } 636 637 static inline u64 638 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 639 struct btrfs_space_info *space_info, 640 bool system_chunk) 641 { 642 struct reserve_ticket *ticket; 643 u64 used; 644 u64 expected; 645 u64 to_reclaim = 0; 646 647 list_for_each_entry(ticket, &space_info->tickets, list) 648 to_reclaim += ticket->bytes; 649 list_for_each_entry(ticket, &space_info->priority_tickets, list) 650 to_reclaim += ticket->bytes; 651 if (to_reclaim) 652 return to_reclaim; 653 654 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 655 if (can_overcommit(fs_info, space_info, to_reclaim, 656 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 657 return 0; 658 659 used = btrfs_space_info_used(space_info, true); 660 661 if (can_overcommit(fs_info, space_info, SZ_1M, 662 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 663 expected = div_factor_fine(space_info->total_bytes, 95); 664 else 665 expected = div_factor_fine(space_info->total_bytes, 90); 666 667 if (used > expected) 668 to_reclaim = used - expected; 669 else 670 to_reclaim = 0; 671 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 672 space_info->bytes_reserved); 673 return to_reclaim; 674 } 675 676 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 677 struct btrfs_space_info *space_info, 678 u64 used, bool system_chunk) 679 { 680 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 681 682 /* If we're just plain full then async reclaim just slows us down. */ 683 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 684 return 0; 685 686 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 687 system_chunk)) 688 return 0; 689 690 return (used >= thresh && !btrfs_fs_closing(fs_info) && 691 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 692 } 693 694 /* 695 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets 696 * @fs_info - fs_info for this fs 697 * @space_info - the space info we were flushing 698 * 699 * We call this when we've exhausted our flushing ability and haven't made 700 * progress in satisfying tickets. The reservation code handles tickets in 701 * order, so if there is a large ticket first and then smaller ones we could 702 * very well satisfy the smaller tickets. This will attempt to wake up any 703 * tickets in the list to catch this case. 704 * 705 * This function returns true if it was able to make progress by clearing out 706 * other tickets, or if it stumbles across a ticket that was smaller than the 707 * first ticket. 708 */ 709 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, 710 struct btrfs_space_info *space_info) 711 { 712 struct reserve_ticket *ticket; 713 u64 tickets_id = space_info->tickets_id; 714 u64 first_ticket_bytes = 0; 715 716 while (!list_empty(&space_info->tickets) && 717 tickets_id == space_info->tickets_id) { 718 ticket = list_first_entry(&space_info->tickets, 719 struct reserve_ticket, list); 720 721 /* 722 * may_commit_transaction will avoid committing the transaction 723 * if it doesn't feel like the space reclaimed by the commit 724 * would result in the ticket succeeding. However if we have a 725 * smaller ticket in the queue it may be small enough to be 726 * satisified by committing the transaction, so if any 727 * subsequent ticket is smaller than the first ticket go ahead 728 * and send us back for another loop through the enospc flushing 729 * code. 730 */ 731 if (first_ticket_bytes == 0) 732 first_ticket_bytes = ticket->bytes; 733 else if (first_ticket_bytes > ticket->bytes) 734 return true; 735 736 list_del_init(&ticket->list); 737 ticket->error = -ENOSPC; 738 wake_up(&ticket->wait); 739 740 /* 741 * We're just throwing tickets away, so more flushing may not 742 * trip over btrfs_try_granting_tickets, so we need to call it 743 * here to see if we can make progress with the next ticket in 744 * the list. 745 */ 746 btrfs_try_granting_tickets(fs_info, space_info); 747 } 748 return (tickets_id != space_info->tickets_id); 749 } 750 751 /* 752 * This is for normal flushers, we can wait all goddamned day if we want to. We 753 * will loop and continuously try to flush as long as we are making progress. 754 * We count progress as clearing off tickets each time we have to loop. 755 */ 756 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 757 { 758 struct btrfs_fs_info *fs_info; 759 struct btrfs_space_info *space_info; 760 u64 to_reclaim; 761 int flush_state; 762 int commit_cycles = 0; 763 u64 last_tickets_id; 764 765 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 766 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 767 768 spin_lock(&space_info->lock); 769 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 770 false); 771 if (!to_reclaim) { 772 space_info->flush = 0; 773 spin_unlock(&space_info->lock); 774 return; 775 } 776 last_tickets_id = space_info->tickets_id; 777 spin_unlock(&space_info->lock); 778 779 flush_state = FLUSH_DELAYED_ITEMS_NR; 780 do { 781 flush_space(fs_info, space_info, to_reclaim, flush_state); 782 spin_lock(&space_info->lock); 783 if (list_empty(&space_info->tickets)) { 784 space_info->flush = 0; 785 spin_unlock(&space_info->lock); 786 return; 787 } 788 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 789 space_info, 790 false); 791 if (last_tickets_id == space_info->tickets_id) { 792 flush_state++; 793 } else { 794 last_tickets_id = space_info->tickets_id; 795 flush_state = FLUSH_DELAYED_ITEMS_NR; 796 if (commit_cycles) 797 commit_cycles--; 798 } 799 800 /* 801 * We don't want to force a chunk allocation until we've tried 802 * pretty hard to reclaim space. Think of the case where we 803 * freed up a bunch of space and so have a lot of pinned space 804 * to reclaim. We would rather use that than possibly create a 805 * underutilized metadata chunk. So if this is our first run 806 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 807 * commit the transaction. If nothing has changed the next go 808 * around then we can force a chunk allocation. 809 */ 810 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 811 flush_state++; 812 813 if (flush_state > COMMIT_TRANS) { 814 commit_cycles++; 815 if (commit_cycles > 2) { 816 if (maybe_fail_all_tickets(fs_info, space_info)) { 817 flush_state = FLUSH_DELAYED_ITEMS_NR; 818 commit_cycles--; 819 } else { 820 space_info->flush = 0; 821 } 822 } else { 823 flush_state = FLUSH_DELAYED_ITEMS_NR; 824 } 825 } 826 spin_unlock(&space_info->lock); 827 } while (flush_state <= COMMIT_TRANS); 828 } 829 830 void btrfs_init_async_reclaim_work(struct work_struct *work) 831 { 832 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 833 } 834 835 static const enum btrfs_flush_state priority_flush_states[] = { 836 FLUSH_DELAYED_ITEMS_NR, 837 FLUSH_DELAYED_ITEMS, 838 ALLOC_CHUNK, 839 }; 840 841 static const enum btrfs_flush_state evict_flush_states[] = { 842 FLUSH_DELAYED_ITEMS_NR, 843 FLUSH_DELAYED_ITEMS, 844 FLUSH_DELAYED_REFS_NR, 845 FLUSH_DELAYED_REFS, 846 FLUSH_DELALLOC, 847 FLUSH_DELALLOC_WAIT, 848 ALLOC_CHUNK, 849 COMMIT_TRANS, 850 }; 851 852 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 853 struct btrfs_space_info *space_info, 854 struct reserve_ticket *ticket, 855 const enum btrfs_flush_state *states, 856 int states_nr) 857 { 858 u64 to_reclaim; 859 int flush_state; 860 861 spin_lock(&space_info->lock); 862 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 863 false); 864 if (!to_reclaim) { 865 spin_unlock(&space_info->lock); 866 return; 867 } 868 spin_unlock(&space_info->lock); 869 870 flush_state = 0; 871 do { 872 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 873 flush_state++; 874 spin_lock(&space_info->lock); 875 if (ticket->bytes == 0) { 876 spin_unlock(&space_info->lock); 877 return; 878 } 879 spin_unlock(&space_info->lock); 880 } while (flush_state < states_nr); 881 } 882 883 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 884 struct btrfs_space_info *space_info, 885 struct reserve_ticket *ticket) 886 887 { 888 DEFINE_WAIT(wait); 889 int ret = 0; 890 891 spin_lock(&space_info->lock); 892 while (ticket->bytes > 0 && ticket->error == 0) { 893 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 894 if (ret) { 895 ticket->error = -EINTR; 896 break; 897 } 898 spin_unlock(&space_info->lock); 899 900 schedule(); 901 902 finish_wait(&ticket->wait, &wait); 903 spin_lock(&space_info->lock); 904 } 905 spin_unlock(&space_info->lock); 906 } 907 908 /** 909 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 910 * @fs_info - the fs 911 * @space_info - the space_info for the reservation 912 * @ticket - the ticket for the reservation 913 * @flush - how much we can flush 914 * 915 * This does the work of figuring out how to flush for the ticket, waiting for 916 * the reservation, and returning the appropriate error if there is one. 917 */ 918 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 919 struct btrfs_space_info *space_info, 920 struct reserve_ticket *ticket, 921 enum btrfs_reserve_flush_enum flush) 922 { 923 u64 reclaim_bytes = 0; 924 int ret; 925 926 switch (flush) { 927 case BTRFS_RESERVE_FLUSH_ALL: 928 wait_reserve_ticket(fs_info, space_info, ticket); 929 break; 930 case BTRFS_RESERVE_FLUSH_LIMIT: 931 priority_reclaim_metadata_space(fs_info, space_info, ticket, 932 priority_flush_states, 933 ARRAY_SIZE(priority_flush_states)); 934 break; 935 case BTRFS_RESERVE_FLUSH_EVICT: 936 priority_reclaim_metadata_space(fs_info, space_info, ticket, 937 evict_flush_states, 938 ARRAY_SIZE(evict_flush_states)); 939 break; 940 default: 941 ASSERT(0); 942 break; 943 } 944 945 spin_lock(&space_info->lock); 946 ret = ticket->error; 947 if (ticket->bytes || ticket->error) { 948 if (ticket->bytes < ticket->orig_bytes) 949 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 950 list_del_init(&ticket->list); 951 if (!ret) 952 ret = -ENOSPC; 953 } 954 spin_unlock(&space_info->lock); 955 956 if (reclaim_bytes) 957 btrfs_space_info_add_old_bytes(fs_info, space_info, 958 reclaim_bytes); 959 ASSERT(list_empty(&ticket->list)); 960 return ret; 961 } 962 963 /** 964 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 965 * @root - the root we're allocating for 966 * @space_info - the space info we want to allocate from 967 * @orig_bytes - the number of bytes we want 968 * @flush - whether or not we can flush to make our reservation 969 * 970 * This will reserve orig_bytes number of bytes from the space info associated 971 * with the block_rsv. If there is not enough space it will make an attempt to 972 * flush out space to make room. It will do this by flushing delalloc if 973 * possible or committing the transaction. If flush is 0 then no attempts to 974 * regain reservations will be made and this will fail if there is not enough 975 * space already. 976 */ 977 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 978 struct btrfs_space_info *space_info, 979 u64 orig_bytes, 980 enum btrfs_reserve_flush_enum flush, 981 bool system_chunk) 982 { 983 struct reserve_ticket ticket; 984 u64 used; 985 int ret = 0; 986 bool pending_tickets; 987 988 ASSERT(orig_bytes); 989 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 990 991 spin_lock(&space_info->lock); 992 ret = -ENOSPC; 993 used = btrfs_space_info_used(space_info, true); 994 pending_tickets = !list_empty(&space_info->tickets) || 995 !list_empty(&space_info->priority_tickets); 996 997 /* 998 * Carry on if we have enough space (short-circuit) OR call 999 * can_overcommit() to ensure we can overcommit to continue. 1000 */ 1001 if (!pending_tickets && 1002 ((used + orig_bytes <= space_info->total_bytes) || 1003 can_overcommit(fs_info, space_info, orig_bytes, flush, 1004 system_chunk))) { 1005 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 1006 orig_bytes); 1007 ret = 0; 1008 } 1009 1010 /* 1011 * If we couldn't make a reservation then setup our reservation ticket 1012 * and kick the async worker if it's not already running. 1013 * 1014 * If we are a priority flusher then we just need to add our ticket to 1015 * the list and we will do our own flushing further down. 1016 */ 1017 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 1018 ticket.orig_bytes = orig_bytes; 1019 ticket.bytes = orig_bytes; 1020 ticket.error = 0; 1021 init_waitqueue_head(&ticket.wait); 1022 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 1023 list_add_tail(&ticket.list, &space_info->tickets); 1024 if (!space_info->flush) { 1025 space_info->flush = 1; 1026 trace_btrfs_trigger_flush(fs_info, 1027 space_info->flags, 1028 orig_bytes, flush, 1029 "enospc"); 1030 queue_work(system_unbound_wq, 1031 &fs_info->async_reclaim_work); 1032 } 1033 } else { 1034 list_add_tail(&ticket.list, 1035 &space_info->priority_tickets); 1036 } 1037 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1038 used += orig_bytes; 1039 /* 1040 * We will do the space reservation dance during log replay, 1041 * which means we won't have fs_info->fs_root set, so don't do 1042 * the async reclaim as we will panic. 1043 */ 1044 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1045 need_do_async_reclaim(fs_info, space_info, 1046 used, system_chunk) && 1047 !work_busy(&fs_info->async_reclaim_work)) { 1048 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1049 orig_bytes, flush, "preempt"); 1050 queue_work(system_unbound_wq, 1051 &fs_info->async_reclaim_work); 1052 } 1053 } 1054 spin_unlock(&space_info->lock); 1055 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1056 return ret; 1057 1058 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1059 } 1060 1061 /** 1062 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1063 * @root - the root we're allocating for 1064 * @block_rsv - the block_rsv we're allocating for 1065 * @orig_bytes - the number of bytes we want 1066 * @flush - whether or not we can flush to make our reservation 1067 * 1068 * This will reserve orig_bytes number of bytes from the space info associated 1069 * with the block_rsv. If there is not enough space it will make an attempt to 1070 * flush out space to make room. It will do this by flushing delalloc if 1071 * possible or committing the transaction. If flush is 0 then no attempts to 1072 * regain reservations will be made and this will fail if there is not enough 1073 * space already. 1074 */ 1075 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1076 struct btrfs_block_rsv *block_rsv, 1077 u64 orig_bytes, 1078 enum btrfs_reserve_flush_enum flush) 1079 { 1080 struct btrfs_fs_info *fs_info = root->fs_info; 1081 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1082 int ret; 1083 bool system_chunk = (root == fs_info->chunk_root); 1084 1085 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1086 orig_bytes, flush, system_chunk); 1087 if (ret == -ENOSPC && 1088 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1089 if (block_rsv != global_rsv && 1090 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1091 ret = 0; 1092 } 1093 if (ret == -ENOSPC) { 1094 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1095 block_rsv->space_info->flags, 1096 orig_bytes, 1); 1097 1098 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1099 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1100 orig_bytes, 0); 1101 } 1102 return ret; 1103 } 1104