1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "misc.h" 4 #include "ctree.h" 5 #include "space-info.h" 6 #include "sysfs.h" 7 #include "volumes.h" 8 #include "free-space-cache.h" 9 #include "ordered-data.h" 10 #include "transaction.h" 11 #include "block-group.h" 12 13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 38 { 39 40 struct btrfs_space_info *space_info; 41 int i; 42 int ret; 43 44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 45 if (!space_info) 46 return -ENOMEM; 47 48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 49 GFP_KERNEL); 50 if (ret) { 51 kfree(space_info); 52 return ret; 53 } 54 55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 56 INIT_LIST_HEAD(&space_info->block_groups[i]); 57 init_rwsem(&space_info->groups_sem); 58 spin_lock_init(&space_info->lock); 59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 61 init_waitqueue_head(&space_info->wait); 62 INIT_LIST_HEAD(&space_info->ro_bgs); 63 INIT_LIST_HEAD(&space_info->tickets); 64 INIT_LIST_HEAD(&space_info->priority_tickets); 65 66 ret = btrfs_sysfs_add_space_info_type(info, space_info); 67 if (ret) 68 return ret; 69 70 list_add_rcu(&space_info->list, &info->space_info); 71 if (flags & BTRFS_BLOCK_GROUP_DATA) 72 info->data_sinfo = space_info; 73 74 return ret; 75 } 76 77 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 78 { 79 struct btrfs_super_block *disk_super; 80 u64 features; 81 u64 flags; 82 int mixed = 0; 83 int ret; 84 85 disk_super = fs_info->super_copy; 86 if (!btrfs_super_root(disk_super)) 87 return -EINVAL; 88 89 features = btrfs_super_incompat_flags(disk_super); 90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 91 mixed = 1; 92 93 flags = BTRFS_BLOCK_GROUP_SYSTEM; 94 ret = create_space_info(fs_info, flags); 95 if (ret) 96 goto out; 97 98 if (mixed) { 99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 100 ret = create_space_info(fs_info, flags); 101 } else { 102 flags = BTRFS_BLOCK_GROUP_METADATA; 103 ret = create_space_info(fs_info, flags); 104 if (ret) 105 goto out; 106 107 flags = BTRFS_BLOCK_GROUP_DATA; 108 ret = create_space_info(fs_info, flags); 109 } 110 out: 111 return ret; 112 } 113 114 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 115 u64 total_bytes, u64 bytes_used, 116 u64 bytes_readonly, 117 struct btrfs_space_info **space_info) 118 { 119 struct btrfs_space_info *found; 120 int factor; 121 122 factor = btrfs_bg_type_to_factor(flags); 123 124 found = btrfs_find_space_info(info, flags); 125 ASSERT(found); 126 spin_lock(&found->lock); 127 found->total_bytes += total_bytes; 128 found->disk_total += total_bytes * factor; 129 found->bytes_used += bytes_used; 130 found->disk_used += bytes_used * factor; 131 found->bytes_readonly += bytes_readonly; 132 if (total_bytes > 0) 133 found->full = 0; 134 btrfs_space_info_add_new_bytes(info, found, 135 total_bytes - bytes_used - 136 bytes_readonly); 137 spin_unlock(&found->lock); 138 *space_info = found; 139 } 140 141 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 142 u64 flags) 143 { 144 struct list_head *head = &info->space_info; 145 struct btrfs_space_info *found; 146 147 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 148 149 rcu_read_lock(); 150 list_for_each_entry_rcu(found, head, list) { 151 if (found->flags & flags) { 152 rcu_read_unlock(); 153 return found; 154 } 155 } 156 rcu_read_unlock(); 157 return NULL; 158 } 159 160 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 161 { 162 return (global->size << 1); 163 } 164 165 static int can_overcommit(struct btrfs_fs_info *fs_info, 166 struct btrfs_space_info *space_info, u64 bytes, 167 enum btrfs_reserve_flush_enum flush, 168 bool system_chunk) 169 { 170 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 171 u64 profile; 172 u64 space_size; 173 u64 avail; 174 u64 used; 175 int factor; 176 177 /* Don't overcommit when in mixed mode. */ 178 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 179 return 0; 180 181 if (system_chunk) 182 profile = btrfs_system_alloc_profile(fs_info); 183 else 184 profile = btrfs_metadata_alloc_profile(fs_info); 185 186 used = btrfs_space_info_used(space_info, false); 187 188 /* 189 * We only want to allow over committing if we have lots of actual space 190 * free, but if we don't have enough space to handle the global reserve 191 * space then we could end up having a real enospc problem when trying 192 * to allocate a chunk or some other such important allocation. 193 */ 194 spin_lock(&global_rsv->lock); 195 space_size = calc_global_rsv_need_space(global_rsv); 196 spin_unlock(&global_rsv->lock); 197 if (used + space_size >= space_info->total_bytes) 198 return 0; 199 200 used += space_info->bytes_may_use; 201 202 avail = atomic64_read(&fs_info->free_chunk_space); 203 204 /* 205 * If we have dup, raid1 or raid10 then only half of the free 206 * space is actually usable. For raid56, the space info used 207 * doesn't include the parity drive, so we don't have to 208 * change the math 209 */ 210 factor = btrfs_bg_type_to_factor(profile); 211 avail = div_u64(avail, factor); 212 213 /* 214 * If we aren't flushing all things, let us overcommit up to 215 * 1/2th of the space. If we can flush, don't let us overcommit 216 * too much, let it overcommit up to 1/8 of the space. 217 */ 218 if (flush == BTRFS_RESERVE_FLUSH_ALL) 219 avail >>= 3; 220 else 221 avail >>= 1; 222 223 if (used + bytes < space_info->total_bytes + avail) 224 return 1; 225 return 0; 226 } 227 228 /* 229 * This is for space we already have accounted in space_info->bytes_may_use, so 230 * basically when we're returning space from block_rsv's. 231 */ 232 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 233 struct btrfs_space_info *space_info, 234 u64 num_bytes) 235 { 236 struct list_head *head; 237 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 238 239 spin_lock(&space_info->lock); 240 head = &space_info->priority_tickets; 241 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); 242 243 again: 244 while (!list_empty(head)) { 245 struct reserve_ticket *ticket; 246 u64 used = btrfs_space_info_used(space_info, true); 247 248 ticket = list_first_entry(head, struct reserve_ticket, list); 249 250 /* Check and see if our ticket can be satisified now. */ 251 if ((used + ticket->bytes <= space_info->total_bytes) || 252 can_overcommit(fs_info, space_info, ticket->bytes, flush, 253 false)) { 254 btrfs_space_info_update_bytes_may_use(fs_info, 255 space_info, 256 ticket->bytes); 257 list_del_init(&ticket->list); 258 ticket->bytes = 0; 259 space_info->tickets_id++; 260 wake_up(&ticket->wait); 261 } else { 262 break; 263 } 264 } 265 266 if (head == &space_info->priority_tickets) { 267 head = &space_info->tickets; 268 flush = BTRFS_RESERVE_FLUSH_ALL; 269 goto again; 270 } 271 spin_unlock(&space_info->lock); 272 } 273 274 /* 275 * This is for newly allocated space that isn't accounted in 276 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 277 * we use this helper. 278 */ 279 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 280 struct btrfs_space_info *space_info, 281 u64 num_bytes) 282 { 283 struct reserve_ticket *ticket; 284 struct list_head *head = &space_info->priority_tickets; 285 286 again: 287 while (!list_empty(head) && num_bytes) { 288 ticket = list_first_entry(head, struct reserve_ticket, 289 list); 290 if (num_bytes >= ticket->bytes) { 291 list_del_init(&ticket->list); 292 num_bytes -= ticket->bytes; 293 btrfs_space_info_update_bytes_may_use(fs_info, 294 space_info, 295 ticket->bytes); 296 ticket->bytes = 0; 297 space_info->tickets_id++; 298 wake_up(&ticket->wait); 299 } else { 300 btrfs_space_info_update_bytes_may_use(fs_info, 301 space_info, 302 num_bytes); 303 ticket->bytes -= num_bytes; 304 num_bytes = 0; 305 } 306 } 307 308 if (num_bytes && head == &space_info->priority_tickets) { 309 head = &space_info->tickets; 310 goto again; 311 } 312 } 313 314 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 315 do { \ 316 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 317 spin_lock(&__rsv->lock); \ 318 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 319 __rsv->size, __rsv->reserved); \ 320 spin_unlock(&__rsv->lock); \ 321 } while (0) 322 323 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 324 struct btrfs_space_info *info, u64 bytes, 325 int dump_block_groups) 326 { 327 struct btrfs_block_group_cache *cache; 328 int index = 0; 329 330 spin_lock(&info->lock); 331 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 332 info->flags, 333 info->total_bytes - btrfs_space_info_used(info, true), 334 info->full ? "" : "not "); 335 btrfs_info(fs_info, 336 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 337 info->total_bytes, info->bytes_used, info->bytes_pinned, 338 info->bytes_reserved, info->bytes_may_use, 339 info->bytes_readonly); 340 spin_unlock(&info->lock); 341 342 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 343 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 344 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 345 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 346 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 347 348 if (!dump_block_groups) 349 return; 350 351 down_read(&info->groups_sem); 352 again: 353 list_for_each_entry(cache, &info->block_groups[index], list) { 354 spin_lock(&cache->lock); 355 btrfs_info(fs_info, 356 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 357 cache->key.objectid, cache->key.offset, 358 btrfs_block_group_used(&cache->item), cache->pinned, 359 cache->reserved, cache->ro ? "[readonly]" : ""); 360 btrfs_dump_free_space(cache, bytes); 361 spin_unlock(&cache->lock); 362 } 363 if (++index < BTRFS_NR_RAID_TYPES) 364 goto again; 365 up_read(&info->groups_sem); 366 } 367 368 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 369 unsigned long nr_pages, int nr_items) 370 { 371 struct super_block *sb = fs_info->sb; 372 373 if (down_read_trylock(&sb->s_umount)) { 374 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 375 up_read(&sb->s_umount); 376 } else { 377 /* 378 * We needn't worry the filesystem going from r/w to r/o though 379 * we don't acquire ->s_umount mutex, because the filesystem 380 * should guarantee the delalloc inodes list be empty after 381 * the filesystem is readonly(all dirty pages are written to 382 * the disk). 383 */ 384 btrfs_start_delalloc_roots(fs_info, nr_items); 385 if (!current->journal_info) 386 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 387 } 388 } 389 390 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 391 u64 to_reclaim) 392 { 393 u64 bytes; 394 u64 nr; 395 396 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 397 nr = div64_u64(to_reclaim, bytes); 398 if (!nr) 399 nr = 1; 400 return nr; 401 } 402 403 #define EXTENT_SIZE_PER_ITEM SZ_256K 404 405 /* 406 * shrink metadata reservation for delalloc 407 */ 408 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 409 u64 orig, bool wait_ordered) 410 { 411 struct btrfs_space_info *space_info; 412 struct btrfs_trans_handle *trans; 413 u64 delalloc_bytes; 414 u64 dio_bytes; 415 u64 async_pages; 416 u64 items; 417 long time_left; 418 unsigned long nr_pages; 419 int loops; 420 421 /* Calc the number of the pages we need flush for space reservation */ 422 items = calc_reclaim_items_nr(fs_info, to_reclaim); 423 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 424 425 trans = (struct btrfs_trans_handle *)current->journal_info; 426 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 427 428 delalloc_bytes = percpu_counter_sum_positive( 429 &fs_info->delalloc_bytes); 430 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 431 if (delalloc_bytes == 0 && dio_bytes == 0) { 432 if (trans) 433 return; 434 if (wait_ordered) 435 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 436 return; 437 } 438 439 /* 440 * If we are doing more ordered than delalloc we need to just wait on 441 * ordered extents, otherwise we'll waste time trying to flush delalloc 442 * that likely won't give us the space back we need. 443 */ 444 if (dio_bytes > delalloc_bytes) 445 wait_ordered = true; 446 447 loops = 0; 448 while ((delalloc_bytes || dio_bytes) && loops < 3) { 449 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 450 451 /* 452 * Triggers inode writeback for up to nr_pages. This will invoke 453 * ->writepages callback and trigger delalloc filling 454 * (btrfs_run_delalloc_range()). 455 */ 456 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 457 458 /* 459 * We need to wait for the compressed pages to start before 460 * we continue. 461 */ 462 async_pages = atomic_read(&fs_info->async_delalloc_pages); 463 if (!async_pages) 464 goto skip_async; 465 466 /* 467 * Calculate how many compressed pages we want to be written 468 * before we continue. I.e if there are more async pages than we 469 * require wait_event will wait until nr_pages are written. 470 */ 471 if (async_pages <= nr_pages) 472 async_pages = 0; 473 else 474 async_pages -= nr_pages; 475 476 wait_event(fs_info->async_submit_wait, 477 atomic_read(&fs_info->async_delalloc_pages) <= 478 (int)async_pages); 479 skip_async: 480 spin_lock(&space_info->lock); 481 if (list_empty(&space_info->tickets) && 482 list_empty(&space_info->priority_tickets)) { 483 spin_unlock(&space_info->lock); 484 break; 485 } 486 spin_unlock(&space_info->lock); 487 488 loops++; 489 if (wait_ordered && !trans) { 490 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 491 } else { 492 time_left = schedule_timeout_killable(1); 493 if (time_left) 494 break; 495 } 496 delalloc_bytes = percpu_counter_sum_positive( 497 &fs_info->delalloc_bytes); 498 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 499 } 500 } 501 502 /** 503 * maybe_commit_transaction - possibly commit the transaction if its ok to 504 * @root - the root we're allocating for 505 * @bytes - the number of bytes we want to reserve 506 * @force - force the commit 507 * 508 * This will check to make sure that committing the transaction will actually 509 * get us somewhere and then commit the transaction if it does. Otherwise it 510 * will return -ENOSPC. 511 */ 512 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 513 struct btrfs_space_info *space_info) 514 { 515 struct reserve_ticket *ticket = NULL; 516 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 517 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 518 struct btrfs_trans_handle *trans; 519 u64 bytes_needed; 520 u64 reclaim_bytes = 0; 521 522 trans = (struct btrfs_trans_handle *)current->journal_info; 523 if (trans) 524 return -EAGAIN; 525 526 spin_lock(&space_info->lock); 527 if (!list_empty(&space_info->priority_tickets)) 528 ticket = list_first_entry(&space_info->priority_tickets, 529 struct reserve_ticket, list); 530 else if (!list_empty(&space_info->tickets)) 531 ticket = list_first_entry(&space_info->tickets, 532 struct reserve_ticket, list); 533 bytes_needed = (ticket) ? ticket->bytes : 0; 534 spin_unlock(&space_info->lock); 535 536 if (!bytes_needed) 537 return 0; 538 539 trans = btrfs_join_transaction(fs_info->extent_root); 540 if (IS_ERR(trans)) 541 return PTR_ERR(trans); 542 543 /* 544 * See if there is enough pinned space to make this reservation, or if 545 * we have block groups that are going to be freed, allowing us to 546 * possibly do a chunk allocation the next loop through. 547 */ 548 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 549 __percpu_counter_compare(&space_info->total_bytes_pinned, 550 bytes_needed, 551 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 552 goto commit; 553 554 /* 555 * See if there is some space in the delayed insertion reservation for 556 * this reservation. 557 */ 558 if (space_info != delayed_rsv->space_info) 559 goto enospc; 560 561 spin_lock(&delayed_rsv->lock); 562 reclaim_bytes += delayed_rsv->reserved; 563 spin_unlock(&delayed_rsv->lock); 564 565 spin_lock(&delayed_refs_rsv->lock); 566 reclaim_bytes += delayed_refs_rsv->reserved; 567 spin_unlock(&delayed_refs_rsv->lock); 568 if (reclaim_bytes >= bytes_needed) 569 goto commit; 570 bytes_needed -= reclaim_bytes; 571 572 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 573 bytes_needed, 574 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 575 goto enospc; 576 577 commit: 578 return btrfs_commit_transaction(trans); 579 enospc: 580 btrfs_end_transaction(trans); 581 return -ENOSPC; 582 } 583 584 /* 585 * Try to flush some data based on policy set by @state. This is only advisory 586 * and may fail for various reasons. The caller is supposed to examine the 587 * state of @space_info to detect the outcome. 588 */ 589 static void flush_space(struct btrfs_fs_info *fs_info, 590 struct btrfs_space_info *space_info, u64 num_bytes, 591 int state) 592 { 593 struct btrfs_root *root = fs_info->extent_root; 594 struct btrfs_trans_handle *trans; 595 int nr; 596 int ret = 0; 597 598 switch (state) { 599 case FLUSH_DELAYED_ITEMS_NR: 600 case FLUSH_DELAYED_ITEMS: 601 if (state == FLUSH_DELAYED_ITEMS_NR) 602 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 603 else 604 nr = -1; 605 606 trans = btrfs_join_transaction(root); 607 if (IS_ERR(trans)) { 608 ret = PTR_ERR(trans); 609 break; 610 } 611 ret = btrfs_run_delayed_items_nr(trans, nr); 612 btrfs_end_transaction(trans); 613 break; 614 case FLUSH_DELALLOC: 615 case FLUSH_DELALLOC_WAIT: 616 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 617 state == FLUSH_DELALLOC_WAIT); 618 break; 619 case FLUSH_DELAYED_REFS_NR: 620 case FLUSH_DELAYED_REFS: 621 trans = btrfs_join_transaction(root); 622 if (IS_ERR(trans)) { 623 ret = PTR_ERR(trans); 624 break; 625 } 626 if (state == FLUSH_DELAYED_REFS_NR) 627 nr = calc_reclaim_items_nr(fs_info, num_bytes); 628 else 629 nr = 0; 630 btrfs_run_delayed_refs(trans, nr); 631 btrfs_end_transaction(trans); 632 break; 633 case ALLOC_CHUNK: 634 case ALLOC_CHUNK_FORCE: 635 trans = btrfs_join_transaction(root); 636 if (IS_ERR(trans)) { 637 ret = PTR_ERR(trans); 638 break; 639 } 640 ret = btrfs_chunk_alloc(trans, 641 btrfs_metadata_alloc_profile(fs_info), 642 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 643 CHUNK_ALLOC_FORCE); 644 btrfs_end_transaction(trans); 645 if (ret > 0 || ret == -ENOSPC) 646 ret = 0; 647 break; 648 case RUN_DELAYED_IPUTS: 649 /* 650 * If we have pending delayed iputs then we could free up a 651 * bunch of pinned space, so make sure we run the iputs before 652 * we do our pinned bytes check below. 653 */ 654 btrfs_run_delayed_iputs(fs_info); 655 btrfs_wait_on_delayed_iputs(fs_info); 656 break; 657 case COMMIT_TRANS: 658 ret = may_commit_transaction(fs_info, space_info); 659 break; 660 default: 661 ret = -ENOSPC; 662 break; 663 } 664 665 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 666 ret); 667 return; 668 } 669 670 static inline u64 671 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 672 struct btrfs_space_info *space_info, 673 bool system_chunk) 674 { 675 struct reserve_ticket *ticket; 676 u64 used; 677 u64 expected; 678 u64 to_reclaim = 0; 679 680 list_for_each_entry(ticket, &space_info->tickets, list) 681 to_reclaim += ticket->bytes; 682 list_for_each_entry(ticket, &space_info->priority_tickets, list) 683 to_reclaim += ticket->bytes; 684 if (to_reclaim) 685 return to_reclaim; 686 687 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 688 if (can_overcommit(fs_info, space_info, to_reclaim, 689 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 690 return 0; 691 692 used = btrfs_space_info_used(space_info, true); 693 694 if (can_overcommit(fs_info, space_info, SZ_1M, 695 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 696 expected = div_factor_fine(space_info->total_bytes, 95); 697 else 698 expected = div_factor_fine(space_info->total_bytes, 90); 699 700 if (used > expected) 701 to_reclaim = used - expected; 702 else 703 to_reclaim = 0; 704 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 705 space_info->bytes_reserved); 706 return to_reclaim; 707 } 708 709 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 710 struct btrfs_space_info *space_info, 711 u64 used, bool system_chunk) 712 { 713 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 714 715 /* If we're just plain full then async reclaim just slows us down. */ 716 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 717 return 0; 718 719 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 720 system_chunk)) 721 return 0; 722 723 return (used >= thresh && !btrfs_fs_closing(fs_info) && 724 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 725 } 726 727 static bool wake_all_tickets(struct list_head *head) 728 { 729 struct reserve_ticket *ticket; 730 731 while (!list_empty(head)) { 732 ticket = list_first_entry(head, struct reserve_ticket, list); 733 list_del_init(&ticket->list); 734 ticket->error = -ENOSPC; 735 wake_up(&ticket->wait); 736 if (ticket->bytes != ticket->orig_bytes) 737 return true; 738 } 739 return false; 740 } 741 742 /* 743 * This is for normal flushers, we can wait all goddamned day if we want to. We 744 * will loop and continuously try to flush as long as we are making progress. 745 * We count progress as clearing off tickets each time we have to loop. 746 */ 747 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 748 { 749 struct btrfs_fs_info *fs_info; 750 struct btrfs_space_info *space_info; 751 u64 to_reclaim; 752 int flush_state; 753 int commit_cycles = 0; 754 u64 last_tickets_id; 755 756 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 757 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 758 759 spin_lock(&space_info->lock); 760 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 761 false); 762 if (!to_reclaim) { 763 space_info->flush = 0; 764 spin_unlock(&space_info->lock); 765 return; 766 } 767 last_tickets_id = space_info->tickets_id; 768 spin_unlock(&space_info->lock); 769 770 flush_state = FLUSH_DELAYED_ITEMS_NR; 771 do { 772 flush_space(fs_info, space_info, to_reclaim, flush_state); 773 spin_lock(&space_info->lock); 774 if (list_empty(&space_info->tickets)) { 775 space_info->flush = 0; 776 spin_unlock(&space_info->lock); 777 return; 778 } 779 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 780 space_info, 781 false); 782 if (last_tickets_id == space_info->tickets_id) { 783 flush_state++; 784 } else { 785 last_tickets_id = space_info->tickets_id; 786 flush_state = FLUSH_DELAYED_ITEMS_NR; 787 if (commit_cycles) 788 commit_cycles--; 789 } 790 791 /* 792 * We don't want to force a chunk allocation until we've tried 793 * pretty hard to reclaim space. Think of the case where we 794 * freed up a bunch of space and so have a lot of pinned space 795 * to reclaim. We would rather use that than possibly create a 796 * underutilized metadata chunk. So if this is our first run 797 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 798 * commit the transaction. If nothing has changed the next go 799 * around then we can force a chunk allocation. 800 */ 801 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 802 flush_state++; 803 804 if (flush_state > COMMIT_TRANS) { 805 commit_cycles++; 806 if (commit_cycles > 2) { 807 if (wake_all_tickets(&space_info->tickets)) { 808 flush_state = FLUSH_DELAYED_ITEMS_NR; 809 commit_cycles--; 810 } else { 811 space_info->flush = 0; 812 } 813 } else { 814 flush_state = FLUSH_DELAYED_ITEMS_NR; 815 } 816 } 817 spin_unlock(&space_info->lock); 818 } while (flush_state <= COMMIT_TRANS); 819 } 820 821 void btrfs_init_async_reclaim_work(struct work_struct *work) 822 { 823 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 824 } 825 826 static const enum btrfs_flush_state priority_flush_states[] = { 827 FLUSH_DELAYED_ITEMS_NR, 828 FLUSH_DELAYED_ITEMS, 829 ALLOC_CHUNK, 830 }; 831 832 static const enum btrfs_flush_state evict_flush_states[] = { 833 FLUSH_DELAYED_ITEMS_NR, 834 FLUSH_DELAYED_ITEMS, 835 FLUSH_DELAYED_REFS_NR, 836 FLUSH_DELAYED_REFS, 837 FLUSH_DELALLOC, 838 FLUSH_DELALLOC_WAIT, 839 ALLOC_CHUNK, 840 COMMIT_TRANS, 841 }; 842 843 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 844 struct btrfs_space_info *space_info, 845 struct reserve_ticket *ticket, 846 const enum btrfs_flush_state *states, 847 int states_nr) 848 { 849 u64 to_reclaim; 850 int flush_state; 851 852 spin_lock(&space_info->lock); 853 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 854 false); 855 if (!to_reclaim) { 856 spin_unlock(&space_info->lock); 857 return; 858 } 859 spin_unlock(&space_info->lock); 860 861 flush_state = 0; 862 do { 863 flush_space(fs_info, space_info, to_reclaim, states[flush_state]); 864 flush_state++; 865 spin_lock(&space_info->lock); 866 if (ticket->bytes == 0) { 867 spin_unlock(&space_info->lock); 868 return; 869 } 870 spin_unlock(&space_info->lock); 871 } while (flush_state < states_nr); 872 } 873 874 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, 875 struct btrfs_space_info *space_info, 876 struct reserve_ticket *ticket) 877 878 { 879 DEFINE_WAIT(wait); 880 int ret = 0; 881 882 spin_lock(&space_info->lock); 883 while (ticket->bytes > 0 && ticket->error == 0) { 884 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 885 if (ret) { 886 ticket->error = -EINTR; 887 break; 888 } 889 spin_unlock(&space_info->lock); 890 891 schedule(); 892 893 finish_wait(&ticket->wait, &wait); 894 spin_lock(&space_info->lock); 895 } 896 spin_unlock(&space_info->lock); 897 } 898 899 /** 900 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket 901 * @fs_info - the fs 902 * @space_info - the space_info for the reservation 903 * @ticket - the ticket for the reservation 904 * @flush - how much we can flush 905 * 906 * This does the work of figuring out how to flush for the ticket, waiting for 907 * the reservation, and returning the appropriate error if there is one. 908 */ 909 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, 910 struct btrfs_space_info *space_info, 911 struct reserve_ticket *ticket, 912 enum btrfs_reserve_flush_enum flush) 913 { 914 u64 reclaim_bytes = 0; 915 int ret; 916 917 switch (flush) { 918 case BTRFS_RESERVE_FLUSH_ALL: 919 wait_reserve_ticket(fs_info, space_info, ticket); 920 break; 921 case BTRFS_RESERVE_FLUSH_LIMIT: 922 priority_reclaim_metadata_space(fs_info, space_info, ticket, 923 priority_flush_states, 924 ARRAY_SIZE(priority_flush_states)); 925 break; 926 case BTRFS_RESERVE_FLUSH_EVICT: 927 priority_reclaim_metadata_space(fs_info, space_info, ticket, 928 evict_flush_states, 929 ARRAY_SIZE(evict_flush_states)); 930 break; 931 default: 932 ASSERT(0); 933 break; 934 } 935 936 spin_lock(&space_info->lock); 937 ret = ticket->error; 938 if (ticket->bytes || ticket->error) { 939 if (ticket->bytes < ticket->orig_bytes) 940 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 941 list_del_init(&ticket->list); 942 if (!ret) 943 ret = -ENOSPC; 944 } 945 spin_unlock(&space_info->lock); 946 947 if (reclaim_bytes) 948 btrfs_space_info_add_old_bytes(fs_info, space_info, 949 reclaim_bytes); 950 ASSERT(list_empty(&ticket->list)); 951 return ret; 952 } 953 954 /** 955 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 956 * @root - the root we're allocating for 957 * @space_info - the space info we want to allocate from 958 * @orig_bytes - the number of bytes we want 959 * @flush - whether or not we can flush to make our reservation 960 * 961 * This will reserve orig_bytes number of bytes from the space info associated 962 * with the block_rsv. If there is not enough space it will make an attempt to 963 * flush out space to make room. It will do this by flushing delalloc if 964 * possible or committing the transaction. If flush is 0 then no attempts to 965 * regain reservations will be made and this will fail if there is not enough 966 * space already. 967 */ 968 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 969 struct btrfs_space_info *space_info, 970 u64 orig_bytes, 971 enum btrfs_reserve_flush_enum flush, 972 bool system_chunk) 973 { 974 struct reserve_ticket ticket; 975 u64 used; 976 int ret = 0; 977 bool pending_tickets; 978 979 ASSERT(orig_bytes); 980 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 981 982 spin_lock(&space_info->lock); 983 ret = -ENOSPC; 984 used = btrfs_space_info_used(space_info, true); 985 pending_tickets = !list_empty(&space_info->tickets) || 986 !list_empty(&space_info->priority_tickets); 987 988 /* 989 * Carry on if we have enough space (short-circuit) OR call 990 * can_overcommit() to ensure we can overcommit to continue. 991 */ 992 if (!pending_tickets && 993 ((used + orig_bytes <= space_info->total_bytes) || 994 can_overcommit(fs_info, space_info, orig_bytes, flush, 995 system_chunk))) { 996 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 997 orig_bytes); 998 ret = 0; 999 } 1000 1001 /* 1002 * If we couldn't make a reservation then setup our reservation ticket 1003 * and kick the async worker if it's not already running. 1004 * 1005 * If we are a priority flusher then we just need to add our ticket to 1006 * the list and we will do our own flushing further down. 1007 */ 1008 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 1009 ticket.orig_bytes = orig_bytes; 1010 ticket.bytes = orig_bytes; 1011 ticket.error = 0; 1012 init_waitqueue_head(&ticket.wait); 1013 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 1014 list_add_tail(&ticket.list, &space_info->tickets); 1015 if (!space_info->flush) { 1016 space_info->flush = 1; 1017 trace_btrfs_trigger_flush(fs_info, 1018 space_info->flags, 1019 orig_bytes, flush, 1020 "enospc"); 1021 queue_work(system_unbound_wq, 1022 &fs_info->async_reclaim_work); 1023 } 1024 } else { 1025 list_add_tail(&ticket.list, 1026 &space_info->priority_tickets); 1027 } 1028 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1029 used += orig_bytes; 1030 /* 1031 * We will do the space reservation dance during log replay, 1032 * which means we won't have fs_info->fs_root set, so don't do 1033 * the async reclaim as we will panic. 1034 */ 1035 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1036 need_do_async_reclaim(fs_info, space_info, 1037 used, system_chunk) && 1038 !work_busy(&fs_info->async_reclaim_work)) { 1039 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1040 orig_bytes, flush, "preempt"); 1041 queue_work(system_unbound_wq, 1042 &fs_info->async_reclaim_work); 1043 } 1044 } 1045 spin_unlock(&space_info->lock); 1046 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1047 return ret; 1048 1049 return handle_reserve_ticket(fs_info, space_info, &ticket, flush); 1050 } 1051 1052 /** 1053 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1054 * @root - the root we're allocating for 1055 * @block_rsv - the block_rsv we're allocating for 1056 * @orig_bytes - the number of bytes we want 1057 * @flush - whether or not we can flush to make our reservation 1058 * 1059 * This will reserve orig_bytes number of bytes from the space info associated 1060 * with the block_rsv. If there is not enough space it will make an attempt to 1061 * flush out space to make room. It will do this by flushing delalloc if 1062 * possible or committing the transaction. If flush is 0 then no attempts to 1063 * regain reservations will be made and this will fail if there is not enough 1064 * space already. 1065 */ 1066 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1067 struct btrfs_block_rsv *block_rsv, 1068 u64 orig_bytes, 1069 enum btrfs_reserve_flush_enum flush) 1070 { 1071 struct btrfs_fs_info *fs_info = root->fs_info; 1072 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1073 int ret; 1074 bool system_chunk = (root == fs_info->chunk_root); 1075 1076 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1077 orig_bytes, flush, system_chunk); 1078 if (ret == -ENOSPC && 1079 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1080 if (block_rsv != global_rsv && 1081 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1082 ret = 0; 1083 } 1084 if (ret == -ENOSPC) { 1085 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1086 block_rsv->space_info->flags, 1087 orig_bytes, 1); 1088 1089 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1090 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1091 orig_bytes, 0); 1092 } 1093 return ret; 1094 } 1095