1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "ctree.h" 4 #include "space-info.h" 5 #include "sysfs.h" 6 #include "volumes.h" 7 #include "free-space-cache.h" 8 #include "ordered-data.h" 9 #include "transaction.h" 10 #include "math.h" 11 #include "block-group.h" 12 13 u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 14 bool may_use_included) 15 { 16 ASSERT(s_info); 17 return s_info->bytes_used + s_info->bytes_reserved + 18 s_info->bytes_pinned + s_info->bytes_readonly + 19 (may_use_included ? s_info->bytes_may_use : 0); 20 } 21 22 /* 23 * after adding space to the filesystem, we need to clear the full flags 24 * on all the space infos. 25 */ 26 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 27 { 28 struct list_head *head = &info->space_info; 29 struct btrfs_space_info *found; 30 31 rcu_read_lock(); 32 list_for_each_entry_rcu(found, head, list) 33 found->full = 0; 34 rcu_read_unlock(); 35 } 36 37 static const char *alloc_name(u64 flags) 38 { 39 switch (flags) { 40 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 41 return "mixed"; 42 case BTRFS_BLOCK_GROUP_METADATA: 43 return "metadata"; 44 case BTRFS_BLOCK_GROUP_DATA: 45 return "data"; 46 case BTRFS_BLOCK_GROUP_SYSTEM: 47 return "system"; 48 default: 49 WARN_ON(1); 50 return "invalid-combination"; 51 }; 52 } 53 54 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 55 { 56 57 struct btrfs_space_info *space_info; 58 int i; 59 int ret; 60 61 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 62 if (!space_info) 63 return -ENOMEM; 64 65 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 66 GFP_KERNEL); 67 if (ret) { 68 kfree(space_info); 69 return ret; 70 } 71 72 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 73 INIT_LIST_HEAD(&space_info->block_groups[i]); 74 init_rwsem(&space_info->groups_sem); 75 spin_lock_init(&space_info->lock); 76 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 77 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 78 init_waitqueue_head(&space_info->wait); 79 INIT_LIST_HEAD(&space_info->ro_bgs); 80 INIT_LIST_HEAD(&space_info->tickets); 81 INIT_LIST_HEAD(&space_info->priority_tickets); 82 83 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 84 info->space_info_kobj, "%s", 85 alloc_name(space_info->flags)); 86 if (ret) { 87 kobject_put(&space_info->kobj); 88 return ret; 89 } 90 91 list_add_rcu(&space_info->list, &info->space_info); 92 if (flags & BTRFS_BLOCK_GROUP_DATA) 93 info->data_sinfo = space_info; 94 95 return ret; 96 } 97 98 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 99 { 100 struct btrfs_super_block *disk_super; 101 u64 features; 102 u64 flags; 103 int mixed = 0; 104 int ret; 105 106 disk_super = fs_info->super_copy; 107 if (!btrfs_super_root(disk_super)) 108 return -EINVAL; 109 110 features = btrfs_super_incompat_flags(disk_super); 111 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 112 mixed = 1; 113 114 flags = BTRFS_BLOCK_GROUP_SYSTEM; 115 ret = create_space_info(fs_info, flags); 116 if (ret) 117 goto out; 118 119 if (mixed) { 120 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 121 ret = create_space_info(fs_info, flags); 122 } else { 123 flags = BTRFS_BLOCK_GROUP_METADATA; 124 ret = create_space_info(fs_info, flags); 125 if (ret) 126 goto out; 127 128 flags = BTRFS_BLOCK_GROUP_DATA; 129 ret = create_space_info(fs_info, flags); 130 } 131 out: 132 return ret; 133 } 134 135 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, 136 u64 total_bytes, u64 bytes_used, 137 u64 bytes_readonly, 138 struct btrfs_space_info **space_info) 139 { 140 struct btrfs_space_info *found; 141 int factor; 142 143 factor = btrfs_bg_type_to_factor(flags); 144 145 found = btrfs_find_space_info(info, flags); 146 ASSERT(found); 147 spin_lock(&found->lock); 148 found->total_bytes += total_bytes; 149 found->disk_total += total_bytes * factor; 150 found->bytes_used += bytes_used; 151 found->disk_used += bytes_used * factor; 152 found->bytes_readonly += bytes_readonly; 153 if (total_bytes > 0) 154 found->full = 0; 155 btrfs_space_info_add_new_bytes(info, found, 156 total_bytes - bytes_used - 157 bytes_readonly); 158 spin_unlock(&found->lock); 159 *space_info = found; 160 } 161 162 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 163 u64 flags) 164 { 165 struct list_head *head = &info->space_info; 166 struct btrfs_space_info *found; 167 168 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 169 170 rcu_read_lock(); 171 list_for_each_entry_rcu(found, head, list) { 172 if (found->flags & flags) { 173 rcu_read_unlock(); 174 return found; 175 } 176 } 177 rcu_read_unlock(); 178 return NULL; 179 } 180 181 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 182 { 183 return (global->size << 1); 184 } 185 186 static int can_overcommit(struct btrfs_fs_info *fs_info, 187 struct btrfs_space_info *space_info, u64 bytes, 188 enum btrfs_reserve_flush_enum flush, 189 bool system_chunk) 190 { 191 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 192 u64 profile; 193 u64 space_size; 194 u64 avail; 195 u64 used; 196 int factor; 197 198 /* Don't overcommit when in mixed mode. */ 199 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 200 return 0; 201 202 if (system_chunk) 203 profile = btrfs_system_alloc_profile(fs_info); 204 else 205 profile = btrfs_metadata_alloc_profile(fs_info); 206 207 used = btrfs_space_info_used(space_info, false); 208 209 /* 210 * We only want to allow over committing if we have lots of actual space 211 * free, but if we don't have enough space to handle the global reserve 212 * space then we could end up having a real enospc problem when trying 213 * to allocate a chunk or some other such important allocation. 214 */ 215 spin_lock(&global_rsv->lock); 216 space_size = calc_global_rsv_need_space(global_rsv); 217 spin_unlock(&global_rsv->lock); 218 if (used + space_size >= space_info->total_bytes) 219 return 0; 220 221 used += space_info->bytes_may_use; 222 223 avail = atomic64_read(&fs_info->free_chunk_space); 224 225 /* 226 * If we have dup, raid1 or raid10 then only half of the free 227 * space is actually usable. For raid56, the space info used 228 * doesn't include the parity drive, so we don't have to 229 * change the math 230 */ 231 factor = btrfs_bg_type_to_factor(profile); 232 avail = div_u64(avail, factor); 233 234 /* 235 * If we aren't flushing all things, let us overcommit up to 236 * 1/2th of the space. If we can flush, don't let us overcommit 237 * too much, let it overcommit up to 1/8 of the space. 238 */ 239 if (flush == BTRFS_RESERVE_FLUSH_ALL) 240 avail >>= 3; 241 else 242 avail >>= 1; 243 244 if (used + bytes < space_info->total_bytes + avail) 245 return 1; 246 return 0; 247 } 248 249 /* 250 * This is for space we already have accounted in space_info->bytes_may_use, so 251 * basically when we're returning space from block_rsv's. 252 */ 253 void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 254 struct btrfs_space_info *space_info, 255 u64 num_bytes) 256 { 257 struct reserve_ticket *ticket; 258 struct list_head *head; 259 u64 used; 260 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 261 bool check_overcommit = false; 262 263 spin_lock(&space_info->lock); 264 head = &space_info->priority_tickets; 265 266 /* 267 * If we are over our limit then we need to check and see if we can 268 * overcommit, and if we can't then we just need to free up our space 269 * and not satisfy any requests. 270 */ 271 used = btrfs_space_info_used(space_info, true); 272 if (used - num_bytes >= space_info->total_bytes) 273 check_overcommit = true; 274 again: 275 while (!list_empty(head) && num_bytes) { 276 ticket = list_first_entry(head, struct reserve_ticket, 277 list); 278 /* 279 * We use 0 bytes because this space is already reserved, so 280 * adding the ticket space would be a double count. 281 */ 282 if (check_overcommit && 283 !can_overcommit(fs_info, space_info, 0, flush, false)) 284 break; 285 if (num_bytes >= ticket->bytes) { 286 list_del_init(&ticket->list); 287 num_bytes -= ticket->bytes; 288 ticket->bytes = 0; 289 space_info->tickets_id++; 290 wake_up(&ticket->wait); 291 } else { 292 ticket->bytes -= num_bytes; 293 num_bytes = 0; 294 } 295 } 296 297 if (num_bytes && head == &space_info->priority_tickets) { 298 head = &space_info->tickets; 299 flush = BTRFS_RESERVE_FLUSH_ALL; 300 goto again; 301 } 302 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); 303 trace_btrfs_space_reservation(fs_info, "space_info", 304 space_info->flags, num_bytes, 0); 305 spin_unlock(&space_info->lock); 306 } 307 308 /* 309 * This is for newly allocated space that isn't accounted in 310 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 311 * we use this helper. 312 */ 313 void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 314 struct btrfs_space_info *space_info, 315 u64 num_bytes) 316 { 317 struct reserve_ticket *ticket; 318 struct list_head *head = &space_info->priority_tickets; 319 320 again: 321 while (!list_empty(head) && num_bytes) { 322 ticket = list_first_entry(head, struct reserve_ticket, 323 list); 324 if (num_bytes >= ticket->bytes) { 325 trace_btrfs_space_reservation(fs_info, "space_info", 326 space_info->flags, 327 ticket->bytes, 1); 328 list_del_init(&ticket->list); 329 num_bytes -= ticket->bytes; 330 btrfs_space_info_update_bytes_may_use(fs_info, 331 space_info, 332 ticket->bytes); 333 ticket->bytes = 0; 334 space_info->tickets_id++; 335 wake_up(&ticket->wait); 336 } else { 337 trace_btrfs_space_reservation(fs_info, "space_info", 338 space_info->flags, 339 num_bytes, 1); 340 btrfs_space_info_update_bytes_may_use(fs_info, 341 space_info, 342 num_bytes); 343 ticket->bytes -= num_bytes; 344 num_bytes = 0; 345 } 346 } 347 348 if (num_bytes && head == &space_info->priority_tickets) { 349 head = &space_info->tickets; 350 goto again; 351 } 352 } 353 354 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 355 do { \ 356 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 357 spin_lock(&__rsv->lock); \ 358 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 359 __rsv->size, __rsv->reserved); \ 360 spin_unlock(&__rsv->lock); \ 361 } while (0) 362 363 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, 364 struct btrfs_space_info *info, u64 bytes, 365 int dump_block_groups) 366 { 367 struct btrfs_block_group_cache *cache; 368 int index = 0; 369 370 spin_lock(&info->lock); 371 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 372 info->flags, 373 info->total_bytes - btrfs_space_info_used(info, true), 374 info->full ? "" : "not "); 375 btrfs_info(fs_info, 376 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 377 info->total_bytes, info->bytes_used, info->bytes_pinned, 378 info->bytes_reserved, info->bytes_may_use, 379 info->bytes_readonly); 380 spin_unlock(&info->lock); 381 382 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 383 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 384 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 385 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 386 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 387 388 if (!dump_block_groups) 389 return; 390 391 down_read(&info->groups_sem); 392 again: 393 list_for_each_entry(cache, &info->block_groups[index], list) { 394 spin_lock(&cache->lock); 395 btrfs_info(fs_info, 396 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 397 cache->key.objectid, cache->key.offset, 398 btrfs_block_group_used(&cache->item), cache->pinned, 399 cache->reserved, cache->ro ? "[readonly]" : ""); 400 btrfs_dump_free_space(cache, bytes); 401 spin_unlock(&cache->lock); 402 } 403 if (++index < BTRFS_NR_RAID_TYPES) 404 goto again; 405 up_read(&info->groups_sem); 406 } 407 408 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 409 unsigned long nr_pages, int nr_items) 410 { 411 struct super_block *sb = fs_info->sb; 412 413 if (down_read_trylock(&sb->s_umount)) { 414 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 415 up_read(&sb->s_umount); 416 } else { 417 /* 418 * We needn't worry the filesystem going from r/w to r/o though 419 * we don't acquire ->s_umount mutex, because the filesystem 420 * should guarantee the delalloc inodes list be empty after 421 * the filesystem is readonly(all dirty pages are written to 422 * the disk). 423 */ 424 btrfs_start_delalloc_roots(fs_info, nr_items); 425 if (!current->journal_info) 426 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 427 } 428 } 429 430 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 431 u64 to_reclaim) 432 { 433 u64 bytes; 434 u64 nr; 435 436 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 437 nr = div64_u64(to_reclaim, bytes); 438 if (!nr) 439 nr = 1; 440 return nr; 441 } 442 443 #define EXTENT_SIZE_PER_ITEM SZ_256K 444 445 /* 446 * shrink metadata reservation for delalloc 447 */ 448 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 449 u64 orig, bool wait_ordered) 450 { 451 struct btrfs_space_info *space_info; 452 struct btrfs_trans_handle *trans; 453 u64 delalloc_bytes; 454 u64 dio_bytes; 455 u64 async_pages; 456 u64 items; 457 long time_left; 458 unsigned long nr_pages; 459 int loops; 460 461 /* Calc the number of the pages we need flush for space reservation */ 462 items = calc_reclaim_items_nr(fs_info, to_reclaim); 463 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 464 465 trans = (struct btrfs_trans_handle *)current->journal_info; 466 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 467 468 delalloc_bytes = percpu_counter_sum_positive( 469 &fs_info->delalloc_bytes); 470 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 471 if (delalloc_bytes == 0 && dio_bytes == 0) { 472 if (trans) 473 return; 474 if (wait_ordered) 475 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 476 return; 477 } 478 479 /* 480 * If we are doing more ordered than delalloc we need to just wait on 481 * ordered extents, otherwise we'll waste time trying to flush delalloc 482 * that likely won't give us the space back we need. 483 */ 484 if (dio_bytes > delalloc_bytes) 485 wait_ordered = true; 486 487 loops = 0; 488 while ((delalloc_bytes || dio_bytes) && loops < 3) { 489 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 490 491 /* 492 * Triggers inode writeback for up to nr_pages. This will invoke 493 * ->writepages callback and trigger delalloc filling 494 * (btrfs_run_delalloc_range()). 495 */ 496 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 497 498 /* 499 * We need to wait for the compressed pages to start before 500 * we continue. 501 */ 502 async_pages = atomic_read(&fs_info->async_delalloc_pages); 503 if (!async_pages) 504 goto skip_async; 505 506 /* 507 * Calculate how many compressed pages we want to be written 508 * before we continue. I.e if there are more async pages than we 509 * require wait_event will wait until nr_pages are written. 510 */ 511 if (async_pages <= nr_pages) 512 async_pages = 0; 513 else 514 async_pages -= nr_pages; 515 516 wait_event(fs_info->async_submit_wait, 517 atomic_read(&fs_info->async_delalloc_pages) <= 518 (int)async_pages); 519 skip_async: 520 spin_lock(&space_info->lock); 521 if (list_empty(&space_info->tickets) && 522 list_empty(&space_info->priority_tickets)) { 523 spin_unlock(&space_info->lock); 524 break; 525 } 526 spin_unlock(&space_info->lock); 527 528 loops++; 529 if (wait_ordered && !trans) { 530 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 531 } else { 532 time_left = schedule_timeout_killable(1); 533 if (time_left) 534 break; 535 } 536 delalloc_bytes = percpu_counter_sum_positive( 537 &fs_info->delalloc_bytes); 538 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); 539 } 540 } 541 542 /** 543 * maybe_commit_transaction - possibly commit the transaction if its ok to 544 * @root - the root we're allocating for 545 * @bytes - the number of bytes we want to reserve 546 * @force - force the commit 547 * 548 * This will check to make sure that committing the transaction will actually 549 * get us somewhere and then commit the transaction if it does. Otherwise it 550 * will return -ENOSPC. 551 */ 552 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 553 struct btrfs_space_info *space_info) 554 { 555 struct reserve_ticket *ticket = NULL; 556 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 557 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 558 struct btrfs_trans_handle *trans; 559 u64 bytes_needed; 560 u64 reclaim_bytes = 0; 561 562 trans = (struct btrfs_trans_handle *)current->journal_info; 563 if (trans) 564 return -EAGAIN; 565 566 spin_lock(&space_info->lock); 567 if (!list_empty(&space_info->priority_tickets)) 568 ticket = list_first_entry(&space_info->priority_tickets, 569 struct reserve_ticket, list); 570 else if (!list_empty(&space_info->tickets)) 571 ticket = list_first_entry(&space_info->tickets, 572 struct reserve_ticket, list); 573 bytes_needed = (ticket) ? ticket->bytes : 0; 574 spin_unlock(&space_info->lock); 575 576 if (!bytes_needed) 577 return 0; 578 579 trans = btrfs_join_transaction(fs_info->extent_root); 580 if (IS_ERR(trans)) 581 return PTR_ERR(trans); 582 583 /* 584 * See if there is enough pinned space to make this reservation, or if 585 * we have block groups that are going to be freed, allowing us to 586 * possibly do a chunk allocation the next loop through. 587 */ 588 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 589 __percpu_counter_compare(&space_info->total_bytes_pinned, 590 bytes_needed, 591 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 592 goto commit; 593 594 /* 595 * See if there is some space in the delayed insertion reservation for 596 * this reservation. 597 */ 598 if (space_info != delayed_rsv->space_info) 599 goto enospc; 600 601 spin_lock(&delayed_rsv->lock); 602 reclaim_bytes += delayed_rsv->reserved; 603 spin_unlock(&delayed_rsv->lock); 604 605 spin_lock(&delayed_refs_rsv->lock); 606 reclaim_bytes += delayed_refs_rsv->reserved; 607 spin_unlock(&delayed_refs_rsv->lock); 608 if (reclaim_bytes >= bytes_needed) 609 goto commit; 610 bytes_needed -= reclaim_bytes; 611 612 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 613 bytes_needed, 614 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 615 goto enospc; 616 617 commit: 618 return btrfs_commit_transaction(trans); 619 enospc: 620 btrfs_end_transaction(trans); 621 return -ENOSPC; 622 } 623 624 /* 625 * Try to flush some data based on policy set by @state. This is only advisory 626 * and may fail for various reasons. The caller is supposed to examine the 627 * state of @space_info to detect the outcome. 628 */ 629 static void flush_space(struct btrfs_fs_info *fs_info, 630 struct btrfs_space_info *space_info, u64 num_bytes, 631 int state) 632 { 633 struct btrfs_root *root = fs_info->extent_root; 634 struct btrfs_trans_handle *trans; 635 int nr; 636 int ret = 0; 637 638 switch (state) { 639 case FLUSH_DELAYED_ITEMS_NR: 640 case FLUSH_DELAYED_ITEMS: 641 if (state == FLUSH_DELAYED_ITEMS_NR) 642 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 643 else 644 nr = -1; 645 646 trans = btrfs_join_transaction(root); 647 if (IS_ERR(trans)) { 648 ret = PTR_ERR(trans); 649 break; 650 } 651 ret = btrfs_run_delayed_items_nr(trans, nr); 652 btrfs_end_transaction(trans); 653 break; 654 case FLUSH_DELALLOC: 655 case FLUSH_DELALLOC_WAIT: 656 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 657 state == FLUSH_DELALLOC_WAIT); 658 break; 659 case FLUSH_DELAYED_REFS_NR: 660 case FLUSH_DELAYED_REFS: 661 trans = btrfs_join_transaction(root); 662 if (IS_ERR(trans)) { 663 ret = PTR_ERR(trans); 664 break; 665 } 666 if (state == FLUSH_DELAYED_REFS_NR) 667 nr = calc_reclaim_items_nr(fs_info, num_bytes); 668 else 669 nr = 0; 670 btrfs_run_delayed_refs(trans, nr); 671 btrfs_end_transaction(trans); 672 break; 673 case ALLOC_CHUNK: 674 case ALLOC_CHUNK_FORCE: 675 trans = btrfs_join_transaction(root); 676 if (IS_ERR(trans)) { 677 ret = PTR_ERR(trans); 678 break; 679 } 680 ret = btrfs_chunk_alloc(trans, 681 btrfs_metadata_alloc_profile(fs_info), 682 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : 683 CHUNK_ALLOC_FORCE); 684 btrfs_end_transaction(trans); 685 if (ret > 0 || ret == -ENOSPC) 686 ret = 0; 687 break; 688 case COMMIT_TRANS: 689 /* 690 * If we have pending delayed iputs then we could free up a 691 * bunch of pinned space, so make sure we run the iputs before 692 * we do our pinned bytes check below. 693 */ 694 btrfs_run_delayed_iputs(fs_info); 695 btrfs_wait_on_delayed_iputs(fs_info); 696 697 ret = may_commit_transaction(fs_info, space_info); 698 break; 699 default: 700 ret = -ENOSPC; 701 break; 702 } 703 704 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 705 ret); 706 return; 707 } 708 709 static inline u64 710 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 711 struct btrfs_space_info *space_info, 712 bool system_chunk) 713 { 714 struct reserve_ticket *ticket; 715 u64 used; 716 u64 expected; 717 u64 to_reclaim = 0; 718 719 list_for_each_entry(ticket, &space_info->tickets, list) 720 to_reclaim += ticket->bytes; 721 list_for_each_entry(ticket, &space_info->priority_tickets, list) 722 to_reclaim += ticket->bytes; 723 if (to_reclaim) 724 return to_reclaim; 725 726 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 727 if (can_overcommit(fs_info, space_info, to_reclaim, 728 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 729 return 0; 730 731 used = btrfs_space_info_used(space_info, true); 732 733 if (can_overcommit(fs_info, space_info, SZ_1M, 734 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 735 expected = div_factor_fine(space_info->total_bytes, 95); 736 else 737 expected = div_factor_fine(space_info->total_bytes, 90); 738 739 if (used > expected) 740 to_reclaim = used - expected; 741 else 742 to_reclaim = 0; 743 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 744 space_info->bytes_reserved); 745 return to_reclaim; 746 } 747 748 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 749 struct btrfs_space_info *space_info, 750 u64 used, bool system_chunk) 751 { 752 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 753 754 /* If we're just plain full then async reclaim just slows us down. */ 755 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 756 return 0; 757 758 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 759 system_chunk)) 760 return 0; 761 762 return (used >= thresh && !btrfs_fs_closing(fs_info) && 763 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 764 } 765 766 static bool wake_all_tickets(struct list_head *head) 767 { 768 struct reserve_ticket *ticket; 769 770 while (!list_empty(head)) { 771 ticket = list_first_entry(head, struct reserve_ticket, list); 772 list_del_init(&ticket->list); 773 ticket->error = -ENOSPC; 774 wake_up(&ticket->wait); 775 if (ticket->bytes != ticket->orig_bytes) 776 return true; 777 } 778 return false; 779 } 780 781 /* 782 * This is for normal flushers, we can wait all goddamned day if we want to. We 783 * will loop and continuously try to flush as long as we are making progress. 784 * We count progress as clearing off tickets each time we have to loop. 785 */ 786 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 787 { 788 struct btrfs_fs_info *fs_info; 789 struct btrfs_space_info *space_info; 790 u64 to_reclaim; 791 int flush_state; 792 int commit_cycles = 0; 793 u64 last_tickets_id; 794 795 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 796 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 797 798 spin_lock(&space_info->lock); 799 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 800 false); 801 if (!to_reclaim) { 802 space_info->flush = 0; 803 spin_unlock(&space_info->lock); 804 return; 805 } 806 last_tickets_id = space_info->tickets_id; 807 spin_unlock(&space_info->lock); 808 809 flush_state = FLUSH_DELAYED_ITEMS_NR; 810 do { 811 flush_space(fs_info, space_info, to_reclaim, flush_state); 812 spin_lock(&space_info->lock); 813 if (list_empty(&space_info->tickets)) { 814 space_info->flush = 0; 815 spin_unlock(&space_info->lock); 816 return; 817 } 818 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 819 space_info, 820 false); 821 if (last_tickets_id == space_info->tickets_id) { 822 flush_state++; 823 } else { 824 last_tickets_id = space_info->tickets_id; 825 flush_state = FLUSH_DELAYED_ITEMS_NR; 826 if (commit_cycles) 827 commit_cycles--; 828 } 829 830 /* 831 * We don't want to force a chunk allocation until we've tried 832 * pretty hard to reclaim space. Think of the case where we 833 * freed up a bunch of space and so have a lot of pinned space 834 * to reclaim. We would rather use that than possibly create a 835 * underutilized metadata chunk. So if this is our first run 836 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 837 * commit the transaction. If nothing has changed the next go 838 * around then we can force a chunk allocation. 839 */ 840 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 841 flush_state++; 842 843 if (flush_state > COMMIT_TRANS) { 844 commit_cycles++; 845 if (commit_cycles > 2) { 846 if (wake_all_tickets(&space_info->tickets)) { 847 flush_state = FLUSH_DELAYED_ITEMS_NR; 848 commit_cycles--; 849 } else { 850 space_info->flush = 0; 851 } 852 } else { 853 flush_state = FLUSH_DELAYED_ITEMS_NR; 854 } 855 } 856 spin_unlock(&space_info->lock); 857 } while (flush_state <= COMMIT_TRANS); 858 } 859 860 void btrfs_init_async_reclaim_work(struct work_struct *work) 861 { 862 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 863 } 864 865 static const enum btrfs_flush_state priority_flush_states[] = { 866 FLUSH_DELAYED_ITEMS_NR, 867 FLUSH_DELAYED_ITEMS, 868 ALLOC_CHUNK, 869 }; 870 871 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 872 struct btrfs_space_info *space_info, 873 struct reserve_ticket *ticket) 874 { 875 u64 to_reclaim; 876 int flush_state; 877 878 spin_lock(&space_info->lock); 879 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 880 false); 881 if (!to_reclaim) { 882 spin_unlock(&space_info->lock); 883 return; 884 } 885 spin_unlock(&space_info->lock); 886 887 flush_state = 0; 888 do { 889 flush_space(fs_info, space_info, to_reclaim, 890 priority_flush_states[flush_state]); 891 flush_state++; 892 spin_lock(&space_info->lock); 893 if (ticket->bytes == 0) { 894 spin_unlock(&space_info->lock); 895 return; 896 } 897 spin_unlock(&space_info->lock); 898 } while (flush_state < ARRAY_SIZE(priority_flush_states)); 899 } 900 901 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 902 struct btrfs_space_info *space_info, 903 struct reserve_ticket *ticket) 904 905 { 906 DEFINE_WAIT(wait); 907 u64 reclaim_bytes = 0; 908 int ret = 0; 909 910 spin_lock(&space_info->lock); 911 while (ticket->bytes > 0 && ticket->error == 0) { 912 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 913 if (ret) { 914 ret = -EINTR; 915 break; 916 } 917 spin_unlock(&space_info->lock); 918 919 schedule(); 920 921 finish_wait(&ticket->wait, &wait); 922 spin_lock(&space_info->lock); 923 } 924 if (!ret) 925 ret = ticket->error; 926 if (!list_empty(&ticket->list)) 927 list_del_init(&ticket->list); 928 if (ticket->bytes && ticket->bytes < ticket->orig_bytes) 929 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 930 spin_unlock(&space_info->lock); 931 932 if (reclaim_bytes) 933 btrfs_space_info_add_old_bytes(fs_info, space_info, 934 reclaim_bytes); 935 return ret; 936 } 937 938 /** 939 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 940 * @root - the root we're allocating for 941 * @space_info - the space info we want to allocate from 942 * @orig_bytes - the number of bytes we want 943 * @flush - whether or not we can flush to make our reservation 944 * 945 * This will reserve orig_bytes number of bytes from the space info associated 946 * with the block_rsv. If there is not enough space it will make an attempt to 947 * flush out space to make room. It will do this by flushing delalloc if 948 * possible or committing the transaction. If flush is 0 then no attempts to 949 * regain reservations will be made and this will fail if there is not enough 950 * space already. 951 */ 952 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 953 struct btrfs_space_info *space_info, 954 u64 orig_bytes, 955 enum btrfs_reserve_flush_enum flush, 956 bool system_chunk) 957 { 958 struct reserve_ticket ticket; 959 u64 used; 960 u64 reclaim_bytes = 0; 961 int ret = 0; 962 963 ASSERT(orig_bytes); 964 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 965 966 spin_lock(&space_info->lock); 967 ret = -ENOSPC; 968 used = btrfs_space_info_used(space_info, true); 969 970 /* 971 * Carry on if we have enough space (short-circuit) OR call 972 * can_overcommit() to ensure we can overcommit to continue. 973 */ 974 if ((used + orig_bytes <= space_info->total_bytes) || 975 can_overcommit(fs_info, space_info, orig_bytes, flush, 976 system_chunk)) { 977 btrfs_space_info_update_bytes_may_use(fs_info, space_info, 978 orig_bytes); 979 trace_btrfs_space_reservation(fs_info, "space_info", 980 space_info->flags, orig_bytes, 1); 981 ret = 0; 982 } 983 984 /* 985 * If we couldn't make a reservation then setup our reservation ticket 986 * and kick the async worker if it's not already running. 987 * 988 * If we are a priority flusher then we just need to add our ticket to 989 * the list and we will do our own flushing further down. 990 */ 991 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 992 ticket.orig_bytes = orig_bytes; 993 ticket.bytes = orig_bytes; 994 ticket.error = 0; 995 init_waitqueue_head(&ticket.wait); 996 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 997 list_add_tail(&ticket.list, &space_info->tickets); 998 if (!space_info->flush) { 999 space_info->flush = 1; 1000 trace_btrfs_trigger_flush(fs_info, 1001 space_info->flags, 1002 orig_bytes, flush, 1003 "enospc"); 1004 queue_work(system_unbound_wq, 1005 &fs_info->async_reclaim_work); 1006 } 1007 } else { 1008 list_add_tail(&ticket.list, 1009 &space_info->priority_tickets); 1010 } 1011 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1012 used += orig_bytes; 1013 /* 1014 * We will do the space reservation dance during log replay, 1015 * which means we won't have fs_info->fs_root set, so don't do 1016 * the async reclaim as we will panic. 1017 */ 1018 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1019 need_do_async_reclaim(fs_info, space_info, 1020 used, system_chunk) && 1021 !work_busy(&fs_info->async_reclaim_work)) { 1022 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1023 orig_bytes, flush, "preempt"); 1024 queue_work(system_unbound_wq, 1025 &fs_info->async_reclaim_work); 1026 } 1027 } 1028 spin_unlock(&space_info->lock); 1029 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 1030 return ret; 1031 1032 if (flush == BTRFS_RESERVE_FLUSH_ALL) 1033 return wait_reserve_ticket(fs_info, space_info, &ticket); 1034 1035 ret = 0; 1036 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 1037 spin_lock(&space_info->lock); 1038 if (ticket.bytes) { 1039 if (ticket.bytes < orig_bytes) 1040 reclaim_bytes = orig_bytes - ticket.bytes; 1041 list_del_init(&ticket.list); 1042 ret = -ENOSPC; 1043 } 1044 spin_unlock(&space_info->lock); 1045 1046 if (reclaim_bytes) 1047 btrfs_space_info_add_old_bytes(fs_info, space_info, 1048 reclaim_bytes); 1049 ASSERT(list_empty(&ticket.list)); 1050 return ret; 1051 } 1052 1053 /** 1054 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 1055 * @root - the root we're allocating for 1056 * @block_rsv - the block_rsv we're allocating for 1057 * @orig_bytes - the number of bytes we want 1058 * @flush - whether or not we can flush to make our reservation 1059 * 1060 * This will reserve orig_bytes number of bytes from the space info associated 1061 * with the block_rsv. If there is not enough space it will make an attempt to 1062 * flush out space to make room. It will do this by flushing delalloc if 1063 * possible or committing the transaction. If flush is 0 then no attempts to 1064 * regain reservations will be made and this will fail if there is not enough 1065 * space already. 1066 */ 1067 int btrfs_reserve_metadata_bytes(struct btrfs_root *root, 1068 struct btrfs_block_rsv *block_rsv, 1069 u64 orig_bytes, 1070 enum btrfs_reserve_flush_enum flush) 1071 { 1072 struct btrfs_fs_info *fs_info = root->fs_info; 1073 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 1074 int ret; 1075 bool system_chunk = (root == fs_info->chunk_root); 1076 1077 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 1078 orig_bytes, flush, system_chunk); 1079 if (ret == -ENOSPC && 1080 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 1081 if (block_rsv != global_rsv && 1082 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) 1083 ret = 0; 1084 } 1085 if (ret == -ENOSPC) { 1086 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 1087 block_rsv->space_info->flags, 1088 orig_bytes, 1); 1089 1090 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 1091 btrfs_dump_space_info(fs_info, block_rsv->space_info, 1092 orig_bytes, 0); 1093 } 1094 return ret; 1095 } 1096