1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "tree-log.h" 20 #include "disk-io.h" 21 #include "print-tree.h" 22 #include "volumes.h" 23 #include "raid56.h" 24 #include "locking.h" 25 #include "free-space-cache.h" 26 #include "free-space-tree.h" 27 #include "math.h" 28 #include "sysfs.h" 29 #include "qgroup.h" 30 #include "ref-verify.h" 31 32 #undef SCRAMBLE_DELAYED_REFS 33 34 /* 35 * control flags for do_chunk_alloc's force field 36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 37 * if we really need one. 38 * 39 * CHUNK_ALLOC_LIMITED means to only try and allocate one 40 * if we have very few chunks already allocated. This is 41 * used as part of the clustering code to help make sure 42 * we have a good pool of storage to cluster in, without 43 * filling the FS with empty chunks 44 * 45 * CHUNK_ALLOC_FORCE means it must try to allocate one 46 * 47 */ 48 enum { 49 CHUNK_ALLOC_NO_FORCE = 0, 50 CHUNK_ALLOC_LIMITED = 1, 51 CHUNK_ALLOC_FORCE = 2, 52 }; 53 54 /* 55 * Declare a helper function to detect underflow of various space info members 56 */ 57 #define DECLARE_SPACE_INFO_UPDATE(name) \ 58 static inline void update_##name(struct btrfs_space_info *sinfo, \ 59 s64 bytes) \ 60 { \ 61 if (bytes < 0 && sinfo->name < -bytes) { \ 62 WARN_ON(1); \ 63 sinfo->name = 0; \ 64 return; \ 65 } \ 66 sinfo->name += bytes; \ 67 } 68 69 DECLARE_SPACE_INFO_UPDATE(bytes_may_use); 70 DECLARE_SPACE_INFO_UPDATE(bytes_pinned); 71 72 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 73 struct btrfs_delayed_ref_node *node, u64 parent, 74 u64 root_objectid, u64 owner_objectid, 75 u64 owner_offset, int refs_to_drop, 76 struct btrfs_delayed_extent_op *extra_op); 77 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 78 struct extent_buffer *leaf, 79 struct btrfs_extent_item *ei); 80 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 81 u64 parent, u64 root_objectid, 82 u64 flags, u64 owner, u64 offset, 83 struct btrfs_key *ins, int ref_mod); 84 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 85 struct btrfs_delayed_ref_node *node, 86 struct btrfs_delayed_extent_op *extent_op); 87 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 88 int force); 89 static int find_next_key(struct btrfs_path *path, int level, 90 struct btrfs_key *key); 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 92 struct btrfs_space_info *info, u64 bytes, 93 int dump_block_groups); 94 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 95 u64 num_bytes); 96 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 97 struct btrfs_space_info *space_info, 98 u64 num_bytes); 99 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 100 struct btrfs_space_info *space_info, 101 u64 num_bytes); 102 103 static noinline int 104 block_group_cache_done(struct btrfs_block_group_cache *cache) 105 { 106 smp_mb(); 107 return cache->cached == BTRFS_CACHE_FINISHED || 108 cache->cached == BTRFS_CACHE_ERROR; 109 } 110 111 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 112 { 113 return (cache->flags & bits) == bits; 114 } 115 116 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 117 { 118 atomic_inc(&cache->count); 119 } 120 121 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 122 { 123 if (atomic_dec_and_test(&cache->count)) { 124 WARN_ON(cache->pinned > 0); 125 WARN_ON(cache->reserved > 0); 126 127 /* 128 * If not empty, someone is still holding mutex of 129 * full_stripe_lock, which can only be released by caller. 130 * And it will definitely cause use-after-free when caller 131 * tries to release full stripe lock. 132 * 133 * No better way to resolve, but only to warn. 134 */ 135 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 136 kfree(cache->free_space_ctl); 137 kfree(cache); 138 } 139 } 140 141 /* 142 * this adds the block group to the fs_info rb tree for the block group 143 * cache 144 */ 145 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 146 struct btrfs_block_group_cache *block_group) 147 { 148 struct rb_node **p; 149 struct rb_node *parent = NULL; 150 struct btrfs_block_group_cache *cache; 151 152 spin_lock(&info->block_group_cache_lock); 153 p = &info->block_group_cache_tree.rb_node; 154 155 while (*p) { 156 parent = *p; 157 cache = rb_entry(parent, struct btrfs_block_group_cache, 158 cache_node); 159 if (block_group->key.objectid < cache->key.objectid) { 160 p = &(*p)->rb_left; 161 } else if (block_group->key.objectid > cache->key.objectid) { 162 p = &(*p)->rb_right; 163 } else { 164 spin_unlock(&info->block_group_cache_lock); 165 return -EEXIST; 166 } 167 } 168 169 rb_link_node(&block_group->cache_node, parent, p); 170 rb_insert_color(&block_group->cache_node, 171 &info->block_group_cache_tree); 172 173 if (info->first_logical_byte > block_group->key.objectid) 174 info->first_logical_byte = block_group->key.objectid; 175 176 spin_unlock(&info->block_group_cache_lock); 177 178 return 0; 179 } 180 181 /* 182 * This will return the block group at or after bytenr if contains is 0, else 183 * it will return the block group that contains the bytenr 184 */ 185 static struct btrfs_block_group_cache * 186 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 187 int contains) 188 { 189 struct btrfs_block_group_cache *cache, *ret = NULL; 190 struct rb_node *n; 191 u64 end, start; 192 193 spin_lock(&info->block_group_cache_lock); 194 n = info->block_group_cache_tree.rb_node; 195 196 while (n) { 197 cache = rb_entry(n, struct btrfs_block_group_cache, 198 cache_node); 199 end = cache->key.objectid + cache->key.offset - 1; 200 start = cache->key.objectid; 201 202 if (bytenr < start) { 203 if (!contains && (!ret || start < ret->key.objectid)) 204 ret = cache; 205 n = n->rb_left; 206 } else if (bytenr > start) { 207 if (contains && bytenr <= end) { 208 ret = cache; 209 break; 210 } 211 n = n->rb_right; 212 } else { 213 ret = cache; 214 break; 215 } 216 } 217 if (ret) { 218 btrfs_get_block_group(ret); 219 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 220 info->first_logical_byte = ret->key.objectid; 221 } 222 spin_unlock(&info->block_group_cache_lock); 223 224 return ret; 225 } 226 227 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 228 u64 start, u64 num_bytes) 229 { 230 u64 end = start + num_bytes - 1; 231 set_extent_bits(&fs_info->freed_extents[0], 232 start, end, EXTENT_UPTODATE); 233 set_extent_bits(&fs_info->freed_extents[1], 234 start, end, EXTENT_UPTODATE); 235 return 0; 236 } 237 238 static void free_excluded_extents(struct btrfs_block_group_cache *cache) 239 { 240 struct btrfs_fs_info *fs_info = cache->fs_info; 241 u64 start, end; 242 243 start = cache->key.objectid; 244 end = start + cache->key.offset - 1; 245 246 clear_extent_bits(&fs_info->freed_extents[0], 247 start, end, EXTENT_UPTODATE); 248 clear_extent_bits(&fs_info->freed_extents[1], 249 start, end, EXTENT_UPTODATE); 250 } 251 252 static int exclude_super_stripes(struct btrfs_block_group_cache *cache) 253 { 254 struct btrfs_fs_info *fs_info = cache->fs_info; 255 u64 bytenr; 256 u64 *logical; 257 int stripe_len; 258 int i, nr, ret; 259 260 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 261 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 262 cache->bytes_super += stripe_len; 263 ret = add_excluded_extent(fs_info, cache->key.objectid, 264 stripe_len); 265 if (ret) 266 return ret; 267 } 268 269 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 270 bytenr = btrfs_sb_offset(i); 271 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 272 bytenr, &logical, &nr, &stripe_len); 273 if (ret) 274 return ret; 275 276 while (nr--) { 277 u64 start, len; 278 279 if (logical[nr] > cache->key.objectid + 280 cache->key.offset) 281 continue; 282 283 if (logical[nr] + stripe_len <= cache->key.objectid) 284 continue; 285 286 start = logical[nr]; 287 if (start < cache->key.objectid) { 288 start = cache->key.objectid; 289 len = (logical[nr] + stripe_len) - start; 290 } else { 291 len = min_t(u64, stripe_len, 292 cache->key.objectid + 293 cache->key.offset - start); 294 } 295 296 cache->bytes_super += len; 297 ret = add_excluded_extent(fs_info, start, len); 298 if (ret) { 299 kfree(logical); 300 return ret; 301 } 302 } 303 304 kfree(logical); 305 } 306 return 0; 307 } 308 309 static struct btrfs_caching_control * 310 get_caching_control(struct btrfs_block_group_cache *cache) 311 { 312 struct btrfs_caching_control *ctl; 313 314 spin_lock(&cache->lock); 315 if (!cache->caching_ctl) { 316 spin_unlock(&cache->lock); 317 return NULL; 318 } 319 320 ctl = cache->caching_ctl; 321 refcount_inc(&ctl->count); 322 spin_unlock(&cache->lock); 323 return ctl; 324 } 325 326 static void put_caching_control(struct btrfs_caching_control *ctl) 327 { 328 if (refcount_dec_and_test(&ctl->count)) 329 kfree(ctl); 330 } 331 332 #ifdef CONFIG_BTRFS_DEBUG 333 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 334 { 335 struct btrfs_fs_info *fs_info = block_group->fs_info; 336 u64 start = block_group->key.objectid; 337 u64 len = block_group->key.offset; 338 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 339 fs_info->nodesize : fs_info->sectorsize; 340 u64 step = chunk << 1; 341 342 while (len > chunk) { 343 btrfs_remove_free_space(block_group, start, chunk); 344 start += step; 345 if (len < step) 346 len = 0; 347 else 348 len -= step; 349 } 350 } 351 #endif 352 353 /* 354 * this is only called by cache_block_group, since we could have freed extents 355 * we need to check the pinned_extents for any extents that can't be used yet 356 * since their free space will be released as soon as the transaction commits. 357 */ 358 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 359 u64 start, u64 end) 360 { 361 struct btrfs_fs_info *info = block_group->fs_info; 362 u64 extent_start, extent_end, size, total_added = 0; 363 int ret; 364 365 while (start < end) { 366 ret = find_first_extent_bit(info->pinned_extents, start, 367 &extent_start, &extent_end, 368 EXTENT_DIRTY | EXTENT_UPTODATE, 369 NULL); 370 if (ret) 371 break; 372 373 if (extent_start <= start) { 374 start = extent_end + 1; 375 } else if (extent_start > start && extent_start < end) { 376 size = extent_start - start; 377 total_added += size; 378 ret = btrfs_add_free_space(block_group, start, 379 size); 380 BUG_ON(ret); /* -ENOMEM or logic error */ 381 start = extent_end + 1; 382 } else { 383 break; 384 } 385 } 386 387 if (start < end) { 388 size = end - start; 389 total_added += size; 390 ret = btrfs_add_free_space(block_group, start, size); 391 BUG_ON(ret); /* -ENOMEM or logic error */ 392 } 393 394 return total_added; 395 } 396 397 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 398 { 399 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 400 struct btrfs_fs_info *fs_info = block_group->fs_info; 401 struct btrfs_root *extent_root = fs_info->extent_root; 402 struct btrfs_path *path; 403 struct extent_buffer *leaf; 404 struct btrfs_key key; 405 u64 total_found = 0; 406 u64 last = 0; 407 u32 nritems; 408 int ret; 409 bool wakeup = true; 410 411 path = btrfs_alloc_path(); 412 if (!path) 413 return -ENOMEM; 414 415 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 416 417 #ifdef CONFIG_BTRFS_DEBUG 418 /* 419 * If we're fragmenting we don't want to make anybody think we can 420 * allocate from this block group until we've had a chance to fragment 421 * the free space. 422 */ 423 if (btrfs_should_fragment_free_space(block_group)) 424 wakeup = false; 425 #endif 426 /* 427 * We don't want to deadlock with somebody trying to allocate a new 428 * extent for the extent root while also trying to search the extent 429 * root to add free space. So we skip locking and search the commit 430 * root, since its read-only 431 */ 432 path->skip_locking = 1; 433 path->search_commit_root = 1; 434 path->reada = READA_FORWARD; 435 436 key.objectid = last; 437 key.offset = 0; 438 key.type = BTRFS_EXTENT_ITEM_KEY; 439 440 next: 441 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 442 if (ret < 0) 443 goto out; 444 445 leaf = path->nodes[0]; 446 nritems = btrfs_header_nritems(leaf); 447 448 while (1) { 449 if (btrfs_fs_closing(fs_info) > 1) { 450 last = (u64)-1; 451 break; 452 } 453 454 if (path->slots[0] < nritems) { 455 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 456 } else { 457 ret = find_next_key(path, 0, &key); 458 if (ret) 459 break; 460 461 if (need_resched() || 462 rwsem_is_contended(&fs_info->commit_root_sem)) { 463 if (wakeup) 464 caching_ctl->progress = last; 465 btrfs_release_path(path); 466 up_read(&fs_info->commit_root_sem); 467 mutex_unlock(&caching_ctl->mutex); 468 cond_resched(); 469 mutex_lock(&caching_ctl->mutex); 470 down_read(&fs_info->commit_root_sem); 471 goto next; 472 } 473 474 ret = btrfs_next_leaf(extent_root, path); 475 if (ret < 0) 476 goto out; 477 if (ret) 478 break; 479 leaf = path->nodes[0]; 480 nritems = btrfs_header_nritems(leaf); 481 continue; 482 } 483 484 if (key.objectid < last) { 485 key.objectid = last; 486 key.offset = 0; 487 key.type = BTRFS_EXTENT_ITEM_KEY; 488 489 if (wakeup) 490 caching_ctl->progress = last; 491 btrfs_release_path(path); 492 goto next; 493 } 494 495 if (key.objectid < block_group->key.objectid) { 496 path->slots[0]++; 497 continue; 498 } 499 500 if (key.objectid >= block_group->key.objectid + 501 block_group->key.offset) 502 break; 503 504 if (key.type == BTRFS_EXTENT_ITEM_KEY || 505 key.type == BTRFS_METADATA_ITEM_KEY) { 506 total_found += add_new_free_space(block_group, last, 507 key.objectid); 508 if (key.type == BTRFS_METADATA_ITEM_KEY) 509 last = key.objectid + 510 fs_info->nodesize; 511 else 512 last = key.objectid + key.offset; 513 514 if (total_found > CACHING_CTL_WAKE_UP) { 515 total_found = 0; 516 if (wakeup) 517 wake_up(&caching_ctl->wait); 518 } 519 } 520 path->slots[0]++; 521 } 522 ret = 0; 523 524 total_found += add_new_free_space(block_group, last, 525 block_group->key.objectid + 526 block_group->key.offset); 527 caching_ctl->progress = (u64)-1; 528 529 out: 530 btrfs_free_path(path); 531 return ret; 532 } 533 534 static noinline void caching_thread(struct btrfs_work *work) 535 { 536 struct btrfs_block_group_cache *block_group; 537 struct btrfs_fs_info *fs_info; 538 struct btrfs_caching_control *caching_ctl; 539 int ret; 540 541 caching_ctl = container_of(work, struct btrfs_caching_control, work); 542 block_group = caching_ctl->block_group; 543 fs_info = block_group->fs_info; 544 545 mutex_lock(&caching_ctl->mutex); 546 down_read(&fs_info->commit_root_sem); 547 548 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 549 ret = load_free_space_tree(caching_ctl); 550 else 551 ret = load_extent_tree_free(caching_ctl); 552 553 spin_lock(&block_group->lock); 554 block_group->caching_ctl = NULL; 555 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 556 spin_unlock(&block_group->lock); 557 558 #ifdef CONFIG_BTRFS_DEBUG 559 if (btrfs_should_fragment_free_space(block_group)) { 560 u64 bytes_used; 561 562 spin_lock(&block_group->space_info->lock); 563 spin_lock(&block_group->lock); 564 bytes_used = block_group->key.offset - 565 btrfs_block_group_used(&block_group->item); 566 block_group->space_info->bytes_used += bytes_used >> 1; 567 spin_unlock(&block_group->lock); 568 spin_unlock(&block_group->space_info->lock); 569 fragment_free_space(block_group); 570 } 571 #endif 572 573 caching_ctl->progress = (u64)-1; 574 575 up_read(&fs_info->commit_root_sem); 576 free_excluded_extents(block_group); 577 mutex_unlock(&caching_ctl->mutex); 578 579 wake_up(&caching_ctl->wait); 580 581 put_caching_control(caching_ctl); 582 btrfs_put_block_group(block_group); 583 } 584 585 static int cache_block_group(struct btrfs_block_group_cache *cache, 586 int load_cache_only) 587 { 588 DEFINE_WAIT(wait); 589 struct btrfs_fs_info *fs_info = cache->fs_info; 590 struct btrfs_caching_control *caching_ctl; 591 int ret = 0; 592 593 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 594 if (!caching_ctl) 595 return -ENOMEM; 596 597 INIT_LIST_HEAD(&caching_ctl->list); 598 mutex_init(&caching_ctl->mutex); 599 init_waitqueue_head(&caching_ctl->wait); 600 caching_ctl->block_group = cache; 601 caching_ctl->progress = cache->key.objectid; 602 refcount_set(&caching_ctl->count, 1); 603 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 604 caching_thread, NULL, NULL); 605 606 spin_lock(&cache->lock); 607 /* 608 * This should be a rare occasion, but this could happen I think in the 609 * case where one thread starts to load the space cache info, and then 610 * some other thread starts a transaction commit which tries to do an 611 * allocation while the other thread is still loading the space cache 612 * info. The previous loop should have kept us from choosing this block 613 * group, but if we've moved to the state where we will wait on caching 614 * block groups we need to first check if we're doing a fast load here, 615 * so we can wait for it to finish, otherwise we could end up allocating 616 * from a block group who's cache gets evicted for one reason or 617 * another. 618 */ 619 while (cache->cached == BTRFS_CACHE_FAST) { 620 struct btrfs_caching_control *ctl; 621 622 ctl = cache->caching_ctl; 623 refcount_inc(&ctl->count); 624 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 625 spin_unlock(&cache->lock); 626 627 schedule(); 628 629 finish_wait(&ctl->wait, &wait); 630 put_caching_control(ctl); 631 spin_lock(&cache->lock); 632 } 633 634 if (cache->cached != BTRFS_CACHE_NO) { 635 spin_unlock(&cache->lock); 636 kfree(caching_ctl); 637 return 0; 638 } 639 WARN_ON(cache->caching_ctl); 640 cache->caching_ctl = caching_ctl; 641 cache->cached = BTRFS_CACHE_FAST; 642 spin_unlock(&cache->lock); 643 644 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 645 mutex_lock(&caching_ctl->mutex); 646 ret = load_free_space_cache(fs_info, cache); 647 648 spin_lock(&cache->lock); 649 if (ret == 1) { 650 cache->caching_ctl = NULL; 651 cache->cached = BTRFS_CACHE_FINISHED; 652 cache->last_byte_to_unpin = (u64)-1; 653 caching_ctl->progress = (u64)-1; 654 } else { 655 if (load_cache_only) { 656 cache->caching_ctl = NULL; 657 cache->cached = BTRFS_CACHE_NO; 658 } else { 659 cache->cached = BTRFS_CACHE_STARTED; 660 cache->has_caching_ctl = 1; 661 } 662 } 663 spin_unlock(&cache->lock); 664 #ifdef CONFIG_BTRFS_DEBUG 665 if (ret == 1 && 666 btrfs_should_fragment_free_space(cache)) { 667 u64 bytes_used; 668 669 spin_lock(&cache->space_info->lock); 670 spin_lock(&cache->lock); 671 bytes_used = cache->key.offset - 672 btrfs_block_group_used(&cache->item); 673 cache->space_info->bytes_used += bytes_used >> 1; 674 spin_unlock(&cache->lock); 675 spin_unlock(&cache->space_info->lock); 676 fragment_free_space(cache); 677 } 678 #endif 679 mutex_unlock(&caching_ctl->mutex); 680 681 wake_up(&caching_ctl->wait); 682 if (ret == 1) { 683 put_caching_control(caching_ctl); 684 free_excluded_extents(cache); 685 return 0; 686 } 687 } else { 688 /* 689 * We're either using the free space tree or no caching at all. 690 * Set cached to the appropriate value and wakeup any waiters. 691 */ 692 spin_lock(&cache->lock); 693 if (load_cache_only) { 694 cache->caching_ctl = NULL; 695 cache->cached = BTRFS_CACHE_NO; 696 } else { 697 cache->cached = BTRFS_CACHE_STARTED; 698 cache->has_caching_ctl = 1; 699 } 700 spin_unlock(&cache->lock); 701 wake_up(&caching_ctl->wait); 702 } 703 704 if (load_cache_only) { 705 put_caching_control(caching_ctl); 706 return 0; 707 } 708 709 down_write(&fs_info->commit_root_sem); 710 refcount_inc(&caching_ctl->count); 711 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 712 up_write(&fs_info->commit_root_sem); 713 714 btrfs_get_block_group(cache); 715 716 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 717 718 return ret; 719 } 720 721 /* 722 * return the block group that starts at or after bytenr 723 */ 724 static struct btrfs_block_group_cache * 725 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 726 { 727 return block_group_cache_tree_search(info, bytenr, 0); 728 } 729 730 /* 731 * return the block group that contains the given bytenr 732 */ 733 struct btrfs_block_group_cache *btrfs_lookup_block_group( 734 struct btrfs_fs_info *info, 735 u64 bytenr) 736 { 737 return block_group_cache_tree_search(info, bytenr, 1); 738 } 739 740 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 741 u64 flags) 742 { 743 struct list_head *head = &info->space_info; 744 struct btrfs_space_info *found; 745 746 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 747 748 rcu_read_lock(); 749 list_for_each_entry_rcu(found, head, list) { 750 if (found->flags & flags) { 751 rcu_read_unlock(); 752 return found; 753 } 754 } 755 rcu_read_unlock(); 756 return NULL; 757 } 758 759 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, 760 bool metadata, u64 root_objectid) 761 { 762 struct btrfs_space_info *space_info; 763 u64 flags; 764 765 if (metadata) { 766 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 767 flags = BTRFS_BLOCK_GROUP_SYSTEM; 768 else 769 flags = BTRFS_BLOCK_GROUP_METADATA; 770 } else { 771 flags = BTRFS_BLOCK_GROUP_DATA; 772 } 773 774 space_info = __find_space_info(fs_info, flags); 775 ASSERT(space_info); 776 percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, 777 BTRFS_TOTAL_BYTES_PINNED_BATCH); 778 } 779 780 /* 781 * after adding space to the filesystem, we need to clear the full flags 782 * on all the space infos. 783 */ 784 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 785 { 786 struct list_head *head = &info->space_info; 787 struct btrfs_space_info *found; 788 789 rcu_read_lock(); 790 list_for_each_entry_rcu(found, head, list) 791 found->full = 0; 792 rcu_read_unlock(); 793 } 794 795 /* simple helper to search for an existing data extent at a given offset */ 796 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 797 { 798 int ret; 799 struct btrfs_key key; 800 struct btrfs_path *path; 801 802 path = btrfs_alloc_path(); 803 if (!path) 804 return -ENOMEM; 805 806 key.objectid = start; 807 key.offset = len; 808 key.type = BTRFS_EXTENT_ITEM_KEY; 809 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 810 btrfs_free_path(path); 811 return ret; 812 } 813 814 /* 815 * helper function to lookup reference count and flags of a tree block. 816 * 817 * the head node for delayed ref is used to store the sum of all the 818 * reference count modifications queued up in the rbtree. the head 819 * node may also store the extent flags to set. This way you can check 820 * to see what the reference count and extent flags would be if all of 821 * the delayed refs are not processed. 822 */ 823 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 824 struct btrfs_fs_info *fs_info, u64 bytenr, 825 u64 offset, int metadata, u64 *refs, u64 *flags) 826 { 827 struct btrfs_delayed_ref_head *head; 828 struct btrfs_delayed_ref_root *delayed_refs; 829 struct btrfs_path *path; 830 struct btrfs_extent_item *ei; 831 struct extent_buffer *leaf; 832 struct btrfs_key key; 833 u32 item_size; 834 u64 num_refs; 835 u64 extent_flags; 836 int ret; 837 838 /* 839 * If we don't have skinny metadata, don't bother doing anything 840 * different 841 */ 842 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 843 offset = fs_info->nodesize; 844 metadata = 0; 845 } 846 847 path = btrfs_alloc_path(); 848 if (!path) 849 return -ENOMEM; 850 851 if (!trans) { 852 path->skip_locking = 1; 853 path->search_commit_root = 1; 854 } 855 856 search_again: 857 key.objectid = bytenr; 858 key.offset = offset; 859 if (metadata) 860 key.type = BTRFS_METADATA_ITEM_KEY; 861 else 862 key.type = BTRFS_EXTENT_ITEM_KEY; 863 864 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 865 if (ret < 0) 866 goto out_free; 867 868 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 869 if (path->slots[0]) { 870 path->slots[0]--; 871 btrfs_item_key_to_cpu(path->nodes[0], &key, 872 path->slots[0]); 873 if (key.objectid == bytenr && 874 key.type == BTRFS_EXTENT_ITEM_KEY && 875 key.offset == fs_info->nodesize) 876 ret = 0; 877 } 878 } 879 880 if (ret == 0) { 881 leaf = path->nodes[0]; 882 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 883 if (item_size >= sizeof(*ei)) { 884 ei = btrfs_item_ptr(leaf, path->slots[0], 885 struct btrfs_extent_item); 886 num_refs = btrfs_extent_refs(leaf, ei); 887 extent_flags = btrfs_extent_flags(leaf, ei); 888 } else { 889 ret = -EINVAL; 890 btrfs_print_v0_err(fs_info); 891 if (trans) 892 btrfs_abort_transaction(trans, ret); 893 else 894 btrfs_handle_fs_error(fs_info, ret, NULL); 895 896 goto out_free; 897 } 898 899 BUG_ON(num_refs == 0); 900 } else { 901 num_refs = 0; 902 extent_flags = 0; 903 ret = 0; 904 } 905 906 if (!trans) 907 goto out; 908 909 delayed_refs = &trans->transaction->delayed_refs; 910 spin_lock(&delayed_refs->lock); 911 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 912 if (head) { 913 if (!mutex_trylock(&head->mutex)) { 914 refcount_inc(&head->refs); 915 spin_unlock(&delayed_refs->lock); 916 917 btrfs_release_path(path); 918 919 /* 920 * Mutex was contended, block until it's released and try 921 * again 922 */ 923 mutex_lock(&head->mutex); 924 mutex_unlock(&head->mutex); 925 btrfs_put_delayed_ref_head(head); 926 goto search_again; 927 } 928 spin_lock(&head->lock); 929 if (head->extent_op && head->extent_op->update_flags) 930 extent_flags |= head->extent_op->flags_to_set; 931 else 932 BUG_ON(num_refs == 0); 933 934 num_refs += head->ref_mod; 935 spin_unlock(&head->lock); 936 mutex_unlock(&head->mutex); 937 } 938 spin_unlock(&delayed_refs->lock); 939 out: 940 WARN_ON(num_refs == 0); 941 if (refs) 942 *refs = num_refs; 943 if (flags) 944 *flags = extent_flags; 945 out_free: 946 btrfs_free_path(path); 947 return ret; 948 } 949 950 /* 951 * Back reference rules. Back refs have three main goals: 952 * 953 * 1) differentiate between all holders of references to an extent so that 954 * when a reference is dropped we can make sure it was a valid reference 955 * before freeing the extent. 956 * 957 * 2) Provide enough information to quickly find the holders of an extent 958 * if we notice a given block is corrupted or bad. 959 * 960 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 961 * maintenance. This is actually the same as #2, but with a slightly 962 * different use case. 963 * 964 * There are two kinds of back refs. The implicit back refs is optimized 965 * for pointers in non-shared tree blocks. For a given pointer in a block, 966 * back refs of this kind provide information about the block's owner tree 967 * and the pointer's key. These information allow us to find the block by 968 * b-tree searching. The full back refs is for pointers in tree blocks not 969 * referenced by their owner trees. The location of tree block is recorded 970 * in the back refs. Actually the full back refs is generic, and can be 971 * used in all cases the implicit back refs is used. The major shortcoming 972 * of the full back refs is its overhead. Every time a tree block gets 973 * COWed, we have to update back refs entry for all pointers in it. 974 * 975 * For a newly allocated tree block, we use implicit back refs for 976 * pointers in it. This means most tree related operations only involve 977 * implicit back refs. For a tree block created in old transaction, the 978 * only way to drop a reference to it is COW it. So we can detect the 979 * event that tree block loses its owner tree's reference and do the 980 * back refs conversion. 981 * 982 * When a tree block is COWed through a tree, there are four cases: 983 * 984 * The reference count of the block is one and the tree is the block's 985 * owner tree. Nothing to do in this case. 986 * 987 * The reference count of the block is one and the tree is not the 988 * block's owner tree. In this case, full back refs is used for pointers 989 * in the block. Remove these full back refs, add implicit back refs for 990 * every pointers in the new block. 991 * 992 * The reference count of the block is greater than one and the tree is 993 * the block's owner tree. In this case, implicit back refs is used for 994 * pointers in the block. Add full back refs for every pointers in the 995 * block, increase lower level extents' reference counts. The original 996 * implicit back refs are entailed to the new block. 997 * 998 * The reference count of the block is greater than one and the tree is 999 * not the block's owner tree. Add implicit back refs for every pointer in 1000 * the new block, increase lower level extents' reference count. 1001 * 1002 * Back Reference Key composing: 1003 * 1004 * The key objectid corresponds to the first byte in the extent, 1005 * The key type is used to differentiate between types of back refs. 1006 * There are different meanings of the key offset for different types 1007 * of back refs. 1008 * 1009 * File extents can be referenced by: 1010 * 1011 * - multiple snapshots, subvolumes, or different generations in one subvol 1012 * - different files inside a single subvolume 1013 * - different offsets inside a file (bookend extents in file.c) 1014 * 1015 * The extent ref structure for the implicit back refs has fields for: 1016 * 1017 * - Objectid of the subvolume root 1018 * - objectid of the file holding the reference 1019 * - original offset in the file 1020 * - how many bookend extents 1021 * 1022 * The key offset for the implicit back refs is hash of the first 1023 * three fields. 1024 * 1025 * The extent ref structure for the full back refs has field for: 1026 * 1027 * - number of pointers in the tree leaf 1028 * 1029 * The key offset for the implicit back refs is the first byte of 1030 * the tree leaf 1031 * 1032 * When a file extent is allocated, The implicit back refs is used. 1033 * the fields are filled in: 1034 * 1035 * (root_key.objectid, inode objectid, offset in file, 1) 1036 * 1037 * When a file extent is removed file truncation, we find the 1038 * corresponding implicit back refs and check the following fields: 1039 * 1040 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1041 * 1042 * Btree extents can be referenced by: 1043 * 1044 * - Different subvolumes 1045 * 1046 * Both the implicit back refs and the full back refs for tree blocks 1047 * only consist of key. The key offset for the implicit back refs is 1048 * objectid of block's owner tree. The key offset for the full back refs 1049 * is the first byte of parent block. 1050 * 1051 * When implicit back refs is used, information about the lowest key and 1052 * level of the tree block are required. These information are stored in 1053 * tree block info structure. 1054 */ 1055 1056 /* 1057 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 1058 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 1059 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 1060 */ 1061 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 1062 struct btrfs_extent_inline_ref *iref, 1063 enum btrfs_inline_ref_type is_data) 1064 { 1065 int type = btrfs_extent_inline_ref_type(eb, iref); 1066 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 1067 1068 if (type == BTRFS_TREE_BLOCK_REF_KEY || 1069 type == BTRFS_SHARED_BLOCK_REF_KEY || 1070 type == BTRFS_SHARED_DATA_REF_KEY || 1071 type == BTRFS_EXTENT_DATA_REF_KEY) { 1072 if (is_data == BTRFS_REF_TYPE_BLOCK) { 1073 if (type == BTRFS_TREE_BLOCK_REF_KEY) 1074 return type; 1075 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1076 ASSERT(eb->fs_info); 1077 /* 1078 * Every shared one has parent tree 1079 * block, which must be aligned to 1080 * nodesize. 1081 */ 1082 if (offset && 1083 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1084 return type; 1085 } 1086 } else if (is_data == BTRFS_REF_TYPE_DATA) { 1087 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1088 return type; 1089 if (type == BTRFS_SHARED_DATA_REF_KEY) { 1090 ASSERT(eb->fs_info); 1091 /* 1092 * Every shared one has parent tree 1093 * block, which must be aligned to 1094 * nodesize. 1095 */ 1096 if (offset && 1097 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1098 return type; 1099 } 1100 } else { 1101 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 1102 return type; 1103 } 1104 } 1105 1106 btrfs_print_leaf((struct extent_buffer *)eb); 1107 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", 1108 eb->start, type); 1109 WARN_ON(1); 1110 1111 return BTRFS_REF_TYPE_INVALID; 1112 } 1113 1114 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1115 { 1116 u32 high_crc = ~(u32)0; 1117 u32 low_crc = ~(u32)0; 1118 __le64 lenum; 1119 1120 lenum = cpu_to_le64(root_objectid); 1121 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1122 lenum = cpu_to_le64(owner); 1123 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1124 lenum = cpu_to_le64(offset); 1125 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1126 1127 return ((u64)high_crc << 31) ^ (u64)low_crc; 1128 } 1129 1130 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1131 struct btrfs_extent_data_ref *ref) 1132 { 1133 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1134 btrfs_extent_data_ref_objectid(leaf, ref), 1135 btrfs_extent_data_ref_offset(leaf, ref)); 1136 } 1137 1138 static int match_extent_data_ref(struct extent_buffer *leaf, 1139 struct btrfs_extent_data_ref *ref, 1140 u64 root_objectid, u64 owner, u64 offset) 1141 { 1142 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1143 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1144 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1145 return 0; 1146 return 1; 1147 } 1148 1149 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1150 struct btrfs_path *path, 1151 u64 bytenr, u64 parent, 1152 u64 root_objectid, 1153 u64 owner, u64 offset) 1154 { 1155 struct btrfs_root *root = trans->fs_info->extent_root; 1156 struct btrfs_key key; 1157 struct btrfs_extent_data_ref *ref; 1158 struct extent_buffer *leaf; 1159 u32 nritems; 1160 int ret; 1161 int recow; 1162 int err = -ENOENT; 1163 1164 key.objectid = bytenr; 1165 if (parent) { 1166 key.type = BTRFS_SHARED_DATA_REF_KEY; 1167 key.offset = parent; 1168 } else { 1169 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1170 key.offset = hash_extent_data_ref(root_objectid, 1171 owner, offset); 1172 } 1173 again: 1174 recow = 0; 1175 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1176 if (ret < 0) { 1177 err = ret; 1178 goto fail; 1179 } 1180 1181 if (parent) { 1182 if (!ret) 1183 return 0; 1184 goto fail; 1185 } 1186 1187 leaf = path->nodes[0]; 1188 nritems = btrfs_header_nritems(leaf); 1189 while (1) { 1190 if (path->slots[0] >= nritems) { 1191 ret = btrfs_next_leaf(root, path); 1192 if (ret < 0) 1193 err = ret; 1194 if (ret) 1195 goto fail; 1196 1197 leaf = path->nodes[0]; 1198 nritems = btrfs_header_nritems(leaf); 1199 recow = 1; 1200 } 1201 1202 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1203 if (key.objectid != bytenr || 1204 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1205 goto fail; 1206 1207 ref = btrfs_item_ptr(leaf, path->slots[0], 1208 struct btrfs_extent_data_ref); 1209 1210 if (match_extent_data_ref(leaf, ref, root_objectid, 1211 owner, offset)) { 1212 if (recow) { 1213 btrfs_release_path(path); 1214 goto again; 1215 } 1216 err = 0; 1217 break; 1218 } 1219 path->slots[0]++; 1220 } 1221 fail: 1222 return err; 1223 } 1224 1225 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1226 struct btrfs_path *path, 1227 u64 bytenr, u64 parent, 1228 u64 root_objectid, u64 owner, 1229 u64 offset, int refs_to_add) 1230 { 1231 struct btrfs_root *root = trans->fs_info->extent_root; 1232 struct btrfs_key key; 1233 struct extent_buffer *leaf; 1234 u32 size; 1235 u32 num_refs; 1236 int ret; 1237 1238 key.objectid = bytenr; 1239 if (parent) { 1240 key.type = BTRFS_SHARED_DATA_REF_KEY; 1241 key.offset = parent; 1242 size = sizeof(struct btrfs_shared_data_ref); 1243 } else { 1244 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1245 key.offset = hash_extent_data_ref(root_objectid, 1246 owner, offset); 1247 size = sizeof(struct btrfs_extent_data_ref); 1248 } 1249 1250 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1251 if (ret && ret != -EEXIST) 1252 goto fail; 1253 1254 leaf = path->nodes[0]; 1255 if (parent) { 1256 struct btrfs_shared_data_ref *ref; 1257 ref = btrfs_item_ptr(leaf, path->slots[0], 1258 struct btrfs_shared_data_ref); 1259 if (ret == 0) { 1260 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1261 } else { 1262 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1263 num_refs += refs_to_add; 1264 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1265 } 1266 } else { 1267 struct btrfs_extent_data_ref *ref; 1268 while (ret == -EEXIST) { 1269 ref = btrfs_item_ptr(leaf, path->slots[0], 1270 struct btrfs_extent_data_ref); 1271 if (match_extent_data_ref(leaf, ref, root_objectid, 1272 owner, offset)) 1273 break; 1274 btrfs_release_path(path); 1275 key.offset++; 1276 ret = btrfs_insert_empty_item(trans, root, path, &key, 1277 size); 1278 if (ret && ret != -EEXIST) 1279 goto fail; 1280 1281 leaf = path->nodes[0]; 1282 } 1283 ref = btrfs_item_ptr(leaf, path->slots[0], 1284 struct btrfs_extent_data_ref); 1285 if (ret == 0) { 1286 btrfs_set_extent_data_ref_root(leaf, ref, 1287 root_objectid); 1288 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1289 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1290 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1291 } else { 1292 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1293 num_refs += refs_to_add; 1294 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1295 } 1296 } 1297 btrfs_mark_buffer_dirty(leaf); 1298 ret = 0; 1299 fail: 1300 btrfs_release_path(path); 1301 return ret; 1302 } 1303 1304 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1305 struct btrfs_path *path, 1306 int refs_to_drop, int *last_ref) 1307 { 1308 struct btrfs_key key; 1309 struct btrfs_extent_data_ref *ref1 = NULL; 1310 struct btrfs_shared_data_ref *ref2 = NULL; 1311 struct extent_buffer *leaf; 1312 u32 num_refs = 0; 1313 int ret = 0; 1314 1315 leaf = path->nodes[0]; 1316 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1317 1318 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1319 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1320 struct btrfs_extent_data_ref); 1321 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1322 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1323 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1324 struct btrfs_shared_data_ref); 1325 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1326 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 1327 btrfs_print_v0_err(trans->fs_info); 1328 btrfs_abort_transaction(trans, -EINVAL); 1329 return -EINVAL; 1330 } else { 1331 BUG(); 1332 } 1333 1334 BUG_ON(num_refs < refs_to_drop); 1335 num_refs -= refs_to_drop; 1336 1337 if (num_refs == 0) { 1338 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1339 *last_ref = 1; 1340 } else { 1341 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1342 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1343 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1344 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1345 btrfs_mark_buffer_dirty(leaf); 1346 } 1347 return ret; 1348 } 1349 1350 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1351 struct btrfs_extent_inline_ref *iref) 1352 { 1353 struct btrfs_key key; 1354 struct extent_buffer *leaf; 1355 struct btrfs_extent_data_ref *ref1; 1356 struct btrfs_shared_data_ref *ref2; 1357 u32 num_refs = 0; 1358 int type; 1359 1360 leaf = path->nodes[0]; 1361 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1362 1363 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 1364 if (iref) { 1365 /* 1366 * If type is invalid, we should have bailed out earlier than 1367 * this call. 1368 */ 1369 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 1370 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1371 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1372 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1373 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1374 } else { 1375 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1376 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1377 } 1378 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1379 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1380 struct btrfs_extent_data_ref); 1381 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1382 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1383 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1384 struct btrfs_shared_data_ref); 1385 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1386 } else { 1387 WARN_ON(1); 1388 } 1389 return num_refs; 1390 } 1391 1392 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1393 struct btrfs_path *path, 1394 u64 bytenr, u64 parent, 1395 u64 root_objectid) 1396 { 1397 struct btrfs_root *root = trans->fs_info->extent_root; 1398 struct btrfs_key key; 1399 int ret; 1400 1401 key.objectid = bytenr; 1402 if (parent) { 1403 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1404 key.offset = parent; 1405 } else { 1406 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1407 key.offset = root_objectid; 1408 } 1409 1410 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1411 if (ret > 0) 1412 ret = -ENOENT; 1413 return ret; 1414 } 1415 1416 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1417 struct btrfs_path *path, 1418 u64 bytenr, u64 parent, 1419 u64 root_objectid) 1420 { 1421 struct btrfs_key key; 1422 int ret; 1423 1424 key.objectid = bytenr; 1425 if (parent) { 1426 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1427 key.offset = parent; 1428 } else { 1429 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1430 key.offset = root_objectid; 1431 } 1432 1433 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, 1434 path, &key, 0); 1435 btrfs_release_path(path); 1436 return ret; 1437 } 1438 1439 static inline int extent_ref_type(u64 parent, u64 owner) 1440 { 1441 int type; 1442 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1443 if (parent > 0) 1444 type = BTRFS_SHARED_BLOCK_REF_KEY; 1445 else 1446 type = BTRFS_TREE_BLOCK_REF_KEY; 1447 } else { 1448 if (parent > 0) 1449 type = BTRFS_SHARED_DATA_REF_KEY; 1450 else 1451 type = BTRFS_EXTENT_DATA_REF_KEY; 1452 } 1453 return type; 1454 } 1455 1456 static int find_next_key(struct btrfs_path *path, int level, 1457 struct btrfs_key *key) 1458 1459 { 1460 for (; level < BTRFS_MAX_LEVEL; level++) { 1461 if (!path->nodes[level]) 1462 break; 1463 if (path->slots[level] + 1 >= 1464 btrfs_header_nritems(path->nodes[level])) 1465 continue; 1466 if (level == 0) 1467 btrfs_item_key_to_cpu(path->nodes[level], key, 1468 path->slots[level] + 1); 1469 else 1470 btrfs_node_key_to_cpu(path->nodes[level], key, 1471 path->slots[level] + 1); 1472 return 0; 1473 } 1474 return 1; 1475 } 1476 1477 /* 1478 * look for inline back ref. if back ref is found, *ref_ret is set 1479 * to the address of inline back ref, and 0 is returned. 1480 * 1481 * if back ref isn't found, *ref_ret is set to the address where it 1482 * should be inserted, and -ENOENT is returned. 1483 * 1484 * if insert is true and there are too many inline back refs, the path 1485 * points to the extent item, and -EAGAIN is returned. 1486 * 1487 * NOTE: inline back refs are ordered in the same way that back ref 1488 * items in the tree are ordered. 1489 */ 1490 static noinline_for_stack 1491 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1492 struct btrfs_path *path, 1493 struct btrfs_extent_inline_ref **ref_ret, 1494 u64 bytenr, u64 num_bytes, 1495 u64 parent, u64 root_objectid, 1496 u64 owner, u64 offset, int insert) 1497 { 1498 struct btrfs_fs_info *fs_info = trans->fs_info; 1499 struct btrfs_root *root = fs_info->extent_root; 1500 struct btrfs_key key; 1501 struct extent_buffer *leaf; 1502 struct btrfs_extent_item *ei; 1503 struct btrfs_extent_inline_ref *iref; 1504 u64 flags; 1505 u64 item_size; 1506 unsigned long ptr; 1507 unsigned long end; 1508 int extra_size; 1509 int type; 1510 int want; 1511 int ret; 1512 int err = 0; 1513 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1514 int needed; 1515 1516 key.objectid = bytenr; 1517 key.type = BTRFS_EXTENT_ITEM_KEY; 1518 key.offset = num_bytes; 1519 1520 want = extent_ref_type(parent, owner); 1521 if (insert) { 1522 extra_size = btrfs_extent_inline_ref_size(want); 1523 path->keep_locks = 1; 1524 } else 1525 extra_size = -1; 1526 1527 /* 1528 * Owner is our level, so we can just add one to get the level for the 1529 * block we are interested in. 1530 */ 1531 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1532 key.type = BTRFS_METADATA_ITEM_KEY; 1533 key.offset = owner; 1534 } 1535 1536 again: 1537 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1538 if (ret < 0) { 1539 err = ret; 1540 goto out; 1541 } 1542 1543 /* 1544 * We may be a newly converted file system which still has the old fat 1545 * extent entries for metadata, so try and see if we have one of those. 1546 */ 1547 if (ret > 0 && skinny_metadata) { 1548 skinny_metadata = false; 1549 if (path->slots[0]) { 1550 path->slots[0]--; 1551 btrfs_item_key_to_cpu(path->nodes[0], &key, 1552 path->slots[0]); 1553 if (key.objectid == bytenr && 1554 key.type == BTRFS_EXTENT_ITEM_KEY && 1555 key.offset == num_bytes) 1556 ret = 0; 1557 } 1558 if (ret) { 1559 key.objectid = bytenr; 1560 key.type = BTRFS_EXTENT_ITEM_KEY; 1561 key.offset = num_bytes; 1562 btrfs_release_path(path); 1563 goto again; 1564 } 1565 } 1566 1567 if (ret && !insert) { 1568 err = -ENOENT; 1569 goto out; 1570 } else if (WARN_ON(ret)) { 1571 err = -EIO; 1572 goto out; 1573 } 1574 1575 leaf = path->nodes[0]; 1576 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1577 if (unlikely(item_size < sizeof(*ei))) { 1578 err = -EINVAL; 1579 btrfs_print_v0_err(fs_info); 1580 btrfs_abort_transaction(trans, err); 1581 goto out; 1582 } 1583 1584 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1585 flags = btrfs_extent_flags(leaf, ei); 1586 1587 ptr = (unsigned long)(ei + 1); 1588 end = (unsigned long)ei + item_size; 1589 1590 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1591 ptr += sizeof(struct btrfs_tree_block_info); 1592 BUG_ON(ptr > end); 1593 } 1594 1595 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 1596 needed = BTRFS_REF_TYPE_DATA; 1597 else 1598 needed = BTRFS_REF_TYPE_BLOCK; 1599 1600 err = -ENOENT; 1601 while (1) { 1602 if (ptr >= end) { 1603 WARN_ON(ptr > end); 1604 break; 1605 } 1606 iref = (struct btrfs_extent_inline_ref *)ptr; 1607 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 1608 if (type == BTRFS_REF_TYPE_INVALID) { 1609 err = -EUCLEAN; 1610 goto out; 1611 } 1612 1613 if (want < type) 1614 break; 1615 if (want > type) { 1616 ptr += btrfs_extent_inline_ref_size(type); 1617 continue; 1618 } 1619 1620 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1621 struct btrfs_extent_data_ref *dref; 1622 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1623 if (match_extent_data_ref(leaf, dref, root_objectid, 1624 owner, offset)) { 1625 err = 0; 1626 break; 1627 } 1628 if (hash_extent_data_ref_item(leaf, dref) < 1629 hash_extent_data_ref(root_objectid, owner, offset)) 1630 break; 1631 } else { 1632 u64 ref_offset; 1633 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1634 if (parent > 0) { 1635 if (parent == ref_offset) { 1636 err = 0; 1637 break; 1638 } 1639 if (ref_offset < parent) 1640 break; 1641 } else { 1642 if (root_objectid == ref_offset) { 1643 err = 0; 1644 break; 1645 } 1646 if (ref_offset < root_objectid) 1647 break; 1648 } 1649 } 1650 ptr += btrfs_extent_inline_ref_size(type); 1651 } 1652 if (err == -ENOENT && insert) { 1653 if (item_size + extra_size >= 1654 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1655 err = -EAGAIN; 1656 goto out; 1657 } 1658 /* 1659 * To add new inline back ref, we have to make sure 1660 * there is no corresponding back ref item. 1661 * For simplicity, we just do not add new inline back 1662 * ref if there is any kind of item for this block 1663 */ 1664 if (find_next_key(path, 0, &key) == 0 && 1665 key.objectid == bytenr && 1666 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1667 err = -EAGAIN; 1668 goto out; 1669 } 1670 } 1671 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1672 out: 1673 if (insert) { 1674 path->keep_locks = 0; 1675 btrfs_unlock_up_safe(path, 1); 1676 } 1677 return err; 1678 } 1679 1680 /* 1681 * helper to add new inline back ref 1682 */ 1683 static noinline_for_stack 1684 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1685 struct btrfs_path *path, 1686 struct btrfs_extent_inline_ref *iref, 1687 u64 parent, u64 root_objectid, 1688 u64 owner, u64 offset, int refs_to_add, 1689 struct btrfs_delayed_extent_op *extent_op) 1690 { 1691 struct extent_buffer *leaf; 1692 struct btrfs_extent_item *ei; 1693 unsigned long ptr; 1694 unsigned long end; 1695 unsigned long item_offset; 1696 u64 refs; 1697 int size; 1698 int type; 1699 1700 leaf = path->nodes[0]; 1701 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1702 item_offset = (unsigned long)iref - (unsigned long)ei; 1703 1704 type = extent_ref_type(parent, owner); 1705 size = btrfs_extent_inline_ref_size(type); 1706 1707 btrfs_extend_item(fs_info, path, size); 1708 1709 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1710 refs = btrfs_extent_refs(leaf, ei); 1711 refs += refs_to_add; 1712 btrfs_set_extent_refs(leaf, ei, refs); 1713 if (extent_op) 1714 __run_delayed_extent_op(extent_op, leaf, ei); 1715 1716 ptr = (unsigned long)ei + item_offset; 1717 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1718 if (ptr < end - size) 1719 memmove_extent_buffer(leaf, ptr + size, ptr, 1720 end - size - ptr); 1721 1722 iref = (struct btrfs_extent_inline_ref *)ptr; 1723 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1724 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1725 struct btrfs_extent_data_ref *dref; 1726 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1727 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1728 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1729 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1730 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1731 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1732 struct btrfs_shared_data_ref *sref; 1733 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1734 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1735 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1736 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1737 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1738 } else { 1739 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1740 } 1741 btrfs_mark_buffer_dirty(leaf); 1742 } 1743 1744 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1745 struct btrfs_path *path, 1746 struct btrfs_extent_inline_ref **ref_ret, 1747 u64 bytenr, u64 num_bytes, u64 parent, 1748 u64 root_objectid, u64 owner, u64 offset) 1749 { 1750 int ret; 1751 1752 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1753 num_bytes, parent, root_objectid, 1754 owner, offset, 0); 1755 if (ret != -ENOENT) 1756 return ret; 1757 1758 btrfs_release_path(path); 1759 *ref_ret = NULL; 1760 1761 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1762 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1763 root_objectid); 1764 } else { 1765 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1766 root_objectid, owner, offset); 1767 } 1768 return ret; 1769 } 1770 1771 /* 1772 * helper to update/remove inline back ref 1773 */ 1774 static noinline_for_stack 1775 void update_inline_extent_backref(struct btrfs_path *path, 1776 struct btrfs_extent_inline_ref *iref, 1777 int refs_to_mod, 1778 struct btrfs_delayed_extent_op *extent_op, 1779 int *last_ref) 1780 { 1781 struct extent_buffer *leaf = path->nodes[0]; 1782 struct btrfs_fs_info *fs_info = leaf->fs_info; 1783 struct btrfs_extent_item *ei; 1784 struct btrfs_extent_data_ref *dref = NULL; 1785 struct btrfs_shared_data_ref *sref = NULL; 1786 unsigned long ptr; 1787 unsigned long end; 1788 u32 item_size; 1789 int size; 1790 int type; 1791 u64 refs; 1792 1793 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1794 refs = btrfs_extent_refs(leaf, ei); 1795 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1796 refs += refs_to_mod; 1797 btrfs_set_extent_refs(leaf, ei, refs); 1798 if (extent_op) 1799 __run_delayed_extent_op(extent_op, leaf, ei); 1800 1801 /* 1802 * If type is invalid, we should have bailed out after 1803 * lookup_inline_extent_backref(). 1804 */ 1805 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1806 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1807 1808 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1809 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1810 refs = btrfs_extent_data_ref_count(leaf, dref); 1811 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1812 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1813 refs = btrfs_shared_data_ref_count(leaf, sref); 1814 } else { 1815 refs = 1; 1816 BUG_ON(refs_to_mod != -1); 1817 } 1818 1819 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1820 refs += refs_to_mod; 1821 1822 if (refs > 0) { 1823 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1824 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1825 else 1826 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1827 } else { 1828 *last_ref = 1; 1829 size = btrfs_extent_inline_ref_size(type); 1830 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1831 ptr = (unsigned long)iref; 1832 end = (unsigned long)ei + item_size; 1833 if (ptr + size < end) 1834 memmove_extent_buffer(leaf, ptr, ptr + size, 1835 end - ptr - size); 1836 item_size -= size; 1837 btrfs_truncate_item(fs_info, path, item_size, 1); 1838 } 1839 btrfs_mark_buffer_dirty(leaf); 1840 } 1841 1842 static noinline_for_stack 1843 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1844 struct btrfs_path *path, 1845 u64 bytenr, u64 num_bytes, u64 parent, 1846 u64 root_objectid, u64 owner, 1847 u64 offset, int refs_to_add, 1848 struct btrfs_delayed_extent_op *extent_op) 1849 { 1850 struct btrfs_extent_inline_ref *iref; 1851 int ret; 1852 1853 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1854 num_bytes, parent, root_objectid, 1855 owner, offset, 1); 1856 if (ret == 0) { 1857 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1858 update_inline_extent_backref(path, iref, refs_to_add, 1859 extent_op, NULL); 1860 } else if (ret == -ENOENT) { 1861 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1862 root_objectid, owner, offset, 1863 refs_to_add, extent_op); 1864 ret = 0; 1865 } 1866 return ret; 1867 } 1868 1869 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1870 struct btrfs_path *path, 1871 u64 bytenr, u64 parent, u64 root_objectid, 1872 u64 owner, u64 offset, int refs_to_add) 1873 { 1874 int ret; 1875 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1876 BUG_ON(refs_to_add != 1); 1877 ret = insert_tree_block_ref(trans, path, bytenr, parent, 1878 root_objectid); 1879 } else { 1880 ret = insert_extent_data_ref(trans, path, bytenr, parent, 1881 root_objectid, owner, offset, 1882 refs_to_add); 1883 } 1884 return ret; 1885 } 1886 1887 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1888 struct btrfs_path *path, 1889 struct btrfs_extent_inline_ref *iref, 1890 int refs_to_drop, int is_data, int *last_ref) 1891 { 1892 int ret = 0; 1893 1894 BUG_ON(!is_data && refs_to_drop != 1); 1895 if (iref) { 1896 update_inline_extent_backref(path, iref, -refs_to_drop, NULL, 1897 last_ref); 1898 } else if (is_data) { 1899 ret = remove_extent_data_ref(trans, path, refs_to_drop, 1900 last_ref); 1901 } else { 1902 *last_ref = 1; 1903 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1904 } 1905 return ret; 1906 } 1907 1908 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1909 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1910 u64 *discarded_bytes) 1911 { 1912 int j, ret = 0; 1913 u64 bytes_left, end; 1914 u64 aligned_start = ALIGN(start, 1 << 9); 1915 1916 if (WARN_ON(start != aligned_start)) { 1917 len -= aligned_start - start; 1918 len = round_down(len, 1 << 9); 1919 start = aligned_start; 1920 } 1921 1922 *discarded_bytes = 0; 1923 1924 if (!len) 1925 return 0; 1926 1927 end = start + len; 1928 bytes_left = len; 1929 1930 /* Skip any superblocks on this device. */ 1931 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1932 u64 sb_start = btrfs_sb_offset(j); 1933 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1934 u64 size = sb_start - start; 1935 1936 if (!in_range(sb_start, start, bytes_left) && 1937 !in_range(sb_end, start, bytes_left) && 1938 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1939 continue; 1940 1941 /* 1942 * Superblock spans beginning of range. Adjust start and 1943 * try again. 1944 */ 1945 if (sb_start <= start) { 1946 start += sb_end - start; 1947 if (start > end) { 1948 bytes_left = 0; 1949 break; 1950 } 1951 bytes_left = end - start; 1952 continue; 1953 } 1954 1955 if (size) { 1956 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 1957 GFP_NOFS, 0); 1958 if (!ret) 1959 *discarded_bytes += size; 1960 else if (ret != -EOPNOTSUPP) 1961 return ret; 1962 } 1963 1964 start = sb_end; 1965 if (start > end) { 1966 bytes_left = 0; 1967 break; 1968 } 1969 bytes_left = end - start; 1970 } 1971 1972 if (bytes_left) { 1973 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 1974 GFP_NOFS, 0); 1975 if (!ret) 1976 *discarded_bytes += bytes_left; 1977 } 1978 return ret; 1979 } 1980 1981 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1982 u64 num_bytes, u64 *actual_bytes) 1983 { 1984 int ret; 1985 u64 discarded_bytes = 0; 1986 struct btrfs_bio *bbio = NULL; 1987 1988 1989 /* 1990 * Avoid races with device replace and make sure our bbio has devices 1991 * associated to its stripes that don't go away while we are discarding. 1992 */ 1993 btrfs_bio_counter_inc_blocked(fs_info); 1994 /* Tell the block device(s) that the sectors can be discarded */ 1995 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 1996 &bbio, 0); 1997 /* Error condition is -ENOMEM */ 1998 if (!ret) { 1999 struct btrfs_bio_stripe *stripe = bbio->stripes; 2000 int i; 2001 2002 2003 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2004 u64 bytes; 2005 struct request_queue *req_q; 2006 2007 if (!stripe->dev->bdev) { 2008 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 2009 continue; 2010 } 2011 req_q = bdev_get_queue(stripe->dev->bdev); 2012 if (!blk_queue_discard(req_q)) 2013 continue; 2014 2015 ret = btrfs_issue_discard(stripe->dev->bdev, 2016 stripe->physical, 2017 stripe->length, 2018 &bytes); 2019 if (!ret) 2020 discarded_bytes += bytes; 2021 else if (ret != -EOPNOTSUPP) 2022 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2023 2024 /* 2025 * Just in case we get back EOPNOTSUPP for some reason, 2026 * just ignore the return value so we don't screw up 2027 * people calling discard_extent. 2028 */ 2029 ret = 0; 2030 } 2031 btrfs_put_bbio(bbio); 2032 } 2033 btrfs_bio_counter_dec(fs_info); 2034 2035 if (actual_bytes) 2036 *actual_bytes = discarded_bytes; 2037 2038 2039 if (ret == -EOPNOTSUPP) 2040 ret = 0; 2041 return ret; 2042 } 2043 2044 /* Can return -ENOMEM */ 2045 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2046 struct btrfs_root *root, 2047 u64 bytenr, u64 num_bytes, u64 parent, 2048 u64 root_objectid, u64 owner, u64 offset) 2049 { 2050 struct btrfs_fs_info *fs_info = root->fs_info; 2051 int old_ref_mod, new_ref_mod; 2052 int ret; 2053 2054 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2055 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2056 2057 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, 2058 owner, offset, BTRFS_ADD_DELAYED_REF); 2059 2060 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2061 ret = btrfs_add_delayed_tree_ref(trans, bytenr, 2062 num_bytes, parent, 2063 root_objectid, (int)owner, 2064 BTRFS_ADD_DELAYED_REF, NULL, 2065 &old_ref_mod, &new_ref_mod); 2066 } else { 2067 ret = btrfs_add_delayed_data_ref(trans, bytenr, 2068 num_bytes, parent, 2069 root_objectid, owner, offset, 2070 0, BTRFS_ADD_DELAYED_REF, 2071 &old_ref_mod, &new_ref_mod); 2072 } 2073 2074 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { 2075 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; 2076 2077 add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); 2078 } 2079 2080 return ret; 2081 } 2082 2083 /* 2084 * __btrfs_inc_extent_ref - insert backreference for a given extent 2085 * 2086 * @trans: Handle of transaction 2087 * 2088 * @node: The delayed ref node used to get the bytenr/length for 2089 * extent whose references are incremented. 2090 * 2091 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ 2092 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical 2093 * bytenr of the parent block. Since new extents are always 2094 * created with indirect references, this will only be the case 2095 * when relocating a shared extent. In that case, root_objectid 2096 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must 2097 * be 0 2098 * 2099 * @root_objectid: The id of the root where this modification has originated, 2100 * this can be either one of the well-known metadata trees or 2101 * the subvolume id which references this extent. 2102 * 2103 * @owner: For data extents it is the inode number of the owning file. 2104 * For metadata extents this parameter holds the level in the 2105 * tree of the extent. 2106 * 2107 * @offset: For metadata extents the offset is ignored and is currently 2108 * always passed as 0. For data extents it is the fileoffset 2109 * this extent belongs to. 2110 * 2111 * @refs_to_add Number of references to add 2112 * 2113 * @extent_op Pointer to a structure, holding information necessary when 2114 * updating a tree block's flags 2115 * 2116 */ 2117 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2118 struct btrfs_delayed_ref_node *node, 2119 u64 parent, u64 root_objectid, 2120 u64 owner, u64 offset, int refs_to_add, 2121 struct btrfs_delayed_extent_op *extent_op) 2122 { 2123 struct btrfs_path *path; 2124 struct extent_buffer *leaf; 2125 struct btrfs_extent_item *item; 2126 struct btrfs_key key; 2127 u64 bytenr = node->bytenr; 2128 u64 num_bytes = node->num_bytes; 2129 u64 refs; 2130 int ret; 2131 2132 path = btrfs_alloc_path(); 2133 if (!path) 2134 return -ENOMEM; 2135 2136 path->reada = READA_FORWARD; 2137 path->leave_spinning = 1; 2138 /* this will setup the path even if it fails to insert the back ref */ 2139 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 2140 parent, root_objectid, owner, 2141 offset, refs_to_add, extent_op); 2142 if ((ret < 0 && ret != -EAGAIN) || !ret) 2143 goto out; 2144 2145 /* 2146 * Ok we had -EAGAIN which means we didn't have space to insert and 2147 * inline extent ref, so just update the reference count and add a 2148 * normal backref. 2149 */ 2150 leaf = path->nodes[0]; 2151 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2152 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2153 refs = btrfs_extent_refs(leaf, item); 2154 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2155 if (extent_op) 2156 __run_delayed_extent_op(extent_op, leaf, item); 2157 2158 btrfs_mark_buffer_dirty(leaf); 2159 btrfs_release_path(path); 2160 2161 path->reada = READA_FORWARD; 2162 path->leave_spinning = 1; 2163 /* now insert the actual backref */ 2164 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, 2165 owner, offset, refs_to_add); 2166 if (ret) 2167 btrfs_abort_transaction(trans, ret); 2168 out: 2169 btrfs_free_path(path); 2170 return ret; 2171 } 2172 2173 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2174 struct btrfs_delayed_ref_node *node, 2175 struct btrfs_delayed_extent_op *extent_op, 2176 int insert_reserved) 2177 { 2178 int ret = 0; 2179 struct btrfs_delayed_data_ref *ref; 2180 struct btrfs_key ins; 2181 u64 parent = 0; 2182 u64 ref_root = 0; 2183 u64 flags = 0; 2184 2185 ins.objectid = node->bytenr; 2186 ins.offset = node->num_bytes; 2187 ins.type = BTRFS_EXTENT_ITEM_KEY; 2188 2189 ref = btrfs_delayed_node_to_data_ref(node); 2190 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); 2191 2192 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2193 parent = ref->parent; 2194 ref_root = ref->root; 2195 2196 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2197 if (extent_op) 2198 flags |= extent_op->flags_to_set; 2199 ret = alloc_reserved_file_extent(trans, parent, ref_root, 2200 flags, ref->objectid, 2201 ref->offset, &ins, 2202 node->ref_mod); 2203 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2204 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2205 ref->objectid, ref->offset, 2206 node->ref_mod, extent_op); 2207 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2208 ret = __btrfs_free_extent(trans, node, parent, 2209 ref_root, ref->objectid, 2210 ref->offset, node->ref_mod, 2211 extent_op); 2212 } else { 2213 BUG(); 2214 } 2215 return ret; 2216 } 2217 2218 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2219 struct extent_buffer *leaf, 2220 struct btrfs_extent_item *ei) 2221 { 2222 u64 flags = btrfs_extent_flags(leaf, ei); 2223 if (extent_op->update_flags) { 2224 flags |= extent_op->flags_to_set; 2225 btrfs_set_extent_flags(leaf, ei, flags); 2226 } 2227 2228 if (extent_op->update_key) { 2229 struct btrfs_tree_block_info *bi; 2230 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2231 bi = (struct btrfs_tree_block_info *)(ei + 1); 2232 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2233 } 2234 } 2235 2236 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2237 struct btrfs_delayed_ref_head *head, 2238 struct btrfs_delayed_extent_op *extent_op) 2239 { 2240 struct btrfs_fs_info *fs_info = trans->fs_info; 2241 struct btrfs_key key; 2242 struct btrfs_path *path; 2243 struct btrfs_extent_item *ei; 2244 struct extent_buffer *leaf; 2245 u32 item_size; 2246 int ret; 2247 int err = 0; 2248 int metadata = !extent_op->is_data; 2249 2250 if (trans->aborted) 2251 return 0; 2252 2253 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2254 metadata = 0; 2255 2256 path = btrfs_alloc_path(); 2257 if (!path) 2258 return -ENOMEM; 2259 2260 key.objectid = head->bytenr; 2261 2262 if (metadata) { 2263 key.type = BTRFS_METADATA_ITEM_KEY; 2264 key.offset = extent_op->level; 2265 } else { 2266 key.type = BTRFS_EXTENT_ITEM_KEY; 2267 key.offset = head->num_bytes; 2268 } 2269 2270 again: 2271 path->reada = READA_FORWARD; 2272 path->leave_spinning = 1; 2273 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2274 if (ret < 0) { 2275 err = ret; 2276 goto out; 2277 } 2278 if (ret > 0) { 2279 if (metadata) { 2280 if (path->slots[0] > 0) { 2281 path->slots[0]--; 2282 btrfs_item_key_to_cpu(path->nodes[0], &key, 2283 path->slots[0]); 2284 if (key.objectid == head->bytenr && 2285 key.type == BTRFS_EXTENT_ITEM_KEY && 2286 key.offset == head->num_bytes) 2287 ret = 0; 2288 } 2289 if (ret > 0) { 2290 btrfs_release_path(path); 2291 metadata = 0; 2292 2293 key.objectid = head->bytenr; 2294 key.offset = head->num_bytes; 2295 key.type = BTRFS_EXTENT_ITEM_KEY; 2296 goto again; 2297 } 2298 } else { 2299 err = -EIO; 2300 goto out; 2301 } 2302 } 2303 2304 leaf = path->nodes[0]; 2305 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2306 2307 if (unlikely(item_size < sizeof(*ei))) { 2308 err = -EINVAL; 2309 btrfs_print_v0_err(fs_info); 2310 btrfs_abort_transaction(trans, err); 2311 goto out; 2312 } 2313 2314 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2315 __run_delayed_extent_op(extent_op, leaf, ei); 2316 2317 btrfs_mark_buffer_dirty(leaf); 2318 out: 2319 btrfs_free_path(path); 2320 return err; 2321 } 2322 2323 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2324 struct btrfs_delayed_ref_node *node, 2325 struct btrfs_delayed_extent_op *extent_op, 2326 int insert_reserved) 2327 { 2328 int ret = 0; 2329 struct btrfs_delayed_tree_ref *ref; 2330 u64 parent = 0; 2331 u64 ref_root = 0; 2332 2333 ref = btrfs_delayed_node_to_tree_ref(node); 2334 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); 2335 2336 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2337 parent = ref->parent; 2338 ref_root = ref->root; 2339 2340 if (node->ref_mod != 1) { 2341 btrfs_err(trans->fs_info, 2342 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2343 node->bytenr, node->ref_mod, node->action, ref_root, 2344 parent); 2345 return -EIO; 2346 } 2347 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2348 BUG_ON(!extent_op || !extent_op->update_flags); 2349 ret = alloc_reserved_tree_block(trans, node, extent_op); 2350 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2351 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2352 ref->level, 0, 1, extent_op); 2353 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2354 ret = __btrfs_free_extent(trans, node, parent, ref_root, 2355 ref->level, 0, 1, extent_op); 2356 } else { 2357 BUG(); 2358 } 2359 return ret; 2360 } 2361 2362 /* helper function to actually process a single delayed ref entry */ 2363 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2364 struct btrfs_delayed_ref_node *node, 2365 struct btrfs_delayed_extent_op *extent_op, 2366 int insert_reserved) 2367 { 2368 int ret = 0; 2369 2370 if (trans->aborted) { 2371 if (insert_reserved) 2372 btrfs_pin_extent(trans->fs_info, node->bytenr, 2373 node->num_bytes, 1); 2374 return 0; 2375 } 2376 2377 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2378 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2379 ret = run_delayed_tree_ref(trans, node, extent_op, 2380 insert_reserved); 2381 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2382 node->type == BTRFS_SHARED_DATA_REF_KEY) 2383 ret = run_delayed_data_ref(trans, node, extent_op, 2384 insert_reserved); 2385 else 2386 BUG(); 2387 if (ret && insert_reserved) 2388 btrfs_pin_extent(trans->fs_info, node->bytenr, 2389 node->num_bytes, 1); 2390 return ret; 2391 } 2392 2393 static inline struct btrfs_delayed_ref_node * 2394 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2395 { 2396 struct btrfs_delayed_ref_node *ref; 2397 2398 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 2399 return NULL; 2400 2401 /* 2402 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2403 * This is to prevent a ref count from going down to zero, which deletes 2404 * the extent item from the extent tree, when there still are references 2405 * to add, which would fail because they would not find the extent item. 2406 */ 2407 if (!list_empty(&head->ref_add_list)) 2408 return list_first_entry(&head->ref_add_list, 2409 struct btrfs_delayed_ref_node, add_list); 2410 2411 ref = rb_entry(rb_first_cached(&head->ref_tree), 2412 struct btrfs_delayed_ref_node, ref_node); 2413 ASSERT(list_empty(&ref->add_list)); 2414 return ref; 2415 } 2416 2417 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 2418 struct btrfs_delayed_ref_head *head) 2419 { 2420 spin_lock(&delayed_refs->lock); 2421 head->processing = 0; 2422 delayed_refs->num_heads_ready++; 2423 spin_unlock(&delayed_refs->lock); 2424 btrfs_delayed_ref_unlock(head); 2425 } 2426 2427 static struct btrfs_delayed_extent_op *cleanup_extent_op( 2428 struct btrfs_delayed_ref_head *head) 2429 { 2430 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 2431 2432 if (!extent_op) 2433 return NULL; 2434 2435 if (head->must_insert_reserved) { 2436 head->extent_op = NULL; 2437 btrfs_free_delayed_extent_op(extent_op); 2438 return NULL; 2439 } 2440 return extent_op; 2441 } 2442 2443 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 2444 struct btrfs_delayed_ref_head *head) 2445 { 2446 struct btrfs_delayed_extent_op *extent_op; 2447 int ret; 2448 2449 extent_op = cleanup_extent_op(head); 2450 if (!extent_op) 2451 return 0; 2452 head->extent_op = NULL; 2453 spin_unlock(&head->lock); 2454 ret = run_delayed_extent_op(trans, head, extent_op); 2455 btrfs_free_delayed_extent_op(extent_op); 2456 return ret ? ret : 1; 2457 } 2458 2459 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 2460 struct btrfs_delayed_ref_root *delayed_refs, 2461 struct btrfs_delayed_ref_head *head) 2462 { 2463 int nr_items = 1; /* Dropping this ref head update. */ 2464 2465 if (head->total_ref_mod < 0) { 2466 struct btrfs_space_info *space_info; 2467 u64 flags; 2468 2469 if (head->is_data) 2470 flags = BTRFS_BLOCK_GROUP_DATA; 2471 else if (head->is_system) 2472 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2473 else 2474 flags = BTRFS_BLOCK_GROUP_METADATA; 2475 space_info = __find_space_info(fs_info, flags); 2476 ASSERT(space_info); 2477 percpu_counter_add_batch(&space_info->total_bytes_pinned, 2478 -head->num_bytes, 2479 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2480 2481 /* 2482 * We had csum deletions accounted for in our delayed refs rsv, 2483 * we need to drop the csum leaves for this update from our 2484 * delayed_refs_rsv. 2485 */ 2486 if (head->is_data) { 2487 spin_lock(&delayed_refs->lock); 2488 delayed_refs->pending_csums -= head->num_bytes; 2489 spin_unlock(&delayed_refs->lock); 2490 nr_items += btrfs_csum_bytes_to_leaves(fs_info, 2491 head->num_bytes); 2492 } 2493 } 2494 2495 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 2496 } 2497 2498 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 2499 struct btrfs_delayed_ref_head *head) 2500 { 2501 2502 struct btrfs_fs_info *fs_info = trans->fs_info; 2503 struct btrfs_delayed_ref_root *delayed_refs; 2504 int ret; 2505 2506 delayed_refs = &trans->transaction->delayed_refs; 2507 2508 ret = run_and_cleanup_extent_op(trans, head); 2509 if (ret < 0) { 2510 unselect_delayed_ref_head(delayed_refs, head); 2511 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2512 return ret; 2513 } else if (ret) { 2514 return ret; 2515 } 2516 2517 /* 2518 * Need to drop our head ref lock and re-acquire the delayed ref lock 2519 * and then re-check to make sure nobody got added. 2520 */ 2521 spin_unlock(&head->lock); 2522 spin_lock(&delayed_refs->lock); 2523 spin_lock(&head->lock); 2524 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 2525 spin_unlock(&head->lock); 2526 spin_unlock(&delayed_refs->lock); 2527 return 1; 2528 } 2529 btrfs_delete_ref_head(delayed_refs, head); 2530 spin_unlock(&head->lock); 2531 spin_unlock(&delayed_refs->lock); 2532 2533 if (head->must_insert_reserved) { 2534 btrfs_pin_extent(fs_info, head->bytenr, 2535 head->num_bytes, 1); 2536 if (head->is_data) { 2537 ret = btrfs_del_csums(trans, fs_info, head->bytenr, 2538 head->num_bytes); 2539 } 2540 } 2541 2542 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 2543 2544 trace_run_delayed_ref_head(fs_info, head, 0); 2545 btrfs_delayed_ref_unlock(head); 2546 btrfs_put_delayed_ref_head(head); 2547 return 0; 2548 } 2549 2550 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 2551 struct btrfs_trans_handle *trans) 2552 { 2553 struct btrfs_delayed_ref_root *delayed_refs = 2554 &trans->transaction->delayed_refs; 2555 struct btrfs_delayed_ref_head *head = NULL; 2556 int ret; 2557 2558 spin_lock(&delayed_refs->lock); 2559 head = btrfs_select_ref_head(delayed_refs); 2560 if (!head) { 2561 spin_unlock(&delayed_refs->lock); 2562 return head; 2563 } 2564 2565 /* 2566 * Grab the lock that says we are going to process all the refs for 2567 * this head 2568 */ 2569 ret = btrfs_delayed_ref_lock(delayed_refs, head); 2570 spin_unlock(&delayed_refs->lock); 2571 2572 /* 2573 * We may have dropped the spin lock to get the head mutex lock, and 2574 * that might have given someone else time to free the head. If that's 2575 * true, it has been removed from our list and we can move on. 2576 */ 2577 if (ret == -EAGAIN) 2578 head = ERR_PTR(-EAGAIN); 2579 2580 return head; 2581 } 2582 2583 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 2584 struct btrfs_delayed_ref_head *locked_ref, 2585 unsigned long *run_refs) 2586 { 2587 struct btrfs_fs_info *fs_info = trans->fs_info; 2588 struct btrfs_delayed_ref_root *delayed_refs; 2589 struct btrfs_delayed_extent_op *extent_op; 2590 struct btrfs_delayed_ref_node *ref; 2591 int must_insert_reserved = 0; 2592 int ret; 2593 2594 delayed_refs = &trans->transaction->delayed_refs; 2595 2596 lockdep_assert_held(&locked_ref->mutex); 2597 lockdep_assert_held(&locked_ref->lock); 2598 2599 while ((ref = select_delayed_ref(locked_ref))) { 2600 if (ref->seq && 2601 btrfs_check_delayed_seq(fs_info, ref->seq)) { 2602 spin_unlock(&locked_ref->lock); 2603 unselect_delayed_ref_head(delayed_refs, locked_ref); 2604 return -EAGAIN; 2605 } 2606 2607 (*run_refs)++; 2608 ref->in_tree = 0; 2609 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 2610 RB_CLEAR_NODE(&ref->ref_node); 2611 if (!list_empty(&ref->add_list)) 2612 list_del(&ref->add_list); 2613 /* 2614 * When we play the delayed ref, also correct the ref_mod on 2615 * head 2616 */ 2617 switch (ref->action) { 2618 case BTRFS_ADD_DELAYED_REF: 2619 case BTRFS_ADD_DELAYED_EXTENT: 2620 locked_ref->ref_mod -= ref->ref_mod; 2621 break; 2622 case BTRFS_DROP_DELAYED_REF: 2623 locked_ref->ref_mod += ref->ref_mod; 2624 break; 2625 default: 2626 WARN_ON(1); 2627 } 2628 atomic_dec(&delayed_refs->num_entries); 2629 2630 /* 2631 * Record the must_insert_reserved flag before we drop the 2632 * spin lock. 2633 */ 2634 must_insert_reserved = locked_ref->must_insert_reserved; 2635 locked_ref->must_insert_reserved = 0; 2636 2637 extent_op = locked_ref->extent_op; 2638 locked_ref->extent_op = NULL; 2639 spin_unlock(&locked_ref->lock); 2640 2641 ret = run_one_delayed_ref(trans, ref, extent_op, 2642 must_insert_reserved); 2643 2644 btrfs_free_delayed_extent_op(extent_op); 2645 if (ret) { 2646 unselect_delayed_ref_head(delayed_refs, locked_ref); 2647 btrfs_put_delayed_ref(ref); 2648 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2649 ret); 2650 return ret; 2651 } 2652 2653 btrfs_put_delayed_ref(ref); 2654 cond_resched(); 2655 2656 spin_lock(&locked_ref->lock); 2657 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2658 } 2659 2660 return 0; 2661 } 2662 2663 /* 2664 * Returns 0 on success or if called with an already aborted transaction. 2665 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2666 */ 2667 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2668 unsigned long nr) 2669 { 2670 struct btrfs_fs_info *fs_info = trans->fs_info; 2671 struct btrfs_delayed_ref_root *delayed_refs; 2672 struct btrfs_delayed_ref_head *locked_ref = NULL; 2673 ktime_t start = ktime_get(); 2674 int ret; 2675 unsigned long count = 0; 2676 unsigned long actual_count = 0; 2677 2678 delayed_refs = &trans->transaction->delayed_refs; 2679 do { 2680 if (!locked_ref) { 2681 locked_ref = btrfs_obtain_ref_head(trans); 2682 if (IS_ERR_OR_NULL(locked_ref)) { 2683 if (PTR_ERR(locked_ref) == -EAGAIN) { 2684 continue; 2685 } else { 2686 break; 2687 } 2688 } 2689 count++; 2690 } 2691 /* 2692 * We need to try and merge add/drops of the same ref since we 2693 * can run into issues with relocate dropping the implicit ref 2694 * and then it being added back again before the drop can 2695 * finish. If we merged anything we need to re-loop so we can 2696 * get a good ref. 2697 * Or we can get node references of the same type that weren't 2698 * merged when created due to bumps in the tree mod seq, and 2699 * we need to merge them to prevent adding an inline extent 2700 * backref before dropping it (triggering a BUG_ON at 2701 * insert_inline_extent_backref()). 2702 */ 2703 spin_lock(&locked_ref->lock); 2704 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2705 2706 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, 2707 &actual_count); 2708 if (ret < 0 && ret != -EAGAIN) { 2709 /* 2710 * Error, btrfs_run_delayed_refs_for_head already 2711 * unlocked everything so just bail out 2712 */ 2713 return ret; 2714 } else if (!ret) { 2715 /* 2716 * Success, perform the usual cleanup of a processed 2717 * head 2718 */ 2719 ret = cleanup_ref_head(trans, locked_ref); 2720 if (ret > 0 ) { 2721 /* We dropped our lock, we need to loop. */ 2722 ret = 0; 2723 continue; 2724 } else if (ret) { 2725 return ret; 2726 } 2727 } 2728 2729 /* 2730 * Either success case or btrfs_run_delayed_refs_for_head 2731 * returned -EAGAIN, meaning we need to select another head 2732 */ 2733 2734 locked_ref = NULL; 2735 cond_resched(); 2736 } while ((nr != -1 && count < nr) || locked_ref); 2737 2738 /* 2739 * We don't want to include ref heads since we can have empty ref heads 2740 * and those will drastically skew our runtime down since we just do 2741 * accounting, no actual extent tree updates. 2742 */ 2743 if (actual_count > 0) { 2744 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2745 u64 avg; 2746 2747 /* 2748 * We weigh the current average higher than our current runtime 2749 * to avoid large swings in the average. 2750 */ 2751 spin_lock(&delayed_refs->lock); 2752 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2753 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2754 spin_unlock(&delayed_refs->lock); 2755 } 2756 return 0; 2757 } 2758 2759 #ifdef SCRAMBLE_DELAYED_REFS 2760 /* 2761 * Normally delayed refs get processed in ascending bytenr order. This 2762 * correlates in most cases to the order added. To expose dependencies on this 2763 * order, we start to process the tree in the middle instead of the beginning 2764 */ 2765 static u64 find_middle(struct rb_root *root) 2766 { 2767 struct rb_node *n = root->rb_node; 2768 struct btrfs_delayed_ref_node *entry; 2769 int alt = 1; 2770 u64 middle; 2771 u64 first = 0, last = 0; 2772 2773 n = rb_first(root); 2774 if (n) { 2775 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2776 first = entry->bytenr; 2777 } 2778 n = rb_last(root); 2779 if (n) { 2780 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2781 last = entry->bytenr; 2782 } 2783 n = root->rb_node; 2784 2785 while (n) { 2786 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2787 WARN_ON(!entry->in_tree); 2788 2789 middle = entry->bytenr; 2790 2791 if (alt) 2792 n = n->rb_left; 2793 else 2794 n = n->rb_right; 2795 2796 alt = 1 - alt; 2797 } 2798 return middle; 2799 } 2800 #endif 2801 2802 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2803 { 2804 u64 num_bytes; 2805 2806 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2807 sizeof(struct btrfs_extent_inline_ref)); 2808 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2809 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2810 2811 /* 2812 * We don't ever fill up leaves all the way so multiply by 2 just to be 2813 * closer to what we're really going to want to use. 2814 */ 2815 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2816 } 2817 2818 /* 2819 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2820 * would require to store the csums for that many bytes. 2821 */ 2822 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2823 { 2824 u64 csum_size; 2825 u64 num_csums_per_leaf; 2826 u64 num_csums; 2827 2828 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2829 num_csums_per_leaf = div64_u64(csum_size, 2830 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2831 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2832 num_csums += num_csums_per_leaf - 1; 2833 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2834 return num_csums; 2835 } 2836 2837 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 2838 { 2839 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 2840 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2841 bool ret = false; 2842 u64 reserved; 2843 2844 spin_lock(&global_rsv->lock); 2845 reserved = global_rsv->reserved; 2846 spin_unlock(&global_rsv->lock); 2847 2848 /* 2849 * Since the global reserve is just kind of magic we don't really want 2850 * to rely on it to save our bacon, so if our size is more than the 2851 * delayed_refs_rsv and the global rsv then it's time to think about 2852 * bailing. 2853 */ 2854 spin_lock(&delayed_refs_rsv->lock); 2855 reserved += delayed_refs_rsv->reserved; 2856 if (delayed_refs_rsv->size >= reserved) 2857 ret = true; 2858 spin_unlock(&delayed_refs_rsv->lock); 2859 return ret; 2860 } 2861 2862 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 2863 { 2864 u64 num_entries = 2865 atomic_read(&trans->transaction->delayed_refs.num_entries); 2866 u64 avg_runtime; 2867 u64 val; 2868 2869 smp_mb(); 2870 avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 2871 val = num_entries * avg_runtime; 2872 if (val >= NSEC_PER_SEC) 2873 return 1; 2874 if (val >= NSEC_PER_SEC / 2) 2875 return 2; 2876 2877 return btrfs_check_space_for_delayed_refs(trans->fs_info); 2878 } 2879 2880 struct async_delayed_refs { 2881 struct btrfs_root *root; 2882 u64 transid; 2883 int count; 2884 int error; 2885 int sync; 2886 struct completion wait; 2887 struct btrfs_work work; 2888 }; 2889 2890 static inline struct async_delayed_refs * 2891 to_async_delayed_refs(struct btrfs_work *work) 2892 { 2893 return container_of(work, struct async_delayed_refs, work); 2894 } 2895 2896 static void delayed_ref_async_start(struct btrfs_work *work) 2897 { 2898 struct async_delayed_refs *async = to_async_delayed_refs(work); 2899 struct btrfs_trans_handle *trans; 2900 struct btrfs_fs_info *fs_info = async->root->fs_info; 2901 int ret; 2902 2903 /* if the commit is already started, we don't need to wait here */ 2904 if (btrfs_transaction_blocked(fs_info)) 2905 goto done; 2906 2907 trans = btrfs_join_transaction(async->root); 2908 if (IS_ERR(trans)) { 2909 async->error = PTR_ERR(trans); 2910 goto done; 2911 } 2912 2913 /* 2914 * trans->sync means that when we call end_transaction, we won't 2915 * wait on delayed refs 2916 */ 2917 trans->sync = true; 2918 2919 /* Don't bother flushing if we got into a different transaction */ 2920 if (trans->transid > async->transid) 2921 goto end; 2922 2923 ret = btrfs_run_delayed_refs(trans, async->count); 2924 if (ret) 2925 async->error = ret; 2926 end: 2927 ret = btrfs_end_transaction(trans); 2928 if (ret && !async->error) 2929 async->error = ret; 2930 done: 2931 if (async->sync) 2932 complete(&async->wait); 2933 else 2934 kfree(async); 2935 } 2936 2937 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, 2938 unsigned long count, u64 transid, int wait) 2939 { 2940 struct async_delayed_refs *async; 2941 int ret; 2942 2943 async = kmalloc(sizeof(*async), GFP_NOFS); 2944 if (!async) 2945 return -ENOMEM; 2946 2947 async->root = fs_info->tree_root; 2948 async->count = count; 2949 async->error = 0; 2950 async->transid = transid; 2951 if (wait) 2952 async->sync = 1; 2953 else 2954 async->sync = 0; 2955 init_completion(&async->wait); 2956 2957 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2958 delayed_ref_async_start, NULL, NULL); 2959 2960 btrfs_queue_work(fs_info->extent_workers, &async->work); 2961 2962 if (wait) { 2963 wait_for_completion(&async->wait); 2964 ret = async->error; 2965 kfree(async); 2966 return ret; 2967 } 2968 return 0; 2969 } 2970 2971 /* 2972 * this starts processing the delayed reference count updates and 2973 * extent insertions we have queued up so far. count can be 2974 * 0, which means to process everything in the tree at the start 2975 * of the run (but not newly added entries), or it can be some target 2976 * number you'd like to process. 2977 * 2978 * Returns 0 on success or if called with an aborted transaction 2979 * Returns <0 on error and aborts the transaction 2980 */ 2981 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2982 unsigned long count) 2983 { 2984 struct btrfs_fs_info *fs_info = trans->fs_info; 2985 struct rb_node *node; 2986 struct btrfs_delayed_ref_root *delayed_refs; 2987 struct btrfs_delayed_ref_head *head; 2988 int ret; 2989 int run_all = count == (unsigned long)-1; 2990 2991 /* We'll clean this up in btrfs_cleanup_transaction */ 2992 if (trans->aborted) 2993 return 0; 2994 2995 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2996 return 0; 2997 2998 delayed_refs = &trans->transaction->delayed_refs; 2999 if (count == 0) 3000 count = atomic_read(&delayed_refs->num_entries) * 2; 3001 3002 again: 3003 #ifdef SCRAMBLE_DELAYED_REFS 3004 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 3005 #endif 3006 ret = __btrfs_run_delayed_refs(trans, count); 3007 if (ret < 0) { 3008 btrfs_abort_transaction(trans, ret); 3009 return ret; 3010 } 3011 3012 if (run_all) { 3013 btrfs_create_pending_block_groups(trans); 3014 3015 spin_lock(&delayed_refs->lock); 3016 node = rb_first_cached(&delayed_refs->href_root); 3017 if (!node) { 3018 spin_unlock(&delayed_refs->lock); 3019 goto out; 3020 } 3021 head = rb_entry(node, struct btrfs_delayed_ref_head, 3022 href_node); 3023 refcount_inc(&head->refs); 3024 spin_unlock(&delayed_refs->lock); 3025 3026 /* Mutex was contended, block until it's released and retry. */ 3027 mutex_lock(&head->mutex); 3028 mutex_unlock(&head->mutex); 3029 3030 btrfs_put_delayed_ref_head(head); 3031 cond_resched(); 3032 goto again; 3033 } 3034 out: 3035 return 0; 3036 } 3037 3038 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3039 struct btrfs_fs_info *fs_info, 3040 u64 bytenr, u64 num_bytes, u64 flags, 3041 int level, int is_data) 3042 { 3043 struct btrfs_delayed_extent_op *extent_op; 3044 int ret; 3045 3046 extent_op = btrfs_alloc_delayed_extent_op(); 3047 if (!extent_op) 3048 return -ENOMEM; 3049 3050 extent_op->flags_to_set = flags; 3051 extent_op->update_flags = true; 3052 extent_op->update_key = false; 3053 extent_op->is_data = is_data ? true : false; 3054 extent_op->level = level; 3055 3056 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, 3057 num_bytes, extent_op); 3058 if (ret) 3059 btrfs_free_delayed_extent_op(extent_op); 3060 return ret; 3061 } 3062 3063 static noinline int check_delayed_ref(struct btrfs_root *root, 3064 struct btrfs_path *path, 3065 u64 objectid, u64 offset, u64 bytenr) 3066 { 3067 struct btrfs_delayed_ref_head *head; 3068 struct btrfs_delayed_ref_node *ref; 3069 struct btrfs_delayed_data_ref *data_ref; 3070 struct btrfs_delayed_ref_root *delayed_refs; 3071 struct btrfs_transaction *cur_trans; 3072 struct rb_node *node; 3073 int ret = 0; 3074 3075 spin_lock(&root->fs_info->trans_lock); 3076 cur_trans = root->fs_info->running_transaction; 3077 if (cur_trans) 3078 refcount_inc(&cur_trans->use_count); 3079 spin_unlock(&root->fs_info->trans_lock); 3080 if (!cur_trans) 3081 return 0; 3082 3083 delayed_refs = &cur_trans->delayed_refs; 3084 spin_lock(&delayed_refs->lock); 3085 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3086 if (!head) { 3087 spin_unlock(&delayed_refs->lock); 3088 btrfs_put_transaction(cur_trans); 3089 return 0; 3090 } 3091 3092 if (!mutex_trylock(&head->mutex)) { 3093 refcount_inc(&head->refs); 3094 spin_unlock(&delayed_refs->lock); 3095 3096 btrfs_release_path(path); 3097 3098 /* 3099 * Mutex was contended, block until it's released and let 3100 * caller try again 3101 */ 3102 mutex_lock(&head->mutex); 3103 mutex_unlock(&head->mutex); 3104 btrfs_put_delayed_ref_head(head); 3105 btrfs_put_transaction(cur_trans); 3106 return -EAGAIN; 3107 } 3108 spin_unlock(&delayed_refs->lock); 3109 3110 spin_lock(&head->lock); 3111 /* 3112 * XXX: We should replace this with a proper search function in the 3113 * future. 3114 */ 3115 for (node = rb_first_cached(&head->ref_tree); node; 3116 node = rb_next(node)) { 3117 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 3118 /* If it's a shared ref we know a cross reference exists */ 3119 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3120 ret = 1; 3121 break; 3122 } 3123 3124 data_ref = btrfs_delayed_node_to_data_ref(ref); 3125 3126 /* 3127 * If our ref doesn't match the one we're currently looking at 3128 * then we have a cross reference. 3129 */ 3130 if (data_ref->root != root->root_key.objectid || 3131 data_ref->objectid != objectid || 3132 data_ref->offset != offset) { 3133 ret = 1; 3134 break; 3135 } 3136 } 3137 spin_unlock(&head->lock); 3138 mutex_unlock(&head->mutex); 3139 btrfs_put_transaction(cur_trans); 3140 return ret; 3141 } 3142 3143 static noinline int check_committed_ref(struct btrfs_root *root, 3144 struct btrfs_path *path, 3145 u64 objectid, u64 offset, u64 bytenr) 3146 { 3147 struct btrfs_fs_info *fs_info = root->fs_info; 3148 struct btrfs_root *extent_root = fs_info->extent_root; 3149 struct extent_buffer *leaf; 3150 struct btrfs_extent_data_ref *ref; 3151 struct btrfs_extent_inline_ref *iref; 3152 struct btrfs_extent_item *ei; 3153 struct btrfs_key key; 3154 u32 item_size; 3155 int type; 3156 int ret; 3157 3158 key.objectid = bytenr; 3159 key.offset = (u64)-1; 3160 key.type = BTRFS_EXTENT_ITEM_KEY; 3161 3162 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3163 if (ret < 0) 3164 goto out; 3165 BUG_ON(ret == 0); /* Corruption */ 3166 3167 ret = -ENOENT; 3168 if (path->slots[0] == 0) 3169 goto out; 3170 3171 path->slots[0]--; 3172 leaf = path->nodes[0]; 3173 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3174 3175 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3176 goto out; 3177 3178 ret = 1; 3179 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3180 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3181 3182 if (item_size != sizeof(*ei) + 3183 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3184 goto out; 3185 3186 if (btrfs_extent_generation(leaf, ei) <= 3187 btrfs_root_last_snapshot(&root->root_item)) 3188 goto out; 3189 3190 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3191 3192 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 3193 if (type != BTRFS_EXTENT_DATA_REF_KEY) 3194 goto out; 3195 3196 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3197 if (btrfs_extent_refs(leaf, ei) != 3198 btrfs_extent_data_ref_count(leaf, ref) || 3199 btrfs_extent_data_ref_root(leaf, ref) != 3200 root->root_key.objectid || 3201 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3202 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3203 goto out; 3204 3205 ret = 0; 3206 out: 3207 return ret; 3208 } 3209 3210 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3211 u64 bytenr) 3212 { 3213 struct btrfs_path *path; 3214 int ret; 3215 3216 path = btrfs_alloc_path(); 3217 if (!path) 3218 return -ENOMEM; 3219 3220 do { 3221 ret = check_committed_ref(root, path, objectid, 3222 offset, bytenr); 3223 if (ret && ret != -ENOENT) 3224 goto out; 3225 3226 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 3227 } while (ret == -EAGAIN); 3228 3229 out: 3230 btrfs_free_path(path); 3231 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3232 WARN_ON(ret > 0); 3233 return ret; 3234 } 3235 3236 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3237 struct btrfs_root *root, 3238 struct extent_buffer *buf, 3239 int full_backref, int inc) 3240 { 3241 struct btrfs_fs_info *fs_info = root->fs_info; 3242 u64 bytenr; 3243 u64 num_bytes; 3244 u64 parent; 3245 u64 ref_root; 3246 u32 nritems; 3247 struct btrfs_key key; 3248 struct btrfs_file_extent_item *fi; 3249 int i; 3250 int level; 3251 int ret = 0; 3252 int (*process_func)(struct btrfs_trans_handle *, 3253 struct btrfs_root *, 3254 u64, u64, u64, u64, u64, u64); 3255 3256 3257 if (btrfs_is_testing(fs_info)) 3258 return 0; 3259 3260 ref_root = btrfs_header_owner(buf); 3261 nritems = btrfs_header_nritems(buf); 3262 level = btrfs_header_level(buf); 3263 3264 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3265 return 0; 3266 3267 if (inc) 3268 process_func = btrfs_inc_extent_ref; 3269 else 3270 process_func = btrfs_free_extent; 3271 3272 if (full_backref) 3273 parent = buf->start; 3274 else 3275 parent = 0; 3276 3277 for (i = 0; i < nritems; i++) { 3278 if (level == 0) { 3279 btrfs_item_key_to_cpu(buf, &key, i); 3280 if (key.type != BTRFS_EXTENT_DATA_KEY) 3281 continue; 3282 fi = btrfs_item_ptr(buf, i, 3283 struct btrfs_file_extent_item); 3284 if (btrfs_file_extent_type(buf, fi) == 3285 BTRFS_FILE_EXTENT_INLINE) 3286 continue; 3287 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3288 if (bytenr == 0) 3289 continue; 3290 3291 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3292 key.offset -= btrfs_file_extent_offset(buf, fi); 3293 ret = process_func(trans, root, bytenr, num_bytes, 3294 parent, ref_root, key.objectid, 3295 key.offset); 3296 if (ret) 3297 goto fail; 3298 } else { 3299 bytenr = btrfs_node_blockptr(buf, i); 3300 num_bytes = fs_info->nodesize; 3301 ret = process_func(trans, root, bytenr, num_bytes, 3302 parent, ref_root, level - 1, 0); 3303 if (ret) 3304 goto fail; 3305 } 3306 } 3307 return 0; 3308 fail: 3309 return ret; 3310 } 3311 3312 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3313 struct extent_buffer *buf, int full_backref) 3314 { 3315 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3316 } 3317 3318 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3319 struct extent_buffer *buf, int full_backref) 3320 { 3321 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3322 } 3323 3324 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3325 struct btrfs_fs_info *fs_info, 3326 struct btrfs_path *path, 3327 struct btrfs_block_group_cache *cache) 3328 { 3329 int ret; 3330 struct btrfs_root *extent_root = fs_info->extent_root; 3331 unsigned long bi; 3332 struct extent_buffer *leaf; 3333 3334 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3335 if (ret) { 3336 if (ret > 0) 3337 ret = -ENOENT; 3338 goto fail; 3339 } 3340 3341 leaf = path->nodes[0]; 3342 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3343 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3344 btrfs_mark_buffer_dirty(leaf); 3345 fail: 3346 btrfs_release_path(path); 3347 return ret; 3348 3349 } 3350 3351 static struct btrfs_block_group_cache * 3352 next_block_group(struct btrfs_fs_info *fs_info, 3353 struct btrfs_block_group_cache *cache) 3354 { 3355 struct rb_node *node; 3356 3357 spin_lock(&fs_info->block_group_cache_lock); 3358 3359 /* If our block group was removed, we need a full search. */ 3360 if (RB_EMPTY_NODE(&cache->cache_node)) { 3361 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3362 3363 spin_unlock(&fs_info->block_group_cache_lock); 3364 btrfs_put_block_group(cache); 3365 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3366 } 3367 node = rb_next(&cache->cache_node); 3368 btrfs_put_block_group(cache); 3369 if (node) { 3370 cache = rb_entry(node, struct btrfs_block_group_cache, 3371 cache_node); 3372 btrfs_get_block_group(cache); 3373 } else 3374 cache = NULL; 3375 spin_unlock(&fs_info->block_group_cache_lock); 3376 return cache; 3377 } 3378 3379 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3380 struct btrfs_trans_handle *trans, 3381 struct btrfs_path *path) 3382 { 3383 struct btrfs_fs_info *fs_info = block_group->fs_info; 3384 struct btrfs_root *root = fs_info->tree_root; 3385 struct inode *inode = NULL; 3386 struct extent_changeset *data_reserved = NULL; 3387 u64 alloc_hint = 0; 3388 int dcs = BTRFS_DC_ERROR; 3389 u64 num_pages = 0; 3390 int retries = 0; 3391 int ret = 0; 3392 3393 /* 3394 * If this block group is smaller than 100 megs don't bother caching the 3395 * block group. 3396 */ 3397 if (block_group->key.offset < (100 * SZ_1M)) { 3398 spin_lock(&block_group->lock); 3399 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3400 spin_unlock(&block_group->lock); 3401 return 0; 3402 } 3403 3404 if (trans->aborted) 3405 return 0; 3406 again: 3407 inode = lookup_free_space_inode(fs_info, block_group, path); 3408 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3409 ret = PTR_ERR(inode); 3410 btrfs_release_path(path); 3411 goto out; 3412 } 3413 3414 if (IS_ERR(inode)) { 3415 BUG_ON(retries); 3416 retries++; 3417 3418 if (block_group->ro) 3419 goto out_free; 3420 3421 ret = create_free_space_inode(fs_info, trans, block_group, 3422 path); 3423 if (ret) 3424 goto out_free; 3425 goto again; 3426 } 3427 3428 /* 3429 * We want to set the generation to 0, that way if anything goes wrong 3430 * from here on out we know not to trust this cache when we load up next 3431 * time. 3432 */ 3433 BTRFS_I(inode)->generation = 0; 3434 ret = btrfs_update_inode(trans, root, inode); 3435 if (ret) { 3436 /* 3437 * So theoretically we could recover from this, simply set the 3438 * super cache generation to 0 so we know to invalidate the 3439 * cache, but then we'd have to keep track of the block groups 3440 * that fail this way so we know we _have_ to reset this cache 3441 * before the next commit or risk reading stale cache. So to 3442 * limit our exposure to horrible edge cases lets just abort the 3443 * transaction, this only happens in really bad situations 3444 * anyway. 3445 */ 3446 btrfs_abort_transaction(trans, ret); 3447 goto out_put; 3448 } 3449 WARN_ON(ret); 3450 3451 /* We've already setup this transaction, go ahead and exit */ 3452 if (block_group->cache_generation == trans->transid && 3453 i_size_read(inode)) { 3454 dcs = BTRFS_DC_SETUP; 3455 goto out_put; 3456 } 3457 3458 if (i_size_read(inode) > 0) { 3459 ret = btrfs_check_trunc_cache_free_space(fs_info, 3460 &fs_info->global_block_rsv); 3461 if (ret) 3462 goto out_put; 3463 3464 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3465 if (ret) 3466 goto out_put; 3467 } 3468 3469 spin_lock(&block_group->lock); 3470 if (block_group->cached != BTRFS_CACHE_FINISHED || 3471 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3472 /* 3473 * don't bother trying to write stuff out _if_ 3474 * a) we're not cached, 3475 * b) we're with nospace_cache mount option, 3476 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3477 */ 3478 dcs = BTRFS_DC_WRITTEN; 3479 spin_unlock(&block_group->lock); 3480 goto out_put; 3481 } 3482 spin_unlock(&block_group->lock); 3483 3484 /* 3485 * We hit an ENOSPC when setting up the cache in this transaction, just 3486 * skip doing the setup, we've already cleared the cache so we're safe. 3487 */ 3488 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3489 ret = -ENOSPC; 3490 goto out_put; 3491 } 3492 3493 /* 3494 * Try to preallocate enough space based on how big the block group is. 3495 * Keep in mind this has to include any pinned space which could end up 3496 * taking up quite a bit since it's not folded into the other space 3497 * cache. 3498 */ 3499 num_pages = div_u64(block_group->key.offset, SZ_256M); 3500 if (!num_pages) 3501 num_pages = 1; 3502 3503 num_pages *= 16; 3504 num_pages *= PAGE_SIZE; 3505 3506 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); 3507 if (ret) 3508 goto out_put; 3509 3510 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3511 num_pages, num_pages, 3512 &alloc_hint); 3513 /* 3514 * Our cache requires contiguous chunks so that we don't modify a bunch 3515 * of metadata or split extents when writing the cache out, which means 3516 * we can enospc if we are heavily fragmented in addition to just normal 3517 * out of space conditions. So if we hit this just skip setting up any 3518 * other block groups for this transaction, maybe we'll unpin enough 3519 * space the next time around. 3520 */ 3521 if (!ret) 3522 dcs = BTRFS_DC_SETUP; 3523 else if (ret == -ENOSPC) 3524 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3525 3526 out_put: 3527 iput(inode); 3528 out_free: 3529 btrfs_release_path(path); 3530 out: 3531 spin_lock(&block_group->lock); 3532 if (!ret && dcs == BTRFS_DC_SETUP) 3533 block_group->cache_generation = trans->transid; 3534 block_group->disk_cache_state = dcs; 3535 spin_unlock(&block_group->lock); 3536 3537 extent_changeset_free(data_reserved); 3538 return ret; 3539 } 3540 3541 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3542 struct btrfs_fs_info *fs_info) 3543 { 3544 struct btrfs_block_group_cache *cache, *tmp; 3545 struct btrfs_transaction *cur_trans = trans->transaction; 3546 struct btrfs_path *path; 3547 3548 if (list_empty(&cur_trans->dirty_bgs) || 3549 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3550 return 0; 3551 3552 path = btrfs_alloc_path(); 3553 if (!path) 3554 return -ENOMEM; 3555 3556 /* Could add new block groups, use _safe just in case */ 3557 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3558 dirty_list) { 3559 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3560 cache_save_setup(cache, trans, path); 3561 } 3562 3563 btrfs_free_path(path); 3564 return 0; 3565 } 3566 3567 /* 3568 * transaction commit does final block group cache writeback during a 3569 * critical section where nothing is allowed to change the FS. This is 3570 * required in order for the cache to actually match the block group, 3571 * but can introduce a lot of latency into the commit. 3572 * 3573 * So, btrfs_start_dirty_block_groups is here to kick off block group 3574 * cache IO. There's a chance we'll have to redo some of it if the 3575 * block group changes again during the commit, but it greatly reduces 3576 * the commit latency by getting rid of the easy block groups while 3577 * we're still allowing others to join the commit. 3578 */ 3579 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 3580 { 3581 struct btrfs_fs_info *fs_info = trans->fs_info; 3582 struct btrfs_block_group_cache *cache; 3583 struct btrfs_transaction *cur_trans = trans->transaction; 3584 int ret = 0; 3585 int should_put; 3586 struct btrfs_path *path = NULL; 3587 LIST_HEAD(dirty); 3588 struct list_head *io = &cur_trans->io_bgs; 3589 int num_started = 0; 3590 int loops = 0; 3591 3592 spin_lock(&cur_trans->dirty_bgs_lock); 3593 if (list_empty(&cur_trans->dirty_bgs)) { 3594 spin_unlock(&cur_trans->dirty_bgs_lock); 3595 return 0; 3596 } 3597 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3598 spin_unlock(&cur_trans->dirty_bgs_lock); 3599 3600 again: 3601 /* 3602 * make sure all the block groups on our dirty list actually 3603 * exist 3604 */ 3605 btrfs_create_pending_block_groups(trans); 3606 3607 if (!path) { 3608 path = btrfs_alloc_path(); 3609 if (!path) 3610 return -ENOMEM; 3611 } 3612 3613 /* 3614 * cache_write_mutex is here only to save us from balance or automatic 3615 * removal of empty block groups deleting this block group while we are 3616 * writing out the cache 3617 */ 3618 mutex_lock(&trans->transaction->cache_write_mutex); 3619 while (!list_empty(&dirty)) { 3620 bool drop_reserve = true; 3621 3622 cache = list_first_entry(&dirty, 3623 struct btrfs_block_group_cache, 3624 dirty_list); 3625 /* 3626 * this can happen if something re-dirties a block 3627 * group that is already under IO. Just wait for it to 3628 * finish and then do it all again 3629 */ 3630 if (!list_empty(&cache->io_list)) { 3631 list_del_init(&cache->io_list); 3632 btrfs_wait_cache_io(trans, cache, path); 3633 btrfs_put_block_group(cache); 3634 } 3635 3636 3637 /* 3638 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3639 * if it should update the cache_state. Don't delete 3640 * until after we wait. 3641 * 3642 * Since we're not running in the commit critical section 3643 * we need the dirty_bgs_lock to protect from update_block_group 3644 */ 3645 spin_lock(&cur_trans->dirty_bgs_lock); 3646 list_del_init(&cache->dirty_list); 3647 spin_unlock(&cur_trans->dirty_bgs_lock); 3648 3649 should_put = 1; 3650 3651 cache_save_setup(cache, trans, path); 3652 3653 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3654 cache->io_ctl.inode = NULL; 3655 ret = btrfs_write_out_cache(fs_info, trans, 3656 cache, path); 3657 if (ret == 0 && cache->io_ctl.inode) { 3658 num_started++; 3659 should_put = 0; 3660 3661 /* 3662 * The cache_write_mutex is protecting the 3663 * io_list, also refer to the definition of 3664 * btrfs_transaction::io_bgs for more details 3665 */ 3666 list_add_tail(&cache->io_list, io); 3667 } else { 3668 /* 3669 * if we failed to write the cache, the 3670 * generation will be bad and life goes on 3671 */ 3672 ret = 0; 3673 } 3674 } 3675 if (!ret) { 3676 ret = write_one_cache_group(trans, fs_info, 3677 path, cache); 3678 /* 3679 * Our block group might still be attached to the list 3680 * of new block groups in the transaction handle of some 3681 * other task (struct btrfs_trans_handle->new_bgs). This 3682 * means its block group item isn't yet in the extent 3683 * tree. If this happens ignore the error, as we will 3684 * try again later in the critical section of the 3685 * transaction commit. 3686 */ 3687 if (ret == -ENOENT) { 3688 ret = 0; 3689 spin_lock(&cur_trans->dirty_bgs_lock); 3690 if (list_empty(&cache->dirty_list)) { 3691 list_add_tail(&cache->dirty_list, 3692 &cur_trans->dirty_bgs); 3693 btrfs_get_block_group(cache); 3694 drop_reserve = false; 3695 } 3696 spin_unlock(&cur_trans->dirty_bgs_lock); 3697 } else if (ret) { 3698 btrfs_abort_transaction(trans, ret); 3699 } 3700 } 3701 3702 /* if it's not on the io list, we need to put the block group */ 3703 if (should_put) 3704 btrfs_put_block_group(cache); 3705 if (drop_reserve) 3706 btrfs_delayed_refs_rsv_release(fs_info, 1); 3707 3708 if (ret) 3709 break; 3710 3711 /* 3712 * Avoid blocking other tasks for too long. It might even save 3713 * us from writing caches for block groups that are going to be 3714 * removed. 3715 */ 3716 mutex_unlock(&trans->transaction->cache_write_mutex); 3717 mutex_lock(&trans->transaction->cache_write_mutex); 3718 } 3719 mutex_unlock(&trans->transaction->cache_write_mutex); 3720 3721 /* 3722 * go through delayed refs for all the stuff we've just kicked off 3723 * and then loop back (just once) 3724 */ 3725 ret = btrfs_run_delayed_refs(trans, 0); 3726 if (!ret && loops == 0) { 3727 loops++; 3728 spin_lock(&cur_trans->dirty_bgs_lock); 3729 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3730 /* 3731 * dirty_bgs_lock protects us from concurrent block group 3732 * deletes too (not just cache_write_mutex). 3733 */ 3734 if (!list_empty(&dirty)) { 3735 spin_unlock(&cur_trans->dirty_bgs_lock); 3736 goto again; 3737 } 3738 spin_unlock(&cur_trans->dirty_bgs_lock); 3739 } else if (ret < 0) { 3740 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3741 } 3742 3743 btrfs_free_path(path); 3744 return ret; 3745 } 3746 3747 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3748 struct btrfs_fs_info *fs_info) 3749 { 3750 struct btrfs_block_group_cache *cache; 3751 struct btrfs_transaction *cur_trans = trans->transaction; 3752 int ret = 0; 3753 int should_put; 3754 struct btrfs_path *path; 3755 struct list_head *io = &cur_trans->io_bgs; 3756 int num_started = 0; 3757 3758 path = btrfs_alloc_path(); 3759 if (!path) 3760 return -ENOMEM; 3761 3762 /* 3763 * Even though we are in the critical section of the transaction commit, 3764 * we can still have concurrent tasks adding elements to this 3765 * transaction's list of dirty block groups. These tasks correspond to 3766 * endio free space workers started when writeback finishes for a 3767 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3768 * allocate new block groups as a result of COWing nodes of the root 3769 * tree when updating the free space inode. The writeback for the space 3770 * caches is triggered by an earlier call to 3771 * btrfs_start_dirty_block_groups() and iterations of the following 3772 * loop. 3773 * Also we want to do the cache_save_setup first and then run the 3774 * delayed refs to make sure we have the best chance at doing this all 3775 * in one shot. 3776 */ 3777 spin_lock(&cur_trans->dirty_bgs_lock); 3778 while (!list_empty(&cur_trans->dirty_bgs)) { 3779 cache = list_first_entry(&cur_trans->dirty_bgs, 3780 struct btrfs_block_group_cache, 3781 dirty_list); 3782 3783 /* 3784 * this can happen if cache_save_setup re-dirties a block 3785 * group that is already under IO. Just wait for it to 3786 * finish and then do it all again 3787 */ 3788 if (!list_empty(&cache->io_list)) { 3789 spin_unlock(&cur_trans->dirty_bgs_lock); 3790 list_del_init(&cache->io_list); 3791 btrfs_wait_cache_io(trans, cache, path); 3792 btrfs_put_block_group(cache); 3793 spin_lock(&cur_trans->dirty_bgs_lock); 3794 } 3795 3796 /* 3797 * don't remove from the dirty list until after we've waited 3798 * on any pending IO 3799 */ 3800 list_del_init(&cache->dirty_list); 3801 spin_unlock(&cur_trans->dirty_bgs_lock); 3802 should_put = 1; 3803 3804 cache_save_setup(cache, trans, path); 3805 3806 if (!ret) 3807 ret = btrfs_run_delayed_refs(trans, 3808 (unsigned long) -1); 3809 3810 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3811 cache->io_ctl.inode = NULL; 3812 ret = btrfs_write_out_cache(fs_info, trans, 3813 cache, path); 3814 if (ret == 0 && cache->io_ctl.inode) { 3815 num_started++; 3816 should_put = 0; 3817 list_add_tail(&cache->io_list, io); 3818 } else { 3819 /* 3820 * if we failed to write the cache, the 3821 * generation will be bad and life goes on 3822 */ 3823 ret = 0; 3824 } 3825 } 3826 if (!ret) { 3827 ret = write_one_cache_group(trans, fs_info, 3828 path, cache); 3829 /* 3830 * One of the free space endio workers might have 3831 * created a new block group while updating a free space 3832 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3833 * and hasn't released its transaction handle yet, in 3834 * which case the new block group is still attached to 3835 * its transaction handle and its creation has not 3836 * finished yet (no block group item in the extent tree 3837 * yet, etc). If this is the case, wait for all free 3838 * space endio workers to finish and retry. This is a 3839 * a very rare case so no need for a more efficient and 3840 * complex approach. 3841 */ 3842 if (ret == -ENOENT) { 3843 wait_event(cur_trans->writer_wait, 3844 atomic_read(&cur_trans->num_writers) == 1); 3845 ret = write_one_cache_group(trans, fs_info, 3846 path, cache); 3847 } 3848 if (ret) 3849 btrfs_abort_transaction(trans, ret); 3850 } 3851 3852 /* if its not on the io list, we need to put the block group */ 3853 if (should_put) 3854 btrfs_put_block_group(cache); 3855 btrfs_delayed_refs_rsv_release(fs_info, 1); 3856 spin_lock(&cur_trans->dirty_bgs_lock); 3857 } 3858 spin_unlock(&cur_trans->dirty_bgs_lock); 3859 3860 /* 3861 * Refer to the definition of io_bgs member for details why it's safe 3862 * to use it without any locking 3863 */ 3864 while (!list_empty(io)) { 3865 cache = list_first_entry(io, struct btrfs_block_group_cache, 3866 io_list); 3867 list_del_init(&cache->io_list); 3868 btrfs_wait_cache_io(trans, cache, path); 3869 btrfs_put_block_group(cache); 3870 } 3871 3872 btrfs_free_path(path); 3873 return ret; 3874 } 3875 3876 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3877 { 3878 struct btrfs_block_group_cache *block_group; 3879 int readonly = 0; 3880 3881 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3882 if (!block_group || block_group->ro) 3883 readonly = 1; 3884 if (block_group) 3885 btrfs_put_block_group(block_group); 3886 return readonly; 3887 } 3888 3889 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3890 { 3891 struct btrfs_block_group_cache *bg; 3892 bool ret = true; 3893 3894 bg = btrfs_lookup_block_group(fs_info, bytenr); 3895 if (!bg) 3896 return false; 3897 3898 spin_lock(&bg->lock); 3899 if (bg->ro) 3900 ret = false; 3901 else 3902 atomic_inc(&bg->nocow_writers); 3903 spin_unlock(&bg->lock); 3904 3905 /* no put on block group, done by btrfs_dec_nocow_writers */ 3906 if (!ret) 3907 btrfs_put_block_group(bg); 3908 3909 return ret; 3910 3911 } 3912 3913 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3914 { 3915 struct btrfs_block_group_cache *bg; 3916 3917 bg = btrfs_lookup_block_group(fs_info, bytenr); 3918 ASSERT(bg); 3919 if (atomic_dec_and_test(&bg->nocow_writers)) 3920 wake_up_var(&bg->nocow_writers); 3921 /* 3922 * Once for our lookup and once for the lookup done by a previous call 3923 * to btrfs_inc_nocow_writers() 3924 */ 3925 btrfs_put_block_group(bg); 3926 btrfs_put_block_group(bg); 3927 } 3928 3929 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3930 { 3931 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 3932 } 3933 3934 static const char *alloc_name(u64 flags) 3935 { 3936 switch (flags) { 3937 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3938 return "mixed"; 3939 case BTRFS_BLOCK_GROUP_METADATA: 3940 return "metadata"; 3941 case BTRFS_BLOCK_GROUP_DATA: 3942 return "data"; 3943 case BTRFS_BLOCK_GROUP_SYSTEM: 3944 return "system"; 3945 default: 3946 WARN_ON(1); 3947 return "invalid-combination"; 3948 }; 3949 } 3950 3951 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 3952 { 3953 3954 struct btrfs_space_info *space_info; 3955 int i; 3956 int ret; 3957 3958 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 3959 if (!space_info) 3960 return -ENOMEM; 3961 3962 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 3963 GFP_KERNEL); 3964 if (ret) { 3965 kfree(space_info); 3966 return ret; 3967 } 3968 3969 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3970 INIT_LIST_HEAD(&space_info->block_groups[i]); 3971 init_rwsem(&space_info->groups_sem); 3972 spin_lock_init(&space_info->lock); 3973 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3974 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3975 init_waitqueue_head(&space_info->wait); 3976 INIT_LIST_HEAD(&space_info->ro_bgs); 3977 INIT_LIST_HEAD(&space_info->tickets); 3978 INIT_LIST_HEAD(&space_info->priority_tickets); 3979 3980 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 3981 info->space_info_kobj, "%s", 3982 alloc_name(space_info->flags)); 3983 if (ret) { 3984 percpu_counter_destroy(&space_info->total_bytes_pinned); 3985 kfree(space_info); 3986 return ret; 3987 } 3988 3989 list_add_rcu(&space_info->list, &info->space_info); 3990 if (flags & BTRFS_BLOCK_GROUP_DATA) 3991 info->data_sinfo = space_info; 3992 3993 return ret; 3994 } 3995 3996 static void update_space_info(struct btrfs_fs_info *info, u64 flags, 3997 u64 total_bytes, u64 bytes_used, 3998 u64 bytes_readonly, 3999 struct btrfs_space_info **space_info) 4000 { 4001 struct btrfs_space_info *found; 4002 int factor; 4003 4004 factor = btrfs_bg_type_to_factor(flags); 4005 4006 found = __find_space_info(info, flags); 4007 ASSERT(found); 4008 spin_lock(&found->lock); 4009 found->total_bytes += total_bytes; 4010 found->disk_total += total_bytes * factor; 4011 found->bytes_used += bytes_used; 4012 found->disk_used += bytes_used * factor; 4013 found->bytes_readonly += bytes_readonly; 4014 if (total_bytes > 0) 4015 found->full = 0; 4016 space_info_add_new_bytes(info, found, total_bytes - 4017 bytes_used - bytes_readonly); 4018 spin_unlock(&found->lock); 4019 *space_info = found; 4020 } 4021 4022 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 4023 { 4024 u64 extra_flags = chunk_to_extended(flags) & 4025 BTRFS_EXTENDED_PROFILE_MASK; 4026 4027 write_seqlock(&fs_info->profiles_lock); 4028 if (flags & BTRFS_BLOCK_GROUP_DATA) 4029 fs_info->avail_data_alloc_bits |= extra_flags; 4030 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4031 fs_info->avail_metadata_alloc_bits |= extra_flags; 4032 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4033 fs_info->avail_system_alloc_bits |= extra_flags; 4034 write_sequnlock(&fs_info->profiles_lock); 4035 } 4036 4037 /* 4038 * returns target flags in extended format or 0 if restripe for this 4039 * chunk_type is not in progress 4040 * 4041 * should be called with balance_lock held 4042 */ 4043 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4044 { 4045 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4046 u64 target = 0; 4047 4048 if (!bctl) 4049 return 0; 4050 4051 if (flags & BTRFS_BLOCK_GROUP_DATA && 4052 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4053 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4054 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4055 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4056 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4057 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4058 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4059 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4060 } 4061 4062 return target; 4063 } 4064 4065 /* 4066 * @flags: available profiles in extended format (see ctree.h) 4067 * 4068 * Returns reduced profile in chunk format. If profile changing is in 4069 * progress (either running or paused) picks the target profile (if it's 4070 * already available), otherwise falls back to plain reducing. 4071 */ 4072 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 4073 { 4074 u64 num_devices = fs_info->fs_devices->rw_devices; 4075 u64 target; 4076 u64 raid_type; 4077 u64 allowed = 0; 4078 4079 /* 4080 * see if restripe for this chunk_type is in progress, if so 4081 * try to reduce to the target profile 4082 */ 4083 spin_lock(&fs_info->balance_lock); 4084 target = get_restripe_target(fs_info, flags); 4085 if (target) { 4086 /* pick target profile only if it's already available */ 4087 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4088 spin_unlock(&fs_info->balance_lock); 4089 return extended_to_chunk(target); 4090 } 4091 } 4092 spin_unlock(&fs_info->balance_lock); 4093 4094 /* First, mask out the RAID levels which aren't possible */ 4095 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4096 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4097 allowed |= btrfs_raid_array[raid_type].bg_flag; 4098 } 4099 allowed &= flags; 4100 4101 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4102 allowed = BTRFS_BLOCK_GROUP_RAID6; 4103 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4104 allowed = BTRFS_BLOCK_GROUP_RAID5; 4105 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4106 allowed = BTRFS_BLOCK_GROUP_RAID10; 4107 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4108 allowed = BTRFS_BLOCK_GROUP_RAID1; 4109 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4110 allowed = BTRFS_BLOCK_GROUP_RAID0; 4111 4112 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4113 4114 return extended_to_chunk(flags | allowed); 4115 } 4116 4117 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4118 { 4119 unsigned seq; 4120 u64 flags; 4121 4122 do { 4123 flags = orig_flags; 4124 seq = read_seqbegin(&fs_info->profiles_lock); 4125 4126 if (flags & BTRFS_BLOCK_GROUP_DATA) 4127 flags |= fs_info->avail_data_alloc_bits; 4128 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4129 flags |= fs_info->avail_system_alloc_bits; 4130 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4131 flags |= fs_info->avail_metadata_alloc_bits; 4132 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4133 4134 return btrfs_reduce_alloc_profile(fs_info, flags); 4135 } 4136 4137 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 4138 { 4139 struct btrfs_fs_info *fs_info = root->fs_info; 4140 u64 flags; 4141 u64 ret; 4142 4143 if (data) 4144 flags = BTRFS_BLOCK_GROUP_DATA; 4145 else if (root == fs_info->chunk_root) 4146 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4147 else 4148 flags = BTRFS_BLOCK_GROUP_METADATA; 4149 4150 ret = get_alloc_profile(fs_info, flags); 4151 return ret; 4152 } 4153 4154 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) 4155 { 4156 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); 4157 } 4158 4159 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) 4160 { 4161 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4162 } 4163 4164 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) 4165 { 4166 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4167 } 4168 4169 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4170 bool may_use_included) 4171 { 4172 ASSERT(s_info); 4173 return s_info->bytes_used + s_info->bytes_reserved + 4174 s_info->bytes_pinned + s_info->bytes_readonly + 4175 (may_use_included ? s_info->bytes_may_use : 0); 4176 } 4177 4178 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 4179 { 4180 struct btrfs_root *root = inode->root; 4181 struct btrfs_fs_info *fs_info = root->fs_info; 4182 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 4183 u64 used; 4184 int ret = 0; 4185 int need_commit = 2; 4186 int have_pinned_space; 4187 4188 /* make sure bytes are sectorsize aligned */ 4189 bytes = ALIGN(bytes, fs_info->sectorsize); 4190 4191 if (btrfs_is_free_space_inode(inode)) { 4192 need_commit = 0; 4193 ASSERT(current->journal_info); 4194 } 4195 4196 again: 4197 /* make sure we have enough space to handle the data first */ 4198 spin_lock(&data_sinfo->lock); 4199 used = btrfs_space_info_used(data_sinfo, true); 4200 4201 if (used + bytes > data_sinfo->total_bytes) { 4202 struct btrfs_trans_handle *trans; 4203 4204 /* 4205 * if we don't have enough free bytes in this space then we need 4206 * to alloc a new chunk. 4207 */ 4208 if (!data_sinfo->full) { 4209 u64 alloc_target; 4210 4211 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4212 spin_unlock(&data_sinfo->lock); 4213 4214 alloc_target = btrfs_data_alloc_profile(fs_info); 4215 /* 4216 * It is ugly that we don't call nolock join 4217 * transaction for the free space inode case here. 4218 * But it is safe because we only do the data space 4219 * reservation for the free space cache in the 4220 * transaction context, the common join transaction 4221 * just increase the counter of the current transaction 4222 * handler, doesn't try to acquire the trans_lock of 4223 * the fs. 4224 */ 4225 trans = btrfs_join_transaction(root); 4226 if (IS_ERR(trans)) 4227 return PTR_ERR(trans); 4228 4229 ret = do_chunk_alloc(trans, alloc_target, 4230 CHUNK_ALLOC_NO_FORCE); 4231 btrfs_end_transaction(trans); 4232 if (ret < 0) { 4233 if (ret != -ENOSPC) 4234 return ret; 4235 else { 4236 have_pinned_space = 1; 4237 goto commit_trans; 4238 } 4239 } 4240 4241 goto again; 4242 } 4243 4244 /* 4245 * If we don't have enough pinned space to deal with this 4246 * allocation, and no removed chunk in current transaction, 4247 * don't bother committing the transaction. 4248 */ 4249 have_pinned_space = __percpu_counter_compare( 4250 &data_sinfo->total_bytes_pinned, 4251 used + bytes - data_sinfo->total_bytes, 4252 BTRFS_TOTAL_BYTES_PINNED_BATCH); 4253 spin_unlock(&data_sinfo->lock); 4254 4255 /* commit the current transaction and try again */ 4256 commit_trans: 4257 if (need_commit) { 4258 need_commit--; 4259 4260 if (need_commit > 0) { 4261 btrfs_start_delalloc_roots(fs_info, -1); 4262 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, 4263 (u64)-1); 4264 } 4265 4266 trans = btrfs_join_transaction(root); 4267 if (IS_ERR(trans)) 4268 return PTR_ERR(trans); 4269 if (have_pinned_space >= 0 || 4270 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4271 &trans->transaction->flags) || 4272 need_commit > 0) { 4273 ret = btrfs_commit_transaction(trans); 4274 if (ret) 4275 return ret; 4276 /* 4277 * The cleaner kthread might still be doing iput 4278 * operations. Wait for it to finish so that 4279 * more space is released. We don't need to 4280 * explicitly run the delayed iputs here because 4281 * the commit_transaction would have woken up 4282 * the cleaner. 4283 */ 4284 ret = btrfs_wait_on_delayed_iputs(fs_info); 4285 if (ret) 4286 return ret; 4287 goto again; 4288 } else { 4289 btrfs_end_transaction(trans); 4290 } 4291 } 4292 4293 trace_btrfs_space_reservation(fs_info, 4294 "space_info:enospc", 4295 data_sinfo->flags, bytes, 1); 4296 return -ENOSPC; 4297 } 4298 update_bytes_may_use(data_sinfo, bytes); 4299 trace_btrfs_space_reservation(fs_info, "space_info", 4300 data_sinfo->flags, bytes, 1); 4301 spin_unlock(&data_sinfo->lock); 4302 4303 return 0; 4304 } 4305 4306 int btrfs_check_data_free_space(struct inode *inode, 4307 struct extent_changeset **reserved, u64 start, u64 len) 4308 { 4309 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4310 int ret; 4311 4312 /* align the range */ 4313 len = round_up(start + len, fs_info->sectorsize) - 4314 round_down(start, fs_info->sectorsize); 4315 start = round_down(start, fs_info->sectorsize); 4316 4317 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4318 if (ret < 0) 4319 return ret; 4320 4321 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4322 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); 4323 if (ret < 0) 4324 btrfs_free_reserved_data_space_noquota(inode, start, len); 4325 else 4326 ret = 0; 4327 return ret; 4328 } 4329 4330 /* 4331 * Called if we need to clear a data reservation for this inode 4332 * Normally in a error case. 4333 * 4334 * This one will *NOT* use accurate qgroup reserved space API, just for case 4335 * which we can't sleep and is sure it won't affect qgroup reserved space. 4336 * Like clear_bit_hook(). 4337 */ 4338 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4339 u64 len) 4340 { 4341 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4342 struct btrfs_space_info *data_sinfo; 4343 4344 /* Make sure the range is aligned to sectorsize */ 4345 len = round_up(start + len, fs_info->sectorsize) - 4346 round_down(start, fs_info->sectorsize); 4347 start = round_down(start, fs_info->sectorsize); 4348 4349 data_sinfo = fs_info->data_sinfo; 4350 spin_lock(&data_sinfo->lock); 4351 update_bytes_may_use(data_sinfo, -len); 4352 trace_btrfs_space_reservation(fs_info, "space_info", 4353 data_sinfo->flags, len, 0); 4354 spin_unlock(&data_sinfo->lock); 4355 } 4356 4357 /* 4358 * Called if we need to clear a data reservation for this inode 4359 * Normally in a error case. 4360 * 4361 * This one will handle the per-inode data rsv map for accurate reserved 4362 * space framework. 4363 */ 4364 void btrfs_free_reserved_data_space(struct inode *inode, 4365 struct extent_changeset *reserved, u64 start, u64 len) 4366 { 4367 struct btrfs_root *root = BTRFS_I(inode)->root; 4368 4369 /* Make sure the range is aligned to sectorsize */ 4370 len = round_up(start + len, root->fs_info->sectorsize) - 4371 round_down(start, root->fs_info->sectorsize); 4372 start = round_down(start, root->fs_info->sectorsize); 4373 4374 btrfs_free_reserved_data_space_noquota(inode, start, len); 4375 btrfs_qgroup_free_data(inode, reserved, start, len); 4376 } 4377 4378 static void force_metadata_allocation(struct btrfs_fs_info *info) 4379 { 4380 struct list_head *head = &info->space_info; 4381 struct btrfs_space_info *found; 4382 4383 rcu_read_lock(); 4384 list_for_each_entry_rcu(found, head, list) { 4385 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4386 found->force_alloc = CHUNK_ALLOC_FORCE; 4387 } 4388 rcu_read_unlock(); 4389 } 4390 4391 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4392 { 4393 return (global->size << 1); 4394 } 4395 4396 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4397 struct btrfs_space_info *sinfo, int force) 4398 { 4399 u64 bytes_used = btrfs_space_info_used(sinfo, false); 4400 u64 thresh; 4401 4402 if (force == CHUNK_ALLOC_FORCE) 4403 return 1; 4404 4405 /* 4406 * in limited mode, we want to have some free space up to 4407 * about 1% of the FS size. 4408 */ 4409 if (force == CHUNK_ALLOC_LIMITED) { 4410 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4411 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4412 4413 if (sinfo->total_bytes - bytes_used < thresh) 4414 return 1; 4415 } 4416 4417 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 4418 return 0; 4419 return 1; 4420 } 4421 4422 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4423 { 4424 u64 num_dev; 4425 4426 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4427 BTRFS_BLOCK_GROUP_RAID0 | 4428 BTRFS_BLOCK_GROUP_RAID5 | 4429 BTRFS_BLOCK_GROUP_RAID6)) 4430 num_dev = fs_info->fs_devices->rw_devices; 4431 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4432 num_dev = 2; 4433 else 4434 num_dev = 1; /* DUP or single */ 4435 4436 return num_dev; 4437 } 4438 4439 /* 4440 * If @is_allocation is true, reserve space in the system space info necessary 4441 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4442 * removing a chunk. 4443 */ 4444 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 4445 { 4446 struct btrfs_fs_info *fs_info = trans->fs_info; 4447 struct btrfs_space_info *info; 4448 u64 left; 4449 u64 thresh; 4450 int ret = 0; 4451 u64 num_devs; 4452 4453 /* 4454 * Needed because we can end up allocating a system chunk and for an 4455 * atomic and race free space reservation in the chunk block reserve. 4456 */ 4457 lockdep_assert_held(&fs_info->chunk_mutex); 4458 4459 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4460 spin_lock(&info->lock); 4461 left = info->total_bytes - btrfs_space_info_used(info, true); 4462 spin_unlock(&info->lock); 4463 4464 num_devs = get_profile_num_devs(fs_info, type); 4465 4466 /* num_devs device items to update and 1 chunk item to add or remove */ 4467 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4468 btrfs_calc_trans_metadata_size(fs_info, 1); 4469 4470 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4471 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4472 left, thresh, type); 4473 dump_space_info(fs_info, info, 0, 0); 4474 } 4475 4476 if (left < thresh) { 4477 u64 flags = btrfs_system_alloc_profile(fs_info); 4478 4479 /* 4480 * Ignore failure to create system chunk. We might end up not 4481 * needing it, as we might not need to COW all nodes/leafs from 4482 * the paths we visit in the chunk tree (they were already COWed 4483 * or created in the current transaction for example). 4484 */ 4485 ret = btrfs_alloc_chunk(trans, flags); 4486 } 4487 4488 if (!ret) { 4489 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4490 &fs_info->chunk_block_rsv, 4491 thresh, BTRFS_RESERVE_NO_FLUSH); 4492 if (!ret) 4493 trans->chunk_bytes_reserved += thresh; 4494 } 4495 } 4496 4497 /* 4498 * If force is CHUNK_ALLOC_FORCE: 4499 * - return 1 if it successfully allocates a chunk, 4500 * - return errors including -ENOSPC otherwise. 4501 * If force is NOT CHUNK_ALLOC_FORCE: 4502 * - return 0 if it doesn't need to allocate a new chunk, 4503 * - return 1 if it successfully allocates a chunk, 4504 * - return errors including -ENOSPC otherwise. 4505 */ 4506 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 4507 int force) 4508 { 4509 struct btrfs_fs_info *fs_info = trans->fs_info; 4510 struct btrfs_space_info *space_info; 4511 bool wait_for_alloc = false; 4512 bool should_alloc = false; 4513 int ret = 0; 4514 4515 /* Don't re-enter if we're already allocating a chunk */ 4516 if (trans->allocating_chunk) 4517 return -ENOSPC; 4518 4519 space_info = __find_space_info(fs_info, flags); 4520 ASSERT(space_info); 4521 4522 do { 4523 spin_lock(&space_info->lock); 4524 if (force < space_info->force_alloc) 4525 force = space_info->force_alloc; 4526 should_alloc = should_alloc_chunk(fs_info, space_info, force); 4527 if (space_info->full) { 4528 /* No more free physical space */ 4529 if (should_alloc) 4530 ret = -ENOSPC; 4531 else 4532 ret = 0; 4533 spin_unlock(&space_info->lock); 4534 return ret; 4535 } else if (!should_alloc) { 4536 spin_unlock(&space_info->lock); 4537 return 0; 4538 } else if (space_info->chunk_alloc) { 4539 /* 4540 * Someone is already allocating, so we need to block 4541 * until this someone is finished and then loop to 4542 * recheck if we should continue with our allocation 4543 * attempt. 4544 */ 4545 wait_for_alloc = true; 4546 spin_unlock(&space_info->lock); 4547 mutex_lock(&fs_info->chunk_mutex); 4548 mutex_unlock(&fs_info->chunk_mutex); 4549 } else { 4550 /* Proceed with allocation */ 4551 space_info->chunk_alloc = 1; 4552 wait_for_alloc = false; 4553 spin_unlock(&space_info->lock); 4554 } 4555 4556 cond_resched(); 4557 } while (wait_for_alloc); 4558 4559 mutex_lock(&fs_info->chunk_mutex); 4560 trans->allocating_chunk = true; 4561 4562 /* 4563 * If we have mixed data/metadata chunks we want to make sure we keep 4564 * allocating mixed chunks instead of individual chunks. 4565 */ 4566 if (btrfs_mixed_space_info(space_info)) 4567 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4568 4569 /* 4570 * if we're doing a data chunk, go ahead and make sure that 4571 * we keep a reasonable number of metadata chunks allocated in the 4572 * FS as well. 4573 */ 4574 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4575 fs_info->data_chunk_allocations++; 4576 if (!(fs_info->data_chunk_allocations % 4577 fs_info->metadata_ratio)) 4578 force_metadata_allocation(fs_info); 4579 } 4580 4581 /* 4582 * Check if we have enough space in SYSTEM chunk because we may need 4583 * to update devices. 4584 */ 4585 check_system_chunk(trans, flags); 4586 4587 ret = btrfs_alloc_chunk(trans, flags); 4588 trans->allocating_chunk = false; 4589 4590 spin_lock(&space_info->lock); 4591 if (ret < 0) { 4592 if (ret == -ENOSPC) 4593 space_info->full = 1; 4594 else 4595 goto out; 4596 } else { 4597 ret = 1; 4598 space_info->max_extent_size = 0; 4599 } 4600 4601 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4602 out: 4603 space_info->chunk_alloc = 0; 4604 spin_unlock(&space_info->lock); 4605 mutex_unlock(&fs_info->chunk_mutex); 4606 /* 4607 * When we allocate a new chunk we reserve space in the chunk block 4608 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4609 * add new nodes/leafs to it if we end up needing to do it when 4610 * inserting the chunk item and updating device items as part of the 4611 * second phase of chunk allocation, performed by 4612 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4613 * large number of new block groups to create in our transaction 4614 * handle's new_bgs list to avoid exhausting the chunk block reserve 4615 * in extreme cases - like having a single transaction create many new 4616 * block groups when starting to write out the free space caches of all 4617 * the block groups that were made dirty during the lifetime of the 4618 * transaction. 4619 */ 4620 if (trans->chunk_bytes_reserved >= (u64)SZ_2M) 4621 btrfs_create_pending_block_groups(trans); 4622 4623 return ret; 4624 } 4625 4626 static int can_overcommit(struct btrfs_fs_info *fs_info, 4627 struct btrfs_space_info *space_info, u64 bytes, 4628 enum btrfs_reserve_flush_enum flush, 4629 bool system_chunk) 4630 { 4631 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4632 u64 profile; 4633 u64 space_size; 4634 u64 avail; 4635 u64 used; 4636 int factor; 4637 4638 /* Don't overcommit when in mixed mode. */ 4639 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4640 return 0; 4641 4642 if (system_chunk) 4643 profile = btrfs_system_alloc_profile(fs_info); 4644 else 4645 profile = btrfs_metadata_alloc_profile(fs_info); 4646 4647 used = btrfs_space_info_used(space_info, false); 4648 4649 /* 4650 * We only want to allow over committing if we have lots of actual space 4651 * free, but if we don't have enough space to handle the global reserve 4652 * space then we could end up having a real enospc problem when trying 4653 * to allocate a chunk or some other such important allocation. 4654 */ 4655 spin_lock(&global_rsv->lock); 4656 space_size = calc_global_rsv_need_space(global_rsv); 4657 spin_unlock(&global_rsv->lock); 4658 if (used + space_size >= space_info->total_bytes) 4659 return 0; 4660 4661 used += space_info->bytes_may_use; 4662 4663 avail = atomic64_read(&fs_info->free_chunk_space); 4664 4665 /* 4666 * If we have dup, raid1 or raid10 then only half of the free 4667 * space is actually usable. For raid56, the space info used 4668 * doesn't include the parity drive, so we don't have to 4669 * change the math 4670 */ 4671 factor = btrfs_bg_type_to_factor(profile); 4672 avail = div_u64(avail, factor); 4673 4674 /* 4675 * If we aren't flushing all things, let us overcommit up to 4676 * 1/2th of the space. If we can flush, don't let us overcommit 4677 * too much, let it overcommit up to 1/8 of the space. 4678 */ 4679 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4680 avail >>= 3; 4681 else 4682 avail >>= 1; 4683 4684 if (used + bytes < space_info->total_bytes + avail) 4685 return 1; 4686 return 0; 4687 } 4688 4689 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4690 unsigned long nr_pages, int nr_items) 4691 { 4692 struct super_block *sb = fs_info->sb; 4693 4694 if (down_read_trylock(&sb->s_umount)) { 4695 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4696 up_read(&sb->s_umount); 4697 } else { 4698 /* 4699 * We needn't worry the filesystem going from r/w to r/o though 4700 * we don't acquire ->s_umount mutex, because the filesystem 4701 * should guarantee the delalloc inodes list be empty after 4702 * the filesystem is readonly(all dirty pages are written to 4703 * the disk). 4704 */ 4705 btrfs_start_delalloc_roots(fs_info, nr_items); 4706 if (!current->journal_info) 4707 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4708 } 4709 } 4710 4711 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4712 u64 to_reclaim) 4713 { 4714 u64 bytes; 4715 u64 nr; 4716 4717 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4718 nr = div64_u64(to_reclaim, bytes); 4719 if (!nr) 4720 nr = 1; 4721 return nr; 4722 } 4723 4724 #define EXTENT_SIZE_PER_ITEM SZ_256K 4725 4726 /* 4727 * shrink metadata reservation for delalloc 4728 */ 4729 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4730 u64 orig, bool wait_ordered) 4731 { 4732 struct btrfs_space_info *space_info; 4733 struct btrfs_trans_handle *trans; 4734 u64 delalloc_bytes; 4735 u64 async_pages; 4736 u64 items; 4737 long time_left; 4738 unsigned long nr_pages; 4739 int loops; 4740 4741 /* Calc the number of the pages we need flush for space reservation */ 4742 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4743 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4744 4745 trans = (struct btrfs_trans_handle *)current->journal_info; 4746 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4747 4748 delalloc_bytes = percpu_counter_sum_positive( 4749 &fs_info->delalloc_bytes); 4750 if (delalloc_bytes == 0) { 4751 if (trans) 4752 return; 4753 if (wait_ordered) 4754 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4755 return; 4756 } 4757 4758 loops = 0; 4759 while (delalloc_bytes && loops < 3) { 4760 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; 4761 4762 /* 4763 * Triggers inode writeback for up to nr_pages. This will invoke 4764 * ->writepages callback and trigger delalloc filling 4765 * (btrfs_run_delalloc_range()). 4766 */ 4767 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4768 4769 /* 4770 * We need to wait for the compressed pages to start before 4771 * we continue. 4772 */ 4773 async_pages = atomic_read(&fs_info->async_delalloc_pages); 4774 if (!async_pages) 4775 goto skip_async; 4776 4777 /* 4778 * Calculate how many compressed pages we want to be written 4779 * before we continue. I.e if there are more async pages than we 4780 * require wait_event will wait until nr_pages are written. 4781 */ 4782 if (async_pages <= nr_pages) 4783 async_pages = 0; 4784 else 4785 async_pages -= nr_pages; 4786 4787 wait_event(fs_info->async_submit_wait, 4788 atomic_read(&fs_info->async_delalloc_pages) <= 4789 (int)async_pages); 4790 skip_async: 4791 spin_lock(&space_info->lock); 4792 if (list_empty(&space_info->tickets) && 4793 list_empty(&space_info->priority_tickets)) { 4794 spin_unlock(&space_info->lock); 4795 break; 4796 } 4797 spin_unlock(&space_info->lock); 4798 4799 loops++; 4800 if (wait_ordered && !trans) { 4801 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4802 } else { 4803 time_left = schedule_timeout_killable(1); 4804 if (time_left) 4805 break; 4806 } 4807 delalloc_bytes = percpu_counter_sum_positive( 4808 &fs_info->delalloc_bytes); 4809 } 4810 } 4811 4812 struct reserve_ticket { 4813 u64 orig_bytes; 4814 u64 bytes; 4815 int error; 4816 struct list_head list; 4817 wait_queue_head_t wait; 4818 }; 4819 4820 /** 4821 * maybe_commit_transaction - possibly commit the transaction if its ok to 4822 * @root - the root we're allocating for 4823 * @bytes - the number of bytes we want to reserve 4824 * @force - force the commit 4825 * 4826 * This will check to make sure that committing the transaction will actually 4827 * get us somewhere and then commit the transaction if it does. Otherwise it 4828 * will return -ENOSPC. 4829 */ 4830 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4831 struct btrfs_space_info *space_info) 4832 { 4833 struct reserve_ticket *ticket = NULL; 4834 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4835 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 4836 struct btrfs_trans_handle *trans; 4837 u64 bytes_needed; 4838 u64 reclaim_bytes = 0; 4839 4840 trans = (struct btrfs_trans_handle *)current->journal_info; 4841 if (trans) 4842 return -EAGAIN; 4843 4844 spin_lock(&space_info->lock); 4845 if (!list_empty(&space_info->priority_tickets)) 4846 ticket = list_first_entry(&space_info->priority_tickets, 4847 struct reserve_ticket, list); 4848 else if (!list_empty(&space_info->tickets)) 4849 ticket = list_first_entry(&space_info->tickets, 4850 struct reserve_ticket, list); 4851 bytes_needed = (ticket) ? ticket->bytes : 0; 4852 spin_unlock(&space_info->lock); 4853 4854 if (!bytes_needed) 4855 return 0; 4856 4857 trans = btrfs_join_transaction(fs_info->extent_root); 4858 if (IS_ERR(trans)) 4859 return PTR_ERR(trans); 4860 4861 /* 4862 * See if there is enough pinned space to make this reservation, or if 4863 * we have block groups that are going to be freed, allowing us to 4864 * possibly do a chunk allocation the next loop through. 4865 */ 4866 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 4867 __percpu_counter_compare(&space_info->total_bytes_pinned, 4868 bytes_needed, 4869 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 4870 goto commit; 4871 4872 /* 4873 * See if there is some space in the delayed insertion reservation for 4874 * this reservation. 4875 */ 4876 if (space_info != delayed_rsv->space_info) 4877 goto enospc; 4878 4879 spin_lock(&delayed_rsv->lock); 4880 reclaim_bytes += delayed_rsv->reserved; 4881 spin_unlock(&delayed_rsv->lock); 4882 4883 spin_lock(&delayed_refs_rsv->lock); 4884 reclaim_bytes += delayed_refs_rsv->reserved; 4885 spin_unlock(&delayed_refs_rsv->lock); 4886 if (reclaim_bytes >= bytes_needed) 4887 goto commit; 4888 bytes_needed -= reclaim_bytes; 4889 4890 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4891 bytes_needed, 4892 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 4893 goto enospc; 4894 4895 commit: 4896 return btrfs_commit_transaction(trans); 4897 enospc: 4898 btrfs_end_transaction(trans); 4899 return -ENOSPC; 4900 } 4901 4902 /* 4903 * Try to flush some data based on policy set by @state. This is only advisory 4904 * and may fail for various reasons. The caller is supposed to examine the 4905 * state of @space_info to detect the outcome. 4906 */ 4907 static void flush_space(struct btrfs_fs_info *fs_info, 4908 struct btrfs_space_info *space_info, u64 num_bytes, 4909 int state) 4910 { 4911 struct btrfs_root *root = fs_info->extent_root; 4912 struct btrfs_trans_handle *trans; 4913 int nr; 4914 int ret = 0; 4915 4916 switch (state) { 4917 case FLUSH_DELAYED_ITEMS_NR: 4918 case FLUSH_DELAYED_ITEMS: 4919 if (state == FLUSH_DELAYED_ITEMS_NR) 4920 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4921 else 4922 nr = -1; 4923 4924 trans = btrfs_join_transaction(root); 4925 if (IS_ERR(trans)) { 4926 ret = PTR_ERR(trans); 4927 break; 4928 } 4929 ret = btrfs_run_delayed_items_nr(trans, nr); 4930 btrfs_end_transaction(trans); 4931 break; 4932 case FLUSH_DELALLOC: 4933 case FLUSH_DELALLOC_WAIT: 4934 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 4935 state == FLUSH_DELALLOC_WAIT); 4936 break; 4937 case FLUSH_DELAYED_REFS_NR: 4938 case FLUSH_DELAYED_REFS: 4939 trans = btrfs_join_transaction(root); 4940 if (IS_ERR(trans)) { 4941 ret = PTR_ERR(trans); 4942 break; 4943 } 4944 if (state == FLUSH_DELAYED_REFS_NR) 4945 nr = calc_reclaim_items_nr(fs_info, num_bytes); 4946 else 4947 nr = 0; 4948 btrfs_run_delayed_refs(trans, nr); 4949 btrfs_end_transaction(trans); 4950 break; 4951 case ALLOC_CHUNK: 4952 case ALLOC_CHUNK_FORCE: 4953 trans = btrfs_join_transaction(root); 4954 if (IS_ERR(trans)) { 4955 ret = PTR_ERR(trans); 4956 break; 4957 } 4958 ret = do_chunk_alloc(trans, 4959 btrfs_metadata_alloc_profile(fs_info), 4960 (state == ALLOC_CHUNK) ? 4961 CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); 4962 btrfs_end_transaction(trans); 4963 if (ret > 0 || ret == -ENOSPC) 4964 ret = 0; 4965 break; 4966 case COMMIT_TRANS: 4967 /* 4968 * If we have pending delayed iputs then we could free up a 4969 * bunch of pinned space, so make sure we run the iputs before 4970 * we do our pinned bytes check below. 4971 */ 4972 btrfs_run_delayed_iputs(fs_info); 4973 btrfs_wait_on_delayed_iputs(fs_info); 4974 4975 ret = may_commit_transaction(fs_info, space_info); 4976 break; 4977 default: 4978 ret = -ENOSPC; 4979 break; 4980 } 4981 4982 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 4983 ret); 4984 return; 4985 } 4986 4987 static inline u64 4988 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 4989 struct btrfs_space_info *space_info, 4990 bool system_chunk) 4991 { 4992 struct reserve_ticket *ticket; 4993 u64 used; 4994 u64 expected; 4995 u64 to_reclaim = 0; 4996 4997 list_for_each_entry(ticket, &space_info->tickets, list) 4998 to_reclaim += ticket->bytes; 4999 list_for_each_entry(ticket, &space_info->priority_tickets, list) 5000 to_reclaim += ticket->bytes; 5001 if (to_reclaim) 5002 return to_reclaim; 5003 5004 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 5005 if (can_overcommit(fs_info, space_info, to_reclaim, 5006 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 5007 return 0; 5008 5009 used = btrfs_space_info_used(space_info, true); 5010 5011 if (can_overcommit(fs_info, space_info, SZ_1M, 5012 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 5013 expected = div_factor_fine(space_info->total_bytes, 95); 5014 else 5015 expected = div_factor_fine(space_info->total_bytes, 90); 5016 5017 if (used > expected) 5018 to_reclaim = used - expected; 5019 else 5020 to_reclaim = 0; 5021 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 5022 space_info->bytes_reserved); 5023 return to_reclaim; 5024 } 5025 5026 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 5027 struct btrfs_space_info *space_info, 5028 u64 used, bool system_chunk) 5029 { 5030 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 5031 5032 /* If we're just plain full then async reclaim just slows us down. */ 5033 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 5034 return 0; 5035 5036 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5037 system_chunk)) 5038 return 0; 5039 5040 return (used >= thresh && !btrfs_fs_closing(fs_info) && 5041 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 5042 } 5043 5044 static bool wake_all_tickets(struct list_head *head) 5045 { 5046 struct reserve_ticket *ticket; 5047 5048 while (!list_empty(head)) { 5049 ticket = list_first_entry(head, struct reserve_ticket, list); 5050 list_del_init(&ticket->list); 5051 ticket->error = -ENOSPC; 5052 wake_up(&ticket->wait); 5053 if (ticket->bytes != ticket->orig_bytes) 5054 return true; 5055 } 5056 return false; 5057 } 5058 5059 /* 5060 * This is for normal flushers, we can wait all goddamned day if we want to. We 5061 * will loop and continuously try to flush as long as we are making progress. 5062 * We count progress as clearing off tickets each time we have to loop. 5063 */ 5064 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 5065 { 5066 struct btrfs_fs_info *fs_info; 5067 struct btrfs_space_info *space_info; 5068 u64 to_reclaim; 5069 int flush_state; 5070 int commit_cycles = 0; 5071 u64 last_tickets_id; 5072 5073 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 5074 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5075 5076 spin_lock(&space_info->lock); 5077 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5078 false); 5079 if (!to_reclaim) { 5080 space_info->flush = 0; 5081 spin_unlock(&space_info->lock); 5082 return; 5083 } 5084 last_tickets_id = space_info->tickets_id; 5085 spin_unlock(&space_info->lock); 5086 5087 flush_state = FLUSH_DELAYED_ITEMS_NR; 5088 do { 5089 flush_space(fs_info, space_info, to_reclaim, flush_state); 5090 spin_lock(&space_info->lock); 5091 if (list_empty(&space_info->tickets)) { 5092 space_info->flush = 0; 5093 spin_unlock(&space_info->lock); 5094 return; 5095 } 5096 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 5097 space_info, 5098 false); 5099 if (last_tickets_id == space_info->tickets_id) { 5100 flush_state++; 5101 } else { 5102 last_tickets_id = space_info->tickets_id; 5103 flush_state = FLUSH_DELAYED_ITEMS_NR; 5104 if (commit_cycles) 5105 commit_cycles--; 5106 } 5107 5108 /* 5109 * We don't want to force a chunk allocation until we've tried 5110 * pretty hard to reclaim space. Think of the case where we 5111 * freed up a bunch of space and so have a lot of pinned space 5112 * to reclaim. We would rather use that than possibly create a 5113 * underutilized metadata chunk. So if this is our first run 5114 * through the flushing state machine skip ALLOC_CHUNK_FORCE and 5115 * commit the transaction. If nothing has changed the next go 5116 * around then we can force a chunk allocation. 5117 */ 5118 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) 5119 flush_state++; 5120 5121 if (flush_state > COMMIT_TRANS) { 5122 commit_cycles++; 5123 if (commit_cycles > 2) { 5124 if (wake_all_tickets(&space_info->tickets)) { 5125 flush_state = FLUSH_DELAYED_ITEMS_NR; 5126 commit_cycles--; 5127 } else { 5128 space_info->flush = 0; 5129 } 5130 } else { 5131 flush_state = FLUSH_DELAYED_ITEMS_NR; 5132 } 5133 } 5134 spin_unlock(&space_info->lock); 5135 } while (flush_state <= COMMIT_TRANS); 5136 } 5137 5138 void btrfs_init_async_reclaim_work(struct work_struct *work) 5139 { 5140 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5141 } 5142 5143 static const enum btrfs_flush_state priority_flush_states[] = { 5144 FLUSH_DELAYED_ITEMS_NR, 5145 FLUSH_DELAYED_ITEMS, 5146 ALLOC_CHUNK, 5147 }; 5148 5149 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5150 struct btrfs_space_info *space_info, 5151 struct reserve_ticket *ticket) 5152 { 5153 u64 to_reclaim; 5154 int flush_state; 5155 5156 spin_lock(&space_info->lock); 5157 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5158 false); 5159 if (!to_reclaim) { 5160 spin_unlock(&space_info->lock); 5161 return; 5162 } 5163 spin_unlock(&space_info->lock); 5164 5165 flush_state = 0; 5166 do { 5167 flush_space(fs_info, space_info, to_reclaim, 5168 priority_flush_states[flush_state]); 5169 flush_state++; 5170 spin_lock(&space_info->lock); 5171 if (ticket->bytes == 0) { 5172 spin_unlock(&space_info->lock); 5173 return; 5174 } 5175 spin_unlock(&space_info->lock); 5176 } while (flush_state < ARRAY_SIZE(priority_flush_states)); 5177 } 5178 5179 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5180 struct btrfs_space_info *space_info, 5181 struct reserve_ticket *ticket) 5182 5183 { 5184 DEFINE_WAIT(wait); 5185 u64 reclaim_bytes = 0; 5186 int ret = 0; 5187 5188 spin_lock(&space_info->lock); 5189 while (ticket->bytes > 0 && ticket->error == 0) { 5190 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5191 if (ret) { 5192 ret = -EINTR; 5193 break; 5194 } 5195 spin_unlock(&space_info->lock); 5196 5197 schedule(); 5198 5199 finish_wait(&ticket->wait, &wait); 5200 spin_lock(&space_info->lock); 5201 } 5202 if (!ret) 5203 ret = ticket->error; 5204 if (!list_empty(&ticket->list)) 5205 list_del_init(&ticket->list); 5206 if (ticket->bytes && ticket->bytes < ticket->orig_bytes) 5207 reclaim_bytes = ticket->orig_bytes - ticket->bytes; 5208 spin_unlock(&space_info->lock); 5209 5210 if (reclaim_bytes) 5211 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); 5212 return ret; 5213 } 5214 5215 /** 5216 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5217 * @root - the root we're allocating for 5218 * @space_info - the space info we want to allocate from 5219 * @orig_bytes - the number of bytes we want 5220 * @flush - whether or not we can flush to make our reservation 5221 * 5222 * This will reserve orig_bytes number of bytes from the space info associated 5223 * with the block_rsv. If there is not enough space it will make an attempt to 5224 * flush out space to make room. It will do this by flushing delalloc if 5225 * possible or committing the transaction. If flush is 0 then no attempts to 5226 * regain reservations will be made and this will fail if there is not enough 5227 * space already. 5228 */ 5229 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 5230 struct btrfs_space_info *space_info, 5231 u64 orig_bytes, 5232 enum btrfs_reserve_flush_enum flush, 5233 bool system_chunk) 5234 { 5235 struct reserve_ticket ticket; 5236 u64 used; 5237 u64 reclaim_bytes = 0; 5238 int ret = 0; 5239 5240 ASSERT(orig_bytes); 5241 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5242 5243 spin_lock(&space_info->lock); 5244 ret = -ENOSPC; 5245 used = btrfs_space_info_used(space_info, true); 5246 5247 /* 5248 * If we have enough space then hooray, make our reservation and carry 5249 * on. If not see if we can overcommit, and if we can, hooray carry on. 5250 * If not things get more complicated. 5251 */ 5252 if (used + orig_bytes <= space_info->total_bytes) { 5253 update_bytes_may_use(space_info, orig_bytes); 5254 trace_btrfs_space_reservation(fs_info, "space_info", 5255 space_info->flags, orig_bytes, 1); 5256 ret = 0; 5257 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, 5258 system_chunk)) { 5259 update_bytes_may_use(space_info, orig_bytes); 5260 trace_btrfs_space_reservation(fs_info, "space_info", 5261 space_info->flags, orig_bytes, 1); 5262 ret = 0; 5263 } 5264 5265 /* 5266 * If we couldn't make a reservation then setup our reservation ticket 5267 * and kick the async worker if it's not already running. 5268 * 5269 * If we are a priority flusher then we just need to add our ticket to 5270 * the list and we will do our own flushing further down. 5271 */ 5272 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5273 ticket.orig_bytes = orig_bytes; 5274 ticket.bytes = orig_bytes; 5275 ticket.error = 0; 5276 init_waitqueue_head(&ticket.wait); 5277 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5278 list_add_tail(&ticket.list, &space_info->tickets); 5279 if (!space_info->flush) { 5280 space_info->flush = 1; 5281 trace_btrfs_trigger_flush(fs_info, 5282 space_info->flags, 5283 orig_bytes, flush, 5284 "enospc"); 5285 queue_work(system_unbound_wq, 5286 &fs_info->async_reclaim_work); 5287 } 5288 } else { 5289 list_add_tail(&ticket.list, 5290 &space_info->priority_tickets); 5291 } 5292 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5293 used += orig_bytes; 5294 /* 5295 * We will do the space reservation dance during log replay, 5296 * which means we won't have fs_info->fs_root set, so don't do 5297 * the async reclaim as we will panic. 5298 */ 5299 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5300 need_do_async_reclaim(fs_info, space_info, 5301 used, system_chunk) && 5302 !work_busy(&fs_info->async_reclaim_work)) { 5303 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5304 orig_bytes, flush, "preempt"); 5305 queue_work(system_unbound_wq, 5306 &fs_info->async_reclaim_work); 5307 } 5308 } 5309 spin_unlock(&space_info->lock); 5310 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5311 return ret; 5312 5313 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5314 return wait_reserve_ticket(fs_info, space_info, &ticket); 5315 5316 ret = 0; 5317 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5318 spin_lock(&space_info->lock); 5319 if (ticket.bytes) { 5320 if (ticket.bytes < orig_bytes) 5321 reclaim_bytes = orig_bytes - ticket.bytes; 5322 list_del_init(&ticket.list); 5323 ret = -ENOSPC; 5324 } 5325 spin_unlock(&space_info->lock); 5326 5327 if (reclaim_bytes) 5328 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); 5329 ASSERT(list_empty(&ticket.list)); 5330 return ret; 5331 } 5332 5333 /** 5334 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5335 * @root - the root we're allocating for 5336 * @block_rsv - the block_rsv we're allocating for 5337 * @orig_bytes - the number of bytes we want 5338 * @flush - whether or not we can flush to make our reservation 5339 * 5340 * This will reserve orig_bytes number of bytes from the space info associated 5341 * with the block_rsv. If there is not enough space it will make an attempt to 5342 * flush out space to make room. It will do this by flushing delalloc if 5343 * possible or committing the transaction. If flush is 0 then no attempts to 5344 * regain reservations will be made and this will fail if there is not enough 5345 * space already. 5346 */ 5347 static int reserve_metadata_bytes(struct btrfs_root *root, 5348 struct btrfs_block_rsv *block_rsv, 5349 u64 orig_bytes, 5350 enum btrfs_reserve_flush_enum flush) 5351 { 5352 struct btrfs_fs_info *fs_info = root->fs_info; 5353 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5354 int ret; 5355 bool system_chunk = (root == fs_info->chunk_root); 5356 5357 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 5358 orig_bytes, flush, system_chunk); 5359 if (ret == -ENOSPC && 5360 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5361 if (block_rsv != global_rsv && 5362 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5363 ret = 0; 5364 } 5365 if (ret == -ENOSPC) { 5366 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5367 block_rsv->space_info->flags, 5368 orig_bytes, 1); 5369 5370 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 5371 dump_space_info(fs_info, block_rsv->space_info, 5372 orig_bytes, 0); 5373 } 5374 return ret; 5375 } 5376 5377 static struct btrfs_block_rsv *get_block_rsv( 5378 const struct btrfs_trans_handle *trans, 5379 const struct btrfs_root *root) 5380 { 5381 struct btrfs_fs_info *fs_info = root->fs_info; 5382 struct btrfs_block_rsv *block_rsv = NULL; 5383 5384 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5385 (root == fs_info->csum_root && trans->adding_csums) || 5386 (root == fs_info->uuid_root)) 5387 block_rsv = trans->block_rsv; 5388 5389 if (!block_rsv) 5390 block_rsv = root->block_rsv; 5391 5392 if (!block_rsv) 5393 block_rsv = &fs_info->empty_block_rsv; 5394 5395 return block_rsv; 5396 } 5397 5398 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5399 u64 num_bytes) 5400 { 5401 int ret = -ENOSPC; 5402 spin_lock(&block_rsv->lock); 5403 if (block_rsv->reserved >= num_bytes) { 5404 block_rsv->reserved -= num_bytes; 5405 if (block_rsv->reserved < block_rsv->size) 5406 block_rsv->full = 0; 5407 ret = 0; 5408 } 5409 spin_unlock(&block_rsv->lock); 5410 return ret; 5411 } 5412 5413 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5414 u64 num_bytes, bool update_size) 5415 { 5416 spin_lock(&block_rsv->lock); 5417 block_rsv->reserved += num_bytes; 5418 if (update_size) 5419 block_rsv->size += num_bytes; 5420 else if (block_rsv->reserved >= block_rsv->size) 5421 block_rsv->full = 1; 5422 spin_unlock(&block_rsv->lock); 5423 } 5424 5425 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5426 struct btrfs_block_rsv *dest, u64 num_bytes, 5427 int min_factor) 5428 { 5429 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5430 u64 min_bytes; 5431 5432 if (global_rsv->space_info != dest->space_info) 5433 return -ENOSPC; 5434 5435 spin_lock(&global_rsv->lock); 5436 min_bytes = div_factor(global_rsv->size, min_factor); 5437 if (global_rsv->reserved < min_bytes + num_bytes) { 5438 spin_unlock(&global_rsv->lock); 5439 return -ENOSPC; 5440 } 5441 global_rsv->reserved -= num_bytes; 5442 if (global_rsv->reserved < global_rsv->size) 5443 global_rsv->full = 0; 5444 spin_unlock(&global_rsv->lock); 5445 5446 block_rsv_add_bytes(dest, num_bytes, true); 5447 return 0; 5448 } 5449 5450 /** 5451 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. 5452 * @fs_info - the fs info for our fs. 5453 * @src - the source block rsv to transfer from. 5454 * @num_bytes - the number of bytes to transfer. 5455 * 5456 * This transfers up to the num_bytes amount from the src rsv to the 5457 * delayed_refs_rsv. Any extra bytes are returned to the space info. 5458 */ 5459 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 5460 struct btrfs_block_rsv *src, 5461 u64 num_bytes) 5462 { 5463 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 5464 u64 to_free = 0; 5465 5466 spin_lock(&src->lock); 5467 src->reserved -= num_bytes; 5468 src->size -= num_bytes; 5469 spin_unlock(&src->lock); 5470 5471 spin_lock(&delayed_refs_rsv->lock); 5472 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 5473 u64 delta = delayed_refs_rsv->size - 5474 delayed_refs_rsv->reserved; 5475 if (num_bytes > delta) { 5476 to_free = num_bytes - delta; 5477 num_bytes = delta; 5478 } 5479 } else { 5480 to_free = num_bytes; 5481 num_bytes = 0; 5482 } 5483 5484 if (num_bytes) 5485 delayed_refs_rsv->reserved += num_bytes; 5486 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 5487 delayed_refs_rsv->full = 1; 5488 spin_unlock(&delayed_refs_rsv->lock); 5489 5490 if (num_bytes) 5491 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5492 0, num_bytes, 1); 5493 if (to_free) 5494 space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, 5495 to_free); 5496 } 5497 5498 /** 5499 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. 5500 * @fs_info - the fs_info for our fs. 5501 * @flush - control how we can flush for this reservation. 5502 * 5503 * This will refill the delayed block_rsv up to 1 items size worth of space and 5504 * will return -ENOSPC if we can't make the reservation. 5505 */ 5506 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 5507 enum btrfs_reserve_flush_enum flush) 5508 { 5509 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5510 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); 5511 u64 num_bytes = 0; 5512 int ret = -ENOSPC; 5513 5514 spin_lock(&block_rsv->lock); 5515 if (block_rsv->reserved < block_rsv->size) { 5516 num_bytes = block_rsv->size - block_rsv->reserved; 5517 num_bytes = min(num_bytes, limit); 5518 } 5519 spin_unlock(&block_rsv->lock); 5520 5521 if (!num_bytes) 5522 return 0; 5523 5524 ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, 5525 num_bytes, flush); 5526 if (ret) 5527 return ret; 5528 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5529 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5530 0, num_bytes, 1); 5531 return 0; 5532 } 5533 5534 /* 5535 * This is for space we already have accounted in space_info->bytes_may_use, so 5536 * basically when we're returning space from block_rsv's. 5537 */ 5538 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5539 struct btrfs_space_info *space_info, 5540 u64 num_bytes) 5541 { 5542 struct reserve_ticket *ticket; 5543 struct list_head *head; 5544 u64 used; 5545 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5546 bool check_overcommit = false; 5547 5548 spin_lock(&space_info->lock); 5549 head = &space_info->priority_tickets; 5550 5551 /* 5552 * If we are over our limit then we need to check and see if we can 5553 * overcommit, and if we can't then we just need to free up our space 5554 * and not satisfy any requests. 5555 */ 5556 used = btrfs_space_info_used(space_info, true); 5557 if (used - num_bytes >= space_info->total_bytes) 5558 check_overcommit = true; 5559 again: 5560 while (!list_empty(head) && num_bytes) { 5561 ticket = list_first_entry(head, struct reserve_ticket, 5562 list); 5563 /* 5564 * We use 0 bytes because this space is already reserved, so 5565 * adding the ticket space would be a double count. 5566 */ 5567 if (check_overcommit && 5568 !can_overcommit(fs_info, space_info, 0, flush, false)) 5569 break; 5570 if (num_bytes >= ticket->bytes) { 5571 list_del_init(&ticket->list); 5572 num_bytes -= ticket->bytes; 5573 ticket->bytes = 0; 5574 space_info->tickets_id++; 5575 wake_up(&ticket->wait); 5576 } else { 5577 ticket->bytes -= num_bytes; 5578 num_bytes = 0; 5579 } 5580 } 5581 5582 if (num_bytes && head == &space_info->priority_tickets) { 5583 head = &space_info->tickets; 5584 flush = BTRFS_RESERVE_FLUSH_ALL; 5585 goto again; 5586 } 5587 update_bytes_may_use(space_info, -num_bytes); 5588 trace_btrfs_space_reservation(fs_info, "space_info", 5589 space_info->flags, num_bytes, 0); 5590 spin_unlock(&space_info->lock); 5591 } 5592 5593 /* 5594 * This is for newly allocated space that isn't accounted in 5595 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5596 * we use this helper. 5597 */ 5598 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5599 struct btrfs_space_info *space_info, 5600 u64 num_bytes) 5601 { 5602 struct reserve_ticket *ticket; 5603 struct list_head *head = &space_info->priority_tickets; 5604 5605 again: 5606 while (!list_empty(head) && num_bytes) { 5607 ticket = list_first_entry(head, struct reserve_ticket, 5608 list); 5609 if (num_bytes >= ticket->bytes) { 5610 trace_btrfs_space_reservation(fs_info, "space_info", 5611 space_info->flags, 5612 ticket->bytes, 1); 5613 list_del_init(&ticket->list); 5614 num_bytes -= ticket->bytes; 5615 update_bytes_may_use(space_info, ticket->bytes); 5616 ticket->bytes = 0; 5617 space_info->tickets_id++; 5618 wake_up(&ticket->wait); 5619 } else { 5620 trace_btrfs_space_reservation(fs_info, "space_info", 5621 space_info->flags, 5622 num_bytes, 1); 5623 update_bytes_may_use(space_info, num_bytes); 5624 ticket->bytes -= num_bytes; 5625 num_bytes = 0; 5626 } 5627 } 5628 5629 if (num_bytes && head == &space_info->priority_tickets) { 5630 head = &space_info->tickets; 5631 goto again; 5632 } 5633 } 5634 5635 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5636 struct btrfs_block_rsv *block_rsv, 5637 struct btrfs_block_rsv *dest, u64 num_bytes, 5638 u64 *qgroup_to_release_ret) 5639 { 5640 struct btrfs_space_info *space_info = block_rsv->space_info; 5641 u64 qgroup_to_release = 0; 5642 u64 ret; 5643 5644 spin_lock(&block_rsv->lock); 5645 if (num_bytes == (u64)-1) { 5646 num_bytes = block_rsv->size; 5647 qgroup_to_release = block_rsv->qgroup_rsv_size; 5648 } 5649 block_rsv->size -= num_bytes; 5650 if (block_rsv->reserved >= block_rsv->size) { 5651 num_bytes = block_rsv->reserved - block_rsv->size; 5652 block_rsv->reserved = block_rsv->size; 5653 block_rsv->full = 1; 5654 } else { 5655 num_bytes = 0; 5656 } 5657 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { 5658 qgroup_to_release = block_rsv->qgroup_rsv_reserved - 5659 block_rsv->qgroup_rsv_size; 5660 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; 5661 } else { 5662 qgroup_to_release = 0; 5663 } 5664 spin_unlock(&block_rsv->lock); 5665 5666 ret = num_bytes; 5667 if (num_bytes > 0) { 5668 if (dest) { 5669 spin_lock(&dest->lock); 5670 if (!dest->full) { 5671 u64 bytes_to_add; 5672 5673 bytes_to_add = dest->size - dest->reserved; 5674 bytes_to_add = min(num_bytes, bytes_to_add); 5675 dest->reserved += bytes_to_add; 5676 if (dest->reserved >= dest->size) 5677 dest->full = 1; 5678 num_bytes -= bytes_to_add; 5679 } 5680 spin_unlock(&dest->lock); 5681 } 5682 if (num_bytes) 5683 space_info_add_old_bytes(fs_info, space_info, 5684 num_bytes); 5685 } 5686 if (qgroup_to_release_ret) 5687 *qgroup_to_release_ret = qgroup_to_release; 5688 return ret; 5689 } 5690 5691 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5692 struct btrfs_block_rsv *dst, u64 num_bytes, 5693 bool update_size) 5694 { 5695 int ret; 5696 5697 ret = block_rsv_use_bytes(src, num_bytes); 5698 if (ret) 5699 return ret; 5700 5701 block_rsv_add_bytes(dst, num_bytes, update_size); 5702 return 0; 5703 } 5704 5705 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5706 { 5707 memset(rsv, 0, sizeof(*rsv)); 5708 spin_lock_init(&rsv->lock); 5709 rsv->type = type; 5710 } 5711 5712 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 5713 struct btrfs_block_rsv *rsv, 5714 unsigned short type) 5715 { 5716 btrfs_init_block_rsv(rsv, type); 5717 rsv->space_info = __find_space_info(fs_info, 5718 BTRFS_BLOCK_GROUP_METADATA); 5719 } 5720 5721 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5722 unsigned short type) 5723 { 5724 struct btrfs_block_rsv *block_rsv; 5725 5726 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5727 if (!block_rsv) 5728 return NULL; 5729 5730 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); 5731 return block_rsv; 5732 } 5733 5734 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5735 struct btrfs_block_rsv *rsv) 5736 { 5737 if (!rsv) 5738 return; 5739 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5740 kfree(rsv); 5741 } 5742 5743 int btrfs_block_rsv_add(struct btrfs_root *root, 5744 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5745 enum btrfs_reserve_flush_enum flush) 5746 { 5747 int ret; 5748 5749 if (num_bytes == 0) 5750 return 0; 5751 5752 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5753 if (!ret) 5754 block_rsv_add_bytes(block_rsv, num_bytes, true); 5755 5756 return ret; 5757 } 5758 5759 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5760 { 5761 u64 num_bytes = 0; 5762 int ret = -ENOSPC; 5763 5764 if (!block_rsv) 5765 return 0; 5766 5767 spin_lock(&block_rsv->lock); 5768 num_bytes = div_factor(block_rsv->size, min_factor); 5769 if (block_rsv->reserved >= num_bytes) 5770 ret = 0; 5771 spin_unlock(&block_rsv->lock); 5772 5773 return ret; 5774 } 5775 5776 int btrfs_block_rsv_refill(struct btrfs_root *root, 5777 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5778 enum btrfs_reserve_flush_enum flush) 5779 { 5780 u64 num_bytes = 0; 5781 int ret = -ENOSPC; 5782 5783 if (!block_rsv) 5784 return 0; 5785 5786 spin_lock(&block_rsv->lock); 5787 num_bytes = min_reserved; 5788 if (block_rsv->reserved >= num_bytes) 5789 ret = 0; 5790 else 5791 num_bytes -= block_rsv->reserved; 5792 spin_unlock(&block_rsv->lock); 5793 5794 if (!ret) 5795 return 0; 5796 5797 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5798 if (!ret) { 5799 block_rsv_add_bytes(block_rsv, num_bytes, false); 5800 return 0; 5801 } 5802 5803 return ret; 5804 } 5805 5806 static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv, 5807 u64 *metadata_bytes, u64 *qgroup_bytes) 5808 { 5809 *metadata_bytes = 0; 5810 *qgroup_bytes = 0; 5811 5812 spin_lock(&block_rsv->lock); 5813 if (block_rsv->reserved < block_rsv->size) 5814 *metadata_bytes = block_rsv->size - block_rsv->reserved; 5815 if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) 5816 *qgroup_bytes = block_rsv->qgroup_rsv_size - 5817 block_rsv->qgroup_rsv_reserved; 5818 spin_unlock(&block_rsv->lock); 5819 } 5820 5821 /** 5822 * btrfs_inode_rsv_refill - refill the inode block rsv. 5823 * @inode - the inode we are refilling. 5824 * @flush - the flushing restriction. 5825 * 5826 * Essentially the same as btrfs_block_rsv_refill, except it uses the 5827 * block_rsv->size as the minimum size. We'll either refill the missing amount 5828 * or return if we already have enough space. This will also handle the reserve 5829 * tracepoint for the reserved amount. 5830 */ 5831 static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, 5832 enum btrfs_reserve_flush_enum flush) 5833 { 5834 struct btrfs_root *root = inode->root; 5835 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5836 u64 num_bytes, last = 0; 5837 u64 qgroup_num_bytes; 5838 int ret = -ENOSPC; 5839 5840 calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); 5841 if (num_bytes == 0) 5842 return 0; 5843 5844 do { 5845 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, 5846 true); 5847 if (ret) 5848 return ret; 5849 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5850 if (ret) { 5851 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 5852 last = num_bytes; 5853 /* 5854 * If we are fragmented we can end up with a lot of 5855 * outstanding extents which will make our size be much 5856 * larger than our reserved amount. 5857 * 5858 * If the reservation happens here, it might be very 5859 * big though not needed in the end, if the delalloc 5860 * flushing happens. 5861 * 5862 * If this is the case try and do the reserve again. 5863 */ 5864 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5865 calc_refill_bytes(block_rsv, &num_bytes, 5866 &qgroup_num_bytes); 5867 if (num_bytes == 0) 5868 return 0; 5869 } 5870 } while (ret && last != num_bytes); 5871 5872 if (!ret) { 5873 block_rsv_add_bytes(block_rsv, num_bytes, false); 5874 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5875 btrfs_ino(inode), num_bytes, 1); 5876 5877 /* Don't forget to increase qgroup_rsv_reserved */ 5878 spin_lock(&block_rsv->lock); 5879 block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; 5880 spin_unlock(&block_rsv->lock); 5881 } 5882 return ret; 5883 } 5884 5885 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5886 struct btrfs_block_rsv *block_rsv, 5887 u64 num_bytes, u64 *qgroup_to_release) 5888 { 5889 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5890 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 5891 struct btrfs_block_rsv *target = delayed_rsv; 5892 5893 if (target->full || target == block_rsv) 5894 target = global_rsv; 5895 5896 if (block_rsv->space_info != target->space_info) 5897 target = NULL; 5898 5899 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, 5900 qgroup_to_release); 5901 } 5902 5903 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5904 struct btrfs_block_rsv *block_rsv, 5905 u64 num_bytes) 5906 { 5907 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 5908 } 5909 5910 /** 5911 * btrfs_inode_rsv_release - release any excessive reservation. 5912 * @inode - the inode we need to release from. 5913 * @qgroup_free - free or convert qgroup meta. 5914 * Unlike normal operation, qgroup meta reservation needs to know if we are 5915 * freeing qgroup reservation or just converting it into per-trans. Normally 5916 * @qgroup_free is true for error handling, and false for normal release. 5917 * 5918 * This is the same as btrfs_block_rsv_release, except that it handles the 5919 * tracepoint for the reservation. 5920 */ 5921 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) 5922 { 5923 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5924 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5925 u64 released = 0; 5926 u64 qgroup_to_release = 0; 5927 5928 /* 5929 * Since we statically set the block_rsv->size we just want to say we 5930 * are releasing 0 bytes, and then we'll just get the reservation over 5931 * the size free'd. 5932 */ 5933 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, 5934 &qgroup_to_release); 5935 if (released > 0) 5936 trace_btrfs_space_reservation(fs_info, "delalloc", 5937 btrfs_ino(inode), released, 0); 5938 if (qgroup_free) 5939 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); 5940 else 5941 btrfs_qgroup_convert_reserved_meta(inode->root, 5942 qgroup_to_release); 5943 } 5944 5945 /** 5946 * btrfs_delayed_refs_rsv_release - release a ref head's reservation. 5947 * @fs_info - the fs_info for our fs. 5948 * @nr - the number of items to drop. 5949 * 5950 * This drops the delayed ref head's count from the delayed refs rsv and frees 5951 * any excess reservation we had. 5952 */ 5953 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 5954 { 5955 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5956 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5957 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); 5958 u64 released = 0; 5959 5960 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 5961 num_bytes, NULL); 5962 if (released) 5963 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5964 0, released, 0); 5965 } 5966 5967 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5968 { 5969 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5970 struct btrfs_space_info *sinfo = block_rsv->space_info; 5971 u64 num_bytes; 5972 5973 /* 5974 * The global block rsv is based on the size of the extent tree, the 5975 * checksum tree and the root tree. If the fs is empty we want to set 5976 * it to a minimal amount for safety. 5977 */ 5978 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5979 btrfs_root_used(&fs_info->csum_root->root_item) + 5980 btrfs_root_used(&fs_info->tree_root->root_item); 5981 num_bytes = max_t(u64, num_bytes, SZ_16M); 5982 5983 spin_lock(&sinfo->lock); 5984 spin_lock(&block_rsv->lock); 5985 5986 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5987 5988 if (block_rsv->reserved < block_rsv->size) { 5989 num_bytes = btrfs_space_info_used(sinfo, true); 5990 if (sinfo->total_bytes > num_bytes) { 5991 num_bytes = sinfo->total_bytes - num_bytes; 5992 num_bytes = min(num_bytes, 5993 block_rsv->size - block_rsv->reserved); 5994 block_rsv->reserved += num_bytes; 5995 update_bytes_may_use(sinfo, num_bytes); 5996 trace_btrfs_space_reservation(fs_info, "space_info", 5997 sinfo->flags, num_bytes, 5998 1); 5999 } 6000 } else if (block_rsv->reserved > block_rsv->size) { 6001 num_bytes = block_rsv->reserved - block_rsv->size; 6002 update_bytes_may_use(sinfo, -num_bytes); 6003 trace_btrfs_space_reservation(fs_info, "space_info", 6004 sinfo->flags, num_bytes, 0); 6005 block_rsv->reserved = block_rsv->size; 6006 } 6007 6008 if (block_rsv->reserved == block_rsv->size) 6009 block_rsv->full = 1; 6010 else 6011 block_rsv->full = 0; 6012 6013 spin_unlock(&block_rsv->lock); 6014 spin_unlock(&sinfo->lock); 6015 } 6016 6017 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 6018 { 6019 struct btrfs_space_info *space_info; 6020 6021 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 6022 fs_info->chunk_block_rsv.space_info = space_info; 6023 6024 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 6025 fs_info->global_block_rsv.space_info = space_info; 6026 fs_info->trans_block_rsv.space_info = space_info; 6027 fs_info->empty_block_rsv.space_info = space_info; 6028 fs_info->delayed_block_rsv.space_info = space_info; 6029 fs_info->delayed_refs_rsv.space_info = space_info; 6030 6031 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; 6032 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; 6033 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 6034 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 6035 if (fs_info->quota_root) 6036 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 6037 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 6038 6039 update_global_block_rsv(fs_info); 6040 } 6041 6042 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 6043 { 6044 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 6045 (u64)-1, NULL); 6046 WARN_ON(fs_info->trans_block_rsv.size > 0); 6047 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 6048 WARN_ON(fs_info->chunk_block_rsv.size > 0); 6049 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 6050 WARN_ON(fs_info->delayed_block_rsv.size > 0); 6051 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 6052 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); 6053 WARN_ON(fs_info->delayed_refs_rsv.size > 0); 6054 } 6055 6056 /* 6057 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 6058 * @trans - the trans that may have generated delayed refs 6059 * 6060 * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 6061 * it'll calculate the additional size and add it to the delayed_refs_rsv. 6062 */ 6063 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 6064 { 6065 struct btrfs_fs_info *fs_info = trans->fs_info; 6066 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 6067 u64 num_bytes; 6068 6069 if (!trans->delayed_ref_updates) 6070 return; 6071 6072 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 6073 trans->delayed_ref_updates); 6074 spin_lock(&delayed_rsv->lock); 6075 delayed_rsv->size += num_bytes; 6076 delayed_rsv->full = 0; 6077 spin_unlock(&delayed_rsv->lock); 6078 trans->delayed_ref_updates = 0; 6079 } 6080 6081 /* 6082 * To be called after all the new block groups attached to the transaction 6083 * handle have been created (btrfs_create_pending_block_groups()). 6084 */ 6085 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 6086 { 6087 struct btrfs_fs_info *fs_info = trans->fs_info; 6088 6089 if (!trans->chunk_bytes_reserved) 6090 return; 6091 6092 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 6093 6094 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 6095 trans->chunk_bytes_reserved, NULL); 6096 trans->chunk_bytes_reserved = 0; 6097 } 6098 6099 /* 6100 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 6101 * root: the root of the parent directory 6102 * rsv: block reservation 6103 * items: the number of items that we need do reservation 6104 * use_global_rsv: allow fallback to the global block reservation 6105 * 6106 * This function is used to reserve the space for snapshot/subvolume 6107 * creation and deletion. Those operations are different with the 6108 * common file/directory operations, they change two fs/file trees 6109 * and root tree, the number of items that the qgroup reserves is 6110 * different with the free space reservation. So we can not use 6111 * the space reservation mechanism in start_transaction(). 6112 */ 6113 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 6114 struct btrfs_block_rsv *rsv, int items, 6115 bool use_global_rsv) 6116 { 6117 u64 qgroup_num_bytes = 0; 6118 u64 num_bytes; 6119 int ret; 6120 struct btrfs_fs_info *fs_info = root->fs_info; 6121 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6122 6123 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 6124 /* One for parent inode, two for dir entries */ 6125 qgroup_num_bytes = 3 * fs_info->nodesize; 6126 ret = btrfs_qgroup_reserve_meta_prealloc(root, 6127 qgroup_num_bytes, true); 6128 if (ret) 6129 return ret; 6130 } 6131 6132 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 6133 rsv->space_info = __find_space_info(fs_info, 6134 BTRFS_BLOCK_GROUP_METADATA); 6135 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 6136 BTRFS_RESERVE_FLUSH_ALL); 6137 6138 if (ret == -ENOSPC && use_global_rsv) 6139 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); 6140 6141 if (ret && qgroup_num_bytes) 6142 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 6143 6144 return ret; 6145 } 6146 6147 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 6148 struct btrfs_block_rsv *rsv) 6149 { 6150 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 6151 } 6152 6153 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, 6154 struct btrfs_inode *inode) 6155 { 6156 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 6157 u64 reserve_size = 0; 6158 u64 qgroup_rsv_size = 0; 6159 u64 csum_leaves; 6160 unsigned outstanding_extents; 6161 6162 lockdep_assert_held(&inode->lock); 6163 outstanding_extents = inode->outstanding_extents; 6164 if (outstanding_extents) 6165 reserve_size = btrfs_calc_trans_metadata_size(fs_info, 6166 outstanding_extents + 1); 6167 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, 6168 inode->csum_bytes); 6169 reserve_size += btrfs_calc_trans_metadata_size(fs_info, 6170 csum_leaves); 6171 /* 6172 * For qgroup rsv, the calculation is very simple: 6173 * account one nodesize for each outstanding extent 6174 * 6175 * This is overestimating in most cases. 6176 */ 6177 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; 6178 6179 spin_lock(&block_rsv->lock); 6180 block_rsv->size = reserve_size; 6181 block_rsv->qgroup_rsv_size = qgroup_rsv_size; 6182 spin_unlock(&block_rsv->lock); 6183 } 6184 6185 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 6186 { 6187 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6188 unsigned nr_extents; 6189 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 6190 int ret = 0; 6191 bool delalloc_lock = true; 6192 6193 /* If we are a free space inode we need to not flush since we will be in 6194 * the middle of a transaction commit. We also don't need the delalloc 6195 * mutex since we won't race with anybody. We need this mostly to make 6196 * lockdep shut its filthy mouth. 6197 * 6198 * If we have a transaction open (can happen if we call truncate_block 6199 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 6200 */ 6201 if (btrfs_is_free_space_inode(inode)) { 6202 flush = BTRFS_RESERVE_NO_FLUSH; 6203 delalloc_lock = false; 6204 } else { 6205 if (current->journal_info) 6206 flush = BTRFS_RESERVE_FLUSH_LIMIT; 6207 6208 if (btrfs_transaction_in_commit(fs_info)) 6209 schedule_timeout(1); 6210 } 6211 6212 if (delalloc_lock) 6213 mutex_lock(&inode->delalloc_mutex); 6214 6215 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6216 6217 /* Add our new extents and calculate the new rsv size. */ 6218 spin_lock(&inode->lock); 6219 nr_extents = count_max_extents(num_bytes); 6220 btrfs_mod_outstanding_extents(inode, nr_extents); 6221 inode->csum_bytes += num_bytes; 6222 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6223 spin_unlock(&inode->lock); 6224 6225 ret = btrfs_inode_rsv_refill(inode, flush); 6226 if (unlikely(ret)) 6227 goto out_fail; 6228 6229 if (delalloc_lock) 6230 mutex_unlock(&inode->delalloc_mutex); 6231 return 0; 6232 6233 out_fail: 6234 spin_lock(&inode->lock); 6235 nr_extents = count_max_extents(num_bytes); 6236 btrfs_mod_outstanding_extents(inode, -nr_extents); 6237 inode->csum_bytes -= num_bytes; 6238 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6239 spin_unlock(&inode->lock); 6240 6241 btrfs_inode_rsv_release(inode, true); 6242 if (delalloc_lock) 6243 mutex_unlock(&inode->delalloc_mutex); 6244 return ret; 6245 } 6246 6247 /** 6248 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6249 * @inode: the inode to release the reservation for. 6250 * @num_bytes: the number of bytes we are releasing. 6251 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation 6252 * 6253 * This will release the metadata reservation for an inode. This can be called 6254 * once we complete IO for a given set of bytes to release their metadata 6255 * reservations, or on error for the same reason. 6256 */ 6257 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 6258 bool qgroup_free) 6259 { 6260 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6261 6262 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6263 spin_lock(&inode->lock); 6264 inode->csum_bytes -= num_bytes; 6265 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6266 spin_unlock(&inode->lock); 6267 6268 if (btrfs_is_testing(fs_info)) 6269 return; 6270 6271 btrfs_inode_rsv_release(inode, qgroup_free); 6272 } 6273 6274 /** 6275 * btrfs_delalloc_release_extents - release our outstanding_extents 6276 * @inode: the inode to balance the reservation for. 6277 * @num_bytes: the number of bytes we originally reserved with 6278 * @qgroup_free: do we need to free qgroup meta reservation or convert them. 6279 * 6280 * When we reserve space we increase outstanding_extents for the extents we may 6281 * add. Once we've set the range as delalloc or created our ordered extents we 6282 * have outstanding_extents to track the real usage, so we use this to free our 6283 * temporarily tracked outstanding_extents. This _must_ be used in conjunction 6284 * with btrfs_delalloc_reserve_metadata. 6285 */ 6286 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, 6287 bool qgroup_free) 6288 { 6289 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6290 unsigned num_extents; 6291 6292 spin_lock(&inode->lock); 6293 num_extents = count_max_extents(num_bytes); 6294 btrfs_mod_outstanding_extents(inode, -num_extents); 6295 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6296 spin_unlock(&inode->lock); 6297 6298 if (btrfs_is_testing(fs_info)) 6299 return; 6300 6301 btrfs_inode_rsv_release(inode, qgroup_free); 6302 } 6303 6304 /** 6305 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6306 * delalloc 6307 * @inode: inode we're writing to 6308 * @start: start range we are writing to 6309 * @len: how long the range we are writing to 6310 * @reserved: mandatory parameter, record actually reserved qgroup ranges of 6311 * current reservation. 6312 * 6313 * This will do the following things 6314 * 6315 * o reserve space in data space info for num bytes 6316 * and reserve precious corresponding qgroup space 6317 * (Done in check_data_free_space) 6318 * 6319 * o reserve space for metadata space, based on the number of outstanding 6320 * extents and how much csums will be needed 6321 * also reserve metadata space in a per root over-reserve method. 6322 * o add to the inodes->delalloc_bytes 6323 * o add it to the fs_info's delalloc inodes list. 6324 * (Above 3 all done in delalloc_reserve_metadata) 6325 * 6326 * Return 0 for success 6327 * Return <0 for error(-ENOSPC or -EQUOT) 6328 */ 6329 int btrfs_delalloc_reserve_space(struct inode *inode, 6330 struct extent_changeset **reserved, u64 start, u64 len) 6331 { 6332 int ret; 6333 6334 ret = btrfs_check_data_free_space(inode, reserved, start, len); 6335 if (ret < 0) 6336 return ret; 6337 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6338 if (ret < 0) 6339 btrfs_free_reserved_data_space(inode, *reserved, start, len); 6340 return ret; 6341 } 6342 6343 /** 6344 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6345 * @inode: inode we're releasing space for 6346 * @start: start position of the space already reserved 6347 * @len: the len of the space already reserved 6348 * @release_bytes: the len of the space we consumed or didn't use 6349 * 6350 * This function will release the metadata space that was not used and will 6351 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6352 * list if there are no delalloc bytes left. 6353 * Also it will handle the qgroup reserved space. 6354 */ 6355 void btrfs_delalloc_release_space(struct inode *inode, 6356 struct extent_changeset *reserved, 6357 u64 start, u64 len, bool qgroup_free) 6358 { 6359 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); 6360 btrfs_free_reserved_data_space(inode, reserved, start, len); 6361 } 6362 6363 static int update_block_group(struct btrfs_trans_handle *trans, 6364 struct btrfs_fs_info *info, u64 bytenr, 6365 u64 num_bytes, int alloc) 6366 { 6367 struct btrfs_block_group_cache *cache = NULL; 6368 u64 total = num_bytes; 6369 u64 old_val; 6370 u64 byte_in_group; 6371 int factor; 6372 int ret = 0; 6373 6374 /* block accounting for super block */ 6375 spin_lock(&info->delalloc_root_lock); 6376 old_val = btrfs_super_bytes_used(info->super_copy); 6377 if (alloc) 6378 old_val += num_bytes; 6379 else 6380 old_val -= num_bytes; 6381 btrfs_set_super_bytes_used(info->super_copy, old_val); 6382 spin_unlock(&info->delalloc_root_lock); 6383 6384 while (total) { 6385 cache = btrfs_lookup_block_group(info, bytenr); 6386 if (!cache) { 6387 ret = -ENOENT; 6388 break; 6389 } 6390 factor = btrfs_bg_type_to_factor(cache->flags); 6391 6392 /* 6393 * If this block group has free space cache written out, we 6394 * need to make sure to load it if we are removing space. This 6395 * is because we need the unpinning stage to actually add the 6396 * space back to the block group, otherwise we will leak space. 6397 */ 6398 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6399 cache_block_group(cache, 1); 6400 6401 byte_in_group = bytenr - cache->key.objectid; 6402 WARN_ON(byte_in_group > cache->key.offset); 6403 6404 spin_lock(&cache->space_info->lock); 6405 spin_lock(&cache->lock); 6406 6407 if (btrfs_test_opt(info, SPACE_CACHE) && 6408 cache->disk_cache_state < BTRFS_DC_CLEAR) 6409 cache->disk_cache_state = BTRFS_DC_CLEAR; 6410 6411 old_val = btrfs_block_group_used(&cache->item); 6412 num_bytes = min(total, cache->key.offset - byte_in_group); 6413 if (alloc) { 6414 old_val += num_bytes; 6415 btrfs_set_block_group_used(&cache->item, old_val); 6416 cache->reserved -= num_bytes; 6417 cache->space_info->bytes_reserved -= num_bytes; 6418 cache->space_info->bytes_used += num_bytes; 6419 cache->space_info->disk_used += num_bytes * factor; 6420 spin_unlock(&cache->lock); 6421 spin_unlock(&cache->space_info->lock); 6422 } else { 6423 old_val -= num_bytes; 6424 btrfs_set_block_group_used(&cache->item, old_val); 6425 cache->pinned += num_bytes; 6426 update_bytes_pinned(cache->space_info, num_bytes); 6427 cache->space_info->bytes_used -= num_bytes; 6428 cache->space_info->disk_used -= num_bytes * factor; 6429 spin_unlock(&cache->lock); 6430 spin_unlock(&cache->space_info->lock); 6431 6432 trace_btrfs_space_reservation(info, "pinned", 6433 cache->space_info->flags, 6434 num_bytes, 1); 6435 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 6436 num_bytes, 6437 BTRFS_TOTAL_BYTES_PINNED_BATCH); 6438 set_extent_dirty(info->pinned_extents, 6439 bytenr, bytenr + num_bytes - 1, 6440 GFP_NOFS | __GFP_NOFAIL); 6441 } 6442 6443 spin_lock(&trans->transaction->dirty_bgs_lock); 6444 if (list_empty(&cache->dirty_list)) { 6445 list_add_tail(&cache->dirty_list, 6446 &trans->transaction->dirty_bgs); 6447 trans->transaction->num_dirty_bgs++; 6448 trans->delayed_ref_updates++; 6449 btrfs_get_block_group(cache); 6450 } 6451 spin_unlock(&trans->transaction->dirty_bgs_lock); 6452 6453 /* 6454 * No longer have used bytes in this block group, queue it for 6455 * deletion. We do this after adding the block group to the 6456 * dirty list to avoid races between cleaner kthread and space 6457 * cache writeout. 6458 */ 6459 if (!alloc && old_val == 0) 6460 btrfs_mark_bg_unused(cache); 6461 6462 btrfs_put_block_group(cache); 6463 total -= num_bytes; 6464 bytenr += num_bytes; 6465 } 6466 6467 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 6468 btrfs_update_delayed_refs_rsv(trans); 6469 return ret; 6470 } 6471 6472 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6473 { 6474 struct btrfs_block_group_cache *cache; 6475 u64 bytenr; 6476 6477 spin_lock(&fs_info->block_group_cache_lock); 6478 bytenr = fs_info->first_logical_byte; 6479 spin_unlock(&fs_info->block_group_cache_lock); 6480 6481 if (bytenr < (u64)-1) 6482 return bytenr; 6483 6484 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6485 if (!cache) 6486 return 0; 6487 6488 bytenr = cache->key.objectid; 6489 btrfs_put_block_group(cache); 6490 6491 return bytenr; 6492 } 6493 6494 static int pin_down_extent(struct btrfs_fs_info *fs_info, 6495 struct btrfs_block_group_cache *cache, 6496 u64 bytenr, u64 num_bytes, int reserved) 6497 { 6498 spin_lock(&cache->space_info->lock); 6499 spin_lock(&cache->lock); 6500 cache->pinned += num_bytes; 6501 update_bytes_pinned(cache->space_info, num_bytes); 6502 if (reserved) { 6503 cache->reserved -= num_bytes; 6504 cache->space_info->bytes_reserved -= num_bytes; 6505 } 6506 spin_unlock(&cache->lock); 6507 spin_unlock(&cache->space_info->lock); 6508 6509 trace_btrfs_space_reservation(fs_info, "pinned", 6510 cache->space_info->flags, num_bytes, 1); 6511 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 6512 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); 6513 set_extent_dirty(fs_info->pinned_extents, bytenr, 6514 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6515 return 0; 6516 } 6517 6518 /* 6519 * this function must be called within transaction 6520 */ 6521 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6522 u64 bytenr, u64 num_bytes, int reserved) 6523 { 6524 struct btrfs_block_group_cache *cache; 6525 6526 cache = btrfs_lookup_block_group(fs_info, bytenr); 6527 BUG_ON(!cache); /* Logic error */ 6528 6529 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); 6530 6531 btrfs_put_block_group(cache); 6532 return 0; 6533 } 6534 6535 /* 6536 * this function must be called within transaction 6537 */ 6538 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6539 u64 bytenr, u64 num_bytes) 6540 { 6541 struct btrfs_block_group_cache *cache; 6542 int ret; 6543 6544 cache = btrfs_lookup_block_group(fs_info, bytenr); 6545 if (!cache) 6546 return -EINVAL; 6547 6548 /* 6549 * pull in the free space cache (if any) so that our pin 6550 * removes the free space from the cache. We have load_only set 6551 * to one because the slow code to read in the free extents does check 6552 * the pinned extents. 6553 */ 6554 cache_block_group(cache, 1); 6555 6556 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); 6557 6558 /* remove us from the free space cache (if we're there at all) */ 6559 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6560 btrfs_put_block_group(cache); 6561 return ret; 6562 } 6563 6564 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6565 u64 start, u64 num_bytes) 6566 { 6567 int ret; 6568 struct btrfs_block_group_cache *block_group; 6569 struct btrfs_caching_control *caching_ctl; 6570 6571 block_group = btrfs_lookup_block_group(fs_info, start); 6572 if (!block_group) 6573 return -EINVAL; 6574 6575 cache_block_group(block_group, 0); 6576 caching_ctl = get_caching_control(block_group); 6577 6578 if (!caching_ctl) { 6579 /* Logic error */ 6580 BUG_ON(!block_group_cache_done(block_group)); 6581 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6582 } else { 6583 mutex_lock(&caching_ctl->mutex); 6584 6585 if (start >= caching_ctl->progress) { 6586 ret = add_excluded_extent(fs_info, start, num_bytes); 6587 } else if (start + num_bytes <= caching_ctl->progress) { 6588 ret = btrfs_remove_free_space(block_group, 6589 start, num_bytes); 6590 } else { 6591 num_bytes = caching_ctl->progress - start; 6592 ret = btrfs_remove_free_space(block_group, 6593 start, num_bytes); 6594 if (ret) 6595 goto out_lock; 6596 6597 num_bytes = (start + num_bytes) - 6598 caching_ctl->progress; 6599 start = caching_ctl->progress; 6600 ret = add_excluded_extent(fs_info, start, num_bytes); 6601 } 6602 out_lock: 6603 mutex_unlock(&caching_ctl->mutex); 6604 put_caching_control(caching_ctl); 6605 } 6606 btrfs_put_block_group(block_group); 6607 return ret; 6608 } 6609 6610 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, 6611 struct extent_buffer *eb) 6612 { 6613 struct btrfs_file_extent_item *item; 6614 struct btrfs_key key; 6615 int found_type; 6616 int i; 6617 int ret = 0; 6618 6619 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6620 return 0; 6621 6622 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6623 btrfs_item_key_to_cpu(eb, &key, i); 6624 if (key.type != BTRFS_EXTENT_DATA_KEY) 6625 continue; 6626 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6627 found_type = btrfs_file_extent_type(eb, item); 6628 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6629 continue; 6630 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6631 continue; 6632 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6633 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6634 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 6635 if (ret) 6636 break; 6637 } 6638 6639 return ret; 6640 } 6641 6642 static void 6643 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6644 { 6645 atomic_inc(&bg->reservations); 6646 } 6647 6648 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6649 const u64 start) 6650 { 6651 struct btrfs_block_group_cache *bg; 6652 6653 bg = btrfs_lookup_block_group(fs_info, start); 6654 ASSERT(bg); 6655 if (atomic_dec_and_test(&bg->reservations)) 6656 wake_up_var(&bg->reservations); 6657 btrfs_put_block_group(bg); 6658 } 6659 6660 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6661 { 6662 struct btrfs_space_info *space_info = bg->space_info; 6663 6664 ASSERT(bg->ro); 6665 6666 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6667 return; 6668 6669 /* 6670 * Our block group is read only but before we set it to read only, 6671 * some task might have had allocated an extent from it already, but it 6672 * has not yet created a respective ordered extent (and added it to a 6673 * root's list of ordered extents). 6674 * Therefore wait for any task currently allocating extents, since the 6675 * block group's reservations counter is incremented while a read lock 6676 * on the groups' semaphore is held and decremented after releasing 6677 * the read access on that semaphore and creating the ordered extent. 6678 */ 6679 down_write(&space_info->groups_sem); 6680 up_write(&space_info->groups_sem); 6681 6682 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 6683 } 6684 6685 /** 6686 * btrfs_add_reserved_bytes - update the block_group and space info counters 6687 * @cache: The cache we are manipulating 6688 * @ram_bytes: The number of bytes of file content, and will be same to 6689 * @num_bytes except for the compress path. 6690 * @num_bytes: The number of bytes in question 6691 * @delalloc: The blocks are allocated for the delalloc write 6692 * 6693 * This is called by the allocator when it reserves space. If this is a 6694 * reservation and the block group has become read only we cannot make the 6695 * reservation and return -EAGAIN, otherwise this function always succeeds. 6696 */ 6697 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6698 u64 ram_bytes, u64 num_bytes, int delalloc) 6699 { 6700 struct btrfs_space_info *space_info = cache->space_info; 6701 int ret = 0; 6702 6703 spin_lock(&space_info->lock); 6704 spin_lock(&cache->lock); 6705 if (cache->ro) { 6706 ret = -EAGAIN; 6707 } else { 6708 cache->reserved += num_bytes; 6709 space_info->bytes_reserved += num_bytes; 6710 update_bytes_may_use(space_info, -ram_bytes); 6711 if (delalloc) 6712 cache->delalloc_bytes += num_bytes; 6713 } 6714 spin_unlock(&cache->lock); 6715 spin_unlock(&space_info->lock); 6716 return ret; 6717 } 6718 6719 /** 6720 * btrfs_free_reserved_bytes - update the block_group and space info counters 6721 * @cache: The cache we are manipulating 6722 * @num_bytes: The number of bytes in question 6723 * @delalloc: The blocks are allocated for the delalloc write 6724 * 6725 * This is called by somebody who is freeing space that was never actually used 6726 * on disk. For example if you reserve some space for a new leaf in transaction 6727 * A and before transaction A commits you free that leaf, you call this with 6728 * reserve set to 0 in order to clear the reservation. 6729 */ 6730 6731 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6732 u64 num_bytes, int delalloc) 6733 { 6734 struct btrfs_space_info *space_info = cache->space_info; 6735 6736 spin_lock(&space_info->lock); 6737 spin_lock(&cache->lock); 6738 if (cache->ro) 6739 space_info->bytes_readonly += num_bytes; 6740 cache->reserved -= num_bytes; 6741 space_info->bytes_reserved -= num_bytes; 6742 space_info->max_extent_size = 0; 6743 6744 if (delalloc) 6745 cache->delalloc_bytes -= num_bytes; 6746 spin_unlock(&cache->lock); 6747 spin_unlock(&space_info->lock); 6748 } 6749 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6750 { 6751 struct btrfs_caching_control *next; 6752 struct btrfs_caching_control *caching_ctl; 6753 struct btrfs_block_group_cache *cache; 6754 6755 down_write(&fs_info->commit_root_sem); 6756 6757 list_for_each_entry_safe(caching_ctl, next, 6758 &fs_info->caching_block_groups, list) { 6759 cache = caching_ctl->block_group; 6760 if (block_group_cache_done(cache)) { 6761 cache->last_byte_to_unpin = (u64)-1; 6762 list_del_init(&caching_ctl->list); 6763 put_caching_control(caching_ctl); 6764 } else { 6765 cache->last_byte_to_unpin = caching_ctl->progress; 6766 } 6767 } 6768 6769 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6770 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6771 else 6772 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6773 6774 up_write(&fs_info->commit_root_sem); 6775 6776 update_global_block_rsv(fs_info); 6777 } 6778 6779 /* 6780 * Returns the free cluster for the given space info and sets empty_cluster to 6781 * what it should be based on the mount options. 6782 */ 6783 static struct btrfs_free_cluster * 6784 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6785 struct btrfs_space_info *space_info, u64 *empty_cluster) 6786 { 6787 struct btrfs_free_cluster *ret = NULL; 6788 6789 *empty_cluster = 0; 6790 if (btrfs_mixed_space_info(space_info)) 6791 return ret; 6792 6793 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6794 ret = &fs_info->meta_alloc_cluster; 6795 if (btrfs_test_opt(fs_info, SSD)) 6796 *empty_cluster = SZ_2M; 6797 else 6798 *empty_cluster = SZ_64K; 6799 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 6800 btrfs_test_opt(fs_info, SSD_SPREAD)) { 6801 *empty_cluster = SZ_2M; 6802 ret = &fs_info->data_alloc_cluster; 6803 } 6804 6805 return ret; 6806 } 6807 6808 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6809 u64 start, u64 end, 6810 const bool return_free_space) 6811 { 6812 struct btrfs_block_group_cache *cache = NULL; 6813 struct btrfs_space_info *space_info; 6814 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6815 struct btrfs_free_cluster *cluster = NULL; 6816 u64 len; 6817 u64 total_unpinned = 0; 6818 u64 empty_cluster = 0; 6819 bool readonly; 6820 6821 while (start <= end) { 6822 readonly = false; 6823 if (!cache || 6824 start >= cache->key.objectid + cache->key.offset) { 6825 if (cache) 6826 btrfs_put_block_group(cache); 6827 total_unpinned = 0; 6828 cache = btrfs_lookup_block_group(fs_info, start); 6829 BUG_ON(!cache); /* Logic error */ 6830 6831 cluster = fetch_cluster_info(fs_info, 6832 cache->space_info, 6833 &empty_cluster); 6834 empty_cluster <<= 1; 6835 } 6836 6837 len = cache->key.objectid + cache->key.offset - start; 6838 len = min(len, end + 1 - start); 6839 6840 if (start < cache->last_byte_to_unpin) { 6841 len = min(len, cache->last_byte_to_unpin - start); 6842 if (return_free_space) 6843 btrfs_add_free_space(cache, start, len); 6844 } 6845 6846 start += len; 6847 total_unpinned += len; 6848 space_info = cache->space_info; 6849 6850 /* 6851 * If this space cluster has been marked as fragmented and we've 6852 * unpinned enough in this block group to potentially allow a 6853 * cluster to be created inside of it go ahead and clear the 6854 * fragmented check. 6855 */ 6856 if (cluster && cluster->fragmented && 6857 total_unpinned > empty_cluster) { 6858 spin_lock(&cluster->lock); 6859 cluster->fragmented = 0; 6860 spin_unlock(&cluster->lock); 6861 } 6862 6863 spin_lock(&space_info->lock); 6864 spin_lock(&cache->lock); 6865 cache->pinned -= len; 6866 update_bytes_pinned(space_info, -len); 6867 6868 trace_btrfs_space_reservation(fs_info, "pinned", 6869 space_info->flags, len, 0); 6870 space_info->max_extent_size = 0; 6871 percpu_counter_add_batch(&space_info->total_bytes_pinned, 6872 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); 6873 if (cache->ro) { 6874 space_info->bytes_readonly += len; 6875 readonly = true; 6876 } 6877 spin_unlock(&cache->lock); 6878 if (!readonly && return_free_space && 6879 global_rsv->space_info == space_info) { 6880 u64 to_add = len; 6881 6882 spin_lock(&global_rsv->lock); 6883 if (!global_rsv->full) { 6884 to_add = min(len, global_rsv->size - 6885 global_rsv->reserved); 6886 global_rsv->reserved += to_add; 6887 update_bytes_may_use(space_info, to_add); 6888 if (global_rsv->reserved >= global_rsv->size) 6889 global_rsv->full = 1; 6890 trace_btrfs_space_reservation(fs_info, 6891 "space_info", 6892 space_info->flags, 6893 to_add, 1); 6894 len -= to_add; 6895 } 6896 spin_unlock(&global_rsv->lock); 6897 /* Add to any tickets we may have */ 6898 if (len) 6899 space_info_add_new_bytes(fs_info, space_info, 6900 len); 6901 } 6902 spin_unlock(&space_info->lock); 6903 } 6904 6905 if (cache) 6906 btrfs_put_block_group(cache); 6907 return 0; 6908 } 6909 6910 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 6911 { 6912 struct btrfs_fs_info *fs_info = trans->fs_info; 6913 struct btrfs_block_group_cache *block_group, *tmp; 6914 struct list_head *deleted_bgs; 6915 struct extent_io_tree *unpin; 6916 u64 start; 6917 u64 end; 6918 int ret; 6919 6920 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6921 unpin = &fs_info->freed_extents[1]; 6922 else 6923 unpin = &fs_info->freed_extents[0]; 6924 6925 while (!trans->aborted) { 6926 struct extent_state *cached_state = NULL; 6927 6928 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6929 ret = find_first_extent_bit(unpin, 0, &start, &end, 6930 EXTENT_DIRTY, &cached_state); 6931 if (ret) { 6932 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6933 break; 6934 } 6935 6936 if (btrfs_test_opt(fs_info, DISCARD)) 6937 ret = btrfs_discard_extent(fs_info, start, 6938 end + 1 - start, NULL); 6939 6940 clear_extent_dirty(unpin, start, end, &cached_state); 6941 unpin_extent_range(fs_info, start, end, true); 6942 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6943 free_extent_state(cached_state); 6944 cond_resched(); 6945 } 6946 6947 /* 6948 * Transaction is finished. We don't need the lock anymore. We 6949 * do need to clean up the block groups in case of a transaction 6950 * abort. 6951 */ 6952 deleted_bgs = &trans->transaction->deleted_bgs; 6953 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6954 u64 trimmed = 0; 6955 6956 ret = -EROFS; 6957 if (!trans->aborted) 6958 ret = btrfs_discard_extent(fs_info, 6959 block_group->key.objectid, 6960 block_group->key.offset, 6961 &trimmed); 6962 6963 list_del_init(&block_group->bg_list); 6964 btrfs_put_block_group_trimming(block_group); 6965 btrfs_put_block_group(block_group); 6966 6967 if (ret) { 6968 const char *errstr = btrfs_decode_error(ret); 6969 btrfs_warn(fs_info, 6970 "discard failed while removing blockgroup: errno=%d %s", 6971 ret, errstr); 6972 } 6973 } 6974 6975 return 0; 6976 } 6977 6978 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6979 struct btrfs_delayed_ref_node *node, u64 parent, 6980 u64 root_objectid, u64 owner_objectid, 6981 u64 owner_offset, int refs_to_drop, 6982 struct btrfs_delayed_extent_op *extent_op) 6983 { 6984 struct btrfs_fs_info *info = trans->fs_info; 6985 struct btrfs_key key; 6986 struct btrfs_path *path; 6987 struct btrfs_root *extent_root = info->extent_root; 6988 struct extent_buffer *leaf; 6989 struct btrfs_extent_item *ei; 6990 struct btrfs_extent_inline_ref *iref; 6991 int ret; 6992 int is_data; 6993 int extent_slot = 0; 6994 int found_extent = 0; 6995 int num_to_del = 1; 6996 u32 item_size; 6997 u64 refs; 6998 u64 bytenr = node->bytenr; 6999 u64 num_bytes = node->num_bytes; 7000 int last_ref = 0; 7001 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 7002 7003 path = btrfs_alloc_path(); 7004 if (!path) 7005 return -ENOMEM; 7006 7007 path->reada = READA_FORWARD; 7008 path->leave_spinning = 1; 7009 7010 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 7011 BUG_ON(!is_data && refs_to_drop != 1); 7012 7013 if (is_data) 7014 skinny_metadata = false; 7015 7016 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 7017 parent, root_objectid, owner_objectid, 7018 owner_offset); 7019 if (ret == 0) { 7020 extent_slot = path->slots[0]; 7021 while (extent_slot >= 0) { 7022 btrfs_item_key_to_cpu(path->nodes[0], &key, 7023 extent_slot); 7024 if (key.objectid != bytenr) 7025 break; 7026 if (key.type == BTRFS_EXTENT_ITEM_KEY && 7027 key.offset == num_bytes) { 7028 found_extent = 1; 7029 break; 7030 } 7031 if (key.type == BTRFS_METADATA_ITEM_KEY && 7032 key.offset == owner_objectid) { 7033 found_extent = 1; 7034 break; 7035 } 7036 if (path->slots[0] - extent_slot > 5) 7037 break; 7038 extent_slot--; 7039 } 7040 7041 if (!found_extent) { 7042 BUG_ON(iref); 7043 ret = remove_extent_backref(trans, path, NULL, 7044 refs_to_drop, 7045 is_data, &last_ref); 7046 if (ret) { 7047 btrfs_abort_transaction(trans, ret); 7048 goto out; 7049 } 7050 btrfs_release_path(path); 7051 path->leave_spinning = 1; 7052 7053 key.objectid = bytenr; 7054 key.type = BTRFS_EXTENT_ITEM_KEY; 7055 key.offset = num_bytes; 7056 7057 if (!is_data && skinny_metadata) { 7058 key.type = BTRFS_METADATA_ITEM_KEY; 7059 key.offset = owner_objectid; 7060 } 7061 7062 ret = btrfs_search_slot(trans, extent_root, 7063 &key, path, -1, 1); 7064 if (ret > 0 && skinny_metadata && path->slots[0]) { 7065 /* 7066 * Couldn't find our skinny metadata item, 7067 * see if we have ye olde extent item. 7068 */ 7069 path->slots[0]--; 7070 btrfs_item_key_to_cpu(path->nodes[0], &key, 7071 path->slots[0]); 7072 if (key.objectid == bytenr && 7073 key.type == BTRFS_EXTENT_ITEM_KEY && 7074 key.offset == num_bytes) 7075 ret = 0; 7076 } 7077 7078 if (ret > 0 && skinny_metadata) { 7079 skinny_metadata = false; 7080 key.objectid = bytenr; 7081 key.type = BTRFS_EXTENT_ITEM_KEY; 7082 key.offset = num_bytes; 7083 btrfs_release_path(path); 7084 ret = btrfs_search_slot(trans, extent_root, 7085 &key, path, -1, 1); 7086 } 7087 7088 if (ret) { 7089 btrfs_err(info, 7090 "umm, got %d back from search, was looking for %llu", 7091 ret, bytenr); 7092 if (ret > 0) 7093 btrfs_print_leaf(path->nodes[0]); 7094 } 7095 if (ret < 0) { 7096 btrfs_abort_transaction(trans, ret); 7097 goto out; 7098 } 7099 extent_slot = path->slots[0]; 7100 } 7101 } else if (WARN_ON(ret == -ENOENT)) { 7102 btrfs_print_leaf(path->nodes[0]); 7103 btrfs_err(info, 7104 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 7105 bytenr, parent, root_objectid, owner_objectid, 7106 owner_offset); 7107 btrfs_abort_transaction(trans, ret); 7108 goto out; 7109 } else { 7110 btrfs_abort_transaction(trans, ret); 7111 goto out; 7112 } 7113 7114 leaf = path->nodes[0]; 7115 item_size = btrfs_item_size_nr(leaf, extent_slot); 7116 if (unlikely(item_size < sizeof(*ei))) { 7117 ret = -EINVAL; 7118 btrfs_print_v0_err(info); 7119 btrfs_abort_transaction(trans, ret); 7120 goto out; 7121 } 7122 ei = btrfs_item_ptr(leaf, extent_slot, 7123 struct btrfs_extent_item); 7124 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 7125 key.type == BTRFS_EXTENT_ITEM_KEY) { 7126 struct btrfs_tree_block_info *bi; 7127 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 7128 bi = (struct btrfs_tree_block_info *)(ei + 1); 7129 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 7130 } 7131 7132 refs = btrfs_extent_refs(leaf, ei); 7133 if (refs < refs_to_drop) { 7134 btrfs_err(info, 7135 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7136 refs_to_drop, refs, bytenr); 7137 ret = -EINVAL; 7138 btrfs_abort_transaction(trans, ret); 7139 goto out; 7140 } 7141 refs -= refs_to_drop; 7142 7143 if (refs > 0) { 7144 if (extent_op) 7145 __run_delayed_extent_op(extent_op, leaf, ei); 7146 /* 7147 * In the case of inline back ref, reference count will 7148 * be updated by remove_extent_backref 7149 */ 7150 if (iref) { 7151 BUG_ON(!found_extent); 7152 } else { 7153 btrfs_set_extent_refs(leaf, ei, refs); 7154 btrfs_mark_buffer_dirty(leaf); 7155 } 7156 if (found_extent) { 7157 ret = remove_extent_backref(trans, path, iref, 7158 refs_to_drop, is_data, 7159 &last_ref); 7160 if (ret) { 7161 btrfs_abort_transaction(trans, ret); 7162 goto out; 7163 } 7164 } 7165 } else { 7166 if (found_extent) { 7167 BUG_ON(is_data && refs_to_drop != 7168 extent_data_ref_count(path, iref)); 7169 if (iref) { 7170 BUG_ON(path->slots[0] != extent_slot); 7171 } else { 7172 BUG_ON(path->slots[0] != extent_slot + 1); 7173 path->slots[0] = extent_slot; 7174 num_to_del = 2; 7175 } 7176 } 7177 7178 last_ref = 1; 7179 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7180 num_to_del); 7181 if (ret) { 7182 btrfs_abort_transaction(trans, ret); 7183 goto out; 7184 } 7185 btrfs_release_path(path); 7186 7187 if (is_data) { 7188 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7189 if (ret) { 7190 btrfs_abort_transaction(trans, ret); 7191 goto out; 7192 } 7193 } 7194 7195 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 7196 if (ret) { 7197 btrfs_abort_transaction(trans, ret); 7198 goto out; 7199 } 7200 7201 ret = update_block_group(trans, info, bytenr, num_bytes, 0); 7202 if (ret) { 7203 btrfs_abort_transaction(trans, ret); 7204 goto out; 7205 } 7206 } 7207 btrfs_release_path(path); 7208 7209 out: 7210 btrfs_free_path(path); 7211 return ret; 7212 } 7213 7214 /* 7215 * when we free an block, it is possible (and likely) that we free the last 7216 * delayed ref for that extent as well. This searches the delayed ref tree for 7217 * a given extent, and if there are no other delayed refs to be processed, it 7218 * removes it from the tree. 7219 */ 7220 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7221 u64 bytenr) 7222 { 7223 struct btrfs_delayed_ref_head *head; 7224 struct btrfs_delayed_ref_root *delayed_refs; 7225 int ret = 0; 7226 7227 delayed_refs = &trans->transaction->delayed_refs; 7228 spin_lock(&delayed_refs->lock); 7229 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7230 if (!head) 7231 goto out_delayed_unlock; 7232 7233 spin_lock(&head->lock); 7234 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 7235 goto out; 7236 7237 if (cleanup_extent_op(head) != NULL) 7238 goto out; 7239 7240 /* 7241 * waiting for the lock here would deadlock. If someone else has it 7242 * locked they are already in the process of dropping it anyway 7243 */ 7244 if (!mutex_trylock(&head->mutex)) 7245 goto out; 7246 7247 btrfs_delete_ref_head(delayed_refs, head); 7248 head->processing = 0; 7249 7250 spin_unlock(&head->lock); 7251 spin_unlock(&delayed_refs->lock); 7252 7253 BUG_ON(head->extent_op); 7254 if (head->must_insert_reserved) 7255 ret = 1; 7256 7257 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); 7258 mutex_unlock(&head->mutex); 7259 btrfs_put_delayed_ref_head(head); 7260 return ret; 7261 out: 7262 spin_unlock(&head->lock); 7263 7264 out_delayed_unlock: 7265 spin_unlock(&delayed_refs->lock); 7266 return 0; 7267 } 7268 7269 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7270 struct btrfs_root *root, 7271 struct extent_buffer *buf, 7272 u64 parent, int last_ref) 7273 { 7274 struct btrfs_fs_info *fs_info = root->fs_info; 7275 int pin = 1; 7276 int ret; 7277 7278 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7279 int old_ref_mod, new_ref_mod; 7280 7281 btrfs_ref_tree_mod(root, buf->start, buf->len, parent, 7282 root->root_key.objectid, 7283 btrfs_header_level(buf), 0, 7284 BTRFS_DROP_DELAYED_REF); 7285 ret = btrfs_add_delayed_tree_ref(trans, buf->start, 7286 buf->len, parent, 7287 root->root_key.objectid, 7288 btrfs_header_level(buf), 7289 BTRFS_DROP_DELAYED_REF, NULL, 7290 &old_ref_mod, &new_ref_mod); 7291 BUG_ON(ret); /* -ENOMEM */ 7292 pin = old_ref_mod >= 0 && new_ref_mod < 0; 7293 } 7294 7295 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 7296 struct btrfs_block_group_cache *cache; 7297 7298 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7299 ret = check_ref_cleanup(trans, buf->start); 7300 if (!ret) 7301 goto out; 7302 } 7303 7304 pin = 0; 7305 cache = btrfs_lookup_block_group(fs_info, buf->start); 7306 7307 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7308 pin_down_extent(fs_info, cache, buf->start, 7309 buf->len, 1); 7310 btrfs_put_block_group(cache); 7311 goto out; 7312 } 7313 7314 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7315 7316 btrfs_add_free_space(cache, buf->start, buf->len); 7317 btrfs_free_reserved_bytes(cache, buf->len, 0); 7318 btrfs_put_block_group(cache); 7319 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7320 } 7321 out: 7322 if (pin) 7323 add_pinned_bytes(fs_info, buf->len, true, 7324 root->root_key.objectid); 7325 7326 if (last_ref) { 7327 /* 7328 * Deleting the buffer, clear the corrupt flag since it doesn't 7329 * matter anymore. 7330 */ 7331 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7332 } 7333 } 7334 7335 /* Can return -ENOMEM */ 7336 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7337 struct btrfs_root *root, 7338 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7339 u64 owner, u64 offset) 7340 { 7341 struct btrfs_fs_info *fs_info = root->fs_info; 7342 int old_ref_mod, new_ref_mod; 7343 int ret; 7344 7345 if (btrfs_is_testing(fs_info)) 7346 return 0; 7347 7348 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) 7349 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, 7350 root_objectid, owner, offset, 7351 BTRFS_DROP_DELAYED_REF); 7352 7353 /* 7354 * tree log blocks never actually go into the extent allocation 7355 * tree, just update pinning info and exit early. 7356 */ 7357 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7358 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7359 /* unlocks the pinned mutex */ 7360 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7361 old_ref_mod = new_ref_mod = 0; 7362 ret = 0; 7363 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7364 ret = btrfs_add_delayed_tree_ref(trans, bytenr, 7365 num_bytes, parent, 7366 root_objectid, (int)owner, 7367 BTRFS_DROP_DELAYED_REF, NULL, 7368 &old_ref_mod, &new_ref_mod); 7369 } else { 7370 ret = btrfs_add_delayed_data_ref(trans, bytenr, 7371 num_bytes, parent, 7372 root_objectid, owner, offset, 7373 0, BTRFS_DROP_DELAYED_REF, 7374 &old_ref_mod, &new_ref_mod); 7375 } 7376 7377 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { 7378 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; 7379 7380 add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); 7381 } 7382 7383 return ret; 7384 } 7385 7386 /* 7387 * when we wait for progress in the block group caching, its because 7388 * our allocation attempt failed at least once. So, we must sleep 7389 * and let some progress happen before we try again. 7390 * 7391 * This function will sleep at least once waiting for new free space to 7392 * show up, and then it will check the block group free space numbers 7393 * for our min num_bytes. Another option is to have it go ahead 7394 * and look in the rbtree for a free extent of a given size, but this 7395 * is a good start. 7396 * 7397 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7398 * any of the information in this block group. 7399 */ 7400 static noinline void 7401 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7402 u64 num_bytes) 7403 { 7404 struct btrfs_caching_control *caching_ctl; 7405 7406 caching_ctl = get_caching_control(cache); 7407 if (!caching_ctl) 7408 return; 7409 7410 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7411 (cache->free_space_ctl->free_space >= num_bytes)); 7412 7413 put_caching_control(caching_ctl); 7414 } 7415 7416 static noinline int 7417 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7418 { 7419 struct btrfs_caching_control *caching_ctl; 7420 int ret = 0; 7421 7422 caching_ctl = get_caching_control(cache); 7423 if (!caching_ctl) 7424 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7425 7426 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7427 if (cache->cached == BTRFS_CACHE_ERROR) 7428 ret = -EIO; 7429 put_caching_control(caching_ctl); 7430 return ret; 7431 } 7432 7433 enum btrfs_loop_type { 7434 LOOP_CACHING_NOWAIT = 0, 7435 LOOP_CACHING_WAIT = 1, 7436 LOOP_ALLOC_CHUNK = 2, 7437 LOOP_NO_EMPTY_SIZE = 3, 7438 }; 7439 7440 static inline void 7441 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7442 int delalloc) 7443 { 7444 if (delalloc) 7445 down_read(&cache->data_rwsem); 7446 } 7447 7448 static inline void 7449 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7450 int delalloc) 7451 { 7452 btrfs_get_block_group(cache); 7453 if (delalloc) 7454 down_read(&cache->data_rwsem); 7455 } 7456 7457 static struct btrfs_block_group_cache * 7458 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7459 struct btrfs_free_cluster *cluster, 7460 int delalloc) 7461 { 7462 struct btrfs_block_group_cache *used_bg = NULL; 7463 7464 spin_lock(&cluster->refill_lock); 7465 while (1) { 7466 used_bg = cluster->block_group; 7467 if (!used_bg) 7468 return NULL; 7469 7470 if (used_bg == block_group) 7471 return used_bg; 7472 7473 btrfs_get_block_group(used_bg); 7474 7475 if (!delalloc) 7476 return used_bg; 7477 7478 if (down_read_trylock(&used_bg->data_rwsem)) 7479 return used_bg; 7480 7481 spin_unlock(&cluster->refill_lock); 7482 7483 /* We should only have one-level nested. */ 7484 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7485 7486 spin_lock(&cluster->refill_lock); 7487 if (used_bg == cluster->block_group) 7488 return used_bg; 7489 7490 up_read(&used_bg->data_rwsem); 7491 btrfs_put_block_group(used_bg); 7492 } 7493 } 7494 7495 static inline void 7496 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7497 int delalloc) 7498 { 7499 if (delalloc) 7500 up_read(&cache->data_rwsem); 7501 btrfs_put_block_group(cache); 7502 } 7503 7504 /* 7505 * Structure used internally for find_free_extent() function. Wraps needed 7506 * parameters. 7507 */ 7508 struct find_free_extent_ctl { 7509 /* Basic allocation info */ 7510 u64 ram_bytes; 7511 u64 num_bytes; 7512 u64 empty_size; 7513 u64 flags; 7514 int delalloc; 7515 7516 /* Where to start the search inside the bg */ 7517 u64 search_start; 7518 7519 /* For clustered allocation */ 7520 u64 empty_cluster; 7521 7522 bool have_caching_bg; 7523 bool orig_have_caching_bg; 7524 7525 /* RAID index, converted from flags */ 7526 int index; 7527 7528 /* 7529 * Current loop number, check find_free_extent_update_loop() for details 7530 */ 7531 int loop; 7532 7533 /* 7534 * Whether we're refilling a cluster, if true we need to re-search 7535 * current block group but don't try to refill the cluster again. 7536 */ 7537 bool retry_clustered; 7538 7539 /* 7540 * Whether we're updating free space cache, if true we need to re-search 7541 * current block group but don't try updating free space cache again. 7542 */ 7543 bool retry_unclustered; 7544 7545 /* If current block group is cached */ 7546 int cached; 7547 7548 /* Max contiguous hole found */ 7549 u64 max_extent_size; 7550 7551 /* Total free space from free space cache, not always contiguous */ 7552 u64 total_free_space; 7553 7554 /* Found result */ 7555 u64 found_offset; 7556 }; 7557 7558 7559 /* 7560 * Helper function for find_free_extent(). 7561 * 7562 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 7563 * Return -EAGAIN to inform caller that we need to re-search this block group 7564 * Return >0 to inform caller that we find nothing 7565 * Return 0 means we have found a location and set ffe_ctl->found_offset. 7566 */ 7567 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, 7568 struct btrfs_free_cluster *last_ptr, 7569 struct find_free_extent_ctl *ffe_ctl, 7570 struct btrfs_block_group_cache **cluster_bg_ret) 7571 { 7572 struct btrfs_fs_info *fs_info = bg->fs_info; 7573 struct btrfs_block_group_cache *cluster_bg; 7574 u64 aligned_cluster; 7575 u64 offset; 7576 int ret; 7577 7578 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 7579 if (!cluster_bg) 7580 goto refill_cluster; 7581 if (cluster_bg != bg && (cluster_bg->ro || 7582 !block_group_bits(cluster_bg, ffe_ctl->flags))) 7583 goto release_cluster; 7584 7585 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 7586 ffe_ctl->num_bytes, cluster_bg->key.objectid, 7587 &ffe_ctl->max_extent_size); 7588 if (offset) { 7589 /* We have a block, we're done */ 7590 spin_unlock(&last_ptr->refill_lock); 7591 trace_btrfs_reserve_extent_cluster(cluster_bg, 7592 ffe_ctl->search_start, ffe_ctl->num_bytes); 7593 *cluster_bg_ret = cluster_bg; 7594 ffe_ctl->found_offset = offset; 7595 return 0; 7596 } 7597 WARN_ON(last_ptr->block_group != cluster_bg); 7598 7599 release_cluster: 7600 /* 7601 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 7602 * lets just skip it and let the allocator find whatever block it can 7603 * find. If we reach this point, we will have tried the cluster 7604 * allocator plenty of times and not have found anything, so we are 7605 * likely way too fragmented for the clustering stuff to find anything. 7606 * 7607 * However, if the cluster is taken from the current block group, 7608 * release the cluster first, so that we stand a better chance of 7609 * succeeding in the unclustered allocation. 7610 */ 7611 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 7612 spin_unlock(&last_ptr->refill_lock); 7613 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 7614 return -ENOENT; 7615 } 7616 7617 /* This cluster didn't work out, free it and start over */ 7618 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7619 7620 if (cluster_bg != bg) 7621 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 7622 7623 refill_cluster: 7624 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 7625 spin_unlock(&last_ptr->refill_lock); 7626 return -ENOENT; 7627 } 7628 7629 aligned_cluster = max_t(u64, 7630 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 7631 bg->full_stripe_len); 7632 ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, 7633 ffe_ctl->search_start, ffe_ctl->num_bytes, 7634 aligned_cluster); 7635 if (ret == 0) { 7636 /* Now pull our allocation out of this cluster */ 7637 offset = btrfs_alloc_from_cluster(bg, last_ptr, 7638 ffe_ctl->num_bytes, ffe_ctl->search_start, 7639 &ffe_ctl->max_extent_size); 7640 if (offset) { 7641 /* We found one, proceed */ 7642 spin_unlock(&last_ptr->refill_lock); 7643 trace_btrfs_reserve_extent_cluster(bg, 7644 ffe_ctl->search_start, 7645 ffe_ctl->num_bytes); 7646 ffe_ctl->found_offset = offset; 7647 return 0; 7648 } 7649 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 7650 !ffe_ctl->retry_clustered) { 7651 spin_unlock(&last_ptr->refill_lock); 7652 7653 ffe_ctl->retry_clustered = true; 7654 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 7655 ffe_ctl->empty_cluster + ffe_ctl->empty_size); 7656 return -EAGAIN; 7657 } 7658 /* 7659 * At this point we either didn't find a cluster or we weren't able to 7660 * allocate a block from our cluster. Free the cluster we've been 7661 * trying to use, and go to the next block group. 7662 */ 7663 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7664 spin_unlock(&last_ptr->refill_lock); 7665 return 1; 7666 } 7667 7668 /* 7669 * Return >0 to inform caller that we find nothing 7670 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 7671 * Return -EAGAIN to inform caller that we need to re-search this block group 7672 */ 7673 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, 7674 struct btrfs_free_cluster *last_ptr, 7675 struct find_free_extent_ctl *ffe_ctl) 7676 { 7677 u64 offset; 7678 7679 /* 7680 * We are doing an unclustered allocation, set the fragmented flag so 7681 * we don't bother trying to setup a cluster again until we get more 7682 * space. 7683 */ 7684 if (unlikely(last_ptr)) { 7685 spin_lock(&last_ptr->lock); 7686 last_ptr->fragmented = 1; 7687 spin_unlock(&last_ptr->lock); 7688 } 7689 if (ffe_ctl->cached) { 7690 struct btrfs_free_space_ctl *free_space_ctl; 7691 7692 free_space_ctl = bg->free_space_ctl; 7693 spin_lock(&free_space_ctl->tree_lock); 7694 if (free_space_ctl->free_space < 7695 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 7696 ffe_ctl->empty_size) { 7697 ffe_ctl->total_free_space = max_t(u64, 7698 ffe_ctl->total_free_space, 7699 free_space_ctl->free_space); 7700 spin_unlock(&free_space_ctl->tree_lock); 7701 return 1; 7702 } 7703 spin_unlock(&free_space_ctl->tree_lock); 7704 } 7705 7706 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 7707 ffe_ctl->num_bytes, ffe_ctl->empty_size, 7708 &ffe_ctl->max_extent_size); 7709 7710 /* 7711 * If we didn't find a chunk, and we haven't failed on this block group 7712 * before, and this block group is in the middle of caching and we are 7713 * ok with waiting, then go ahead and wait for progress to be made, and 7714 * set @retry_unclustered to true. 7715 * 7716 * If @retry_unclustered is true then we've already waited on this 7717 * block group once and should move on to the next block group. 7718 */ 7719 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && 7720 ffe_ctl->loop > LOOP_CACHING_NOWAIT) { 7721 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 7722 ffe_ctl->empty_size); 7723 ffe_ctl->retry_unclustered = true; 7724 return -EAGAIN; 7725 } else if (!offset) { 7726 return 1; 7727 } 7728 ffe_ctl->found_offset = offset; 7729 return 0; 7730 } 7731 7732 /* 7733 * Return >0 means caller needs to re-search for free extent 7734 * Return 0 means we have the needed free extent. 7735 * Return <0 means we failed to locate any free extent. 7736 */ 7737 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 7738 struct btrfs_free_cluster *last_ptr, 7739 struct btrfs_key *ins, 7740 struct find_free_extent_ctl *ffe_ctl, 7741 int full_search, bool use_cluster) 7742 { 7743 struct btrfs_root *root = fs_info->extent_root; 7744 int ret; 7745 7746 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 7747 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 7748 ffe_ctl->orig_have_caching_bg = true; 7749 7750 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && 7751 ffe_ctl->have_caching_bg) 7752 return 1; 7753 7754 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) 7755 return 1; 7756 7757 if (ins->objectid) { 7758 if (!use_cluster && last_ptr) { 7759 spin_lock(&last_ptr->lock); 7760 last_ptr->window_start = ins->objectid; 7761 spin_unlock(&last_ptr->lock); 7762 } 7763 return 0; 7764 } 7765 7766 /* 7767 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7768 * caching kthreads as we move along 7769 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7770 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7771 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7772 * again 7773 */ 7774 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 7775 ffe_ctl->index = 0; 7776 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { 7777 /* 7778 * We want to skip the LOOP_CACHING_WAIT step if we 7779 * don't have any uncached bgs and we've already done a 7780 * full search through. 7781 */ 7782 if (ffe_ctl->orig_have_caching_bg || !full_search) 7783 ffe_ctl->loop = LOOP_CACHING_WAIT; 7784 else 7785 ffe_ctl->loop = LOOP_ALLOC_CHUNK; 7786 } else { 7787 ffe_ctl->loop++; 7788 } 7789 7790 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 7791 struct btrfs_trans_handle *trans; 7792 int exist = 0; 7793 7794 trans = current->journal_info; 7795 if (trans) 7796 exist = 1; 7797 else 7798 trans = btrfs_join_transaction(root); 7799 7800 if (IS_ERR(trans)) { 7801 ret = PTR_ERR(trans); 7802 return ret; 7803 } 7804 7805 ret = do_chunk_alloc(trans, ffe_ctl->flags, 7806 CHUNK_ALLOC_FORCE); 7807 7808 /* 7809 * If we can't allocate a new chunk we've already looped 7810 * through at least once, move on to the NO_EMPTY_SIZE 7811 * case. 7812 */ 7813 if (ret == -ENOSPC) 7814 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; 7815 7816 /* Do not bail out on ENOSPC since we can do more. */ 7817 if (ret < 0 && ret != -ENOSPC) 7818 btrfs_abort_transaction(trans, ret); 7819 else 7820 ret = 0; 7821 if (!exist) 7822 btrfs_end_transaction(trans); 7823 if (ret) 7824 return ret; 7825 } 7826 7827 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 7828 /* 7829 * Don't loop again if we already have no empty_size and 7830 * no empty_cluster. 7831 */ 7832 if (ffe_ctl->empty_size == 0 && 7833 ffe_ctl->empty_cluster == 0) 7834 return -ENOSPC; 7835 ffe_ctl->empty_size = 0; 7836 ffe_ctl->empty_cluster = 0; 7837 } 7838 return 1; 7839 } 7840 return -ENOSPC; 7841 } 7842 7843 /* 7844 * walks the btree of allocated extents and find a hole of a given size. 7845 * The key ins is changed to record the hole: 7846 * ins->objectid == start position 7847 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7848 * ins->offset == the size of the hole. 7849 * Any available blocks before search_start are skipped. 7850 * 7851 * If there is no suitable free space, we will record the max size of 7852 * the free space extent currently. 7853 * 7854 * The overall logic and call chain: 7855 * 7856 * find_free_extent() 7857 * |- Iterate through all block groups 7858 * | |- Get a valid block group 7859 * | |- Try to do clustered allocation in that block group 7860 * | |- Try to do unclustered allocation in that block group 7861 * | |- Check if the result is valid 7862 * | | |- If valid, then exit 7863 * | |- Jump to next block group 7864 * | 7865 * |- Push harder to find free extents 7866 * |- If not found, re-iterate all block groups 7867 */ 7868 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7869 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7870 u64 hint_byte, struct btrfs_key *ins, 7871 u64 flags, int delalloc) 7872 { 7873 int ret = 0; 7874 struct btrfs_free_cluster *last_ptr = NULL; 7875 struct btrfs_block_group_cache *block_group = NULL; 7876 struct find_free_extent_ctl ffe_ctl = {0}; 7877 struct btrfs_space_info *space_info; 7878 bool use_cluster = true; 7879 bool full_search = false; 7880 7881 WARN_ON(num_bytes < fs_info->sectorsize); 7882 7883 ffe_ctl.ram_bytes = ram_bytes; 7884 ffe_ctl.num_bytes = num_bytes; 7885 ffe_ctl.empty_size = empty_size; 7886 ffe_ctl.flags = flags; 7887 ffe_ctl.search_start = 0; 7888 ffe_ctl.retry_clustered = false; 7889 ffe_ctl.retry_unclustered = false; 7890 ffe_ctl.delalloc = delalloc; 7891 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); 7892 ffe_ctl.have_caching_bg = false; 7893 ffe_ctl.orig_have_caching_bg = false; 7894 ffe_ctl.found_offset = 0; 7895 7896 ins->type = BTRFS_EXTENT_ITEM_KEY; 7897 ins->objectid = 0; 7898 ins->offset = 0; 7899 7900 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7901 7902 space_info = __find_space_info(fs_info, flags); 7903 if (!space_info) { 7904 btrfs_err(fs_info, "No space info for %llu", flags); 7905 return -ENOSPC; 7906 } 7907 7908 /* 7909 * If our free space is heavily fragmented we may not be able to make 7910 * big contiguous allocations, so instead of doing the expensive search 7911 * for free space, simply return ENOSPC with our max_extent_size so we 7912 * can go ahead and search for a more manageable chunk. 7913 * 7914 * If our max_extent_size is large enough for our allocation simply 7915 * disable clustering since we will likely not be able to find enough 7916 * space to create a cluster and induce latency trying. 7917 */ 7918 if (unlikely(space_info->max_extent_size)) { 7919 spin_lock(&space_info->lock); 7920 if (space_info->max_extent_size && 7921 num_bytes > space_info->max_extent_size) { 7922 ins->offset = space_info->max_extent_size; 7923 spin_unlock(&space_info->lock); 7924 return -ENOSPC; 7925 } else if (space_info->max_extent_size) { 7926 use_cluster = false; 7927 } 7928 spin_unlock(&space_info->lock); 7929 } 7930 7931 last_ptr = fetch_cluster_info(fs_info, space_info, 7932 &ffe_ctl.empty_cluster); 7933 if (last_ptr) { 7934 spin_lock(&last_ptr->lock); 7935 if (last_ptr->block_group) 7936 hint_byte = last_ptr->window_start; 7937 if (last_ptr->fragmented) { 7938 /* 7939 * We still set window_start so we can keep track of the 7940 * last place we found an allocation to try and save 7941 * some time. 7942 */ 7943 hint_byte = last_ptr->window_start; 7944 use_cluster = false; 7945 } 7946 spin_unlock(&last_ptr->lock); 7947 } 7948 7949 ffe_ctl.search_start = max(ffe_ctl.search_start, 7950 first_logical_byte(fs_info, 0)); 7951 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); 7952 if (ffe_ctl.search_start == hint_byte) { 7953 block_group = btrfs_lookup_block_group(fs_info, 7954 ffe_ctl.search_start); 7955 /* 7956 * we don't want to use the block group if it doesn't match our 7957 * allocation bits, or if its not cached. 7958 * 7959 * However if we are re-searching with an ideal block group 7960 * picked out then we don't care that the block group is cached. 7961 */ 7962 if (block_group && block_group_bits(block_group, flags) && 7963 block_group->cached != BTRFS_CACHE_NO) { 7964 down_read(&space_info->groups_sem); 7965 if (list_empty(&block_group->list) || 7966 block_group->ro) { 7967 /* 7968 * someone is removing this block group, 7969 * we can't jump into the have_block_group 7970 * target because our list pointers are not 7971 * valid 7972 */ 7973 btrfs_put_block_group(block_group); 7974 up_read(&space_info->groups_sem); 7975 } else { 7976 ffe_ctl.index = btrfs_bg_flags_to_raid_index( 7977 block_group->flags); 7978 btrfs_lock_block_group(block_group, delalloc); 7979 goto have_block_group; 7980 } 7981 } else if (block_group) { 7982 btrfs_put_block_group(block_group); 7983 } 7984 } 7985 search: 7986 ffe_ctl.have_caching_bg = false; 7987 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || 7988 ffe_ctl.index == 0) 7989 full_search = true; 7990 down_read(&space_info->groups_sem); 7991 list_for_each_entry(block_group, 7992 &space_info->block_groups[ffe_ctl.index], list) { 7993 /* If the block group is read-only, we can skip it entirely. */ 7994 if (unlikely(block_group->ro)) 7995 continue; 7996 7997 btrfs_grab_block_group(block_group, delalloc); 7998 ffe_ctl.search_start = block_group->key.objectid; 7999 8000 /* 8001 * this can happen if we end up cycling through all the 8002 * raid types, but we want to make sure we only allocate 8003 * for the proper type. 8004 */ 8005 if (!block_group_bits(block_group, flags)) { 8006 u64 extra = BTRFS_BLOCK_GROUP_DUP | 8007 BTRFS_BLOCK_GROUP_RAID1 | 8008 BTRFS_BLOCK_GROUP_RAID5 | 8009 BTRFS_BLOCK_GROUP_RAID6 | 8010 BTRFS_BLOCK_GROUP_RAID10; 8011 8012 /* 8013 * if they asked for extra copies and this block group 8014 * doesn't provide them, bail. This does allow us to 8015 * fill raid0 from raid1. 8016 */ 8017 if ((flags & extra) && !(block_group->flags & extra)) 8018 goto loop; 8019 } 8020 8021 have_block_group: 8022 ffe_ctl.cached = block_group_cache_done(block_group); 8023 if (unlikely(!ffe_ctl.cached)) { 8024 ffe_ctl.have_caching_bg = true; 8025 ret = cache_block_group(block_group, 0); 8026 BUG_ON(ret < 0); 8027 ret = 0; 8028 } 8029 8030 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 8031 goto loop; 8032 8033 /* 8034 * Ok we want to try and use the cluster allocator, so 8035 * lets look there 8036 */ 8037 if (last_ptr && use_cluster) { 8038 struct btrfs_block_group_cache *cluster_bg = NULL; 8039 8040 ret = find_free_extent_clustered(block_group, last_ptr, 8041 &ffe_ctl, &cluster_bg); 8042 8043 if (ret == 0) { 8044 if (cluster_bg && cluster_bg != block_group) { 8045 btrfs_release_block_group(block_group, 8046 delalloc); 8047 block_group = cluster_bg; 8048 } 8049 goto checks; 8050 } else if (ret == -EAGAIN) { 8051 goto have_block_group; 8052 } else if (ret > 0) { 8053 goto loop; 8054 } 8055 /* ret == -ENOENT case falls through */ 8056 } 8057 8058 ret = find_free_extent_unclustered(block_group, last_ptr, 8059 &ffe_ctl); 8060 if (ret == -EAGAIN) 8061 goto have_block_group; 8062 else if (ret > 0) 8063 goto loop; 8064 /* ret == 0 case falls through */ 8065 checks: 8066 ffe_ctl.search_start = round_up(ffe_ctl.found_offset, 8067 fs_info->stripesize); 8068 8069 /* move on to the next group */ 8070 if (ffe_ctl.search_start + num_bytes > 8071 block_group->key.objectid + block_group->key.offset) { 8072 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 8073 num_bytes); 8074 goto loop; 8075 } 8076 8077 if (ffe_ctl.found_offset < ffe_ctl.search_start) 8078 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 8079 ffe_ctl.search_start - ffe_ctl.found_offset); 8080 8081 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 8082 num_bytes, delalloc); 8083 if (ret == -EAGAIN) { 8084 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 8085 num_bytes); 8086 goto loop; 8087 } 8088 btrfs_inc_block_group_reservations(block_group); 8089 8090 /* we are all good, lets return */ 8091 ins->objectid = ffe_ctl.search_start; 8092 ins->offset = num_bytes; 8093 8094 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, 8095 num_bytes); 8096 btrfs_release_block_group(block_group, delalloc); 8097 break; 8098 loop: 8099 ffe_ctl.retry_clustered = false; 8100 ffe_ctl.retry_unclustered = false; 8101 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 8102 ffe_ctl.index); 8103 btrfs_release_block_group(block_group, delalloc); 8104 cond_resched(); 8105 } 8106 up_read(&space_info->groups_sem); 8107 8108 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, 8109 full_search, use_cluster); 8110 if (ret > 0) 8111 goto search; 8112 8113 if (ret == -ENOSPC) { 8114 /* 8115 * Use ffe_ctl->total_free_space as fallback if we can't find 8116 * any contiguous hole. 8117 */ 8118 if (!ffe_ctl.max_extent_size) 8119 ffe_ctl.max_extent_size = ffe_ctl.total_free_space; 8120 spin_lock(&space_info->lock); 8121 space_info->max_extent_size = ffe_ctl.max_extent_size; 8122 spin_unlock(&space_info->lock); 8123 ins->offset = ffe_ctl.max_extent_size; 8124 } 8125 return ret; 8126 } 8127 8128 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ 8129 do { \ 8130 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ 8131 spin_lock(&__rsv->lock); \ 8132 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ 8133 __rsv->size, __rsv->reserved); \ 8134 spin_unlock(&__rsv->lock); \ 8135 } while (0) 8136 8137 static void dump_space_info(struct btrfs_fs_info *fs_info, 8138 struct btrfs_space_info *info, u64 bytes, 8139 int dump_block_groups) 8140 { 8141 struct btrfs_block_group_cache *cache; 8142 int index = 0; 8143 8144 spin_lock(&info->lock); 8145 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 8146 info->flags, 8147 info->total_bytes - btrfs_space_info_used(info, true), 8148 info->full ? "" : "not "); 8149 btrfs_info(fs_info, 8150 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 8151 info->total_bytes, info->bytes_used, info->bytes_pinned, 8152 info->bytes_reserved, info->bytes_may_use, 8153 info->bytes_readonly); 8154 spin_unlock(&info->lock); 8155 8156 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 8157 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 8158 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 8159 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 8160 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 8161 8162 if (!dump_block_groups) 8163 return; 8164 8165 down_read(&info->groups_sem); 8166 again: 8167 list_for_each_entry(cache, &info->block_groups[index], list) { 8168 spin_lock(&cache->lock); 8169 btrfs_info(fs_info, 8170 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 8171 cache->key.objectid, cache->key.offset, 8172 btrfs_block_group_used(&cache->item), cache->pinned, 8173 cache->reserved, cache->ro ? "[readonly]" : ""); 8174 btrfs_dump_free_space(cache, bytes); 8175 spin_unlock(&cache->lock); 8176 } 8177 if (++index < BTRFS_NR_RAID_TYPES) 8178 goto again; 8179 up_read(&info->groups_sem); 8180 } 8181 8182 /* 8183 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 8184 * hole that is at least as big as @num_bytes. 8185 * 8186 * @root - The root that will contain this extent 8187 * 8188 * @ram_bytes - The amount of space in ram that @num_bytes take. This 8189 * is used for accounting purposes. This value differs 8190 * from @num_bytes only in the case of compressed extents. 8191 * 8192 * @num_bytes - Number of bytes to allocate on-disk. 8193 * 8194 * @min_alloc_size - Indicates the minimum amount of space that the 8195 * allocator should try to satisfy. In some cases 8196 * @num_bytes may be larger than what is required and if 8197 * the filesystem is fragmented then allocation fails. 8198 * However, the presence of @min_alloc_size gives a 8199 * chance to try and satisfy the smaller allocation. 8200 * 8201 * @empty_size - A hint that you plan on doing more COW. This is the 8202 * size in bytes the allocator should try to find free 8203 * next to the block it returns. This is just a hint and 8204 * may be ignored by the allocator. 8205 * 8206 * @hint_byte - Hint to the allocator to start searching above the byte 8207 * address passed. It might be ignored. 8208 * 8209 * @ins - This key is modified to record the found hole. It will 8210 * have the following values: 8211 * ins->objectid == start position 8212 * ins->flags = BTRFS_EXTENT_ITEM_KEY 8213 * ins->offset == the size of the hole. 8214 * 8215 * @is_data - Boolean flag indicating whether an extent is 8216 * allocated for data (true) or metadata (false) 8217 * 8218 * @delalloc - Boolean flag indicating whether this allocation is for 8219 * delalloc or not. If 'true' data_rwsem of block groups 8220 * is going to be acquired. 8221 * 8222 * 8223 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 8224 * case -ENOSPC is returned then @ins->offset will contain the size of the 8225 * largest available hole the allocator managed to find. 8226 */ 8227 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 8228 u64 num_bytes, u64 min_alloc_size, 8229 u64 empty_size, u64 hint_byte, 8230 struct btrfs_key *ins, int is_data, int delalloc) 8231 { 8232 struct btrfs_fs_info *fs_info = root->fs_info; 8233 bool final_tried = num_bytes == min_alloc_size; 8234 u64 flags; 8235 int ret; 8236 8237 flags = get_alloc_profile_by_root(root, is_data); 8238 again: 8239 WARN_ON(num_bytes < fs_info->sectorsize); 8240 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 8241 hint_byte, ins, flags, delalloc); 8242 if (!ret && !is_data) { 8243 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 8244 } else if (ret == -ENOSPC) { 8245 if (!final_tried && ins->offset) { 8246 num_bytes = min(num_bytes >> 1, ins->offset); 8247 num_bytes = round_down(num_bytes, 8248 fs_info->sectorsize); 8249 num_bytes = max(num_bytes, min_alloc_size); 8250 ram_bytes = num_bytes; 8251 if (num_bytes == min_alloc_size) 8252 final_tried = true; 8253 goto again; 8254 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8255 struct btrfs_space_info *sinfo; 8256 8257 sinfo = __find_space_info(fs_info, flags); 8258 btrfs_err(fs_info, 8259 "allocation failed flags %llu, wanted %llu", 8260 flags, num_bytes); 8261 if (sinfo) 8262 dump_space_info(fs_info, sinfo, num_bytes, 1); 8263 } 8264 } 8265 8266 return ret; 8267 } 8268 8269 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8270 u64 start, u64 len, 8271 int pin, int delalloc) 8272 { 8273 struct btrfs_block_group_cache *cache; 8274 int ret = 0; 8275 8276 cache = btrfs_lookup_block_group(fs_info, start); 8277 if (!cache) { 8278 btrfs_err(fs_info, "Unable to find block group for %llu", 8279 start); 8280 return -ENOSPC; 8281 } 8282 8283 if (pin) 8284 pin_down_extent(fs_info, cache, start, len, 1); 8285 else { 8286 if (btrfs_test_opt(fs_info, DISCARD)) 8287 ret = btrfs_discard_extent(fs_info, start, len, NULL); 8288 btrfs_add_free_space(cache, start, len); 8289 btrfs_free_reserved_bytes(cache, len, delalloc); 8290 trace_btrfs_reserved_extent_free(fs_info, start, len); 8291 } 8292 8293 btrfs_put_block_group(cache); 8294 return ret; 8295 } 8296 8297 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8298 u64 start, u64 len, int delalloc) 8299 { 8300 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8301 } 8302 8303 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8304 u64 start, u64 len) 8305 { 8306 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8307 } 8308 8309 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8310 u64 parent, u64 root_objectid, 8311 u64 flags, u64 owner, u64 offset, 8312 struct btrfs_key *ins, int ref_mod) 8313 { 8314 struct btrfs_fs_info *fs_info = trans->fs_info; 8315 int ret; 8316 struct btrfs_extent_item *extent_item; 8317 struct btrfs_extent_inline_ref *iref; 8318 struct btrfs_path *path; 8319 struct extent_buffer *leaf; 8320 int type; 8321 u32 size; 8322 8323 if (parent > 0) 8324 type = BTRFS_SHARED_DATA_REF_KEY; 8325 else 8326 type = BTRFS_EXTENT_DATA_REF_KEY; 8327 8328 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8329 8330 path = btrfs_alloc_path(); 8331 if (!path) 8332 return -ENOMEM; 8333 8334 path->leave_spinning = 1; 8335 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8336 ins, size); 8337 if (ret) { 8338 btrfs_free_path(path); 8339 return ret; 8340 } 8341 8342 leaf = path->nodes[0]; 8343 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8344 struct btrfs_extent_item); 8345 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8346 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8347 btrfs_set_extent_flags(leaf, extent_item, 8348 flags | BTRFS_EXTENT_FLAG_DATA); 8349 8350 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8351 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8352 if (parent > 0) { 8353 struct btrfs_shared_data_ref *ref; 8354 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8355 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8356 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8357 } else { 8358 struct btrfs_extent_data_ref *ref; 8359 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8360 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8361 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8362 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8363 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8364 } 8365 8366 btrfs_mark_buffer_dirty(path->nodes[0]); 8367 btrfs_free_path(path); 8368 8369 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); 8370 if (ret) 8371 return ret; 8372 8373 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); 8374 if (ret) { /* -ENOENT, logic error */ 8375 btrfs_err(fs_info, "update block group failed for %llu %llu", 8376 ins->objectid, ins->offset); 8377 BUG(); 8378 } 8379 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8380 return ret; 8381 } 8382 8383 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8384 struct btrfs_delayed_ref_node *node, 8385 struct btrfs_delayed_extent_op *extent_op) 8386 { 8387 struct btrfs_fs_info *fs_info = trans->fs_info; 8388 int ret; 8389 struct btrfs_extent_item *extent_item; 8390 struct btrfs_key extent_key; 8391 struct btrfs_tree_block_info *block_info; 8392 struct btrfs_extent_inline_ref *iref; 8393 struct btrfs_path *path; 8394 struct extent_buffer *leaf; 8395 struct btrfs_delayed_tree_ref *ref; 8396 u32 size = sizeof(*extent_item) + sizeof(*iref); 8397 u64 num_bytes; 8398 u64 flags = extent_op->flags_to_set; 8399 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8400 8401 ref = btrfs_delayed_node_to_tree_ref(node); 8402 8403 extent_key.objectid = node->bytenr; 8404 if (skinny_metadata) { 8405 extent_key.offset = ref->level; 8406 extent_key.type = BTRFS_METADATA_ITEM_KEY; 8407 num_bytes = fs_info->nodesize; 8408 } else { 8409 extent_key.offset = node->num_bytes; 8410 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 8411 size += sizeof(*block_info); 8412 num_bytes = node->num_bytes; 8413 } 8414 8415 path = btrfs_alloc_path(); 8416 if (!path) 8417 return -ENOMEM; 8418 8419 path->leave_spinning = 1; 8420 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8421 &extent_key, size); 8422 if (ret) { 8423 btrfs_free_path(path); 8424 return ret; 8425 } 8426 8427 leaf = path->nodes[0]; 8428 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8429 struct btrfs_extent_item); 8430 btrfs_set_extent_refs(leaf, extent_item, 1); 8431 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8432 btrfs_set_extent_flags(leaf, extent_item, 8433 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8434 8435 if (skinny_metadata) { 8436 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8437 } else { 8438 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8439 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 8440 btrfs_set_tree_block_level(leaf, block_info, ref->level); 8441 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8442 } 8443 8444 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 8445 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8446 btrfs_set_extent_inline_ref_type(leaf, iref, 8447 BTRFS_SHARED_BLOCK_REF_KEY); 8448 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); 8449 } else { 8450 btrfs_set_extent_inline_ref_type(leaf, iref, 8451 BTRFS_TREE_BLOCK_REF_KEY); 8452 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); 8453 } 8454 8455 btrfs_mark_buffer_dirty(leaf); 8456 btrfs_free_path(path); 8457 8458 ret = remove_from_free_space_tree(trans, extent_key.objectid, 8459 num_bytes); 8460 if (ret) 8461 return ret; 8462 8463 ret = update_block_group(trans, fs_info, extent_key.objectid, 8464 fs_info->nodesize, 1); 8465 if (ret) { /* -ENOENT, logic error */ 8466 btrfs_err(fs_info, "update block group failed for %llu %llu", 8467 extent_key.objectid, extent_key.offset); 8468 BUG(); 8469 } 8470 8471 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, 8472 fs_info->nodesize); 8473 return ret; 8474 } 8475 8476 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8477 struct btrfs_root *root, u64 owner, 8478 u64 offset, u64 ram_bytes, 8479 struct btrfs_key *ins) 8480 { 8481 int ret; 8482 8483 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 8484 8485 btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, 8486 root->root_key.objectid, owner, offset, 8487 BTRFS_ADD_DELAYED_EXTENT); 8488 8489 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, 8490 ins->offset, 0, 8491 root->root_key.objectid, owner, 8492 offset, ram_bytes, 8493 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); 8494 return ret; 8495 } 8496 8497 /* 8498 * this is used by the tree logging recovery code. It records that 8499 * an extent has been allocated and makes sure to clear the free 8500 * space cache bits as well 8501 */ 8502 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8503 u64 root_objectid, u64 owner, u64 offset, 8504 struct btrfs_key *ins) 8505 { 8506 struct btrfs_fs_info *fs_info = trans->fs_info; 8507 int ret; 8508 struct btrfs_block_group_cache *block_group; 8509 struct btrfs_space_info *space_info; 8510 8511 /* 8512 * Mixed block groups will exclude before processing the log so we only 8513 * need to do the exclude dance if this fs isn't mixed. 8514 */ 8515 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8516 ret = __exclude_logged_extent(fs_info, ins->objectid, 8517 ins->offset); 8518 if (ret) 8519 return ret; 8520 } 8521 8522 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8523 if (!block_group) 8524 return -EINVAL; 8525 8526 space_info = block_group->space_info; 8527 spin_lock(&space_info->lock); 8528 spin_lock(&block_group->lock); 8529 space_info->bytes_reserved += ins->offset; 8530 block_group->reserved += ins->offset; 8531 spin_unlock(&block_group->lock); 8532 spin_unlock(&space_info->lock); 8533 8534 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 8535 offset, ins, 1); 8536 btrfs_put_block_group(block_group); 8537 return ret; 8538 } 8539 8540 static struct extent_buffer * 8541 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8542 u64 bytenr, int level, u64 owner) 8543 { 8544 struct btrfs_fs_info *fs_info = root->fs_info; 8545 struct extent_buffer *buf; 8546 8547 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8548 if (IS_ERR(buf)) 8549 return buf; 8550 8551 /* 8552 * Extra safety check in case the extent tree is corrupted and extent 8553 * allocator chooses to use a tree block which is already used and 8554 * locked. 8555 */ 8556 if (buf->lock_owner == current->pid) { 8557 btrfs_err_rl(fs_info, 8558 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 8559 buf->start, btrfs_header_owner(buf), current->pid); 8560 free_extent_buffer(buf); 8561 return ERR_PTR(-EUCLEAN); 8562 } 8563 8564 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8565 btrfs_tree_lock(buf); 8566 clean_tree_block(fs_info, buf); 8567 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8568 8569 btrfs_set_lock_blocking_write(buf); 8570 set_extent_buffer_uptodate(buf); 8571 8572 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 8573 btrfs_set_header_level(buf, level); 8574 btrfs_set_header_bytenr(buf, buf->start); 8575 btrfs_set_header_generation(buf, trans->transid); 8576 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 8577 btrfs_set_header_owner(buf, owner); 8578 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 8579 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 8580 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8581 buf->log_index = root->log_transid % 2; 8582 /* 8583 * we allow two log transactions at a time, use different 8584 * EXTENT bit to differentiate dirty pages. 8585 */ 8586 if (buf->log_index == 0) 8587 set_extent_dirty(&root->dirty_log_pages, buf->start, 8588 buf->start + buf->len - 1, GFP_NOFS); 8589 else 8590 set_extent_new(&root->dirty_log_pages, buf->start, 8591 buf->start + buf->len - 1); 8592 } else { 8593 buf->log_index = -1; 8594 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8595 buf->start + buf->len - 1, GFP_NOFS); 8596 } 8597 trans->dirty = true; 8598 /* this returns a buffer locked for blocking */ 8599 return buf; 8600 } 8601 8602 static struct btrfs_block_rsv * 8603 use_block_rsv(struct btrfs_trans_handle *trans, 8604 struct btrfs_root *root, u32 blocksize) 8605 { 8606 struct btrfs_fs_info *fs_info = root->fs_info; 8607 struct btrfs_block_rsv *block_rsv; 8608 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8609 int ret; 8610 bool global_updated = false; 8611 8612 block_rsv = get_block_rsv(trans, root); 8613 8614 if (unlikely(block_rsv->size == 0)) 8615 goto try_reserve; 8616 again: 8617 ret = block_rsv_use_bytes(block_rsv, blocksize); 8618 if (!ret) 8619 return block_rsv; 8620 8621 if (block_rsv->failfast) 8622 return ERR_PTR(ret); 8623 8624 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8625 global_updated = true; 8626 update_global_block_rsv(fs_info); 8627 goto again; 8628 } 8629 8630 /* 8631 * The global reserve still exists to save us from ourselves, so don't 8632 * warn_on if we are short on our delayed refs reserve. 8633 */ 8634 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && 8635 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8636 static DEFINE_RATELIMIT_STATE(_rs, 8637 DEFAULT_RATELIMIT_INTERVAL * 10, 8638 /*DEFAULT_RATELIMIT_BURST*/ 1); 8639 if (__ratelimit(&_rs)) 8640 WARN(1, KERN_DEBUG 8641 "BTRFS: block rsv returned %d\n", ret); 8642 } 8643 try_reserve: 8644 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8645 BTRFS_RESERVE_NO_FLUSH); 8646 if (!ret) 8647 return block_rsv; 8648 /* 8649 * If we couldn't reserve metadata bytes try and use some from 8650 * the global reserve if its space type is the same as the global 8651 * reservation. 8652 */ 8653 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8654 block_rsv->space_info == global_rsv->space_info) { 8655 ret = block_rsv_use_bytes(global_rsv, blocksize); 8656 if (!ret) 8657 return global_rsv; 8658 } 8659 return ERR_PTR(ret); 8660 } 8661 8662 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8663 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8664 { 8665 block_rsv_add_bytes(block_rsv, blocksize, false); 8666 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); 8667 } 8668 8669 /* 8670 * finds a free extent and does all the dirty work required for allocation 8671 * returns the tree buffer or an ERR_PTR on error. 8672 */ 8673 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8674 struct btrfs_root *root, 8675 u64 parent, u64 root_objectid, 8676 const struct btrfs_disk_key *key, 8677 int level, u64 hint, 8678 u64 empty_size) 8679 { 8680 struct btrfs_fs_info *fs_info = root->fs_info; 8681 struct btrfs_key ins; 8682 struct btrfs_block_rsv *block_rsv; 8683 struct extent_buffer *buf; 8684 struct btrfs_delayed_extent_op *extent_op; 8685 u64 flags = 0; 8686 int ret; 8687 u32 blocksize = fs_info->nodesize; 8688 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8689 8690 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8691 if (btrfs_is_testing(fs_info)) { 8692 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8693 level, root_objectid); 8694 if (!IS_ERR(buf)) 8695 root->alloc_bytenr += blocksize; 8696 return buf; 8697 } 8698 #endif 8699 8700 block_rsv = use_block_rsv(trans, root, blocksize); 8701 if (IS_ERR(block_rsv)) 8702 return ERR_CAST(block_rsv); 8703 8704 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8705 empty_size, hint, &ins, 0, 0); 8706 if (ret) 8707 goto out_unuse; 8708 8709 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 8710 root_objectid); 8711 if (IS_ERR(buf)) { 8712 ret = PTR_ERR(buf); 8713 goto out_free_reserved; 8714 } 8715 8716 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8717 if (parent == 0) 8718 parent = ins.objectid; 8719 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8720 } else 8721 BUG_ON(parent > 0); 8722 8723 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8724 extent_op = btrfs_alloc_delayed_extent_op(); 8725 if (!extent_op) { 8726 ret = -ENOMEM; 8727 goto out_free_buf; 8728 } 8729 if (key) 8730 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8731 else 8732 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8733 extent_op->flags_to_set = flags; 8734 extent_op->update_key = skinny_metadata ? false : true; 8735 extent_op->update_flags = true; 8736 extent_op->is_data = false; 8737 extent_op->level = level; 8738 8739 btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, 8740 root_objectid, level, 0, 8741 BTRFS_ADD_DELAYED_EXTENT); 8742 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 8743 ins.offset, parent, 8744 root_objectid, level, 8745 BTRFS_ADD_DELAYED_EXTENT, 8746 extent_op, NULL, NULL); 8747 if (ret) 8748 goto out_free_delayed; 8749 } 8750 return buf; 8751 8752 out_free_delayed: 8753 btrfs_free_delayed_extent_op(extent_op); 8754 out_free_buf: 8755 free_extent_buffer(buf); 8756 out_free_reserved: 8757 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8758 out_unuse: 8759 unuse_block_rsv(fs_info, block_rsv, blocksize); 8760 return ERR_PTR(ret); 8761 } 8762 8763 struct walk_control { 8764 u64 refs[BTRFS_MAX_LEVEL]; 8765 u64 flags[BTRFS_MAX_LEVEL]; 8766 struct btrfs_key update_progress; 8767 struct btrfs_key drop_progress; 8768 int drop_level; 8769 int stage; 8770 int level; 8771 int shared_level; 8772 int update_ref; 8773 int keep_locks; 8774 int reada_slot; 8775 int reada_count; 8776 int restarted; 8777 }; 8778 8779 #define DROP_REFERENCE 1 8780 #define UPDATE_BACKREF 2 8781 8782 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8783 struct btrfs_root *root, 8784 struct walk_control *wc, 8785 struct btrfs_path *path) 8786 { 8787 struct btrfs_fs_info *fs_info = root->fs_info; 8788 u64 bytenr; 8789 u64 generation; 8790 u64 refs; 8791 u64 flags; 8792 u32 nritems; 8793 struct btrfs_key key; 8794 struct extent_buffer *eb; 8795 int ret; 8796 int slot; 8797 int nread = 0; 8798 8799 if (path->slots[wc->level] < wc->reada_slot) { 8800 wc->reada_count = wc->reada_count * 2 / 3; 8801 wc->reada_count = max(wc->reada_count, 2); 8802 } else { 8803 wc->reada_count = wc->reada_count * 3 / 2; 8804 wc->reada_count = min_t(int, wc->reada_count, 8805 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8806 } 8807 8808 eb = path->nodes[wc->level]; 8809 nritems = btrfs_header_nritems(eb); 8810 8811 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8812 if (nread >= wc->reada_count) 8813 break; 8814 8815 cond_resched(); 8816 bytenr = btrfs_node_blockptr(eb, slot); 8817 generation = btrfs_node_ptr_generation(eb, slot); 8818 8819 if (slot == path->slots[wc->level]) 8820 goto reada; 8821 8822 if (wc->stage == UPDATE_BACKREF && 8823 generation <= root->root_key.offset) 8824 continue; 8825 8826 /* We don't lock the tree block, it's OK to be racy here */ 8827 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8828 wc->level - 1, 1, &refs, 8829 &flags); 8830 /* We don't care about errors in readahead. */ 8831 if (ret < 0) 8832 continue; 8833 BUG_ON(refs == 0); 8834 8835 if (wc->stage == DROP_REFERENCE) { 8836 if (refs == 1) 8837 goto reada; 8838 8839 if (wc->level == 1 && 8840 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8841 continue; 8842 if (!wc->update_ref || 8843 generation <= root->root_key.offset) 8844 continue; 8845 btrfs_node_key_to_cpu(eb, &key, slot); 8846 ret = btrfs_comp_cpu_keys(&key, 8847 &wc->update_progress); 8848 if (ret < 0) 8849 continue; 8850 } else { 8851 if (wc->level == 1 && 8852 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8853 continue; 8854 } 8855 reada: 8856 readahead_tree_block(fs_info, bytenr); 8857 nread++; 8858 } 8859 wc->reada_slot = slot; 8860 } 8861 8862 /* 8863 * helper to process tree block while walking down the tree. 8864 * 8865 * when wc->stage == UPDATE_BACKREF, this function updates 8866 * back refs for pointers in the block. 8867 * 8868 * NOTE: return value 1 means we should stop walking down. 8869 */ 8870 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8871 struct btrfs_root *root, 8872 struct btrfs_path *path, 8873 struct walk_control *wc, int lookup_info) 8874 { 8875 struct btrfs_fs_info *fs_info = root->fs_info; 8876 int level = wc->level; 8877 struct extent_buffer *eb = path->nodes[level]; 8878 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8879 int ret; 8880 8881 if (wc->stage == UPDATE_BACKREF && 8882 btrfs_header_owner(eb) != root->root_key.objectid) 8883 return 1; 8884 8885 /* 8886 * when reference count of tree block is 1, it won't increase 8887 * again. once full backref flag is set, we never clear it. 8888 */ 8889 if (lookup_info && 8890 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8891 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8892 BUG_ON(!path->locks[level]); 8893 ret = btrfs_lookup_extent_info(trans, fs_info, 8894 eb->start, level, 1, 8895 &wc->refs[level], 8896 &wc->flags[level]); 8897 BUG_ON(ret == -ENOMEM); 8898 if (ret) 8899 return ret; 8900 BUG_ON(wc->refs[level] == 0); 8901 } 8902 8903 if (wc->stage == DROP_REFERENCE) { 8904 if (wc->refs[level] > 1) 8905 return 1; 8906 8907 if (path->locks[level] && !wc->keep_locks) { 8908 btrfs_tree_unlock_rw(eb, path->locks[level]); 8909 path->locks[level] = 0; 8910 } 8911 return 0; 8912 } 8913 8914 /* wc->stage == UPDATE_BACKREF */ 8915 if (!(wc->flags[level] & flag)) { 8916 BUG_ON(!path->locks[level]); 8917 ret = btrfs_inc_ref(trans, root, eb, 1); 8918 BUG_ON(ret); /* -ENOMEM */ 8919 ret = btrfs_dec_ref(trans, root, eb, 0); 8920 BUG_ON(ret); /* -ENOMEM */ 8921 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, 8922 eb->len, flag, 8923 btrfs_header_level(eb), 0); 8924 BUG_ON(ret); /* -ENOMEM */ 8925 wc->flags[level] |= flag; 8926 } 8927 8928 /* 8929 * the block is shared by multiple trees, so it's not good to 8930 * keep the tree lock 8931 */ 8932 if (path->locks[level] && level > 0) { 8933 btrfs_tree_unlock_rw(eb, path->locks[level]); 8934 path->locks[level] = 0; 8935 } 8936 return 0; 8937 } 8938 8939 /* 8940 * This is used to verify a ref exists for this root to deal with a bug where we 8941 * would have a drop_progress key that hadn't been updated properly. 8942 */ 8943 static int check_ref_exists(struct btrfs_trans_handle *trans, 8944 struct btrfs_root *root, u64 bytenr, u64 parent, 8945 int level) 8946 { 8947 struct btrfs_path *path; 8948 struct btrfs_extent_inline_ref *iref; 8949 int ret; 8950 8951 path = btrfs_alloc_path(); 8952 if (!path) 8953 return -ENOMEM; 8954 8955 ret = lookup_extent_backref(trans, path, &iref, bytenr, 8956 root->fs_info->nodesize, parent, 8957 root->root_key.objectid, level, 0); 8958 btrfs_free_path(path); 8959 if (ret == -ENOENT) 8960 return 0; 8961 if (ret < 0) 8962 return ret; 8963 return 1; 8964 } 8965 8966 /* 8967 * helper to process tree block pointer. 8968 * 8969 * when wc->stage == DROP_REFERENCE, this function checks 8970 * reference count of the block pointed to. if the block 8971 * is shared and we need update back refs for the subtree 8972 * rooted at the block, this function changes wc->stage to 8973 * UPDATE_BACKREF. if the block is shared and there is no 8974 * need to update back, this function drops the reference 8975 * to the block. 8976 * 8977 * NOTE: return value 1 means we should stop walking down. 8978 */ 8979 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8980 struct btrfs_root *root, 8981 struct btrfs_path *path, 8982 struct walk_control *wc, int *lookup_info) 8983 { 8984 struct btrfs_fs_info *fs_info = root->fs_info; 8985 u64 bytenr; 8986 u64 generation; 8987 u64 parent; 8988 struct btrfs_key key; 8989 struct btrfs_key first_key; 8990 struct extent_buffer *next; 8991 int level = wc->level; 8992 int reada = 0; 8993 int ret = 0; 8994 bool need_account = false; 8995 8996 generation = btrfs_node_ptr_generation(path->nodes[level], 8997 path->slots[level]); 8998 /* 8999 * if the lower level block was created before the snapshot 9000 * was created, we know there is no need to update back refs 9001 * for the subtree 9002 */ 9003 if (wc->stage == UPDATE_BACKREF && 9004 generation <= root->root_key.offset) { 9005 *lookup_info = 1; 9006 return 1; 9007 } 9008 9009 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 9010 btrfs_node_key_to_cpu(path->nodes[level], &first_key, 9011 path->slots[level]); 9012 9013 next = find_extent_buffer(fs_info, bytenr); 9014 if (!next) { 9015 next = btrfs_find_create_tree_block(fs_info, bytenr); 9016 if (IS_ERR(next)) 9017 return PTR_ERR(next); 9018 9019 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 9020 level - 1); 9021 reada = 1; 9022 } 9023 btrfs_tree_lock(next); 9024 btrfs_set_lock_blocking_write(next); 9025 9026 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 9027 &wc->refs[level - 1], 9028 &wc->flags[level - 1]); 9029 if (ret < 0) 9030 goto out_unlock; 9031 9032 if (unlikely(wc->refs[level - 1] == 0)) { 9033 btrfs_err(fs_info, "Missing references."); 9034 ret = -EIO; 9035 goto out_unlock; 9036 } 9037 *lookup_info = 0; 9038 9039 if (wc->stage == DROP_REFERENCE) { 9040 if (wc->refs[level - 1] > 1) { 9041 need_account = true; 9042 if (level == 1 && 9043 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 9044 goto skip; 9045 9046 if (!wc->update_ref || 9047 generation <= root->root_key.offset) 9048 goto skip; 9049 9050 btrfs_node_key_to_cpu(path->nodes[level], &key, 9051 path->slots[level]); 9052 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 9053 if (ret < 0) 9054 goto skip; 9055 9056 wc->stage = UPDATE_BACKREF; 9057 wc->shared_level = level - 1; 9058 } 9059 } else { 9060 if (level == 1 && 9061 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 9062 goto skip; 9063 } 9064 9065 if (!btrfs_buffer_uptodate(next, generation, 0)) { 9066 btrfs_tree_unlock(next); 9067 free_extent_buffer(next); 9068 next = NULL; 9069 *lookup_info = 1; 9070 } 9071 9072 if (!next) { 9073 if (reada && level == 1) 9074 reada_walk_down(trans, root, wc, path); 9075 next = read_tree_block(fs_info, bytenr, generation, level - 1, 9076 &first_key); 9077 if (IS_ERR(next)) { 9078 return PTR_ERR(next); 9079 } else if (!extent_buffer_uptodate(next)) { 9080 free_extent_buffer(next); 9081 return -EIO; 9082 } 9083 btrfs_tree_lock(next); 9084 btrfs_set_lock_blocking_write(next); 9085 } 9086 9087 level--; 9088 ASSERT(level == btrfs_header_level(next)); 9089 if (level != btrfs_header_level(next)) { 9090 btrfs_err(root->fs_info, "mismatched level"); 9091 ret = -EIO; 9092 goto out_unlock; 9093 } 9094 path->nodes[level] = next; 9095 path->slots[level] = 0; 9096 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9097 wc->level = level; 9098 if (wc->level == 1) 9099 wc->reada_slot = 0; 9100 return 0; 9101 skip: 9102 wc->refs[level - 1] = 0; 9103 wc->flags[level - 1] = 0; 9104 if (wc->stage == DROP_REFERENCE) { 9105 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 9106 parent = path->nodes[level]->start; 9107 } else { 9108 ASSERT(root->root_key.objectid == 9109 btrfs_header_owner(path->nodes[level])); 9110 if (root->root_key.objectid != 9111 btrfs_header_owner(path->nodes[level])) { 9112 btrfs_err(root->fs_info, 9113 "mismatched block owner"); 9114 ret = -EIO; 9115 goto out_unlock; 9116 } 9117 parent = 0; 9118 } 9119 9120 /* 9121 * If we had a drop_progress we need to verify the refs are set 9122 * as expected. If we find our ref then we know that from here 9123 * on out everything should be correct, and we can clear the 9124 * ->restarted flag. 9125 */ 9126 if (wc->restarted) { 9127 ret = check_ref_exists(trans, root, bytenr, parent, 9128 level - 1); 9129 if (ret < 0) 9130 goto out_unlock; 9131 if (ret == 0) 9132 goto no_delete; 9133 ret = 0; 9134 wc->restarted = 0; 9135 } 9136 9137 /* 9138 * Reloc tree doesn't contribute to qgroup numbers, and we have 9139 * already accounted them at merge time (replace_path), 9140 * thus we could skip expensive subtree trace here. 9141 */ 9142 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 9143 need_account) { 9144 ret = btrfs_qgroup_trace_subtree(trans, next, 9145 generation, level - 1); 9146 if (ret) { 9147 btrfs_err_rl(fs_info, 9148 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 9149 ret); 9150 } 9151 } 9152 9153 /* 9154 * We need to update the next key in our walk control so we can 9155 * update the drop_progress key accordingly. We don't care if 9156 * find_next_key doesn't find a key because that means we're at 9157 * the end and are going to clean up now. 9158 */ 9159 wc->drop_level = level; 9160 find_next_key(path, level, &wc->drop_progress); 9161 9162 ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, 9163 parent, root->root_key.objectid, 9164 level - 1, 0); 9165 if (ret) 9166 goto out_unlock; 9167 } 9168 no_delete: 9169 *lookup_info = 1; 9170 ret = 1; 9171 9172 out_unlock: 9173 btrfs_tree_unlock(next); 9174 free_extent_buffer(next); 9175 9176 return ret; 9177 } 9178 9179 /* 9180 * helper to process tree block while walking up the tree. 9181 * 9182 * when wc->stage == DROP_REFERENCE, this function drops 9183 * reference count on the block. 9184 * 9185 * when wc->stage == UPDATE_BACKREF, this function changes 9186 * wc->stage back to DROP_REFERENCE if we changed wc->stage 9187 * to UPDATE_BACKREF previously while processing the block. 9188 * 9189 * NOTE: return value 1 means we should stop walking up. 9190 */ 9191 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 9192 struct btrfs_root *root, 9193 struct btrfs_path *path, 9194 struct walk_control *wc) 9195 { 9196 struct btrfs_fs_info *fs_info = root->fs_info; 9197 int ret; 9198 int level = wc->level; 9199 struct extent_buffer *eb = path->nodes[level]; 9200 u64 parent = 0; 9201 9202 if (wc->stage == UPDATE_BACKREF) { 9203 BUG_ON(wc->shared_level < level); 9204 if (level < wc->shared_level) 9205 goto out; 9206 9207 ret = find_next_key(path, level + 1, &wc->update_progress); 9208 if (ret > 0) 9209 wc->update_ref = 0; 9210 9211 wc->stage = DROP_REFERENCE; 9212 wc->shared_level = -1; 9213 path->slots[level] = 0; 9214 9215 /* 9216 * check reference count again if the block isn't locked. 9217 * we should start walking down the tree again if reference 9218 * count is one. 9219 */ 9220 if (!path->locks[level]) { 9221 BUG_ON(level == 0); 9222 btrfs_tree_lock(eb); 9223 btrfs_set_lock_blocking_write(eb); 9224 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9225 9226 ret = btrfs_lookup_extent_info(trans, fs_info, 9227 eb->start, level, 1, 9228 &wc->refs[level], 9229 &wc->flags[level]); 9230 if (ret < 0) { 9231 btrfs_tree_unlock_rw(eb, path->locks[level]); 9232 path->locks[level] = 0; 9233 return ret; 9234 } 9235 BUG_ON(wc->refs[level] == 0); 9236 if (wc->refs[level] == 1) { 9237 btrfs_tree_unlock_rw(eb, path->locks[level]); 9238 path->locks[level] = 0; 9239 return 1; 9240 } 9241 } 9242 } 9243 9244 /* wc->stage == DROP_REFERENCE */ 9245 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 9246 9247 if (wc->refs[level] == 1) { 9248 if (level == 0) { 9249 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9250 ret = btrfs_dec_ref(trans, root, eb, 1); 9251 else 9252 ret = btrfs_dec_ref(trans, root, eb, 0); 9253 BUG_ON(ret); /* -ENOMEM */ 9254 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 9255 if (ret) { 9256 btrfs_err_rl(fs_info, 9257 "error %d accounting leaf items. Quota is out of sync, rescan required.", 9258 ret); 9259 } 9260 } 9261 /* make block locked assertion in clean_tree_block happy */ 9262 if (!path->locks[level] && 9263 btrfs_header_generation(eb) == trans->transid) { 9264 btrfs_tree_lock(eb); 9265 btrfs_set_lock_blocking_write(eb); 9266 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9267 } 9268 clean_tree_block(fs_info, eb); 9269 } 9270 9271 if (eb == root->node) { 9272 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9273 parent = eb->start; 9274 else if (root->root_key.objectid != btrfs_header_owner(eb)) 9275 goto owner_mismatch; 9276 } else { 9277 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9278 parent = path->nodes[level + 1]->start; 9279 else if (root->root_key.objectid != 9280 btrfs_header_owner(path->nodes[level + 1])) 9281 goto owner_mismatch; 9282 } 9283 9284 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 9285 out: 9286 wc->refs[level] = 0; 9287 wc->flags[level] = 0; 9288 return 0; 9289 9290 owner_mismatch: 9291 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 9292 btrfs_header_owner(eb), root->root_key.objectid); 9293 return -EUCLEAN; 9294 } 9295 9296 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 9297 struct btrfs_root *root, 9298 struct btrfs_path *path, 9299 struct walk_control *wc) 9300 { 9301 int level = wc->level; 9302 int lookup_info = 1; 9303 int ret; 9304 9305 while (level >= 0) { 9306 ret = walk_down_proc(trans, root, path, wc, lookup_info); 9307 if (ret > 0) 9308 break; 9309 9310 if (level == 0) 9311 break; 9312 9313 if (path->slots[level] >= 9314 btrfs_header_nritems(path->nodes[level])) 9315 break; 9316 9317 ret = do_walk_down(trans, root, path, wc, &lookup_info); 9318 if (ret > 0) { 9319 path->slots[level]++; 9320 continue; 9321 } else if (ret < 0) 9322 return ret; 9323 level = wc->level; 9324 } 9325 return 0; 9326 } 9327 9328 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 9329 struct btrfs_root *root, 9330 struct btrfs_path *path, 9331 struct walk_control *wc, int max_level) 9332 { 9333 int level = wc->level; 9334 int ret; 9335 9336 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 9337 while (level < max_level && path->nodes[level]) { 9338 wc->level = level; 9339 if (path->slots[level] + 1 < 9340 btrfs_header_nritems(path->nodes[level])) { 9341 path->slots[level]++; 9342 return 0; 9343 } else { 9344 ret = walk_up_proc(trans, root, path, wc); 9345 if (ret > 0) 9346 return 0; 9347 if (ret < 0) 9348 return ret; 9349 9350 if (path->locks[level]) { 9351 btrfs_tree_unlock_rw(path->nodes[level], 9352 path->locks[level]); 9353 path->locks[level] = 0; 9354 } 9355 free_extent_buffer(path->nodes[level]); 9356 path->nodes[level] = NULL; 9357 level++; 9358 } 9359 } 9360 return 1; 9361 } 9362 9363 /* 9364 * drop a subvolume tree. 9365 * 9366 * this function traverses the tree freeing any blocks that only 9367 * referenced by the tree. 9368 * 9369 * when a shared tree block is found. this function decreases its 9370 * reference count by one. if update_ref is true, this function 9371 * also make sure backrefs for the shared block and all lower level 9372 * blocks are properly updated. 9373 * 9374 * If called with for_reloc == 0, may exit early with -EAGAIN 9375 */ 9376 int btrfs_drop_snapshot(struct btrfs_root *root, 9377 struct btrfs_block_rsv *block_rsv, int update_ref, 9378 int for_reloc) 9379 { 9380 struct btrfs_fs_info *fs_info = root->fs_info; 9381 struct btrfs_path *path; 9382 struct btrfs_trans_handle *trans; 9383 struct btrfs_root *tree_root = fs_info->tree_root; 9384 struct btrfs_root_item *root_item = &root->root_item; 9385 struct walk_control *wc; 9386 struct btrfs_key key; 9387 int err = 0; 9388 int ret; 9389 int level; 9390 bool root_dropped = false; 9391 9392 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 9393 9394 path = btrfs_alloc_path(); 9395 if (!path) { 9396 err = -ENOMEM; 9397 goto out; 9398 } 9399 9400 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9401 if (!wc) { 9402 btrfs_free_path(path); 9403 err = -ENOMEM; 9404 goto out; 9405 } 9406 9407 trans = btrfs_start_transaction(tree_root, 0); 9408 if (IS_ERR(trans)) { 9409 err = PTR_ERR(trans); 9410 goto out_free; 9411 } 9412 9413 err = btrfs_run_delayed_items(trans); 9414 if (err) 9415 goto out_end_trans; 9416 9417 if (block_rsv) 9418 trans->block_rsv = block_rsv; 9419 9420 /* 9421 * This will help us catch people modifying the fs tree while we're 9422 * dropping it. It is unsafe to mess with the fs tree while it's being 9423 * dropped as we unlock the root node and parent nodes as we walk down 9424 * the tree, assuming nothing will change. If something does change 9425 * then we'll have stale information and drop references to blocks we've 9426 * already dropped. 9427 */ 9428 set_bit(BTRFS_ROOT_DELETING, &root->state); 9429 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9430 level = btrfs_header_level(root->node); 9431 path->nodes[level] = btrfs_lock_root_node(root); 9432 btrfs_set_lock_blocking_write(path->nodes[level]); 9433 path->slots[level] = 0; 9434 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9435 memset(&wc->update_progress, 0, 9436 sizeof(wc->update_progress)); 9437 } else { 9438 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9439 memcpy(&wc->update_progress, &key, 9440 sizeof(wc->update_progress)); 9441 9442 level = root_item->drop_level; 9443 BUG_ON(level == 0); 9444 path->lowest_level = level; 9445 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9446 path->lowest_level = 0; 9447 if (ret < 0) { 9448 err = ret; 9449 goto out_end_trans; 9450 } 9451 WARN_ON(ret > 0); 9452 9453 /* 9454 * unlock our path, this is safe because only this 9455 * function is allowed to delete this snapshot 9456 */ 9457 btrfs_unlock_up_safe(path, 0); 9458 9459 level = btrfs_header_level(root->node); 9460 while (1) { 9461 btrfs_tree_lock(path->nodes[level]); 9462 btrfs_set_lock_blocking_write(path->nodes[level]); 9463 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9464 9465 ret = btrfs_lookup_extent_info(trans, fs_info, 9466 path->nodes[level]->start, 9467 level, 1, &wc->refs[level], 9468 &wc->flags[level]); 9469 if (ret < 0) { 9470 err = ret; 9471 goto out_end_trans; 9472 } 9473 BUG_ON(wc->refs[level] == 0); 9474 9475 if (level == root_item->drop_level) 9476 break; 9477 9478 btrfs_tree_unlock(path->nodes[level]); 9479 path->locks[level] = 0; 9480 WARN_ON(wc->refs[level] != 1); 9481 level--; 9482 } 9483 } 9484 9485 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 9486 wc->level = level; 9487 wc->shared_level = -1; 9488 wc->stage = DROP_REFERENCE; 9489 wc->update_ref = update_ref; 9490 wc->keep_locks = 0; 9491 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9492 9493 while (1) { 9494 9495 ret = walk_down_tree(trans, root, path, wc); 9496 if (ret < 0) { 9497 err = ret; 9498 break; 9499 } 9500 9501 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9502 if (ret < 0) { 9503 err = ret; 9504 break; 9505 } 9506 9507 if (ret > 0) { 9508 BUG_ON(wc->stage != DROP_REFERENCE); 9509 break; 9510 } 9511 9512 if (wc->stage == DROP_REFERENCE) { 9513 wc->drop_level = wc->level; 9514 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 9515 &wc->drop_progress, 9516 path->slots[wc->drop_level]); 9517 } 9518 btrfs_cpu_key_to_disk(&root_item->drop_progress, 9519 &wc->drop_progress); 9520 root_item->drop_level = wc->drop_level; 9521 9522 BUG_ON(wc->level == 0); 9523 if (btrfs_should_end_transaction(trans) || 9524 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9525 ret = btrfs_update_root(trans, tree_root, 9526 &root->root_key, 9527 root_item); 9528 if (ret) { 9529 btrfs_abort_transaction(trans, ret); 9530 err = ret; 9531 goto out_end_trans; 9532 } 9533 9534 btrfs_end_transaction_throttle(trans); 9535 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9536 btrfs_debug(fs_info, 9537 "drop snapshot early exit"); 9538 err = -EAGAIN; 9539 goto out_free; 9540 } 9541 9542 trans = btrfs_start_transaction(tree_root, 0); 9543 if (IS_ERR(trans)) { 9544 err = PTR_ERR(trans); 9545 goto out_free; 9546 } 9547 if (block_rsv) 9548 trans->block_rsv = block_rsv; 9549 } 9550 } 9551 btrfs_release_path(path); 9552 if (err) 9553 goto out_end_trans; 9554 9555 ret = btrfs_del_root(trans, &root->root_key); 9556 if (ret) { 9557 btrfs_abort_transaction(trans, ret); 9558 err = ret; 9559 goto out_end_trans; 9560 } 9561 9562 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9563 ret = btrfs_find_root(tree_root, &root->root_key, path, 9564 NULL, NULL); 9565 if (ret < 0) { 9566 btrfs_abort_transaction(trans, ret); 9567 err = ret; 9568 goto out_end_trans; 9569 } else if (ret > 0) { 9570 /* if we fail to delete the orphan item this time 9571 * around, it'll get picked up the next time. 9572 * 9573 * The most common failure here is just -ENOENT. 9574 */ 9575 btrfs_del_orphan_item(trans, tree_root, 9576 root->root_key.objectid); 9577 } 9578 } 9579 9580 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9581 btrfs_add_dropped_root(trans, root); 9582 } else { 9583 free_extent_buffer(root->node); 9584 free_extent_buffer(root->commit_root); 9585 btrfs_put_fs_root(root); 9586 } 9587 root_dropped = true; 9588 out_end_trans: 9589 btrfs_end_transaction_throttle(trans); 9590 out_free: 9591 kfree(wc); 9592 btrfs_free_path(path); 9593 out: 9594 /* 9595 * So if we need to stop dropping the snapshot for whatever reason we 9596 * need to make sure to add it back to the dead root list so that we 9597 * keep trying to do the work later. This also cleans up roots if we 9598 * don't have it in the radix (like when we recover after a power fail 9599 * or unmount) so we don't leak memory. 9600 */ 9601 if (!for_reloc && !root_dropped) 9602 btrfs_add_dead_root(root); 9603 if (err && err != -EAGAIN) 9604 btrfs_handle_fs_error(fs_info, err, NULL); 9605 return err; 9606 } 9607 9608 /* 9609 * drop subtree rooted at tree block 'node'. 9610 * 9611 * NOTE: this function will unlock and release tree block 'node' 9612 * only used by relocation code 9613 */ 9614 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9615 struct btrfs_root *root, 9616 struct extent_buffer *node, 9617 struct extent_buffer *parent) 9618 { 9619 struct btrfs_fs_info *fs_info = root->fs_info; 9620 struct btrfs_path *path; 9621 struct walk_control *wc; 9622 int level; 9623 int parent_level; 9624 int ret = 0; 9625 int wret; 9626 9627 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9628 9629 path = btrfs_alloc_path(); 9630 if (!path) 9631 return -ENOMEM; 9632 9633 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9634 if (!wc) { 9635 btrfs_free_path(path); 9636 return -ENOMEM; 9637 } 9638 9639 btrfs_assert_tree_locked(parent); 9640 parent_level = btrfs_header_level(parent); 9641 extent_buffer_get(parent); 9642 path->nodes[parent_level] = parent; 9643 path->slots[parent_level] = btrfs_header_nritems(parent); 9644 9645 btrfs_assert_tree_locked(node); 9646 level = btrfs_header_level(node); 9647 path->nodes[level] = node; 9648 path->slots[level] = 0; 9649 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9650 9651 wc->refs[parent_level] = 1; 9652 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9653 wc->level = level; 9654 wc->shared_level = -1; 9655 wc->stage = DROP_REFERENCE; 9656 wc->update_ref = 0; 9657 wc->keep_locks = 1; 9658 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9659 9660 while (1) { 9661 wret = walk_down_tree(trans, root, path, wc); 9662 if (wret < 0) { 9663 ret = wret; 9664 break; 9665 } 9666 9667 wret = walk_up_tree(trans, root, path, wc, parent_level); 9668 if (wret < 0) 9669 ret = wret; 9670 if (wret != 0) 9671 break; 9672 } 9673 9674 kfree(wc); 9675 btrfs_free_path(path); 9676 return ret; 9677 } 9678 9679 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9680 { 9681 u64 num_devices; 9682 u64 stripped; 9683 9684 /* 9685 * if restripe for this chunk_type is on pick target profile and 9686 * return, otherwise do the usual balance 9687 */ 9688 stripped = get_restripe_target(fs_info, flags); 9689 if (stripped) 9690 return extended_to_chunk(stripped); 9691 9692 num_devices = fs_info->fs_devices->rw_devices; 9693 9694 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9695 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9696 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9697 9698 if (num_devices == 1) { 9699 stripped |= BTRFS_BLOCK_GROUP_DUP; 9700 stripped = flags & ~stripped; 9701 9702 /* turn raid0 into single device chunks */ 9703 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9704 return stripped; 9705 9706 /* turn mirroring into duplication */ 9707 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9708 BTRFS_BLOCK_GROUP_RAID10)) 9709 return stripped | BTRFS_BLOCK_GROUP_DUP; 9710 } else { 9711 /* they already had raid on here, just return */ 9712 if (flags & stripped) 9713 return flags; 9714 9715 stripped |= BTRFS_BLOCK_GROUP_DUP; 9716 stripped = flags & ~stripped; 9717 9718 /* switch duplicated blocks with raid1 */ 9719 if (flags & BTRFS_BLOCK_GROUP_DUP) 9720 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9721 9722 /* this is drive concat, leave it alone */ 9723 } 9724 9725 return flags; 9726 } 9727 9728 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9729 { 9730 struct btrfs_space_info *sinfo = cache->space_info; 9731 u64 num_bytes; 9732 u64 sinfo_used; 9733 u64 min_allocable_bytes; 9734 int ret = -ENOSPC; 9735 9736 /* 9737 * We need some metadata space and system metadata space for 9738 * allocating chunks in some corner cases until we force to set 9739 * it to be readonly. 9740 */ 9741 if ((sinfo->flags & 9742 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9743 !force) 9744 min_allocable_bytes = SZ_1M; 9745 else 9746 min_allocable_bytes = 0; 9747 9748 spin_lock(&sinfo->lock); 9749 spin_lock(&cache->lock); 9750 9751 if (cache->ro) { 9752 cache->ro++; 9753 ret = 0; 9754 goto out; 9755 } 9756 9757 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9758 cache->bytes_super - btrfs_block_group_used(&cache->item); 9759 sinfo_used = btrfs_space_info_used(sinfo, true); 9760 9761 if (sinfo_used + num_bytes + min_allocable_bytes <= 9762 sinfo->total_bytes) { 9763 sinfo->bytes_readonly += num_bytes; 9764 cache->ro++; 9765 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9766 ret = 0; 9767 } 9768 out: 9769 spin_unlock(&cache->lock); 9770 spin_unlock(&sinfo->lock); 9771 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { 9772 btrfs_info(cache->fs_info, 9773 "unable to make block group %llu ro", 9774 cache->key.objectid); 9775 btrfs_info(cache->fs_info, 9776 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", 9777 sinfo_used, num_bytes, min_allocable_bytes); 9778 dump_space_info(cache->fs_info, cache->space_info, 0, 0); 9779 } 9780 return ret; 9781 } 9782 9783 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) 9784 9785 { 9786 struct btrfs_fs_info *fs_info = cache->fs_info; 9787 struct btrfs_trans_handle *trans; 9788 u64 alloc_flags; 9789 int ret; 9790 9791 again: 9792 trans = btrfs_join_transaction(fs_info->extent_root); 9793 if (IS_ERR(trans)) 9794 return PTR_ERR(trans); 9795 9796 /* 9797 * we're not allowed to set block groups readonly after the dirty 9798 * block groups cache has started writing. If it already started, 9799 * back off and let this transaction commit 9800 */ 9801 mutex_lock(&fs_info->ro_block_group_mutex); 9802 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9803 u64 transid = trans->transid; 9804 9805 mutex_unlock(&fs_info->ro_block_group_mutex); 9806 btrfs_end_transaction(trans); 9807 9808 ret = btrfs_wait_for_commit(fs_info, transid); 9809 if (ret) 9810 return ret; 9811 goto again; 9812 } 9813 9814 /* 9815 * if we are changing raid levels, try to allocate a corresponding 9816 * block group with the new raid level. 9817 */ 9818 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9819 if (alloc_flags != cache->flags) { 9820 ret = do_chunk_alloc(trans, alloc_flags, 9821 CHUNK_ALLOC_FORCE); 9822 /* 9823 * ENOSPC is allowed here, we may have enough space 9824 * already allocated at the new raid level to 9825 * carry on 9826 */ 9827 if (ret == -ENOSPC) 9828 ret = 0; 9829 if (ret < 0) 9830 goto out; 9831 } 9832 9833 ret = inc_block_group_ro(cache, 0); 9834 if (!ret) 9835 goto out; 9836 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9837 ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9838 if (ret < 0) 9839 goto out; 9840 ret = inc_block_group_ro(cache, 0); 9841 out: 9842 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9843 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9844 mutex_lock(&fs_info->chunk_mutex); 9845 check_system_chunk(trans, alloc_flags); 9846 mutex_unlock(&fs_info->chunk_mutex); 9847 } 9848 mutex_unlock(&fs_info->ro_block_group_mutex); 9849 9850 btrfs_end_transaction(trans); 9851 return ret; 9852 } 9853 9854 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 9855 { 9856 u64 alloc_flags = get_alloc_profile(trans->fs_info, type); 9857 9858 return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9859 } 9860 9861 /* 9862 * helper to account the unused space of all the readonly block group in the 9863 * space_info. takes mirrors into account. 9864 */ 9865 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9866 { 9867 struct btrfs_block_group_cache *block_group; 9868 u64 free_bytes = 0; 9869 int factor; 9870 9871 /* It's df, we don't care if it's racy */ 9872 if (list_empty(&sinfo->ro_bgs)) 9873 return 0; 9874 9875 spin_lock(&sinfo->lock); 9876 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9877 spin_lock(&block_group->lock); 9878 9879 if (!block_group->ro) { 9880 spin_unlock(&block_group->lock); 9881 continue; 9882 } 9883 9884 factor = btrfs_bg_type_to_factor(block_group->flags); 9885 free_bytes += (block_group->key.offset - 9886 btrfs_block_group_used(&block_group->item)) * 9887 factor; 9888 9889 spin_unlock(&block_group->lock); 9890 } 9891 spin_unlock(&sinfo->lock); 9892 9893 return free_bytes; 9894 } 9895 9896 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9897 { 9898 struct btrfs_space_info *sinfo = cache->space_info; 9899 u64 num_bytes; 9900 9901 BUG_ON(!cache->ro); 9902 9903 spin_lock(&sinfo->lock); 9904 spin_lock(&cache->lock); 9905 if (!--cache->ro) { 9906 num_bytes = cache->key.offset - cache->reserved - 9907 cache->pinned - cache->bytes_super - 9908 btrfs_block_group_used(&cache->item); 9909 sinfo->bytes_readonly -= num_bytes; 9910 list_del_init(&cache->ro_list); 9911 } 9912 spin_unlock(&cache->lock); 9913 spin_unlock(&sinfo->lock); 9914 } 9915 9916 /* 9917 * Checks to see if it's even possible to relocate this block group. 9918 * 9919 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9920 * ok to go ahead and try. 9921 */ 9922 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9923 { 9924 struct btrfs_root *root = fs_info->extent_root; 9925 struct btrfs_block_group_cache *block_group; 9926 struct btrfs_space_info *space_info; 9927 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9928 struct btrfs_device *device; 9929 struct btrfs_trans_handle *trans; 9930 u64 min_free; 9931 u64 dev_min = 1; 9932 u64 dev_nr = 0; 9933 u64 target; 9934 int debug; 9935 int index; 9936 int full = 0; 9937 int ret = 0; 9938 9939 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9940 9941 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9942 9943 /* odd, couldn't find the block group, leave it alone */ 9944 if (!block_group) { 9945 if (debug) 9946 btrfs_warn(fs_info, 9947 "can't find block group for bytenr %llu", 9948 bytenr); 9949 return -1; 9950 } 9951 9952 min_free = btrfs_block_group_used(&block_group->item); 9953 9954 /* no bytes used, we're good */ 9955 if (!min_free) 9956 goto out; 9957 9958 space_info = block_group->space_info; 9959 spin_lock(&space_info->lock); 9960 9961 full = space_info->full; 9962 9963 /* 9964 * if this is the last block group we have in this space, we can't 9965 * relocate it unless we're able to allocate a new chunk below. 9966 * 9967 * Otherwise, we need to make sure we have room in the space to handle 9968 * all of the extents from this block group. If we can, we're good 9969 */ 9970 if ((space_info->total_bytes != block_group->key.offset) && 9971 (btrfs_space_info_used(space_info, false) + min_free < 9972 space_info->total_bytes)) { 9973 spin_unlock(&space_info->lock); 9974 goto out; 9975 } 9976 spin_unlock(&space_info->lock); 9977 9978 /* 9979 * ok we don't have enough space, but maybe we have free space on our 9980 * devices to allocate new chunks for relocation, so loop through our 9981 * alloc devices and guess if we have enough space. if this block 9982 * group is going to be restriped, run checks against the target 9983 * profile instead of the current one. 9984 */ 9985 ret = -1; 9986 9987 /* 9988 * index: 9989 * 0: raid10 9990 * 1: raid1 9991 * 2: dup 9992 * 3: raid0 9993 * 4: single 9994 */ 9995 target = get_restripe_target(fs_info, block_group->flags); 9996 if (target) { 9997 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); 9998 } else { 9999 /* 10000 * this is just a balance, so if we were marked as full 10001 * we know there is no space for a new chunk 10002 */ 10003 if (full) { 10004 if (debug) 10005 btrfs_warn(fs_info, 10006 "no space to alloc new chunk for block group %llu", 10007 block_group->key.objectid); 10008 goto out; 10009 } 10010 10011 index = btrfs_bg_flags_to_raid_index(block_group->flags); 10012 } 10013 10014 if (index == BTRFS_RAID_RAID10) { 10015 dev_min = 4; 10016 /* Divide by 2 */ 10017 min_free >>= 1; 10018 } else if (index == BTRFS_RAID_RAID1) { 10019 dev_min = 2; 10020 } else if (index == BTRFS_RAID_DUP) { 10021 /* Multiply by 2 */ 10022 min_free <<= 1; 10023 } else if (index == BTRFS_RAID_RAID0) { 10024 dev_min = fs_devices->rw_devices; 10025 min_free = div64_u64(min_free, dev_min); 10026 } 10027 10028 /* We need to do this so that we can look at pending chunks */ 10029 trans = btrfs_join_transaction(root); 10030 if (IS_ERR(trans)) { 10031 ret = PTR_ERR(trans); 10032 goto out; 10033 } 10034 10035 mutex_lock(&fs_info->chunk_mutex); 10036 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 10037 u64 dev_offset; 10038 10039 /* 10040 * check to make sure we can actually find a chunk with enough 10041 * space to fit our block group in. 10042 */ 10043 if (device->total_bytes > device->bytes_used + min_free && 10044 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 10045 ret = find_free_dev_extent(trans, device, min_free, 10046 &dev_offset, NULL); 10047 if (!ret) 10048 dev_nr++; 10049 10050 if (dev_nr >= dev_min) 10051 break; 10052 10053 ret = -1; 10054 } 10055 } 10056 if (debug && ret == -1) 10057 btrfs_warn(fs_info, 10058 "no space to allocate a new chunk for block group %llu", 10059 block_group->key.objectid); 10060 mutex_unlock(&fs_info->chunk_mutex); 10061 btrfs_end_transaction(trans); 10062 out: 10063 btrfs_put_block_group(block_group); 10064 return ret; 10065 } 10066 10067 static int find_first_block_group(struct btrfs_fs_info *fs_info, 10068 struct btrfs_path *path, 10069 struct btrfs_key *key) 10070 { 10071 struct btrfs_root *root = fs_info->extent_root; 10072 int ret = 0; 10073 struct btrfs_key found_key; 10074 struct extent_buffer *leaf; 10075 struct btrfs_block_group_item bg; 10076 u64 flags; 10077 int slot; 10078 10079 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 10080 if (ret < 0) 10081 goto out; 10082 10083 while (1) { 10084 slot = path->slots[0]; 10085 leaf = path->nodes[0]; 10086 if (slot >= btrfs_header_nritems(leaf)) { 10087 ret = btrfs_next_leaf(root, path); 10088 if (ret == 0) 10089 continue; 10090 if (ret < 0) 10091 goto out; 10092 break; 10093 } 10094 btrfs_item_key_to_cpu(leaf, &found_key, slot); 10095 10096 if (found_key.objectid >= key->objectid && 10097 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 10098 struct extent_map_tree *em_tree; 10099 struct extent_map *em; 10100 10101 em_tree = &root->fs_info->mapping_tree.map_tree; 10102 read_lock(&em_tree->lock); 10103 em = lookup_extent_mapping(em_tree, found_key.objectid, 10104 found_key.offset); 10105 read_unlock(&em_tree->lock); 10106 if (!em) { 10107 btrfs_err(fs_info, 10108 "logical %llu len %llu found bg but no related chunk", 10109 found_key.objectid, found_key.offset); 10110 ret = -ENOENT; 10111 } else if (em->start != found_key.objectid || 10112 em->len != found_key.offset) { 10113 btrfs_err(fs_info, 10114 "block group %llu len %llu mismatch with chunk %llu len %llu", 10115 found_key.objectid, found_key.offset, 10116 em->start, em->len); 10117 ret = -EUCLEAN; 10118 } else { 10119 read_extent_buffer(leaf, &bg, 10120 btrfs_item_ptr_offset(leaf, slot), 10121 sizeof(bg)); 10122 flags = btrfs_block_group_flags(&bg) & 10123 BTRFS_BLOCK_GROUP_TYPE_MASK; 10124 10125 if (flags != (em->map_lookup->type & 10126 BTRFS_BLOCK_GROUP_TYPE_MASK)) { 10127 btrfs_err(fs_info, 10128 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 10129 found_key.objectid, 10130 found_key.offset, flags, 10131 (BTRFS_BLOCK_GROUP_TYPE_MASK & 10132 em->map_lookup->type)); 10133 ret = -EUCLEAN; 10134 } else { 10135 ret = 0; 10136 } 10137 } 10138 free_extent_map(em); 10139 goto out; 10140 } 10141 path->slots[0]++; 10142 } 10143 out: 10144 return ret; 10145 } 10146 10147 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 10148 { 10149 struct btrfs_block_group_cache *block_group; 10150 u64 last = 0; 10151 10152 while (1) { 10153 struct inode *inode; 10154 10155 block_group = btrfs_lookup_first_block_group(info, last); 10156 while (block_group) { 10157 wait_block_group_cache_done(block_group); 10158 spin_lock(&block_group->lock); 10159 if (block_group->iref) 10160 break; 10161 spin_unlock(&block_group->lock); 10162 block_group = next_block_group(info, block_group); 10163 } 10164 if (!block_group) { 10165 if (last == 0) 10166 break; 10167 last = 0; 10168 continue; 10169 } 10170 10171 inode = block_group->inode; 10172 block_group->iref = 0; 10173 block_group->inode = NULL; 10174 spin_unlock(&block_group->lock); 10175 ASSERT(block_group->io_ctl.inode == NULL); 10176 iput(inode); 10177 last = block_group->key.objectid + block_group->key.offset; 10178 btrfs_put_block_group(block_group); 10179 } 10180 } 10181 10182 /* 10183 * Must be called only after stopping all workers, since we could have block 10184 * group caching kthreads running, and therefore they could race with us if we 10185 * freed the block groups before stopping them. 10186 */ 10187 int btrfs_free_block_groups(struct btrfs_fs_info *info) 10188 { 10189 struct btrfs_block_group_cache *block_group; 10190 struct btrfs_space_info *space_info; 10191 struct btrfs_caching_control *caching_ctl; 10192 struct rb_node *n; 10193 10194 down_write(&info->commit_root_sem); 10195 while (!list_empty(&info->caching_block_groups)) { 10196 caching_ctl = list_entry(info->caching_block_groups.next, 10197 struct btrfs_caching_control, list); 10198 list_del(&caching_ctl->list); 10199 put_caching_control(caching_ctl); 10200 } 10201 up_write(&info->commit_root_sem); 10202 10203 spin_lock(&info->unused_bgs_lock); 10204 while (!list_empty(&info->unused_bgs)) { 10205 block_group = list_first_entry(&info->unused_bgs, 10206 struct btrfs_block_group_cache, 10207 bg_list); 10208 list_del_init(&block_group->bg_list); 10209 btrfs_put_block_group(block_group); 10210 } 10211 spin_unlock(&info->unused_bgs_lock); 10212 10213 spin_lock(&info->block_group_cache_lock); 10214 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 10215 block_group = rb_entry(n, struct btrfs_block_group_cache, 10216 cache_node); 10217 rb_erase(&block_group->cache_node, 10218 &info->block_group_cache_tree); 10219 RB_CLEAR_NODE(&block_group->cache_node); 10220 spin_unlock(&info->block_group_cache_lock); 10221 10222 down_write(&block_group->space_info->groups_sem); 10223 list_del(&block_group->list); 10224 up_write(&block_group->space_info->groups_sem); 10225 10226 /* 10227 * We haven't cached this block group, which means we could 10228 * possibly have excluded extents on this block group. 10229 */ 10230 if (block_group->cached == BTRFS_CACHE_NO || 10231 block_group->cached == BTRFS_CACHE_ERROR) 10232 free_excluded_extents(block_group); 10233 10234 btrfs_remove_free_space_cache(block_group); 10235 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 10236 ASSERT(list_empty(&block_group->dirty_list)); 10237 ASSERT(list_empty(&block_group->io_list)); 10238 ASSERT(list_empty(&block_group->bg_list)); 10239 ASSERT(atomic_read(&block_group->count) == 1); 10240 btrfs_put_block_group(block_group); 10241 10242 spin_lock(&info->block_group_cache_lock); 10243 } 10244 spin_unlock(&info->block_group_cache_lock); 10245 10246 /* now that all the block groups are freed, go through and 10247 * free all the space_info structs. This is only called during 10248 * the final stages of unmount, and so we know nobody is 10249 * using them. We call synchronize_rcu() once before we start, 10250 * just to be on the safe side. 10251 */ 10252 synchronize_rcu(); 10253 10254 release_global_block_rsv(info); 10255 10256 while (!list_empty(&info->space_info)) { 10257 int i; 10258 10259 space_info = list_entry(info->space_info.next, 10260 struct btrfs_space_info, 10261 list); 10262 10263 /* 10264 * Do not hide this behind enospc_debug, this is actually 10265 * important and indicates a real bug if this happens. 10266 */ 10267 if (WARN_ON(space_info->bytes_pinned > 0 || 10268 space_info->bytes_reserved > 0 || 10269 space_info->bytes_may_use > 0)) 10270 dump_space_info(info, space_info, 0, 0); 10271 list_del(&space_info->list); 10272 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 10273 struct kobject *kobj; 10274 kobj = space_info->block_group_kobjs[i]; 10275 space_info->block_group_kobjs[i] = NULL; 10276 if (kobj) { 10277 kobject_del(kobj); 10278 kobject_put(kobj); 10279 } 10280 } 10281 kobject_del(&space_info->kobj); 10282 kobject_put(&space_info->kobj); 10283 } 10284 return 0; 10285 } 10286 10287 /* link_block_group will queue up kobjects to add when we're reclaim-safe */ 10288 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) 10289 { 10290 struct btrfs_space_info *space_info; 10291 struct raid_kobject *rkobj; 10292 LIST_HEAD(list); 10293 int index; 10294 int ret = 0; 10295 10296 spin_lock(&fs_info->pending_raid_kobjs_lock); 10297 list_splice_init(&fs_info->pending_raid_kobjs, &list); 10298 spin_unlock(&fs_info->pending_raid_kobjs_lock); 10299 10300 list_for_each_entry(rkobj, &list, list) { 10301 space_info = __find_space_info(fs_info, rkobj->flags); 10302 index = btrfs_bg_flags_to_raid_index(rkobj->flags); 10303 10304 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 10305 "%s", get_raid_name(index)); 10306 if (ret) { 10307 kobject_put(&rkobj->kobj); 10308 break; 10309 } 10310 } 10311 if (ret) 10312 btrfs_warn(fs_info, 10313 "failed to add kobject for block cache, ignoring"); 10314 } 10315 10316 static void link_block_group(struct btrfs_block_group_cache *cache) 10317 { 10318 struct btrfs_space_info *space_info = cache->space_info; 10319 struct btrfs_fs_info *fs_info = cache->fs_info; 10320 int index = btrfs_bg_flags_to_raid_index(cache->flags); 10321 bool first = false; 10322 10323 down_write(&space_info->groups_sem); 10324 if (list_empty(&space_info->block_groups[index])) 10325 first = true; 10326 list_add_tail(&cache->list, &space_info->block_groups[index]); 10327 up_write(&space_info->groups_sem); 10328 10329 if (first) { 10330 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 10331 if (!rkobj) { 10332 btrfs_warn(cache->fs_info, 10333 "couldn't alloc memory for raid level kobject"); 10334 return; 10335 } 10336 rkobj->flags = cache->flags; 10337 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 10338 10339 spin_lock(&fs_info->pending_raid_kobjs_lock); 10340 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); 10341 spin_unlock(&fs_info->pending_raid_kobjs_lock); 10342 space_info->block_group_kobjs[index] = &rkobj->kobj; 10343 } 10344 } 10345 10346 static struct btrfs_block_group_cache * 10347 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 10348 u64 start, u64 size) 10349 { 10350 struct btrfs_block_group_cache *cache; 10351 10352 cache = kzalloc(sizeof(*cache), GFP_NOFS); 10353 if (!cache) 10354 return NULL; 10355 10356 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 10357 GFP_NOFS); 10358 if (!cache->free_space_ctl) { 10359 kfree(cache); 10360 return NULL; 10361 } 10362 10363 cache->key.objectid = start; 10364 cache->key.offset = size; 10365 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10366 10367 cache->fs_info = fs_info; 10368 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 10369 set_free_space_tree_thresholds(cache); 10370 10371 atomic_set(&cache->count, 1); 10372 spin_lock_init(&cache->lock); 10373 init_rwsem(&cache->data_rwsem); 10374 INIT_LIST_HEAD(&cache->list); 10375 INIT_LIST_HEAD(&cache->cluster_list); 10376 INIT_LIST_HEAD(&cache->bg_list); 10377 INIT_LIST_HEAD(&cache->ro_list); 10378 INIT_LIST_HEAD(&cache->dirty_list); 10379 INIT_LIST_HEAD(&cache->io_list); 10380 btrfs_init_free_space_ctl(cache); 10381 atomic_set(&cache->trimming, 0); 10382 mutex_init(&cache->free_space_lock); 10383 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 10384 10385 return cache; 10386 } 10387 10388 10389 /* 10390 * Iterate all chunks and verify that each of them has the corresponding block 10391 * group 10392 */ 10393 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 10394 { 10395 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 10396 struct extent_map *em; 10397 struct btrfs_block_group_cache *bg; 10398 u64 start = 0; 10399 int ret = 0; 10400 10401 while (1) { 10402 read_lock(&map_tree->map_tree.lock); 10403 /* 10404 * lookup_extent_mapping will return the first extent map 10405 * intersecting the range, so setting @len to 1 is enough to 10406 * get the first chunk. 10407 */ 10408 em = lookup_extent_mapping(&map_tree->map_tree, start, 1); 10409 read_unlock(&map_tree->map_tree.lock); 10410 if (!em) 10411 break; 10412 10413 bg = btrfs_lookup_block_group(fs_info, em->start); 10414 if (!bg) { 10415 btrfs_err(fs_info, 10416 "chunk start=%llu len=%llu doesn't have corresponding block group", 10417 em->start, em->len); 10418 ret = -EUCLEAN; 10419 free_extent_map(em); 10420 break; 10421 } 10422 if (bg->key.objectid != em->start || 10423 bg->key.offset != em->len || 10424 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 10425 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 10426 btrfs_err(fs_info, 10427 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 10428 em->start, em->len, 10429 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 10430 bg->key.objectid, bg->key.offset, 10431 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 10432 ret = -EUCLEAN; 10433 free_extent_map(em); 10434 btrfs_put_block_group(bg); 10435 break; 10436 } 10437 start = em->start + em->len; 10438 free_extent_map(em); 10439 btrfs_put_block_group(bg); 10440 } 10441 return ret; 10442 } 10443 10444 int btrfs_read_block_groups(struct btrfs_fs_info *info) 10445 { 10446 struct btrfs_path *path; 10447 int ret; 10448 struct btrfs_block_group_cache *cache; 10449 struct btrfs_space_info *space_info; 10450 struct btrfs_key key; 10451 struct btrfs_key found_key; 10452 struct extent_buffer *leaf; 10453 int need_clear = 0; 10454 u64 cache_gen; 10455 u64 feature; 10456 int mixed; 10457 10458 feature = btrfs_super_incompat_flags(info->super_copy); 10459 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 10460 10461 key.objectid = 0; 10462 key.offset = 0; 10463 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10464 path = btrfs_alloc_path(); 10465 if (!path) 10466 return -ENOMEM; 10467 path->reada = READA_FORWARD; 10468 10469 cache_gen = btrfs_super_cache_generation(info->super_copy); 10470 if (btrfs_test_opt(info, SPACE_CACHE) && 10471 btrfs_super_generation(info->super_copy) != cache_gen) 10472 need_clear = 1; 10473 if (btrfs_test_opt(info, CLEAR_CACHE)) 10474 need_clear = 1; 10475 10476 while (1) { 10477 ret = find_first_block_group(info, path, &key); 10478 if (ret > 0) 10479 break; 10480 if (ret != 0) 10481 goto error; 10482 10483 leaf = path->nodes[0]; 10484 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 10485 10486 cache = btrfs_create_block_group_cache(info, found_key.objectid, 10487 found_key.offset); 10488 if (!cache) { 10489 ret = -ENOMEM; 10490 goto error; 10491 } 10492 10493 if (need_clear) { 10494 /* 10495 * When we mount with old space cache, we need to 10496 * set BTRFS_DC_CLEAR and set dirty flag. 10497 * 10498 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 10499 * truncate the old free space cache inode and 10500 * setup a new one. 10501 * b) Setting 'dirty flag' makes sure that we flush 10502 * the new space cache info onto disk. 10503 */ 10504 if (btrfs_test_opt(info, SPACE_CACHE)) 10505 cache->disk_cache_state = BTRFS_DC_CLEAR; 10506 } 10507 10508 read_extent_buffer(leaf, &cache->item, 10509 btrfs_item_ptr_offset(leaf, path->slots[0]), 10510 sizeof(cache->item)); 10511 cache->flags = btrfs_block_group_flags(&cache->item); 10512 if (!mixed && 10513 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 10514 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 10515 btrfs_err(info, 10516 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 10517 cache->key.objectid); 10518 ret = -EINVAL; 10519 goto error; 10520 } 10521 10522 key.objectid = found_key.objectid + found_key.offset; 10523 btrfs_release_path(path); 10524 10525 /* 10526 * We need to exclude the super stripes now so that the space 10527 * info has super bytes accounted for, otherwise we'll think 10528 * we have more space than we actually do. 10529 */ 10530 ret = exclude_super_stripes(cache); 10531 if (ret) { 10532 /* 10533 * We may have excluded something, so call this just in 10534 * case. 10535 */ 10536 free_excluded_extents(cache); 10537 btrfs_put_block_group(cache); 10538 goto error; 10539 } 10540 10541 /* 10542 * check for two cases, either we are full, and therefore 10543 * don't need to bother with the caching work since we won't 10544 * find any space, or we are empty, and we can just add all 10545 * the space in and be done with it. This saves us _a_lot_ of 10546 * time, particularly in the full case. 10547 */ 10548 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10549 cache->last_byte_to_unpin = (u64)-1; 10550 cache->cached = BTRFS_CACHE_FINISHED; 10551 free_excluded_extents(cache); 10552 } else if (btrfs_block_group_used(&cache->item) == 0) { 10553 cache->last_byte_to_unpin = (u64)-1; 10554 cache->cached = BTRFS_CACHE_FINISHED; 10555 add_new_free_space(cache, found_key.objectid, 10556 found_key.objectid + 10557 found_key.offset); 10558 free_excluded_extents(cache); 10559 } 10560 10561 ret = btrfs_add_block_group_cache(info, cache); 10562 if (ret) { 10563 btrfs_remove_free_space_cache(cache); 10564 btrfs_put_block_group(cache); 10565 goto error; 10566 } 10567 10568 trace_btrfs_add_block_group(info, cache, 0); 10569 update_space_info(info, cache->flags, found_key.offset, 10570 btrfs_block_group_used(&cache->item), 10571 cache->bytes_super, &space_info); 10572 10573 cache->space_info = space_info; 10574 10575 link_block_group(cache); 10576 10577 set_avail_alloc_bits(info, cache->flags); 10578 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10579 inc_block_group_ro(cache, 1); 10580 } else if (btrfs_block_group_used(&cache->item) == 0) { 10581 ASSERT(list_empty(&cache->bg_list)); 10582 btrfs_mark_bg_unused(cache); 10583 } 10584 } 10585 10586 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10587 if (!(get_alloc_profile(info, space_info->flags) & 10588 (BTRFS_BLOCK_GROUP_RAID10 | 10589 BTRFS_BLOCK_GROUP_RAID1 | 10590 BTRFS_BLOCK_GROUP_RAID5 | 10591 BTRFS_BLOCK_GROUP_RAID6 | 10592 BTRFS_BLOCK_GROUP_DUP))) 10593 continue; 10594 /* 10595 * avoid allocating from un-mirrored block group if there are 10596 * mirrored block groups. 10597 */ 10598 list_for_each_entry(cache, 10599 &space_info->block_groups[BTRFS_RAID_RAID0], 10600 list) 10601 inc_block_group_ro(cache, 1); 10602 list_for_each_entry(cache, 10603 &space_info->block_groups[BTRFS_RAID_SINGLE], 10604 list) 10605 inc_block_group_ro(cache, 1); 10606 } 10607 10608 btrfs_add_raid_kobjects(info); 10609 init_global_block_rsv(info); 10610 ret = check_chunk_block_group_mappings(info); 10611 error: 10612 btrfs_free_path(path); 10613 return ret; 10614 } 10615 10616 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 10617 { 10618 struct btrfs_fs_info *fs_info = trans->fs_info; 10619 struct btrfs_block_group_cache *block_group; 10620 struct btrfs_root *extent_root = fs_info->extent_root; 10621 struct btrfs_block_group_item item; 10622 struct btrfs_key key; 10623 int ret = 0; 10624 10625 if (!trans->can_flush_pending_bgs) 10626 return; 10627 10628 while (!list_empty(&trans->new_bgs)) { 10629 block_group = list_first_entry(&trans->new_bgs, 10630 struct btrfs_block_group_cache, 10631 bg_list); 10632 if (ret) 10633 goto next; 10634 10635 spin_lock(&block_group->lock); 10636 memcpy(&item, &block_group->item, sizeof(item)); 10637 memcpy(&key, &block_group->key, sizeof(key)); 10638 spin_unlock(&block_group->lock); 10639 10640 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10641 sizeof(item)); 10642 if (ret) 10643 btrfs_abort_transaction(trans, ret); 10644 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); 10645 if (ret) 10646 btrfs_abort_transaction(trans, ret); 10647 add_block_group_free_space(trans, block_group); 10648 /* already aborted the transaction if it failed. */ 10649 next: 10650 btrfs_delayed_refs_rsv_release(fs_info, 1); 10651 list_del_init(&block_group->bg_list); 10652 } 10653 btrfs_trans_release_chunk_metadata(trans); 10654 } 10655 10656 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, 10657 u64 type, u64 chunk_offset, u64 size) 10658 { 10659 struct btrfs_fs_info *fs_info = trans->fs_info; 10660 struct btrfs_block_group_cache *cache; 10661 int ret; 10662 10663 btrfs_set_log_full_commit(fs_info, trans); 10664 10665 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10666 if (!cache) 10667 return -ENOMEM; 10668 10669 btrfs_set_block_group_used(&cache->item, bytes_used); 10670 btrfs_set_block_group_chunk_objectid(&cache->item, 10671 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 10672 btrfs_set_block_group_flags(&cache->item, type); 10673 10674 cache->flags = type; 10675 cache->last_byte_to_unpin = (u64)-1; 10676 cache->cached = BTRFS_CACHE_FINISHED; 10677 cache->needs_free_space = 1; 10678 ret = exclude_super_stripes(cache); 10679 if (ret) { 10680 /* 10681 * We may have excluded something, so call this just in 10682 * case. 10683 */ 10684 free_excluded_extents(cache); 10685 btrfs_put_block_group(cache); 10686 return ret; 10687 } 10688 10689 add_new_free_space(cache, chunk_offset, chunk_offset + size); 10690 10691 free_excluded_extents(cache); 10692 10693 #ifdef CONFIG_BTRFS_DEBUG 10694 if (btrfs_should_fragment_free_space(cache)) { 10695 u64 new_bytes_used = size - bytes_used; 10696 10697 bytes_used += new_bytes_used >> 1; 10698 fragment_free_space(cache); 10699 } 10700 #endif 10701 /* 10702 * Ensure the corresponding space_info object is created and 10703 * assigned to our block group. We want our bg to be added to the rbtree 10704 * with its ->space_info set. 10705 */ 10706 cache->space_info = __find_space_info(fs_info, cache->flags); 10707 ASSERT(cache->space_info); 10708 10709 ret = btrfs_add_block_group_cache(fs_info, cache); 10710 if (ret) { 10711 btrfs_remove_free_space_cache(cache); 10712 btrfs_put_block_group(cache); 10713 return ret; 10714 } 10715 10716 /* 10717 * Now that our block group has its ->space_info set and is inserted in 10718 * the rbtree, update the space info's counters. 10719 */ 10720 trace_btrfs_add_block_group(fs_info, cache, 1); 10721 update_space_info(fs_info, cache->flags, size, bytes_used, 10722 cache->bytes_super, &cache->space_info); 10723 update_global_block_rsv(fs_info); 10724 10725 link_block_group(cache); 10726 10727 list_add_tail(&cache->bg_list, &trans->new_bgs); 10728 trans->delayed_ref_updates++; 10729 btrfs_update_delayed_refs_rsv(trans); 10730 10731 set_avail_alloc_bits(fs_info, type); 10732 return 0; 10733 } 10734 10735 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10736 { 10737 u64 extra_flags = chunk_to_extended(flags) & 10738 BTRFS_EXTENDED_PROFILE_MASK; 10739 10740 write_seqlock(&fs_info->profiles_lock); 10741 if (flags & BTRFS_BLOCK_GROUP_DATA) 10742 fs_info->avail_data_alloc_bits &= ~extra_flags; 10743 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10744 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10745 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10746 fs_info->avail_system_alloc_bits &= ~extra_flags; 10747 write_sequnlock(&fs_info->profiles_lock); 10748 } 10749 10750 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10751 u64 group_start, struct extent_map *em) 10752 { 10753 struct btrfs_fs_info *fs_info = trans->fs_info; 10754 struct btrfs_root *root = fs_info->extent_root; 10755 struct btrfs_path *path; 10756 struct btrfs_block_group_cache *block_group; 10757 struct btrfs_free_cluster *cluster; 10758 struct btrfs_root *tree_root = fs_info->tree_root; 10759 struct btrfs_key key; 10760 struct inode *inode; 10761 struct kobject *kobj = NULL; 10762 int ret; 10763 int index; 10764 int factor; 10765 struct btrfs_caching_control *caching_ctl = NULL; 10766 bool remove_em; 10767 bool remove_rsv = false; 10768 10769 block_group = btrfs_lookup_block_group(fs_info, group_start); 10770 BUG_ON(!block_group); 10771 BUG_ON(!block_group->ro); 10772 10773 trace_btrfs_remove_block_group(block_group); 10774 /* 10775 * Free the reserved super bytes from this block group before 10776 * remove it. 10777 */ 10778 free_excluded_extents(block_group); 10779 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, 10780 block_group->key.offset); 10781 10782 memcpy(&key, &block_group->key, sizeof(key)); 10783 index = btrfs_bg_flags_to_raid_index(block_group->flags); 10784 factor = btrfs_bg_type_to_factor(block_group->flags); 10785 10786 /* make sure this block group isn't part of an allocation cluster */ 10787 cluster = &fs_info->data_alloc_cluster; 10788 spin_lock(&cluster->refill_lock); 10789 btrfs_return_cluster_to_free_space(block_group, cluster); 10790 spin_unlock(&cluster->refill_lock); 10791 10792 /* 10793 * make sure this block group isn't part of a metadata 10794 * allocation cluster 10795 */ 10796 cluster = &fs_info->meta_alloc_cluster; 10797 spin_lock(&cluster->refill_lock); 10798 btrfs_return_cluster_to_free_space(block_group, cluster); 10799 spin_unlock(&cluster->refill_lock); 10800 10801 path = btrfs_alloc_path(); 10802 if (!path) { 10803 ret = -ENOMEM; 10804 goto out; 10805 } 10806 10807 /* 10808 * get the inode first so any iput calls done for the io_list 10809 * aren't the final iput (no unlinks allowed now) 10810 */ 10811 inode = lookup_free_space_inode(fs_info, block_group, path); 10812 10813 mutex_lock(&trans->transaction->cache_write_mutex); 10814 /* 10815 * Make sure our free space cache IO is done before removing the 10816 * free space inode 10817 */ 10818 spin_lock(&trans->transaction->dirty_bgs_lock); 10819 if (!list_empty(&block_group->io_list)) { 10820 list_del_init(&block_group->io_list); 10821 10822 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10823 10824 spin_unlock(&trans->transaction->dirty_bgs_lock); 10825 btrfs_wait_cache_io(trans, block_group, path); 10826 btrfs_put_block_group(block_group); 10827 spin_lock(&trans->transaction->dirty_bgs_lock); 10828 } 10829 10830 if (!list_empty(&block_group->dirty_list)) { 10831 list_del_init(&block_group->dirty_list); 10832 remove_rsv = true; 10833 btrfs_put_block_group(block_group); 10834 } 10835 spin_unlock(&trans->transaction->dirty_bgs_lock); 10836 mutex_unlock(&trans->transaction->cache_write_mutex); 10837 10838 if (!IS_ERR(inode)) { 10839 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10840 if (ret) { 10841 btrfs_add_delayed_iput(inode); 10842 goto out; 10843 } 10844 clear_nlink(inode); 10845 /* One for the block groups ref */ 10846 spin_lock(&block_group->lock); 10847 if (block_group->iref) { 10848 block_group->iref = 0; 10849 block_group->inode = NULL; 10850 spin_unlock(&block_group->lock); 10851 iput(inode); 10852 } else { 10853 spin_unlock(&block_group->lock); 10854 } 10855 /* One for our lookup ref */ 10856 btrfs_add_delayed_iput(inode); 10857 } 10858 10859 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10860 key.offset = block_group->key.objectid; 10861 key.type = 0; 10862 10863 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10864 if (ret < 0) 10865 goto out; 10866 if (ret > 0) 10867 btrfs_release_path(path); 10868 if (ret == 0) { 10869 ret = btrfs_del_item(trans, tree_root, path); 10870 if (ret) 10871 goto out; 10872 btrfs_release_path(path); 10873 } 10874 10875 spin_lock(&fs_info->block_group_cache_lock); 10876 rb_erase(&block_group->cache_node, 10877 &fs_info->block_group_cache_tree); 10878 RB_CLEAR_NODE(&block_group->cache_node); 10879 10880 if (fs_info->first_logical_byte == block_group->key.objectid) 10881 fs_info->first_logical_byte = (u64)-1; 10882 spin_unlock(&fs_info->block_group_cache_lock); 10883 10884 down_write(&block_group->space_info->groups_sem); 10885 /* 10886 * we must use list_del_init so people can check to see if they 10887 * are still on the list after taking the semaphore 10888 */ 10889 list_del_init(&block_group->list); 10890 if (list_empty(&block_group->space_info->block_groups[index])) { 10891 kobj = block_group->space_info->block_group_kobjs[index]; 10892 block_group->space_info->block_group_kobjs[index] = NULL; 10893 clear_avail_alloc_bits(fs_info, block_group->flags); 10894 } 10895 up_write(&block_group->space_info->groups_sem); 10896 if (kobj) { 10897 kobject_del(kobj); 10898 kobject_put(kobj); 10899 } 10900 10901 if (block_group->has_caching_ctl) 10902 caching_ctl = get_caching_control(block_group); 10903 if (block_group->cached == BTRFS_CACHE_STARTED) 10904 wait_block_group_cache_done(block_group); 10905 if (block_group->has_caching_ctl) { 10906 down_write(&fs_info->commit_root_sem); 10907 if (!caching_ctl) { 10908 struct btrfs_caching_control *ctl; 10909 10910 list_for_each_entry(ctl, 10911 &fs_info->caching_block_groups, list) 10912 if (ctl->block_group == block_group) { 10913 caching_ctl = ctl; 10914 refcount_inc(&caching_ctl->count); 10915 break; 10916 } 10917 } 10918 if (caching_ctl) 10919 list_del_init(&caching_ctl->list); 10920 up_write(&fs_info->commit_root_sem); 10921 if (caching_ctl) { 10922 /* Once for the caching bgs list and once for us. */ 10923 put_caching_control(caching_ctl); 10924 put_caching_control(caching_ctl); 10925 } 10926 } 10927 10928 spin_lock(&trans->transaction->dirty_bgs_lock); 10929 WARN_ON(!list_empty(&block_group->dirty_list)); 10930 WARN_ON(!list_empty(&block_group->io_list)); 10931 spin_unlock(&trans->transaction->dirty_bgs_lock); 10932 10933 btrfs_remove_free_space_cache(block_group); 10934 10935 spin_lock(&block_group->space_info->lock); 10936 list_del_init(&block_group->ro_list); 10937 10938 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10939 WARN_ON(block_group->space_info->total_bytes 10940 < block_group->key.offset); 10941 WARN_ON(block_group->space_info->bytes_readonly 10942 < block_group->key.offset); 10943 WARN_ON(block_group->space_info->disk_total 10944 < block_group->key.offset * factor); 10945 } 10946 block_group->space_info->total_bytes -= block_group->key.offset; 10947 block_group->space_info->bytes_readonly -= block_group->key.offset; 10948 block_group->space_info->disk_total -= block_group->key.offset * factor; 10949 10950 spin_unlock(&block_group->space_info->lock); 10951 10952 memcpy(&key, &block_group->key, sizeof(key)); 10953 10954 mutex_lock(&fs_info->chunk_mutex); 10955 if (!list_empty(&em->list)) { 10956 /* We're in the transaction->pending_chunks list. */ 10957 free_extent_map(em); 10958 } 10959 spin_lock(&block_group->lock); 10960 block_group->removed = 1; 10961 /* 10962 * At this point trimming can't start on this block group, because we 10963 * removed the block group from the tree fs_info->block_group_cache_tree 10964 * so no one can't find it anymore and even if someone already got this 10965 * block group before we removed it from the rbtree, they have already 10966 * incremented block_group->trimming - if they didn't, they won't find 10967 * any free space entries because we already removed them all when we 10968 * called btrfs_remove_free_space_cache(). 10969 * 10970 * And we must not remove the extent map from the fs_info->mapping_tree 10971 * to prevent the same logical address range and physical device space 10972 * ranges from being reused for a new block group. This is because our 10973 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10974 * completely transactionless, so while it is trimming a range the 10975 * currently running transaction might finish and a new one start, 10976 * allowing for new block groups to be created that can reuse the same 10977 * physical device locations unless we take this special care. 10978 * 10979 * There may also be an implicit trim operation if the file system 10980 * is mounted with -odiscard. The same protections must remain 10981 * in place until the extents have been discarded completely when 10982 * the transaction commit has completed. 10983 */ 10984 remove_em = (atomic_read(&block_group->trimming) == 0); 10985 /* 10986 * Make sure a trimmer task always sees the em in the pinned_chunks list 10987 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10988 * before checking block_group->removed). 10989 */ 10990 if (!remove_em) { 10991 /* 10992 * Our em might be in trans->transaction->pending_chunks which 10993 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10994 * and so is the fs_info->pinned_chunks list. 10995 * 10996 * So at this point we must be holding the chunk_mutex to avoid 10997 * any races with chunk allocation (more specifically at 10998 * volumes.c:contains_pending_extent()), to ensure it always 10999 * sees the em, either in the pending_chunks list or in the 11000 * pinned_chunks list. 11001 */ 11002 list_move_tail(&em->list, &fs_info->pinned_chunks); 11003 } 11004 spin_unlock(&block_group->lock); 11005 11006 if (remove_em) { 11007 struct extent_map_tree *em_tree; 11008 11009 em_tree = &fs_info->mapping_tree.map_tree; 11010 write_lock(&em_tree->lock); 11011 /* 11012 * The em might be in the pending_chunks list, so make sure the 11013 * chunk mutex is locked, since remove_extent_mapping() will 11014 * delete us from that list. 11015 */ 11016 remove_extent_mapping(em_tree, em); 11017 write_unlock(&em_tree->lock); 11018 /* once for the tree */ 11019 free_extent_map(em); 11020 } 11021 11022 mutex_unlock(&fs_info->chunk_mutex); 11023 11024 ret = remove_block_group_free_space(trans, block_group); 11025 if (ret) 11026 goto out; 11027 11028 btrfs_put_block_group(block_group); 11029 btrfs_put_block_group(block_group); 11030 11031 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 11032 if (ret > 0) 11033 ret = -EIO; 11034 if (ret < 0) 11035 goto out; 11036 11037 ret = btrfs_del_item(trans, root, path); 11038 out: 11039 if (remove_rsv) 11040 btrfs_delayed_refs_rsv_release(fs_info, 1); 11041 btrfs_free_path(path); 11042 return ret; 11043 } 11044 11045 struct btrfs_trans_handle * 11046 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 11047 const u64 chunk_offset) 11048 { 11049 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 11050 struct extent_map *em; 11051 struct map_lookup *map; 11052 unsigned int num_items; 11053 11054 read_lock(&em_tree->lock); 11055 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 11056 read_unlock(&em_tree->lock); 11057 ASSERT(em && em->start == chunk_offset); 11058 11059 /* 11060 * We need to reserve 3 + N units from the metadata space info in order 11061 * to remove a block group (done at btrfs_remove_chunk() and at 11062 * btrfs_remove_block_group()), which are used for: 11063 * 11064 * 1 unit for adding the free space inode's orphan (located in the tree 11065 * of tree roots). 11066 * 1 unit for deleting the block group item (located in the extent 11067 * tree). 11068 * 1 unit for deleting the free space item (located in tree of tree 11069 * roots). 11070 * N units for deleting N device extent items corresponding to each 11071 * stripe (located in the device tree). 11072 * 11073 * In order to remove a block group we also need to reserve units in the 11074 * system space info in order to update the chunk tree (update one or 11075 * more device items and remove one chunk item), but this is done at 11076 * btrfs_remove_chunk() through a call to check_system_chunk(). 11077 */ 11078 map = em->map_lookup; 11079 num_items = 3 + map->num_stripes; 11080 free_extent_map(em); 11081 11082 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 11083 num_items, 1); 11084 } 11085 11086 /* 11087 * Process the unused_bgs list and remove any that don't have any allocated 11088 * space inside of them. 11089 */ 11090 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 11091 { 11092 struct btrfs_block_group_cache *block_group; 11093 struct btrfs_space_info *space_info; 11094 struct btrfs_trans_handle *trans; 11095 int ret = 0; 11096 11097 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 11098 return; 11099 11100 spin_lock(&fs_info->unused_bgs_lock); 11101 while (!list_empty(&fs_info->unused_bgs)) { 11102 u64 start, end; 11103 int trimming; 11104 11105 block_group = list_first_entry(&fs_info->unused_bgs, 11106 struct btrfs_block_group_cache, 11107 bg_list); 11108 list_del_init(&block_group->bg_list); 11109 11110 space_info = block_group->space_info; 11111 11112 if (ret || btrfs_mixed_space_info(space_info)) { 11113 btrfs_put_block_group(block_group); 11114 continue; 11115 } 11116 spin_unlock(&fs_info->unused_bgs_lock); 11117 11118 mutex_lock(&fs_info->delete_unused_bgs_mutex); 11119 11120 /* Don't want to race with allocators so take the groups_sem */ 11121 down_write(&space_info->groups_sem); 11122 spin_lock(&block_group->lock); 11123 if (block_group->reserved || block_group->pinned || 11124 btrfs_block_group_used(&block_group->item) || 11125 block_group->ro || 11126 list_is_singular(&block_group->list)) { 11127 /* 11128 * We want to bail if we made new allocations or have 11129 * outstanding allocations in this block group. We do 11130 * the ro check in case balance is currently acting on 11131 * this block group. 11132 */ 11133 trace_btrfs_skip_unused_block_group(block_group); 11134 spin_unlock(&block_group->lock); 11135 up_write(&space_info->groups_sem); 11136 goto next; 11137 } 11138 spin_unlock(&block_group->lock); 11139 11140 /* We don't want to force the issue, only flip if it's ok. */ 11141 ret = inc_block_group_ro(block_group, 0); 11142 up_write(&space_info->groups_sem); 11143 if (ret < 0) { 11144 ret = 0; 11145 goto next; 11146 } 11147 11148 /* 11149 * Want to do this before we do anything else so we can recover 11150 * properly if we fail to join the transaction. 11151 */ 11152 trans = btrfs_start_trans_remove_block_group(fs_info, 11153 block_group->key.objectid); 11154 if (IS_ERR(trans)) { 11155 btrfs_dec_block_group_ro(block_group); 11156 ret = PTR_ERR(trans); 11157 goto next; 11158 } 11159 11160 /* 11161 * We could have pending pinned extents for this block group, 11162 * just delete them, we don't care about them anymore. 11163 */ 11164 start = block_group->key.objectid; 11165 end = start + block_group->key.offset - 1; 11166 /* 11167 * Hold the unused_bg_unpin_mutex lock to avoid racing with 11168 * btrfs_finish_extent_commit(). If we are at transaction N, 11169 * another task might be running finish_extent_commit() for the 11170 * previous transaction N - 1, and have seen a range belonging 11171 * to the block group in freed_extents[] before we were able to 11172 * clear the whole block group range from freed_extents[]. This 11173 * means that task can lookup for the block group after we 11174 * unpinned it from freed_extents[] and removed it, leading to 11175 * a BUG_ON() at btrfs_unpin_extent_range(). 11176 */ 11177 mutex_lock(&fs_info->unused_bg_unpin_mutex); 11178 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 11179 EXTENT_DIRTY); 11180 if (ret) { 11181 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11182 btrfs_dec_block_group_ro(block_group); 11183 goto end_trans; 11184 } 11185 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 11186 EXTENT_DIRTY); 11187 if (ret) { 11188 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11189 btrfs_dec_block_group_ro(block_group); 11190 goto end_trans; 11191 } 11192 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11193 11194 /* Reset pinned so btrfs_put_block_group doesn't complain */ 11195 spin_lock(&space_info->lock); 11196 spin_lock(&block_group->lock); 11197 11198 update_bytes_pinned(space_info, -block_group->pinned); 11199 space_info->bytes_readonly += block_group->pinned; 11200 percpu_counter_add_batch(&space_info->total_bytes_pinned, 11201 -block_group->pinned, 11202 BTRFS_TOTAL_BYTES_PINNED_BATCH); 11203 block_group->pinned = 0; 11204 11205 spin_unlock(&block_group->lock); 11206 spin_unlock(&space_info->lock); 11207 11208 /* DISCARD can flip during remount */ 11209 trimming = btrfs_test_opt(fs_info, DISCARD); 11210 11211 /* Implicit trim during transaction commit. */ 11212 if (trimming) 11213 btrfs_get_block_group_trimming(block_group); 11214 11215 /* 11216 * Btrfs_remove_chunk will abort the transaction if things go 11217 * horribly wrong. 11218 */ 11219 ret = btrfs_remove_chunk(trans, block_group->key.objectid); 11220 11221 if (ret) { 11222 if (trimming) 11223 btrfs_put_block_group_trimming(block_group); 11224 goto end_trans; 11225 } 11226 11227 /* 11228 * If we're not mounted with -odiscard, we can just forget 11229 * about this block group. Otherwise we'll need to wait 11230 * until transaction commit to do the actual discard. 11231 */ 11232 if (trimming) { 11233 spin_lock(&fs_info->unused_bgs_lock); 11234 /* 11235 * A concurrent scrub might have added us to the list 11236 * fs_info->unused_bgs, so use a list_move operation 11237 * to add the block group to the deleted_bgs list. 11238 */ 11239 list_move(&block_group->bg_list, 11240 &trans->transaction->deleted_bgs); 11241 spin_unlock(&fs_info->unused_bgs_lock); 11242 btrfs_get_block_group(block_group); 11243 } 11244 end_trans: 11245 btrfs_end_transaction(trans); 11246 next: 11247 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 11248 btrfs_put_block_group(block_group); 11249 spin_lock(&fs_info->unused_bgs_lock); 11250 } 11251 spin_unlock(&fs_info->unused_bgs_lock); 11252 } 11253 11254 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 11255 { 11256 struct btrfs_super_block *disk_super; 11257 u64 features; 11258 u64 flags; 11259 int mixed = 0; 11260 int ret; 11261 11262 disk_super = fs_info->super_copy; 11263 if (!btrfs_super_root(disk_super)) 11264 return -EINVAL; 11265 11266 features = btrfs_super_incompat_flags(disk_super); 11267 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 11268 mixed = 1; 11269 11270 flags = BTRFS_BLOCK_GROUP_SYSTEM; 11271 ret = create_space_info(fs_info, flags); 11272 if (ret) 11273 goto out; 11274 11275 if (mixed) { 11276 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 11277 ret = create_space_info(fs_info, flags); 11278 } else { 11279 flags = BTRFS_BLOCK_GROUP_METADATA; 11280 ret = create_space_info(fs_info, flags); 11281 if (ret) 11282 goto out; 11283 11284 flags = BTRFS_BLOCK_GROUP_DATA; 11285 ret = create_space_info(fs_info, flags); 11286 } 11287 out: 11288 return ret; 11289 } 11290 11291 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 11292 u64 start, u64 end) 11293 { 11294 return unpin_extent_range(fs_info, start, end, false); 11295 } 11296 11297 /* 11298 * It used to be that old block groups would be left around forever. 11299 * Iterating over them would be enough to trim unused space. Since we 11300 * now automatically remove them, we also need to iterate over unallocated 11301 * space. 11302 * 11303 * We don't want a transaction for this since the discard may take a 11304 * substantial amount of time. We don't require that a transaction be 11305 * running, but we do need to take a running transaction into account 11306 * to ensure that we're not discarding chunks that were released or 11307 * allocated in the current transaction. 11308 * 11309 * Holding the chunks lock will prevent other threads from allocating 11310 * or releasing chunks, but it won't prevent a running transaction 11311 * from committing and releasing the memory that the pending chunks 11312 * list head uses. For that, we need to take a reference to the 11313 * transaction and hold the commit root sem. We only need to hold 11314 * it while performing the free space search since we have already 11315 * held back allocations. 11316 */ 11317 static int btrfs_trim_free_extents(struct btrfs_device *device, 11318 u64 minlen, u64 *trimmed) 11319 { 11320 u64 start = 0, len = 0; 11321 int ret; 11322 11323 *trimmed = 0; 11324 11325 /* Discard not supported = nothing to do. */ 11326 if (!blk_queue_discard(bdev_get_queue(device->bdev))) 11327 return 0; 11328 11329 /* Not writable = nothing to do. */ 11330 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 11331 return 0; 11332 11333 /* No free space = nothing to do. */ 11334 if (device->total_bytes <= device->bytes_used) 11335 return 0; 11336 11337 ret = 0; 11338 11339 while (1) { 11340 struct btrfs_fs_info *fs_info = device->fs_info; 11341 struct btrfs_transaction *trans; 11342 u64 bytes; 11343 11344 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 11345 if (ret) 11346 break; 11347 11348 ret = down_read_killable(&fs_info->commit_root_sem); 11349 if (ret) { 11350 mutex_unlock(&fs_info->chunk_mutex); 11351 break; 11352 } 11353 11354 spin_lock(&fs_info->trans_lock); 11355 trans = fs_info->running_transaction; 11356 if (trans) 11357 refcount_inc(&trans->use_count); 11358 spin_unlock(&fs_info->trans_lock); 11359 11360 if (!trans) 11361 up_read(&fs_info->commit_root_sem); 11362 11363 ret = find_free_dev_extent_start(trans, device, minlen, start, 11364 &start, &len); 11365 if (trans) { 11366 up_read(&fs_info->commit_root_sem); 11367 btrfs_put_transaction(trans); 11368 } 11369 11370 if (ret) { 11371 mutex_unlock(&fs_info->chunk_mutex); 11372 if (ret == -ENOSPC) 11373 ret = 0; 11374 break; 11375 } 11376 11377 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 11378 mutex_unlock(&fs_info->chunk_mutex); 11379 11380 if (ret) 11381 break; 11382 11383 start += len; 11384 *trimmed += bytes; 11385 11386 if (fatal_signal_pending(current)) { 11387 ret = -ERESTARTSYS; 11388 break; 11389 } 11390 11391 cond_resched(); 11392 } 11393 11394 return ret; 11395 } 11396 11397 /* 11398 * Trim the whole filesystem by: 11399 * 1) trimming the free space in each block group 11400 * 2) trimming the unallocated space on each device 11401 * 11402 * This will also continue trimming even if a block group or device encounters 11403 * an error. The return value will be the last error, or 0 if nothing bad 11404 * happens. 11405 */ 11406 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 11407 { 11408 struct btrfs_block_group_cache *cache = NULL; 11409 struct btrfs_device *device; 11410 struct list_head *devices; 11411 u64 group_trimmed; 11412 u64 start; 11413 u64 end; 11414 u64 trimmed = 0; 11415 u64 bg_failed = 0; 11416 u64 dev_failed = 0; 11417 int bg_ret = 0; 11418 int dev_ret = 0; 11419 int ret = 0; 11420 11421 cache = btrfs_lookup_first_block_group(fs_info, range->start); 11422 for (; cache; cache = next_block_group(fs_info, cache)) { 11423 if (cache->key.objectid >= (range->start + range->len)) { 11424 btrfs_put_block_group(cache); 11425 break; 11426 } 11427 11428 start = max(range->start, cache->key.objectid); 11429 end = min(range->start + range->len, 11430 cache->key.objectid + cache->key.offset); 11431 11432 if (end - start >= range->minlen) { 11433 if (!block_group_cache_done(cache)) { 11434 ret = cache_block_group(cache, 0); 11435 if (ret) { 11436 bg_failed++; 11437 bg_ret = ret; 11438 continue; 11439 } 11440 ret = wait_block_group_cache_done(cache); 11441 if (ret) { 11442 bg_failed++; 11443 bg_ret = ret; 11444 continue; 11445 } 11446 } 11447 ret = btrfs_trim_block_group(cache, 11448 &group_trimmed, 11449 start, 11450 end, 11451 range->minlen); 11452 11453 trimmed += group_trimmed; 11454 if (ret) { 11455 bg_failed++; 11456 bg_ret = ret; 11457 continue; 11458 } 11459 } 11460 } 11461 11462 if (bg_failed) 11463 btrfs_warn(fs_info, 11464 "failed to trim %llu block group(s), last error %d", 11465 bg_failed, bg_ret); 11466 mutex_lock(&fs_info->fs_devices->device_list_mutex); 11467 devices = &fs_info->fs_devices->devices; 11468 list_for_each_entry(device, devices, dev_list) { 11469 ret = btrfs_trim_free_extents(device, range->minlen, 11470 &group_trimmed); 11471 if (ret) { 11472 dev_failed++; 11473 dev_ret = ret; 11474 break; 11475 } 11476 11477 trimmed += group_trimmed; 11478 } 11479 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 11480 11481 if (dev_failed) 11482 btrfs_warn(fs_info, 11483 "failed to trim %llu device(s), last error %d", 11484 dev_failed, dev_ret); 11485 range->len = trimmed; 11486 if (bg_ret) 11487 return bg_ret; 11488 return dev_ret; 11489 } 11490 11491 /* 11492 * btrfs_{start,end}_write_no_snapshotting() are similar to 11493 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 11494 * data into the page cache through nocow before the subvolume is snapshoted, 11495 * but flush the data into disk after the snapshot creation, or to prevent 11496 * operations while snapshotting is ongoing and that cause the snapshot to be 11497 * inconsistent (writes followed by expanding truncates for example). 11498 */ 11499 void btrfs_end_write_no_snapshotting(struct btrfs_root *root) 11500 { 11501 percpu_counter_dec(&root->subv_writers->counter); 11502 cond_wake_up(&root->subv_writers->wait); 11503 } 11504 11505 int btrfs_start_write_no_snapshotting(struct btrfs_root *root) 11506 { 11507 if (atomic_read(&root->will_be_snapshotted)) 11508 return 0; 11509 11510 percpu_counter_inc(&root->subv_writers->counter); 11511 /* 11512 * Make sure counter is updated before we check for snapshot creation. 11513 */ 11514 smp_mb(); 11515 if (atomic_read(&root->will_be_snapshotted)) { 11516 btrfs_end_write_no_snapshotting(root); 11517 return 0; 11518 } 11519 return 1; 11520 } 11521 11522 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11523 { 11524 while (true) { 11525 int ret; 11526 11527 ret = btrfs_start_write_no_snapshotting(root); 11528 if (ret) 11529 break; 11530 wait_var_event(&root->will_be_snapshotted, 11531 !atomic_read(&root->will_be_snapshotted)); 11532 } 11533 } 11534 11535 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) 11536 { 11537 struct btrfs_fs_info *fs_info = bg->fs_info; 11538 11539 spin_lock(&fs_info->unused_bgs_lock); 11540 if (list_empty(&bg->bg_list)) { 11541 btrfs_get_block_group(bg); 11542 trace_btrfs_add_unused_block_group(bg); 11543 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 11544 } 11545 spin_unlock(&fs_info->unused_bgs_lock); 11546 } 11547