1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "tree-log.h" 20 #include "disk-io.h" 21 #include "print-tree.h" 22 #include "volumes.h" 23 #include "raid56.h" 24 #include "locking.h" 25 #include "free-space-cache.h" 26 #include "free-space-tree.h" 27 #include "math.h" 28 #include "sysfs.h" 29 #include "qgroup.h" 30 #include "ref-verify.h" 31 32 #undef SCRAMBLE_DELAYED_REFS 33 34 /* 35 * control flags for do_chunk_alloc's force field 36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 37 * if we really need one. 38 * 39 * CHUNK_ALLOC_LIMITED means to only try and allocate one 40 * if we have very few chunks already allocated. This is 41 * used as part of the clustering code to help make sure 42 * we have a good pool of storage to cluster in, without 43 * filling the FS with empty chunks 44 * 45 * CHUNK_ALLOC_FORCE means it must try to allocate one 46 * 47 */ 48 enum { 49 CHUNK_ALLOC_NO_FORCE = 0, 50 CHUNK_ALLOC_LIMITED = 1, 51 CHUNK_ALLOC_FORCE = 2, 52 }; 53 54 /* 55 * Declare a helper function to detect underflow of various space info members 56 */ 57 #define DECLARE_SPACE_INFO_UPDATE(name) \ 58 static inline void update_##name(struct btrfs_space_info *sinfo, \ 59 s64 bytes) \ 60 { \ 61 if (bytes < 0 && sinfo->name < -bytes) { \ 62 WARN_ON(1); \ 63 sinfo->name = 0; \ 64 return; \ 65 } \ 66 sinfo->name += bytes; \ 67 } 68 69 DECLARE_SPACE_INFO_UPDATE(bytes_may_use); 70 DECLARE_SPACE_INFO_UPDATE(bytes_pinned); 71 72 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 73 struct btrfs_delayed_ref_node *node, u64 parent, 74 u64 root_objectid, u64 owner_objectid, 75 u64 owner_offset, int refs_to_drop, 76 struct btrfs_delayed_extent_op *extra_op); 77 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 78 struct extent_buffer *leaf, 79 struct btrfs_extent_item *ei); 80 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 81 u64 parent, u64 root_objectid, 82 u64 flags, u64 owner, u64 offset, 83 struct btrfs_key *ins, int ref_mod); 84 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 85 struct btrfs_delayed_ref_node *node, 86 struct btrfs_delayed_extent_op *extent_op); 87 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 88 int force); 89 static int find_next_key(struct btrfs_path *path, int level, 90 struct btrfs_key *key); 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 92 struct btrfs_space_info *info, u64 bytes, 93 int dump_block_groups); 94 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 95 u64 num_bytes); 96 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 97 struct btrfs_space_info *space_info, 98 u64 num_bytes); 99 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 100 struct btrfs_space_info *space_info, 101 u64 num_bytes); 102 103 static noinline int 104 block_group_cache_done(struct btrfs_block_group_cache *cache) 105 { 106 smp_mb(); 107 return cache->cached == BTRFS_CACHE_FINISHED || 108 cache->cached == BTRFS_CACHE_ERROR; 109 } 110 111 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 112 { 113 return (cache->flags & bits) == bits; 114 } 115 116 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 117 { 118 atomic_inc(&cache->count); 119 } 120 121 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 122 { 123 if (atomic_dec_and_test(&cache->count)) { 124 WARN_ON(cache->pinned > 0); 125 WARN_ON(cache->reserved > 0); 126 127 /* 128 * If not empty, someone is still holding mutex of 129 * full_stripe_lock, which can only be released by caller. 130 * And it will definitely cause use-after-free when caller 131 * tries to release full stripe lock. 132 * 133 * No better way to resolve, but only to warn. 134 */ 135 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 136 kfree(cache->free_space_ctl); 137 kfree(cache); 138 } 139 } 140 141 /* 142 * this adds the block group to the fs_info rb tree for the block group 143 * cache 144 */ 145 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 146 struct btrfs_block_group_cache *block_group) 147 { 148 struct rb_node **p; 149 struct rb_node *parent = NULL; 150 struct btrfs_block_group_cache *cache; 151 152 spin_lock(&info->block_group_cache_lock); 153 p = &info->block_group_cache_tree.rb_node; 154 155 while (*p) { 156 parent = *p; 157 cache = rb_entry(parent, struct btrfs_block_group_cache, 158 cache_node); 159 if (block_group->key.objectid < cache->key.objectid) { 160 p = &(*p)->rb_left; 161 } else if (block_group->key.objectid > cache->key.objectid) { 162 p = &(*p)->rb_right; 163 } else { 164 spin_unlock(&info->block_group_cache_lock); 165 return -EEXIST; 166 } 167 } 168 169 rb_link_node(&block_group->cache_node, parent, p); 170 rb_insert_color(&block_group->cache_node, 171 &info->block_group_cache_tree); 172 173 if (info->first_logical_byte > block_group->key.objectid) 174 info->first_logical_byte = block_group->key.objectid; 175 176 spin_unlock(&info->block_group_cache_lock); 177 178 return 0; 179 } 180 181 /* 182 * This will return the block group at or after bytenr if contains is 0, else 183 * it will return the block group that contains the bytenr 184 */ 185 static struct btrfs_block_group_cache * 186 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 187 int contains) 188 { 189 struct btrfs_block_group_cache *cache, *ret = NULL; 190 struct rb_node *n; 191 u64 end, start; 192 193 spin_lock(&info->block_group_cache_lock); 194 n = info->block_group_cache_tree.rb_node; 195 196 while (n) { 197 cache = rb_entry(n, struct btrfs_block_group_cache, 198 cache_node); 199 end = cache->key.objectid + cache->key.offset - 1; 200 start = cache->key.objectid; 201 202 if (bytenr < start) { 203 if (!contains && (!ret || start < ret->key.objectid)) 204 ret = cache; 205 n = n->rb_left; 206 } else if (bytenr > start) { 207 if (contains && bytenr <= end) { 208 ret = cache; 209 break; 210 } 211 n = n->rb_right; 212 } else { 213 ret = cache; 214 break; 215 } 216 } 217 if (ret) { 218 btrfs_get_block_group(ret); 219 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 220 info->first_logical_byte = ret->key.objectid; 221 } 222 spin_unlock(&info->block_group_cache_lock); 223 224 return ret; 225 } 226 227 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 228 u64 start, u64 num_bytes) 229 { 230 u64 end = start + num_bytes - 1; 231 set_extent_bits(&fs_info->freed_extents[0], 232 start, end, EXTENT_UPTODATE); 233 set_extent_bits(&fs_info->freed_extents[1], 234 start, end, EXTENT_UPTODATE); 235 return 0; 236 } 237 238 static void free_excluded_extents(struct btrfs_block_group_cache *cache) 239 { 240 struct btrfs_fs_info *fs_info = cache->fs_info; 241 u64 start, end; 242 243 start = cache->key.objectid; 244 end = start + cache->key.offset - 1; 245 246 clear_extent_bits(&fs_info->freed_extents[0], 247 start, end, EXTENT_UPTODATE); 248 clear_extent_bits(&fs_info->freed_extents[1], 249 start, end, EXTENT_UPTODATE); 250 } 251 252 static int exclude_super_stripes(struct btrfs_block_group_cache *cache) 253 { 254 struct btrfs_fs_info *fs_info = cache->fs_info; 255 u64 bytenr; 256 u64 *logical; 257 int stripe_len; 258 int i, nr, ret; 259 260 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 261 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 262 cache->bytes_super += stripe_len; 263 ret = add_excluded_extent(fs_info, cache->key.objectid, 264 stripe_len); 265 if (ret) 266 return ret; 267 } 268 269 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 270 bytenr = btrfs_sb_offset(i); 271 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 272 bytenr, &logical, &nr, &stripe_len); 273 if (ret) 274 return ret; 275 276 while (nr--) { 277 u64 start, len; 278 279 if (logical[nr] > cache->key.objectid + 280 cache->key.offset) 281 continue; 282 283 if (logical[nr] + stripe_len <= cache->key.objectid) 284 continue; 285 286 start = logical[nr]; 287 if (start < cache->key.objectid) { 288 start = cache->key.objectid; 289 len = (logical[nr] + stripe_len) - start; 290 } else { 291 len = min_t(u64, stripe_len, 292 cache->key.objectid + 293 cache->key.offset - start); 294 } 295 296 cache->bytes_super += len; 297 ret = add_excluded_extent(fs_info, start, len); 298 if (ret) { 299 kfree(logical); 300 return ret; 301 } 302 } 303 304 kfree(logical); 305 } 306 return 0; 307 } 308 309 static struct btrfs_caching_control * 310 get_caching_control(struct btrfs_block_group_cache *cache) 311 { 312 struct btrfs_caching_control *ctl; 313 314 spin_lock(&cache->lock); 315 if (!cache->caching_ctl) { 316 spin_unlock(&cache->lock); 317 return NULL; 318 } 319 320 ctl = cache->caching_ctl; 321 refcount_inc(&ctl->count); 322 spin_unlock(&cache->lock); 323 return ctl; 324 } 325 326 static void put_caching_control(struct btrfs_caching_control *ctl) 327 { 328 if (refcount_dec_and_test(&ctl->count)) 329 kfree(ctl); 330 } 331 332 #ifdef CONFIG_BTRFS_DEBUG 333 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 334 { 335 struct btrfs_fs_info *fs_info = block_group->fs_info; 336 u64 start = block_group->key.objectid; 337 u64 len = block_group->key.offset; 338 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 339 fs_info->nodesize : fs_info->sectorsize; 340 u64 step = chunk << 1; 341 342 while (len > chunk) { 343 btrfs_remove_free_space(block_group, start, chunk); 344 start += step; 345 if (len < step) 346 len = 0; 347 else 348 len -= step; 349 } 350 } 351 #endif 352 353 /* 354 * this is only called by cache_block_group, since we could have freed extents 355 * we need to check the pinned_extents for any extents that can't be used yet 356 * since their free space will be released as soon as the transaction commits. 357 */ 358 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 359 u64 start, u64 end) 360 { 361 struct btrfs_fs_info *info = block_group->fs_info; 362 u64 extent_start, extent_end, size, total_added = 0; 363 int ret; 364 365 while (start < end) { 366 ret = find_first_extent_bit(info->pinned_extents, start, 367 &extent_start, &extent_end, 368 EXTENT_DIRTY | EXTENT_UPTODATE, 369 NULL); 370 if (ret) 371 break; 372 373 if (extent_start <= start) { 374 start = extent_end + 1; 375 } else if (extent_start > start && extent_start < end) { 376 size = extent_start - start; 377 total_added += size; 378 ret = btrfs_add_free_space(block_group, start, 379 size); 380 BUG_ON(ret); /* -ENOMEM or logic error */ 381 start = extent_end + 1; 382 } else { 383 break; 384 } 385 } 386 387 if (start < end) { 388 size = end - start; 389 total_added += size; 390 ret = btrfs_add_free_space(block_group, start, size); 391 BUG_ON(ret); /* -ENOMEM or logic error */ 392 } 393 394 return total_added; 395 } 396 397 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 398 { 399 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 400 struct btrfs_fs_info *fs_info = block_group->fs_info; 401 struct btrfs_root *extent_root = fs_info->extent_root; 402 struct btrfs_path *path; 403 struct extent_buffer *leaf; 404 struct btrfs_key key; 405 u64 total_found = 0; 406 u64 last = 0; 407 u32 nritems; 408 int ret; 409 bool wakeup = true; 410 411 path = btrfs_alloc_path(); 412 if (!path) 413 return -ENOMEM; 414 415 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 416 417 #ifdef CONFIG_BTRFS_DEBUG 418 /* 419 * If we're fragmenting we don't want to make anybody think we can 420 * allocate from this block group until we've had a chance to fragment 421 * the free space. 422 */ 423 if (btrfs_should_fragment_free_space(block_group)) 424 wakeup = false; 425 #endif 426 /* 427 * We don't want to deadlock with somebody trying to allocate a new 428 * extent for the extent root while also trying to search the extent 429 * root to add free space. So we skip locking and search the commit 430 * root, since its read-only 431 */ 432 path->skip_locking = 1; 433 path->search_commit_root = 1; 434 path->reada = READA_FORWARD; 435 436 key.objectid = last; 437 key.offset = 0; 438 key.type = BTRFS_EXTENT_ITEM_KEY; 439 440 next: 441 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 442 if (ret < 0) 443 goto out; 444 445 leaf = path->nodes[0]; 446 nritems = btrfs_header_nritems(leaf); 447 448 while (1) { 449 if (btrfs_fs_closing(fs_info) > 1) { 450 last = (u64)-1; 451 break; 452 } 453 454 if (path->slots[0] < nritems) { 455 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 456 } else { 457 ret = find_next_key(path, 0, &key); 458 if (ret) 459 break; 460 461 if (need_resched() || 462 rwsem_is_contended(&fs_info->commit_root_sem)) { 463 if (wakeup) 464 caching_ctl->progress = last; 465 btrfs_release_path(path); 466 up_read(&fs_info->commit_root_sem); 467 mutex_unlock(&caching_ctl->mutex); 468 cond_resched(); 469 mutex_lock(&caching_ctl->mutex); 470 down_read(&fs_info->commit_root_sem); 471 goto next; 472 } 473 474 ret = btrfs_next_leaf(extent_root, path); 475 if (ret < 0) 476 goto out; 477 if (ret) 478 break; 479 leaf = path->nodes[0]; 480 nritems = btrfs_header_nritems(leaf); 481 continue; 482 } 483 484 if (key.objectid < last) { 485 key.objectid = last; 486 key.offset = 0; 487 key.type = BTRFS_EXTENT_ITEM_KEY; 488 489 if (wakeup) 490 caching_ctl->progress = last; 491 btrfs_release_path(path); 492 goto next; 493 } 494 495 if (key.objectid < block_group->key.objectid) { 496 path->slots[0]++; 497 continue; 498 } 499 500 if (key.objectid >= block_group->key.objectid + 501 block_group->key.offset) 502 break; 503 504 if (key.type == BTRFS_EXTENT_ITEM_KEY || 505 key.type == BTRFS_METADATA_ITEM_KEY) { 506 total_found += add_new_free_space(block_group, last, 507 key.objectid); 508 if (key.type == BTRFS_METADATA_ITEM_KEY) 509 last = key.objectid + 510 fs_info->nodesize; 511 else 512 last = key.objectid + key.offset; 513 514 if (total_found > CACHING_CTL_WAKE_UP) { 515 total_found = 0; 516 if (wakeup) 517 wake_up(&caching_ctl->wait); 518 } 519 } 520 path->slots[0]++; 521 } 522 ret = 0; 523 524 total_found += add_new_free_space(block_group, last, 525 block_group->key.objectid + 526 block_group->key.offset); 527 caching_ctl->progress = (u64)-1; 528 529 out: 530 btrfs_free_path(path); 531 return ret; 532 } 533 534 static noinline void caching_thread(struct btrfs_work *work) 535 { 536 struct btrfs_block_group_cache *block_group; 537 struct btrfs_fs_info *fs_info; 538 struct btrfs_caching_control *caching_ctl; 539 int ret; 540 541 caching_ctl = container_of(work, struct btrfs_caching_control, work); 542 block_group = caching_ctl->block_group; 543 fs_info = block_group->fs_info; 544 545 mutex_lock(&caching_ctl->mutex); 546 down_read(&fs_info->commit_root_sem); 547 548 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 549 ret = load_free_space_tree(caching_ctl); 550 else 551 ret = load_extent_tree_free(caching_ctl); 552 553 spin_lock(&block_group->lock); 554 block_group->caching_ctl = NULL; 555 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 556 spin_unlock(&block_group->lock); 557 558 #ifdef CONFIG_BTRFS_DEBUG 559 if (btrfs_should_fragment_free_space(block_group)) { 560 u64 bytes_used; 561 562 spin_lock(&block_group->space_info->lock); 563 spin_lock(&block_group->lock); 564 bytes_used = block_group->key.offset - 565 btrfs_block_group_used(&block_group->item); 566 block_group->space_info->bytes_used += bytes_used >> 1; 567 spin_unlock(&block_group->lock); 568 spin_unlock(&block_group->space_info->lock); 569 fragment_free_space(block_group); 570 } 571 #endif 572 573 caching_ctl->progress = (u64)-1; 574 575 up_read(&fs_info->commit_root_sem); 576 free_excluded_extents(block_group); 577 mutex_unlock(&caching_ctl->mutex); 578 579 wake_up(&caching_ctl->wait); 580 581 put_caching_control(caching_ctl); 582 btrfs_put_block_group(block_group); 583 } 584 585 static int cache_block_group(struct btrfs_block_group_cache *cache, 586 int load_cache_only) 587 { 588 DEFINE_WAIT(wait); 589 struct btrfs_fs_info *fs_info = cache->fs_info; 590 struct btrfs_caching_control *caching_ctl; 591 int ret = 0; 592 593 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 594 if (!caching_ctl) 595 return -ENOMEM; 596 597 INIT_LIST_HEAD(&caching_ctl->list); 598 mutex_init(&caching_ctl->mutex); 599 init_waitqueue_head(&caching_ctl->wait); 600 caching_ctl->block_group = cache; 601 caching_ctl->progress = cache->key.objectid; 602 refcount_set(&caching_ctl->count, 1); 603 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 604 caching_thread, NULL, NULL); 605 606 spin_lock(&cache->lock); 607 /* 608 * This should be a rare occasion, but this could happen I think in the 609 * case where one thread starts to load the space cache info, and then 610 * some other thread starts a transaction commit which tries to do an 611 * allocation while the other thread is still loading the space cache 612 * info. The previous loop should have kept us from choosing this block 613 * group, but if we've moved to the state where we will wait on caching 614 * block groups we need to first check if we're doing a fast load here, 615 * so we can wait for it to finish, otherwise we could end up allocating 616 * from a block group who's cache gets evicted for one reason or 617 * another. 618 */ 619 while (cache->cached == BTRFS_CACHE_FAST) { 620 struct btrfs_caching_control *ctl; 621 622 ctl = cache->caching_ctl; 623 refcount_inc(&ctl->count); 624 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 625 spin_unlock(&cache->lock); 626 627 schedule(); 628 629 finish_wait(&ctl->wait, &wait); 630 put_caching_control(ctl); 631 spin_lock(&cache->lock); 632 } 633 634 if (cache->cached != BTRFS_CACHE_NO) { 635 spin_unlock(&cache->lock); 636 kfree(caching_ctl); 637 return 0; 638 } 639 WARN_ON(cache->caching_ctl); 640 cache->caching_ctl = caching_ctl; 641 cache->cached = BTRFS_CACHE_FAST; 642 spin_unlock(&cache->lock); 643 644 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 645 mutex_lock(&caching_ctl->mutex); 646 ret = load_free_space_cache(fs_info, cache); 647 648 spin_lock(&cache->lock); 649 if (ret == 1) { 650 cache->caching_ctl = NULL; 651 cache->cached = BTRFS_CACHE_FINISHED; 652 cache->last_byte_to_unpin = (u64)-1; 653 caching_ctl->progress = (u64)-1; 654 } else { 655 if (load_cache_only) { 656 cache->caching_ctl = NULL; 657 cache->cached = BTRFS_CACHE_NO; 658 } else { 659 cache->cached = BTRFS_CACHE_STARTED; 660 cache->has_caching_ctl = 1; 661 } 662 } 663 spin_unlock(&cache->lock); 664 #ifdef CONFIG_BTRFS_DEBUG 665 if (ret == 1 && 666 btrfs_should_fragment_free_space(cache)) { 667 u64 bytes_used; 668 669 spin_lock(&cache->space_info->lock); 670 spin_lock(&cache->lock); 671 bytes_used = cache->key.offset - 672 btrfs_block_group_used(&cache->item); 673 cache->space_info->bytes_used += bytes_used >> 1; 674 spin_unlock(&cache->lock); 675 spin_unlock(&cache->space_info->lock); 676 fragment_free_space(cache); 677 } 678 #endif 679 mutex_unlock(&caching_ctl->mutex); 680 681 wake_up(&caching_ctl->wait); 682 if (ret == 1) { 683 put_caching_control(caching_ctl); 684 free_excluded_extents(cache); 685 return 0; 686 } 687 } else { 688 /* 689 * We're either using the free space tree or no caching at all. 690 * Set cached to the appropriate value and wakeup any waiters. 691 */ 692 spin_lock(&cache->lock); 693 if (load_cache_only) { 694 cache->caching_ctl = NULL; 695 cache->cached = BTRFS_CACHE_NO; 696 } else { 697 cache->cached = BTRFS_CACHE_STARTED; 698 cache->has_caching_ctl = 1; 699 } 700 spin_unlock(&cache->lock); 701 wake_up(&caching_ctl->wait); 702 } 703 704 if (load_cache_only) { 705 put_caching_control(caching_ctl); 706 return 0; 707 } 708 709 down_write(&fs_info->commit_root_sem); 710 refcount_inc(&caching_ctl->count); 711 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 712 up_write(&fs_info->commit_root_sem); 713 714 btrfs_get_block_group(cache); 715 716 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 717 718 return ret; 719 } 720 721 /* 722 * return the block group that starts at or after bytenr 723 */ 724 static struct btrfs_block_group_cache * 725 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 726 { 727 return block_group_cache_tree_search(info, bytenr, 0); 728 } 729 730 /* 731 * return the block group that contains the given bytenr 732 */ 733 struct btrfs_block_group_cache *btrfs_lookup_block_group( 734 struct btrfs_fs_info *info, 735 u64 bytenr) 736 { 737 return block_group_cache_tree_search(info, bytenr, 1); 738 } 739 740 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 741 u64 flags) 742 { 743 struct list_head *head = &info->space_info; 744 struct btrfs_space_info *found; 745 746 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 747 748 rcu_read_lock(); 749 list_for_each_entry_rcu(found, head, list) { 750 if (found->flags & flags) { 751 rcu_read_unlock(); 752 return found; 753 } 754 } 755 rcu_read_unlock(); 756 return NULL; 757 } 758 759 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, 760 bool metadata, u64 root_objectid) 761 { 762 struct btrfs_space_info *space_info; 763 u64 flags; 764 765 if (metadata) { 766 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 767 flags = BTRFS_BLOCK_GROUP_SYSTEM; 768 else 769 flags = BTRFS_BLOCK_GROUP_METADATA; 770 } else { 771 flags = BTRFS_BLOCK_GROUP_DATA; 772 } 773 774 space_info = __find_space_info(fs_info, flags); 775 ASSERT(space_info); 776 percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, 777 BTRFS_TOTAL_BYTES_PINNED_BATCH); 778 } 779 780 /* 781 * after adding space to the filesystem, we need to clear the full flags 782 * on all the space infos. 783 */ 784 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 785 { 786 struct list_head *head = &info->space_info; 787 struct btrfs_space_info *found; 788 789 rcu_read_lock(); 790 list_for_each_entry_rcu(found, head, list) 791 found->full = 0; 792 rcu_read_unlock(); 793 } 794 795 /* simple helper to search for an existing data extent at a given offset */ 796 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 797 { 798 int ret; 799 struct btrfs_key key; 800 struct btrfs_path *path; 801 802 path = btrfs_alloc_path(); 803 if (!path) 804 return -ENOMEM; 805 806 key.objectid = start; 807 key.offset = len; 808 key.type = BTRFS_EXTENT_ITEM_KEY; 809 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 810 btrfs_free_path(path); 811 return ret; 812 } 813 814 /* 815 * helper function to lookup reference count and flags of a tree block. 816 * 817 * the head node for delayed ref is used to store the sum of all the 818 * reference count modifications queued up in the rbtree. the head 819 * node may also store the extent flags to set. This way you can check 820 * to see what the reference count and extent flags would be if all of 821 * the delayed refs are not processed. 822 */ 823 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 824 struct btrfs_fs_info *fs_info, u64 bytenr, 825 u64 offset, int metadata, u64 *refs, u64 *flags) 826 { 827 struct btrfs_delayed_ref_head *head; 828 struct btrfs_delayed_ref_root *delayed_refs; 829 struct btrfs_path *path; 830 struct btrfs_extent_item *ei; 831 struct extent_buffer *leaf; 832 struct btrfs_key key; 833 u32 item_size; 834 u64 num_refs; 835 u64 extent_flags; 836 int ret; 837 838 /* 839 * If we don't have skinny metadata, don't bother doing anything 840 * different 841 */ 842 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 843 offset = fs_info->nodesize; 844 metadata = 0; 845 } 846 847 path = btrfs_alloc_path(); 848 if (!path) 849 return -ENOMEM; 850 851 if (!trans) { 852 path->skip_locking = 1; 853 path->search_commit_root = 1; 854 } 855 856 search_again: 857 key.objectid = bytenr; 858 key.offset = offset; 859 if (metadata) 860 key.type = BTRFS_METADATA_ITEM_KEY; 861 else 862 key.type = BTRFS_EXTENT_ITEM_KEY; 863 864 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 865 if (ret < 0) 866 goto out_free; 867 868 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 869 if (path->slots[0]) { 870 path->slots[0]--; 871 btrfs_item_key_to_cpu(path->nodes[0], &key, 872 path->slots[0]); 873 if (key.objectid == bytenr && 874 key.type == BTRFS_EXTENT_ITEM_KEY && 875 key.offset == fs_info->nodesize) 876 ret = 0; 877 } 878 } 879 880 if (ret == 0) { 881 leaf = path->nodes[0]; 882 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 883 if (item_size >= sizeof(*ei)) { 884 ei = btrfs_item_ptr(leaf, path->slots[0], 885 struct btrfs_extent_item); 886 num_refs = btrfs_extent_refs(leaf, ei); 887 extent_flags = btrfs_extent_flags(leaf, ei); 888 } else { 889 ret = -EINVAL; 890 btrfs_print_v0_err(fs_info); 891 if (trans) 892 btrfs_abort_transaction(trans, ret); 893 else 894 btrfs_handle_fs_error(fs_info, ret, NULL); 895 896 goto out_free; 897 } 898 899 BUG_ON(num_refs == 0); 900 } else { 901 num_refs = 0; 902 extent_flags = 0; 903 ret = 0; 904 } 905 906 if (!trans) 907 goto out; 908 909 delayed_refs = &trans->transaction->delayed_refs; 910 spin_lock(&delayed_refs->lock); 911 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 912 if (head) { 913 if (!mutex_trylock(&head->mutex)) { 914 refcount_inc(&head->refs); 915 spin_unlock(&delayed_refs->lock); 916 917 btrfs_release_path(path); 918 919 /* 920 * Mutex was contended, block until it's released and try 921 * again 922 */ 923 mutex_lock(&head->mutex); 924 mutex_unlock(&head->mutex); 925 btrfs_put_delayed_ref_head(head); 926 goto search_again; 927 } 928 spin_lock(&head->lock); 929 if (head->extent_op && head->extent_op->update_flags) 930 extent_flags |= head->extent_op->flags_to_set; 931 else 932 BUG_ON(num_refs == 0); 933 934 num_refs += head->ref_mod; 935 spin_unlock(&head->lock); 936 mutex_unlock(&head->mutex); 937 } 938 spin_unlock(&delayed_refs->lock); 939 out: 940 WARN_ON(num_refs == 0); 941 if (refs) 942 *refs = num_refs; 943 if (flags) 944 *flags = extent_flags; 945 out_free: 946 btrfs_free_path(path); 947 return ret; 948 } 949 950 /* 951 * Back reference rules. Back refs have three main goals: 952 * 953 * 1) differentiate between all holders of references to an extent so that 954 * when a reference is dropped we can make sure it was a valid reference 955 * before freeing the extent. 956 * 957 * 2) Provide enough information to quickly find the holders of an extent 958 * if we notice a given block is corrupted or bad. 959 * 960 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 961 * maintenance. This is actually the same as #2, but with a slightly 962 * different use case. 963 * 964 * There are two kinds of back refs. The implicit back refs is optimized 965 * for pointers in non-shared tree blocks. For a given pointer in a block, 966 * back refs of this kind provide information about the block's owner tree 967 * and the pointer's key. These information allow us to find the block by 968 * b-tree searching. The full back refs is for pointers in tree blocks not 969 * referenced by their owner trees. The location of tree block is recorded 970 * in the back refs. Actually the full back refs is generic, and can be 971 * used in all cases the implicit back refs is used. The major shortcoming 972 * of the full back refs is its overhead. Every time a tree block gets 973 * COWed, we have to update back refs entry for all pointers in it. 974 * 975 * For a newly allocated tree block, we use implicit back refs for 976 * pointers in it. This means most tree related operations only involve 977 * implicit back refs. For a tree block created in old transaction, the 978 * only way to drop a reference to it is COW it. So we can detect the 979 * event that tree block loses its owner tree's reference and do the 980 * back refs conversion. 981 * 982 * When a tree block is COWed through a tree, there are four cases: 983 * 984 * The reference count of the block is one and the tree is the block's 985 * owner tree. Nothing to do in this case. 986 * 987 * The reference count of the block is one and the tree is not the 988 * block's owner tree. In this case, full back refs is used for pointers 989 * in the block. Remove these full back refs, add implicit back refs for 990 * every pointers in the new block. 991 * 992 * The reference count of the block is greater than one and the tree is 993 * the block's owner tree. In this case, implicit back refs is used for 994 * pointers in the block. Add full back refs for every pointers in the 995 * block, increase lower level extents' reference counts. The original 996 * implicit back refs are entailed to the new block. 997 * 998 * The reference count of the block is greater than one and the tree is 999 * not the block's owner tree. Add implicit back refs for every pointer in 1000 * the new block, increase lower level extents' reference count. 1001 * 1002 * Back Reference Key composing: 1003 * 1004 * The key objectid corresponds to the first byte in the extent, 1005 * The key type is used to differentiate between types of back refs. 1006 * There are different meanings of the key offset for different types 1007 * of back refs. 1008 * 1009 * File extents can be referenced by: 1010 * 1011 * - multiple snapshots, subvolumes, or different generations in one subvol 1012 * - different files inside a single subvolume 1013 * - different offsets inside a file (bookend extents in file.c) 1014 * 1015 * The extent ref structure for the implicit back refs has fields for: 1016 * 1017 * - Objectid of the subvolume root 1018 * - objectid of the file holding the reference 1019 * - original offset in the file 1020 * - how many bookend extents 1021 * 1022 * The key offset for the implicit back refs is hash of the first 1023 * three fields. 1024 * 1025 * The extent ref structure for the full back refs has field for: 1026 * 1027 * - number of pointers in the tree leaf 1028 * 1029 * The key offset for the implicit back refs is the first byte of 1030 * the tree leaf 1031 * 1032 * When a file extent is allocated, The implicit back refs is used. 1033 * the fields are filled in: 1034 * 1035 * (root_key.objectid, inode objectid, offset in file, 1) 1036 * 1037 * When a file extent is removed file truncation, we find the 1038 * corresponding implicit back refs and check the following fields: 1039 * 1040 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1041 * 1042 * Btree extents can be referenced by: 1043 * 1044 * - Different subvolumes 1045 * 1046 * Both the implicit back refs and the full back refs for tree blocks 1047 * only consist of key. The key offset for the implicit back refs is 1048 * objectid of block's owner tree. The key offset for the full back refs 1049 * is the first byte of parent block. 1050 * 1051 * When implicit back refs is used, information about the lowest key and 1052 * level of the tree block are required. These information are stored in 1053 * tree block info structure. 1054 */ 1055 1056 /* 1057 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 1058 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 1059 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 1060 */ 1061 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 1062 struct btrfs_extent_inline_ref *iref, 1063 enum btrfs_inline_ref_type is_data) 1064 { 1065 int type = btrfs_extent_inline_ref_type(eb, iref); 1066 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 1067 1068 if (type == BTRFS_TREE_BLOCK_REF_KEY || 1069 type == BTRFS_SHARED_BLOCK_REF_KEY || 1070 type == BTRFS_SHARED_DATA_REF_KEY || 1071 type == BTRFS_EXTENT_DATA_REF_KEY) { 1072 if (is_data == BTRFS_REF_TYPE_BLOCK) { 1073 if (type == BTRFS_TREE_BLOCK_REF_KEY) 1074 return type; 1075 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1076 ASSERT(eb->fs_info); 1077 /* 1078 * Every shared one has parent tree 1079 * block, which must be aligned to 1080 * nodesize. 1081 */ 1082 if (offset && 1083 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1084 return type; 1085 } 1086 } else if (is_data == BTRFS_REF_TYPE_DATA) { 1087 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1088 return type; 1089 if (type == BTRFS_SHARED_DATA_REF_KEY) { 1090 ASSERT(eb->fs_info); 1091 /* 1092 * Every shared one has parent tree 1093 * block, which must be aligned to 1094 * nodesize. 1095 */ 1096 if (offset && 1097 IS_ALIGNED(offset, eb->fs_info->nodesize)) 1098 return type; 1099 } 1100 } else { 1101 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 1102 return type; 1103 } 1104 } 1105 1106 btrfs_print_leaf((struct extent_buffer *)eb); 1107 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", 1108 eb->start, type); 1109 WARN_ON(1); 1110 1111 return BTRFS_REF_TYPE_INVALID; 1112 } 1113 1114 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1115 { 1116 u32 high_crc = ~(u32)0; 1117 u32 low_crc = ~(u32)0; 1118 __le64 lenum; 1119 1120 lenum = cpu_to_le64(root_objectid); 1121 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1122 lenum = cpu_to_le64(owner); 1123 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1124 lenum = cpu_to_le64(offset); 1125 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1126 1127 return ((u64)high_crc << 31) ^ (u64)low_crc; 1128 } 1129 1130 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1131 struct btrfs_extent_data_ref *ref) 1132 { 1133 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1134 btrfs_extent_data_ref_objectid(leaf, ref), 1135 btrfs_extent_data_ref_offset(leaf, ref)); 1136 } 1137 1138 static int match_extent_data_ref(struct extent_buffer *leaf, 1139 struct btrfs_extent_data_ref *ref, 1140 u64 root_objectid, u64 owner, u64 offset) 1141 { 1142 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1143 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1144 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1145 return 0; 1146 return 1; 1147 } 1148 1149 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1150 struct btrfs_path *path, 1151 u64 bytenr, u64 parent, 1152 u64 root_objectid, 1153 u64 owner, u64 offset) 1154 { 1155 struct btrfs_root *root = trans->fs_info->extent_root; 1156 struct btrfs_key key; 1157 struct btrfs_extent_data_ref *ref; 1158 struct extent_buffer *leaf; 1159 u32 nritems; 1160 int ret; 1161 int recow; 1162 int err = -ENOENT; 1163 1164 key.objectid = bytenr; 1165 if (parent) { 1166 key.type = BTRFS_SHARED_DATA_REF_KEY; 1167 key.offset = parent; 1168 } else { 1169 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1170 key.offset = hash_extent_data_ref(root_objectid, 1171 owner, offset); 1172 } 1173 again: 1174 recow = 0; 1175 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1176 if (ret < 0) { 1177 err = ret; 1178 goto fail; 1179 } 1180 1181 if (parent) { 1182 if (!ret) 1183 return 0; 1184 goto fail; 1185 } 1186 1187 leaf = path->nodes[0]; 1188 nritems = btrfs_header_nritems(leaf); 1189 while (1) { 1190 if (path->slots[0] >= nritems) { 1191 ret = btrfs_next_leaf(root, path); 1192 if (ret < 0) 1193 err = ret; 1194 if (ret) 1195 goto fail; 1196 1197 leaf = path->nodes[0]; 1198 nritems = btrfs_header_nritems(leaf); 1199 recow = 1; 1200 } 1201 1202 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1203 if (key.objectid != bytenr || 1204 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1205 goto fail; 1206 1207 ref = btrfs_item_ptr(leaf, path->slots[0], 1208 struct btrfs_extent_data_ref); 1209 1210 if (match_extent_data_ref(leaf, ref, root_objectid, 1211 owner, offset)) { 1212 if (recow) { 1213 btrfs_release_path(path); 1214 goto again; 1215 } 1216 err = 0; 1217 break; 1218 } 1219 path->slots[0]++; 1220 } 1221 fail: 1222 return err; 1223 } 1224 1225 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1226 struct btrfs_path *path, 1227 u64 bytenr, u64 parent, 1228 u64 root_objectid, u64 owner, 1229 u64 offset, int refs_to_add) 1230 { 1231 struct btrfs_root *root = trans->fs_info->extent_root; 1232 struct btrfs_key key; 1233 struct extent_buffer *leaf; 1234 u32 size; 1235 u32 num_refs; 1236 int ret; 1237 1238 key.objectid = bytenr; 1239 if (parent) { 1240 key.type = BTRFS_SHARED_DATA_REF_KEY; 1241 key.offset = parent; 1242 size = sizeof(struct btrfs_shared_data_ref); 1243 } else { 1244 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1245 key.offset = hash_extent_data_ref(root_objectid, 1246 owner, offset); 1247 size = sizeof(struct btrfs_extent_data_ref); 1248 } 1249 1250 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1251 if (ret && ret != -EEXIST) 1252 goto fail; 1253 1254 leaf = path->nodes[0]; 1255 if (parent) { 1256 struct btrfs_shared_data_ref *ref; 1257 ref = btrfs_item_ptr(leaf, path->slots[0], 1258 struct btrfs_shared_data_ref); 1259 if (ret == 0) { 1260 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1261 } else { 1262 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1263 num_refs += refs_to_add; 1264 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1265 } 1266 } else { 1267 struct btrfs_extent_data_ref *ref; 1268 while (ret == -EEXIST) { 1269 ref = btrfs_item_ptr(leaf, path->slots[0], 1270 struct btrfs_extent_data_ref); 1271 if (match_extent_data_ref(leaf, ref, root_objectid, 1272 owner, offset)) 1273 break; 1274 btrfs_release_path(path); 1275 key.offset++; 1276 ret = btrfs_insert_empty_item(trans, root, path, &key, 1277 size); 1278 if (ret && ret != -EEXIST) 1279 goto fail; 1280 1281 leaf = path->nodes[0]; 1282 } 1283 ref = btrfs_item_ptr(leaf, path->slots[0], 1284 struct btrfs_extent_data_ref); 1285 if (ret == 0) { 1286 btrfs_set_extent_data_ref_root(leaf, ref, 1287 root_objectid); 1288 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1289 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1290 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1291 } else { 1292 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1293 num_refs += refs_to_add; 1294 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1295 } 1296 } 1297 btrfs_mark_buffer_dirty(leaf); 1298 ret = 0; 1299 fail: 1300 btrfs_release_path(path); 1301 return ret; 1302 } 1303 1304 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1305 struct btrfs_path *path, 1306 int refs_to_drop, int *last_ref) 1307 { 1308 struct btrfs_key key; 1309 struct btrfs_extent_data_ref *ref1 = NULL; 1310 struct btrfs_shared_data_ref *ref2 = NULL; 1311 struct extent_buffer *leaf; 1312 u32 num_refs = 0; 1313 int ret = 0; 1314 1315 leaf = path->nodes[0]; 1316 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1317 1318 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1319 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1320 struct btrfs_extent_data_ref); 1321 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1322 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1323 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1324 struct btrfs_shared_data_ref); 1325 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1326 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 1327 btrfs_print_v0_err(trans->fs_info); 1328 btrfs_abort_transaction(trans, -EINVAL); 1329 return -EINVAL; 1330 } else { 1331 BUG(); 1332 } 1333 1334 BUG_ON(num_refs < refs_to_drop); 1335 num_refs -= refs_to_drop; 1336 1337 if (num_refs == 0) { 1338 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1339 *last_ref = 1; 1340 } else { 1341 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1342 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1343 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1344 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1345 btrfs_mark_buffer_dirty(leaf); 1346 } 1347 return ret; 1348 } 1349 1350 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1351 struct btrfs_extent_inline_ref *iref) 1352 { 1353 struct btrfs_key key; 1354 struct extent_buffer *leaf; 1355 struct btrfs_extent_data_ref *ref1; 1356 struct btrfs_shared_data_ref *ref2; 1357 u32 num_refs = 0; 1358 int type; 1359 1360 leaf = path->nodes[0]; 1361 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1362 1363 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 1364 if (iref) { 1365 /* 1366 * If type is invalid, we should have bailed out earlier than 1367 * this call. 1368 */ 1369 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 1370 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1371 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1372 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1373 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1374 } else { 1375 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1376 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1377 } 1378 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1379 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1380 struct btrfs_extent_data_ref); 1381 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1382 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1383 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1384 struct btrfs_shared_data_ref); 1385 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1386 } else { 1387 WARN_ON(1); 1388 } 1389 return num_refs; 1390 } 1391 1392 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1393 struct btrfs_path *path, 1394 u64 bytenr, u64 parent, 1395 u64 root_objectid) 1396 { 1397 struct btrfs_root *root = trans->fs_info->extent_root; 1398 struct btrfs_key key; 1399 int ret; 1400 1401 key.objectid = bytenr; 1402 if (parent) { 1403 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1404 key.offset = parent; 1405 } else { 1406 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1407 key.offset = root_objectid; 1408 } 1409 1410 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1411 if (ret > 0) 1412 ret = -ENOENT; 1413 return ret; 1414 } 1415 1416 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1417 struct btrfs_path *path, 1418 u64 bytenr, u64 parent, 1419 u64 root_objectid) 1420 { 1421 struct btrfs_key key; 1422 int ret; 1423 1424 key.objectid = bytenr; 1425 if (parent) { 1426 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1427 key.offset = parent; 1428 } else { 1429 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1430 key.offset = root_objectid; 1431 } 1432 1433 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root, 1434 path, &key, 0); 1435 btrfs_release_path(path); 1436 return ret; 1437 } 1438 1439 static inline int extent_ref_type(u64 parent, u64 owner) 1440 { 1441 int type; 1442 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1443 if (parent > 0) 1444 type = BTRFS_SHARED_BLOCK_REF_KEY; 1445 else 1446 type = BTRFS_TREE_BLOCK_REF_KEY; 1447 } else { 1448 if (parent > 0) 1449 type = BTRFS_SHARED_DATA_REF_KEY; 1450 else 1451 type = BTRFS_EXTENT_DATA_REF_KEY; 1452 } 1453 return type; 1454 } 1455 1456 static int find_next_key(struct btrfs_path *path, int level, 1457 struct btrfs_key *key) 1458 1459 { 1460 for (; level < BTRFS_MAX_LEVEL; level++) { 1461 if (!path->nodes[level]) 1462 break; 1463 if (path->slots[level] + 1 >= 1464 btrfs_header_nritems(path->nodes[level])) 1465 continue; 1466 if (level == 0) 1467 btrfs_item_key_to_cpu(path->nodes[level], key, 1468 path->slots[level] + 1); 1469 else 1470 btrfs_node_key_to_cpu(path->nodes[level], key, 1471 path->slots[level] + 1); 1472 return 0; 1473 } 1474 return 1; 1475 } 1476 1477 /* 1478 * look for inline back ref. if back ref is found, *ref_ret is set 1479 * to the address of inline back ref, and 0 is returned. 1480 * 1481 * if back ref isn't found, *ref_ret is set to the address where it 1482 * should be inserted, and -ENOENT is returned. 1483 * 1484 * if insert is true and there are too many inline back refs, the path 1485 * points to the extent item, and -EAGAIN is returned. 1486 * 1487 * NOTE: inline back refs are ordered in the same way that back ref 1488 * items in the tree are ordered. 1489 */ 1490 static noinline_for_stack 1491 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1492 struct btrfs_path *path, 1493 struct btrfs_extent_inline_ref **ref_ret, 1494 u64 bytenr, u64 num_bytes, 1495 u64 parent, u64 root_objectid, 1496 u64 owner, u64 offset, int insert) 1497 { 1498 struct btrfs_fs_info *fs_info = trans->fs_info; 1499 struct btrfs_root *root = fs_info->extent_root; 1500 struct btrfs_key key; 1501 struct extent_buffer *leaf; 1502 struct btrfs_extent_item *ei; 1503 struct btrfs_extent_inline_ref *iref; 1504 u64 flags; 1505 u64 item_size; 1506 unsigned long ptr; 1507 unsigned long end; 1508 int extra_size; 1509 int type; 1510 int want; 1511 int ret; 1512 int err = 0; 1513 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1514 int needed; 1515 1516 key.objectid = bytenr; 1517 key.type = BTRFS_EXTENT_ITEM_KEY; 1518 key.offset = num_bytes; 1519 1520 want = extent_ref_type(parent, owner); 1521 if (insert) { 1522 extra_size = btrfs_extent_inline_ref_size(want); 1523 path->keep_locks = 1; 1524 } else 1525 extra_size = -1; 1526 1527 /* 1528 * Owner is our level, so we can just add one to get the level for the 1529 * block we are interested in. 1530 */ 1531 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1532 key.type = BTRFS_METADATA_ITEM_KEY; 1533 key.offset = owner; 1534 } 1535 1536 again: 1537 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1538 if (ret < 0) { 1539 err = ret; 1540 goto out; 1541 } 1542 1543 /* 1544 * We may be a newly converted file system which still has the old fat 1545 * extent entries for metadata, so try and see if we have one of those. 1546 */ 1547 if (ret > 0 && skinny_metadata) { 1548 skinny_metadata = false; 1549 if (path->slots[0]) { 1550 path->slots[0]--; 1551 btrfs_item_key_to_cpu(path->nodes[0], &key, 1552 path->slots[0]); 1553 if (key.objectid == bytenr && 1554 key.type == BTRFS_EXTENT_ITEM_KEY && 1555 key.offset == num_bytes) 1556 ret = 0; 1557 } 1558 if (ret) { 1559 key.objectid = bytenr; 1560 key.type = BTRFS_EXTENT_ITEM_KEY; 1561 key.offset = num_bytes; 1562 btrfs_release_path(path); 1563 goto again; 1564 } 1565 } 1566 1567 if (ret && !insert) { 1568 err = -ENOENT; 1569 goto out; 1570 } else if (WARN_ON(ret)) { 1571 err = -EIO; 1572 goto out; 1573 } 1574 1575 leaf = path->nodes[0]; 1576 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1577 if (unlikely(item_size < sizeof(*ei))) { 1578 err = -EINVAL; 1579 btrfs_print_v0_err(fs_info); 1580 btrfs_abort_transaction(trans, err); 1581 goto out; 1582 } 1583 1584 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1585 flags = btrfs_extent_flags(leaf, ei); 1586 1587 ptr = (unsigned long)(ei + 1); 1588 end = (unsigned long)ei + item_size; 1589 1590 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1591 ptr += sizeof(struct btrfs_tree_block_info); 1592 BUG_ON(ptr > end); 1593 } 1594 1595 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 1596 needed = BTRFS_REF_TYPE_DATA; 1597 else 1598 needed = BTRFS_REF_TYPE_BLOCK; 1599 1600 err = -ENOENT; 1601 while (1) { 1602 if (ptr >= end) { 1603 WARN_ON(ptr > end); 1604 break; 1605 } 1606 iref = (struct btrfs_extent_inline_ref *)ptr; 1607 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 1608 if (type == BTRFS_REF_TYPE_INVALID) { 1609 err = -EUCLEAN; 1610 goto out; 1611 } 1612 1613 if (want < type) 1614 break; 1615 if (want > type) { 1616 ptr += btrfs_extent_inline_ref_size(type); 1617 continue; 1618 } 1619 1620 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1621 struct btrfs_extent_data_ref *dref; 1622 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1623 if (match_extent_data_ref(leaf, dref, root_objectid, 1624 owner, offset)) { 1625 err = 0; 1626 break; 1627 } 1628 if (hash_extent_data_ref_item(leaf, dref) < 1629 hash_extent_data_ref(root_objectid, owner, offset)) 1630 break; 1631 } else { 1632 u64 ref_offset; 1633 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1634 if (parent > 0) { 1635 if (parent == ref_offset) { 1636 err = 0; 1637 break; 1638 } 1639 if (ref_offset < parent) 1640 break; 1641 } else { 1642 if (root_objectid == ref_offset) { 1643 err = 0; 1644 break; 1645 } 1646 if (ref_offset < root_objectid) 1647 break; 1648 } 1649 } 1650 ptr += btrfs_extent_inline_ref_size(type); 1651 } 1652 if (err == -ENOENT && insert) { 1653 if (item_size + extra_size >= 1654 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1655 err = -EAGAIN; 1656 goto out; 1657 } 1658 /* 1659 * To add new inline back ref, we have to make sure 1660 * there is no corresponding back ref item. 1661 * For simplicity, we just do not add new inline back 1662 * ref if there is any kind of item for this block 1663 */ 1664 if (find_next_key(path, 0, &key) == 0 && 1665 key.objectid == bytenr && 1666 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1667 err = -EAGAIN; 1668 goto out; 1669 } 1670 } 1671 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1672 out: 1673 if (insert) { 1674 path->keep_locks = 0; 1675 btrfs_unlock_up_safe(path, 1); 1676 } 1677 return err; 1678 } 1679 1680 /* 1681 * helper to add new inline back ref 1682 */ 1683 static noinline_for_stack 1684 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1685 struct btrfs_path *path, 1686 struct btrfs_extent_inline_ref *iref, 1687 u64 parent, u64 root_objectid, 1688 u64 owner, u64 offset, int refs_to_add, 1689 struct btrfs_delayed_extent_op *extent_op) 1690 { 1691 struct extent_buffer *leaf; 1692 struct btrfs_extent_item *ei; 1693 unsigned long ptr; 1694 unsigned long end; 1695 unsigned long item_offset; 1696 u64 refs; 1697 int size; 1698 int type; 1699 1700 leaf = path->nodes[0]; 1701 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1702 item_offset = (unsigned long)iref - (unsigned long)ei; 1703 1704 type = extent_ref_type(parent, owner); 1705 size = btrfs_extent_inline_ref_size(type); 1706 1707 btrfs_extend_item(fs_info, path, size); 1708 1709 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1710 refs = btrfs_extent_refs(leaf, ei); 1711 refs += refs_to_add; 1712 btrfs_set_extent_refs(leaf, ei, refs); 1713 if (extent_op) 1714 __run_delayed_extent_op(extent_op, leaf, ei); 1715 1716 ptr = (unsigned long)ei + item_offset; 1717 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1718 if (ptr < end - size) 1719 memmove_extent_buffer(leaf, ptr + size, ptr, 1720 end - size - ptr); 1721 1722 iref = (struct btrfs_extent_inline_ref *)ptr; 1723 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1724 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1725 struct btrfs_extent_data_ref *dref; 1726 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1727 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1728 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1729 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1730 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1731 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1732 struct btrfs_shared_data_ref *sref; 1733 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1734 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1735 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1736 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1737 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1738 } else { 1739 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1740 } 1741 btrfs_mark_buffer_dirty(leaf); 1742 } 1743 1744 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1745 struct btrfs_path *path, 1746 struct btrfs_extent_inline_ref **ref_ret, 1747 u64 bytenr, u64 num_bytes, u64 parent, 1748 u64 root_objectid, u64 owner, u64 offset) 1749 { 1750 int ret; 1751 1752 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1753 num_bytes, parent, root_objectid, 1754 owner, offset, 0); 1755 if (ret != -ENOENT) 1756 return ret; 1757 1758 btrfs_release_path(path); 1759 *ref_ret = NULL; 1760 1761 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1762 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1763 root_objectid); 1764 } else { 1765 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1766 root_objectid, owner, offset); 1767 } 1768 return ret; 1769 } 1770 1771 /* 1772 * helper to update/remove inline back ref 1773 */ 1774 static noinline_for_stack 1775 void update_inline_extent_backref(struct btrfs_path *path, 1776 struct btrfs_extent_inline_ref *iref, 1777 int refs_to_mod, 1778 struct btrfs_delayed_extent_op *extent_op, 1779 int *last_ref) 1780 { 1781 struct extent_buffer *leaf = path->nodes[0]; 1782 struct btrfs_fs_info *fs_info = leaf->fs_info; 1783 struct btrfs_extent_item *ei; 1784 struct btrfs_extent_data_ref *dref = NULL; 1785 struct btrfs_shared_data_ref *sref = NULL; 1786 unsigned long ptr; 1787 unsigned long end; 1788 u32 item_size; 1789 int size; 1790 int type; 1791 u64 refs; 1792 1793 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1794 refs = btrfs_extent_refs(leaf, ei); 1795 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1796 refs += refs_to_mod; 1797 btrfs_set_extent_refs(leaf, ei, refs); 1798 if (extent_op) 1799 __run_delayed_extent_op(extent_op, leaf, ei); 1800 1801 /* 1802 * If type is invalid, we should have bailed out after 1803 * lookup_inline_extent_backref(). 1804 */ 1805 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1806 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1807 1808 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1809 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1810 refs = btrfs_extent_data_ref_count(leaf, dref); 1811 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1812 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1813 refs = btrfs_shared_data_ref_count(leaf, sref); 1814 } else { 1815 refs = 1; 1816 BUG_ON(refs_to_mod != -1); 1817 } 1818 1819 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1820 refs += refs_to_mod; 1821 1822 if (refs > 0) { 1823 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1824 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1825 else 1826 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1827 } else { 1828 *last_ref = 1; 1829 size = btrfs_extent_inline_ref_size(type); 1830 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1831 ptr = (unsigned long)iref; 1832 end = (unsigned long)ei + item_size; 1833 if (ptr + size < end) 1834 memmove_extent_buffer(leaf, ptr, ptr + size, 1835 end - ptr - size); 1836 item_size -= size; 1837 btrfs_truncate_item(fs_info, path, item_size, 1); 1838 } 1839 btrfs_mark_buffer_dirty(leaf); 1840 } 1841 1842 static noinline_for_stack 1843 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1844 struct btrfs_path *path, 1845 u64 bytenr, u64 num_bytes, u64 parent, 1846 u64 root_objectid, u64 owner, 1847 u64 offset, int refs_to_add, 1848 struct btrfs_delayed_extent_op *extent_op) 1849 { 1850 struct btrfs_extent_inline_ref *iref; 1851 int ret; 1852 1853 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1854 num_bytes, parent, root_objectid, 1855 owner, offset, 1); 1856 if (ret == 0) { 1857 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1858 update_inline_extent_backref(path, iref, refs_to_add, 1859 extent_op, NULL); 1860 } else if (ret == -ENOENT) { 1861 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1862 root_objectid, owner, offset, 1863 refs_to_add, extent_op); 1864 ret = 0; 1865 } 1866 return ret; 1867 } 1868 1869 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1870 struct btrfs_path *path, 1871 u64 bytenr, u64 parent, u64 root_objectid, 1872 u64 owner, u64 offset, int refs_to_add) 1873 { 1874 int ret; 1875 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1876 BUG_ON(refs_to_add != 1); 1877 ret = insert_tree_block_ref(trans, path, bytenr, parent, 1878 root_objectid); 1879 } else { 1880 ret = insert_extent_data_ref(trans, path, bytenr, parent, 1881 root_objectid, owner, offset, 1882 refs_to_add); 1883 } 1884 return ret; 1885 } 1886 1887 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1888 struct btrfs_path *path, 1889 struct btrfs_extent_inline_ref *iref, 1890 int refs_to_drop, int is_data, int *last_ref) 1891 { 1892 int ret = 0; 1893 1894 BUG_ON(!is_data && refs_to_drop != 1); 1895 if (iref) { 1896 update_inline_extent_backref(path, iref, -refs_to_drop, NULL, 1897 last_ref); 1898 } else if (is_data) { 1899 ret = remove_extent_data_ref(trans, path, refs_to_drop, 1900 last_ref); 1901 } else { 1902 *last_ref = 1; 1903 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path); 1904 } 1905 return ret; 1906 } 1907 1908 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1909 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1910 u64 *discarded_bytes) 1911 { 1912 int j, ret = 0; 1913 u64 bytes_left, end; 1914 u64 aligned_start = ALIGN(start, 1 << 9); 1915 1916 if (WARN_ON(start != aligned_start)) { 1917 len -= aligned_start - start; 1918 len = round_down(len, 1 << 9); 1919 start = aligned_start; 1920 } 1921 1922 *discarded_bytes = 0; 1923 1924 if (!len) 1925 return 0; 1926 1927 end = start + len; 1928 bytes_left = len; 1929 1930 /* Skip any superblocks on this device. */ 1931 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1932 u64 sb_start = btrfs_sb_offset(j); 1933 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1934 u64 size = sb_start - start; 1935 1936 if (!in_range(sb_start, start, bytes_left) && 1937 !in_range(sb_end, start, bytes_left) && 1938 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1939 continue; 1940 1941 /* 1942 * Superblock spans beginning of range. Adjust start and 1943 * try again. 1944 */ 1945 if (sb_start <= start) { 1946 start += sb_end - start; 1947 if (start > end) { 1948 bytes_left = 0; 1949 break; 1950 } 1951 bytes_left = end - start; 1952 continue; 1953 } 1954 1955 if (size) { 1956 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 1957 GFP_NOFS, 0); 1958 if (!ret) 1959 *discarded_bytes += size; 1960 else if (ret != -EOPNOTSUPP) 1961 return ret; 1962 } 1963 1964 start = sb_end; 1965 if (start > end) { 1966 bytes_left = 0; 1967 break; 1968 } 1969 bytes_left = end - start; 1970 } 1971 1972 if (bytes_left) { 1973 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 1974 GFP_NOFS, 0); 1975 if (!ret) 1976 *discarded_bytes += bytes_left; 1977 } 1978 return ret; 1979 } 1980 1981 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1982 u64 num_bytes, u64 *actual_bytes) 1983 { 1984 int ret; 1985 u64 discarded_bytes = 0; 1986 struct btrfs_bio *bbio = NULL; 1987 1988 1989 /* 1990 * Avoid races with device replace and make sure our bbio has devices 1991 * associated to its stripes that don't go away while we are discarding. 1992 */ 1993 btrfs_bio_counter_inc_blocked(fs_info); 1994 /* Tell the block device(s) that the sectors can be discarded */ 1995 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 1996 &bbio, 0); 1997 /* Error condition is -ENOMEM */ 1998 if (!ret) { 1999 struct btrfs_bio_stripe *stripe = bbio->stripes; 2000 int i; 2001 2002 2003 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2004 u64 bytes; 2005 struct request_queue *req_q; 2006 2007 if (!stripe->dev->bdev) { 2008 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 2009 continue; 2010 } 2011 req_q = bdev_get_queue(stripe->dev->bdev); 2012 if (!blk_queue_discard(req_q)) 2013 continue; 2014 2015 ret = btrfs_issue_discard(stripe->dev->bdev, 2016 stripe->physical, 2017 stripe->length, 2018 &bytes); 2019 if (!ret) 2020 discarded_bytes += bytes; 2021 else if (ret != -EOPNOTSUPP) 2022 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2023 2024 /* 2025 * Just in case we get back EOPNOTSUPP for some reason, 2026 * just ignore the return value so we don't screw up 2027 * people calling discard_extent. 2028 */ 2029 ret = 0; 2030 } 2031 btrfs_put_bbio(bbio); 2032 } 2033 btrfs_bio_counter_dec(fs_info); 2034 2035 if (actual_bytes) 2036 *actual_bytes = discarded_bytes; 2037 2038 2039 if (ret == -EOPNOTSUPP) 2040 ret = 0; 2041 return ret; 2042 } 2043 2044 /* Can return -ENOMEM */ 2045 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2046 struct btrfs_root *root, 2047 u64 bytenr, u64 num_bytes, u64 parent, 2048 u64 root_objectid, u64 owner, u64 offset) 2049 { 2050 struct btrfs_fs_info *fs_info = root->fs_info; 2051 int old_ref_mod, new_ref_mod; 2052 int ret; 2053 2054 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2055 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2056 2057 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, 2058 owner, offset, BTRFS_ADD_DELAYED_REF); 2059 2060 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2061 ret = btrfs_add_delayed_tree_ref(trans, bytenr, 2062 num_bytes, parent, 2063 root_objectid, (int)owner, 2064 BTRFS_ADD_DELAYED_REF, NULL, 2065 &old_ref_mod, &new_ref_mod); 2066 } else { 2067 ret = btrfs_add_delayed_data_ref(trans, bytenr, 2068 num_bytes, parent, 2069 root_objectid, owner, offset, 2070 0, BTRFS_ADD_DELAYED_REF, 2071 &old_ref_mod, &new_ref_mod); 2072 } 2073 2074 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { 2075 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; 2076 2077 add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); 2078 } 2079 2080 return ret; 2081 } 2082 2083 /* 2084 * __btrfs_inc_extent_ref - insert backreference for a given extent 2085 * 2086 * @trans: Handle of transaction 2087 * 2088 * @node: The delayed ref node used to get the bytenr/length for 2089 * extent whose references are incremented. 2090 * 2091 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ 2092 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical 2093 * bytenr of the parent block. Since new extents are always 2094 * created with indirect references, this will only be the case 2095 * when relocating a shared extent. In that case, root_objectid 2096 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must 2097 * be 0 2098 * 2099 * @root_objectid: The id of the root where this modification has originated, 2100 * this can be either one of the well-known metadata trees or 2101 * the subvolume id which references this extent. 2102 * 2103 * @owner: For data extents it is the inode number of the owning file. 2104 * For metadata extents this parameter holds the level in the 2105 * tree of the extent. 2106 * 2107 * @offset: For metadata extents the offset is ignored and is currently 2108 * always passed as 0. For data extents it is the fileoffset 2109 * this extent belongs to. 2110 * 2111 * @refs_to_add Number of references to add 2112 * 2113 * @extent_op Pointer to a structure, holding information necessary when 2114 * updating a tree block's flags 2115 * 2116 */ 2117 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2118 struct btrfs_delayed_ref_node *node, 2119 u64 parent, u64 root_objectid, 2120 u64 owner, u64 offset, int refs_to_add, 2121 struct btrfs_delayed_extent_op *extent_op) 2122 { 2123 struct btrfs_path *path; 2124 struct extent_buffer *leaf; 2125 struct btrfs_extent_item *item; 2126 struct btrfs_key key; 2127 u64 bytenr = node->bytenr; 2128 u64 num_bytes = node->num_bytes; 2129 u64 refs; 2130 int ret; 2131 2132 path = btrfs_alloc_path(); 2133 if (!path) 2134 return -ENOMEM; 2135 2136 path->reada = READA_FORWARD; 2137 path->leave_spinning = 1; 2138 /* this will setup the path even if it fails to insert the back ref */ 2139 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 2140 parent, root_objectid, owner, 2141 offset, refs_to_add, extent_op); 2142 if ((ret < 0 && ret != -EAGAIN) || !ret) 2143 goto out; 2144 2145 /* 2146 * Ok we had -EAGAIN which means we didn't have space to insert and 2147 * inline extent ref, so just update the reference count and add a 2148 * normal backref. 2149 */ 2150 leaf = path->nodes[0]; 2151 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2152 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2153 refs = btrfs_extent_refs(leaf, item); 2154 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2155 if (extent_op) 2156 __run_delayed_extent_op(extent_op, leaf, item); 2157 2158 btrfs_mark_buffer_dirty(leaf); 2159 btrfs_release_path(path); 2160 2161 path->reada = READA_FORWARD; 2162 path->leave_spinning = 1; 2163 /* now insert the actual backref */ 2164 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid, 2165 owner, offset, refs_to_add); 2166 if (ret) 2167 btrfs_abort_transaction(trans, ret); 2168 out: 2169 btrfs_free_path(path); 2170 return ret; 2171 } 2172 2173 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2174 struct btrfs_delayed_ref_node *node, 2175 struct btrfs_delayed_extent_op *extent_op, 2176 int insert_reserved) 2177 { 2178 int ret = 0; 2179 struct btrfs_delayed_data_ref *ref; 2180 struct btrfs_key ins; 2181 u64 parent = 0; 2182 u64 ref_root = 0; 2183 u64 flags = 0; 2184 2185 ins.objectid = node->bytenr; 2186 ins.offset = node->num_bytes; 2187 ins.type = BTRFS_EXTENT_ITEM_KEY; 2188 2189 ref = btrfs_delayed_node_to_data_ref(node); 2190 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); 2191 2192 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2193 parent = ref->parent; 2194 ref_root = ref->root; 2195 2196 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2197 if (extent_op) 2198 flags |= extent_op->flags_to_set; 2199 ret = alloc_reserved_file_extent(trans, parent, ref_root, 2200 flags, ref->objectid, 2201 ref->offset, &ins, 2202 node->ref_mod); 2203 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2204 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2205 ref->objectid, ref->offset, 2206 node->ref_mod, extent_op); 2207 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2208 ret = __btrfs_free_extent(trans, node, parent, 2209 ref_root, ref->objectid, 2210 ref->offset, node->ref_mod, 2211 extent_op); 2212 } else { 2213 BUG(); 2214 } 2215 return ret; 2216 } 2217 2218 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2219 struct extent_buffer *leaf, 2220 struct btrfs_extent_item *ei) 2221 { 2222 u64 flags = btrfs_extent_flags(leaf, ei); 2223 if (extent_op->update_flags) { 2224 flags |= extent_op->flags_to_set; 2225 btrfs_set_extent_flags(leaf, ei, flags); 2226 } 2227 2228 if (extent_op->update_key) { 2229 struct btrfs_tree_block_info *bi; 2230 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2231 bi = (struct btrfs_tree_block_info *)(ei + 1); 2232 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2233 } 2234 } 2235 2236 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2237 struct btrfs_delayed_ref_head *head, 2238 struct btrfs_delayed_extent_op *extent_op) 2239 { 2240 struct btrfs_fs_info *fs_info = trans->fs_info; 2241 struct btrfs_key key; 2242 struct btrfs_path *path; 2243 struct btrfs_extent_item *ei; 2244 struct extent_buffer *leaf; 2245 u32 item_size; 2246 int ret; 2247 int err = 0; 2248 int metadata = !extent_op->is_data; 2249 2250 if (trans->aborted) 2251 return 0; 2252 2253 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2254 metadata = 0; 2255 2256 path = btrfs_alloc_path(); 2257 if (!path) 2258 return -ENOMEM; 2259 2260 key.objectid = head->bytenr; 2261 2262 if (metadata) { 2263 key.type = BTRFS_METADATA_ITEM_KEY; 2264 key.offset = extent_op->level; 2265 } else { 2266 key.type = BTRFS_EXTENT_ITEM_KEY; 2267 key.offset = head->num_bytes; 2268 } 2269 2270 again: 2271 path->reada = READA_FORWARD; 2272 path->leave_spinning = 1; 2273 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2274 if (ret < 0) { 2275 err = ret; 2276 goto out; 2277 } 2278 if (ret > 0) { 2279 if (metadata) { 2280 if (path->slots[0] > 0) { 2281 path->slots[0]--; 2282 btrfs_item_key_to_cpu(path->nodes[0], &key, 2283 path->slots[0]); 2284 if (key.objectid == head->bytenr && 2285 key.type == BTRFS_EXTENT_ITEM_KEY && 2286 key.offset == head->num_bytes) 2287 ret = 0; 2288 } 2289 if (ret > 0) { 2290 btrfs_release_path(path); 2291 metadata = 0; 2292 2293 key.objectid = head->bytenr; 2294 key.offset = head->num_bytes; 2295 key.type = BTRFS_EXTENT_ITEM_KEY; 2296 goto again; 2297 } 2298 } else { 2299 err = -EIO; 2300 goto out; 2301 } 2302 } 2303 2304 leaf = path->nodes[0]; 2305 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2306 2307 if (unlikely(item_size < sizeof(*ei))) { 2308 err = -EINVAL; 2309 btrfs_print_v0_err(fs_info); 2310 btrfs_abort_transaction(trans, err); 2311 goto out; 2312 } 2313 2314 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2315 __run_delayed_extent_op(extent_op, leaf, ei); 2316 2317 btrfs_mark_buffer_dirty(leaf); 2318 out: 2319 btrfs_free_path(path); 2320 return err; 2321 } 2322 2323 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2324 struct btrfs_delayed_ref_node *node, 2325 struct btrfs_delayed_extent_op *extent_op, 2326 int insert_reserved) 2327 { 2328 int ret = 0; 2329 struct btrfs_delayed_tree_ref *ref; 2330 u64 parent = 0; 2331 u64 ref_root = 0; 2332 2333 ref = btrfs_delayed_node_to_tree_ref(node); 2334 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); 2335 2336 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2337 parent = ref->parent; 2338 ref_root = ref->root; 2339 2340 if (node->ref_mod != 1) { 2341 btrfs_err(trans->fs_info, 2342 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2343 node->bytenr, node->ref_mod, node->action, ref_root, 2344 parent); 2345 return -EIO; 2346 } 2347 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2348 BUG_ON(!extent_op || !extent_op->update_flags); 2349 ret = alloc_reserved_tree_block(trans, node, extent_op); 2350 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2351 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 2352 ref->level, 0, 1, extent_op); 2353 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2354 ret = __btrfs_free_extent(trans, node, parent, ref_root, 2355 ref->level, 0, 1, extent_op); 2356 } else { 2357 BUG(); 2358 } 2359 return ret; 2360 } 2361 2362 /* helper function to actually process a single delayed ref entry */ 2363 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2364 struct btrfs_delayed_ref_node *node, 2365 struct btrfs_delayed_extent_op *extent_op, 2366 int insert_reserved) 2367 { 2368 int ret = 0; 2369 2370 if (trans->aborted) { 2371 if (insert_reserved) 2372 btrfs_pin_extent(trans->fs_info, node->bytenr, 2373 node->num_bytes, 1); 2374 return 0; 2375 } 2376 2377 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2378 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2379 ret = run_delayed_tree_ref(trans, node, extent_op, 2380 insert_reserved); 2381 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2382 node->type == BTRFS_SHARED_DATA_REF_KEY) 2383 ret = run_delayed_data_ref(trans, node, extent_op, 2384 insert_reserved); 2385 else 2386 BUG(); 2387 if (ret && insert_reserved) 2388 btrfs_pin_extent(trans->fs_info, node->bytenr, 2389 node->num_bytes, 1); 2390 return ret; 2391 } 2392 2393 static inline struct btrfs_delayed_ref_node * 2394 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2395 { 2396 struct btrfs_delayed_ref_node *ref; 2397 2398 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 2399 return NULL; 2400 2401 /* 2402 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2403 * This is to prevent a ref count from going down to zero, which deletes 2404 * the extent item from the extent tree, when there still are references 2405 * to add, which would fail because they would not find the extent item. 2406 */ 2407 if (!list_empty(&head->ref_add_list)) 2408 return list_first_entry(&head->ref_add_list, 2409 struct btrfs_delayed_ref_node, add_list); 2410 2411 ref = rb_entry(rb_first_cached(&head->ref_tree), 2412 struct btrfs_delayed_ref_node, ref_node); 2413 ASSERT(list_empty(&ref->add_list)); 2414 return ref; 2415 } 2416 2417 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 2418 struct btrfs_delayed_ref_head *head) 2419 { 2420 spin_lock(&delayed_refs->lock); 2421 head->processing = 0; 2422 delayed_refs->num_heads_ready++; 2423 spin_unlock(&delayed_refs->lock); 2424 btrfs_delayed_ref_unlock(head); 2425 } 2426 2427 static struct btrfs_delayed_extent_op *cleanup_extent_op( 2428 struct btrfs_delayed_ref_head *head) 2429 { 2430 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 2431 2432 if (!extent_op) 2433 return NULL; 2434 2435 if (head->must_insert_reserved) { 2436 head->extent_op = NULL; 2437 btrfs_free_delayed_extent_op(extent_op); 2438 return NULL; 2439 } 2440 return extent_op; 2441 } 2442 2443 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 2444 struct btrfs_delayed_ref_head *head) 2445 { 2446 struct btrfs_delayed_extent_op *extent_op; 2447 int ret; 2448 2449 extent_op = cleanup_extent_op(head); 2450 if (!extent_op) 2451 return 0; 2452 head->extent_op = NULL; 2453 spin_unlock(&head->lock); 2454 ret = run_delayed_extent_op(trans, head, extent_op); 2455 btrfs_free_delayed_extent_op(extent_op); 2456 return ret ? ret : 1; 2457 } 2458 2459 static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, 2460 struct btrfs_delayed_ref_head *head) 2461 { 2462 struct btrfs_fs_info *fs_info = trans->fs_info; 2463 struct btrfs_delayed_ref_root *delayed_refs = 2464 &trans->transaction->delayed_refs; 2465 int nr_items = 1; /* Dropping this ref head update. */ 2466 2467 if (head->total_ref_mod < 0) { 2468 struct btrfs_space_info *space_info; 2469 u64 flags; 2470 2471 if (head->is_data) 2472 flags = BTRFS_BLOCK_GROUP_DATA; 2473 else if (head->is_system) 2474 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2475 else 2476 flags = BTRFS_BLOCK_GROUP_METADATA; 2477 space_info = __find_space_info(fs_info, flags); 2478 ASSERT(space_info); 2479 percpu_counter_add_batch(&space_info->total_bytes_pinned, 2480 -head->num_bytes, 2481 BTRFS_TOTAL_BYTES_PINNED_BATCH); 2482 2483 /* 2484 * We had csum deletions accounted for in our delayed refs rsv, 2485 * we need to drop the csum leaves for this update from our 2486 * delayed_refs_rsv. 2487 */ 2488 if (head->is_data) { 2489 spin_lock(&delayed_refs->lock); 2490 delayed_refs->pending_csums -= head->num_bytes; 2491 spin_unlock(&delayed_refs->lock); 2492 nr_items += btrfs_csum_bytes_to_leaves(fs_info, 2493 head->num_bytes); 2494 } 2495 } 2496 2497 /* Also free its reserved qgroup space */ 2498 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2499 head->qgroup_reserved); 2500 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 2501 } 2502 2503 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 2504 struct btrfs_delayed_ref_head *head) 2505 { 2506 2507 struct btrfs_fs_info *fs_info = trans->fs_info; 2508 struct btrfs_delayed_ref_root *delayed_refs; 2509 int ret; 2510 2511 delayed_refs = &trans->transaction->delayed_refs; 2512 2513 ret = run_and_cleanup_extent_op(trans, head); 2514 if (ret < 0) { 2515 unselect_delayed_ref_head(delayed_refs, head); 2516 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2517 return ret; 2518 } else if (ret) { 2519 return ret; 2520 } 2521 2522 /* 2523 * Need to drop our head ref lock and re-acquire the delayed ref lock 2524 * and then re-check to make sure nobody got added. 2525 */ 2526 spin_unlock(&head->lock); 2527 spin_lock(&delayed_refs->lock); 2528 spin_lock(&head->lock); 2529 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 2530 spin_unlock(&head->lock); 2531 spin_unlock(&delayed_refs->lock); 2532 return 1; 2533 } 2534 btrfs_delete_ref_head(delayed_refs, head); 2535 spin_unlock(&head->lock); 2536 spin_unlock(&delayed_refs->lock); 2537 2538 if (head->must_insert_reserved) { 2539 btrfs_pin_extent(fs_info, head->bytenr, 2540 head->num_bytes, 1); 2541 if (head->is_data) { 2542 ret = btrfs_del_csums(trans, fs_info, head->bytenr, 2543 head->num_bytes); 2544 } 2545 } 2546 2547 cleanup_ref_head_accounting(trans, head); 2548 2549 trace_run_delayed_ref_head(fs_info, head, 0); 2550 btrfs_delayed_ref_unlock(head); 2551 btrfs_put_delayed_ref_head(head); 2552 return 0; 2553 } 2554 2555 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 2556 struct btrfs_trans_handle *trans) 2557 { 2558 struct btrfs_delayed_ref_root *delayed_refs = 2559 &trans->transaction->delayed_refs; 2560 struct btrfs_delayed_ref_head *head = NULL; 2561 int ret; 2562 2563 spin_lock(&delayed_refs->lock); 2564 head = btrfs_select_ref_head(delayed_refs); 2565 if (!head) { 2566 spin_unlock(&delayed_refs->lock); 2567 return head; 2568 } 2569 2570 /* 2571 * Grab the lock that says we are going to process all the refs for 2572 * this head 2573 */ 2574 ret = btrfs_delayed_ref_lock(delayed_refs, head); 2575 spin_unlock(&delayed_refs->lock); 2576 2577 /* 2578 * We may have dropped the spin lock to get the head mutex lock, and 2579 * that might have given someone else time to free the head. If that's 2580 * true, it has been removed from our list and we can move on. 2581 */ 2582 if (ret == -EAGAIN) 2583 head = ERR_PTR(-EAGAIN); 2584 2585 return head; 2586 } 2587 2588 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 2589 struct btrfs_delayed_ref_head *locked_ref, 2590 unsigned long *run_refs) 2591 { 2592 struct btrfs_fs_info *fs_info = trans->fs_info; 2593 struct btrfs_delayed_ref_root *delayed_refs; 2594 struct btrfs_delayed_extent_op *extent_op; 2595 struct btrfs_delayed_ref_node *ref; 2596 int must_insert_reserved = 0; 2597 int ret; 2598 2599 delayed_refs = &trans->transaction->delayed_refs; 2600 2601 lockdep_assert_held(&locked_ref->mutex); 2602 lockdep_assert_held(&locked_ref->lock); 2603 2604 while ((ref = select_delayed_ref(locked_ref))) { 2605 if (ref->seq && 2606 btrfs_check_delayed_seq(fs_info, ref->seq)) { 2607 spin_unlock(&locked_ref->lock); 2608 unselect_delayed_ref_head(delayed_refs, locked_ref); 2609 return -EAGAIN; 2610 } 2611 2612 (*run_refs)++; 2613 ref->in_tree = 0; 2614 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 2615 RB_CLEAR_NODE(&ref->ref_node); 2616 if (!list_empty(&ref->add_list)) 2617 list_del(&ref->add_list); 2618 /* 2619 * When we play the delayed ref, also correct the ref_mod on 2620 * head 2621 */ 2622 switch (ref->action) { 2623 case BTRFS_ADD_DELAYED_REF: 2624 case BTRFS_ADD_DELAYED_EXTENT: 2625 locked_ref->ref_mod -= ref->ref_mod; 2626 break; 2627 case BTRFS_DROP_DELAYED_REF: 2628 locked_ref->ref_mod += ref->ref_mod; 2629 break; 2630 default: 2631 WARN_ON(1); 2632 } 2633 atomic_dec(&delayed_refs->num_entries); 2634 2635 /* 2636 * Record the must_insert_reserved flag before we drop the 2637 * spin lock. 2638 */ 2639 must_insert_reserved = locked_ref->must_insert_reserved; 2640 locked_ref->must_insert_reserved = 0; 2641 2642 extent_op = locked_ref->extent_op; 2643 locked_ref->extent_op = NULL; 2644 spin_unlock(&locked_ref->lock); 2645 2646 ret = run_one_delayed_ref(trans, ref, extent_op, 2647 must_insert_reserved); 2648 2649 btrfs_free_delayed_extent_op(extent_op); 2650 if (ret) { 2651 unselect_delayed_ref_head(delayed_refs, locked_ref); 2652 btrfs_put_delayed_ref(ref); 2653 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2654 ret); 2655 return ret; 2656 } 2657 2658 btrfs_put_delayed_ref(ref); 2659 cond_resched(); 2660 2661 spin_lock(&locked_ref->lock); 2662 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2663 } 2664 2665 return 0; 2666 } 2667 2668 /* 2669 * Returns 0 on success or if called with an already aborted transaction. 2670 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2671 */ 2672 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2673 unsigned long nr) 2674 { 2675 struct btrfs_fs_info *fs_info = trans->fs_info; 2676 struct btrfs_delayed_ref_root *delayed_refs; 2677 struct btrfs_delayed_ref_head *locked_ref = NULL; 2678 ktime_t start = ktime_get(); 2679 int ret; 2680 unsigned long count = 0; 2681 unsigned long actual_count = 0; 2682 2683 delayed_refs = &trans->transaction->delayed_refs; 2684 do { 2685 if (!locked_ref) { 2686 locked_ref = btrfs_obtain_ref_head(trans); 2687 if (IS_ERR_OR_NULL(locked_ref)) { 2688 if (PTR_ERR(locked_ref) == -EAGAIN) { 2689 continue; 2690 } else { 2691 break; 2692 } 2693 } 2694 count++; 2695 } 2696 /* 2697 * We need to try and merge add/drops of the same ref since we 2698 * can run into issues with relocate dropping the implicit ref 2699 * and then it being added back again before the drop can 2700 * finish. If we merged anything we need to re-loop so we can 2701 * get a good ref. 2702 * Or we can get node references of the same type that weren't 2703 * merged when created due to bumps in the tree mod seq, and 2704 * we need to merge them to prevent adding an inline extent 2705 * backref before dropping it (triggering a BUG_ON at 2706 * insert_inline_extent_backref()). 2707 */ 2708 spin_lock(&locked_ref->lock); 2709 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref); 2710 2711 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, 2712 &actual_count); 2713 if (ret < 0 && ret != -EAGAIN) { 2714 /* 2715 * Error, btrfs_run_delayed_refs_for_head already 2716 * unlocked everything so just bail out 2717 */ 2718 return ret; 2719 } else if (!ret) { 2720 /* 2721 * Success, perform the usual cleanup of a processed 2722 * head 2723 */ 2724 ret = cleanup_ref_head(trans, locked_ref); 2725 if (ret > 0 ) { 2726 /* We dropped our lock, we need to loop. */ 2727 ret = 0; 2728 continue; 2729 } else if (ret) { 2730 return ret; 2731 } 2732 } 2733 2734 /* 2735 * Either success case or btrfs_run_delayed_refs_for_head 2736 * returned -EAGAIN, meaning we need to select another head 2737 */ 2738 2739 locked_ref = NULL; 2740 cond_resched(); 2741 } while ((nr != -1 && count < nr) || locked_ref); 2742 2743 /* 2744 * We don't want to include ref heads since we can have empty ref heads 2745 * and those will drastically skew our runtime down since we just do 2746 * accounting, no actual extent tree updates. 2747 */ 2748 if (actual_count > 0) { 2749 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2750 u64 avg; 2751 2752 /* 2753 * We weigh the current average higher than our current runtime 2754 * to avoid large swings in the average. 2755 */ 2756 spin_lock(&delayed_refs->lock); 2757 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2758 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2759 spin_unlock(&delayed_refs->lock); 2760 } 2761 return 0; 2762 } 2763 2764 #ifdef SCRAMBLE_DELAYED_REFS 2765 /* 2766 * Normally delayed refs get processed in ascending bytenr order. This 2767 * correlates in most cases to the order added. To expose dependencies on this 2768 * order, we start to process the tree in the middle instead of the beginning 2769 */ 2770 static u64 find_middle(struct rb_root *root) 2771 { 2772 struct rb_node *n = root->rb_node; 2773 struct btrfs_delayed_ref_node *entry; 2774 int alt = 1; 2775 u64 middle; 2776 u64 first = 0, last = 0; 2777 2778 n = rb_first(root); 2779 if (n) { 2780 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2781 first = entry->bytenr; 2782 } 2783 n = rb_last(root); 2784 if (n) { 2785 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2786 last = entry->bytenr; 2787 } 2788 n = root->rb_node; 2789 2790 while (n) { 2791 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2792 WARN_ON(!entry->in_tree); 2793 2794 middle = entry->bytenr; 2795 2796 if (alt) 2797 n = n->rb_left; 2798 else 2799 n = n->rb_right; 2800 2801 alt = 1 - alt; 2802 } 2803 return middle; 2804 } 2805 #endif 2806 2807 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2808 { 2809 u64 num_bytes; 2810 2811 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2812 sizeof(struct btrfs_extent_inline_ref)); 2813 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2814 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2815 2816 /* 2817 * We don't ever fill up leaves all the way so multiply by 2 just to be 2818 * closer to what we're really going to want to use. 2819 */ 2820 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2821 } 2822 2823 /* 2824 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2825 * would require to store the csums for that many bytes. 2826 */ 2827 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2828 { 2829 u64 csum_size; 2830 u64 num_csums_per_leaf; 2831 u64 num_csums; 2832 2833 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2834 num_csums_per_leaf = div64_u64(csum_size, 2835 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2836 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2837 num_csums += num_csums_per_leaf - 1; 2838 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2839 return num_csums; 2840 } 2841 2842 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 2843 { 2844 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 2845 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2846 bool ret = false; 2847 u64 reserved; 2848 2849 spin_lock(&global_rsv->lock); 2850 reserved = global_rsv->reserved; 2851 spin_unlock(&global_rsv->lock); 2852 2853 /* 2854 * Since the global reserve is just kind of magic we don't really want 2855 * to rely on it to save our bacon, so if our size is more than the 2856 * delayed_refs_rsv and the global rsv then it's time to think about 2857 * bailing. 2858 */ 2859 spin_lock(&delayed_refs_rsv->lock); 2860 reserved += delayed_refs_rsv->reserved; 2861 if (delayed_refs_rsv->size >= reserved) 2862 ret = true; 2863 spin_unlock(&delayed_refs_rsv->lock); 2864 return ret; 2865 } 2866 2867 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) 2868 { 2869 u64 num_entries = 2870 atomic_read(&trans->transaction->delayed_refs.num_entries); 2871 u64 avg_runtime; 2872 u64 val; 2873 2874 smp_mb(); 2875 avg_runtime = trans->fs_info->avg_delayed_ref_runtime; 2876 val = num_entries * avg_runtime; 2877 if (val >= NSEC_PER_SEC) 2878 return 1; 2879 if (val >= NSEC_PER_SEC / 2) 2880 return 2; 2881 2882 return btrfs_check_space_for_delayed_refs(trans->fs_info); 2883 } 2884 2885 struct async_delayed_refs { 2886 struct btrfs_root *root; 2887 u64 transid; 2888 int count; 2889 int error; 2890 int sync; 2891 struct completion wait; 2892 struct btrfs_work work; 2893 }; 2894 2895 static inline struct async_delayed_refs * 2896 to_async_delayed_refs(struct btrfs_work *work) 2897 { 2898 return container_of(work, struct async_delayed_refs, work); 2899 } 2900 2901 static void delayed_ref_async_start(struct btrfs_work *work) 2902 { 2903 struct async_delayed_refs *async = to_async_delayed_refs(work); 2904 struct btrfs_trans_handle *trans; 2905 struct btrfs_fs_info *fs_info = async->root->fs_info; 2906 int ret; 2907 2908 /* if the commit is already started, we don't need to wait here */ 2909 if (btrfs_transaction_blocked(fs_info)) 2910 goto done; 2911 2912 trans = btrfs_join_transaction(async->root); 2913 if (IS_ERR(trans)) { 2914 async->error = PTR_ERR(trans); 2915 goto done; 2916 } 2917 2918 /* 2919 * trans->sync means that when we call end_transaction, we won't 2920 * wait on delayed refs 2921 */ 2922 trans->sync = true; 2923 2924 /* Don't bother flushing if we got into a different transaction */ 2925 if (trans->transid > async->transid) 2926 goto end; 2927 2928 ret = btrfs_run_delayed_refs(trans, async->count); 2929 if (ret) 2930 async->error = ret; 2931 end: 2932 ret = btrfs_end_transaction(trans); 2933 if (ret && !async->error) 2934 async->error = ret; 2935 done: 2936 if (async->sync) 2937 complete(&async->wait); 2938 else 2939 kfree(async); 2940 } 2941 2942 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, 2943 unsigned long count, u64 transid, int wait) 2944 { 2945 struct async_delayed_refs *async; 2946 int ret; 2947 2948 async = kmalloc(sizeof(*async), GFP_NOFS); 2949 if (!async) 2950 return -ENOMEM; 2951 2952 async->root = fs_info->tree_root; 2953 async->count = count; 2954 async->error = 0; 2955 async->transid = transid; 2956 if (wait) 2957 async->sync = 1; 2958 else 2959 async->sync = 0; 2960 init_completion(&async->wait); 2961 2962 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2963 delayed_ref_async_start, NULL, NULL); 2964 2965 btrfs_queue_work(fs_info->extent_workers, &async->work); 2966 2967 if (wait) { 2968 wait_for_completion(&async->wait); 2969 ret = async->error; 2970 kfree(async); 2971 return ret; 2972 } 2973 return 0; 2974 } 2975 2976 /* 2977 * this starts processing the delayed reference count updates and 2978 * extent insertions we have queued up so far. count can be 2979 * 0, which means to process everything in the tree at the start 2980 * of the run (but not newly added entries), or it can be some target 2981 * number you'd like to process. 2982 * 2983 * Returns 0 on success or if called with an aborted transaction 2984 * Returns <0 on error and aborts the transaction 2985 */ 2986 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2987 unsigned long count) 2988 { 2989 struct btrfs_fs_info *fs_info = trans->fs_info; 2990 struct rb_node *node; 2991 struct btrfs_delayed_ref_root *delayed_refs; 2992 struct btrfs_delayed_ref_head *head; 2993 int ret; 2994 int run_all = count == (unsigned long)-1; 2995 2996 /* We'll clean this up in btrfs_cleanup_transaction */ 2997 if (trans->aborted) 2998 return 0; 2999 3000 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 3001 return 0; 3002 3003 delayed_refs = &trans->transaction->delayed_refs; 3004 if (count == 0) 3005 count = atomic_read(&delayed_refs->num_entries) * 2; 3006 3007 again: 3008 #ifdef SCRAMBLE_DELAYED_REFS 3009 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 3010 #endif 3011 ret = __btrfs_run_delayed_refs(trans, count); 3012 if (ret < 0) { 3013 btrfs_abort_transaction(trans, ret); 3014 return ret; 3015 } 3016 3017 if (run_all) { 3018 if (!list_empty(&trans->new_bgs)) 3019 btrfs_create_pending_block_groups(trans); 3020 3021 spin_lock(&delayed_refs->lock); 3022 node = rb_first_cached(&delayed_refs->href_root); 3023 if (!node) { 3024 spin_unlock(&delayed_refs->lock); 3025 goto out; 3026 } 3027 head = rb_entry(node, struct btrfs_delayed_ref_head, 3028 href_node); 3029 refcount_inc(&head->refs); 3030 spin_unlock(&delayed_refs->lock); 3031 3032 /* Mutex was contended, block until it's released and retry. */ 3033 mutex_lock(&head->mutex); 3034 mutex_unlock(&head->mutex); 3035 3036 btrfs_put_delayed_ref_head(head); 3037 cond_resched(); 3038 goto again; 3039 } 3040 out: 3041 return 0; 3042 } 3043 3044 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3045 struct btrfs_fs_info *fs_info, 3046 u64 bytenr, u64 num_bytes, u64 flags, 3047 int level, int is_data) 3048 { 3049 struct btrfs_delayed_extent_op *extent_op; 3050 int ret; 3051 3052 extent_op = btrfs_alloc_delayed_extent_op(); 3053 if (!extent_op) 3054 return -ENOMEM; 3055 3056 extent_op->flags_to_set = flags; 3057 extent_op->update_flags = true; 3058 extent_op->update_key = false; 3059 extent_op->is_data = is_data ? true : false; 3060 extent_op->level = level; 3061 3062 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, 3063 num_bytes, extent_op); 3064 if (ret) 3065 btrfs_free_delayed_extent_op(extent_op); 3066 return ret; 3067 } 3068 3069 static noinline int check_delayed_ref(struct btrfs_root *root, 3070 struct btrfs_path *path, 3071 u64 objectid, u64 offset, u64 bytenr) 3072 { 3073 struct btrfs_delayed_ref_head *head; 3074 struct btrfs_delayed_ref_node *ref; 3075 struct btrfs_delayed_data_ref *data_ref; 3076 struct btrfs_delayed_ref_root *delayed_refs; 3077 struct btrfs_transaction *cur_trans; 3078 struct rb_node *node; 3079 int ret = 0; 3080 3081 spin_lock(&root->fs_info->trans_lock); 3082 cur_trans = root->fs_info->running_transaction; 3083 if (cur_trans) 3084 refcount_inc(&cur_trans->use_count); 3085 spin_unlock(&root->fs_info->trans_lock); 3086 if (!cur_trans) 3087 return 0; 3088 3089 delayed_refs = &cur_trans->delayed_refs; 3090 spin_lock(&delayed_refs->lock); 3091 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3092 if (!head) { 3093 spin_unlock(&delayed_refs->lock); 3094 btrfs_put_transaction(cur_trans); 3095 return 0; 3096 } 3097 3098 if (!mutex_trylock(&head->mutex)) { 3099 refcount_inc(&head->refs); 3100 spin_unlock(&delayed_refs->lock); 3101 3102 btrfs_release_path(path); 3103 3104 /* 3105 * Mutex was contended, block until it's released and let 3106 * caller try again 3107 */ 3108 mutex_lock(&head->mutex); 3109 mutex_unlock(&head->mutex); 3110 btrfs_put_delayed_ref_head(head); 3111 btrfs_put_transaction(cur_trans); 3112 return -EAGAIN; 3113 } 3114 spin_unlock(&delayed_refs->lock); 3115 3116 spin_lock(&head->lock); 3117 /* 3118 * XXX: We should replace this with a proper search function in the 3119 * future. 3120 */ 3121 for (node = rb_first_cached(&head->ref_tree); node; 3122 node = rb_next(node)) { 3123 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 3124 /* If it's a shared ref we know a cross reference exists */ 3125 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3126 ret = 1; 3127 break; 3128 } 3129 3130 data_ref = btrfs_delayed_node_to_data_ref(ref); 3131 3132 /* 3133 * If our ref doesn't match the one we're currently looking at 3134 * then we have a cross reference. 3135 */ 3136 if (data_ref->root != root->root_key.objectid || 3137 data_ref->objectid != objectid || 3138 data_ref->offset != offset) { 3139 ret = 1; 3140 break; 3141 } 3142 } 3143 spin_unlock(&head->lock); 3144 mutex_unlock(&head->mutex); 3145 btrfs_put_transaction(cur_trans); 3146 return ret; 3147 } 3148 3149 static noinline int check_committed_ref(struct btrfs_root *root, 3150 struct btrfs_path *path, 3151 u64 objectid, u64 offset, u64 bytenr) 3152 { 3153 struct btrfs_fs_info *fs_info = root->fs_info; 3154 struct btrfs_root *extent_root = fs_info->extent_root; 3155 struct extent_buffer *leaf; 3156 struct btrfs_extent_data_ref *ref; 3157 struct btrfs_extent_inline_ref *iref; 3158 struct btrfs_extent_item *ei; 3159 struct btrfs_key key; 3160 u32 item_size; 3161 int type; 3162 int ret; 3163 3164 key.objectid = bytenr; 3165 key.offset = (u64)-1; 3166 key.type = BTRFS_EXTENT_ITEM_KEY; 3167 3168 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3169 if (ret < 0) 3170 goto out; 3171 BUG_ON(ret == 0); /* Corruption */ 3172 3173 ret = -ENOENT; 3174 if (path->slots[0] == 0) 3175 goto out; 3176 3177 path->slots[0]--; 3178 leaf = path->nodes[0]; 3179 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3180 3181 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3182 goto out; 3183 3184 ret = 1; 3185 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3186 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3187 3188 if (item_size != sizeof(*ei) + 3189 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3190 goto out; 3191 3192 if (btrfs_extent_generation(leaf, ei) <= 3193 btrfs_root_last_snapshot(&root->root_item)) 3194 goto out; 3195 3196 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3197 3198 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 3199 if (type != BTRFS_EXTENT_DATA_REF_KEY) 3200 goto out; 3201 3202 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3203 if (btrfs_extent_refs(leaf, ei) != 3204 btrfs_extent_data_ref_count(leaf, ref) || 3205 btrfs_extent_data_ref_root(leaf, ref) != 3206 root->root_key.objectid || 3207 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3208 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3209 goto out; 3210 3211 ret = 0; 3212 out: 3213 return ret; 3214 } 3215 3216 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3217 u64 bytenr) 3218 { 3219 struct btrfs_path *path; 3220 int ret; 3221 3222 path = btrfs_alloc_path(); 3223 if (!path) 3224 return -ENOMEM; 3225 3226 do { 3227 ret = check_committed_ref(root, path, objectid, 3228 offset, bytenr); 3229 if (ret && ret != -ENOENT) 3230 goto out; 3231 3232 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 3233 } while (ret == -EAGAIN); 3234 3235 out: 3236 btrfs_free_path(path); 3237 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3238 WARN_ON(ret > 0); 3239 return ret; 3240 } 3241 3242 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3243 struct btrfs_root *root, 3244 struct extent_buffer *buf, 3245 int full_backref, int inc) 3246 { 3247 struct btrfs_fs_info *fs_info = root->fs_info; 3248 u64 bytenr; 3249 u64 num_bytes; 3250 u64 parent; 3251 u64 ref_root; 3252 u32 nritems; 3253 struct btrfs_key key; 3254 struct btrfs_file_extent_item *fi; 3255 int i; 3256 int level; 3257 int ret = 0; 3258 int (*process_func)(struct btrfs_trans_handle *, 3259 struct btrfs_root *, 3260 u64, u64, u64, u64, u64, u64); 3261 3262 3263 if (btrfs_is_testing(fs_info)) 3264 return 0; 3265 3266 ref_root = btrfs_header_owner(buf); 3267 nritems = btrfs_header_nritems(buf); 3268 level = btrfs_header_level(buf); 3269 3270 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3271 return 0; 3272 3273 if (inc) 3274 process_func = btrfs_inc_extent_ref; 3275 else 3276 process_func = btrfs_free_extent; 3277 3278 if (full_backref) 3279 parent = buf->start; 3280 else 3281 parent = 0; 3282 3283 for (i = 0; i < nritems; i++) { 3284 if (level == 0) { 3285 btrfs_item_key_to_cpu(buf, &key, i); 3286 if (key.type != BTRFS_EXTENT_DATA_KEY) 3287 continue; 3288 fi = btrfs_item_ptr(buf, i, 3289 struct btrfs_file_extent_item); 3290 if (btrfs_file_extent_type(buf, fi) == 3291 BTRFS_FILE_EXTENT_INLINE) 3292 continue; 3293 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3294 if (bytenr == 0) 3295 continue; 3296 3297 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3298 key.offset -= btrfs_file_extent_offset(buf, fi); 3299 ret = process_func(trans, root, bytenr, num_bytes, 3300 parent, ref_root, key.objectid, 3301 key.offset); 3302 if (ret) 3303 goto fail; 3304 } else { 3305 bytenr = btrfs_node_blockptr(buf, i); 3306 num_bytes = fs_info->nodesize; 3307 ret = process_func(trans, root, bytenr, num_bytes, 3308 parent, ref_root, level - 1, 0); 3309 if (ret) 3310 goto fail; 3311 } 3312 } 3313 return 0; 3314 fail: 3315 return ret; 3316 } 3317 3318 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3319 struct extent_buffer *buf, int full_backref) 3320 { 3321 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3322 } 3323 3324 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3325 struct extent_buffer *buf, int full_backref) 3326 { 3327 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3328 } 3329 3330 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3331 struct btrfs_fs_info *fs_info, 3332 struct btrfs_path *path, 3333 struct btrfs_block_group_cache *cache) 3334 { 3335 int ret; 3336 struct btrfs_root *extent_root = fs_info->extent_root; 3337 unsigned long bi; 3338 struct extent_buffer *leaf; 3339 3340 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3341 if (ret) { 3342 if (ret > 0) 3343 ret = -ENOENT; 3344 goto fail; 3345 } 3346 3347 leaf = path->nodes[0]; 3348 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3349 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3350 btrfs_mark_buffer_dirty(leaf); 3351 fail: 3352 btrfs_release_path(path); 3353 return ret; 3354 3355 } 3356 3357 static struct btrfs_block_group_cache * 3358 next_block_group(struct btrfs_fs_info *fs_info, 3359 struct btrfs_block_group_cache *cache) 3360 { 3361 struct rb_node *node; 3362 3363 spin_lock(&fs_info->block_group_cache_lock); 3364 3365 /* If our block group was removed, we need a full search. */ 3366 if (RB_EMPTY_NODE(&cache->cache_node)) { 3367 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3368 3369 spin_unlock(&fs_info->block_group_cache_lock); 3370 btrfs_put_block_group(cache); 3371 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3372 } 3373 node = rb_next(&cache->cache_node); 3374 btrfs_put_block_group(cache); 3375 if (node) { 3376 cache = rb_entry(node, struct btrfs_block_group_cache, 3377 cache_node); 3378 btrfs_get_block_group(cache); 3379 } else 3380 cache = NULL; 3381 spin_unlock(&fs_info->block_group_cache_lock); 3382 return cache; 3383 } 3384 3385 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3386 struct btrfs_trans_handle *trans, 3387 struct btrfs_path *path) 3388 { 3389 struct btrfs_fs_info *fs_info = block_group->fs_info; 3390 struct btrfs_root *root = fs_info->tree_root; 3391 struct inode *inode = NULL; 3392 struct extent_changeset *data_reserved = NULL; 3393 u64 alloc_hint = 0; 3394 int dcs = BTRFS_DC_ERROR; 3395 u64 num_pages = 0; 3396 int retries = 0; 3397 int ret = 0; 3398 3399 /* 3400 * If this block group is smaller than 100 megs don't bother caching the 3401 * block group. 3402 */ 3403 if (block_group->key.offset < (100 * SZ_1M)) { 3404 spin_lock(&block_group->lock); 3405 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3406 spin_unlock(&block_group->lock); 3407 return 0; 3408 } 3409 3410 if (trans->aborted) 3411 return 0; 3412 again: 3413 inode = lookup_free_space_inode(fs_info, block_group, path); 3414 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3415 ret = PTR_ERR(inode); 3416 btrfs_release_path(path); 3417 goto out; 3418 } 3419 3420 if (IS_ERR(inode)) { 3421 BUG_ON(retries); 3422 retries++; 3423 3424 if (block_group->ro) 3425 goto out_free; 3426 3427 ret = create_free_space_inode(fs_info, trans, block_group, 3428 path); 3429 if (ret) 3430 goto out_free; 3431 goto again; 3432 } 3433 3434 /* 3435 * We want to set the generation to 0, that way if anything goes wrong 3436 * from here on out we know not to trust this cache when we load up next 3437 * time. 3438 */ 3439 BTRFS_I(inode)->generation = 0; 3440 ret = btrfs_update_inode(trans, root, inode); 3441 if (ret) { 3442 /* 3443 * So theoretically we could recover from this, simply set the 3444 * super cache generation to 0 so we know to invalidate the 3445 * cache, but then we'd have to keep track of the block groups 3446 * that fail this way so we know we _have_ to reset this cache 3447 * before the next commit or risk reading stale cache. So to 3448 * limit our exposure to horrible edge cases lets just abort the 3449 * transaction, this only happens in really bad situations 3450 * anyway. 3451 */ 3452 btrfs_abort_transaction(trans, ret); 3453 goto out_put; 3454 } 3455 WARN_ON(ret); 3456 3457 /* We've already setup this transaction, go ahead and exit */ 3458 if (block_group->cache_generation == trans->transid && 3459 i_size_read(inode)) { 3460 dcs = BTRFS_DC_SETUP; 3461 goto out_put; 3462 } 3463 3464 if (i_size_read(inode) > 0) { 3465 ret = btrfs_check_trunc_cache_free_space(fs_info, 3466 &fs_info->global_block_rsv); 3467 if (ret) 3468 goto out_put; 3469 3470 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3471 if (ret) 3472 goto out_put; 3473 } 3474 3475 spin_lock(&block_group->lock); 3476 if (block_group->cached != BTRFS_CACHE_FINISHED || 3477 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3478 /* 3479 * don't bother trying to write stuff out _if_ 3480 * a) we're not cached, 3481 * b) we're with nospace_cache mount option, 3482 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3483 */ 3484 dcs = BTRFS_DC_WRITTEN; 3485 spin_unlock(&block_group->lock); 3486 goto out_put; 3487 } 3488 spin_unlock(&block_group->lock); 3489 3490 /* 3491 * We hit an ENOSPC when setting up the cache in this transaction, just 3492 * skip doing the setup, we've already cleared the cache so we're safe. 3493 */ 3494 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3495 ret = -ENOSPC; 3496 goto out_put; 3497 } 3498 3499 /* 3500 * Try to preallocate enough space based on how big the block group is. 3501 * Keep in mind this has to include any pinned space which could end up 3502 * taking up quite a bit since it's not folded into the other space 3503 * cache. 3504 */ 3505 num_pages = div_u64(block_group->key.offset, SZ_256M); 3506 if (!num_pages) 3507 num_pages = 1; 3508 3509 num_pages *= 16; 3510 num_pages *= PAGE_SIZE; 3511 3512 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); 3513 if (ret) 3514 goto out_put; 3515 3516 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3517 num_pages, num_pages, 3518 &alloc_hint); 3519 /* 3520 * Our cache requires contiguous chunks so that we don't modify a bunch 3521 * of metadata or split extents when writing the cache out, which means 3522 * we can enospc if we are heavily fragmented in addition to just normal 3523 * out of space conditions. So if we hit this just skip setting up any 3524 * other block groups for this transaction, maybe we'll unpin enough 3525 * space the next time around. 3526 */ 3527 if (!ret) 3528 dcs = BTRFS_DC_SETUP; 3529 else if (ret == -ENOSPC) 3530 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3531 3532 out_put: 3533 iput(inode); 3534 out_free: 3535 btrfs_release_path(path); 3536 out: 3537 spin_lock(&block_group->lock); 3538 if (!ret && dcs == BTRFS_DC_SETUP) 3539 block_group->cache_generation = trans->transid; 3540 block_group->disk_cache_state = dcs; 3541 spin_unlock(&block_group->lock); 3542 3543 extent_changeset_free(data_reserved); 3544 return ret; 3545 } 3546 3547 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3548 struct btrfs_fs_info *fs_info) 3549 { 3550 struct btrfs_block_group_cache *cache, *tmp; 3551 struct btrfs_transaction *cur_trans = trans->transaction; 3552 struct btrfs_path *path; 3553 3554 if (list_empty(&cur_trans->dirty_bgs) || 3555 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3556 return 0; 3557 3558 path = btrfs_alloc_path(); 3559 if (!path) 3560 return -ENOMEM; 3561 3562 /* Could add new block groups, use _safe just in case */ 3563 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3564 dirty_list) { 3565 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3566 cache_save_setup(cache, trans, path); 3567 } 3568 3569 btrfs_free_path(path); 3570 return 0; 3571 } 3572 3573 /* 3574 * transaction commit does final block group cache writeback during a 3575 * critical section where nothing is allowed to change the FS. This is 3576 * required in order for the cache to actually match the block group, 3577 * but can introduce a lot of latency into the commit. 3578 * 3579 * So, btrfs_start_dirty_block_groups is here to kick off block group 3580 * cache IO. There's a chance we'll have to redo some of it if the 3581 * block group changes again during the commit, but it greatly reduces 3582 * the commit latency by getting rid of the easy block groups while 3583 * we're still allowing others to join the commit. 3584 */ 3585 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) 3586 { 3587 struct btrfs_fs_info *fs_info = trans->fs_info; 3588 struct btrfs_block_group_cache *cache; 3589 struct btrfs_transaction *cur_trans = trans->transaction; 3590 int ret = 0; 3591 int should_put; 3592 struct btrfs_path *path = NULL; 3593 LIST_HEAD(dirty); 3594 struct list_head *io = &cur_trans->io_bgs; 3595 int num_started = 0; 3596 int loops = 0; 3597 3598 spin_lock(&cur_trans->dirty_bgs_lock); 3599 if (list_empty(&cur_trans->dirty_bgs)) { 3600 spin_unlock(&cur_trans->dirty_bgs_lock); 3601 return 0; 3602 } 3603 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3604 spin_unlock(&cur_trans->dirty_bgs_lock); 3605 3606 again: 3607 /* 3608 * make sure all the block groups on our dirty list actually 3609 * exist 3610 */ 3611 btrfs_create_pending_block_groups(trans); 3612 3613 if (!path) { 3614 path = btrfs_alloc_path(); 3615 if (!path) 3616 return -ENOMEM; 3617 } 3618 3619 /* 3620 * cache_write_mutex is here only to save us from balance or automatic 3621 * removal of empty block groups deleting this block group while we are 3622 * writing out the cache 3623 */ 3624 mutex_lock(&trans->transaction->cache_write_mutex); 3625 while (!list_empty(&dirty)) { 3626 bool drop_reserve = true; 3627 3628 cache = list_first_entry(&dirty, 3629 struct btrfs_block_group_cache, 3630 dirty_list); 3631 /* 3632 * this can happen if something re-dirties a block 3633 * group that is already under IO. Just wait for it to 3634 * finish and then do it all again 3635 */ 3636 if (!list_empty(&cache->io_list)) { 3637 list_del_init(&cache->io_list); 3638 btrfs_wait_cache_io(trans, cache, path); 3639 btrfs_put_block_group(cache); 3640 } 3641 3642 3643 /* 3644 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3645 * if it should update the cache_state. Don't delete 3646 * until after we wait. 3647 * 3648 * Since we're not running in the commit critical section 3649 * we need the dirty_bgs_lock to protect from update_block_group 3650 */ 3651 spin_lock(&cur_trans->dirty_bgs_lock); 3652 list_del_init(&cache->dirty_list); 3653 spin_unlock(&cur_trans->dirty_bgs_lock); 3654 3655 should_put = 1; 3656 3657 cache_save_setup(cache, trans, path); 3658 3659 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3660 cache->io_ctl.inode = NULL; 3661 ret = btrfs_write_out_cache(fs_info, trans, 3662 cache, path); 3663 if (ret == 0 && cache->io_ctl.inode) { 3664 num_started++; 3665 should_put = 0; 3666 3667 /* 3668 * The cache_write_mutex is protecting the 3669 * io_list, also refer to the definition of 3670 * btrfs_transaction::io_bgs for more details 3671 */ 3672 list_add_tail(&cache->io_list, io); 3673 } else { 3674 /* 3675 * if we failed to write the cache, the 3676 * generation will be bad and life goes on 3677 */ 3678 ret = 0; 3679 } 3680 } 3681 if (!ret) { 3682 ret = write_one_cache_group(trans, fs_info, 3683 path, cache); 3684 /* 3685 * Our block group might still be attached to the list 3686 * of new block groups in the transaction handle of some 3687 * other task (struct btrfs_trans_handle->new_bgs). This 3688 * means its block group item isn't yet in the extent 3689 * tree. If this happens ignore the error, as we will 3690 * try again later in the critical section of the 3691 * transaction commit. 3692 */ 3693 if (ret == -ENOENT) { 3694 ret = 0; 3695 spin_lock(&cur_trans->dirty_bgs_lock); 3696 if (list_empty(&cache->dirty_list)) { 3697 list_add_tail(&cache->dirty_list, 3698 &cur_trans->dirty_bgs); 3699 btrfs_get_block_group(cache); 3700 drop_reserve = false; 3701 } 3702 spin_unlock(&cur_trans->dirty_bgs_lock); 3703 } else if (ret) { 3704 btrfs_abort_transaction(trans, ret); 3705 } 3706 } 3707 3708 /* if it's not on the io list, we need to put the block group */ 3709 if (should_put) 3710 btrfs_put_block_group(cache); 3711 if (drop_reserve) 3712 btrfs_delayed_refs_rsv_release(fs_info, 1); 3713 3714 if (ret) 3715 break; 3716 3717 /* 3718 * Avoid blocking other tasks for too long. It might even save 3719 * us from writing caches for block groups that are going to be 3720 * removed. 3721 */ 3722 mutex_unlock(&trans->transaction->cache_write_mutex); 3723 mutex_lock(&trans->transaction->cache_write_mutex); 3724 } 3725 mutex_unlock(&trans->transaction->cache_write_mutex); 3726 3727 /* 3728 * go through delayed refs for all the stuff we've just kicked off 3729 * and then loop back (just once) 3730 */ 3731 ret = btrfs_run_delayed_refs(trans, 0); 3732 if (!ret && loops == 0) { 3733 loops++; 3734 spin_lock(&cur_trans->dirty_bgs_lock); 3735 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3736 /* 3737 * dirty_bgs_lock protects us from concurrent block group 3738 * deletes too (not just cache_write_mutex). 3739 */ 3740 if (!list_empty(&dirty)) { 3741 spin_unlock(&cur_trans->dirty_bgs_lock); 3742 goto again; 3743 } 3744 spin_unlock(&cur_trans->dirty_bgs_lock); 3745 } else if (ret < 0) { 3746 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3747 } 3748 3749 btrfs_free_path(path); 3750 return ret; 3751 } 3752 3753 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3754 struct btrfs_fs_info *fs_info) 3755 { 3756 struct btrfs_block_group_cache *cache; 3757 struct btrfs_transaction *cur_trans = trans->transaction; 3758 int ret = 0; 3759 int should_put; 3760 struct btrfs_path *path; 3761 struct list_head *io = &cur_trans->io_bgs; 3762 int num_started = 0; 3763 3764 path = btrfs_alloc_path(); 3765 if (!path) 3766 return -ENOMEM; 3767 3768 /* 3769 * Even though we are in the critical section of the transaction commit, 3770 * we can still have concurrent tasks adding elements to this 3771 * transaction's list of dirty block groups. These tasks correspond to 3772 * endio free space workers started when writeback finishes for a 3773 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3774 * allocate new block groups as a result of COWing nodes of the root 3775 * tree when updating the free space inode. The writeback for the space 3776 * caches is triggered by an earlier call to 3777 * btrfs_start_dirty_block_groups() and iterations of the following 3778 * loop. 3779 * Also we want to do the cache_save_setup first and then run the 3780 * delayed refs to make sure we have the best chance at doing this all 3781 * in one shot. 3782 */ 3783 spin_lock(&cur_trans->dirty_bgs_lock); 3784 while (!list_empty(&cur_trans->dirty_bgs)) { 3785 cache = list_first_entry(&cur_trans->dirty_bgs, 3786 struct btrfs_block_group_cache, 3787 dirty_list); 3788 3789 /* 3790 * this can happen if cache_save_setup re-dirties a block 3791 * group that is already under IO. Just wait for it to 3792 * finish and then do it all again 3793 */ 3794 if (!list_empty(&cache->io_list)) { 3795 spin_unlock(&cur_trans->dirty_bgs_lock); 3796 list_del_init(&cache->io_list); 3797 btrfs_wait_cache_io(trans, cache, path); 3798 btrfs_put_block_group(cache); 3799 spin_lock(&cur_trans->dirty_bgs_lock); 3800 } 3801 3802 /* 3803 * don't remove from the dirty list until after we've waited 3804 * on any pending IO 3805 */ 3806 list_del_init(&cache->dirty_list); 3807 spin_unlock(&cur_trans->dirty_bgs_lock); 3808 should_put = 1; 3809 3810 cache_save_setup(cache, trans, path); 3811 3812 if (!ret) 3813 ret = btrfs_run_delayed_refs(trans, 3814 (unsigned long) -1); 3815 3816 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3817 cache->io_ctl.inode = NULL; 3818 ret = btrfs_write_out_cache(fs_info, trans, 3819 cache, path); 3820 if (ret == 0 && cache->io_ctl.inode) { 3821 num_started++; 3822 should_put = 0; 3823 list_add_tail(&cache->io_list, io); 3824 } else { 3825 /* 3826 * if we failed to write the cache, the 3827 * generation will be bad and life goes on 3828 */ 3829 ret = 0; 3830 } 3831 } 3832 if (!ret) { 3833 ret = write_one_cache_group(trans, fs_info, 3834 path, cache); 3835 /* 3836 * One of the free space endio workers might have 3837 * created a new block group while updating a free space 3838 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3839 * and hasn't released its transaction handle yet, in 3840 * which case the new block group is still attached to 3841 * its transaction handle and its creation has not 3842 * finished yet (no block group item in the extent tree 3843 * yet, etc). If this is the case, wait for all free 3844 * space endio workers to finish and retry. This is a 3845 * a very rare case so no need for a more efficient and 3846 * complex approach. 3847 */ 3848 if (ret == -ENOENT) { 3849 wait_event(cur_trans->writer_wait, 3850 atomic_read(&cur_trans->num_writers) == 1); 3851 ret = write_one_cache_group(trans, fs_info, 3852 path, cache); 3853 } 3854 if (ret) 3855 btrfs_abort_transaction(trans, ret); 3856 } 3857 3858 /* if its not on the io list, we need to put the block group */ 3859 if (should_put) 3860 btrfs_put_block_group(cache); 3861 btrfs_delayed_refs_rsv_release(fs_info, 1); 3862 spin_lock(&cur_trans->dirty_bgs_lock); 3863 } 3864 spin_unlock(&cur_trans->dirty_bgs_lock); 3865 3866 /* 3867 * Refer to the definition of io_bgs member for details why it's safe 3868 * to use it without any locking 3869 */ 3870 while (!list_empty(io)) { 3871 cache = list_first_entry(io, struct btrfs_block_group_cache, 3872 io_list); 3873 list_del_init(&cache->io_list); 3874 btrfs_wait_cache_io(trans, cache, path); 3875 btrfs_put_block_group(cache); 3876 } 3877 3878 btrfs_free_path(path); 3879 return ret; 3880 } 3881 3882 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3883 { 3884 struct btrfs_block_group_cache *block_group; 3885 int readonly = 0; 3886 3887 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3888 if (!block_group || block_group->ro) 3889 readonly = 1; 3890 if (block_group) 3891 btrfs_put_block_group(block_group); 3892 return readonly; 3893 } 3894 3895 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3896 { 3897 struct btrfs_block_group_cache *bg; 3898 bool ret = true; 3899 3900 bg = btrfs_lookup_block_group(fs_info, bytenr); 3901 if (!bg) 3902 return false; 3903 3904 spin_lock(&bg->lock); 3905 if (bg->ro) 3906 ret = false; 3907 else 3908 atomic_inc(&bg->nocow_writers); 3909 spin_unlock(&bg->lock); 3910 3911 /* no put on block group, done by btrfs_dec_nocow_writers */ 3912 if (!ret) 3913 btrfs_put_block_group(bg); 3914 3915 return ret; 3916 3917 } 3918 3919 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3920 { 3921 struct btrfs_block_group_cache *bg; 3922 3923 bg = btrfs_lookup_block_group(fs_info, bytenr); 3924 ASSERT(bg); 3925 if (atomic_dec_and_test(&bg->nocow_writers)) 3926 wake_up_var(&bg->nocow_writers); 3927 /* 3928 * Once for our lookup and once for the lookup done by a previous call 3929 * to btrfs_inc_nocow_writers() 3930 */ 3931 btrfs_put_block_group(bg); 3932 btrfs_put_block_group(bg); 3933 } 3934 3935 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3936 { 3937 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 3938 } 3939 3940 static const char *alloc_name(u64 flags) 3941 { 3942 switch (flags) { 3943 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3944 return "mixed"; 3945 case BTRFS_BLOCK_GROUP_METADATA: 3946 return "metadata"; 3947 case BTRFS_BLOCK_GROUP_DATA: 3948 return "data"; 3949 case BTRFS_BLOCK_GROUP_SYSTEM: 3950 return "system"; 3951 default: 3952 WARN_ON(1); 3953 return "invalid-combination"; 3954 }; 3955 } 3956 3957 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 3958 { 3959 3960 struct btrfs_space_info *space_info; 3961 int i; 3962 int ret; 3963 3964 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 3965 if (!space_info) 3966 return -ENOMEM; 3967 3968 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 3969 GFP_KERNEL); 3970 if (ret) { 3971 kfree(space_info); 3972 return ret; 3973 } 3974 3975 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3976 INIT_LIST_HEAD(&space_info->block_groups[i]); 3977 init_rwsem(&space_info->groups_sem); 3978 spin_lock_init(&space_info->lock); 3979 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3980 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3981 init_waitqueue_head(&space_info->wait); 3982 INIT_LIST_HEAD(&space_info->ro_bgs); 3983 INIT_LIST_HEAD(&space_info->tickets); 3984 INIT_LIST_HEAD(&space_info->priority_tickets); 3985 3986 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, 3987 info->space_info_kobj, "%s", 3988 alloc_name(space_info->flags)); 3989 if (ret) { 3990 percpu_counter_destroy(&space_info->total_bytes_pinned); 3991 kfree(space_info); 3992 return ret; 3993 } 3994 3995 list_add_rcu(&space_info->list, &info->space_info); 3996 if (flags & BTRFS_BLOCK_GROUP_DATA) 3997 info->data_sinfo = space_info; 3998 3999 return ret; 4000 } 4001 4002 static void update_space_info(struct btrfs_fs_info *info, u64 flags, 4003 u64 total_bytes, u64 bytes_used, 4004 u64 bytes_readonly, 4005 struct btrfs_space_info **space_info) 4006 { 4007 struct btrfs_space_info *found; 4008 int factor; 4009 4010 factor = btrfs_bg_type_to_factor(flags); 4011 4012 found = __find_space_info(info, flags); 4013 ASSERT(found); 4014 spin_lock(&found->lock); 4015 found->total_bytes += total_bytes; 4016 found->disk_total += total_bytes * factor; 4017 found->bytes_used += bytes_used; 4018 found->disk_used += bytes_used * factor; 4019 found->bytes_readonly += bytes_readonly; 4020 if (total_bytes > 0) 4021 found->full = 0; 4022 space_info_add_new_bytes(info, found, total_bytes - 4023 bytes_used - bytes_readonly); 4024 spin_unlock(&found->lock); 4025 *space_info = found; 4026 } 4027 4028 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 4029 { 4030 u64 extra_flags = chunk_to_extended(flags) & 4031 BTRFS_EXTENDED_PROFILE_MASK; 4032 4033 write_seqlock(&fs_info->profiles_lock); 4034 if (flags & BTRFS_BLOCK_GROUP_DATA) 4035 fs_info->avail_data_alloc_bits |= extra_flags; 4036 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4037 fs_info->avail_metadata_alloc_bits |= extra_flags; 4038 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4039 fs_info->avail_system_alloc_bits |= extra_flags; 4040 write_sequnlock(&fs_info->profiles_lock); 4041 } 4042 4043 /* 4044 * returns target flags in extended format or 0 if restripe for this 4045 * chunk_type is not in progress 4046 * 4047 * should be called with balance_lock held 4048 */ 4049 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4050 { 4051 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4052 u64 target = 0; 4053 4054 if (!bctl) 4055 return 0; 4056 4057 if (flags & BTRFS_BLOCK_GROUP_DATA && 4058 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4059 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4060 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4061 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4062 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4063 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4064 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4065 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4066 } 4067 4068 return target; 4069 } 4070 4071 /* 4072 * @flags: available profiles in extended format (see ctree.h) 4073 * 4074 * Returns reduced profile in chunk format. If profile changing is in 4075 * progress (either running or paused) picks the target profile (if it's 4076 * already available), otherwise falls back to plain reducing. 4077 */ 4078 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 4079 { 4080 u64 num_devices = fs_info->fs_devices->rw_devices; 4081 u64 target; 4082 u64 raid_type; 4083 u64 allowed = 0; 4084 4085 /* 4086 * see if restripe for this chunk_type is in progress, if so 4087 * try to reduce to the target profile 4088 */ 4089 spin_lock(&fs_info->balance_lock); 4090 target = get_restripe_target(fs_info, flags); 4091 if (target) { 4092 /* pick target profile only if it's already available */ 4093 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4094 spin_unlock(&fs_info->balance_lock); 4095 return extended_to_chunk(target); 4096 } 4097 } 4098 spin_unlock(&fs_info->balance_lock); 4099 4100 /* First, mask out the RAID levels which aren't possible */ 4101 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4102 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4103 allowed |= btrfs_raid_array[raid_type].bg_flag; 4104 } 4105 allowed &= flags; 4106 4107 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4108 allowed = BTRFS_BLOCK_GROUP_RAID6; 4109 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4110 allowed = BTRFS_BLOCK_GROUP_RAID5; 4111 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4112 allowed = BTRFS_BLOCK_GROUP_RAID10; 4113 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4114 allowed = BTRFS_BLOCK_GROUP_RAID1; 4115 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4116 allowed = BTRFS_BLOCK_GROUP_RAID0; 4117 4118 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4119 4120 return extended_to_chunk(flags | allowed); 4121 } 4122 4123 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4124 { 4125 unsigned seq; 4126 u64 flags; 4127 4128 do { 4129 flags = orig_flags; 4130 seq = read_seqbegin(&fs_info->profiles_lock); 4131 4132 if (flags & BTRFS_BLOCK_GROUP_DATA) 4133 flags |= fs_info->avail_data_alloc_bits; 4134 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4135 flags |= fs_info->avail_system_alloc_bits; 4136 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4137 flags |= fs_info->avail_metadata_alloc_bits; 4138 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4139 4140 return btrfs_reduce_alloc_profile(fs_info, flags); 4141 } 4142 4143 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 4144 { 4145 struct btrfs_fs_info *fs_info = root->fs_info; 4146 u64 flags; 4147 u64 ret; 4148 4149 if (data) 4150 flags = BTRFS_BLOCK_GROUP_DATA; 4151 else if (root == fs_info->chunk_root) 4152 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4153 else 4154 flags = BTRFS_BLOCK_GROUP_METADATA; 4155 4156 ret = get_alloc_profile(fs_info, flags); 4157 return ret; 4158 } 4159 4160 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) 4161 { 4162 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA); 4163 } 4164 4165 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info) 4166 { 4167 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4168 } 4169 4170 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) 4171 { 4172 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4173 } 4174 4175 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4176 bool may_use_included) 4177 { 4178 ASSERT(s_info); 4179 return s_info->bytes_used + s_info->bytes_reserved + 4180 s_info->bytes_pinned + s_info->bytes_readonly + 4181 (may_use_included ? s_info->bytes_may_use : 0); 4182 } 4183 4184 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 4185 { 4186 struct btrfs_root *root = inode->root; 4187 struct btrfs_fs_info *fs_info = root->fs_info; 4188 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 4189 u64 used; 4190 int ret = 0; 4191 int need_commit = 2; 4192 int have_pinned_space; 4193 4194 /* make sure bytes are sectorsize aligned */ 4195 bytes = ALIGN(bytes, fs_info->sectorsize); 4196 4197 if (btrfs_is_free_space_inode(inode)) { 4198 need_commit = 0; 4199 ASSERT(current->journal_info); 4200 } 4201 4202 again: 4203 /* make sure we have enough space to handle the data first */ 4204 spin_lock(&data_sinfo->lock); 4205 used = btrfs_space_info_used(data_sinfo, true); 4206 4207 if (used + bytes > data_sinfo->total_bytes) { 4208 struct btrfs_trans_handle *trans; 4209 4210 /* 4211 * if we don't have enough free bytes in this space then we need 4212 * to alloc a new chunk. 4213 */ 4214 if (!data_sinfo->full) { 4215 u64 alloc_target; 4216 4217 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4218 spin_unlock(&data_sinfo->lock); 4219 4220 alloc_target = btrfs_data_alloc_profile(fs_info); 4221 /* 4222 * It is ugly that we don't call nolock join 4223 * transaction for the free space inode case here. 4224 * But it is safe because we only do the data space 4225 * reservation for the free space cache in the 4226 * transaction context, the common join transaction 4227 * just increase the counter of the current transaction 4228 * handler, doesn't try to acquire the trans_lock of 4229 * the fs. 4230 */ 4231 trans = btrfs_join_transaction(root); 4232 if (IS_ERR(trans)) 4233 return PTR_ERR(trans); 4234 4235 ret = do_chunk_alloc(trans, alloc_target, 4236 CHUNK_ALLOC_NO_FORCE); 4237 btrfs_end_transaction(trans); 4238 if (ret < 0) { 4239 if (ret != -ENOSPC) 4240 return ret; 4241 else { 4242 have_pinned_space = 1; 4243 goto commit_trans; 4244 } 4245 } 4246 4247 goto again; 4248 } 4249 4250 /* 4251 * If we don't have enough pinned space to deal with this 4252 * allocation, and no removed chunk in current transaction, 4253 * don't bother committing the transaction. 4254 */ 4255 have_pinned_space = __percpu_counter_compare( 4256 &data_sinfo->total_bytes_pinned, 4257 used + bytes - data_sinfo->total_bytes, 4258 BTRFS_TOTAL_BYTES_PINNED_BATCH); 4259 spin_unlock(&data_sinfo->lock); 4260 4261 /* commit the current transaction and try again */ 4262 commit_trans: 4263 if (need_commit) { 4264 need_commit--; 4265 4266 if (need_commit > 0) { 4267 btrfs_start_delalloc_roots(fs_info, -1); 4268 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, 4269 (u64)-1); 4270 } 4271 4272 trans = btrfs_join_transaction(root); 4273 if (IS_ERR(trans)) 4274 return PTR_ERR(trans); 4275 if (have_pinned_space >= 0 || 4276 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4277 &trans->transaction->flags) || 4278 need_commit > 0) { 4279 ret = btrfs_commit_transaction(trans); 4280 if (ret) 4281 return ret; 4282 /* 4283 * The cleaner kthread might still be doing iput 4284 * operations. Wait for it to finish so that 4285 * more space is released. 4286 */ 4287 mutex_lock(&fs_info->cleaner_delayed_iput_mutex); 4288 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4289 goto again; 4290 } else { 4291 btrfs_end_transaction(trans); 4292 } 4293 } 4294 4295 trace_btrfs_space_reservation(fs_info, 4296 "space_info:enospc", 4297 data_sinfo->flags, bytes, 1); 4298 return -ENOSPC; 4299 } 4300 update_bytes_may_use(data_sinfo, bytes); 4301 trace_btrfs_space_reservation(fs_info, "space_info", 4302 data_sinfo->flags, bytes, 1); 4303 spin_unlock(&data_sinfo->lock); 4304 4305 return 0; 4306 } 4307 4308 int btrfs_check_data_free_space(struct inode *inode, 4309 struct extent_changeset **reserved, u64 start, u64 len) 4310 { 4311 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4312 int ret; 4313 4314 /* align the range */ 4315 len = round_up(start + len, fs_info->sectorsize) - 4316 round_down(start, fs_info->sectorsize); 4317 start = round_down(start, fs_info->sectorsize); 4318 4319 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4320 if (ret < 0) 4321 return ret; 4322 4323 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4324 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); 4325 if (ret < 0) 4326 btrfs_free_reserved_data_space_noquota(inode, start, len); 4327 else 4328 ret = 0; 4329 return ret; 4330 } 4331 4332 /* 4333 * Called if we need to clear a data reservation for this inode 4334 * Normally in a error case. 4335 * 4336 * This one will *NOT* use accurate qgroup reserved space API, just for case 4337 * which we can't sleep and is sure it won't affect qgroup reserved space. 4338 * Like clear_bit_hook(). 4339 */ 4340 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4341 u64 len) 4342 { 4343 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4344 struct btrfs_space_info *data_sinfo; 4345 4346 /* Make sure the range is aligned to sectorsize */ 4347 len = round_up(start + len, fs_info->sectorsize) - 4348 round_down(start, fs_info->sectorsize); 4349 start = round_down(start, fs_info->sectorsize); 4350 4351 data_sinfo = fs_info->data_sinfo; 4352 spin_lock(&data_sinfo->lock); 4353 update_bytes_may_use(data_sinfo, -len); 4354 trace_btrfs_space_reservation(fs_info, "space_info", 4355 data_sinfo->flags, len, 0); 4356 spin_unlock(&data_sinfo->lock); 4357 } 4358 4359 /* 4360 * Called if we need to clear a data reservation for this inode 4361 * Normally in a error case. 4362 * 4363 * This one will handle the per-inode data rsv map for accurate reserved 4364 * space framework. 4365 */ 4366 void btrfs_free_reserved_data_space(struct inode *inode, 4367 struct extent_changeset *reserved, u64 start, u64 len) 4368 { 4369 struct btrfs_root *root = BTRFS_I(inode)->root; 4370 4371 /* Make sure the range is aligned to sectorsize */ 4372 len = round_up(start + len, root->fs_info->sectorsize) - 4373 round_down(start, root->fs_info->sectorsize); 4374 start = round_down(start, root->fs_info->sectorsize); 4375 4376 btrfs_free_reserved_data_space_noquota(inode, start, len); 4377 btrfs_qgroup_free_data(inode, reserved, start, len); 4378 } 4379 4380 static void force_metadata_allocation(struct btrfs_fs_info *info) 4381 { 4382 struct list_head *head = &info->space_info; 4383 struct btrfs_space_info *found; 4384 4385 rcu_read_lock(); 4386 list_for_each_entry_rcu(found, head, list) { 4387 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4388 found->force_alloc = CHUNK_ALLOC_FORCE; 4389 } 4390 rcu_read_unlock(); 4391 } 4392 4393 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4394 { 4395 return (global->size << 1); 4396 } 4397 4398 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4399 struct btrfs_space_info *sinfo, int force) 4400 { 4401 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4402 u64 bytes_used = btrfs_space_info_used(sinfo, false); 4403 u64 thresh; 4404 4405 if (force == CHUNK_ALLOC_FORCE) 4406 return 1; 4407 4408 /* 4409 * We need to take into account the global rsv because for all intents 4410 * and purposes it's used space. Don't worry about locking the 4411 * global_rsv, it doesn't change except when the transaction commits. 4412 */ 4413 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4414 bytes_used += calc_global_rsv_need_space(global_rsv); 4415 4416 /* 4417 * in limited mode, we want to have some free space up to 4418 * about 1% of the FS size. 4419 */ 4420 if (force == CHUNK_ALLOC_LIMITED) { 4421 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4422 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4423 4424 if (sinfo->total_bytes - bytes_used < thresh) 4425 return 1; 4426 } 4427 4428 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) 4429 return 0; 4430 return 1; 4431 } 4432 4433 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4434 { 4435 u64 num_dev; 4436 4437 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4438 BTRFS_BLOCK_GROUP_RAID0 | 4439 BTRFS_BLOCK_GROUP_RAID5 | 4440 BTRFS_BLOCK_GROUP_RAID6)) 4441 num_dev = fs_info->fs_devices->rw_devices; 4442 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4443 num_dev = 2; 4444 else 4445 num_dev = 1; /* DUP or single */ 4446 4447 return num_dev; 4448 } 4449 4450 /* 4451 * If @is_allocation is true, reserve space in the system space info necessary 4452 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4453 * removing a chunk. 4454 */ 4455 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) 4456 { 4457 struct btrfs_fs_info *fs_info = trans->fs_info; 4458 struct btrfs_space_info *info; 4459 u64 left; 4460 u64 thresh; 4461 int ret = 0; 4462 u64 num_devs; 4463 4464 /* 4465 * Needed because we can end up allocating a system chunk and for an 4466 * atomic and race free space reservation in the chunk block reserve. 4467 */ 4468 lockdep_assert_held(&fs_info->chunk_mutex); 4469 4470 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4471 spin_lock(&info->lock); 4472 left = info->total_bytes - btrfs_space_info_used(info, true); 4473 spin_unlock(&info->lock); 4474 4475 num_devs = get_profile_num_devs(fs_info, type); 4476 4477 /* num_devs device items to update and 1 chunk item to add or remove */ 4478 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4479 btrfs_calc_trans_metadata_size(fs_info, 1); 4480 4481 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4482 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4483 left, thresh, type); 4484 dump_space_info(fs_info, info, 0, 0); 4485 } 4486 4487 if (left < thresh) { 4488 u64 flags = btrfs_system_alloc_profile(fs_info); 4489 4490 /* 4491 * Ignore failure to create system chunk. We might end up not 4492 * needing it, as we might not need to COW all nodes/leafs from 4493 * the paths we visit in the chunk tree (they were already COWed 4494 * or created in the current transaction for example). 4495 */ 4496 ret = btrfs_alloc_chunk(trans, flags); 4497 } 4498 4499 if (!ret) { 4500 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4501 &fs_info->chunk_block_rsv, 4502 thresh, BTRFS_RESERVE_NO_FLUSH); 4503 if (!ret) 4504 trans->chunk_bytes_reserved += thresh; 4505 } 4506 } 4507 4508 /* 4509 * If force is CHUNK_ALLOC_FORCE: 4510 * - return 1 if it successfully allocates a chunk, 4511 * - return errors including -ENOSPC otherwise. 4512 * If force is NOT CHUNK_ALLOC_FORCE: 4513 * - return 0 if it doesn't need to allocate a new chunk, 4514 * - return 1 if it successfully allocates a chunk, 4515 * - return errors including -ENOSPC otherwise. 4516 */ 4517 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 4518 int force) 4519 { 4520 struct btrfs_fs_info *fs_info = trans->fs_info; 4521 struct btrfs_space_info *space_info; 4522 bool wait_for_alloc = false; 4523 bool should_alloc = false; 4524 int ret = 0; 4525 4526 /* Don't re-enter if we're already allocating a chunk */ 4527 if (trans->allocating_chunk) 4528 return -ENOSPC; 4529 4530 space_info = __find_space_info(fs_info, flags); 4531 ASSERT(space_info); 4532 4533 do { 4534 spin_lock(&space_info->lock); 4535 if (force < space_info->force_alloc) 4536 force = space_info->force_alloc; 4537 should_alloc = should_alloc_chunk(fs_info, space_info, force); 4538 if (space_info->full) { 4539 /* No more free physical space */ 4540 if (should_alloc) 4541 ret = -ENOSPC; 4542 else 4543 ret = 0; 4544 spin_unlock(&space_info->lock); 4545 return ret; 4546 } else if (!should_alloc) { 4547 spin_unlock(&space_info->lock); 4548 return 0; 4549 } else if (space_info->chunk_alloc) { 4550 /* 4551 * Someone is already allocating, so we need to block 4552 * until this someone is finished and then loop to 4553 * recheck if we should continue with our allocation 4554 * attempt. 4555 */ 4556 wait_for_alloc = true; 4557 spin_unlock(&space_info->lock); 4558 mutex_lock(&fs_info->chunk_mutex); 4559 mutex_unlock(&fs_info->chunk_mutex); 4560 } else { 4561 /* Proceed with allocation */ 4562 space_info->chunk_alloc = 1; 4563 wait_for_alloc = false; 4564 spin_unlock(&space_info->lock); 4565 } 4566 4567 cond_resched(); 4568 } while (wait_for_alloc); 4569 4570 mutex_lock(&fs_info->chunk_mutex); 4571 trans->allocating_chunk = true; 4572 4573 /* 4574 * If we have mixed data/metadata chunks we want to make sure we keep 4575 * allocating mixed chunks instead of individual chunks. 4576 */ 4577 if (btrfs_mixed_space_info(space_info)) 4578 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4579 4580 /* 4581 * if we're doing a data chunk, go ahead and make sure that 4582 * we keep a reasonable number of metadata chunks allocated in the 4583 * FS as well. 4584 */ 4585 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4586 fs_info->data_chunk_allocations++; 4587 if (!(fs_info->data_chunk_allocations % 4588 fs_info->metadata_ratio)) 4589 force_metadata_allocation(fs_info); 4590 } 4591 4592 /* 4593 * Check if we have enough space in SYSTEM chunk because we may need 4594 * to update devices. 4595 */ 4596 check_system_chunk(trans, flags); 4597 4598 ret = btrfs_alloc_chunk(trans, flags); 4599 trans->allocating_chunk = false; 4600 4601 spin_lock(&space_info->lock); 4602 if (ret < 0) { 4603 if (ret == -ENOSPC) 4604 space_info->full = 1; 4605 else 4606 goto out; 4607 } else { 4608 ret = 1; 4609 space_info->max_extent_size = 0; 4610 } 4611 4612 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4613 out: 4614 space_info->chunk_alloc = 0; 4615 spin_unlock(&space_info->lock); 4616 mutex_unlock(&fs_info->chunk_mutex); 4617 /* 4618 * When we allocate a new chunk we reserve space in the chunk block 4619 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4620 * add new nodes/leafs to it if we end up needing to do it when 4621 * inserting the chunk item and updating device items as part of the 4622 * second phase of chunk allocation, performed by 4623 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4624 * large number of new block groups to create in our transaction 4625 * handle's new_bgs list to avoid exhausting the chunk block reserve 4626 * in extreme cases - like having a single transaction create many new 4627 * block groups when starting to write out the free space caches of all 4628 * the block groups that were made dirty during the lifetime of the 4629 * transaction. 4630 */ 4631 if (trans->chunk_bytes_reserved >= (u64)SZ_2M) 4632 btrfs_create_pending_block_groups(trans); 4633 4634 return ret; 4635 } 4636 4637 static int can_overcommit(struct btrfs_fs_info *fs_info, 4638 struct btrfs_space_info *space_info, u64 bytes, 4639 enum btrfs_reserve_flush_enum flush, 4640 bool system_chunk) 4641 { 4642 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4643 u64 profile; 4644 u64 space_size; 4645 u64 avail; 4646 u64 used; 4647 int factor; 4648 4649 /* Don't overcommit when in mixed mode. */ 4650 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4651 return 0; 4652 4653 if (system_chunk) 4654 profile = btrfs_system_alloc_profile(fs_info); 4655 else 4656 profile = btrfs_metadata_alloc_profile(fs_info); 4657 4658 used = btrfs_space_info_used(space_info, false); 4659 4660 /* 4661 * We only want to allow over committing if we have lots of actual space 4662 * free, but if we don't have enough space to handle the global reserve 4663 * space then we could end up having a real enospc problem when trying 4664 * to allocate a chunk or some other such important allocation. 4665 */ 4666 spin_lock(&global_rsv->lock); 4667 space_size = calc_global_rsv_need_space(global_rsv); 4668 spin_unlock(&global_rsv->lock); 4669 if (used + space_size >= space_info->total_bytes) 4670 return 0; 4671 4672 used += space_info->bytes_may_use; 4673 4674 avail = atomic64_read(&fs_info->free_chunk_space); 4675 4676 /* 4677 * If we have dup, raid1 or raid10 then only half of the free 4678 * space is actually usable. For raid56, the space info used 4679 * doesn't include the parity drive, so we don't have to 4680 * change the math 4681 */ 4682 factor = btrfs_bg_type_to_factor(profile); 4683 avail = div_u64(avail, factor); 4684 4685 /* 4686 * If we aren't flushing all things, let us overcommit up to 4687 * 1/2th of the space. If we can flush, don't let us overcommit 4688 * too much, let it overcommit up to 1/8 of the space. 4689 */ 4690 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4691 avail >>= 3; 4692 else 4693 avail >>= 1; 4694 4695 if (used + bytes < space_info->total_bytes + avail) 4696 return 1; 4697 return 0; 4698 } 4699 4700 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4701 unsigned long nr_pages, int nr_items) 4702 { 4703 struct super_block *sb = fs_info->sb; 4704 4705 if (down_read_trylock(&sb->s_umount)) { 4706 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4707 up_read(&sb->s_umount); 4708 } else { 4709 /* 4710 * We needn't worry the filesystem going from r/w to r/o though 4711 * we don't acquire ->s_umount mutex, because the filesystem 4712 * should guarantee the delalloc inodes list be empty after 4713 * the filesystem is readonly(all dirty pages are written to 4714 * the disk). 4715 */ 4716 btrfs_start_delalloc_roots(fs_info, nr_items); 4717 if (!current->journal_info) 4718 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4719 } 4720 } 4721 4722 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4723 u64 to_reclaim) 4724 { 4725 u64 bytes; 4726 u64 nr; 4727 4728 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4729 nr = div64_u64(to_reclaim, bytes); 4730 if (!nr) 4731 nr = 1; 4732 return nr; 4733 } 4734 4735 #define EXTENT_SIZE_PER_ITEM SZ_256K 4736 4737 /* 4738 * shrink metadata reservation for delalloc 4739 */ 4740 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, 4741 u64 orig, bool wait_ordered) 4742 { 4743 struct btrfs_space_info *space_info; 4744 struct btrfs_trans_handle *trans; 4745 u64 delalloc_bytes; 4746 u64 max_reclaim; 4747 u64 items; 4748 long time_left; 4749 unsigned long nr_pages; 4750 int loops; 4751 4752 /* Calc the number of the pages we need flush for space reservation */ 4753 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4754 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4755 4756 trans = (struct btrfs_trans_handle *)current->journal_info; 4757 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4758 4759 delalloc_bytes = percpu_counter_sum_positive( 4760 &fs_info->delalloc_bytes); 4761 if (delalloc_bytes == 0) { 4762 if (trans) 4763 return; 4764 if (wait_ordered) 4765 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4766 return; 4767 } 4768 4769 loops = 0; 4770 while (delalloc_bytes && loops < 3) { 4771 max_reclaim = min(delalloc_bytes, to_reclaim); 4772 nr_pages = max_reclaim >> PAGE_SHIFT; 4773 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4774 /* 4775 * We need to wait for the async pages to actually start before 4776 * we do anything. 4777 */ 4778 max_reclaim = atomic_read(&fs_info->async_delalloc_pages); 4779 if (!max_reclaim) 4780 goto skip_async; 4781 4782 if (max_reclaim <= nr_pages) 4783 max_reclaim = 0; 4784 else 4785 max_reclaim -= nr_pages; 4786 4787 wait_event(fs_info->async_submit_wait, 4788 atomic_read(&fs_info->async_delalloc_pages) <= 4789 (int)max_reclaim); 4790 skip_async: 4791 spin_lock(&space_info->lock); 4792 if (list_empty(&space_info->tickets) && 4793 list_empty(&space_info->priority_tickets)) { 4794 spin_unlock(&space_info->lock); 4795 break; 4796 } 4797 spin_unlock(&space_info->lock); 4798 4799 loops++; 4800 if (wait_ordered && !trans) { 4801 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4802 } else { 4803 time_left = schedule_timeout_killable(1); 4804 if (time_left) 4805 break; 4806 } 4807 delalloc_bytes = percpu_counter_sum_positive( 4808 &fs_info->delalloc_bytes); 4809 } 4810 } 4811 4812 struct reserve_ticket { 4813 u64 bytes; 4814 int error; 4815 struct list_head list; 4816 wait_queue_head_t wait; 4817 }; 4818 4819 /** 4820 * maybe_commit_transaction - possibly commit the transaction if its ok to 4821 * @root - the root we're allocating for 4822 * @bytes - the number of bytes we want to reserve 4823 * @force - force the commit 4824 * 4825 * This will check to make sure that committing the transaction will actually 4826 * get us somewhere and then commit the transaction if it does. Otherwise it 4827 * will return -ENOSPC. 4828 */ 4829 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4830 struct btrfs_space_info *space_info) 4831 { 4832 struct reserve_ticket *ticket = NULL; 4833 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4834 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 4835 struct btrfs_trans_handle *trans; 4836 u64 bytes_needed; 4837 u64 reclaim_bytes = 0; 4838 4839 trans = (struct btrfs_trans_handle *)current->journal_info; 4840 if (trans) 4841 return -EAGAIN; 4842 4843 spin_lock(&space_info->lock); 4844 if (!list_empty(&space_info->priority_tickets)) 4845 ticket = list_first_entry(&space_info->priority_tickets, 4846 struct reserve_ticket, list); 4847 else if (!list_empty(&space_info->tickets)) 4848 ticket = list_first_entry(&space_info->tickets, 4849 struct reserve_ticket, list); 4850 bytes_needed = (ticket) ? ticket->bytes : 0; 4851 spin_unlock(&space_info->lock); 4852 4853 if (!bytes_needed) 4854 return 0; 4855 4856 /* See if there is enough pinned space to make this reservation */ 4857 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4858 bytes_needed, 4859 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 4860 goto commit; 4861 4862 /* 4863 * See if there is some space in the delayed insertion reservation for 4864 * this reservation. 4865 */ 4866 if (space_info != delayed_rsv->space_info) 4867 return -ENOSPC; 4868 4869 spin_lock(&delayed_rsv->lock); 4870 reclaim_bytes += delayed_rsv->reserved; 4871 spin_unlock(&delayed_rsv->lock); 4872 4873 spin_lock(&delayed_refs_rsv->lock); 4874 reclaim_bytes += delayed_refs_rsv->reserved; 4875 spin_unlock(&delayed_refs_rsv->lock); 4876 if (reclaim_bytes >= bytes_needed) 4877 goto commit; 4878 bytes_needed -= reclaim_bytes; 4879 4880 if (__percpu_counter_compare(&space_info->total_bytes_pinned, 4881 bytes_needed, 4882 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { 4883 return -ENOSPC; 4884 } 4885 4886 commit: 4887 trans = btrfs_join_transaction(fs_info->extent_root); 4888 if (IS_ERR(trans)) 4889 return -ENOSPC; 4890 4891 return btrfs_commit_transaction(trans); 4892 } 4893 4894 /* 4895 * Try to flush some data based on policy set by @state. This is only advisory 4896 * and may fail for various reasons. The caller is supposed to examine the 4897 * state of @space_info to detect the outcome. 4898 */ 4899 static void flush_space(struct btrfs_fs_info *fs_info, 4900 struct btrfs_space_info *space_info, u64 num_bytes, 4901 int state) 4902 { 4903 struct btrfs_root *root = fs_info->extent_root; 4904 struct btrfs_trans_handle *trans; 4905 int nr; 4906 int ret = 0; 4907 4908 switch (state) { 4909 case FLUSH_DELAYED_ITEMS_NR: 4910 case FLUSH_DELAYED_ITEMS: 4911 if (state == FLUSH_DELAYED_ITEMS_NR) 4912 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4913 else 4914 nr = -1; 4915 4916 trans = btrfs_join_transaction(root); 4917 if (IS_ERR(trans)) { 4918 ret = PTR_ERR(trans); 4919 break; 4920 } 4921 ret = btrfs_run_delayed_items_nr(trans, nr); 4922 btrfs_end_transaction(trans); 4923 break; 4924 case FLUSH_DELALLOC: 4925 case FLUSH_DELALLOC_WAIT: 4926 shrink_delalloc(fs_info, num_bytes * 2, num_bytes, 4927 state == FLUSH_DELALLOC_WAIT); 4928 break; 4929 case FLUSH_DELAYED_REFS_NR: 4930 case FLUSH_DELAYED_REFS: 4931 trans = btrfs_join_transaction(root); 4932 if (IS_ERR(trans)) { 4933 ret = PTR_ERR(trans); 4934 break; 4935 } 4936 if (state == FLUSH_DELAYED_REFS_NR) 4937 nr = calc_reclaim_items_nr(fs_info, num_bytes); 4938 else 4939 nr = 0; 4940 btrfs_run_delayed_refs(trans, nr); 4941 btrfs_end_transaction(trans); 4942 break; 4943 case ALLOC_CHUNK: 4944 trans = btrfs_join_transaction(root); 4945 if (IS_ERR(trans)) { 4946 ret = PTR_ERR(trans); 4947 break; 4948 } 4949 ret = do_chunk_alloc(trans, 4950 btrfs_metadata_alloc_profile(fs_info), 4951 CHUNK_ALLOC_NO_FORCE); 4952 btrfs_end_transaction(trans); 4953 if (ret > 0 || ret == -ENOSPC) 4954 ret = 0; 4955 break; 4956 case COMMIT_TRANS: 4957 ret = may_commit_transaction(fs_info, space_info); 4958 break; 4959 default: 4960 ret = -ENOSPC; 4961 break; 4962 } 4963 4964 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, 4965 ret); 4966 return; 4967 } 4968 4969 static inline u64 4970 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, 4971 struct btrfs_space_info *space_info, 4972 bool system_chunk) 4973 { 4974 struct reserve_ticket *ticket; 4975 u64 used; 4976 u64 expected; 4977 u64 to_reclaim = 0; 4978 4979 list_for_each_entry(ticket, &space_info->tickets, list) 4980 to_reclaim += ticket->bytes; 4981 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4982 to_reclaim += ticket->bytes; 4983 if (to_reclaim) 4984 return to_reclaim; 4985 4986 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4987 if (can_overcommit(fs_info, space_info, to_reclaim, 4988 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 4989 return 0; 4990 4991 used = btrfs_space_info_used(space_info, true); 4992 4993 if (can_overcommit(fs_info, space_info, SZ_1M, 4994 BTRFS_RESERVE_FLUSH_ALL, system_chunk)) 4995 expected = div_factor_fine(space_info->total_bytes, 95); 4996 else 4997 expected = div_factor_fine(space_info->total_bytes, 90); 4998 4999 if (used > expected) 5000 to_reclaim = used - expected; 5001 else 5002 to_reclaim = 0; 5003 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 5004 space_info->bytes_reserved); 5005 return to_reclaim; 5006 } 5007 5008 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, 5009 struct btrfs_space_info *space_info, 5010 u64 used, bool system_chunk) 5011 { 5012 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 5013 5014 /* If we're just plain full then async reclaim just slows us down. */ 5015 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 5016 return 0; 5017 5018 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5019 system_chunk)) 5020 return 0; 5021 5022 return (used >= thresh && !btrfs_fs_closing(fs_info) && 5023 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 5024 } 5025 5026 static void wake_all_tickets(struct list_head *head) 5027 { 5028 struct reserve_ticket *ticket; 5029 5030 while (!list_empty(head)) { 5031 ticket = list_first_entry(head, struct reserve_ticket, list); 5032 list_del_init(&ticket->list); 5033 ticket->error = -ENOSPC; 5034 wake_up(&ticket->wait); 5035 } 5036 } 5037 5038 /* 5039 * This is for normal flushers, we can wait all goddamned day if we want to. We 5040 * will loop and continuously try to flush as long as we are making progress. 5041 * We count progress as clearing off tickets each time we have to loop. 5042 */ 5043 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 5044 { 5045 struct btrfs_fs_info *fs_info; 5046 struct btrfs_space_info *space_info; 5047 u64 to_reclaim; 5048 int flush_state; 5049 int commit_cycles = 0; 5050 u64 last_tickets_id; 5051 5052 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 5053 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5054 5055 spin_lock(&space_info->lock); 5056 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5057 false); 5058 if (!to_reclaim) { 5059 space_info->flush = 0; 5060 spin_unlock(&space_info->lock); 5061 return; 5062 } 5063 last_tickets_id = space_info->tickets_id; 5064 spin_unlock(&space_info->lock); 5065 5066 flush_state = FLUSH_DELAYED_ITEMS_NR; 5067 do { 5068 flush_space(fs_info, space_info, to_reclaim, flush_state); 5069 spin_lock(&space_info->lock); 5070 if (list_empty(&space_info->tickets)) { 5071 space_info->flush = 0; 5072 spin_unlock(&space_info->lock); 5073 return; 5074 } 5075 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, 5076 space_info, 5077 false); 5078 if (last_tickets_id == space_info->tickets_id) { 5079 flush_state++; 5080 } else { 5081 last_tickets_id = space_info->tickets_id; 5082 flush_state = FLUSH_DELAYED_ITEMS_NR; 5083 if (commit_cycles) 5084 commit_cycles--; 5085 } 5086 5087 if (flush_state > COMMIT_TRANS) { 5088 commit_cycles++; 5089 if (commit_cycles > 2) { 5090 wake_all_tickets(&space_info->tickets); 5091 space_info->flush = 0; 5092 } else { 5093 flush_state = FLUSH_DELAYED_ITEMS_NR; 5094 } 5095 } 5096 spin_unlock(&space_info->lock); 5097 } while (flush_state <= COMMIT_TRANS); 5098 } 5099 5100 void btrfs_init_async_reclaim_work(struct work_struct *work) 5101 { 5102 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5103 } 5104 5105 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5106 struct btrfs_space_info *space_info, 5107 struct reserve_ticket *ticket) 5108 { 5109 u64 to_reclaim; 5110 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5111 5112 spin_lock(&space_info->lock); 5113 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, 5114 false); 5115 if (!to_reclaim) { 5116 spin_unlock(&space_info->lock); 5117 return; 5118 } 5119 spin_unlock(&space_info->lock); 5120 5121 do { 5122 flush_space(fs_info, space_info, to_reclaim, flush_state); 5123 flush_state++; 5124 spin_lock(&space_info->lock); 5125 if (ticket->bytes == 0) { 5126 spin_unlock(&space_info->lock); 5127 return; 5128 } 5129 spin_unlock(&space_info->lock); 5130 5131 /* 5132 * Priority flushers can't wait on delalloc without 5133 * deadlocking. 5134 */ 5135 if (flush_state == FLUSH_DELALLOC || 5136 flush_state == FLUSH_DELALLOC_WAIT) 5137 flush_state = ALLOC_CHUNK; 5138 } while (flush_state < COMMIT_TRANS); 5139 } 5140 5141 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5142 struct btrfs_space_info *space_info, 5143 struct reserve_ticket *ticket, u64 orig_bytes) 5144 5145 { 5146 DEFINE_WAIT(wait); 5147 int ret = 0; 5148 5149 spin_lock(&space_info->lock); 5150 while (ticket->bytes > 0 && ticket->error == 0) { 5151 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5152 if (ret) { 5153 ret = -EINTR; 5154 break; 5155 } 5156 spin_unlock(&space_info->lock); 5157 5158 schedule(); 5159 5160 finish_wait(&ticket->wait, &wait); 5161 spin_lock(&space_info->lock); 5162 } 5163 if (!ret) 5164 ret = ticket->error; 5165 if (!list_empty(&ticket->list)) 5166 list_del_init(&ticket->list); 5167 if (ticket->bytes && ticket->bytes < orig_bytes) { 5168 u64 num_bytes = orig_bytes - ticket->bytes; 5169 update_bytes_may_use(space_info, -num_bytes); 5170 trace_btrfs_space_reservation(fs_info, "space_info", 5171 space_info->flags, num_bytes, 0); 5172 } 5173 spin_unlock(&space_info->lock); 5174 5175 return ret; 5176 } 5177 5178 /** 5179 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5180 * @root - the root we're allocating for 5181 * @space_info - the space info we want to allocate from 5182 * @orig_bytes - the number of bytes we want 5183 * @flush - whether or not we can flush to make our reservation 5184 * 5185 * This will reserve orig_bytes number of bytes from the space info associated 5186 * with the block_rsv. If there is not enough space it will make an attempt to 5187 * flush out space to make room. It will do this by flushing delalloc if 5188 * possible or committing the transaction. If flush is 0 then no attempts to 5189 * regain reservations will be made and this will fail if there is not enough 5190 * space already. 5191 */ 5192 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, 5193 struct btrfs_space_info *space_info, 5194 u64 orig_bytes, 5195 enum btrfs_reserve_flush_enum flush, 5196 bool system_chunk) 5197 { 5198 struct reserve_ticket ticket; 5199 u64 used; 5200 int ret = 0; 5201 5202 ASSERT(orig_bytes); 5203 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5204 5205 spin_lock(&space_info->lock); 5206 ret = -ENOSPC; 5207 used = btrfs_space_info_used(space_info, true); 5208 5209 /* 5210 * If we have enough space then hooray, make our reservation and carry 5211 * on. If not see if we can overcommit, and if we can, hooray carry on. 5212 * If not things get more complicated. 5213 */ 5214 if (used + orig_bytes <= space_info->total_bytes) { 5215 update_bytes_may_use(space_info, orig_bytes); 5216 trace_btrfs_space_reservation(fs_info, "space_info", 5217 space_info->flags, orig_bytes, 1); 5218 ret = 0; 5219 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, 5220 system_chunk)) { 5221 update_bytes_may_use(space_info, orig_bytes); 5222 trace_btrfs_space_reservation(fs_info, "space_info", 5223 space_info->flags, orig_bytes, 1); 5224 ret = 0; 5225 } 5226 5227 /* 5228 * If we couldn't make a reservation then setup our reservation ticket 5229 * and kick the async worker if it's not already running. 5230 * 5231 * If we are a priority flusher then we just need to add our ticket to 5232 * the list and we will do our own flushing further down. 5233 */ 5234 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5235 ticket.bytes = orig_bytes; 5236 ticket.error = 0; 5237 init_waitqueue_head(&ticket.wait); 5238 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5239 list_add_tail(&ticket.list, &space_info->tickets); 5240 if (!space_info->flush) { 5241 space_info->flush = 1; 5242 trace_btrfs_trigger_flush(fs_info, 5243 space_info->flags, 5244 orig_bytes, flush, 5245 "enospc"); 5246 queue_work(system_unbound_wq, 5247 &fs_info->async_reclaim_work); 5248 } 5249 } else { 5250 list_add_tail(&ticket.list, 5251 &space_info->priority_tickets); 5252 } 5253 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5254 used += orig_bytes; 5255 /* 5256 * We will do the space reservation dance during log replay, 5257 * which means we won't have fs_info->fs_root set, so don't do 5258 * the async reclaim as we will panic. 5259 */ 5260 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5261 need_do_async_reclaim(fs_info, space_info, 5262 used, system_chunk) && 5263 !work_busy(&fs_info->async_reclaim_work)) { 5264 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5265 orig_bytes, flush, "preempt"); 5266 queue_work(system_unbound_wq, 5267 &fs_info->async_reclaim_work); 5268 } 5269 } 5270 spin_unlock(&space_info->lock); 5271 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5272 return ret; 5273 5274 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5275 return wait_reserve_ticket(fs_info, space_info, &ticket, 5276 orig_bytes); 5277 5278 ret = 0; 5279 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5280 spin_lock(&space_info->lock); 5281 if (ticket.bytes) { 5282 if (ticket.bytes < orig_bytes) { 5283 u64 num_bytes = orig_bytes - ticket.bytes; 5284 update_bytes_may_use(space_info, -num_bytes); 5285 trace_btrfs_space_reservation(fs_info, "space_info", 5286 space_info->flags, 5287 num_bytes, 0); 5288 5289 } 5290 list_del_init(&ticket.list); 5291 ret = -ENOSPC; 5292 } 5293 spin_unlock(&space_info->lock); 5294 ASSERT(list_empty(&ticket.list)); 5295 return ret; 5296 } 5297 5298 /** 5299 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5300 * @root - the root we're allocating for 5301 * @block_rsv - the block_rsv we're allocating for 5302 * @orig_bytes - the number of bytes we want 5303 * @flush - whether or not we can flush to make our reservation 5304 * 5305 * This will reserve orig_bytes number of bytes from the space info associated 5306 * with the block_rsv. If there is not enough space it will make an attempt to 5307 * flush out space to make room. It will do this by flushing delalloc if 5308 * possible or committing the transaction. If flush is 0 then no attempts to 5309 * regain reservations will be made and this will fail if there is not enough 5310 * space already. 5311 */ 5312 static int reserve_metadata_bytes(struct btrfs_root *root, 5313 struct btrfs_block_rsv *block_rsv, 5314 u64 orig_bytes, 5315 enum btrfs_reserve_flush_enum flush) 5316 { 5317 struct btrfs_fs_info *fs_info = root->fs_info; 5318 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5319 int ret; 5320 bool system_chunk = (root == fs_info->chunk_root); 5321 5322 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, 5323 orig_bytes, flush, system_chunk); 5324 if (ret == -ENOSPC && 5325 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5326 if (block_rsv != global_rsv && 5327 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5328 ret = 0; 5329 } 5330 if (ret == -ENOSPC) { 5331 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5332 block_rsv->space_info->flags, 5333 orig_bytes, 1); 5334 5335 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) 5336 dump_space_info(fs_info, block_rsv->space_info, 5337 orig_bytes, 0); 5338 } 5339 return ret; 5340 } 5341 5342 static struct btrfs_block_rsv *get_block_rsv( 5343 const struct btrfs_trans_handle *trans, 5344 const struct btrfs_root *root) 5345 { 5346 struct btrfs_fs_info *fs_info = root->fs_info; 5347 struct btrfs_block_rsv *block_rsv = NULL; 5348 5349 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5350 (root == fs_info->csum_root && trans->adding_csums) || 5351 (root == fs_info->uuid_root)) 5352 block_rsv = trans->block_rsv; 5353 5354 if (!block_rsv) 5355 block_rsv = root->block_rsv; 5356 5357 if (!block_rsv) 5358 block_rsv = &fs_info->empty_block_rsv; 5359 5360 return block_rsv; 5361 } 5362 5363 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5364 u64 num_bytes) 5365 { 5366 int ret = -ENOSPC; 5367 spin_lock(&block_rsv->lock); 5368 if (block_rsv->reserved >= num_bytes) { 5369 block_rsv->reserved -= num_bytes; 5370 if (block_rsv->reserved < block_rsv->size) 5371 block_rsv->full = 0; 5372 ret = 0; 5373 } 5374 spin_unlock(&block_rsv->lock); 5375 return ret; 5376 } 5377 5378 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5379 u64 num_bytes, bool update_size) 5380 { 5381 spin_lock(&block_rsv->lock); 5382 block_rsv->reserved += num_bytes; 5383 if (update_size) 5384 block_rsv->size += num_bytes; 5385 else if (block_rsv->reserved >= block_rsv->size) 5386 block_rsv->full = 1; 5387 spin_unlock(&block_rsv->lock); 5388 } 5389 5390 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5391 struct btrfs_block_rsv *dest, u64 num_bytes, 5392 int min_factor) 5393 { 5394 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5395 u64 min_bytes; 5396 5397 if (global_rsv->space_info != dest->space_info) 5398 return -ENOSPC; 5399 5400 spin_lock(&global_rsv->lock); 5401 min_bytes = div_factor(global_rsv->size, min_factor); 5402 if (global_rsv->reserved < min_bytes + num_bytes) { 5403 spin_unlock(&global_rsv->lock); 5404 return -ENOSPC; 5405 } 5406 global_rsv->reserved -= num_bytes; 5407 if (global_rsv->reserved < global_rsv->size) 5408 global_rsv->full = 0; 5409 spin_unlock(&global_rsv->lock); 5410 5411 block_rsv_add_bytes(dest, num_bytes, true); 5412 return 0; 5413 } 5414 5415 /** 5416 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. 5417 * @fs_info - the fs info for our fs. 5418 * @src - the source block rsv to transfer from. 5419 * @num_bytes - the number of bytes to transfer. 5420 * 5421 * This transfers up to the num_bytes amount from the src rsv to the 5422 * delayed_refs_rsv. Any extra bytes are returned to the space info. 5423 */ 5424 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 5425 struct btrfs_block_rsv *src, 5426 u64 num_bytes) 5427 { 5428 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 5429 u64 to_free = 0; 5430 5431 spin_lock(&src->lock); 5432 src->reserved -= num_bytes; 5433 src->size -= num_bytes; 5434 spin_unlock(&src->lock); 5435 5436 spin_lock(&delayed_refs_rsv->lock); 5437 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 5438 u64 delta = delayed_refs_rsv->size - 5439 delayed_refs_rsv->reserved; 5440 if (num_bytes > delta) { 5441 to_free = num_bytes - delta; 5442 num_bytes = delta; 5443 } 5444 } else { 5445 to_free = num_bytes; 5446 num_bytes = 0; 5447 } 5448 5449 if (num_bytes) 5450 delayed_refs_rsv->reserved += num_bytes; 5451 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 5452 delayed_refs_rsv->full = 1; 5453 spin_unlock(&delayed_refs_rsv->lock); 5454 5455 if (num_bytes) 5456 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5457 0, num_bytes, 1); 5458 if (to_free) 5459 space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, 5460 to_free); 5461 } 5462 5463 /** 5464 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. 5465 * @fs_info - the fs_info for our fs. 5466 * @flush - control how we can flush for this reservation. 5467 * 5468 * This will refill the delayed block_rsv up to 1 items size worth of space and 5469 * will return -ENOSPC if we can't make the reservation. 5470 */ 5471 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 5472 enum btrfs_reserve_flush_enum flush) 5473 { 5474 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5475 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); 5476 u64 num_bytes = 0; 5477 int ret = -ENOSPC; 5478 5479 spin_lock(&block_rsv->lock); 5480 if (block_rsv->reserved < block_rsv->size) { 5481 num_bytes = block_rsv->size - block_rsv->reserved; 5482 num_bytes = min(num_bytes, limit); 5483 } 5484 spin_unlock(&block_rsv->lock); 5485 5486 if (!num_bytes) 5487 return 0; 5488 5489 ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, 5490 num_bytes, flush); 5491 if (ret) 5492 return ret; 5493 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5494 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5495 0, num_bytes, 1); 5496 return 0; 5497 } 5498 5499 /* 5500 * This is for space we already have accounted in space_info->bytes_may_use, so 5501 * basically when we're returning space from block_rsv's. 5502 */ 5503 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5504 struct btrfs_space_info *space_info, 5505 u64 num_bytes) 5506 { 5507 struct reserve_ticket *ticket; 5508 struct list_head *head; 5509 u64 used; 5510 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5511 bool check_overcommit = false; 5512 5513 spin_lock(&space_info->lock); 5514 head = &space_info->priority_tickets; 5515 5516 /* 5517 * If we are over our limit then we need to check and see if we can 5518 * overcommit, and if we can't then we just need to free up our space 5519 * and not satisfy any requests. 5520 */ 5521 used = btrfs_space_info_used(space_info, true); 5522 if (used - num_bytes >= space_info->total_bytes) 5523 check_overcommit = true; 5524 again: 5525 while (!list_empty(head) && num_bytes) { 5526 ticket = list_first_entry(head, struct reserve_ticket, 5527 list); 5528 /* 5529 * We use 0 bytes because this space is already reserved, so 5530 * adding the ticket space would be a double count. 5531 */ 5532 if (check_overcommit && 5533 !can_overcommit(fs_info, space_info, 0, flush, false)) 5534 break; 5535 if (num_bytes >= ticket->bytes) { 5536 list_del_init(&ticket->list); 5537 num_bytes -= ticket->bytes; 5538 ticket->bytes = 0; 5539 space_info->tickets_id++; 5540 wake_up(&ticket->wait); 5541 } else { 5542 ticket->bytes -= num_bytes; 5543 num_bytes = 0; 5544 } 5545 } 5546 5547 if (num_bytes && head == &space_info->priority_tickets) { 5548 head = &space_info->tickets; 5549 flush = BTRFS_RESERVE_FLUSH_ALL; 5550 goto again; 5551 } 5552 update_bytes_may_use(space_info, -num_bytes); 5553 trace_btrfs_space_reservation(fs_info, "space_info", 5554 space_info->flags, num_bytes, 0); 5555 spin_unlock(&space_info->lock); 5556 } 5557 5558 /* 5559 * This is for newly allocated space that isn't accounted in 5560 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5561 * we use this helper. 5562 */ 5563 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5564 struct btrfs_space_info *space_info, 5565 u64 num_bytes) 5566 { 5567 struct reserve_ticket *ticket; 5568 struct list_head *head = &space_info->priority_tickets; 5569 5570 again: 5571 while (!list_empty(head) && num_bytes) { 5572 ticket = list_first_entry(head, struct reserve_ticket, 5573 list); 5574 if (num_bytes >= ticket->bytes) { 5575 trace_btrfs_space_reservation(fs_info, "space_info", 5576 space_info->flags, 5577 ticket->bytes, 1); 5578 list_del_init(&ticket->list); 5579 num_bytes -= ticket->bytes; 5580 update_bytes_may_use(space_info, ticket->bytes); 5581 ticket->bytes = 0; 5582 space_info->tickets_id++; 5583 wake_up(&ticket->wait); 5584 } else { 5585 trace_btrfs_space_reservation(fs_info, "space_info", 5586 space_info->flags, 5587 num_bytes, 1); 5588 update_bytes_may_use(space_info, num_bytes); 5589 ticket->bytes -= num_bytes; 5590 num_bytes = 0; 5591 } 5592 } 5593 5594 if (num_bytes && head == &space_info->priority_tickets) { 5595 head = &space_info->tickets; 5596 goto again; 5597 } 5598 } 5599 5600 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5601 struct btrfs_block_rsv *block_rsv, 5602 struct btrfs_block_rsv *dest, u64 num_bytes, 5603 u64 *qgroup_to_release_ret) 5604 { 5605 struct btrfs_space_info *space_info = block_rsv->space_info; 5606 u64 qgroup_to_release = 0; 5607 u64 ret; 5608 5609 spin_lock(&block_rsv->lock); 5610 if (num_bytes == (u64)-1) { 5611 num_bytes = block_rsv->size; 5612 qgroup_to_release = block_rsv->qgroup_rsv_size; 5613 } 5614 block_rsv->size -= num_bytes; 5615 if (block_rsv->reserved >= block_rsv->size) { 5616 num_bytes = block_rsv->reserved - block_rsv->size; 5617 block_rsv->reserved = block_rsv->size; 5618 block_rsv->full = 1; 5619 } else { 5620 num_bytes = 0; 5621 } 5622 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { 5623 qgroup_to_release = block_rsv->qgroup_rsv_reserved - 5624 block_rsv->qgroup_rsv_size; 5625 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; 5626 } else { 5627 qgroup_to_release = 0; 5628 } 5629 spin_unlock(&block_rsv->lock); 5630 5631 ret = num_bytes; 5632 if (num_bytes > 0) { 5633 if (dest) { 5634 spin_lock(&dest->lock); 5635 if (!dest->full) { 5636 u64 bytes_to_add; 5637 5638 bytes_to_add = dest->size - dest->reserved; 5639 bytes_to_add = min(num_bytes, bytes_to_add); 5640 dest->reserved += bytes_to_add; 5641 if (dest->reserved >= dest->size) 5642 dest->full = 1; 5643 num_bytes -= bytes_to_add; 5644 } 5645 spin_unlock(&dest->lock); 5646 } 5647 if (num_bytes) 5648 space_info_add_old_bytes(fs_info, space_info, 5649 num_bytes); 5650 } 5651 if (qgroup_to_release_ret) 5652 *qgroup_to_release_ret = qgroup_to_release; 5653 return ret; 5654 } 5655 5656 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5657 struct btrfs_block_rsv *dst, u64 num_bytes, 5658 bool update_size) 5659 { 5660 int ret; 5661 5662 ret = block_rsv_use_bytes(src, num_bytes); 5663 if (ret) 5664 return ret; 5665 5666 block_rsv_add_bytes(dst, num_bytes, update_size); 5667 return 0; 5668 } 5669 5670 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5671 { 5672 memset(rsv, 0, sizeof(*rsv)); 5673 spin_lock_init(&rsv->lock); 5674 rsv->type = type; 5675 } 5676 5677 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, 5678 struct btrfs_block_rsv *rsv, 5679 unsigned short type) 5680 { 5681 btrfs_init_block_rsv(rsv, type); 5682 rsv->space_info = __find_space_info(fs_info, 5683 BTRFS_BLOCK_GROUP_METADATA); 5684 } 5685 5686 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5687 unsigned short type) 5688 { 5689 struct btrfs_block_rsv *block_rsv; 5690 5691 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5692 if (!block_rsv) 5693 return NULL; 5694 5695 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); 5696 return block_rsv; 5697 } 5698 5699 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5700 struct btrfs_block_rsv *rsv) 5701 { 5702 if (!rsv) 5703 return; 5704 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5705 kfree(rsv); 5706 } 5707 5708 int btrfs_block_rsv_add(struct btrfs_root *root, 5709 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5710 enum btrfs_reserve_flush_enum flush) 5711 { 5712 int ret; 5713 5714 if (num_bytes == 0) 5715 return 0; 5716 5717 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5718 if (!ret) 5719 block_rsv_add_bytes(block_rsv, num_bytes, true); 5720 5721 return ret; 5722 } 5723 5724 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5725 { 5726 u64 num_bytes = 0; 5727 int ret = -ENOSPC; 5728 5729 if (!block_rsv) 5730 return 0; 5731 5732 spin_lock(&block_rsv->lock); 5733 num_bytes = div_factor(block_rsv->size, min_factor); 5734 if (block_rsv->reserved >= num_bytes) 5735 ret = 0; 5736 spin_unlock(&block_rsv->lock); 5737 5738 return ret; 5739 } 5740 5741 int btrfs_block_rsv_refill(struct btrfs_root *root, 5742 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5743 enum btrfs_reserve_flush_enum flush) 5744 { 5745 u64 num_bytes = 0; 5746 int ret = -ENOSPC; 5747 5748 if (!block_rsv) 5749 return 0; 5750 5751 spin_lock(&block_rsv->lock); 5752 num_bytes = min_reserved; 5753 if (block_rsv->reserved >= num_bytes) 5754 ret = 0; 5755 else 5756 num_bytes -= block_rsv->reserved; 5757 spin_unlock(&block_rsv->lock); 5758 5759 if (!ret) 5760 return 0; 5761 5762 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5763 if (!ret) { 5764 block_rsv_add_bytes(block_rsv, num_bytes, false); 5765 return 0; 5766 } 5767 5768 return ret; 5769 } 5770 5771 /** 5772 * btrfs_inode_rsv_refill - refill the inode block rsv. 5773 * @inode - the inode we are refilling. 5774 * @flush - the flushing restriction. 5775 * 5776 * Essentially the same as btrfs_block_rsv_refill, except it uses the 5777 * block_rsv->size as the minimum size. We'll either refill the missing amount 5778 * or return if we already have enough space. This will also handle the reserve 5779 * tracepoint for the reserved amount. 5780 */ 5781 static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, 5782 enum btrfs_reserve_flush_enum flush) 5783 { 5784 struct btrfs_root *root = inode->root; 5785 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5786 u64 num_bytes = 0; 5787 u64 qgroup_num_bytes = 0; 5788 int ret = -ENOSPC; 5789 5790 spin_lock(&block_rsv->lock); 5791 if (block_rsv->reserved < block_rsv->size) 5792 num_bytes = block_rsv->size - block_rsv->reserved; 5793 if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) 5794 qgroup_num_bytes = block_rsv->qgroup_rsv_size - 5795 block_rsv->qgroup_rsv_reserved; 5796 spin_unlock(&block_rsv->lock); 5797 5798 if (num_bytes == 0) 5799 return 0; 5800 5801 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); 5802 if (ret) 5803 return ret; 5804 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5805 if (!ret) { 5806 block_rsv_add_bytes(block_rsv, num_bytes, false); 5807 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5808 btrfs_ino(inode), num_bytes, 1); 5809 5810 /* Don't forget to increase qgroup_rsv_reserved */ 5811 spin_lock(&block_rsv->lock); 5812 block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; 5813 spin_unlock(&block_rsv->lock); 5814 } else 5815 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 5816 return ret; 5817 } 5818 5819 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5820 struct btrfs_block_rsv *block_rsv, 5821 u64 num_bytes, u64 *qgroup_to_release) 5822 { 5823 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5824 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 5825 struct btrfs_block_rsv *target = delayed_rsv; 5826 5827 if (target->full || target == block_rsv) 5828 target = global_rsv; 5829 5830 if (block_rsv->space_info != target->space_info) 5831 target = NULL; 5832 5833 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, 5834 qgroup_to_release); 5835 } 5836 5837 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5838 struct btrfs_block_rsv *block_rsv, 5839 u64 num_bytes) 5840 { 5841 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 5842 } 5843 5844 /** 5845 * btrfs_inode_rsv_release - release any excessive reservation. 5846 * @inode - the inode we need to release from. 5847 * @qgroup_free - free or convert qgroup meta. 5848 * Unlike normal operation, qgroup meta reservation needs to know if we are 5849 * freeing qgroup reservation or just converting it into per-trans. Normally 5850 * @qgroup_free is true for error handling, and false for normal release. 5851 * 5852 * This is the same as btrfs_block_rsv_release, except that it handles the 5853 * tracepoint for the reservation. 5854 */ 5855 static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) 5856 { 5857 struct btrfs_fs_info *fs_info = inode->root->fs_info; 5858 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 5859 u64 released = 0; 5860 u64 qgroup_to_release = 0; 5861 5862 /* 5863 * Since we statically set the block_rsv->size we just want to say we 5864 * are releasing 0 bytes, and then we'll just get the reservation over 5865 * the size free'd. 5866 */ 5867 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, 5868 &qgroup_to_release); 5869 if (released > 0) 5870 trace_btrfs_space_reservation(fs_info, "delalloc", 5871 btrfs_ino(inode), released, 0); 5872 if (qgroup_free) 5873 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); 5874 else 5875 btrfs_qgroup_convert_reserved_meta(inode->root, 5876 qgroup_to_release); 5877 } 5878 5879 /** 5880 * btrfs_delayed_refs_rsv_release - release a ref head's reservation. 5881 * @fs_info - the fs_info for our fs. 5882 * @nr - the number of items to drop. 5883 * 5884 * This drops the delayed ref head's count from the delayed refs rsv and frees 5885 * any excess reservation we had. 5886 */ 5887 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 5888 { 5889 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 5890 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5891 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); 5892 u64 released = 0; 5893 5894 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 5895 num_bytes, NULL); 5896 if (released) 5897 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 5898 0, released, 0); 5899 } 5900 5901 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5902 { 5903 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5904 struct btrfs_space_info *sinfo = block_rsv->space_info; 5905 u64 num_bytes; 5906 5907 /* 5908 * The global block rsv is based on the size of the extent tree, the 5909 * checksum tree and the root tree. If the fs is empty we want to set 5910 * it to a minimal amount for safety. 5911 */ 5912 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5913 btrfs_root_used(&fs_info->csum_root->root_item) + 5914 btrfs_root_used(&fs_info->tree_root->root_item); 5915 num_bytes = max_t(u64, num_bytes, SZ_16M); 5916 5917 spin_lock(&sinfo->lock); 5918 spin_lock(&block_rsv->lock); 5919 5920 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5921 5922 if (block_rsv->reserved < block_rsv->size) { 5923 num_bytes = btrfs_space_info_used(sinfo, true); 5924 if (sinfo->total_bytes > num_bytes) { 5925 num_bytes = sinfo->total_bytes - num_bytes; 5926 num_bytes = min(num_bytes, 5927 block_rsv->size - block_rsv->reserved); 5928 block_rsv->reserved += num_bytes; 5929 update_bytes_may_use(sinfo, num_bytes); 5930 trace_btrfs_space_reservation(fs_info, "space_info", 5931 sinfo->flags, num_bytes, 5932 1); 5933 } 5934 } else if (block_rsv->reserved > block_rsv->size) { 5935 num_bytes = block_rsv->reserved - block_rsv->size; 5936 update_bytes_may_use(sinfo, -num_bytes); 5937 trace_btrfs_space_reservation(fs_info, "space_info", 5938 sinfo->flags, num_bytes, 0); 5939 block_rsv->reserved = block_rsv->size; 5940 } 5941 5942 if (block_rsv->reserved == block_rsv->size) 5943 block_rsv->full = 1; 5944 else 5945 block_rsv->full = 0; 5946 5947 spin_unlock(&block_rsv->lock); 5948 spin_unlock(&sinfo->lock); 5949 } 5950 5951 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5952 { 5953 struct btrfs_space_info *space_info; 5954 5955 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5956 fs_info->chunk_block_rsv.space_info = space_info; 5957 5958 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5959 fs_info->global_block_rsv.space_info = space_info; 5960 fs_info->trans_block_rsv.space_info = space_info; 5961 fs_info->empty_block_rsv.space_info = space_info; 5962 fs_info->delayed_block_rsv.space_info = space_info; 5963 fs_info->delayed_refs_rsv.space_info = space_info; 5964 5965 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; 5966 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; 5967 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5968 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5969 if (fs_info->quota_root) 5970 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5971 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5972 5973 update_global_block_rsv(fs_info); 5974 } 5975 5976 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5977 { 5978 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5979 (u64)-1, NULL); 5980 WARN_ON(fs_info->trans_block_rsv.size > 0); 5981 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5982 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5983 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5984 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5985 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5986 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); 5987 WARN_ON(fs_info->delayed_refs_rsv.size > 0); 5988 } 5989 5990 /* 5991 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv 5992 * @trans - the trans that may have generated delayed refs 5993 * 5994 * This is to be called anytime we may have adjusted trans->delayed_ref_updates, 5995 * it'll calculate the additional size and add it to the delayed_refs_rsv. 5996 */ 5997 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 5998 { 5999 struct btrfs_fs_info *fs_info = trans->fs_info; 6000 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 6001 u64 num_bytes; 6002 6003 if (!trans->delayed_ref_updates) 6004 return; 6005 6006 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 6007 trans->delayed_ref_updates); 6008 spin_lock(&delayed_rsv->lock); 6009 delayed_rsv->size += num_bytes; 6010 delayed_rsv->full = 0; 6011 spin_unlock(&delayed_rsv->lock); 6012 trans->delayed_ref_updates = 0; 6013 } 6014 6015 /* 6016 * To be called after all the new block groups attached to the transaction 6017 * handle have been created (btrfs_create_pending_block_groups()). 6018 */ 6019 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 6020 { 6021 struct btrfs_fs_info *fs_info = trans->fs_info; 6022 6023 if (!trans->chunk_bytes_reserved) 6024 return; 6025 6026 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 6027 6028 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 6029 trans->chunk_bytes_reserved, NULL); 6030 trans->chunk_bytes_reserved = 0; 6031 } 6032 6033 /* 6034 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 6035 * root: the root of the parent directory 6036 * rsv: block reservation 6037 * items: the number of items that we need do reservation 6038 * use_global_rsv: allow fallback to the global block reservation 6039 * 6040 * This function is used to reserve the space for snapshot/subvolume 6041 * creation and deletion. Those operations are different with the 6042 * common file/directory operations, they change two fs/file trees 6043 * and root tree, the number of items that the qgroup reserves is 6044 * different with the free space reservation. So we can not use 6045 * the space reservation mechanism in start_transaction(). 6046 */ 6047 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 6048 struct btrfs_block_rsv *rsv, int items, 6049 bool use_global_rsv) 6050 { 6051 u64 qgroup_num_bytes = 0; 6052 u64 num_bytes; 6053 int ret; 6054 struct btrfs_fs_info *fs_info = root->fs_info; 6055 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6056 6057 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 6058 /* One for parent inode, two for dir entries */ 6059 qgroup_num_bytes = 3 * fs_info->nodesize; 6060 ret = btrfs_qgroup_reserve_meta_prealloc(root, 6061 qgroup_num_bytes, true); 6062 if (ret) 6063 return ret; 6064 } 6065 6066 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 6067 rsv->space_info = __find_space_info(fs_info, 6068 BTRFS_BLOCK_GROUP_METADATA); 6069 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 6070 BTRFS_RESERVE_FLUSH_ALL); 6071 6072 if (ret == -ENOSPC && use_global_rsv) 6073 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); 6074 6075 if (ret && qgroup_num_bytes) 6076 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); 6077 6078 return ret; 6079 } 6080 6081 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 6082 struct btrfs_block_rsv *rsv) 6083 { 6084 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 6085 } 6086 6087 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, 6088 struct btrfs_inode *inode) 6089 { 6090 struct btrfs_block_rsv *block_rsv = &inode->block_rsv; 6091 u64 reserve_size = 0; 6092 u64 qgroup_rsv_size = 0; 6093 u64 csum_leaves; 6094 unsigned outstanding_extents; 6095 6096 lockdep_assert_held(&inode->lock); 6097 outstanding_extents = inode->outstanding_extents; 6098 if (outstanding_extents) 6099 reserve_size = btrfs_calc_trans_metadata_size(fs_info, 6100 outstanding_extents + 1); 6101 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, 6102 inode->csum_bytes); 6103 reserve_size += btrfs_calc_trans_metadata_size(fs_info, 6104 csum_leaves); 6105 /* 6106 * For qgroup rsv, the calculation is very simple: 6107 * account one nodesize for each outstanding extent 6108 * 6109 * This is overestimating in most cases. 6110 */ 6111 qgroup_rsv_size = outstanding_extents * fs_info->nodesize; 6112 6113 spin_lock(&block_rsv->lock); 6114 block_rsv->size = reserve_size; 6115 block_rsv->qgroup_rsv_size = qgroup_rsv_size; 6116 spin_unlock(&block_rsv->lock); 6117 } 6118 6119 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 6120 { 6121 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6122 unsigned nr_extents; 6123 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 6124 int ret = 0; 6125 bool delalloc_lock = true; 6126 6127 /* If we are a free space inode we need to not flush since we will be in 6128 * the middle of a transaction commit. We also don't need the delalloc 6129 * mutex since we won't race with anybody. We need this mostly to make 6130 * lockdep shut its filthy mouth. 6131 * 6132 * If we have a transaction open (can happen if we call truncate_block 6133 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 6134 */ 6135 if (btrfs_is_free_space_inode(inode)) { 6136 flush = BTRFS_RESERVE_NO_FLUSH; 6137 delalloc_lock = false; 6138 } else { 6139 if (current->journal_info) 6140 flush = BTRFS_RESERVE_FLUSH_LIMIT; 6141 6142 if (btrfs_transaction_in_commit(fs_info)) 6143 schedule_timeout(1); 6144 } 6145 6146 if (delalloc_lock) 6147 mutex_lock(&inode->delalloc_mutex); 6148 6149 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6150 6151 /* Add our new extents and calculate the new rsv size. */ 6152 spin_lock(&inode->lock); 6153 nr_extents = count_max_extents(num_bytes); 6154 btrfs_mod_outstanding_extents(inode, nr_extents); 6155 inode->csum_bytes += num_bytes; 6156 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6157 spin_unlock(&inode->lock); 6158 6159 ret = btrfs_inode_rsv_refill(inode, flush); 6160 if (unlikely(ret)) 6161 goto out_fail; 6162 6163 if (delalloc_lock) 6164 mutex_unlock(&inode->delalloc_mutex); 6165 return 0; 6166 6167 out_fail: 6168 spin_lock(&inode->lock); 6169 nr_extents = count_max_extents(num_bytes); 6170 btrfs_mod_outstanding_extents(inode, -nr_extents); 6171 inode->csum_bytes -= num_bytes; 6172 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6173 spin_unlock(&inode->lock); 6174 6175 btrfs_inode_rsv_release(inode, true); 6176 if (delalloc_lock) 6177 mutex_unlock(&inode->delalloc_mutex); 6178 return ret; 6179 } 6180 6181 /** 6182 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6183 * @inode: the inode to release the reservation for. 6184 * @num_bytes: the number of bytes we are releasing. 6185 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation 6186 * 6187 * This will release the metadata reservation for an inode. This can be called 6188 * once we complete IO for a given set of bytes to release their metadata 6189 * reservations, or on error for the same reason. 6190 */ 6191 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, 6192 bool qgroup_free) 6193 { 6194 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6195 6196 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6197 spin_lock(&inode->lock); 6198 inode->csum_bytes -= num_bytes; 6199 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6200 spin_unlock(&inode->lock); 6201 6202 if (btrfs_is_testing(fs_info)) 6203 return; 6204 6205 btrfs_inode_rsv_release(inode, qgroup_free); 6206 } 6207 6208 /** 6209 * btrfs_delalloc_release_extents - release our outstanding_extents 6210 * @inode: the inode to balance the reservation for. 6211 * @num_bytes: the number of bytes we originally reserved with 6212 * @qgroup_free: do we need to free qgroup meta reservation or convert them. 6213 * 6214 * When we reserve space we increase outstanding_extents for the extents we may 6215 * add. Once we've set the range as delalloc or created our ordered extents we 6216 * have outstanding_extents to track the real usage, so we use this to free our 6217 * temporarily tracked outstanding_extents. This _must_ be used in conjunction 6218 * with btrfs_delalloc_reserve_metadata. 6219 */ 6220 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, 6221 bool qgroup_free) 6222 { 6223 struct btrfs_fs_info *fs_info = inode->root->fs_info; 6224 unsigned num_extents; 6225 6226 spin_lock(&inode->lock); 6227 num_extents = count_max_extents(num_bytes); 6228 btrfs_mod_outstanding_extents(inode, -num_extents); 6229 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 6230 spin_unlock(&inode->lock); 6231 6232 if (btrfs_is_testing(fs_info)) 6233 return; 6234 6235 btrfs_inode_rsv_release(inode, qgroup_free); 6236 } 6237 6238 /** 6239 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6240 * delalloc 6241 * @inode: inode we're writing to 6242 * @start: start range we are writing to 6243 * @len: how long the range we are writing to 6244 * @reserved: mandatory parameter, record actually reserved qgroup ranges of 6245 * current reservation. 6246 * 6247 * This will do the following things 6248 * 6249 * o reserve space in data space info for num bytes 6250 * and reserve precious corresponding qgroup space 6251 * (Done in check_data_free_space) 6252 * 6253 * o reserve space for metadata space, based on the number of outstanding 6254 * extents and how much csums will be needed 6255 * also reserve metadata space in a per root over-reserve method. 6256 * o add to the inodes->delalloc_bytes 6257 * o add it to the fs_info's delalloc inodes list. 6258 * (Above 3 all done in delalloc_reserve_metadata) 6259 * 6260 * Return 0 for success 6261 * Return <0 for error(-ENOSPC or -EQUOT) 6262 */ 6263 int btrfs_delalloc_reserve_space(struct inode *inode, 6264 struct extent_changeset **reserved, u64 start, u64 len) 6265 { 6266 int ret; 6267 6268 ret = btrfs_check_data_free_space(inode, reserved, start, len); 6269 if (ret < 0) 6270 return ret; 6271 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6272 if (ret < 0) 6273 btrfs_free_reserved_data_space(inode, *reserved, start, len); 6274 return ret; 6275 } 6276 6277 /** 6278 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6279 * @inode: inode we're releasing space for 6280 * @start: start position of the space already reserved 6281 * @len: the len of the space already reserved 6282 * @release_bytes: the len of the space we consumed or didn't use 6283 * 6284 * This function will release the metadata space that was not used and will 6285 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6286 * list if there are no delalloc bytes left. 6287 * Also it will handle the qgroup reserved space. 6288 */ 6289 void btrfs_delalloc_release_space(struct inode *inode, 6290 struct extent_changeset *reserved, 6291 u64 start, u64 len, bool qgroup_free) 6292 { 6293 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); 6294 btrfs_free_reserved_data_space(inode, reserved, start, len); 6295 } 6296 6297 static int update_block_group(struct btrfs_trans_handle *trans, 6298 struct btrfs_fs_info *info, u64 bytenr, 6299 u64 num_bytes, int alloc) 6300 { 6301 struct btrfs_block_group_cache *cache = NULL; 6302 u64 total = num_bytes; 6303 u64 old_val; 6304 u64 byte_in_group; 6305 int factor; 6306 int ret = 0; 6307 6308 /* block accounting for super block */ 6309 spin_lock(&info->delalloc_root_lock); 6310 old_val = btrfs_super_bytes_used(info->super_copy); 6311 if (alloc) 6312 old_val += num_bytes; 6313 else 6314 old_val -= num_bytes; 6315 btrfs_set_super_bytes_used(info->super_copy, old_val); 6316 spin_unlock(&info->delalloc_root_lock); 6317 6318 while (total) { 6319 cache = btrfs_lookup_block_group(info, bytenr); 6320 if (!cache) { 6321 ret = -ENOENT; 6322 break; 6323 } 6324 factor = btrfs_bg_type_to_factor(cache->flags); 6325 6326 /* 6327 * If this block group has free space cache written out, we 6328 * need to make sure to load it if we are removing space. This 6329 * is because we need the unpinning stage to actually add the 6330 * space back to the block group, otherwise we will leak space. 6331 */ 6332 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6333 cache_block_group(cache, 1); 6334 6335 byte_in_group = bytenr - cache->key.objectid; 6336 WARN_ON(byte_in_group > cache->key.offset); 6337 6338 spin_lock(&cache->space_info->lock); 6339 spin_lock(&cache->lock); 6340 6341 if (btrfs_test_opt(info, SPACE_CACHE) && 6342 cache->disk_cache_state < BTRFS_DC_CLEAR) 6343 cache->disk_cache_state = BTRFS_DC_CLEAR; 6344 6345 old_val = btrfs_block_group_used(&cache->item); 6346 num_bytes = min(total, cache->key.offset - byte_in_group); 6347 if (alloc) { 6348 old_val += num_bytes; 6349 btrfs_set_block_group_used(&cache->item, old_val); 6350 cache->reserved -= num_bytes; 6351 cache->space_info->bytes_reserved -= num_bytes; 6352 cache->space_info->bytes_used += num_bytes; 6353 cache->space_info->disk_used += num_bytes * factor; 6354 spin_unlock(&cache->lock); 6355 spin_unlock(&cache->space_info->lock); 6356 } else { 6357 old_val -= num_bytes; 6358 btrfs_set_block_group_used(&cache->item, old_val); 6359 cache->pinned += num_bytes; 6360 update_bytes_pinned(cache->space_info, num_bytes); 6361 cache->space_info->bytes_used -= num_bytes; 6362 cache->space_info->disk_used -= num_bytes * factor; 6363 spin_unlock(&cache->lock); 6364 spin_unlock(&cache->space_info->lock); 6365 6366 trace_btrfs_space_reservation(info, "pinned", 6367 cache->space_info->flags, 6368 num_bytes, 1); 6369 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 6370 num_bytes, 6371 BTRFS_TOTAL_BYTES_PINNED_BATCH); 6372 set_extent_dirty(info->pinned_extents, 6373 bytenr, bytenr + num_bytes - 1, 6374 GFP_NOFS | __GFP_NOFAIL); 6375 } 6376 6377 spin_lock(&trans->transaction->dirty_bgs_lock); 6378 if (list_empty(&cache->dirty_list)) { 6379 list_add_tail(&cache->dirty_list, 6380 &trans->transaction->dirty_bgs); 6381 trans->transaction->num_dirty_bgs++; 6382 trans->delayed_ref_updates++; 6383 btrfs_get_block_group(cache); 6384 } 6385 spin_unlock(&trans->transaction->dirty_bgs_lock); 6386 6387 /* 6388 * No longer have used bytes in this block group, queue it for 6389 * deletion. We do this after adding the block group to the 6390 * dirty list to avoid races between cleaner kthread and space 6391 * cache writeout. 6392 */ 6393 if (!alloc && old_val == 0) 6394 btrfs_mark_bg_unused(cache); 6395 6396 btrfs_put_block_group(cache); 6397 total -= num_bytes; 6398 bytenr += num_bytes; 6399 } 6400 6401 /* Modified block groups are accounted for in the delayed_refs_rsv. */ 6402 btrfs_update_delayed_refs_rsv(trans); 6403 return ret; 6404 } 6405 6406 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6407 { 6408 struct btrfs_block_group_cache *cache; 6409 u64 bytenr; 6410 6411 spin_lock(&fs_info->block_group_cache_lock); 6412 bytenr = fs_info->first_logical_byte; 6413 spin_unlock(&fs_info->block_group_cache_lock); 6414 6415 if (bytenr < (u64)-1) 6416 return bytenr; 6417 6418 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6419 if (!cache) 6420 return 0; 6421 6422 bytenr = cache->key.objectid; 6423 btrfs_put_block_group(cache); 6424 6425 return bytenr; 6426 } 6427 6428 static int pin_down_extent(struct btrfs_fs_info *fs_info, 6429 struct btrfs_block_group_cache *cache, 6430 u64 bytenr, u64 num_bytes, int reserved) 6431 { 6432 spin_lock(&cache->space_info->lock); 6433 spin_lock(&cache->lock); 6434 cache->pinned += num_bytes; 6435 update_bytes_pinned(cache->space_info, num_bytes); 6436 if (reserved) { 6437 cache->reserved -= num_bytes; 6438 cache->space_info->bytes_reserved -= num_bytes; 6439 } 6440 spin_unlock(&cache->lock); 6441 spin_unlock(&cache->space_info->lock); 6442 6443 trace_btrfs_space_reservation(fs_info, "pinned", 6444 cache->space_info->flags, num_bytes, 1); 6445 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, 6446 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); 6447 set_extent_dirty(fs_info->pinned_extents, bytenr, 6448 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6449 return 0; 6450 } 6451 6452 /* 6453 * this function must be called within transaction 6454 */ 6455 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6456 u64 bytenr, u64 num_bytes, int reserved) 6457 { 6458 struct btrfs_block_group_cache *cache; 6459 6460 cache = btrfs_lookup_block_group(fs_info, bytenr); 6461 BUG_ON(!cache); /* Logic error */ 6462 6463 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); 6464 6465 btrfs_put_block_group(cache); 6466 return 0; 6467 } 6468 6469 /* 6470 * this function must be called within transaction 6471 */ 6472 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6473 u64 bytenr, u64 num_bytes) 6474 { 6475 struct btrfs_block_group_cache *cache; 6476 int ret; 6477 6478 cache = btrfs_lookup_block_group(fs_info, bytenr); 6479 if (!cache) 6480 return -EINVAL; 6481 6482 /* 6483 * pull in the free space cache (if any) so that our pin 6484 * removes the free space from the cache. We have load_only set 6485 * to one because the slow code to read in the free extents does check 6486 * the pinned extents. 6487 */ 6488 cache_block_group(cache, 1); 6489 6490 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); 6491 6492 /* remove us from the free space cache (if we're there at all) */ 6493 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6494 btrfs_put_block_group(cache); 6495 return ret; 6496 } 6497 6498 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6499 u64 start, u64 num_bytes) 6500 { 6501 int ret; 6502 struct btrfs_block_group_cache *block_group; 6503 struct btrfs_caching_control *caching_ctl; 6504 6505 block_group = btrfs_lookup_block_group(fs_info, start); 6506 if (!block_group) 6507 return -EINVAL; 6508 6509 cache_block_group(block_group, 0); 6510 caching_ctl = get_caching_control(block_group); 6511 6512 if (!caching_ctl) { 6513 /* Logic error */ 6514 BUG_ON(!block_group_cache_done(block_group)); 6515 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6516 } else { 6517 mutex_lock(&caching_ctl->mutex); 6518 6519 if (start >= caching_ctl->progress) { 6520 ret = add_excluded_extent(fs_info, start, num_bytes); 6521 } else if (start + num_bytes <= caching_ctl->progress) { 6522 ret = btrfs_remove_free_space(block_group, 6523 start, num_bytes); 6524 } else { 6525 num_bytes = caching_ctl->progress - start; 6526 ret = btrfs_remove_free_space(block_group, 6527 start, num_bytes); 6528 if (ret) 6529 goto out_lock; 6530 6531 num_bytes = (start + num_bytes) - 6532 caching_ctl->progress; 6533 start = caching_ctl->progress; 6534 ret = add_excluded_extent(fs_info, start, num_bytes); 6535 } 6536 out_lock: 6537 mutex_unlock(&caching_ctl->mutex); 6538 put_caching_control(caching_ctl); 6539 } 6540 btrfs_put_block_group(block_group); 6541 return ret; 6542 } 6543 6544 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, 6545 struct extent_buffer *eb) 6546 { 6547 struct btrfs_file_extent_item *item; 6548 struct btrfs_key key; 6549 int found_type; 6550 int i; 6551 int ret = 0; 6552 6553 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6554 return 0; 6555 6556 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6557 btrfs_item_key_to_cpu(eb, &key, i); 6558 if (key.type != BTRFS_EXTENT_DATA_KEY) 6559 continue; 6560 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6561 found_type = btrfs_file_extent_type(eb, item); 6562 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6563 continue; 6564 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6565 continue; 6566 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6567 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6568 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 6569 if (ret) 6570 break; 6571 } 6572 6573 return ret; 6574 } 6575 6576 static void 6577 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6578 { 6579 atomic_inc(&bg->reservations); 6580 } 6581 6582 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6583 const u64 start) 6584 { 6585 struct btrfs_block_group_cache *bg; 6586 6587 bg = btrfs_lookup_block_group(fs_info, start); 6588 ASSERT(bg); 6589 if (atomic_dec_and_test(&bg->reservations)) 6590 wake_up_var(&bg->reservations); 6591 btrfs_put_block_group(bg); 6592 } 6593 6594 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6595 { 6596 struct btrfs_space_info *space_info = bg->space_info; 6597 6598 ASSERT(bg->ro); 6599 6600 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6601 return; 6602 6603 /* 6604 * Our block group is read only but before we set it to read only, 6605 * some task might have had allocated an extent from it already, but it 6606 * has not yet created a respective ordered extent (and added it to a 6607 * root's list of ordered extents). 6608 * Therefore wait for any task currently allocating extents, since the 6609 * block group's reservations counter is incremented while a read lock 6610 * on the groups' semaphore is held and decremented after releasing 6611 * the read access on that semaphore and creating the ordered extent. 6612 */ 6613 down_write(&space_info->groups_sem); 6614 up_write(&space_info->groups_sem); 6615 6616 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); 6617 } 6618 6619 /** 6620 * btrfs_add_reserved_bytes - update the block_group and space info counters 6621 * @cache: The cache we are manipulating 6622 * @ram_bytes: The number of bytes of file content, and will be same to 6623 * @num_bytes except for the compress path. 6624 * @num_bytes: The number of bytes in question 6625 * @delalloc: The blocks are allocated for the delalloc write 6626 * 6627 * This is called by the allocator when it reserves space. If this is a 6628 * reservation and the block group has become read only we cannot make the 6629 * reservation and return -EAGAIN, otherwise this function always succeeds. 6630 */ 6631 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6632 u64 ram_bytes, u64 num_bytes, int delalloc) 6633 { 6634 struct btrfs_space_info *space_info = cache->space_info; 6635 int ret = 0; 6636 6637 spin_lock(&space_info->lock); 6638 spin_lock(&cache->lock); 6639 if (cache->ro) { 6640 ret = -EAGAIN; 6641 } else { 6642 cache->reserved += num_bytes; 6643 space_info->bytes_reserved += num_bytes; 6644 update_bytes_may_use(space_info, -ram_bytes); 6645 if (delalloc) 6646 cache->delalloc_bytes += num_bytes; 6647 } 6648 spin_unlock(&cache->lock); 6649 spin_unlock(&space_info->lock); 6650 return ret; 6651 } 6652 6653 /** 6654 * btrfs_free_reserved_bytes - update the block_group and space info counters 6655 * @cache: The cache we are manipulating 6656 * @num_bytes: The number of bytes in question 6657 * @delalloc: The blocks are allocated for the delalloc write 6658 * 6659 * This is called by somebody who is freeing space that was never actually used 6660 * on disk. For example if you reserve some space for a new leaf in transaction 6661 * A and before transaction A commits you free that leaf, you call this with 6662 * reserve set to 0 in order to clear the reservation. 6663 */ 6664 6665 static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6666 u64 num_bytes, int delalloc) 6667 { 6668 struct btrfs_space_info *space_info = cache->space_info; 6669 6670 spin_lock(&space_info->lock); 6671 spin_lock(&cache->lock); 6672 if (cache->ro) 6673 space_info->bytes_readonly += num_bytes; 6674 cache->reserved -= num_bytes; 6675 space_info->bytes_reserved -= num_bytes; 6676 space_info->max_extent_size = 0; 6677 6678 if (delalloc) 6679 cache->delalloc_bytes -= num_bytes; 6680 spin_unlock(&cache->lock); 6681 spin_unlock(&space_info->lock); 6682 } 6683 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6684 { 6685 struct btrfs_caching_control *next; 6686 struct btrfs_caching_control *caching_ctl; 6687 struct btrfs_block_group_cache *cache; 6688 6689 down_write(&fs_info->commit_root_sem); 6690 6691 list_for_each_entry_safe(caching_ctl, next, 6692 &fs_info->caching_block_groups, list) { 6693 cache = caching_ctl->block_group; 6694 if (block_group_cache_done(cache)) { 6695 cache->last_byte_to_unpin = (u64)-1; 6696 list_del_init(&caching_ctl->list); 6697 put_caching_control(caching_ctl); 6698 } else { 6699 cache->last_byte_to_unpin = caching_ctl->progress; 6700 } 6701 } 6702 6703 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6704 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6705 else 6706 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6707 6708 up_write(&fs_info->commit_root_sem); 6709 6710 update_global_block_rsv(fs_info); 6711 } 6712 6713 /* 6714 * Returns the free cluster for the given space info and sets empty_cluster to 6715 * what it should be based on the mount options. 6716 */ 6717 static struct btrfs_free_cluster * 6718 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6719 struct btrfs_space_info *space_info, u64 *empty_cluster) 6720 { 6721 struct btrfs_free_cluster *ret = NULL; 6722 6723 *empty_cluster = 0; 6724 if (btrfs_mixed_space_info(space_info)) 6725 return ret; 6726 6727 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6728 ret = &fs_info->meta_alloc_cluster; 6729 if (btrfs_test_opt(fs_info, SSD)) 6730 *empty_cluster = SZ_2M; 6731 else 6732 *empty_cluster = SZ_64K; 6733 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 6734 btrfs_test_opt(fs_info, SSD_SPREAD)) { 6735 *empty_cluster = SZ_2M; 6736 ret = &fs_info->data_alloc_cluster; 6737 } 6738 6739 return ret; 6740 } 6741 6742 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6743 u64 start, u64 end, 6744 const bool return_free_space) 6745 { 6746 struct btrfs_block_group_cache *cache = NULL; 6747 struct btrfs_space_info *space_info; 6748 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6749 struct btrfs_free_cluster *cluster = NULL; 6750 u64 len; 6751 u64 total_unpinned = 0; 6752 u64 empty_cluster = 0; 6753 bool readonly; 6754 6755 while (start <= end) { 6756 readonly = false; 6757 if (!cache || 6758 start >= cache->key.objectid + cache->key.offset) { 6759 if (cache) 6760 btrfs_put_block_group(cache); 6761 total_unpinned = 0; 6762 cache = btrfs_lookup_block_group(fs_info, start); 6763 BUG_ON(!cache); /* Logic error */ 6764 6765 cluster = fetch_cluster_info(fs_info, 6766 cache->space_info, 6767 &empty_cluster); 6768 empty_cluster <<= 1; 6769 } 6770 6771 len = cache->key.objectid + cache->key.offset - start; 6772 len = min(len, end + 1 - start); 6773 6774 if (start < cache->last_byte_to_unpin) { 6775 len = min(len, cache->last_byte_to_unpin - start); 6776 if (return_free_space) 6777 btrfs_add_free_space(cache, start, len); 6778 } 6779 6780 start += len; 6781 total_unpinned += len; 6782 space_info = cache->space_info; 6783 6784 /* 6785 * If this space cluster has been marked as fragmented and we've 6786 * unpinned enough in this block group to potentially allow a 6787 * cluster to be created inside of it go ahead and clear the 6788 * fragmented check. 6789 */ 6790 if (cluster && cluster->fragmented && 6791 total_unpinned > empty_cluster) { 6792 spin_lock(&cluster->lock); 6793 cluster->fragmented = 0; 6794 spin_unlock(&cluster->lock); 6795 } 6796 6797 spin_lock(&space_info->lock); 6798 spin_lock(&cache->lock); 6799 cache->pinned -= len; 6800 update_bytes_pinned(space_info, -len); 6801 6802 trace_btrfs_space_reservation(fs_info, "pinned", 6803 space_info->flags, len, 0); 6804 space_info->max_extent_size = 0; 6805 percpu_counter_add_batch(&space_info->total_bytes_pinned, 6806 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); 6807 if (cache->ro) { 6808 space_info->bytes_readonly += len; 6809 readonly = true; 6810 } 6811 spin_unlock(&cache->lock); 6812 if (!readonly && return_free_space && 6813 global_rsv->space_info == space_info) { 6814 u64 to_add = len; 6815 6816 spin_lock(&global_rsv->lock); 6817 if (!global_rsv->full) { 6818 to_add = min(len, global_rsv->size - 6819 global_rsv->reserved); 6820 global_rsv->reserved += to_add; 6821 update_bytes_may_use(space_info, to_add); 6822 if (global_rsv->reserved >= global_rsv->size) 6823 global_rsv->full = 1; 6824 trace_btrfs_space_reservation(fs_info, 6825 "space_info", 6826 space_info->flags, 6827 to_add, 1); 6828 len -= to_add; 6829 } 6830 spin_unlock(&global_rsv->lock); 6831 /* Add to any tickets we may have */ 6832 if (len) 6833 space_info_add_new_bytes(fs_info, space_info, 6834 len); 6835 } 6836 spin_unlock(&space_info->lock); 6837 } 6838 6839 if (cache) 6840 btrfs_put_block_group(cache); 6841 return 0; 6842 } 6843 6844 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 6845 { 6846 struct btrfs_fs_info *fs_info = trans->fs_info; 6847 struct btrfs_block_group_cache *block_group, *tmp; 6848 struct list_head *deleted_bgs; 6849 struct extent_io_tree *unpin; 6850 u64 start; 6851 u64 end; 6852 int ret; 6853 6854 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6855 unpin = &fs_info->freed_extents[1]; 6856 else 6857 unpin = &fs_info->freed_extents[0]; 6858 6859 while (!trans->aborted) { 6860 struct extent_state *cached_state = NULL; 6861 6862 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6863 ret = find_first_extent_bit(unpin, 0, &start, &end, 6864 EXTENT_DIRTY, &cached_state); 6865 if (ret) { 6866 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6867 break; 6868 } 6869 6870 if (btrfs_test_opt(fs_info, DISCARD)) 6871 ret = btrfs_discard_extent(fs_info, start, 6872 end + 1 - start, NULL); 6873 6874 clear_extent_dirty(unpin, start, end, &cached_state); 6875 unpin_extent_range(fs_info, start, end, true); 6876 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6877 free_extent_state(cached_state); 6878 cond_resched(); 6879 } 6880 6881 /* 6882 * Transaction is finished. We don't need the lock anymore. We 6883 * do need to clean up the block groups in case of a transaction 6884 * abort. 6885 */ 6886 deleted_bgs = &trans->transaction->deleted_bgs; 6887 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6888 u64 trimmed = 0; 6889 6890 ret = -EROFS; 6891 if (!trans->aborted) 6892 ret = btrfs_discard_extent(fs_info, 6893 block_group->key.objectid, 6894 block_group->key.offset, 6895 &trimmed); 6896 6897 list_del_init(&block_group->bg_list); 6898 btrfs_put_block_group_trimming(block_group); 6899 btrfs_put_block_group(block_group); 6900 6901 if (ret) { 6902 const char *errstr = btrfs_decode_error(ret); 6903 btrfs_warn(fs_info, 6904 "discard failed while removing blockgroup: errno=%d %s", 6905 ret, errstr); 6906 } 6907 } 6908 6909 return 0; 6910 } 6911 6912 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6913 struct btrfs_delayed_ref_node *node, u64 parent, 6914 u64 root_objectid, u64 owner_objectid, 6915 u64 owner_offset, int refs_to_drop, 6916 struct btrfs_delayed_extent_op *extent_op) 6917 { 6918 struct btrfs_fs_info *info = trans->fs_info; 6919 struct btrfs_key key; 6920 struct btrfs_path *path; 6921 struct btrfs_root *extent_root = info->extent_root; 6922 struct extent_buffer *leaf; 6923 struct btrfs_extent_item *ei; 6924 struct btrfs_extent_inline_ref *iref; 6925 int ret; 6926 int is_data; 6927 int extent_slot = 0; 6928 int found_extent = 0; 6929 int num_to_del = 1; 6930 u32 item_size; 6931 u64 refs; 6932 u64 bytenr = node->bytenr; 6933 u64 num_bytes = node->num_bytes; 6934 int last_ref = 0; 6935 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 6936 6937 path = btrfs_alloc_path(); 6938 if (!path) 6939 return -ENOMEM; 6940 6941 path->reada = READA_FORWARD; 6942 path->leave_spinning = 1; 6943 6944 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6945 BUG_ON(!is_data && refs_to_drop != 1); 6946 6947 if (is_data) 6948 skinny_metadata = false; 6949 6950 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 6951 parent, root_objectid, owner_objectid, 6952 owner_offset); 6953 if (ret == 0) { 6954 extent_slot = path->slots[0]; 6955 while (extent_slot >= 0) { 6956 btrfs_item_key_to_cpu(path->nodes[0], &key, 6957 extent_slot); 6958 if (key.objectid != bytenr) 6959 break; 6960 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6961 key.offset == num_bytes) { 6962 found_extent = 1; 6963 break; 6964 } 6965 if (key.type == BTRFS_METADATA_ITEM_KEY && 6966 key.offset == owner_objectid) { 6967 found_extent = 1; 6968 break; 6969 } 6970 if (path->slots[0] - extent_slot > 5) 6971 break; 6972 extent_slot--; 6973 } 6974 6975 if (!found_extent) { 6976 BUG_ON(iref); 6977 ret = remove_extent_backref(trans, path, NULL, 6978 refs_to_drop, 6979 is_data, &last_ref); 6980 if (ret) { 6981 btrfs_abort_transaction(trans, ret); 6982 goto out; 6983 } 6984 btrfs_release_path(path); 6985 path->leave_spinning = 1; 6986 6987 key.objectid = bytenr; 6988 key.type = BTRFS_EXTENT_ITEM_KEY; 6989 key.offset = num_bytes; 6990 6991 if (!is_data && skinny_metadata) { 6992 key.type = BTRFS_METADATA_ITEM_KEY; 6993 key.offset = owner_objectid; 6994 } 6995 6996 ret = btrfs_search_slot(trans, extent_root, 6997 &key, path, -1, 1); 6998 if (ret > 0 && skinny_metadata && path->slots[0]) { 6999 /* 7000 * Couldn't find our skinny metadata item, 7001 * see if we have ye olde extent item. 7002 */ 7003 path->slots[0]--; 7004 btrfs_item_key_to_cpu(path->nodes[0], &key, 7005 path->slots[0]); 7006 if (key.objectid == bytenr && 7007 key.type == BTRFS_EXTENT_ITEM_KEY && 7008 key.offset == num_bytes) 7009 ret = 0; 7010 } 7011 7012 if (ret > 0 && skinny_metadata) { 7013 skinny_metadata = false; 7014 key.objectid = bytenr; 7015 key.type = BTRFS_EXTENT_ITEM_KEY; 7016 key.offset = num_bytes; 7017 btrfs_release_path(path); 7018 ret = btrfs_search_slot(trans, extent_root, 7019 &key, path, -1, 1); 7020 } 7021 7022 if (ret) { 7023 btrfs_err(info, 7024 "umm, got %d back from search, was looking for %llu", 7025 ret, bytenr); 7026 if (ret > 0) 7027 btrfs_print_leaf(path->nodes[0]); 7028 } 7029 if (ret < 0) { 7030 btrfs_abort_transaction(trans, ret); 7031 goto out; 7032 } 7033 extent_slot = path->slots[0]; 7034 } 7035 } else if (WARN_ON(ret == -ENOENT)) { 7036 btrfs_print_leaf(path->nodes[0]); 7037 btrfs_err(info, 7038 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 7039 bytenr, parent, root_objectid, owner_objectid, 7040 owner_offset); 7041 btrfs_abort_transaction(trans, ret); 7042 goto out; 7043 } else { 7044 btrfs_abort_transaction(trans, ret); 7045 goto out; 7046 } 7047 7048 leaf = path->nodes[0]; 7049 item_size = btrfs_item_size_nr(leaf, extent_slot); 7050 if (unlikely(item_size < sizeof(*ei))) { 7051 ret = -EINVAL; 7052 btrfs_print_v0_err(info); 7053 btrfs_abort_transaction(trans, ret); 7054 goto out; 7055 } 7056 ei = btrfs_item_ptr(leaf, extent_slot, 7057 struct btrfs_extent_item); 7058 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 7059 key.type == BTRFS_EXTENT_ITEM_KEY) { 7060 struct btrfs_tree_block_info *bi; 7061 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 7062 bi = (struct btrfs_tree_block_info *)(ei + 1); 7063 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 7064 } 7065 7066 refs = btrfs_extent_refs(leaf, ei); 7067 if (refs < refs_to_drop) { 7068 btrfs_err(info, 7069 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7070 refs_to_drop, refs, bytenr); 7071 ret = -EINVAL; 7072 btrfs_abort_transaction(trans, ret); 7073 goto out; 7074 } 7075 refs -= refs_to_drop; 7076 7077 if (refs > 0) { 7078 if (extent_op) 7079 __run_delayed_extent_op(extent_op, leaf, ei); 7080 /* 7081 * In the case of inline back ref, reference count will 7082 * be updated by remove_extent_backref 7083 */ 7084 if (iref) { 7085 BUG_ON(!found_extent); 7086 } else { 7087 btrfs_set_extent_refs(leaf, ei, refs); 7088 btrfs_mark_buffer_dirty(leaf); 7089 } 7090 if (found_extent) { 7091 ret = remove_extent_backref(trans, path, iref, 7092 refs_to_drop, is_data, 7093 &last_ref); 7094 if (ret) { 7095 btrfs_abort_transaction(trans, ret); 7096 goto out; 7097 } 7098 } 7099 } else { 7100 if (found_extent) { 7101 BUG_ON(is_data && refs_to_drop != 7102 extent_data_ref_count(path, iref)); 7103 if (iref) { 7104 BUG_ON(path->slots[0] != extent_slot); 7105 } else { 7106 BUG_ON(path->slots[0] != extent_slot + 1); 7107 path->slots[0] = extent_slot; 7108 num_to_del = 2; 7109 } 7110 } 7111 7112 last_ref = 1; 7113 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7114 num_to_del); 7115 if (ret) { 7116 btrfs_abort_transaction(trans, ret); 7117 goto out; 7118 } 7119 btrfs_release_path(path); 7120 7121 if (is_data) { 7122 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7123 if (ret) { 7124 btrfs_abort_transaction(trans, ret); 7125 goto out; 7126 } 7127 } 7128 7129 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 7130 if (ret) { 7131 btrfs_abort_transaction(trans, ret); 7132 goto out; 7133 } 7134 7135 ret = update_block_group(trans, info, bytenr, num_bytes, 0); 7136 if (ret) { 7137 btrfs_abort_transaction(trans, ret); 7138 goto out; 7139 } 7140 } 7141 btrfs_release_path(path); 7142 7143 out: 7144 btrfs_free_path(path); 7145 return ret; 7146 } 7147 7148 /* 7149 * when we free an block, it is possible (and likely) that we free the last 7150 * delayed ref for that extent as well. This searches the delayed ref tree for 7151 * a given extent, and if there are no other delayed refs to be processed, it 7152 * removes it from the tree. 7153 */ 7154 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7155 u64 bytenr) 7156 { 7157 struct btrfs_delayed_ref_head *head; 7158 struct btrfs_delayed_ref_root *delayed_refs; 7159 int ret = 0; 7160 7161 delayed_refs = &trans->transaction->delayed_refs; 7162 spin_lock(&delayed_refs->lock); 7163 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7164 if (!head) 7165 goto out_delayed_unlock; 7166 7167 spin_lock(&head->lock); 7168 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 7169 goto out; 7170 7171 if (cleanup_extent_op(head) != NULL) 7172 goto out; 7173 7174 /* 7175 * waiting for the lock here would deadlock. If someone else has it 7176 * locked they are already in the process of dropping it anyway 7177 */ 7178 if (!mutex_trylock(&head->mutex)) 7179 goto out; 7180 7181 btrfs_delete_ref_head(delayed_refs, head); 7182 head->processing = 0; 7183 7184 spin_unlock(&head->lock); 7185 spin_unlock(&delayed_refs->lock); 7186 7187 BUG_ON(head->extent_op); 7188 if (head->must_insert_reserved) 7189 ret = 1; 7190 7191 cleanup_ref_head_accounting(trans, head); 7192 mutex_unlock(&head->mutex); 7193 btrfs_put_delayed_ref_head(head); 7194 return ret; 7195 out: 7196 spin_unlock(&head->lock); 7197 7198 out_delayed_unlock: 7199 spin_unlock(&delayed_refs->lock); 7200 return 0; 7201 } 7202 7203 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7204 struct btrfs_root *root, 7205 struct extent_buffer *buf, 7206 u64 parent, int last_ref) 7207 { 7208 struct btrfs_fs_info *fs_info = root->fs_info; 7209 int pin = 1; 7210 int ret; 7211 7212 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7213 int old_ref_mod, new_ref_mod; 7214 7215 btrfs_ref_tree_mod(root, buf->start, buf->len, parent, 7216 root->root_key.objectid, 7217 btrfs_header_level(buf), 0, 7218 BTRFS_DROP_DELAYED_REF); 7219 ret = btrfs_add_delayed_tree_ref(trans, buf->start, 7220 buf->len, parent, 7221 root->root_key.objectid, 7222 btrfs_header_level(buf), 7223 BTRFS_DROP_DELAYED_REF, NULL, 7224 &old_ref_mod, &new_ref_mod); 7225 BUG_ON(ret); /* -ENOMEM */ 7226 pin = old_ref_mod >= 0 && new_ref_mod < 0; 7227 } 7228 7229 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 7230 struct btrfs_block_group_cache *cache; 7231 7232 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7233 ret = check_ref_cleanup(trans, buf->start); 7234 if (!ret) 7235 goto out; 7236 } 7237 7238 pin = 0; 7239 cache = btrfs_lookup_block_group(fs_info, buf->start); 7240 7241 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7242 pin_down_extent(fs_info, cache, buf->start, 7243 buf->len, 1); 7244 btrfs_put_block_group(cache); 7245 goto out; 7246 } 7247 7248 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7249 7250 btrfs_add_free_space(cache, buf->start, buf->len); 7251 btrfs_free_reserved_bytes(cache, buf->len, 0); 7252 btrfs_put_block_group(cache); 7253 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7254 } 7255 out: 7256 if (pin) 7257 add_pinned_bytes(fs_info, buf->len, true, 7258 root->root_key.objectid); 7259 7260 if (last_ref) { 7261 /* 7262 * Deleting the buffer, clear the corrupt flag since it doesn't 7263 * matter anymore. 7264 */ 7265 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7266 } 7267 } 7268 7269 /* Can return -ENOMEM */ 7270 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7271 struct btrfs_root *root, 7272 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7273 u64 owner, u64 offset) 7274 { 7275 struct btrfs_fs_info *fs_info = root->fs_info; 7276 int old_ref_mod, new_ref_mod; 7277 int ret; 7278 7279 if (btrfs_is_testing(fs_info)) 7280 return 0; 7281 7282 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) 7283 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, 7284 root_objectid, owner, offset, 7285 BTRFS_DROP_DELAYED_REF); 7286 7287 /* 7288 * tree log blocks never actually go into the extent allocation 7289 * tree, just update pinning info and exit early. 7290 */ 7291 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7292 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7293 /* unlocks the pinned mutex */ 7294 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7295 old_ref_mod = new_ref_mod = 0; 7296 ret = 0; 7297 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7298 ret = btrfs_add_delayed_tree_ref(trans, bytenr, 7299 num_bytes, parent, 7300 root_objectid, (int)owner, 7301 BTRFS_DROP_DELAYED_REF, NULL, 7302 &old_ref_mod, &new_ref_mod); 7303 } else { 7304 ret = btrfs_add_delayed_data_ref(trans, bytenr, 7305 num_bytes, parent, 7306 root_objectid, owner, offset, 7307 0, BTRFS_DROP_DELAYED_REF, 7308 &old_ref_mod, &new_ref_mod); 7309 } 7310 7311 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { 7312 bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; 7313 7314 add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); 7315 } 7316 7317 return ret; 7318 } 7319 7320 /* 7321 * when we wait for progress in the block group caching, its because 7322 * our allocation attempt failed at least once. So, we must sleep 7323 * and let some progress happen before we try again. 7324 * 7325 * This function will sleep at least once waiting for new free space to 7326 * show up, and then it will check the block group free space numbers 7327 * for our min num_bytes. Another option is to have it go ahead 7328 * and look in the rbtree for a free extent of a given size, but this 7329 * is a good start. 7330 * 7331 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7332 * any of the information in this block group. 7333 */ 7334 static noinline void 7335 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7336 u64 num_bytes) 7337 { 7338 struct btrfs_caching_control *caching_ctl; 7339 7340 caching_ctl = get_caching_control(cache); 7341 if (!caching_ctl) 7342 return; 7343 7344 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7345 (cache->free_space_ctl->free_space >= num_bytes)); 7346 7347 put_caching_control(caching_ctl); 7348 } 7349 7350 static noinline int 7351 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7352 { 7353 struct btrfs_caching_control *caching_ctl; 7354 int ret = 0; 7355 7356 caching_ctl = get_caching_control(cache); 7357 if (!caching_ctl) 7358 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7359 7360 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7361 if (cache->cached == BTRFS_CACHE_ERROR) 7362 ret = -EIO; 7363 put_caching_control(caching_ctl); 7364 return ret; 7365 } 7366 7367 enum btrfs_loop_type { 7368 LOOP_CACHING_NOWAIT = 0, 7369 LOOP_CACHING_WAIT = 1, 7370 LOOP_ALLOC_CHUNK = 2, 7371 LOOP_NO_EMPTY_SIZE = 3, 7372 }; 7373 7374 static inline void 7375 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7376 int delalloc) 7377 { 7378 if (delalloc) 7379 down_read(&cache->data_rwsem); 7380 } 7381 7382 static inline void 7383 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7384 int delalloc) 7385 { 7386 btrfs_get_block_group(cache); 7387 if (delalloc) 7388 down_read(&cache->data_rwsem); 7389 } 7390 7391 static struct btrfs_block_group_cache * 7392 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7393 struct btrfs_free_cluster *cluster, 7394 int delalloc) 7395 { 7396 struct btrfs_block_group_cache *used_bg = NULL; 7397 7398 spin_lock(&cluster->refill_lock); 7399 while (1) { 7400 used_bg = cluster->block_group; 7401 if (!used_bg) 7402 return NULL; 7403 7404 if (used_bg == block_group) 7405 return used_bg; 7406 7407 btrfs_get_block_group(used_bg); 7408 7409 if (!delalloc) 7410 return used_bg; 7411 7412 if (down_read_trylock(&used_bg->data_rwsem)) 7413 return used_bg; 7414 7415 spin_unlock(&cluster->refill_lock); 7416 7417 /* We should only have one-level nested. */ 7418 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7419 7420 spin_lock(&cluster->refill_lock); 7421 if (used_bg == cluster->block_group) 7422 return used_bg; 7423 7424 up_read(&used_bg->data_rwsem); 7425 btrfs_put_block_group(used_bg); 7426 } 7427 } 7428 7429 static inline void 7430 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7431 int delalloc) 7432 { 7433 if (delalloc) 7434 up_read(&cache->data_rwsem); 7435 btrfs_put_block_group(cache); 7436 } 7437 7438 /* 7439 * Structure used internally for find_free_extent() function. Wraps needed 7440 * parameters. 7441 */ 7442 struct find_free_extent_ctl { 7443 /* Basic allocation info */ 7444 u64 ram_bytes; 7445 u64 num_bytes; 7446 u64 empty_size; 7447 u64 flags; 7448 int delalloc; 7449 7450 /* Where to start the search inside the bg */ 7451 u64 search_start; 7452 7453 /* For clustered allocation */ 7454 u64 empty_cluster; 7455 7456 bool have_caching_bg; 7457 bool orig_have_caching_bg; 7458 7459 /* RAID index, converted from flags */ 7460 int index; 7461 7462 /* 7463 * Current loop number, check find_free_extent_update_loop() for details 7464 */ 7465 int loop; 7466 7467 /* 7468 * Whether we're refilling a cluster, if true we need to re-search 7469 * current block group but don't try to refill the cluster again. 7470 */ 7471 bool retry_clustered; 7472 7473 /* 7474 * Whether we're updating free space cache, if true we need to re-search 7475 * current block group but don't try updating free space cache again. 7476 */ 7477 bool retry_unclustered; 7478 7479 /* If current block group is cached */ 7480 int cached; 7481 7482 /* Max contiguous hole found */ 7483 u64 max_extent_size; 7484 7485 /* Total free space from free space cache, not always contiguous */ 7486 u64 total_free_space; 7487 7488 /* Found result */ 7489 u64 found_offset; 7490 }; 7491 7492 7493 /* 7494 * Helper function for find_free_extent(). 7495 * 7496 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 7497 * Return -EAGAIN to inform caller that we need to re-search this block group 7498 * Return >0 to inform caller that we find nothing 7499 * Return 0 means we have found a location and set ffe_ctl->found_offset. 7500 */ 7501 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, 7502 struct btrfs_free_cluster *last_ptr, 7503 struct find_free_extent_ctl *ffe_ctl, 7504 struct btrfs_block_group_cache **cluster_bg_ret) 7505 { 7506 struct btrfs_fs_info *fs_info = bg->fs_info; 7507 struct btrfs_block_group_cache *cluster_bg; 7508 u64 aligned_cluster; 7509 u64 offset; 7510 int ret; 7511 7512 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 7513 if (!cluster_bg) 7514 goto refill_cluster; 7515 if (cluster_bg != bg && (cluster_bg->ro || 7516 !block_group_bits(cluster_bg, ffe_ctl->flags))) 7517 goto release_cluster; 7518 7519 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 7520 ffe_ctl->num_bytes, cluster_bg->key.objectid, 7521 &ffe_ctl->max_extent_size); 7522 if (offset) { 7523 /* We have a block, we're done */ 7524 spin_unlock(&last_ptr->refill_lock); 7525 trace_btrfs_reserve_extent_cluster(cluster_bg, 7526 ffe_ctl->search_start, ffe_ctl->num_bytes); 7527 *cluster_bg_ret = cluster_bg; 7528 ffe_ctl->found_offset = offset; 7529 return 0; 7530 } 7531 WARN_ON(last_ptr->block_group != cluster_bg); 7532 7533 release_cluster: 7534 /* 7535 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 7536 * lets just skip it and let the allocator find whatever block it can 7537 * find. If we reach this point, we will have tried the cluster 7538 * allocator plenty of times and not have found anything, so we are 7539 * likely way too fragmented for the clustering stuff to find anything. 7540 * 7541 * However, if the cluster is taken from the current block group, 7542 * release the cluster first, so that we stand a better chance of 7543 * succeeding in the unclustered allocation. 7544 */ 7545 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 7546 spin_unlock(&last_ptr->refill_lock); 7547 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 7548 return -ENOENT; 7549 } 7550 7551 /* This cluster didn't work out, free it and start over */ 7552 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7553 7554 if (cluster_bg != bg) 7555 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 7556 7557 refill_cluster: 7558 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 7559 spin_unlock(&last_ptr->refill_lock); 7560 return -ENOENT; 7561 } 7562 7563 aligned_cluster = max_t(u64, 7564 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 7565 bg->full_stripe_len); 7566 ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, 7567 ffe_ctl->search_start, ffe_ctl->num_bytes, 7568 aligned_cluster); 7569 if (ret == 0) { 7570 /* Now pull our allocation out of this cluster */ 7571 offset = btrfs_alloc_from_cluster(bg, last_ptr, 7572 ffe_ctl->num_bytes, ffe_ctl->search_start, 7573 &ffe_ctl->max_extent_size); 7574 if (offset) { 7575 /* We found one, proceed */ 7576 spin_unlock(&last_ptr->refill_lock); 7577 trace_btrfs_reserve_extent_cluster(bg, 7578 ffe_ctl->search_start, 7579 ffe_ctl->num_bytes); 7580 ffe_ctl->found_offset = offset; 7581 return 0; 7582 } 7583 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 7584 !ffe_ctl->retry_clustered) { 7585 spin_unlock(&last_ptr->refill_lock); 7586 7587 ffe_ctl->retry_clustered = true; 7588 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 7589 ffe_ctl->empty_cluster + ffe_ctl->empty_size); 7590 return -EAGAIN; 7591 } 7592 /* 7593 * At this point we either didn't find a cluster or we weren't able to 7594 * allocate a block from our cluster. Free the cluster we've been 7595 * trying to use, and go to the next block group. 7596 */ 7597 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7598 spin_unlock(&last_ptr->refill_lock); 7599 return 1; 7600 } 7601 7602 /* 7603 * Return >0 to inform caller that we find nothing 7604 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 7605 * Return -EAGAIN to inform caller that we need to re-search this block group 7606 */ 7607 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, 7608 struct btrfs_free_cluster *last_ptr, 7609 struct find_free_extent_ctl *ffe_ctl) 7610 { 7611 u64 offset; 7612 7613 /* 7614 * We are doing an unclustered allocation, set the fragmented flag so 7615 * we don't bother trying to setup a cluster again until we get more 7616 * space. 7617 */ 7618 if (unlikely(last_ptr)) { 7619 spin_lock(&last_ptr->lock); 7620 last_ptr->fragmented = 1; 7621 spin_unlock(&last_ptr->lock); 7622 } 7623 if (ffe_ctl->cached) { 7624 struct btrfs_free_space_ctl *free_space_ctl; 7625 7626 free_space_ctl = bg->free_space_ctl; 7627 spin_lock(&free_space_ctl->tree_lock); 7628 if (free_space_ctl->free_space < 7629 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 7630 ffe_ctl->empty_size) { 7631 ffe_ctl->total_free_space = max_t(u64, 7632 ffe_ctl->total_free_space, 7633 free_space_ctl->free_space); 7634 spin_unlock(&free_space_ctl->tree_lock); 7635 return 1; 7636 } 7637 spin_unlock(&free_space_ctl->tree_lock); 7638 } 7639 7640 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 7641 ffe_ctl->num_bytes, ffe_ctl->empty_size, 7642 &ffe_ctl->max_extent_size); 7643 7644 /* 7645 * If we didn't find a chunk, and we haven't failed on this block group 7646 * before, and this block group is in the middle of caching and we are 7647 * ok with waiting, then go ahead and wait for progress to be made, and 7648 * set @retry_unclustered to true. 7649 * 7650 * If @retry_unclustered is true then we've already waited on this 7651 * block group once and should move on to the next block group. 7652 */ 7653 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && 7654 ffe_ctl->loop > LOOP_CACHING_NOWAIT) { 7655 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 7656 ffe_ctl->empty_size); 7657 ffe_ctl->retry_unclustered = true; 7658 return -EAGAIN; 7659 } else if (!offset) { 7660 return 1; 7661 } 7662 ffe_ctl->found_offset = offset; 7663 return 0; 7664 } 7665 7666 /* 7667 * Return >0 means caller needs to re-search for free extent 7668 * Return 0 means we have the needed free extent. 7669 * Return <0 means we failed to locate any free extent. 7670 */ 7671 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 7672 struct btrfs_free_cluster *last_ptr, 7673 struct btrfs_key *ins, 7674 struct find_free_extent_ctl *ffe_ctl, 7675 int full_search, bool use_cluster) 7676 { 7677 struct btrfs_root *root = fs_info->extent_root; 7678 int ret; 7679 7680 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 7681 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 7682 ffe_ctl->orig_have_caching_bg = true; 7683 7684 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && 7685 ffe_ctl->have_caching_bg) 7686 return 1; 7687 7688 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) 7689 return 1; 7690 7691 if (ins->objectid) { 7692 if (!use_cluster && last_ptr) { 7693 spin_lock(&last_ptr->lock); 7694 last_ptr->window_start = ins->objectid; 7695 spin_unlock(&last_ptr->lock); 7696 } 7697 return 0; 7698 } 7699 7700 /* 7701 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7702 * caching kthreads as we move along 7703 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7704 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7705 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7706 * again 7707 */ 7708 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 7709 ffe_ctl->index = 0; 7710 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { 7711 /* 7712 * We want to skip the LOOP_CACHING_WAIT step if we 7713 * don't have any uncached bgs and we've already done a 7714 * full search through. 7715 */ 7716 if (ffe_ctl->orig_have_caching_bg || !full_search) 7717 ffe_ctl->loop = LOOP_CACHING_WAIT; 7718 else 7719 ffe_ctl->loop = LOOP_ALLOC_CHUNK; 7720 } else { 7721 ffe_ctl->loop++; 7722 } 7723 7724 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 7725 struct btrfs_trans_handle *trans; 7726 int exist = 0; 7727 7728 trans = current->journal_info; 7729 if (trans) 7730 exist = 1; 7731 else 7732 trans = btrfs_join_transaction(root); 7733 7734 if (IS_ERR(trans)) { 7735 ret = PTR_ERR(trans); 7736 return ret; 7737 } 7738 7739 ret = do_chunk_alloc(trans, ffe_ctl->flags, 7740 CHUNK_ALLOC_FORCE); 7741 7742 /* 7743 * If we can't allocate a new chunk we've already looped 7744 * through at least once, move on to the NO_EMPTY_SIZE 7745 * case. 7746 */ 7747 if (ret == -ENOSPC) 7748 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; 7749 7750 /* Do not bail out on ENOSPC since we can do more. */ 7751 if (ret < 0 && ret != -ENOSPC) 7752 btrfs_abort_transaction(trans, ret); 7753 else 7754 ret = 0; 7755 if (!exist) 7756 btrfs_end_transaction(trans); 7757 if (ret) 7758 return ret; 7759 } 7760 7761 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 7762 /* 7763 * Don't loop again if we already have no empty_size and 7764 * no empty_cluster. 7765 */ 7766 if (ffe_ctl->empty_size == 0 && 7767 ffe_ctl->empty_cluster == 0) 7768 return -ENOSPC; 7769 ffe_ctl->empty_size = 0; 7770 ffe_ctl->empty_cluster = 0; 7771 } 7772 return 1; 7773 } 7774 return -ENOSPC; 7775 } 7776 7777 /* 7778 * walks the btree of allocated extents and find a hole of a given size. 7779 * The key ins is changed to record the hole: 7780 * ins->objectid == start position 7781 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7782 * ins->offset == the size of the hole. 7783 * Any available blocks before search_start are skipped. 7784 * 7785 * If there is no suitable free space, we will record the max size of 7786 * the free space extent currently. 7787 * 7788 * The overall logic and call chain: 7789 * 7790 * find_free_extent() 7791 * |- Iterate through all block groups 7792 * | |- Get a valid block group 7793 * | |- Try to do clustered allocation in that block group 7794 * | |- Try to do unclustered allocation in that block group 7795 * | |- Check if the result is valid 7796 * | | |- If valid, then exit 7797 * | |- Jump to next block group 7798 * | 7799 * |- Push harder to find free extents 7800 * |- If not found, re-iterate all block groups 7801 */ 7802 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7803 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7804 u64 hint_byte, struct btrfs_key *ins, 7805 u64 flags, int delalloc) 7806 { 7807 int ret = 0; 7808 struct btrfs_free_cluster *last_ptr = NULL; 7809 struct btrfs_block_group_cache *block_group = NULL; 7810 struct find_free_extent_ctl ffe_ctl = {0}; 7811 struct btrfs_space_info *space_info; 7812 bool use_cluster = true; 7813 bool full_search = false; 7814 7815 WARN_ON(num_bytes < fs_info->sectorsize); 7816 7817 ffe_ctl.ram_bytes = ram_bytes; 7818 ffe_ctl.num_bytes = num_bytes; 7819 ffe_ctl.empty_size = empty_size; 7820 ffe_ctl.flags = flags; 7821 ffe_ctl.search_start = 0; 7822 ffe_ctl.retry_clustered = false; 7823 ffe_ctl.retry_unclustered = false; 7824 ffe_ctl.delalloc = delalloc; 7825 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); 7826 ffe_ctl.have_caching_bg = false; 7827 ffe_ctl.orig_have_caching_bg = false; 7828 ffe_ctl.found_offset = 0; 7829 7830 ins->type = BTRFS_EXTENT_ITEM_KEY; 7831 ins->objectid = 0; 7832 ins->offset = 0; 7833 7834 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7835 7836 space_info = __find_space_info(fs_info, flags); 7837 if (!space_info) { 7838 btrfs_err(fs_info, "No space info for %llu", flags); 7839 return -ENOSPC; 7840 } 7841 7842 /* 7843 * If our free space is heavily fragmented we may not be able to make 7844 * big contiguous allocations, so instead of doing the expensive search 7845 * for free space, simply return ENOSPC with our max_extent_size so we 7846 * can go ahead and search for a more manageable chunk. 7847 * 7848 * If our max_extent_size is large enough for our allocation simply 7849 * disable clustering since we will likely not be able to find enough 7850 * space to create a cluster and induce latency trying. 7851 */ 7852 if (unlikely(space_info->max_extent_size)) { 7853 spin_lock(&space_info->lock); 7854 if (space_info->max_extent_size && 7855 num_bytes > space_info->max_extent_size) { 7856 ins->offset = space_info->max_extent_size; 7857 spin_unlock(&space_info->lock); 7858 return -ENOSPC; 7859 } else if (space_info->max_extent_size) { 7860 use_cluster = false; 7861 } 7862 spin_unlock(&space_info->lock); 7863 } 7864 7865 last_ptr = fetch_cluster_info(fs_info, space_info, 7866 &ffe_ctl.empty_cluster); 7867 if (last_ptr) { 7868 spin_lock(&last_ptr->lock); 7869 if (last_ptr->block_group) 7870 hint_byte = last_ptr->window_start; 7871 if (last_ptr->fragmented) { 7872 /* 7873 * We still set window_start so we can keep track of the 7874 * last place we found an allocation to try and save 7875 * some time. 7876 */ 7877 hint_byte = last_ptr->window_start; 7878 use_cluster = false; 7879 } 7880 spin_unlock(&last_ptr->lock); 7881 } 7882 7883 ffe_ctl.search_start = max(ffe_ctl.search_start, 7884 first_logical_byte(fs_info, 0)); 7885 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); 7886 if (ffe_ctl.search_start == hint_byte) { 7887 block_group = btrfs_lookup_block_group(fs_info, 7888 ffe_ctl.search_start); 7889 /* 7890 * we don't want to use the block group if it doesn't match our 7891 * allocation bits, or if its not cached. 7892 * 7893 * However if we are re-searching with an ideal block group 7894 * picked out then we don't care that the block group is cached. 7895 */ 7896 if (block_group && block_group_bits(block_group, flags) && 7897 block_group->cached != BTRFS_CACHE_NO) { 7898 down_read(&space_info->groups_sem); 7899 if (list_empty(&block_group->list) || 7900 block_group->ro) { 7901 /* 7902 * someone is removing this block group, 7903 * we can't jump into the have_block_group 7904 * target because our list pointers are not 7905 * valid 7906 */ 7907 btrfs_put_block_group(block_group); 7908 up_read(&space_info->groups_sem); 7909 } else { 7910 ffe_ctl.index = btrfs_bg_flags_to_raid_index( 7911 block_group->flags); 7912 btrfs_lock_block_group(block_group, delalloc); 7913 goto have_block_group; 7914 } 7915 } else if (block_group) { 7916 btrfs_put_block_group(block_group); 7917 } 7918 } 7919 search: 7920 ffe_ctl.have_caching_bg = false; 7921 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || 7922 ffe_ctl.index == 0) 7923 full_search = true; 7924 down_read(&space_info->groups_sem); 7925 list_for_each_entry(block_group, 7926 &space_info->block_groups[ffe_ctl.index], list) { 7927 /* If the block group is read-only, we can skip it entirely. */ 7928 if (unlikely(block_group->ro)) 7929 continue; 7930 7931 btrfs_grab_block_group(block_group, delalloc); 7932 ffe_ctl.search_start = block_group->key.objectid; 7933 7934 /* 7935 * this can happen if we end up cycling through all the 7936 * raid types, but we want to make sure we only allocate 7937 * for the proper type. 7938 */ 7939 if (!block_group_bits(block_group, flags)) { 7940 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7941 BTRFS_BLOCK_GROUP_RAID1 | 7942 BTRFS_BLOCK_GROUP_RAID5 | 7943 BTRFS_BLOCK_GROUP_RAID6 | 7944 BTRFS_BLOCK_GROUP_RAID10; 7945 7946 /* 7947 * if they asked for extra copies and this block group 7948 * doesn't provide them, bail. This does allow us to 7949 * fill raid0 from raid1. 7950 */ 7951 if ((flags & extra) && !(block_group->flags & extra)) 7952 goto loop; 7953 } 7954 7955 have_block_group: 7956 ffe_ctl.cached = block_group_cache_done(block_group); 7957 if (unlikely(!ffe_ctl.cached)) { 7958 ffe_ctl.have_caching_bg = true; 7959 ret = cache_block_group(block_group, 0); 7960 BUG_ON(ret < 0); 7961 ret = 0; 7962 } 7963 7964 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7965 goto loop; 7966 7967 /* 7968 * Ok we want to try and use the cluster allocator, so 7969 * lets look there 7970 */ 7971 if (last_ptr && use_cluster) { 7972 struct btrfs_block_group_cache *cluster_bg = NULL; 7973 7974 ret = find_free_extent_clustered(block_group, last_ptr, 7975 &ffe_ctl, &cluster_bg); 7976 7977 if (ret == 0) { 7978 if (cluster_bg && cluster_bg != block_group) { 7979 btrfs_release_block_group(block_group, 7980 delalloc); 7981 block_group = cluster_bg; 7982 } 7983 goto checks; 7984 } else if (ret == -EAGAIN) { 7985 goto have_block_group; 7986 } else if (ret > 0) { 7987 goto loop; 7988 } 7989 /* ret == -ENOENT case falls through */ 7990 } 7991 7992 ret = find_free_extent_unclustered(block_group, last_ptr, 7993 &ffe_ctl); 7994 if (ret == -EAGAIN) 7995 goto have_block_group; 7996 else if (ret > 0) 7997 goto loop; 7998 /* ret == 0 case falls through */ 7999 checks: 8000 ffe_ctl.search_start = round_up(ffe_ctl.found_offset, 8001 fs_info->stripesize); 8002 8003 /* move on to the next group */ 8004 if (ffe_ctl.search_start + num_bytes > 8005 block_group->key.objectid + block_group->key.offset) { 8006 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 8007 num_bytes); 8008 goto loop; 8009 } 8010 8011 if (ffe_ctl.found_offset < ffe_ctl.search_start) 8012 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 8013 ffe_ctl.search_start - ffe_ctl.found_offset); 8014 8015 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 8016 num_bytes, delalloc); 8017 if (ret == -EAGAIN) { 8018 btrfs_add_free_space(block_group, ffe_ctl.found_offset, 8019 num_bytes); 8020 goto loop; 8021 } 8022 btrfs_inc_block_group_reservations(block_group); 8023 8024 /* we are all good, lets return */ 8025 ins->objectid = ffe_ctl.search_start; 8026 ins->offset = num_bytes; 8027 8028 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, 8029 num_bytes); 8030 btrfs_release_block_group(block_group, delalloc); 8031 break; 8032 loop: 8033 ffe_ctl.retry_clustered = false; 8034 ffe_ctl.retry_unclustered = false; 8035 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 8036 ffe_ctl.index); 8037 btrfs_release_block_group(block_group, delalloc); 8038 cond_resched(); 8039 } 8040 up_read(&space_info->groups_sem); 8041 8042 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, 8043 full_search, use_cluster); 8044 if (ret > 0) 8045 goto search; 8046 8047 if (ret == -ENOSPC) { 8048 /* 8049 * Use ffe_ctl->total_free_space as fallback if we can't find 8050 * any contiguous hole. 8051 */ 8052 if (!ffe_ctl.max_extent_size) 8053 ffe_ctl.max_extent_size = ffe_ctl.total_free_space; 8054 spin_lock(&space_info->lock); 8055 space_info->max_extent_size = ffe_ctl.max_extent_size; 8056 spin_unlock(&space_info->lock); 8057 ins->offset = ffe_ctl.max_extent_size; 8058 } 8059 return ret; 8060 } 8061 8062 static void dump_space_info(struct btrfs_fs_info *fs_info, 8063 struct btrfs_space_info *info, u64 bytes, 8064 int dump_block_groups) 8065 { 8066 struct btrfs_block_group_cache *cache; 8067 int index = 0; 8068 8069 spin_lock(&info->lock); 8070 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 8071 info->flags, 8072 info->total_bytes - btrfs_space_info_used(info, true), 8073 info->full ? "" : "not "); 8074 btrfs_info(fs_info, 8075 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 8076 info->total_bytes, info->bytes_used, info->bytes_pinned, 8077 info->bytes_reserved, info->bytes_may_use, 8078 info->bytes_readonly); 8079 spin_unlock(&info->lock); 8080 8081 if (!dump_block_groups) 8082 return; 8083 8084 down_read(&info->groups_sem); 8085 again: 8086 list_for_each_entry(cache, &info->block_groups[index], list) { 8087 spin_lock(&cache->lock); 8088 btrfs_info(fs_info, 8089 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 8090 cache->key.objectid, cache->key.offset, 8091 btrfs_block_group_used(&cache->item), cache->pinned, 8092 cache->reserved, cache->ro ? "[readonly]" : ""); 8093 btrfs_dump_free_space(cache, bytes); 8094 spin_unlock(&cache->lock); 8095 } 8096 if (++index < BTRFS_NR_RAID_TYPES) 8097 goto again; 8098 up_read(&info->groups_sem); 8099 } 8100 8101 /* 8102 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 8103 * hole that is at least as big as @num_bytes. 8104 * 8105 * @root - The root that will contain this extent 8106 * 8107 * @ram_bytes - The amount of space in ram that @num_bytes take. This 8108 * is used for accounting purposes. This value differs 8109 * from @num_bytes only in the case of compressed extents. 8110 * 8111 * @num_bytes - Number of bytes to allocate on-disk. 8112 * 8113 * @min_alloc_size - Indicates the minimum amount of space that the 8114 * allocator should try to satisfy. In some cases 8115 * @num_bytes may be larger than what is required and if 8116 * the filesystem is fragmented then allocation fails. 8117 * However, the presence of @min_alloc_size gives a 8118 * chance to try and satisfy the smaller allocation. 8119 * 8120 * @empty_size - A hint that you plan on doing more COW. This is the 8121 * size in bytes the allocator should try to find free 8122 * next to the block it returns. This is just a hint and 8123 * may be ignored by the allocator. 8124 * 8125 * @hint_byte - Hint to the allocator to start searching above the byte 8126 * address passed. It might be ignored. 8127 * 8128 * @ins - This key is modified to record the found hole. It will 8129 * have the following values: 8130 * ins->objectid == start position 8131 * ins->flags = BTRFS_EXTENT_ITEM_KEY 8132 * ins->offset == the size of the hole. 8133 * 8134 * @is_data - Boolean flag indicating whether an extent is 8135 * allocated for data (true) or metadata (false) 8136 * 8137 * @delalloc - Boolean flag indicating whether this allocation is for 8138 * delalloc or not. If 'true' data_rwsem of block groups 8139 * is going to be acquired. 8140 * 8141 * 8142 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 8143 * case -ENOSPC is returned then @ins->offset will contain the size of the 8144 * largest available hole the allocator managed to find. 8145 */ 8146 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 8147 u64 num_bytes, u64 min_alloc_size, 8148 u64 empty_size, u64 hint_byte, 8149 struct btrfs_key *ins, int is_data, int delalloc) 8150 { 8151 struct btrfs_fs_info *fs_info = root->fs_info; 8152 bool final_tried = num_bytes == min_alloc_size; 8153 u64 flags; 8154 int ret; 8155 8156 flags = get_alloc_profile_by_root(root, is_data); 8157 again: 8158 WARN_ON(num_bytes < fs_info->sectorsize); 8159 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 8160 hint_byte, ins, flags, delalloc); 8161 if (!ret && !is_data) { 8162 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 8163 } else if (ret == -ENOSPC) { 8164 if (!final_tried && ins->offset) { 8165 num_bytes = min(num_bytes >> 1, ins->offset); 8166 num_bytes = round_down(num_bytes, 8167 fs_info->sectorsize); 8168 num_bytes = max(num_bytes, min_alloc_size); 8169 ram_bytes = num_bytes; 8170 if (num_bytes == min_alloc_size) 8171 final_tried = true; 8172 goto again; 8173 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8174 struct btrfs_space_info *sinfo; 8175 8176 sinfo = __find_space_info(fs_info, flags); 8177 btrfs_err(fs_info, 8178 "allocation failed flags %llu, wanted %llu", 8179 flags, num_bytes); 8180 if (sinfo) 8181 dump_space_info(fs_info, sinfo, num_bytes, 1); 8182 } 8183 } 8184 8185 return ret; 8186 } 8187 8188 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8189 u64 start, u64 len, 8190 int pin, int delalloc) 8191 { 8192 struct btrfs_block_group_cache *cache; 8193 int ret = 0; 8194 8195 cache = btrfs_lookup_block_group(fs_info, start); 8196 if (!cache) { 8197 btrfs_err(fs_info, "Unable to find block group for %llu", 8198 start); 8199 return -ENOSPC; 8200 } 8201 8202 if (pin) 8203 pin_down_extent(fs_info, cache, start, len, 1); 8204 else { 8205 if (btrfs_test_opt(fs_info, DISCARD)) 8206 ret = btrfs_discard_extent(fs_info, start, len, NULL); 8207 btrfs_add_free_space(cache, start, len); 8208 btrfs_free_reserved_bytes(cache, len, delalloc); 8209 trace_btrfs_reserved_extent_free(fs_info, start, len); 8210 } 8211 8212 btrfs_put_block_group(cache); 8213 return ret; 8214 } 8215 8216 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8217 u64 start, u64 len, int delalloc) 8218 { 8219 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8220 } 8221 8222 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8223 u64 start, u64 len) 8224 { 8225 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8226 } 8227 8228 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8229 u64 parent, u64 root_objectid, 8230 u64 flags, u64 owner, u64 offset, 8231 struct btrfs_key *ins, int ref_mod) 8232 { 8233 struct btrfs_fs_info *fs_info = trans->fs_info; 8234 int ret; 8235 struct btrfs_extent_item *extent_item; 8236 struct btrfs_extent_inline_ref *iref; 8237 struct btrfs_path *path; 8238 struct extent_buffer *leaf; 8239 int type; 8240 u32 size; 8241 8242 if (parent > 0) 8243 type = BTRFS_SHARED_DATA_REF_KEY; 8244 else 8245 type = BTRFS_EXTENT_DATA_REF_KEY; 8246 8247 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8248 8249 path = btrfs_alloc_path(); 8250 if (!path) 8251 return -ENOMEM; 8252 8253 path->leave_spinning = 1; 8254 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8255 ins, size); 8256 if (ret) { 8257 btrfs_free_path(path); 8258 return ret; 8259 } 8260 8261 leaf = path->nodes[0]; 8262 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8263 struct btrfs_extent_item); 8264 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8265 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8266 btrfs_set_extent_flags(leaf, extent_item, 8267 flags | BTRFS_EXTENT_FLAG_DATA); 8268 8269 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8270 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8271 if (parent > 0) { 8272 struct btrfs_shared_data_ref *ref; 8273 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8274 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8275 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8276 } else { 8277 struct btrfs_extent_data_ref *ref; 8278 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8279 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8280 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8281 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8282 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8283 } 8284 8285 btrfs_mark_buffer_dirty(path->nodes[0]); 8286 btrfs_free_path(path); 8287 8288 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); 8289 if (ret) 8290 return ret; 8291 8292 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); 8293 if (ret) { /* -ENOENT, logic error */ 8294 btrfs_err(fs_info, "update block group failed for %llu %llu", 8295 ins->objectid, ins->offset); 8296 BUG(); 8297 } 8298 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8299 return ret; 8300 } 8301 8302 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8303 struct btrfs_delayed_ref_node *node, 8304 struct btrfs_delayed_extent_op *extent_op) 8305 { 8306 struct btrfs_fs_info *fs_info = trans->fs_info; 8307 int ret; 8308 struct btrfs_extent_item *extent_item; 8309 struct btrfs_key extent_key; 8310 struct btrfs_tree_block_info *block_info; 8311 struct btrfs_extent_inline_ref *iref; 8312 struct btrfs_path *path; 8313 struct extent_buffer *leaf; 8314 struct btrfs_delayed_tree_ref *ref; 8315 u32 size = sizeof(*extent_item) + sizeof(*iref); 8316 u64 num_bytes; 8317 u64 flags = extent_op->flags_to_set; 8318 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8319 8320 ref = btrfs_delayed_node_to_tree_ref(node); 8321 8322 extent_key.objectid = node->bytenr; 8323 if (skinny_metadata) { 8324 extent_key.offset = ref->level; 8325 extent_key.type = BTRFS_METADATA_ITEM_KEY; 8326 num_bytes = fs_info->nodesize; 8327 } else { 8328 extent_key.offset = node->num_bytes; 8329 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 8330 size += sizeof(*block_info); 8331 num_bytes = node->num_bytes; 8332 } 8333 8334 path = btrfs_alloc_path(); 8335 if (!path) 8336 return -ENOMEM; 8337 8338 path->leave_spinning = 1; 8339 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8340 &extent_key, size); 8341 if (ret) { 8342 btrfs_free_path(path); 8343 return ret; 8344 } 8345 8346 leaf = path->nodes[0]; 8347 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8348 struct btrfs_extent_item); 8349 btrfs_set_extent_refs(leaf, extent_item, 1); 8350 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8351 btrfs_set_extent_flags(leaf, extent_item, 8352 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8353 8354 if (skinny_metadata) { 8355 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8356 } else { 8357 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8358 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 8359 btrfs_set_tree_block_level(leaf, block_info, ref->level); 8360 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8361 } 8362 8363 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 8364 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8365 btrfs_set_extent_inline_ref_type(leaf, iref, 8366 BTRFS_SHARED_BLOCK_REF_KEY); 8367 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); 8368 } else { 8369 btrfs_set_extent_inline_ref_type(leaf, iref, 8370 BTRFS_TREE_BLOCK_REF_KEY); 8371 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); 8372 } 8373 8374 btrfs_mark_buffer_dirty(leaf); 8375 btrfs_free_path(path); 8376 8377 ret = remove_from_free_space_tree(trans, extent_key.objectid, 8378 num_bytes); 8379 if (ret) 8380 return ret; 8381 8382 ret = update_block_group(trans, fs_info, extent_key.objectid, 8383 fs_info->nodesize, 1); 8384 if (ret) { /* -ENOENT, logic error */ 8385 btrfs_err(fs_info, "update block group failed for %llu %llu", 8386 extent_key.objectid, extent_key.offset); 8387 BUG(); 8388 } 8389 8390 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, 8391 fs_info->nodesize); 8392 return ret; 8393 } 8394 8395 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8396 struct btrfs_root *root, u64 owner, 8397 u64 offset, u64 ram_bytes, 8398 struct btrfs_key *ins) 8399 { 8400 int ret; 8401 8402 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 8403 8404 btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, 8405 root->root_key.objectid, owner, offset, 8406 BTRFS_ADD_DELAYED_EXTENT); 8407 8408 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, 8409 ins->offset, 0, 8410 root->root_key.objectid, owner, 8411 offset, ram_bytes, 8412 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); 8413 return ret; 8414 } 8415 8416 /* 8417 * this is used by the tree logging recovery code. It records that 8418 * an extent has been allocated and makes sure to clear the free 8419 * space cache bits as well 8420 */ 8421 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8422 u64 root_objectid, u64 owner, u64 offset, 8423 struct btrfs_key *ins) 8424 { 8425 struct btrfs_fs_info *fs_info = trans->fs_info; 8426 int ret; 8427 struct btrfs_block_group_cache *block_group; 8428 struct btrfs_space_info *space_info; 8429 8430 /* 8431 * Mixed block groups will exclude before processing the log so we only 8432 * need to do the exclude dance if this fs isn't mixed. 8433 */ 8434 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8435 ret = __exclude_logged_extent(fs_info, ins->objectid, 8436 ins->offset); 8437 if (ret) 8438 return ret; 8439 } 8440 8441 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8442 if (!block_group) 8443 return -EINVAL; 8444 8445 space_info = block_group->space_info; 8446 spin_lock(&space_info->lock); 8447 spin_lock(&block_group->lock); 8448 space_info->bytes_reserved += ins->offset; 8449 block_group->reserved += ins->offset; 8450 spin_unlock(&block_group->lock); 8451 spin_unlock(&space_info->lock); 8452 8453 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 8454 offset, ins, 1); 8455 btrfs_put_block_group(block_group); 8456 return ret; 8457 } 8458 8459 static struct extent_buffer * 8460 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8461 u64 bytenr, int level, u64 owner) 8462 { 8463 struct btrfs_fs_info *fs_info = root->fs_info; 8464 struct extent_buffer *buf; 8465 8466 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8467 if (IS_ERR(buf)) 8468 return buf; 8469 8470 /* 8471 * Extra safety check in case the extent tree is corrupted and extent 8472 * allocator chooses to use a tree block which is already used and 8473 * locked. 8474 */ 8475 if (buf->lock_owner == current->pid) { 8476 btrfs_err_rl(fs_info, 8477 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 8478 buf->start, btrfs_header_owner(buf), current->pid); 8479 free_extent_buffer(buf); 8480 return ERR_PTR(-EUCLEAN); 8481 } 8482 8483 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8484 btrfs_tree_lock(buf); 8485 clean_tree_block(fs_info, buf); 8486 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8487 8488 btrfs_set_lock_blocking(buf); 8489 set_extent_buffer_uptodate(buf); 8490 8491 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 8492 btrfs_set_header_level(buf, level); 8493 btrfs_set_header_bytenr(buf, buf->start); 8494 btrfs_set_header_generation(buf, trans->transid); 8495 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 8496 btrfs_set_header_owner(buf, owner); 8497 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 8498 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 8499 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8500 buf->log_index = root->log_transid % 2; 8501 /* 8502 * we allow two log transactions at a time, use different 8503 * EXTENT bit to differentiate dirty pages. 8504 */ 8505 if (buf->log_index == 0) 8506 set_extent_dirty(&root->dirty_log_pages, buf->start, 8507 buf->start + buf->len - 1, GFP_NOFS); 8508 else 8509 set_extent_new(&root->dirty_log_pages, buf->start, 8510 buf->start + buf->len - 1); 8511 } else { 8512 buf->log_index = -1; 8513 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8514 buf->start + buf->len - 1, GFP_NOFS); 8515 } 8516 trans->dirty = true; 8517 /* this returns a buffer locked for blocking */ 8518 return buf; 8519 } 8520 8521 static struct btrfs_block_rsv * 8522 use_block_rsv(struct btrfs_trans_handle *trans, 8523 struct btrfs_root *root, u32 blocksize) 8524 { 8525 struct btrfs_fs_info *fs_info = root->fs_info; 8526 struct btrfs_block_rsv *block_rsv; 8527 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8528 int ret; 8529 bool global_updated = false; 8530 8531 block_rsv = get_block_rsv(trans, root); 8532 8533 if (unlikely(block_rsv->size == 0)) 8534 goto try_reserve; 8535 again: 8536 ret = block_rsv_use_bytes(block_rsv, blocksize); 8537 if (!ret) 8538 return block_rsv; 8539 8540 if (block_rsv->failfast) 8541 return ERR_PTR(ret); 8542 8543 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8544 global_updated = true; 8545 update_global_block_rsv(fs_info); 8546 goto again; 8547 } 8548 8549 /* 8550 * The global reserve still exists to save us from ourselves, so don't 8551 * warn_on if we are short on our delayed refs reserve. 8552 */ 8553 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && 8554 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8555 static DEFINE_RATELIMIT_STATE(_rs, 8556 DEFAULT_RATELIMIT_INTERVAL * 10, 8557 /*DEFAULT_RATELIMIT_BURST*/ 1); 8558 if (__ratelimit(&_rs)) 8559 WARN(1, KERN_DEBUG 8560 "BTRFS: block rsv returned %d\n", ret); 8561 } 8562 try_reserve: 8563 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8564 BTRFS_RESERVE_NO_FLUSH); 8565 if (!ret) 8566 return block_rsv; 8567 /* 8568 * If we couldn't reserve metadata bytes try and use some from 8569 * the global reserve if its space type is the same as the global 8570 * reservation. 8571 */ 8572 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8573 block_rsv->space_info == global_rsv->space_info) { 8574 ret = block_rsv_use_bytes(global_rsv, blocksize); 8575 if (!ret) 8576 return global_rsv; 8577 } 8578 return ERR_PTR(ret); 8579 } 8580 8581 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8582 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8583 { 8584 block_rsv_add_bytes(block_rsv, blocksize, false); 8585 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); 8586 } 8587 8588 /* 8589 * finds a free extent and does all the dirty work required for allocation 8590 * returns the tree buffer or an ERR_PTR on error. 8591 */ 8592 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8593 struct btrfs_root *root, 8594 u64 parent, u64 root_objectid, 8595 const struct btrfs_disk_key *key, 8596 int level, u64 hint, 8597 u64 empty_size) 8598 { 8599 struct btrfs_fs_info *fs_info = root->fs_info; 8600 struct btrfs_key ins; 8601 struct btrfs_block_rsv *block_rsv; 8602 struct extent_buffer *buf; 8603 struct btrfs_delayed_extent_op *extent_op; 8604 u64 flags = 0; 8605 int ret; 8606 u32 blocksize = fs_info->nodesize; 8607 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8608 8609 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8610 if (btrfs_is_testing(fs_info)) { 8611 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8612 level, root_objectid); 8613 if (!IS_ERR(buf)) 8614 root->alloc_bytenr += blocksize; 8615 return buf; 8616 } 8617 #endif 8618 8619 block_rsv = use_block_rsv(trans, root, blocksize); 8620 if (IS_ERR(block_rsv)) 8621 return ERR_CAST(block_rsv); 8622 8623 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8624 empty_size, hint, &ins, 0, 0); 8625 if (ret) 8626 goto out_unuse; 8627 8628 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 8629 root_objectid); 8630 if (IS_ERR(buf)) { 8631 ret = PTR_ERR(buf); 8632 goto out_free_reserved; 8633 } 8634 8635 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8636 if (parent == 0) 8637 parent = ins.objectid; 8638 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8639 } else 8640 BUG_ON(parent > 0); 8641 8642 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8643 extent_op = btrfs_alloc_delayed_extent_op(); 8644 if (!extent_op) { 8645 ret = -ENOMEM; 8646 goto out_free_buf; 8647 } 8648 if (key) 8649 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8650 else 8651 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8652 extent_op->flags_to_set = flags; 8653 extent_op->update_key = skinny_metadata ? false : true; 8654 extent_op->update_flags = true; 8655 extent_op->is_data = false; 8656 extent_op->level = level; 8657 8658 btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, 8659 root_objectid, level, 0, 8660 BTRFS_ADD_DELAYED_EXTENT); 8661 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 8662 ins.offset, parent, 8663 root_objectid, level, 8664 BTRFS_ADD_DELAYED_EXTENT, 8665 extent_op, NULL, NULL); 8666 if (ret) 8667 goto out_free_delayed; 8668 } 8669 return buf; 8670 8671 out_free_delayed: 8672 btrfs_free_delayed_extent_op(extent_op); 8673 out_free_buf: 8674 free_extent_buffer(buf); 8675 out_free_reserved: 8676 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8677 out_unuse: 8678 unuse_block_rsv(fs_info, block_rsv, blocksize); 8679 return ERR_PTR(ret); 8680 } 8681 8682 struct walk_control { 8683 u64 refs[BTRFS_MAX_LEVEL]; 8684 u64 flags[BTRFS_MAX_LEVEL]; 8685 struct btrfs_key update_progress; 8686 int stage; 8687 int level; 8688 int shared_level; 8689 int update_ref; 8690 int keep_locks; 8691 int reada_slot; 8692 int reada_count; 8693 }; 8694 8695 #define DROP_REFERENCE 1 8696 #define UPDATE_BACKREF 2 8697 8698 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8699 struct btrfs_root *root, 8700 struct walk_control *wc, 8701 struct btrfs_path *path) 8702 { 8703 struct btrfs_fs_info *fs_info = root->fs_info; 8704 u64 bytenr; 8705 u64 generation; 8706 u64 refs; 8707 u64 flags; 8708 u32 nritems; 8709 struct btrfs_key key; 8710 struct extent_buffer *eb; 8711 int ret; 8712 int slot; 8713 int nread = 0; 8714 8715 if (path->slots[wc->level] < wc->reada_slot) { 8716 wc->reada_count = wc->reada_count * 2 / 3; 8717 wc->reada_count = max(wc->reada_count, 2); 8718 } else { 8719 wc->reada_count = wc->reada_count * 3 / 2; 8720 wc->reada_count = min_t(int, wc->reada_count, 8721 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8722 } 8723 8724 eb = path->nodes[wc->level]; 8725 nritems = btrfs_header_nritems(eb); 8726 8727 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8728 if (nread >= wc->reada_count) 8729 break; 8730 8731 cond_resched(); 8732 bytenr = btrfs_node_blockptr(eb, slot); 8733 generation = btrfs_node_ptr_generation(eb, slot); 8734 8735 if (slot == path->slots[wc->level]) 8736 goto reada; 8737 8738 if (wc->stage == UPDATE_BACKREF && 8739 generation <= root->root_key.offset) 8740 continue; 8741 8742 /* We don't lock the tree block, it's OK to be racy here */ 8743 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8744 wc->level - 1, 1, &refs, 8745 &flags); 8746 /* We don't care about errors in readahead. */ 8747 if (ret < 0) 8748 continue; 8749 BUG_ON(refs == 0); 8750 8751 if (wc->stage == DROP_REFERENCE) { 8752 if (refs == 1) 8753 goto reada; 8754 8755 if (wc->level == 1 && 8756 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8757 continue; 8758 if (!wc->update_ref || 8759 generation <= root->root_key.offset) 8760 continue; 8761 btrfs_node_key_to_cpu(eb, &key, slot); 8762 ret = btrfs_comp_cpu_keys(&key, 8763 &wc->update_progress); 8764 if (ret < 0) 8765 continue; 8766 } else { 8767 if (wc->level == 1 && 8768 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8769 continue; 8770 } 8771 reada: 8772 readahead_tree_block(fs_info, bytenr); 8773 nread++; 8774 } 8775 wc->reada_slot = slot; 8776 } 8777 8778 /* 8779 * helper to process tree block while walking down the tree. 8780 * 8781 * when wc->stage == UPDATE_BACKREF, this function updates 8782 * back refs for pointers in the block. 8783 * 8784 * NOTE: return value 1 means we should stop walking down. 8785 */ 8786 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8787 struct btrfs_root *root, 8788 struct btrfs_path *path, 8789 struct walk_control *wc, int lookup_info) 8790 { 8791 struct btrfs_fs_info *fs_info = root->fs_info; 8792 int level = wc->level; 8793 struct extent_buffer *eb = path->nodes[level]; 8794 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8795 int ret; 8796 8797 if (wc->stage == UPDATE_BACKREF && 8798 btrfs_header_owner(eb) != root->root_key.objectid) 8799 return 1; 8800 8801 /* 8802 * when reference count of tree block is 1, it won't increase 8803 * again. once full backref flag is set, we never clear it. 8804 */ 8805 if (lookup_info && 8806 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8807 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8808 BUG_ON(!path->locks[level]); 8809 ret = btrfs_lookup_extent_info(trans, fs_info, 8810 eb->start, level, 1, 8811 &wc->refs[level], 8812 &wc->flags[level]); 8813 BUG_ON(ret == -ENOMEM); 8814 if (ret) 8815 return ret; 8816 BUG_ON(wc->refs[level] == 0); 8817 } 8818 8819 if (wc->stage == DROP_REFERENCE) { 8820 if (wc->refs[level] > 1) 8821 return 1; 8822 8823 if (path->locks[level] && !wc->keep_locks) { 8824 btrfs_tree_unlock_rw(eb, path->locks[level]); 8825 path->locks[level] = 0; 8826 } 8827 return 0; 8828 } 8829 8830 /* wc->stage == UPDATE_BACKREF */ 8831 if (!(wc->flags[level] & flag)) { 8832 BUG_ON(!path->locks[level]); 8833 ret = btrfs_inc_ref(trans, root, eb, 1); 8834 BUG_ON(ret); /* -ENOMEM */ 8835 ret = btrfs_dec_ref(trans, root, eb, 0); 8836 BUG_ON(ret); /* -ENOMEM */ 8837 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, 8838 eb->len, flag, 8839 btrfs_header_level(eb), 0); 8840 BUG_ON(ret); /* -ENOMEM */ 8841 wc->flags[level] |= flag; 8842 } 8843 8844 /* 8845 * the block is shared by multiple trees, so it's not good to 8846 * keep the tree lock 8847 */ 8848 if (path->locks[level] && level > 0) { 8849 btrfs_tree_unlock_rw(eb, path->locks[level]); 8850 path->locks[level] = 0; 8851 } 8852 return 0; 8853 } 8854 8855 /* 8856 * helper to process tree block pointer. 8857 * 8858 * when wc->stage == DROP_REFERENCE, this function checks 8859 * reference count of the block pointed to. if the block 8860 * is shared and we need update back refs for the subtree 8861 * rooted at the block, this function changes wc->stage to 8862 * UPDATE_BACKREF. if the block is shared and there is no 8863 * need to update back, this function drops the reference 8864 * to the block. 8865 * 8866 * NOTE: return value 1 means we should stop walking down. 8867 */ 8868 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8869 struct btrfs_root *root, 8870 struct btrfs_path *path, 8871 struct walk_control *wc, int *lookup_info) 8872 { 8873 struct btrfs_fs_info *fs_info = root->fs_info; 8874 u64 bytenr; 8875 u64 generation; 8876 u64 parent; 8877 struct btrfs_key key; 8878 struct btrfs_key first_key; 8879 struct extent_buffer *next; 8880 int level = wc->level; 8881 int reada = 0; 8882 int ret = 0; 8883 bool need_account = false; 8884 8885 generation = btrfs_node_ptr_generation(path->nodes[level], 8886 path->slots[level]); 8887 /* 8888 * if the lower level block was created before the snapshot 8889 * was created, we know there is no need to update back refs 8890 * for the subtree 8891 */ 8892 if (wc->stage == UPDATE_BACKREF && 8893 generation <= root->root_key.offset) { 8894 *lookup_info = 1; 8895 return 1; 8896 } 8897 8898 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8899 btrfs_node_key_to_cpu(path->nodes[level], &first_key, 8900 path->slots[level]); 8901 8902 next = find_extent_buffer(fs_info, bytenr); 8903 if (!next) { 8904 next = btrfs_find_create_tree_block(fs_info, bytenr); 8905 if (IS_ERR(next)) 8906 return PTR_ERR(next); 8907 8908 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8909 level - 1); 8910 reada = 1; 8911 } 8912 btrfs_tree_lock(next); 8913 btrfs_set_lock_blocking(next); 8914 8915 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8916 &wc->refs[level - 1], 8917 &wc->flags[level - 1]); 8918 if (ret < 0) 8919 goto out_unlock; 8920 8921 if (unlikely(wc->refs[level - 1] == 0)) { 8922 btrfs_err(fs_info, "Missing references."); 8923 ret = -EIO; 8924 goto out_unlock; 8925 } 8926 *lookup_info = 0; 8927 8928 if (wc->stage == DROP_REFERENCE) { 8929 if (wc->refs[level - 1] > 1) { 8930 need_account = true; 8931 if (level == 1 && 8932 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8933 goto skip; 8934 8935 if (!wc->update_ref || 8936 generation <= root->root_key.offset) 8937 goto skip; 8938 8939 btrfs_node_key_to_cpu(path->nodes[level], &key, 8940 path->slots[level]); 8941 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8942 if (ret < 0) 8943 goto skip; 8944 8945 wc->stage = UPDATE_BACKREF; 8946 wc->shared_level = level - 1; 8947 } 8948 } else { 8949 if (level == 1 && 8950 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8951 goto skip; 8952 } 8953 8954 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8955 btrfs_tree_unlock(next); 8956 free_extent_buffer(next); 8957 next = NULL; 8958 *lookup_info = 1; 8959 } 8960 8961 if (!next) { 8962 if (reada && level == 1) 8963 reada_walk_down(trans, root, wc, path); 8964 next = read_tree_block(fs_info, bytenr, generation, level - 1, 8965 &first_key); 8966 if (IS_ERR(next)) { 8967 return PTR_ERR(next); 8968 } else if (!extent_buffer_uptodate(next)) { 8969 free_extent_buffer(next); 8970 return -EIO; 8971 } 8972 btrfs_tree_lock(next); 8973 btrfs_set_lock_blocking(next); 8974 } 8975 8976 level--; 8977 ASSERT(level == btrfs_header_level(next)); 8978 if (level != btrfs_header_level(next)) { 8979 btrfs_err(root->fs_info, "mismatched level"); 8980 ret = -EIO; 8981 goto out_unlock; 8982 } 8983 path->nodes[level] = next; 8984 path->slots[level] = 0; 8985 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8986 wc->level = level; 8987 if (wc->level == 1) 8988 wc->reada_slot = 0; 8989 return 0; 8990 skip: 8991 wc->refs[level - 1] = 0; 8992 wc->flags[level - 1] = 0; 8993 if (wc->stage == DROP_REFERENCE) { 8994 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8995 parent = path->nodes[level]->start; 8996 } else { 8997 ASSERT(root->root_key.objectid == 8998 btrfs_header_owner(path->nodes[level])); 8999 if (root->root_key.objectid != 9000 btrfs_header_owner(path->nodes[level])) { 9001 btrfs_err(root->fs_info, 9002 "mismatched block owner"); 9003 ret = -EIO; 9004 goto out_unlock; 9005 } 9006 parent = 0; 9007 } 9008 9009 /* 9010 * Reloc tree doesn't contribute to qgroup numbers, and we have 9011 * already accounted them at merge time (replace_path), 9012 * thus we could skip expensive subtree trace here. 9013 */ 9014 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 9015 need_account) { 9016 ret = btrfs_qgroup_trace_subtree(trans, next, 9017 generation, level - 1); 9018 if (ret) { 9019 btrfs_err_rl(fs_info, 9020 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 9021 ret); 9022 } 9023 } 9024 ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, 9025 parent, root->root_key.objectid, 9026 level - 1, 0); 9027 if (ret) 9028 goto out_unlock; 9029 } 9030 9031 *lookup_info = 1; 9032 ret = 1; 9033 9034 out_unlock: 9035 btrfs_tree_unlock(next); 9036 free_extent_buffer(next); 9037 9038 return ret; 9039 } 9040 9041 /* 9042 * helper to process tree block while walking up the tree. 9043 * 9044 * when wc->stage == DROP_REFERENCE, this function drops 9045 * reference count on the block. 9046 * 9047 * when wc->stage == UPDATE_BACKREF, this function changes 9048 * wc->stage back to DROP_REFERENCE if we changed wc->stage 9049 * to UPDATE_BACKREF previously while processing the block. 9050 * 9051 * NOTE: return value 1 means we should stop walking up. 9052 */ 9053 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 9054 struct btrfs_root *root, 9055 struct btrfs_path *path, 9056 struct walk_control *wc) 9057 { 9058 struct btrfs_fs_info *fs_info = root->fs_info; 9059 int ret; 9060 int level = wc->level; 9061 struct extent_buffer *eb = path->nodes[level]; 9062 u64 parent = 0; 9063 9064 if (wc->stage == UPDATE_BACKREF) { 9065 BUG_ON(wc->shared_level < level); 9066 if (level < wc->shared_level) 9067 goto out; 9068 9069 ret = find_next_key(path, level + 1, &wc->update_progress); 9070 if (ret > 0) 9071 wc->update_ref = 0; 9072 9073 wc->stage = DROP_REFERENCE; 9074 wc->shared_level = -1; 9075 path->slots[level] = 0; 9076 9077 /* 9078 * check reference count again if the block isn't locked. 9079 * we should start walking down the tree again if reference 9080 * count is one. 9081 */ 9082 if (!path->locks[level]) { 9083 BUG_ON(level == 0); 9084 btrfs_tree_lock(eb); 9085 btrfs_set_lock_blocking(eb); 9086 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9087 9088 ret = btrfs_lookup_extent_info(trans, fs_info, 9089 eb->start, level, 1, 9090 &wc->refs[level], 9091 &wc->flags[level]); 9092 if (ret < 0) { 9093 btrfs_tree_unlock_rw(eb, path->locks[level]); 9094 path->locks[level] = 0; 9095 return ret; 9096 } 9097 BUG_ON(wc->refs[level] == 0); 9098 if (wc->refs[level] == 1) { 9099 btrfs_tree_unlock_rw(eb, path->locks[level]); 9100 path->locks[level] = 0; 9101 return 1; 9102 } 9103 } 9104 } 9105 9106 /* wc->stage == DROP_REFERENCE */ 9107 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 9108 9109 if (wc->refs[level] == 1) { 9110 if (level == 0) { 9111 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9112 ret = btrfs_dec_ref(trans, root, eb, 1); 9113 else 9114 ret = btrfs_dec_ref(trans, root, eb, 0); 9115 BUG_ON(ret); /* -ENOMEM */ 9116 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 9117 if (ret) { 9118 btrfs_err_rl(fs_info, 9119 "error %d accounting leaf items. Quota is out of sync, rescan required.", 9120 ret); 9121 } 9122 } 9123 /* make block locked assertion in clean_tree_block happy */ 9124 if (!path->locks[level] && 9125 btrfs_header_generation(eb) == trans->transid) { 9126 btrfs_tree_lock(eb); 9127 btrfs_set_lock_blocking(eb); 9128 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9129 } 9130 clean_tree_block(fs_info, eb); 9131 } 9132 9133 if (eb == root->node) { 9134 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9135 parent = eb->start; 9136 else if (root->root_key.objectid != btrfs_header_owner(eb)) 9137 goto owner_mismatch; 9138 } else { 9139 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 9140 parent = path->nodes[level + 1]->start; 9141 else if (root->root_key.objectid != 9142 btrfs_header_owner(path->nodes[level + 1])) 9143 goto owner_mismatch; 9144 } 9145 9146 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 9147 out: 9148 wc->refs[level] = 0; 9149 wc->flags[level] = 0; 9150 return 0; 9151 9152 owner_mismatch: 9153 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 9154 btrfs_header_owner(eb), root->root_key.objectid); 9155 return -EUCLEAN; 9156 } 9157 9158 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 9159 struct btrfs_root *root, 9160 struct btrfs_path *path, 9161 struct walk_control *wc) 9162 { 9163 int level = wc->level; 9164 int lookup_info = 1; 9165 int ret; 9166 9167 while (level >= 0) { 9168 ret = walk_down_proc(trans, root, path, wc, lookup_info); 9169 if (ret > 0) 9170 break; 9171 9172 if (level == 0) 9173 break; 9174 9175 if (path->slots[level] >= 9176 btrfs_header_nritems(path->nodes[level])) 9177 break; 9178 9179 ret = do_walk_down(trans, root, path, wc, &lookup_info); 9180 if (ret > 0) { 9181 path->slots[level]++; 9182 continue; 9183 } else if (ret < 0) 9184 return ret; 9185 level = wc->level; 9186 } 9187 return 0; 9188 } 9189 9190 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 9191 struct btrfs_root *root, 9192 struct btrfs_path *path, 9193 struct walk_control *wc, int max_level) 9194 { 9195 int level = wc->level; 9196 int ret; 9197 9198 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 9199 while (level < max_level && path->nodes[level]) { 9200 wc->level = level; 9201 if (path->slots[level] + 1 < 9202 btrfs_header_nritems(path->nodes[level])) { 9203 path->slots[level]++; 9204 return 0; 9205 } else { 9206 ret = walk_up_proc(trans, root, path, wc); 9207 if (ret > 0) 9208 return 0; 9209 if (ret < 0) 9210 return ret; 9211 9212 if (path->locks[level]) { 9213 btrfs_tree_unlock_rw(path->nodes[level], 9214 path->locks[level]); 9215 path->locks[level] = 0; 9216 } 9217 free_extent_buffer(path->nodes[level]); 9218 path->nodes[level] = NULL; 9219 level++; 9220 } 9221 } 9222 return 1; 9223 } 9224 9225 /* 9226 * drop a subvolume tree. 9227 * 9228 * this function traverses the tree freeing any blocks that only 9229 * referenced by the tree. 9230 * 9231 * when a shared tree block is found. this function decreases its 9232 * reference count by one. if update_ref is true, this function 9233 * also make sure backrefs for the shared block and all lower level 9234 * blocks are properly updated. 9235 * 9236 * If called with for_reloc == 0, may exit early with -EAGAIN 9237 */ 9238 int btrfs_drop_snapshot(struct btrfs_root *root, 9239 struct btrfs_block_rsv *block_rsv, int update_ref, 9240 int for_reloc) 9241 { 9242 struct btrfs_fs_info *fs_info = root->fs_info; 9243 struct btrfs_path *path; 9244 struct btrfs_trans_handle *trans; 9245 struct btrfs_root *tree_root = fs_info->tree_root; 9246 struct btrfs_root_item *root_item = &root->root_item; 9247 struct walk_control *wc; 9248 struct btrfs_key key; 9249 int err = 0; 9250 int ret; 9251 int level; 9252 bool root_dropped = false; 9253 9254 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 9255 9256 path = btrfs_alloc_path(); 9257 if (!path) { 9258 err = -ENOMEM; 9259 goto out; 9260 } 9261 9262 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9263 if (!wc) { 9264 btrfs_free_path(path); 9265 err = -ENOMEM; 9266 goto out; 9267 } 9268 9269 trans = btrfs_start_transaction(tree_root, 0); 9270 if (IS_ERR(trans)) { 9271 err = PTR_ERR(trans); 9272 goto out_free; 9273 } 9274 9275 err = btrfs_run_delayed_items(trans); 9276 if (err) 9277 goto out_end_trans; 9278 9279 if (block_rsv) 9280 trans->block_rsv = block_rsv; 9281 9282 /* 9283 * This will help us catch people modifying the fs tree while we're 9284 * dropping it. It is unsafe to mess with the fs tree while it's being 9285 * dropped as we unlock the root node and parent nodes as we walk down 9286 * the tree, assuming nothing will change. If something does change 9287 * then we'll have stale information and drop references to blocks we've 9288 * already dropped. 9289 */ 9290 set_bit(BTRFS_ROOT_DELETING, &root->state); 9291 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9292 level = btrfs_header_level(root->node); 9293 path->nodes[level] = btrfs_lock_root_node(root); 9294 btrfs_set_lock_blocking(path->nodes[level]); 9295 path->slots[level] = 0; 9296 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9297 memset(&wc->update_progress, 0, 9298 sizeof(wc->update_progress)); 9299 } else { 9300 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9301 memcpy(&wc->update_progress, &key, 9302 sizeof(wc->update_progress)); 9303 9304 level = root_item->drop_level; 9305 BUG_ON(level == 0); 9306 path->lowest_level = level; 9307 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9308 path->lowest_level = 0; 9309 if (ret < 0) { 9310 err = ret; 9311 goto out_end_trans; 9312 } 9313 WARN_ON(ret > 0); 9314 9315 /* 9316 * unlock our path, this is safe because only this 9317 * function is allowed to delete this snapshot 9318 */ 9319 btrfs_unlock_up_safe(path, 0); 9320 9321 level = btrfs_header_level(root->node); 9322 while (1) { 9323 btrfs_tree_lock(path->nodes[level]); 9324 btrfs_set_lock_blocking(path->nodes[level]); 9325 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9326 9327 ret = btrfs_lookup_extent_info(trans, fs_info, 9328 path->nodes[level]->start, 9329 level, 1, &wc->refs[level], 9330 &wc->flags[level]); 9331 if (ret < 0) { 9332 err = ret; 9333 goto out_end_trans; 9334 } 9335 BUG_ON(wc->refs[level] == 0); 9336 9337 if (level == root_item->drop_level) 9338 break; 9339 9340 btrfs_tree_unlock(path->nodes[level]); 9341 path->locks[level] = 0; 9342 WARN_ON(wc->refs[level] != 1); 9343 level--; 9344 } 9345 } 9346 9347 wc->level = level; 9348 wc->shared_level = -1; 9349 wc->stage = DROP_REFERENCE; 9350 wc->update_ref = update_ref; 9351 wc->keep_locks = 0; 9352 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9353 9354 while (1) { 9355 9356 ret = walk_down_tree(trans, root, path, wc); 9357 if (ret < 0) { 9358 err = ret; 9359 break; 9360 } 9361 9362 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9363 if (ret < 0) { 9364 err = ret; 9365 break; 9366 } 9367 9368 if (ret > 0) { 9369 BUG_ON(wc->stage != DROP_REFERENCE); 9370 break; 9371 } 9372 9373 if (wc->stage == DROP_REFERENCE) { 9374 level = wc->level; 9375 btrfs_node_key(path->nodes[level], 9376 &root_item->drop_progress, 9377 path->slots[level]); 9378 root_item->drop_level = level; 9379 } 9380 9381 BUG_ON(wc->level == 0); 9382 if (btrfs_should_end_transaction(trans) || 9383 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9384 ret = btrfs_update_root(trans, tree_root, 9385 &root->root_key, 9386 root_item); 9387 if (ret) { 9388 btrfs_abort_transaction(trans, ret); 9389 err = ret; 9390 goto out_end_trans; 9391 } 9392 9393 btrfs_end_transaction_throttle(trans); 9394 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9395 btrfs_debug(fs_info, 9396 "drop snapshot early exit"); 9397 err = -EAGAIN; 9398 goto out_free; 9399 } 9400 9401 trans = btrfs_start_transaction(tree_root, 0); 9402 if (IS_ERR(trans)) { 9403 err = PTR_ERR(trans); 9404 goto out_free; 9405 } 9406 if (block_rsv) 9407 trans->block_rsv = block_rsv; 9408 } 9409 } 9410 btrfs_release_path(path); 9411 if (err) 9412 goto out_end_trans; 9413 9414 ret = btrfs_del_root(trans, &root->root_key); 9415 if (ret) { 9416 btrfs_abort_transaction(trans, ret); 9417 err = ret; 9418 goto out_end_trans; 9419 } 9420 9421 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9422 ret = btrfs_find_root(tree_root, &root->root_key, path, 9423 NULL, NULL); 9424 if (ret < 0) { 9425 btrfs_abort_transaction(trans, ret); 9426 err = ret; 9427 goto out_end_trans; 9428 } else if (ret > 0) { 9429 /* if we fail to delete the orphan item this time 9430 * around, it'll get picked up the next time. 9431 * 9432 * The most common failure here is just -ENOENT. 9433 */ 9434 btrfs_del_orphan_item(trans, tree_root, 9435 root->root_key.objectid); 9436 } 9437 } 9438 9439 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9440 btrfs_add_dropped_root(trans, root); 9441 } else { 9442 free_extent_buffer(root->node); 9443 free_extent_buffer(root->commit_root); 9444 btrfs_put_fs_root(root); 9445 } 9446 root_dropped = true; 9447 out_end_trans: 9448 btrfs_end_transaction_throttle(trans); 9449 out_free: 9450 kfree(wc); 9451 btrfs_free_path(path); 9452 out: 9453 /* 9454 * So if we need to stop dropping the snapshot for whatever reason we 9455 * need to make sure to add it back to the dead root list so that we 9456 * keep trying to do the work later. This also cleans up roots if we 9457 * don't have it in the radix (like when we recover after a power fail 9458 * or unmount) so we don't leak memory. 9459 */ 9460 if (!for_reloc && !root_dropped) 9461 btrfs_add_dead_root(root); 9462 if (err && err != -EAGAIN) 9463 btrfs_handle_fs_error(fs_info, err, NULL); 9464 return err; 9465 } 9466 9467 /* 9468 * drop subtree rooted at tree block 'node'. 9469 * 9470 * NOTE: this function will unlock and release tree block 'node' 9471 * only used by relocation code 9472 */ 9473 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9474 struct btrfs_root *root, 9475 struct extent_buffer *node, 9476 struct extent_buffer *parent) 9477 { 9478 struct btrfs_fs_info *fs_info = root->fs_info; 9479 struct btrfs_path *path; 9480 struct walk_control *wc; 9481 int level; 9482 int parent_level; 9483 int ret = 0; 9484 int wret; 9485 9486 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9487 9488 path = btrfs_alloc_path(); 9489 if (!path) 9490 return -ENOMEM; 9491 9492 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9493 if (!wc) { 9494 btrfs_free_path(path); 9495 return -ENOMEM; 9496 } 9497 9498 btrfs_assert_tree_locked(parent); 9499 parent_level = btrfs_header_level(parent); 9500 extent_buffer_get(parent); 9501 path->nodes[parent_level] = parent; 9502 path->slots[parent_level] = btrfs_header_nritems(parent); 9503 9504 btrfs_assert_tree_locked(node); 9505 level = btrfs_header_level(node); 9506 path->nodes[level] = node; 9507 path->slots[level] = 0; 9508 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9509 9510 wc->refs[parent_level] = 1; 9511 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9512 wc->level = level; 9513 wc->shared_level = -1; 9514 wc->stage = DROP_REFERENCE; 9515 wc->update_ref = 0; 9516 wc->keep_locks = 1; 9517 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9518 9519 while (1) { 9520 wret = walk_down_tree(trans, root, path, wc); 9521 if (wret < 0) { 9522 ret = wret; 9523 break; 9524 } 9525 9526 wret = walk_up_tree(trans, root, path, wc, parent_level); 9527 if (wret < 0) 9528 ret = wret; 9529 if (wret != 0) 9530 break; 9531 } 9532 9533 kfree(wc); 9534 btrfs_free_path(path); 9535 return ret; 9536 } 9537 9538 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9539 { 9540 u64 num_devices; 9541 u64 stripped; 9542 9543 /* 9544 * if restripe for this chunk_type is on pick target profile and 9545 * return, otherwise do the usual balance 9546 */ 9547 stripped = get_restripe_target(fs_info, flags); 9548 if (stripped) 9549 return extended_to_chunk(stripped); 9550 9551 num_devices = fs_info->fs_devices->rw_devices; 9552 9553 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9554 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9555 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9556 9557 if (num_devices == 1) { 9558 stripped |= BTRFS_BLOCK_GROUP_DUP; 9559 stripped = flags & ~stripped; 9560 9561 /* turn raid0 into single device chunks */ 9562 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9563 return stripped; 9564 9565 /* turn mirroring into duplication */ 9566 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9567 BTRFS_BLOCK_GROUP_RAID10)) 9568 return stripped | BTRFS_BLOCK_GROUP_DUP; 9569 } else { 9570 /* they already had raid on here, just return */ 9571 if (flags & stripped) 9572 return flags; 9573 9574 stripped |= BTRFS_BLOCK_GROUP_DUP; 9575 stripped = flags & ~stripped; 9576 9577 /* switch duplicated blocks with raid1 */ 9578 if (flags & BTRFS_BLOCK_GROUP_DUP) 9579 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9580 9581 /* this is drive concat, leave it alone */ 9582 } 9583 9584 return flags; 9585 } 9586 9587 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9588 { 9589 struct btrfs_space_info *sinfo = cache->space_info; 9590 u64 num_bytes; 9591 u64 min_allocable_bytes; 9592 int ret = -ENOSPC; 9593 9594 /* 9595 * We need some metadata space and system metadata space for 9596 * allocating chunks in some corner cases until we force to set 9597 * it to be readonly. 9598 */ 9599 if ((sinfo->flags & 9600 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9601 !force) 9602 min_allocable_bytes = SZ_1M; 9603 else 9604 min_allocable_bytes = 0; 9605 9606 spin_lock(&sinfo->lock); 9607 spin_lock(&cache->lock); 9608 9609 if (cache->ro) { 9610 cache->ro++; 9611 ret = 0; 9612 goto out; 9613 } 9614 9615 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9616 cache->bytes_super - btrfs_block_group_used(&cache->item); 9617 9618 if (btrfs_space_info_used(sinfo, true) + num_bytes + 9619 min_allocable_bytes <= sinfo->total_bytes) { 9620 sinfo->bytes_readonly += num_bytes; 9621 cache->ro++; 9622 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9623 ret = 0; 9624 } 9625 out: 9626 spin_unlock(&cache->lock); 9627 spin_unlock(&sinfo->lock); 9628 return ret; 9629 } 9630 9631 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) 9632 9633 { 9634 struct btrfs_fs_info *fs_info = cache->fs_info; 9635 struct btrfs_trans_handle *trans; 9636 u64 alloc_flags; 9637 int ret; 9638 9639 again: 9640 trans = btrfs_join_transaction(fs_info->extent_root); 9641 if (IS_ERR(trans)) 9642 return PTR_ERR(trans); 9643 9644 /* 9645 * we're not allowed to set block groups readonly after the dirty 9646 * block groups cache has started writing. If it already started, 9647 * back off and let this transaction commit 9648 */ 9649 mutex_lock(&fs_info->ro_block_group_mutex); 9650 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9651 u64 transid = trans->transid; 9652 9653 mutex_unlock(&fs_info->ro_block_group_mutex); 9654 btrfs_end_transaction(trans); 9655 9656 ret = btrfs_wait_for_commit(fs_info, transid); 9657 if (ret) 9658 return ret; 9659 goto again; 9660 } 9661 9662 /* 9663 * if we are changing raid levels, try to allocate a corresponding 9664 * block group with the new raid level. 9665 */ 9666 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9667 if (alloc_flags != cache->flags) { 9668 ret = do_chunk_alloc(trans, alloc_flags, 9669 CHUNK_ALLOC_FORCE); 9670 /* 9671 * ENOSPC is allowed here, we may have enough space 9672 * already allocated at the new raid level to 9673 * carry on 9674 */ 9675 if (ret == -ENOSPC) 9676 ret = 0; 9677 if (ret < 0) 9678 goto out; 9679 } 9680 9681 ret = inc_block_group_ro(cache, 0); 9682 if (!ret) 9683 goto out; 9684 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9685 ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9686 if (ret < 0) 9687 goto out; 9688 ret = inc_block_group_ro(cache, 0); 9689 out: 9690 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9691 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9692 mutex_lock(&fs_info->chunk_mutex); 9693 check_system_chunk(trans, alloc_flags); 9694 mutex_unlock(&fs_info->chunk_mutex); 9695 } 9696 mutex_unlock(&fs_info->ro_block_group_mutex); 9697 9698 btrfs_end_transaction(trans); 9699 return ret; 9700 } 9701 9702 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) 9703 { 9704 u64 alloc_flags = get_alloc_profile(trans->fs_info, type); 9705 9706 return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 9707 } 9708 9709 /* 9710 * helper to account the unused space of all the readonly block group in the 9711 * space_info. takes mirrors into account. 9712 */ 9713 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9714 { 9715 struct btrfs_block_group_cache *block_group; 9716 u64 free_bytes = 0; 9717 int factor; 9718 9719 /* It's df, we don't care if it's racy */ 9720 if (list_empty(&sinfo->ro_bgs)) 9721 return 0; 9722 9723 spin_lock(&sinfo->lock); 9724 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9725 spin_lock(&block_group->lock); 9726 9727 if (!block_group->ro) { 9728 spin_unlock(&block_group->lock); 9729 continue; 9730 } 9731 9732 factor = btrfs_bg_type_to_factor(block_group->flags); 9733 free_bytes += (block_group->key.offset - 9734 btrfs_block_group_used(&block_group->item)) * 9735 factor; 9736 9737 spin_unlock(&block_group->lock); 9738 } 9739 spin_unlock(&sinfo->lock); 9740 9741 return free_bytes; 9742 } 9743 9744 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9745 { 9746 struct btrfs_space_info *sinfo = cache->space_info; 9747 u64 num_bytes; 9748 9749 BUG_ON(!cache->ro); 9750 9751 spin_lock(&sinfo->lock); 9752 spin_lock(&cache->lock); 9753 if (!--cache->ro) { 9754 num_bytes = cache->key.offset - cache->reserved - 9755 cache->pinned - cache->bytes_super - 9756 btrfs_block_group_used(&cache->item); 9757 sinfo->bytes_readonly -= num_bytes; 9758 list_del_init(&cache->ro_list); 9759 } 9760 spin_unlock(&cache->lock); 9761 spin_unlock(&sinfo->lock); 9762 } 9763 9764 /* 9765 * Checks to see if it's even possible to relocate this block group. 9766 * 9767 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9768 * ok to go ahead and try. 9769 */ 9770 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9771 { 9772 struct btrfs_root *root = fs_info->extent_root; 9773 struct btrfs_block_group_cache *block_group; 9774 struct btrfs_space_info *space_info; 9775 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9776 struct btrfs_device *device; 9777 struct btrfs_trans_handle *trans; 9778 u64 min_free; 9779 u64 dev_min = 1; 9780 u64 dev_nr = 0; 9781 u64 target; 9782 int debug; 9783 int index; 9784 int full = 0; 9785 int ret = 0; 9786 9787 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9788 9789 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9790 9791 /* odd, couldn't find the block group, leave it alone */ 9792 if (!block_group) { 9793 if (debug) 9794 btrfs_warn(fs_info, 9795 "can't find block group for bytenr %llu", 9796 bytenr); 9797 return -1; 9798 } 9799 9800 min_free = btrfs_block_group_used(&block_group->item); 9801 9802 /* no bytes used, we're good */ 9803 if (!min_free) 9804 goto out; 9805 9806 space_info = block_group->space_info; 9807 spin_lock(&space_info->lock); 9808 9809 full = space_info->full; 9810 9811 /* 9812 * if this is the last block group we have in this space, we can't 9813 * relocate it unless we're able to allocate a new chunk below. 9814 * 9815 * Otherwise, we need to make sure we have room in the space to handle 9816 * all of the extents from this block group. If we can, we're good 9817 */ 9818 if ((space_info->total_bytes != block_group->key.offset) && 9819 (btrfs_space_info_used(space_info, false) + min_free < 9820 space_info->total_bytes)) { 9821 spin_unlock(&space_info->lock); 9822 goto out; 9823 } 9824 spin_unlock(&space_info->lock); 9825 9826 /* 9827 * ok we don't have enough space, but maybe we have free space on our 9828 * devices to allocate new chunks for relocation, so loop through our 9829 * alloc devices and guess if we have enough space. if this block 9830 * group is going to be restriped, run checks against the target 9831 * profile instead of the current one. 9832 */ 9833 ret = -1; 9834 9835 /* 9836 * index: 9837 * 0: raid10 9838 * 1: raid1 9839 * 2: dup 9840 * 3: raid0 9841 * 4: single 9842 */ 9843 target = get_restripe_target(fs_info, block_group->flags); 9844 if (target) { 9845 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); 9846 } else { 9847 /* 9848 * this is just a balance, so if we were marked as full 9849 * we know there is no space for a new chunk 9850 */ 9851 if (full) { 9852 if (debug) 9853 btrfs_warn(fs_info, 9854 "no space to alloc new chunk for block group %llu", 9855 block_group->key.objectid); 9856 goto out; 9857 } 9858 9859 index = btrfs_bg_flags_to_raid_index(block_group->flags); 9860 } 9861 9862 if (index == BTRFS_RAID_RAID10) { 9863 dev_min = 4; 9864 /* Divide by 2 */ 9865 min_free >>= 1; 9866 } else if (index == BTRFS_RAID_RAID1) { 9867 dev_min = 2; 9868 } else if (index == BTRFS_RAID_DUP) { 9869 /* Multiply by 2 */ 9870 min_free <<= 1; 9871 } else if (index == BTRFS_RAID_RAID0) { 9872 dev_min = fs_devices->rw_devices; 9873 min_free = div64_u64(min_free, dev_min); 9874 } 9875 9876 /* We need to do this so that we can look at pending chunks */ 9877 trans = btrfs_join_transaction(root); 9878 if (IS_ERR(trans)) { 9879 ret = PTR_ERR(trans); 9880 goto out; 9881 } 9882 9883 mutex_lock(&fs_info->chunk_mutex); 9884 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9885 u64 dev_offset; 9886 9887 /* 9888 * check to make sure we can actually find a chunk with enough 9889 * space to fit our block group in. 9890 */ 9891 if (device->total_bytes > device->bytes_used + min_free && 9892 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 9893 ret = find_free_dev_extent(trans, device, min_free, 9894 &dev_offset, NULL); 9895 if (!ret) 9896 dev_nr++; 9897 9898 if (dev_nr >= dev_min) 9899 break; 9900 9901 ret = -1; 9902 } 9903 } 9904 if (debug && ret == -1) 9905 btrfs_warn(fs_info, 9906 "no space to allocate a new chunk for block group %llu", 9907 block_group->key.objectid); 9908 mutex_unlock(&fs_info->chunk_mutex); 9909 btrfs_end_transaction(trans); 9910 out: 9911 btrfs_put_block_group(block_group); 9912 return ret; 9913 } 9914 9915 static int find_first_block_group(struct btrfs_fs_info *fs_info, 9916 struct btrfs_path *path, 9917 struct btrfs_key *key) 9918 { 9919 struct btrfs_root *root = fs_info->extent_root; 9920 int ret = 0; 9921 struct btrfs_key found_key; 9922 struct extent_buffer *leaf; 9923 struct btrfs_block_group_item bg; 9924 u64 flags; 9925 int slot; 9926 9927 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9928 if (ret < 0) 9929 goto out; 9930 9931 while (1) { 9932 slot = path->slots[0]; 9933 leaf = path->nodes[0]; 9934 if (slot >= btrfs_header_nritems(leaf)) { 9935 ret = btrfs_next_leaf(root, path); 9936 if (ret == 0) 9937 continue; 9938 if (ret < 0) 9939 goto out; 9940 break; 9941 } 9942 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9943 9944 if (found_key.objectid >= key->objectid && 9945 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9946 struct extent_map_tree *em_tree; 9947 struct extent_map *em; 9948 9949 em_tree = &root->fs_info->mapping_tree.map_tree; 9950 read_lock(&em_tree->lock); 9951 em = lookup_extent_mapping(em_tree, found_key.objectid, 9952 found_key.offset); 9953 read_unlock(&em_tree->lock); 9954 if (!em) { 9955 btrfs_err(fs_info, 9956 "logical %llu len %llu found bg but no related chunk", 9957 found_key.objectid, found_key.offset); 9958 ret = -ENOENT; 9959 } else if (em->start != found_key.objectid || 9960 em->len != found_key.offset) { 9961 btrfs_err(fs_info, 9962 "block group %llu len %llu mismatch with chunk %llu len %llu", 9963 found_key.objectid, found_key.offset, 9964 em->start, em->len); 9965 ret = -EUCLEAN; 9966 } else { 9967 read_extent_buffer(leaf, &bg, 9968 btrfs_item_ptr_offset(leaf, slot), 9969 sizeof(bg)); 9970 flags = btrfs_block_group_flags(&bg) & 9971 BTRFS_BLOCK_GROUP_TYPE_MASK; 9972 9973 if (flags != (em->map_lookup->type & 9974 BTRFS_BLOCK_GROUP_TYPE_MASK)) { 9975 btrfs_err(fs_info, 9976 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", 9977 found_key.objectid, 9978 found_key.offset, flags, 9979 (BTRFS_BLOCK_GROUP_TYPE_MASK & 9980 em->map_lookup->type)); 9981 ret = -EUCLEAN; 9982 } else { 9983 ret = 0; 9984 } 9985 } 9986 free_extent_map(em); 9987 goto out; 9988 } 9989 path->slots[0]++; 9990 } 9991 out: 9992 return ret; 9993 } 9994 9995 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9996 { 9997 struct btrfs_block_group_cache *block_group; 9998 u64 last = 0; 9999 10000 while (1) { 10001 struct inode *inode; 10002 10003 block_group = btrfs_lookup_first_block_group(info, last); 10004 while (block_group) { 10005 wait_block_group_cache_done(block_group); 10006 spin_lock(&block_group->lock); 10007 if (block_group->iref) 10008 break; 10009 spin_unlock(&block_group->lock); 10010 block_group = next_block_group(info, block_group); 10011 } 10012 if (!block_group) { 10013 if (last == 0) 10014 break; 10015 last = 0; 10016 continue; 10017 } 10018 10019 inode = block_group->inode; 10020 block_group->iref = 0; 10021 block_group->inode = NULL; 10022 spin_unlock(&block_group->lock); 10023 ASSERT(block_group->io_ctl.inode == NULL); 10024 iput(inode); 10025 last = block_group->key.objectid + block_group->key.offset; 10026 btrfs_put_block_group(block_group); 10027 } 10028 } 10029 10030 /* 10031 * Must be called only after stopping all workers, since we could have block 10032 * group caching kthreads running, and therefore they could race with us if we 10033 * freed the block groups before stopping them. 10034 */ 10035 int btrfs_free_block_groups(struct btrfs_fs_info *info) 10036 { 10037 struct btrfs_block_group_cache *block_group; 10038 struct btrfs_space_info *space_info; 10039 struct btrfs_caching_control *caching_ctl; 10040 struct rb_node *n; 10041 10042 down_write(&info->commit_root_sem); 10043 while (!list_empty(&info->caching_block_groups)) { 10044 caching_ctl = list_entry(info->caching_block_groups.next, 10045 struct btrfs_caching_control, list); 10046 list_del(&caching_ctl->list); 10047 put_caching_control(caching_ctl); 10048 } 10049 up_write(&info->commit_root_sem); 10050 10051 spin_lock(&info->unused_bgs_lock); 10052 while (!list_empty(&info->unused_bgs)) { 10053 block_group = list_first_entry(&info->unused_bgs, 10054 struct btrfs_block_group_cache, 10055 bg_list); 10056 list_del_init(&block_group->bg_list); 10057 btrfs_put_block_group(block_group); 10058 } 10059 spin_unlock(&info->unused_bgs_lock); 10060 10061 spin_lock(&info->block_group_cache_lock); 10062 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 10063 block_group = rb_entry(n, struct btrfs_block_group_cache, 10064 cache_node); 10065 rb_erase(&block_group->cache_node, 10066 &info->block_group_cache_tree); 10067 RB_CLEAR_NODE(&block_group->cache_node); 10068 spin_unlock(&info->block_group_cache_lock); 10069 10070 down_write(&block_group->space_info->groups_sem); 10071 list_del(&block_group->list); 10072 up_write(&block_group->space_info->groups_sem); 10073 10074 /* 10075 * We haven't cached this block group, which means we could 10076 * possibly have excluded extents on this block group. 10077 */ 10078 if (block_group->cached == BTRFS_CACHE_NO || 10079 block_group->cached == BTRFS_CACHE_ERROR) 10080 free_excluded_extents(block_group); 10081 10082 btrfs_remove_free_space_cache(block_group); 10083 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 10084 ASSERT(list_empty(&block_group->dirty_list)); 10085 ASSERT(list_empty(&block_group->io_list)); 10086 ASSERT(list_empty(&block_group->bg_list)); 10087 ASSERT(atomic_read(&block_group->count) == 1); 10088 btrfs_put_block_group(block_group); 10089 10090 spin_lock(&info->block_group_cache_lock); 10091 } 10092 spin_unlock(&info->block_group_cache_lock); 10093 10094 /* now that all the block groups are freed, go through and 10095 * free all the space_info structs. This is only called during 10096 * the final stages of unmount, and so we know nobody is 10097 * using them. We call synchronize_rcu() once before we start, 10098 * just to be on the safe side. 10099 */ 10100 synchronize_rcu(); 10101 10102 release_global_block_rsv(info); 10103 10104 while (!list_empty(&info->space_info)) { 10105 int i; 10106 10107 space_info = list_entry(info->space_info.next, 10108 struct btrfs_space_info, 10109 list); 10110 10111 /* 10112 * Do not hide this behind enospc_debug, this is actually 10113 * important and indicates a real bug if this happens. 10114 */ 10115 if (WARN_ON(space_info->bytes_pinned > 0 || 10116 space_info->bytes_reserved > 0 || 10117 space_info->bytes_may_use > 0)) 10118 dump_space_info(info, space_info, 0, 0); 10119 list_del(&space_info->list); 10120 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 10121 struct kobject *kobj; 10122 kobj = space_info->block_group_kobjs[i]; 10123 space_info->block_group_kobjs[i] = NULL; 10124 if (kobj) { 10125 kobject_del(kobj); 10126 kobject_put(kobj); 10127 } 10128 } 10129 kobject_del(&space_info->kobj); 10130 kobject_put(&space_info->kobj); 10131 } 10132 return 0; 10133 } 10134 10135 /* link_block_group will queue up kobjects to add when we're reclaim-safe */ 10136 void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) 10137 { 10138 struct btrfs_space_info *space_info; 10139 struct raid_kobject *rkobj; 10140 LIST_HEAD(list); 10141 int index; 10142 int ret = 0; 10143 10144 spin_lock(&fs_info->pending_raid_kobjs_lock); 10145 list_splice_init(&fs_info->pending_raid_kobjs, &list); 10146 spin_unlock(&fs_info->pending_raid_kobjs_lock); 10147 10148 list_for_each_entry(rkobj, &list, list) { 10149 space_info = __find_space_info(fs_info, rkobj->flags); 10150 index = btrfs_bg_flags_to_raid_index(rkobj->flags); 10151 10152 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 10153 "%s", get_raid_name(index)); 10154 if (ret) { 10155 kobject_put(&rkobj->kobj); 10156 break; 10157 } 10158 } 10159 if (ret) 10160 btrfs_warn(fs_info, 10161 "failed to add kobject for block cache, ignoring"); 10162 } 10163 10164 static void link_block_group(struct btrfs_block_group_cache *cache) 10165 { 10166 struct btrfs_space_info *space_info = cache->space_info; 10167 struct btrfs_fs_info *fs_info = cache->fs_info; 10168 int index = btrfs_bg_flags_to_raid_index(cache->flags); 10169 bool first = false; 10170 10171 down_write(&space_info->groups_sem); 10172 if (list_empty(&space_info->block_groups[index])) 10173 first = true; 10174 list_add_tail(&cache->list, &space_info->block_groups[index]); 10175 up_write(&space_info->groups_sem); 10176 10177 if (first) { 10178 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 10179 if (!rkobj) { 10180 btrfs_warn(cache->fs_info, 10181 "couldn't alloc memory for raid level kobject"); 10182 return; 10183 } 10184 rkobj->flags = cache->flags; 10185 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 10186 10187 spin_lock(&fs_info->pending_raid_kobjs_lock); 10188 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); 10189 spin_unlock(&fs_info->pending_raid_kobjs_lock); 10190 space_info->block_group_kobjs[index] = &rkobj->kobj; 10191 } 10192 } 10193 10194 static struct btrfs_block_group_cache * 10195 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 10196 u64 start, u64 size) 10197 { 10198 struct btrfs_block_group_cache *cache; 10199 10200 cache = kzalloc(sizeof(*cache), GFP_NOFS); 10201 if (!cache) 10202 return NULL; 10203 10204 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 10205 GFP_NOFS); 10206 if (!cache->free_space_ctl) { 10207 kfree(cache); 10208 return NULL; 10209 } 10210 10211 cache->key.objectid = start; 10212 cache->key.offset = size; 10213 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10214 10215 cache->fs_info = fs_info; 10216 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); 10217 set_free_space_tree_thresholds(cache); 10218 10219 atomic_set(&cache->count, 1); 10220 spin_lock_init(&cache->lock); 10221 init_rwsem(&cache->data_rwsem); 10222 INIT_LIST_HEAD(&cache->list); 10223 INIT_LIST_HEAD(&cache->cluster_list); 10224 INIT_LIST_HEAD(&cache->bg_list); 10225 INIT_LIST_HEAD(&cache->ro_list); 10226 INIT_LIST_HEAD(&cache->dirty_list); 10227 INIT_LIST_HEAD(&cache->io_list); 10228 btrfs_init_free_space_ctl(cache); 10229 atomic_set(&cache->trimming, 0); 10230 mutex_init(&cache->free_space_lock); 10231 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 10232 10233 return cache; 10234 } 10235 10236 10237 /* 10238 * Iterate all chunks and verify that each of them has the corresponding block 10239 * group 10240 */ 10241 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 10242 { 10243 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 10244 struct extent_map *em; 10245 struct btrfs_block_group_cache *bg; 10246 u64 start = 0; 10247 int ret = 0; 10248 10249 while (1) { 10250 read_lock(&map_tree->map_tree.lock); 10251 /* 10252 * lookup_extent_mapping will return the first extent map 10253 * intersecting the range, so setting @len to 1 is enough to 10254 * get the first chunk. 10255 */ 10256 em = lookup_extent_mapping(&map_tree->map_tree, start, 1); 10257 read_unlock(&map_tree->map_tree.lock); 10258 if (!em) 10259 break; 10260 10261 bg = btrfs_lookup_block_group(fs_info, em->start); 10262 if (!bg) { 10263 btrfs_err(fs_info, 10264 "chunk start=%llu len=%llu doesn't have corresponding block group", 10265 em->start, em->len); 10266 ret = -EUCLEAN; 10267 free_extent_map(em); 10268 break; 10269 } 10270 if (bg->key.objectid != em->start || 10271 bg->key.offset != em->len || 10272 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != 10273 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 10274 btrfs_err(fs_info, 10275 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", 10276 em->start, em->len, 10277 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, 10278 bg->key.objectid, bg->key.offset, 10279 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); 10280 ret = -EUCLEAN; 10281 free_extent_map(em); 10282 btrfs_put_block_group(bg); 10283 break; 10284 } 10285 start = em->start + em->len; 10286 free_extent_map(em); 10287 btrfs_put_block_group(bg); 10288 } 10289 return ret; 10290 } 10291 10292 int btrfs_read_block_groups(struct btrfs_fs_info *info) 10293 { 10294 struct btrfs_path *path; 10295 int ret; 10296 struct btrfs_block_group_cache *cache; 10297 struct btrfs_space_info *space_info; 10298 struct btrfs_key key; 10299 struct btrfs_key found_key; 10300 struct extent_buffer *leaf; 10301 int need_clear = 0; 10302 u64 cache_gen; 10303 u64 feature; 10304 int mixed; 10305 10306 feature = btrfs_super_incompat_flags(info->super_copy); 10307 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 10308 10309 key.objectid = 0; 10310 key.offset = 0; 10311 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 10312 path = btrfs_alloc_path(); 10313 if (!path) 10314 return -ENOMEM; 10315 path->reada = READA_FORWARD; 10316 10317 cache_gen = btrfs_super_cache_generation(info->super_copy); 10318 if (btrfs_test_opt(info, SPACE_CACHE) && 10319 btrfs_super_generation(info->super_copy) != cache_gen) 10320 need_clear = 1; 10321 if (btrfs_test_opt(info, CLEAR_CACHE)) 10322 need_clear = 1; 10323 10324 while (1) { 10325 ret = find_first_block_group(info, path, &key); 10326 if (ret > 0) 10327 break; 10328 if (ret != 0) 10329 goto error; 10330 10331 leaf = path->nodes[0]; 10332 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 10333 10334 cache = btrfs_create_block_group_cache(info, found_key.objectid, 10335 found_key.offset); 10336 if (!cache) { 10337 ret = -ENOMEM; 10338 goto error; 10339 } 10340 10341 if (need_clear) { 10342 /* 10343 * When we mount with old space cache, we need to 10344 * set BTRFS_DC_CLEAR and set dirty flag. 10345 * 10346 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 10347 * truncate the old free space cache inode and 10348 * setup a new one. 10349 * b) Setting 'dirty flag' makes sure that we flush 10350 * the new space cache info onto disk. 10351 */ 10352 if (btrfs_test_opt(info, SPACE_CACHE)) 10353 cache->disk_cache_state = BTRFS_DC_CLEAR; 10354 } 10355 10356 read_extent_buffer(leaf, &cache->item, 10357 btrfs_item_ptr_offset(leaf, path->slots[0]), 10358 sizeof(cache->item)); 10359 cache->flags = btrfs_block_group_flags(&cache->item); 10360 if (!mixed && 10361 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 10362 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 10363 btrfs_err(info, 10364 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 10365 cache->key.objectid); 10366 ret = -EINVAL; 10367 goto error; 10368 } 10369 10370 key.objectid = found_key.objectid + found_key.offset; 10371 btrfs_release_path(path); 10372 10373 /* 10374 * We need to exclude the super stripes now so that the space 10375 * info has super bytes accounted for, otherwise we'll think 10376 * we have more space than we actually do. 10377 */ 10378 ret = exclude_super_stripes(cache); 10379 if (ret) { 10380 /* 10381 * We may have excluded something, so call this just in 10382 * case. 10383 */ 10384 free_excluded_extents(cache); 10385 btrfs_put_block_group(cache); 10386 goto error; 10387 } 10388 10389 /* 10390 * check for two cases, either we are full, and therefore 10391 * don't need to bother with the caching work since we won't 10392 * find any space, or we are empty, and we can just add all 10393 * the space in and be done with it. This saves us _a_lot_ of 10394 * time, particularly in the full case. 10395 */ 10396 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10397 cache->last_byte_to_unpin = (u64)-1; 10398 cache->cached = BTRFS_CACHE_FINISHED; 10399 free_excluded_extents(cache); 10400 } else if (btrfs_block_group_used(&cache->item) == 0) { 10401 cache->last_byte_to_unpin = (u64)-1; 10402 cache->cached = BTRFS_CACHE_FINISHED; 10403 add_new_free_space(cache, found_key.objectid, 10404 found_key.objectid + 10405 found_key.offset); 10406 free_excluded_extents(cache); 10407 } 10408 10409 ret = btrfs_add_block_group_cache(info, cache); 10410 if (ret) { 10411 btrfs_remove_free_space_cache(cache); 10412 btrfs_put_block_group(cache); 10413 goto error; 10414 } 10415 10416 trace_btrfs_add_block_group(info, cache, 0); 10417 update_space_info(info, cache->flags, found_key.offset, 10418 btrfs_block_group_used(&cache->item), 10419 cache->bytes_super, &space_info); 10420 10421 cache->space_info = space_info; 10422 10423 link_block_group(cache); 10424 10425 set_avail_alloc_bits(info, cache->flags); 10426 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10427 inc_block_group_ro(cache, 1); 10428 } else if (btrfs_block_group_used(&cache->item) == 0) { 10429 ASSERT(list_empty(&cache->bg_list)); 10430 btrfs_mark_bg_unused(cache); 10431 } 10432 } 10433 10434 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10435 if (!(get_alloc_profile(info, space_info->flags) & 10436 (BTRFS_BLOCK_GROUP_RAID10 | 10437 BTRFS_BLOCK_GROUP_RAID1 | 10438 BTRFS_BLOCK_GROUP_RAID5 | 10439 BTRFS_BLOCK_GROUP_RAID6 | 10440 BTRFS_BLOCK_GROUP_DUP))) 10441 continue; 10442 /* 10443 * avoid allocating from un-mirrored block group if there are 10444 * mirrored block groups. 10445 */ 10446 list_for_each_entry(cache, 10447 &space_info->block_groups[BTRFS_RAID_RAID0], 10448 list) 10449 inc_block_group_ro(cache, 1); 10450 list_for_each_entry(cache, 10451 &space_info->block_groups[BTRFS_RAID_SINGLE], 10452 list) 10453 inc_block_group_ro(cache, 1); 10454 } 10455 10456 btrfs_add_raid_kobjects(info); 10457 init_global_block_rsv(info); 10458 ret = check_chunk_block_group_mappings(info); 10459 error: 10460 btrfs_free_path(path); 10461 return ret; 10462 } 10463 10464 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) 10465 { 10466 struct btrfs_fs_info *fs_info = trans->fs_info; 10467 struct btrfs_block_group_cache *block_group; 10468 struct btrfs_root *extent_root = fs_info->extent_root; 10469 struct btrfs_block_group_item item; 10470 struct btrfs_key key; 10471 int ret = 0; 10472 10473 if (!trans->can_flush_pending_bgs) 10474 return; 10475 10476 while (!list_empty(&trans->new_bgs)) { 10477 block_group = list_first_entry(&trans->new_bgs, 10478 struct btrfs_block_group_cache, 10479 bg_list); 10480 if (ret) 10481 goto next; 10482 10483 spin_lock(&block_group->lock); 10484 memcpy(&item, &block_group->item, sizeof(item)); 10485 memcpy(&key, &block_group->key, sizeof(key)); 10486 spin_unlock(&block_group->lock); 10487 10488 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10489 sizeof(item)); 10490 if (ret) 10491 btrfs_abort_transaction(trans, ret); 10492 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); 10493 if (ret) 10494 btrfs_abort_transaction(trans, ret); 10495 add_block_group_free_space(trans, block_group); 10496 /* already aborted the transaction if it failed. */ 10497 next: 10498 btrfs_delayed_refs_rsv_release(fs_info, 1); 10499 list_del_init(&block_group->bg_list); 10500 } 10501 btrfs_trans_release_chunk_metadata(trans); 10502 } 10503 10504 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, 10505 u64 type, u64 chunk_offset, u64 size) 10506 { 10507 struct btrfs_fs_info *fs_info = trans->fs_info; 10508 struct btrfs_block_group_cache *cache; 10509 int ret; 10510 10511 btrfs_set_log_full_commit(fs_info, trans); 10512 10513 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10514 if (!cache) 10515 return -ENOMEM; 10516 10517 btrfs_set_block_group_used(&cache->item, bytes_used); 10518 btrfs_set_block_group_chunk_objectid(&cache->item, 10519 BTRFS_FIRST_CHUNK_TREE_OBJECTID); 10520 btrfs_set_block_group_flags(&cache->item, type); 10521 10522 cache->flags = type; 10523 cache->last_byte_to_unpin = (u64)-1; 10524 cache->cached = BTRFS_CACHE_FINISHED; 10525 cache->needs_free_space = 1; 10526 ret = exclude_super_stripes(cache); 10527 if (ret) { 10528 /* 10529 * We may have excluded something, so call this just in 10530 * case. 10531 */ 10532 free_excluded_extents(cache); 10533 btrfs_put_block_group(cache); 10534 return ret; 10535 } 10536 10537 add_new_free_space(cache, chunk_offset, chunk_offset + size); 10538 10539 free_excluded_extents(cache); 10540 10541 #ifdef CONFIG_BTRFS_DEBUG 10542 if (btrfs_should_fragment_free_space(cache)) { 10543 u64 new_bytes_used = size - bytes_used; 10544 10545 bytes_used += new_bytes_used >> 1; 10546 fragment_free_space(cache); 10547 } 10548 #endif 10549 /* 10550 * Ensure the corresponding space_info object is created and 10551 * assigned to our block group. We want our bg to be added to the rbtree 10552 * with its ->space_info set. 10553 */ 10554 cache->space_info = __find_space_info(fs_info, cache->flags); 10555 ASSERT(cache->space_info); 10556 10557 ret = btrfs_add_block_group_cache(fs_info, cache); 10558 if (ret) { 10559 btrfs_remove_free_space_cache(cache); 10560 btrfs_put_block_group(cache); 10561 return ret; 10562 } 10563 10564 /* 10565 * Now that our block group has its ->space_info set and is inserted in 10566 * the rbtree, update the space info's counters. 10567 */ 10568 trace_btrfs_add_block_group(fs_info, cache, 1); 10569 update_space_info(fs_info, cache->flags, size, bytes_used, 10570 cache->bytes_super, &cache->space_info); 10571 update_global_block_rsv(fs_info); 10572 10573 link_block_group(cache); 10574 10575 list_add_tail(&cache->bg_list, &trans->new_bgs); 10576 trans->delayed_ref_updates++; 10577 btrfs_update_delayed_refs_rsv(trans); 10578 10579 set_avail_alloc_bits(fs_info, type); 10580 return 0; 10581 } 10582 10583 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10584 { 10585 u64 extra_flags = chunk_to_extended(flags) & 10586 BTRFS_EXTENDED_PROFILE_MASK; 10587 10588 write_seqlock(&fs_info->profiles_lock); 10589 if (flags & BTRFS_BLOCK_GROUP_DATA) 10590 fs_info->avail_data_alloc_bits &= ~extra_flags; 10591 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10592 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10593 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10594 fs_info->avail_system_alloc_bits &= ~extra_flags; 10595 write_sequnlock(&fs_info->profiles_lock); 10596 } 10597 10598 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10599 u64 group_start, struct extent_map *em) 10600 { 10601 struct btrfs_fs_info *fs_info = trans->fs_info; 10602 struct btrfs_root *root = fs_info->extent_root; 10603 struct btrfs_path *path; 10604 struct btrfs_block_group_cache *block_group; 10605 struct btrfs_free_cluster *cluster; 10606 struct btrfs_root *tree_root = fs_info->tree_root; 10607 struct btrfs_key key; 10608 struct inode *inode; 10609 struct kobject *kobj = NULL; 10610 int ret; 10611 int index; 10612 int factor; 10613 struct btrfs_caching_control *caching_ctl = NULL; 10614 bool remove_em; 10615 bool remove_rsv = false; 10616 10617 block_group = btrfs_lookup_block_group(fs_info, group_start); 10618 BUG_ON(!block_group); 10619 BUG_ON(!block_group->ro); 10620 10621 trace_btrfs_remove_block_group(block_group); 10622 /* 10623 * Free the reserved super bytes from this block group before 10624 * remove it. 10625 */ 10626 free_excluded_extents(block_group); 10627 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, 10628 block_group->key.offset); 10629 10630 memcpy(&key, &block_group->key, sizeof(key)); 10631 index = btrfs_bg_flags_to_raid_index(block_group->flags); 10632 factor = btrfs_bg_type_to_factor(block_group->flags); 10633 10634 /* make sure this block group isn't part of an allocation cluster */ 10635 cluster = &fs_info->data_alloc_cluster; 10636 spin_lock(&cluster->refill_lock); 10637 btrfs_return_cluster_to_free_space(block_group, cluster); 10638 spin_unlock(&cluster->refill_lock); 10639 10640 /* 10641 * make sure this block group isn't part of a metadata 10642 * allocation cluster 10643 */ 10644 cluster = &fs_info->meta_alloc_cluster; 10645 spin_lock(&cluster->refill_lock); 10646 btrfs_return_cluster_to_free_space(block_group, cluster); 10647 spin_unlock(&cluster->refill_lock); 10648 10649 path = btrfs_alloc_path(); 10650 if (!path) { 10651 ret = -ENOMEM; 10652 goto out; 10653 } 10654 10655 /* 10656 * get the inode first so any iput calls done for the io_list 10657 * aren't the final iput (no unlinks allowed now) 10658 */ 10659 inode = lookup_free_space_inode(fs_info, block_group, path); 10660 10661 mutex_lock(&trans->transaction->cache_write_mutex); 10662 /* 10663 * Make sure our free space cache IO is done before removing the 10664 * free space inode 10665 */ 10666 spin_lock(&trans->transaction->dirty_bgs_lock); 10667 if (!list_empty(&block_group->io_list)) { 10668 list_del_init(&block_group->io_list); 10669 10670 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10671 10672 spin_unlock(&trans->transaction->dirty_bgs_lock); 10673 btrfs_wait_cache_io(trans, block_group, path); 10674 btrfs_put_block_group(block_group); 10675 spin_lock(&trans->transaction->dirty_bgs_lock); 10676 } 10677 10678 if (!list_empty(&block_group->dirty_list)) { 10679 list_del_init(&block_group->dirty_list); 10680 remove_rsv = true; 10681 btrfs_put_block_group(block_group); 10682 } 10683 spin_unlock(&trans->transaction->dirty_bgs_lock); 10684 mutex_unlock(&trans->transaction->cache_write_mutex); 10685 10686 if (!IS_ERR(inode)) { 10687 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10688 if (ret) { 10689 btrfs_add_delayed_iput(inode); 10690 goto out; 10691 } 10692 clear_nlink(inode); 10693 /* One for the block groups ref */ 10694 spin_lock(&block_group->lock); 10695 if (block_group->iref) { 10696 block_group->iref = 0; 10697 block_group->inode = NULL; 10698 spin_unlock(&block_group->lock); 10699 iput(inode); 10700 } else { 10701 spin_unlock(&block_group->lock); 10702 } 10703 /* One for our lookup ref */ 10704 btrfs_add_delayed_iput(inode); 10705 } 10706 10707 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10708 key.offset = block_group->key.objectid; 10709 key.type = 0; 10710 10711 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10712 if (ret < 0) 10713 goto out; 10714 if (ret > 0) 10715 btrfs_release_path(path); 10716 if (ret == 0) { 10717 ret = btrfs_del_item(trans, tree_root, path); 10718 if (ret) 10719 goto out; 10720 btrfs_release_path(path); 10721 } 10722 10723 spin_lock(&fs_info->block_group_cache_lock); 10724 rb_erase(&block_group->cache_node, 10725 &fs_info->block_group_cache_tree); 10726 RB_CLEAR_NODE(&block_group->cache_node); 10727 10728 if (fs_info->first_logical_byte == block_group->key.objectid) 10729 fs_info->first_logical_byte = (u64)-1; 10730 spin_unlock(&fs_info->block_group_cache_lock); 10731 10732 down_write(&block_group->space_info->groups_sem); 10733 /* 10734 * we must use list_del_init so people can check to see if they 10735 * are still on the list after taking the semaphore 10736 */ 10737 list_del_init(&block_group->list); 10738 if (list_empty(&block_group->space_info->block_groups[index])) { 10739 kobj = block_group->space_info->block_group_kobjs[index]; 10740 block_group->space_info->block_group_kobjs[index] = NULL; 10741 clear_avail_alloc_bits(fs_info, block_group->flags); 10742 } 10743 up_write(&block_group->space_info->groups_sem); 10744 if (kobj) { 10745 kobject_del(kobj); 10746 kobject_put(kobj); 10747 } 10748 10749 if (block_group->has_caching_ctl) 10750 caching_ctl = get_caching_control(block_group); 10751 if (block_group->cached == BTRFS_CACHE_STARTED) 10752 wait_block_group_cache_done(block_group); 10753 if (block_group->has_caching_ctl) { 10754 down_write(&fs_info->commit_root_sem); 10755 if (!caching_ctl) { 10756 struct btrfs_caching_control *ctl; 10757 10758 list_for_each_entry(ctl, 10759 &fs_info->caching_block_groups, list) 10760 if (ctl->block_group == block_group) { 10761 caching_ctl = ctl; 10762 refcount_inc(&caching_ctl->count); 10763 break; 10764 } 10765 } 10766 if (caching_ctl) 10767 list_del_init(&caching_ctl->list); 10768 up_write(&fs_info->commit_root_sem); 10769 if (caching_ctl) { 10770 /* Once for the caching bgs list and once for us. */ 10771 put_caching_control(caching_ctl); 10772 put_caching_control(caching_ctl); 10773 } 10774 } 10775 10776 spin_lock(&trans->transaction->dirty_bgs_lock); 10777 if (!list_empty(&block_group->dirty_list)) { 10778 WARN_ON(1); 10779 } 10780 if (!list_empty(&block_group->io_list)) { 10781 WARN_ON(1); 10782 } 10783 spin_unlock(&trans->transaction->dirty_bgs_lock); 10784 btrfs_remove_free_space_cache(block_group); 10785 10786 spin_lock(&block_group->space_info->lock); 10787 list_del_init(&block_group->ro_list); 10788 10789 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10790 WARN_ON(block_group->space_info->total_bytes 10791 < block_group->key.offset); 10792 WARN_ON(block_group->space_info->bytes_readonly 10793 < block_group->key.offset); 10794 WARN_ON(block_group->space_info->disk_total 10795 < block_group->key.offset * factor); 10796 } 10797 block_group->space_info->total_bytes -= block_group->key.offset; 10798 block_group->space_info->bytes_readonly -= block_group->key.offset; 10799 block_group->space_info->disk_total -= block_group->key.offset * factor; 10800 10801 spin_unlock(&block_group->space_info->lock); 10802 10803 memcpy(&key, &block_group->key, sizeof(key)); 10804 10805 mutex_lock(&fs_info->chunk_mutex); 10806 if (!list_empty(&em->list)) { 10807 /* We're in the transaction->pending_chunks list. */ 10808 free_extent_map(em); 10809 } 10810 spin_lock(&block_group->lock); 10811 block_group->removed = 1; 10812 /* 10813 * At this point trimming can't start on this block group, because we 10814 * removed the block group from the tree fs_info->block_group_cache_tree 10815 * so no one can't find it anymore and even if someone already got this 10816 * block group before we removed it from the rbtree, they have already 10817 * incremented block_group->trimming - if they didn't, they won't find 10818 * any free space entries because we already removed them all when we 10819 * called btrfs_remove_free_space_cache(). 10820 * 10821 * And we must not remove the extent map from the fs_info->mapping_tree 10822 * to prevent the same logical address range and physical device space 10823 * ranges from being reused for a new block group. This is because our 10824 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10825 * completely transactionless, so while it is trimming a range the 10826 * currently running transaction might finish and a new one start, 10827 * allowing for new block groups to be created that can reuse the same 10828 * physical device locations unless we take this special care. 10829 * 10830 * There may also be an implicit trim operation if the file system 10831 * is mounted with -odiscard. The same protections must remain 10832 * in place until the extents have been discarded completely when 10833 * the transaction commit has completed. 10834 */ 10835 remove_em = (atomic_read(&block_group->trimming) == 0); 10836 /* 10837 * Make sure a trimmer task always sees the em in the pinned_chunks list 10838 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10839 * before checking block_group->removed). 10840 */ 10841 if (!remove_em) { 10842 /* 10843 * Our em might be in trans->transaction->pending_chunks which 10844 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10845 * and so is the fs_info->pinned_chunks list. 10846 * 10847 * So at this point we must be holding the chunk_mutex to avoid 10848 * any races with chunk allocation (more specifically at 10849 * volumes.c:contains_pending_extent()), to ensure it always 10850 * sees the em, either in the pending_chunks list or in the 10851 * pinned_chunks list. 10852 */ 10853 list_move_tail(&em->list, &fs_info->pinned_chunks); 10854 } 10855 spin_unlock(&block_group->lock); 10856 10857 if (remove_em) { 10858 struct extent_map_tree *em_tree; 10859 10860 em_tree = &fs_info->mapping_tree.map_tree; 10861 write_lock(&em_tree->lock); 10862 /* 10863 * The em might be in the pending_chunks list, so make sure the 10864 * chunk mutex is locked, since remove_extent_mapping() will 10865 * delete us from that list. 10866 */ 10867 remove_extent_mapping(em_tree, em); 10868 write_unlock(&em_tree->lock); 10869 /* once for the tree */ 10870 free_extent_map(em); 10871 } 10872 10873 mutex_unlock(&fs_info->chunk_mutex); 10874 10875 ret = remove_block_group_free_space(trans, block_group); 10876 if (ret) 10877 goto out; 10878 10879 btrfs_put_block_group(block_group); 10880 btrfs_put_block_group(block_group); 10881 10882 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10883 if (ret > 0) 10884 ret = -EIO; 10885 if (ret < 0) 10886 goto out; 10887 10888 ret = btrfs_del_item(trans, root, path); 10889 out: 10890 if (remove_rsv) 10891 btrfs_delayed_refs_rsv_release(fs_info, 1); 10892 btrfs_free_path(path); 10893 return ret; 10894 } 10895 10896 struct btrfs_trans_handle * 10897 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10898 const u64 chunk_offset) 10899 { 10900 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10901 struct extent_map *em; 10902 struct map_lookup *map; 10903 unsigned int num_items; 10904 10905 read_lock(&em_tree->lock); 10906 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10907 read_unlock(&em_tree->lock); 10908 ASSERT(em && em->start == chunk_offset); 10909 10910 /* 10911 * We need to reserve 3 + N units from the metadata space info in order 10912 * to remove a block group (done at btrfs_remove_chunk() and at 10913 * btrfs_remove_block_group()), which are used for: 10914 * 10915 * 1 unit for adding the free space inode's orphan (located in the tree 10916 * of tree roots). 10917 * 1 unit for deleting the block group item (located in the extent 10918 * tree). 10919 * 1 unit for deleting the free space item (located in tree of tree 10920 * roots). 10921 * N units for deleting N device extent items corresponding to each 10922 * stripe (located in the device tree). 10923 * 10924 * In order to remove a block group we also need to reserve units in the 10925 * system space info in order to update the chunk tree (update one or 10926 * more device items and remove one chunk item), but this is done at 10927 * btrfs_remove_chunk() through a call to check_system_chunk(). 10928 */ 10929 map = em->map_lookup; 10930 num_items = 3 + map->num_stripes; 10931 free_extent_map(em); 10932 10933 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10934 num_items, 1); 10935 } 10936 10937 /* 10938 * Process the unused_bgs list and remove any that don't have any allocated 10939 * space inside of them. 10940 */ 10941 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10942 { 10943 struct btrfs_block_group_cache *block_group; 10944 struct btrfs_space_info *space_info; 10945 struct btrfs_trans_handle *trans; 10946 int ret = 0; 10947 10948 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 10949 return; 10950 10951 spin_lock(&fs_info->unused_bgs_lock); 10952 while (!list_empty(&fs_info->unused_bgs)) { 10953 u64 start, end; 10954 int trimming; 10955 10956 block_group = list_first_entry(&fs_info->unused_bgs, 10957 struct btrfs_block_group_cache, 10958 bg_list); 10959 list_del_init(&block_group->bg_list); 10960 10961 space_info = block_group->space_info; 10962 10963 if (ret || btrfs_mixed_space_info(space_info)) { 10964 btrfs_put_block_group(block_group); 10965 continue; 10966 } 10967 spin_unlock(&fs_info->unused_bgs_lock); 10968 10969 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10970 10971 /* Don't want to race with allocators so take the groups_sem */ 10972 down_write(&space_info->groups_sem); 10973 spin_lock(&block_group->lock); 10974 if (block_group->reserved || block_group->pinned || 10975 btrfs_block_group_used(&block_group->item) || 10976 block_group->ro || 10977 list_is_singular(&block_group->list)) { 10978 /* 10979 * We want to bail if we made new allocations or have 10980 * outstanding allocations in this block group. We do 10981 * the ro check in case balance is currently acting on 10982 * this block group. 10983 */ 10984 trace_btrfs_skip_unused_block_group(block_group); 10985 spin_unlock(&block_group->lock); 10986 up_write(&space_info->groups_sem); 10987 goto next; 10988 } 10989 spin_unlock(&block_group->lock); 10990 10991 /* We don't want to force the issue, only flip if it's ok. */ 10992 ret = inc_block_group_ro(block_group, 0); 10993 up_write(&space_info->groups_sem); 10994 if (ret < 0) { 10995 ret = 0; 10996 goto next; 10997 } 10998 10999 /* 11000 * Want to do this before we do anything else so we can recover 11001 * properly if we fail to join the transaction. 11002 */ 11003 trans = btrfs_start_trans_remove_block_group(fs_info, 11004 block_group->key.objectid); 11005 if (IS_ERR(trans)) { 11006 btrfs_dec_block_group_ro(block_group); 11007 ret = PTR_ERR(trans); 11008 goto next; 11009 } 11010 11011 /* 11012 * We could have pending pinned extents for this block group, 11013 * just delete them, we don't care about them anymore. 11014 */ 11015 start = block_group->key.objectid; 11016 end = start + block_group->key.offset - 1; 11017 /* 11018 * Hold the unused_bg_unpin_mutex lock to avoid racing with 11019 * btrfs_finish_extent_commit(). If we are at transaction N, 11020 * another task might be running finish_extent_commit() for the 11021 * previous transaction N - 1, and have seen a range belonging 11022 * to the block group in freed_extents[] before we were able to 11023 * clear the whole block group range from freed_extents[]. This 11024 * means that task can lookup for the block group after we 11025 * unpinned it from freed_extents[] and removed it, leading to 11026 * a BUG_ON() at btrfs_unpin_extent_range(). 11027 */ 11028 mutex_lock(&fs_info->unused_bg_unpin_mutex); 11029 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 11030 EXTENT_DIRTY); 11031 if (ret) { 11032 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11033 btrfs_dec_block_group_ro(block_group); 11034 goto end_trans; 11035 } 11036 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 11037 EXTENT_DIRTY); 11038 if (ret) { 11039 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11040 btrfs_dec_block_group_ro(block_group); 11041 goto end_trans; 11042 } 11043 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 11044 11045 /* Reset pinned so btrfs_put_block_group doesn't complain */ 11046 spin_lock(&space_info->lock); 11047 spin_lock(&block_group->lock); 11048 11049 update_bytes_pinned(space_info, -block_group->pinned); 11050 space_info->bytes_readonly += block_group->pinned; 11051 percpu_counter_add_batch(&space_info->total_bytes_pinned, 11052 -block_group->pinned, 11053 BTRFS_TOTAL_BYTES_PINNED_BATCH); 11054 block_group->pinned = 0; 11055 11056 spin_unlock(&block_group->lock); 11057 spin_unlock(&space_info->lock); 11058 11059 /* DISCARD can flip during remount */ 11060 trimming = btrfs_test_opt(fs_info, DISCARD); 11061 11062 /* Implicit trim during transaction commit. */ 11063 if (trimming) 11064 btrfs_get_block_group_trimming(block_group); 11065 11066 /* 11067 * Btrfs_remove_chunk will abort the transaction if things go 11068 * horribly wrong. 11069 */ 11070 ret = btrfs_remove_chunk(trans, block_group->key.objectid); 11071 11072 if (ret) { 11073 if (trimming) 11074 btrfs_put_block_group_trimming(block_group); 11075 goto end_trans; 11076 } 11077 11078 /* 11079 * If we're not mounted with -odiscard, we can just forget 11080 * about this block group. Otherwise we'll need to wait 11081 * until transaction commit to do the actual discard. 11082 */ 11083 if (trimming) { 11084 spin_lock(&fs_info->unused_bgs_lock); 11085 /* 11086 * A concurrent scrub might have added us to the list 11087 * fs_info->unused_bgs, so use a list_move operation 11088 * to add the block group to the deleted_bgs list. 11089 */ 11090 list_move(&block_group->bg_list, 11091 &trans->transaction->deleted_bgs); 11092 spin_unlock(&fs_info->unused_bgs_lock); 11093 btrfs_get_block_group(block_group); 11094 } 11095 end_trans: 11096 btrfs_end_transaction(trans); 11097 next: 11098 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 11099 btrfs_put_block_group(block_group); 11100 spin_lock(&fs_info->unused_bgs_lock); 11101 } 11102 spin_unlock(&fs_info->unused_bgs_lock); 11103 } 11104 11105 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 11106 { 11107 struct btrfs_super_block *disk_super; 11108 u64 features; 11109 u64 flags; 11110 int mixed = 0; 11111 int ret; 11112 11113 disk_super = fs_info->super_copy; 11114 if (!btrfs_super_root(disk_super)) 11115 return -EINVAL; 11116 11117 features = btrfs_super_incompat_flags(disk_super); 11118 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 11119 mixed = 1; 11120 11121 flags = BTRFS_BLOCK_GROUP_SYSTEM; 11122 ret = create_space_info(fs_info, flags); 11123 if (ret) 11124 goto out; 11125 11126 if (mixed) { 11127 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 11128 ret = create_space_info(fs_info, flags); 11129 } else { 11130 flags = BTRFS_BLOCK_GROUP_METADATA; 11131 ret = create_space_info(fs_info, flags); 11132 if (ret) 11133 goto out; 11134 11135 flags = BTRFS_BLOCK_GROUP_DATA; 11136 ret = create_space_info(fs_info, flags); 11137 } 11138 out: 11139 return ret; 11140 } 11141 11142 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 11143 u64 start, u64 end) 11144 { 11145 return unpin_extent_range(fs_info, start, end, false); 11146 } 11147 11148 /* 11149 * It used to be that old block groups would be left around forever. 11150 * Iterating over them would be enough to trim unused space. Since we 11151 * now automatically remove them, we also need to iterate over unallocated 11152 * space. 11153 * 11154 * We don't want a transaction for this since the discard may take a 11155 * substantial amount of time. We don't require that a transaction be 11156 * running, but we do need to take a running transaction into account 11157 * to ensure that we're not discarding chunks that were released or 11158 * allocated in the current transaction. 11159 * 11160 * Holding the chunks lock will prevent other threads from allocating 11161 * or releasing chunks, but it won't prevent a running transaction 11162 * from committing and releasing the memory that the pending chunks 11163 * list head uses. For that, we need to take a reference to the 11164 * transaction and hold the commit root sem. We only need to hold 11165 * it while performing the free space search since we have already 11166 * held back allocations. 11167 */ 11168 static int btrfs_trim_free_extents(struct btrfs_device *device, 11169 u64 minlen, u64 *trimmed) 11170 { 11171 u64 start = 0, len = 0; 11172 int ret; 11173 11174 *trimmed = 0; 11175 11176 /* Discard not supported = nothing to do. */ 11177 if (!blk_queue_discard(bdev_get_queue(device->bdev))) 11178 return 0; 11179 11180 /* Not writable = nothing to do. */ 11181 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 11182 return 0; 11183 11184 /* No free space = nothing to do. */ 11185 if (device->total_bytes <= device->bytes_used) 11186 return 0; 11187 11188 ret = 0; 11189 11190 while (1) { 11191 struct btrfs_fs_info *fs_info = device->fs_info; 11192 struct btrfs_transaction *trans; 11193 u64 bytes; 11194 11195 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 11196 if (ret) 11197 break; 11198 11199 ret = down_read_killable(&fs_info->commit_root_sem); 11200 if (ret) { 11201 mutex_unlock(&fs_info->chunk_mutex); 11202 break; 11203 } 11204 11205 spin_lock(&fs_info->trans_lock); 11206 trans = fs_info->running_transaction; 11207 if (trans) 11208 refcount_inc(&trans->use_count); 11209 spin_unlock(&fs_info->trans_lock); 11210 11211 if (!trans) 11212 up_read(&fs_info->commit_root_sem); 11213 11214 ret = find_free_dev_extent_start(trans, device, minlen, start, 11215 &start, &len); 11216 if (trans) { 11217 up_read(&fs_info->commit_root_sem); 11218 btrfs_put_transaction(trans); 11219 } 11220 11221 if (ret) { 11222 mutex_unlock(&fs_info->chunk_mutex); 11223 if (ret == -ENOSPC) 11224 ret = 0; 11225 break; 11226 } 11227 11228 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 11229 mutex_unlock(&fs_info->chunk_mutex); 11230 11231 if (ret) 11232 break; 11233 11234 start += len; 11235 *trimmed += bytes; 11236 11237 if (fatal_signal_pending(current)) { 11238 ret = -ERESTARTSYS; 11239 break; 11240 } 11241 11242 cond_resched(); 11243 } 11244 11245 return ret; 11246 } 11247 11248 /* 11249 * Trim the whole filesystem by: 11250 * 1) trimming the free space in each block group 11251 * 2) trimming the unallocated space on each device 11252 * 11253 * This will also continue trimming even if a block group or device encounters 11254 * an error. The return value will be the last error, or 0 if nothing bad 11255 * happens. 11256 */ 11257 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 11258 { 11259 struct btrfs_block_group_cache *cache = NULL; 11260 struct btrfs_device *device; 11261 struct list_head *devices; 11262 u64 group_trimmed; 11263 u64 start; 11264 u64 end; 11265 u64 trimmed = 0; 11266 u64 bg_failed = 0; 11267 u64 dev_failed = 0; 11268 int bg_ret = 0; 11269 int dev_ret = 0; 11270 int ret = 0; 11271 11272 cache = btrfs_lookup_first_block_group(fs_info, range->start); 11273 for (; cache; cache = next_block_group(fs_info, cache)) { 11274 if (cache->key.objectid >= (range->start + range->len)) { 11275 btrfs_put_block_group(cache); 11276 break; 11277 } 11278 11279 start = max(range->start, cache->key.objectid); 11280 end = min(range->start + range->len, 11281 cache->key.objectid + cache->key.offset); 11282 11283 if (end - start >= range->minlen) { 11284 if (!block_group_cache_done(cache)) { 11285 ret = cache_block_group(cache, 0); 11286 if (ret) { 11287 bg_failed++; 11288 bg_ret = ret; 11289 continue; 11290 } 11291 ret = wait_block_group_cache_done(cache); 11292 if (ret) { 11293 bg_failed++; 11294 bg_ret = ret; 11295 continue; 11296 } 11297 } 11298 ret = btrfs_trim_block_group(cache, 11299 &group_trimmed, 11300 start, 11301 end, 11302 range->minlen); 11303 11304 trimmed += group_trimmed; 11305 if (ret) { 11306 bg_failed++; 11307 bg_ret = ret; 11308 continue; 11309 } 11310 } 11311 } 11312 11313 if (bg_failed) 11314 btrfs_warn(fs_info, 11315 "failed to trim %llu block group(s), last error %d", 11316 bg_failed, bg_ret); 11317 mutex_lock(&fs_info->fs_devices->device_list_mutex); 11318 devices = &fs_info->fs_devices->devices; 11319 list_for_each_entry(device, devices, dev_list) { 11320 ret = btrfs_trim_free_extents(device, range->minlen, 11321 &group_trimmed); 11322 if (ret) { 11323 dev_failed++; 11324 dev_ret = ret; 11325 break; 11326 } 11327 11328 trimmed += group_trimmed; 11329 } 11330 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 11331 11332 if (dev_failed) 11333 btrfs_warn(fs_info, 11334 "failed to trim %llu device(s), last error %d", 11335 dev_failed, dev_ret); 11336 range->len = trimmed; 11337 if (bg_ret) 11338 return bg_ret; 11339 return dev_ret; 11340 } 11341 11342 /* 11343 * btrfs_{start,end}_write_no_snapshotting() are similar to 11344 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 11345 * data into the page cache through nocow before the subvolume is snapshoted, 11346 * but flush the data into disk after the snapshot creation, or to prevent 11347 * operations while snapshotting is ongoing and that cause the snapshot to be 11348 * inconsistent (writes followed by expanding truncates for example). 11349 */ 11350 void btrfs_end_write_no_snapshotting(struct btrfs_root *root) 11351 { 11352 percpu_counter_dec(&root->subv_writers->counter); 11353 cond_wake_up(&root->subv_writers->wait); 11354 } 11355 11356 int btrfs_start_write_no_snapshotting(struct btrfs_root *root) 11357 { 11358 if (atomic_read(&root->will_be_snapshotted)) 11359 return 0; 11360 11361 percpu_counter_inc(&root->subv_writers->counter); 11362 /* 11363 * Make sure counter is updated before we check for snapshot creation. 11364 */ 11365 smp_mb(); 11366 if (atomic_read(&root->will_be_snapshotted)) { 11367 btrfs_end_write_no_snapshotting(root); 11368 return 0; 11369 } 11370 return 1; 11371 } 11372 11373 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11374 { 11375 while (true) { 11376 int ret; 11377 11378 ret = btrfs_start_write_no_snapshotting(root); 11379 if (ret) 11380 break; 11381 wait_var_event(&root->will_be_snapshotted, 11382 !atomic_read(&root->will_be_snapshotted)); 11383 } 11384 } 11385 11386 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) 11387 { 11388 struct btrfs_fs_info *fs_info = bg->fs_info; 11389 11390 spin_lock(&fs_info->unused_bgs_lock); 11391 if (list_empty(&bg->bg_list)) { 11392 btrfs_get_block_group(bg); 11393 trace_btrfs_add_unused_block_group(bg); 11394 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 11395 } 11396 spin_unlock(&fs_info->unused_bgs_lock); 11397 } 11398